[SCM] x264 packaging branch, ubuntu, updated. debian/0.svn20090621+git364d7d-0ubuntu2-10-g4af53c3

Wed Feb 17 23:29:34 UTC 2010

The following commit has been merged in the ubuntu branch:
commit 2c2e34a016dc6c32c1596ceb67d34e924177799d
Author: Reinhard Tartler <siretart at tauware.de>
Date:   Sat Feb 13 18:04:43 2010 +0100

    Imported Upstream version 0.svn20100213+gitfcf70c

diff --git a/.gitignore b/.gitignore
index 308b793..9d8cb70 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,7 @@
 .depend
 config.h
 config.mak
+config.log
 x264
 checkasm
 
diff --git a/AUTHORS b/AUTHORS
index 289a2a9..a0f3329 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -34,7 +34,7 @@ D: BeOS and MacOS X ports.
 S: France
 
 N: Gabriel Bouvigne
-E: gabriel.bouvigne AT joost DOT com
+E: bouvigne AT mp3-tech DOT org
 D: 2pass VBV
 
 N: Guillaume Poirier
diff --git a/Makefile b/Makefile
index 594e98b..3ac975d 100644
--- a/Makefile
+++ b/Makefile
@@ -10,9 +10,36 @@ SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
        common/quant.c common/vlc.c \
        encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
        encoder/set.c encoder/macroblock.c encoder/cabac.c \
-       encoder/cavlc.c encoder/encoder.c
+       encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
 
-SRCCLI = x264.c matroska.c muxers.c
+SRCCLI = x264.c input/yuv.c input/y4m.c output/raw.c \
+         output/matroska.c output/matroska_ebml.c \
+         output/flv.c output/flv_bytestream.c
+
+SRCSO =
+
+CONFIG := $(shell cat config.h)
+
+# Optional muxer module sources
+ifneq ($(findstring AVS_INPUT, $(CONFIG)),)
+SRCCLI += input/avs.c
+endif
+
+ifneq ($(findstring HAVE_PTHREAD, $(CONFIG)),)
+SRCCLI += input/thread.c
+endif
+
+ifneq ($(findstring LAVF_INPUT, $(CONFIG)),)
+SRCCLI += input/lavf.c
+endif
+
+ifneq ($(findstring FFMS_INPUT, $(CONFIG)),)
+SRCCLI += input/ffms.c
+endif
+
+ifneq ($(findstring MP4_OUTPUT, $(CONFIG)),)
+SRCCLI += output/mp4.c
+endif
 
 # Visualization sources
 ifeq ($(VIS),yes)
@@ -48,11 +75,20 @@ endif
 
 # AltiVec optims
 ifeq ($(ARCH),PPC)
-ALTIVECSRC += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c \
-              common/ppc/quant.c common/ppc/deblock.c \
-              common/ppc/predict.c
-SRCS += $(ALTIVECSRC)
-$(ALTIVECSRC:%.c=%.o): CFLAGS += $(ALTIVECFLAGS)
+SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c \
+        common/ppc/quant.c common/ppc/deblock.c \
+        common/ppc/predict.c
+endif
+
+# NEON optims
+ifeq ($(ARCH),ARM)
+ifneq ($(AS),)
+ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S \
+          common/arm/dct-a.S common/arm/quant-a.S common/arm/deblock-a.S \
+          common/arm/predict-a.S
+SRCS   += common/arm/mc-c.c common/arm/predict-c.c
+OBJASM  = $(ASMSRC:%.S=%.o)
+endif
 endif
 
 # VIS optims
@@ -65,8 +101,15 @@ ifneq ($(HAVE_GETOPT_LONG),1)
 SRCS += extras/getopt.c
 endif
 
+ifneq ($(SONAME),)
+ifeq ($(SYS),MINGW)
+SRCSO += x264dll.c
+endif
+endif
+
 OBJS = $(SRCS:%.c=%.o)
 OBJCLI = $(SRCCLI:%.c=%.o)
+OBJSO = $(SRCSO:%.c=%.o)
 DEP  = depend
 
 .PHONY: all default fprofiled clean distclean install uninstall dox test testclean
@@ -77,23 +120,26 @@ libx264.a: .depend $(OBJS) $(OBJASM)
 	$(AR) rc libx264.a $(OBJS) $(OBJASM)
 	$(RANLIB) libx264.a
 
-$(SONAME): .depend $(OBJS) $(OBJASM)
-	$(CC) -shared -o $@ $(OBJS) $(OBJASM) $(SOFLAGS) $(LDFLAGS)
+$(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO)
+	$(CC) -shared -o $@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
 
-x264$(EXE): $(OBJCLI) libx264.a 
-	$(CC) -o $@ $+ $(LDFLAGS)
+x264$(EXE): $(OBJCLI) libx264.a
+	$(CC) -o $@ $+ $(LDFLAGS) $(LDFLAGSCLI)
 
 checkasm: tools/checkasm.o libx264.a
 	$(CC) -o $@ $+ $(LDFLAGS)
 
 %.o: %.asm
 	$(AS) $(ASFLAGS) -o $@ $<
-# delete local/anonymous symbols, so they don't show up in oprofile
-	-@ $(STRIP) -x $@
+	-@ $(STRIP) -x $@ # delete local/anonymous symbols, so they don't show up in oprofile
+
+%.o: %.S
+	$(AS) $(ASFLAGS) -o $@ $<
+	-@ $(STRIP) -x $@ # delete local/anonymous symbols, so they don't show up in oprofile
 
 .depend: config.mak
-	rm -f .depend
-	$(foreach SRC, $(SRCS) $(SRCCLI), $(CC) $(CFLAGS) $(ALTIVECFLAGS) $(SRC) -MT $(SRC:%.c=%.o) -MM -g0 1>> .depend;)
+	@rm -f .depend
+	@$(foreach SRC, $(SRCS) $(SRCCLI) $(SRCSO), $(CC) $(CFLAGS) $(SRC) -MT $(SRC:%.c=%.o) -MM -g0 1>> .depend;)
 
 config.mak:
 	./configure
@@ -105,12 +151,12 @@ endif
 
 SRC2 = $(SRCS) $(SRCCLI)
 # These should cover most of the important codepaths
-OPT0 = --crf 30 -b1 -m1 -r1 --me dia --no-cabac --direct temporal --no-ssim --no-psnr
-OPT1 = --crf 16 -b2 -m3 -r3 --me hex -8 --direct spatial --no-dct-decimate
-OPT2 = --crf 26 -b2 -m5 -r2 --me hex -8 -w --cqm jvt --nr 100
-OPT3 = --crf 18 -b3 -m9 -r5 --me umh -8 -t1 -A all --mixed-refs -w --b-pyramid --direct auto --no-fast-pskip
-OPT4 = --crf 22 -b3 -m7 -r4 --me esa -8 -t2 -A all --mixed-refs --psy-rd 1.0:1.0
-OPT5 = --frames 50 --crf 24 -b3 -m9 -r3 --me tesa -8 -t1 --mixed-refs
+OPT0 = --crf 30 -b1 -m1 -r1 --me dia --no-cabac --direct temporal --ssim --no-weightb
+OPT1 = --crf 16 -b2 -m3 -r3 --me hex --no-8x8dct --direct spatial --no-dct-decimate -t0  --slice-max-mbs 50
+OPT2 = --crf 26 -b4 -m5 -r2 --me hex --cqm jvt --nr 100 --psnr --no-mixed-refs --b-adapt 2 --slice-max-size 1500
+OPT3 = --crf 18 -b3 -m9 -r5 --me umh -t1 -A all --b-pyramid normal --direct auto --no-fast-pskip --no-mbtree
+OPT4 = --crf 22 -b3 -m7 -r4 --me esa -t2 -A all --psy-rd 1.0:1.0 --slices 4
+OPT5 = --frames 50 --crf 24 -b3 -m10 -r3 --me tesa -t2
 OPT6 = --frames 50 -q0 -m9 -r2 --me hex -Aall
 OPT7 = --frames 50 -q0 -m2 -r1 --me hex --no-cabac
 
@@ -125,7 +171,7 @@ fprofiled:
 	mv config.mak config.mak2
 	sed -e 's/CFLAGS.*/& -fprofile-generate/; s/LDFLAGS.*/& -fprofile-generate/' config.mak2 > config.mak
 	$(MAKE) x264$(EXE)
-	$(foreach V, $(VIDS), $(foreach I, 0 1 2 3 4 5 6 7, ./x264$(EXE) $(OPT$I) $(V) --progress -o $(DEVNULL) ;))
+	$(foreach V, $(VIDS), $(foreach I, 0 1 2 3 4 5 6 7, ./x264$(EXE) $(OPT$I) --threads 1 $(V) -o $(DEVNULL) ;))
 	rm -f $(SRC2:%.c=%.o)
 	sed -e 's/CFLAGS.*/& -fprofile-use/; s/LDFLAGS.*/& -fprofile-use/' config.mak2 > config.mak
 	$(MAKE)
@@ -134,13 +180,13 @@ fprofiled:
 endif
 
 clean:
-	rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(SONAME) *.a x264 x264.exe .depend TAGS
+	rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a x264 x264.exe .depend TAGS
 	rm -f checkasm checkasm.exe tools/checkasm.o tools/checkasm-a.o
 	rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno)
 	- sed -e 's/ *-fprofile-\(generate\|use\)//g' config.mak > config.mak2 && mv config.mak2 config.mak
 
 distclean: clean
-	rm -f config.mak config.h x264.pc
+	rm -f config.mak config.h config.log x264.pc
 	rm -rf test/
 
 install: x264$(EXE) $(SONAME)
diff --git a/build/win32/libx264.vcproj b/build/win32/libx264.vcproj
deleted file mode 100644
index 8497a35..0000000
--- a/build/win32/libx264.vcproj
+++ /dev/null
@@ -1,995 +0,0 @@
-<?xml version="1.0" encoding="Windows-1252"?>
-<VisualStudioProject
-	ProjectType="Visual C++"
-	Version="7.10"
-	Name="libx264"
-	ProjectGUID="{A8D6E4CD-1885-4B03-8E41-5F3DB825BAA6}"
-	SccProjectName=""
-	SccLocalPath="">
-	<Platforms>
-		<Platform
-			Name="Win32"/>
-	</Platforms>
-	<Configurations>
-		<Configuration
-			Name="Release|Win32"
-			OutputDirectory=".\Release"
-			IntermediateDirectory=".\Release"
-			ConfigurationType="4"
-			UseOfMFC="0"
-			ATLMinimizesCRunTimeLibraryUsage="FALSE"
-			CharacterSet="2">
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="2"
-				InlineFunctionExpansion="2"
-				OptimizeForProcessor="2"
-				AdditionalIncludeDirectories="../../common;../../extras;../.."
-				PreprocessorDefinitions="NDEBUG;_LIB;WIN32;__X264__;HAVE_MMX;ARCH_X86;HAVE_STDINT_H"
-				StringPooling="TRUE"
-				RuntimeLibrary="4"
-				EnableFunctionLevelLinking="TRUE"
-				PrecompiledHeaderFile=".\Release/libx264.pch"
-				AssemblerListingLocation=".\Release/"
-				ObjectFile=".\Release/"
-				ProgramDataBaseFileName=".\Release/"
-				WarningLevel="3"
-				SuppressStartupBanner="TRUE"
-				CompileAs="0"/>
-			<Tool
-				Name="VCCustomBuildTool"/>
-			<Tool
-				Name="VCLibrarianTool"
-				OutputFile="bin/libx264.lib"
-				SuppressStartupBanner="TRUE"/>
-			<Tool
-				Name="VCMIDLTool"/>
-			<Tool
-				Name="VCPostBuildEventTool"/>
-			<Tool
-				Name="VCPreBuildEventTool"/>
-			<Tool
-				Name="VCPreLinkEventTool"/>
-			<Tool
-				Name="VCResourceCompilerTool"
-				PreprocessorDefinitions="NDEBUG"
-				Culture="2052"/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"/>
-			<Tool
-				Name="VCManagedWrapperGeneratorTool"/>
-			<Tool
-				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
-		</Configuration>
-		<Configuration
-			Name="Debug|Win32"
-			OutputDirectory=".\Debug"
-			IntermediateDirectory=".\Debug"
-			ConfigurationType="4"
-			UseOfMFC="0"
-			ATLMinimizesCRunTimeLibraryUsage="FALSE"
-			CharacterSet="2">
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="0"
-				AdditionalIncludeDirectories="../../common;../../extras;../.."
-				PreprocessorDefinitions="_DEBUG;_LIB;WIN32;__X264__;HAVE_MMX;ARCH_X86;HAVE_STDINT_H"
-				BasicRuntimeChecks="3"
-				RuntimeLibrary="5"
-				PrecompiledHeaderFile=".\Debug/libx264.pch"
-				AssemblerListingLocation=".\Debug/"
-				ObjectFile=".\Debug/"
-				ProgramDataBaseFileName=".\Debug/"
-				WarningLevel="3"
-				SuppressStartupBanner="TRUE"
-				DebugInformationFormat="3"
-				CompileAs="0"/>
-			<Tool
-				Name="VCCustomBuildTool"/>
-			<Tool
-				Name="VCLibrarianTool"
-				OutputFile="bin/libx264.lib"
-				SuppressStartupBanner="TRUE"/>
-			<Tool
-				Name="VCMIDLTool"/>
-			<Tool
-				Name="VCPostBuildEventTool"/>
-			<Tool
-				Name="VCPreBuildEventTool"/>
-			<Tool
-				Name="VCPreLinkEventTool"/>
-			<Tool
-				Name="VCResourceCompilerTool"
-				PreprocessorDefinitions="_DEBUG"
-				Culture="2052"/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"/>
-			<Tool
-				Name="VCManagedWrapperGeneratorTool"/>
-			<Tool
-				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
-		</Configuration>
-	</Configurations>
-	<References>
-	</References>
-	<Files>
-		<Filter
-			Name="Enc"
-			Filter=".c">
-			<File
-				RelativePath="..\..\encoder\analyse.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/enc_release/"
-						ProgramDataBaseFileName="obj/enc_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/enc_debug/"
-						ProgramDataBaseFileName="obj/enc_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\encoder\cabac.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/enc_release/"
-						ProgramDataBaseFileName="obj/enc_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/enc_debug/"
-						ProgramDataBaseFileName="obj/enc_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\encoder\cavlc.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/enc_release/"
-						ProgramDataBaseFileName="obj/enc_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/enc_debug/"
-						ProgramDataBaseFileName="obj/enc_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\encoder\encoder.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/enc_release/"
-						ProgramDataBaseFileName="obj/enc_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/enc_debug/"
-						ProgramDataBaseFileName="obj/enc_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\encoder\macroblock.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/enc_release/"
-						ProgramDataBaseFileName="obj/enc_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/enc_debug/"
-						ProgramDataBaseFileName="obj/enc_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\encoder\me.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/enc_release/"
-						ProgramDataBaseFileName="obj/enc_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/enc_debug/"
-						ProgramDataBaseFileName="obj/enc_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\encoder\ratecontrol.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/enc_release/"
-						ProgramDataBaseFileName="obj/enc_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/enc_debug/"
-						ProgramDataBaseFileName="obj/enc_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\encoder\set.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/enc_release/"
-						ProgramDataBaseFileName="obj/enc_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/enc_debug/"
-						ProgramDataBaseFileName="obj/enc_debug/"/>
-				</FileConfiguration>
-			</File>
-			<Filter
-				Name="enc_h"
-				Filter=".h">
-				<File
-					RelativePath="..\..\encoder\analyse.h">
-				</File>
-				<File
-					RelativePath="..\..\encoder\macroblock.h">
-				</File>
-				<File
-					RelativePath="..\..\encoder\me.h">
-				</File>
-				<File
-					RelativePath="..\..\encoder\ratecontrol.h">
-				</File>
-				<File
-					RelativePath="..\..\encoder\set.h">
-				</File>
-			</Filter>
-		</Filter>
-		<Filter
-			Name="Core"
-			Filter=".c;.h;">
-			<File
-				RelativePath="..\..\common\cabac.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/core_release/"
-						ProgramDataBaseFileName="obj/core_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/core_debug/"
-						ProgramDataBaseFileName="obj/core_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\common\common.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/core_release/"
-						ProgramDataBaseFileName="obj/core_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/core_debug/"
-						ProgramDataBaseFileName="obj/core_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\common\cpu.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/core_release/"
-						ProgramDataBaseFileName="obj/core_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/core_debug/"
-						ProgramDataBaseFileName="obj/core_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\common\dct.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/core_release/"
-						ProgramDataBaseFileName="obj/core_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/core_debug/"
-						ProgramDataBaseFileName="obj/core_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\common\vlc.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/core_release/"
-						ProgramDataBaseFileName="obj/core_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/core_debug/"
-						ProgramDataBaseFileName="obj/core_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\common\frame.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/core_release/"
-						ProgramDataBaseFileName="obj/core_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/core_debug/"
-						ProgramDataBaseFileName="obj/core_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\common\macroblock.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/core_release/"
-						ProgramDataBaseFileName="obj/core_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/core_debug/"
-						ProgramDataBaseFileName="obj/core_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\common\x86\mc-c.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/core_release/"
-						ProgramDataBaseFileName="obj/core_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/core_debug/"
-						ProgramDataBaseFileName="obj/core_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\common\mc.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/core_release/"
-						ProgramDataBaseFileName="obj/core_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/core_debug/"
-						ProgramDataBaseFileName="obj/core_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\common\mdate.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/core_release/"
-						ProgramDataBaseFileName="obj/core_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/core_debug/"
-						ProgramDataBaseFileName="obj/core_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\common\pixel.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/core_release/"
-						ProgramDataBaseFileName="obj/core_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/core_debug/"
-						ProgramDataBaseFileName="obj/core_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\common\predict.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/core_release/"
-						ProgramDataBaseFileName="obj/core_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/core_debug/"
-						ProgramDataBaseFileName="obj/core_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\common\x86\predict-c.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/core_release/"
-						ProgramDataBaseFileName="obj/core_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/core_debug/"
-						ProgramDataBaseFileName="obj/core_debug/"/>
-				</FileConfiguration>
-			</File>
-			<File
-				RelativePath="..\..\common\quant.c">
-			</File>
-			<File
-				RelativePath="..\..\common\set.c">
-			</File>
-			<Filter
-				Name="core_h"
-				Filter=".h">
-				<File
-					RelativePath="..\..\common\bs.h">
-				</File>
-				<File
-					RelativePath="..\..\common\cabac.h">
-				</File>
-				<File
-					RelativePath="..\..\common\common.h">
-				</File>
-				<File
-					RelativePath="..\..\common\cpu.h">
-				</File>
-				<File
-					RelativePath="..\..\common\dct.h">
-				</File>
-				<File
-					RelativePath="..\..\common\frame.h">
-				</File>
-				<File
-					RelativePath="..\..\common\macroblock.h">
-				</File>
-				<File
-					RelativePath="..\..\common\mc.h">
-				</File>
-				<File
-					RelativePath="..\..\common\osdep.h">
-				</File>
-				<File
-					RelativePath="..\..\common\pixel.h">
-				</File>
-				<File
-					RelativePath="..\..\common\predict.h">
-				</File>
-				<File
-					RelativePath="..\..\common\quant.h">
-				</File>
-				<File
-					RelativePath="..\..\common\set.h">
-				</File>
-				<File
-					RelativePath="..\..\common\vlc.h">
-				</File>
-			</Filter>
-			<Filter
-				Name="x86"
-				Filter=".asm">
-				<File
-					RelativePath="..\..\common\x86\cpu-a.asm">
-					<FileConfiguration
-						Name="Release|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\..\common\x86\dct-32.asm">
-					<FileConfiguration
-						Name="Release|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\..\common\x86\dct-a.asm">
-					<FileConfiguration
-						Name="Release|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\..\common\x86\cabac-a.asm">
-					<FileConfiguration
-						Name="Release|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\..\common\x86\deblock-a.asm">
-					<FileConfiguration
-						Name="Release|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\..\common\x86\x86inc.asm">
-					<FileConfiguration
-						Name="Release|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\..\common\x86\mc-a.asm">
-					<FileConfiguration
-						Name="Release|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\..\common\x86\mc-a2.asm">
-					<FileConfiguration
-						Name="Release|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\..\common\x86\pixel-a.asm">
-					<FileConfiguration
-						Name="Release|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\..\common\x86\pixel-32.asm">
-					<FileConfiguration
-						Name="Release|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\..\common\x86\predict-a.asm">
-					<FileConfiguration
-						Name="Release|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\..\common\x86\quant-a.asm">
-					<FileConfiguration
-						Name="Release|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-				</File>
-				<File
-					RelativePath="..\..\common\x86\sad-a.asm">
-					<FileConfiguration
-						Name="Release|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-					<FileConfiguration
-						Name="Debug|Win32">
-						<Tool
-							Name="VCCustomBuildTool"
-							Description="Assembly $(InputPath)"
-							CommandLine="yasm -I..\..\common\x86\ -f win32 -O2 -DPREFIX -o &quot;$(IntDir)\$(InputName)&quot;.obj &quot;$(InputPath)&quot;"
-							Outputs="$(IntDir)\$(InputName).obj"/>
-					</FileConfiguration>
-				</File>
-			</Filter>
-		</Filter>
-		<Filter
-			Name="extras"
-			Filter=".c">
-			<File
-				RelativePath="..\..\extras\getopt.c">
-				<FileConfiguration
-					Name="Release|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="2"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						ObjectFile=".\obj/util_release/"
-						ProgramDataBaseFileName="obj/util_release/"/>
-				</FileConfiguration>
-				<FileConfiguration
-					Name="Debug|Win32">
-					<Tool
-						Name="VCCLCompilerTool"
-						Optimization="0"
-						AdditionalIncludeDirectories=""
-						PreprocessorDefinitions=""
-						BasicRuntimeChecks="3"
-						ObjectFile=".\obj/util_debug/"
-						ProgramDataBaseFileName="obj/util_debug/"/>
-				</FileConfiguration>
-			</File>
-			<Filter
-				Name="extras_h"
-				Filter=".h">
-				<File
-					RelativePath="..\..\extras\getopt.h">
-				</File>
-				<File
-					RelativePath="..\..\extras\stdint.h">
-				</File>
-			</Filter>
-		</Filter>
-	</Files>
-	<Globals>
-	</Globals>
-</VisualStudioProject>
diff --git a/build/win32/x264.sln b/build/win32/x264.sln
deleted file mode 100644
index 8fb518d..0000000
--- a/build/win32/x264.sln
+++ /dev/null
@@ -1,30 +0,0 @@
-Microsoft Visual Studio Solution File, Format Version 8.00
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libx264", "libx264.vcproj", "{A8D6E4CD-1885-4B03-8E41-5F3DB825BAA6}"
-	ProjectSection(ProjectDependencies) = postProject
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "x264", "x264.vcproj", "{22E1814D-7955-4456-AEA5-0C9BA7500792}"
-	ProjectSection(ProjectDependencies) = postProject
-		{A8D6E4CD-1885-4B03-8E41-5F3DB825BAA6} = {A8D6E4CD-1885-4B03-8E41-5F3DB825BAA6}
-	EndProjectSection
-EndProject
-Global
-	GlobalSection(SolutionConfiguration) = preSolution
-		Debug = Debug
-		Release = Release
-	EndGlobalSection
-	GlobalSection(ProjectConfiguration) = postSolution
-		{22E1814D-7955-4456-AEA5-0C9BA7500792}.Debug.ActiveCfg = Debug|Win32
-		{22E1814D-7955-4456-AEA5-0C9BA7500792}.Debug.Build.0 = Debug|Win32
-		{22E1814D-7955-4456-AEA5-0C9BA7500792}.Release.ActiveCfg = Release|Win32
-		{22E1814D-7955-4456-AEA5-0C9BA7500792}.Release.Build.0 = Release|Win32
-		{A8D6E4CD-1885-4B03-8E41-5F3DB825BAA6}.Debug.ActiveCfg = Debug|Win32
-		{A8D6E4CD-1885-4B03-8E41-5F3DB825BAA6}.Debug.Build.0 = Debug|Win32
-		{A8D6E4CD-1885-4B03-8E41-5F3DB825BAA6}.Release.ActiveCfg = Release|Win32
-		{A8D6E4CD-1885-4B03-8E41-5F3DB825BAA6}.Release.Build.0 = Release|Win32
-	EndGlobalSection
-	GlobalSection(ExtensibilityGlobals) = postSolution
-	EndGlobalSection
-	GlobalSection(ExtensibilityAddIns) = postSolution
-	EndGlobalSection
-EndGlobal
diff --git a/build/win32/x264.vcproj b/build/win32/x264.vcproj
deleted file mode 100644
index c567265..0000000
--- a/build/win32/x264.vcproj
+++ /dev/null
@@ -1,178 +0,0 @@
-<?xml version="1.0" encoding="Windows-1252"?>
-<VisualStudioProject
-	ProjectType="Visual C++"
-	Version="7.10"
-	Name="x264"
-	ProjectGUID="{22E1814D-7955-4456-AEA5-0C9BA7500792}"
-	SccProjectName=""
-	SccLocalPath="">
-	<Platforms>
-		<Platform
-			Name="Win32"/>
-	</Platforms>
-	<Configurations>
-		<Configuration
-			Name="Debug|Win32"
-			OutputDirectory=".\obj/x264_Debug"
-			IntermediateDirectory=".\obj/x264_Debug"
-			ConfigurationType="1"
-			UseOfMFC="0"
-			ATLMinimizesCRunTimeLibraryUsage="FALSE"
-			CharacterSet="2">
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="0"
-				AdditionalIncludeDirectories="../../common;../../extras;../.."
-				PreprocessorDefinitions="_DEBUG;_CONSOLE;WIN32;__X264__;HAVE_MMX;HAVE_STDINT_H;AVIS_INPUT"
-				BasicRuntimeChecks="3"
-				RuntimeLibrary="5"
-				UsePrecompiledHeader="2"
-				PrecompiledHeaderFile=".\obj/x264_Debug/x264.pch"
-				AssemblerListingLocation=".\obj/x264_Debug/"
-				ObjectFile=".\obj/x264_Debug/"
-				ProgramDataBaseFileName=".\obj/x264_Debug/"
-				WarningLevel="3"
-				SuppressStartupBanner="TRUE"
-				DebugInformationFormat="3"
-				CompileAs="0"/>
-			<Tool
-				Name="VCCustomBuildTool"/>
-			<Tool
-				Name="VCLinkerTool"
-				AdditionalDependencies="vfw32.lib winmm.lib"
-				OutputFile="bin/x264.exe"
-				LinkIncremental="1"
-				SuppressStartupBanner="TRUE"
-				GenerateDebugInformation="TRUE"
-				ProgramDatabaseFile=".\obj/x264_Debug/x264.pdb"
-				SubSystem="1"
-				TargetMachine="1"/>
-			<Tool
-				Name="VCMIDLTool"
-				TypeLibraryName=".\obj/x264_Debug/x264.tlb"
-				HeaderFileName=""/>
-			<Tool
-				Name="VCPostBuildEventTool"/>
-			<Tool
-				Name="VCPreBuildEventTool"/>
-			<Tool
-				Name="VCPreLinkEventTool"/>
-			<Tool
-				Name="VCResourceCompilerTool"
-				PreprocessorDefinitions="_DEBUG"
-				Culture="2052"/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"/>
-			<Tool
-				Name="VCWebDeploymentTool"/>
-			<Tool
-				Name="VCManagedWrapperGeneratorTool"/>
-			<Tool
-				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
-		</Configuration>
-		<Configuration
-			Name="Release|Win32"
-			OutputDirectory=".\obj/x264_Release"
-			IntermediateDirectory=".\obj/x264_Release"
-			ConfigurationType="1"
-			UseOfMFC="0"
-			ATLMinimizesCRunTimeLibraryUsage="FALSE"
-			CharacterSet="2">
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="2"
-				InlineFunctionExpansion="1"
-				AdditionalIncludeDirectories="../../common;../../extras;../.."
-				PreprocessorDefinitions="NDEBUG;_CONSOLE;WIN32;__X264__;HAVE_MMX;HAVE_STDINT_H;AVIS_INPUT"
-				StringPooling="TRUE"
-				RuntimeLibrary="0"
-				EnableFunctionLevelLinking="TRUE"
-				UsePrecompiledHeader="2"
-				PrecompiledHeaderFile=".\obj/x264_Release/x264.pch"
-				AssemblerListingLocation=".\obj/x264_Release/"
-				ObjectFile=".\obj/x264_Release/"
-				ProgramDataBaseFileName=".\obj/x264_Release/"
-				WarningLevel="3"
-				SuppressStartupBanner="TRUE"
-				CompileAs="0"/>
-			<Tool
-				Name="VCCustomBuildTool"/>
-			<Tool
-				Name="VCLinkerTool"
-				AdditionalDependencies="vfw32.lib winmm.lib"
-				OutputFile="bin/x264.exe"
-				LinkIncremental="1"
-				SuppressStartupBanner="TRUE"
-				ProgramDatabaseFile=".\obj/x264_Release/x264.pdb"
-				SubSystem="1"
-				TargetMachine="1"/>
-			<Tool
-				Name="VCMIDLTool"
-				TypeLibraryName=".\obj/x264_Release/x264.tlb"
-				HeaderFileName=""/>
-			<Tool
-				Name="VCPostBuildEventTool"/>
-			<Tool
-				Name="VCPreBuildEventTool"/>
-			<Tool
-				Name="VCPreLinkEventTool"/>
-			<Tool
-				Name="VCResourceCompilerTool"
-				PreprocessorDefinitions="NDEBUG"
-				Culture="2052"/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"/>
-			<Tool
-				Name="VCWebDeploymentTool"/>
-			<Tool
-				Name="VCManagedWrapperGeneratorTool"/>
-			<Tool
-				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
-		</Configuration>
-	</Configurations>
-	<References>
-	</References>
-	<Files>
-		<File
-			RelativePath="..\..\matroska.c">
-		</File>
-		<File
-			RelativePath="..\..\matroska.h">
-		</File>
-		<File
-			RelativePath="..\..\muxers.c">
-		</File>
-		<File
-			RelativePath="..\..\muxers.h">
-		</File>
-		<File
-			RelativePath="..\..\x264.c">
-			<FileConfiguration
-				Name="Debug|Win32">
-				<Tool
-					Name="VCCLCompilerTool"
-					Optimization="0"
-					AdditionalIncludeDirectories=""
-					PreprocessorDefinitions=""
-					BasicRuntimeChecks="3"/>
-			</FileConfiguration>
-			<FileConfiguration
-				Name="Release|Win32">
-				<Tool
-					Name="VCCLCompilerTool"
-					Optimization="2"
-					AdditionalIncludeDirectories=""
-					PreprocessorDefinitions=""/>
-			</FileConfiguration>
-		</File>
-		<File
-			RelativePath="..\..\x264.h">
-		</File>
-	</Files>
-	<Globals>
-	</Globals>
-</VisualStudioProject>
diff --git a/common/arm/asm.S b/common/arm/asm.S
new file mode 100644
index 0000000..d163165
--- /dev/null
+++ b/common/arm/asm.S
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#ifdef __ELF__
+#   define ELF
+#else
+#   define ELF @
+#endif
+
+        .macro require8, val=1
+ELF     .eabi_attribute 24, \val
+        .endm
+
+        .macro preserve8, val=1
+ELF     .eabi_attribute 25, \val
+        .endm
+
+        .macro function name
+        .global \name
+ELF     .hidden \name
+ELF     .type   \name, %function
+        .func   \name
+\name:
+        .endm
+
+        .macro movrel rd, val
+#if defined(HAVE_ARMV6T2) && !defined(PIC)
+        movw            \rd, #:lower16:\val
+        movt            \rd, #:upper16:\val
+#else
+        ldr             \rd, =\val
+#endif
+        .endm
+
+.macro movconst rd, val
+#ifdef HAVE_ARMV6T2
+    movw        \rd, #:lower16:\val
+.if \val >> 16
+    movt        \rd, #:upper16:\val
+.endif
+#else
+    ldr         \rd, =\val
+#endif
+.endm
+
+#define FENC_STRIDE 16
+#define FDEC_STRIDE 32
+
+.macro HORIZ_ADD dest, a, b
+.ifnb \b
+    vadd.u16    \a, \a, \b
+.endif
+    vpaddl.u16  \a, \a
+    vpaddl.u32  \dest, \a
+.endm
+
+.macro SUMSUB_AB sum, diff, a, b
+    vadd.s16    \sum,  \a, \b
+    vsub.s16    \diff, \a, \b
+.endm
+
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
+    SUMSUB_AB   \s1, \d1, \a, \b
+    SUMSUB_AB   \s2, \d2, \c, \d
+.endm
+
+.macro ABS2 a b
+    vabs.s16 \a, \a
+    vabs.s16 \b, \b
+.endm
+
+// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes)
+// op = sumsub/amax (sum and diff / maximum of absolutes)
+// d1/2 = destination registers
+// s1/2 = source registers
+.macro HADAMARD dist, op, d1, d2, s1, s2
+.if \dist == 1
+    vtrn.16     \s1, \s2
+.else
+    vtrn.32     \s1, \s2
+.endif
+.ifc \op, sumsub
+    SUMSUB_AB   \d1, \d2, \s1, \s2
+.else
+    vabs.s16    \s1, \s1
+    vabs.s16    \s2, \s2
+    vmax.s16    \d1, \s1, \s2
+.endif
+.endm
+
+.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7
+    vtrn.32         \r0, \r4
+    vtrn.32         \r1, \r5
+    vtrn.32         \r2, \r6
+    vtrn.32         \r3, \r7
+    vtrn.16         \r0, \r2
+    vtrn.16         \r1, \r3
+    vtrn.16         \r4, \r6
+    vtrn.16         \r5, \r7
+    vtrn.8          \r0, \r1
+    vtrn.8          \r2, \r3
+    vtrn.8          \r4, \r5
+    vtrn.8          \r6, \r7
+.endm
+
+.macro TRANSPOSE4x4 r0 r1 r2 r3
+    vtrn.16         \r0, \r2
+    vtrn.16         \r1, \r3
+    vtrn.8          \r0, \r1
+    vtrn.8          \r2, \r3
+.endm
+
+.macro TRANSPOSE4x4_16  d0 d1 d2 d3
+    vtrn.32     \d0, \d2
+    vtrn.32     \d1, \d3
+    vtrn.16     \d0, \d1
+    vtrn.16     \d2, \d3
+.endm
diff --git a/common/arm/cpu-a.S b/common/arm/cpu-a.S
new file mode 100644
index 0000000..40eff03
--- /dev/null
+++ b/common/arm/cpu-a.S
@@ -0,0 +1,106 @@
+/*****************************************************************************
+ * cpu-a.S: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.fpu neon
+.align
+
+// done in gas because .fpu neon overrides the refusal to assemble
+// instructions the selected -march/-mcpu doesn't support
+function x264_cpu_neon_test
+    vadd.i16    q0, q0, q0
+    bx          lr
+.endfunc
+
+// return: 0 on success
+//         1 if counters were already enabled
+//         9 if lo-res counters were already enabled
+function x264_cpu_enable_armv7_counter
+    mrc         p15, 0, r2, c9, c12, 0      // read PMNC
+    ands        r0, r2, #1
+    andne       r0, r2, #9
+
+    orr         r2, r2, #1                  // enable counters
+    bic         r2, r2, #8                  // full resolution
+    mcreq       p15, 0, r2, c9, c12, 0      // write PMNC
+    mov         r2, #1 << 31                // enable cycle counter
+    mcr         p15, 0, r2, c9, c12, 1      // write CNTENS
+    bx          lr
+.endfunc
+
+function x264_cpu_disable_armv7_counter
+    mrc         p15, 0, r0, c9, c12, 0      // read PMNC
+    bic         r0, r0, #1                  // disable counters
+    mcr         p15, 0, r0, c9, c12, 0      // write PMNC
+    bx          lr
+.endfunc
+
+
+.macro READ_TIME r
+    mrc         p15, 0, \r, c9, c13, 0
+.endm
+
+// return: 0 if transfers neon -> arm transfers take more than 10 cycles
+//         nonzero otherwise
+function x264_cpu_fast_neon_mrc_test
+    // check for user access to performance counters
+    mrc         p15, 0, r0, c9, c14, 0
+    cmp         r0, #0
+    bxeq        lr
+
+    push        {r4-r6,lr}
+    bl          x264_cpu_enable_armv7_counter
+    ands        r1, r0, #8
+    mov         r3, #0
+    mov         ip, #4
+    mov         r6, #4
+    moveq       r5, #1
+    movne       r5, #64
+
+average_loop:
+    mov         r4, r5
+    READ_TIME   r1
+1:  subs        r4, r4, #1
+.rept 8
+    vmov.u32    lr, d0[0]
+    add         lr, lr, lr
+.endr
+    bgt         1b
+    READ_TIME   r2
+
+    subs        r6, r6, #1
+    sub         r2, r2, r1
+    cmpgt       r2, #30 << 3    // assume context switch if it took over 30 cycles
+    addle       r3, r3, r2
+    subles      ip, ip, #1
+    bgt         average_loop
+
+    // disable counters if we enabled them
+    ands        r0, r0, #1
+    bleq        x264_cpu_disable_armv7_counter
+
+    lsr         r0, r3, #5
+    cmp         r0, #10
+    movgt       r0, #0
+    pop         {r4-r6,pc}
+.endfunc
diff --git a/common/arm/dct-a.S b/common/arm/dct-a.S
new file mode 100644
index 0000000..0ed7238
--- /dev/null
+++ b/common/arm/dct-a.S
@@ -0,0 +1,663 @@
+/*****************************************************************************
+ * dct-a.S: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.fpu neon
+
+.section .rodata
+.align 4
+
+scan4x4_frame:
+.byte    0,1,   8,9,   2,3,   4,5
+.byte    2,3,   8,9,  16,17, 10,11
+.byte   12,13,  6,7,  14,15, 20,21
+.byte   10,11, 12,13,  6,7,  14,15
+
+.text
+
+// sum = a + (b>>shift)   sub = (a>>shift) - b
+.macro SUMSUB_SHR shift sum sub a b t0 t1
+    vshr.s16    \t0,  \b, #\shift
+    vshr.s16    \t1,  \a, #\shift
+    vadd.s16    \sum, \a, \t0
+    vsub.s16    \sub, \t1, \b
+.endm
+
+// sum = (a>>shift) + b   sub = a - (b>>shift)
+.macro SUMSUB_SHR2 shift sum sub a b t0 t1
+    vshr.s16    \t0,  \a, #\shift
+    vshr.s16    \t1,  \b, #\shift
+    vadd.s16    \sum, \t0, \b
+    vsub.s16    \sub, \a, \t1
+.endm
+
+// a += 1.5*ma   b -= 1.5*mb
+.macro SUMSUB_15 a b ma mb t0 t1
+    vshr.s16    \t0, \ma, #1
+    vshr.s16    \t1, \mb, #1
+    vadd.s16    \t0, \t0, \ma
+    vadd.s16    \t1, \t1, \mb
+    vadd.s16    \a,  \a,  \t0
+    vsub.s16    \b,  \b,  \t1
+.endm
+
+
+function x264_dct4x4dc_neon
+    vld1.64         {d0-d3}, [r0,:128]
+    SUMSUB_ABCD     d4, d5, d6, d7, d0, d1, d2, d3
+    SUMSUB_ABCD     d0, d2, d3, d1, d4, d6, d5, d7
+
+    vmov.s16        d31, #1
+    HADAMARD        1, sumsub, q2, q3, q0, q1
+    vtrn.32         d4,  d5
+    vadd.s16        d16, d4,  d31
+    vtrn.32         d6,  d7
+    vadd.s16        d17, d6,  d31
+    vrhadd.s16      d0,  d4,  d5
+    vhsub.s16       d1,  d16, d5
+    vhsub.s16       d2,  d17, d7
+    vrhadd.s16      d3,  d6,  d7
+    vst1.64         {d0-d3}, [r0,:128]
+    bx              lr
+.endfunc
+
+function x264_idct4x4dc_neon
+    vld1.64         {d0-d3}, [r0,:128]
+    SUMSUB_ABCD     d4, d5, d6, d7, d0, d1, d2, d3
+    SUMSUB_ABCD     d0, d2, d3, d1, d4, d6, d5, d7
+
+    HADAMARD        1, sumsub, q2, q3, q0, q1
+    HADAMARD        2, sumsub, d0, d1, d4, d5
+    HADAMARD        2, sumsub, d3, d2, d6, d7
+    vst1.64         {d0-d3}, [r0,:128]
+    bx              lr
+.endfunc
+
+
+.macro DCT_1D d0 d1 d2 d3  d4 d5 d6 d7
+    SUMSUB_AB       \d1, \d6, \d5, \d6
+    SUMSUB_AB       \d3, \d7, \d4, \d7
+    vadd.s16        \d0, \d3, \d1
+    vadd.s16        \d4, \d7, \d7
+    vadd.s16        \d5, \d6, \d6
+    vsub.s16        \d2, \d3, \d1
+    vadd.s16        \d1, \d4, \d6
+    vsub.s16        \d3, \d7, \d5
+.endm
+
+function x264_sub4x4_dct_neon
+    mov             r3, #FENC_STRIDE
+    mov             ip, #FDEC_STRIDE
+    vld1.32         {d0[]}, [r1,:32], r3
+    vld1.32         {d1[]}, [r2,:32], ip
+    vld1.32         {d2[]}, [r1,:32], r3
+    vsubl.u8        q8,  d0,  d1
+    vld1.32         {d3[]}, [r2,:32], ip
+    vld1.32         {d4[]}, [r1,:32], r3
+    vsubl.u8        q9,  d2,  d3
+    vld1.32         {d5[]}, [r2,:32], ip
+    vld1.32         {d6[]}, [r1,:32], r3
+    vsubl.u8        q10, d4,  d5
+    vld1.32         {d7[]}, [r2,:32], ip
+    vsubl.u8        q11, d6,  d7
+
+    DCT_1D          d0, d1, d2, d3, d16, d18, d20, d22
+    TRANSPOSE4x4_16 d0, d1, d2, d3
+    DCT_1D          d4, d5, d6, d7, d0, d1, d2, d3
+    vst1.64         {d4-d7}, [r0,:128]
+    bx              lr
+.endfunc
+
+function x264_sub8x4_dct_neon
+    vld1.64         {d0}, [r1,:64], r3
+    vld1.64         {d1}, [r2,:64], ip
+    vsubl.u8        q8,  d0,  d1
+    vld1.64         {d2}, [r1,:64], r3
+    vld1.64         {d3}, [r2,:64], ip
+    vsubl.u8        q9,  d2,  d3
+    vld1.64         {d4}, [r1,:64], r3
+    vld1.64         {d5}, [r2,:64], ip
+    vsubl.u8        q10, d4,  d5
+    vld1.64         {d6}, [r1,:64], r3
+    vld1.64         {d7}, [r2,:64], ip
+    vsubl.u8        q11, d6,  d7
+
+    DCT_1D          q0, q1, q2, q3,  q8, q9, q10, q11
+    TRANSPOSE4x4_16 q0, q1, q2, q3
+
+    SUMSUB_AB       q8,  q12, q0,  q3
+    SUMSUB_AB       q9,  q10, q1,  q2
+    vadd.i16        q13, q12, q12
+    vadd.i16        q11, q10, q10
+    vadd.i16        d0,  d16, d18
+    vadd.i16        d1,  d26, d20
+    vsub.i16        d2,  d16, d18
+    vsub.i16        d3,  d24, d22
+    vst1.64         {d0-d1}, [r0,:128]!
+    vadd.i16        d4,  d17, d19
+    vadd.i16        d5,  d27, d21
+    vst1.64         {d2-d3}, [r0,:128]!
+    vsub.i16        d6,  d17, d19
+    vsub.i16        d7,  d25, d23
+    vst1.64         {d4-d5}, [r0,:128]!
+    vst1.64         {d6-d7}, [r0,:128]!
+    bx              lr
+.endfunc
+
+function x264_sub8x8_dct_neon
+    push            {lr}
+    mov             r3, #FENC_STRIDE
+    mov             ip, #FDEC_STRIDE
+    bl              x264_sub8x4_dct_neon
+    pop             {lr}
+    b               x264_sub8x4_dct_neon
+.endfunc
+
+function x264_sub16x16_dct_neon
+    push            {lr}
+    mov             r3, #FENC_STRIDE
+    mov             ip, #FDEC_STRIDE
+    bl              x264_sub8x4_dct_neon
+    bl              x264_sub8x4_dct_neon
+    sub             r1, r1, #8*FENC_STRIDE-8
+    sub             r2, r2, #8*FDEC_STRIDE-8
+    bl              x264_sub8x4_dct_neon
+    bl              x264_sub8x4_dct_neon
+    sub             r1, r1, #8
+    sub             r2, r2, #8
+    bl              x264_sub8x4_dct_neon
+    bl              x264_sub8x4_dct_neon
+    sub             r1, r1, #8*FENC_STRIDE-8
+    sub             r2, r2, #8*FDEC_STRIDE-8
+    bl              x264_sub8x4_dct_neon
+    pop             {lr}
+    b               x264_sub8x4_dct_neon
+.endfunc
+
+
+.macro DCT8_1D type
+    SUMSUB_AB       q2,  q1,  q11, q12  // s34/d34
+    SUMSUB_AB       q3,  q11, q10, q13  // s25/d25
+    SUMSUB_AB       q13, q10, q9,  q14  // s16/d16
+    SUMSUB_AB       q14, q8,  q8,  q15  // s07/d07
+
+    SUMSUB_AB       q9,  q2,  q14, q2   // a0/a2
+    SUMSUB_AB       q12, q14, q13, q3   // a1/a3
+
+    SUMSUB_AB       q3,  q13, q8,  q1   // a6/a5
+    vshr.s16        q0,  q10, #1
+    vshr.s16        q15, q11, #1
+    vadd.s16        q0,  q0,  q10
+    vadd.s16        q15, q15, q11
+    vsub.s16        q3,  q3,  q0
+    vsub.s16        q13, q13, q15
+
+    SUMSUB_AB       q0,  q15, q10, q11  // a4/a7
+    vshr.s16        q10, q8,  #1
+    vshr.s16        q11, q1,  #1
+    vadd.s16        q10, q10, q8
+    vadd.s16        q11, q11, q1
+    vadd.s16        q10, q0,  q10
+    vadd.s16        q15, q15, q11
+
+    SUMSUB_AB       q8,  q12, q9,  q12
+    SUMSUB_SHR   2, q9,  q15, q10, q15,  q0, q1
+    SUMSUB_SHR   1, q10, q14, q2,  q14,  q0, q1
+    SUMSUB_SHR2  2, q11, q13, q3,  q13,  q0, q1
+.endm
+
+function x264_sub8x8_dct8_neon
+    mov             r3, #FENC_STRIDE
+    mov             ip, #FDEC_STRIDE
+    vld1.64         {d16}, [r1,:64], r3
+    vld1.64         {d17}, [r2,:64], ip
+    vsubl.u8        q8,  d16, d17
+    vld1.64         {d18}, [r1,:64], r3
+    vld1.64         {d19}, [r2,:64], ip
+    vsubl.u8        q9,  d18, d19
+    vld1.64         {d20}, [r1,:64], r3
+    vld1.64         {d21}, [r2,:64], ip
+    vsubl.u8        q10, d20, d21
+    vld1.64         {d22}, [r1,:64], r3
+    vld1.64         {d23}, [r2,:64], ip
+    vsubl.u8        q11, d22, d23
+    vld1.64         {d24}, [r1,:64], r3
+    vld1.64         {d25}, [r2,:64], ip
+    vsubl.u8        q12, d24, d25
+    vld1.64         {d26}, [r1,:64], r3
+    vld1.64         {d27}, [r2,:64], ip
+    vsubl.u8        q13, d26, d27
+    vld1.64         {d28}, [r1,:64], r3
+    vld1.64         {d29}, [r2,:64], ip
+    vsubl.u8        q14, d28, d29
+    vld1.64         {d30}, [r1,:64], r3
+    vld1.64         {d31}, [r2,:64], ip
+    vsubl.u8        q15, d30, d31
+
+    DCT8_1D row
+    vswp            d17, d24    // 8, 12
+    vswp            d21, d28    // 10,14
+    vtrn.32         q8,  q10
+    vtrn.32         q12, q14
+
+    vswp            d19, d26    // 9, 13
+    vswp            d23, d30    // 11,15
+    vtrn.32         q9,  q11
+    vtrn.32         q13, q15
+
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q8,  q9
+    vtrn.16         q14, q15
+    DCT8_1D col
+
+    vst1.64         {d16-d19}, [r0,:128]!
+    vst1.64         {d20-d23}, [r0,:128]!
+    vst1.64         {d24-d27}, [r0,:128]!
+    vst1.64         {d28-d31}, [r0,:128]!
+    bx              lr
+.endfunc
+
+function x264_sub16x16_dct8_neon
+    push            {lr}
+    bl              x264_sub8x8_dct8_neon
+    sub             r1,  r1,  #FENC_STRIDE*8 - 8
+    sub             r2,  r2,  #FDEC_STRIDE*8 - 8
+    bl              x264_sub8x8_dct8_neon
+    sub             r1,  r1,  #8
+    sub             r2,  r2,  #8
+    bl              x264_sub8x8_dct8_neon
+    pop             {lr}
+    sub             r1,  r1,  #FENC_STRIDE*8 - 8
+    sub             r2,  r2,  #FDEC_STRIDE*8 - 8
+    b               x264_sub8x8_dct8_neon
+.endfunc
+
+
+// First part of IDCT (minus final SUMSUB_BA)
+.macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
+    SUMSUB_AB       \d4, \d5, \d0, \d2
+    vshr.s16        \d7, \d1, #1
+    vshr.s16        \d6, \d3, #1
+    vsub.s16        \d7, \d7, \d3
+    vadd.s16        \d6, \d6, \d1
+.endm
+
+function x264_add4x4_idct_neon
+    mov             r2, #FDEC_STRIDE
+    vld1.64         {d0-d3}, [r1,:128]
+
+    IDCT_1D         d4, d5, d6, d7, d0, d1, d2, d3
+    vld1.32         {d30[0]}, [r0,:32], r2
+    SUMSUB_AB       q0, q1, q2, q3
+
+    TRANSPOSE4x4_16 d0, d1, d3, d2
+
+    IDCT_1D         d4, d5, d6, d7, d0, d1, d3, d2
+    vld1.32         {d30[1]}, [r0,:32], r2
+    SUMSUB_AB       q0, q1, q2, q3
+
+    vrshr.s16       q0, q0, #6
+    vld1.32         {d31[1]}, [r0,:32], r2
+    vrshr.s16       q1, q1, #6
+    vld1.32         {d31[0]}, [r0,:32], r2
+
+    sub             r0, r0, r2, lsl #2
+    vaddw.u8        q0, q0, d30
+    vaddw.u8        q1, q1, d31
+    vqmovun.s16     d0, q0
+    vqmovun.s16     d2, q1
+
+    vst1.32         {d0[0]}, [r0,:32], r2
+    vst1.32         {d0[1]}, [r0,:32], r2
+    vst1.32         {d2[1]}, [r0,:32], r2
+    vst1.32         {d2[0]}, [r0,:32], r2
+    bx              lr
+.endfunc
+
+function x264_add8x4_idct_neon
+    vld1.64         {d0-d3}, [r1,:128]!
+    IDCT_1D         d16, d18, d20, d22, d0, d1, d2, d3
+    vld1.64         {d4-d7}, [r1,:128]!
+    IDCT_1D         d17, d19, d21, d23, d4, d5, d6, d7
+    SUMSUB_AB       q0,  q3,  q8,  q10
+    SUMSUB_AB       q1,  q2,  q9,  q11
+
+    TRANSPOSE4x4_16 q0,  q1,  q2,  q3
+
+    IDCT_1D         q8,  q9,  q10, q11, q0, q1, q2, q3
+    SUMSUB_AB       q0,  q3,  q8,  q10
+    SUMSUB_AB       q1,  q2,  q9,  q11
+
+    vrshr.s16       q0,  q0,  #6
+    vld1.32         {d28}, [r0,:64], r2
+    vrshr.s16       q1,  q1,  #6
+    vld1.32         {d29}, [r0,:64], r2
+    vrshr.s16       q2,  q2,  #6
+    vld1.32         {d30}, [r0,:64], r2
+    vrshr.s16       q3,  q3,  #6
+    vld1.32         {d31}, [r0,:64], r2
+
+    sub             r0,  r0,  r2,  lsl #2
+    vaddw.u8        q0,  q0,  d28
+    vaddw.u8        q1,  q1,  d29
+    vaddw.u8        q2,  q2,  d30
+    vaddw.u8        q3,  q3,  d31
+
+    vqmovun.s16     d0,  q0
+    vqmovun.s16     d1,  q1
+    vst1.32         {d0}, [r0,:64], r2
+    vqmovun.s16     d2,  q2
+    vst1.32         {d1}, [r0,:64], r2
+    vqmovun.s16     d3,  q3
+    vst1.32         {d2}, [r0,:64], r2
+    vst1.32         {d3}, [r0,:64], r2
+    bx              lr
+.endfunc
+
+function x264_add8x8_idct_neon
+    mov             r2, #FDEC_STRIDE
+    mov             ip, lr
+    bl              x264_add8x4_idct_neon
+    mov             lr, ip
+    b               x264_add8x4_idct_neon
+.endfunc
+
+function x264_add16x16_idct_neon
+    mov             r2, #FDEC_STRIDE
+    mov             ip, lr
+    bl              x264_add8x4_idct_neon
+    bl              x264_add8x4_idct_neon
+    sub             r0, r0, #8*FDEC_STRIDE-8
+    bl              x264_add8x4_idct_neon
+    bl              x264_add8x4_idct_neon
+    sub             r0, r0, #8
+    bl              x264_add8x4_idct_neon
+    bl              x264_add8x4_idct_neon
+    sub             r0, r0, #8*FDEC_STRIDE-8
+    bl              x264_add8x4_idct_neon
+    mov             lr, ip
+    b               x264_add8x4_idct_neon
+.endfunc
+
+
+.macro IDCT8_1D type
+.ifc \type, col
+    vswp            d21, d28
+.endif
+    SUMSUB_AB       q0,  q1,  q8,  q12              // a0/a2
+.ifc \type, row
+    vld1.64         {d28-d31}, [r1,:128]!
+.else
+    vswp            d19, d26
+.endif
+    SUMSUB_SHR   1, q2,  q3,  q10, q14,  q8, q12    // a6/a4
+.ifc \type, col
+    vswp            d23, d30
+.endif
+    SUMSUB_AB       q8,  q10, q13, q11
+    SUMSUB_15       q8,  q10, q9,  q15,  q12, q14   // a7/a1
+    SUMSUB_AB       q14, q15, q15, q9
+    SUMSUB_15       q15, q14, q13, q11,  q12, q9    // a5/a3
+
+    SUMSUB_SHR   2, q13, q14, q14, q15,  q11, q9    // b3/b5
+    SUMSUB_SHR2  2, q12, q15, q8,  q10,  q11, q9    // b1/b7
+
+    SUMSUB_AB       q10, q2,  q0,  q2               // b0/b6
+    SUMSUB_AB       q11, q3,  q1,  q3               // b2/b4
+
+    SUMSUB_AB       q8,  q15, q10, q15
+    SUMSUB_AB       q9,  q14, q11, q14
+    SUMSUB_AB       q10, q13, q3,  q13
+.ifc \type, row
+    vtrn.16         q8,  q9
+.endif
+    SUMSUB_AB       q11, q12, q2,  q12
+.endm
+
+function x264_add8x8_idct8_neon
+    mov             r2,  #FDEC_STRIDE
+    vld1.64         {d16-d19}, [r1,:128]!
+    vld1.64         {d20-d23}, [r1,:128]!
+    vld1.64         {d24-d27}, [r1,:128]!
+
+    IDCT8_1D row
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    vtrn.32         q8,  q10
+    vtrn.32         q9,  q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vswp            d17, d24
+    IDCT8_1D col
+
+    vld1.64         {d0}, [r0,:64], r2
+    vrshr.s16       q8,  q8,  #6
+    vld1.64         {d1}, [r0,:64], r2
+    vrshr.s16       q9,  q9,  #6
+    vld1.64         {d2}, [r0,:64], r2
+    vrshr.s16       q10, q10, #6
+    vld1.64         {d3}, [r0,:64], r2
+    vrshr.s16       q11, q11, #6
+    vld1.64         {d4}, [r0,:64], r2
+    vrshr.s16       q12, q12, #6
+    vld1.64         {d5}, [r0,:64], r2
+    vrshr.s16       q13, q13, #6
+    vld1.64         {d6}, [r0,:64], r2
+    vrshr.s16       q14, q14, #6
+    vld1.64         {d7}, [r0,:64], r2
+    vrshr.s16       q15, q15, #6
+    sub             r0,  r0,  r2,  lsl #3
+
+    vaddw.u8        q8,  q8,  d0
+    vaddw.u8        q9,  q9,  d1
+    vaddw.u8        q10, q10, d2
+    vqmovun.s16     d0,  q8
+    vqmovun.s16     d1,  q9
+    vqmovun.s16     d2,  q10
+    vaddw.u8        q11, q11, d3
+    vst1.64         {d0}, [r0,:64], r2
+    vaddw.u8        q12, q12, d4
+    vst1.64         {d1}, [r0,:64], r2
+    vaddw.u8        q13, q13, d5
+    vst1.64         {d2}, [r0,:64], r2
+    vqmovun.s16     d3,  q11
+    vqmovun.s16     d4,  q12
+    vaddw.u8        q14, q14, d6
+    vaddw.u8        q15, q15, d7
+    vst1.64         {d3}, [r0,:64], r2
+    vqmovun.s16     d5,  q13
+    vst1.64         {d4}, [r0,:64], r2
+    vqmovun.s16     d6,  q14
+    vqmovun.s16     d7,  q15
+    vst1.64         {d5}, [r0,:64], r2
+    vst1.64         {d6}, [r0,:64], r2
+    vst1.64         {d7}, [r0,:64], r2
+    bx              lr
+.endfunc
+
+function x264_add16x16_idct8_neon
+    mov             ip,  lr
+    bl              x264_add8x8_idct8_neon
+    sub             r0,  r0,  #8*FDEC_STRIDE-8
+    bl              x264_add8x8_idct8_neon
+    sub             r0,  r0,  #8
+    bl              x264_add8x8_idct8_neon
+    sub             r0,  r0,  #8*FDEC_STRIDE-8
+    mov             lr,  ip
+    b               x264_add8x8_idct8_neon
+.endfunc
+
+
+function x264_add8x8_idct_dc_neon
+    mov             r2,  #FDEC_STRIDE
+    vld1.64         {d16}, [r1,:64]
+    vrshr.s16       d16, d16, #6
+    vld1.64         {d0}, [r0,:64], r2
+    vmov.i16        q15, #0
+    vld1.64         {d1}, [r0,:64], r2
+    vld1.64         {d2}, [r0,:64], r2
+    vdup.16         d20, d16[0]
+    vld1.64         {d3}, [r0,:64], r2
+    vdup.16         d21, d16[1]
+    vld1.64         {d4}, [r0,:64], r2
+    vdup.16         d22, d16[2]
+    vld1.64         {d5}, [r0,:64], r2
+    vdup.16         d23, d16[3]
+    vld1.64         {d6}, [r0,:64], r2
+    vsub.s16        q12, q15, q10
+    vld1.64         {d7}, [r0,:64], r2
+    vsub.s16        q13, q15, q11
+
+    sub             r0,  r0,  #8*FDEC_STRIDE
+
+    vqmovun.s16     d20, q10
+    vqmovun.s16     d22, q11
+    vqmovun.s16     d24, q12
+    vqmovun.s16     d26, q13
+
+    vmov            d21, d20
+    vqadd.u8        q0,  q0,  q10
+    vmov            d23, d22
+    vqadd.u8        q1,  q1,  q10
+    vmov            d25, d24
+    vqadd.u8        q2,  q2,  q11
+    vmov            d27, d26
+    vqadd.u8        q3,  q3,  q11
+    vqsub.u8        q0,  q0,  q12
+    vqsub.u8        q1,  q1,  q12
+    vqsub.u8        q2,  q2,  q13
+
+    vst1.64         {d0}, [r0,:64], r2
+    vqsub.u8        q3,  q3,  q13
+    vst1.64         {d1}, [r0,:64], r2
+    vst1.64         {d2}, [r0,:64], r2
+    vst1.64         {d3}, [r0,:64], r2
+    vst1.64         {d4}, [r0,:64], r2
+    vst1.64         {d5}, [r0,:64], r2
+    vst1.64         {d6}, [r0,:64], r2
+    vst1.64         {d7}, [r0,:64], r2
+    bx              lr
+.endfunc
+
+.macro ADD16x4_IDCT_DC dc
+    vld1.64         {d16-d17}, [r0,:128], r3
+    vld1.64         {d18-d19}, [r0,:128], r3
+    vdup.16         d4,  \dc[0]
+    vdup.16         d5,  \dc[1]
+    vld1.64         {d20-d21}, [r0,:128], r3
+    vdup.16         d6,  \dc[2]
+    vdup.16         d7,  \dc[3]
+    vld1.64         {d22-d23}, [r0,:128], r3
+    vsub.s16        q12, q15, q2
+    vsub.s16        q13, q15, q3
+
+    vqmovun.s16     d4,  q2
+    vqmovun.s16     d5,  q3
+    vqmovun.s16     d6,  q12
+    vqmovun.s16     d7,  q13
+
+    vqadd.u8        q8,  q8,  q2
+    vqadd.u8        q9,  q9,  q2
+    vqadd.u8        q10, q10, q2
+    vqadd.u8        q11, q11, q2
+
+    vqsub.u8        q8,  q8,  q3
+    vqsub.u8        q9,  q9,  q3
+    vqsub.u8        q10, q10, q3
+    vst1.64         {d16-d17}, [r2,:128], r3
+    vqsub.u8        q11, q11, q3
+    vst1.64         {d18-d19}, [r2,:128], r3
+    vst1.64         {d20-d21}, [r2,:128], r3
+    vst1.64         {d22-d23}, [r2,:128], r3
+.endm
+
+function x264_add16x16_idct_dc_neon
+    mov             r2,  r0
+    mov             r3,  #FDEC_STRIDE
+    vmov.i16        q15, #0
+
+    vld1.64         {d0-d3}, [r1,:64]
+    vrshr.s16       q0, #6
+    vrshr.s16       q1, #6
+
+    ADD16x4_IDCT_DC d0
+    ADD16x4_IDCT_DC d1
+    ADD16x4_IDCT_DC d2
+    ADD16x4_IDCT_DC d3
+    bx              lr
+.endfunc
+
+function x264_sub8x8_dct_dc_neon
+    mov             r3,  #FENC_STRIDE
+    mov             ip,  #FDEC_STRIDE
+    vld1.64         {d16}, [r1,:64], r3
+    vld1.64         {d17}, [r2,:64], ip
+    vsubl.u8        q8,  d16, d17
+    vld1.64         {d18}, [r1,:64], r3
+    vld1.64         {d19}, [r2,:64], ip
+    vsubl.u8        q9,  d18, d19
+    vld1.64         {d20}, [r1,:64], r3
+    vld1.64         {d21}, [r2,:64], ip
+    vsubl.u8        q10, d20, d21
+    vld1.64         {d22}, [r1,:64], r3
+    vadd.s16        q0,  q8,  q9
+    vld1.64         {d23}, [r2,:64], ip
+    vsubl.u8        q11, d22, d23
+    vld1.64         {d24}, [r1,:64], r3
+    vadd.s16        q0,  q0,  q10
+    vld1.64         {d25}, [r2,:64], ip
+    vsubl.u8        q12, d24, d25
+    vld1.64         {d26}, [r1,:64], r3
+    vadd.s16        q0,  q0,  q11
+    vld1.64         {d27}, [r2,:64], ip
+    vsubl.u8        q13, d26, d27
+    vld1.64         {d28}, [r1,:64], r3
+    vld1.64         {d29}, [r2,:64], ip
+    vsubl.u8        q14, d28, d29
+    vld1.64         {d30}, [r1,:64], r3
+    vadd.s16        q1,  q12, q13
+    vld1.64         {d31}, [r2,:64], ip
+    vpadd.s16       d0,  d0,  d1
+    vadd.s16        q1,  q1,  q14
+    vsubl.u8        q15, d30, d31
+    vadd.s16        q1,  q1,  q15
+    vpadd.s16       d2,  d2,  d3
+    vpadd.s16       d0,  d0,  d2
+    vst1.64         {d0}, [r0,:64]
+    bx              lr
+.endfunc
+
+
+function x264_zigzag_scan_4x4_frame_neon
+    movrel      r2, scan4x4_frame
+    vld1.64     {d0-d3},   [r1,:128]
+    vld1.64     {d16-d19}, [r2,:128]
+    vtbl.8      d4, {d0-d1}, d16
+    vtbl.8      d5, {d1-d3}, d17
+    vtbl.8      d6, {d0-d2}, d18
+    vtbl.8      d7, {d2-d3}, d19
+    vst1.64     {d4-d7},   [r0,:128]
+    bx          lr
+.endfunc
diff --git a/common/arm/dct.h b/common/arm/dct.h
new file mode 100644
index 0000000..55f53ce
--- /dev/null
+++ b/common/arm/dct.h
@@ -0,0 +1,49 @@
+/*****************************************************************************
+ * dct.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef X264_ARM_DCT_H
+#define X264_ARM_DCT_H
+
+void x264_dct4x4dc_neon( int16_t d[16] );
+void x264_idct4x4dc_neon( int16_t d[16] );
+
+void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+
+void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
+void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
+void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
+
+void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
+void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
+void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
+
+void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
+
+void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
+void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
+
+void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
+
+#endif
diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S
new file mode 100644
index 0000000..f124b55
--- /dev/null
+++ b/common/arm/deblock-a.S
@@ -0,0 +1,283 @@
+/*****************************************************************************
+ * deblock.S: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: Mans Rullgard <mans at mansr.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.fpu neon
+
+.macro h264_loop_filter_start
+    ldr             ip,  [sp]
+    ldr             ip,  [ip]
+    vmov.32         d24[0], ip
+    and             ip,  ip,  ip, lsl #16
+    ands            ip,  ip,  ip, lsl #8
+    bxlt            lr
+.endm
+
+.macro align_push_regs
+    and             ip,  sp,  #15
+    add             ip,  ip,  #32
+    sub             sp,  sp,  ip
+    vst1.64         {d12-d15}, [sp,:128]
+    sub             sp,  sp,  #32
+    vst1.64         {d8-d11},  [sp,:128]
+.endm
+
+.macro align_pop_regs
+    vld1.64         {d8-d11},  [sp,:128]!
+    vld1.64         {d12-d15}, [sp,:128], ip
+.endm
+
+.macro h264_loop_filter_luma
+    vdup.8          q11, r2         @ alpha
+    vmovl.u8        q12, d24
+    vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
+    vmovl.u16       q12, d24
+    vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
+    vsli.16         q12, q12, #8
+    vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
+    vsli.32         q12, q12, #16
+    vclt.u8         q6,  q6,  q11   @ < alpha
+    vdup.8          q11, r3         @ beta
+    vclt.s8         q7,  q12, #0
+    vclt.u8         q14, q14, q11   @ < beta
+    vclt.u8         q15, q15, q11   @ < beta
+    vbic            q6,  q6,  q7
+    vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
+    vand            q6,  q6,  q14
+    vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
+    vclt.u8         q4,  q4,  q11   @ < beta
+    vand            q6,  q6,  q15
+    vclt.u8         q5,  q5,  q11   @ < beta
+    vand            q4,  q4,  q6
+    vand            q5,  q5,  q6
+    vand            q12, q12, q6
+    vrhadd.u8       q14, q8,  q0
+    vsub.i8         q6,  q12, q4
+    vqadd.u8        q7,  q9,  q12
+    vhadd.u8        q10, q10, q14
+    vsub.i8         q6,  q6,  q5
+    vhadd.u8        q14, q2,  q14
+    vmin.u8         q7,  q7,  q10
+    vqsub.u8        q11, q9,  q12
+    vqadd.u8        q2,  q1,  q12
+    vmax.u8         q7,  q7,  q11
+    vqsub.u8        q11, q1,  q12
+    vmin.u8         q14, q2,  q14
+    vmovl.u8        q2,  d0
+    vmax.u8         q14, q14, q11
+    vmovl.u8        q10, d1
+    vsubw.u8        q2,  q2,  d16
+    vsubw.u8        q10, q10, d17
+    vshl.i16        q2,  q2,  #2
+    vshl.i16        q10, q10, #2
+    vaddw.u8        q2,  q2,  d18
+    vaddw.u8        q10, q10, d19
+    vsubw.u8        q2,  q2,  d2
+    vsubw.u8        q10, q10, d3
+    vrshrn.i16      d4,  q2,  #3
+    vrshrn.i16      d5,  q10, #3
+    vbsl            q4,  q7,  q9
+    vbsl            q5,  q14, q1
+    vneg.s8         q7,  q6
+    vmovl.u8        q14, d16
+    vmin.s8         q2,  q2,  q6
+    vmovl.u8        q6,  d17
+    vmax.s8         q2,  q2,  q7
+    vmovl.u8        q11, d0
+    vmovl.u8        q12, d1
+    vaddw.s8        q14, q14, d4
+    vaddw.s8        q6,  q6,  d5
+    vsubw.s8        q11, q11, d4
+    vsubw.s8        q12, q12, d5
+    vqmovun.s16     d16, q14
+    vqmovun.s16     d17, q6
+    vqmovun.s16     d0,  q11
+    vqmovun.s16     d1,  q12
+.endm
+
+function x264_deblock_v_luma_neon
+    h264_loop_filter_start
+
+    vld1.64         {d0, d1},  [r0,:128], r1
+    vld1.64         {d2, d3},  [r0,:128], r1
+    vld1.64         {d4, d5},  [r0,:128], r1
+    sub             r0,  r0,  r1, lsl #2
+    sub             r0,  r0,  r1, lsl #1
+    vld1.64         {d20,d21}, [r0,:128], r1
+    vld1.64         {d18,d19}, [r0,:128], r1
+    vld1.64         {d16,d17}, [r0,:128], r1
+
+    align_push_regs
+
+    h264_loop_filter_luma
+
+    sub             r0,  r0,  r1, lsl #1
+    vst1.64         {d8, d9},  [r0,:128], r1
+    vst1.64         {d16,d17}, [r0,:128], r1
+    vst1.64         {d0, d1},  [r0,:128], r1
+    vst1.64         {d10,d11}, [r0,:128]
+
+    align_pop_regs
+    bx              lr
+.endfunc
+
+function x264_deblock_h_luma_neon
+    h264_loop_filter_start
+
+    sub             r0,  r0,  #4
+    vld1.64         {d6},  [r0], r1
+    vld1.64         {d20}, [r0], r1
+    vld1.64         {d18}, [r0], r1
+    vld1.64         {d16}, [r0], r1
+    vld1.64         {d0},  [r0], r1
+    vld1.64         {d2},  [r0], r1
+    vld1.64         {d4},  [r0], r1
+    vld1.64         {d26}, [r0], r1
+    vld1.64         {d7},  [r0], r1
+    vld1.64         {d21}, [r0], r1
+    vld1.64         {d19}, [r0], r1
+    vld1.64         {d17}, [r0], r1
+    vld1.64         {d1},  [r0], r1
+    vld1.64         {d3},  [r0], r1
+    vld1.64         {d5},  [r0], r1
+    vld1.64         {d27}, [r0], r1
+
+    TRANSPOSE8x8    q3, q10, q9, q8, q0, q1, q2, q13
+
+    align_push_regs
+
+    h264_loop_filter_luma
+
+    TRANSPOSE4x4    q4, q8, q0, q5
+
+    sub             r0,  r0,  r1, lsl #4
+    add             r0,  r0,  #2
+    vst1.32         {d8[0]},  [r0], r1
+    vst1.32         {d16[0]}, [r0], r1
+    vst1.32         {d0[0]},  [r0], r1
+    vst1.32         {d10[0]}, [r0], r1
+    vst1.32         {d8[1]},  [r0], r1
+    vst1.32         {d16[1]}, [r0], r1
+    vst1.32         {d0[1]},  [r0], r1
+    vst1.32         {d10[1]}, [r0], r1
+    vst1.32         {d9[0]},  [r0], r1
+    vst1.32         {d17[0]}, [r0], r1
+    vst1.32         {d1[0]},  [r0], r1
+    vst1.32         {d11[0]}, [r0], r1
+    vst1.32         {d9[1]},  [r0], r1
+    vst1.32         {d17[1]}, [r0], r1
+    vst1.32         {d1[1]},  [r0], r1
+    vst1.32         {d11[1]}, [r0], r1
+
+    align_pop_regs
+    bx              lr
+.endfunc
+
+.macro h264_loop_filter_chroma
+    vdup.8          d22, r2         // alpha
+    vmovl.u8        q12, d24
+    vabd.u8         d26, d16, d0    // abs(p0 - q0)
+    vmovl.u8        q2,  d0
+    vabd.u8         d28, d18, d16   // abs(p1 - p0)
+    vsubw.u8        q2,  q2,  d16
+    vsli.16         d24, d24, #8
+    vshl.i16        q2,  q2,  #2
+    vabd.u8         d30, d2,  d0    // abs(q1 - q0)
+    vaddw.u8        q2,  q2,  d18
+    vclt.u8         d26, d26, d22   // < alpha
+    vsubw.u8        q2,  q2,  d2
+    vdup.8          d22, r3         // beta
+    vclt.s8         d25, d24, #0
+    vrshrn.i16      d4,  q2,  #3
+    vclt.u8         d28, d28, d22   // < beta
+    vbic            d26, d26, d25
+    vclt.u8         d30, d30, d22   // < beta
+    vand            d26, d26, d28
+    vneg.s8         d25, d24
+    vand            d26, d26, d30
+    vmin.s8         d4,  d4,  d24
+    vmovl.u8        q14, d16
+    vand            d4,  d4,  d26
+    vmax.s8         d4,  d4,  d25
+    vmovl.u8        q11, d0
+    vaddw.s8        q14, q14, d4
+    vsubw.s8        q11, q11, d4
+    vqmovun.s16     d16, q14
+    vqmovun.s16     d0,  q11
+.endm
+
+function x264_deblock_v_chroma_neon
+    h264_loop_filter_start
+
+    sub             r0,  r0,  r1, lsl #1
+    vld1.64         {d18}, [r0,:64], r1
+    vld1.64         {d16}, [r0,:64], r1
+    vld1.64         {d0},  [r0,:64], r1
+    vld1.64         {d2},  [r0,:64]
+
+    h264_loop_filter_chroma
+
+    sub             r0,  r0,  r1, lsl #1
+    vst1.64         {d16}, [r0,:64], r1
+    vst1.64         {d0},  [r0,:64], r1
+
+    bx              lr
+.endfunc
+
+function x264_deblock_h_chroma_neon
+    h264_loop_filter_start
+
+    sub             r0,  r0,  #2
+    vld1.32         {d18[]},  [r0], r1
+    vld1.32         {d16[]},  [r0], r1
+    vld1.32         {d0[]},   [r0], r1
+    vld1.32         {d2[]},   [r0], r1
+    vld1.32         {d18[1]}, [r0], r1
+    vld1.32         {d16[1]}, [r0], r1
+    vld1.32         {d0[1]},  [r0], r1
+    vld1.32         {d2[1]},  [r0], r1
+
+    vtrn.16         d18, d0
+    vtrn.16         d16, d2
+    vtrn.8          d18, d16
+    vtrn.8          d0,  d2
+
+    h264_loop_filter_chroma
+
+    vtrn.16         d18, d0
+    vtrn.16         d16, d2
+    vtrn.8          d18, d16
+    vtrn.8          d0,  d2
+
+    sub             r0,  r0,  r1, lsl #3
+    vst1.32         {d18[0]}, [r0], r1
+    vst1.32         {d16[0]}, [r0], r1
+    vst1.32         {d0[0]},  [r0], r1
+    vst1.32         {d2[0]},  [r0], r1
+    vst1.32         {d18[1]}, [r0], r1
+    vst1.32         {d16[1]}, [r0], r1
+    vst1.32         {d0[1]},  [r0], r1
+    vst1.32         {d2[1]},  [r0], r1
+
+    bx              lr
+.endfunc
diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
new file mode 100644
index 0000000..a62af39
--- /dev/null
+++ b/common/arm/mc-a.S
@@ -0,0 +1,1045 @@
+/*****************************************************************************
+ * mc.S: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *          Mans Rullgard <mans at mansr.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.fpu neon
+.text
+
+// note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8
+// They also use nothing above armv5te, but we don't care about pre-armv6
+
+// void prefetch_ref( uint8_t *pix, int stride, int parity )
+function x264_prefetch_ref_arm
+    sub         r2, r2, #1
+    add         r0, r0, #64
+    and         r2, r2, r1
+    add         r0, r0, r2, lsl #3
+    add         r2, r1, r1, lsl #1
+    pld         [r0]
+    pld         [r0, r1]
+    pld         [r0, r1, lsl #1]
+    add         r3, r0, r1, lsl #2
+    pld         [r0, r2]
+    pld         [r3]
+    pld         [r3, r1]
+    pld         [r3, r1, lsl #1]
+    pld         [r3, r2]
+    bx          lr
+.endfunc
+
+// void prefetch_fenc( uint8_t *pix_y, int stride_y,
+//                     uint8_t *pix_uv, int stride_uv, int mb_x )
+function x264_prefetch_fenc_arm
+    ldr         ip, [sp]
+    push        {lr}
+    and         lr, ip, #3
+    smulbb      lr, lr, r1      // note: this assumes stride_y is <= 16 bits signed
+    and         ip, ip, #6
+    smulbb      ip, ip, r3
+    add         r0, r0, #64
+    add         r2, r2, #64
+    add         r0, r0, lr, lsl #2
+    pld         [r0]
+    add         lr, r0, r1, lsl #1
+    pld         [r0, r1]
+    pld         [lr]
+    add         r2, r2, ip, lsl #2
+    pld         [lr, r1]
+    pld         [r2]
+    add         ip, r2, r3, lsl #1
+    pld         [r2, r3]
+    pld         [ip]
+    pld         [ip, r3]
+    pop         {pc}
+.endfunc
+
+
+// void *x264_memcpy_aligned( void * dst, const void * src, size_t n )
+function x264_memcpy_aligned_neon
+    orr         r3,  r0,  r1,  lsr #1
+    movrel      ip,  memcpy_table
+    and         r3,  r3,  #0xc
+    ldr         pc,  [ip, r3]
+.endfunc
+
+.macro MEMCPY_ALIGNED srcalign dstalign
+function memcpy_aligned_\dstalign\()_\srcalign\()_neon
+    mov         r3, r0
+.if \srcalign == 8 && \dstalign == 8
+    sub         r2, #16
+    vld1.64     {d0}, [r1,:64]!
+    vst1.64     {d0}, [r3,:64]!
+    .set r1align, 128
+    .set r3align, 128
+.else
+    .set r1align, \srcalign * 8
+    .set r3align, \dstalign * 8
+.endif
+    tst         r2, #16
+    beq         32f
+    sub         r2, #16
+    vld1.64     {d0-d1}, [r1,:r1align]!
+    vst1.64     {d0-d1}, [r3,:r3align]!
+32: // n is a multiple of 32
+    tst         r2, #32
+    beq         64f
+    sub         r2, #32
+    vld1.64     {d0-d3}, [r1,:r1align]!
+    vst1.64     {d0-d3}, [r3,:r3align]!
+64: // n is a multiple of 64
+    subs        r2, #64
+    vld1.64     {d0-d3}, [r1,:r1align]!
+    vld1.64     {d4-d7}, [r1,:r1align]!
+    vst1.64     {d0-d3}, [r3,:r3align]!
+    vst1.64     {d4-d7}, [r3,:r3align]!
+    bgt         64b
+.if \srcalign == 8 && \dstalign == 8
+    vld1.64     {d0}, [r1,:64]!
+    vst1.64     {d0}, [r3,:64]!
+.endif
+    bx          lr
+.endfunc
+.endm
+
+MEMCPY_ALIGNED 16, 16
+MEMCPY_ALIGNED 16, 8
+MEMCPY_ALIGNED  8, 16
+MEMCPY_ALIGNED  8, 8
+
+.section .rodata
+memcpy_table:
+.word memcpy_aligned_16_16_neon
+.word memcpy_aligned_16_8_neon
+.word memcpy_aligned_8_16_neon
+.word memcpy_aligned_8_8_neon
+.text
+
+.ltorg
+
+// void x264_memzero_aligned( void *dst, size_t n )
+function x264_memzero_aligned_neon
+    vmov.i8     q0, #0
+    vmov.i8     q1, #0
+memzero_loop:
+    subs        r1, #128
+.rept 4
+    vst1.64     {d0-d3}, [r0,:128]!
+.endr
+    bgt         memzero_loop
+    bx          lr
+.endfunc
+
+
+// void pixel_avg( uint8_t *dst, int dst_stride,
+//                 uint8_t *src1, int src1_stride,
+//                 uint8_t *src2, int src2_stride, int weight );
+.macro AVGH w h
+function x264_pixel_avg_\w\()x\h\()_neon
+    ldr         ip, [sp, #8]
+    push        {r4-r6,lr}
+    cmp         ip, #32
+    ldrd        r4, [sp, #16]
+    mov         lr, #\h
+    beq         x264_pixel_avg_w\w\()_neon
+    rsbs        r6,  ip,  #64
+    blt         x264_pixel_avg_weight_w\w\()_add_sub_neon     // weight > 64
+    cmp         ip,  #0
+    bge         x264_pixel_avg_weight_w\w\()_add_add_neon
+    b           x264_pixel_avg_weight_w\w\()_sub_add_neon     // weight < 0
+.endfunc
+.endm
+
+AVGH  4, 2
+AVGH  4, 4
+AVGH  4, 8
+AVGH  8, 4
+AVGH  8, 8
+AVGH  8, 16
+AVGH 16, 8
+AVGH 16, 16
+
+// 0 < weight < 64
+.macro load_weights_add_add
+    vdup.8      d30, ip
+    vdup.8      d31, r6
+.endm
+
+.macro load_add_add d1 d2
+    vld1.32     {\d1}, [r2], r3
+    vld1.32     {\d2}, [r4], r5
+.endm
+
+.macro weight_add_add dst s1 s2
+    vmull.u8    \dst, \s1, d30
+    vmlal.u8    \dst, \s2, d31
+.endm
+
+// weight > 64
+.macro load_weights_add_sub
+    rsb         r6,  #0
+    vdup.8      d30, ip
+    vdup.8      d31, r6
+.endm
+
+.macro load_add_sub d1 d2
+    vld1.32     {\d1}, [r2], r3
+    vld1.32     {\d2}, [r4], r5
+.endm
+
+.macro weight_add_sub dst s1 s2
+    vmull.u8    \dst, \s1, d30
+    vmlsl.u8    \dst, \s2, d31
+.endm
+
+// weight < 0
+.macro load_weights_sub_add
+    rsb         ip,  #0
+    vdup.8      d31, r6
+    vdup.8      d30, ip
+.endm
+
+.macro load_sub_add d1 d2
+    vld1.32     {\d2}, [r4], r5
+    vld1.32     {\d1}, [r2], r3
+.endm
+
+.macro weight_sub_add dst s1 s2
+    vmull.u8    \dst, \s2, d31
+    vmlsl.u8    \dst, \s1, d30
+.endm
+
+.macro AVG_WEIGHT ext
+function x264_pixel_avg_weight_w4_\ext\()_neon
+    load_weights_\ext
+1:  // height loop
+    subs            lr,  lr,  #2
+    load_\ext       d0[], d1[]
+    weight_\ext     q8,  d0,  d1
+    load_\ext       d2[], d3[]
+    vqrshrun.s16    d0,  q8,  #6
+    weight_\ext     q9,  d2,  d3
+    vst1.32         {d0[0]}, [r0,:32], r1
+    vqrshrun.s16    d1,  q9,  #6
+    vst1.32         {d1[0]}, [r0,:32], r1
+    bgt             1b
+    pop             {r4-r6,pc}
+.endfunc
+
+function x264_pixel_avg_weight_w8_\ext\()_neon
+    load_weights_\ext
+1:  // height loop
+    subs            lr,  lr,  #4
+    load_\ext       d0,  d1
+    weight_\ext     q8,  d0,  d1
+    load_\ext       d2,  d3
+    weight_\ext     q9,  d2,  d3
+    load_\ext       d4,  d5
+    weight_\ext     q10, d4,  d5
+    load_\ext       d6,  d7
+    weight_\ext     q11, d6,  d7
+    vqrshrun.s16    d0,  q8,  #6
+    vqrshrun.s16    d1,  q9,  #6
+    vqrshrun.s16    d2,  q10, #6
+    vqrshrun.s16    d3,  q11, #6
+    vst1.64         {d0}, [r0,:64], r1
+    vst1.64         {d1}, [r0,:64], r1
+    vst1.64         {d2}, [r0,:64], r1
+    vst1.64         {d3}, [r0,:64], r1
+    bgt             1b
+    pop             {r4-r6,pc}
+.endfunc
+
+function x264_pixel_avg_weight_w16_\ext\()_neon
+    load_weights_\ext
+1:  // height loop
+    subs            lr,  lr,  #2
+    load_\ext       d0-d1, d2-d3
+    weight_\ext     q8,  d0,  d2
+    weight_\ext     q9,  d1,  d3
+    load_\ext       d4-d5, d6-d7
+    weight_\ext     q10, d4,  d6
+    weight_\ext     q11, d5,  d7
+    vqrshrun.s16    d0,  q8,  #6
+    vqrshrun.s16    d1,  q9,  #6
+    vqrshrun.s16    d2,  q10, #6
+    vqrshrun.s16    d3,  q11, #6
+    vst1.64         {d0-d1}, [r0,:128], r1
+    vst1.64         {d2-d3}, [r0,:128], r1
+    bgt             1b
+    pop             {r4-r6,pc}
+.endfunc
+.endm
+
+AVG_WEIGHT add_add
+AVG_WEIGHT add_sub
+AVG_WEIGHT sub_add
+
+function x264_pixel_avg_w4_neon
+    subs        lr,  lr,  #2
+    vld1.32     {d0[]}, [r2], r3
+    vld1.32     {d2[]}, [r4], r5
+    vrhadd.u8   d0,  d0,  d2
+    vld1.32     {d1[]}, [r2], r3
+    vld1.32     {d3[]}, [r4], r5
+    vrhadd.u8   d1,  d1,  d3
+    vst1.32     {d0[0]}, [r0,:32], r1
+    vst1.32     {d1[0]}, [r0,:32], r1
+    bgt         x264_pixel_avg_w4_neon
+    pop         {r4-r6,pc}
+.endfunc
+
+function x264_pixel_avg_w8_neon
+    subs        lr,  lr,  #4
+    vld1.64     {d0}, [r2], r3
+    vld1.64     {d2}, [r4], r5
+    vrhadd.u8   d0,  d0,  d2
+    vld1.64     {d1}, [r2], r3
+    vld1.64     {d3}, [r4], r5
+    vrhadd.u8   d1,  d1,  d3
+    vst1.64     {d0}, [r0,:64], r1
+    vld1.64     {d2}, [r2], r3
+    vld1.64     {d4}, [r4], r5
+    vrhadd.u8   d2,  d2,  d4
+    vst1.64     {d1}, [r0,:64], r1
+    vld1.64     {d3}, [r2], r3
+    vld1.64     {d5}, [r4], r5
+    vrhadd.u8   d3,  d3,  d5
+    vst1.64     {d2}, [r0,:64], r1
+    vst1.64     {d3}, [r0,:64], r1
+    bgt         x264_pixel_avg_w8_neon
+    pop         {r4-r6,pc}
+.endfunc
+
+function x264_pixel_avg_w16_neon
+    subs        lr,  lr,  #4
+    vld1.64     {d0-d1}, [r2], r3
+    vld1.64     {d2-d3}, [r4], r5
+    vrhadd.u8   q0,  q0,  q1
+    vld1.64     {d2-d3}, [r2], r3
+    vld1.64     {d4-d5}, [r4], r5
+    vrhadd.u8   q1,  q1,  q2
+    vst1.64     {d0-d1}, [r0,:128], r1
+    vld1.64     {d4-d5}, [r2], r3
+    vld1.64     {d6-d7}, [r4], r5
+    vrhadd.u8   q2,  q2,  q3
+    vst1.64     {d2-d3}, [r0,:128], r1
+    vld1.64     {d6-d7}, [r2], r3
+    vld1.64     {d0-d1}, [r4], r5
+    vrhadd.u8   q3,  q3,  q0
+    vst1.64     {d4-d5}, [r0,:128], r1
+    vst1.64     {d6-d7}, [r0,:128], r1
+    bgt         x264_pixel_avg_w16_neon
+    pop         {r4-r6,pc}
+.endfunc
+
+
+function x264_pixel_avg2_w4_neon
+    ldr         ip,  [sp, #4]
+    push        {lr}
+    ldr         lr,  [sp, #4]
+avg2_w4_loop:
+    subs        ip,  ip,  #2
+    vld1.32     {d0[]},  [r2], r3
+    vld1.32     {d2[]},  [lr], r3
+    vrhadd.u8   d0,  d0,  d2
+    vld1.32     {d1[]},  [r2], r3
+    vld1.32     {d3[]},  [lr], r3
+    vrhadd.u8   d1,  d1,  d3
+    vst1.32     {d0[0]}, [r0,:32], r1
+    vst1.32     {d1[0]}, [r0,:32], r1
+    bgt         avg2_w4_loop
+    pop         {pc}
+.endfunc
+
+function x264_pixel_avg2_w8_neon
+    ldr         ip,  [sp, #4]
+    push        {lr}
+    ldr         lr,  [sp, #4]
+avg2_w8_loop:
+    subs        ip,  ip,  #2
+    vld1.64     {d0}, [r2], r3
+    vld1.64     {d2}, [lr], r3
+    vrhadd.u8   d0,  d0,  d2
+    vld1.64     {d1}, [r2], r3
+    vld1.64     {d3}, [lr], r3
+    vrhadd.u8   d1,  d1,  d3
+    vst1.64     {d0}, [r0,:64], r1
+    vst1.64     {d1}, [r0,:64], r1
+    bgt         avg2_w8_loop
+    pop         {pc}
+.endfunc
+
+function x264_pixel_avg2_w16_neon
+    ldr         ip,  [sp, #4]
+    push        {lr}
+    ldr         lr,  [sp, #4]
+avg2_w16_loop:
+    subs        ip,  ip,  #2
+    vld1.64     {d0-d1}, [r2], r3
+    vld1.64     {d2-d3}, [lr], r3
+    vrhadd.u8   q0,  q0,  q1
+    vld1.64     {d4-d5}, [r2], r3
+    vld1.64     {d6-d7}, [lr], r3
+    vrhadd.u8   q2,  q2,  q3
+    vst1.64     {d0-d1}, [r0,:128], r1
+    vst1.64     {d4-d5}, [r0,:128], r1
+    bgt         avg2_w16_loop
+    pop         {pc}
+.endfunc
+
+function x264_pixel_avg2_w20_neon
+    ldr         ip,  [sp, #4]
+    push        {lr}
+    sub         r1,  r1,  #16
+    ldr         lr,  [sp, #4]
+avg2_w20_loop:
+    subs        ip,  ip,  #2
+    vld1.64     {d0-d2},  [r2], r3
+    vld1.64     {d4-d6},  [lr], r3
+    vrhadd.u8   q0,  q0,  q2
+    vrhadd.u8   d2,  d2,  d6
+    vld1.64     {d4-d6},  [r2], r3
+    vld1.64     {d16-d18},[lr], r3
+    vrhadd.u8   q2,  q2,  q8
+    vst1.64     {d0-d1},  [r0,:128]!
+    vrhadd.u8   d6,  d6,  d18
+    vst1.32     {d2[0]},  [r0,:32], r1
+    vst1.64     {d4-d5},  [r0,:128]!
+    vst1.32     {d6[0]},  [r0,:32], r1
+    bgt         avg2_w20_loop
+    pop         {pc}
+.endfunc
+
+
+// void mc_copy( uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int height )
+function x264_mc_copy_w4_neon
+    ldr         ip,  [sp]
+copy_w4_loop:
+    subs        ip,  ip,  #4
+    vld1.32     {d0[]},  [r2], r3
+    vld1.32     {d1[]},  [r2], r3
+    vld1.32     {d2[]},  [r2], r3
+    vld1.32     {d3[]},  [r2], r3
+    vst1.32     {d0[0]}, [r0,:32], r1
+    vst1.32     {d1[0]}, [r0,:32], r1
+    vst1.32     {d2[0]}, [r0,:32], r1
+    vst1.32     {d3[0]}, [r0,:32], r1
+    bgt         copy_w4_loop
+    bx          lr
+.endfunc
+
+function x264_mc_copy_w8_neon
+    ldr         ip,  [sp]
+copy_w8_loop:
+    subs        ip,  ip,  #4
+    vld1.32     {d0}, [r2], r3
+    vld1.32     {d1}, [r2], r3
+    vld1.32     {d2}, [r2], r3
+    vld1.32     {d3}, [r2], r3
+    vst1.32     {d0}, [r0,:64], r1
+    vst1.32     {d1}, [r0,:64], r1
+    vst1.32     {d2}, [r0,:64], r1
+    vst1.32     {d3}, [r0,:64], r1
+    bgt         copy_w8_loop
+    bx          lr
+.endfunc
+
+function x264_mc_copy_w16_neon
+    ldr         ip,  [sp]
+copy_w16_loop:
+    subs        ip,  ip,  #4
+    vld1.32     {d0-d1}, [r2], r3
+    vld1.32     {d2-d3}, [r2], r3
+    vld1.32     {d4-d5}, [r2], r3
+    vld1.32     {d6-d7}, [r2], r3
+    vst1.32     {d0-d1}, [r0,:128], r1
+    vst1.32     {d2-d3}, [r0,:128], r1
+    vst1.32     {d4-d5}, [r0,:128], r1
+    vst1.32     {d6-d7}, [r0,:128], r1
+    bgt         copy_w16_loop
+    bx          lr
+.endfunc
+
+function x264_mc_copy_w16_aligned_neon
+    ldr         ip,  [sp]
+copy_w16_aligned_loop:
+    subs        ip,  ip,  #4
+    vld1.32     {d0-d1}, [r2,:128], r3
+    vld1.32     {d2-d3}, [r2,:128], r3
+    vld1.32     {d4-d5}, [r2,:128], r3
+    vld1.32     {d6-d7}, [r2,:128], r3
+    vst1.32     {d0-d1}, [r0,:128], r1
+    vst1.32     {d2-d3}, [r0,:128], r1
+    vst1.32     {d4-d5}, [r0,:128], r1
+    vst1.32     {d6-d7}, [r0,:128], r1
+    bgt         copy_w16_aligned_loop
+    bx          lr
+.endfunc
+
+
+// void x264_mc_chroma_neon( uint8_t *dst, int i_dst_stride,
+//                           uint8_t *src, int i_src_stride,
+//                           int dx, int dy, int i_width, int i_height );
+function x264_mc_chroma_neon
+    push            {r4-r6, lr}
+    ldrd            r4,  [sp, #16]
+    ldr             r6,  [sp, #24]
+
+    asr             lr,  r5,  #3
+    mul             lr,  r3,  lr
+    add             r2,  r2,  r4,  asr #3
+    cmp             r6, #4
+    add             r2,  r2,  lr
+
+    and             r4, r4, #7
+    and             r5, r5, #7
+    pld             [r2]
+    pld             [r2, r3]
+
+    bgt             mc_chroma_w8
+    beq             mc_chroma_w4
+
+// calculate cA cB cC cD
+.macro CHROMA_MC_START r0 r1
+    muls            lr,  r4,  r5
+    rsb             r6,  lr,  r5,  lsl #3
+    rsb             ip,  lr,  r4,  lsl #3
+    sub             r4,  lr,  r4,  lsl #3
+    sub             r4,  r4,  r5,  lsl #3
+    add             r4,  r4,  #64
+
+    beq             2f
+
+    add             r5,  r2,  r3
+
+    vdup.8          d0,  r4
+    lsl             r3,  r3,  #1
+    vdup.8          d1,  ip
+    vld1.64         {\r0}, [r2], r3
+    vdup.8          d2,  r6
+    vld1.64         {\r1}, [r5], r3
+    vdup.8          d3,  lr
+    ldr             r4,  [sp, #28]
+
+    vext.8          d5,  d4,  d5,  #1
+    vext.8          d7,  d6,  d7,  #1
+.endm
+
+.macro CHROMA_MC width, align
+mc_chroma_w\width:
+    CHROMA_MC_START d4,  d6
+// since the element size varies, there's a different index for the 2nd store
+.if \width == 4
+    .set st2, 1
+.else
+    .set st2, 2
+.endif
+
+    vtrn.32         d4,  d5
+    vtrn.32         d6,  d7
+
+    vtrn.32         d0,  d1
+    vtrn.32         d2,  d3
+
+1:  // height loop, interpolate xy
+    pld             [r5]
+    vmull.u8        q8,  d4,  d0
+    vmlal.u8        q8,  d6,  d2
+    vld1.64         {d4},     [r2], r3
+    vext.8          d5,  d4,  d5,  #1
+    vtrn.32         d4,  d5
+    vmull.u8        q9,  d6,  d0
+    vmlal.u8        q9,  d4,  d2
+    vld1.64         {d6},     [r5], r3
+    vadd.i16        d16, d16, d17
+    vadd.i16        d17, d18, d19
+    vrshrn.u16      d16, q8,  #6
+    subs            r4,  r4,  #2
+    pld             [r2]
+    vext.8          d7,  d6,  d7,  #1
+    vtrn.32         d6,  d7
+    vst1.\align     {d16[0]},   [r0,:\align], r1
+    vst1.\align     {d16[st2]}, [r0,:\align], r1
+    bgt             1b
+
+    pop             {r4-r6, pc}
+
+2:  // dx or dy are 0
+    tst             r6,  r6
+    add             ip,  ip,  r6
+    vdup.8          d0,  r4
+    vdup.8          d1,  ip
+    vtrn.32         d0,  d1
+    ldr             r4,  [sp, #28]
+
+    beq             4f
+
+    vext.32         d1,  d0,  d1,  #1
+    add             r5,  r2,  r3
+    lsl             r3,  r3,  #1
+    vld1.32         {d4[0]},  [r2], r3
+    vld1.32         {d4[1]},  [r5], r3
+
+3:  // vertical interpolation loop
+    pld             [r5]
+    vmull.u8        q8,  d4,  d0
+    vld1.32         {d4[0]},  [r2], r3
+    vmull.u8        q9,  d4,  d1
+    vld1.32         {d4[1]},  [r5], r3
+    vadd.i16        d16, d16, d17
+    vadd.i16        d17, d18, d19
+    vrshrn.u16      d16, q8,  #6
+    subs            r4,  r4,  #2
+    pld             [r2]
+    vst1.\align     {d16[0]},   [r0,:\align], r1
+    vst1.\align     {d16[st2]}, [r0,:\align], r1
+    bgt             3b
+
+    pop             {r4-r6, pc}
+
+4:  // dy is 0
+    vld1.64         {d4},     [r2], r3
+    vld1.64         {d6},     [r2], r3
+    vext.8          d5,  d4,  d5,  #1
+    vext.8          d7,  d6,  d7,  #1
+    vtrn.32         d4,  d5
+    vtrn.32         d6,  d7
+
+5:  // horizontal interpolation loop
+    vmull.u8        q8,  d4,  d0
+    vmull.u8        q9,  d6,  d0
+    subs            r4,  r4,  #2
+    vld1.64         {d4},     [r2], r3
+    vext.8          d5,  d4,  d5,  #1
+    vtrn.32         d4,  d5
+    vadd.i16        d16, d16, d17
+    vadd.i16        d17, d18, d19
+    pld             [r2]
+    vrshrn.u16      d16, q8,  #6
+    vld1.64         {d6},     [r2], r3
+    vext.8          d7,  d6,  d7,  #1
+    vtrn.32         d6,  d7
+    pld             [r2]
+    vst1.\align     {d16[0]},   [r0,:\align], r1
+    vst1.\align     {d16[st2]}, [r0,:\align], r1
+    bgt             5b
+
+    pop             {r4-r6, pc}
+.endm
+
+    CHROMA_MC 2, 16
+    CHROMA_MC 4, 32
+
+// the optimial timing for width 8 is different enough that it's not
+// readable to put it in the same macro as width 2/4
+mc_chroma_w8:
+    CHROMA_MC_START d4-d5, d6-d7
+
+1:  // height loop, interpolate xy
+    pld             [r5]
+    vmull.u8        q8,  d4,  d0
+    vmlal.u8        q8,  d5,  d1
+    vld1.64         {d4, d5}, [r2], r3
+    vmlal.u8        q8,  d6,  d2
+    vext.8          d5,  d4,  d5,  #1
+    vmlal.u8        q8,  d7,  d3
+    vmull.u8        q9,  d6,  d0
+    subs            r4,  r4,  #2
+    vmlal.u8        q9,  d7,  d1
+    vmlal.u8        q9,  d4,  d2
+    vmlal.u8        q9,  d5,  d3
+    vrshrn.u16      d16, q8,  #6
+    vld1.64         {d6, d7}, [r5], r3
+    pld             [r2]
+    vrshrn.u16      d17, q9,  #6
+    vext.8          d7,  d6,  d7,  #1
+    vst1.64         {d16}, [r0,:64], r1
+    vst1.64         {d17}, [r0,:64], r1
+    bgt             1b
+
+    pop             {r4-r6, pc}
+
+2:  // dx or dy are 0
+    tst             r6,  r6
+    add             ip,  ip,  r6
+    vdup.8          d0,  r4
+    vdup.8          d1,  ip
+    ldr             r4,  [sp, #28]
+
+    beq             4f
+
+    add             r5,  r2,  r3
+    lsl             r3,  r3,  #1
+    vld1.64         {d4}, [r2], r3
+    vld1.64         {d6}, [r5], r3
+
+3:  // vertical interpolation loop
+    pld             [r5]
+    vmull.u8        q8,  d4,  d0
+    vmlal.u8        q8,  d6,  d1
+    vld1.64         {d4}, [r2], r3
+    vmull.u8        q9,  d6,  d0
+    vmlal.u8        q9,  d4,  d1
+    vld1.64         {d6}, [r5], r3
+    vrshrn.u16      d16, q8,  #6
+    vrshrn.u16      d17, q9,  #6
+    subs            r4,  r4,  #2
+    pld             [r2]
+    vst1.64         {d16}, [r0,:64], r1
+    vst1.64         {d17}, [r0,:64], r1
+    bgt             3b
+
+    pop             {r4-r6, pc}
+
+4:  // dy is 0
+    vld1.64         {d4, d5}, [r2], r3
+    vld1.64         {d6, d7}, [r2], r3
+    vext.8          d5,  d4,  d5,  #1
+    vext.8          d7,  d6,  d7,  #1
+
+5:  // horizontal interpolation loop
+    pld             [r2]
+    subs            r4,  r4,  #2
+    vmull.u8        q8,  d4,  d0
+    vmlal.u8        q8,  d5,  d1
+    vld1.64         {d4,  d5}, [r2], r3
+    vmull.u8        q9,  d6,  d0
+    vmlal.u8        q9,  d7,  d1
+    pld             [r2]
+    vext.8          d5,  d4,  d5,  #1
+    vrshrn.u16      d16, q8,  #6
+    vrshrn.u16      d17, q9,  #6
+    vld1.64         {d6, d7}, [r2], r3
+    vext.8          d7,  d6,  d7,  #1
+    vst1.64         {d16}, [r0,:64], r1
+    vst1.64         {d17}, [r0,:64], r1
+    bgt             5b
+
+    pop             {r4-r6, pc}
+.endfunc
+
+
+// hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width)
+function x264_hpel_filter_v_neon
+    ldr             ip,  [sp]
+    sub             r1,  r1,  r3,  lsl #1
+    push            {lr}
+    add             lr,  r1,  ip
+    vmov.u8         d30, #5
+    vmov.u8         d31, #20
+
+filter_v_loop:
+    subs            ip,  ip,  #16
+    vld1.64         {d0-d1},   [r1,:128], r3
+    vld1.64         {d2-d3},   [r1,:128], r3
+    vld1.64         {d4-d5},   [r1,:128], r3
+    vld1.64         {d6-d7},   [r1,:128], r3
+    vld1.64         {d16-d17}, [r1,:128], r3
+    vld1.64         {d18-d19}, [r1,:128], r3
+    sub             r1,  lr,  ip
+
+    vaddl.u8        q10, d0,  d18
+    vmlsl.u8        q10, d2,  d30
+    vmlal.u8        q10, d4,  d31
+    vmlal.u8        q10, d6,  d31
+    vmlsl.u8        q10, d16, d30
+
+    vaddl.u8        q11, d1,  d19
+    vmlsl.u8        q11, d3,  d30
+    vmlal.u8        q11, d5,  d31
+    vmlal.u8        q11, d7,  d31
+    vmlsl.u8        q11, d17, d30
+
+    vqrshrun.s16    d0,  q10, #5
+    vst1.64         {d20-d21}, [r2,:128]!
+    vqrshrun.s16    d1,  q11, #5
+    vst1.64         {d22-d23}, [r2,:128]!
+    vst1.64         {d0-d1},   [r0,:128]!
+    bgt             filter_v_loop
+    pop             {pc}
+.endfunc
+
+// hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
+function x264_hpel_filter_c_neon
+    sub             r1,  #16
+    vld1.64         {d0-d3}, [r1,:128]!
+
+    // unrolled 2x: 4% faster
+filter_c_loop:
+    subs            r2,  r2,  #16
+    vld1.64         {d4-d7}, [r1,:128]!
+    vext.16         q8,  q0,  q1,  #6
+    vext.16         q12, q1,  q2,  #3
+    vadd.s16        q8,  q8,  q12
+    vext.16         q9,  q0,  q1,  #7
+    vext.16         q11, q1,  q2,  #2
+    vadd.s16        q9,  q9,  q11
+    vext.16         q10, q1,  q2,  #1
+    vext.16         q11, q1,  q2,  #6
+    vadd.s16        q10, q1,  q10
+    vsub.s16        q8,  q8,  q9    // a-b
+    vext.16         q15, q2,  q3,  #3
+    vsub.s16        q9,  q9,  q10   // b-c
+
+    vext.16         q12, q1,  q2,  #7
+    vshr.s16        q8,  q8,  #2    // (a-b)/4
+    vadd.s16        q11, q11, q15
+    vext.16         q14, q2,  q3,  #2
+    vsub.s16        q8,  q8,  q9    // (a-b)/4-b+c
+    vadd.s16        q12, q12, q14
+    vext.16         q13, q2,  q3,  #1
+
+    vshr.s16        q8,  q8,  #2    // ((a-b)/4-b+c)/4
+    vadd.s16        q13, q2,  q13
+    vadd.s16        q8,  q8,  q10   // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vsub.s16        q11, q11, q12   // a-b
+    vsub.s16        q12, q12, q13   // b-c
+    vshr.s16        q11, q11, #2    // (a-b)/4
+    vqrshrun.s16    d30, q8,  #6
+    vsub.s16        q11, q11, q12   // (a-b)/4-b+c
+    vshr.s16        q11, q11, #2    // ((a-b)/4-b+c)/4
+    vld1.64         {d0-d3}, [r1,:128]!
+    vadd.s16        q11, q11, q13   // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+
+    vext.16         q8,  q2,  q3,  #6
+    vqrshrun.s16    d31, q11,  #6
+    vext.16         q12, q3,  q0,  #3
+    vadd.s16        q8,  q8,  q12
+    vext.16         q9,  q2,  q3,  #7
+    vst1.64         {d30-d31}, [r0,:128]!
+    bxle            lr
+    subs            r2,  r2,  #16
+
+    vext.16         q11, q3,  q0,  #2
+    vadd.s16        q9,  q9,  q11
+    vext.16         q10, q3,  q0,  #1
+    vext.16         q11, q3,  q0,  #6
+    vadd.s16        q10, q3,  q10
+    vsub.s16        q8,  q8,  q9    // a-b
+    vext.16         q15, q0,  q1,  #3
+    vsub.s16        q9,  q9,  q10   // b-c
+
+    vext.16         q12, q3,  q0,  #7
+    vshr.s16        q8,  q8,  #2    // (a-b)/4
+    vadd.s16        q11, q11, q15
+    vext.16         q14, q0,  q1,  #2
+    vsub.s16        q8,  q8,  q9    // (a-b)/4-b+c
+    vadd.s16        q12, q12, q14
+    vext.16         q13, q0,  q1,  #1
+
+    vshr.s16        q8,  q8,  #2    // ((a-b)/4-b+c)/4
+    vadd.s16        q13, q0,  q13
+    vadd.s16        q8,  q8,  q10   // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vsub.s16        q11, q11, q12   // a-b
+    vsub.s16        q12, q12, q13   // b-c
+    vshr.s16        q11, q11, #2    // (a-b)/4
+    vqrshrun.s16    d30, q8,  #6
+    vsub.s16        q11, q11, q12   // (a-b)/4-b+c
+    vshr.s16        q11, q11, #2    // ((a-b)/4-b+c)/4
+    vadd.s16        q11, q11, q13   // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+
+    vqrshrun.s16    d31, q11,  #6
+    vst1.64         {d30-d31}, [r0,:128]!
+    bgt             filter_c_loop
+    bx              lr
+.endfunc
+
+// hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
+function x264_hpel_filter_h_neon
+    sub             r1,  #16
+    vmov.u8         d30, #5
+    vld1.64         {d0-d3}, [r1,:128]!
+    vmov.u8         d31, #20
+
+    // unrolled 3x because it's 5% faster, due to mitigating
+    // the high latency of multiplication and vqrshrun
+filter_h_loop:
+    subs            r2,  r2,  #16
+    vld1.64         {d4-d5}, [r1,:128]!
+    vext.8          q8,  q0,  q1,  #14
+    vext.8          q12, q1,  q2,  #3
+    vaddl.u8        q13, d16, d24
+    vext.8          q9,  q0,  q1,  #15
+    vaddl.u8        q14, d17, d25
+
+    vext.8          q10, q1,  q2,  #1
+    vmlal.u8        q13, d2,  d31
+    vmlsl.u8        q13, d18, d30
+    vext.8          q11, q1,  q2,  #2
+    vmlal.u8        q13, d20, d31
+    vmlsl.u8        q13, d22, d30
+
+    vmlsl.u8        q14, d19, d30
+    vmlal.u8        q14, d3,  d31
+    vmlal.u8        q14, d21, d31
+    vmlsl.u8        q14, d23, d30
+    vqrshrun.s16    d6,  q13, #5
+
+    vld1.64         {d0-d1}, [r1,:128]!
+    vext.8          q8,  q1,  q2,  #14
+    vext.8          q12, q2,  q0,  #3
+    vaddl.u8        q13, d16, d24
+    vqrshrun.s16    d7,  q14, #5
+    vext.8          q9,  q1,  q2,  #15
+    vaddl.u8        q14, d17, d25
+
+    vst1.64         {d6-d7}, [r0,:128]!
+    bxle            lr
+    subs            r2,  r2,  #16
+
+    vext.8          q10, q2,  q0,  #1
+    vmlal.u8        q13, d4,  d31
+    vmlsl.u8        q13, d18, d30
+    vext.8          q11, q2,  q0,  #2
+    vmlal.u8        q13, d20, d31
+    vmlsl.u8        q13, d22, d30
+
+    vmlsl.u8        q14, d19, d30
+    vmlal.u8        q14, d5,  d31
+    vmlal.u8        q14, d21, d31
+    vmlsl.u8        q14, d23, d30
+    vqrshrun.s16    d6,  q13, #5
+
+    vld1.64         {d2-d3}, [r1,:128]!
+    vext.8          q8,  q2,  q0,  #14
+    vext.8          q12, q0,  q1,  #3
+    vaddl.u8        q13, d16, d24
+    vqrshrun.s16    d7,  q14, #5
+    vext.8          q9,  q2,  q0,  #15
+    vaddl.u8        q14, d17, d25
+
+    vst1.64         {d6-d7}, [r0,:128]!
+    bxle            lr
+    subs            r2,  r2,  #16
+
+    vext.8          q10, q0,  q1,  #1
+    vmlal.u8        q13, d0,  d31
+    vmlsl.u8        q13, d18, d30
+    vext.8          q11, q0,  q1,  #2
+    vmlal.u8        q13, d20, d31
+    vmlsl.u8        q13, d22, d30
+
+    vmlsl.u8        q14, d19, d30
+    vmlal.u8        q14, d1,  d31
+    vmlal.u8        q14, d21, d31
+    vmlsl.u8        q14, d23, d30
+
+    vqrshrun.s16    d6, q13, #5
+    vqrshrun.s16    d7, q14, #5
+    vst1.64         {d6-d7}, [r0,:128]!
+    bgt             filter_h_loop
+    bx              lr
+.endfunc
+
+
+// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv,
+//                         uint8_t *dstc, int src_stride, int dst_stride, int width,
+//                         int height )
+function x264_frame_init_lowres_core_neon
+    push            {r4-r10,lr}
+    vpush           {d8-d15}
+    ldrd            r4,  [sp, #96]
+    ldrd            r6,  [sp, #104]
+    ldr             lr,  [sp, #112]
+    sub             r10, r6,  r7            // dst_stride - width
+    and             r10, r10, #~15
+
+lowres_yloop:
+    mov             ip,  r7                 // width
+    mov             r6,  r0                 // src0
+    add             r8,  r0,  r5            // src1 = src0 + src_stride
+    add             r9,  r0,  r5,  lsl #1   // src2 = src1 + src_stride
+
+    vld2.8          {d8, d10}, [r6,:128]!
+    vld2.8          {d12,d14}, [r8,:128]!
+    vld2.8          {d16,d18}, [r9,:128]!
+
+lowres_xloop:
+    subs            ip,  ip,  #16
+
+    vld2.8          {d9, d11}, [r6,:128]!
+    vld2.8          {d13,d15}, [r8,:128]!
+    vrhadd.u8       q0,  q4,  q6
+    vld2.8          {d17,d19}, [r9,:128]!
+    vrhadd.u8       q5,  q5,  q7
+    vld2.8          {d20,d22}, [r6,:128]!
+    vrhadd.u8       q1,  q6,  q8
+    vld2.8          {d24,d26}, [r8,:128]!
+    vrhadd.u8       q7,  q7,  q9
+    vext.8          q4,  q4,  q10, #1
+    vrhadd.u8       q0,  q0,  q5
+    vext.8          q6,  q6,  q12, #1
+    vrhadd.u8       q1,  q1,  q7
+    vld2.8          {d28,d30}, [r9,:128]!
+    vrhadd.u8       q4,  q4,  q6
+    vext.8          q8,  q8,  q14, #1
+    vrhadd.u8       q6,  q6,  q8
+    vst1.64         {d0-d1},   [r1,:128]!
+    vrhadd.u8       q2,  q4,  q5
+    vst1.64         {d2-d3},   [r3,:128]!
+    vrhadd.u8       q3,  q6,  q7
+    vst1.64         {d4-d5},   [r2,:128]!
+    vst1.64         {d6-d7},   [r4,:128]!
+
+    ble             lowres_xloop_end
+    subs            ip,  ip,  #16
+
+    vld2.8          {d21,d23}, [r6,:128]!
+    vld2.8          {d25,d27}, [r8,:128]!
+    vrhadd.u8       q0,  q10, q12
+    vld2.8          {d29,d31}, [r9,:128]!
+    vrhadd.u8       q11, q11, q13
+    vld2.8          {d8, d10}, [r6,:128]!
+    vrhadd.u8       q1,  q12, q14
+    vld2.8          {d12,d14}, [r8,:128]!
+    vrhadd.u8       q13, q13, q15
+    vext.8          q10, q10, q4,  #1
+    vrhadd.u8       q0,  q0,  q11
+    vext.8          q12, q12, q6,  #1
+    vrhadd.u8       q1,  q1,  q13
+    vld2.8          {d16,d18}, [r9,:128]!
+    vrhadd.u8       q10, q10, q12
+    vext.8          q14, q14, q8,  #1
+    vrhadd.u8       q12, q12, q14
+    vst1.64         {d0-d1},   [r1,:128]!
+    vrhadd.u8       q2,  q10, q11
+    vst1.64         {d2-d3},   [r3,:128]!
+    vrhadd.u8       q3,  q12, q13
+    vst1.64         {d4-d5},   [r2,:128]!
+    vst1.64         {d6-d7},   [r4,:128]!
+
+    bgt             lowres_xloop
+
+lowres_xloop_end:
+    subs            lr,  lr,  #1
+    add             r0,  r0,  r5,  lsl #1
+    add             r1,  r1,  r10
+    add             r2,  r2,  r10
+    add             r3,  r3,  r10
+    add             r4,  r4,  r10
+    bgt             lowres_yloop
+
+    vpop            {d8-d15}
+    pop             {r4-r10,pc}
+.endfunc
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
new file mode 100644
index 0000000..20cf151
--- /dev/null
+++ b/common/arm/mc-c.c
@@ -0,0 +1,196 @@
+/*****************************************************************************
+ * mc-c.c: h264 encoder library (Motion Compensation)
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "mc.h"
+
+void x264_prefetch_ref_arm( uint8_t *, int, int );
+void x264_prefetch_fenc_arm( uint8_t *, int, uint8_t *, int, int );
+
+void *x264_memcpy_aligned_neon( void * dst, const void * src, size_t n );
+void x264_memzero_aligned_neon( void *dst, int n );
+
+void x264_pixel_avg_16x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_16x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_8x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_8x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_8x4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_4x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_4x4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_4x2_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+
+void x264_pixel_avg2_w4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+void x264_pixel_avg2_w8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+void x264_pixel_avg2_w16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+void x264_pixel_avg2_w20_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+
+void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int );
+
+void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int );
+void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int);
+
+static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =
+{
+    NULL,
+    x264_pixel_avg2_w4_neon,
+    x264_pixel_avg2_w8_neon,
+    x264_pixel_avg2_w16_neon,   // no slower than w12, so no point in a separate function
+    x264_pixel_avg2_w16_neon,
+    x264_pixel_avg2_w20_neon,
+};
+
+static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, int, uint8_t *, int, int ) =
+{
+    NULL,
+    x264_mc_copy_w4_neon,
+    x264_mc_copy_w8_neon,
+    NULL,
+    x264_mc_copy_w16_neon,
+};
+
+static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+
+static void mc_luma_neon( uint8_t *dst,    int i_dst_stride,
+                          uint8_t *src[4], int i_src_stride,
+                          int mvx, int mvy,
+                          int i_width, int i_height, const x264_weight_t *weight )
+{
+    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
+    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
+    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+    if ( (mvy&3) == 3 )             // explict if() to force conditional add
+        src1 += i_src_stride;
+
+    if( qpel_idx & 5 ) /* qpel interpolation needed */
+    {
+        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        x264_pixel_avg_wtab_neon[i_width>>2](
+                dst, i_dst_stride, src1, i_src_stride,
+                src2, i_height );
+        if( weight->weightfn )
+            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
+    }
+    else if( weight->weightfn )
+        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
+    else
+        x264_mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
+}
+
+static uint8_t *get_ref_neon( uint8_t *dst,   int *i_dst_stride,
+                              uint8_t *src[4], int i_src_stride,
+                              int mvx, int mvy,
+                              int i_width, int i_height, const x264_weight_t *weight )
+{
+    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
+    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
+    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+    if ( (mvy&3) == 3 )             // explict if() to force conditional add
+        src1 += i_src_stride;
+
+    if( qpel_idx & 5 ) /* qpel interpolation needed */
+    {
+        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        x264_pixel_avg_wtab_neon[i_width>>2](
+                dst, *i_dst_stride, src1, i_src_stride,
+                src2, i_height );
+        if( weight->weightfn )
+            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
+        return dst;
+    }
+    else if( weight->weightfn )
+    {
+        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
+        return dst;
+    }
+    else
+    {
+        *i_dst_stride = i_src_stride;
+        return src1;
+    }
+}
+
+void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
+void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
+void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
+
+static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+                              int stride, int width, int height, int16_t *buf )
+{
+    int realign = (intptr_t)src & 15;
+    src -= realign;
+    dstv -= realign;
+    dstc -= realign;
+    dsth -= realign;
+    width += realign;
+    while( height-- )
+    {
+        x264_hpel_filter_v_neon( dstv, src, buf+8, stride, width );
+        x264_hpel_filter_c_neon( dstc, buf+8, width );
+        x264_hpel_filter_h_neon( dsth, src, width );
+        dsth += stride;
+        dstv += stride;
+        dstc += stride;
+        src  += stride;
+    }
+}
+
+void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
+{
+    if( !(cpu&X264_CPU_ARMV6) )
+        return;
+
+    pf->prefetch_fenc = x264_prefetch_fenc_arm;
+    pf->prefetch_ref  = x264_prefetch_ref_arm;
+
+    if( !(cpu&X264_CPU_NEON) )
+        return;
+
+    pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
+    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon;
+    pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_neon;
+    pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_neon;
+
+    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
+    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_neon;
+    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_neon;
+    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_neon;
+    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_neon;
+    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_neon;
+    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_neon;
+    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_neon;
+
+// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
+#ifndef SYS_MACOSX
+    pf->memcpy_aligned  = x264_memcpy_aligned_neon;
+#endif
+    pf->memzero_aligned = x264_memzero_aligned_neon;
+
+    pf->mc_chroma = x264_mc_chroma_neon;
+    pf->mc_luma = mc_luma_neon;
+    pf->get_ref = get_ref_neon;
+    pf->hpel_filter = hpel_filter_neon;
+    pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
+}
diff --git a/common/ppc/mc.h b/common/arm/mc.h
similarity index 80%
copy from common/ppc/mc.h
copy to common/arm/mc.h
index 0465dd9..6ee510e 100644
--- a/common/ppc/mc.h
+++ b/common/arm/mc.h
@@ -1,7 +1,9 @@
 /*****************************************************************************
- * mc.h: h264 encoder library
+ * mc.h: h264 encoder library (Motion Compensation)
  *****************************************************************************
- * Copyright (C) 2003-2008 Eric Petit <eric.petit at lapsus.org>
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -18,9 +20,9 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/
 
-#ifndef X264_PPC_MC_H
-#define X264_PPC_MC_H
+#ifndef X264_ARM_MC_H
+#define X264_ARM_MC_H
 
-void x264_mc_altivec_init( x264_mc_functions_t *pf );
+void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf );
 
 #endif
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
new file mode 100644
index 0000000..4dd65ed
--- /dev/null
+++ b/common/arm/pixel-a.S
@@ -0,0 +1,1238 @@
+/*****************************************************************************
+ * pixel.S: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.fpu neon
+.section .rodata
+.align 4
+
+.rept 16 .byte 0xff
+.endr
+mask_ff:
+.rept 16 .byte 0
+.endr
+
+mask_ac4:
+.short 0, -1, -1, -1,  0, -1, -1, -1
+mask_ac8:
+.short 0, -1, -1, -1, -1, -1, -1, -1
+
+.text
+
+.macro SAD4_ARMV6 h
+function x264_pixel_sad_4x\h\()_armv6
+    push        {r4-r6,lr}
+    ldr         r4, [r2], r3
+    ldr         r5, [r0], r1
+    ldr         r6, [r2], r3
+    ldr         lr, [r0], r1
+    usad8       ip, r4, r5
+.rept (\h - 2)/2
+    ldr         r4, [r2], r3
+    ldr         r5, [r0], r1
+    usada8      ip, r6, lr, ip
+    ldr         r6, [r2], r3
+    ldr         lr, [r0], r1
+    usada8      ip, r4, r5, ip
+.endr
+    usada8      r0, r6, lr, ip
+    pop         {r4-r6,pc}
+.endfunc
+.endm
+
+SAD4_ARMV6 4
+SAD4_ARMV6 8
+
+
+.macro SAD_START_4 align:vararg
+    vld1.32     {d1[]}, [r2 \align], r3
+    vld1.32     {d0[]}, [r0,:32], r1
+    vabdl.u8    q8,  d0,  d1
+.endm
+
+.macro SAD_4 align:vararg
+    vld1.32     {d1[]}, [r2 \align], r3
+    vld1.32     {d0[]}, [r0,:32], r1
+    vabal.u8    q8,  d0,  d1
+.endm
+
+.macro SAD_START_8 align:vararg
+    vld1.64     {d1}, [r2 \align], r3
+    vld1.64     {d0}, [r0,:64], r1
+    vabdl.u8    q8,  d0,  d1
+.endm
+
+.macro SAD_8 align:vararg
+    vld1.64     {d1}, [r2 \align], r3
+    vld1.64     {d0}, [r0,:64], r1
+    vabal.u8    q8,  d0,  d1
+.endm
+
+.macro SAD_START_16 align:vararg
+    vld1.64     {d2-d3}, [r2 \align], r3
+    vld1.64     {d0-d1}, [r0,:128], r1
+    vabdl.u8    q8,  d0,  d2
+    vld1.64     {d6-d7}, [r2 \align], r3
+    vabdl.u8    q9,  d1,  d3
+    vld1.64     {d4-d5}, [r0,:128], r1
+.endm
+
+.macro SAD_16 align:vararg
+    vabal.u8    q8,  d4,  d6
+    vld1.64     {d2-d3}, [r2 \align], r3
+    vabal.u8    q9,  d5,  d7
+    vld1.64     {d0-d1}, [r0,:128], r1
+    vabal.u8    q8,  d0,  d2
+    vld1.64     {d6-d7}, [r2 \align], r3
+    vabal.u8    q9,  d1,  d3
+    vld1.64     {d4-d5}, [r0,:128], r1
+.endm
+
+.macro SAD_FUNC w, h, name, align:vararg
+function x264_pixel_sad\name\()_\w\()x\h\()_neon
+.if \w == 16
+    .set r, \h / 2 - 1
+.else
+    .set r, \h - 1
+.endif
+
+    SAD_START_\w \align
+.rept r
+    SAD_\w \align
+.endr
+
+.if \w > 8
+    vabal.u8    q8,  d4,  d6
+    vabal.u8    q9,  d5,  d7
+    vadd.u16    q8,  q8,  q9
+.endif
+.if \w > 4
+    vadd.u16    d16, d16, d17
+.endif
+    vpadd.u16   d0,  d16, d16
+    vpaddl.u16  d0,  d0
+    vmov.u32    r0,  d0[0]
+    bx          lr
+.endfunc
+.endm
+
+SAD_FUNC  4,  4
+SAD_FUNC  4,  8
+SAD_FUNC  8,  4
+SAD_FUNC  8,  8
+SAD_FUNC  8,  16
+SAD_FUNC  16, 8
+SAD_FUNC  16, 16
+
+SAD_FUNC  4,  4,  _aligned, ,:32
+SAD_FUNC  4,  8,  _aligned, ,:32
+SAD_FUNC  8,  4,  _aligned, ,:64
+SAD_FUNC  8,  8,  _aligned, ,:64
+SAD_FUNC  8,  16, _aligned, ,:64
+SAD_FUNC  16, 8,  _aligned, ,:128
+SAD_FUNC  16, 16, _aligned, ,:128
+
+// If dual issue is possible, use additional accumulators to avoid
+// stalls from vadal's latency. This only matters for aligned.
+.macro SAD_DUAL_START_8
+    SAD_START_8 ,:64
+    vld1.64     {d3}, [r2,:64], r3
+    vld1.64     {d2}, [r0,:64], r1
+    vabdl.u8    q9,  d2,  d3
+.endm
+
+.macro SAD_DUAL_8 align:vararg
+    vld1.64     {d1}, [r2,:64], r3
+    vld1.64     {d0}, [r0,:64], r1
+    vabal.u8    q8,  d0,  d1
+    vld1.64     {d3}, [r2,:64], r3
+    vld1.64     {d2}, [r0,:64], r1
+    vabal.u8    q9,  d2,  d3
+.endm
+
+.macro SAD_DUAL_START_16
+    SAD_START_16 ,:128
+    vabdl.u8    q10, d4,  d6
+    vld1.64     {d2-d3}, [r2,:128], r3
+    vabdl.u8    q11, d5,  d7
+    vld1.64     {d0-d1}, [r0,:128], r1
+.endm
+
+.macro SAD_DUAL_16
+    vabal.u8    q8,  d0,  d2
+    vld1.64     {d6-d7}, [r2,:128], r3
+    vabal.u8    q9,  d1,  d3
+    vld1.64     {d4-d5}, [r0,:128], r1
+    vabal.u8    q10, d4,  d6
+    vld1.64     {d2-d3}, [r2,:128], r3
+    vabal.u8    q11, d5,  d7
+    vld1.64     {d0-d1}, [r0,:128], r1
+.endm
+
+.macro SAD_DUAL_END_16
+    vabal.u8    q8,  d0,  d2
+    vld1.64     {d6-d7}, [r2,:128], r3
+    vabal.u8    q9,  d1,  d3
+    vld1.64     {d4-d5}, [r0,:128], r1
+    vabal.u8    q10, d4,  d6
+    vabal.u8    q11, d5,  d7
+.endm
+
+.macro SAD_FUNC_DUAL w, h
+function x264_pixel_sad_aligned_\w\()x\h\()_neon_dual
+    SAD_DUAL_START_\w
+.rept \h / 2 - \w / 8
+    SAD_DUAL_\w
+.endr
+
+.if \w > 8
+    SAD_DUAL_END_16
+    vadd.u16    q8,  q8,  q9
+    vadd.u16    q9,  q10, q11
+.endif
+.if \w > 4
+    vadd.u16    q8,  q8,  q9
+    vadd.u16    d16, d16, d17
+.endif
+    vpadd.u16   d0,  d16, d16
+    vpaddl.u16  d0,  d0
+    vmov.u32    r0,  d0[0]
+    bx          lr
+.endfunc
+.endm
+
+SAD_FUNC_DUAL  8,  4
+SAD_FUNC_DUAL  8,  8
+SAD_FUNC_DUAL  8,  16
+SAD_FUNC_DUAL  16, 8
+SAD_FUNC_DUAL  16, 16
+
+
+.macro SAD_X_START_4 x
+    vld1.32     {d0[]}, [r0,:32], lr
+    vld1.32     {d1[]}, [r1], r6
+    vabdl.u8    q8,  d1,  d0
+    vld1.32     {d2[]}, [r2], r6
+    vabdl.u8    q9,  d2,  d0
+    vld1.32     {d3[]}, [r3], r6
+    vabdl.u8    q10, d3,  d0
+.if \x == 4
+    vld1.32     {d4[]}, [r12], r6
+    vabdl.u8    q11, d4,  d0
+.endif
+.endm
+
+.macro SAD_X_4 x
+    vld1.32     {d0[]}, [r0,:32], lr
+    vld1.32     {d1[]}, [r1], r6
+    vabal.u8    q8,  d1,  d0
+    vld1.32     {d2[]}, [r2], r6
+    vabal.u8    q9,  d2,  d0
+    vld1.32     {d3[]}, [r3], r6
+    vabal.u8    q10, d3,  d0
+.if \x == 4
+    vld1.32     {d4[]}, [r12], r6
+    vabal.u8    q11, d4,  d0
+.endif
+.endm
+
+.macro SAD_X_START_8 x
+    vld1.64     {d0}, [r0,:64], lr
+    vld1.64     {d1}, [r1], r6
+    vabdl.u8    q8,  d1,  d0
+    vld1.64     {d2}, [r2], r6
+    vabdl.u8    q9,  d2,  d0
+    vld1.64     {d3}, [r3], r6
+    vabdl.u8    q10, d3,  d0
+.if \x == 4
+    vld1.64     {d4}, [r12], r6
+    vabdl.u8    q11, d4,  d0
+.endif
+.endm
+
+.macro SAD_X_8 x
+    vld1.64     {d0}, [r0,:64], lr
+    vld1.64     {d1}, [r1], r6
+    vabal.u8    q8,  d1,  d0
+    vld1.64     {d2}, [r2], r6
+    vabal.u8    q9,  d2,  d0
+    vld1.64     {d3}, [r3], r6
+    vabal.u8    q10, d3,  d0
+.if \x == 4
+    vld1.64     {d4}, [r12], r6
+    vabal.u8    q11, d4,  d0
+.endif
+.endm
+
+.macro SAD_X_START_16 x
+    vld1.64     {d0-d1}, [r0,:128], lr
+    vld1.64     {d2-d3}, [r1], r6
+    vabdl.u8    q8,  d2,  d0
+    vabdl.u8    q12, d3,  d1
+    vld1.64     {d4-d5}, [r2], r6
+    vabdl.u8    q9,  d4,  d0
+    vabdl.u8    q13, d5,  d1
+    vld1.64     {d6-d7}, [r3], r6
+    vabdl.u8    q10, d6,  d0
+    vabdl.u8    q14, d7,  d1
+.if \x == 4
+    vld1.64     {d2-d3}, [r12], r6
+    vabdl.u8    q11, d2,  d0
+    vabdl.u8    q15, d3,  d1
+.endif
+.endm
+
+.macro SAD_X_16 x
+    vld1.64     {d0-d1}, [r0,:128], lr
+    vld1.64     {d2-d3}, [r1], r6
+    vabal.u8    q8,  d2,  d0
+    vabal.u8    q12, d3,  d1
+    vld1.64     {d4-d5}, [r2], r6
+    vabal.u8    q9,  d4,  d0
+    vabal.u8    q13, d5,  d1
+    vld1.64     {d6-d7}, [r3], r6
+    vabal.u8    q10, d6,  d0
+    vabal.u8    q14, d7,  d1
+.if \x == 4
+    vld1.64     {d2-d3}, [r12], r6
+    vabal.u8    q11, d2,  d0
+    vabal.u8    q15, d3,  d1
+.endif
+.endm
+
+.macro SAD_X_FUNC x, w, h
+function x264_pixel_sad_x\x\()_\w\()x\h\()_neon
+    push        {r6-r7,lr}
+.if \x == 3
+    ldrd        r6,  [sp, #12]
+.else
+    ldrd        r6,  [sp, #16]
+    ldr         r12, [sp, #12]
+.endif
+    mov         lr,  #FENC_STRIDE
+
+    SAD_X_START_\w \x
+.rept \h - 1
+    SAD_X_\w \x
+.endr
+
+// add up the sads
+.if \w > 8
+    vadd.u16    q8,  q8,  q12
+    vadd.u16    q9,  q9,  q13
+    vadd.u16    q10, q10, q14
+.if \x == 4
+    vadd.u16    q11, q11, q15
+.endif
+.endif
+.if \w > 4
+    vadd.u16    d16, d16, d17
+    vadd.u16    d18, d18, d19
+    vadd.u16    d20, d20, d21
+.if \x == 4
+    vadd.u16    d22, d22, d23
+.endif
+.endif
+    vpadd.u16   d0,  d16, d18
+    vpadd.u16   d1,  d20, d22
+    vpaddl.u16  q0,  q0
+
+.if \x == 3
+    vst1.32     {d0},    [r7]!
+    vst1.32     {d1[0]}, [r7,:32]
+.else
+    vst1.32     {d0-d1}, [r7]
+.endif
+    pop         {r6-r7,pc}
+.endfunc
+.endm
+
+SAD_X_FUNC  3, 4,  4
+SAD_X_FUNC  3, 4,  8
+SAD_X_FUNC  3, 8,  4
+SAD_X_FUNC  3, 8,  8
+SAD_X_FUNC  3, 8,  16
+SAD_X_FUNC  3, 16, 8
+SAD_X_FUNC  3, 16, 16
+
+SAD_X_FUNC  4, 4,  4
+SAD_X_FUNC  4, 4,  8
+SAD_X_FUNC  4, 8,  4
+SAD_X_FUNC  4, 8,  8
+SAD_X_FUNC  4, 8,  16
+SAD_X_FUNC  4, 16, 8
+SAD_X_FUNC  4, 16, 16
+
+
+.macro SSD_START_4
+    vld1.32     {d16[]}, [r0,:32], r1
+    vld1.32     {d17[]}, [r2,:32], r3
+    vsubl.u8    q2, d16, d17
+    vld1.32     {d16[]}, [r0,:32], r1
+    vmull.s16   q0, d4, d4
+    vld1.32     {d17[]}, [r2,:32], r3
+.endm
+
+.macro SSD_4
+    vsubl.u8    q2, d16, d17
+    vld1.32     {d16[]}, [r0,:32], r1
+    vmlal.s16   q0, d4, d4
+    vld1.32     {d17[]}, [r2,:32], r3
+.endm
+
+.macro SSD_END_4
+    vsubl.u8    q2, d16, d17
+    vmlal.s16   q0, d4, d4
+.endm
+
+.macro SSD_START_8
+    vld1.64     {d16}, [r0,:64], r1
+    vld1.64     {d17}, [r2,:64], r3
+    vsubl.u8    q2, d16, d17
+    vld1.64     {d16}, [r0,:64], r1
+    vmull.s16   q0, d4, d4
+    vmlal.s16   q0, d5, d5
+    vld1.64     {d17}, [r2,:64], r3
+.endm
+
+.macro SSD_8
+    vsubl.u8    q2, d16, d17
+    vld1.64     {d16}, [r0,:64], r1
+    vmlal.s16   q0, d4, d4
+    vmlal.s16   q0, d5, d5
+    vld1.64     {d17}, [r2,:64], r3
+.endm
+
+.macro SSD_END_8
+    vsubl.u8    q2, d16, d17
+    vmlal.s16   q0, d4, d4
+    vmlal.s16   q0, d5, d5
+.endm
+
+.macro SSD_START_16
+    vld1.64     {d16-d17}, [r0,:128], r1
+    vld1.64     {d18-d19}, [r2,:128], r3
+    vsubl.u8    q2, d16, d18
+    vsubl.u8    q3, d17, d19
+    vld1.64     {d16-d17}, [r0,:128], r1
+    vmull.s16   q0, d4, d4
+    vmlal.s16   q0, d5, d5
+    vld1.64     {d18-d19}, [r2,:128], r3
+    vmlal.s16   q0, d6, d6
+    vmlal.s16   q0, d7, d7
+.endm
+
+.macro SSD_16
+    vsubl.u8    q2, d16, d18
+    vsubl.u8    q3, d17, d19
+    vld1.64     {d16-d17}, [r0,:128], r1
+    vmlal.s16   q0, d4, d4
+    vmlal.s16   q0, d5, d5
+    vld1.64     {d18-d19}, [r2,:128], r3
+    vmlal.s16   q0, d6, d6
+    vmlal.s16   q0, d7, d7
+.endm
+
+.macro SSD_END_16
+    vsubl.u8    q2, d16, d18
+    vsubl.u8    q3, d17, d19
+    vmlal.s16   q0, d4, d4
+    vmlal.s16   q0, d5, d5
+    vmlal.s16   q0, d6, d6
+    vmlal.s16   q0, d7, d7
+.endm
+
+.macro SSD_FUNC w h
+function x264_pixel_ssd_\w\()x\h\()_neon
+    SSD_START_\w
+.rept \h-2
+    SSD_\w
+.endr
+    SSD_END_\w
+    vadd.s32    d0, d0, d1
+    vpadd.s32   d0, d0, d0
+    vmov.32     r0, d0[0]
+    bx          lr
+.endfunc
+.endm
+
+SSD_FUNC   4, 4
+SSD_FUNC   4, 8
+SSD_FUNC   8, 4
+SSD_FUNC   8, 8
+SSD_FUNC   8, 16
+SSD_FUNC  16, 8
+SSD_FUNC  16, 16
+
+
+.macro VAR_SQR_SUM qsqr_sum qsqr_last qsqr dsrc vpadal=vpadal.u16
+    vmull.u8        \qsqr, \dsrc, \dsrc
+    vaddw.u8        q0, q0, \dsrc
+    \vpadal         \qsqr_sum, \qsqr_last
+.endm
+
+function x264_pixel_var_8x8_neon
+    vld1.64         {d16}, [r0,:64], r1
+    vmull.u8        q1,  d16, d16
+    vmovl.u8        q0,  d16
+    vld1.64         {d18}, [r0,:64], r1
+    vmull.u8        q2,  d18, d18
+    vaddw.u8        q0,  q0,  d18
+
+    vld1.64         {d20}, [r0,:64], r1
+    VAR_SQR_SUM     q1,  q1,   q3,  d20, vpaddl.u16
+    vld1.64         {d22}, [r0,:64], r1
+    VAR_SQR_SUM     q2,  q2,   q8,  d22, vpaddl.u16
+
+    vld1.64         {d24}, [r0,:64], r1
+    VAR_SQR_SUM     q1,  q3,   q9,  d24
+    vld1.64         {d26}, [r0,:64], r1
+    VAR_SQR_SUM     q2,  q8,   q10, d26
+    vld1.64         {d24}, [r0,:64], r1
+    VAR_SQR_SUM     q1,  q9,   q14, d24
+    vld1.64         {d26}, [r0,:64], r1
+    VAR_SQR_SUM     q2,  q10,  q15, d26
+    b               x264_var_end
+.endfunc
+
+function x264_pixel_var_16x16_neon
+    vld1.64         {d16-d17}, [r0,:128], r1
+    vmull.u8        q12, d16, d16
+    vmovl.u8        q0,  d16
+    vmull.u8        q13, d17, d17
+    vaddw.u8        q0,  q0,  d17
+
+    vld1.64         {d18-d19}, [r0,:128], r1
+    VAR_SQR_SUM     q1,  q12,  q14, d18, vpaddl.u16
+    VAR_SQR_SUM     q2,  q13,  q15, d19, vpaddl.u16
+
+    mov             ip,  #7
+var16_loop:
+    subs            ip,  ip,  #1
+    vld1.64         {d16-d17}, [r0,:128], r1
+    VAR_SQR_SUM     q1,  q14,  q12, d16
+    VAR_SQR_SUM     q2,  q15,  q13, d17
+
+    vld1.64         {d18-d19}, [r0,:128], r1
+    VAR_SQR_SUM     q1,  q12,  q14, d18
+    VAR_SQR_SUM     q2,  q13,  q15, d19
+    bgt             var16_loop
+.endfunc
+
+function x264_var_end
+    vpaddl.u16      q8,  q14
+    vpaddl.u16      q9,  q15
+    vadd.u32        q1,  q1,  q8
+    vadd.u16        d0,  d0,  d1
+    vadd.u32        q1,  q1,  q9
+    vadd.u32        q1,  q1,  q2
+    vpaddl.u16      d0,  d0
+    vadd.u32        d2,  d2,  d3
+    vpadd.u32       d0,  d0,  d2
+
+    vmov            r0,  r1,  d0
+    bx              lr
+.endfunc
+
+.macro DIFF_SUM diff da db lastdiff
+    vld1.64         {\da}, [r0,:64], r1
+    vld1.64         {\db}, [r2,:64], r3
+.ifnb \lastdiff
+    vadd.s16        q0,  q0,  \lastdiff
+.endif
+    vsubl.u8        \diff, \da, \db
+.endm
+
+.macro SQR_ACC acc d0 d1 vmlal=vmlal.s16
+    \vmlal          \acc, \d0, \d0
+    vmlal.s16       \acc, \d1, \d1
+.endm
+
+function x264_pixel_var2_8x8_neon
+    DIFF_SUM        q0,  d0,  d1
+    DIFF_SUM        q8,  d16, d17
+    SQR_ACC         q1,  d0,  d1,  vmull.s16
+    DIFF_SUM        q9,  d18, d19, q8
+    SQR_ACC         q2,  d16, d17, vmull.s16
+.rept 2
+    DIFF_SUM        q8,  d16, d17, q9
+    SQR_ACC         q1,  d18, d19
+    DIFF_SUM        q9,  d18, d19, q8
+    SQR_ACC         q2,  d16, d17
+.endr
+    DIFF_SUM        q8,  d16, d17, q9
+    SQR_ACC         q1,  d18, d19
+    vadd.s16        q0,  q0,  q8
+    SQR_ACC         q2,  d16, d17
+
+    ldr             ip,  [sp]
+    vadd.s16        d0,  d0,  d1
+    vadd.s32        q1,  q1,  q2
+    vpaddl.s16      d0,  d0
+    vadd.s32        d1,  d2,  d3
+    vpadd.s32       d0,  d0,  d1
+
+    vmov.32         r0,  r1,  d0
+    vst1.32         {d0[1]}, [ip,:32]
+    mul             r0,  r0,  r0
+    sub             r0,  r1,  r0,  lsr #6
+    bx              lr
+.endfunc
+
+
+.macro LOAD_DIFF_8x4 q0 q1 q2 q3
+    vld1.32     {d1}, [r2], r3
+    vld1.32     {d0}, [r0,:64], r1
+    vsubl.u8    \q0, d0,  d1
+    vld1.32     {d3}, [r2], r3
+    vld1.32     {d2}, [r0,:64], r1
+    vsubl.u8    \q1, d2,  d3
+    vld1.32     {d5}, [r2], r3
+    vld1.32     {d4}, [r0,:64], r1
+    vsubl.u8    \q2, d4,  d5
+    vld1.32     {d7}, [r2], r3
+    vld1.32     {d6}, [r0,:64], r1
+    vsubl.u8    \q3, d6,  d7
+.endm
+
+function x264_pixel_satd_4x4_neon
+    vld1.32     {d1[]},  [r2], r3
+    vld1.32     {d0[]},  [r0,:32], r1
+    vld1.32     {d3[]},  [r2], r3
+    vld1.32     {d2[]},  [r0,:32], r1
+    vld1.32     {d1[1]}, [r2], r3
+    vld1.32     {d0[1]}, [r0,:32], r1
+    vld1.32     {d3[1]}, [r2], r3
+    vld1.32     {d2[1]}, [r0,:32], r1
+    vsubl.u8    q0,  d0,  d1
+    vsubl.u8    q1,  d2,  d3
+
+    SUMSUB_AB   q2, q3, q0, q1
+    SUMSUB_ABCD d0, d2, d1, d3, d4, d5, d6, d7
+    HADAMARD    1, sumsub, q2, q3, q0, q1
+    HADAMARD    2, amax,   q0,,    q2, q3
+
+    HORIZ_ADD   d0,  d0,  d1
+    vmov.32     r0,  d0[0]
+    bx          lr
+.endfunc
+
+function x264_pixel_satd_4x8_neon
+    vld1.32     {d1[]},  [r2], r3
+    vld1.32     {d0[]},  [r0,:32], r1
+    vld1.32     {d3[]},  [r2], r3
+    vld1.32     {d2[]},  [r0,:32], r1
+    vld1.32     {d5[]},  [r2], r3
+    vld1.32     {d4[]},  [r0,:32], r1
+    vld1.32     {d7[]},  [r2], r3
+    vld1.32     {d6[]},  [r0,:32], r1
+
+    vld1.32     {d1[1]}, [r2], r3
+    vld1.32     {d0[1]}, [r0,:32], r1
+    vsubl.u8    q0,  d0,  d1
+    vld1.32     {d3[1]}, [r2], r3
+    vld1.32     {d2[1]}, [r0,:32], r1
+    vsubl.u8    q1,  d2,  d3
+    vld1.32     {d5[1]}, [r2], r3
+    vld1.32     {d4[1]}, [r0,:32], r1
+    vsubl.u8    q2,  d4,  d5
+    vld1.32     {d7[1]}, [r2], r3
+    SUMSUB_AB   q8,  q9,  q0,  q1
+    vld1.32     {d6[1]}, [r0,:32], r1
+    vsubl.u8    q3,  d6,  d7
+    SUMSUB_AB   q10, q11, q2,  q3
+    b           x264_satd_4x8_8x4_end_neon
+.endfunc
+
+function x264_pixel_satd_8x4_neon
+    vld1.64     {d1}, [r2], r3
+    vld1.64     {d0}, [r0,:64], r1
+    vsubl.u8    q0,  d0,  d1
+    vld1.64     {d3}, [r2], r3
+    vld1.64     {d2}, [r0,:64], r1
+    vsubl.u8    q1,  d2,  d3
+    vld1.64     {d5}, [r2], r3
+    vld1.64     {d4}, [r0,:64], r1
+    vsubl.u8    q2,  d4,  d5
+    vld1.64     {d7}, [r2], r3
+    SUMSUB_AB   q8,  q9,  q0,  q1
+    vld1.64     {d6}, [r0,:64], r1
+    vsubl.u8    q3,  d6,  d7
+    SUMSUB_AB   q10, q11, q2,  q3
+.endfunc
+
+function x264_satd_4x8_8x4_end_neon
+    vadd.s16    q0,  q8,  q10
+    vadd.s16    q1,  q9,  q11
+    vsub.s16    q2,  q8,  q10
+    vsub.s16    q3,  q9,  q11
+
+    vtrn.16     q0,  q1
+    vadd.s16    q8,  q0,  q1
+    vtrn.16     q2,  q3
+    vsub.s16    q9,  q0,  q1
+    vadd.s16    q10, q2,  q3
+    vsub.s16    q11, q2,  q3
+    vtrn.32     q8,  q10
+    vabs.s16    q8,  q8
+    vtrn.32     q9,  q11
+    vabs.s16    q10, q10
+    vabs.s16    q9,  q9
+    vabs.s16    q11, q11
+    vmax.u16    q0,  q8,  q10
+    vmax.u16    q1,  q9,  q11
+
+    vadd.u16    q0,  q0,  q1
+    HORIZ_ADD   d0,  d0,  d1
+    vmov.32     r0,  d0[0]
+    bx          lr
+.endfunc
+
+function x264_pixel_satd_8x8_neon
+    mov         ip,  lr
+
+    bl x264_satd_8x8_neon
+    vadd.u16    q0,  q12, q13
+    vadd.u16    q1,  q14, q15
+
+    vadd.u16    q0,  q0,  q1
+    HORIZ_ADD   d0,  d0,  d1
+    mov         lr,  ip
+    vmov.32     r0,  d0[0]
+    bx          lr
+.endfunc
+
+function x264_pixel_satd_8x16_neon
+    vpush       {d8-d11}
+    mov         ip,  lr
+
+    bl x264_satd_8x8_neon
+    vadd.u16    q4,  q12, q13
+    vadd.u16    q5,  q14, q15
+
+    bl x264_satd_8x8_neon
+    vadd.u16    q4,  q4,  q12
+    vadd.u16    q5,  q5,  q13
+    vadd.u16    q4,  q4,  q14
+    vadd.u16    q5,  q5,  q15
+
+    vadd.u16    q0,  q4,  q5
+    HORIZ_ADD   d0,  d0,  d1
+    vpop        {d8-d11}
+    mov         lr,  ip
+    vmov.32     r0,  d0[0]
+    bx          lr
+.endfunc
+
+function x264_satd_8x8_neon
+    LOAD_DIFF_8x4 q8,  q9,  q10, q11
+    vld1.64     {d7}, [r2], r3
+    SUMSUB_AB   q0,  q1,  q8,  q9
+    vld1.64     {d6}, [r0,:64], r1
+    vsubl.u8    q12, d6,  d7
+    vld1.64     {d17}, [r2], r3
+    SUMSUB_AB   q2,  q3,  q10, q11
+    vld1.64     {d16}, [r0,:64], r1
+    vsubl.u8    q13, d16, d17
+    vld1.64     {d19}, [r2], r3
+    SUMSUB_AB   q8,  q10, q0,  q2
+    vld1.64     {d18}, [r0,:64], r1
+    vsubl.u8    q14, d18, d19
+    vld1.64     {d1}, [r2], r3
+    SUMSUB_AB   q9,  q11, q1,  q3
+    vld1.64     {d0}, [r0,:64], r1
+    vsubl.u8    q15, d0,  d1
+.endfunc
+
+// one vertical hadamard pass and two horizontal
+function x264_satd_8x4v_8x8h_neon
+    SUMSUB_ABCD q0, q1, q2, q3, q12, q13, q14, q15
+    vtrn.16     q8,  q9
+    SUMSUB_AB   q12, q14, q0,  q2
+    vtrn.16     q10, q11
+    SUMSUB_AB   q13, q15, q1,  q3
+    SUMSUB_AB   q0,  q1,  q8,  q9
+    vtrn.16     q12, q13
+    SUMSUB_AB   q2,  q3,  q10, q11
+    vtrn.16     q14, q15
+    SUMSUB_AB   q8,  q9,  q12, q13
+    vtrn.32     q0,  q2
+    SUMSUB_AB   q10, q11, q14, q15
+
+    vtrn.32     q1,  q3
+    ABS2        q0,  q2
+    vtrn.32     q8,  q10
+    ABS2        q1,  q3
+    vtrn.32     q9,  q11
+    ABS2        q8,  q10
+    ABS2        q9,  q11
+    vmax.s16    q12, q0,  q2
+    vmax.s16    q13, q1,  q3
+    vmax.s16    q14, q8,  q10
+    vmax.s16    q15, q9,  q11
+    bx          lr
+.endfunc
+
+function x264_pixel_satd_16x8_neon
+    vpush       {d8-d11}
+    mov         ip, lr
+
+    bl          x264_satd_16x4_neon
+    vadd.u16    q4,  q12, q13
+    vadd.u16    q5,  q14, q15
+
+    bl          x264_satd_16x4_neon
+    vadd.u16    q4,  q4,  q12
+    vadd.u16    q5,  q5,  q13
+    vadd.u16    q4,  q4,  q14
+    vadd.u16    q5,  q5,  q15
+
+    vadd.u16    q0,  q4,  q5
+    HORIZ_ADD   d0,  d0,  d1
+    vpop        {d8-d11}
+    mov         lr,  ip
+    vmov.32     r0,  d0[0]
+    bx          lr
+.endfunc
+
+function x264_pixel_satd_16x16_neon
+    vpush       {d8-d11}
+    mov         ip, lr
+
+    bl          x264_satd_16x4_neon
+    vadd.u16    q4,  q12, q13
+    vadd.u16    q5,  q14, q15
+
+    bl          x264_satd_16x4_neon
+    vadd.u16    q4,  q4,  q12
+    vadd.u16    q5,  q5,  q13
+    vadd.u16    q4,  q4,  q14
+    vadd.u16    q5,  q5,  q15
+
+    bl          x264_satd_16x4_neon
+    vadd.u16    q4,  q4,  q12
+    vadd.u16    q5,  q5,  q13
+    vadd.u16    q4,  q4,  q14
+    vadd.u16    q5,  q5,  q15
+
+    bl          x264_satd_16x4_neon
+    vadd.u16    q4,  q4,  q12
+    vadd.u16    q5,  q5,  q13
+    vadd.u16    q4,  q4,  q14
+    vadd.u16    q5,  q5,  q15
+
+    vadd.u16    q0,  q4,  q5
+    HORIZ_ADD   d0,  d0,  d1
+    vpop        {d8-d11}
+    mov         lr,  ip
+    vmov.32     r0,  d0[0]
+    bx          lr
+.endfunc
+
+function x264_satd_16x4_neon
+    vld1.64     {d2-d3}, [r2], r3
+    vld1.64     {d0-d1}, [r0,:128], r1
+    vsubl.u8    q8,  d0,  d2
+    vld1.64     {d6-d7}, [r2], r3
+    vsubl.u8    q12, d1,  d3
+    vld1.64     {d4-d5}, [r0,:128], r1
+    vsubl.u8    q9,  d4,  d6
+    vld1.64     {d2-d3}, [r2], r3
+    vsubl.u8    q13, d5,  d7
+    vld1.64     {d0-d1}, [r0,:128], r1
+    vsubl.u8    q10, d0,  d2
+    vld1.64     {d6-d7}, [r2], r3
+    vsubl.u8    q14, d1,  d3
+    vadd.s16    q0,  q8,  q9
+    vld1.64     {d4-d5}, [r0,:128], r1
+    vsub.s16    q1,  q8,  q9
+    vsubl.u8    q11, d4,  d6
+    vsubl.u8    q15, d5,  d7
+    SUMSUB_AB   q2,  q3,  q10, q11
+    SUMSUB_ABCD q8,  q10, q9,  q11, q0,  q2,  q1,  q3
+    b           x264_satd_8x4v_8x8h_neon
+.endfunc
+
+
+function x264_pixel_sa8d_8x8_neon
+    mov             ip,  lr
+    bl              x264_sa8d_8x8_neon
+    vadd.u16        q0,  q8,  q9
+    HORIZ_ADD       d0,  d0,  d1
+    mov             lr,  ip
+    vmov.32         r0,  d0[0]
+    add             r0,  r0,  #1
+    lsr             r0,  r0,  #1
+    bx              lr
+.endfunc
+
+function x264_pixel_sa8d_16x16_neon
+    vpush           {d8-d11}
+    mov             ip,  lr
+    bl              x264_sa8d_8x8_neon
+    vpaddl.u16      q4,  q8
+    vpaddl.u16      q5,  q9
+    bl              x264_sa8d_8x8_neon
+    vpadal.u16      q4,  q8
+    vpadal.u16      q5,  q9
+    sub             r0,  r0,  r1,  lsl #4
+    sub             r2,  r2,  r3,  lsl #4
+    add             r0,  r0,  #8
+    add             r2,  r2,  #8
+    bl              x264_sa8d_8x8_neon
+    vpadal.u16      q4,  q8
+    vpadal.u16      q5,  q9
+    bl              x264_sa8d_8x8_neon
+    vpaddl.u16      q8,  q8
+    vpaddl.u16      q9,  q9
+    vadd.u32        q0,  q4,  q8
+    vadd.u32        q1,  q5,  q9
+    vadd.u32        q0,  q0,  q1
+    vadd.u32        d0,  d0,  d1
+    vpadd.u32       d0,  d0,  d0
+    vpop            {d8-d11}
+    mov             lr,  ip
+    vmov.32         r0,  d0[0]
+    add             r0,  r0,  #1
+    lsr             r0,  r0,  #1
+    bx              lr
+.endfunc
+
+.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
+    SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
+    SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
+.endm
+
+function x264_sa8d_8x8_neon
+    LOAD_DIFF_8x4   q8,  q9,  q10, q11
+    vld1.64         {d7}, [r2], r3
+    SUMSUB_AB       q0,  q1,  q8,  q9
+    vld1.64         {d6}, [r0,:64], r1
+    vsubl.u8        q12, d6,  d7
+    vld1.64         {d17}, [r2], r3
+    SUMSUB_AB       q2,  q3,  q10, q11
+    vld1.64         {d16}, [r0,:64], r1
+    vsubl.u8        q13, d16, d17
+    vld1.64         {d19}, [r2], r3
+    SUMSUB_AB       q8,  q10, q0,  q2
+    vld1.64         {d18}, [r0,:64], r1
+    vsubl.u8        q14, d18, d19
+    vld1.64         {d1}, [r2], r3
+    SUMSUB_AB       q9,  q11, q1,  q3
+    vld1.64         {d0}, [r0,:64], r1
+    vsubl.u8        q15, d0,  d1
+
+    HADAMARD4_V     q12, q13, q14, q15,  q0,  q1,  q2,  q3
+    SUMSUB_ABCD     q0,  q8,  q1,  q9,   q8,  q12, q9,  q13
+    SUMSUB_AB       q2,  q10, q10, q14
+    vtrn.16         q8,  q9
+    SUMSUB_AB       q3,  q11, q11, q15
+    vtrn.16         q0,  q1
+    SUMSUB_AB       q12, q13, q8,  q9
+    vtrn.16         q10, q11
+    SUMSUB_AB       q8,  q9,  q0,  q1
+    vtrn.16         q2,  q3
+    SUMSUB_AB       q14, q15, q10, q11
+    vadd.i16        q10, q2,  q3
+    vtrn.32         q12, q14
+    vsub.i16        q11, q2,  q3
+    vtrn.32         q13, q15
+    SUMSUB_AB       q0,  q2,  q12, q14
+    vtrn.32         q8,  q10
+    SUMSUB_AB       q1,  q3,  q13, q15
+    vtrn.32         q9,  q11
+    SUMSUB_AB       q12, q14, q8,  q10
+    SUMSUB_AB       q13, q15, q9,  q11
+
+    vswp            d1,  d24
+    ABS2            q0,  q12
+    vswp            d3,  d26
+    ABS2            q1,  q13
+    vswp            d5,  d28
+    ABS2            q2,  q14
+    vswp            d7,  d30
+    ABS2            q3,  q15
+    vmax.s16        q8,  q0,  q12
+    vmax.s16        q9,  q1,  q13
+    vmax.s16        q10, q2,  q14
+    vmax.s16        q11, q3,  q15
+    vadd.i16        q8,  q8,  q9
+    vadd.i16        q9,  q10, q11
+    bx              lr
+.endfunc
+
+
+.macro HADAMARD_AC w h
+function x264_pixel_hadamard_ac_\w\()x\h\()_neon
+    vpush           {d8-d15}
+    movrel          ip, mask_ac4
+    vmov.i8         q4, #0
+    // note: this assumes mask_ac8 is after mask_ac4 (so don't move it)
+    vld1.64         {d12-d15}, [ip,:128]
+    vmov.i8         q5, #0
+
+    mov             ip,  lr
+    bl              x264_hadamard_ac_8x8_neon
+.if \h > 8
+    bl              x264_hadamard_ac_8x8_neon
+.endif
+.if \w > 8
+    sub             r0,  r0,  r1,  lsl #3
+    add             r0,  r0,  #8
+    bl              x264_hadamard_ac_8x8_neon
+.endif
+.if \w * \h == 256
+    sub             r0,  r0,  r1,  lsl #4
+    bl              x264_hadamard_ac_8x8_neon
+.endif
+
+    vadd.s32        d8,  d8,  d9
+    vadd.s32        d10, d10, d11
+    vpadd.s32       d0,  d8,  d10
+    vpop            {d8-d15}
+    mov             lr,  ip
+    vmov            r0,  r1,  d0
+    lsr             r0,  r0,  #1
+    lsr             r1,  r1,  #2
+    bx              lr
+.endfunc
+.endm
+
+HADAMARD_AC  8, 8
+HADAMARD_AC  8, 16
+HADAMARD_AC 16, 8
+HADAMARD_AC 16, 16
+
+// q4: satd  q5: sa8d  q6: mask_ac4  q7: mask_ac8
+function x264_hadamard_ac_8x8_neon
+    vld1.64         {d2},  [r0,:64], r1
+    vld1.64         {d3},  [r0,:64], r1
+    vaddl.u8        q0,  d2,  d3
+    vld1.64         {d6},  [r0,:64], r1
+    vsubl.u8        q1,  d2,  d3
+    vld1.64         {d7},  [r0,:64], r1
+    vaddl.u8        q2,  d6,  d7
+    vld1.64         {d18}, [r0,:64], r1
+    vsubl.u8        q3,  d6,  d7
+    vld1.64         {d19}, [r0,:64], r1
+    vaddl.u8        q8,  d18, d19
+    vld1.64         {d22}, [r0,:64], r1
+    vsubl.u8        q9,  d18, d19
+    vld1.64         {d23}, [r0,:64], r1
+
+    SUMSUB_ABCD     q12, q14, q13, q15, q0,  q2,  q1,  q3
+    vaddl.u8        q10, d22, d23
+    vsubl.u8        q11, d22, d23
+    vtrn.16         q12, q13
+    SUMSUB_ABCD     q0,  q2,  q1,  q3,  q8,  q10, q9,  q11
+
+    vtrn.16         q14, q15
+    SUMSUB_AB       q8,  q9,  q12, q13
+    vtrn.16         q0,  q1
+    SUMSUB_AB       q10, q11, q14, q15
+    vtrn.16         q2,  q3
+    SUMSUB_AB       q12, q13, q0,  q1
+    vtrn.32         q8,  q10
+    SUMSUB_AB       q14, q15, q2,  q3
+    vtrn.32         q9,  q11
+    SUMSUB_AB       q0,  q2,  q8,  q10
+    vtrn.32         q12, q14
+    SUMSUB_AB       q1,  q3,  q9,  q11
+    vtrn.32         q13, q15
+    SUMSUB_ABCD     q8,  q10, q9,  q11, q12, q14, q13, q15
+
+    vabs.s16        q12, q0
+    vabs.s16        q13, q8
+    vabs.s16        q15, q1
+    vadd.s16        q12, q12, q13
+    vabs.s16        q14, q2
+    vand.s16        q12, q12, q6
+    vabs.s16        q13, q3
+    vadd.s16        q12, q12, q15
+    vabs.s16        q15, q9
+    vadd.s16        q12, q12, q14
+    vabs.s16        q14, q10
+    vadd.s16        q12, q12, q13
+    vabs.s16        q13, q11
+    vadd.s16        q12, q12, q15
+    vsub.s16        q15, q11, q3
+    vadd.s16        q12, q12, q14
+    vadd.s16        q14, q11, q3
+    vadd.s16        q12, q12, q13
+    vsub.s16        q13, q10, q2
+    vadd.s16        q2,  q10, q2
+    vpadal.u16      q4,  q12
+
+    SUMSUB_AB       q10, q11, q9,  q1
+    SUMSUB_AB       q9,  q8,  q0,  q8
+    vswp            d29, d30
+    vabs.s16        q14, q14
+    vabs.s16        q15, q15
+    vswp            d5,  d26
+    vabs.s16        q2,  q2
+    vabs.s16        q13, q13
+    vswp            d21, d22
+    vabs.s16        q10, q10
+    vabs.s16        q11, q11
+    vmax.s16        q3,  q14, q15
+    vmax.s16        q2,  q2,  q13
+    vmax.s16        q1,  q10, q11
+    vswp            d19, d16
+    SUMSUB_AB       q14, q15, q9,  q8
+
+    vadd.s16        q2,  q2,  q3
+    vadd.s16        q2,  q2,  q1
+    vand            q14, q14, q7
+    vadd.s16        q2,  q2,  q2
+    vabs.s16        q15, q15
+    vabs.s16        q14, q14
+    vadd.s16        q2,  q2,  q15
+    vadd.s16        q2,  q2,  q14
+    vpadal.u16      q5,  q2
+    bx              lr
+.endfunc
+
+
+.macro SSIM_ITER n ssa s12 ssb lastssa lasts12 lastssb da db dnext
+    vld1.64     {\db}, [r2], r3
+    vmull.u8    \ssa,  \da, \da
+    vmull.u8    \s12,  \da, \db
+.if \n == 1
+    vpaddl.u16  q2,  \lastssa
+    vpaddl.u16  q3,  \lasts12
+    vaddl.u8    q0,  d0,  \da
+.else
+    vpadal.u16  q2,  \lastssa
+    vpadal.u16  q3,  \lasts12
+    vaddw.u8    q0,  q0,  \da
+.endif
+    vpadal.u16  q2,  \lastssb
+.if \n < 3
+    vld1.64     {\dnext}, [r0], r1
+.endif
+.if \n == 1
+    vaddl.u8    q1,  d2,  \db
+.else
+    vaddw.u8    q1,  q1,  \db
+.endif
+    vmull.u8    \ssb, \db, \db
+.endm
+
+function x264_pixel_ssim_4x4x2_core_neon
+    ldr         ip, [sp]
+    vld1.64     {d0}, [r0], r1
+    vld1.64     {d2}, [r2], r3
+    vmull.u8    q2,  d0,  d0
+    vmull.u8    q3,  d0,  d2
+    vld1.64     {d28}, [r0], r1
+    vmull.u8    q15, d2,  d2
+
+    SSIM_ITER 1, q8, q9, q14,  q2, q3, q15,  d28, d29, d26
+    SSIM_ITER 2, q10,q11,q13,  q8, q9, q14,  d26, d27, d28
+    SSIM_ITER 3, q8, q9, q15,  q10,q11,q13,  d28, d29
+
+    vpadal.u16  q2,  q8
+    vpaddl.u16  q0,  q0
+    vpaddl.u16  q1,  q1
+    vpadal.u16  q2,  q15
+    vpadal.u16  q3,  q9
+
+    vpadd.u32   d0,  d0,  d1
+    vpadd.u32   d1,  d2,  d3
+    vpadd.u32   d2,  d4,  d5
+    vpadd.u32   d3,  d6,  d7
+
+    vst4.32     {d0-d3}, [ip]
+    bx          lr
+.endfunc
+
+// FIXME: see about doing 16x16 -> 32 bit multiplies for s1/s2
+function x264_pixel_ssim_end4_neon
+    vld1.32     {d16-d19}, [r0,:128]!
+    vld1.32     {d20-d23}, [r1,:128]!
+    vadd.s32    q0,  q8,  q10
+    vadd.s32    q1,  q9,  q11
+    vld1.32     {d24-d27}, [r0,:128]!
+    vadd.s32    q0,  q0,  q1
+    vld1.32     {d28-d31}, [r1,:128]!
+    vadd.s32    q2,  q12, q14
+    vadd.s32    q3,  q13, q15
+    vld1.32     {d16-d17}, [r0,:128]
+    vadd.s32    q1,  q1,  q2
+    vld1.32     {d18-d19}, [r1,:128]
+    vadd.s32    q8,  q8,  q9
+    vadd.s32    q2,  q2,  q3
+    vadd.s32    q3,  q3,  q8
+
+    vtrn.32     q0,  q1
+    vtrn.32     q2,  q3
+    vswp        d1,  d4
+    vswp        d3,  d6
+
+//  s1=q0, s2=q1, ss=q2, s12=q3
+    vmul.s32    q8,  q0,  q1    // s1*s2
+    vmul.s32    q0,  q0,  q0
+    vmla.s32    q0,  q1,  q1    // s1*s1 + s2*s2
+
+    vshl.s32    q3,  q3,  #7
+    vshl.s32    q2,  q2,  #6
+    vadd.s32    q1,  q8,  q8
+
+    mov         r3, #416        // ssim_c1 = .01*.01*255*255*64
+    movconst    ip, 235963      // ssim_c2 = .03*.03*255*255*64*63
+    vdup.32     q14, r3
+    vdup.32     q15, ip
+
+    vsub.s32    q2,  q2,  q0    // vars
+    vsub.s32    q3,  q3,  q1    // covar*2
+    vadd.s32    q0,  q0,  q14
+    vadd.s32    q2,  q2,  q15
+    vadd.s32    q1,  q1,  q14
+    vadd.s32    q3,  q3,  q15
+
+    vcvt.f32.s32    q0,  q0
+    vcvt.f32.s32    q2,  q2
+    vcvt.f32.s32    q1,  q1
+    vcvt.f32.s32    q3,  q3
+
+    vmul.f32    q0,  q0,  q2
+    vmul.f32    q1,  q1,  q3
+
+    cmp         r2,  #4
+
+    vdiv.f32    s0,  s4,  s0
+    vdiv.f32    s1,  s5,  s1
+    vdiv.f32    s2,  s6,  s2
+    vdiv.f32    s3,  s7,  s3
+
+    beq         ssim_skip
+    movrel      r3,  mask_ff
+    sub         r3,  r3,  r2,  lsl #2
+    vld1.64     {d6-d7}, [r3]
+    vand        q0,  q0,  q3
+ssim_skip:
+    vadd.f32    d0,  d0,  d1
+    vpadd.f32   d0,  d0,  d0
+    vmov.32     r0,  d0[0]
+    bx          lr
+.endfunc
diff --git a/common/arm/pixel.h b/common/arm/pixel.h
new file mode 100644
index 0000000..0683520
--- /dev/null
+++ b/common/arm/pixel.h
@@ -0,0 +1,69 @@
+/*****************************************************************************
+ * pixel.h: h264 encoder library
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef X264_ARM_PIXEL_H
+#define X264_ARM_PIXEL_H
+
+#define DECL_PIXELS( ret, name, suffix, args ) \
+    ret x264_pixel_##name##_16x16_##suffix args;\
+    ret x264_pixel_##name##_16x8_##suffix args;\
+    ret x264_pixel_##name##_8x16_##suffix args;\
+    ret x264_pixel_##name##_8x8_##suffix args;\
+    ret x264_pixel_##name##_8x4_##suffix args;\
+    ret x264_pixel_##name##_4x8_##suffix args;\
+    ret x264_pixel_##name##_4x4_##suffix args;\
+
+#define DECL_X1( name, suffix ) \
+    DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) )
+
+#define DECL_X4( name, suffix ) \
+    DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) )\
+    DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) )
+
+int x264_pixel_sad_4x4_armv6( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_4x8_armv6( uint8_t *, int, uint8_t *, int );
+
+DECL_X1( sad, neon )
+DECL_X1( sad_aligned, neon )
+DECL_X1( sad_aligned, neon_dual )
+DECL_X4( sad, neon )
+DECL_X1( satd, neon )
+DECL_X1( ssd, neon )
+
+int x264_pixel_sa8d_8x8_neon( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sa8d_16x16_neon( uint8_t *, int, uint8_t *, int );
+
+uint64_t x264_pixel_var_8x8_neon( uint8_t *, int );
+uint64_t x264_pixel_var_16x16_neon( uint8_t *, int );
+int x264_pixel_var2_8x8_neon( uint8_t *, int, uint8_t *, int, int * );
+
+uint64_t x264_pixel_hadamard_ac_8x8_neon( uint8_t *, int );
+uint64_t x264_pixel_hadamard_ac_8x16_neon( uint8_t *, int );
+uint64_t x264_pixel_hadamard_ac_16x8_neon( uint8_t *, int );
+uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, int );
+
+void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, int,
+                                      const uint8_t *, int,
+                                      int sums[2][4]);
+float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
+
+#endif
diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S
new file mode 100644
index 0000000..9a91478
--- /dev/null
+++ b/common/arm/predict-a.S
@@ -0,0 +1,270 @@
+/*****************************************************************************
+ * predict_armv6.S: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.fpu neon
+
+.section .rodata
+.align 4
+
+pw_76543210: .short 7,6,5,4,3,2,1,0
+
+.text
+
+// because gcc doesn't believe in using the free shift in add
+function x264_predict_4x4_h_armv6
+    ldrb    r1, [r0, #0*FDEC_STRIDE-1]
+    ldrb    r2, [r0, #1*FDEC_STRIDE-1]
+    ldrb    r3, [r0, #2*FDEC_STRIDE-1]
+    ldrb    ip, [r0, #3*FDEC_STRIDE-1]
+    add     r1, r1, r1, lsl #8
+    add     r2, r2, r2, lsl #8
+    add     r3, r3, r3, lsl #8
+    add     ip, ip, ip, lsl #8
+    add     r1, r1, r1, lsl #16
+    str     r1, [r0, #0*FDEC_STRIDE]
+    add     r2, r2, r2, lsl #16
+    str     r2, [r0, #1*FDEC_STRIDE]
+    add     r3, r3, r3, lsl #16
+    str     r3, [r0, #2*FDEC_STRIDE]
+    add     ip, ip, ip, lsl #16
+    str     ip, [r0, #3*FDEC_STRIDE]
+    bx      lr
+.endfunc
+
+function x264_predict_4x4_dc_armv6
+    mov     ip, #0
+    ldr     r1, [r0, #-FDEC_STRIDE]
+    ldrb    r2, [r0, #0*FDEC_STRIDE-1]
+    ldrb    r3, [r0, #1*FDEC_STRIDE-1]
+    usad8   r1, r1, ip
+    add     r2, r2, #4
+    ldrb    ip, [r0, #2*FDEC_STRIDE-1]
+    add     r2, r2, r3
+    ldrb    r3, [r0, #3*FDEC_STRIDE-1]
+    add     r2, r2, ip
+    add     r2, r2, r3
+    add     r1, r1, r2
+    lsr     r1, r1, #3
+    add     r1, r1, r1, lsl #8
+    add     r1, r1, r1, lsl #16
+    str     r1, [r0, #0*FDEC_STRIDE]
+    str     r1, [r0, #1*FDEC_STRIDE]
+    str     r1, [r0, #2*FDEC_STRIDE]
+    str     r1, [r0, #3*FDEC_STRIDE]
+    bx      lr
+.endfunc
+
+// return a1 = (a1+2*b1+c1+2)>>2  a2 = (a2+2*b2+c2+2)>>2
+.macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
+    uhadd8  \a1, \a1, \c1
+    uhadd8  \a2, \a2, \c2
+    uhadd8  \c1, \a1, \b1
+    uhadd8  \c2, \a2, \b2
+    eor     \a1, \a1, \b1
+    eor     \a2, \a2, \b2
+    and     \a1, \a1, \pb_1
+    and     \a2, \a2, \pb_1
+    uadd8   \a1, \a1, \c1
+    uadd8   \a2, \a2, \c2
+.endm
+
+function x264_predict_4x4_ddr_armv6
+    ldr     r1, [r0, # -FDEC_STRIDE]
+    ldrb    r2, [r0, # -FDEC_STRIDE-1]
+    ldrb    r3, [r0, #0*FDEC_STRIDE-1]
+    push    {r4-r6,lr}
+    add     r2, r2, r1, lsl #8
+    ldrb    r4, [r0, #1*FDEC_STRIDE-1]
+    add     r3, r3, r2, lsl #8
+    ldrb    r5, [r0, #2*FDEC_STRIDE-1]
+    ldrb    r6, [r0, #3*FDEC_STRIDE-1]
+    add     r4, r4, r3, lsl #8
+    add     r5, r5, r4, lsl #8
+    add     r6, r6, r5, lsl #8
+    ldr     ip, =0x01010101
+    PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
+    str     r1, [r0, #0*FDEC_STRIDE]
+    lsl     r2, r1, #8
+    lsl     r3, r1, #16
+    lsl     r4, r4, #8
+    lsl     r5, r1, #24
+    add     r2, r2, r4, lsr #24
+    str     r2, [r0, #1*FDEC_STRIDE]
+    add     r3, r3, r4, lsr #16
+    str     r3, [r0, #2*FDEC_STRIDE]
+    add     r5, r5, r4, lsr #8
+    str     r5, [r0, #3*FDEC_STRIDE]
+    pop     {r4-r6,pc}
+.endfunc
+
+function x264_predict_4x4_ddl_neon
+    sub         r0, #FDEC_STRIDE
+    mov         ip, #FDEC_STRIDE
+    vld1.64     {d0}, [r0], ip
+    vdup.8      d3, d0[7]
+    vext.8      d1, d0, d0, #1
+    vext.8      d2, d0, d3, #2
+    vhadd.u8    d0, d0, d2
+    vrhadd.u8   d0, d0, d1
+    vst1.32     {d0[0]}, [r0,:32], ip
+    vext.8      d1, d0, d0, #1
+    vext.8      d2, d0, d0, #2
+    vst1.32     {d1[0]}, [r0,:32], ip
+    vext.8      d3, d0, d0, #3
+    vst1.32     {d2[0]}, [r0,:32], ip
+    vst1.32     {d3[0]}, [r0,:32], ip
+    bx          lr
+.endfunc
+
+function x264_predict_8x8_dc_neon
+    mov     ip, #0
+    ldrd    r2, [r1, #8]
+    push    {r4-r5,lr}
+    ldrd    r4, [r1, #16]
+    lsl     r3, r3, #8
+    ldrb    lr, [r1, #7]
+    usad8   r2, r2, ip
+    usad8   r3, r3, ip
+    usada8  r2, r4, ip, r2
+    add     lr, lr, #8
+    usada8  r3, r5, ip, r3
+    add     r2, r2, lr
+    mov     ip, #FDEC_STRIDE
+    add     r2, r2, r3
+    lsr     r2, r2, #4
+
+    vdup.8   d0, r2
+.rept 8
+    vst1.64 {d0}, [r0,:64], ip
+.endr
+    pop    {r4-r5,pc}
+.endfunc
+
+
+function x264_predict_8x8_h_neon
+    add         r1, r1, #7
+    mov         ip, #FDEC_STRIDE
+    vld1.64     {d16}, [r1]
+    vdup.8      d0, d16[7]
+    vdup.8      d1, d16[6]
+    vst1.64     {d0}, [r0,:64], ip
+    vdup.8      d2, d16[5]
+    vst1.64     {d1}, [r0,:64], ip
+    vdup.8      d3, d16[4]
+    vst1.64     {d2}, [r0,:64], ip
+    vdup.8      d4, d16[3]
+    vst1.64     {d3}, [r0,:64], ip
+    vdup.8      d5, d16[2]
+    vst1.64     {d4}, [r0,:64], ip
+    vdup.8      d6, d16[1]
+    vst1.64     {d5}, [r0,:64], ip
+    vdup.8      d7, d16[0]
+    vst1.64     {d6}, [r0,:64], ip
+    vst1.64     {d7}, [r0,:64], ip
+    bx          lr
+.endfunc
+
+function x264_predict_8x8c_h_neon
+    sub         r1, r0, #1
+    mov         ip, #FDEC_STRIDE
+.rept 4
+    vld1.8      {d0[]}, [r1], ip
+    vld1.8      {d2[]}, [r1], ip
+    vst1.64     {d0}, [r0,:64], ip
+    vst1.64     {d2}, [r0,:64], ip
+.endr
+    bx          lr
+.endfunc
+
+function x264_predict_8x8c_v_neon
+    sub         r0, r0, #FDEC_STRIDE
+    mov         ip, #FDEC_STRIDE
+    vld1.64     {d0}, [r0,:64], ip
+.rept 8
+    vst1.64     {d0}, [r0,:64], ip
+.endr
+    bx          lr
+.endfunc
+
+
+function x264_predict_16x16_dc_neon
+    sub         r3, r0, #FDEC_STRIDE
+    sub         r0, r0, #1
+    vld1.64     {d0-d1}, [r3,:128]
+    ldrb        ip, [r0], #FDEC_STRIDE
+    vaddl.u8    q0, d0, d1
+    ldrb        r1, [r0], #FDEC_STRIDE
+    vadd.u16    d0, d0, d1
+    vpadd.u16   d0, d0, d0
+    vpadd.u16   d0, d0, d0
+.rept 4
+    ldrb        r2, [r0], #FDEC_STRIDE
+    add         ip, ip, r1
+    ldrb        r3, [r0], #FDEC_STRIDE
+    add         ip, ip, r2
+    ldrb        r1, [r0], #FDEC_STRIDE
+    add         ip, ip, r3
+.endr
+    ldrb        r2, [r0], #FDEC_STRIDE
+    add         ip, ip, r1
+    ldrb        r3, [r0], #FDEC_STRIDE
+    add         ip, ip, r2
+
+    sub         r0, r0, #FDEC_STRIDE*16
+    add         ip, ip, r3
+    vdup.16     d1, ip
+    vadd.u16    d0, d0, d1
+    mov         ip, #FDEC_STRIDE
+    add         r0, r0, #1
+    vrshr.u16   d0, d0, #5
+    vdup.8      q0, d0[0]
+.rept 16
+    vst1.64     {d0-d1}, [r0,:64], ip
+.endr
+    bx          lr
+.endfunc
+
+function x264_predict_16x16_h_neon
+    sub         r1, r0, #1
+    mov         ip, #FDEC_STRIDE
+.rept 8
+    vld1.8      {d0[]}, [r1], ip
+    vmov        d1, d0
+    vld1.8      {d2[]}, [r1], ip
+    vmov        d3, d2
+    vst1.64     {d0-d1}, [r0,:128], ip
+    vst1.64     {d2-d3}, [r0,:128], ip
+.endr
+    bx          lr
+.endfunc
+
+function x264_predict_16x16_v_neon
+    sub         r0, r0, #FDEC_STRIDE
+    mov         ip, #FDEC_STRIDE
+    vld1.64     {d0-d1}, [r0,:128], ip
+.rept 16
+    vst1.64     {d0-d1}, [r0,:128], ip
+.endr
+    bx          lr
+.endfunc
diff --git a/common/arm/predict-c.c b/common/arm/predict-c.c
new file mode 100644
index 0000000..1f2cd52
--- /dev/null
+++ b/common/arm/predict-c.c
@@ -0,0 +1,83 @@
+/*****************************************************************************
+ * predict.c: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "predict.h"
+#include "pixel.h"
+
+void x264_predict_4x4_dc_armv6( uint8_t *src );
+void x264_predict_4x4_h_armv6( uint8_t *src );
+void x264_predict_4x4_ddr_armv6( uint8_t *src );
+void x264_predict_4x4_ddl_neon( uint8_t *src );
+
+void x264_predict_8x8c_h_neon( uint8_t *src );
+void x264_predict_8x8c_v_neon( uint8_t *src );
+
+void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[33] );
+void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[33] );
+
+void x264_predict_16x16_dc_neon( uint8_t *src );
+void x264_predict_16x16_h_neon( uint8_t *src );
+void x264_predict_16x16_v_neon( uint8_t *src );
+
+void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] )
+{
+    if (!(cpu&X264_CPU_ARMV6))
+        return;
+
+    pf[I_PRED_4x4_H]   = x264_predict_4x4_h_armv6;
+    pf[I_PRED_4x4_DC]  = x264_predict_4x4_dc_armv6;
+    pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;
+
+    if (!(cpu&X264_CPU_NEON))
+        return;
+
+    pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
+}
+
+void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] )
+{
+    if (!(cpu&X264_CPU_NEON))
+        return;
+
+    pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
+    pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon;
+}
+
+void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
+{
+    if (!(cpu&X264_CPU_NEON))
+        return;
+
+    pf[I_PRED_8x8_DC]  = x264_predict_8x8_dc_neon;
+    pf[I_PRED_8x8_H]   = x264_predict_8x8_h_neon;
+}
+
+void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] )
+{
+    if (!(cpu&X264_CPU_NEON))
+        return;
+
+    pf[I_PRED_16x16_DC ]    = x264_predict_16x16_dc_neon;
+    pf[I_PRED_16x16_H ]     = x264_predict_16x16_h_neon;
+    pf[I_PRED_16x16_V ]     = x264_predict_16x16_v_neon;
+}
diff --git a/common/ppc/predict.h b/common/arm/predict.h
similarity index 69%
copy from common/ppc/predict.h
copy to common/arm/predict.h
index 29488aa..fe5ccda 100644
--- a/common/ppc/predict.h
+++ b/common/arm/predict.h
@@ -1,7 +1,9 @@
 /*****************************************************************************
  * predict.h: h264 encoder library
  *****************************************************************************
- * Copyright (C) 2007 Guillaume Poirier <gpoirier at mplayerhq.hu>
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -18,10 +20,12 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/
 
-#ifndef X264_PPC_PREDICT_H
-#define X264_PPC_PREDICT_H
+#ifndef X264_ARM_PREDICT_H
+#define X264_ARM_PREDICT_H
 
-void x264_predict_16x16_init_altivec ( x264_predict_t pf[7] );
-void x264_predict_8x8c_init_altivec( x264_predict_t pf[7] );
+void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] );
+void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
+void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] );
+void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] );
 
-#endif /* X264_PPC_PREDICT_H */
+#endif
diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S
new file mode 100644
index 0000000..0b49eb4
--- /dev/null
+++ b/common/arm/quant-a.S
@@ -0,0 +1,352 @@
+/*****************************************************************************
+ * quant.S: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.fpu neon
+
+.section .rodata
+.align 4
+pmovmskb_byte:
+.byte 1,2,4,8,16,32,64,128
+.byte 1,2,4,8,16,32,64,128
+
+.text
+
+.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 load_mf=no
+    vadd.u16    q8,  q8,  \bias0
+    vadd.u16    q9,  q9,  \bias1
+.ifc \load_mf, yes
+    vld1.64     {\mf0-\mf3}, [r1,:128]!
+.endif
+    vmull.u16   q10, d16, \mf0
+    vmull.u16   q11, d17, \mf1
+    vmull.u16   q12, d18, \mf2
+    vmull.u16   q13, d19, \mf3
+    vshr.s16    q14, q14, #15
+    vshr.s16    q15, q15, #15
+    vshrn.u32   d16, q10, #16
+    vshrn.u32   d17, q11, #16
+    vshrn.u32   d18, q12, #16
+    vshrn.u32   d19, q13, #16
+    veor        q8,  q8,  q14
+    veor        q9,  q9,  q15
+    vsub.s16    q8,  q8,  q14
+    vsub.s16    q9,  q9,  q15
+    vorr        \bias0, q8,  q9
+    vst1.64     {d16-d19}, [r0,:128]!
+.endm
+
+.macro QUANT_END d
+    vmov        r2,  r3,  \d
+    orrs        r0,  r2,  r3
+    movne       r0,  #1
+    bx          lr
+.endm
+
+// quant_2x2_dc( int16_t dct[4], int mf, int bias )
+function x264_quant_2x2_dc_neon
+    vld1.64     {d0}, [r0,:64]
+    vabs.s16    d3,  d0
+    vdup.16     d2,  r2
+    vdup.16     d1,  r1
+    vadd.u16    d3,  d3,  d2
+    vmull.u16   q3,  d3,  d1
+    vshr.s16    d0,  d0,  #15
+    vshrn.u32   d3,  q3,  #16
+    veor        d3,  d3,  d0
+    vsub.s16    d3,  d3,  d0
+    vst1.64     {d3}, [r0,:64]
+    QUANT_END   d3
+.endfunc
+
+// quant_4x4_dc( int16_t dct[16], int mf, int bias )
+function x264_quant_4x4_dc_neon
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    vdup.16     q0,  r2
+    vdup.16     q2,  r1
+    QUANT_TWO   q0,  q0,  d4,  d5,  d4,  d5
+    vorr        d0,  d0,  d1
+    QUANT_END   d0
+.endfunc
+
+// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
+function x264_quant_4x4_neon
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    vld1.64     {d0-d3}, [r2,:128]
+    vld1.64     {d4-d7}, [r1,:128]
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7
+    vorr        d0,  d0,  d1
+    QUANT_END   d0
+.endfunc
+
+// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
+function x264_quant_8x8_neon
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    vld1.64     {d0-d3},   [r2,:128]!
+    vld1.64     {d4-d7},   [r1,:128]!
+    QUANT_TWO   q0,  q1,  d4,  d5,  d6,  d7
+.rept 3
+    vld1.64     {d28-d31}, [r0,:128]
+    vabs.s16    q8,  q14
+    vabs.s16    q9,  q15
+    vld1.64     {d2-d5},   [r2,:128]!
+    QUANT_TWO   q1,  q2,  d4,  d5,  d6,  d7, yes
+    vorr        q0,  q0,  q1
+.endr
+    vorr        d0,  d0,  d1
+    QUANT_END   d0
+.endfunc
+
+.macro DEQUANT_START mf_size offset dc=no
+    mov         r3,  #0x2b
+    mul         r3,  r3,  r2
+    lsr         r3,  r3,  #8            // i_qbits = i_qp / 6
+    add         ip,  r3,  r3,  lsl #1
+    sub         r2,  r2,  ip,  lsl #1   // i_mf = i_qp % 6
+.ifc \dc,no
+    add         r1,  r1,  r2, lsl #\mf_size  // dequant_mf[i_mf]
+.else
+    ldr         r1, [r1,  r2, lsl #\mf_size] // dequant_mf[i_mf][0][0]
+.endif
+    subs        r3,  r3,  #\offset      // 6 for 8x8
+.endm
+
+// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
+.macro DEQUANT size bits
+function x264_dequant_\size\()_neon
+    DEQUANT_START \bits+2, \bits
+.ifc \size, 8x8
+    mov         r2,  #4
+.endif
+    blt         dequant_\size\()_rshift
+
+    vdup.16     q15, r3
+dequant_\size\()_lshift_loop:
+.ifc \size, 8x8
+    subs        r2,  r2,  #1
+.endif
+    vld1.32     {d16-d17}, [r1,:128]!
+    vld1.32     {d18-d19}, [r1,:128]!
+    vmovn.s32   d4,  q8
+    vld1.32     {d20-d21}, [r1,:128]!
+    vmovn.s32   d5,  q9
+    vld1.32     {d22-d23}, [r1,:128]!
+    vmovn.s32   d6,  q10
+    vld1.16     {d0-d3},   [r0,:128]
+    vmovn.s32   d7,  q11
+    vmul.s16    q0,  q0,  q2
+    vmul.s16    q1,  q1,  q3
+    vshl.s16    q0,  q0,  q15
+    vshl.s16    q1,  q1,  q15
+    vst1.16     {d0-d3},   [r0,:128]!
+.ifc \size, 8x8
+    bgt         dequant_\size\()_lshift_loop
+.endif
+    bx          lr
+
+dequant_\size\()_rshift:
+    vdup.32     q15, r3
+    rsb         r3,  r3,  #0
+    mov         ip,  #1
+    sub         r3,  r3,  #1
+    lsl         ip,  ip,  r3
+
+.ifc \size, 8x8
+dequant_\size\()_rshift_loop:
+    subs        r2,  r2,  #1
+.endif
+    vdup.32     q10, ip
+    vld1.32     {d16-d17}, [r1,:128]!
+    vdup.32     q11, ip
+    vld1.32     {d18-d19}, [r1,:128]!
+    vmovn.s32   d4,  q8
+    vld1.32     {d16-d17}, [r1,:128]!
+    vmovn.s32   d5,  q9
+    vld1.32     {d18-d19}, [r1,:128]!
+    vmovn.s32   d6,  q8
+    vld1.16     {d0-d3},   [r0,:128]
+    vmovn.s32   d7,  q9
+    vdup.32     q12, ip
+    vdup.32     q13, ip
+
+    vmlal.s16   q10, d0,  d4
+    vmlal.s16   q11, d1,  d5
+    vmlal.s16   q12, d2,  d6
+    vmlal.s16   q13, d3,  d7
+    vshl.s32    q10, q10, q15
+    vshl.s32    q11, q11, q15
+    vshl.s32    q12, q12, q15
+    vshl.s32    q13, q13, q15
+
+    vmovn.s32   d0,  q10
+    vmovn.s32   d1,  q11
+    vmovn.s32   d2,  q12
+    vmovn.s32   d3,  q13
+    vst1.16     {d0-d3},   [r0,:128]!
+.ifc \size, 8x8
+    bgt         dequant_\size\()_rshift_loop
+.endif
+    bx          lr
+.endfunc
+.endm
+
+DEQUANT 4x4, 4
+DEQUANT 8x8, 6
+
+// dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
+function x264_dequant_4x4_dc_neon
+    DEQUANT_START 6, 6, yes
+    blt         dequant_4x4_dc_rshift
+
+    lsl         r1,  r1,  r3
+    vdup.16     q2,  r1
+    vld1.16     {d0-d3},   [r0,:128]
+    vdup.16     q15, r3
+
+    vmul.s16    q0,  q0,  q2
+    vmul.s16    q1,  q1,  q2
+    vst1.16     {d0-d3},   [r0,:128]
+    bx          lr
+
+dequant_4x4_dc_rshift:
+    vdup.16     d4,  r1
+    vdup.32     q15, r3
+    rsb         r3,  r3,  #0
+    mov         ip,  #1
+    sub         r3,  r3,  #1
+    lsl         ip,  ip,  r3
+
+    vdup.32     q10, ip
+    vdup.32     q11, ip
+    vld1.16     {d0-d3},   [r0,:128]
+    vdup.32     q12, ip
+    vdup.32     q13, ip
+
+    vmlal.s16   q10, d0,  d4
+    vmlal.s16   q11, d1,  d4
+    vmlal.s16   q12, d2,  d4
+    vmlal.s16   q13, d3,  d4
+    vshl.s32    q10, q10, q15
+    vshl.s32    q11, q11, q15
+    vshl.s32    q12, q12, q15
+    vshl.s32    q13, q13, q15
+
+    vmovn.s32   d0,  q10
+    vmovn.s32   d1,  q11
+    vmovn.s32   d2,  q12
+    vmovn.s32   d3,  q13
+    vst1.16     {d0-d3},   [r0,:128]
+    bx          lr
+.endfunc
+
+
+// int coeff_last( int16_t *l )
+function x264_coeff_last4_arm
+    ldrd        r2,  [r0]
+    subs        r0,  r3,  #0
+    movne       r0,  #2
+    movne       r2,  r3
+    lsrs        r2,  r2,  #16
+    addne       r0,  r0,  #1
+    bx          lr
+.endfunc
+
+.macro COEFF_LAST_1x size
+function x264_coeff_last\size\()_neon
+.if \size == 15
+    sub         r0,  r0,  #2
+    vld1.64     {d0-d3}, [r0]
+.else
+    vld1.64     {d0-d3}, [r0,:128]
+.endif
+    vtst.16     q0,  q0
+    vtst.16     q1,  q1
+    vshrn.u16   d0,  q0,  #8
+    vshrn.u16   d1,  q1,  #8
+    vshrn.u16   d0,  q0,  #4
+    vclz.i32    d0,  d0
+    mov         ip,  #7
+    mov         r3,  #\size - 9
+    vmov        r0,  r1,  d0
+
+    subs        r1,  ip,  r1,  lsr #2
+    addge       r0,  r1,  #\size - 8
+    sublts      r0,  r3,  r0,  lsr #2
+    movlt       r0,  #0
+    bx          lr
+.endfunc
+.endm
+
+COEFF_LAST_1x 15
+COEFF_LAST_1x 16
+
+function x264_coeff_last64_neon
+    vld1.64     {d16-d19}, [r0,:128]!
+    vqmovn.u16  d16, q8
+    vqmovn.u16  d17, q9
+    vld1.64     {d20-d23}, [r0,:128]!
+    vqmovn.u16  d18, q10
+    vqmovn.u16  d19, q11
+    vld1.64     {d24-d27}, [r0,:128]!
+    vqmovn.u16  d20, q12
+    vqmovn.u16  d21, q13
+    vld1.64     {d28-d31}, [r0,:128]!
+    vqmovn.u16  d22, q14
+    vqmovn.u16  d23, q15
+
+    movrel      r1, pmovmskb_byte
+    vld1.64     {d0-d1}, [r1,:128]
+
+    vtst.8      q8,  q8
+    vtst.8      q9,  q9
+    vtst.8      q10, q10
+    vtst.8      q11, q11
+
+    vand        q8,  q8,  q0
+    vand        q9,  q9,  q0
+    vand        q10, q10, q0
+    vand        q11, q11, q0
+
+    vpadd.u8    d0,  d16, d17
+    vpadd.u8    d1,  d18, d19
+    vpadd.u8    d2,  d20, d21
+    vpadd.u8    d3,  d22, d23
+    vpadd.u8    d0,  d0,  d1
+    vpadd.u8    d1,  d2,  d3
+    vpadd.u8    d0,  d0,  d1
+    vclz.i32    d0,  d0
+    mov         ip,  #31
+    vmov        r0,  r1,  d0
+
+    subs        r1,  ip,  r1
+    addge       r0,  r1,  #32
+    sublts      r0,  ip,  r0
+    movlt       r0,  #0
+    bx          lr
+.endfunc
diff --git a/matroska.h b/common/arm/quant.h
similarity index 50%
copy from matroska.h
copy to common/arm/quant.h
index be6f530..dcfed63 100644
--- a/matroska.h
+++ b/common/arm/quant.h
@@ -1,7 +1,9 @@
 /*****************************************************************************
- * matroska.h:
+ * quant.h: h264 encoder library
  *****************************************************************************
- * Copyright (C) 2005 Mike Matsnev
+ * Copyright (C) 2005-2008 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -18,24 +20,23 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/
 
-#ifndef X264_MATROSKA_H
-#define X264_MATROSKA_H
+#ifndef X264_ARM_QUANT_H
+#define X264_ARM_QUANT_H
 
-typedef struct mk_Writer mk_Writer;
+int x264_quant_2x2_dc_armv6( int16_t dct[4], int mf, int bias );
 
-mk_Writer *mk_createWriter( const char *filename );
+int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias );
+int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias );
+int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
 
-int  mk_writeHeader( mk_Writer *w, const char *writingApp,
-                     const char *codecID,
-                     const void *codecPrivate, unsigned codecPrivateSize,
-                     int64_t default_frame_duration,
-                     int64_t timescale,
-                     unsigned width, unsigned height,
-                     unsigned d_width, unsigned d_height );
+void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
 
-int  mk_startFrame( mk_Writer *w );
-int  mk_addFrameData( mk_Writer *w, const void *data, unsigned size );
-int  mk_setFrameFlags( mk_Writer *w, int64_t timestamp, int keyframe );
-int  mk_close( mk_Writer *w );
+int x264_coeff_last4_arm( int16_t * );
+int x264_coeff_last15_neon( int16_t * );
+int x264_coeff_last16_neon( int16_t * );
+int x264_coeff_last64_neon( int16_t * );
 
 #endif
diff --git a/common/bs.h b/common/bs.h
index eafa8f8..0773de6 100644
--- a/common/bs.h
+++ b/common/bs.h
@@ -73,24 +73,37 @@ extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
 
 static inline void bs_init( bs_t *s, void *p_data, int i_data )
 {
-    int offset = ((intptr_t)p_data & (WORD_SIZE-1));
+    int offset = ((intptr_t)p_data & 3);
     s->p       = s->p_start = (uint8_t*)p_data - offset;
     s->p_end   = (uint8_t*)p_data + i_data;
-    s->i_left  = offset ? 8*offset : (WORD_SIZE*8);
-    s->cur_bits = endian_fix( *(intptr_t*)s->p );
+    s->i_left  = (WORD_SIZE - offset)*8;
+    s->cur_bits = endian_fix32( M32(s->p) );
+    s->cur_bits >>= (4-offset)*8;
 }
 static inline int bs_pos( bs_t *s )
 {
     return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
 }
 
-/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32/64-bit aligned. */
+/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
 static inline void bs_flush( bs_t *s )
 {
-    *(intptr_t*)s->p = endian_fix( s->cur_bits << s->i_left );
+    M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
     s->p += WORD_SIZE - s->i_left / 8;
     s->i_left = WORD_SIZE*8;
 }
+/* The inverse of bs_flush: prepare the bitstream to be written to again. */
+static inline void bs_realign( bs_t *s )
+{
+    int offset = ((intptr_t)s->p & 3);
+    if( offset )
+    {
+        s->p       = (uint8_t*)s->p - offset;
+        s->i_left  = (WORD_SIZE - offset)*8;
+        s->cur_bits = endian_fix32( M32(s->p) );
+        s->cur_bits >>= (4-offset)*8;
+    }
+}
 
 static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
 {
@@ -101,9 +114,9 @@ static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
         if( s->i_left <= 32 )
         {
 #ifdef WORDS_BIGENDIAN
-            *(uint32_t*)s->p = s->cur_bits >> (32 - s->i_left);
+            M32( s->p ) = s->cur_bits >> (32 - s->i_left);
 #else
-            *(uint32_t*)s->p = endian_fix( s->cur_bits << s->i_left );
+            M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
 #endif
             s->i_left += 32;
             s->p += 4;
@@ -120,7 +133,7 @@ static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
         {
             i_count -= s->i_left;
             s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
-            *(uint32_t*)s->p = endian_fix( s->cur_bits );
+            M32( s->p ) = endian_fix( s->cur_bits );
             s->p += 4;
             s->cur_bits = i_bits;
             s->i_left = 32 - i_count;
@@ -143,7 +156,7 @@ static inline void bs_write1( bs_t *s, uint32_t i_bit )
     s->i_left--;
     if( s->i_left == WORD_SIZE*8-32 )
     {
-        *(uint32_t*)s->p = endian_fix32( s->cur_bits );
+        M32( s->p ) = endian_fix32( s->cur_bits );
         s->p += 4;
         s->i_left = WORD_SIZE*8;
     }
@@ -151,23 +164,19 @@ static inline void bs_write1( bs_t *s, uint32_t i_bit )
 
 static inline void bs_align_0( bs_t *s )
 {
-    if( s->i_left&7 )
-    {
-        s->cur_bits <<= s->i_left&7;
-        s->i_left &= ~7;
-    }
+    bs_write( s, s->i_left&7, 0 );
     bs_flush( s );
 }
 static inline void bs_align_1( bs_t *s )
 {
-    if( s->i_left&7 )
-    {
-        s->cur_bits <<= s->i_left&7;
-        s->cur_bits |= (1 << (s->i_left&7)) - 1;
-        s->i_left &= ~7;
-    }
+    bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 );
     bs_flush( s );
 }
+static inline void bs_align_10( bs_t *s )
+{
+    if( s->i_left&7 )
+        bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) );
+}
 
 /* golomb functions */
 
@@ -245,7 +254,7 @@ static inline void bs_write_te( bs_t *s, int x, int val )
 static inline void bs_rbsp_trailing( bs_t *s )
 {
     bs_write1( s, 1 );
-    bs_flush( s );
+    bs_write( s, s->i_left&7, 0  );
 }
 
 static inline int bs_size_ue( unsigned int val )
diff --git a/common/cabac.h b/common/cabac.h
index 9d0fddd..35871b4 100644
--- a/common/cabac.h
+++ b/common/cabac.h
@@ -39,7 +39,7 @@ typedef struct
     uint8_t *p_end;
 
     /* aligned for memcpy_aligned starting here */
-    DECLARE_ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
+    ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
 
     /* context */
     uint8_t state[460];
diff --git a/common/common.c b/common/common.c
index d7d45d3..6d1d7f0 100644
--- a/common/common.c
+++ b/common/common.c
@@ -21,6 +21,9 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/
 
+#include "common.h"
+#include "cpu.h"
+
 #include <stdarg.h>
 #include <ctype.h>
 
@@ -28,9 +31,6 @@
 #include <malloc.h>
 #endif
 
-#include "common.h"
-#include "cpu.h"
-
 static void x264_log_default( void *, int, const char *, va_list );
 
 /****************************************************************************
@@ -43,8 +43,9 @@ void    x264_param_default( x264_param_t *param )
 
     /* CPU autodetect */
     param->cpu = x264_cpu_detect();
-    param->i_threads = 1;
+    param->i_threads = X264_THREADS_AUTO;
     param->b_deterministic = 1;
+    param->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
 
     /* Video properties */
     param->i_csp           = X264_CSP_I420;
@@ -62,16 +63,21 @@ void    x264_param_default( x264_param_t *param )
     param->i_fps_num       = 25;
     param->i_fps_den       = 1;
     param->i_level_idc     = -1;
+    param->i_slice_max_size = 0;
+    param->i_slice_max_mbs = 0;
+    param->i_slice_count = 0;
 
     /* Encoder parameters */
-    param->i_frame_reference = 1;
+    param->i_frame_reference = 3;
     param->i_keyint_max = 250;
     param->i_keyint_min = 25;
-    param->i_bframe = 0;
+    param->i_bframe = 3;
     param->i_scenecut_threshold = 40;
     param->i_bframe_adaptive = X264_B_ADAPT_FAST;
     param->i_bframe_bias = 0;
-    param->b_bframe_pyramid = 0;
+    param->i_bframe_pyramid = 0;
+    param->b_interlaced = 0;
+    param->b_constrained_intra = 0;
 
     param->b_deblocking_filter = 1;
     param->i_deblocking_filter_alphac0 = 0;
@@ -80,14 +86,14 @@ void    x264_param_default( x264_param_t *param )
     param->b_cabac = 1;
     param->i_cabac_init_idc = 0;
 
-    param->rc.i_rc_method = X264_RC_NONE;
+    param->rc.i_rc_method = X264_RC_CRF;
     param->rc.i_bitrate = 0;
     param->rc.f_rate_tolerance = 1.0;
     param->rc.i_vbv_max_bitrate = 0;
     param->rc.i_vbv_buffer_size = 0;
     param->rc.f_vbv_buffer_init = 0.9;
-    param->rc.i_qp_constant = 26;
-    param->rc.f_rf_constant = 0;
+    param->rc.i_qp_constant = 23;
+    param->rc.f_rf_constant = 23;
     param->rc.i_qp_min = 10;
     param->rc.i_qp_max = 51;
     param->rc.i_qp_step = 4;
@@ -95,6 +101,7 @@ void    x264_param_default( x264_param_t *param )
     param->rc.f_pb_factor = 1.3;
     param->rc.i_aq_mode = X264_AQ_VARIANCE;
     param->rc.f_aq_strength = 1.0;
+    param->rc.i_lookahead = 40;
 
     param->rc.b_stat_write = 0;
     param->rc.psz_stat_out = "x264_2pass.log";
@@ -104,6 +111,7 @@ void    x264_param_default( x264_param_t *param )
     param->rc.f_qblur = 0.5;
     param->rc.f_complexity_blur = 20;
     param->rc.i_zones = 0;
+    param->rc.b_mb_tree = 1;
 
     /* Log */
     param->pf_log = x264_log_default;
@@ -117,19 +125,25 @@ void    x264_param_default( x264_param_t *param )
     param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
     param->analyse.i_me_method = X264_ME_HEX;
     param->analyse.f_psy_rd = 1.0;
+    param->analyse.b_psy = 1;
     param->analyse.f_psy_trellis = 0;
     param->analyse.i_me_range = 16;
-    param->analyse.i_subpel_refine = 6;
+    param->analyse.i_subpel_refine = 7;
+    param->analyse.b_mixed_references = 1;
     param->analyse.b_chroma_me = 1;
     param->analyse.i_mv_range_thread = -1;
     param->analyse.i_mv_range = -1; // set from level_idc
     param->analyse.i_chroma_qp_offset = 0;
     param->analyse.b_fast_pskip = 1;
+    param->analyse.b_weighted_bipred = 1;
+    param->analyse.i_weighted_pred = X264_WEIGHTP_SMART;
     param->analyse.b_dct_decimate = 1;
+    param->analyse.b_transform_8x8 = 1;
+    param->analyse.i_trellis = 1;
     param->analyse.i_luma_deadzone[0] = 21;
     param->analyse.i_luma_deadzone[1] = 11;
-    param->analyse.b_psnr = 1;
-    param->analyse.b_ssim = 1;
+    param->analyse.b_psnr = 0;
+    param->analyse.b_ssim = 0;
 
     param->i_cqm_preset = X264_CQM_FLAT;
     memset( param->cqm_4iy, 16, 16 );
@@ -140,7 +154,10 @@ void    x264_param_default( x264_param_t *param )
     memset( param->cqm_8py, 16, 64 );
 
     param->b_repeat_headers = 1;
+    param->b_annexb = 1;
     param->b_aud = 0;
+    param->b_vfr_input = 1;
+    param->b_dts_compress = 0;
 }
 
 static int parse_enum( const char *arg, const char * const *names, int *dst )
@@ -246,7 +263,7 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
         if( b_error )
         {
             char *buf = strdup(value);
-            char *tok, UNUSED *saveptr, *init;
+            char *tok, UNUSED *saveptr=NULL, *init;
             b_error = 0;
             p->cpu = 0;
             for( init=buf; (tok=strtok_r(init, ",", &saveptr)); init=NULL )
@@ -262,10 +279,19 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
     OPT("threads")
     {
         if( !strcmp(value, "auto") )
-            p->i_threads = 0;
+            p->i_threads = X264_THREADS_AUTO;
         else
             p->i_threads = atoi(value);
     }
+    OPT("sliced-threads")
+        p->b_sliced_threads = atobool(value);
+    OPT("sync-lookahead")
+    {
+        if( !strcmp(value, "auto") )
+            p->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
+        else
+            p->i_sync_lookahead = atoi(value);
+    }
     OPT2("deterministic", "n-deterministic")
         p->b_deterministic = atobool(value);
     OPT2("level", "level-idc")
@@ -331,6 +357,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
             p->i_scenecut_threshold = atoi(value);
         }
     }
+    OPT("intra-refresh")
+        p->b_intra_refresh = atobool(value);
     OPT("bframes")
         p->i_bframe = atoi(value);
     OPT("b-adapt")
@@ -345,7 +373,7 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
     OPT("b-bias")
         p->i_bframe_bias = atoi(value);
     OPT("b-pyramid")
-        p->b_bframe_pyramid = atobool(value);
+        b_error |= parse_enum( value, x264_b_pyramid_names, &p->i_bframe_pyramid );
     OPT("nf")
         p->b_deblocking_filter = !atobool(value);
     OPT2("filter", "deblock")
@@ -363,12 +391,20 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
         else
             p->b_deblocking_filter = atobool(value);
     }
+    OPT("slice-max-size")
+        p->i_slice_max_size = atoi(value);
+    OPT("slice-max-mbs")
+        p->i_slice_max_mbs = atoi(value);
+    OPT("slices")
+        p->i_slice_count = atoi(value);
     OPT("cabac")
         p->b_cabac = atobool(value);
     OPT("cabac-idc")
         p->i_cabac_init_idc = atoi(value);
     OPT("interlaced")
         p->b_interlaced = atobool(value);
+    OPT("constrained-intra")
+        p->b_constrained_intra = atobool(value);
     OPT("cqm")
     {
         if( strstr( value, "flat" ) )
@@ -438,7 +474,7 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
     }
     OPT("log")
         p->i_log_level = atoi(value);
-#ifdef VISUALIZE
+#ifdef HAVE_VISUALIZE
     OPT("visualize")
         p->b_visualize = atobool(value);
 #endif
@@ -460,6 +496,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
         p->analyse.b_transform_8x8 = atobool(value);
     OPT2("weightb", "weight-b")
         p->analyse.b_weighted_bipred = atobool(value);
+    OPT("weightp")
+        p->analyse.i_weighted_pred = atoi(value);
     OPT2("direct", "direct-pred")
         b_error |= parse_enum( value, x264_direct_pred_names, &p->analyse.i_direct_mv_pred );
     OPT("chroma-qp-offset")
@@ -489,6 +527,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
             p->analyse.f_psy_trellis = 0;
         }
     }
+    OPT("psy")
+        p->analyse.b_psy = atobool(value);
     OPT("chroma-me")
         p->analyse.b_chroma_me = atobool(value);
     OPT("mixed-refs")
@@ -520,6 +560,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
         p->rc.f_rf_constant = atof(value);
         p->rc.i_rc_method = X264_RC_CRF;
     }
+    OPT("rc-lookahead")
+        p->rc.i_lookahead = atoi(value);
     OPT2("qpmin", "qp-min")
         p->rc.i_qp_min = atoi(value);
     OPT2("qpmax", "qp-max")
@@ -555,6 +597,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
     }
     OPT("qcomp")
         p->rc.f_qcompress = atof(value);
+    OPT("mbtree")
+        p->rc.b_mb_tree = atobool(value);
     OPT("qblur")
         p->rc.f_qblur = atof(value);
     OPT2("cplxblur", "cplx-blur")
@@ -573,6 +617,10 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
         p->b_repeat_headers = !atobool(value);
     OPT("repeat-headers")
         p->b_repeat_headers = atobool(value);
+    OPT("annexb")
+        p->b_annexb = atobool(value);
+    OPT("force-cfr")
+        p->b_vfr_input = !atobool(value);
     else
         return X264_PARAM_BAD_NAME;
 #undef OPT
@@ -593,11 +641,14 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
  ****************************************************************************/
 void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... )
 {
-    if( i_level <= h->param.i_log_level )
+    if( !h || i_level <= h->param.i_log_level )
     {
         va_list arg;
         va_start( arg, psz_fmt );
-        h->param.pf_log( h->param.p_log_private, i_level, psz_fmt, arg );
+        if( !h )
+            x264_log_default( NULL, i_level, psz_fmt, arg );
+        else
+            h->param.pf_log( h->param.p_log_private, i_level, psz_fmt, arg );
         va_end( arg );
     }
 }
@@ -630,18 +681,22 @@ static void x264_log_default( void *p_unused, int i_level, const char *psz_fmt,
 /****************************************************************************
  * x264_picture_alloc:
  ****************************************************************************/
-void x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
+int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
 {
     pic->i_type = X264_TYPE_AUTO;
     pic->i_qpplus1 = 0;
     pic->img.i_csp = i_csp;
     pic->img.i_plane = 3;
     pic->img.plane[0] = x264_malloc( 3 * i_width * i_height / 2 );
+    if( !pic->img.plane[0] )
+        return -1;
     pic->img.plane[1] = pic->img.plane[0] + i_width * i_height;
     pic->img.plane[2] = pic->img.plane[1] + i_width * i_height / 4;
     pic->img.i_stride[0] = i_width;
     pic->img.i_stride[1] = i_width / 2;
     pic->img.i_stride[2] = i_width / 2;
+    pic->param = NULL;
+    return 0;
 }
 
 /****************************************************************************
@@ -658,23 +713,23 @@ void x264_picture_clean( x264_picture_t *pic )
 /****************************************************************************
  * x264_nal_encode:
  ****************************************************************************/
-int x264_nal_encode( void *p_data, int *pi_data, int b_annexeb, x264_nal_t *nal )
+int x264_nal_encode( uint8_t *dst, int b_annexb, x264_nal_t *nal )
 {
-    uint8_t *dst = p_data;
     uint8_t *src = nal->p_payload;
-    uint8_t *end = &nal->p_payload[nal->i_payload];
-    int i_count = 0;
+    uint8_t *end = nal->p_payload + nal->i_payload;
+    uint8_t *orig_dst = dst;
+    int i_count = 0, size;
 
-    /* FIXME this code doesn't check overflow */
-
-    if( b_annexeb )
+    /* long nal start code (we always use long ones) */
+    if( b_annexb )
     {
-        /* long nal start code (we always use long ones)*/
         *dst++ = 0x00;
         *dst++ = 0x00;
         *dst++ = 0x00;
         *dst++ = 0x01;
     }
+    else /* save room for size later */
+        dst += 4;
 
     /* nal header */
     *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
@@ -692,9 +747,19 @@ int x264_nal_encode( void *p_data, int *pi_data, int b_annexeb, x264_nal_t *nal
             i_count = 0;
         *dst++ = *src++;
     }
-    *pi_data = dst - (uint8_t*)p_data;
+    size = (dst - orig_dst) - 4;
 
-    return *pi_data;
+    /* Write the size header for mp4/etc */
+    if( !b_annexb )
+    {
+        /* Size doesn't include the size of the header we're writing now. */
+        orig_dst[0] = size>>24;
+        orig_dst[1] = size>>16;
+        orig_dst[2] = size>> 8;
+        orig_dst[3] = size>> 0;
+    }
+
+    return size+4;
 }
 
 
@@ -704,22 +769,25 @@ int x264_nal_encode( void *p_data, int *pi_data, int b_annexeb, x264_nal_t *nal
  ****************************************************************************/
 void *x264_malloc( int i_size )
 {
+    uint8_t *align_buf = NULL;
 #ifdef SYS_MACOSX
     /* Mac OS X always returns 16 bytes aligned memory */
-    return malloc( i_size );
+    align_buf = malloc( i_size );
 #elif defined( HAVE_MALLOC_H )
-    return memalign( 16, i_size );
+    align_buf = memalign( 16, i_size );
 #else
-    uint8_t * buf;
-    uint8_t * align_buf;
-    buf = (uint8_t *) malloc( i_size + 15 + sizeof( void ** ) +
-              sizeof( int ) );
-    align_buf = buf + 15 + sizeof( void ** ) + sizeof( int );
-    align_buf -= (intptr_t) align_buf & 15;
-    *( (void **) ( align_buf - sizeof( void ** ) ) ) = buf;
-    *( (int *) ( align_buf - sizeof( void ** ) - sizeof( int ) ) ) = i_size;
-    return align_buf;
+    uint8_t *buf = malloc( i_size + 15 + sizeof(void **) + sizeof(int) );
+    if( buf )
+    {
+        align_buf = buf + 15 + sizeof(void **) + sizeof(int);
+        align_buf -= (intptr_t) align_buf & 15;
+        *( (void **) ( align_buf - sizeof(void **) ) ) = buf;
+        *( (int *) ( align_buf - sizeof(void **) - sizeof(int) ) ) = i_size;
+    }
 #endif
+    if( !align_buf )
+        x264_log( NULL, X264_LOG_ERROR, "malloc of size %d failed\n", i_size );
+    return align_buf;
 }
 
 /****************************************************************************
@@ -738,31 +806,6 @@ void x264_free( void *p )
 }
 
 /****************************************************************************
- * x264_realloc:
- ****************************************************************************/
-void *x264_realloc( void *p, int i_size )
-{
-#ifdef HAVE_MALLOC_H
-    return realloc( p, i_size );
-#else
-    int       i_old_size = 0;
-    uint8_t * p_new;
-    if( p )
-    {
-        i_old_size = *( (int*) ( (uint8_t*) p - sizeof( void ** ) -
-                         sizeof( int ) ) );
-    }
-    p_new = x264_malloc( i_size );
-    if( i_old_size > 0 && i_size > 0 )
-    {
-        memcpy( p_new, p, ( i_old_size < i_size ) ? i_old_size : i_size );
-    }
-    x264_free( p );
-    return p_new;
-#endif
-}
-
-/****************************************************************************
  * x264_reduce_fraction:
  ****************************************************************************/
 void x264_reduce_fraction( int *n, int *d )
@@ -775,9 +818,9 @@ void x264_reduce_fraction( int *n, int *d )
     c = a % b;
     while(c)
     {
-	a = b;
-	b = c;
-	c = a % b;
+        a = b;
+        b = c;
+        c = a % b;
     }
     *n /= b;
     *d /= b;
@@ -825,6 +868,8 @@ char *x264_param2string( x264_param_t *p, int b_res )
     if( p->rc.psz_zones )
         len += strlen(p->rc.psz_zones);
     buf = s = x264_malloc( len );
+    if( !buf )
+        return NULL;
 
     if( b_res )
     {
@@ -839,7 +884,9 @@ char *x264_param2string( x264_param_t *p, int b_res )
     s += sprintf( s, " analyse=%#x:%#x", p->analyse.intra, p->analyse.inter );
     s += sprintf( s, " me=%s", x264_motion_est_names[ p->analyse.i_me_method ] );
     s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine );
-    s += sprintf( s, " psy_rd=%.1f:%.1f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
+    s += sprintf( s, " psy=%d", p->analyse.b_psy );
+    if( p->analyse.b_psy )
+        s += sprintf( s, " psy_rd=%.1f:%.1f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis );
     s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references );
     s += sprintf( s, " me_range=%d", p->analyse.i_me_range );
     s += sprintf( s, " chroma_me=%d", p->analyse.b_chroma_me );
@@ -847,26 +894,39 @@ char *x264_param2string( x264_param_t *p, int b_res )
     s += sprintf( s, " 8x8dct=%d", p->analyse.b_transform_8x8 );
     s += sprintf( s, " cqm=%d", p->i_cqm_preset );
     s += sprintf( s, " deadzone=%d,%d", p->analyse.i_luma_deadzone[0], p->analyse.i_luma_deadzone[1] );
+    s += sprintf( s, " fast_pskip=%d", p->analyse.b_fast_pskip );
     s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset );
     s += sprintf( s, " threads=%d", p->i_threads );
+    s += sprintf( s, " sliced_threads=%d", p->b_sliced_threads );
+    if( p->i_slice_count )
+        s += sprintf( s, " slices=%d", p->i_slice_count );
+    if( p->i_slice_max_size )
+        s += sprintf( s, " slice_max_size=%d", p->i_slice_max_size );
+    if( p->i_slice_max_mbs )
+        s += sprintf( s, " slice_max_mbs=%d", p->i_slice_max_mbs );
     s += sprintf( s, " nr=%d", p->analyse.i_noise_reduction );
     s += sprintf( s, " decimate=%d", p->analyse.b_dct_decimate );
     s += sprintf( s, " mbaff=%d", p->b_interlaced );
+    s += sprintf( s, " constrained_intra=%d", p->b_constrained_intra );
 
     s += sprintf( s, " bframes=%d", p->i_bframe );
     if( p->i_bframe )
     {
         s += sprintf( s, " b_pyramid=%d b_adapt=%d b_bias=%d direct=%d wpredb=%d",
-                      p->b_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias,
+                      p->i_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias,
                       p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred );
     }
+    s += sprintf( s, " wpredp=%d", p->analyse.i_weighted_pred > 0 ? p->analyse.i_weighted_pred : 0 );
+
+    s += sprintf( s, " keyint=%d keyint_min=%d scenecut=%d intra_refresh=%d",
+                  p->i_keyint_max, p->i_keyint_min, p->i_scenecut_threshold, p->b_intra_refresh );
 
-    s += sprintf( s, " keyint=%d keyint_min=%d scenecut=%d",
-                  p->i_keyint_max, p->i_keyint_min, p->i_scenecut_threshold );
+    if( p->rc.b_mb_tree || p->rc.i_vbv_buffer_size )
+        s += sprintf( s, " rc_lookahead=%d", p->rc.i_lookahead );
 
-    s += sprintf( s, " rc=%s", p->rc.i_rc_method == X264_RC_ABR ?
+    s += sprintf( s, " rc=%s mbtree=%d", p->rc.i_rc_method == X264_RC_ABR ?
                                ( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_buffer_size ? "cbr" : "abr" )
-                               : p->rc.i_rc_method == X264_RC_CRF ? "crf" : "cqp" );
+                               : p->rc.i_rc_method == X264_RC_CRF ? "crf" : "cqp", p->rc.b_mb_tree );
     if( p->rc.i_rc_method == X264_RC_ABR || p->rc.i_rc_method == X264_RC_CRF )
     {
         if( p->rc.i_rc_method == X264_RC_CRF )
@@ -888,7 +948,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
     if( !(p->rc.i_rc_method == X264_RC_CQP && p->rc.i_qp_constant == 0) )
     {
         s += sprintf( s, " ip_ratio=%.2f", p->rc.f_ip_factor );
-        if( p->i_bframe )
+        if( p->i_bframe && !p->rc.b_mb_tree )
             s += sprintf( s, " pb_ratio=%.2f", p->rc.f_pb_factor );
         s += sprintf( s, " aq=%d", p->rc.i_aq_mode );
         if( p->rc.i_aq_mode )
diff --git a/common/common.h b/common/common.h
index 1e46ae8..950f48f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -34,28 +34,39 @@
 #define X264_MIN4(a,b,c,d) X264_MIN((a),X264_MIN3((b),(c),(d)))
 #define X264_MAX4(a,b,c,d) X264_MAX((a),X264_MAX3((b),(c),(d)))
 #define XCHG(type,a,b) do{ type t = a; a = b; b = t; } while(0)
+#define IS_DISPOSABLE(type) ( type == X264_TYPE_B )
 #define FIX8(f) ((int)(f*(1<<8)+.5))
 
 #define CHECKED_MALLOC( var, size )\
-{\
+do {\
     var = x264_malloc( size );\
     if( !var )\
-    {\
-        x264_log( h, X264_LOG_ERROR, "malloc failed\n" );\
         goto fail;\
-    }\
-}
+} while( 0 )
+#define CHECKED_MALLOCZERO( var, size )\
+do {\
+    CHECKED_MALLOC( var, size );\
+    memset( var, 0, size );\
+} while( 0 )
 
 #define X264_BFRAME_MAX 16
 #define X264_THREAD_MAX 128
-#define X264_SLICE_MAX 4
-#define X264_NAL_MAX (4 + X264_SLICE_MAX)
 #define X264_PCM_COST (386*8)
+#define X264_LOOKAHEAD_MAX 250
+// arbitrary, but low because SATD scores are 1/4 normal
+#define X264_LOOKAHEAD_QP 12
 
 // number of pixels (per thread) in progress at any given time.
 // 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
 #define X264_THREAD_HEIGHT 24
 
+/* WEIGHTP_FAKE is set when mb_tree & psy are enabled, but normal weightp is disabled
+ * (such as in baseline). It checks for fades in lookahead and adjusts qp accordingly
+ * to increase quality. Defined as (-1) so that if(i_weighted_pred > 0) is true only when
+ * real weights are being used. */
+
+#define X264_WEIGHTP_FAKE (-1)
+
 /****************************************************************************
  * Includes
  ****************************************************************************/
@@ -65,6 +76,22 @@
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
+#include <limits.h>
+
+/* Unions for type-punning.
+ * Mn: load or store n bits, aligned, native-endian
+ * CPn: copy n bits, aligned, native-endian
+ * we don't use memcpy for CPn because memcpy's args aren't assumed to be aligned */
+typedef union { uint16_t i; uint8_t  c[2]; } MAY_ALIAS x264_union16_t;
+typedef union { uint32_t i; uint16_t b[2]; uint8_t  c[4]; } MAY_ALIAS x264_union32_t;
+typedef union { uint64_t i; uint32_t a[2]; uint16_t b[4]; uint8_t c[8]; } MAY_ALIAS x264_union64_t;
+#define M16(src) (((x264_union16_t*)(src))->i)
+#define M32(src) (((x264_union32_t*)(src))->i)
+#define M64(src) (((x264_union64_t*)(src))->i)
+#define CP16(dst,src) M16(dst) = M16(src)
+#define CP32(dst,src) M32(dst) = M32(src)
+#define CP64(dst,src) M64(dst) = M64(src)
+
 #include "x264.h"
 #include "bs.h"
 #include "set.h"
@@ -77,12 +104,11 @@
 #include "quant.h"
 
 /****************************************************************************
- * Generals functions
+ * General functions
  ****************************************************************************/
 /* x264_malloc : will do or emulate a memalign
  * you have to use x264_free for buffers allocated with x264_malloc */
 void *x264_malloc( int );
-void *x264_realloc( void *p, int i_size );
 void  x264_free( void * );
 
 /* x264_slurp_file: malloc space for the whole file and read it */
@@ -95,6 +121,8 @@ int64_t x264_mdate( void );
  * the encoding options */
 char *x264_param2string( x264_param_t *p, int b_res );
 
+int x264_nal_encode( uint8_t *dst, int b_annexb, x264_nal_t *nal );
+
 /* log */
 void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
 
@@ -152,6 +180,26 @@ static inline uint32_t x264_cabac_amvd_sum( int16_t *mvdleft, int16_t *mvdtop )
     return amvd0 + (amvd1<<16);
 }
 
+extern const uint8_t x264_exp2_lut[64];
+extern const float x264_log2_lut[128];
+extern const float x264_log2_lz_lut[32];
+
+/* Not a general-purpose function; multiplies input by -1/6 to convert
+ * qp to qscale. */
+static ALWAYS_INLINE int x264_exp2fix8( float x )
+{
+    int i = x*(-64.f/6.f) + 512.5f;
+    if( i < 0 ) return 0;
+    if( i > 1023 ) return 0xffff;
+    return (x264_exp2_lut[i&63]+256) << (i>>6) >> 8;
+}
+
+static ALWAYS_INLINE float x264_log2( uint32_t x )
+{
+    int lz = x264_clz( x );
+    return x264_log2_lut[(x<<lz>>24)&0x7f] + x264_log2_lz_lut[lz];
+}
+
 /****************************************************************************
  *
  ****************************************************************************/
@@ -199,11 +247,23 @@ typedef struct
 
     int b_ref_pic_list_reordering_l0;
     int b_ref_pic_list_reordering_l1;
-    struct {
+    struct
+    {
         int idc;
         int arg;
     } ref_pic_list_order[2][16];
 
+    /* P-frame weighting */
+    x264_weight_t weight[32][3];
+
+    int i_mmco_remove_from_end;
+    int i_mmco_command_count;
+    struct /* struct for future expansion */
+    {
+        int i_difference_of_pic_nums;
+        int i_poc;
+    } mmco[16];
+
     int i_cabac_init_idc;
 
     int i_qp;
@@ -218,6 +278,19 @@ typedef struct
 
 } x264_slice_header_t;
 
+typedef struct x264_lookahead_t
+{
+    volatile uint8_t              b_exit_thread;
+    uint8_t                       b_thread_active;
+    uint8_t                       b_analyse_keyframe;
+    int                           i_last_keyframe;
+    int                           i_slicetype_length;
+    x264_frame_t                  *last_nonb;
+    x264_synch_frame_list_t       ifbuf;
+    x264_synch_frame_list_t       next;
+    x264_synch_frame_list_t       ofbuf;
+} x264_lookahead_t;
+
 /* From ffmpeg
  */
 #define X264_SCAN8_SIZE (6*8)
@@ -262,36 +335,37 @@ struct x264_t
     /* encoder parameters */
     x264_param_t    param;
 
-    x264_t          *thread[X264_THREAD_MAX];
+    x264_t          *thread[X264_THREAD_MAX+1];
     x264_pthread_t  thread_handle;
     int             b_thread_active;
     int             i_thread_phase; /* which thread to use for the next frame */
+    int             i_threadslice_start; /* first row in this thread slice */
+    int             i_threadslice_end; /* row after the end of this thread slice */
 
     /* bitstream output */
     struct
     {
         int         i_nal;
-        x264_nal_t  nal[X264_NAL_MAX];
+        int         i_nals_allocated;
+        x264_nal_t  *nal;
         int         i_bitstream;    /* size of p_bitstream */
         uint8_t     *p_bitstream;   /* will hold data for all nal */
         bs_t        bs;
-        int         i_frame_size;
     } out;
 
+    uint8_t *nal_buffer;
+    int      nal_buffer_size;
+
     /**** thread synchronization starts here ****/
 
     /* frame number/poc */
     int             i_frame;
+    int             i_frame_num;
 
-    int             i_frame_offset; /* decoding only */
-    int             i_frame_num;    /* decoding only */
-    int             i_poc_msb;      /* decoding only */
-    int             i_poc_lsb;      /* decoding only */
-    int             i_poc;          /* decoding only */
-
-    int             i_thread_num;   /* threads only */
-    int             i_nal_type;     /* threads only */
-    int             i_nal_ref_idc;  /* threads only */
+    int             i_thread_frames; /* Number of different frames being encoded by threads;
+                                      * 1 when sliced-threads is on. */
+    int             i_nal_type;
+    int             i_nal_ref_idc;
 
     /* We use only one SPS and one PPS */
     x264_sps_t      sps_array[1];
@@ -300,9 +374,12 @@ struct x264_t
     x264_pps_t      *pps;
     int             i_idr_pic_id;
 
-    /* quantization matrix for decoding, [cqm][qp%6][coef_y][coef_x] */
-    int             (*dequant4_mf[4])[4][4]; /* [4][6][4][4] */
-    int             (*dequant8_mf[2])[8][8]; /* [2][6][8][8] */
+    /* Timebase multiplier for DTS compression */
+    int             i_dts_compress_multiplier;
+
+    /* quantization matrix for decoding, [cqm][qp%6][coef] */
+    int             (*dequant4_mf[4])[16];   /* [4][6][16] */
+    int             (*dequant8_mf[2])[64];   /* [2][6][64] */
     /* quantization matrix for trellis, [cqm][qp][coef] */
     int             (*unquant4_mf[4])[16];   /* [4][52][16] */
     int             (*unquant8_mf[2])[64];   /* [2][52][64] */
@@ -312,10 +389,16 @@ struct x264_t
     uint16_t        (*quant4_bias[4])[16];   /* [4][52][16] */
     uint16_t        (*quant8_bias[2])[64];   /* [2][52][64] */
 
+    /* mv/ref cost arrays.  Indexed by lambda instead of
+     * qp because, due to rounding, some quantizers share
+     * lambdas.  This saves memory. */
+    uint16_t *cost_mv[92];
+    uint16_t *cost_mv_fpel[92][4];
+
     const uint8_t   *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */
 
-    DECLARE_ALIGNED_16( uint32_t nr_residual_sum[2][64] );
-    DECLARE_ALIGNED_16( uint16_t nr_offset[2][64] );
+    ALIGNED_16( uint32_t nr_residual_sum[2][64] );
+    ALIGNED_16( uint16_t nr_offset[2][64] );
     uint32_t        nr_count[2];
 
     /* Slice header */
@@ -327,18 +410,17 @@ struct x264_t
     struct
     {
         /* Frames to be encoded (whose types have been decided) */
-        x264_frame_t *current[X264_BFRAME_MAX*4+3];
-        /* Temporary buffer (frames types not yet decided) */
-        x264_frame_t *next[X264_BFRAME_MAX*4+3];
-        /* Unused frames */
-        x264_frame_t *unused[X264_BFRAME_MAX*4 + X264_THREAD_MAX*2 + 16+4];
-        /* For adaptive B decision */
-        x264_frame_t *last_nonb;
+        x264_frame_t **current;
+        /* Unused frames: 0 = fenc, 1 = fdec */
+        x264_frame_t **unused[2];
+
+        /* Unused blank frames (for duplicates) */
+        x264_frame_t **blank_unused;
 
         /* frames used for reference + sentinels */
         x264_frame_t *reference[16+2];
 
-        int i_last_idr; /* Frame number of the last IDR */
+        int i_last_keyframe; /* Frame number of the last keyframe */
 
         int i_input;    /* Number of input frames already accepted */
 
@@ -346,6 +428,10 @@ struct x264_t
         int i_max_ref0;
         int i_max_ref1;
         int i_delay;    /* Number of frames buffered for B reordering */
+        int     i_bframe_delay;
+        int64_t i_bframe_delay_time;
+        int64_t i_init_delta;
+        int64_t i_prev_dts[2];
         int b_have_lowres;  /* Whether 1/2 resolution luma planes are being used */
         int b_have_sub8x8_esa;
     } frames;
@@ -368,11 +454,11 @@ struct x264_t
     /* Current MB DCT coeffs */
     struct
     {
-        DECLARE_ALIGNED_16( int16_t luma16x16_dc[16] );
-        DECLARE_ALIGNED_16( int16_t chroma_dc[2][4] );
+        ALIGNED_16( int16_t luma16x16_dc[16] );
+        ALIGNED_16( int16_t chroma_dc[2][4] );
         // FIXME share memory?
-        DECLARE_ALIGNED_16( int16_t luma8x8[4][64] );
-        DECLARE_ALIGNED_16( int16_t luma4x4[16+8][16] );
+        ALIGNED_16( int16_t luma8x8[4][64] );
+        ALIGNED_16( int16_t luma4x4[16+8][16] );
     } dct;
 
     /* MB table and cache for current frame/mb */
@@ -418,6 +504,7 @@ struct x264_t
         unsigned int i_neighbour;
         unsigned int i_neighbour8[4];       /* neighbours of each 8x8 or 4x4 block that are available */
         unsigned int i_neighbour4[16];      /* at the time the block is coded */
+        unsigned int i_neighbour_intra;     /* for constrained intra pred */
         int     i_mb_type_top;
         int     i_mb_type_left;
         int     i_mb_type_topleft;
@@ -444,12 +531,14 @@ struct x264_t
         int8_t  *skipbp;                    /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
         int8_t  *mb_transform_size;         /* transform_size_8x8_flag of each mb */
         uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
-        uint8_t (*nnz_backup)[16];          /* when using cavlc + 8x8dct, the deblocker uses a modified nnz */
+
+         /* buffer for weighted versions of the reference frames */
+        uint8_t *p_weight_buf[16];
 
         /* current value */
         int     i_type;
         int     i_partition;
-        DECLARE_ALIGNED_4( uint8_t i_sub_partition[4] );
+        ALIGNED_4( uint8_t i_sub_partition[4] );
         int     b_transform_8x8;
 
         int     i_cbp_luma;
@@ -466,28 +555,31 @@ struct x264_t
         /* skip flag for motion compensation */
         /* if we've already done MC, we don't need to do it again */
         int b_skip_mc;
+        /* set to true if we are re-encoding a macroblock. */
+        int b_reencode_mb;
+        int ip_offset; /* Used by PIR to offset the quantizer of intra-refresh blocks. */
 
         struct
         {
             /* space for p_fenc and p_fdec */
 #define FENC_STRIDE 16
 #define FDEC_STRIDE 32
-            DECLARE_ALIGNED_16( uint8_t fenc_buf[24*FENC_STRIDE] );
-            DECLARE_ALIGNED_16( uint8_t fdec_buf[27*FDEC_STRIDE] );
+            ALIGNED_16( uint8_t fenc_buf[24*FENC_STRIDE] );
+            ALIGNED_16( uint8_t fdec_buf[27*FDEC_STRIDE] );
 
             /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
-            DECLARE_ALIGNED_16( uint8_t i4x4_fdec_buf[16*16] );
-            DECLARE_ALIGNED_16( uint8_t i8x8_fdec_buf[16*16] );
-            DECLARE_ALIGNED_16( int16_t i8x8_dct_buf[3][64] );
-            DECLARE_ALIGNED_16( int16_t i4x4_dct_buf[15][16] );
+            ALIGNED_16( uint8_t i4x4_fdec_buf[16*16] );
+            ALIGNED_16( uint8_t i8x8_fdec_buf[16*16] );
+            ALIGNED_16( int16_t i8x8_dct_buf[3][64] );
+            ALIGNED_16( int16_t i4x4_dct_buf[15][16] );
             uint32_t i4x4_nnz_buf[4];
             uint32_t i8x8_nnz_buf[4];
             int i4x4_cbp;
             int i8x8_cbp;
 
             /* Psy trellis DCT data */
-            DECLARE_ALIGNED_16( int16_t fenc_dct8[4][64] );
-            DECLARE_ALIGNED_16( int16_t fenc_dct4[16][16] );
+            ALIGNED_16( int16_t fenc_dct8[4][64] );
+            ALIGNED_16( int16_t fenc_dct4[16][16] );
 
             /* Psy RD SATD scores */
             int fenc_satd[4][4];
@@ -506,6 +598,7 @@ struct x264_t
             /* pointer over mb of the references */
             int i_fref[2];
             uint8_t *p_fref[2][32][4+2]; /* last: lN, lH, lV, lHV, cU, cV */
+            uint8_t *p_fref_w[32];  /* weighted fullpel luma */
             uint16_t *p_integral[2][16];
 
             /* fref stride */
@@ -516,24 +609,24 @@ struct x264_t
         struct
         {
             /* real intra4x4_pred_mode if I_4X4 or I_8X8, I_PRED_4x4_DC if mb available, -1 if not */
-            int8_t  intra4x4_pred_mode[X264_SCAN8_SIZE];
+            ALIGNED_8( int8_t intra4x4_pred_mode[X264_SCAN8_SIZE] );
 
             /* i_non_zero_count if available else 0x80 */
-            uint8_t non_zero_count[X264_SCAN8_SIZE];
+            ALIGNED_4( uint8_t non_zero_count[X264_SCAN8_SIZE] );
 
             /* -1 if unused, -2 if unavailable */
-            DECLARE_ALIGNED_4( int8_t ref[2][X264_SCAN8_SIZE] );
+            ALIGNED_4( int8_t ref[2][X264_SCAN8_SIZE] );
 
             /* 0 if not available */
-            DECLARE_ALIGNED_16( int16_t mv[2][X264_SCAN8_SIZE][2] );
-            DECLARE_ALIGNED_8( int16_t mvd[2][X264_SCAN8_SIZE][2] );
+            ALIGNED_16( int16_t mv[2][X264_SCAN8_SIZE][2] );
+            ALIGNED_8( int16_t mvd[2][X264_SCAN8_SIZE][2] );
 
             /* 1 if SKIP or DIRECT. set only for B-frames + CABAC */
-            DECLARE_ALIGNED_4( int8_t skip[X264_SCAN8_SIZE] );
+            ALIGNED_4( int8_t skip[X264_SCAN8_SIZE] );
 
-            DECLARE_ALIGNED_16( int16_t direct_mv[2][X264_SCAN8_SIZE][2] );
-            DECLARE_ALIGNED_4( int8_t  direct_ref[2][X264_SCAN8_SIZE] );
-            DECLARE_ALIGNED_4( int16_t pskip_mv[2] );
+            ALIGNED_4( int16_t direct_mv[2][4][2] );
+            ALIGNED_4( int8_t  direct_ref[2][4] );
+            ALIGNED_4( int16_t pskip_mv[2] );
 
             /* number of neighbors (top and left) that used 8x8 dct */
             int     i_neighbour_transform_size;
@@ -554,12 +647,19 @@ struct x264_t
         int     b_direct_auto_read; /* take stats for --direct auto from the 2pass log */
         int     b_direct_auto_write; /* analyse direct modes, to use and/or save */
 
+        /* lambda values */
+        int     i_trellis_lambda2[2][2]; /* [luma,chroma][inter,intra] */
+        int     i_psy_rd_lambda;
+        int     i_chroma_lambda2_offset;
+
         /* B_direct and weighted prediction */
         int16_t dist_scale_factor[16][2];
-        int16_t bipred_weight[32][4];
+        int8_t bipred_weight_buf[2][32][4];
+        int8_t (*bipred_weight)[4];
         /* maps fref1[0]'s ref indices into the current list0 */
-        int8_t  map_col_to_list0_buf[2]; // for negative indices
-        int8_t  map_col_to_list0[16];
+#define map_col_to_list0(col) h->mb.map_col_to_list0[col+2]
+        int8_t  map_col_to_list0[18];
+        int ref_blind_dupe; /* The index of the blind reference frame duplicate. */
     } mb;
 
     /* rate control encoding only */
@@ -586,6 +686,7 @@ struct x264_t
             int i_mb_count_ref[2][32];
             int i_mb_partition[17];
             int i_mb_cbp[6];
+            int i_mb_pred_mode[3][13];
             /* Adaptive direct mv pred */
             int i_direct_score[2];
             /* Metrics */
@@ -596,9 +697,9 @@ struct x264_t
         /* Cumulated stats */
 
         /* per slice info */
-        int     i_slice_count[5];
-        int64_t i_slice_size[5];
-        double  f_slice_qp[5];
+        int     i_frame_count[5];
+        int64_t i_frame_size[5];
+        double  f_frame_qp[5];
         int     i_consecutive_bframes[X264_BFRAME_MAX+1];
         /* */
         int64_t i_ssd_global[5];
@@ -613,9 +714,12 @@ struct x264_t
         int64_t i_mb_count_8x8dct[2];
         int64_t i_mb_count_ref[2][2][32];
         int64_t i_mb_cbp[6];
+        int64_t i_mb_pred_mode[3][13];
         /* */
         int     i_direct_score[2];
         int     i_direct_frames[2];
+        /* num p-frames weighted */
+        int     i_wpred[3];
 
     } stat;
 
@@ -635,9 +739,10 @@ struct x264_t
     x264_quant_function_t quantf;
     x264_deblock_function_t loopf;
 
-#if VISUALIZE
+#ifdef HAVE_VISUALIZE
     struct visualize_t *visualize;
 #endif
+    x264_lookahead_t *lookahead;
 };
 
 // included at the end because it needs x264_t
diff --git a/common/cpu.c b/common/cpu.c
index 1cb7080..9f2d5a6 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -22,8 +22,11 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/
 
+#define _GNU_SOURCE // for sched_getaffinity
+#include "common.h"
+#include "cpu.h"
+
 #if defined(HAVE_PTHREAD) && defined(SYS_LINUX)
-#define _GNU_SOURCE
 #include <sched.h>
 #endif
 #ifdef SYS_BEOS
@@ -39,9 +42,6 @@
 #include <machine/cpu.h>
 #endif
 
-#include "common.h"
-#include "cpu.h"
-
 const x264_cpu_name_t x264_cpu_names[] = {
     {"Altivec", X264_CPU_ALTIVEC},
 //  {"MMX",     X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
@@ -61,9 +61,30 @@ const x264_cpu_name_t x264_cpu_names[] = {
     {"SSEMisalign", X264_CPU_SSE_MISALIGN},
     {"LZCNT", X264_CPU_LZCNT},
     {"Slow_mod4_stack", X264_CPU_STACK_MOD4},
+    {"ARMv6", X264_CPU_ARMV6},
+    {"NEON",  X264_CPU_NEON},
+    {"Fast_NEON_MRC",  X264_CPU_FAST_NEON_MRC},
     {"", 0},
 };
 
+#if (defined(ARCH_PPC) && defined(SYS_LINUX)) || (defined(ARCH_ARM) && !defined(HAVE_NEON))
+#include <signal.h>
+#include <setjmp.h>
+static sigjmp_buf jmpbuf;
+static volatile sig_atomic_t canjump = 0;
+
+static void sigill_handler( int sig )
+{
+    if( !canjump )
+    {
+        signal( sig, SIG_DFL );
+        raise( sig );
+    }
+
+    canjump = 0;
+    siglongjmp( jmpbuf, 1 );
+}
+#endif
 
 #ifdef HAVE_MMX
 extern int  x264_cpu_cpuid_test( void );
@@ -122,13 +143,17 @@ uint32_t x264_cpu_detect( void )
             if( ecx&0x00000040 ) /* SSE4a */
             {
                 cpu |= X264_CPU_SSE2_IS_FAST;
-                cpu |= X264_CPU_SSE_MISALIGN;
                 cpu |= X264_CPU_LZCNT;
                 cpu |= X264_CPU_SHUFFLE_IS_FAST;
-                x264_cpu_mask_misalign_sse();
             }
             else
                 cpu |= X264_CPU_SSE2_IS_SLOW;
+
+            if( ecx&0x00000080 ) /* Misalign SSE */
+            {
+                cpu |= X264_CPU_SSE_MISALIGN;
+                x264_cpu_mask_misalign_sse();
+            }
         }
     }
 
@@ -224,22 +249,6 @@ uint32_t x264_cpu_detect( void )
 }
 
 #elif defined( SYS_LINUX )
-#include <signal.h>
-#include <setjmp.h>
-static sigjmp_buf jmpbuf;
-static volatile sig_atomic_t canjump = 0;
-
-static void sigill_handler( int sig )
-{
-    if( !canjump )
-    {
-        signal( sig, SIG_DFL );
-        raise( sig );
-    }
-
-    canjump = 0;
-    siglongjmp( jmpbuf, 1 );
-}
 
 uint32_t x264_cpu_detect( void )
 {
@@ -265,6 +274,48 @@ uint32_t x264_cpu_detect( void )
 }
 #endif
 
+#elif defined( ARCH_ARM )
+
+void x264_cpu_neon_test();
+int x264_cpu_fast_neon_mrc_test();
+
+uint32_t x264_cpu_detect( void )
+{
+    int flags = 0;
+#ifdef HAVE_ARMV6
+    flags |= X264_CPU_ARMV6;
+
+    // don't do this hack if compiled with -mfpu=neon
+#ifndef HAVE_NEON
+    static void (* oldsig)( int );
+    oldsig = signal( SIGILL, sigill_handler );
+    if( sigsetjmp( jmpbuf, 1 ) )
+    {
+        signal( SIGILL, oldsig );
+        return flags;
+    }
+
+    canjump = 1;
+    x264_cpu_neon_test();
+    canjump = 0;
+    signal( SIGILL, oldsig );
+#endif
+
+    flags |= X264_CPU_NEON;
+
+    // fast neon -> arm (Cortex-A9) detection relies on user access to the
+    // cycle counter; this assumes ARMv7 performance counters.
+    // NEON requires at least ARMv7, ARMv8 may require changes here, but
+    // hopefully this hacky detection method will have been replaced by then.
+    // Note that there is potential for a race condition if another program or
+    // x264 instance disables or reinits the counters while x264 is using them,
+    // which may result in incorrect detection and the counters stuck enabled.
+    flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0;
+    // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
+#endif
+    return flags;
+}
+
 #else
 
 uint32_t x264_cpu_detect( void )
diff --git a/common/cpu.h b/common/cpu.h
index 4380a35..6901e1e 100644
--- a/common/cpu.h
+++ b/common/cpu.h
@@ -33,12 +33,12 @@ void     x264_cpu_mask_misalign_sse( void );
  * gcc 4.2 introduced __attribute__((force_align_arg_pointer)) to fix this
  * problem, but I don't want to require such a new version.
  * This applies only to x86_32, since other architectures that need alignment
- * also have ABIs that ensure aligned stack. */
+ * either have ABIs that ensure aligned stack, or don't support it at all. */
 #if defined(ARCH_X86) && defined(HAVE_MMX)
-int x264_stack_align( void (*func)(x264_t*), x264_t *arg );
-#define x264_stack_align(func,arg) x264_stack_align((void (*)(x264_t*))func,arg)
+int x264_stack_align( void (*func)(), ... );
+#define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__)
 #else
-#define x264_stack_align(func,arg) func(arg)
+#define x264_stack_align(func,...) func(__VA_ARGS__)
 #endif
 
 typedef struct {
diff --git a/common/dct.c b/common/dct.c
index 1f8f4b3..aa83ef4 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -28,79 +28,78 @@
 #ifdef ARCH_PPC
 #   include "ppc/dct.h"
 #endif
+#ifdef ARCH_ARM
+#   include "arm/dct.h"
+#endif
 
 int x264_dct4_weight2_zigzag[2][16];
 int x264_dct8_weight2_zigzag[2][64];
 
-/*
- * XXX For all dct dc : input could be equal to output so ...
- */
-
-static void dct4x4dc( int16_t d[4][4] )
+static void dct4x4dc( int16_t d[16] )
 {
-    int16_t tmp[4][4];
+    int16_t tmp[16];
     int s01, s23;
     int d01, d23;
     int i;
 
     for( i = 0; i < 4; i++ )
     {
-        s01 = d[i][0] + d[i][1];
-        d01 = d[i][0] - d[i][1];
-        s23 = d[i][2] + d[i][3];
-        d23 = d[i][2] - d[i][3];
-
-        tmp[0][i] = s01 + s23;
-        tmp[1][i] = s01 - s23;
-        tmp[2][i] = d01 - d23;
-        tmp[3][i] = d01 + d23;
+        s01 = d[i*4+0] + d[i*4+1];
+        d01 = d[i*4+0] - d[i*4+1];
+        s23 = d[i*4+2] + d[i*4+3];
+        d23 = d[i*4+2] - d[i*4+3];
+
+        tmp[0*4+i] = s01 + s23;
+        tmp[1*4+i] = s01 - s23;
+        tmp[2*4+i] = d01 - d23;
+        tmp[3*4+i] = d01 + d23;
     }
 
     for( i = 0; i < 4; i++ )
     {
-        s01 = tmp[i][0] + tmp[i][1];
-        d01 = tmp[i][0] - tmp[i][1];
-        s23 = tmp[i][2] + tmp[i][3];
-        d23 = tmp[i][2] - tmp[i][3];
-
-        d[i][0] = ( s01 + s23 + 1 ) >> 1;
-        d[i][1] = ( s01 - s23 + 1 ) >> 1;
-        d[i][2] = ( d01 - d23 + 1 ) >> 1;
-        d[i][3] = ( d01 + d23 + 1 ) >> 1;
+        s01 = tmp[i*4+0] + tmp[i*4+1];
+        d01 = tmp[i*4+0] - tmp[i*4+1];
+        s23 = tmp[i*4+2] + tmp[i*4+3];
+        d23 = tmp[i*4+2] - tmp[i*4+3];
+
+        d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
+        d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
+        d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
+        d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
     }
 }
 
-static void idct4x4dc( int16_t d[4][4] )
+static void idct4x4dc( int16_t d[16] )
 {
-    int16_t tmp[4][4];
+    int16_t tmp[16];
     int s01, s23;
     int d01, d23;
     int i;
 
     for( i = 0; i < 4; i++ )
     {
-        s01 = d[i][0] + d[i][1];
-        d01 = d[i][0] - d[i][1];
-        s23 = d[i][2] + d[i][3];
-        d23 = d[i][2] - d[i][3];
-
-        tmp[0][i] = s01 + s23;
-        tmp[1][i] = s01 - s23;
-        tmp[2][i] = d01 - d23;
-        tmp[3][i] = d01 + d23;
+        s01 = d[i*4+0] + d[i*4+1];
+        d01 = d[i*4+0] - d[i*4+1];
+        s23 = d[i*4+2] + d[i*4+3];
+        d23 = d[i*4+2] - d[i*4+3];
+
+        tmp[0*4+i] = s01 + s23;
+        tmp[1*4+i] = s01 - s23;
+        tmp[2*4+i] = d01 - d23;
+        tmp[3*4+i] = d01 + d23;
     }
 
     for( i = 0; i < 4; i++ )
     {
-        s01 = tmp[i][0] + tmp[i][1];
-        d01 = tmp[i][0] - tmp[i][1];
-        s23 = tmp[i][2] + tmp[i][3];
-        d23 = tmp[i][2] - tmp[i][3];
-
-        d[i][0] = s01 + s23;
-        d[i][1] = s01 - s23;
-        d[i][2] = d01 - d23;
-        d[i][3] = d01 + d23;
+        s01 = tmp[i*4+0] + tmp[i*4+1];
+        d01 = tmp[i*4+0] - tmp[i*4+1];
+        s23 = tmp[i*4+2] + tmp[i*4+3];
+        d23 = tmp[i*4+2] - tmp[i*4+3];
+
+        d[i*4+0] = s01 + s23;
+        d[i*4+1] = s01 - s23;
+        d[i*4+2] = d01 - d23;
+        d[i*4+3] = d01 + d23;
     }
 }
 
@@ -119,42 +118,42 @@ static inline void pixel_sub_wxh( int16_t *diff, int i_size,
     }
 }
 
-static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
+static void sub4x4_dct( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 )
 {
-    int16_t d[4][4];
-    int16_t tmp[4][4];
+    int16_t d[16];
+    int16_t tmp[16];
     int i;
 
-    pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
+    pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 
     for( i = 0; i < 4; i++ )
     {
-        const int s03 = d[i][0] + d[i][3];
-        const int s12 = d[i][1] + d[i][2];
-        const int d03 = d[i][0] - d[i][3];
-        const int d12 = d[i][1] - d[i][2];
-
-        tmp[0][i] =   s03 +   s12;
-        tmp[1][i] = 2*d03 +   d12;
-        tmp[2][i] =   s03 -   s12;
-        tmp[3][i] =   d03 - 2*d12;
+        const int s03 = d[i*4+0] + d[i*4+3];
+        const int s12 = d[i*4+1] + d[i*4+2];
+        const int d03 = d[i*4+0] - d[i*4+3];
+        const int d12 = d[i*4+1] - d[i*4+2];
+
+        tmp[0*4+i] =   s03 +   s12;
+        tmp[1*4+i] = 2*d03 +   d12;
+        tmp[2*4+i] =   s03 -   s12;
+        tmp[3*4+i] =   d03 - 2*d12;
     }
 
     for( i = 0; i < 4; i++ )
     {
-        const int s03 = tmp[i][0] + tmp[i][3];
-        const int s12 = tmp[i][1] + tmp[i][2];
-        const int d03 = tmp[i][0] - tmp[i][3];
-        const int d12 = tmp[i][1] - tmp[i][2];
-
-        dct[i][0] =   s03 +   s12;
-        dct[i][1] = 2*d03 +   d12;
-        dct[i][2] =   s03 -   s12;
-        dct[i][3] =   d03 - 2*d12;
+        const int s03 = tmp[i*4+0] + tmp[i*4+3];
+        const int s12 = tmp[i*4+1] + tmp[i*4+2];
+        const int d03 = tmp[i*4+0] - tmp[i*4+3];
+        const int d12 = tmp[i*4+1] - tmp[i*4+2];
+
+        dct[i*4+0] =   s03 +   s12;
+        dct[i*4+1] = 2*d03 +   d12;
+        dct[i*4+2] =   s03 -   s12;
+        dct[i*4+3] =   d03 - 2*d12;
     }
 }
 
-static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
+static void sub8x8_dct( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 )
 {
     sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
     sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
@@ -162,7 +161,7 @@ static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
     sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 }
 
-static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
+static void sub16x16_dct( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 )
 {
     sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
     sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
@@ -170,52 +169,70 @@ static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
     sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 }
 
+static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
+{
+    int16_t d[16];
+    int sum = 0;
+
+    pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
+
+    sum += d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
+    sum += d[8] + d[9] + d[10] + d[11] + d[12] + d[13] + d[14] + d[15];
+
+    return sum;
+}
+
+static void sub8x8_dct_dc( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 )
+{
+    dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
+    dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
+    dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
+    dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
+}
 
-static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
+static void add4x4_idct( uint8_t *p_dst, int16_t dct[16] )
 {
-    int16_t d[4][4];
-    int16_t tmp[4][4];
+    int16_t d[16];
+    int16_t tmp[16];
     int x, y;
     int i;
 
     for( i = 0; i < 4; i++ )
     {
-        const int s02 =  dct[0][i]     +  dct[2][i];
-        const int d02 =  dct[0][i]     -  dct[2][i];
-        const int s13 =  dct[1][i]     + (dct[3][i]>>1);
-        const int d13 = (dct[1][i]>>1) -  dct[3][i];
-
-        tmp[i][0] = s02 + s13;
-        tmp[i][1] = d02 + d13;
-        tmp[i][2] = d02 - d13;
-        tmp[i][3] = s02 - s13;
+        const int s02 =  dct[0*4+i]     +  dct[2*4+i];
+        const int d02 =  dct[0*4+i]     -  dct[2*4+i];
+        const int s13 =  dct[1*4+i]     + (dct[3*4+i]>>1);
+        const int d13 = (dct[1*4+i]>>1) -  dct[3*4+i];
+
+        tmp[i*4+0] = s02 + s13;
+        tmp[i*4+1] = d02 + d13;
+        tmp[i*4+2] = d02 - d13;
+        tmp[i*4+3] = s02 - s13;
     }
 
     for( i = 0; i < 4; i++ )
     {
-        const int s02 =  tmp[0][i]     +  tmp[2][i];
-        const int d02 =  tmp[0][i]     -  tmp[2][i];
-        const int s13 =  tmp[1][i]     + (tmp[3][i]>>1);
-        const int d13 = (tmp[1][i]>>1) -  tmp[3][i];
-
-        d[0][i] = ( s02 + s13 + 32 ) >> 6;
-        d[1][i] = ( d02 + d13 + 32 ) >> 6;
-        d[2][i] = ( d02 - d13 + 32 ) >> 6;
-        d[3][i] = ( s02 - s13 + 32 ) >> 6;
+        const int s02 =  tmp[0*4+i]     +  tmp[2*4+i];
+        const int d02 =  tmp[0*4+i]     -  tmp[2*4+i];
+        const int s13 =  tmp[1*4+i]     + (tmp[3*4+i]>>1);
+        const int d13 = (tmp[1*4+i]>>1) -  tmp[3*4+i];
+
+        d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
+        d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
+        d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
+        d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
     }
 
 
     for( y = 0; y < 4; y++ )
     {
         for( x = 0; x < 4; x++ )
-        {
-            p_dst[x] = x264_clip_uint8( p_dst[x] + d[y][x] );
-        }
+            p_dst[x] = x264_clip_uint8( p_dst[x] + d[y*4+x] );
         p_dst += FDEC_STRIDE;
     }
 }
 
-static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
+static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][16] )
 {
     add4x4_idct( &p_dst[0],               dct[0] );
     add4x4_idct( &p_dst[4],               dct[1] );
@@ -223,7 +240,7 @@ static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
     add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
 }
 
-static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] )
+static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][16] )
 {
     add8x8_idct( &p_dst[0],               &dct[0] );
     add8x8_idct( &p_dst[8],               &dct[4] );
@@ -262,29 +279,29 @@ static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] )
     DST(7) = (a4>>2) - a7 ;\
 }
 
-static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+static void sub8x8_dct8( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 )
 {
     int i;
-    int16_t tmp[8][8];
+    int16_t tmp[64];
 
-    pixel_sub_wxh( (int16_t*)tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
+    pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 
-#define SRC(x) tmp[x][i]
-#define DST(x) tmp[x][i]
+#define SRC(x) tmp[x*8+i]
+#define DST(x) tmp[x*8+i]
     for( i = 0; i < 8; i++ )
         DCT8_1D
 #undef SRC
 #undef DST
 
-#define SRC(x) tmp[i][x]
-#define DST(x) dct[x][i]
+#define SRC(x) tmp[i*8+x]
+#define DST(x) dct[x*8+i]
     for( i = 0; i < 8; i++ )
         DCT8_1D
 #undef SRC
 #undef DST
 }
 
-static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
+static void sub16x16_dct8( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 )
 {
     sub8x8_dct8( dct[0], &pix1[0],               &pix2[0] );
     sub8x8_dct8( dct[1], &pix1[8],               &pix2[8] );
@@ -319,20 +336,20 @@ static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
     DST(7, b0 - b7);\
 }
 
-static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
+static void add8x8_idct8( uint8_t *dst, int16_t dct[64] )
 {
     int i;
 
-    dct[0][0] += 32; // rounding for the >>6 at the end
+    dct[0] += 32; // rounding for the >>6 at the end
 
-#define SRC(x)     dct[x][i]
-#define DST(x,rhs) dct[x][i] = (rhs)
+#define SRC(x)     dct[x*8+i]
+#define DST(x,rhs) dct[x*8+i] = (rhs)
     for( i = 0; i < 8; i++ )
         IDCT8_1D
 #undef SRC
 #undef DST
 
-#define SRC(x)     dct[i][x]
+#define SRC(x)     dct[i*8+x]
 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
     for( i = 0; i < 8; i++ )
         IDCT8_1D
@@ -340,7 +357,7 @@ static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
 #undef DST
 }
 
-static void add16x16_idct8( uint8_t *dst, int16_t dct[4][8][8] )
+static void add16x16_idct8( uint8_t *dst, int16_t dct[4][64] )
 {
     add8x8_idct8( &dst[0],               dct[0] );
     add8x8_idct8( &dst[8],               dct[1] );
@@ -361,23 +378,23 @@ static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
     }
 }
 
-static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[2][2] )
+static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[4] )
 {
-    add4x4_idct_dc( &p_dst[0],               dct[0][0] );
-    add4x4_idct_dc( &p_dst[4],               dct[0][1] );
-    add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[1][0] );
-    add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[1][1] );
+    add4x4_idct_dc( &p_dst[0],               dct[0] );
+    add4x4_idct_dc( &p_dst[4],               dct[1] );
+    add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
+    add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
 }
 
-static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[4][4] )
+static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[16] )
 {
     int i;
-    for( i = 0; i < 4; i++, p_dst += 4*FDEC_STRIDE )
+    for( i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
     {
-        add4x4_idct_dc( &p_dst[ 0], dct[i][0] );
-        add4x4_idct_dc( &p_dst[ 4], dct[i][1] );
-        add4x4_idct_dc( &p_dst[ 8], dct[i][2] );
-        add4x4_idct_dc( &p_dst[12], dct[i][3] );
+        add4x4_idct_dc( &p_dst[ 0], dct[0] );
+        add4x4_idct_dc( &p_dst[ 4], dct[1] );
+        add4x4_idct_dc( &p_dst[ 8], dct[2] );
+        add4x4_idct_dc( &p_dst[12], dct[3] );
     }
 }
 
@@ -391,6 +408,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
     dctf->add4x4_idct   = add4x4_idct;
 
     dctf->sub8x8_dct    = sub8x8_dct;
+    dctf->sub8x8_dct_dc = sub8x8_dct_dc;
     dctf->add8x8_idct   = add8x8_idct;
     dctf->add8x8_idct_dc = add8x8_idct_dc;
 
@@ -416,6 +434,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
         dctf->dct4x4dc      = x264_dct4x4dc_mmx;
         dctf->idct4x4dc     = x264_idct4x4dc_mmx;
+        dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
 
 #ifndef ARCH_X86_64
         dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
@@ -434,6 +453,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
     {
         dctf->sub8x8_dct8   = x264_sub8x8_dct8_sse2;
         dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
+        dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
         dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
         dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
 
@@ -454,6 +474,10 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
     }
+
+    if( cpu&X264_CPU_SSE4 )
+        dctf->add4x4_idct   = x264_add4x4_idct_sse4;
+
 #endif //HAVE_MMX
 
 #ifdef ARCH_PPC
@@ -474,6 +498,30 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
     }
 #endif
+
+#ifdef HAVE_ARMV6
+    if( cpu&X264_CPU_NEON )
+    {
+        dctf->sub4x4_dct    = x264_sub4x4_dct_neon;
+        dctf->sub8x8_dct    = x264_sub8x8_dct_neon;
+        dctf->sub16x16_dct  = x264_sub16x16_dct_neon;
+        dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
+        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
+        dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
+        dctf->dct4x4dc      = x264_dct4x4dc_neon;
+        dctf->idct4x4dc     = x264_idct4x4dc_neon;
+
+        dctf->add4x4_idct   = x264_add4x4_idct_neon;
+        dctf->add8x8_idct   = x264_add8x8_idct_neon;
+        dctf->add16x16_idct = x264_add16x16_idct_neon;
+
+        dctf->sub8x8_dct8   = x264_sub8x8_dct8_neon;
+        dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
+
+        dctf->add8x8_idct8  = x264_add8x8_idct8_neon;
+        dctf->add16x16_idct8= x264_add16x16_idct8_neon;
+    }
+#endif
 }
 
 void x264_dct_init_weights( void )
@@ -489,8 +537,7 @@ void x264_dct_init_weights( void )
 }
 
 
-// gcc pessimizes multi-dimensional arrays here, even with constant indices
-#define ZIG(i,y,x) level[i] = dct[0][x*8+y];
+#define ZIG(i,y,x) level[i] = dct[x*8+y];
 #define ZIGZAG8_FRAME\
     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
@@ -528,42 +575,43 @@ void x264_dct_init_weights( void )
     ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
 
 #define ZIGZAG4_FRAME\
-    ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
+    ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
 
 #define ZIGZAG4_FIELD\
-    ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
+    ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
     ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
     ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
     ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
 
-static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
+static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[64] )
 {
     ZIGZAG8_FRAME
 }
 
-static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
+static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[64] )
 {
     ZIGZAG8_FIELD
 }
 
 #undef ZIG
-#define ZIG(i,y,x) level[i] = dct[0][x*4+y];
+#define ZIG(i,y,x) level[i] = dct[x*4+y];
+#define ZIGDC(i,y,x) ZIG(i,y,x)
 
-static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
+static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[16] )
 {
     ZIGZAG4_FRAME
 }
 
-static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
+static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[16] )
 {
-    *(uint32_t*)level = *(uint32_t*)dct;
+    CP32( level, dct );
     ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
-    *(uint32_t*)(level+6) = *(uint32_t*)(*dct+6);
-    *(uint64_t*)(level+8) = *(uint64_t*)(*dct+8);
-    *(uint64_t*)(level+12) = *(uint64_t*)(*dct+12);
+    CP32( level+6, dct+6 );
+    CP64( level+8, dct+8 );
+    CP64( level+12, dct+12 );
 }
 
 #undef ZIG
@@ -571,43 +619,76 @@ static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
     int oe = x+y*FENC_STRIDE;\
     int od = x+y*FDEC_STRIDE;\
     level[i] = p_src[oe] - p_dst[od];\
+    nz |= level[i];\
 }
 #define COPY4x4\
-    *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\
-    *(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\
-    *(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\
-    *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);
+    CP32( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
+    CP32( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
+    CP32( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
+    CP32( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
 #define COPY8x8\
-    *(uint64_t*)(p_dst+0*FDEC_STRIDE) = *(uint64_t*)(p_src+0*FENC_STRIDE);\
-    *(uint64_t*)(p_dst+1*FDEC_STRIDE) = *(uint64_t*)(p_src+1*FENC_STRIDE);\
-    *(uint64_t*)(p_dst+2*FDEC_STRIDE) = *(uint64_t*)(p_src+2*FENC_STRIDE);\
-    *(uint64_t*)(p_dst+3*FDEC_STRIDE) = *(uint64_t*)(p_src+3*FENC_STRIDE);\
-    *(uint64_t*)(p_dst+4*FDEC_STRIDE) = *(uint64_t*)(p_src+4*FENC_STRIDE);\
-    *(uint64_t*)(p_dst+5*FDEC_STRIDE) = *(uint64_t*)(p_src+5*FENC_STRIDE);\
-    *(uint64_t*)(p_dst+6*FDEC_STRIDE) = *(uint64_t*)(p_src+6*FENC_STRIDE);\
-    *(uint64_t*)(p_dst+7*FDEC_STRIDE) = *(uint64_t*)(p_src+7*FENC_STRIDE);
+    CP64( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
+    CP64( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
+    CP64( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
+    CP64( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
+    CP64( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
+    CP64( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
+    CP64( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
+    CP64( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
+
+static int zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
+{
+    int nz = 0;
+    ZIGZAG4_FRAME
+    COPY4x4
+    return !!nz;
+}
 
-static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
+static int zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
 {
+    int nz = 0;
+    ZIGZAG4_FIELD
+    COPY4x4
+    return !!nz;
+}
+
+#undef ZIGDC
+#define ZIGDC(i,y,x) {\
+    int oe = x+y*FENC_STRIDE;\
+    int od = x+y*FDEC_STRIDE;\
+    *dc = p_src[oe] - p_dst[od];\
+    level[0] = 0;\
+}
+
+static int zigzag_sub_4x4ac_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
+{
+    int nz = 0;
     ZIGZAG4_FRAME
     COPY4x4
+    return !!nz;
 }
 
-static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
+static int zigzag_sub_4x4ac_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
 {
+    int nz = 0;
     ZIGZAG4_FIELD
     COPY4x4
+    return !!nz;
 }
 
-static void zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
+static int zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
 {
+    int nz = 0;
     ZIGZAG8_FRAME
     COPY8x8
+    return !!nz;
 }
-static void zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
+static int zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
 {
+    int nz = 0;
     ZIGZAG8_FIELD
     COPY8x8
+    return !!nz;
 }
 
 #undef ZIG
@@ -636,9 +717,18 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
         pf->scan_4x4   = zigzag_scan_4x4_field;
         pf->sub_8x8    = zigzag_sub_8x8_field;
         pf->sub_4x4    = zigzag_sub_4x4_field;
+        pf->sub_4x4ac  = zigzag_sub_4x4ac_field;
 #ifdef HAVE_MMX
         if( cpu&X264_CPU_MMXEXT )
+        {
             pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
+            pf->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext;
+        }
+        if( cpu&X264_CPU_SSSE3 )
+        {
+            pf->sub_4x4  = x264_zigzag_sub_4x4_field_ssse3;
+            pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
+        }
 #endif
 
 #ifdef ARCH_PPC
@@ -652,6 +742,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
         pf->scan_4x4   = zigzag_scan_4x4_frame;
         pf->sub_8x8    = zigzag_sub_8x8_frame;
         pf->sub_4x4    = zigzag_sub_4x4_frame;
+        pf->sub_4x4ac  = zigzag_sub_4x4ac_frame;
 #ifdef HAVE_MMX
         if( cpu&X264_CPU_MMX )
             pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
@@ -662,6 +753,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
         if( cpu&X264_CPU_SSSE3 )
         {
             pf->sub_4x4  = x264_zigzag_sub_4x4_frame_ssse3;
+            pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
             if( cpu&X264_CPU_SHUFFLE_IS_FAST )
                 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
@@ -672,6 +764,10 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
         if( cpu&X264_CPU_ALTIVEC )
             pf->scan_4x4   = x264_zigzag_scan_4x4_frame_altivec;
 #endif
+#ifdef HAVE_ARMV6
+        if( cpu&X264_CPU_NEON )
+            pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
+#endif
     }
 
     pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
diff --git a/common/dct.h b/common/dct.h
index 3819ce1..6f282b9 100644
--- a/common/dct.h
+++ b/common/dct.h
@@ -91,34 +91,36 @@ typedef struct
     // pix1  stride = FENC_STRIDE
     // pix2  stride = FDEC_STRIDE
     // p_dst stride = FDEC_STRIDE
-    void (*sub4x4_dct)   ( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 );
-    void (*add4x4_idct)  ( uint8_t *p_dst, int16_t dct[4][4] );
+    void (*sub4x4_dct)   ( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
+    void (*add4x4_idct)  ( uint8_t *p_dst, int16_t dct[16] );
 
-    void (*sub8x8_dct)   ( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 );
-    void (*add8x8_idct)  ( uint8_t *p_dst, int16_t dct[4][4][4] );
-    void (*add8x8_idct_dc) ( uint8_t *p_dst, int16_t dct[2][2] );
+    void (*sub8x8_dct)   ( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
+    void (*sub8x8_dct_dc)( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
+    void (*add8x8_idct)  ( uint8_t *p_dst, int16_t dct[4][16] );
+    void (*add8x8_idct_dc) ( uint8_t *p_dst, int16_t dct[4] );
 
-    void (*sub16x16_dct) ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
-    void (*add16x16_idct)( uint8_t *p_dst, int16_t dct[16][4][4] );
-    void (*add16x16_idct_dc) ( uint8_t *p_dst, int16_t dct[4][4] );
+    void (*sub16x16_dct) ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+    void (*add16x16_idct)( uint8_t *p_dst, int16_t dct[16][16] );
+    void (*add16x16_idct_dc) ( uint8_t *p_dst, int16_t dct[16] );
 
-    void (*sub8x8_dct8)  ( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 );
-    void (*add8x8_idct8) ( uint8_t *p_dst, int16_t dct[8][8] );
+    void (*sub8x8_dct8)  ( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
+    void (*add8x8_idct8) ( uint8_t *p_dst, int16_t dct[64] );
 
-    void (*sub16x16_dct8) ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
-    void (*add16x16_idct8)( uint8_t *p_dst, int16_t dct[4][8][8] );
+    void (*sub16x16_dct8) ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
+    void (*add16x16_idct8)( uint8_t *p_dst, int16_t dct[4][64] );
 
-    void (*dct4x4dc) ( int16_t d[4][4] );
-    void (*idct4x4dc)( int16_t d[4][4] );
+    void (*dct4x4dc) ( int16_t d[16] );
+    void (*idct4x4dc)( int16_t d[16] );
 
 } x264_dct_function_t;
 
 typedef struct
 {
-    void (*scan_8x8)( int16_t level[64], int16_t dct[8][8] );
-    void (*scan_4x4)( int16_t level[16], int16_t dct[4][4] );
-    void (*sub_8x8)( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst );
-    void (*sub_4x4)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst );
+    void (*scan_8x8)( int16_t level[64], int16_t dct[64] );
+    void (*scan_4x4)( int16_t level[16], int16_t dct[16] );
+    int  (*sub_8x8)  ( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst );
+    int  (*sub_4x4)  ( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst );
+    int  (*sub_4x4ac)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc );
     void (*interleave_8x8_cavlc)( int16_t *dst, int16_t *src, uint8_t *nnz );
 
 } x264_zigzag_function_t;
diff --git a/common/frame.c b/common/frame.c
index cc4b1b3..40cc78f 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -26,9 +26,9 @@
 
 #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
 
-x264_frame_t *x264_frame_new( x264_t *h )
+x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
 {
-    x264_frame_t *frame = x264_malloc( sizeof(x264_frame_t) );
+    x264_frame_t *frame;
     int i, j;
 
     int i_mb_count = h->mb.i_mb_count;
@@ -38,9 +38,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
     int chroma_plane_size;
     int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
 
-    if( !frame ) return NULL;
-
-    memset( frame, 0, sizeof(x264_frame_t) );
+    CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
 
     /* allocate frame data (+64 for extra data for me) */
     i_width  = ALIGN( h->param.i_width, 16 );
@@ -50,60 +48,22 @@ x264_frame_t *x264_frame_new( x264_t *h )
     frame->i_plane = 3;
     for( i = 0; i < 3; i++ )
     {
-        frame->i_stride[i] = ALIGN( i_stride >> !!i, 16 );
+        frame->i_stride[i] = ALIGN( i_stride >> !!i, align );
         frame->i_width[i] = i_width >> !!i;
         frame->i_lines[i] = i_lines >> !!i;
     }
 
-    luma_plane_size = (frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ));
-    chroma_plane_size = (frame->i_stride[1] * ( frame->i_lines[1] + 2*i_padv ));
+    luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
+    chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv));
     for( i = 1; i < 3; i++ )
     {
         CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
         frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
     }
-    /* all 4 luma planes allocated together, since the cacheline split code
-     * requires them to be in-phase wrt cacheline alignment. */
-    if( h->param.analyse.i_subpel_refine )
-    {
-        CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
-        for( i = 0; i < 4; i++ )
-            frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
-        frame->plane[0] = frame->filtered[0];
-    }
-    else
-    {
-        CHECKED_MALLOC( frame->buffer[0], luma_plane_size);
-        frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
-    }
-
-    if( h->frames.b_have_lowres )
-    {
-        frame->i_width_lowres = frame->i_width[0]/2;
-        frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
-        frame->i_lines_lowres = frame->i_lines[0]/2;
-
-        luma_plane_size = frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv );
-
-        CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
-        for( i = 0; i < 4; i++ )
-            frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
-
-        for( j = 0; j <= !!h->param.i_bframe; j++ )
-            for( i = 0; i <= h->param.i_bframe; i++ )
-            {
-                CHECKED_MALLOC( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
-                memset( frame->lowres_mvs[j][i], 0, 2*h->mb.i_mb_count*sizeof(int16_t) );
-                CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
-            }
-    }
 
-    if( h->param.analyse.i_me_method >= X264_ME_ESA )
-    {
-        CHECKED_MALLOC( frame->buffer[3],
-                        frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
-        frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
-    }
+    for( i = 0; i < h->param.i_bframe + 2; i++ )
+        for( j = 0; j < h->param.i_bframe + 2; j++ )
+            CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
 
     frame->i_poc = -1;
     frame->i_type = X264_TYPE_AUTO;
@@ -112,73 +72,142 @@ x264_frame_t *x264_frame_new( x264_t *h )
     frame->i_frame = -1;
     frame->i_frame_num = -1;
     frame->i_lines_completed = -1;
+    frame->b_fdec = b_fdec;
+    frame->orig = frame;
 
-    CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
-    CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
-    CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
-    CHECKED_MALLOC( frame->i_intra_cost, i_mb_count * sizeof(uint16_t) );
-    if( h->param.i_bframe )
+    /* all 4 luma planes allocated together, since the cacheline split code
+     * requires them to be in-phase wrt cacheline alignment. */
+    if( h->param.analyse.i_subpel_refine && b_fdec )
     {
-        CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
-        CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
+        CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size );
+        for( i = 0; i < 4; i++ )
+            frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
+        frame->plane[0] = frame->filtered[0];
     }
     else
     {
-        frame->mv[1]  = NULL;
-        frame->ref[1] = NULL;
+        CHECKED_MALLOC( frame->buffer[0], luma_plane_size );
+        frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
     }
 
-    CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
-    CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
-    for( i = 0; i < h->param.i_bframe + 2; i++ )
-        for( j = 0; j < h->param.i_bframe + 2; j++ )
-            CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
+    frame->b_duplicate = 0;
 
-    if( h->param.rc.i_aq_mode )
+    if( b_fdec ) /* fdec frame */
+    {
+        CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t));
+        CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) );
+        CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) );
+        if( h->param.i_bframe )
+        {
+            CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) );
+            CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) );
+        }
+        else
+        {
+            frame->mv[1]  = NULL;
+            frame->ref[1] = NULL;
+        }
+        CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) );
+        CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) );
+        if( h->param.analyse.i_me_method >= X264_ME_ESA )
+        {
+            CHECKED_MALLOC( frame->buffer[3],
+                            frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
+            frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
+        }
+    }
+    else /* fenc frame */
     {
-        CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
         if( h->frames.b_have_lowres )
-            CHECKED_MALLOC( frame->i_inv_qscale_factor, h->mb.i_mb_count * sizeof(uint16_t) );
+        {
+            frame->i_width_lowres = frame->i_width[0]/2;
+            frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
+            frame->i_lines_lowres = frame->i_lines[0]/2;
+
+            luma_plane_size = frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV);
+
+            CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
+            for( i = 0; i < 4; i++ )
+                frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;
+
+            for( j = 0; j <= !!h->param.i_bframe; j++ )
+                for( i = 0; i <= h->param.i_bframe; i++ )
+                {
+                    CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
+                    CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
+                }
+            CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
+            for( j = 0; j <= h->param.i_bframe+1; j++ )
+                for( i = 0; i <= h->param.i_bframe+1; i++ )
+                {
+                    CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
+                    CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) );
+                }
+            frame->i_intra_cost = frame->lowres_costs[0][0];
+            memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) );
+        }
+        if( h->param.rc.i_aq_mode )
+        {
+            CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
+            CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) );
+            if( h->frames.b_have_lowres )
+                /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */
+                CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) );
+        }
     }
 
-    x264_pthread_mutex_init( &frame->mutex, NULL );
-    x264_pthread_cond_init( &frame->cv, NULL );
+    if( x264_pthread_mutex_init( &frame->mutex, NULL ) )
+        goto fail;
+    if( x264_pthread_cond_init( &frame->cv, NULL ) )
+        goto fail;
 
     return frame;
 
 fail:
-    x264_frame_delete( frame );
+    x264_free( frame );
     return NULL;
 }
 
 void x264_frame_delete( x264_frame_t *frame )
 {
     int i, j;
-    for( i = 0; i < 4; i++ )
-        x264_free( frame->buffer[i] );
-    for( i = 0; i < 4; i++ )
-        x264_free( frame->buffer_lowres[i] );
-    for( i = 0; i < X264_BFRAME_MAX+2; i++ )
-        for( j = 0; j < X264_BFRAME_MAX+2; j++ )
-            x264_free( frame->i_row_satds[i][j] );
-    for( j = 0; j < 2; j++ )
-        for( i = 0; i <= X264_BFRAME_MAX; i++ )
-        {
-            x264_free( frame->lowres_mvs[j][i] );
-            x264_free( frame->lowres_mv_costs[j][i] );
-        }
-    x264_free( frame->f_qp_offset );
-    x264_free( frame->i_inv_qscale_factor );
-    x264_free( frame->i_intra_cost );
-    x264_free( frame->i_row_bits );
-    x264_free( frame->i_row_qp );
-    x264_free( frame->mb_type );
-    x264_free( frame->mv[0] );
-    x264_free( frame->mv[1] );
-    x264_free( frame->ref[0] );
-    x264_free( frame->ref[1] );
-    x264_pthread_mutex_destroy( &frame->mutex );
-    x264_pthread_cond_destroy( &frame->cv );
+    /* Duplicate frames are blank copies of real frames (including pointers),
+     * so freeing those pointers would cause a double free later. */
+    if( !frame->b_duplicate )
+    {
+        for( i = 0; i < 4; i++ )
+            x264_free( frame->buffer[i] );
+        for( i = 0; i < 4; i++ )
+            x264_free( frame->buffer_lowres[i] );
+        for( i = 0; i < X264_BFRAME_MAX+2; i++ )
+            for( j = 0; j < X264_BFRAME_MAX+2; j++ )
+                x264_free( frame->i_row_satds[i][j] );
+        for( j = 0; j < 2; j++ )
+            for( i = 0; i <= X264_BFRAME_MAX; i++ )
+            {
+                x264_free( frame->lowres_mvs[j][i] );
+                x264_free( frame->lowres_mv_costs[j][i] );
+            }
+        x264_free( frame->i_propagate_cost );
+        for( j = 0; j <= X264_BFRAME_MAX+1; j++ )
+            for( i = 0; i <= X264_BFRAME_MAX+1; i++ )
+            {
+                x264_free( frame->lowres_costs[j][i] );
+                x264_free( frame->lowres_inter_types[j][i] );
+            }
+        x264_free( frame->f_qp_offset );
+        x264_free( frame->f_qp_offset_aq );
+        x264_free( frame->i_inv_qscale_factor );
+        x264_free( frame->i_row_bits );
+        x264_free( frame->i_row_qp );
+        x264_free( frame->mb_type );
+        x264_free( frame->mv[0] );
+        x264_free( frame->mv[1] );
+        x264_free( frame->ref[0] );
+        x264_free( frame->ref[1] );
+        x264_pthread_mutex_destroy( &frame->mutex );
+        x264_pthread_cond_destroy( &frame->cv );
+    }
     x264_free( frame );
 }
 
@@ -194,7 +223,8 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
 
     dst->i_type     = src->i_type;
     dst->i_qpplus1  = src->i_qpplus1;
-    dst->i_pts      = src->i_pts;
+    dst->i_pts      = dst->i_reordered_pts = src->i_pts;
+    dst->param      = src->param;
 
     for( i=0; i<3; i++ )
     {
@@ -298,7 +328,7 @@ void x264_frame_expand_border_lowres( x264_frame_t *frame )
 {
     int i;
     for( i = 0; i < 4; i++ )
-        plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_stride_lowres - 2*PADH, frame->i_lines_lowres, PADH, PADV, 1, 1 );
+        plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 );
 }
 
 void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
@@ -309,8 +339,8 @@ void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
         int i_subsample = i ? 1 : 0;
         int i_width = h->param.i_width >> i_subsample;
         int i_height = h->param.i_height >> i_subsample;
-        int i_padx = ( h->sps->i_mb_width * 16 - h->param.i_width ) >> i_subsample;
-        int i_pady = ( h->sps->i_mb_height * 16 - h->param.i_height ) >> i_subsample;
+        int i_padx = (h->sps->i_mb_width * 16 - h->param.i_width) >> i_subsample;
+        int i_pady = (h->sps->i_mb_height * 16 - h->param.i_height) >> i_subsample;
 
         if( i_padx )
         {
@@ -631,9 +661,10 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
     int stride2y  = stridey << b_interlaced;
     int strideuv  = h->fdec->i_stride[1];
     int stride2uv = strideuv << b_interlaced;
+    uint8_t (*nnz_backup)[16] = h->scratch_buffer;
 
     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
-        munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, munge_cavlc_nnz_row );
+        munge_cavlc_nnz( h, mb_y, nnz_backup, munge_cavlc_nnz_row );
 
     for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
     {
@@ -698,10 +729,10 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
         {\
             /* *** Get bS for each 4px for the current edge *** */\
             if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
-                *(uint32_t*)bS = 0x03030303;\
+                M32( bS ) = 0x03030303;\
             else\
             {\
-                *(uint32_t*)bS = 0x00000000;\
+                M32( bS ) = 0x00000000;\
                 for( i = 0; i < 4; i++ )\
                 {\
                     int x  = i_dir == 0 ? i_edge : i;\
@@ -717,15 +748,20 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
                             bS[i] = bS[i-1];\
                         else\
                         {\
-                            /* FIXME: A given frame may occupy more than one position in\
-                             * the reference list. So we should compare the frame numbers,\
-                             * not the indices in the ref list.\
-                             * No harm yet, as we don't generate that case.*/\
                             int i8p= mb_8x8+(x>>1)+(y>>1)*s8x8;\
                             int i8q= mbn_8x8+(xn>>1)+(yn>>1)*s8x8;\
                             int i4p= mb_4x4+x+y*s4x4;\
                             int i4q= mbn_4x4+xn+yn*s4x4;\
-                            if((h->mb.ref[0][i8p] != h->mb.ref[0][i8q] ||\
+                            int refs_equal;\
+                            /* We don't use duplicate refs in B-frames, so we can take this shortcut for now. */ \
+                            if( h->sh.i_type == SLICE_TYPE_B || h->mb.ref[0][i8p] < 0 || h->mb.ref[0][i8q] < 0 )\
+                                refs_equal = h->mb.ref[0][i8p] == h->mb.ref[0][i8q];\
+                            else if( !h->mb.b_interlaced )\
+                                refs_equal = h->fref0[h->mb.ref[0][i8p]]->i_poc == h->fref0[h->mb.ref[0][i8q]]->i_poc;\
+                            else\
+                                refs_equal = h->fref0[h->mb.ref[0][i8p]>>1]->i_poc == h->fref0[h->mb.ref[0][i8q]>>1]->i_poc\
+                                           && (h->mb.ref[0][i8p]&1) == (h->mb.ref[0][i8q]&1);\
+                            if((!refs_equal ||\
                                 abs( h->mb.mv[0][i4p][0] - h->mb.mv[0][i4q][0] ) >= 4 ||\
                                 abs( h->mb.mv[0][i4p][1] - h->mb.mv[0][i4q][1] ) >= mvy_limit ) ||\
                                (h->sh.i_type == SLICE_TYPE_B &&\
@@ -747,7 +783,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
         {\
             int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
             int i_qpn, i, mbn_xy, mbn_8x8, mbn_4x4;\
-            DECLARE_ALIGNED_4( uint8_t bS[4] );  /* filtering strength */\
+            ALIGNED_4( uint8_t bS[4] );  /* filtering strength */\
             if( i_edge )\
                 i_edge+= b_8x8_transform;\
             else\
@@ -767,7 +803,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
                     goto end##i_dir;\
                 }\
                 DEBLOCK_STRENGTH(i_dir);\
-                if( *(uint32_t*)bS )\
+                if( M32( bS ) )\
                     FILTER_DIR( , i_dir);\
                 end##i_dir:\
                 i_edge += b_8x8_transform+1;\
@@ -778,7 +814,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
             for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
             {\
                 DEBLOCK_STRENGTH(i_dir);\
-                if( *(uint32_t*)bS )\
+                if( M32( bS ) )\
                     FILTER_DIR( , i_dir);\
             }\
         }
@@ -788,7 +824,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
     }
 
     if( !h->pps->b_cabac && h->pps->b_transform_8x8_mode )
-        munge_cavlc_nnz( h, mb_y, h->mb.nnz_backup, restore_cavlc_nnz_row );
+        munge_cavlc_nnz( h, mb_y, nnz_backup, restore_cavlc_nnz_row );
 }
 
 void x264_frame_deblock( x264_t *h )
@@ -832,6 +868,13 @@ void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta,
 void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 #endif // ARCH_PPC
 
+#ifdef HAVE_ARMV6
+void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * );
+void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * );
+void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * );
+void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * );
+#endif
+
 void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
 {
     pf->deblock_v_luma = deblock_v_luma_c;
@@ -873,6 +916,16 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
         pf->deblock_h_luma = x264_deblock_h_luma_altivec;
    }
 #endif // ARCH_PPC
+
+#ifdef HAVE_ARMV6
+   if( cpu&X264_CPU_NEON )
+   {
+        pf->deblock_v_luma   = x264_deblock_v_luma_neon;
+        pf->deblock_h_luma   = x264_deblock_h_luma_neon;
+        pf->deblock_v_chroma = x264_deblock_v_chroma_neon;
+        pf->deblock_h_chroma = x264_deblock_h_chroma_neon;
+   }
+#endif
 }
 
 
@@ -937,20 +990,49 @@ void x264_frame_push_unused( x264_t *h, x264_frame_t *frame )
     assert( frame->i_reference_count > 0 );
     frame->i_reference_count--;
     if( frame->i_reference_count == 0 )
-        x264_frame_push( h->frames.unused, frame );
-    assert( h->frames.unused[ sizeof(h->frames.unused) / sizeof(*h->frames.unused) - 1 ] == NULL );
+        x264_frame_push( h->frames.unused[frame->b_fdec], frame );
 }
 
-x264_frame_t *x264_frame_pop_unused( x264_t *h )
+x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec )
 {
     x264_frame_t *frame;
-    if( h->frames.unused[0] )
-        frame = x264_frame_pop( h->frames.unused );
+    if( h->frames.unused[b_fdec][0] )
+        frame = x264_frame_pop( h->frames.unused[b_fdec] );
     else
-        frame = x264_frame_new( h );
-    assert( frame->i_reference_count == 0 );
+        frame = x264_frame_new( h, b_fdec );
+    if( !frame )
+        return NULL;
+    frame->b_last_minigop_bframe = 0;
     frame->i_reference_count = 1;
     frame->b_intra_calculated = 0;
+    frame->b_scenecut = 1;
+    frame->b_keyframe = 0;
+
+    memset( frame->weight, 0, sizeof(frame->weight) );
+    memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) );
+
+    return frame;
+}
+
+void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame )
+{
+    assert( frame->i_reference_count > 0 );
+    frame->i_reference_count--;
+    if( frame->i_reference_count == 0 )
+        x264_frame_push( h->frames.blank_unused, frame );
+}
+
+x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
+{
+    x264_frame_t *frame;
+    if( h->frames.blank_unused[0] )
+        frame = x264_frame_pop( h->frames.blank_unused );
+    else
+        frame = x264_malloc( sizeof(x264_frame_t) );
+    if( !frame )
+        return NULL;
+    frame->b_duplicate = 1;
+    frame->i_reference_count = 1;
     return frame;
 }
 
@@ -973,3 +1055,63 @@ void x264_frame_sort( x264_frame_t **list, int b_dts )
         }
     } while( !b_ok );
 }
+
+void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
+                         int i_width, int i_height, x264_weight_t *w )
+{
+    int x;
+    /* Weight horizontal strips of height 16. This was found to be the optimal height
+     * in terms of the cache loads. */
+    while( i_height > 0 )
+    {
+        for( x = 0; x < i_width; x += 16 )
+            w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) );
+        i_height -= 16;
+        dst += 16 * i_dst_stride;
+        src += 16 * i_src_stride;
+    }
+}
+
+void x264_frame_delete_list( x264_frame_t **list )
+{
+    int i = 0;
+    if( !list )
+        return;
+    while( list[i] )
+        x264_frame_delete( list[i++] );
+    x264_free( list );
+}
+
+int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int max_size )
+{
+    if( max_size < 0 )
+        return -1;
+    slist->i_max_size = max_size;
+    slist->i_size = 0;
+    CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) );
+    if( x264_pthread_mutex_init( &slist->mutex, NULL ) ||
+        x264_pthread_cond_init( &slist->cv_fill, NULL ) ||
+        x264_pthread_cond_init( &slist->cv_empty, NULL ) )
+        return -1;
+    return 0;
+fail:
+    return -1;
+}
+
+void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist )
+{
+    x264_pthread_mutex_destroy( &slist->mutex );
+    x264_pthread_cond_destroy( &slist->cv_fill );
+    x264_pthread_cond_destroy( &slist->cv_empty );
+    x264_frame_delete_list( slist->list );
+}
+
+void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame )
+{
+    x264_pthread_mutex_lock( &slist->mutex );
+    while( slist->i_size == slist->i_max_size )
+        x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex );
+    slist->list[ slist->i_size++ ] = frame;
+    x264_pthread_mutex_unlock( &slist->mutex );
+    x264_pthread_cond_broadcast( &slist->cv_fill );
+}
diff --git a/common/frame.h b/common/frame.h
index aad77f5..b1852b3 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -28,16 +28,24 @@
 #define PADH 32
 #define PADV 32
 
-typedef struct
+typedef struct x264_frame
 {
     /* */
     int     i_poc;
     int     i_type;
     int     i_qpplus1;
     int64_t i_pts;
-    int     i_frame;    /* Presentation frame number */
-    int     i_frame_num; /* Coded frame number */
+    int64_t i_reordered_pts;
+    x264_param_t *param;
+
+    int     i_frame;     /* Presentation frame number */
+    int     i_coded;     /* Coded frame number */
+    int     i_frame_num; /* 7.4.3 frame_num */
     int     b_kept_as_ref;
+    int     b_keyframe;
+    uint8_t b_fdec;
+    uint8_t b_last_minigop_bframe; /* this frame is the last b in a sequence of bframes */
+    uint8_t i_bframes;   /* number of bframes following this nonb in coded order */
     float   f_qp_avg_rc; /* QPs as decided by ratecontrol */
     float   f_qp_avg_aq; /* QPs as decided by AQ in addition to ratecontrol */
 
@@ -59,10 +67,18 @@ typedef struct
     uint8_t *buffer[4];
     uint8_t *buffer_lowres[4];
 
+    x264_weight_t weight[16][3]; /* [ref_index][plane] */
+    uint8_t *weighted[16]; /* plane[0] weighted of the reference frames */
+    int b_duplicate;
+    struct x264_frame *orig;
+
     /* motion data */
     int8_t  *mb_type;
     int16_t (*mv[2])[2];
     int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2];
+    uint16_t (*lowres_costs[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
+    /* Actually a width-2 bitfield with 4 values per uint8_t. */
+    uint8_t  (*lowres_inter_types[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]);
     int     *lowres_mv_costs[2][X264_BFRAME_MAX+1];
     int8_t  *ref[2];
     int     i_ref[2];
@@ -81,18 +97,44 @@ typedef struct
     int     *i_row_bits;
     int     *i_row_qp;
     float   *f_qp_offset;
+    float   *f_qp_offset_aq;
     int     b_intra_calculated;
     uint16_t *i_intra_cost;
+    uint16_t *i_propagate_cost;
     uint16_t *i_inv_qscale_factor;
+    int     b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
+    float   f_weighted_cost_delta[X264_BFRAME_MAX+2];
+    uint32_t i_pixel_sum;
+    uint64_t i_pixel_ssd;
+
+    /* vbv */
+    uint8_t i_planned_type[X264_LOOKAHEAD_MAX+1];
+    int i_planned_satd[X264_LOOKAHEAD_MAX+1];
 
     /* threading */
     int     i_lines_completed; /* in pixels */
+    int     i_lines_weighted; /* FIXME: this only supports weighting of one reference frame */
     int     i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */
     x264_pthread_mutex_t mutex;
     x264_pthread_cond_t  cv;
 
+    /* periodic intra refresh */
+    float   f_pir_position;
+    int     i_pir_start_col;
+    int     i_pir_end_col;
 } x264_frame_t;
 
+/* synchronized frame list */
+typedef struct
+{
+   x264_frame_t **list;
+   int i_max_size;
+   int i_size;
+   x264_pthread_mutex_t     mutex;
+   x264_pthread_cond_t      cv_fill;  /* event signaling that the list became fuller */
+   x264_pthread_cond_t      cv_empty; /* event signaling that the list became emptier */
+} x264_synch_frame_list_t;
+
 typedef void (*x264_deblock_inter_t)( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
 typedef void (*x264_deblock_intra_t)( uint8_t *pix, int stride, int alpha, int beta );
 typedef struct
@@ -107,7 +149,7 @@ typedef struct
     x264_deblock_intra_t deblock_h_chroma_intra;
 } x264_deblock_function_t;
 
-x264_frame_t *x264_frame_new( x264_t *h );
+x264_frame_t *x264_frame_new( x264_t *h, int b_fdec );
 void          x264_frame_delete( x264_frame_t *frame );
 
 int           x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src );
@@ -133,8 +175,18 @@ x264_frame_t *x264_frame_pop( x264_frame_t **list );
 void          x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame );
 x264_frame_t *x264_frame_shift( x264_frame_t **list );
 void          x264_frame_push_unused( x264_t *h, x264_frame_t *frame );
-x264_frame_t *x264_frame_pop_unused( x264_t *h );
+void          x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame );
+x264_frame_t *x264_frame_pop_blank_unused( x264_t *h );
+void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
+                              int i_width, int i_height, x264_weight_t *w );
+x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec );
 void          x264_frame_sort( x264_frame_t **list, int b_dts );
+void          x264_frame_delete_list( x264_frame_t **list );
+
+int           x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int nelem );
+void          x264_synch_frame_list_delete( x264_synch_frame_list_t *slist );
+void          x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame );
+
 #define x264_frame_sort_dts(list) x264_frame_sort(list, 1)
 #define x264_frame_sort_pts(list) x264_frame_sort(list, 0)
 
diff --git a/common/macroblock.c b/common/macroblock.c
index 836d203..10f09ac 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -33,7 +33,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
     int16_t *mv_a  = h->mb.cache.mv[i_list][i8 - 1];
     int     i_refb = h->mb.cache.ref[i_list][i8 - 8];
     int16_t *mv_b  = h->mb.cache.mv[i_list][i8 - 8];
-    int     i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width ];
+    int     i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width];
     int16_t *mv_c  = h->mb.cache.mv[i_list][i8 - 8 + i_width];
 
     int i_count = 0;
@@ -50,7 +50,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
         {
             if( i_refb == i_ref )
             {
-                *(uint32_t*)mvp = *(uint32_t*)mv_b;
+                CP32( mvp, mv_b );
                 return;
             }
         }
@@ -58,7 +58,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
         {
             if( i_refa == i_ref )
             {
-                *(uint32_t*)mvp = *(uint32_t*)mv_a;
+                CP32( mvp, mv_a );
                 return;
             }
         }
@@ -69,7 +69,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
         {
             if( i_refa == i_ref )
             {
-                *(uint32_t*)mvp = *(uint32_t*)mv_a;
+                CP32( mvp, mv_a );
                 return;
             }
         }
@@ -77,7 +77,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
         {
             if( i_refc == i_ref )
             {
-                *(uint32_t*)mvp = *(uint32_t*)mv_c;
+                CP32( mvp, mv_c );
                 return;
             }
         }
@@ -95,14 +95,14 @@ median:
     else if( i_count == 1 )
     {
         if( i_refa == i_ref )
-            *(uint32_t*)mvp = *(uint32_t*)mv_a;
+            CP32( mvp, mv_a );
         else if( i_refb == i_ref )
-            *(uint32_t*)mvp = *(uint32_t*)mv_b;
+            CP32( mvp, mv_b );
         else
-            *(uint32_t*)mvp = *(uint32_t*)mv_c;
+            CP32( mvp, mv_c );
     }
     else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
-        *(uint32_t*)mvp = *(uint32_t*)mv_a;
+        CP32( mvp, mv_a );
     else
         goto median;
 }
@@ -136,14 +136,14 @@ median:
     else if( i_count == 1 )
     {
         if( i_refa == i_ref )
-            *(uint32_t*)mvp = *(uint32_t*)mv_a;
+            CP32( mvp, mv_a );
         else if( i_refb == i_ref )
-            *(uint32_t*)mvp = *(uint32_t*)mv_b;
+            CP32( mvp, mv_b );
         else
-            *(uint32_t*)mvp = *(uint32_t*)mv_c;
+            CP32( mvp, mv_c );
     }
     else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
-        *(uint32_t*)mvp = *(uint32_t*)mv_a;
+        CP32( mvp, mv_a );
     else
         goto median;
 }
@@ -157,10 +157,10 @@ void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] )
     int16_t *mv_b  = h->mb.cache.mv[0][X264_SCAN8_0 - 8];
 
     if( i_refa == -2 || i_refb == -2 ||
-        !( i_refa | *(uint32_t*)mv_a ) ||
-        !( i_refb | *(uint32_t*)mv_b ) )
+        !( i_refa | M32( mv_a ) ) ||
+        !( i_refb | M32( mv_b ) ) )
     {
-        *(uint32_t*)mv = 0;
+        M32( mv ) = 0;
     }
     else
     {
@@ -173,7 +173,7 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
     int i_mb_4x4 = 16 * h->mb.i_mb_stride * h->mb.i_mb_y + 4 * h->mb.i_mb_x;
     int i_mb_8x8 =  4 * h->mb.i_mb_stride * h->mb.i_mb_y + 2 * h->mb.i_mb_x;
     int i8;
-    const int type_col = h->fref1[0]->mb_type[ h->mb.i_mb_xy ];
+    const int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy];
 
     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );
 
@@ -190,7 +190,7 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
         const int x8 = i8%2;
         const int y8 = i8/2;
         const int i_part_8x8 = i_mb_8x8 + x8 + y8 * h->mb.i_b8_stride;
-        const int i_ref = h->mb.map_col_to_list0[ h->fref1[0]->ref[0][ i_part_8x8 ] ];
+        const int i_ref = map_col_to_list0(h->fref1[0]->ref[0][i_part_8x8]);
 
         if( i_ref >= 0 )
         {
@@ -221,7 +221,7 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
 static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
 {
     int ref[2];
-    DECLARE_ALIGNED_8( int16_t mv[2][2] );
+    ALIGNED_ARRAY_8( int16_t, mv,[2],[2] );
     int i_list;
     int i8;
     const int8_t *l1ref0 = &h->fref1[0]->ref[0][ h->mb.i_b8_xy ];
@@ -259,11 +259,12 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
     if( ref[0] >= 0 )
         x264_mb_predict_mv_16x16( h, 0, ref[0], mv[0] );
     else
-        *(uint32_t*)mv[0] = 0;
+        M32( mv[0] ) = 0;
+
     if( ref[1] >= 0 )
         x264_mb_predict_mv_16x16( h, 1, ref[1], mv[1] );
     else
-        *(uint32_t*)mv[1] = 0;
+        M32( mv[1] ) = 0;
 
     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, ref[0] );
     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, ref[1] );
@@ -325,56 +326,58 @@ int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed )
 
     if( b_changed != NULL && b_available )
     {
-        int type_col = h->fref1[0]->mb_type[ h->mb.i_mb_xy ];
-        if( IS_INTRA(type_col) || type_col == P_SKIP )
+        int type_col = h->fref1[0]->mb_type[h->mb.i_mb_xy];
+        int changed = 0;
+
+        if( IS_INTRA( type_col ) || type_col == P_SKIP )
         {
-            *b_changed = h->mb.cache.direct_ref[0][0] != h->mb.cache.ref[0][X264_SCAN8_0]
-                      || h->mb.cache.direct_ref[1][0] != h->mb.cache.ref[1][X264_SCAN8_0]
-                      || *(uint32_t*)h->mb.cache.direct_mv[0][X264_SCAN8_0] != *(uint32_t*)h->mb.cache.mv[0][X264_SCAN8_0]
-                      || *(uint32_t*)h->mb.cache.direct_mv[1][X264_SCAN8_0] != *(uint32_t*)h->mb.cache.mv[1][X264_SCAN8_0];
+            changed |= M32( h->mb.cache.direct_mv[0][0] ) ^ M32( h->mb.cache.mv[0][X264_SCAN8_0] );
+            changed |= M32( h->mb.cache.direct_mv[1][0] ) ^ M32( h->mb.cache.mv[1][X264_SCAN8_0] );
+            changed |= h->mb.cache.direct_ref[0][0] ^ h->mb.cache.ref[0][X264_SCAN8_0];
+            changed |= h->mb.cache.direct_ref[1][0] ^ h->mb.cache.ref[1][X264_SCAN8_0];
         }
         else
         {
-            int i, l;
-            *b_changed = 0;
+            int l;
             for( l = 0; l < 2; l++ )
-                for( i = 0; i < 4; i++ )
-                    *b_changed |= h->mb.cache.direct_ref[l][i] != h->mb.cache.ref[l][x264_scan8[i*4]];
-            *b_changed = *b_changed || memcmp(h->mb.cache.direct_mv, h->mb.cache.mv, sizeof(h->mb.cache.mv));
+            {
+                changed |= M32( h->mb.cache.direct_mv[l][0] ) ^ M32( h->mb.cache.mv[l][x264_scan8[ 0]] );
+                if( changed ) break;
+                changed |= M32( h->mb.cache.direct_mv[l][1] ) ^ M32( h->mb.cache.mv[l][x264_scan8[ 4]] );
+                changed |= M32( h->mb.cache.direct_mv[l][2] ) ^ M32( h->mb.cache.mv[l][x264_scan8[ 8]] );
+                changed |= M32( h->mb.cache.direct_mv[l][3] ) ^ M32( h->mb.cache.mv[l][x264_scan8[12]] );
+                if( changed ) break;
+                changed |= h->mb.cache.direct_ref[l][0] ^ h->mb.cache.ref[l][x264_scan8[ 0]];
+                changed |= h->mb.cache.direct_ref[l][1] ^ h->mb.cache.ref[l][x264_scan8[ 4]];
+                changed |= h->mb.cache.direct_ref[l][2] ^ h->mb.cache.ref[l][x264_scan8[ 8]];
+                changed |= h->mb.cache.direct_ref[l][3] ^ h->mb.cache.ref[l][x264_scan8[12]];
+            }
         }
-        if( !*b_changed )
+        *b_changed = changed;
+        if( !changed )
             return b_available;
     }
 
     /* cache ref & mv */
     if( b_available )
     {
-        int i, l;
+        int l;
         for( l = 0; l < 2; l++ )
-            for( i = 0; i < 4; i++ )
-                h->mb.cache.direct_ref[l][i] = h->mb.cache.ref[l][x264_scan8[i*4]];
-        h->mc.memcpy_aligned(h->mb.cache.direct_mv, h->mb.cache.mv, sizeof(h->mb.cache.mv));
+        {
+            CP32( h->mb.cache.direct_mv[l][0], h->mb.cache.mv[l][x264_scan8[ 0]] );
+            CP32( h->mb.cache.direct_mv[l][1], h->mb.cache.mv[l][x264_scan8[ 4]] );
+            CP32( h->mb.cache.direct_mv[l][2], h->mb.cache.mv[l][x264_scan8[ 8]] );
+            CP32( h->mb.cache.direct_mv[l][3], h->mb.cache.mv[l][x264_scan8[12]] );
+            h->mb.cache.direct_ref[l][0] = h->mb.cache.ref[l][x264_scan8[ 0]];
+            h->mb.cache.direct_ref[l][1] = h->mb.cache.ref[l][x264_scan8[ 4]];
+            h->mb.cache.direct_ref[l][2] = h->mb.cache.ref[l][x264_scan8[ 8]];
+            h->mb.cache.direct_ref[l][3] = h->mb.cache.ref[l][x264_scan8[12]];
+        }
     }
 
     return b_available;
 }
 
-void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
-{
-    const int x = 2*(idx%2);
-    const int y = 2*(idx/2);
-    x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
-    x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
-    *(uint64_t*)h->mb.cache.mv[0][x264_scan8[idx*4]] =
-    *(uint64_t*)h->mb.cache.direct_mv[0][x264_scan8[idx*4]];
-    *(uint64_t*)h->mb.cache.mv[0][x264_scan8[idx*4]+8] =
-    *(uint64_t*)h->mb.cache.direct_mv[0][x264_scan8[idx*4]+8];
-    *(uint64_t*)h->mb.cache.mv[1][x264_scan8[idx*4]] =
-    *(uint64_t*)h->mb.cache.direct_mv[1][x264_scan8[idx*4]];
-    *(uint64_t*)h->mb.cache.mv[1][x264_scan8[idx*4]+8] =
-    *(uint64_t*)h->mb.cache.direct_mv[1][x264_scan8[idx*4]+8];
-}
-
 /* This just improves encoder performance, it's not part of the spec */
 void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[9][2], int *i_mvc )
 {
@@ -382,7 +385,7 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[
     int i = 0;
 
 #define SET_MVP(mvp) { \
-        *(uint32_t*)mvc[i] = *(uint32_t*)mvp; \
+        CP32( mvc[i], mvp ); \
         i++; \
     }
 
@@ -397,7 +400,11 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[
     {
         int16_t (*lowres_mv)[2] = i_list ? h->fenc->lowres_mvs[1][h->fref1[0]->i_frame-h->fenc->i_frame-1]
                                          : h->fenc->lowres_mvs[0][h->fenc->i_frame-h->fref0[0]->i_frame-1];
-        if( lowres_mv[0][0] != 0x7fff ) *(uint32_t*)mvc[i++] = (*(uint32_t*)lowres_mv[h->mb.i_mb_xy]*2)&0xfffeffff;
+        if( lowres_mv[0][0] != 0x7fff )
+        {
+            M32( mvc[i] ) = (M32( lowres_mv[h->mb.i_mb_xy] )*2)&0xfffeffff;
+            i++;
+        }
     }
 
     /* spatial predictors */
@@ -462,72 +469,83 @@ static void setup_inverse_delta_pocs( x264_t *h )
     }
 }
 
-static inline void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height )
+static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int height )
 {
     const int i8 = x264_scan8[0]+x+8*y;
     const int i_ref = h->mb.cache.ref[0][i8];
-    const int mvx   = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
-    int       mvy   = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
+    const int mvx   = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
+    int       mvy   = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
 
     h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
                    h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0],
-                   mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height );
+                   mvx, mvy, 4*width, 4*height, &h->sh.weight[i_ref][0] );
 
     // chroma is offset if MCing from a field of opposite parity
     if( h->mb.b_interlaced & i_ref )
         mvy += (h->mb.i_mb_y & 1)*4 - 2;
 
     h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                     &h->mb.pic.p_fref[0][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+                     h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1],
                      mvx, mvy, 2*width, 2*height );
 
+    if( h->sh.weight[i_ref][1].weightfn )
+        h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                                                   &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                                                   &h->sh.weight[i_ref][1], height*2 );
+
     h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                     &h->mb.pic.p_fref[0][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+                     h->mb.pic.p_fref[0][i_ref][5], h->mb.pic.i_stride[2],
                      mvx, mvy, 2*width, 2*height );
+
+    if( h->sh.weight[i_ref][2].weightfn )
+        h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                                                   &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                                                   &h->sh.weight[i_ref][2],height*2 );
+
 }
-static inline void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
+static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
 {
     const int i8 = x264_scan8[0]+x+8*y;
     const int i_ref = h->mb.cache.ref[1][i8];
-    const int mvx   = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
-    int       mvy   = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
+    const int mvx   = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
+    int       mvy   = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
 
     h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
                    h->mb.pic.p_fref[1][i_ref], h->mb.pic.i_stride[0],
-                   mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height );
+                   mvx, mvy, 4*width, 4*height, weight_none );
 
     if( h->mb.b_interlaced & i_ref )
         mvy += (h->mb.i_mb_y & 1)*4 - 2;
 
     h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                     &h->mb.pic.p_fref[1][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+                     h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1],
                      mvx, mvy, 2*width, 2*height );
 
     h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
-                     &h->mb.pic.p_fref[1][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+                     h->mb.pic.p_fref[1][i_ref][5], h->mb.pic.i_stride[2],
                      mvx, mvy, 2*width, 2*height );
 }
 
-static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
+static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
 {
     const int i8 = x264_scan8[0]+x+8*y;
     const int i_ref0 = h->mb.cache.ref[0][i8];
     const int i_ref1 = h->mb.cache.ref[1][i8];
     const int weight = h->mb.bipred_weight[i_ref0][i_ref1];
-    const int mvx0   = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
-    const int mvx1   = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
-    int       mvy0   = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
-    int       mvy1   = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
+    const int mvx0   = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
+    const int mvx1   = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x;
+    int       mvy0   = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
+    int       mvy1   = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
     int       i_mode = x264_size2pixel[height][width];
     int       i_stride0 = 16, i_stride1 = 16;
-    DECLARE_ALIGNED_16( uint8_t tmp0[16*16] );
-    DECLARE_ALIGNED_16( uint8_t tmp1[16*16] );
+    ALIGNED_ARRAY_16( uint8_t, tmp0,[16*16] );
+    ALIGNED_ARRAY_16( uint8_t, tmp1,[16*16] );
     uint8_t *src0, *src1;
 
     src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0],
-                          mvx0 + 4*4*x, mvy0 + 4*4*y, 4*width, 4*height );
+                          mvx0, mvy0, 4*width, 4*height, weight_none );
     src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
-                          mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height );
+                          mvx1, mvy1, 4*width, 4*height, weight_none );
     h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
                        src0, i_stride0, src1, i_stride1, weight );
 
@@ -536,14 +554,14 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int he
     if( h->mb.b_interlaced & i_ref1 )
         mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
 
-    h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+    h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1],
                      mvx0, mvy0, 2*width, 2*height );
-    h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+    h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1],
                      mvx1, mvy1, 2*width, 2*height );
     h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
-    h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+    h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][5], h->mb.pic.i_stride[2],
                      mvx0, mvy0, 2*width, 2*height );
-    h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+    h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][5], h->mb.pic.i_stride[2],
                      mvx1, mvy1, 2*width, 2*height );
     h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
 }
@@ -683,7 +701,6 @@ int x264_macroblock_cache_init( x264_t *h )
 
     /* all coeffs */
     CHECKED_MALLOC( h->mb.non_zero_count, i_mb_count * 24 * sizeof(uint8_t) );
-    CHECKED_MALLOC( h->mb.nnz_backup, h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t) );
 
     if( h->param.b_cabac )
     {
@@ -694,15 +711,61 @@ int x264_macroblock_cache_init( x264_t *h )
 
     for( i=0; i<2; i++ )
     {
-        int i_refs = X264_MIN(16, (i ? 1 : h->param.i_frame_reference) + h->param.b_bframe_pyramid) << h->param.b_interlaced;
+        int i_refs = X264_MIN(16, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << h->param.b_interlaced;
+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+            i_refs = X264_MIN(16, i_refs + 2); //smart weights add two duplicate frames
+        else if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_BLIND )
+            i_refs = X264_MIN(16, i_refs + 1); //blind weights add one duplicate frame
+
         for( j=0; j < i_refs; j++ )
             CHECKED_MALLOC( h->mb.mvr[i][j], 2 * i_mb_count * sizeof(int16_t) );
     }
 
+    if( h->param.analyse.i_weighted_pred )
+    {
+        int i_padv = PADV << h->param.b_interlaced;
+#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
+        int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
+        int i_stride, luma_plane_size;
+        int numweightbuf;
+
+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE )
+        {
+            // only need buffer for lookahead
+            if( !h->param.i_sync_lookahead || h == h->thread[h->param.i_threads] )
+            {
+                // Fake analysis only works on lowres
+                i_stride = ALIGN( h->sps->i_mb_width*8 + 2*PADH, align );
+                luma_plane_size = i_stride * (h->sps->i_mb_height*8+2*i_padv);
+                // Only need 1 buffer for analysis
+                numweightbuf = 1;
+            }
+            else
+                numweightbuf = 0;
+        }
+        else
+        {
+            i_stride = ALIGN( h->sps->i_mb_width*16 + 2*PADH, align );
+            luma_plane_size = i_stride * (h->sps->i_mb_height*16+2*i_padv);
+
+            if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+                //SMART can weight one ref and one offset -1
+                numweightbuf = 2;
+            else
+                //blind only has one weighted copy (offset -1)
+                numweightbuf = 1;
+        }
+
+        for( i = 0; i < numweightbuf; i++ )
+            CHECKED_MALLOC( h->mb.p_weight_buf[i], luma_plane_size );
+#undef ALIGN
+    }
+
     for( i=0; i<=h->param.b_interlaced; i++ )
         for( j=0; j<3; j++ )
         {
-            CHECKED_MALLOC( h->mb.intra_border_backup[i][j], h->fdec->i_stride[j] );
+            /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
+            CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
             h->mb.intra_border_backup[i][j] += 8;
         }
 
@@ -710,41 +773,6 @@ int x264_macroblock_cache_init( x264_t *h )
     memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
     memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
 
-    /* fdec:      fenc:
-     * yyyyyyy
-     * yYYYY      YYYY
-     * yYYYY      YYYY
-     * yYYYY      YYYY
-     * yYYYY      YYYY
-     * uuu vvv    UUVV
-     * uUU vVV    UUVV
-     * uUU vVV
-     */
-    h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
-    h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
-    h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
-    h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
-    h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
-    h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
-
-    h->mb.i_neighbour4[6] =
-    h->mb.i_neighbour4[9] =
-    h->mb.i_neighbour4[12] =
-    h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
-    h->mb.i_neighbour4[3] =
-    h->mb.i_neighbour4[7] =
-    h->mb.i_neighbour4[11] =
-    h->mb.i_neighbour4[13] =
-    h->mb.i_neighbour4[15] =
-    h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
-
-    int buf_hpel = (h->param.i_width+48) * sizeof(int16_t);
-    int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
-    int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
-    int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
-        ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
-    CHECKED_MALLOC( h->scratch_buffer, X264_MAX3( buf_hpel, buf_ssim, buf_tesa ) );
-
     return 0;
 fail: return -1;
 }
@@ -757,6 +785,9 @@ void x264_macroblock_cache_end( x264_t *h )
     for( i=0; i<2; i++ )
         for( j=0; j<32; j++ )
             x264_free( h->mb.mvr[i][j] );
+    for( i=0; i<16; i++ )
+        x264_free( h->mb.p_weight_buf[i] );
+
     if( h->param.b_cabac )
     {
         x264_free( h->mb.chroma_pred_mode );
@@ -765,12 +796,10 @@ void x264_macroblock_cache_end( x264_t *h )
     }
     x264_free( h->mb.intra4x4_pred_mode );
     x264_free( h->mb.non_zero_count );
-    x264_free( h->mb.nnz_backup );
     x264_free( h->mb.mb_transform_size );
     x264_free( h->mb.skipbp );
     x264_free( h->mb.cbp );
     x264_free( h->mb.qp );
-    x264_free( h->scratch_buffer );
 }
 void x264_macroblock_slice_init( x264_t *h )
 {
@@ -791,16 +820,16 @@ void x264_macroblock_slice_init( x264_t *h )
         for( i = 0; i < h->i_ref1; i++ )
             h->fdec->ref_poc[1][i] = h->fref1[i]->i_poc;
 
-        h->mb.map_col_to_list0[-1] = -1;
-        h->mb.map_col_to_list0[-2] = -2;
+        map_col_to_list0(-1) = -1;
+        map_col_to_list0(-2) = -2;
         for( i = 0; i < h->fref1[0]->i_ref[0]; i++ )
         {
             int poc = h->fref1[0]->ref_poc[0][i];
-            h->mb.map_col_to_list0[i] = -2;
+            map_col_to_list0(i) = -2;
             for( j = 0; j < h->i_ref0; j++ )
                 if( h->fref0[j]->i_poc == poc )
                 {
-                    h->mb.map_col_to_list0[i] = j;
+                    map_col_to_list0(i) = j;
                     break;
                 }
         }
@@ -809,6 +838,37 @@ void x264_macroblock_slice_init( x264_t *h )
         memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
 
     setup_inverse_delta_pocs( h );
+
+    h->mb.i_neighbour4[6] =
+    h->mb.i_neighbour4[9] =
+    h->mb.i_neighbour4[12] =
+    h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT;
+    h->mb.i_neighbour4[3] =
+    h->mb.i_neighbour4[7] =
+    h->mb.i_neighbour4[11] =
+    h->mb.i_neighbour4[13] =
+    h->mb.i_neighbour4[15] =
+    h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
+}
+
+void x264_macroblock_thread_init( x264_t *h )
+{
+    /* fdec:      fenc:
+     * yyyyyyy
+     * yYYYY      YYYY
+     * yYYYY      YYYY
+     * yYYYY      YYYY
+     * yYYYY      YYYY
+     * uuu vvv    UUVV
+     * uUU vVV    UUVV
+     * uUU vVV
+     */
+    h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
+    h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
+    h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
+    h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
+    h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
+    h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
 }
 
 void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
@@ -837,8 +897,10 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb
     const int i_pix_offset = h->mb.b_interlaced
                            ? w * (i_mb_x + (i_mb_y&~1) * i_stride) + (i_mb_y&1) * i_stride
                            : w * (i_mb_x + i_mb_y * i_stride);
+    const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
+    const uint8_t *intra_fdec = h->param.b_sliced_threads ? plane_fdec-i_stride2 :
+                                &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
     int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
-    const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i];
     x264_frame_t **fref[2] = { h->fref0, h->fref1 };
     int j, k;
     if( h->mb.b_interlaced )
@@ -847,19 +909,25 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb
     h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
     h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
         h->mb.pic.p_fenc_plane[i], i_stride2, w );
-    memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
-    if( h->mb.b_interlaced )
-    {
-        const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
+    if( i_mb_y > 0 )
+        memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
+    else
+        memset( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], 0, w*3/2+1 );
+    if( h->mb.b_interlaced || h->mb.b_reencode_mb )
         for( j = 0; j < w; j++ )
             h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
-    }
     for( j = 0; j < h->mb.pic.i_fref[0]; j++ )
     {
         h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]];
         if( i == 0 )
+        {
             for( k = 1; k < 4; k++ )
                 h->mb.pic.p_fref[0][j][k] = &fref[0][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]];
+            if( h->sh.weight[j][0].weightfn )
+                h->mb.pic.p_fref_w[j] = &h->fenc->weighted[j >> h->mb.b_interlaced][ref_pix_offset[j&1]];
+            else
+                h->mb.pic.p_fref_w[j] = h->mb.pic.p_fref[0][j][0];
+        }
     }
     if( h->sh.i_type == SLICE_TYPE_B )
         for( j = 0; j < h->mb.pic.i_fref[1]; j++ )
@@ -894,24 +962,28 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
     h->mb.i_b4_xy = i_mb_4x4;
     h->mb.i_mb_top_xy = i_top_xy;
     h->mb.i_neighbour = 0;
+    h->mb.i_neighbour_intra = 0;
 
     /* load cache */
     if( i_top_xy >= h->sh.i_first_mb )
     {
         h->mb.i_mb_type_top =
-        i_top_type= h->mb.type[i_top_xy];
+        i_top_type = h->mb.type[i_top_xy];
         h->mb.cache.i_cbp_top = h->mb.cbp[i_top_xy];
 
         h->mb.i_neighbour |= MB_TOP;
 
+        if( !h->param.b_constrained_intra || IS_INTRA( i_top_type ) )
+            h->mb.i_neighbour_intra |= MB_TOP;
+
         /* load intra4x4 */
-        *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] = *(uint32_t*)&h->mb.intra4x4_pred_mode[i_top_xy][0];
+        CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &h->mb.intra4x4_pred_mode[i_top_xy][0] );
 
         /* load non_zero_count */
-        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0] - 8] = *(uint32_t*)&h->mb.non_zero_count[i_top_xy][12];
+        CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &h->mb.non_zero_count[i_top_xy][12] );
         /* shift because x264_scan8[16] is misaligned */
-        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16] - 9] = *(uint16_t*)&h->mb.non_zero_count[i_top_xy][18] << 8;
-        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] = *(uint16_t*)&h->mb.non_zero_count[i_top_xy][22] << 8;
+        M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = M16( &h->mb.non_zero_count[i_top_xy][18] ) << 8;
+        M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = M16( &h->mb.non_zero_count[i_top_xy][22] ) << 8;
     }
     else
     {
@@ -919,20 +991,12 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
         h->mb.cache.i_cbp_top = -1;
 
         /* load intra4x4 */
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] =
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[1] - 8] =
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[4] - 8] =
-        h->mb.cache.intra4x4_pred_mode[x264_scan8[5] - 8] = -1;
+        M32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] ) = 0xFFFFFFFFU;
 
         /* load non_zero_count */
-        h->mb.cache.non_zero_count[x264_scan8[0] - 8] =
-        h->mb.cache.non_zero_count[x264_scan8[1] - 8] =
-        h->mb.cache.non_zero_count[x264_scan8[4] - 8] =
-        h->mb.cache.non_zero_count[x264_scan8[5] - 8] =
-        h->mb.cache.non_zero_count[x264_scan8[16+0] - 8] =
-        h->mb.cache.non_zero_count[x264_scan8[16+1] - 8] =
-        h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 8] =
-        h->mb.cache.non_zero_count[x264_scan8[16+4+1] - 8] = 0x80;
+        M32( &h->mb.cache.non_zero_count[x264_scan8[   0] - 8] ) = 0x80808080U;
+        M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = 0x80808080U;
+        M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = 0x80808080U;
     }
 
     if( i_mb_x > 0 && i_mb_xy > h->sh.i_first_mb )
@@ -944,6 +1008,9 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
 
         h->mb.i_neighbour |= MB_LEFT;
 
+        if( !h->param.b_constrained_intra || IS_INTRA( i_left_type ) )
+            h->mb.i_neighbour_intra |= MB_LEFT;
+
         /* load intra4x4 */
         h->mb.cache.intra4x4_pred_mode[x264_scan8[0 ] - 1] = h->mb.intra4x4_pred_mode[i_left_xy][4];
         h->mb.cache.intra4x4_pred_mode[x264_scan8[2 ] - 1] = h->mb.intra4x4_pred_mode[i_left_xy][5];
@@ -987,6 +1054,8 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
     {
         h->mb.i_neighbour |= MB_TOPRIGHT;
         h->mb.i_mb_type_topright = h->mb.type[ i_top_xy + 1 ];
+        if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_topright ) )
+            h->mb.i_neighbour_intra |= MB_TOPRIGHT;
     }
     else
         h->mb.i_mb_type_topright = -1;
@@ -994,6 +1063,8 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
     {
         h->mb.i_neighbour |= MB_TOPLEFT;
         h->mb.i_mb_type_topleft = h->mb.type[ i_top_xy - 1 ];
+        if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_topleft ) )
+            h->mb.i_neighbour_intra |= MB_TOPLEFT;
     }
     else
         h->mb.i_mb_type_topleft = -1;
@@ -1014,7 +1085,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
           + !!(h->mb.i_neighbour & MB_TOP);
     }
 
-    if( !h->mb.b_interlaced )
+    if( !h->mb.b_interlaced && !h->mb.b_reencode_mb )
     {
         copy_column8( h->mb.pic.p_fdec[0]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+ 4*FDEC_STRIDE );
         copy_column8( h->mb.pic.p_fdec[0]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+12*FDEC_STRIDE );
@@ -1060,13 +1131,13 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 const int ir = i_top_8x8 - 1;
                 const int iv = i_top_4x4 - 1;
                 h->mb.cache.ref[i_list][i8]  = h->mb.ref[i_list][ir];
-                *(uint32_t*)h->mb.cache.mv[i_list][i8] = *(uint32_t*)h->mb.mv[i_list][iv];
+                CP32( h->mb.cache.mv[i_list][i8], h->mb.mv[i_list][iv] );
             }
             else
             {
                 const int i8 = x264_scan8[0] - 1 - 1*8;
                 h->mb.cache.ref[i_list][i8] = -2;
-                *(uint32_t*)h->mb.cache.mv[i_list][i8] = 0;
+                M32( h->mb.cache.mv[i_list][i8] ) = 0;
             }
 
             if( h->mb.i_neighbour & MB_TOP )
@@ -1078,15 +1149,15 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 h->mb.cache.ref[i_list][i8+1] = h->mb.ref[i_list][ir + 0];
                 h->mb.cache.ref[i_list][i8+2] =
                 h->mb.cache.ref[i_list][i8+3] = h->mb.ref[i_list][ir + 1];
-                *(uint64_t*)h->mb.cache.mv[i_list][i8+0] = *(uint64_t*)h->mb.mv[i_list][iv+0];
-                *(uint64_t*)h->mb.cache.mv[i_list][i8+2] = *(uint64_t*)h->mb.mv[i_list][iv+2];
+                CP64( h->mb.cache.mv[i_list][i8+0], h->mb.mv[i_list][iv+0] );
+                CP64( h->mb.cache.mv[i_list][i8+2], h->mb.mv[i_list][iv+2] );
             }
             else
             {
                 const int i8 = x264_scan8[0] - 8;
-                *(uint64_t*)h->mb.cache.mv[i_list][i8+0] = 0;
-                *(uint64_t*)h->mb.cache.mv[i_list][i8+2] = 0;
-                *(uint32_t*)&h->mb.cache.ref[i_list][i8] = (uint8_t)(-2) * 0x01010101U;
+                M64( h->mb.cache.mv[i_list][i8+0] ) = 0;
+                M64( h->mb.cache.mv[i_list][i8+2] ) = 0;
+                M32( &h->mb.cache.ref[i_list][i8] ) = (uint8_t)(-2) * 0x01010101U;
             }
 
             if( h->mb.i_neighbour & MB_TOPRIGHT )
@@ -1095,13 +1166,13 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 const int ir = i_top_8x8 + 2;
                 const int iv = i_top_4x4 + 4;
                 h->mb.cache.ref[i_list][i8]  = h->mb.ref[i_list][ir];
-                *(uint32_t*)h->mb.cache.mv[i_list][i8] = *(uint32_t*)h->mb.mv[i_list][iv];
+                CP32( h->mb.cache.mv[i_list][i8], h->mb.mv[i_list][iv] );
             }
             else
             {
                 const int i8 = x264_scan8[0] + 4 - 1*8;
                 h->mb.cache.ref[i_list][i8] = -2;
-                *(uint32_t*)h->mb.cache.mv[i_list][i8] = 0;
+                M32( h->mb.cache.mv[i_list][i8] ) = 0;
             }
 
             if( h->mb.i_neighbour & MB_LEFT )
@@ -1114,10 +1185,10 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 h->mb.cache.ref[i_list][i8+2*8] =
                 h->mb.cache.ref[i_list][i8+3*8] = h->mb.ref[i_list][ir + 1*s8x8];
 
-                *(uint32_t*)h->mb.cache.mv[i_list][i8+0*8] = *(uint32_t*)h->mb.mv[i_list][iv + 0*s4x4];
-                *(uint32_t*)h->mb.cache.mv[i_list][i8+1*8] = *(uint32_t*)h->mb.mv[i_list][iv + 1*s4x4];
-                *(uint32_t*)h->mb.cache.mv[i_list][i8+2*8] = *(uint32_t*)h->mb.mv[i_list][iv + 2*s4x4];
-                *(uint32_t*)h->mb.cache.mv[i_list][i8+3*8] = *(uint32_t*)h->mb.mv[i_list][iv + 3*s4x4];
+                CP32( h->mb.cache.mv[i_list][i8+0*8], h->mb.mv[i_list][iv + 0*s4x4] );
+                CP32( h->mb.cache.mv[i_list][i8+1*8], h->mb.mv[i_list][iv + 1*s4x4] );
+                CP32( h->mb.cache.mv[i_list][i8+2*8], h->mb.mv[i_list][iv + 2*s4x4] );
+                CP32( h->mb.cache.mv[i_list][i8+3*8], h->mb.mv[i_list][iv + 3*s4x4] );
             }
             else
             {
@@ -1125,7 +1196,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 for( i = 0; i < 4; i++ )
                 {
                     h->mb.cache.ref[i_list][i8+i*8] = -2;
-                    *(uint32_t*)h->mb.cache.mv[i_list][i8+i*8] = 0;
+                    M32( h->mb.cache.mv[i_list][i8+i*8] ) = 0;
                 }
             }
 
@@ -1135,45 +1206,49 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 {
                     const int i8 = x264_scan8[0] - 8;
                     const int iv = i_top_4x4;
-                    *(uint64_t*)h->mb.cache.mvd[i_list][i8+0] = *(uint64_t*)h->mb.mvd[i_list][iv+0];
-                    *(uint64_t*)h->mb.cache.mvd[i_list][i8+2] = *(uint64_t*)h->mb.mvd[i_list][iv+2];
+                    CP64( h->mb.cache.mvd[i_list][i8+0], h->mb.mvd[i_list][iv+0] );
+                    CP64( h->mb.cache.mvd[i_list][i8+2], h->mb.mvd[i_list][iv+2] );
                 }
                 else
                 {
                     const int i8 = x264_scan8[0] - 8;
-                    *(uint64_t*)h->mb.cache.mvd[i_list][i8+0] =
-                    *(uint64_t*)h->mb.cache.mvd[i_list][i8+2] = 0;
+                    M64( h->mb.cache.mvd[i_list][i8+0] ) = 0;
+                    M64( h->mb.cache.mvd[i_list][i8+2] ) = 0;
                 }
 
                 if( i_left_type >= 0 )
                 {
                     const int i8 = x264_scan8[0] - 1;
                     const int iv = i_mb_4x4 - 1;
-                    *(uint32_t*)h->mb.cache.mvd[i_list][i8+0*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 0*s4x4];
-                    *(uint32_t*)h->mb.cache.mvd[i_list][i8+1*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 1*s4x4];
-                    *(uint32_t*)h->mb.cache.mvd[i_list][i8+2*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 2*s4x4];
-                    *(uint32_t*)h->mb.cache.mvd[i_list][i8+3*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 3*s4x4];
+                    CP32( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] );
+                    CP32( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] );
+                    CP32( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] );
+                    CP32( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] );
                 }
                 else
                 {
                     const int i8 = x264_scan8[0] - 1;
                     for( i = 0; i < 4; i++ )
-                        *(uint32_t*)h->mb.cache.mvd[i_list][i8+i*8] = 0;
+                        M32( h->mb.cache.mvd[i_list][i8+i*8] ) = 0;
                 }
             }
         }
 
         /* load skip */
-        if( h->sh.i_type == SLICE_TYPE_B && h->param.b_cabac )
+        if( h->sh.i_type == SLICE_TYPE_B )
         {
-            uint8_t skipbp;
-            x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
-            skipbp = i_left_type >= 0 ? h->mb.skipbp[i_left_xy] : 0;
-            h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2;
-            h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8;
-            skipbp = i_top_type >= 0 ? h->mb.skipbp[i_top_xy] : 0;
-            h->mb.cache.skip[x264_scan8[0] - 8] = skipbp & 0x4;
-            h->mb.cache.skip[x264_scan8[4] - 8] = skipbp & 0x8;
+            h->mb.bipred_weight = h->mb.bipred_weight_buf[h->mb.b_interlaced&(i_mb_y&1)];
+            if( h->param.b_cabac )
+            {
+                uint8_t skipbp;
+                x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
+                skipbp = i_left_type >= 0 ? h->mb.skipbp[i_left_xy] : 0;
+                h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2;
+                h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8;
+                skipbp = i_top_type >= 0 ? h->mb.skipbp[i_top_xy] : 0;
+                h->mb.cache.skip[x264_scan8[0] - 8] = skipbp & 0x4;
+                h->mb.cache.skip[x264_scan8[4] - 8] = skipbp & 0x8;
+            }
         }
 
         if( h->sh.i_type == SLICE_TYPE_P )
@@ -1181,20 +1256,20 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
     }
 
     h->mb.i_neighbour4[0] =
-    h->mb.i_neighbour8[0] = (h->mb.i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT))
-                            | ((h->mb.i_neighbour & MB_TOP) ? MB_TOPRIGHT : 0);
+    h->mb.i_neighbour8[0] = (h->mb.i_neighbour_intra & (MB_TOP|MB_LEFT|MB_TOPLEFT))
+                            | ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOPRIGHT : 0);
     h->mb.i_neighbour4[4] =
-    h->mb.i_neighbour4[1] = MB_LEFT | ((h->mb.i_neighbour & MB_TOP) ? (MB_TOP|MB_TOPLEFT|MB_TOPRIGHT) : 0);
+    h->mb.i_neighbour4[1] = MB_LEFT | ((h->mb.i_neighbour_intra & MB_TOP) ? (MB_TOP|MB_TOPLEFT|MB_TOPRIGHT) : 0);
     h->mb.i_neighbour4[2] =
     h->mb.i_neighbour4[8] =
     h->mb.i_neighbour4[10] =
-    h->mb.i_neighbour8[2] = MB_TOP|MB_TOPRIGHT | ((h->mb.i_neighbour & MB_LEFT) ? (MB_LEFT|MB_TOPLEFT) : 0);
+    h->mb.i_neighbour8[2] = MB_TOP|MB_TOPRIGHT | ((h->mb.i_neighbour_intra & MB_LEFT) ? (MB_LEFT|MB_TOPLEFT) : 0);
     h->mb.i_neighbour4[5] =
-    h->mb.i_neighbour8[1] = MB_LEFT | (h->mb.i_neighbour & MB_TOPRIGHT)
-                            | ((h->mb.i_neighbour & MB_TOP) ? MB_TOP|MB_TOPLEFT : 0);
+    h->mb.i_neighbour8[1] = MB_LEFT | (h->mb.i_neighbour_intra & MB_TOPRIGHT)
+                            | ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOP|MB_TOPLEFT : 0);
 }
 
-static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i)
+static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i )
 {
     int w = i ? 8 : 16;
     int i_stride = h->fdec->i_stride[!!i];
@@ -1221,7 +1296,7 @@ void x264_macroblock_cache_save( x264_t *h )
     int8_t *intra4x4_pred_mode = h->mb.intra4x4_pred_mode[i_mb_xy];
     uint8_t *non_zero_count = h->mb.non_zero_count[i_mb_xy];
 
-    int i, y;
+    int y;
 
     x264_macroblock_store_pic( h, 0 );
     x264_macroblock_store_pic( h, 1 );
@@ -1235,13 +1310,16 @@ void x264_macroblock_cache_save( x264_t *h )
     /* save intra4x4 */
     if( i_mb_type == I_4x4 )
     {
-        *(uint32_t*)&intra4x4_pred_mode[0] = *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[10] ];
-        *(uint32_t*)&intra4x4_pred_mode[4] = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
-                                                       h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
-                                                       h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
+        CP32( &intra4x4_pred_mode[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] );
+        M32( &intra4x4_pred_mode[4] ) = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
+                                                  h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
+                                                  h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
     }
+    else if( !h->param.b_constrained_intra || IS_INTRA(i_mb_type) )
+        M64( intra4x4_pred_mode ) = I_PRED_4x4_DC * 0x0101010101010101ULL;
     else
-        *(uint64_t*)intra4x4_pred_mode = I_PRED_4x4_DC * 0x0101010101010101ULL;
+        M64( intra4x4_pred_mode ) = (uint8_t)(-1) * 0x0101010101010101ULL;
+
 
     if( i_mb_type == I_PCM )
     {
@@ -1251,20 +1329,19 @@ void x264_macroblock_cache_save( x264_t *h )
         h->mb.i_cbp_luma = 0xf;
         h->mb.cbp[i_mb_xy] = 0x72f;   /* all set */
         h->mb.b_transform_8x8 = 0;
-        for( i = 0; i < 16 + 2*4; i++ )
-            non_zero_count[i] = 16;
+        memset( non_zero_count, 16, 24 );
     }
     else
     {
         /* save non zero count */
-        *(uint32_t*)&non_zero_count[0*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+0*8];
-        *(uint32_t*)&non_zero_count[1*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+1*8];
-        *(uint32_t*)&non_zero_count[2*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+2*8];
-        *(uint32_t*)&non_zero_count[3*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+3*8];
-        *(uint16_t*)&non_zero_count[16+0*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] >> 8;
-        *(uint16_t*)&non_zero_count[16+1*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] >> 8;
-        *(uint16_t*)&non_zero_count[16+2*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] >> 8;
-        *(uint16_t*)&non_zero_count[16+3*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] >> 8;
+        CP32( &non_zero_count[0*4], &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] );
+        CP32( &non_zero_count[1*4], &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] );
+        CP32( &non_zero_count[2*4], &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] );
+        CP32( &non_zero_count[3*4], &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] );
+        M16( &non_zero_count[16+0*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] ) >> 8;
+        M16( &non_zero_count[16+1*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] ) >> 8;
+        M16( &non_zero_count[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8;
+        M16( &non_zero_count[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8;
 
         if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
             h->mb.i_qp = h->mb.i_last_qp;
@@ -1287,8 +1364,8 @@ void x264_macroblock_cache_save( x264_t *h )
             h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
             for( y = 0; y < 4; y++ )
             {
-                *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+0];
-                *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+2];
+                CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[0][x264_scan8[0]+8*y+0] );
+                CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[0][x264_scan8[0]+8*y+2] );
             }
             if( h->sh.i_type == SLICE_TYPE_B )
             {
@@ -1298,8 +1375,8 @@ void x264_macroblock_cache_save( x264_t *h )
                 h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
                 for( y = 0; y < 4; y++ )
                 {
-                    *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+0];
-                    *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+2];
+                    CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[1][x264_scan8[0]+8*y+0] );
+                    CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[1][x264_scan8[0]+8*y+2] );
                 }
             }
         }
@@ -1308,12 +1385,12 @@ void x264_macroblock_cache_save( x264_t *h )
             int i_list;
             for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2  : 1 ); i_list++ )
             {
-                *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+0*s8x8] = (uint8_t)(-1) * 0x0101;
-                *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+1*s8x8] = (uint8_t)(-1) * 0x0101;
+                M16( &h->mb.ref[i_list][i_mb_8x8+0*s8x8] ) = (uint8_t)(-1) * 0x0101;
+                M16( &h->mb.ref[i_list][i_mb_8x8+1*s8x8] ) = (uint8_t)(-1) * 0x0101;
                 for( y = 0; y < 4; y++ )
                 {
-                    *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = 0;
-                    *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] = 0;
+                    M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] ) = 0;
+                    M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] ) = 0;
                 }
             }
         }
@@ -1330,28 +1407,28 @@ void x264_macroblock_cache_save( x264_t *h )
         {
             for( y = 0; y < 4; y++ )
             {
-                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[0][x264_scan8[0]+8*y+0];
-                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[0][x264_scan8[0]+8*y+2];
+                CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[0][x264_scan8[0]+8*y+0] );
+                CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[0][x264_scan8[0]+8*y+2] );
             }
             if( h->sh.i_type == SLICE_TYPE_B )
                 for( y = 0; y < 4; y++ )
                 {
-                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[1][x264_scan8[0]+8*y+0];
-                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[1][x264_scan8[0]+8*y+2];
+                    CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[1][x264_scan8[0]+8*y+0] );
+                    CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[1][x264_scan8[0]+8*y+2] );
                 }
         }
         else
         {
             for( y = 0; y < 4; y++ )
             {
-                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+0] = 0;
-                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+2] = 0;
+                M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0] ) = 0;
+                M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2] ) = 0;
             }
             if( h->sh.i_type == SLICE_TYPE_B )
                 for( y = 0; y < 4; y++ )
                 {
-                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+0] = 0;
-                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+2] = 0;
+                    M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0] ) = 0;
+                    M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2] ) = 0;
                 }
         }
 
@@ -1373,45 +1450,50 @@ void x264_macroblock_cache_save( x264_t *h )
     }
 }
 
+
 void x264_macroblock_bipred_init( x264_t *h )
 {
-    int i_ref0, i_ref1;
-    for( i_ref0 = 0; i_ref0 < h->i_ref0; i_ref0++ )
-    {
-        int poc0 = h->fref0[i_ref0]->i_poc;
-        for( i_ref1 = 0; i_ref1 < h->i_ref1; i_ref1++ )
+    int i_ref0, i_ref1, field;
+    for( field = 0; field <= h->sh.b_mbaff; field++ )
+        for( i_ref0 = 0; i_ref0 < (h->i_ref0<<h->sh.b_mbaff); i_ref0++ )
         {
-            int dist_scale_factor;
-            int poc1 = h->fref1[i_ref1]->i_poc;
-            int td = x264_clip3( poc1 - poc0, -128, 127 );
-            if( td == 0 /* || pic0 is a long-term ref */ )
-                dist_scale_factor = 256;
-            else
+            int poc0 = h->fref0[i_ref0>>h->sh.b_mbaff]->i_poc;
+            if( h->sh.b_mbaff && field^(i_ref0&1) )
+                poc0 += h->sh.i_delta_poc_bottom;
+            for( i_ref1 = 0; i_ref1 < (h->i_ref1<<h->sh.b_mbaff); i_ref1++ )
             {
-                int tb = x264_clip3( h->fdec->i_poc - poc0, -128, 127 );
-                int tx = (16384 + (abs(td) >> 1)) / td;
-                dist_scale_factor = x264_clip3( (tb * tx + 32) >> 6, -1024, 1023 );
-            }
-            h->mb.dist_scale_factor[i_ref0][i_ref1] = dist_scale_factor;
+                int dist_scale_factor;
+                int poc1 = h->fref1[i_ref1>>h->sh.b_mbaff]->i_poc;
+                if( h->sh.b_mbaff && field^(i_ref1&1) )
+                    poc1 += h->sh.i_delta_poc_bottom;
+                int cur_poc = h->fdec->i_poc + field*h->sh.i_delta_poc_bottom;
+                int td = x264_clip3( poc1 - poc0, -128, 127 );
+                if( td == 0 /* || pic0 is a long-term ref */ )
+                    dist_scale_factor = 256;
+                else
+                {
+                    int tb = x264_clip3( cur_poc - poc0, -128, 127 );
+                    int tx = (16384 + (abs(td) >> 1)) / td;
+                    dist_scale_factor = x264_clip3( (tb * tx + 32) >> 6, -1024, 1023 );
+                }
 
-            dist_scale_factor >>= 2;
-            if( h->param.analyse.b_weighted_bipred
-                  && dist_scale_factor >= -64
-                  && dist_scale_factor <= 128 )
-            {
-                h->mb.bipred_weight[i_ref0][i_ref1] = 64 - dist_scale_factor;
-                // ssse3 implementation of biweight doesn't support the extrema.
-                // if we ever generate them, we'll have to drop that optimization.
-                assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 );
+                // FIXME: will need this if we ever do temporal MV pred with interlaced
+                if( !h->sh.b_mbaff )
+                    h->mb.dist_scale_factor[i_ref0][i_ref1] = dist_scale_factor;
+
+                dist_scale_factor >>= 2;
+                if( h->param.analyse.b_weighted_bipred
+                      && dist_scale_factor >= -64
+                      && dist_scale_factor <= 128 )
+                {
+                    h->mb.bipred_weight_buf[field][i_ref0][i_ref1] = 64 - dist_scale_factor;
+                    // ssse3 implementation of biweight doesn't support the extrema.
+                    // if we ever generate them, we'll have to drop that optimization.
+                    assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 );
+                }
+                else
+                    h->mb.bipred_weight_buf[field][i_ref0][i_ref1] = 32;
             }
-            else
-                h->mb.bipred_weight[i_ref0][i_ref1] = 32;
         }
-    }
-    if( h->sh.b_mbaff )
-    {
-        for( i_ref0 = 2*h->i_ref0-1; i_ref0 >= 0; i_ref0-- )
-            for( i_ref1 = 2*h->i_ref1-1; i_ref1 >= 0; i_ref1-- )
-                h->mb.bipred_weight[i_ref0][i_ref1] = h->mb.bipred_weight[i_ref0>>1][i_ref1>>1];
-    }
 }
+
diff --git a/common/macroblock.h b/common/macroblock.h
index d16b8de..48f3105 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -263,6 +263,7 @@ enum cabac_ctx_block_cat_e
 
 int  x264_macroblock_cache_init( x264_t *h );
 void x264_macroblock_slice_init( x264_t *h );
+void x264_macroblock_thread_init( x264_t *h );
 void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y );
 void x264_macroblock_cache_save( x264_t *h );
 void x264_macroblock_cache_end( x264_t *h );
@@ -291,10 +292,6 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
  *      if b_changed != NULL, set it to whether refs or mvs differ from
  *      before this functioncall. */
 int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed );
-/* x264_mb_load_mv_direct8x8:
- *      set h->mb.cache.mv and h->mb.cache.ref for B_DIRECT
- *      must be called only after x264_mb_predict_mv_direct16x16 */
-void x264_mb_load_mv_direct8x8( x264_t *h, int idx );
 /* x264_mb_predict_mv_ref16x16:
  *      set mvc with D_16x16 prediction.
  *      uses all neighbors, even those that didn't end up using this ref.
@@ -338,21 +335,22 @@ static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
 }
 static ALWAYS_INLINE void x264_macroblock_cache_rect1( void *dst, int width, int height, uint8_t val )
 {
+    uint32_t *d = dst;
     if( width == 4 )
     {
         uint32_t val2 = val * 0x01010101;
-                          ((uint32_t*)dst)[0] = val2;
-        if( height >= 2 ) ((uint32_t*)dst)[2] = val2;
-        if( height == 4 ) ((uint32_t*)dst)[4] = val2;
-        if( height == 4 ) ((uint32_t*)dst)[6] = val2;
+                          M32( d+0 ) = val2;
+        if( height >= 2 ) M32( d+2 ) = val2;
+        if( height == 4 ) M32( d+4 ) = val2;
+        if( height == 4 ) M32( d+6 ) = val2;
     }
     else // 2
     {
         uint32_t val2 = val * 0x0101;
-                          ((uint16_t*)dst)[ 0] = val2;
-        if( height >= 2 ) ((uint16_t*)dst)[ 4] = val2;
-        if( height == 4 ) ((uint16_t*)dst)[ 8] = val2;
-        if( height == 4 ) ((uint16_t*)dst)[12] = val2;
+                          M16( d+0 ) = val2;
+        if( height >= 2 ) M16( d+2 ) = val2;
+        if( height == 4 ) M16( d+4 ) = val2;
+        if( height == 4 ) M16( d+6 ) = val2;
     }
 }
 static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int height, uint32_t val )
@@ -360,25 +358,27 @@ static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int
     int dy;
     if( width == 1 || WORD_SIZE < 8 )
     {
+        uint32_t *d = dst;
         for( dy = 0; dy < height; dy++ )
         {
-                             ((uint32_t*)dst)[8*dy+0] = val;
-            if( width >= 2 ) ((uint32_t*)dst)[8*dy+1] = val;
-            if( width == 4 ) ((uint32_t*)dst)[8*dy+2] = val;
-            if( width == 4 ) ((uint32_t*)dst)[8*dy+3] = val;
+                             M32( d+8*dy+0 ) = val;
+            if( width >= 2 ) M32( d+8*dy+1 ) = val;
+            if( width == 4 ) M32( d+8*dy+2 ) = val;
+            if( width == 4 ) M32( d+8*dy+3 ) = val;
         }
     }
     else
     {
         uint64_t val64 = val + ((uint64_t)val<<32);
+        uint64_t *d = dst;
         for( dy = 0; dy < height; dy++ )
         {
-                             ((uint64_t*)dst)[4*dy+0] = val64;
-            if( width == 4 ) ((uint64_t*)dst)[4*dy+1] = val64;
+                             M64( d+4*dy+0 ) = val64;
+            if( width == 4 ) M64( d+4*dy+1 ) = val64;
         }
     }
 }
-#define x264_macroblock_cache_mv_ptr(a,x,y,w,h,l,mv) x264_macroblock_cache_mv(a,x,y,w,h,l,*(uint32_t*)mv)
+#define x264_macroblock_cache_mv_ptr( a, x, y, w, h, l, mv ) x264_macroblock_cache_mv( a, x, y, w, h, l, M32( mv ) )
 static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv )
 {
     x264_macroblock_cache_rect4( &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
@@ -401,22 +401,20 @@ static ALWAYS_INLINE void x264_macroblock_cache_intra8x8_pred( x264_t *h, int x,
     cache[0] = cache[1] = cache[8] = cache[9] = i_mode;
 }
 #define array_non_zero(a) array_non_zero_int(a, sizeof(a))
-#define array_non_zero_int array_non_zero_int_c
-static ALWAYS_INLINE int array_non_zero_int_c( void *v, int i_count )
+#define array_non_zero_int array_non_zero_int
+static ALWAYS_INLINE int array_non_zero_int( int16_t *v, int i_count )
 {
-    union {uint16_t s[4]; uint64_t l;} *x = v;
     if(i_count == 8)
-        return !!x[0].l;
+        return !!M64( &v[0] );
     else if(i_count == 16)
-        return !!(x[0].l|x[1].l);
+        return !!(M64( &v[0] ) | M64( &v[4] ));
     else if(i_count == 32)
-        return !!(x[0].l|x[1].l|x[2].l|x[3].l);
+        return !!(M64( &v[0] ) | M64( &v[4] ) | M64( &v[8] ) | M64( &v[12] ));
     else
     {
         int i;
-        i_count /= sizeof(uint64_t);
-        for( i = 0; i < i_count; i++ )
-            if( x[i].l ) return 1;
+        for( i = 0; i < i_count; i+=4 )
+            if( M64( &v[i] ) ) return 1;
         return 0;
     }
 }
@@ -462,7 +460,7 @@ static inline int x264_mb_transform_8x8_allowed( x264_t *h )
         return 0;
     if( h->mb.i_type != P_8x8 )
         return partition_tab[h->mb.i_type];
-    return *(uint32_t*)h->mb.i_sub_partition == D_L0_8x8*0x01010101;
+    return M32( h->mb.i_sub_partition ) == D_L0_8x8*0x01010101;
 }
 
 #endif
diff --git a/common/mc.c b/common/mc.c
index e5d6cc8..ac740cf 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -29,6 +29,9 @@
 #ifdef ARCH_PPC
 #include "ppc/mc.h"
 #endif
+#ifdef ARCH_ARM
+#include "arm/mc.h"
+#endif
 
 
 static inline void pixel_avg( uint8_t *dst,  int i_dst_stride,
@@ -117,6 +120,67 @@ PIXEL_AVG_C( pixel_avg_4x2,   4, 2 )
 PIXEL_AVG_C( pixel_avg_2x4,   2, 4 )
 PIXEL_AVG_C( pixel_avg_2x2,   2, 2 )
 
+static void x264_weight_cache( x264_t *h, x264_weight_t *w )
+{
+    w->weightfn = h->mc.weight;
+}
+#define opscale(x) dst[x] = x264_clip_uint8( ((src[x] * weight->i_scale + (1<<(weight->i_denom - 1))) >> weight->i_denom) + weight->i_offset )
+#define opscale_noden(x) dst[x] = x264_clip_uint8( src[x] * weight->i_scale + weight->i_offset )
+static inline void mc_weight( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height )
+{
+
+    int x, y;
+    if( weight->i_denom >= 1 )
+    {
+        for( y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
+        {
+            for( x = 0; x < i_width; x++ )
+                opscale( x );
+        }
+    }
+    else
+    {
+        for( y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
+            for( x = 0; x < i_width; x++ )
+                opscale_noden( x );
+    }
+}
+
+#define MC_WEIGHT_C( name, lx ) \
+    static void name( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, const x264_weight_t *weight, int height ) \
+{ \
+    int x, y; \
+    if( weight->i_denom >= 1 ) \
+    { \
+        for( y = 0; y < height; y++, dst += i_dst_stride, src += i_src_stride ) \
+            for( x = 0; x < lx; x++ ) \
+                opscale( x ); \
+    } \
+    else \
+    { \
+        for( y = 0; y < height; y++, dst += i_dst_stride, src += i_src_stride ) \
+            for( x = 0; x < lx; x++ ) \
+                opscale_noden( x ); \
+    } \
+}
+
+MC_WEIGHT_C( mc_weight_w20, 20 )
+MC_WEIGHT_C( mc_weight_w16, 16 )
+MC_WEIGHT_C( mc_weight_w12, 12 )
+MC_WEIGHT_C( mc_weight_w8,   8 )
+MC_WEIGHT_C( mc_weight_w4,   4 )
+MC_WEIGHT_C( mc_weight_w2,   2 )
+
+static weight_fn_t x264_mc_weight_wtab[6] =
+{
+    mc_weight_w2,
+    mc_weight_w4,
+    mc_weight_w8,
+    mc_weight_w12,
+    mc_weight_w16,
+    mc_weight_w20,
+};
+const x264_weight_t weight_none[3] = { {{0}} };
 static void mc_copy( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
 {
     int y;
@@ -160,7 +224,7 @@ static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
 static void mc_luma( uint8_t *dst,    int i_dst_stride,
                      uint8_t *src[4], int i_src_stride,
                      int mvx, int mvy,
-                     int i_width, int i_height )
+                     int i_width, int i_height, const x264_weight_t *weight )
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     int offset = (mvy>>2)*i_src_stride + (mvx>>2);
@@ -171,17 +235,19 @@ static void mc_luma( uint8_t *dst,    int i_dst_stride,
         uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
         pixel_avg( dst, i_dst_stride, src1, i_src_stride,
                    src2, i_src_stride, i_width, i_height );
+        if( weight->weightfn )
+            mc_weight( dst, i_dst_stride, dst, i_dst_stride, weight, i_width, i_height );
     }
+    else if( weight->weightfn )
+        mc_weight( dst, i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
     else
-    {
         mc_copy( src1, i_src_stride, dst, i_dst_stride, i_width, i_height );
-    }
 }
 
 static uint8_t *get_ref( uint8_t *dst,   int *i_dst_stride,
                          uint8_t *src[4], int i_src_stride,
                          int mvx, int mvy,
-                         int i_width, int i_height )
+                         int i_width, int i_height, const x264_weight_t *weight )
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     int offset = (mvy>>2)*i_src_stride + (mvx>>2);
@@ -192,6 +258,13 @@ static uint8_t *get_ref( uint8_t *dst,   int *i_dst_stride,
         uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
         pixel_avg( dst, *i_dst_stride, src1, i_src_stride,
                    src2, i_src_stride, i_width, i_height );
+        if( weight->weightfn )
+            mc_weight( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_width, i_height );
+        return dst;
+    }
+    else if( weight->weightfn )
+    {
+        mc_weight( dst, *i_dst_stride, src1, i_src_stride, weight, i_width, i_height );
         return dst;
     }
     else
@@ -314,7 +387,7 @@ void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
     // duplicate last row and column so that their interpolation doesn't have to be special-cased
     for( y=0; y<i_height; y++ )
         src[i_width+y*i_stride] = src[i_width-1+y*i_stride];
-    memcpy( src+i_stride*i_height, src+i_stride*(i_height-1), i_width );
+    memcpy( src+i_stride*i_height, src+i_stride*(i_height-1), i_width+1 );
     h->mc.frame_init_lowres_core( src, frame->lowres[0], frame->lowres[1], frame->lowres[2], frame->lowres[3],
                                   i_stride, frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres );
     x264_frame_expand_border_lowres( frame );
@@ -356,6 +429,33 @@ static void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth,
     }
 }
 
+#if defined(__GNUC__) && (defined(ARCH_X86) || defined(ARCH_X86_64))
+// gcc isn't smart enough to use the "idiv" instruction
+static ALWAYS_INLINE int32_t div_64_32(int64_t x, int32_t y) {
+    int32_t quotient, remainder;
+    asm("idiv %4"
+        :"=a"(quotient), "=d"(remainder)
+        :"a"((uint32_t)x), "d"((int32_t)(x>>32)), "r"(y)
+    );
+    return quotient;
+}
+#else
+#define div_64_32(x,y) ((x)/(y))
+#endif
+
+/* Estimate the total amount of influence on future quality that could be had if we
+ * were to improve the reference samples used to inter predict any given macroblock. */
+static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+                                   uint16_t *inter_costs, uint16_t *inv_qscales, int len )
+{
+    int i;
+    for( i=0; i<len; i++ )
+    {
+        int propagate_amount = propagate_in[i] + ((intra_costs[i] * inv_qscales[i] + 128)>>8);
+        dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - inter_costs[i]), intra_costs[i]);
+    }
+}
+
 void x264_mc_init( int cpu, x264_mc_functions_t *pf )
 {
     pf->mc_luma   = mc_luma;
@@ -373,6 +473,11 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
     pf->avg[PIXEL_2x4]  = pixel_avg_2x4;
     pf->avg[PIXEL_2x2]  = pixel_avg_2x2;
 
+    pf->weight    = x264_mc_weight_wtab;
+    pf->offsetadd = x264_mc_weight_wtab;
+    pf->offsetsub = x264_mc_weight_wtab;
+    pf->weight_cache = x264_weight_cache;
+
     pf->copy_16x16_unaligned = mc_copy_w16;
     pf->copy[PIXEL_16x16] = mc_copy_w16;
     pf->copy[PIXEL_8x8]   = mc_copy_w8;
@@ -392,6 +497,8 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
     pf->integral_init4v = integral_init4v;
     pf->integral_init8v = integral_init8v;
 
+    pf->mbtree_propagate_cost = mbtree_propagate_cost;
+
 #ifdef HAVE_MMX
     x264_mc_init_mmx( cpu, pf );
 #endif
@@ -399,6 +506,9 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
     if( cpu&X264_CPU_ALTIVEC )
         x264_mc_altivec_init( pf );
 #endif
+#ifdef HAVE_ARMV6
+    x264_mc_init_arm( cpu, pf );
+#endif
 }
 
 void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
diff --git a/common/mc.h b/common/mc.h
index 594940f..68bba48 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -21,6 +21,33 @@
 #ifndef X264_MC_H
 #define X264_MC_H
 
+struct x264_weight_t;
+typedef void (* weight_fn_t)( uint8_t *, int, uint8_t *,int, const struct x264_weight_t *, int );
+typedef struct x264_weight_t
+{
+    /* aligning the first member is a gcc hack to force the struct to be
+     * 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
+    ALIGNED_16( int16_t cachea[8] );
+    int16_t cacheb[8];
+    int32_t i_denom;
+    int32_t i_scale;
+    int32_t i_offset;
+    weight_fn_t *weightfn;
+} ALIGNED_16( x264_weight_t );
+
+extern const x264_weight_t weight_none[3];
+
+#define SET_WEIGHT( w, b, s, d, o )\
+{\
+    (w).i_scale = (s);\
+    (w).i_denom = (d);\
+    (w).i_offset = (o);\
+    if( b )\
+        h->mc.weight_cache( h, &w );\
+    else\
+        w.weightfn = NULL;\
+}
+
 /* Do the MC
  * XXX: Only width = 4, 8 or 16 are valid
  * width == 4 -> height == 4 or 8
@@ -32,12 +59,12 @@ typedef struct
 {
     void (*mc_luma)(uint8_t *dst, int i_dst, uint8_t **src, int i_src,
                     int mvx, int mvy,
-                    int i_width, int i_height );
+                    int i_width, int i_height, const x264_weight_t *weight );
 
     /* may round up the dimensions if they're not a power of 2 */
     uint8_t* (*get_ref)(uint8_t *dst, int *i_dst, uint8_t **src, int i_src,
                         int mvx, int mvy,
-                        int i_width, int i_height );
+                        int i_width, int i_height, const x264_weight_t *weight );
 
     /* mc_chroma may write up to 2 bytes of garbage to the right of dst,
      * so it must be run from left to right. */
@@ -74,6 +101,13 @@ typedef struct
 
     void (*frame_init_lowres_core)( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
                                     int src_stride, int dst_stride, int width, int height );
+    weight_fn_t *weight;
+    weight_fn_t *offsetadd;
+    weight_fn_t *offsetsub;
+    void (*weight_cache)( x264_t *, x264_weight_t * );
+
+    void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+                                   uint16_t *inter_costs, uint16_t *inv_qscales, int len );
 } x264_mc_functions_t;
 
 void x264_mc_init( int cpu, x264_mc_functions_t *pf );
diff --git a/common/mdate.c b/common/mdate.c
index 1a02cdf..7a1c8a5 100644
--- a/common/mdate.c
+++ b/common/mdate.c
@@ -18,7 +18,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/
 
-#if !(defined(_MSC_VER) || defined(__MINGW32__))
+#ifndef __MINGW32__
 #include <sys/time.h>
 #else
 #include <sys/types.h>
@@ -31,9 +31,8 @@
 
 int64_t x264_mdate( void )
 {
-#if !(defined(_MSC_VER) || defined(__MINGW32__))
+#ifndef __MINGW32__
     struct timeval tv_date;
-
     gettimeofday( &tv_date, NULL );
     return( (int64_t) tv_date.tv_sec * 1000000 + (int64_t) tv_date.tv_usec );
 #else
diff --git a/common/osdep.h b/common/osdep.h
index 168d6b2..7f680ed 100644
--- a/common/osdep.h
+++ b/common/osdep.h
@@ -27,6 +27,9 @@
 #define _LARGEFILE_SOURCE 1
 #define _FILE_OFFSET_BITS 64
 #include <stdio.h>
+#include <sys/stat.h>
+
+#include "config.h"
 
 #ifdef HAVE_STDINT_H
 #include <stdint.h>
@@ -34,30 +37,18 @@
 #include <inttypes.h>
 #endif
 
+#ifndef HAVE_LOG2F
+#define log2f(x) (logf((x))/0.693147180559945f)
+#endif
+
 #ifdef _WIN32
 #include <io.h>    // _setmode()
 #include <fcntl.h> // _O_BINARY
 #endif
 
-#ifdef _MSC_VER
-#define inline __inline
-#define strcasecmp stricmp
-#define strncasecmp strnicmp
-#define snprintf _snprintf
-#define fseek _fseeki64
-#define ftell _ftelli64
-#define isfinite _finite
-#define strtok_r strtok_s
-#define _CRT_SECURE_NO_DEPRECATE
-#define X264_VERSION "" // no configure script for msvc
-#endif
-
 #if (defined(SYS_OPENBSD) && !defined(isfinite)) || defined(SYS_SunOS)
 #define isfinite finite
 #endif
-#if defined(_MSC_VER) || defined(SYS_SunOS) || defined(SYS_MACOSX)
-#define sqrtf sqrt
-#endif
 #ifdef _WIN32
 #define rename(src,dst) (unlink(dst), rename(src,dst)) // POSIX says that rename() removes the destination, but win32 doesn't.
 #ifndef strtok_r
@@ -65,33 +56,63 @@
 #endif
 #endif
 
-#ifdef _MSC_VER
-#define DECLARE_ALIGNED( var, n ) __declspec(align(n)) var
-#else
 #define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n)))
+#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
+#define ALIGNED_8( var )  DECLARE_ALIGNED( var, 8 )
+#define ALIGNED_4( var )  DECLARE_ALIGNED( var, 4 )
+
+// ARM compiliers don't reliably align stack variables
+// - EABI requires only 8 byte stack alignment to be maintained
+// - gcc can't align stack variables to more even if the stack were to be correctly aligned outside the function
+// - armcc can't either, but is nice enough to actually tell you so
+// - Apple gcc only maintains 4 byte alignment
+// - llvm can align the stack, but only in svn and (unrelated) it exposes bugs in all released GNU binutils...
+#if defined(ARCH_ARM) && defined(SYS_MACOSX)
+#define ALIGNED_ARRAY_8( type, name, sub1, ... )\
+    uint8_t name##_u [sizeof(type sub1 __VA_ARGS__) + 7]; \
+    type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+7) & ~7)
+#else
+#define ALIGNED_ARRAY_8( type, name, sub1, ... )\
+    ALIGNED_8( type name sub1 __VA_ARGS__ )
+#endif
+
+#ifdef ARCH_ARM
+#define ALIGNED_ARRAY_16( type, name, sub1, ... )\
+    uint8_t name##_u [sizeof(type sub1 __VA_ARGS__) + 15];\
+    type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+15) & ~15)
+#else
+#define ALIGNED_ARRAY_16( type, name, sub1, ... )\
+    ALIGNED_16( type name sub1 __VA_ARGS__ )
 #endif
-#define DECLARE_ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
-#define DECLARE_ALIGNED_8( var )  DECLARE_ALIGNED( var, 8 )
-#define DECLARE_ALIGNED_4( var )  DECLARE_ALIGNED( var, 4 )
 
 #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
 #define UNUSED __attribute__((unused))
 #define ALWAYS_INLINE __attribute__((always_inline)) inline
 #define NOINLINE __attribute__((noinline))
+#define MAY_ALIAS __attribute__((may_alias))
+#define x264_constant_p(x) __builtin_constant_p(x)
 #else
 #define UNUSED
 #define ALWAYS_INLINE inline
 #define NOINLINE
+#define MAY_ALIAS
+#define x264_constant_p(x) 0
 #endif
 
 /* threads */
 #if defined(SYS_BEOS)
 #include <kernel/OS.h>
 #define x264_pthread_t               thread_id
-#define x264_pthread_create(t,u,f,d) { *(t)=spawn_thread(f,"",10,d); \
-                                       resume_thread(*(t)); }
+static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(void *), void *d )
+{
+     *t = spawn_thread( f, "", 10, d );
+     if( *t < B_NO_ERROR )
+         return -1;
+     resume_thread( *t );
+     return 0;
+}
 #define x264_pthread_join(t,s)       { long tmp; \
-                                       wait_for_thread(t,(s)?(long*)(s):&tmp); }
+                                       wait_for_thread(t,(s)?(long*)(*(s)):&tmp); }
 #ifndef usleep
 #define usleep(t)                    snooze(t)
 #endif
@@ -103,7 +124,7 @@
 
 #else
 #define x264_pthread_t               int
-#define x264_pthread_create(t,u,f,d)
+#define x264_pthread_create(t,u,f,d) 0
 #define x264_pthread_join(t,s)
 #endif //SYS_*
 
@@ -121,39 +142,53 @@
 #define x264_pthread_cond_destroy    pthread_cond_destroy
 #define x264_pthread_cond_broadcast  pthread_cond_broadcast
 #define x264_pthread_cond_wait       pthread_cond_wait
+#define x264_pthread_attr_t          pthread_attr_t
+#define x264_pthread_attr_init       pthread_attr_init
+#define x264_pthread_attr_destroy    pthread_attr_destroy
+#define X264_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
 #else
 #define x264_pthread_mutex_t         int
-#define x264_pthread_mutex_init(m,f)
+#define x264_pthread_mutex_init(m,f) 0
 #define x264_pthread_mutex_destroy(m)
 #define x264_pthread_mutex_lock(m)
 #define x264_pthread_mutex_unlock(m)
 #define x264_pthread_cond_t          int
-#define x264_pthread_cond_init(c,f)
+#define x264_pthread_cond_init(c,f)  0
 #define x264_pthread_cond_destroy(c)
 #define x264_pthread_cond_broadcast(c)
 #define x264_pthread_cond_wait(c,m)
+#define x264_pthread_attr_t          int
+#define x264_pthread_attr_init(a)    0
+#define x264_pthread_attr_destroy(a)
+#define X264_PTHREAD_MUTEX_INITIALIZER 0
 #endif
 
 #define WORD_SIZE sizeof(void*)
 
+#define asm __asm__
+
 #if !defined(_WIN64) && !defined(__LP64__)
-#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
+#if defined(__INTEL_COMPILER)
 #define BROKEN_STACK_ALIGNMENT /* define it if stack is not mod16 */
 #endif
 #endif
 
 #ifdef WORDS_BIGENDIAN
 #define endian_fix(x) (x)
+#define endian_fix64(x) (x)
 #define endian_fix32(x) (x)
-#elif defined(__GNUC__) && defined(HAVE_MMX)
+#define endian_fix16(x) (x)
+#else
+#if defined(__GNUC__) && defined(HAVE_MMX)
 static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
 {
     asm("bswap %0":"+r"(x));
     return x;
 }
-static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
+#elif defined(__GNUC__) && defined(HAVE_ARMV6)
+static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
 {
-    asm("bswap %0":"+r"(x));
+    asm("rev %0, %0":"+r"(x));
     return x;
 }
 #else
@@ -161,12 +196,26 @@ static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
 {
     return (x<<24) + ((x<<8)&0xff0000) + ((x>>8)&0xff00) + (x>>24);
 }
+#endif
+#if defined(__GNUC__) && defined(ARCH_X86_64)
+static ALWAYS_INLINE uint64_t endian_fix64( uint64_t x )
+{
+    asm("bswap %0":"+r"(x));
+    return x;
+}
+#else
+static ALWAYS_INLINE uint64_t endian_fix64( uint64_t x )
+{
+    return endian_fix32(x>>32) + ((uint64_t)endian_fix32(x)<<32);
+}
+#endif
 static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
 {
-    if( WORD_SIZE == 8 )
-        return endian_fix32(x>>32) + ((uint64_t)endian_fix32(x)<<32);
-    else
-        return endian_fix32(x);
+    return WORD_SIZE == 8 ? endian_fix64(x) : endian_fix32(x);
+}
+static ALWAYS_INLINE uint16_t endian_fix16( uint16_t x )
+{
+    return (x<<8)|(x>>8);
 }
 #endif
 
@@ -176,7 +225,7 @@ static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
 static int ALWAYS_INLINE x264_clz( uint32_t x )
 {
     static uint8_t lut[16] = {4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0};
-    int y, z = ((x - 0x10000) >> 27) & 16;
+    int y, z = (((x >> 16) - 1) >> 27) & 16;
     x >>= z^16;
     z += y = ((x - 0x100) >> 28) & 8;
     x >>= y^8;
@@ -186,4 +235,31 @@ static int ALWAYS_INLINE x264_clz( uint32_t x )
 }
 #endif
 
+#ifdef USE_REAL_PTHREAD
+#ifdef SYS_MINGW
+#define x264_lower_thread_priority(p)\
+{\
+    x264_pthread_t handle = pthread_self();\
+    struct sched_param sp;\
+    int policy = SCHED_OTHER;\
+    pthread_getschedparam( handle, &policy, &sp );\
+    sp.sched_priority -= p;\
+    pthread_setschedparam( handle, policy, &sp );\
+}
+#else
+#include <unistd.h>
+#define x264_lower_thread_priority(p) { UNUSED int nice_ret = nice(p); }
+#endif /* USE_REAL_PTHREAD */
+#else
+#define x264_lower_thread_priority(p)
+#endif
+
+static inline uint8_t x264_is_regular_file( FILE *filehandle )
+{
+    struct stat file_stat;
+    if( fstat( fileno( filehandle ), &file_stat ) )
+        return 0;
+    return S_ISREG( file_stat.st_mode );
+}
+
 #endif /* X264_OSDEP_H */
diff --git a/common/pixel.c b/common/pixel.c
index 5932f07..7c60237 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -29,6 +29,9 @@
 #ifdef ARCH_PPC
 #   include "ppc/pixel.h"
 #endif
+#ifdef ARCH_ARM
+#   include "arm/pixel.h"
+#endif
 #ifdef ARCH_UltraSparc
 #   include "sparc/pixel.h"
 #endif
@@ -139,10 +142,10 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1
 /****************************************************************************
  * pixel_var_wxh
  ****************************************************************************/
-#define PIXEL_VAR_C( name, w, shift ) \
-static int name( uint8_t *pix, int i_stride ) \
+#define PIXEL_VAR_C( name, w ) \
+static uint64_t name( uint8_t *pix, int i_stride ) \
 {                                             \
-    uint32_t var = 0, sum = 0, sqr = 0;       \
+    uint32_t sum = 0, sqr = 0;                \
     int x, y;                                 \
     for( y = 0; y < w; y++ )                  \
     {                                         \
@@ -153,12 +156,35 @@ static int name( uint8_t *pix, int i_stride ) \
         }                                     \
         pix += i_stride;                      \
     }                                         \
-    var = sqr - (sum * sum >> shift);         \
-    return var;                               \
+    return sum + ((uint64_t)sqr << 32);       \
 }
 
-PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 )
-PIXEL_VAR_C( x264_pixel_var_8x8,    8, 6 )
+PIXEL_VAR_C( x264_pixel_var_16x16, 16 )
+PIXEL_VAR_C( x264_pixel_var_8x8,    8 )
+
+/****************************************************************************
+ * pixel_var2_wxh
+ ****************************************************************************/
+static int pixel_var2_8x8( uint8_t *pix1, int i_stride1, uint8_t *pix2, int i_stride2, int *ssd )
+{
+    uint32_t var = 0, sum = 0, sqr = 0;
+    int x, y;
+    for( y = 0; y < 8; y++ )
+    {
+        for( x = 0; x < 8; x++ )
+        {
+            int diff = pix1[x] - pix2[x];
+            sum += diff;
+            sqr += diff * diff;
+        }
+        pix1 += i_stride1;
+        pix2 += i_stride2;
+    }
+    sum = abs(sum);
+    var = sqr - (sum * sum >> 6);
+    *ssd = sqr;
+    return var;
+}
 
 
 #define HADAMARD4(d0,d1,d2,d3,s0,s1,s2,s3) {\
@@ -429,6 +455,10 @@ SATD_X_DECL7( _ssse3 )
 SATD_X_DECL7( _sse4 )
 #endif
 
+#ifdef HAVE_ARMV6
+SATD_X_DECL7( _neon )
+#endif
+
 /****************************************************************************
  * structural similarity metric
  ****************************************************************************/
@@ -611,6 +641,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 
     pixf->ssim_4x4x2_core = ssim_4x4x2_core;
     pixf->ssim_end4 = ssim_end4;
+    pixf->var2_8x8 = pixel_var2_8x8;
 
 #ifdef HAVE_MMX
     if( cpu&X264_CPU_MMX )
@@ -636,6 +667,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_mmxext;
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
         pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_mmxext;
+        pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext;
 
         if( cpu&X264_CPU_CACHELINE_32 )
         {
@@ -682,6 +714,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 #ifdef ARCH_X86_64
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
 #endif
+        pixf->var2_8x8 = x264_pixel_var2_8x8_sse2;
     }
 
     if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
@@ -761,6 +794,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 #ifdef ARCH_X86_64
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
 #endif
+        pixf->var2_8x8 = x264_pixel_var2_8x8_ssse3;
         if( cpu&X264_CPU_CACHELINE_64 )
         {
             INIT2( sad, _cache64_ssse3 );
@@ -787,6 +821,47 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     }
 #endif //HAVE_MMX
 
+#ifdef HAVE_ARMV6
+    if( cpu&X264_CPU_ARMV6 )
+    {
+        pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_armv6;
+        pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_armv6;
+        pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_armv6;
+        pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_armv6;
+    }
+    if( cpu&X264_CPU_NEON )
+    {
+        INIT5( sad, _neon );
+        INIT5( sad_aligned, _neon );
+        INIT7( sad_x3, _neon );
+        INIT7( sad_x4, _neon );
+        INIT7( ssd, _neon );
+        INIT7( satd, _neon );
+        INIT7( satd_x3, _neon );
+        INIT7( satd_x4, _neon );
+        INIT4( hadamard_ac, _neon );
+        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_neon;
+        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
+        pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
+        pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
+        pixf->var2_8x8          = x264_pixel_var2_8x8_neon;
+
+        pixf->ssim_4x4x2_core   = x264_pixel_ssim_4x4x2_core_neon;
+        pixf->ssim_end4         = x264_pixel_ssim_end4_neon;
+
+        if( cpu&X264_CPU_FAST_NEON_MRC )
+        {
+            pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_neon;
+            pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_neon;
+            pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_aligned_4x8_neon;
+            pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_aligned_4x4_neon;
+        }
+        else    // really just scheduled for dual issue / A8
+        {
+            INIT5( sad_aligned, _neon_dual );
+        }
+    }
+#endif
 #ifdef ARCH_PPC
     if( cpu&X264_CPU_ALTIVEC )
     {
diff --git a/common/pixel.h b/common/pixel.h
index 207c74f..1102642 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -73,8 +73,9 @@ typedef struct
     x264_pixel_cmp_x3_t fpelcmp_x3[7];
     x264_pixel_cmp_x4_t fpelcmp_x4[7];
     x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
+    int (*var2_8x8)( uint8_t *, int, uint8_t *, int, int * );
 
-    int (*var[4])( uint8_t *pix, int stride );
+    uint64_t (*var[4])( uint8_t *pix, int stride );
     uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride );
 
     void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
diff --git a/common/ppc/deblock.c b/common/ppc/deblock.c
index 3531812..14171e6 100644
--- a/common/ppc/deblock.c
+++ b/common/ppc/deblock.c
@@ -41,7 +41,7 @@
 static inline void write16x4(uint8_t *dst, int dst_stride,
                              register vec_u8_t r0, register vec_u8_t r1,
                              register vec_u8_t r2, register vec_u8_t r3) {
-    DECLARE_ALIGNED_16(unsigned char result[64]);
+    ALIGNED_16(unsigned char result[64]);
     uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
     int int_dst_stride = dst_stride/4;
 
@@ -220,7 +220,7 @@ static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
 }
 
 #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) {            \
-    DECLARE_ALIGNED_16(unsigned char temp[16]);                                              \
+    ALIGNED_16(unsigned char temp[16]);                                              \
     register vec_u8_t alphavec;                                                              \
     register vec_u8_t betavec;                                                               \
     register vec_u8_t mask;                                                                  \
diff --git a/common/ppc/mc.c b/common/ppc/mc.c
index 56ec9c1..a588d8f 100644
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -181,7 +181,7 @@ static void x264_mc_copy_w16_aligned_altivec( uint8_t *dst, int i_dst,
 static void mc_luma_altivec( uint8_t *dst,    int i_dst_stride,
                              uint8_t *src[4], int i_src_stride,
                              int mvx, int mvy,
-                             int i_width, int i_height )
+                             int i_width, int i_height, const x264_weight_t *weight )
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     int offset = (mvy>>2)*i_src_stride + (mvx>>2);
@@ -201,8 +201,11 @@ static void mc_luma_altivec( uint8_t *dst,    int i_dst_stride,
         default:
             x264_pixel_avg2_w16_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
         }
-
+        if( weight->weightfn )
+            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
     }
+    else if( weight->weightfn )
+        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
     else
     {
         switch(i_width) {
@@ -224,7 +227,7 @@ static void mc_luma_altivec( uint8_t *dst,    int i_dst_stride,
 static uint8_t *get_ref_altivec( uint8_t *dst,   int *i_dst_stride,
                                  uint8_t *src[4], int i_src_stride,
                                  int mvx, int mvy,
-                                 int i_width, int i_height )
+                                 int i_width, int i_height, const x264_weight_t *weight )
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     int offset = (mvy>>2)*i_src_stride + (mvx>>2);
@@ -248,6 +251,13 @@ static uint8_t *get_ref_altivec( uint8_t *dst,   int *i_dst_stride,
             x264_pixel_avg2_w20_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
             break;
         }
+        if( weight->weightfn )
+            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
+        return dst;
+    }
+    else if( weight->weightfn )
+    {
+        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
         return dst;
     }
     else
@@ -303,7 +313,7 @@ static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
     int d8x = mvx & 0x07;
     int d8y = mvy & 0x07;
 
-    DECLARE_ALIGNED_16( uint16_t coeff[4] );
+    ALIGNED_16( uint16_t coeff[4] );
     coeff[0] = (8-d8x)*(8-d8y);
     coeff[1] = d8x    *(8-d8y);
     coeff[2] = (8-d8x)*d8y;
@@ -384,7 +394,7 @@ static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
     int d8x = mvx & 0x07;
     int d8y = mvy & 0x07;
 
-    DECLARE_ALIGNED_16( uint16_t coeff[4] );
+    ALIGNED_16( uint16_t coeff[4] );
     coeff[0] = (8-d8x)*(8-d8y);
     coeff[1] = d8x    *(8-d8y);
     coeff[2] = (8-d8x)*d8y;
diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index 360e71d..64d4c49 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -33,7 +33,7 @@ static int name( uint8_t *pix1, int i_pix1,            \
                  uint8_t *pix2, int i_pix2 )           \
 {                                                      \
     int y;                                             \
-    DECLARE_ALIGNED_16( int sum );                     \
+    ALIGNED_16( int sum );                     \
                                                        \
     LOAD_ZERO;                                         \
     PREP_LOAD;                                         \
@@ -118,7 +118,7 @@ PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec,   8,  8,  2s, 1 )
 static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
                                    uint8_t *pix2, int i_pix2 )
 {
-    DECLARE_ALIGNED_16( int i_satd );
+    ALIGNED_16( int i_satd );
 
     PREP_DIFF;
     PREP_LOAD_SRC( pix1 );
@@ -163,7 +163,7 @@ static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
 static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1,
                                    uint8_t *pix2, int i_pix2 )
 {
-    DECLARE_ALIGNED_16( int i_satd );
+    ALIGNED_16( int i_satd );
 
     PREP_DIFF;
     vec_s16_t diff0v, diff1v, diff2v, diff3v;
@@ -217,7 +217,7 @@ static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1,
 static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1,
                                    uint8_t *pix2, int i_pix2 )
 {
-    DECLARE_ALIGNED_16( int i_satd );
+    ALIGNED_16( int i_satd );
 
     PREP_DIFF;
     vec_s16_t diff0v, diff1v, diff2v, diff3v,
@@ -271,7 +271,7 @@ static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1,
 static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
                                    uint8_t *pix2, int i_pix2 )
 {
-    DECLARE_ALIGNED_16( int i_satd );
+    ALIGNED_16( int i_satd );
 
     PREP_DIFF;
     vec_s16_t diff0v, diff1v, diff2v, diff3v,
@@ -331,7 +331,7 @@ static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
 static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
                                     uint8_t *pix2, int i_pix2 )
 {
-    DECLARE_ALIGNED_16( int i_satd );
+    ALIGNED_16( int i_satd );
 
     PREP_DIFF;
     vec_s16_t diff0v, diff1v, diff2v, diff3v,
@@ -415,7 +415,7 @@ static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
 static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1,
                                     uint8_t *pix2, int i_pix2 )
 {
-    DECLARE_ALIGNED_16( int i_satd );
+    ALIGNED_16( int i_satd );
 
     LOAD_ZERO;
     PREP_LOAD;
@@ -499,7 +499,7 @@ static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1,
 static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1,
                                      uint8_t *pix2, int i_pix2 )
 {
-    DECLARE_ALIGNED_16( int i_satd );
+    ALIGNED_16( int i_satd );
 
     LOAD_ZERO;
     PREP_LOAD;
@@ -630,10 +630,10 @@ static void pixel_sad_x4_16x16_altivec( uint8_t *fenc,
                                         uint8_t *pix2, uint8_t *pix3,
                                         int i_stride, int scores[4] )
 {
-    DECLARE_ALIGNED_16( int sum0 );
-    DECLARE_ALIGNED_16( int sum1 );
-    DECLARE_ALIGNED_16( int sum2 );
-    DECLARE_ALIGNED_16( int sum3 );
+    ALIGNED_16( int sum0 );
+    ALIGNED_16( int sum1 );
+    ALIGNED_16( int sum2 );
+    ALIGNED_16( int sum3 );
     int y;
 
     LOAD_ZERO;
@@ -751,9 +751,9 @@ static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
                                         uint8_t *pix1, uint8_t *pix2,
                                         int i_stride, int scores[3] )
 {
-    DECLARE_ALIGNED_16( int sum0 );
-    DECLARE_ALIGNED_16( int sum1 );
-    DECLARE_ALIGNED_16( int sum2 );
+    ALIGNED_16( int sum0 );
+    ALIGNED_16( int sum1 );
+    ALIGNED_16( int sum2 );
     int y;
 
     LOAD_ZERO;
@@ -846,10 +846,10 @@ static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
 
 static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
 {
-    DECLARE_ALIGNED_16( int sum0 );
-    DECLARE_ALIGNED_16( int sum1 );
-    DECLARE_ALIGNED_16( int sum2 );
-    DECLARE_ALIGNED_16( int sum3 );
+    ALIGNED_16( int sum0 );
+    ALIGNED_16( int sum1 );
+    ALIGNED_16( int sum2 );
+    ALIGNED_16( int sum3 );
     int y;
 
     LOAD_ZERO;
@@ -964,9 +964,9 @@ static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0,
                                        uint8_t *pix1, uint8_t *pix2,
                                        int i_stride, int scores[3] )
 {
-    DECLARE_ALIGNED_16( int sum0 );
-    DECLARE_ALIGNED_16( int sum1 );
-    DECLARE_ALIGNED_16( int sum2 );
+    ALIGNED_16( int sum0 );
+    ALIGNED_16( int sum1 );
+    ALIGNED_16( int sum2 );
     int y;
 
     LOAD_ZERO;
@@ -1062,10 +1062,10 @@ static void pixel_sad_x4_8x16_altivec( uint8_t *fenc,
                                        uint8_t *pix2, uint8_t *pix3,
                                        int i_stride, int scores[4] )
 {
-    DECLARE_ALIGNED_16( int sum0 );
-    DECLARE_ALIGNED_16( int sum1 );
-    DECLARE_ALIGNED_16( int sum2 );
-    DECLARE_ALIGNED_16( int sum3 );
+    ALIGNED_16( int sum0 );
+    ALIGNED_16( int sum1 );
+    ALIGNED_16( int sum2 );
+    ALIGNED_16( int sum3 );
     int y;
 
     LOAD_ZERO;
@@ -1183,9 +1183,9 @@ static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0,
                                        uint8_t *pix1, uint8_t *pix2,
                                        int i_stride, int scores[3] )
 {
-    DECLARE_ALIGNED_16( int sum0 );
-    DECLARE_ALIGNED_16( int sum1 );
-    DECLARE_ALIGNED_16( int sum2 );
+    ALIGNED_16( int sum0 );
+    ALIGNED_16( int sum1 );
+    ALIGNED_16( int sum2 );
     int y;
 
     LOAD_ZERO;
@@ -1283,10 +1283,10 @@ static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,
                                       uint8_t *pix2, uint8_t *pix3,
                                       int i_stride, int scores[4] )
 {
-    DECLARE_ALIGNED_16( int sum0 );
-    DECLARE_ALIGNED_16( int sum1 );
-    DECLARE_ALIGNED_16( int sum2 );
-    DECLARE_ALIGNED_16( int sum3 );
+    ALIGNED_16( int sum0 );
+    ALIGNED_16( int sum1 );
+    ALIGNED_16( int sum2 );
+    ALIGNED_16( int sum3 );
     int y;
 
     LOAD_ZERO;
@@ -1404,9 +1404,9 @@ static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0,
                                       uint8_t *pix1, uint8_t *pix2,
                                       int i_stride, int scores[3] )
 {
-    DECLARE_ALIGNED_16( int sum0 );
-    DECLARE_ALIGNED_16( int sum1 );
-    DECLARE_ALIGNED_16( int sum2 );
+    ALIGNED_16( int sum0 );
+    ALIGNED_16( int sum1 );
+    ALIGNED_16( int sum2 );
     int y;
 
     LOAD_ZERO;
@@ -1506,7 +1506,7 @@ static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0,
 static int pixel_ssd_16x16_altivec ( uint8_t *pix1, int i_stride_pix1,
                                      uint8_t *pix2, int i_stride_pix2)
 {
-    DECLARE_ALIGNED_16( int sum );
+    ALIGNED_16( int sum );
 
     int y;
     LOAD_ZERO;
@@ -1586,7 +1586,7 @@ static int pixel_ssd_16x16_altivec ( uint8_t *pix1, int i_stride_pix1,
 static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
                                    uint8_t *pix2, int i_stride_pix2)
 {
-    DECLARE_ALIGNED_16( int sum );
+    ALIGNED_16( int sum );
 
     int y;
     LOAD_ZERO;
@@ -1636,10 +1636,10 @@ static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
 /****************************************************************************
  * variance
  ****************************************************************************/
-static int x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
+static uint64_t x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
 {
-    DECLARE_ALIGNED_16(uint32_t sum_tab[4]);
-    DECLARE_ALIGNED_16(uint32_t sqr_tab[4]);
+    ALIGNED_16(uint32_t sum_tab[4]);
+    ALIGNED_16(uint32_t sqr_tab[4]);
 
     LOAD_ZERO;
     vec_u32_t sqr_v = zero_u32v;
@@ -1661,14 +1661,13 @@ static int x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
 
     uint32_t sum = sum_tab[3];
     uint32_t sqr = sqr_tab[3];
-    uint32_t var = sqr - (sum * sum >> 8);
-    return var;
+    return sum + ((uint64_t)sqr<<32);
 }
 
-static int x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
+static uint64_t x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
 {
-    DECLARE_ALIGNED_16(uint32_t sum_tab[4]);
-    DECLARE_ALIGNED_16(uint32_t sqr_tab[4]);
+    ALIGNED_16(uint32_t sum_tab[4]);
+    ALIGNED_16(uint32_t sqr_tab[4]);
 
     LOAD_ZERO;
     vec_u32_t sqr_v = zero_u32v;
@@ -1700,8 +1699,7 @@ static int x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
 
     uint32_t sum = sum_tab[3];
     uint32_t sqr = sqr_tab[3];
-    uint32_t var = sqr - (sum * sum >> 6);
-    return var;
+    return sum + ((uint64_t)sqr<<32);
 }
 
 
@@ -1870,8 +1868,8 @@ static int pixel_sa8d_16x16_altivec( uint8_t *pix1, int i_pix1,
 
 static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, int stride, const vec_u8_t perm )
 {
-    DECLARE_ALIGNED_16( int32_t sum4_tab[4] );
-    DECLARE_ALIGNED_16( int32_t sum8_tab[4] );
+    ALIGNED_16( int32_t sum4_tab[4] );
+    ALIGNED_16( int32_t sum8_tab[4] );
     LOAD_ZERO;
 
     VEC_LOAD_HIGH( pix, 0 );
@@ -1937,7 +1935,7 @@ static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, int stride, const vec_u
 
     int sum8 = sum8_tab[3];
 
-    DECLARE_ALIGNED_16( int16_t tmp0_4_tab[8] );
+    ALIGNED_16( int16_t tmp0_4_tab[8] );
     vec_ste(vec_add(pix16_d0, pix16_d4), 0, tmp0_4_tab);
 
     sum4 -= tmp0_4_tab[0];
@@ -1997,7 +1995,7 @@ static void ssim_4x4x2_core_altivec( const uint8_t *pix1, int stride1,
                                      const uint8_t *pix2, int stride2,
                                      int sums[2][4] )
 {
-    DECLARE_ALIGNED_16( int temp[4] );
+    ALIGNED_16( int temp[4] );
 
     int y;
     vec_u8_t pix1v, pix2v;
diff --git a/common/predict.c b/common/predict.c
index ce4b9bf..0718c81 100644
--- a/common/predict.c
+++ b/common/predict.c
@@ -33,6 +33,9 @@
 #ifdef ARCH_PPC
 #   include "ppc/predict.h"
 #endif
+#ifdef ARCH_ARM
+#   include "arm/predict.h"
+#endif
 
 /****************************************************************************
  * 16x16 prediction for intra luma block
@@ -41,11 +44,10 @@
 #define PREDICT_16x16_DC(v) \
     for( i = 0; i < 16; i++ )\
     {\
-        uint32_t *p = (uint32_t*)src;\
-        *p++ = v;\
-        *p++ = v;\
-        *p++ = v;\
-        *p++ = v;\
+        M32( src+ 0 ) = v;\
+        M32( src+ 4 ) = v;\
+        M32( src+ 8 ) = v;\
+        M32( src+12 ) = v;\
         src += FDEC_STRIDE;\
     }
 
@@ -101,32 +103,28 @@ static void predict_16x16_h( uint8_t *src )
     for( i = 0; i < 16; i++ )
     {
         const uint32_t v = 0x01010101 * src[-1];
-        uint32_t *p = (uint32_t*)src;
-
-        *p++ = v;
-        *p++ = v;
-        *p++ = v;
-        *p++ = v;
-
+        M32( src+ 0 ) = v;
+        M32( src+ 4 ) = v;
+        M32( src+ 8 ) = v;
+        M32( src+12 ) = v;
         src += FDEC_STRIDE;
 
     }
 }
 static void predict_16x16_v( uint8_t *src )
 {
-    uint32_t v0 = *(uint32_t*)&src[ 0-FDEC_STRIDE];
-    uint32_t v1 = *(uint32_t*)&src[ 4-FDEC_STRIDE];
-    uint32_t v2 = *(uint32_t*)&src[ 8-FDEC_STRIDE];
-    uint32_t v3 = *(uint32_t*)&src[12-FDEC_STRIDE];
+    uint32_t v0 = M32( &src[ 0-FDEC_STRIDE] );
+    uint32_t v1 = M32( &src[ 4-FDEC_STRIDE] );
+    uint32_t v2 = M32( &src[ 8-FDEC_STRIDE] );
+    uint32_t v3 = M32( &src[12-FDEC_STRIDE] );
     int i;
 
     for( i = 0; i < 16; i++ )
     {
-        uint32_t *p = (uint32_t*)src;
-        *p++ = v0;
-        *p++ = v1;
-        *p++ = v2;
-        *p++ = v3;
+        M32( src+ 0 ) = v0;
+        M32( src+ 4 ) = v1;
+        M32( src+ 8 ) = v2;
+        M32( src+12 ) = v3;
         src += FDEC_STRIDE;
     }
 }
@@ -175,9 +173,8 @@ static void predict_8x8c_dc_128( uint8_t *src )
 
     for( y = 0; y < 8; y++ )
     {
-        uint32_t *p = (uint32_t*)src;
-        *p++ = 0x80808080;
-        *p++ = 0x80808080;
+        M32( src+0 ) = 0x80808080;
+        M32( src+4 ) = 0x80808080;
         src += FDEC_STRIDE;
     }
 }
@@ -196,16 +193,14 @@ static void predict_8x8c_dc_left( uint8_t *src )
 
     for( y = 0; y < 4; y++ )
     {
-        uint32_t *p = (uint32_t*)src;
-        *p++ = dc0;
-        *p++ = dc0;
+        M32( src+0 ) = dc0;
+        M32( src+4 ) = dc0;
         src += FDEC_STRIDE;
     }
     for( y = 0; y < 4; y++ )
     {
-        uint32_t *p = (uint32_t*)src;
-        *p++ = dc1;
-        *p++ = dc1;
+        M32( src+0 ) = dc1;
+        M32( src+4 ) = dc1;
         src += FDEC_STRIDE;
     }
 
@@ -225,9 +220,8 @@ static void predict_8x8c_dc_top( uint8_t *src )
 
     for( y = 0; y < 8; y++ )
     {
-        uint32_t *p = (uint32_t*)src;
-        *p++ = dc0;
-        *p++ = dc1;
+        M32( src+0 ) = dc0;
+        M32( src+4 ) = dc1;
         src += FDEC_STRIDE;
     }
 }
@@ -261,17 +255,15 @@ static void predict_8x8c_dc( uint8_t *src )
 
     for( y = 0; y < 4; y++ )
     {
-        uint32_t *p = (uint32_t*)src;
-        *p++ = dc0;
-        *p++ = dc1;
+        M32( src+0 ) = dc0;
+        M32( src+4 ) = dc1;
         src += FDEC_STRIDE;
     }
 
     for( y = 0; y < 4; y++ )
     {
-        uint32_t *p = (uint32_t*)src;
-        *p++ = dc2;
-        *p++ = dc3;
+        M32( src+0 ) = dc2;
+        M32( src+4 ) = dc3;
         src += FDEC_STRIDE;
     }
 }
@@ -282,23 +274,21 @@ static void predict_8x8c_h( uint8_t *src )
     for( i = 0; i < 8; i++ )
     {
         uint32_t v = 0x01010101 * src[-1];
-        uint32_t *p = (uint32_t*)src;
-        *p++ = v;
-        *p++ = v;
+        M32( src+0 ) = v;
+        M32( src+4 ) = v;
         src += FDEC_STRIDE;
     }
 }
 static void predict_8x8c_v( uint8_t *src )
 {
-    uint32_t v0 = *(uint32_t*)&src[0-FDEC_STRIDE];
-    uint32_t v1 = *(uint32_t*)&src[4-FDEC_STRIDE];
+    uint32_t v0 = M32( src+0-FDEC_STRIDE );
+    uint32_t v1 = M32( src+4-FDEC_STRIDE );
     int i;
 
     for( i = 0; i < 8; i++ )
     {
-        uint32_t *p = (uint32_t*)src;
-        *p++ = v0;
-        *p++ = v1;
+        M32( src+0 ) = v0;
+        M32( src+4 ) = v1;
         src += FDEC_STRIDE;
     }
 }
@@ -340,7 +330,7 @@ static void predict_8x8c_p( uint8_t *src )
  ****************************************************************************/
 
 #define SRC(x,y) src[(x)+(y)*FDEC_STRIDE]
-#define SRC32(x,y) *(uint32_t*)&SRC(x,y)
+#define SRC32(x,y) M32( &SRC(x,y) )
 
 #define PREDICT_4x4_DC(v)\
     SRC32(0,0) = SRC32(0,1) = SRC32(0,2) = SRC32(0,3) = v;
@@ -532,7 +522,7 @@ static void predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor,
             }
             else
             {
-                *(uint64_t*)(edge+24) = SRC(7,-1) * 0x0101010101010101ULL;
+                M64( edge+24 ) = SRC(7,-1) * 0x0101010101010101ULL;
                 edge[32] = SRC(7,-1);
             }
         }
@@ -558,8 +548,8 @@ static void predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor,
 #define PREDICT_8x8_DC(v) \
     int y; \
     for( y = 0; y < 8; y++ ) { \
-        ((uint32_t*)src)[0] = \
-        ((uint32_t*)src)[1] = v; \
+        M32( src+0 ) = v; \
+        M32( src+4 ) = v; \
         src += FDEC_STRIDE; \
     }
 
@@ -590,17 +580,17 @@ static void predict_8x8_dc( uint8_t *src, uint8_t edge[33] )
 static void predict_8x8_h( uint8_t *src, uint8_t edge[33] )
 {
     PREDICT_8x8_LOAD_LEFT
-#define ROW(y) ((uint32_t*)(src+y*FDEC_STRIDE))[0] =\
-               ((uint32_t*)(src+y*FDEC_STRIDE))[1] = 0x01010101U * l##y
+#define ROW(y) M32( src+y*FDEC_STRIDE+0 ) =\
+               M32( src+y*FDEC_STRIDE+4 ) = 0x01010101U * l##y;
     ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
 #undef ROW
 }
 static void predict_8x8_v( uint8_t *src, uint8_t edge[33] )
 {
-    const uint64_t top = *(uint64_t*)(edge+16);
+    const uint64_t top = M64( edge+16 );
     int y;
     for( y = 0; y < 8; y++ )
-        *(uint64_t*)(src+y*FDEC_STRIDE) = top;
+        M64( src+y*FDEC_STRIDE ) = top;
 }
 static void predict_8x8_ddl( uint8_t *src, uint8_t edge[33] )
 {
@@ -770,6 +760,10 @@ void x264_predict_16x16_init( int cpu, x264_predict_t pf[7] )
         x264_predict_16x16_init_altivec( pf );
     }
 #endif
+
+#ifdef HAVE_ARMV6
+    x264_predict_16x16_init_arm( cpu, pf );
+#endif
 }
 
 void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
@@ -792,6 +786,10 @@ void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
         x264_predict_8x8c_init_altivec( pf );
     }
 #endif
+
+#ifdef HAVE_ARMV6
+    x264_predict_8x8c_init_arm( cpu, pf );
+#endif
 }
 
 void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
@@ -813,6 +811,10 @@ void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_
 #ifdef HAVE_MMX
     x264_predict_8x8_init_mmx( cpu, pf, predict_filter );
 #endif
+
+#ifdef HAVE_ARMV6
+    x264_predict_8x8_init_arm( cpu, pf, predict_filter );
+#endif
 }
 
 void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
@@ -833,5 +835,9 @@ void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
 #ifdef HAVE_MMX
     x264_predict_4x4_init_mmx( cpu, pf );
 #endif
+
+#ifdef HAVE_ARMV6
+    x264_predict_4x4_init_arm( cpu, pf );
+#endif
 }
 
diff --git a/common/quant.c b/common/quant.c
index daf2b5a..7434a3d 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -29,6 +29,9 @@
 #ifdef ARCH_PPC
 #   include "ppc/quant.h"
 #endif
+#ifdef ARCH_ARM
+#   include "arm/quant.h"
+#endif
 
 #define QUANT_ONE( coef, mf, f ) \
 { \
@@ -39,141 +42,101 @@
     nz |= (coef); \
 }
 
-static int quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
+static int quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
 {
     int i, nz = 0;
     for( i = 0; i < 64; i++ )
-        QUANT_ONE( dct[0][i], mf[i], bias[i] );
+        QUANT_ONE( dct[i], mf[i], bias[i] );
     return !!nz;
 }
 
-static int quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
+static int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
 {
     int i, nz = 0;
     for( i = 0; i < 16; i++ )
-        QUANT_ONE( dct[0][i], mf[i], bias[i] );
+        QUANT_ONE( dct[i], mf[i], bias[i] );
     return !!nz;
 }
 
-static int quant_4x4_dc( int16_t dct[4][4], int mf, int bias )
+static int quant_4x4_dc( int16_t dct[16], int mf, int bias )
 {
     int i, nz = 0;
     for( i = 0; i < 16; i++ )
-        QUANT_ONE( dct[0][i], mf, bias );
+        QUANT_ONE( dct[i], mf, bias );
     return !!nz;
 }
 
-static int quant_2x2_dc( int16_t dct[2][2], int mf, int bias )
+static int quant_2x2_dc( int16_t dct[4], int mf, int bias )
 {
     int nz = 0;
-    QUANT_ONE( dct[0][0], mf, bias );
-    QUANT_ONE( dct[0][1], mf, bias );
-    QUANT_ONE( dct[0][2], mf, bias );
-    QUANT_ONE( dct[0][3], mf, bias );
+    QUANT_ONE( dct[0], mf, bias );
+    QUANT_ONE( dct[1], mf, bias );
+    QUANT_ONE( dct[2], mf, bias );
+    QUANT_ONE( dct[3], mf, bias );
     return !!nz;
 }
 
 #define DEQUANT_SHL( x ) \
-    dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][y][x] ) << i_qbits
+    dct[x] = ( dct[x] * dequant_mf[i_mf][x] ) << i_qbits
 
 #define DEQUANT_SHR( x ) \
-    dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][y][x] + f ) >> (-i_qbits)
+    dct[x] = ( dct[x] * dequant_mf[i_mf][x] + f ) >> (-i_qbits)
 
-static void dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+static void dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
 {
     const int i_mf = i_qp%6;
     const int i_qbits = i_qp/6 - 4;
-    int y;
+    int i;
 
     if( i_qbits >= 0 )
     {
-        for( y = 0; y < 4; y++ )
-        {
-            DEQUANT_SHL( 0 );
-            DEQUANT_SHL( 1 );
-            DEQUANT_SHL( 2 );
-            DEQUANT_SHL( 3 );
-        }
+        for( i = 0; i < 16; i++ )
+            DEQUANT_SHL( i );
     }
     else
     {
         const int f = 1 << (-i_qbits-1);
-        for( y = 0; y < 4; y++ )
-        {
-            DEQUANT_SHR( 0 );
-            DEQUANT_SHR( 1 );
-            DEQUANT_SHR( 2 );
-            DEQUANT_SHR( 3 );
-        }
+        for( i = 0; i < 16; i++ )
+            DEQUANT_SHR( i );
     }
 }
 
-static void dequant_8x8( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp )
+static void dequant_8x8( int16_t dct[64], int dequant_mf[6][64], int i_qp )
 {
     const int i_mf = i_qp%6;
     const int i_qbits = i_qp/6 - 6;
-    int y;
+    int i;
 
     if( i_qbits >= 0 )
     {
-        for( y = 0; y < 8; y++ )
-        {
-            DEQUANT_SHL( 0 );
-            DEQUANT_SHL( 1 );
-            DEQUANT_SHL( 2 );
-            DEQUANT_SHL( 3 );
-            DEQUANT_SHL( 4 );
-            DEQUANT_SHL( 5 );
-            DEQUANT_SHL( 6 );
-            DEQUANT_SHL( 7 );
-        }
+        for( i = 0; i < 64; i++ )
+            DEQUANT_SHL( i );
     }
     else
     {
         const int f = 1 << (-i_qbits-1);
-        for( y = 0; y < 8; y++ )
-        {
-            DEQUANT_SHR( 0 );
-            DEQUANT_SHR( 1 );
-            DEQUANT_SHR( 2 );
-            DEQUANT_SHR( 3 );
-            DEQUANT_SHR( 4 );
-            DEQUANT_SHR( 5 );
-            DEQUANT_SHR( 6 );
-            DEQUANT_SHR( 7 );
-        }
+        for( i = 0; i < 64; i++ )
+            DEQUANT_SHR( i );
     }
 }
 
-static void dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+static void dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
 {
     const int i_qbits = i_qp/6 - 6;
-    int y;
+    int i;
 
     if( i_qbits >= 0 )
     {
-        const int i_dmf = dequant_mf[i_qp%6][0][0] << i_qbits;
-
-        for( y = 0; y < 4; y++ )
-        {
-            dct[y][0] *= i_dmf;
-            dct[y][1] *= i_dmf;
-            dct[y][2] *= i_dmf;
-            dct[y][3] *= i_dmf;
-        }
+        const int i_dmf = dequant_mf[i_qp%6][0] << i_qbits;
+        for( i = 0; i < 16; i++ )
+            dct[i] *= i_dmf;
     }
     else
     {
-        const int i_dmf = dequant_mf[i_qp%6][0][0];
+        const int i_dmf = dequant_mf[i_qp%6][0];
         const int f = 1 << (-i_qbits-1);
-
-        for( y = 0; y < 4; y++ )
-        {
-            dct[y][0] = ( dct[y][0] * i_dmf + f ) >> (-i_qbits);
-            dct[y][1] = ( dct[y][1] * i_dmf + f ) >> (-i_qbits);
-            dct[y][2] = ( dct[y][2] * i_dmf + f ) >> (-i_qbits);
-            dct[y][3] = ( dct[y][3] * i_dmf + f ) >> (-i_qbits);
-        }
+        for( i = 0; i < 16; i++ )
+            dct[i] = ( dct[i] * i_dmf + f ) >> (-i_qbits);
     }
 }
 
@@ -215,7 +178,7 @@ static int ALWAYS_INLINE x264_decimate_score_internal( int16_t *dct, int i_max )
     int idx = i_max - 1;
 
     /* Yes, dct[idx-1] is guaranteed to be 32-bit aligned.  idx>=0 instead of 1 works correctly for the same reason */
-    while( idx >= 0 && *(uint32_t*)&dct[idx-1] == 0 )
+    while( idx >= 0 && M32( &dct[idx-1] ) == 0 )
         idx -= 2;
     if( idx >= 0 && dct[idx] == 0 )
         idx--;
@@ -255,7 +218,7 @@ static int ALWAYS_INLINE x264_coeff_last_internal( int16_t *l, int i_count )
 {
     int i_last;
     for( i_last = i_count-1; i_last >= 3; i_last -= 4 )
-        if( *(uint64_t*)(l+i_last-3) )
+        if( M64( l+i_last-3 ) )
             break;
     while( i_last >= 0 && l[i_last] == 0 )
         i_last--;
@@ -428,6 +391,25 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->dequant_8x8 = x264_dequant_8x8_altivec;
     }
 #endif
+
+#ifdef HAVE_ARMV6
+    if( cpu&X264_CPU_ARMV6 )
+        pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_arm;
+
+    if( cpu&X264_CPU_NEON )
+    {
+        pf->quant_2x2_dc   = x264_quant_2x2_dc_neon;
+        pf->quant_4x4      = x264_quant_4x4_neon;
+        pf->quant_4x4_dc   = x264_quant_4x4_dc_neon;
+        pf->quant_8x8      = x264_quant_8x8_neon;
+        pf->dequant_4x4    = x264_dequant_4x4_neon;
+        pf->dequant_4x4_dc = x264_dequant_4x4_dc_neon;
+        pf->dequant_8x8    = x264_dequant_8x8_neon;
+        pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon;
+        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
+        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
+    }
+#endif
     pf->coeff_last[  DCT_LUMA_DC] = pf->coeff_last[DCT_LUMA_4x4];
     pf->coeff_last[DCT_CHROMA_AC] = pf->coeff_last[ DCT_LUMA_AC];
     pf->coeff_level_run[  DCT_LUMA_DC] = pf->coeff_level_run[DCT_LUMA_4x4];
diff --git a/common/quant.h b/common/quant.h
index b8a7b98..1cfe95d 100644
--- a/common/quant.h
+++ b/common/quant.h
@@ -25,14 +25,14 @@
 
 typedef struct
 {
-    int (*quant_8x8)( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
-    int (*quant_4x4)( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-    int (*quant_4x4_dc)( int16_t dct[4][4], int mf, int bias );
-    int (*quant_2x2_dc)( int16_t dct[2][2], int mf, int bias );
+    int (*quant_8x8)( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
+    int (*quant_4x4)( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
+    int (*quant_4x4_dc)( int16_t dct[16], int mf, int bias );
+    int (*quant_2x2_dc)( int16_t dct[4], int mf, int bias );
 
-    void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
-    void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
-    void (*dequant_4x4_dc)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
+    void (*dequant_8x8)( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+    void (*dequant_4x4)( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+    void (*dequant_4x4_dc)( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 
     void (*denoise_dct)( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
 
diff --git a/common/set.c b/common/set.c
index 6c7ddc4..f9379f0 100644
--- a/common/set.c
+++ b/common/set.c
@@ -20,7 +20,7 @@
 
 #include "common.h"
 
-#define SHIFT(x,s) ((s)<0 ? (x)<<-(s) : (s)==0 ? (x) : ((x)+(1<<((s)-1)))>>(s))
+#define SHIFT(x,s) ((s)<=0 ? (x)<<-(s) : ((x)+(1<<((s)-1)))>>(s))
 #define DIV(n,d) (((n) + ((d)>>1)) / (d))
 
 static const int dequant4_scale[6][3] =
@@ -71,13 +71,14 @@ int x264_cqm_init( x264_t *h )
     int def_quant8[6][64];
     int def_dequant4[6][16];
     int def_dequant8[6][64];
-    int quant4_mf[4][6][4][4];
-    int quant8_mf[2][6][8][8];
+    int quant4_mf[4][6][16];
+    int quant8_mf[2][6][64];
     int q, i, j, i_list;
     int deadzone[4] = { 32 - h->param.analyse.i_luma_deadzone[1],
                         32 - h->param.analyse.i_luma_deadzone[0],
                         32 - 11, 32 - 21 };
     int max_qp_err = -1;
+    int max_chroma_qp_err = -1;
 
     for( i = 0; i < 6; i++ )
     {
@@ -93,9 +94,9 @@ int x264_cqm_init( x264_t *h )
         }
         else
         {
-            h->  quant4_mf[i] = x264_malloc(52*size*sizeof(uint16_t) );
-            h->dequant4_mf[i] = x264_malloc( 6*size*sizeof(int) );
-            h->unquant4_mf[i] = x264_malloc(52*size*sizeof(int) );
+            CHECKED_MALLOC( h->  quant4_mf[i], 52*size*sizeof(uint16_t) );
+            CHECKED_MALLOC( h->dequant4_mf[i],  6*size*sizeof(int) );
+            CHECKED_MALLOC( h->unquant4_mf[i], 52*size*sizeof(int) );
         }
 
         for( j = (i<4 ? 0 : 4); j < i; j++ )
@@ -105,7 +106,7 @@ int x264_cqm_init( x264_t *h )
         if( j < i )
             h->quant4_bias[i] = h->quant4_bias[j];
         else
-            h->quant4_bias[i] = x264_malloc(52*size*sizeof(uint16_t) );
+            CHECKED_MALLOC( h->quant4_bias[i], 52*size*sizeof(uint16_t) );
     }
 
     for( q = 0; q < 6; q++ )
@@ -129,14 +130,14 @@ int x264_cqm_init( x264_t *h )
         for( i_list = 0; i_list < 4; i_list++ )
             for( i = 0; i < 16; i++ )
             {
-                h->dequant4_mf[i_list][q][0][i] = def_dequant4[q][i] * h->pps->scaling_list[i_list][i];
-                     quant4_mf[i_list][q][0][i] = DIV(def_quant4[q][i] * 16, h->pps->scaling_list[i_list][i]);
+                h->dequant4_mf[i_list][q][i] = def_dequant4[q][i] * h->pps->scaling_list[i_list][i];
+                     quant4_mf[i_list][q][i] = DIV(def_quant4[q][i] * 16, h->pps->scaling_list[i_list][i]);
             }
         for( i_list = 0; i_list < 2; i_list++ )
             for( i = 0; i < 64; i++ )
             {
-                h->dequant8_mf[i_list][q][0][i] = def_dequant8[q][i] * h->pps->scaling_list[4+i_list][i];
-                     quant8_mf[i_list][q][0][i] = DIV(def_quant8[q][i] * 16, h->pps->scaling_list[4+i_list][i]);
+                h->dequant8_mf[i_list][q][i] = def_dequant8[q][i] * h->pps->scaling_list[4+i_list][i];
+                     quant8_mf[i_list][q][i] = DIV(def_quant8[q][i] * 16, h->pps->scaling_list[4+i_list][i]);
             }
     }
     for( q = 0; q < 52; q++ )
@@ -144,19 +145,21 @@ int x264_cqm_init( x264_t *h )
         for( i_list = 0; i_list < 4; i_list++ )
             for( i = 0; i < 16; i++ )
             {
-                h->unquant4_mf[i_list][q][i] = (1ULL << (q/6 + 15 + 8)) / quant4_mf[i_list][q%6][0][i];
-                h->  quant4_mf[i_list][q][i] = j = SHIFT(quant4_mf[i_list][q%6][0][i], q/6 - 1);
+                h->unquant4_mf[i_list][q][i] = (1ULL << (q/6 + 15 + 8)) / quant4_mf[i_list][q%6][i];
+                h->  quant4_mf[i_list][q][i] = j = SHIFT(quant4_mf[i_list][q%6][i], q/6 - 1);
                 // round to nearest, unless that would cause the deadzone to be negative
                 h->quant4_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
-                if( j > 0xffff && q > max_qp_err )
+                if( j > 0xffff && q > max_qp_err && (i_list == CQM_4IY || i_list == CQM_4PY) )
                     max_qp_err = q;
+                if( j > 0xffff && q > max_chroma_qp_err && (i_list == CQM_4IC || i_list == CQM_4PC) )
+                    max_chroma_qp_err = q;
             }
         if( h->param.analyse.b_transform_8x8 )
         for( i_list = 0; i_list < 2; i_list++ )
             for( i = 0; i < 64; i++ )
             {
-                h->unquant8_mf[i_list][q][i] = (1ULL << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][0][i];
-                h->  quant8_mf[i_list][q][i] = j = SHIFT(quant8_mf[i_list][q%6][0][i], q/6);
+                h->unquant8_mf[i_list][q][i] = (1ULL << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][i];
+                h->  quant8_mf[i_list][q][i] = j = SHIFT(quant8_mf[i_list][q%6][i], q/6);
                 h->quant8_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
                 if( j > 0xffff && q > max_qp_err )
                     max_qp_err = q;
@@ -165,34 +168,46 @@ int x264_cqm_init( x264_t *h )
 
     if( !h->mb.b_lossless && max_qp_err >= h->param.rc.i_qp_min )
     {
-        x264_log( h, X264_LOG_ERROR, "Quantization overflow.\n" );
-        x264_log( h, X264_LOG_ERROR, "Your CQM is incompatible with QP < %d, but min QP is set to %d\n",
-                  max_qp_err+1, h->param.rc.i_qp_min );
+        x264_log( h, X264_LOG_ERROR, "Quantization overflow.  Your CQM is incompatible with QP < %d,\n", max_qp_err+1 );
+        x264_log( h, X264_LOG_ERROR, "but min QP is set to %d.\n", h->param.rc.i_qp_min );
+        return -1;
+    }
+    if( !h->mb.b_lossless && max_chroma_qp_err >= h->chroma_qp_table[h->param.rc.i_qp_min] )
+    {
+        x264_log( h, X264_LOG_ERROR, "Quantization overflow.  Your CQM is incompatible with QP < %d,\n", max_chroma_qp_err+1 );
+        x264_log( h, X264_LOG_ERROR, "but min chroma QP is implied to be %d.\n", h->chroma_qp_table[h->param.rc.i_qp_min] );
         return -1;
     }
     return 0;
+fail:
+    x264_cqm_delete( h );
+    return -1;
 }
 
+#define CQM_DELETE( n, max )\
+    for( i = 0; i < max; i++ )\
+    {\
+        for( j = 0; j < i; j++ )\
+            if( h->quant##n##_mf[i] == h->quant##n##_mf[j] )\
+                break;\
+        if( j == i )\
+        {\
+            x264_free( h->  quant##n##_mf[i] );\
+            x264_free( h->dequant##n##_mf[i] );\
+            x264_free( h->unquant##n##_mf[i] );\
+        }\
+        for( j = 0; j < i; j++ )\
+            if( h->quant##n##_bias[i] == h->quant##n##_bias[j] )\
+                break;\
+        if( j == i )\
+            x264_free( h->quant##n##_bias[i] );\
+    }
+
 void x264_cqm_delete( x264_t *h )
 {
     int i, j;
-    for( i = 0; i < 6; i++ )
-    {
-        for( j = 0; j < i; j++ )
-            if( h->quant4_mf[i] == h->quant4_mf[j] )
-                break;
-        if( j == i )
-        {
-            x264_free( h->  quant4_mf[i] );
-            x264_free( h->dequant4_mf[i] );
-            x264_free( h->unquant4_mf[i] );
-        }
-        for( j = 0; j < i; j++ )
-            if( h->quant4_bias[i] == h->quant4_bias[j] )
-                break;
-        if( j == i )
-            x264_free( h->quant4_bias[i] );
-    }
+    CQM_DELETE( 4, 4 );
+    CQM_DELETE( 8, 2 );
 }
 
 static int x264_cqm_parse_jmlist( x264_t *h, const char *buf, const char *name,
diff --git a/common/visualize.c b/common/visualize.c
index f7100f0..1d3dd84 100644
--- a/common/visualize.c
+++ b/common/visualize.c
@@ -94,10 +94,13 @@ static void mv(int x0, int y0, int16_t dmv[2], int ref, int zoom, char *col)
 /* }}} */
 
 /* {{{ [fold] void x264_visualize_init( x264_t *h ) */
-void x264_visualize_init( x264_t *h )
+int x264_visualize_init( x264_t *h )
 {
     int mb = h->sps->i_mb_width * h->sps->i_mb_height;
-    h->visualize = x264_malloc(mb * sizeof(visualize_t));
+    CHECKED_MALLOC( h->visualize, mb * sizeof(visualize_t) );
+    return 0;
+fail:
+    return -1;
 }
 /* }}} */
 /* {{{ [fold] void x264_visualize_mb( x264_t *h ) */
diff --git a/common/visualize.h b/common/visualize.h
index b611f6c..f9753d7 100644
--- a/common/visualize.h
+++ b/common/visualize.h
@@ -23,7 +23,7 @@
 
 #include "common/common.h"
 
-void x264_visualize_init( x264_t *h );
+int  x264_visualize_init( x264_t *h );
 void x264_visualize_mb( x264_t *h );
 void x264_visualize_show( x264_t *h );
 void x264_visualize_close( x264_t *h );
diff --git a/common/x86/cpu-a.asm b/common/x86/cpu-a.asm
index 20eb7b8..990f0ee 100644
--- a/common/x86/cpu-a.asm
+++ b/common/x86/cpu-a.asm
@@ -96,11 +96,13 @@ cglobal x264_cpu_cpuid, 0,6
 cglobal x264_stack_align
     push ebp
     mov  ebp, esp
-    sub  esp, 4
+    sub  esp, 8
     and  esp, ~15
     mov  ecx, [ebp+8]
     mov  edx, [ebp+12]
     mov  [esp], edx
+    mov  edx, [ebp+16]
+    mov  [esp+4], edx
     call ecx
     leave
     ret
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 6e92df6..d4a0cae 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -26,16 +26,30 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
+%macro SHUFFLE_16BIT 8
+    %rep 8
+        db %1*2
+        db %1*2+1
+        %rotate 1
+    %endrep
+%endmacro
+
 SECTION_RODATA
+pw_32_0: times 4 dw 32
+         times 4 dw 0
 pw_32: times 8 dw 32
 pw_8000: times 8 dw 0x8000
 hsub_mul: times 8 db 1, -1
+
 pb_sub4frame:   db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
-pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
-pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
+pb_sub4field:   db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
+pb_subacmask:   dw 0,-1,-1,-1,-1,-1,-1,-1
+pb_scan4framea: SHUFFLE_16BIT 6,3,7,0,4,1,2,5
+pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7
 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
 pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
 pb_1: times 16 db 1
+pw_1: times 8 dw 1
 
 SECTION .text
 
@@ -145,6 +159,59 @@ cglobal x264_add4x4_idct_mmx, 2,2
     STORE_DIFF  m3, m4, m7, [r0+3*FDEC_STRIDE]
     RET
 
+INIT_XMM
+cglobal x264_add4x4_idct_sse4, 2,2,6
+    mova      m0, [r1+0x00]     ; row1/row0
+    mova      m2, [r1+0x10]     ; row3/row2
+    mova      m1, m0            ; row1/row0
+    psraw     m0, 1             ; row1>>1/...
+    mova      m3, m2            ; row3/row2
+    psraw     m2, 1             ; row3>>1/...
+    movsd     m0, m1            ; row1>>1/row0
+    movsd     m2, m3            ; row3>>1/row2
+    psubw     m0, m3            ; row1>>1-row3/row0-2
+    paddw     m2, m1            ; row3>>1+row1/row0+2
+    SBUTTERFLY2 wd, 0, 2, 1
+    SUMSUB_BA m2, m0, m1
+    pshuflw   m1, m2, 10110001b
+    pshufhw   m2, m2, 10110001b
+    punpckldq m1, m0
+    punpckhdq m2, m0
+    SWAP 0, 1
+
+    mova      m1, [pw_32_0 GLOBAL]
+    paddw     m1, m0            ; row1/row0 corrected
+    psraw     m0, 1             ; row1>>1/...
+    mova      m3, m2            ; row3/row2
+    psraw     m2, 1             ; row3>>1/...
+    movsd     m0, m1            ; row1>>1/row0
+    movsd     m2, m3            ; row3>>1/row2
+    psubw     m0, m3            ; row1>>1-row3/row0-2
+    paddw     m2, m1            ; row3>>1+row1/row0+2
+    SBUTTERFLY2 qdq, 0, 2, 1
+    SUMSUB_BA m2, m0, m1
+
+    movd      m4, [r0+FDEC_STRIDE*0]
+    movd      m1, [r0+FDEC_STRIDE*1]
+    movd      m3, [r0+FDEC_STRIDE*2]
+    movd      m5, [r0+FDEC_STRIDE*3]
+    punpckldq m1, m4            ; row0/row1
+    pxor      m4, m4
+    punpckldq m3, m5            ; row3/row2
+    punpcklbw m1, m4
+    psraw     m2, 6
+    punpcklbw m3, m4
+    psraw     m0, 6
+    paddsw    m2, m1
+    paddsw    m0, m3
+    packuswb  m0, m2            ; row0/row1/row3/row2
+    pextrd   [r0+FDEC_STRIDE*0], m0, 3
+    pextrd   [r0+FDEC_STRIDE*1], m0, 2
+    movd     [r0+FDEC_STRIDE*2], m0
+    pextrd   [r0+FDEC_STRIDE*3], m0, 1
+    RET
+
+INIT_MMX
 ;-----------------------------------------------------------------------------
 ; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
 ;-----------------------------------------------------------------------------
@@ -428,6 +495,79 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
     ret
 
 ;-----------------------------------------------------------------------------
+; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
+;-----------------------------------------------------------------------------
+
+%macro DCTDC_2ROW_MMX 3
+    movq      %1, [r1+FENC_STRIDE*(0+%3)]
+    movq      m1, [r1+FENC_STRIDE*(1+%3)]
+    movq      m2, [r2+FDEC_STRIDE*(0+%3)]
+    movq      m3, [r2+FDEC_STRIDE*(1+%3)]
+    movq      %2, %1
+    punpckldq %1, m1
+    punpckhdq %2, m1
+    movq      m1, m2
+    punpckldq m2, m3
+    punpckhdq m1, m3
+    psadbw    %1, m7
+    psadbw    %2, m7
+    psadbw    m2, m7
+    psadbw    m1, m7
+    psubw     %1, m2
+    psubw     %2, m1
+%endmacro
+
+INIT_MMX
+cglobal x264_sub8x8_dct_dc_mmxext, 3,3
+    pxor      m7, m7
+    call .loop
+    add       r1, FENC_STRIDE*4
+    add       r2, FDEC_STRIDE*4
+    add       r0, 4
+.loop:
+    DCTDC_2ROW_MMX m0, m4, 0
+    DCTDC_2ROW_MMX m5, m6, 2
+    paddw     m0, m5
+    paddw     m4, m6
+    punpcklwd m0, m4
+    movd    [r0], m0
+    ret
+
+INIT_XMM
+%macro DCTDC_2ROW_SSE2 3
+    movq      m0, [r1+FENC_STRIDE*(0+%1)]
+    movq      m1, [r1+FENC_STRIDE*(1+%1)]
+    movq      m2, [r2+FDEC_STRIDE*(0+%1)]
+    movq      m3, [r2+FDEC_STRIDE*(1+%1)]
+    punpckldq m0, m1
+    punpckldq m2, m3
+    psadbw    m0, m7
+    psadbw    m2, m7
+%if %2
+    paddw     %3, m0
+    paddw     m6, m2
+%else
+    SWAP      %3, m0
+    SWAP      m6, m2
+%endif
+%endmacro
+
+cglobal x264_sub8x8_dct_dc_sse2, 3,3,8
+    pxor     m7, m7
+    DCTDC_2ROW_SSE2 0, 0, m4
+    DCTDC_2ROW_SSE2 2, 1, m4
+    add      r1, FENC_STRIDE*4
+    add      r2, FDEC_STRIDE*4
+    psubq    m4, m6
+    DCTDC_2ROW_SSE2 0, 0, m5
+    DCTDC_2ROW_SSE2 2, 1, m5
+    psubq    m5, m6
+    packssdw m4, m5
+    packssdw m4, m4
+    movq   [r0], m4
+    RET
+
+;-----------------------------------------------------------------------------
 ; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
 ;-----------------------------------------------------------------------------
 %macro SCAN_8x8 1
@@ -704,9 +844,106 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
     RET
 
 ;-----------------------------------------------------------------------------
+; void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[8][8] )
+;-----------------------------------------------------------------------------
+
+; Output order:
+;  0  1  2  8  9  3  4 10
+; 16 11  5  6  7 12 17 24
+; 18 13 14 15 19 25 32 26
+; 20 21 22 23 27 33 40 34
+; 28 29 30 31 35 41 48 42
+; 36 37 38 39 43 49 50 44
+; 45 46 47 51 56 57 52 53
+; 54 55 58 59 60 61 62 63
+
+cglobal x264_zigzag_scan_8x8_field_mmxext, 2,3
+    movq       mm0, [r1+2*0]        ; 03 02 01 00
+    movq       mm1, [r1+2*4]        ; 07 06 05 04
+    movq       mm2, [r1+2*8]        ; 11 10 09 08
+    pshufw     mm3, mm0, 011111111b ; 03 03 03 03
+    movd        r2, mm2             ; 09 08
+    pshufw     mm2, mm2, 000111001b ; 08 11 10 09
+    punpcklwd  mm3, mm1             ; 05 03 04 03
+    pinsrw     mm0, r2, 3           ; 08 02 01 00
+    movq       mm4, mm2
+    punpcklwd  mm2, mm3             ; 04 10 03 09
+    pshufw     mm2, mm2, 010110100b ; 10 04 03 09
+    movq  [r0+2*0], mm0             ; 08 02 01 00
+    movq  [r0+2*4], mm2             ; 10 04 03 09
+    movq       mm3, [r1+2*12]       ; 15 14 13 12
+    movq       mm5, [r1+2*16]       ; 19 18 17 16
+    punpckldq  mm6, mm5             ; 17 16 XX XX
+    psrlq      mm1, 16              ; XX 07 06 05
+    punpckhwd  mm6, mm4             ; 08 17 11 16
+    punpckldq  mm6, mm1             ; 06 05 11 16
+    movq  [r0+2*8], mm6             ; 06 05 11 16
+    psrlq      mm1, 16              ; XX XX 07 06
+    punpcklwd  mm1, mm5             ; 17 07 16 06
+    movq       mm0, [r1+2*20]       ; 23 22 21 20
+    movq       mm2, [r1+2*24]       ; 27 26 25 24
+    movq       mm6, mm3
+    punpckhdq  mm1, mm1             ; 17 07 17 07
+    punpcklwd  mm6, mm2             ; 25 13 24 12
+    pextrw      r2, mm5, 2
+    movq [r0+2*24], mm0             ; 23 22 21 20
+    punpcklwd  mm1, mm6             ; 24 17 12 07
+    movq [r0+2*12], mm1
+    pinsrw     mm3, r2, 0           ; 15 14 13 18
+    movq [r0+2*16], mm3             ; 15 14 13 18
+    movq       mm7, [r1+2*28]
+    movq       mm0, [r1+2*32]       ; 35 34 33 32
+    psrlq      mm5, 48              ; XX XX XX 19
+    pshufw     mm1, mm2, 011111001b ; 27 27 26 25
+    punpcklwd  mm5, mm0             ; 33 XX 32 19
+    psrlq      mm2, 48              ; XX XX XX 27
+    punpcklwd  mm5, mm1             ; 26 32 25 19
+    movq [r0+2*32], mm7
+    movq [r0+2*20], mm5             ; 26 32 25 19
+    movq       mm7, [r1+2*36]
+    movq       mm1, [r1+2*40]       ; 43 42 41 40
+    pshufw     mm3, mm0, 011111001b ; 35 35 34 33
+    punpcklwd  mm2, mm1             ; 41 XX 40 27
+    movq [r0+2*40], mm7
+    punpcklwd  mm2, mm3             ; 34 40 33 27
+    movq [r0+2*28], mm2
+    movq       mm7, [r1+2*44]       ; 47 46 45 44
+    movq       mm2, [r1+2*48]       ; 51 50 49 48
+    psrlq      mm0, 48              ; XX XX XX 35
+    punpcklwd  mm0, mm2             ; 49 XX 48 35
+    pshufw     mm3, mm1, 011111001b ; 43 43 42 41
+    punpcklwd  mm0, mm3             ; 42 48 41 35
+    movq [r0+2*36], mm0
+    pextrw      r2, mm2, 3          ; 51
+    psrlq      mm1, 48              ; XX XX XX 43
+    punpcklwd  mm1, mm7             ; 45 XX 44 43
+    psrlq      mm2, 16              ; XX 51 50 49
+    punpcklwd  mm1, mm2             ; 50 44 49 43
+    pshufw     mm1, mm1, 010110100b ; 44 50 49 43
+    movq [r0+2*44], mm1
+    psrlq      mm7, 16              ; XX 47 46 45
+    pinsrw     mm7, r2, 3           ; 51 47 46 45
+    movq [r0+2*48], mm7
+    movq       mm0, [r1+2*56]       ; 59 58 57 56
+    movq       mm1, [r1+2*52]       ; 55 54 53 52
+    movq       mm2, mm0
+    movq       mm7, [r1+2*60]
+    punpckldq  mm2, mm1             ; 53 52 57 56
+    punpckhdq  mm1, mm0             ; 59 58 55 54
+    movq [r0+2*52], mm2
+    movq [r0+2*56], mm1
+    movq [r0+2*60], mm7
+    RET
+
+;-----------------------------------------------------------------------------
 ; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
 ;-----------------------------------------------------------------------------
-cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3,8
+%macro ZIGZAG_SUB_4x4 2
+%ifidn %1, ac
+cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 4,4,8
+%else
+cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8
+%endif
     movd      xmm0, [r1+0*FENC_STRIDE]
     movd      xmm1, [r1+1*FENC_STRIDE]
     movd      xmm2, [r1+2*FENC_STRIDE]
@@ -725,7 +962,11 @@ cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3,8
     punpckldq xmm6, xmm7
     punpcklqdq xmm0, xmm2
     punpcklqdq xmm4, xmm6
+%ifidn %2, frame
     movdqa    xmm7, [pb_sub4frame GLOBAL]
+%else
+    movdqa    xmm7, [pb_sub4field GLOBAL]
+%endif
     pshufb    xmm0, xmm7
     pshufb    xmm4, xmm7
     pxor      xmm6, xmm6
@@ -737,9 +978,28 @@ cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3,8
     punpckhbw xmm5, xmm6
     psubw     xmm0, xmm4
     psubw     xmm1, xmm5
+%ifidn %1, ac
+    movd       r2d, xmm0
+    pand      xmm0, [pb_subacmask GLOBAL]
+%endif
     movdqa    [r0], xmm0
+    pxor      xmm2, xmm2
     movdqa [r0+16], xmm1
+    por       xmm0, xmm1
+    pcmpeqb   xmm0, xmm2
+    pmovmskb   eax, xmm0
+%ifidn %1, ac
+    mov       [r3], r2w
+%endif
+    sub        eax, 0xffff
+    shr        eax, 31
     RET
+%endmacro
+
+ZIGZAG_SUB_4x4   , frame
+ZIGZAG_SUB_4x4 ac, frame
+ZIGZAG_SUB_4x4   , field
+ZIGZAG_SUB_4x4 ac, field
 
 ;-----------------------------------------------------------------------------
 ; void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz )
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 4451821..a8f46ca 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -24,50 +24,56 @@
 #ifndef X264_I386_DCT_H
 #define X264_I386_DCT_H
 
-void x264_sub4x4_dct_mmx     ( int16_t dct[ 4][4]   ,  uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_mmx     ( int16_t dct[ 4][4][4],  uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_mmx   ( int16_t dct[16][4][4],  uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_sse2    ( int16_t dct[ 4][4][4],  uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_sse2  ( int16_t dct[16][4][4],  uint8_t *pix1, uint8_t *pix2 );
-void x264_sub4x4_dct_ssse3   ( int16_t dct[ 4][4]   ,  uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct_ssse3   ( int16_t dct[ 4][4][4],  uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct_ssse3 ( int16_t dct[16][4][4],  uint8_t *pix1, uint8_t *pix2 );
+void x264_sub4x4_dct_mmx      ( int16_t dct    [16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_mmx      ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_mmx    ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_sse2     ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_sse2   ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub4x4_dct_ssse3    ( int16_t dct    [16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_ssse3    ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_ssse3  ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_dc_mmxext( int16_t dct    [ 4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_dc_sse2  ( int16_t dct    [ 4], uint8_t *pix1, uint8_t *pix2 );
 
+void x264_add4x4_idct_mmx       ( uint8_t *p_dst, int16_t dct    [16] );
+void x264_add4x4_idct_sse4      ( uint8_t *p_dst, int16_t dct    [16] );
+void x264_add8x8_idct_mmx       ( uint8_t *p_dst, int16_t dct[ 4][16] );
+void x264_add8x8_idct_dc_mmx    ( uint8_t *p_dst, int16_t dct    [ 4] );
+void x264_add16x16_idct_mmx     ( uint8_t *p_dst, int16_t dct[16][16] );
+void x264_add16x16_idct_dc_mmx  ( uint8_t *p_dst, int16_t dct    [16] );
+void x264_add8x8_idct_sse2      ( uint8_t *p_dst, int16_t dct[ 4][16] );
+void x264_add16x16_idct_sse2    ( uint8_t *p_dst, int16_t dct[16][16] );
+void x264_add16x16_idct_dc_sse2 ( uint8_t *p_dst, int16_t dct    [16] );
+void x264_add8x8_idct_dc_ssse3  ( uint8_t *p_dst, int16_t dct    [ 4] );
+void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct    [16] );
 
-void x264_add4x4_idct_mmx    ( uint8_t *p_dst, int16_t dct[ 4][4]    );
-void x264_add8x8_idct_mmx    ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
-void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[2][2] );
-void x264_add16x16_idct_mmx  ( uint8_t *p_dst, int16_t dct[16][4][4] );
-void x264_add16x16_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[4][4] );
-void x264_add8x8_idct_sse2   ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
-void x264_add16x16_idct_sse2 ( uint8_t *p_dst, int16_t dct[16][4][4] );
-void x264_add16x16_idct_dc_sse2( uint8_t *p_dst, int16_t dct[4][4] );
-void x264_add8x8_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[2][2] );
-void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[4][4] );
+void x264_dct4x4dc_mmx       ( int16_t d[16] );
+void x264_idct4x4dc_mmx      ( int16_t d[16] );
 
-void x264_dct4x4dc_mmx       ( int16_t d[4][4] );
-void x264_idct4x4dc_mmx      ( int16_t d[4][4] );
+void x264_sub8x8_dct8_mmx    ( int16_t dct   [64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct8_mmx  ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct8_sse2   ( int16_t dct   [64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct8_sse2 ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct8_ssse3  ( int16_t dct   [64], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct8_ssse3( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
 
-void x264_sub8x8_dct8_mmx    ( int16_t dct[8][8]   , uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct8_mmx  ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct8_sse2   ( int16_t dct[8][8]   , uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct8_sse2 ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
-void x264_sub8x8_dct8_ssse3  ( int16_t dct[8][8]   , uint8_t *pix1, uint8_t *pix2 );
-void x264_sub16x16_dct8_ssse3( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
 
+void x264_add8x8_idct8_mmx   ( uint8_t *dst, int16_t dct   [64] );
+void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][64] );
+void x264_add8x8_idct8_sse2  ( uint8_t *dst, int16_t dct   [64] );
+void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][64] );
 
-void x264_add8x8_idct8_mmx   ( uint8_t *dst, int16_t dct[8][8]    );
-void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][8][8] );
-void x264_add8x8_idct8_sse2  ( uint8_t *dst, int16_t dct[8][8]    );
-void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][8][8] );
-
-void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[8][8] );
-void x264_zigzag_scan_8x8_frame_sse2  ( int16_t level[64], int16_t dct[8][8] );
-void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] );
-void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[4][4] );
-void x264_zigzag_scan_4x4_frame_mmx   ( int16_t level[16], int16_t dct[4][4] );
-void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] );
-void x264_zigzag_sub_4x4_frame_ssse3  ( int16_t level[16], const uint8_t *src, uint8_t *dst );
+void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_frame_sse2  ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_frame_mmx   ( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[64] );
+int  x264_zigzag_sub_4x4_frame_ssse3  ( int16_t level[16], const uint8_t *src, uint8_t *dst );
+int  x264_zigzag_sub_4x4ac_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
+int  x264_zigzag_sub_4x4_field_ssse3  ( int16_t level[16], const uint8_t *src, uint8_t *dst );
+int  x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
 void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz );
 void x264_zigzag_interleave_8x8_cavlc_sse2( int16_t *dst, int16_t *src, uint8_t *nnz );
 
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 6b435d8..f486a8d 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -6,6 +6,7 @@
 ;* Authors: Loren Merritt <lorenm at u.washington.edu>
 ;*          Jason Garrett-Glaser <darkshikari at gmail.com>
 ;*          Laurent Aimar <fenrir at via.ecp.fr>
+;*          Dylan Yudaken <dyudaken at gmail.com>
 ;*          Min Chen <chenm001.163.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
@@ -25,8 +26,10 @@
 
 %include "x86inc.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
+ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0
+pw_1:  times 8 dw  1
 pw_4:  times 8 dw  4
 pw_8:  times 8 dw  8
 pw_32: times 8 dw 32
@@ -36,9 +39,8 @@ sw_64: dd 64
 SECTION .text
 
 ;=============================================================================
-; weighted prediction
+; implicit weighted biprediction
 ;=============================================================================
-; implicit bipred only:
 ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
 %ifdef ARCH_X86_64
     DECLARE_REG_TMP 0,1,2,3,4,5,10,11
@@ -63,12 +65,12 @@ SECTION .text
     %endmacro
 %endif
 
-%macro SPLATW 2
+%macro SPLATW 2-3 0
 %if mmsize==16
-    pshuflw  %1, %2, 0
+    pshuflw  %1, %2, %3*0x55
     punpcklqdq %1, %1
 %else
-    pshufw   %1, %2, 0
+    pshufw   %1, %2, %3*0x55
 %endif
 %endmacro
 
@@ -174,6 +176,225 @@ INIT_XMM
 AVG_WEIGHT ssse3, 8,  7
 AVG_WEIGHT ssse3, 16, 7
 
+;=============================================================================
+; P frame explicit weighted prediction
+;=============================================================================
+
+%macro WEIGHT_START 1
+    mova     m3, [r4]
+    mova     m6, [r4+16]
+    movd     m5, [r4+32]
+    pxor     m2, m2
+%if (%1 == 20 || %1 == 12) && mmsize == 16
+    movdq2q mm3, xmm3
+    movdq2q mm4, xmm4
+    movdq2q mm5, xmm5
+    movdq2q mm6, xmm6
+    pxor    mm2, mm2
+%endif
+%endmacro
+
+%macro WEIGHT_START_SSSE3 1
+    mova     m3, [r4]
+    mova     m4, [r4+16]
+    pxor     m2, m2
+%if %1 == 20 || %1 == 12
+    movdq2q mm3, xmm3
+    movdq2q mm4, xmm4
+    pxor    mm2, mm2
+%endif
+%endmacro
+
+;; macro to weight mmsize bytes taking half from %1 and half from %2
+%macro WEIGHT 2             ; (src1,src2)
+    movh      m0, [%1]
+    movh      m1, [%2]
+    punpcklbw m0, m2        ;setup
+    punpcklbw m1, m2        ;setup
+    pmullw    m0, m3        ;scale
+    pmullw    m1, m3        ;scale
+    paddsw    m0, m6        ;1<<(denom-1)+(offset<<denom)
+    paddsw    m1, m6        ;1<<(denom-1)+(offset<<denom)
+    psraw     m0, m5        ;denom
+    psraw     m1, m5        ;denom
+%endmacro
+
+%macro WEIGHT_SSSE3 2
+    movh      m0, [%1]
+    movh      m1, [%2]
+    punpcklbw m0, m2
+    punpcklbw m1, m2
+    psllw     m0, 7
+    psllw     m1, 7
+    pmulhrsw  m0, m3
+    pmulhrsw  m1, m3
+    paddw     m0, m4
+    paddw     m1, m4
+%endmacro
+
+%macro WEIGHT_SAVE_ROW 3        ;(src,dst,width)
+%if %3 == 16
+    mova     [%2], %1
+%elif %3 == 8
+    movq     [%2], %1
+%else
+    movd     [%2], %1       ; width 2 can write garbage for last 2 bytes
+%endif
+%endmacro
+
+%macro WEIGHT_ROW 3         ; (src,dst,width)
+    ;; load weights
+    WEIGHT           %1, (%1+(mmsize/2))
+    packuswb         m0, m1        ;put bytes into m0
+    WEIGHT_SAVE_ROW  m0, %2, %3
+%endmacro
+
+%macro WEIGHT_SAVE_COL 2        ;(dst,size)
+%if %2 == 8
+    packuswb     m0, m1
+    movq       [%1], m0
+    movhps  [%1+r1], m0
+%else
+    packuswb     m0, m0
+    packuswb     m1, m1
+    movd       [%1], m0    ; width 2 can write garbage for last 2 bytes
+    movd    [%1+r1], m1
+%endif
+%endmacro
+
+%macro WEIGHT_COL 3     ; (src,dst,width)
+%if %3 <= 4 && mmsize == 16
+    INIT_MMX
+    ;; load weights
+    WEIGHT           %1, (%1+r3)
+    WEIGHT_SAVE_COL  %2, %3
+    INIT_XMM
+%else
+    WEIGHT           %1, (%1+r3)
+    WEIGHT_SAVE_COL  %2, %3
+%endif
+
+%endmacro
+
+%macro WEIGHT_TWO_ROW 3 ; (src,dst,width)
+%assign x 0
+%rep %3
+%if (%3-x) >= mmsize
+    WEIGHT_ROW    (%1+x),    (%2+x), mmsize     ; weight 1 mmsize
+    WEIGHT_ROW (%1+r3+x), (%2+r1+x), mmsize     ; weight 1 mmsize
+    %assign x (x+mmsize)
+%else
+    WEIGHT_COL (%1+x),(%2+x),(%3-x)
+    %exitrep
+%endif
+%if x >= %3
+    %exitrep
+%endif
+%endrep
+%endmacro
+
+
+;void x264_mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src,int i_src_stride, x264_weight_t *weight,int h)
+
+%ifdef ARCH_X86_64
+%define NUMREGS 6
+%define LOAD_HEIGHT
+%define HEIGHT_REG r5d
+%else
+%define NUMREGS 5
+%define LOAD_HEIGHT mov r4d, r5m
+%define HEIGHT_REG r4d
+%endif
+
+%macro WEIGHTER 2
+    cglobal x264_mc_weight_w%1_%2, NUMREGS, NUMREGS, 7
+    WEIGHT_START %1
+    LOAD_HEIGHT
+.loop:
+    WEIGHT_TWO_ROW r2, r0, %1
+    lea  r0, [r0+r1*2]
+    lea  r2, [r2+r3*2]
+    sub HEIGHT_REG, 2
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_MMX
+WEIGHTER  4, mmxext
+WEIGHTER  8, mmxext
+WEIGHTER 12, mmxext
+WEIGHTER 16, mmxext
+WEIGHTER 20, mmxext
+INIT_XMM
+WEIGHTER  8, sse2
+WEIGHTER 16, sse2
+WEIGHTER 20, sse2
+%define WEIGHT WEIGHT_SSSE3
+%define WEIGHT_START WEIGHT_START_SSSE3
+INIT_MMX
+WEIGHTER  4, ssse3
+INIT_XMM
+WEIGHTER  8, ssse3
+WEIGHTER 16, ssse3
+WEIGHTER 20, ssse3
+
+%macro OFFSET_OP 7
+    mov%6        m0, [%1]
+    mov%6        m1, [%2]
+    p%5usb       m0, m2
+    p%5usb       m1, m2
+    mov%7      [%3], m0
+    mov%7      [%4], m1
+%endmacro
+
+%macro OFFSET_TWO_ROW 4
+%assign x 0
+%rep %3
+%if (%3-x) >= mmsize
+    OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
+    %assign x (x+mmsize)
+%else
+    OFFSET_OP (%1+x),(%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
+    %exitrep
+%endif
+%if x >= %3
+    %exitrep
+%endif
+%endrep
+%endmacro
+
+;void x264_mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, x264_weight_t *w, int h )
+%macro OFFSET 3
+    cglobal x264_mc_offset%3_w%1_%2, NUMREGS, NUMREGS
+    mova m2, [r4]
+    LOAD_HEIGHT
+.loop:
+    OFFSET_TWO_ROW r2, r0, %1, %3
+    lea  r0, [r0+r1*2]
+    lea  r2, [r2+r3*2]
+    sub HEIGHT_REG, 2
+    jg .loop
+    REP_RET
+%endmacro
+
+%macro OFFSETPN 2
+       OFFSET %1, %2, add
+       OFFSET %1, %2, sub
+%endmacro
+INIT_MMX
+OFFSETPN  4, mmxext
+OFFSETPN  8, mmxext
+OFFSETPN 12, mmxext
+OFFSETPN 16, mmxext
+OFFSETPN 20, mmxext
+INIT_XMM
+OFFSETPN 12, sse2
+OFFSETPN 16, sse2
+OFFSETPN 20, sse2
+%undef LOAD_HEIGHT
+%undef HEIGHT_REG
+%undef NUMREGS
+
 
 
 ;=============================================================================
@@ -510,6 +731,66 @@ AVG_CACHELINE_CHECK 12, 64, mmxext
 AVG_CACHELINE_CHECK 16, 64, sse2
 AVG_CACHELINE_CHECK 20, 64, sse2
 
+; computed jump assumes this loop is exactly 48 bytes
+%macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
+ALIGN 16
+avg_w16_align%1_%2_ssse3:
+%if %2&15==0
+    movdqa  xmm1, [r2+16]
+    palignr xmm1, [r2], %1
+    pavgb   xmm1, [r2+r4]
+%else
+    movdqa  xmm1, [r2+16]
+    movdqa  xmm2, [r2+r4+16]
+    palignr xmm1, [r2], %1
+    palignr xmm2, [r2+r4], %2
+    pavgb   xmm1, xmm2
+%endif
+    movdqa  [r0], xmm1
+    add    r2, r3
+    add    r0, r1
+    dec    r5d
+    jg     avg_w16_align%1_%2_ssse3
+    rep ret
+%endmacro
+
+%assign j 1
+%assign k 2
+%rep 15
+AVG16_CACHELINE_LOOP_SSSE3 j, j
+AVG16_CACHELINE_LOOP_SSSE3 j, k
+%assign j j+1
+%assign k k+1
+%endrep
+
+cglobal x264_pixel_avg2_w16_cache64_ssse3
+    mov    eax, r2m
+    and    eax, 0x3f
+    cmp    eax, 0x30
+    jle x264_pixel_avg2_w16_sse2
+    PROLOGUE 6,7
+    lea    r6, [r4+r2]
+    and    r4, ~0xf
+    and    r6, 0x1f
+    and    r2, ~0xf
+    lea    r6, [r6*3]    ;(offset + align*2)*3
+    sub    r4, r2
+    shl    r6, 4         ;jump = (offset + align*2)*48
+%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
+%ifdef PIC
+    lea   r11, [avg_w16_addr GLOBAL]
+    add    r6, r11
+%else
+    lea    r6, [avg_w16_addr + r6 GLOBAL]
+%endif
+%ifdef UNIX64
+    jmp    r6
+%else
+    call   r6
+    RET
+%endif
+
+
 ;=============================================================================
 ; pixel copy
 ;=============================================================================
@@ -869,8 +1150,9 @@ MC_CHROMA mmxext
 INIT_XMM
 MC_CHROMA sse2, 8
 
+%macro MC_CHROMA_SSSE3 2
 INIT_MMX
-cglobal x264_mc_chroma_ssse3, 0,6,8
+cglobal x264_mc_chroma_ssse3%1, 0,6,%2
     MC_CHROMA_START
     and       r4d, 7
     and       r5d, 7
@@ -887,19 +1169,27 @@ cglobal x264_mc_chroma_ssse3, 0,6,8
     mova       m5, [pw_32 GLOBAL]
     movd       m6, r5d
     movd       m7, r4d
-    movifnidn r0,  r0mp
+    movifnidn  r0, r0mp
     movifnidn r1d, r1m
     movifnidn r4d, r7m
     SPLATW     m6, m6
     SPLATW     m7, m7
-    movh       m0, [r2]
-    punpcklbw  m0, [r2+1]
-    add r2, r3
+    mov        r5, r2
+    and        r2, ~3
+    and        r5, 3
+%ifdef PIC
+    lea       r11, [ch_shuffle GLOBAL]
+    movu       m5, [r11 + r5*2]
+%else
+    movu       m5, [ch_shuffle + r5*2 GLOBAL]
+%endif
+    movu       m0, [r2]
+    pshufb     m0, m5
 .loop4:
-    movh       m1, [r2]
-    movh       m3, [r2+r3]
-    punpcklbw  m1, [r2+1]
-    punpcklbw  m3, [r2+r3+1]
+    movu       m1, [r2+r3]
+    pshufb     m1, m5
+    movu       m3, [r2+2*r3]
+    pshufb     m3, m5
     lea        r2, [r2+2*r3]
     mova       m2, m1
     mova       m4, m3
@@ -907,8 +1197,8 @@ cglobal x264_mc_chroma_ssse3, 0,6,8
     pmaddubsw  m1, m6
     pmaddubsw  m2, m7
     pmaddubsw  m3, m6
-    paddw      m0, m5
-    paddw      m2, m5
+    paddw      m0, [pw_32 GLOBAL]
+    paddw      m2, [pw_32 GLOBAL]
     paddw      m1, m0
     paddw      m3, m2
     mova       m0, m4
@@ -925,23 +1215,28 @@ cglobal x264_mc_chroma_ssse3, 0,6,8
 
 INIT_XMM
 .width8:
-    mova       m5, [pw_32 GLOBAL]
     movd       m6, r5d
     movd       m7, r4d
-    movifnidn r0,  r0mp
+    movifnidn  r0, r0mp
     movifnidn r1d, r1m
     movifnidn r4d, r7m
     SPLATW     m6, m6
     SPLATW     m7, m7
+%ifidn %1, _cache64
+    mov        r5, r2
+    and        r5, 0x3f
+    cmp        r5, 0x38
+    jge .split
+%endif
+    mova       m5, [pw_32 GLOBAL]
     movh       m0, [r2]
     movh       m1, [r2+1]
     punpcklbw  m0, m1
-    add r2, r3
 .loop8:
-    movh       m1, [r2]
-    movh       m2, [r2+1]
-    movh       m3, [r2+r3]
-    movh       m4, [r2+r3+1]
+    movh       m1, [r2+1*r3]
+    movh       m2, [r2+1*r3+1]
+    movh       m3, [r2+2*r3]
+    movh       m4, [r2+2*r3+1]
     punpcklbw  m1, m2
     punpcklbw  m3, m4
     lea        r2, [r2+2*r3]
@@ -965,6 +1260,53 @@ INIT_XMM
     lea        r0, [r0+2*r1]
     jg .loop8
     REP_RET
-
+%ifidn %1, _cache64
+.split:
+    and        r2, ~7
+    and        r5, 7
+%ifdef PIC
+    lea       r11, [ch_shuffle GLOBAL]
+    movu       m5, [r11 + r5*2]
+%else
+    movu       m5, [ch_shuffle + r5*2 GLOBAL]
+%endif
+    movu       m0, [r2]
+    pshufb     m0, m5
+%ifdef ARCH_X86_64
+    mova       m8, [pw_32 GLOBAL]
+    %define round m8
+%else
+    %define round [pw_32 GLOBAL]
+%endif
+.splitloop8:
+    movu       m1, [r2+r3]
+    pshufb     m1, m5
+    movu       m3, [r2+2*r3]
+    pshufb     m3, m5
+    lea        r2, [r2+2*r3]
+    mova       m2, m1
+    mova       m4, m3
+    pmaddubsw  m0, m7
+    pmaddubsw  m1, m6
+    pmaddubsw  m2, m7
+    pmaddubsw  m3, m6
+    paddw      m0, round
+    paddw      m2, round
+    paddw      m1, m0
+    paddw      m3, m2
+    mova       m0, m4
+    psrlw      m1, 6
+    psrlw      m3, 6
+    packuswb   m1, m3
+    movh     [r0], m1
+    movhps [r0+r1], m1
+    sub       r4d, 2
+    lea        r0, [r0+2*r1]
+    jg .splitloop8
+    REP_RET
+%endif
 ; mc_chroma 1d ssse3 is negligibly faster, and definitely not worth the extra code size
+%endmacro
 
+MC_CHROMA_SSSE3 , 8
+MC_CHROMA_SSSE3 _cache64, 9
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 9745ac6..245c09f 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -34,6 +34,7 @@ filt_mul51: times 8 db 1, -5
 pw_1:  times 8 dw 1
 pw_16: times 8 dw 16
 pw_32: times 8 dw 32
+pd_128: times 4 dd 128
 
 SECTION .text
 
@@ -1081,3 +1082,43 @@ INIT_XMM
 FRAME_INIT_LOWRES sse2, 12
 %define PALIGNR PALIGNR_SSSE3
 FRAME_INIT_LOWRES ssse3, 12
+
+;-----------------------------------------------------------------------------
+; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+;                             uint16_t *inter_costs, uint16_t *inv_qscales, int len )
+;-----------------------------------------------------------------------------
+cglobal x264_mbtree_propagate_cost_sse2, 6,6
+    shl r5d, 1
+    lea r0, [r0+r5*2]
+    add r1, r5
+    add r2, r5
+    add r3, r5
+    add r4, r5
+    neg r5
+    pxor      xmm5, xmm5
+    movdqa    xmm4, [pd_128 GLOBAL]
+.loop:
+    movq      xmm2, [r2+r5] ; intra
+    movq      xmm0, [r4+r5] ; invq
+    punpcklwd xmm2, xmm5
+    punpcklwd xmm0, xmm5
+    pmaddwd   xmm0, xmm2
+    paddd     xmm0, xmm4
+    psrld     xmm0, 8       ; intra*invq>>8
+    movq      xmm1, [r1+r5] ; prop
+    movq      xmm3, [r3+r5] ; inter
+    punpcklwd xmm1, xmm5
+    punpcklwd xmm3, xmm5
+    paddd     xmm0, xmm1    ; prop + (intra*invq>>8)
+    cvtdq2ps  xmm1, xmm2    ; intra
+    psubd     xmm2, xmm3    ; intra - inter
+    cvtdq2ps  xmm0, xmm0
+    cvtdq2ps  xmm2, xmm2
+    mulps     xmm0, xmm2    ; (prop + (intra*invq>>8)) * (intra - inter)
+    divps     xmm0, xmm1    ; / intra
+    cvttps2dq xmm0, xmm0    ; truncation isn't really desired, but matches the integer implementation
+    movdqa [r0+r5*2], xmm0
+    add r5, 8
+    jl .loop
+    REP_RET
+
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index aede5b8..b3683a3 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -42,6 +42,32 @@ DECL_SUF( x264_pixel_avg_8x4,   ( uint8_t *, int, uint8_t *, int, uint8_t *, int
 DECL_SUF( x264_pixel_avg_4x8,   ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
 DECL_SUF( x264_pixel_avg_4x4,   ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
 DECL_SUF( x264_pixel_avg_4x2,   ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
+
+#define MC_WEIGHT(w,type) \
+    extern void x264_mc_weight_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int );
+
+#define MC_WEIGHT_OFFSET(w,type) \
+    extern void x264_mc_offsetadd_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
+    extern void x264_mc_offsetsub_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
+    MC_WEIGHT(w,type)
+
+MC_WEIGHT_OFFSET( 4, mmxext )
+MC_WEIGHT_OFFSET( 8, mmxext )
+MC_WEIGHT_OFFSET( 12, mmxext )
+MC_WEIGHT_OFFSET( 16, mmxext )
+MC_WEIGHT_OFFSET( 20, mmxext )
+MC_WEIGHT_OFFSET( 12, sse2 )
+MC_WEIGHT_OFFSET( 16, sse2 )
+MC_WEIGHT_OFFSET( 20, sse2 )
+MC_WEIGHT( 8, sse2  )
+MC_WEIGHT( 4, ssse3 )
+MC_WEIGHT( 8, ssse3 )
+MC_WEIGHT( 12, ssse3 )
+MC_WEIGHT( 16, ssse3 )
+MC_WEIGHT( 20, ssse3 )
+#undef MC_OFFSET
+#undef MC_WEIGHT
+
 extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
 extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
 extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
@@ -59,6 +85,9 @@ extern void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride,
 extern void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride,
                                   uint8_t *dst, int i_dst_stride,
                                   int dx, int dy, int i_width, int i_height );
+extern void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride,
+                                  uint8_t *dst, int i_dst_stride,
+                                  int dx, int dy, int i_width, int i_height );
 extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
 extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
 extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
@@ -71,6 +100,8 @@ extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int strid
 extern void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
 extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
 extern void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
+extern void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+                                             uint16_t *inter_costs, uint16_t *inv_qscales, int len );
 #define LOWRES(cpu) \
 extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
                                                int src_stride, int dst_stride, int width, int height );
@@ -91,6 +122,7 @@ PIXEL_AVG_WALL(cache64_mmxext)
 PIXEL_AVG_WALL(cache64_sse2)
 PIXEL_AVG_WALL(sse2)
 PIXEL_AVG_WALL(sse2_misalign)
+PIXEL_AVG_WALL(cache64_ssse3)
 
 #define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
 static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
@@ -116,6 +148,7 @@ PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_m
 PIXEL_AVG_WTAB(sse2, mmxext, mmxext, sse2, sse2, sse2)
 PIXEL_AVG_WTAB(sse2_misalign, mmxext, mmxext, sse2, sse2, sse2_misalign)
 PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
+PIXEL_AVG_WTAB(cache64_ssse3, mmxext, cache64_mmxext, cache64_sse2, cache64_ssse3, cache64_sse2)
 
 #define MC_COPY_WTAB(instr, name1, name2, name3)\
 static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\
@@ -130,6 +163,70 @@ static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, i
 MC_COPY_WTAB(mmx,mmx,mmx,mmx)
 MC_COPY_WTAB(sse2,mmx,mmx,sse2)
 
+#define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
+    static void (* x264_mc_##function##_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int ) =\
+{\
+    x264_mc_##function##_w4_##name1,\
+    x264_mc_##function##_w4_##name1,\
+    x264_mc_##function##_w8_##name2,\
+    x264_mc_##function##_w##w12version##_##instr,\
+    x264_mc_##function##_w16_##instr,\
+    x264_mc_##function##_w20_##instr,\
+};
+
+MC_WEIGHT_WTAB(weight,mmxext,mmxext,mmxext,12)
+MC_WEIGHT_WTAB(offsetadd,mmxext,mmxext,mmxext,12)
+MC_WEIGHT_WTAB(offsetsub,mmxext,mmxext,mmxext,12)
+MC_WEIGHT_WTAB(weight,sse2,mmxext,sse2,16)
+MC_WEIGHT_WTAB(offsetadd,sse2,mmxext,mmxext,16)
+MC_WEIGHT_WTAB(offsetsub,sse2,mmxext,mmxext,16)
+MC_WEIGHT_WTAB(weight,ssse3,ssse3,ssse3,16)
+
+static void x264_weight_cache_mmxext( x264_t *h, x264_weight_t *w )
+{
+    int i;
+    int16_t den1;
+
+    if( w->i_scale == 1<<w->i_denom )
+    {
+        if( w->i_offset < 0 )
+            w->weightfn = h->mc.offsetsub;
+        else
+            w->weightfn = h->mc.offsetadd;
+        memset( w->cachea, abs(w->i_offset), sizeof(w->cachea) );
+        return;
+    }
+    w->weightfn = h->mc.weight;
+    den1 = 1 << (w->i_denom - 1) | w->i_offset << w->i_denom;
+    for( i = 0; i < 8; i++ )
+    {
+        w->cachea[i] = w->i_scale;
+        w->cacheb[i] = den1;
+    }
+}
+
+static void x264_weight_cache_ssse3( x264_t *h, x264_weight_t *w )
+{
+    int i, den1;
+    if( w->i_scale == 1<<w->i_denom )
+    {
+        if( w->i_offset < 0 )
+            w->weightfn = h->mc.offsetsub;
+        else
+            w->weightfn = h->mc.offsetadd;
+
+        memset( w->cachea, abs( w->i_offset ), sizeof(w->cachea) );
+        return;
+    }
+    w->weightfn = h->mc.weight;
+    den1 = w->i_scale << (8 - w->i_denom);
+    for(i = 0;i<8;i++)
+    {
+        w->cachea[i] = den1;
+        w->cacheb[i] = w->i_offset;
+    }
+}
+
 static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
 static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
 
@@ -137,7 +234,7 @@ static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
 static void mc_luma_##name( uint8_t *dst,    int i_dst_stride,\
                   uint8_t *src[4], int i_src_stride,\
                   int mvx, int mvy,\
-                  int i_width, int i_height )\
+                  int i_width, int i_height, const x264_weight_t *weight )\
 {\
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
     int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
@@ -148,12 +245,13 @@ static void mc_luma_##name( uint8_t *dst,    int i_dst_stride,\
         x264_pixel_avg_wtab_##instr1[i_width>>2](\
                 dst, i_dst_stride, src1, i_src_stride,\
                 src2, i_height );\
+        if( weight->weightfn )\
+            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );\
     }\
+    else if( weight->weightfn )\
+        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );\
     else\
-    {\
-        x264_mc_copy_wtab_##instr2[i_width>>2](\
-                dst, i_dst_stride, src1, i_src_stride, i_height );\
-    }\
+        x264_mc_copy_wtab_##instr2[i_width>>2](dst, i_dst_stride, src1, i_src_stride, i_height );\
 }
 
 MC_LUMA(mmxext,mmxext,mmx)
@@ -163,12 +261,13 @@ MC_LUMA(cache64_mmxext,cache64_mmxext,mmx)
 #endif
 MC_LUMA(sse2,sse2,sse2)
 MC_LUMA(cache64_sse2,cache64_sse2,sse2)
+MC_LUMA(cache64_ssse3,cache64_ssse3,sse2)
 
 #define GET_REF(name)\
 static uint8_t *get_ref_##name( uint8_t *dst,   int *i_dst_stride,\
                          uint8_t *src[4], int i_src_stride,\
                          int mvx, int mvy,\
-                         int i_width, int i_height )\
+                         int i_width, int i_height, const x264_weight_t *weight )\
 {\
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
     int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
@@ -179,6 +278,13 @@ static uint8_t *get_ref_##name( uint8_t *dst,   int *i_dst_stride,\
         x264_pixel_avg_wtab_##name[i_width>>2](\
                 dst, *i_dst_stride, src1, i_src_stride,\
                 src2, i_height );\
+        if( weight->weightfn )\
+            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );\
+        return dst;\
+    }\
+    else if( weight->weightfn )\
+    {\
+        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );\
         return dst;\
     }\
     else\
@@ -196,6 +302,7 @@ GET_REF(cache64_mmxext)
 GET_REF(sse2)
 GET_REF(sse2_misalign)
 GET_REF(cache64_sse2)
+GET_REF(cache64_ssse3)
 
 #define HPEL(align, cpu, cpuv, cpuc, cpuh)\
 void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
@@ -257,6 +364,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->get_ref = get_ref_mmxext;
     pf->mc_chroma = x264_mc_chroma_mmxext;
 
+    pf->weight = x264_mc_weight_wtab_mmxext;
+    pf->offsetadd = x264_mc_offsetadd_wtab_mmxext;
+    pf->offsetsub = x264_mc_offsetsub_wtab_mmxext;
+    pf->weight_cache = x264_weight_cache_mmxext;
+
     pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmxext;
     pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_mmxext;
     pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_mmxext;
@@ -296,10 +408,15 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->integral_init4v = x264_integral_init4v_sse2;
     pf->integral_init8v = x264_integral_init8v_sse2;
     pf->hpel_filter = x264_hpel_filter_sse2_amd;
+    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
 
     if( cpu&X264_CPU_SSE2_IS_SLOW )
         return;
 
+    pf->weight = x264_mc_weight_wtab_sse2;
+    pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
+    pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
+
     pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
     pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
     pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
@@ -340,6 +457,16 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->hpel_filter = x264_hpel_filter_ssse3;
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
     pf->mc_chroma = x264_mc_chroma_ssse3;
+    if( cpu&X264_CPU_CACHELINE_64 )
+    {
+        pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
+        pf->mc_luma = mc_luma_cache64_ssse3;
+        pf->get_ref = get_ref_cache64_ssse3;
+
+        /* ssse3 weight is slower on Nehalem, so only assign here. */
+        pf->weight_cache = x264_weight_cache_ssse3;
+        pf->weight = x264_mc_weight_wtab_ssse3;
+    }
 
     if( cpu&X264_CPU_SHUFFLE_IS_FAST )
         pf->integral_init4v = x264_integral_init4v_ssse3;
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 6a19c40..d94daaf 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -77,13 +77,16 @@ SECTION .text
 ;=============================================================================
 
 %macro SSD_LOAD_FULL 5
-    mova      m1, [r0+%1]
-    mova      m2, [r2+%2]
-    mova      m3, [r0+%3]
-    mova      m4, [r2+%4]
-%if %5
-    lea       r0, [r0+2*r1]
-    lea       r2, [r2+2*r3]
+    mova      m1, [t0+%1]
+    mova      m2, [t2+%2]
+    mova      m3, [t0+%3]
+    mova      m4, [t2+%4]
+%if %5==1
+    add       t0, t1
+    add       t2, t3
+%elif %5==2
+    lea       t0, [t0+2*t1]
+    lea       t2, [t2+2*t3]
 %endif
 %endmacro
 
@@ -91,7 +94,7 @@ SECTION .text
     movh      m%1, %3
     movh      m%2, %4
 %if %5
-    lea       r0, [r0+2*r1]
+    lea       t0, [t0+2*t1]
 %endif
 %endmacro
 
@@ -99,7 +102,7 @@ SECTION .text
     movh      m%3, %5
     movh      m%4, %6
 %if %7
-    lea       r2, [r2+2*r3]
+    lea       t2, [t2+2*t3]
 %endif
     punpcklbw m%1, m7
     punpcklbw m%3, m7
@@ -113,7 +116,7 @@ SECTION .text
     movh      m%3, %5
     movh      m%4, %6
 %if %7
-    lea       r2, [r2+2*r3]
+    lea       t2, [t2+2*t3]
 %endif
     punpcklqdq m%1, m%2
     punpcklqdq m%3, m%4
@@ -126,17 +129,17 @@ SECTION .text
     movh      m%3, %5
     movh      m%4, %6
 %if %7
-    lea       r2, [r2+2*r3]
+    lea       t2, [t2+2*t3]
 %endif
     punpcklbw m%1, m%3
     punpcklbw m%2, m%4
 %endmacro
 
 %macro SSD_LOAD_HALF 5
-    LOAD      1, 2, [r0+%1], [r0+%3], 1
-    JOIN      1, 2, 3, 4, [r2+%2], [r2+%4], 1
-    LOAD      3, 4, [r0+%1], [r0+%3], %5
-    JOIN      3, 4, 5, 6, [r2+%2], [r2+%4], %5
+    LOAD      1, 2, [t0+%1], [t0+%3], 1
+    JOIN      1, 2, 3, 4, [t2+%2], [t2+%4], 1
+    LOAD      3, 4, [t0+%1], [t0+%3], %5
+    JOIN      3, 4, 5, 6, [t2+%2], [t2+%4], %5
 %endmacro
 
 %macro SSD_CORE 7-8
@@ -152,8 +155,8 @@ SECTION .text
     mova      m%2, m%1
     mova      m%4, m%3
     punpckhbw m%1, m%5
-    punpckhbw m%3, m%5
     punpcklbw m%2, m%5
+    punpckhbw m%3, m%5
     punpcklbw m%4, m%5
 %endif
     pmaddwd   m%1, m%1
@@ -167,11 +170,11 @@ SECTION .text
     DEINTB %6, %1, %7, %2, %5
     psubw m%6, m%7
     psubw m%1, m%2
-    SWAP %2, %6
+    SWAP %6, %2, %1
     DEINTB %6, %3, %7, %4, %5
     psubw m%6, m%7
     psubw m%3, m%4
-    SWAP %4, %6
+    SWAP %6, %4, %3
 %endif
     pmaddwd   m%1, m%1
     pmaddwd   m%2, m%2
@@ -187,7 +190,7 @@ SECTION .text
     punpcklbw m%3, m%4
     punpckhbw m%6, m%2
     punpckhbw m%7, m%4
-    SWAP %6, %2
+    SWAP %6, %2, %3
     SWAP %7, %4
 %endif
     pmaddubsw m%1, m%5
@@ -200,28 +203,46 @@ SECTION .text
     pmaddwd   m%4, m%4
 %endmacro
 
-%macro SSD_END 1
+%macro SSD_ITER 6
+    SSD_LOAD_%1 %2,%3,%4,%5,%6
+    SSD_CORE  1, 2, 3, 4, 7, 5, 6, %1
     paddd     m1, m2
     paddd     m3, m4
-%if %1
     paddd     m0, m1
-%else
-    SWAP      0, 1
-%endif
     paddd     m0, m3
 %endmacro
 
-%macro SSD_ITER 7
-    SSD_LOAD_%1 %2,%3,%4,%5,%7
-    SSD_CORE  1, 2, 3, 4, 7, 5, 6, %1
-    SSD_END  %6
-%endmacro
-
 ;-----------------------------------------------------------------------------
 ; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int )
 ;-----------------------------------------------------------------------------
 %macro SSD 3-4 0
-cglobal x264_pixel_ssd_%1x%2_%3, 4,4,%4
+%if %1 != %2
+    %assign function_align 8
+%else
+    %assign function_align 16
+%endif
+cglobal x264_pixel_ssd_%1x%2_%3, 0,0,0
+    mov     al, %1*%2/mmsize/2
+
+%if %1 != %2
+    jmp mangle(x264_pixel_ssd_%1x%1_%3.startloop)
+%else
+
+.startloop:
+%ifdef ARCH_X86_64
+    DECLARE_REG_TMP 0,1,2,3
+%ifnidn %3, mmx
+    PROLOGUE 0,0,8
+%endif
+%else
+    PROLOGUE 0,5
+    DECLARE_REG_TMP 1,2,3,4
+    mov t0, r0m
+    mov t1, r1m
+    mov t2, r2m
+    mov t3, r3m
+%endif
+
 %ifidn %3, ssse3
     mova    m7, [hsub_mul GLOBAL]
 %elifidn %3, sse2
@@ -229,57 +250,57 @@ cglobal x264_pixel_ssd_%1x%2_%3, 4,4,%4
 %elif %1 >= mmsize
     pxor    m7, m7
 %endif
-%assign i 0
-%rep %2/4
+    pxor    m0, m0
+
+ALIGN 16
+.loop:
 %if %1 > mmsize
-    SSD_ITER FULL, 0,  0,     mmsize,    mmsize, i, 0
-    SSD_ITER FULL, r1, r3, r1+mmsize, r3+mmsize, 1, 1
-    SSD_ITER FULL, 0,  0,     mmsize,    mmsize, 1, 0
-    SSD_ITER FULL, r1, r3, r1+mmsize, r3+mmsize, 1, i<%2/4-1
+    SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
 %elif %1 == mmsize
-    SSD_ITER FULL, 0, 0, r1, r3, i, 1
-    SSD_ITER FULL, 0, 0, r1, r3, 1, i<%2/4-1
+    SSD_ITER FULL, 0, 0, t1, t3, 2
 %else
-    SSD_ITER HALF, 0, 0, r1, r3, i, i<%2/4-1
+    SSD_ITER HALF, 0, 0, t1, t3, 2
 %endif
-%assign i i+1
-%endrep
+    dec     al
+    jg .loop
     HADDD   m0, m1
     movd   eax, m0
     RET
+%endif
 %endmacro
 
 INIT_MMX
 SSD 16, 16, mmx
 SSD 16,  8, mmx
-SSD  8, 16, mmx
 SSD  8,  8, mmx
+SSD  8, 16, mmx
+SSD  4,  4, mmx
 SSD  8,  4, mmx
 SSD  4,  8, mmx
-SSD  4,  4, mmx
 INIT_XMM
 SSD 16, 16, sse2slow, 8
+SSD  8,  8, sse2slow, 8
 SSD 16,  8, sse2slow, 8
 SSD  8, 16, sse2slow, 8
-SSD  8,  8, sse2slow, 8
 SSD  8,  4, sse2slow, 8
 %define SSD_CORE SSD_CORE_SSE2
 %define JOIN JOIN_SSE2
 SSD 16, 16, sse2, 8
+SSD  8,  8, sse2, 8
 SSD 16,  8, sse2, 8
 SSD  8, 16, sse2, 8
-SSD  8,  8, sse2, 8
 SSD  8,  4, sse2, 8
 %define SSD_CORE SSD_CORE_SSSE3
 %define JOIN JOIN_SSSE3
 SSD 16, 16, ssse3, 8
+SSD  8,  8, ssse3, 8
 SSD 16,  8, ssse3, 8
 SSD  8, 16, ssse3, 8
-SSD  8,  8, ssse3, 8
 SSD  8,  4, ssse3, 8
 INIT_MMX
-SSD  4,  8, ssse3
 SSD  4,  4, ssse3
+SSD  4,  8, ssse3
+%assign function_align 16
 
 ;=============================================================================
 ; variance
@@ -295,14 +316,15 @@ SSD  4,  4, ssse3
 %endif
 %endmacro
 
-%macro VAR_END 1
+%macro VAR_END 0
     HADDW   m5, m7
-    movd   r1d, m5
-    imul   r1d, r1d
+    movd   eax, m5
     HADDD   m6, m1
-    shr    r1d, %1
-    movd   eax, m6
-    sub    eax, r1d  ; sqr - (sum * sum >> shift)
+    movd   edx, m6
+%ifdef ARCH_X86_64
+    shl    rdx, 32
+    add    rax, rdx
+%endif
     RET
 %endmacro
 
@@ -349,12 +371,12 @@ INIT_MMX
 cglobal x264_pixel_var_16x16_mmxext, 2,3
     VAR_START 0
     VAR_2ROW 8, 16
-    VAR_END 8
+    VAR_END
 
 cglobal x264_pixel_var_8x8_mmxext, 2,3
     VAR_START 0
     VAR_2ROW r1, 4
-    VAR_END 6
+    VAR_END
 
 INIT_XMM
 cglobal x264_pixel_var_16x16_sse2, 2,3,8
@@ -368,7 +390,7 @@ cglobal x264_pixel_var_16x16_sse2, 2,3,8
     VAR_CORE
     dec r2d
     jg .loop
-    VAR_END 8
+    VAR_END
 
 cglobal x264_pixel_var_8x8_sse2, 2,4,8
     VAR_START 1
@@ -384,8 +406,121 @@ cglobal x264_pixel_var_8x8_sse2, 2,4,8
     VAR_CORE
     dec r2d
     jg .loop
-    VAR_END 6
+    VAR_END
 
+%macro VAR2_END 0
+    HADDW   m5, m7
+    movd   r1d, m5
+    imul   r1d, r1d
+    HADDD   m6, m1
+    shr    r1d, 6
+    movd   eax, m6
+    mov   [r4], eax
+    sub    eax, r1d  ; sqr - (sum * sum >> shift)
+    RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * )
+;-----------------------------------------------------------------------------
+%ifndef ARCH_X86_64
+INIT_MMX
+cglobal x264_pixel_var2_8x8_mmxext, 5,6
+    VAR_START 0
+    mov      r5d, 8
+.loop:
+    movq      m0, [r0]
+    movq      m1, m0
+    movq      m4, m0
+    movq      m2, [r2]
+    movq      m3, m2
+    punpcklbw m0, m7
+    punpckhbw m1, m7
+    punpcklbw m2, m7
+    punpckhbw m3, m7
+    psubw     m0, m2
+    psubw     m1, m3
+    paddw     m5, m0
+    paddw     m5, m1
+    pmaddwd   m0, m0
+    pmaddwd   m1, m1
+    paddd     m6, m0
+    paddd     m6, m1
+    add       r0, r1
+    add       r2, r3
+    dec       r5d
+    jg .loop
+    VAR2_END
+    RET
+%endif
+
+INIT_XMM
+cglobal x264_pixel_var2_8x8_sse2, 5,6,8
+    VAR_START 1
+    mov      r5d, 4
+.loop:
+    movq      m1, [r0]
+    movhps    m1, [r0+r1]
+    movq      m3, [r2]
+    movhps    m3, [r2+r3]
+    DEINTB    0, 1, 2, 3, 7
+    psubw     m0, m2
+    psubw     m1, m3
+    paddw     m5, m0
+    paddw     m5, m1
+    pmaddwd   m0, m0
+    pmaddwd   m1, m1
+    paddd     m6, m0
+    paddd     m6, m1
+    lea       r0, [r0+r1*2]
+    lea       r2, [r2+r3*2]
+    dec      r5d
+    jg .loop
+    VAR2_END
+    RET
+
+cglobal x264_pixel_var2_8x8_ssse3, 5,6,8
+    pxor      m5, m5    ; sum
+    pxor      m6, m6    ; sum squared
+    mova      m7, [hsub_mul GLOBAL]
+    mov      r5d, 2
+.loop:
+    movq      m0, [r0]
+    movq      m2, [r2]
+    movq      m1, [r0+r1]
+    movq      m3, [r2+r3]
+    lea       r0, [r0+r1*2]
+    lea       r2, [r2+r3*2]
+    punpcklbw m0, m2
+    punpcklbw m1, m3
+    movq      m2, [r0]
+    movq      m3, [r2]
+    punpcklbw m2, m3
+    movq      m3, [r0+r1]
+    movq      m4, [r2+r3]
+    punpcklbw m3, m4
+    pmaddubsw m0, m7
+    pmaddubsw m1, m7
+    pmaddubsw m2, m7
+    pmaddubsw m3, m7
+    paddw     m5, m0
+    paddw     m5, m1
+    paddw     m5, m2
+    paddw     m5, m3
+    pmaddwd   m0, m0
+    pmaddwd   m1, m1
+    pmaddwd   m2, m2
+    pmaddwd   m3, m3
+    paddd     m6, m0
+    paddd     m6, m1
+    paddd     m6, m2
+    paddd     m6, m3
+    lea       r0, [r0+r1*2]
+    lea       r2, [r2+r3*2]
+    dec      r5d
+    jg .loop
+    VAR2_END
+    RET
 
 ;=============================================================================
 ; SATD
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 312aca8..9bba683 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -69,8 +69,8 @@ DECL_X4( sad, cache64_mmxext );
 DECL_X4( sad, cache64_sse2 );
 DECL_X4( sad, cache64_ssse3 );
 
-DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride ))
-DECL_PIXELS( int, var, sse2,   ( uint8_t *pix, int i_stride ))
+DECL_PIXELS( uint64_t, var, mmxext, ( uint8_t *pix, int i_stride ))
+DECL_PIXELS( uint64_t, var, sse2,   ( uint8_t *pix, int i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t *pix, int i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, sse2,   ( uint8_t *pix, int i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, ssse3,  ( uint8_t *pix, int i_stride ))
@@ -102,6 +102,9 @@ void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
 void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
                                       const uint8_t *pix2, int stride2, int sums[2][4] );
 float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
+int  x264_pixel_var2_8x8_mmxext( uint8_t *, int, uint8_t *, int, int * );
+int  x264_pixel_var2_8x8_sse2( uint8_t *, int, uint8_t *, int, int * );
+int  x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
 
 #define DECL_ADS( size, suffix ) \
 int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index 0248038..602ddcd 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -75,9 +75,9 @@ extern void predict_16x16_dc_left_core_sse2( uint8_t *src, int i_dc_left );
 extern void predict_16x16_v_sse2( uint8_t *src );
 extern void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
 
-DECLARE_ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
-DECLARE_ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
-DECLARE_ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
+ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
+ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
+ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
 
 #define PREDICT_P_SUM(j,i)\
     H += i * ( src[j+i - FDEC_STRIDE ]  - src[j-i - FDEC_STRIDE ] );\
@@ -266,12 +266,12 @@ static void predict_8x8c_dc_left( uint8_t *src )
 
     for( y = 0; y < 4; y++ )
     {
-        *(uint64_t*)src = dc0;
+        M64( src ) = dc0;
         src += FDEC_STRIDE;
     }
     for( y = 0; y < 4; y++ )
     {
-        *(uint64_t*)src = dc1;
+        M64( src ) = dc1;
         src += FDEC_STRIDE;
     }
 
@@ -296,8 +296,8 @@ static void predict_8x8c_dc_left( uint8_t *src )
 #define PREDICT_8x8_DC(v) \
     int y; \
     for( y = 0; y < 8; y++ ) { \
-        ((uint32_t*)src)[0] = \
-        ((uint32_t*)src)[1] = v; \
+        M32( src+0 ) = v; \
+        M32( src+4 ) = v; \
         src += FDEC_STRIDE; \
     }
 
@@ -332,7 +332,7 @@ void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[33], int res[3] )
     PREDICT_8x8_LOAD_TOP\
     PREDICT_8x8_LOAD_LEFT\
     int t;\
-    DECLARE_ALIGNED_16( int16_t sa8d_1d[2][8] );\
+    ALIGNED_16( int16_t sa8d_1d[2][8] );\
     SUMSUB(l0,l4,l1,l5,l2,l6,l3,l7);\
     SUMSUB(l0,l2,l1,l3,l4,l6,l5,l7);\
     SUMSUB(l0,l1,l2,l3,l4,l5,l6,l7);\
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 284cbbb..52e121a 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -305,6 +305,7 @@ QUANT_AC x264_quant_8x8_sse4, 8
 
 %macro DEQUANT16_FLAT 2-5
     mova   m0, %1
+    psllw  m0, m4
 %assign i %0-2
 %rep %0-1
 %if i
@@ -313,7 +314,6 @@ QUANT_AC x264_quant_8x8_sse4, 8
 %else
     pmullw m0, [r0+%2]
 %endif
-    psllw  m %+ i, m4
     mova   [r0+%2], m %+ i
     %assign i i-1
     %rotate 1
diff --git a/common/x86/quant.h b/common/x86/quant.h
index dff60a8..4e42b81 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -24,30 +24,30 @@
 #ifndef X264_I386_QUANT_H
 #define X264_I386_QUANT_H
 
-int x264_quant_2x2_dc_mmxext( int16_t dct[2][2], int mf, int bias );
-int x264_quant_4x4_dc_mmxext( int16_t dct[4][4], int mf, int bias );
-int x264_quant_4x4_mmx( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-int x264_quant_8x8_mmx( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
-int x264_quant_4x4_dc_sse2( int16_t dct[4][4], int mf, int bias );
-int x264_quant_4x4_sse2( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-int x264_quant_8x8_sse2( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
-int x264_quant_2x2_dc_ssse3( int16_t dct[2][2], int mf, int bias );
-int x264_quant_4x4_dc_ssse3( int16_t dct[4][4], int mf, int bias );
-int x264_quant_4x4_ssse3( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-int x264_quant_8x8_ssse3( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
-int x264_quant_4x4_dc_sse4( int16_t dct[4][4], int mf, int bias );
-int x264_quant_4x4_sse4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
-int x264_quant_8x8_sse4( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
-void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
-void x264_dequant_4x4dc_mmxext( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
-void x264_dequant_8x8_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
-void x264_dequant_4x4_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
-void x264_dequant_4x4dc_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
-void x264_dequant_8x8_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
-void x264_dequant_4x4_flat16_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
-void x264_dequant_8x8_flat16_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
-void x264_dequant_4x4_flat16_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
-void x264_dequant_8x8_flat16_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
+int x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias );
+int x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias );
+int x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_mmx( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_4x4_dc_sse2( int16_t dct[16], int mf, int bias );
+int x264_quant_4x4_sse2( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_sse2( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_2x2_dc_ssse3( int16_t dct[4], int mf, int bias );
+int x264_quant_4x4_dc_ssse3( int16_t dct[16], int mf, int bias );
+int x264_quant_4x4_ssse3( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_ssse3( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
+int x264_quant_4x4_dc_sse4( int16_t dct[16], int mf, int bias );
+int x264_quant_4x4_sse4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_sse4( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
+void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4dc_mmxext( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4dc_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
 void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
 void x264_denoise_dct_sse2( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
 void x264_denoise_dct_ssse3( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 68d8584..342a984 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -28,9 +28,8 @@
 
 SECTION_RODATA
 pb_3: times 16 db 3
+pb_shuf8x8c: db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
 pw_8: times 4 dw 8
-pb_shuf8x8c0: db 0,0,0,0,2,2,2,2
-pb_shuf8x8c1: db 4,4,4,4,6,6,6,6
 sw_64: dd 64
 
 SECTION .text
@@ -450,16 +449,32 @@ cglobal x264_intra_sad_x3_8x8c_%1, 3,3
     psrlw       m0, 2
     pavgw       m0, m7 ; s0+s2, s1, s3, s1+s3
 %ifidn %1, ssse3
-    movq        m1, m0
-    pshufb      m0, [pb_shuf8x8c0 GLOBAL]
-    pshufb      m1, [pb_shuf8x8c1 GLOBAL]
+    movq2dq   xmm0, m0
+    pshufb    xmm0, [pb_shuf8x8c GLOBAL]
+    movq      xmm1, [r0+FENC_STRIDE*0]
+    movq      xmm2, [r0+FENC_STRIDE*1]
+    movq      xmm3, [r0+FENC_STRIDE*2]
+    movq      xmm4, [r0+FENC_STRIDE*3]
+    movhps    xmm1, [r0+FENC_STRIDE*4]
+    movhps    xmm2, [r0+FENC_STRIDE*5]
+    movhps    xmm3, [r0+FENC_STRIDE*6]
+    movhps    xmm4, [r0+FENC_STRIDE*7]
+    psadbw    xmm1, xmm0
+    psadbw    xmm2, xmm0
+    psadbw    xmm3, xmm0
+    psadbw    xmm4, xmm0
+    paddw     xmm1, xmm2
+    paddw     xmm1, xmm3
+    paddw     xmm1, xmm4
+    movhlps   xmm0, xmm1
+    paddw     xmm1, xmm0
+    movd      [r2], xmm1
 %else
     packuswb    m0, m0
     punpcklbw   m0, m0
     movq        m1, m0
     punpcklbw   m0, m0 ; 4x dc0 4x dc1
     punpckhbw   m1, m1 ; 4x dc2 4x dc3
-%endif
     movq        m2, [r0+FENC_STRIDE*0]
     movq        m3, [r0+FENC_STRIDE*1]
     movq        m4, [r0+FENC_STRIDE*2]
@@ -483,6 +498,7 @@ cglobal x264_intra_sad_x3_8x8c_%1, 3,3
     paddw       m6, m0
     paddw       m2, m6
     movd      [r2], m2
+%endif
     RET
 %endmacro
 
diff --git a/common/x86/util.h b/common/x86/util.h
index ab1e208..efc700a 100644
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -38,8 +38,8 @@ static inline void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b,
         "pminsw %%mm2, %%mm0 \n"
         "pmaxsw %%mm1, %%mm0 \n"
         "movd   %%mm0, %0    \n"
-        :"=m"(*(uint32_t*)dst)
-        :"m"(*(uint32_t*)a), "m"(*(uint32_t*)b), "m"(*(uint32_t*)c)
+        :"=m"(*(x264_union32_t*)dst)
+        :"m"(M32( a )), "m"(M32( b )), "m"(M32( c ))
     );
 }
 #define x264_predictor_difference x264_predictor_difference_mmxext
@@ -69,44 +69,11 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
         "jg 1b                \n"
         "movq    %%mm4, %0    \n"
         :"=m"(output), "+r"(i_mvc)
-        :"r"(mvc), "m"(*(struct {int16_t x[4];} *)mvc)
+        :"r"(mvc), "m"(M64( mvc ))
     );
     sum += output[0] + output[1] + output[2] + output[3];
     return sum;
 }
-#undef array_non_zero_int
-#define array_non_zero_int array_non_zero_int_mmx
-static ALWAYS_INLINE int array_non_zero_int_mmx( void *v, int i_count )
-{
-    if(i_count == 128)
-    {
-        int nonzero = 0;
-        asm(
-            "movq     (%1),    %%mm0 \n"
-            "por      8(%1),   %%mm0 \n"
-            "por      16(%1),  %%mm0 \n"
-            "por      24(%1),  %%mm0 \n"
-            "por      32(%1),  %%mm0 \n"
-            "por      40(%1),  %%mm0 \n"
-            "por      48(%1),  %%mm0 \n"
-            "por      56(%1),  %%mm0 \n"
-            "por      64(%1),  %%mm0 \n"
-            "por      72(%1),  %%mm0 \n"
-            "por      80(%1),  %%mm0 \n"
-            "por      88(%1),  %%mm0 \n"
-            "por      96(%1),  %%mm0 \n"
-            "por      104(%1), %%mm0 \n"
-            "por      112(%1), %%mm0 \n"
-            "por      120(%1), %%mm0 \n"
-            "packsswb %%mm0,   %%mm0 \n"
-            "movd     %%mm0,   %0    \n"
-            :"=r"(nonzero)
-            :"r"(v), "m"(*(struct {int16_t x[64];} *)v)
-        );
-        return !!nonzero;
-    }
-    else return array_non_zero_int_c( v, i_count );
-}
 #define x264_cabac_amvd_sum x264_cabac_amvd_sum_mmxext
 static ALWAYS_INLINE uint32_t x264_cabac_amvd_sum_mmxext(int16_t *mvdleft, int16_t *mvdtop)
 {
@@ -131,7 +98,7 @@ static ALWAYS_INLINE uint32_t x264_cabac_amvd_sum_mmxext(int16_t *mvdleft, int16
         "pminsw    %5, %%mm0 \n"
         "movd   %%mm0, %0    \n"
         :"=r"(amvd)
-        :"m"(*(uint32_t*)mvdleft),"m"(*(uint32_t*)mvdtop),
+        :"m"(M32( mvdleft )),"m"(M32( mvdtop )),
          "m"(pw_28),"m"(pw_2184),"m"(pw_2)
     );
     return amvd;
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index fced5c6..2a91084 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -6,21 +6,32 @@
 ;* Authors: Loren Merritt <lorenm at u.washington.edu>
 ;*          Anton Mitrofanov <BugMaster at narod.ru>
 ;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
 ;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 ;*****************************************************************************
 
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible.  Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well.  Send patches or ideas
+; to x264-devel at videolan.org .
+
 %ifdef ARCH_X86_64
     %ifidn __OUTPUT_FORMAT__,win32
         %define WIN64
@@ -29,6 +40,12 @@
     %endif
 %endif
 
+%ifdef PREFIX
+    %define mangle(x) _ %+ x
+%else
+    %define mangle(x) x
+%endif
+
 ; FIXME: All of the 64bit asm functions that take a stride as an argument
 ; via register, assume that the high dword of that register is filled with 0.
 ; This is true in practice (since we never do any 64bit arithmetic on strides,
@@ -37,14 +54,14 @@
 ; Name of the .rodata section.
 ; Kludge: Something on OS X fails to align .rodata even given an align attribute,
 ; so use a different read-only section.
-%macro SECTION_RODATA 0
+%macro SECTION_RODATA 0-1 16
     %ifidn __OUTPUT_FORMAT__,macho64
-        SECTION .text align=16
+        SECTION .text align=%1
     %elifidn __OUTPUT_FORMAT__,macho
-        SECTION .text align=16
+        SECTION .text align=%1
         fakegot:
     %else
-        SECTION .rodata align=16
+        SECTION .rodata align=%1
     %endif
 %endmacro
 
@@ -85,7 +102,7 @@
 ; PROLOGUE can also be invoked by adding the same options to cglobal
 
 ; e.g.
-; cglobal foo, 2,3, dst, src, tmp
+; cglobal foo, 2,3,0, dst, src, tmp
 ; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
 
 ; TODO Some functions can use some args directly from the stack. If they're the
@@ -222,6 +239,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
             CAT_UNDEF arg_name %+ %%i, d
             CAT_UNDEF arg_name %+ %%i, w
             CAT_UNDEF arg_name %+ %%i, b
+            CAT_UNDEF arg_name %+ %%i, m
             CAT_UNDEF arg_name, %%i
             %assign %%i %%i+1
         %endrep
@@ -233,6 +251,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
         %xdefine %1d r %+ %%i %+ d
         %xdefine %1w r %+ %%i %+ w
         %xdefine %1b r %+ %%i %+ b
+        %xdefine %1m r %+ %%i %+ m
         CAT_XDEFINE arg_name, %%i, %1
         %assign %%i %%i+1
         %rotate 1
@@ -258,15 +277,11 @@ DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 56]
     %endif
 %endmacro
 
-%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
     ASSERT %2 >= %1
     %assign regs_used %2
     ASSERT regs_used <= 7
-    %if %0 > 2
-        %assign xmm_regs_used %3
-    %else
-        %assign xmm_regs_used 0
-    %endif
+    %assign xmm_regs_used %3
     ASSERT xmm_regs_used <= 16
     %if regs_used > 4
         push r4
@@ -387,7 +402,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
     %endif
 %endmacro
 
-%macro PROLOGUE 2-4+ ; #args, #regs, arg_names...
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
     ASSERT %2 >= %1
     %assign regs_used %2
     ASSERT regs_used <= 7
@@ -433,10 +448,8 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
 
 ; Symbol prefix for C linkage
 %macro cglobal 1-2+
-    %ifdef PREFIX
-        %xdefine %1.skip_prologue _%1.skip_prologue
-        %xdefine %1 _%1
-    %endif
+    %xdefine %1 mangle(%1)
+    %xdefine %1.skip_prologue %1 %+ .skip_prologue
     %ifidn __OUTPUT_FORMAT__,elf
         global %1:function hidden
     %else
@@ -452,9 +465,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
 %endmacro
 
 %macro cextern 1
-    %ifdef PREFIX
-        %xdefine %1 _%1
-    %endif
+    %xdefine %1 mangle(%1)
     extern %1
 %endmacro
 
@@ -464,9 +475,6 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
 SECTION .note.GNU-stack noalloc noexec nowrite progbits
 %endif
 
-%assign FENC_STRIDE 16
-%assign FDEC_STRIDE 32
-
 ; merge mmx and sse*
 
 %macro CAT_XDEFINE 3
@@ -573,7 +581,10 @@ INIT_MMX
 %endrep
 %endmacro
 
-%macro SAVE_MM_PERMUTATION 1
+; If SAVE_MM_PERMUTATION is placed at the end of a function and given the
+; function name, then any later calls to that function will automatically
+; load the permutation, so values can be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 1 ; name to save as
     %assign %%i 0
     %rep num_mmregs
     CAT_XDEFINE %1_m, %%i, m %+ %%i
@@ -581,7 +592,7 @@ INIT_MMX
     %endrep
 %endmacro
 
-%macro LOAD_MM_PERMUTATION 1
+%macro LOAD_MM_PERMUTATION 1 ; name to load from
     %assign %%i 0
     %rep num_mmregs
     CAT_XDEFINE m, %%i, %1_m %+ %%i
@@ -597,7 +608,7 @@ INIT_MMX
     %endif
 %endmacro
 
-;Substitutions that reduce instruction size but are functionally equivalent
+; Substitutions that reduce instruction size but are functionally equivalent
 %macro add 2
     %ifnum %2
         %if %2==128
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index cfd7767..b822688 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -21,6 +21,9 @@
 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 ;*****************************************************************************
 
+%assign FENC_STRIDE 16
+%assign FDEC_STRIDE 32
+
 %macro SBUTTERFLY 4
     mova      m%4, m%2
     punpckl%1 m%2, m%3
@@ -28,6 +31,13 @@
     SWAP %3, %4
 %endmacro
 
+%macro SBUTTERFLY2 4
+    mova      m%4, m%2
+    punpckh%1 m%2, m%3
+    punpckl%1 m%4, m%3
+    SWAP %2, %4, %3
+%endmacro
+
 %macro TRANSPOSE4x4W 5
     SBUTTERFLY wd, %1, %2, %5
     SBUTTERFLY wd, %3, %4, %5
@@ -386,10 +396,10 @@
 %macro SUMSUBD2_AB 4
     mova    %4, %1
     mova    %3, %2
-    psraw   %2, 1
-    psraw   %1, 1
-    paddw   %2, %4
-    psubw   %1, %3
+    psraw   %2, 1  ; %2: %2>>1
+    psraw   %1, 1  ; %1: %1>>1
+    paddw   %2, %4 ; %2: %2>>1+%1
+    psubw   %1, %3 ; %1: %1>>1-%2
 %endmacro
 
 %macro DCT4_1D 5
@@ -410,14 +420,24 @@
 %macro IDCT4_1D 5-6
 %ifnum %5
     SUMSUBD2_AB m%2, m%4, m%6, m%5
+    ; %2: %2>>1-%4 %4: %2+%4>>1
     SUMSUB_BA   m%3, m%1, m%6
+    ; %3: %1+%3 %1: %1-%3
     SUMSUB_BADC m%4, m%3, m%2, m%1, m%6
+    ; %4: %1+%3 + (%2+%4>>1)
+    ; %3: %1+%3 - (%2+%4>>1)
+    ; %2: %1-%3 + (%2>>1-%4)
+    ; %1: %1-%3 - (%2>>1-%4)
 %else
     SUMSUBD2_AB m%2, m%4, [%5], [%5+16]
     SUMSUB_BA   m%3, m%1
     SUMSUB_BADC m%4, m%3, m%2, m%1
 %endif
     SWAP %1, %4, %3
+    ; %1: %1+%3 + (%2+%4>>1) row0
+    ; %2: %1-%3 + (%2>>1-%4) row1
+    ; %3: %1-%3 - (%2>>1-%4) row2
+    ; %4: %1+%3 - (%2+%4>>1) row3
 %endmacro
 
 
diff --git a/config.guess b/config.guess
index 0f0fe71..e792aac 100755
--- a/config.guess
+++ b/config.guess
@@ -1,10 +1,10 @@
 #! /bin/sh
 # Attempt to guess a canonical system name.
 #   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-#   2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation,
-#   Inc.
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+#   Free Software Foundation, Inc.
 
-timestamp='2007-03-06'
+timestamp='2009-09-18'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -27,16 +27,16 @@ timestamp='2007-03-06'
 # the same distribution terms that you use for the rest of that program.
 
 
-# Originally written by Per Bothner <per at bothner.com>.
-# Please send patches to <config-patches at gnu.org>.  Submit a context
-# diff and a properly formatted ChangeLog entry.
+# Originally written by Per Bothner.  Please send patches (context
+# diff format) to <config-patches at gnu.org> and include a ChangeLog
+# entry.
 #
 # This script attempts to guess a canonical system name similar to
 # config.sub.  If it succeeds, it prints the system name on stdout, and
 # exits with 0.  Otherwise, it exits with 1.
 #
-# The plan is that this can be called by configure scripts if you
-# don't specify an explicit build system type.
+# You can get the latest version of this script from:
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
 
 me=`echo "$0" | sed -e 's,.*/,,'`
 
@@ -56,8 +56,8 @@ version="\
 GNU config.guess ($timestamp)
 
 Originally written by Per Bothner.
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005
-Free Software Foundation, Inc.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
+2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -170,7 +170,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
 		eval $set_cc_for_build
 		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
-			| grep __ELF__ >/dev/null
+			| grep -q __ELF__
 		then
 		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
 		    # Return netbsd for either.  FIX?
@@ -324,14 +324,30 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	case `/usr/bin/uname -p` in
 	    sparc) echo sparc-icl-nx7; exit ;;
 	esac ;;
+    s390x:SunOS:*:*)
+	echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit ;;
     sun4H:SunOS:5.*:*)
 	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
 	exit ;;
     sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
 	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
 	exit ;;
-    i86pc:SunOS:5.*:*)
-	echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+    i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
+	eval $set_cc_for_build
+	SUN_ARCH="i386"
+	# If there is a compiler, see if it is configured for 64-bit objects.
+	# Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
+	# This test works for both compilers.
+	if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+	    if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
+		(CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+		grep IS_64BIT_ARCH >/dev/null
+	    then
+		SUN_ARCH="x86_64"
+	    fi
+	fi
+	echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
 	exit ;;
     sun4*:SunOS:6*:*)
 	# According to config.sub, this is the proper way to canonicalize
@@ -532,7 +548,7 @@ EOF
 		echo rs6000-ibm-aix3.2
 	fi
 	exit ;;
-    *:AIX:*:[45])
+    *:AIX:*:[456])
 	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
 	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
 		IBM_ARCH=rs6000
@@ -640,7 +656,7 @@ EOF
 	    # => hppa64-hp-hpux11.23
 
 	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
-		grep __LP64__ >/dev/null
+		grep -q __LP64__
 	    then
 		HP_ARCH="hppa2.0w"
 	    else
@@ -793,16 +809,22 @@ EOF
 	exit ;;
     *:Interix*:[3456]*)
     	case ${UNAME_MACHINE} in
-	    x86) 
+	    x86)
 		echo i586-pc-interix${UNAME_RELEASE}
 		exit ;;
-	    EM64T | authenticamd)
+	    EM64T | authenticamd | genuineintel)
 		echo x86_64-unknown-interix${UNAME_RELEASE}
 		exit ;;
+	    IA64)
+		echo ia64-unknown-interix${UNAME_RELEASE}
+		exit ;;
 	esac ;;
     [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
 	echo i${UNAME_MACHINE}-pc-mks
 	exit ;;
+    8664:Windows_NT:*)
+	echo x86_64-pc-mks
+	exit ;;
     i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
 	# How do we know it's Interix rather than the generic POSIX subsystem?
 	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
@@ -832,8 +854,29 @@ EOF
     i*86:Minix:*:*)
 	echo ${UNAME_MACHINE}-pc-minix
 	exit ;;
+    alpha:Linux:*:*)
+	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
+	  EV5)   UNAME_MACHINE=alphaev5 ;;
+	  EV56)  UNAME_MACHINE=alphaev56 ;;
+	  PCA56) UNAME_MACHINE=alphapca56 ;;
+	  PCA57) UNAME_MACHINE=alphapca56 ;;
+	  EV6)   UNAME_MACHINE=alphaev6 ;;
+	  EV67)  UNAME_MACHINE=alphaev67 ;;
+	  EV68*) UNAME_MACHINE=alphaev68 ;;
+        esac
+	objdump --private-headers /bin/sh | grep -q ld.so.1
+	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+	exit ;;
     arm*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	eval $set_cc_for_build
+	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
+	    | grep -q __ARM_EABI__
+	then
+	    echo ${UNAME_MACHINE}-unknown-linux-gnu
+	else
+	    echo ${UNAME_MACHINE}-unknown-linux-gnueabi
+	fi
 	exit ;;
     avr32*:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
@@ -847,6 +890,9 @@ EOF
     frv:Linux:*:*)
     	echo frv-unknown-linux-gnu
 	exit ;;
+    i*86:Linux:*:*)
+	echo ${UNAME_MACHINE}-pc-linux-gnu
+	exit ;;
     ia64:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
@@ -856,40 +902,17 @@ EOF
     m68*:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
-    mips:Linux:*:*)
+    mips:Linux:*:* | mips64:Linux:*:*)
 	eval $set_cc_for_build
 	sed 's/^	//' << EOF >$dummy.c
 	#undef CPU
-	#undef mips
-	#undef mipsel
+	#undef ${UNAME_MACHINE}
+	#undef ${UNAME_MACHINE}el
 	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
-	CPU=mipsel
+	CPU=${UNAME_MACHINE}el
 	#else
 	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
-	CPU=mips
-	#else
-	CPU=
-	#endif
-	#endif
-EOF
-	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
-	    /^CPU/{
-		s: ::g
-		p
-	    }'`"
-	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
-	;;
-    mips64:Linux:*:*)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#undef CPU
-	#undef mips64
-	#undef mips64el
-	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
-	CPU=mips64el
-	#else
-	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
-	CPU=mips64
+	CPU=${UNAME_MACHINE}
 	#else
 	CPU=
 	#endif
@@ -905,25 +928,11 @@ EOF
     or32:Linux:*:*)
 	echo or32-unknown-linux-gnu
 	exit ;;
-    ppc:Linux:*:*)
-	echo powerpc-unknown-linux-gnu
-	exit ;;
-    ppc64:Linux:*:*)
-	echo powerpc64-unknown-linux-gnu
+    padre:Linux:*:*)
+	echo sparc-unknown-linux-gnu
 	exit ;;
-    alpha:Linux:*:*)
-	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
-	  EV5)   UNAME_MACHINE=alphaev5 ;;
-	  EV56)  UNAME_MACHINE=alphaev56 ;;
-	  PCA56) UNAME_MACHINE=alphapca56 ;;
-	  PCA57) UNAME_MACHINE=alphapca56 ;;
-	  EV6)   UNAME_MACHINE=alphaev6 ;;
-	  EV67)  UNAME_MACHINE=alphaev67 ;;
-	  EV68*) UNAME_MACHINE=alphaev68 ;;
-        esac
-	objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null
-	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
-	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+    parisc64:Linux:*:* | hppa64:Linux:*:*)
+	echo hppa64-unknown-linux-gnu
 	exit ;;
     parisc:Linux:*:* | hppa:Linux:*:*)
 	# Look for CPU level
@@ -933,8 +942,11 @@ EOF
 	  *)    echo hppa-unknown-linux-gnu ;;
 	esac
 	exit ;;
-    parisc64:Linux:*:* | hppa64:Linux:*:*)
-	echo hppa64-unknown-linux-gnu
+    ppc64:Linux:*:*)
+	echo powerpc64-unknown-linux-gnu
+	exit ;;
+    ppc:Linux:*:*)
+	echo powerpc-unknown-linux-gnu
 	exit ;;
     s390:Linux:*:* | s390x:Linux:*:*)
 	echo ${UNAME_MACHINE}-ibm-linux
@@ -954,72 +966,9 @@ EOF
     x86_64:Linux:*:*)
 	echo x86_64-unknown-linux-gnu
 	exit ;;
-    xtensa:Linux:*:*)
-    	echo xtensa-unknown-linux-gnu
+    xtensa*:Linux:*:*)
+    	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
-    i*86:Linux:*:*)
-	# The BFD linker knows what the default object file format is, so
-	# first see if it will tell us. cd to the root directory to prevent
-	# problems with other programs or directories called `ld' in the path.
-	# Set LC_ALL=C to ensure ld outputs messages in English.
-	ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \
-			 | sed -ne '/supported targets:/!d
-				    s/[ 	][ 	]*/ /g
-				    s/.*supported targets: *//
-				    s/ .*//
-				    p'`
-        case "$ld_supported_targets" in
-	  elf32-i386)
-		TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu"
-		;;
-	  a.out-i386-linux)
-		echo "${UNAME_MACHINE}-pc-linux-gnuaout"
-		exit ;;
-	  coff-i386)
-		echo "${UNAME_MACHINE}-pc-linux-gnucoff"
-		exit ;;
-	  "")
-		# Either a pre-BFD a.out linker (linux-gnuoldld) or
-		# one that does not give us useful --help.
-		echo "${UNAME_MACHINE}-pc-linux-gnuoldld"
-		exit ;;
-	esac
-	# Determine whether the default compiler is a.out or elf
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#include <features.h>
-	#ifdef __ELF__
-	# ifdef __GLIBC__
-	#  if __GLIBC__ >= 2
-	LIBC=gnu
-	#  else
-	LIBC=gnulibc1
-	#  endif
-	# else
-	LIBC=gnulibc1
-	# endif
-	#else
-	#if defined(__INTEL_COMPILER) || defined(__PGI) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
-	LIBC=gnu
-	#else
-	LIBC=gnuaout
-	#endif
-	#endif
-	#ifdef __dietlibc__
-	LIBC=dietlibc
-	#endif
-EOF
-	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
-	    /^LIBC/{
-		s: ::g
-		p
-	    }'`"
-	test x"${LIBC}" != x && {
-		echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
-		exit
-	}
-	test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; }
-	;;
     i*86:DYNIX/ptx:4*:*)
 	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
 	# earlier versions are messed up and put the nodename in both
@@ -1048,7 +997,7 @@ EOF
     i*86:syllable:*:*)
 	echo ${UNAME_MACHINE}-pc-syllable
 	exit ;;
-    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*)
+    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*)
 	echo i386-unknown-lynxos${UNAME_RELEASE}
 	exit ;;
     i*86:*DOS:*:*)
@@ -1092,8 +1041,11 @@ EOF
     pc:*:*:*)
 	# Left here for compatibility:
         # uname -m prints for DJGPP always 'pc', but it prints nothing about
-        # the processor, so we play safe by assuming i386.
-	echo i386-pc-msdosdjgpp
+        # the processor, so we play safe by assuming i586.
+	# Note: whatever this is, it MUST be the same as what config.sub
+	# prints for the "djgpp" host, or else GDB configury will decide that
+	# this is a cross-build.
+	echo i586-pc-msdosdjgpp
         exit ;;
     Intel:Mach:3*:*)
 	echo i386-pc-mach3
@@ -1131,6 +1083,16 @@ EOF
     3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
         /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
           && { echo i486-ncr-sysv4; exit; } ;;
+    NCR*:*:4.2:* | MPRAS*:*:4.2:*)
+	OS_REL='.3'
+	test -r /etc/.relid \
+	    && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	    && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	    && { echo i586-ncr-sysv4.3${OS_REL}; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \
+	    && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
     m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
 	echo m68k-unknown-lynxos${UNAME_RELEASE}
 	exit ;;
@@ -1143,7 +1105,7 @@ EOF
     rs6000:LynxOS:2.*:*)
 	echo rs6000-unknown-lynxos${UNAME_RELEASE}
 	exit ;;
-    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*)
+    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*)
 	echo powerpc-unknown-lynxos${UNAME_RELEASE}
 	exit ;;
     SM[BE]S:UNIX_SV:*:*)
@@ -1206,6 +1168,9 @@ EOF
     BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
 	echo i586-pc-beos
 	exit ;;
+    BePC:Haiku:*:*)	# Haiku running on Intel PC compatible.
+	echo i586-pc-haiku
+	exit ;;
     SX-4:SUPER-UX:*:*)
 	echo sx4-nec-superux${UNAME_RELEASE}
 	exit ;;
@@ -1233,6 +1198,16 @@ EOF
     *:Darwin:*:*)
 	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
 	case $UNAME_PROCESSOR in
+	    i386)
+		eval $set_cc_for_build
+		if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+		  if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
+		      (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+		      grep IS_64BIT_ARCH >/dev/null
+		  then
+		      UNAME_PROCESSOR="x86_64"
+		  fi
+		fi ;;
 	    unknown) UNAME_PROCESSOR=powerpc ;;
 	esac
 	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
@@ -1314,6 +1289,9 @@ EOF
     i*86:rdos:*:*)
 	echo ${UNAME_MACHINE}-pc-rdos
 	exit ;;
+    i*86:AROS:*:*)
+	echo ${UNAME_MACHINE}-pc-aros
+	exit ;;
 esac
 
 #echo '(No uname command or uname output not recognized.)' 1>&2
@@ -1474,9 +1452,9 @@ This script, last modified $timestamp, has failed to recognize
 the operating system you are using. It is advised that you
 download the most up to date version of the config scripts from
 
-  http://savannah.gnu.org/cgi-bin/viewcvs/*checkout*/config/config/config.guess
+  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
 and
-  http://savannah.gnu.org/cgi-bin/viewcvs/*checkout*/config/config/config.sub
+  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
 
 If the version you run ($0) is already up to date, please
 send the following data and any information you think might be
diff --git a/config.sub b/config.sub
index 5defff6..8ca084b 100755
--- a/config.sub
+++ b/config.sub
@@ -1,10 +1,10 @@
 #! /bin/sh
 # Configuration validation subroutine script.
 #   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-#   2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation,
-#   Inc.
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+#   Free Software Foundation, Inc.
 
-timestamp='2007-01-18'
+timestamp='2009-08-19'
 
 # This file is (in principle) common to ALL GNU software.
 # The presence of a machine in this file suggests that SOME GNU software
@@ -32,13 +32,16 @@ timestamp='2007-01-18'
 
 
 # Please send patches to <config-patches at gnu.org>.  Submit a context
-# diff and a properly formatted ChangeLog entry.
+# diff and a properly formatted GNU ChangeLog entry.
 #
 # Configuration subroutine to validate and canonicalize a configuration type.
 # Supply the specified configuration type as an argument.
 # If it is invalid, we print an error message on stderr and exit with code 1.
 # Otherwise, we print the canonical config type on stdout and succeed.
 
+# You can get the latest version of this script from:
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
+
 # This file is supposed to be the same for all GNU packages
 # and recognize all the CPU types, system types and aliases
 # that are meaningful with *any* GNU software.
@@ -72,8 +75,8 @@ Report bugs and patches to <config-patches at gnu.org>."
 version="\
 GNU config.sub ($timestamp)
 
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005
-Free Software Foundation, Inc.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
+2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -122,6 +125,7 @@ maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
 case $maybe_os in
   nto-qnx* | linux-gnu* | linux-dietlibc | linux-newlib* | linux-uclibc* | \
   uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | knetbsd*-gnu* | netbsd*-gnu* | \
+  kopensolaris*-gnu* | \
   storm-chaos* | os2-emx* | rtmk-nova*)
     os=-$maybe_os
     basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
@@ -148,10 +152,13 @@ case $os in
 	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
 	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
 	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
-	-apple | -axis | -knuth | -cray)
+	-apple | -axis | -knuth | -cray | -microblaze)
 		os=
 		basic_machine=$1
 		;;
+        -bluegene*)
+	        os=-cnk
+		;;
 	-sim | -cisco | -oki | -wec | -winbond)
 		os=
 		basic_machine=$1
@@ -249,13 +256,16 @@ case $basic_machine in
 	| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
 	| i370 | i860 | i960 | ia64 \
 	| ip2k | iq2000 \
+	| lm32 \
 	| m32c | m32r | m32rle | m68000 | m68k | m88k \
-	| maxq | mb | microblaze | mcore | mep \
+	| maxq | mb | microblaze | mcore | mep | metag \
 	| mips | mipsbe | mipseb | mipsel | mipsle \
 	| mips16 \
 	| mips64 | mips64el \
-	| mips64vr | mips64vrel \
+	| mips64octeon | mips64octeonel \
 	| mips64orion | mips64orionel \
+	| mips64r5900 | mips64r5900el \
+	| mips64vr | mips64vrel \
 	| mips64vr4100 | mips64vr4100el \
 	| mips64vr4300 | mips64vr4300el \
 	| mips64vr5000 | mips64vr5000el \
@@ -268,6 +278,7 @@ case $basic_machine in
 	| mipsisa64sr71k | mipsisa64sr71kel \
 	| mipstx39 | mipstx39el \
 	| mn10200 | mn10300 \
+	| moxie \
 	| mt \
 	| msp430 \
 	| nios | nios2 \
@@ -277,7 +288,7 @@ case $basic_machine in
 	| powerpc | powerpc64 | powerpc64le | powerpcle | ppcbe \
 	| pyramid \
 	| score \
-	| sh | sh[1234] | sh[24]a | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
+	| sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
 	| sh64 | sh64le \
 	| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
 	| sparcv8 | sparcv9 | sparcv9b | sparcv9v \
@@ -286,7 +297,7 @@ case $basic_machine in
 	| v850 | v850e \
 	| we32k \
 	| x86 | xc16x | xscale | xscalee[bl] | xstormy16 | xtensa \
-	| z8k)
+	| z8k | z80)
 		basic_machine=$basic_machine-unknown
 		;;
 	m6811 | m68hc11 | m6812 | m68hc12)
@@ -329,14 +340,17 @@ case $basic_machine in
 	| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
 	| i*86-* | i860-* | i960-* | ia64-* \
 	| ip2k-* | iq2000-* \
+	| lm32-* \
 	| m32c-* | m32r-* | m32rle-* \
 	| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
-	| m88110-* | m88k-* | maxq-* | mcore-* \
+	| m88110-* | m88k-* | maxq-* | mcore-* | metag-* | microblaze-* \
 	| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
 	| mips16-* \
 	| mips64-* | mips64el-* \
-	| mips64vr-* | mips64vrel-* \
+	| mips64octeon-* | mips64octeonel-* \
 	| mips64orion-* | mips64orionel-* \
+	| mips64r5900-* | mips64r5900el-* \
+	| mips64vr-* | mips64vrel-* \
 	| mips64vr4100-* | mips64vr4100el-* \
 	| mips64vr4300-* | mips64vr4300el-* \
 	| mips64vr5000-* | mips64vr5000el-* \
@@ -358,20 +372,24 @@ case $basic_machine in
 	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* | ppcbe-* \
 	| pyramid-* \
 	| romp-* | rs6000-* \
-	| sh-* | sh[1234]-* | sh[24]a-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
+	| sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
 	| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
 	| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
 	| sparclite-* \
 	| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | strongarm-* | sv1-* | sx?-* \
 	| tahoe-* | thumb-* \
-	| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
+	| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* | tile-* \
 	| tron-* \
 	| v850-* | v850e-* | vax-* \
 	| we32k-* \
 	| x86-* | x86_64-* | xc16x-* | xps100-* | xscale-* | xscalee[bl]-* \
-	| xstormy16-* | xtensa-* \
+	| xstormy16-* | xtensa*-* \
 	| ymp-* \
-	| z8k-*)
+	| z8k-* | z80-*)
+		;;
+	# Recognize the basic CPU types without company name, with glob match.
+	xtensa*)
+		basic_machine=$basic_machine-unknown
 		;;
 	# Recognize the various machine names and aliases which stand
 	# for a CPU type and a company and sometimes even an OS.
@@ -435,6 +453,10 @@ case $basic_machine in
 		basic_machine=m68k-apollo
 		os=-bsd
 		;;
+	aros)
+		basic_machine=i386-pc
+		os=-aros
+		;;
 	aux)
 		basic_machine=m68k-apple
 		os=-aux
@@ -443,10 +465,26 @@ case $basic_machine in
 		basic_machine=ns32k-sequent
 		os=-dynix
 		;;
+	blackfin)
+		basic_machine=bfin-unknown
+		os=-linux
+		;;
+	blackfin-*)
+		basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'`
+		os=-linux
+		;;
+	bluegene*)
+		basic_machine=powerpc-ibm
+		os=-cnk
+		;;
 	c90)
 		basic_machine=c90-cray
 		os=-unicos
 		;;
+        cegcc)
+		basic_machine=arm-unknown
+		os=-cegcc
+		;;
 	convex-c1)
 		basic_machine=c1-convex
 		os=-bsd
@@ -475,8 +513,8 @@ case $basic_machine in
 		basic_machine=craynv-cray
 		os=-unicosmp
 		;;
-	cr16c)
-		basic_machine=cr16c-unknown
+	cr16)
+		basic_machine=cr16-unknown
 		os=-elf
 		;;
 	crds | unos)
@@ -514,6 +552,10 @@ case $basic_machine in
 		basic_machine=m88k-motorola
 		os=-sysv3
 		;;
+	dicos)
+		basic_machine=i686-pc
+		os=-dicos
+		;;
 	djgpp)
 		basic_machine=i586-pc
 		os=-msdosdjgpp
@@ -668,6 +710,14 @@ case $basic_machine in
 		basic_machine=m68k-isi
 		os=-sysv
 		;;
+	m68knommu)
+		basic_machine=m68k-unknown
+		os=-linux
+		;;
+	m68knommu-*)
+		basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'`
+		os=-linux
+		;;
 	m88k-omron*)
 		basic_machine=m88k-omron
 		;;
@@ -679,10 +729,17 @@ case $basic_machine in
 		basic_machine=ns32k-utek
 		os=-sysv
 		;;
+        microblaze)
+		basic_machine=microblaze-xilinx
+		;;
 	mingw32)
 		basic_machine=i386-pc
 		os=-mingw32
 		;;
+	mingw32ce)
+		basic_machine=arm-unknown
+		os=-mingw32ce
+		;;
 	miniframe)
 		basic_machine=m68000-convergent
 		;;
@@ -809,6 +866,14 @@ case $basic_machine in
 		basic_machine=i860-intel
 		os=-osf
 		;;
+	parisc)
+		basic_machine=hppa-unknown
+		os=-linux
+		;;
+	parisc-*)
+		basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'`
+		os=-linux
+		;;
 	pbd)
 		basic_machine=sparc-tti
 		;;
@@ -1017,6 +1082,10 @@ case $basic_machine in
 		basic_machine=tic6x-unknown
 		os=-coff
 		;;
+	tile*)
+		basic_machine=tile-unknown
+		os=-linux-gnu
+		;;
 	tx39)
 		basic_machine=mipstx39-unknown
 		;;
@@ -1092,6 +1161,10 @@ case $basic_machine in
 		basic_machine=z8k-unknown
 		os=-sim
 		;;
+	z80-*-coff)
+		basic_machine=z80-unknown
+		os=-sim
+		;;
 	none)
 		basic_machine=none-none
 		os=-none
@@ -1130,7 +1203,7 @@ case $basic_machine in
 	we32k)
 		basic_machine=we32k-att
 		;;
-	sh[1234] | sh[24]a | sh[34]eb | sh[1234]le | sh[23]ele)
+	sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele)
 		basic_machine=sh-unknown
 		;;
 	sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v)
@@ -1200,10 +1273,11 @@ case $os in
 	# Each alternative MUST END IN A *, to match a version number.
 	# -sysv* is not here because it comes later, after sysvr4.
 	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
-	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\
+	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\
 	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \
+	      | -kopensolaris* \
 	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
-	      | -aos* \
+	      | -aos* | -aros* \
 	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
 	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
 	      | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
@@ -1212,7 +1286,7 @@ case $os in
 	      | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
 	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
 	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
-	      | -chorusos* | -chorusrdb* \
+	      | -chorusos* | -chorusrdb* | -cegcc* \
 	      | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
 	      | -mingw32* | -linux-gnu* | -linux-newlib* | -linux-uclibc* \
 	      | -uxpv* | -beos* | -mpeix* | -udk* \
@@ -1352,6 +1426,9 @@ case $os in
 	-zvmoe)
 		os=-zvmoe
 		;;
+	-dicos*)
+		os=-dicos
+		;;
 	-none)
 		;;
 	*)
@@ -1549,7 +1626,7 @@ case $basic_machine in
 			-sunos*)
 				vendor=sun
 				;;
-			-aix*)
+			-cnk*|-aix*)
 				vendor=ibm
 				;;
 			-beos*)
diff --git a/configure b/configure
index 53198e8..b254383 100755
--- a/configure
+++ b/configure
@@ -7,10 +7,12 @@ echo ""
 echo "available options:"
 echo ""
 echo "  --help                   print this message"
-echo "  --disable-avis-input     disables avisynth input (win32 only)"
+echo "  --disable-avs-input      disables avisynth input (win32 only)"
+echo "  --disable-lavf-input     disables libavformat input"
+echo "  --disable-ffms-input     disables ffmpegsource input"
 echo "  --disable-mp4-output     disables mp4 output (using gpac)"
 echo "  --disable-pthread        disables multithreaded encoding"
-echo "  --disable-asm            disables assembly optimizations on x86"
+echo "  --disable-asm            disables assembly optimizations on x86 and arm"
 echo "  --enable-debug           adds -g, doesn't strip"
 echo "  --enable-gprof           adds -pg, doesn't strip"
 echo "  --enable-visualize       enables visualization (X11 only)"
@@ -25,24 +27,83 @@ echo ""
 exit 1
 fi
 
+log_check() {
+    echo -n "checking $1... " >> config.log
+}
+
+log_ok() {
+    echo "yes" >> config.log
+}
+
+log_fail() {
+    echo "no" >> config.log
+}
+
+log_msg() {
+    echo "$1" >> config.log
+}
+
 cc_check() {
+    if [ -z "$3" ]; then
+        if [ -z "$1$2" ]; then
+            log_check "whether $CC works"
+        elif [ -z "$1" ]; then
+            log_check "for $2"
+        else
+            log_check "for $1"
+        fi
+    elif [ -z "$1" ]; then
+        log_check "whether $CC supports $3"
+    else
+        log_check "for $3 in $1";
+    fi
     rm -f conftest.c
     [ -n "$1" ] && echo "#include <$1>" > conftest.c
     echo "int main () { $3 return 0; }" >> conftest.c
-    $CC conftest.c $CFLAGS $LDFLAGS $2 -o conftest 2>$DEVNULL
+    if $CC conftest.c $CFLAGS $LDFLAGS $LDFLAGSCLI $2 -o conftest >conftest.log 2>&1; then
+        res=$?
+        log_ok
+    else
+        res=$?
+        log_fail
+        log_msg "Failed commandline was:"
+        log_msg "--------------------------------------------------"
+        log_msg "$CC conftest.c $CFLAGS $LDFLAGS $LDFLAGSCLI $2"
+        cat conftest.log >> config.log
+        log_msg "--------------------------------------------------"
+    fi
+    return $res
 }
 
 as_check() {
+    log_check "whether $AS supports $1"
     echo "$1" > conftest.asm
-    $AS conftest.asm $ASFLAGS $2 -o conftest.o 2>$DEVNULL
+    if $AS conftest.asm $ASFLAGS $2 -o conftest.o >conftest.log 2>&1; then
+        res=$?
+        log_ok
+    else
+        res=$?
+        log_fail
+        log_msg "Failed commandline was:"
+        log_msg "--------------------------------------------------"
+        log_msg "$AS conftest.asm $ASFLAGS $2 -o conftest.o"
+        cat conftest.log >> config.log
+        log_msg "--------------------------------------------------"
+    fi
+    return $res
+}
+
+define() {
+    echo "#define $1$([ -n "$2" ] && echo " $2")" >> config.h
 }
 
 die() {
+    log_msg "DIED: $@"
     echo "$@"
     exit 1
 }
 
-rm -f config.h config.mak x264.pc conftest*
+rm -f config.h config.mak config.log x264.pc conftest*
 
 prefix='/usr/local'
 exec_prefix='${prefix}'
@@ -51,7 +112,9 @@ libdir='${exec_prefix}/lib'
 includedir='${prefix}/include'
 DEVNULL='/dev/null'
 
-avis_input="auto"
+avs_input="auto"
+lavf_input="auto"
+ffms_input="auto"
 mp4_output="auto"
 pthread="auto"
 asm="yes"
@@ -63,6 +126,7 @@ shared="no"
 
 CFLAGS="$CFLAGS -Wall -I."
 LDFLAGS="$LDFLAGS"
+LDFLAGSCLI="$LDFLAGSCLI"
 ASFLAGS="$ASFLAGS"
 HAVE_GETOPT_LONG=1
 cross_prefix=""
@@ -95,11 +159,23 @@ for opt do
         --disable-asm)
             asm="no"
             ;;
-        --enable-avis-input)
-            avis_input="yes"
+        --enable-avs-input)
+            avs_input="auto"
+            ;;
+        --disable-avs-input)
+            avs_input="no"
+            ;;
+        --enable-lavf-input)
+            lavf_input="auto"
+            ;;
+        --disable-lavf-input)
+            lavf_input="no"
             ;;
-        --disable-avis-input)
-            avis_input="no"
+        --enable-ffms-input)
+            ffms_input="auto"
+            ;;
+        --disable-ffms-input)
+            ffms_input="no"
             ;;
         --enable-mp4-output)
             mp4_output="yes"
@@ -138,7 +214,7 @@ for opt do
             ;;
         --enable-visualize)
             LDFLAGS="$LDFLAGS -L/usr/X11R6/lib -lX11"
-            CFLAGS="$CFLAGS -DVISUALIZE=1"
+            define HAVE_VISUALIZE
             vis="yes"
             ;;
         --host=*)
@@ -157,7 +233,6 @@ CC="${CC-${cross_prefix}gcc}"
 AR="${AR-${cross_prefix}ar}"
 RANLIB="${RANLIB-${cross_prefix}ranlib}"
 STRIP="${STRIP-${cross_prefix}strip}"
-AS=""
 
 if [ "x$host" = x ]; then
     host=`./config.guess`
@@ -174,14 +249,14 @@ host_os="${host#*-}"
 case $host_os in
   beos*)
     SYS="BEOS"
-    CFLAGS="$CFLAGS -DHAVE_MALLOC_H"
+    define HAVE_MALLOC_H
     ;;
   darwin*)
     SYS="MACOSX"
     CFLAGS="$CFLAGS -falign-loops=16"
-    LDFLAGS="$LDFLAGS -lm -lmx"
+    LDFLAGS="$LDFLAGS -lm"
     if [ "$pic" = "no" ]; then
-        CFLAGS="$CFLAGS -mdynamic-no-pic"
+        cc_check "" -mdynamic-no-pic && CFLAGS="$CFLAGS -mdynamic-no-pic"
     fi
     ;;
   freebsd*)
@@ -190,7 +265,7 @@ case $host_os in
     ;;
   kfreebsd*-gnu)
     SYS="FREEBSD"
-    CFLAGS="$CFLAGS -DHAVE_MALLOC_H"
+    define HAVE_MALLOC_H
     LDFLAGS="$LDFLAGS -lm"
     ;;
   netbsd*)
@@ -204,7 +279,7 @@ case $host_os in
     ;;
   *linux*)
     SYS="LINUX"
-    CFLAGS="$CFLAGS -DHAVE_MALLOC_H"
+    define HAVE_MALLOC_H
     LDFLAGS="$LDFLAGS -lm"
     ;;
   cygwin*)
@@ -223,7 +298,7 @@ case $host_os in
     ;;
   sunos*|solaris*)
     SYS="SunOS"
-    CFLAGS="$CFLAGS -DHAVE_MALLOC_H"
+    define HAVE_MALLOC_H
     LDFLAGS="$LDFLAGS -lm"
     HAVE_GETOPT_LONG=0
     ;;
@@ -240,6 +315,9 @@ case $host_cpu in
     if [[ "$asm" == yes && "$CFLAGS" != *-march* ]]; then
       CFLAGS="$CFLAGS -march=i686"
     fi
+    if [[ "$asm" == yes && "$CFLAGS" != *-mfpmath* ]]; then
+      CFLAGS="$CFLAGS -mfpmath=sse -msse"
+    fi
     if [ "$SYS" = MACOSX ]; then
       ASFLAGS="$ASFLAGS -f macho -DPREFIX"
     elif [ "$SYS" = MINGW ]; then
@@ -253,8 +331,10 @@ case $host_cpu in
     AS="yasm"
     if [ "$SYS" = MACOSX ];then
       ASFLAGS="$ASFLAGS -f macho64 -m amd64 -DPIC -DPREFIX"
-      CFLAGS="$CFLAGS -arch x86_64"
-      LDFLAGS="$LDFLAGS -arch x86_64"
+      if cc_check '' "-arch x86_64"; then
+        CFLAGS="$CFLAGS -arch x86_64"
+        LDFLAGS="$LDFLAGS -arch x86_64"
+      fi
     elif [ "$SYS" = MINGW ]; then
       ASFLAGS="$ASFLAGS -f win32 -m amd64 -DPREFIX"
     else
@@ -265,9 +345,10 @@ case $host_cpu in
     ARCH="PPC"
     if [ $SYS = MACOSX ]
     then
-      ALTIVECFLAGS="$ALTIVECFLAGS -faltivec -fastf -mcpu=G4"
+      CFLAGS="$CFLAGS -faltivec -fastf -mcpu=G4"
     else
-      ALTIVECFLAGS="$ALTIVECFLAGS -maltivec -mabi=altivec -DHAVE_ALTIVEC_H"
+      CFLAGS="$CFLAGS -maltivec -mabi=altivec"
+      define HAVE_ALTIVEC_H
     fi
     ;;
   sparc)
@@ -286,6 +367,7 @@ case $host_cpu in
     ;;
   arm*)
     ARCH="ARM"
+    AS="${AS-${cross_prefix}gcc}"
     ;;
   s390|s390x)
     ARCH="S390"
@@ -298,23 +380,39 @@ case $host_cpu in
     ;;
 esac
 
+log_msg "x264 configure script"
+if [ -n "$*" ]; then
+    msg="Command line options:"
+    for i in $@; do
+        msg="$msg \"$i\""
+    done
+    log_msg "$msg"
+fi
+log_msg ""
+
 # check requirements
 
 cc_check || die "No working C compiler found."
 
-if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" \) ] ; then
+if cc_check '' -std=gnu99 ; then
+    CFLAGS="$CFLAGS -std=gnu99"
+elif cc_check '' -std=c99 ; then
+    CFLAGS="$CFLAGS -std=c99 -D_POSIX_C_SOURCE=200112L -D_BSD_SOURCE"
+fi
+
+if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" \) ] ; then
     pic="yes"
 fi
 
 if [ $asm = yes -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
-    if ! as_check "pinsrd xmm0, [esp], 0" ; then
+    if ! as_check "lzcnt eax, eax" ; then
         VER=`($AS --version || echo no assembler) 2>$DEVNULL | head -n 1`
         echo "Found $VER"
-        echo "Minimum version is yasm-0.6.1"
+        echo "Minimum version is yasm-0.6.2"
         echo "If you really want to compile without asm, configure with --disable-asm."
         exit 1
     fi
-    if ! cc_check '' '' 'asm("pabsw %xmm0, %xmm0");' ; then
+    if ! cc_check '' '' '__asm__("pabsw %xmm0, %xmm0");' ; then
         VER=`(as --version || echo no gnu as) 2>$DEVNULL | head -n 1`
         echo "Found $VER"
         echo "Minimum version is binutils-2.17"
@@ -322,16 +420,37 @@ if [ $asm = yes -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
         echo "If you really want to compile without asm, configure with --disable-asm."
         exit 1
     fi
-    CFLAGS="$CFLAGS -DHAVE_MMX"
+    define HAVE_MMX
 fi
+
+if [ $asm = yes -a $ARCH = ARM ] ; then
+    # set flags so neon is built by default
+    echo $CFLAGS | grep -Eq '(-mcpu|-march|-mfpu|-mfloat-abi)' || CFLAGS="$CFLAGS -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp"
+
+    if  cc_check '' '' '__asm__("rev ip, ip");' ; then      define HAVE_ARMV6   && ASFLAGS="$ASFLAGS -DHAVE_ARMV6"
+        cc_check '' '' '__asm__("movt r0, #0");'         && define HAVE_ARMV6T2 && ASFLAGS="$ASFLAGS -DHAVE_ARMV6T2"
+        cc_check '' '' '__asm__("vadd.i16 q0, q0, q0");' && define HAVE_NEON    && ASFLAGS="$ASFLAGS -DHAVE_NEON"
+        ASFLAGS="$ASFLAGS -c"
+    else
+        echo "You specified a pre-ARMv6 or Thumb-1 CPU in your CFLAGS."
+        echo "If you really want to run on such a CPU, configure with --disable-asm."
+        exit 1
+    fi
+fi
+
 [ $asm = no ] && AS=""
 [ "x$AS" = x ] && asm="no"
 
-CFLAGS="$CFLAGS -DARCH_$ARCH -DSYS_$SYS"
+define ARCH_$ARCH
+define SYS_$SYS
 
-echo "unsigned int endian = 'B' << 24 | 'I' << 16 | 'G' << 8 | 'E';" > conftest.c
+echo "int i = 0x42494745; double f = 0x1.0656e6469616ep+102;" > conftest.c
 $CC $CFLAGS conftest.c -c -o conftest.o 2>$DEVNULL || die "endian test failed"
-grep -q BIGE conftest.o && CFLAGS="$CFLAGS -DWORDS_BIGENDIAN"
+if grep -q BIGE conftest.o && grep -q FPendian conftest.o ; then
+    define WORDS_BIGENDIAN
+elif !(grep -q EGIB conftest.o && grep -q naidnePF conftest.o) ; then
+    die "endian test failed"
+fi
 
 # autodetect options that weren't forced nor disabled
 
@@ -352,11 +471,11 @@ if test "$pthread" = "auto" ; then
             elif cc_check pthread.h "-lpthreadGC2 -lwsock32 -DPTW32_STATIC_LIB" "pthread_create(0,0,0,0);" ; then
                 pthread="yes"
                 libpthread="-lpthreadGC2 -lwsock32"
-                CFLAGS="$CFLAGS -DPTW32_STATIC_LIB"
+                define PTW32_STATIC_LIB
             elif cc_check pthread.h "-lpthreadGC2 -lws2_32 -DPTW32_STATIC_LIB" "pthread_create(0,0,0,0);" ; then
                 pthread="yes"
                 libpthread="-lpthreadGC2 -lws2_32"
-                CFLAGS="$CFLAGS -DPTW32_STATIC_LIB"
+                define PTW32_STATIC_LIB
             fi
             ;;
         OPENBSD)
@@ -368,10 +487,82 @@ if test "$pthread" = "auto" ; then
     esac
 fi
 if test "$pthread" = "yes" ; then
-    CFLAGS="$CFLAGS -DHAVE_PTHREAD"
+    define HAVE_PTHREAD
     LDFLAGS="$LDFLAGS $libpthread"
 fi
 
+if cc_check "math.h" "-Werror" "return log2f(2);" ; then
+    define HAVE_LOG2F
+fi
+
+if [ "$lavf_input" = "auto" ] ; then
+    lavf_input="no"
+    if ${cross_prefix}pkg-config --exists libavformat libavcodec libswscale 2>$DEVNULL; then
+        LAVF_LIBS="$LAVF_LIBS $(${cross_prefix}pkg-config --libs libavformat libavcodec libswscale)"
+        LAVF_CFLAGS="$LAVF_CFLAGS $(${cross_prefix}pkg-config --cflags libavformat libavcodec libswscale)"
+    fi
+    if [ -z "$LAVF_LIBS" -a -z "$LAVF_CFLAGS" ]; then
+        LAVF_LIBS="-lavformat -lswscale"
+        for lib in -lpostproc -lavcodec -lavutil -lm -lz -lbz2 $libpthread -lavifil32; do
+            cc_check "" $lib && LAVF_LIBS="$LAVF_LIBS $lib"
+        done
+    fi
+    LAVF_LIBS="-L. $LAVF_LIBS"
+    if cc_check libavformat/avformat.h "$LAVF_CFLAGS $LAVF_LIBS" && \
+       cc_check libswscale/swscale.h "$LAVF_CFLAGS $LAVF_LIBS" ; then
+        # avcodec_decode_video2 is currently the most recently added function that we use; it was added in r18351
+        if cc_check libavformat/avformat.h "$LAVF_CFLAGS $LAVF_LIBS" "avcodec_decode_video2( NULL, NULL, NULL, NULL );" ; then
+            lavf_input="yes"
+            define LAVF_INPUT
+        else
+            echo "Warning: libavformat is too old, update to ffmpeg r18351+"
+        fi
+    fi
+fi
+
+if [ "$ffms_input" = "auto" ] ; then
+    ffms_major="2"; ffms_minor="13"; ffms_micro="1"; ffms_bump="0"
+
+    ffms_input="no"
+    [ $ffms_micro -gt 0 -o $ffms_bump -gt 0 ] && vmicro=".$ffms_micro"
+    [ $ffms_bump -gt 0 ] && vbump=".$ffms_bump"
+    if ${cross_prefix}pkg-config --atleast-version="$ffms_major.$ffms_minor$vmicro$vbump" ffms2 2>$DEVNULL; then
+        FFMS2_LIBS="$FFMS2_LIBS $(${cross_prefix}pkg-config --libs ffms2)"
+        FFMS2_CFLAGS="$FFMS2_LIBS $(${cross_prefix}pkg-config --cflags ffms2)"
+        api_check="no"
+    else
+        api_check="yes"
+    fi
+    [ -z "$FFMS2_LIBS" ] && FFMS2_LIBS="-lffms2"
+
+    if cc_check ffms.h "$FFMS2_CFLAGS $FFMS2_LIBS" "FFMS_DestroyVideoSource(0);" ; then
+        ffms_input="yes"
+    elif cc_check ffms.h "$FFMS2_CFLAGS $FFMS2_LIBS -lstdc++ $LAVF_LIBS" "FFMS_DestroyVideoSource(0);" ; then
+        ffms_input="yes"
+        FFMS2_LIBS="$FFMS2_LIBS -lstdc++ $LAVF_LIBS"
+    fi
+
+    if [ $api_check = "yes" -a $ffms_input = "yes" ]; then
+        log_check "whether ffms2 version is at least $ffms_major.$ffms_minor$vmicro$vbump"
+        $CC $CFLAGS $FFMS2_CFLAGS -c -o conftest -x c - >$DEVNULL 2>&1 <<EOF
+#include <ffms.h>
+#if FFMS_VERSION < (($ffms_major << 24) | ($ffms_minor << 16) | ($ffms_micro << 8) | $ffms_bump)
+#error Requires ffms2 version 2.13.1
+#endif
+EOF
+        [ $? = 0 ] && log_ok || { ffms_input="no"; log_fail; }
+    fi
+fi
+
+if [ "$ffms_input" = "yes" ]; then
+    LDFLAGSCLI="$FFMS2_LIBS $LDFLAGSCLI"
+    [ -n "$FFMS2_CFLAGS" ] && CFLAGS="$CFLAGS $FFMS2_CFLAGS"
+    define FFMS_INPUT
+elif [ "$lavf_input" = "yes" ]; then
+    LDFLAGSCLI="$LAVF_LIBS $LDFLAGSCLI"
+    [ -n "$LAVF_CFLAGS" ] && CFLAGS="$CFLAGS $LAVF_CFLAGS"
+fi
+
 MP4_LDFLAGS="-lgpac_static"
 if [ $SYS = MINGW ]; then
     MP4_LDFLAGS="$MP4_LDFLAGS -lwinmm"
@@ -381,26 +572,19 @@ if [ "$mp4_output" = "auto" ] ; then
     cc_check gpac/isomedia.h "$MP4_LDFLAGS" && mp4_output="yes"
 fi
 if [ "$mp4_output" = "yes" ] ; then
-    echo "#define MP4_OUTPUT" >> config.h
-    LDFLAGS="$LDFLAGS $MP4_LDFLAGS"
+    define MP4_OUTPUT
+    LDFLAGSCLI="$LDFLAGSCLI $MP4_LDFLAGS"
 fi
 
-if [ "$avis_input" = "auto" ] ; then
-    if [ $SYS = MINGW ]; then
-        avis_input="yes"
-    else
-        avis_input="no";
-    fi
-fi
-if [ "$avis_input" = "yes" ] ; then
-    if cc_check "stdlib.h" -lvfw32 ; then
-        echo "#define AVIS_INPUT" >> config.h
-        LDFLAGS="$LDFLAGS -lvfw32"
-    elif cc_check "stdlib.h" -lavifil32 ; then
-        echo "#define AVIS_INPUT" >> config.h
-        LDFLAGS="$LDFLAGS -lavifil32"
-    else
-        avis_input="no";
+if [ "$avs_input" = "auto" ] ; then
+    avs_input=no
+    if [ $SYS = MINGW ] && cc_check avisynth_c.h ; then
+        avs_input="yes"
+        define AVS_INPUT
+        define HAVE_AVISYNTH_C_H
+    elif [ $SYS = MINGW ] && cc_check extras/avisynth_c.h ; then
+        avs_input="yes"
+        define AVS_INPUT
     fi
 fi
 
@@ -418,16 +602,20 @@ fi
 
 if [ "$debug" = "yes" ]; then
     CFLAGS="-O1 -g $CFLAGS"
+elif [ $ARCH = ARM ]; then
+    # arm-gcc-4.2 produces incorrect output with -ffast-math
+    # and it doesn't save any speed anyway on 4.4, so disable it
+    CFLAGS="-O3 -fno-fast-math $CFLAGS"
 else
-    CFLAGS="-O4 -ffast-math $CFLAGS"
+    CFLAGS="-O3 -ffast-math $CFLAGS"
 fi
 
 if cc_check "stdio.h" "" "fseeko(stdin,0,0);" ; then
-    echo "#define fseek fseeko" >> config.h
-    echo "#define ftell ftello" >> config.h
+    define fseek fseeko
+    define ftell ftello
 elif cc_check "stdio.h" "" "fseeko64(stdin,0,0);" ; then
-    echo "#define fseek fseeko64" >> config.h
-    echo "#define ftell ftello64" >> config.h
+    define fseek fseeko64
+    define ftell ftello64
 fi
 
 rm -f conftest*
@@ -444,8 +632,8 @@ ARCH=$ARCH
 SYS=$SYS
 CC=$CC
 CFLAGS=$CFLAGS
-ALTIVECFLAGS=$ALTIVECFLAGS
 LDFLAGS=$LDFLAGS
+LDFLAGSCLI=$LDFLAGSCLI
 AR=$AR
 RANLIB=$RANLIB
 STRIP=$STRIP
@@ -496,18 +684,27 @@ Libs: $pclibs
 Cflags: -I$includedir
 EOF
 
+cat > conftest.log <<EOF
+Platform:   $ARCH
+System:     $SYS
+asm:        $asm
+avs input:  $avs_input
+lavf input: $lavf_input
+ffms input: $ffms_input
+mp4 output: $mp4_output
+pthread:    $pthread
+debug:      $debug
+gprof:      $gprof
+PIC:        $pic
+shared:     $shared
+visualize:  $vis
+EOF
+
+echo >> config.log
+cat conftest.log >> config.log
+cat conftest.log
+rm conftest.log
 
-echo "Platform:   $ARCH"
-echo "System:     $SYS"
-echo "asm:        $asm"
-echo "avis input: $avis_input"
-echo "mp4 output: $mp4_output"
-echo "pthread:    $pthread"
-echo "debug:      $debug"
-echo "gprof:      $gprof"
-echo "PIC:        $pic"
-echo "shared:     $shared"
-echo "visualize:  $vis"
 echo
 echo "You can run 'make' or 'make fprofiled' now."
 
diff --git a/doc/standards.txt b/doc/standards.txt
index 4ebb165..db9a691 100644
--- a/doc/standards.txt
+++ b/doc/standards.txt
@@ -1,5 +1,5 @@
-x264 is written in C. The particular variant of C is: intersection of gcc-2.95 and msvc. This means C89 + a few C99 features.
-The extra utilities (mostly checkasm) are written in C99, with no attempt at compatibility with old compilers.
+x264 is written in C. The particular variant of C is: intersection of C99 and gcc>=3.4.
+checkasm is written in gcc, with no attempt at compatibility with anything else.
 
 We make the following additional assumptions which are true of real systems but not guaranteed by C99:
 * Two's complement.
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 7ec43c8..666596b 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -24,10 +24,7 @@
 
 #define _ISOC99_SOURCE
 #include <math.h>
-#include <limits.h>
-#ifndef _MSC_VER
 #include <unistd.h>
-#endif
 
 #include "common/common.h"
 #include "common/cpu.h"
@@ -47,7 +44,7 @@ typedef struct
     /* 8x8 */
     int       i_cost8x8;
     /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
-    DECLARE_ALIGNED_4( int16_t mvc[32][5][2] );
+    ALIGNED_4( int16_t mvc[32][5][2] );
     x264_me_t me8x8[4];
 
     /* Sub 4x4 */
@@ -78,15 +75,15 @@ typedef struct
     int i_lambda;
     int i_lambda2;
     int i_qp;
-    int16_t *p_cost_mv;
-    uint16_t *p_cost_ref0;
-    uint16_t *p_cost_ref1;
+    uint16_t *p_cost_mv;
+    uint16_t *p_cost_ref[2];
     int i_mbrd;
 
 
     /* I: Intra part */
     /* Take some shortcuts in intra search if intra is deemed unlikely */
     int b_fast_intra;
+    int b_force_intra; /* For Periodic Intra Refresh.  Only supported in P-frames. */
     int b_try_pskip;
 
     /* Luma part */
@@ -106,7 +103,7 @@ typedef struct
 
     /* Chroma part */
     int i_satd_i8x8chroma;
-    int i_satd_i8x8chroma_dir[4];
+    int i_satd_i8x8chroma_dir[7];
     int i_predict8x8chroma;
 
     /* II: Inter part P/B frame */
@@ -135,7 +132,7 @@ typedef struct
 } x264_mb_analysis_t;
 
 /* lambda = pow(2,qp/6-2) */
-const int x264_lambda_tab[52] = {
+const uint8_t x264_lambda_tab[52] = {
    1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
    1, 1, 1, 1,              /*  8-11 */
    1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
@@ -156,82 +153,217 @@ const int x264_lambda2_tab[52] = {
 943718, 1189010, 1498059, 1887436                                  /* 48 - 51 */
 };
 
+const uint8_t x264_exp2_lut[64] = {
+      0,   3,   6,   8,  11,  14,  17,  20,  23,  26,  29,  32,  36,  39,  42,  45,
+     48,  52,  55,  58,  62,  65,  69,  72,  76,  80,  83,  87,  91,  94,  98, 102,
+    106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
+    175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
+};
+
+const float x264_log2_lut[128] = {
+    0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
+    0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
+    0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
+    0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
+    0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
+    0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
+    0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
+    0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
+    0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
+    0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
+    0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
+    0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
+    0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
+    0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
+    0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
+    0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
+};
+
+/* Avoid an int/float conversion. */
+const float x264_log2_lz_lut[32] = {
+    31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+};
+
+// should the intra and inter lambdas be different?
+// I'm just matching the behaviour of deadzone quant.
+static const int x264_trellis_lambda2_tab[2][52] = {
+    // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
+    {    46,      58,      73,      92,     117,     147,
+        185,     233,     294,     370,     466,     587,
+        740,     932,    1174,    1480,    1864,    2349,
+       2959,    3728,    4697,    5918,    7457,    9395,
+      11837,   14914,   18790,   23674,   29828,   37581,
+      47349,   59656,   75163,   94699,  119313,  150326,
+     189399,  238627,  300652,  378798,  477255,  601304,
+     757596,  954511, 1202608, 1515192, 1909022, 2405217,
+    3030384, 3818045, 4810435, 6060769 },
+    // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
+    {    27,      34,      43,      54,      68,      86,
+        108,     136,     172,     216,     273,     343,
+        433,     545,     687,     865,    1090,    1374,
+       1731,    2180,    2747,    3461,    4361,    5494,
+       6922,    8721,   10988,   13844,   17442,   21976,
+      27688,   34885,   43953,   55377,   69771,   87906,
+     110755,  139543,  175813,  221511,  279087,  351627,
+     443023,  558174,  703255,  886046, 1116348, 1406511,
+    1772093, 2232697, 2813022, 3544186 }
+};
+
+static const uint16_t x264_chroma_lambda2_offset_tab[] = {
+       16,    20,    25,    32,    40,    50,
+       64,    80,   101,   128,   161,   203,
+      256,   322,   406,   512,   645,   812,
+     1024,  1290,  1625,  2048,  2580,  3250,
+     4096,  5160,  6501,  8192, 10321, 13003,
+    16384, 20642, 26007, 32768, 41285, 52015,
+    65535
+};
+
 /* TODO: calculate CABAC costs */
-static const int i_mb_b_cost_table[X264_MBTYPE_MAX] = {
+static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = {
     9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
 };
-static const int i_mb_b16x8_cost_table[17] = {
+static const uint8_t i_mb_b16x8_cost_table[17] = {
     0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
 };
-static const int i_sub_mb_b_cost_table[13] = {
+static const uint8_t i_sub_mb_b_cost_table[13] = {
     7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
 };
-static const int i_sub_mb_p_cost_table[4] = {
+static const uint8_t i_sub_mb_p_cost_table[4] = {
     5, 3, 3, 1
 };
 
 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
 
-/* Indexed by lambda instead of qp because, due to rounding,
- * some quantizers share lambdas.  This saves memory. */
-uint16_t *x264_cost_mv_fpel[92][4];
-uint16_t x264_cost_ref[92][3][33];
+static uint16_t x264_cost_ref[92][3][33];
+static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
 
-/* initialize an array of lambda*nbits for all possible mvs */
-static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
+int x264_analyse_init_costs( x264_t *h, int qp )
 {
-    static int16_t *p_cost_mv[92];
     int i, j;
-
-    if( !p_cost_mv[a->i_lambda] )
+    int lambda = x264_lambda_tab[qp];
+    if( h->cost_mv[lambda] )
+        return 0;
+    /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
+    CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
+    h->cost_mv[lambda] += 2*4*2048;
+    for( i = 0; i <= 2*4*2048; i++ )
+    {
+        h->cost_mv[lambda][-i] =
+        h->cost_mv[lambda][i]  = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
+    }
+    x264_pthread_mutex_lock( &cost_ref_mutex );
+    for( i = 0; i < 3; i++ )
+        for( j = 0; j < 33; j++ )
+            x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
+    x264_pthread_mutex_unlock( &cost_ref_mutex );
+    if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
     {
-        x264_emms();
-        /* could be faster, but isn't called many times */
-        /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
-        p_cost_mv[a->i_lambda] = x264_malloc( (4*4*2048 + 1) * sizeof(int16_t) );
-        p_cost_mv[a->i_lambda] += 2*4*2048;
-        for( i = 0; i <= 2*4*2048; i++ )
+        for( j=0; j<4; j++ )
         {
-            p_cost_mv[a->i_lambda][-i] =
-            p_cost_mv[a->i_lambda][i]  = a->i_lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
+            CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
+            h->cost_mv_fpel[lambda][j] += 2*2048;
+            for( i = -2*2048; i < 2*2048; i++ )
+                h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
         }
-        for( i = 0; i < 3; i++ )
-            for( j = 0; j < 33; j++ )
-                x264_cost_ref[a->i_lambda][i][j] = i ? a->i_lambda * bs_size_te( i, j ) : 0;
     }
-    a->p_cost_mv = p_cost_mv[a->i_lambda];
-    a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
-    a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
+    return 0;
+fail:
+    return -1;
+}
 
-    /* FIXME is this useful for all me methods? */
-    if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_lambda][0] )
+void x264_analyse_free_costs( x264_t *h )
+{
+    int i, j;
+    for( i = 0; i < 92; i++ )
     {
-        for( j=0; j<4; j++ )
+        if( h->cost_mv[i] )
+            x264_free( h->cost_mv[i] - 2*4*2048 );
+        if( h->cost_mv_fpel[i][0] )
+            for( j = 0; j < 4; j++ )
+                x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
+    }
+}
+
+void x264_analyse_weight_frame( x264_t *h, int end )
+{
+    int j;
+    for( j=0; j<h->i_ref0; j++ )
+    {
+        if( h->sh.weight[j][0].weightfn )
         {
-            x264_cost_mv_fpel[a->i_lambda][j] = x264_malloc( (4*2048 + 1) * sizeof(int16_t) );
-            x264_cost_mv_fpel[a->i_lambda][j] += 2*2048;
-            for( i = -2*2048; i < 2*2048; i++ )
-                x264_cost_mv_fpel[a->i_lambda][j][i] = p_cost_mv[a->i_lambda][i*4+j];
+            x264_frame_t *frame = h->fref0[j];
+            int width = frame->i_width[0] + 2*PADH;
+            int i_padv = PADV << h->param.b_interlaced;
+            int offset, height;
+            uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
+            int k;
+            height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
+            offset = h->fenc->i_lines_weighted*frame->i_stride[0];
+            h->fenc->i_lines_weighted += height;
+            if( height )
+            {
+                for( k = j; k < h->i_ref0; k++ )
+                    if( h->sh.weight[k][0].weightfn )
+                    {
+                        uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
+                        x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
+                                                 src + offset, frame->i_stride[0],
+                                                 width, height, &h->sh.weight[k][0] );
+                    }
+            }
+            break;
         }
     }
 }
 
-static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
+/* initialize an array of lambda*nbits for all possible mvs */
+static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
+{
+    a->p_cost_mv = h->cost_mv[a->i_lambda];
+    a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
+    a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
+}
+
+static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 {
-    int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
-    /* mbrd == 1 -> RD mode decision */
-    /* mbrd == 2 -> RD refinement */
-    a->i_mbrd = (i>=6) + (i>=8);
     /* conduct the analysis using this lamda and QP */
     a->i_qp = h->mb.i_qp = i_qp;
     h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
+
     a->i_lambda = x264_lambda_tab[i_qp];
     a->i_lambda2 = x264_lambda2_tab[i_qp];
+
+    h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
+    if( h->param.analyse.i_trellis )
+    {
+        h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
+        h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
+        h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
+        h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
+    }
+    h->mb.i_psy_rd_lambda = a->i_lambda;
+    /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
+    h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
+
+}
+
+static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
+{
+    int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
+
+    /* mbrd == 1 -> RD mode decision */
+    /* mbrd == 2 -> RD refinement */
+    /* mbrd == 3 -> QPRD */
+    a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
+
+    x264_mb_analyse_init_qp( h, a, i_qp );
+
     h->mb.i_me_method = h->param.analyse.i_me_method;
     h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
     h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
                         && h->mb.i_subpel_refine >= 5;
-    h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
+
     h->mb.b_transform_8x8 = 0;
     h->mb.b_noise_reduction = 0;
 
@@ -257,8 +389,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
         int i_fmv_range = 4 * h->param.analyse.i_mv_range;
         // limit motion search to a slightly smaller range than the theoretical limit,
         // since the search may go a few iterations past its given range
-        int i_fpel_border = 5; // umh unconditional radius
-        int i_spel_border = 8; // 1.5 for subpel_satd, 1.5 for subpel_rd, 2 for bime, round up
+        int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
 
         /* Calculate max allowed MV range */
 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
@@ -266,15 +397,23 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
         h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
         h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
         h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
+        if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
+        {
+            int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
+            int max_mv = max_x - 4*16*h->mb.i_mb_x;
+            /* If we're left of the refresh bar, don't reference right of it. */
+            if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
+                h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
+        }
         h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
         h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
-        if( h->mb.i_mb_x == 0)
+        if( h->mb.i_mb_x == 0 )
         {
             int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
             int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
             int thread_mvy_range = i_fmv_range;
 
-            if( h->param.i_threads > 1 )
+            if( h->i_thread_frames > 1 )
             {
                 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
                 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
@@ -284,19 +423,22 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
                     int i_ref = i ? h->i_ref1 : h->i_ref0;
                     for( j=0; j<i_ref; j++ )
                     {
-                        x264_frame_cond_wait( fref[j], thresh );
-                        thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->i_lines_completed - pix_y );
+                        x264_frame_cond_wait( fref[j]->orig, thresh );
+                        thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
                     }
                 }
+
                 if( h->param.b_deterministic )
                     thread_mvy_range = h->param.analyse.i_mv_range_thread;
                 if( h->mb.b_interlaced )
                     thread_mvy_range >>= 1;
+
+                x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
             }
 
             h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
             h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
-            h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], X264_MAX(4*(-512+i_spel_border), -i_fmv_range), i_fmv_range );
+            h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
             h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
             h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
             h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
@@ -361,154 +503,80 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
             }
         }
         h->mb.b_skip_mc = 0;
+        if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
+            h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
+        {
+            a->b_force_intra = 1;
+            a->b_fast_intra = 0;
+        }
+        else
+            a->b_force_intra = 0;
     }
 }
 
+/* Prediction modes allowed for various combinations of neighbors. */
+/* Terminated by a -1. */
+/* In order, no neighbors, left, top, top/left, top/left/topleft */
+static const int8_t i16x16_mode_available[5][5] =
+{
+    {I_PRED_16x16_DC_128, -1, -1, -1, -1},
+    {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
+    {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
+    {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
+    {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
+};
 
+static const int8_t i8x8chroma_mode_available[5][5] =
+{
+    {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
+    {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
+    {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
+    {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
+    {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
+};
 
-/*
- * Handle intra mb
- */
-/* Max = 4 */
-static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
+static const int8_t i4x4_mode_available[5][10] =
 {
-    if( i_neighbour & MB_TOPLEFT )
-    {
-        /* top and left available */
-        *mode++ = I_PRED_16x16_V;
-        *mode++ = I_PRED_16x16_H;
-        *mode++ = I_PRED_16x16_DC;
-        *mode++ = I_PRED_16x16_P;
-        *pi_count = 4;
-    }
-    else if( i_neighbour & MB_LEFT )
-    {
-        /* left available*/
-        *mode++ = I_PRED_16x16_DC_LEFT;
-        *mode++ = I_PRED_16x16_H;
-        *pi_count = 2;
-    }
-    else if( i_neighbour & MB_TOP )
-    {
-        /* top available*/
-        *mode++ = I_PRED_16x16_DC_TOP;
-        *mode++ = I_PRED_16x16_V;
-        *pi_count = 2;
-    }
-    else
-    {
-        /* none available */
-        *mode = I_PRED_16x16_DC_128;
-        *pi_count = 1;
-    }
-}
+    {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
+    {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
+    {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
+    {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
+};
 
-/* Max = 4 */
-static void predict_8x8chroma_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
+static inline const int8_t *predict_16x16_mode_available( int i_neighbour )
 {
-    if( i_neighbour & MB_TOPLEFT )
-    {
-        /* top and left available */
-        *mode++ = I_PRED_CHROMA_V;
-        *mode++ = I_PRED_CHROMA_H;
-        *mode++ = I_PRED_CHROMA_DC;
-        *mode++ = I_PRED_CHROMA_P;
-        *pi_count = 4;
-    }
-    else if( i_neighbour & MB_LEFT )
-    {
-        /* left available*/
-        *mode++ = I_PRED_CHROMA_DC_LEFT;
-        *mode++ = I_PRED_CHROMA_H;
-        *pi_count = 2;
-    }
-    else if( i_neighbour & MB_TOP )
-    {
-        /* top available*/
-        *mode++ = I_PRED_CHROMA_DC_TOP;
-        *mode++ = I_PRED_CHROMA_V;
-        *pi_count = 2;
-    }
-    else
-    {
-        /* none available */
-        *mode = I_PRED_CHROMA_DC_128;
-        *pi_count = 1;
-    }
+    int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
+    return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
 }
 
-/* MAX = 9 */
-static void predict_4x4_mode_available( unsigned int i_neighbour,
-                                        int *mode, int *pi_count )
+static inline const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
 {
-    int b_l = i_neighbour & MB_LEFT;
-    int b_t = i_neighbour & MB_TOP;
+    int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
+    return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
+}
 
-    if( b_l && b_t )
-    {
-        *pi_count = 6;
-        *mode++ = I_PRED_4x4_DC;
-        *mode++ = I_PRED_4x4_H;
-        *mode++ = I_PRED_4x4_V;
-        *mode++ = I_PRED_4x4_DDL;
-        if( i_neighbour & MB_TOPLEFT )
-        {
-            *mode++ = I_PRED_4x4_DDR;
-            *mode++ = I_PRED_4x4_VR;
-            *mode++ = I_PRED_4x4_HD;
-            *pi_count += 3;
-        }
-        *mode++ = I_PRED_4x4_VL;
-        *mode++ = I_PRED_4x4_HU;
-    }
-    else if( b_l )
-    {
-        *mode++ = I_PRED_4x4_DC_LEFT;
-        *mode++ = I_PRED_4x4_H;
-        *mode++ = I_PRED_4x4_HU;
-        *pi_count = 3;
-    }
-    else if( b_t )
-    {
-        *mode++ = I_PRED_4x4_DC_TOP;
-        *mode++ = I_PRED_4x4_V;
-        *mode++ = I_PRED_4x4_DDL;
-        *mode++ = I_PRED_4x4_VL;
-        *pi_count = 4;
-    }
-    else
-    {
-        *mode++ = I_PRED_4x4_DC_128;
-        *pi_count = 1;
-    }
+static inline const int8_t *predict_4x4_mode_available( int i_neighbour )
+{
+    int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
+    return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
 }
 
 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
 {
-    DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] );
-    DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
-    DECLARE_ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
-    int i;
+    ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
 
     if( do_both_dct || h->mb.b_transform_8x8 )
-    {
-        h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], zero );
-        for( i = 0; i < 4; i++ )
-            h->zigzagf.scan_8x8( h->mb.pic.fenc_dct8[i], dct8x8[i] );
-    }
+        h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
     if( do_both_dct || !h->mb.b_transform_8x8 )
-    {
-        h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], zero );
-        for( i = 0; i < 16; i++ )
-            h->zigzagf.scan_4x4( h->mb.pic.fenc_dct4[i], dct4x4[i] );
-    }
+        h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
 }
 
 /* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
 static inline void x264_mb_cache_fenc_satd( x264_t *h )
 {
-    DECLARE_ALIGNED_16( static uint8_t zero[16] ) = {0};
+    ALIGNED_16( static uint8_t zero[16] ) = {0};
     uint8_t *fenc;
     int x, y, satd_sum = 0, sa8d_sum = 0;
     if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
@@ -537,71 +605,55 @@ static inline void x264_mb_cache_fenc_satd( x264_t *h )
 
 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
 {
-    int i;
-
-    int i_max;
-    int predict_mode[4];
     int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
 
-    uint8_t *p_dstc[2], *p_srcc[2];
-
     if( a->i_satd_i8x8chroma < COST_MAX )
         return;
 
-    /* 8x8 prediction selection for chroma */
-    p_dstc[0] = h->mb.pic.p_fdec[1];
-    p_dstc[1] = h->mb.pic.p_fdec[2];
-    p_srcc[0] = h->mb.pic.p_fenc[1];
-    p_srcc[1] = h->mb.pic.p_fenc[2];
+    const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
 
-    predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
-    a->i_satd_i8x8chroma = COST_MAX;
-    if( i_max == 4 && b_merged_satd )
+    /* 8x8 prediction selection for chroma */
+    if( predict_mode[3] >= 0 && b_merged_satd )
     {
         int satdu[4], satdv[4];
-        h->pixf.intra_mbcmp_x3_8x8c( p_srcc[0], p_dstc[0], satdu );
-        h->pixf.intra_mbcmp_x3_8x8c( p_srcc[1], p_dstc[1], satdv );
-        h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] );
-        h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] );
-        satdu[I_PRED_CHROMA_P] =
-            h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, p_srcc[0], FENC_STRIDE );
-        satdv[I_PRED_CHROMA_P] =
-            h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, p_srcc[1], FENC_STRIDE );
-
-        for( i=0; i<i_max; i++ )
+        h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
+        h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
+        h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
+        h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
+        satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
+        satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
+
+        for( ; *predict_mode >= 0; predict_mode++ )
         {
-            int i_mode = predict_mode[i];
-            int i_satd = satdu[i_mode] + satdv[i_mode]
-                       + a->i_lambda * bs_size_ue(i_mode);
+            int i_mode = *predict_mode;
+            int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
 
-            a->i_satd_i8x8chroma_dir[i] = i_satd;
+            a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
         }
     }
     else
     {
-        for( i=0; i<i_max; i++ )
+        for( ; *predict_mode >= 0; predict_mode++ )
         {
             int i_satd;
-            int i_mode = predict_mode[i];
+            int i_mode = *predict_mode;
 
             /* we do the prediction */
             if( h->mb.b_lossless )
                 x264_predict_lossless_8x8_chroma( h, i_mode );
             else
             {
-                h->predict_8x8c[i_mode]( p_dstc[0] );
-                h->predict_8x8c[i_mode]( p_dstc[1] );
+                h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
+                h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
             }
 
             /* we calculate the cost */
-            i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE,
-                                               p_srcc[0], FENC_STRIDE ) +
-                     h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE,
-                                               p_srcc[1], FENC_STRIDE ) +
+            i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
+                     h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
                      a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
 
-            a->i_satd_i8x8chroma_dir[i] = i_satd;
+            a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
             COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
         }
     }
@@ -616,16 +668,14 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 
     int i, idx;
-    int i_max;
-    int predict_mode[9];
     int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
 
     /*---------------- Try all mode and calculate their score ---------------*/
 
     /* 16x16 prediction selection */
-    predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
+    const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
 
-    if( b_merged_satd && i_max == 4 )
+    if( b_merged_satd && predict_mode[3] >= 0 )
     {
         h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
         h->predict_16x16[I_PRED_16x16_P]( p_dst );
@@ -639,10 +689,10 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
     }
     else
     {
-        for( i = 0; i < i_max; i++ )
+        for( ; *predict_mode >= 0; predict_mode++ )
         {
             int i_satd;
-            int i_mode = predict_mode[i];
+            int i_mode = *predict_mode;
 
             if( h->mb.b_lossless )
                 x264_predict_lossless_16x16( h, i_mode );
@@ -665,7 +715,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
     /* 8x8 prediction selection */
     if( flags & X264_ANALYSE_I8x8 )
     {
-        DECLARE_ALIGNED_16( uint8_t edge[33] );
+        ALIGNED_ARRAY_16( uint8_t, edge,[33] );
         x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
         int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
         int i_cost = 0;
@@ -685,10 +735,10 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
             int i_best = COST_MAX;
             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
 
-            predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
+            predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
 
-            if( b_merged_satd && i_max == 9 )
+            if( b_merged_satd && predict_mode[8] >= 0 )
             {
                 int satd[9];
                 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
@@ -698,23 +748,22 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
                     int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
                     COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
                 }
-                i = 3;
+                predict_mode += 3;
             }
-            else
-                i = 0;
 
-            for( ; i<i_max; i++ )
+            for( ; *predict_mode >= 0; predict_mode++ )
             {
                 int i_satd;
-                int i_mode = predict_mode[i];
+                int i_mode = *predict_mode;
 
                 if( h->mb.b_lossless )
                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
                 else
                     h->predict_8x8[i_mode]( p_dst_by, edge );
 
-                i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE )
-                       + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
+                i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) + a->i_lambda * 4;
+                if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
+                    i_satd -= a->i_lambda * 3;
 
                 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
                 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
@@ -737,10 +786,10 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
             if( h->mb.i_skip_intra )
             {
                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
-                h->mb.pic.i8x8_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
-                h->mb.pic.i8x8_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
-                h->mb.pic.i8x8_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
-                h->mb.pic.i8x8_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
+                h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
+                h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
+                h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
+                h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
                 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
                 if( h->mb.i_skip_intra == 2 )
                     h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
@@ -777,41 +826,39 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
             int i_best = COST_MAX;
             int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
 
-            predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
+            const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
 
             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
                 /* emulate missing topright samples */
-                *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
+                M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
 
-            if( b_merged_satd && i_max >= 6 )
+            if( b_merged_satd && predict_mode[5] >= 0 )
             {
                 int satd[9];
                 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
                 satd[i_pred_mode] -= 3 * a->i_lambda;
                 for( i=2; i>=0; i-- )
-                    COPY2_IF_LT( i_best, satd[i] + 4 * a->i_lambda,
-                                 a->i_predict4x4[idx], i );
-                i = 3;
+                    COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
+                predict_mode += 3;
             }
-            else
-                i = 0;
 
-            for( ; i<i_max; i++ )
+            for( ; *predict_mode >= 0; predict_mode++ )
             {
                 int i_satd;
-                int i_mode = predict_mode[i];
+                int i_mode = *predict_mode;
+
                 if( h->mb.b_lossless )
                     x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
                 else
                     h->predict_4x4[i_mode]( p_dst_by );
 
-                i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE,
-                                                   p_src_by, FENC_STRIDE )
-                       + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
+                i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
+                if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
+                    i_satd -= a->i_lambda * 3;
 
                 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
             }
-            i_cost += i_best;
+            i_cost += i_best + 4 * a->i_lambda;
 
             if( i_cost > i_satd_thresh || idx == 15 )
                 break;
@@ -828,10 +875,10 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
             if( h->mb.i_skip_intra )
             {
                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
-                h->mb.pic.i4x4_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
-                h->mb.pic.i4x4_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
-                h->mb.pic.i4x4_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
-                h->mb.pic.i4x4_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
+                h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
+                h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
+                h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
+                h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
                 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
                 if( h->mb.i_skip_intra == 2 )
                     h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
@@ -877,21 +924,20 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
 {
     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 
-    int i, j, idx, x, y;
-    int i_max, i_mode, i_thresh;
+    int i, idx, x, y;
+    int i_mode, i_thresh;
     uint64_t i_satd, i_best;
-    int predict_mode[9];
     h->mb.i_skip_intra = 0;
 
     if( h->mb.i_type == I_16x16 )
     {
         int old_pred_mode = a->i_predict16x16;
+        const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
         i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
         i_best = a->i_satd_i16x16;
-        predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
-        for( i = 0; i < i_max; i++ )
+        for( ; *predict_mode >= 0; predict_mode++ )
         {
-            int i_mode = predict_mode[i];
+            int i_mode = *predict_mode;
             if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
                 continue;
             h->mb.i_intra16x16_pred_mode = i_mode;
@@ -901,18 +947,19 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
     }
 
     /* RD selection for chroma prediction */
-    predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
-    if( i_max > 1 )
+    const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
+    if( predict_mode[1] >= 0 )
     {
+        int8_t predict_mode_sorted[4];
+        int i_max;
         i_thresh = a->i_satd_i8x8chroma * 5/4;
 
-        for( i = j = 0; i < i_max; i++ )
-            if( a->i_satd_i8x8chroma_dir[i] < i_thresh &&
-                predict_mode[i] != a->i_predict8x8chroma )
-            {
-                predict_mode[j++] = predict_mode[i];
-            }
-        i_max = j;
+        for( i_max = 0; *predict_mode >= 0; predict_mode++ )
+        {
+            i_mode = *predict_mode;
+            if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
+                predict_mode_sorted[i_max++] = i_mode;
+        }
 
         if( i_max > 0 )
         {
@@ -924,7 +971,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
             i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
             for( i = 0; i < i_max; i++ )
             {
-                i_mode = predict_mode[i];
+                i_mode = predict_mode_sorted[i];
                 if( h->mb.b_lossless )
                     x264_predict_lossless_8x8_chroma( h, i_mode );
                 else
@@ -952,15 +999,15 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
             uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
             i_best = COST_MAX64;
 
-            predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
+            const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
 
             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
                 /* emulate missing topright samples */
-                *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
+                M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
 
-            for( i = 0; i < i_max; i++ )
+            for( ; *predict_mode >= 0; predict_mode++ )
             {
-                i_mode = predict_mode[i];
+                i_mode = *predict_mode;
                 if( h->mb.b_lossless )
                     x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
                 else
@@ -971,18 +1018,18 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
                 {
                     a->i_predict4x4[idx] = i_mode;
                     i_best = i_satd;
-                    pels[0] = *(uint32_t*)(p_dst_by+0*FDEC_STRIDE);
-                    pels[1] = *(uint32_t*)(p_dst_by+1*FDEC_STRIDE);
-                    pels[2] = *(uint32_t*)(p_dst_by+2*FDEC_STRIDE);
-                    pels[3] = *(uint32_t*)(p_dst_by+3*FDEC_STRIDE);
+                    pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
+                    pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
+                    pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
+                    pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
                     i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
                 }
             }
 
-            *(uint32_t*)(p_dst_by+0*FDEC_STRIDE) = pels[0];
-            *(uint32_t*)(p_dst_by+1*FDEC_STRIDE) = pels[1];
-            *(uint32_t*)(p_dst_by+2*FDEC_STRIDE) = pels[2];
-            *(uint32_t*)(p_dst_by+3*FDEC_STRIDE) = pels[3];
+            M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
+            M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
+            M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
+            M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
             h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
 
             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
@@ -990,12 +1037,12 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
     }
     else if( h->mb.i_type == I_8x8 )
     {
-        DECLARE_ALIGNED_16( uint8_t edge[33] );
+        ALIGNED_ARRAY_16( uint8_t, edge,[33] );
         for( idx = 0; idx < 4; idx++ )
         {
             uint64_t pels_h = 0;
             uint8_t pels_v[7];
-            uint16_t i_nnz[2];
+            uint16_t i_nnz[2] = {0}; //shut up gcc
             uint8_t *p_dst_by;
             int j;
             int cbp_luma_new = 0;
@@ -1006,14 +1053,15 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
             y = idx>>1;
 
             p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
-            predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
+            const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
             h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
 
-            for( i = 0; i < i_max; i++ )
+            for( ; *predict_mode >= 0; predict_mode++ )
             {
-                i_mode = predict_mode[i];
+                i_mode = *predict_mode;
                 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
                     continue;
+
                 if( h->mb.b_lossless )
                     x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
                 else
@@ -1027,21 +1075,21 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
                     cbp_luma_new = h->mb.i_cbp_luma;
                     i_best = i_satd;
 
-                    pels_h = *(uint64_t*)(p_dst_by+7*FDEC_STRIDE);
+                    pels_h = M64( p_dst_by+7*FDEC_STRIDE );
                     if( !(idx&1) )
                         for( j=0; j<7; j++ )
                             pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
-                    i_nnz[0] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]];
-                    i_nnz[1] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]];
+                    i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
+                    i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
                 }
             }
             a->i_cbp_i8x8_luma = cbp_luma_new;
-            *(uint64_t*)(p_dst_by+7*FDEC_STRIDE) = pels_h;
+            M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
             if( !(idx&1) )
                 for( j=0; j<7; j++ )
                     p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
-            *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] = i_nnz[0];
-            *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] = i_nnz[1];
+            M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
+            M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
 
             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
         }
@@ -1049,6 +1097,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
 }
 
 #define LOAD_FENC( m, src, xoff, yoff) \
+    (m)->p_cost_mv = a->p_cost_mv; \
     (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
     (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
     (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
@@ -1056,28 +1105,33 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
     (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
 
 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
-    (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
+    (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
     (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
     (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
     (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
     (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
     (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
-    (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]];
+    (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
+    (m)->weight = weight_none; \
+    (m)->i_ref = ref;
+
+#define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
+    (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
+    (m)->weight = h->sh.weight[i_ref];
 
 #define REF_COST(list, ref) \
-    (a->p_cost_ref##list[ref])
+    (a->p_cost_ref[list][ref])
 
 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
 {
     x264_me_t m;
     int i_ref, i_mvc;
-    DECLARE_ALIGNED_4( int16_t mvc[8][2] );
+    ALIGNED_4( int16_t mvc[8][2] );
     int i_halfpel_thresh = INT_MAX;
     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
 
     /* 16x16 Search on all ref frame */
     m.i_pixel = PIXEL_16x16;
-    m.p_cost_mv = a->p_cost_mv;
     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
 
     a->l0.me16x16.cost = INT_MAX;
@@ -1086,13 +1140,21 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
         const int i_ref_cost = REF_COST( 0, i_ref );
         i_halfpel_thresh -= i_ref_cost;
         m.i_ref_cost = i_ref_cost;
-        m.i_ref = i_ref;
 
         /* search with ref */
         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
+        LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
+
         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
-        x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
+
+        if( h->mb.ref_blind_dupe == i_ref )
+        {
+            CP32( m.mv, a->l0.mvc[0][0] );
+            x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
+        }
+        else
+            x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
 
         /* early termination
          * SSD threshold would probably be better than SATD */
@@ -1105,7 +1167,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
         {
             h->mb.i_type = P_SKIP;
             x264_analyse_update_cache( h, a );
-            assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
+            assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
             return;
         }
 
@@ -1116,22 +1178,24 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
             h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
 
         /* save mv for predicting neighbors */
-        *(uint32_t*)a->l0.mvc[i_ref][0] =
-        *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
+        CP32( a->l0.mvc[i_ref][0], m.mv );
+        CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
     }
 
     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
-    assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
+    assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
 
     h->mb.i_type = P_L0;
     if( a->i_mbrd )
     {
         x264_mb_cache_fenc_satd( h );
-        if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
+        if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
         {
             h->mb.i_partition = D_16x16;
             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
             a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+            if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
+                h->mb.i_type = P_SKIP;
         }
     }
 }
@@ -1148,22 +1212,29 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
 
     h->mb.i_partition = D_8x8;
 
+    #define CHECK_NEIGHBOUR(i)\
+    {\
+        int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
+        if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
+            i_maxref = ref;\
+    }
+
     /* early termination: if 16x16 chose ref 0, then evalute no refs older
      * than those used by the neighbors */
-    if( i_maxref > 0 && a->l0.me16x16.i_ref == 0 &&
+    if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
         h->mb.i_mb_type_top && h->mb.i_mb_type_left )
     {
         i_maxref = 0;
-        i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 - 1 ] );
-        i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 0 ] );
-        i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 2 ] );
-        i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 4 ] );
-        i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 0 - 1 ] );
-        i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 2*8 - 1 ] );
+        CHECK_NEIGHBOUR(  -8 - 1 );
+        CHECK_NEIGHBOUR(  -8 + 0 );
+        CHECK_NEIGHBOUR(  -8 + 2 );
+        CHECK_NEIGHBOUR(  -8 + 4 );
+        CHECK_NEIGHBOUR(   0 - 1 );
+        CHECK_NEIGHBOUR( 2*8 - 1 );
     }
 
     for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
-         *(uint32_t*)a->l0.mvc[i_ref][0] = *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy];
+        CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
 
     for( i = 0; i < 4; i++ )
     {
@@ -1172,34 +1243,45 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
         const int y8 = i/2;
 
         m.i_pixel = PIXEL_8x8;
-        m.p_cost_mv = a->p_cost_mv;
 
         LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
         l0m->cost = INT_MAX;
-        for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
+        for( i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
         {
             const int i_ref_cost = REF_COST( 0, i_ref );
-            i_halfpel_thresh -= i_ref_cost;
             m.i_ref_cost = i_ref_cost;
-            m.i_ref = i_ref;
 
             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
+            LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
+
             x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
-            x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
+            if( h->mb.ref_blind_dupe == i_ref )
+            {
+                CP32( m.mv, a->l0.mvc[0][i+1] );
+                x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
+            }
+            else
+                x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
 
             m.cost += i_ref_cost;
             i_halfpel_thresh += i_ref_cost;
-            *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
+            CP32( a->l0.mvc[i_ref][i+1], m.mv );
 
             if( m.cost < l0m->cost )
                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
+            if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
+                i_ref = h->mb.ref_blind_dupe;
+            else
+                i_ref++;
         }
         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
         x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
 
-        /* mb type cost */
-        l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
+        /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
+           are effectively zero. */
+        if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
+            l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
     }
 
     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
@@ -1214,9 +1296,11 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
 
 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
 {
-    const int i_ref = a->l0.me16x16.i_ref;
+    /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
+     * reference frame flags.  Thus, if we're not doing mixedrefs, just
+     * don't bother analysing the dupes. */
+    const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
     const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
-    uint8_t  **p_fref = h->mb.pic.p_fref[0][i_ref];
     uint8_t  **p_fenc = h->mb.pic.p_fenc;
     int i_mvc;
     int16_t (*mvc)[2] = a->l0.mvc[i_ref];
@@ -1226,7 +1310,7 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
     h->mb.i_partition = D_8x8;
 
     i_mvc = 1;
-    *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.me16x16.mv;
+    CP32( mvc[0], a->l0.me16x16.mv );
 
     for( i = 0; i < 4; i++ )
     {
@@ -1235,23 +1319,24 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
         const int y8 = i/2;
 
         m->i_pixel = PIXEL_8x8;
-        m->p_cost_mv = a->p_cost_mv;
         m->i_ref_cost = i_ref_cost;
-        m->i_ref = i_ref;
 
         LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
-        LOAD_HPELS( m, p_fref, 0, i_ref, 8*x8, 8*y8 );
+        LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
+        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
+
         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
         x264_me_search( h, m, mvc, i_mvc );
 
         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
 
-        *(uint32_t*)mvc[i_mvc] = *(uint32_t*)m->mv;
+        CP32( mvc[i_mvc], m->mv );
         i_mvc++;
 
         /* mb type cost */
         m->cost += i_ref_cost;
-        m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
+        if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
+            m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
     }
 
     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
@@ -1268,7 +1353,7 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
 {
     x264_me_t m;
     uint8_t  **p_fenc = h->mb.pic.p_fenc;
-    DECLARE_ALIGNED_4( int16_t mvc[3][2] );
+    ALIGNED_4( int16_t mvc[3][2] );
     int i, j;
 
     /* XXX Needed for x264_mb_predict_mv */
@@ -1277,11 +1362,12 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
     for( i = 0; i < 2; i++ )
     {
         x264_me_t *l0m = &a->l0.me16x8[i];
-        const int ref8[2] = { a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref };
+        const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
+        const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
+        const int ref8[2] = { minref, maxref };
         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
 
         m.i_pixel = PIXEL_16x8;
-        m.p_cost_mv = a->p_cost_mv;
 
         LOAD_FENC( &m, p_fenc, 0, 8*i );
         l0m->cost = INT_MAX;
@@ -1290,17 +1376,25 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
             const int i_ref = ref8[j];
             const int i_ref_cost = REF_COST( 0, i_ref );
             m.i_ref_cost = i_ref_cost;
-            m.i_ref = i_ref;
 
             /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
-            *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
-            *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
-            *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
+            CP32( mvc[0], a->l0.mvc[i_ref][0] );
+            CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
+            CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
 
             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
+            LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
+
             x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
             x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
-            x264_me_search( h, &m, mvc, 3 );
+            /* We can only take this shortcut if the first search was performed on ref0. */
+            if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
+            {
+                /* We can just leave the MV from the previous ref search. */
+                x264_me_refine_qpel_refdupe( h, &m, NULL );
+            }
+            else
+                x264_me_search( h, &m, mvc, 3 );
 
             m.cost += i_ref_cost;
 
@@ -1318,7 +1412,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
 {
     x264_me_t m;
     uint8_t  **p_fenc = h->mb.pic.p_fenc;
-    DECLARE_ALIGNED_4( int16_t mvc[3][2] );
+    ALIGNED_4( int16_t mvc[3][2] );
     int i, j;
 
     /* XXX Needed for x264_mb_predict_mv */
@@ -1327,11 +1421,12 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
     for( i = 0; i < 2; i++ )
     {
         x264_me_t *l0m = &a->l0.me8x16[i];
-        const int ref8[2] = { a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref };
+        const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
+        const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
+        const int ref8[2] = { minref, maxref };
         const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
 
         m.i_pixel = PIXEL_8x16;
-        m.p_cost_mv = a->p_cost_mv;
 
         LOAD_FENC( &m, p_fenc, 8*i, 0 );
         l0m->cost = INT_MAX;
@@ -1340,16 +1435,24 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
             const int i_ref = ref8[j];
             const int i_ref_cost = REF_COST( 0, i_ref );
             m.i_ref_cost = i_ref_cost;
-            m.i_ref = i_ref;
 
-            *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
-            *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
-            *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
+            CP32( mvc[0], a->l0.mvc[i_ref][0] );
+            CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
+            CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
 
             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
+            LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
+
             x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
-            x264_me_search( h, &m, mvc, 3 );
+            /* We can only take this shortcut if the first search was performed on ref0. */
+            if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
+            {
+                /* We can just leave the MV from the previous ref search. */
+                x264_me_refine_qpel_refdupe( h, &m, NULL );
+            }
+            else
+                x264_me_search( h, &m, mvc, 3 );
 
             m.cost += i_ref_cost;
 
@@ -1365,32 +1468,43 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
 
 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
 {
-    DECLARE_ALIGNED_8( uint8_t pix1[16*8] );
+    ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
     uint8_t *pix2 = pix1+8;
     const int i_stride = h->mb.pic.i_stride[1];
     const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
     const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
+    const int i_ref = a->l0.me8x8[i8x8].i_ref;
+    const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+    x264_weight_t *weight = h->sh.weight[i_ref];
 
 #define CHROMA4x4MC( width, height, me, x, y ) \
-    h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height ); \
-    h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height );
+    h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
+    if( weight[1].weightfn ) \
+        weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
+    h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
+    if( weight[2].weightfn ) \
+        weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
+
 
     if( pixel == PIXEL_4x4 )
     {
-        CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 );
-        CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 2,0 );
-        CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 0,2 );
-        CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 );
+        x264_me_t *m = a->l0.me4x4[i8x8];
+        CHROMA4x4MC( 2,2, m[0], 0,0 );
+        CHROMA4x4MC( 2,2, m[1], 2,0 );
+        CHROMA4x4MC( 2,2, m[2], 0,2 );
+        CHROMA4x4MC( 2,2, m[3], 2,2 );
     }
     else if( pixel == PIXEL_8x4 )
     {
-        CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 );
-        CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 );
+        x264_me_t *m = a->l0.me8x4[i8x8];
+        CHROMA4x4MC( 4,2, m[0], 0,0 );
+        CHROMA4x4MC( 4,2, m[1], 0,2 );
     }
     else
     {
-        CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 );
-        CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 );
+        x264_me_t *m = a->l0.me4x8[i8x8];
+        CHROMA4x4MC( 2,4, m[0], 0,0 );
+        CHROMA4x4MC( 2,4, m[1], 2,0 );
     }
 
     return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
@@ -1417,10 +1531,10 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8
         x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
 
         m->i_pixel = PIXEL_4x4;
-        m->p_cost_mv = a->p_cost_mv;
 
         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
+        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
 
         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
         x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
@@ -1457,10 +1571,10 @@ static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8
         x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
 
         m->i_pixel = PIXEL_8x4;
-        m->p_cost_mv = a->p_cost_mv;
 
         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
+        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
 
         x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
@@ -1494,10 +1608,10 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8
         x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
 
         m->i_pixel = PIXEL_4x8;
-        m->p_cost_mv = a->p_cost_mv;
 
         LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
         LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
+        LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
 
         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
@@ -1534,33 +1648,31 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
     }
 }
 
-#define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \
-{ \
-    h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
-}
-
 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
 {
-    DECLARE_ALIGNED_16( uint8_t pix0[16*16] );
-    DECLARE_ALIGNED_16( uint8_t pix1[16*16] );
+    ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
+    ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
     uint8_t *src0, *src1;
     int stride0 = 16, stride1 = 16;
 
     x264_me_t m;
     int i_ref, i_mvc;
-    DECLARE_ALIGNED_4( int16_t mvc[9][2] );
+    ALIGNED_4( int16_t mvc[9][2] );
     int i_halfpel_thresh = INT_MAX;
     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
 
     /* 16x16 Search on all ref frame */
     m.i_pixel = PIXEL_16x16;
-    m.p_cost_mv = a->p_cost_mv;
+    m.weight = weight_none;
+
     LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
 
     /* ME for List 0 */
     a->l0.me16x16.cost = INT_MAX;
     for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
     {
+        const int i_ref_cost = REF_COST( 0, i_ref );
+        m.i_ref_cost = i_ref_cost;
         /* search with ref */
         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
@@ -1568,7 +1680,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
 
         /* add ref cost */
-        m.cost += REF_COST( 0, i_ref );
+        m.cost += i_ref_cost;
 
         if( m.cost < a->l0.me16x16.cost )
         {
@@ -1577,10 +1689,9 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
         }
 
         /* save mv for predicting neighbors */
-        *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
+        CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
     }
-    /* subtract ref cost, so we don't have to add it for the other MB types */
-    a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref );
+    a->l0.me16x16.i_ref = a->l0.i_ref;
 
     /* ME for list 1 */
     i_halfpel_thresh = INT_MAX;
@@ -1588,6 +1699,8 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
     a->l1.me16x16.cost = INT_MAX;
     for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
     {
+        const int i_ref_cost = REF_COST( 0, i_ref );
+        m.i_ref_cost = i_ref_cost;
         /* search with ref */
         LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
         x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
@@ -1595,7 +1708,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
         x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
 
         /* add ref cost */
-        m.cost += REF_COST( 1, i_ref );
+        m.cost += i_ref_cost;
 
         if( m.cost < a->l1.me16x16.cost )
         {
@@ -1604,22 +1717,17 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
         }
 
         /* save mv for predicting neighbors */
-        *(uint32_t*)h->mb.mvr[1][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
+        CP32( h->mb.mvr[1][i_ref][h->mb.i_mb_xy], m.mv );
     }
-    /* subtract ref cost, so we don't have to add it for the other MB types */
-    a->l1.me16x16.cost -= REF_COST( 1, a->l1.i_ref );
-
-    /* Set global ref, needed for other modes? */
-    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
-    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
+    a->l1.me16x16.i_ref = a->l1.i_ref;
 
     /* get cost of BI mode */
     src0 = h->mc.get_ref( pix0, &stride0,
-                           h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
-                           a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 );
+                          h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
+                          a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16, weight_none );
     src1 = h->mc.get_ref( pix1, &stride1,
-                           h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
-                           a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 );
+                          h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
+                          a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16, weight_none );
 
     h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
 
@@ -1665,6 +1773,16 @@ static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int
     }
 }
 
+static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
+{
+    const int x = 2*(idx&1);
+    const int y = 2*(idx>>1);
+    x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
+    x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
+    x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
+    x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
+}
+
 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
     if( x264_mb_partition_listX_table[0][part] ) \
     { \
@@ -1725,7 +1843,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
     uint8_t **p_fref[2] =
         { h->mb.pic.p_fref[0][a->l0.i_ref],
           h->mb.pic.p_fref[1][a->l1.i_ref] };
-    DECLARE_ALIGNED_8( uint8_t pix[2][8*8] );
+    ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
     int i, l;
 
     /* XXX Needed for x264_mb_predict_mv */
@@ -1745,24 +1863,26 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
         for( l = 0; l < 2; l++ )
         {
             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
+            const int i_ref_cost = REF_COST( l, lX->i_ref );
             x264_me_t *m = &lX->me8x8[i];
 
             m->i_pixel = PIXEL_8x8;
-            m->p_cost_mv = a->p_cost_mv;
+            m->i_ref_cost = i_ref_cost;
 
             LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
 
+            x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->i_ref );
             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
             x264_me_search( h, m, &lX->me16x16.mv, 1 );
+            m->cost += i_ref_cost;
 
             x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
 
             /* BI mode */
             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
-                                    m->mv[0], m->mv[1], 8, 8 );
-            i_part_cost_bi += m->cost_mv;
-            /* FIXME: ref cost */
+                                    m->mv[0], m->mv[1], 8, 8, weight_none );
+            i_part_cost_bi += m->cost_mv + i_ref_cost;
         }
         h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
@@ -1790,8 +1910,8 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
     uint8_t **p_fref[2] =
         { h->mb.pic.p_fref[0][a->l0.i_ref],
           h->mb.pic.p_fref[1][a->l1.i_ref] };
-    DECLARE_ALIGNED_16( uint8_t pix[2][16*8] );
-    DECLARE_ALIGNED_4( int16_t mvc[2][2] );
+    ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
+    ALIGNED_4( int16_t mvc[2][2] );
     int i, l;
 
     h->mb.i_partition = D_16x8;
@@ -1808,25 +1928,27 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
         for( l = 0; l < 2; l++ )
         {
             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
+            const int i_ref_cost = REF_COST( l, lX->i_ref );
             x264_me_t *m = &lX->me16x8[i];
 
             m->i_pixel = PIXEL_16x8;
-            m->p_cost_mv = a->p_cost_mv;
+            m->i_ref_cost = i_ref_cost;
 
             LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
 
-            *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[2*i].mv;
-            *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[2*i+1].mv;
+            CP32( mvc[0], lX->me8x8[2*i].mv );
+            CP32( mvc[1], lX->me8x8[2*i+1].mv );
 
-            x264_mb_predict_mv( h, l, 8*i, 2, m->mvp );
+            x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, lX->i_ref );
+            x264_mb_predict_mv( h, l, 8*i, 4, m->mvp );
             x264_me_search( h, m, mvc, 2 );
+            m->cost += i_ref_cost;
 
             /* BI mode */
             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
-                                    m->mv[0], m->mv[1], 16, 8 );
-            /* FIXME: ref cost */
-            i_part_cost_bi += m->cost_mv;
+                                    m->mv[0], m->mv[1], 16, 8, weight_none );
+            i_part_cost_bi += m->cost_mv + i_ref_cost;
         }
         h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
         i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
@@ -1860,8 +1982,8 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
     uint8_t **p_fref[2] =
         { h->mb.pic.p_fref[0][a->l0.i_ref],
           h->mb.pic.p_fref[1][a->l1.i_ref] };
-    DECLARE_ALIGNED_8( uint8_t pix[2][8*16] );
-    DECLARE_ALIGNED_4( int16_t mvc[2][2] );
+    ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
+    ALIGNED_4( int16_t mvc[2][2] );
     int i, l;
 
     h->mb.i_partition = D_8x16;
@@ -1877,25 +1999,27 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
         for( l = 0; l < 2; l++ )
         {
             x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
+            const int i_ref_cost = REF_COST( l, lX->i_ref );
             x264_me_t *m = &lX->me8x16[i];
 
             m->i_pixel = PIXEL_8x16;
-            m->p_cost_mv = a->p_cost_mv;
+            m->i_ref_cost = i_ref_cost;
 
             LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
 
-            *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[i].mv;
-            *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[i+2].mv;
+            CP32( mvc[0], lX->me8x8[i].mv );
+            CP32( mvc[1], lX->me8x8[i+2].mv );
 
+            x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, lX->i_ref );
             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
             x264_me_search( h, m, mvc, 2 );
+            m->cost += i_ref_cost;
 
             /* BI mode */
             src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref,  m->i_stride[0],
-                                    m->mv[0], m->mv[1], 8, 16 );
-            /* FIXME: ref cost */
-            i_part_cost_bi += m->cost_mv;
+                                    m->mv[0], m->mv[1], 8, 16, weight_none );
+            i_part_cost_bi += m->cost_mv + i_ref_cost;
         }
 
         h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
@@ -1936,7 +2060,6 @@ static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
         x264_analyse_update_cache( h, a );
         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
     }
-    a->l0.me16x16.cost = a->l0.i_rd16x16;
 
     if( a->l0.i_cost16x8 <= thresh )
     {
@@ -1985,8 +2108,11 @@ static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
                     cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
                     COPY2_IF_LT( bcost, cost, btype, subtype );
                 }
-                h->mb.i_sub_partition[i] = btype;
-                x264_mb_cache_mv_p8x8( h, a, i );
+                if( h->mb.i_sub_partition[i] != btype )
+                {
+                    h->mb.i_sub_partition[i] = btype;
+                    x264_mb_cache_mv_p8x8( h, a, i );
+                }
             }
         }
         else
@@ -2077,25 +2203,25 @@ static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
 
     switch( h->mb.i_partition )
     {
-    case D_16x16:
-        if( h->mb.i_type == B_BI_BI )
-            x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
-        break;
-    case D_16x8:
-        for( i=0; i<2; i++ )
-            if( a->i_mb_partition16x8[i] == D_BI_8x8 )
-                x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
-        break;
-    case D_8x16:
-        for( i=0; i<2; i++ )
-            if( a->i_mb_partition8x16[i] == D_BI_8x8 )
-                x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
-        break;
-    case D_8x8:
-        for( i=0; i<4; i++ )
-            if( h->mb.i_sub_partition[i] == D_BI_8x8 )
-                x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
-        break;
+        case D_16x16:
+            if( h->mb.i_type == B_BI_BI )
+                x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
+            break;
+        case D_16x8:
+            for( i=0; i<2; i++ )
+                if( a->i_mb_partition16x8[i] == D_BI_8x8 )
+                    x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
+            break;
+        case D_8x16:
+            for( i=0; i<2; i++ )
+                if( a->i_mb_partition8x16[i] == D_BI_8x8 )
+                    x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
+            break;
+        case D_8x8:
+            for( i=0; i<4; i++ )
+                if( h->mb.i_sub_partition[i] == D_BI_8x8 )
+                    x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
+            break;
     }
 }
 
@@ -2123,7 +2249,7 @@ static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *
     {
         int i_rd8;
         x264_analyse_update_cache( h, a );
-        h->mb.b_transform_8x8 = !h->mb.b_transform_8x8;
+        h->mb.b_transform_8x8 ^= 1;
         /* FIXME only luma is needed, but the score for comparison already includes chroma */
         i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
 
@@ -2134,10 +2260,87 @@ static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *
             *i_rd = i_rd8;
         }
         else
-            h->mb.b_transform_8x8 = !h->mb.b_transform_8x8;
+            h->mb.b_transform_8x8 ^= 1;
     }
 }
 
+/* Rate-distortion optimal QP selection.
+ * FIXME: More than half of the benefit of this function seems to be
+ * in the way it improves the coding of chroma DC (by decimating or
+ * finding a better way to code a single DC coefficient.)
+ * There must be a more efficient way to get that portion of the benefit
+ * without doing full QP-RD, but RD-decimation doesn't seem to do the
+ * trick. */
+static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
+{
+    int bcost, cost, direction, failures, prevcost, origcost;
+    int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
+    int last_qp_tried = 0;
+    origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
+
+    /* If CBP is already zero, don't raise the quantizer any higher. */
+    for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
+    {
+        /* Without psy-RD, require monotonicity when moving quant away from previous
+         * macroblock's quant; allow 1 failure when moving quant towards previous quant.
+         * With psy-RD, allow 1 failure when moving quant away from previous quant,
+         * allow 2 failures when moving quant towards previous quant.
+         * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
+        int threshold = (!!h->mb.i_psy_rd);
+        /* Raise the threshold for failures if we're moving towards the last QP. */
+        if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
+            ( h->mb.i_last_qp > orig_qp && direction ==  1 ) )
+            threshold++;
+        h->mb.i_qp = orig_qp;
+        failures = 0;
+        prevcost = origcost;
+        h->mb.i_qp += direction;
+        while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
+        {
+            if( h->mb.i_last_qp == h->mb.i_qp )
+                last_qp_tried = 1;
+            h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
+            cost = x264_rd_cost_mb( h, a->i_lambda2 );
+            COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
+
+            /* We can't assume that the costs are monotonic over QPs.
+             * Tie case-as-failure seems to give better results. */
+            if( cost < prevcost )
+                failures = 0;
+            else
+                failures++;
+            prevcost = cost;
+
+            if( failures > threshold )
+                break;
+            if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
+                break;
+            h->mb.i_qp += direction;
+        }
+    }
+
+    /* Always try the last block's QP. */
+    if( !last_qp_tried )
+    {
+        h->mb.i_qp = h->mb.i_last_qp;
+        h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
+        cost = x264_rd_cost_mb( h, a->i_lambda2 );
+        COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
+    }
+
+    h->mb.i_qp = bqp;
+    h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
+
+    /* Check transform again; decision from before may no longer be optimal. */
+    if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
+        x264_mb_transform_8x8_allowed( h ) )
+    {
+        h->mb.b_transform_8x8 ^= 1;
+        cost = x264_rd_cost_mb( h, a->i_lambda2 );
+        if( cost > bcost )
+            h->mb.b_transform_8x8 ^= 1;
+    }
+}
 
 /*****************************************************************************
  * x264_macroblock_analyse:
@@ -2150,13 +2353,20 @@ void x264_macroblock_analyse( x264_t *h )
 
     h->mb.i_qp = x264_ratecontrol_qp( h );
     if( h->param.rc.i_aq_mode )
+    {
         x264_adaptive_quant( h );
+        /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
+         * to lower the bit cost of the qp_delta.  Don't do this if QPRD is enabled. */
+        if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
+            h->mb.i_qp = h->mb.i_last_qp;
+    }
 
     x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
 
     /*--------------------------- Do the analysis ---------------------------*/
     if( h->sh.i_type == SLICE_TYPE_I )
     {
+intra_analysis:
         if( analysis.i_mbrd )
             x264_mb_cache_fenc_satd( h );
         x264_mb_analyse_intra( h, &analysis, COST_MAX );
@@ -2179,20 +2389,31 @@ void x264_macroblock_analyse( x264_t *h )
 
         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
 
-        /* Fast P_SKIP detection */
         analysis.b_try_pskip = 0;
-        if( h->param.analyse.b_fast_pskip )
+        if( analysis.b_force_intra )
         {
-            if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
-                // FIXME don't need to check this if the reference frame is done
-                {}
-            else if( h->param.analyse.i_subpel_refine >= 3 )
-                analysis.b_try_pskip = 1;
-            else if( h->mb.i_mb_type_left == P_SKIP ||
-                     h->mb.i_mb_type_top == P_SKIP ||
-                     h->mb.i_mb_type_topleft == P_SKIP ||
-                     h->mb.i_mb_type_topright == P_SKIP )
-                b_skip = x264_macroblock_probe_pskip( h );
+            if( !h->param.analyse.b_psy )
+            {
+                x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
+                goto intra_analysis;
+            }
+        }
+        else
+        {
+            /* Fast P_SKIP detection */
+            if( h->param.analyse.b_fast_pskip )
+            {
+                if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
+                    // FIXME don't need to check this if the reference frame is done
+                    {}
+                else if( h->param.analyse.i_subpel_refine >= 3 )
+                    analysis.b_try_pskip = 1;
+                else if( h->mb.i_mb_type_left == P_SKIP ||
+                         h->mb.i_mb_type_top == P_SKIP ||
+                         h->mb.i_mb_type_topleft == P_SKIP ||
+                         h->mb.i_mb_type_topright == P_SKIP )
+                    b_skip = x264_macroblock_probe_pskip( h );
+            }
         }
 
         h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
@@ -2201,7 +2422,7 @@ void x264_macroblock_analyse( x264_t *h )
         {
             h->mb.i_type = P_SKIP;
             h->mb.i_partition = D_16x16;
-            assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
+            assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
         }
         else
         {
@@ -2281,7 +2502,7 @@ void x264_macroblock_analyse( x264_t *h )
 
             /* refine qpel */
             //FIXME mb_type costs?
-            if( analysis.i_mbrd )
+            if( analysis.i_mbrd || !h->mb.i_subpel_refine )
             {
                 /* refine later */
             }
@@ -2365,7 +2586,7 @@ void x264_macroblock_analyse( x264_t *h )
                 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
                 i_type = P_L0;
                 i_partition = D_16x16;
-                i_cost = analysis.l0.me16x16.cost;
+                i_cost = analysis.l0.i_rd16x16;
                 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
                 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
                 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
@@ -2383,6 +2604,19 @@ void x264_macroblock_analyse( x264_t *h )
 
             h->mb.i_type = i_type;
 
+            if( analysis.b_force_intra && !IS_INTRA(i_type) )
+            {
+                /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
+                 * it was an inter block. */
+                x264_analyse_update_cache( h, &analysis );
+                x264_macroblock_encode( h );
+                h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
+                h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
+                h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
+                x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
+                goto intra_analysis;
+            }
+
             if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
             {
                 if( IS_INTRA( h->mb.i_type ) )
@@ -2392,6 +2626,7 @@ void x264_macroblock_analyse( x264_t *h )
                 else if( i_partition == D_16x16 )
                 {
                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
+                    analysis.l0.me16x16.cost = i_cost;
                     x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
                 }
                 else if( i_partition == D_16x8 )
@@ -2424,20 +2659,20 @@ void x264_macroblock_analyse( x264_t *h )
                         }
                         else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
                         {
-                           x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
-                           x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
+                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
+                            x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
                         }
                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
                         {
-                           x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
-                           x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
+                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
+                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
                         }
                         else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
                         {
-                           x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
-                           x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
-                           x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
-                           x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
+                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
+                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
+                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
+                            x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
                         }
                     }
                 }
@@ -2500,7 +2735,7 @@ void x264_macroblock_analyse( x264_t *h )
             const unsigned int flags = h->param.analyse.inter;
             int i_type;
             int i_partition;
-            int i_satd_inter = 0; // shut up uninitialized warning
+            int i_satd_inter;
             h->mb.b_skip_mc = 0;
 
             x264_mb_analyse_load_costs( h, &analysis );
@@ -2561,7 +2796,7 @@ void x264_macroblock_analyse( x264_t *h )
                 }
             }
 
-            if( analysis.i_mbrd )
+            if( analysis.i_mbrd || !h->mb.i_subpel_refine )
             {
                 /* refine later */
             }
@@ -2644,9 +2879,10 @@ void x264_macroblock_analyse( x264_t *h )
                 }
             }
 
+            i_satd_inter = i_cost;
+
             if( analysis.i_mbrd )
             {
-                i_satd_inter = i_cost;
                 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
                 i_type = B_SKIP;
                 i_cost = i_bskip_cost;
@@ -2692,9 +2928,15 @@ void x264_macroblock_analyse( x264_t *h )
                 if( i_partition == D_16x16 )
                 {
                     if( i_type == B_L0_L0 )
+                    {
+                        analysis.l0.me16x16.cost = i_cost;
                         x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
+                    }
                     else if( i_type == B_L1_L1 )
+                    {
+                        analysis.l1.me16x16.cost = i_cost;
                         x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
+                    }
                     else if( i_type == B_BI_BI )
                         x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 );
                 }
@@ -2742,9 +2984,25 @@ void x264_macroblock_analyse( x264_t *h )
 
     x264_analyse_update_cache( h, &analysis );
 
+    /* In rare cases we can end up qpel-RDing our way back to a larger partition size
+     * without realizing it.  Check for this and account for it if necessary. */
+    if( analysis.i_mbrd >= 2 )
+    {
+        /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
+        static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
+        int list = check_mv_lists[h->mb.i_type] - 1;
+        if( list >= 0 && h->mb.i_partition != D_16x16 &&
+            M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
+            h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
+                h->mb.i_partition = D_16x16;
+    }
+
     if( !analysis.i_mbrd )
         x264_mb_analyse_transform( h );
 
+    if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
+        x264_mb_analyse_qp_rd( h, &analysis );
+
     h->mb.b_trellis = h->param.analyse.i_trellis;
     h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
     if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
@@ -2885,7 +3143,7 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
     }
 
 #ifndef NDEBUG
-    if( h->param.i_threads > 1 && !IS_INTRA(h->mb.i_type) )
+    if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
     {
         int l;
         for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
@@ -2894,7 +3152,7 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
             int ref = h->mb.cache.ref[l][x264_scan8[0]];
             if( ref < 0 )
                 continue;
-            completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->i_lines_completed;
+            completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
             if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
             {
                 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
diff --git a/encoder/analyse.h b/encoder/analyse.h
index b8c828f..7c2c22c 100644
--- a/encoder/analyse.h
+++ b/encoder/analyse.h
@@ -24,7 +24,21 @@
 #ifndef X264_ANALYSE_H
 #define X264_ANALYSE_H
 
+int x264_analyse_init_costs( x264_t *h, int qp );
+void x264_analyse_free_costs( x264_t *h );
+void x264_analyse_weight_frame( x264_t *h, int end );
 void x264_macroblock_analyse( x264_t *h );
 void x264_slicetype_decide( x264_t *h );
 
+void x264_slicetype_analyse( x264_t *h, int keyframe );
+
+int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
+void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame );
+
+int  x264_lookahead_init( x264_t *h, int i_slicetype_length );
+int  x264_lookahead_is_empty( x264_t *h );
+void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame );
+void x264_lookahead_get_frames( x264_t *h );
+void x264_lookahead_delete( x264_t *h );
+
 #endif
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 97defa0..271f527 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -36,11 +36,13 @@ static inline void x264_cabac_mb_type_intra( x264_t *h, x264_cabac_t *cb, int i_
     {
         x264_cabac_encode_decision_noup( cb, ctx0, 0 );
     }
+#if !RDO_SKIP_BS
     else if( i_mb_type == I_PCM )
     {
         x264_cabac_encode_decision_noup( cb, ctx0, 1 );
         x264_cabac_encode_flush( h, cb );
     }
+#endif
     else
     {
         int i_pred = x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode];
@@ -86,24 +88,9 @@ static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb )
         /* prefix: 14, suffix: 17 */
         if( i_mb_type == P_L0 )
         {
-            if( h->mb.i_partition == D_16x16 )
-            {
-                x264_cabac_encode_decision_noup( cb, 14, 0 );
-                x264_cabac_encode_decision_noup( cb, 15, 0 );
-                x264_cabac_encode_decision_noup( cb, 16, 0 );
-            }
-            else if( h->mb.i_partition == D_16x8 )
-            {
-                x264_cabac_encode_decision_noup( cb, 14, 0 );
-                x264_cabac_encode_decision_noup( cb, 15, 1 );
-                x264_cabac_encode_decision_noup( cb, 17, 1 );
-            }
-            else if( h->mb.i_partition == D_8x16 )
-            {
-                x264_cabac_encode_decision_noup( cb, 14, 0 );
-                x264_cabac_encode_decision_noup( cb, 15, 1 );
-                x264_cabac_encode_decision_noup( cb, 17, 0 );
-            }
+            x264_cabac_encode_decision_noup( cb, 14, 0 );
+            x264_cabac_encode_decision_noup( cb, 15, h->mb.i_partition != D_16x16 );
+            x264_cabac_encode_decision_noup( cb, 17-(h->mb.i_partition == D_16x16), h->mb.i_partition == D_16x8 );
         }
         else if( i_mb_type == P_8x8 )
         {
@@ -129,10 +116,14 @@ static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb )
             ctx++;
 
         if( i_mb_type == B_DIRECT )
+        {
             x264_cabac_encode_decision_noup( cb, 27+ctx, 0 );
-        else if( i_mb_type == B_8x8 )
+            return;
+        }
+        x264_cabac_encode_decision_noup( cb, 27+ctx, 1 );
+
+        if( i_mb_type == B_8x8 )
         {
-            x264_cabac_encode_decision_noup( cb, 27+ctx, 1 );
             x264_cabac_encode_decision_noup( cb, 27+3,   1 );
             x264_cabac_encode_decision_noup( cb, 27+4,   1 );
             x264_cabac_encode_decision( cb, 27+5,   1 );
@@ -142,7 +133,6 @@ static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb )
         else if( IS_INTRA( i_mb_type ) )
         {
             /* prefix */
-            x264_cabac_encode_decision_noup( cb, 27+ctx, 1 );
             x264_cabac_encode_decision_noup( cb, 27+3,   1 );
             x264_cabac_encode_decision_noup( cb, 27+4,   1 );
             x264_cabac_encode_decision( cb, 27+5,   1 );
@@ -154,39 +144,32 @@ static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb )
         }
         else
         {
-            static const int i_mb_len[9*3] =
-            {
-                6, 6, 3,    /* L0 L0 */
-                6, 6, 0,    /* L0 L1 */
-                7, 7, 0,    /* L0 BI */
-                6, 6, 0,    /* L1 L0 */
-                6, 6, 3,    /* L1 L1 */
-                7, 7, 0,    /* L1 BI */
-                7, 7, 0,    /* BI L0 */
-                7, 7, 0,    /* BI L1 */
-                7, 7, 6,    /* BI BI */
-            };
-            static const int i_mb_bits[9*3][7] =
+            static const uint8_t i_mb_bits[9*3] =
             {
-                { 1,1,0,0,0,1   }, { 1,1,0,0,1,0,  }, { 1,0,0 },       /* L0 L0 */
-                { 1,1,0,1,0,1   }, { 1,1,0,1,1,0   }, {0},             /* L0 L1 */
-                { 1,1,1,0,0,0,0 }, { 1,1,1,0,0,0,1 }, {0},             /* L0 BI */
-                { 1,1,0,1,1,1   }, { 1,1,1,1,1,0   }, {0},             /* L1 L0 */
-                { 1,1,0,0,1,1   }, { 1,1,0,1,0,0   }, { 1,0,1 },       /* L1 L1 */
-                { 1,1,1,0,0,1,0 }, { 1,1,1,0,0,1,1 }, {0},             /* L1 BI */
-                { 1,1,1,0,1,0,0 }, { 1,1,1,0,1,0,1 }, {0},             /* BI L0 */
-                { 1,1,1,0,1,1,0 }, { 1,1,1,0,1,1,1 }, {0},             /* BI L1 */
-                { 1,1,1,1,0,0,0 }, { 1,1,1,1,0,0,1 }, { 1,1,0,0,0,0 }, /* BI BI */
+                0x31, 0x29, 0x4, /* L0 L0 */
+                0x35, 0x2d, 0,   /* L0 L1 */
+                0x43, 0x63, 0,   /* L0 BI */
+                0x3d, 0x2f, 0,   /* L1 L0 */
+                0x39, 0x25, 0x6, /* L1 L1 */
+                0x53, 0x73, 0,   /* L1 BI */
+                0x4b, 0x6b, 0,   /* BI L0 */
+                0x5b, 0x7b, 0,   /* BI L1 */
+                0x47, 0x67, 0x21 /* BI BI */
             };
 
             const int idx = (i_mb_type - B_L0_L0) * 3 + (h->mb.i_partition - D_16x8);
-            int i;
+            int bits = i_mb_bits[idx];
 
-            x264_cabac_encode_decision_noup( cb, 27+ctx, i_mb_bits[idx][0] );
-            x264_cabac_encode_decision_noup( cb, 27+3,   i_mb_bits[idx][1] );
-            x264_cabac_encode_decision( cb, 27+5-i_mb_bits[idx][1], i_mb_bits[idx][2] );
-            for( i = 3; i < i_mb_len[idx]; i++ )
-                x264_cabac_encode_decision( cb, 27+5, i_mb_bits[idx][i] );
+            x264_cabac_encode_decision_noup( cb, 27+3, bits&1 );
+            x264_cabac_encode_decision( cb, 27+5-(bits&1), (bits>>1)&1 ); bits >>= 2;
+            if( bits != 1 )
+            {
+                x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
+                x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
+                x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1;
+                if( bits != 1 )
+                    x264_cabac_encode_decision_noup( cb, 27+5, bits&1 );
+            }
         }
     }
 }
@@ -231,10 +214,10 @@ static void x264_cabac_mb_cbp_luma( x264_t *h, x264_cabac_t *cb )
     int cbp = h->mb.i_cbp_luma;
     int cbp_l = h->mb.cache.i_cbp_left;
     int cbp_t = h->mb.cache.i_cbp_top;
-    x264_cabac_encode_decision( cb, 76 - ((cbp_l >> 1) & 1) - ((cbp_t >> 1) & 2), (h->mb.i_cbp_luma >> 0) & 1 );
-    x264_cabac_encode_decision( cb, 76 - ((cbp   >> 0) & 1) - ((cbp_t >> 2) & 2), (h->mb.i_cbp_luma >> 1) & 1 );
-    x264_cabac_encode_decision( cb, 76 - ((cbp_l >> 3) & 1) - ((cbp   << 1) & 2), (h->mb.i_cbp_luma >> 2) & 1 );
-    x264_cabac_encode_decision_noup( cb, 76 - ((cbp   >> 2) & 1) - ((cbp   >> 0) & 2), (h->mb.i_cbp_luma >> 3) & 1 );
+    x264_cabac_encode_decision     ( cb, 76 - ((cbp_l >> 1) & 1) - ((cbp_t >> 1) & 2), (cbp >> 0) & 1 );
+    x264_cabac_encode_decision     ( cb, 76 - ((cbp   >> 0) & 1) - ((cbp_t >> 2) & 2), (cbp >> 1) & 1 );
+    x264_cabac_encode_decision     ( cb, 76 - ((cbp_l >> 3) & 1) - ((cbp   << 1) & 2), (cbp >> 2) & 1 );
+    x264_cabac_encode_decision_noup( cb, 76 - ((cbp   >> 2) & 1) - ((cbp   >> 0) & 2), (cbp >> 3) & 1 );
 }
 
 static void x264_cabac_mb_cbp_chroma( x264_t *h, x264_cabac_t *cb )
@@ -260,7 +243,6 @@ static void x264_cabac_mb_cbp_chroma( x264_t *h, x264_cabac_t *cb )
 
 static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
 {
-    int i_mbn_xy = h->mb.i_mb_prev_xy;
     int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
     int ctx;
 
@@ -273,9 +255,9 @@ static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
         i_dqp = 0;
     }
 
-    /* No need to test for PCM / SKIP */
-    ctx = h->mb.i_last_dqp &&
-        ( h->mb.type[i_mbn_xy] == I_16x16 || (h->mb.cbp[i_mbn_xy]&0x3f) );
+    /* Since, per the above, empty-CBP I16x16 blocks never have delta quants,
+     * we don't have to check for them. */
+    ctx = h->mb.i_last_dqp && h->mb.cbp[h->mb.i_mb_prev_xy];
 
     if( i_dqp != 0 )
     {
@@ -283,11 +265,11 @@ static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
         /* dqp is interpreted modulo 52 */
         if( val >= 51 && val != 52 )
             val = 103 - val;
-        while( val-- )
+        do
         {
             x264_cabac_encode_decision( cb, 60 + ctx, 1 );
             ctx = 2+(ctx>>1);
-        }
+        } while( --val );
     }
     x264_cabac_encode_decision_noup( cb, 60 + ctx, 0 );
 }
@@ -305,61 +287,38 @@ void x264_cabac_mb_skip( x264_t *h, int b_skip )
 static inline void x264_cabac_mb_sub_p_partition( x264_cabac_t *cb, int i_sub )
 {
     if( i_sub == D_L0_8x8 )
-        x264_cabac_encode_decision( cb, 21, 1 );
-    else if( i_sub == D_L0_8x4 )
-    {
-        x264_cabac_encode_decision( cb, 21, 0 );
-        x264_cabac_encode_decision( cb, 22, 0 );
-    }
-    else if( i_sub == D_L0_4x8 )
     {
-        x264_cabac_encode_decision( cb, 21, 0 );
-        x264_cabac_encode_decision( cb, 22, 1 );
-        x264_cabac_encode_decision( cb, 23, 1 );
+        x264_cabac_encode_decision( cb, 21, 1 );
+        return;
     }
-    else if( i_sub == D_L0_4x4 )
+    x264_cabac_encode_decision( cb, 21, 0 );
+    if( i_sub == D_L0_8x4 )
+        x264_cabac_encode_decision( cb, 22, 0 );
+    else
     {
-        x264_cabac_encode_decision( cb, 21, 0 );
         x264_cabac_encode_decision( cb, 22, 1 );
-        x264_cabac_encode_decision( cb, 23, 0 );
+        x264_cabac_encode_decision( cb, 23, i_sub == D_L0_4x8 );
     }
 }
 
-static NOINLINE void x264_cabac_mb_sub_b_partition( x264_cabac_t *cb, int i_sub )
+static inline void x264_cabac_mb_sub_b_partition( x264_cabac_t *cb, int i_sub )
 {
-    static const uint8_t part_bits[12][7] = {
-        {6,1,1,1,0,1,1}, // D_L0_4x4
-        {5,1,1,0,0,1},   // D_L0_8x4
-        {5,1,1,0,1,0},   // D_L0_4x8
-        {3,1,0,0},       // D_L0_8x8
-        {5,1,1,1,1,0},   // D_L1_4x4
-        {5,1,1,0,1,1},   // D_L1_8x4
-        {6,1,1,1,0,0,0}, // D_L1_4x8
-        {3,1,0,1},       // D_L1_8x8
-        {5,1,1,1,1,1},   // D_BI_4x4
-        {6,1,1,1,0,0,1}, // D_BI_8x4
-        {6,1,1,1,0,1,0}, // D_BI_4x8
-        {5,1,1,0,0,0},   // D_BI_8x8
-    };
-    int len;
     if( i_sub == D_DIRECT_8x8 )
     {
         x264_cabac_encode_decision( cb, 36, 0 );
         return;
     }
-    len = part_bits[i_sub][0];
-    x264_cabac_encode_decision( cb, 36, part_bits[i_sub][1] );
-    x264_cabac_encode_decision( cb, 37, part_bits[i_sub][2] );
-    if( len == 3 )
-        x264_cabac_encode_decision( cb, 39, part_bits[i_sub][3] );
-    else
+    x264_cabac_encode_decision( cb, 36, 1 );
+    if( i_sub == D_BI_8x8 )
     {
-        x264_cabac_encode_decision( cb, 38, part_bits[i_sub][3] );
-        x264_cabac_encode_decision( cb, 39, part_bits[i_sub][4] );
-        x264_cabac_encode_decision( cb, 39, part_bits[i_sub][5] );
-        if( len == 6 )
-            x264_cabac_encode_decision( cb, 39, part_bits[i_sub][6] );
+        x264_cabac_encode_decision( cb, 37, 1 );
+        x264_cabac_encode_decision( cb, 38, 0 );
+        x264_cabac_encode_decision( cb, 39, 0 );
+        x264_cabac_encode_decision( cb, 39, 0 );
+        return;
     }
+    x264_cabac_encode_decision( cb, 37, 0 );
+    x264_cabac_encode_decision( cb, 39, i_sub == D_L1_8x8 );
 }
 
 static inline void x264_cabac_mb_transform_size( x264_t *h, x264_cabac_t *cb )
@@ -376,9 +335,9 @@ static void x264_cabac_mb_ref( x264_t *h, x264_cabac_t *cb, int i_list, int idx
     int i_ref  = h->mb.cache.ref[i_list][i8];
     int ctx  = 0;
 
-    if( i_refa > 0 && !h->mb.cache.skip[i8 - 1])
+    if( i_refa > 0 && !h->mb.cache.skip[i8 - 1] )
         ctx++;
-    if( i_refb > 0 && !h->mb.cache.skip[i8 - 8])
+    if( i_refb > 0 && !h->mb.cache.skip[i8 - 8] )
         ctx += 2;
 
     while( i_ref > 0 )
@@ -392,54 +351,68 @@ static void x264_cabac_mb_ref( x264_t *h, x264_cabac_t *cb, int i_list, int idx
 
 static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx )
 {
-    static const uint8_t ctxes[9] = { 0,3,4,5,6,6,6,6,6 };
     const int i_abs = abs( mvd );
     const int ctxbase = l ? 47 : 40;
     int i;
-
+#if RDO_SKIP_BS
     if( i_abs == 0 )
         x264_cabac_encode_decision( cb, ctxbase + ctx, 0 );
-    else if( i_abs < 9 )
+    else
     {
         x264_cabac_encode_decision( cb, ctxbase + ctx, 1 );
-#if RDO_SKIP_BS
-        if( i_abs > 4 )
+        if( i_abs <= 3 )
         {
-            for( i = 1; i < 4; i++ )
-                x264_cabac_encode_decision( cb, ctxbase + ctxes[i], 1 );
-            cb->f8_bits_encoded += cabac_size_unary[i_abs - 3][cb->state[ctxbase+6]];
-            cb->state[ctxbase+6] = cabac_transition_unary[i_abs - 3][cb->state[ctxbase+6]];
+            for( i = 1; i < i_abs; i++ )
+                x264_cabac_encode_decision( cb, ctxbase + i + 2, 1 );
+            x264_cabac_encode_decision( cb, ctxbase + i_abs + 2, 0 );
+            x264_cabac_encode_bypass( cb, mvd < 0 );
         }
         else
-#endif
         {
-            for( i = 1; i < i_abs; i++ )
-                x264_cabac_encode_decision( cb, ctxbase + ctxes[i], 1 );
-            x264_cabac_encode_decision( cb, ctxbase + ctxes[i_abs], 0 );
-            x264_cabac_encode_bypass( cb, mvd < 0 );
+            x264_cabac_encode_decision( cb, ctxbase + 3, 1 );
+            x264_cabac_encode_decision( cb, ctxbase + 4, 1 );
+            x264_cabac_encode_decision( cb, ctxbase + 5, 1 );
+            if( i_abs < 9 )
+            {
+                cb->f8_bits_encoded += cabac_size_unary[i_abs - 3][cb->state[ctxbase+6]];
+                cb->state[ctxbase+6] = cabac_transition_unary[i_abs - 3][cb->state[ctxbase+6]];
+            }
+            else
+            {
+                cb->f8_bits_encoded += cabac_size_5ones[cb->state[ctxbase+6]];
+                cb->state[ctxbase+6] = cabac_transition_5ones[cb->state[ctxbase+6]];
+                x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 );
+            }
         }
     }
+#else
+    static const uint8_t ctxes[8] = { 3,4,5,6,6,6,6,6 };
+
+    if( i_abs == 0 )
+        x264_cabac_encode_decision( cb, ctxbase + ctx, 0 );
     else
     {
         x264_cabac_encode_decision( cb, ctxbase + ctx, 1 );
-#if RDO_SKIP_BS
-        for( i = 1; i < 4; i++ )
-            x264_cabac_encode_decision( cb, ctxbase + ctxes[i], 1 );
-        cb->f8_bits_encoded += cabac_size_5ones[cb->state[ctxbase+6]];
-        cb->state[ctxbase+6] = cabac_transition_5ones[cb->state[ctxbase+6]];
-        x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 );
-#else
-        for( i = 1; i < 9; i++ )
-            x264_cabac_encode_decision( cb, ctxbase + ctxes[i], 1 );
-        x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 );
+        if( i_abs < 9 )
+        {
+            for( i = 1; i < i_abs; i++ )
+                x264_cabac_encode_decision( cb, ctxbase + ctxes[i-1], 1 );
+            x264_cabac_encode_decision( cb, ctxbase + ctxes[i_abs-1], 0 );
+        }
+        else
+        {
+            for( i = 1; i < 9; i++ )
+                x264_cabac_encode_decision( cb, ctxbase + ctxes[i-1], 1 );
+            x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 );
+        }
         x264_cabac_encode_bypass( cb, mvd < 0 );
-#endif
     }
+#endif
 }
 
-static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width, int height )
+static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width )
 {
-    DECLARE_ALIGNED_4( int16_t mvp[2] );
+    ALIGNED_4( int16_t mvp[2] );
     uint32_t amvd;
     int mdx, mdy;
 
@@ -458,43 +431,35 @@ static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_l
 }
 
 #define x264_cabac_mb_mvd(h,cb,i_list,idx,width,height)\
+do\
 {\
-    uint32_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width,height);\
+    uint32_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width);\
     x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mvd );\
-}
+} while(0)
 
-static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int i )
+static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i )
 {
-    if( !x264_mb_partition_listX_table[i_list][ h->mb.i_sub_partition[i] ] )
-        return;
-
     switch( h->mb.i_sub_partition[i] )
     {
         case D_L0_8x8:
-        case D_L1_8x8:
-        case D_BI_8x8:
-            x264_cabac_mb_mvd( h, cb, i_list, 4*i, 2, 2 );
+            x264_cabac_mb_mvd( h, cb, 0, 4*i, 2, 2 );
             break;
         case D_L0_8x4:
-        case D_L1_8x4:
-        case D_BI_8x4:
-            x264_cabac_mb_mvd( h, cb, i_list, 4*i+0, 2, 1 );
-            x264_cabac_mb_mvd( h, cb, i_list, 4*i+2, 2, 1 );
+            x264_cabac_mb_mvd( h, cb, 0, 4*i+0, 2, 1 );
+            x264_cabac_mb_mvd( h, cb, 0, 4*i+2, 2, 1 );
             break;
         case D_L0_4x8:
-        case D_L1_4x8:
-        case D_BI_4x8:
-            x264_cabac_mb_mvd( h, cb, i_list, 4*i+0, 1, 2 );
-            x264_cabac_mb_mvd( h, cb, i_list, 4*i+1, 1, 2 );
+            x264_cabac_mb_mvd( h, cb, 0, 4*i+0, 1, 2 );
+            x264_cabac_mb_mvd( h, cb, 0, 4*i+1, 1, 2 );
             break;
         case D_L0_4x4:
-        case D_L1_4x4:
-        case D_BI_4x4:
-            x264_cabac_mb_mvd( h, cb, i_list, 4*i+0, 1, 1 );
-            x264_cabac_mb_mvd( h, cb, i_list, 4*i+1, 1, 1 );
-            x264_cabac_mb_mvd( h, cb, i_list, 4*i+2, 1, 1 );
-            x264_cabac_mb_mvd( h, cb, i_list, 4*i+3, 1, 1 );
+            x264_cabac_mb_mvd( h, cb, 0, 4*i+0, 1, 1 );
+            x264_cabac_mb_mvd( h, cb, 0, 4*i+1, 1, 1 );
+            x264_cabac_mb_mvd( h, cb, 0, 4*i+2, 1, 1 );
+            x264_cabac_mb_mvd( h, cb, 0, 4*i+3, 1, 1 );
             break;
+        default:
+            assert(0);
     }
 }
 
@@ -519,9 +484,14 @@ static int ALWAYS_INLINE x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int
             /* no need to test for skip/pcm */
             i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1];
             i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8];
-            i_nza &= 0x7f + (b_intra << 7);
-            i_nzb &= 0x7f + (b_intra << 7);
-            return 85 + 4*i_cat + 2*!!i_nzb + !!i_nza;
+            if( x264_constant_p(b_intra) && !b_intra )
+                return 85 + 4*i_cat + ((2*i_nzb + i_nza)&0x7f);
+            else
+            {
+                i_nza &= 0x7f + (b_intra << 7);
+                i_nzb &= 0x7f + (b_intra << 7);
+                return 85 + 4*i_cat + 2*!!i_nzb + !!i_nza;
+            }
         case DCT_LUMA_DC:
             i_nza = (h->mb.cache.i_cbp_left >> 8) & 1;
             i_nzb = (h->mb.cache.i_cbp_top  >> 8) & 1;
@@ -861,7 +831,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
             x264_cabac_mb_mvd( h, cb, 0, 0, 4, 2 );
             x264_cabac_mb_mvd( h, cb, 0, 8, 4, 2 );
         }
-        else if( h->mb.i_partition == D_8x16 )
+        else //if( h->mb.i_partition == D_8x16 )
         {
             if( h->mb.pic.i_fref[0] > 1 )
             {
@@ -875,10 +845,8 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
     else if( i_mb_type == P_8x8 )
     {
         /* sub mb type */
-        x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[0] );
-        x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[1] );
-        x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[2] );
-        x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[3] );
+        for( i = 0; i < 4; i++ )
+            x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i] );
 
         /* ref 0 */
         if( h->mb.pic.i_fref[0] > 1 )
@@ -890,57 +858,50 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
         }
 
         for( i = 0; i < 4; i++ )
-            x264_cabac_mb8x8_mvd( h, cb, 0, i );
+            x264_cabac_mb8x8_mvd( h, cb, i );
     }
     else if( i_mb_type == B_8x8 )
     {
         /* sub mb type */
-        x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[0] );
-        x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[1] );
-        x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[2] );
-        x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[3] );
+        for( i = 0; i < 4; i++ )
+            x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[i] );
 
         /* ref */
-        for( i_list = 0; i_list < 2; i_list++ )
-        {
-            if( ( i_list ? h->mb.pic.i_fref[1] : h->mb.pic.i_fref[0] ) == 1 )
-                continue;
+        if( h->mb.pic.i_fref[0] > 1 )
             for( i = 0; i < 4; i++ )
-                if( x264_mb_partition_listX_table[i_list][ h->mb.i_sub_partition[i] ] )
-                    x264_cabac_mb_ref( h, cb, i_list, 4*i );
-        }
+                if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
+                    x264_cabac_mb_ref( h, cb, 0, 4*i );
+
+        if( h->mb.pic.i_fref[1] > 1 )
+            for( i = 0; i < 4; i++ )
+                if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
+                    x264_cabac_mb_ref( h, cb, 1, 4*i );
 
         for( i = 0; i < 4; i++ )
-            x264_cabac_mb8x8_mvd( h, cb, 0, i );
+            if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
+                x264_cabac_mb_mvd( h, cb, 0, 4*i, 2, 2 );
+
         for( i = 0; i < 4; i++ )
-            x264_cabac_mb8x8_mvd( h, cb, 1, i );
+            if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
+                x264_cabac_mb_mvd( h, cb, 1, 4*i, 2, 2 );
     }
     else if( i_mb_type != B_DIRECT )
     {
         /* All B mode */
         const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
-
-        for( i_list = 0; i_list < 2; i_list++ )
+        if( h->mb.pic.i_fref[0] > 1 )
         {
-            const int i_ref_max = h->mb.pic.i_fref[i_list];
-
-            if( i_ref_max > 1 )
-            {
-                if( h->mb.i_partition == D_16x16 )
-                {
-                    if( b_list[i_list][0] ) x264_cabac_mb_ref( h, cb, i_list, 0 );
-                }
-                else if( h->mb.i_partition == D_16x8 )
-                {
-                    if( b_list[i_list][0] ) x264_cabac_mb_ref( h, cb, i_list, 0 );
-                    if( b_list[i_list][1] ) x264_cabac_mb_ref( h, cb, i_list, 8 );
-                }
-                else if( h->mb.i_partition == D_8x16 )
-                {
-                    if( b_list[i_list][0] ) x264_cabac_mb_ref( h, cb, i_list, 0 );
-                    if( b_list[i_list][1] ) x264_cabac_mb_ref( h, cb, i_list, 4 );
-                }
-            }
+            if( b_list[0][0] )
+                x264_cabac_mb_ref( h, cb, 0, 0 );
+            if( b_list[0][1] && h->mb.i_partition != D_16x16 )
+                x264_cabac_mb_ref( h, cb, 0, 8 >> (h->mb.i_partition == D_8x16) );
+        }
+        if( h->mb.pic.i_fref[1] > 1 )
+        {
+            if( b_list[1][0] )
+                x264_cabac_mb_ref( h, cb, 1, 0 );
+            if( b_list[1][1] && h->mb.i_partition != D_16x16 )
+                x264_cabac_mb_ref( h, cb, 1, 8 >> (h->mb.i_partition == D_8x16) );
         }
         for( i_list = 0; i_list < 2; i_list++ )
         {
@@ -953,7 +914,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
                 if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 4, 2 );
                 if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 8, 4, 2 );
             }
-            else if( h->mb.i_partition == D_8x16 )
+            else //if( h->mb.i_partition == D_8x16 )
             {
                 if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 2, 4 );
                 if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 4, 2, 4 );
@@ -1010,11 +971,9 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
         {
             block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], b_intra );
             block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], b_intra );
-        }
-        if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
-        {
-            for( i = 16; i < 24; i++ )
-                block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, b_intra );
+            if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
+                for( i = 16; i < 24; i++ )
+                    block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, b_intra );
         }
     }
 
@@ -1027,9 +986,9 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
 /*****************************************************************************
  * RD only; doesn't generate a valid bitstream
  * doesn't write cbp or chroma dc (I don't know how much this matters)
- * doesn't write ref or subpartition (never varies between calls, so no point in doing so)
+ * doesn't write ref (never varies between calls, so no point in doing so)
+ * only writes subpartition for p8x8, needed for sub-8x8 mode decision RDO
  * works on all partition sizes except 16x16
- * for sub8x8, call once per 8x8 block
  *****************************************************************************/
 static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int i_pixel )
 {
@@ -1038,11 +997,12 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
     int j;
 
     if( i_mb_type == P_8x8 )
-        x264_cabac_mb8x8_mvd( h, cb, 0, i8 );
-    else if( i_mb_type == P_L0 )
     {
-        x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
+        x264_cabac_mb8x8_mvd( h, cb, i8 );
+        x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i8] );
     }
+    else if( i_mb_type == P_L0 )
+        x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
     else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
     {
         if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<<b_8x16 );
@@ -1050,8 +1010,10 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
     }
     else //if( i_mb_type == B_8x8 )
     {
-        x264_cabac_mb8x8_mvd( h, cb, 0, i8 );
-        x264_cabac_mb8x8_mvd( h, cb, 1, i8 );
+        if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
+            x264_cabac_mb_mvd( h, cb, 0, 4*i8, 2, 2 );
+        if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
+            x264_cabac_mb_mvd( h, cb, 1, 4*i8, 2, 2 );
     }
 
     for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index 1b0b5d1..c65c9bd 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -61,8 +61,9 @@ static const uint8_t sub_mb_type_b_to_golomb[13]=
 /****************************************************************************
  * block_residual_write_cavlc:
  ****************************************************************************/
-static inline int block_residual_write_cavlc_escape( x264_t *h, bs_t *s, int i_suffix_length, int level )
+static inline int block_residual_write_cavlc_escape( x264_t *h, int i_suffix_length, int level )
 {
+    bs_t *s = &h->out.bs;
     static const uint16_t next_suffix[7] = { 0, 3, 6, 12, 24, 48, 0xffff };
     int i_level_prefix = 15;
     int mask = level >> 15;
@@ -112,8 +113,9 @@ static inline int block_residual_write_cavlc_escape( x264_t *h, bs_t *s, int i_s
     return i_suffix_length;
 }
 
-static int block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, int16_t *l, int nC )
+static int block_residual_write_cavlc( x264_t *h, int i_ctxBlockCat, int16_t *l, int nC )
 {
+    bs_t *s = &h->out.bs;
     static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
     static const int count_cat[5] = {16, 15, 16, 4, 15};
     x264_run_level_t runlevel;
@@ -157,7 +159,7 @@ static int block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, in
             i_suffix_length = x264_level_token[i_suffix_length][val_original].i_next;
         }
         else
-            i_suffix_length = block_residual_write_cavlc_escape( h, s, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
+            i_suffix_length = block_residual_write_cavlc_escape( h, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
         for( i = i_trailing+1; i < i_total; i++ )
         {
             val = runlevel.level[i] + LEVEL_TABLE_SIZE/2;
@@ -167,7 +169,7 @@ static int block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, in
                 i_suffix_length = x264_level_token[i_suffix_length][val].i_next;
             }
             else
-                i_suffix_length = block_residual_write_cavlc_escape( h, s, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
+                i_suffix_length = block_residual_write_cavlc_escape( h, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
         }
     }
 
@@ -191,18 +193,19 @@ static int block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, in
 
 static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
 
-#define block_residual_write_cavlc(h,s,cat,idx,l)\
+#define block_residual_write_cavlc(h,cat,idx,l)\
 {\
     int nC = cat == DCT_CHROMA_DC ? 4 : ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? 0 : idx )];\
     uint8_t *nnz = &h->mb.cache.non_zero_count[x264_scan8[idx]];\
     if( !*nnz )\
-        bs_write_vlc( s, x264_coeff0_token[nC] );\
+        bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );\
     else\
-        *nnz = block_residual_write_cavlc(h,s,cat,l,nC);\
+        *nnz = block_residual_write_cavlc(h,cat,l,nC);\
 }
 
-static void cavlc_qp_delta( x264_t *h, bs_t *s )
+static void cavlc_qp_delta( x264_t *h )
 {
+    bs_t *s = &h->out.bs;
     int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
 
     /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
@@ -225,50 +228,40 @@ static void cavlc_qp_delta( x264_t *h, bs_t *s )
     bs_write_se( s, i_dqp );
 }
 
-static void cavlc_mb_mvd( x264_t *h, bs_t *s, int i_list, int idx, int width )
+static void cavlc_mb_mvd( x264_t *h, int i_list, int idx, int width )
 {
-    DECLARE_ALIGNED_4( int16_t mvp[2] );
+    bs_t *s = &h->out.bs;
+    ALIGNED_4( int16_t mvp[2] );
     x264_mb_predict_mv( h, i_list, idx, width, mvp );
     bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0] );
     bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1] );
 }
 
-static void cavlc_mb8x8_mvd( x264_t *h, bs_t *s, int i_list, int i )
+static inline void cavlc_mb8x8_mvd( x264_t *h, int i )
 {
-    if( !x264_mb_partition_listX_table[i_list][ h->mb.i_sub_partition[i] ] )
-        return;
-
     switch( h->mb.i_sub_partition[i] )
     {
         case D_L0_8x8:
-        case D_L1_8x8:
-        case D_BI_8x8:
-            cavlc_mb_mvd( h, s, i_list, 4*i, 2 );
+            cavlc_mb_mvd( h, 0, 4*i, 2 );
             break;
         case D_L0_8x4:
-        case D_L1_8x4:
-        case D_BI_8x4:
-            cavlc_mb_mvd( h, s, i_list, 4*i+0, 2 );
-            cavlc_mb_mvd( h, s, i_list, 4*i+2, 2 );
+            cavlc_mb_mvd( h, 0, 4*i+0, 2 );
+            cavlc_mb_mvd( h, 0, 4*i+2, 2 );
             break;
         case D_L0_4x8:
-        case D_L1_4x8:
-        case D_BI_4x8:
-            cavlc_mb_mvd( h, s, i_list, 4*i+0, 1 );
-            cavlc_mb_mvd( h, s, i_list, 4*i+1, 1 );
+            cavlc_mb_mvd( h, 0, 4*i+0, 1 );
+            cavlc_mb_mvd( h, 0, 4*i+1, 1 );
             break;
         case D_L0_4x4:
-        case D_L1_4x4:
-        case D_BI_4x4:
-            cavlc_mb_mvd( h, s, i_list, 4*i+0, 1 );
-            cavlc_mb_mvd( h, s, i_list, 4*i+1, 1 );
-            cavlc_mb_mvd( h, s, i_list, 4*i+2, 1 );
-            cavlc_mb_mvd( h, s, i_list, 4*i+3, 1 );
+            cavlc_mb_mvd( h, 0, 4*i+0, 1 );
+            cavlc_mb_mvd( h, 0, 4*i+1, 1 );
+            cavlc_mb_mvd( h, 0, 4*i+2, 1 );
+            cavlc_mb_mvd( h, 0, 4*i+3, 1 );
             break;
     }
 }
 
-static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8start, int i8end )
+static inline void x264_macroblock_luma_write_cavlc( x264_t *h, int i8start, int i8end )
 {
     int i8, i4;
     if( h->mb.b_transform_8x8 )
@@ -282,20 +275,23 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8s
     for( i8 = i8start; i8 <= i8end; i8++ )
         if( h->mb.i_cbp_luma & (1 << i8) )
             for( i4 = 0; i4 < 4; i4++ )
-                block_residual_write_cavlc( h, s, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4] );
+                block_residual_write_cavlc( h, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4] );
 }
 
 /*****************************************************************************
  * x264_macroblock_write:
  *****************************************************************************/
-void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
+void x264_macroblock_write_cavlc( x264_t *h )
 {
+    bs_t *s = &h->out.bs;
     const int i_mb_type = h->mb.i_type;
-    static const int i_offsets[3] = {5,23,0};
+    static const uint8_t i_offsets[3] = {5,23,0};
     int i_mb_i_offset = i_offsets[h->sh.i_type];
     int i;
 
-#if !RDO_SKIP_BS
+#if RDO_SKIP_BS
+    s->i_bits_encoded = 0;
+#else
     const int i_mb_pos_start = bs_pos( s );
     int       i_mb_pos_tex;
 #endif
@@ -309,6 +305,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
 #if !RDO_SKIP_BS
     if( i_mb_type == I_PCM )
     {
+        uint8_t *p_start = s->p_start;
         bs_write_ue( s, i_mb_i_offset + 25 );
         i_mb_pos_tex = bs_pos( s );
         h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
@@ -324,6 +321,9 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
             memcpy( s->p + i*8, h->mb.pic.p_fenc[2] + i*FENC_STRIDE, 8 );
         s->p += 64;
 
+        bs_init( s, s->p, s->p_end - s->p );
+        s->p_start = p_start;
+
         /* if PCM is chosen, we need to store reconstructed frame data */
         h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE, 16 );
         h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
@@ -366,17 +366,13 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
     }
     else if( i_mb_type == P_L0 )
     {
-        DECLARE_ALIGNED_4( int16_t mvp[2] );
-
         if( h->mb.i_partition == D_16x16 )
         {
             bs_write1( s, 1 );
 
             if( h->mb.pic.i_fref[0] > 1 )
                 bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
-            x264_mb_predict_mv( h, 0, 0, 4, mvp );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
+            cavlc_mb_mvd( h, 0, 0, 4 );
         }
         else if( h->mb.i_partition == D_16x8 )
         {
@@ -386,14 +382,8 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
                 bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
                 bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
             }
-
-            x264_mb_predict_mv( h, 0, 0, 4, mvp );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
-
-            x264_mb_predict_mv( h, 0, 8, 4, mvp );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[8]][0] - mvp[0] );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[8]][1] - mvp[1] );
+            cavlc_mb_mvd( h, 0, 0, 4 );
+            cavlc_mb_mvd( h, 0, 8, 4 );
         }
         else if( h->mb.i_partition == D_8x16 )
         {
@@ -403,14 +393,8 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
                 bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
                 bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
             }
-
-            x264_mb_predict_mv( h, 0, 0, 2, mvp );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
-
-            x264_mb_predict_mv( h, 0, 4, 2, mvp );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4]][0] - mvp[0] );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4]][1] - mvp[1] );
+            cavlc_mb_mvd( h, 0, 0, 2 );
+            cavlc_mb_mvd( h, 0, 4, 2 );
         }
     }
     else if( i_mb_type == P_8x8 )
@@ -445,7 +429,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
         }
 
         for( i = 0; i < 4; i++ )
-            cavlc_mb8x8_mvd( h, s, 0, i );
+            cavlc_mb8x8_mvd( h, i );
     }
     else if( i_mb_type == B_8x8 )
     {
@@ -467,80 +451,47 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
 
         /* mvd */
         for( i = 0; i < 4; i++ )
-            cavlc_mb8x8_mvd( h, s, 0, i );
+            if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
+                cavlc_mb_mvd( h, 0, 4*i, 2 );
         for( i = 0; i < 4; i++ )
-            cavlc_mb8x8_mvd( h, s, 1, i );
+            if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
+                cavlc_mb_mvd( h, 1, 4*i, 2 );
     }
     else if( i_mb_type != B_DIRECT )
     {
         /* All B mode */
         /* Motion Vector */
-        int i_list;
-        DECLARE_ALIGNED_4( int16_t mvp[2] );
         const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
+        const int i_ref0_max = h->mb.pic.i_fref[0] - 1;
+        const int i_ref1_max = h->mb.pic.i_fref[1] - 1;
 
         bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] );
-
-        for( i_list = 0; i_list < 2; i_list++ )
+        if( h->mb.i_partition == D_16x16 )
         {
-            const int i_ref_max = (i_list == 0 ? h->mb.pic.i_fref[0] : h->mb.pic.i_fref[1]) - 1;
-
-            if( i_ref_max )
-                switch( h->mb.i_partition )
-                {
-                    case D_16x16:
-                        if( b_list[i_list][0] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[0]] );
-                        break;
-                    case D_16x8:
-                        if( b_list[i_list][0] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[0]] );
-                        if( b_list[i_list][1] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[8]] );
-                        break;
-                    case D_8x16:
-                        if( b_list[i_list][0] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[0]] );
-                        if( b_list[i_list][1] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[4]] );
-                        break;
-                }
+            if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[0]] );
+            if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[0]] );
+            if( b_list[0][0] ) cavlc_mb_mvd( h, 0, 0, 4 );
+            if( b_list[1][0] ) cavlc_mb_mvd( h, 1, 0, 4 );
         }
-        for( i_list = 0; i_list < 2; i_list++ )
+        else
         {
-            switch( h->mb.i_partition )
+            if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[ 0]] );
+            if( i_ref0_max && b_list[0][1] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[12]] );
+            if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[ 0]] );
+            if( i_ref1_max && b_list[1][1] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[12]] );
+            if( h->mb.i_partition == D_16x8 )
             {
-                case D_16x16:
-                    if( b_list[i_list][0] )
-                    {
-                        x264_mb_predict_mv( h, i_list, 0, 4, mvp );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
-                    }
-                    break;
-                case D_16x8:
-                    if( b_list[i_list][0] )
-                    {
-                        x264_mb_predict_mv( h, i_list, 0, 4, mvp );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
-                    }
-                    if( b_list[i_list][1] )
-                    {
-                        x264_mb_predict_mv( h, i_list, 8, 4, mvp );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[8]][0] - mvp[0] );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[8]][1] - mvp[1] );
-                    }
-                    break;
-                case D_8x16:
-                    if( b_list[i_list][0] )
-                    {
-                        x264_mb_predict_mv( h, i_list, 0, 2, mvp );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
-                    }
-                    if( b_list[i_list][1] )
-                    {
-                        x264_mb_predict_mv( h, i_list, 4, 2, mvp );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4]][0] - mvp[0] );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4]][1] - mvp[1] );
-                    }
-                    break;
+                if( b_list[0][0] ) cavlc_mb_mvd( h, 0, 0, 4 );
+                if( b_list[0][1] ) cavlc_mb_mvd( h, 0, 8, 4 );
+                if( b_list[1][0] ) cavlc_mb_mvd( h, 1, 0, 4 );
+                if( b_list[1][1] ) cavlc_mb_mvd( h, 1, 8, 4 );
+            }
+            else //if( h->mb.i_partition == D_8x16 )
+            {
+                if( b_list[0][0] ) cavlc_mb_mvd( h, 0, 0, 2 );
+                if( b_list[0][1] ) cavlc_mb_mvd( h, 0, 4, 2 );
+                if( b_list[1][0] ) cavlc_mb_mvd( h, 1, 0, 2 );
+                if( b_list[1][1] ) cavlc_mb_mvd( h, 1, 4, 2 );
             }
         }
     }
@@ -565,29 +516,29 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
     /* write residual */
     if( i_mb_type == I_16x16 )
     {
-        cavlc_qp_delta( h, s );
+        cavlc_qp_delta( h );
 
         /* DC Luma */
-        block_residual_write_cavlc( h, s, DCT_LUMA_DC, 24 , h->dct.luma16x16_dc );
+        block_residual_write_cavlc( h, DCT_LUMA_DC, 24 , h->dct.luma16x16_dc );
 
         /* AC Luma */
         if( h->mb.i_cbp_luma )
             for( i = 0; i < 16; i++ )
-                block_residual_write_cavlc( h, s, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );
+                block_residual_write_cavlc( h, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );
     }
     else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
     {
-        cavlc_qp_delta( h, s );
-        x264_macroblock_luma_write_cavlc( h, s, 0, 3 );
+        cavlc_qp_delta( h );
+        x264_macroblock_luma_write_cavlc( h, 0, 3 );
     }
     if( h->mb.i_cbp_chroma )
     {
         /* Chroma DC residual present */
-        block_residual_write_cavlc( h, s, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
-        block_residual_write_cavlc( h, s, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
+        block_residual_write_cavlc( h, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
+        block_residual_write_cavlc( h, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
         if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
             for( i = 16; i < 24; i++ )
-                block_residual_write_cavlc( h, s, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
+                block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
     }
 
 #if !RDO_SKIP_BS
@@ -599,37 +550,42 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
 /*****************************************************************************
  * RD only; doesn't generate a valid bitstream
  * doesn't write cbp or chroma dc (I don't know how much this matters)
- * doesn't write ref or subpartition (never varies between calls, so no point in doing so)
+ * doesn't write ref (never varies between calls, so no point in doing so)
+ * only writes subpartition for p8x8, needed for sub-8x8 mode decision RDO
  * works on all partition sizes except 16x16
- * for sub8x8, call once per 8x8 block
  *****************************************************************************/
 static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
 {
+    bs_t *s = &h->out.bs;
     const int i_mb_type = h->mb.i_type;
     int b_8x16 = h->mb.i_partition == D_8x16;
     int j;
-    h->out.bs.i_bits_encoded = 0;
 
     if( i_mb_type == P_8x8 )
-        cavlc_mb8x8_mvd( h, &h->out.bs, 0, i8 );
+    {
+        cavlc_mb8x8_mvd( h, i8 );
+        bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i8] ] );
+    }
     else if( i_mb_type == P_L0 )
-        cavlc_mb_mvd( h, &h->out.bs, 0, 4*i8, 4>>b_8x16 );
+        cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
     else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
     {
-        if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) cavlc_mb_mvd( h, &h->out.bs, 0, 4*i8, 4>>b_8x16 );
-        if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) cavlc_mb_mvd( h, &h->out.bs, 1, 4*i8, 4>>b_8x16 );
+        if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
+        if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) cavlc_mb_mvd( h, 1, 4*i8, 4>>b_8x16 );
     }
     else //if( i_mb_type == B_8x8 )
     {
-        cavlc_mb8x8_mvd( h, &h->out.bs, 0, i8 );
-        cavlc_mb8x8_mvd( h, &h->out.bs, 1, i8 );
+        if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
+            cavlc_mb_mvd( h, 0, 4*i8, 2 );
+        if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
+            cavlc_mb_mvd( h, 1, 4*i8, 2 );
     }
 
     for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
     {
-        x264_macroblock_luma_write_cavlc( h, &h->out.bs, i8, i8 );
-        block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 );
-        block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1 );
+        x264_macroblock_luma_write_cavlc( h, i8, i8 );
+        block_residual_write_cavlc( h, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 );
+        block_residual_write_cavlc( h, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1 );
         i8 += x264_pixel_size[i_pixel].h >> 3;
     }
 
@@ -640,12 +596,12 @@ static int x264_subpartition_size_cavlc( x264_t *h, int i4, int i_pixel )
 {
     int b_8x4 = i_pixel == PIXEL_8x4;
     h->out.bs.i_bits_encoded = 0;
-    cavlc_mb_mvd( h, &h->out.bs, 0, i4, 1+b_8x4 );
-    block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
+    cavlc_mb_mvd( h, 0, i4, 1+b_8x4 );
+    block_residual_write_cavlc( h, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
     if( i_pixel != PIXEL_4x4 )
     {
         i4 += 2-b_8x4;
-        block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
+        block_residual_write_cavlc( h, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
     }
 
     return h->out.bs.i_bits_encoded;
@@ -663,14 +619,14 @@ static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
 {
     h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, 4*i8, i_mode );
     bs_write_ue( &h->out.bs, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
-    x264_macroblock_luma_write_cavlc( h, &h->out.bs, i8, i8 );
+    x264_macroblock_luma_write_cavlc( h, i8, i8 );
     return h->out.bs.i_bits_encoded;
 }
 
 static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
 {
     h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, i4, i_mode );
-    block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
+    block_residual_write_cavlc( h, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
     return h->out.bs.i_bits_encoded;
 }
 
@@ -679,14 +635,14 @@ static int x264_i8x8_chroma_size_cavlc( x264_t *h )
     h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
     if( h->mb.i_cbp_chroma )
     {
-        block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
-        block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
+        block_residual_write_cavlc( h, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
+        block_residual_write_cavlc( h, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
 
         if( h->mb.i_cbp_chroma == 2 )
         {
             int i;
             for( i = 16; i < 24; i++ )
-                block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
+                block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
         }
     }
     return h->out.bs.i_bits_encoded;
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 176443b..d873cd0 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -31,8 +31,9 @@
 #include "analyse.h"
 #include "ratecontrol.h"
 #include "macroblock.h"
+#include "me.h"
 
-#if VISUALIZE
+#ifdef HAVE_VISUALIZE
 #include "common/visualize.h"
 #endif
 
@@ -42,9 +43,9 @@
 
 #define bs_write_ue bs_write_ue_big
 
-static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
-                                    x264_nal_t **pp_nal, int *pi_nal,
-                                    x264_picture_t *pic_out );
+static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
+                                   x264_nal_t **pp_nal, int *pi_nal,
+                                   x264_picture_t *pic_out );
 
 /****************************************************************************
  *
@@ -67,7 +68,7 @@ static void x264_frame_dump( x264_t *h )
     if( !f )
         return;
     /* Write the frame in display order */
-    fseek( f, h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2, SEEK_SET );
+    fseek( f, (uint64_t)h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2, SEEK_SET );
     for( i = 0; i < h->fdec->i_plane; i++ )
         for( y = 0; y < h->param.i_height >> !!i; y++ )
             fwrite( &h->fdec->plane[i][y*h->fdec->i_stride[i]], 1, h->param.i_width >> !!i, f );
@@ -88,7 +89,7 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
     sh->pps = pps;
 
     sh->i_first_mb  = 0;
-    sh->i_last_mb   = h->sps->i_mb_width * h->sps->i_mb_height;
+    sh->i_last_mb   = h->mb.i_mb_count - 1;
     sh->i_pps_id    = pps->i_id;
 
     sh->i_frame_num = i_frame;
@@ -175,12 +176,12 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
 
     bs_write_ue( s, sh->i_type + 5 );   /* same type things */
     bs_write_ue( s, sh->i_pps_id );
-    bs_write( s, sh->sps->i_log2_max_frame_num, sh->i_frame_num );
+    bs_write( s, sh->sps->i_log2_max_frame_num, sh->i_frame_num & ((1<<sh->sps->i_log2_max_frame_num)-1) );
 
     if( !sh->sps->b_frame_mbs_only )
     {
         bs_write1( s, sh->b_field_pic );
-        if ( sh->b_field_pic )
+        if( sh->b_field_pic )
             bs_write1( s, sh->b_bottom_field );
     }
 
@@ -191,7 +192,7 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
 
     if( sh->sps->i_poc_type == 0 )
     {
-        bs_write( s, sh->sps->i_log2_max_poc_lsb, sh->i_poc_lsb );
+        bs_write( s, sh->sps->i_log2_max_poc_lsb, sh->i_poc_lsb & ((1<<sh->sps->i_log2_max_poc_lsb)-1) );
         if( sh->pps->b_pic_order && !sh->b_field_pic )
         {
             bs_write_se( s, sh->i_delta_poc_bottom );
@@ -257,10 +258,36 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
         }
     }
 
-    if( ( sh->pps->b_weighted_pred && ( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_SP ) ) ||
-        ( sh->pps->b_weighted_bipred == 1 && sh->i_type == SLICE_TYPE_B ) )
+    if( sh->pps->b_weighted_pred && ( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_SP ) )
     {
-        /* FIXME */
+        /* pred_weight_table() */
+        bs_write_ue( s, sh->weight[0][0].i_denom );
+        bs_write_ue( s, sh->weight[0][1].i_denom );
+        for( i = 0; i < sh->i_num_ref_idx_l0_active; i++ )
+        {
+            int luma_weight_l0_flag = !!sh->weight[i][0].weightfn;
+            int chroma_weight_l0_flag = !!sh->weight[i][1].weightfn || !!sh->weight[i][2].weightfn;
+            bs_write1( s, luma_weight_l0_flag );
+            if( luma_weight_l0_flag )
+            {
+                bs_write_se( s, sh->weight[i][0].i_scale );
+                bs_write_se( s, sh->weight[i][0].i_offset );
+            }
+            bs_write1( s, chroma_weight_l0_flag );
+            if( chroma_weight_l0_flag )
+            {
+                int j;
+                for( j = 1; j < 3; j++ )
+                {
+                    bs_write_se( s, sh->weight[i][j].i_scale );
+                    bs_write_se( s, sh->weight[i][j].i_offset );
+                }
+            }
+        }
+    }
+    else if( sh->pps->b_weighted_bipred == 1 && sh->i_type == SLICE_TYPE_B )
+    {
+      /* TODO */
     }
 
     if( i_nal_ref_idc != 0 )
@@ -272,7 +299,17 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
         }
         else
         {
-            bs_write1( s, 0 );  /* adaptive_ref_pic_marking_mode_flag */
+            bs_write1( s, sh->i_mmco_command_count > 0 ); /* adaptive_ref_pic_marking_mode_flag */
+            if( sh->i_mmco_command_count > 0 )
+            {
+                int i;
+                for( i = 0; i < sh->i_mmco_command_count; i++ )
+                {
+                    bs_write_ue( s, 1 ); /* mark short term ref as unused */
+                    bs_write_ue( s, sh->mmco[i].i_difference_of_pic_nums - 1 );
+                }
+                bs_write_ue( s, 0 ); /* end command list */
+            }
         }
     }
 
@@ -295,17 +332,18 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
 
 /* If we are within a reasonable distance of the end of the memory allocated for the bitstream, */
 /* reallocate, adding an arbitrary amount of space (100 kilobytes). */
-static void x264_bitstream_check_buffer( x264_t *h )
+static int x264_bitstream_check_buffer( x264_t *h )
 {
+    uint8_t *bs_bak = h->out.p_bitstream;
     if( ( h->param.b_cabac && (h->cabac.p_end - h->cabac.p < 2500) )
      || ( h->out.bs.p_end - h->out.bs.p < 2500 ) )
     {
-        uint8_t *bs_bak = h->out.p_bitstream;
         intptr_t delta;
         int i;
 
         h->out.i_bitstream += 100000;
-        h->out.p_bitstream = x264_realloc( h->out.p_bitstream, h->out.i_bitstream );
+        CHECKED_MALLOC( h->out.p_bitstream, h->out.i_bitstream );
+        h->mc.memcpy_aligned( h->out.p_bitstream, bs_bak, (h->out.i_bitstream - 100000) & ~15 );
         delta = h->out.p_bitstream - bs_bak;
 
         h->out.bs.p_start += delta;
@@ -318,7 +356,12 @@ static void x264_bitstream_check_buffer( x264_t *h )
 
         for( i = 0; i <= h->out.i_nal; i++ )
             h->out.nal[i].p_payload += delta;
+        x264_free( bs_bak );
     }
+    return 0;
+fail:
+    x264_free( bs_bak );
+    return -1;
 }
 
 /****************************************************************************
@@ -332,9 +375,9 @@ static void x264_bitstream_check_buffer( x264_t *h )
 static int x264_validate_parameters( x264_t *h )
 {
 #ifdef HAVE_MMX
-    if( !(x264_cpu_detect() & X264_CPU_MMXEXT) )
+    if( !(x264_cpu_detect() & X264_CPU_SSE) )
     {
-        x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n");
+        x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm support\n");
         x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n");
         return -1;
     }
@@ -352,14 +395,15 @@ static int x264_validate_parameters( x264_t *h )
                   h->param.i_width, h->param.i_height );
         return -1;
     }
-    if( h->param.i_csp != X264_CSP_I420 )
+    int i_csp = h->param.i_csp & X264_CSP_MASK;
+    if( i_csp != X264_CSP_I420 && i_csp != X264_CSP_YV12 )
     {
-        x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420 supported)\n" );
+        x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12 supported)\n" );
         return -1;
     }
 
-    if( h->param.i_threads == 0 )
-        h->param.i_threads = x264_cpu_num_processors() * 3/2;
+    if( h->param.i_threads == X264_THREADS_AUTO )
+        h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2;
     h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
     if( h->param.i_threads > 1 )
     {
@@ -367,7 +411,17 @@ static int x264_validate_parameters( x264_t *h )
         x264_log( h, X264_LOG_WARNING, "not compiled with pthread support!\n");
         h->param.i_threads = 1;
 #endif
+        /* Avoid absurdly small thread slices as they can reduce performance
+         * and VBV compliance.  Capped at an arbitrary 4 rows per thread. */
+        if( h->param.b_sliced_threads )
+        {
+            int max_threads = (h->param.i_height+15)/16 / 4;
+            h->param.i_threads = X264_MIN( h->param.i_threads, max_threads );
+        }
     }
+    else
+        h->param.b_sliced_threads = 0;
+    h->i_thread_frames = h->param.b_sliced_threads ? 1 : h->param.i_threads;
 
     if( h->param.b_interlaced )
     {
@@ -381,6 +435,31 @@ static int x264_validate_parameters( x264_t *h )
             x264_log( h, X264_LOG_WARNING, "interlace + direct=temporal is not implemented\n" );
             h->param.analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
         }
+        if( h->param.analyse.i_weighted_pred > 0 )
+        {
+            x264_log( h, X264_LOG_WARNING, "interlace + weightp is not implemented\n" );
+            h->param.analyse.i_weighted_pred = X264_WEIGHTP_NONE;
+        }
+    }
+
+    /* Detect default ffmpeg settings and terminate with an error. */
+    {
+        int score = 0;
+        score += h->param.analyse.i_me_range == 0;
+        score += h->param.rc.i_qp_step == 3;
+        score += h->param.i_keyint_max == 12;
+        score += h->param.rc.i_qp_min == 2;
+        score += h->param.rc.i_qp_max == 31;
+        score += h->param.rc.f_qcompress == 0.5;
+        score += fabs(h->param.rc.f_ip_factor - 1.25) < 0.01;
+        score += fabs(h->param.rc.f_pb_factor - 1.25) < 0.01;
+        score += h->param.analyse.inter == 0 && h->param.analyse.i_subpel_refine == 8;
+        if( score >= 5 )
+        {
+            x264_log( h, X264_LOG_ERROR, "broken ffmpeg default settings detected\n" );
+            x264_log( h, X264_LOG_ERROR, "use an encoding preset (vpre)\n" );
+            return -1;
+        }
     }
 
     if( h->param.rc.i_rc_method < 0 || h->param.rc.i_rc_method > 2 )
@@ -391,7 +470,10 @@ static int x264_validate_parameters( x264_t *h )
     h->param.rc.f_rf_constant = x264_clip3f( h->param.rc.f_rf_constant, 0, 51 );
     h->param.rc.i_qp_constant = x264_clip3( h->param.rc.i_qp_constant, 0, 51 );
     if( h->param.rc.i_rc_method == X264_RC_CRF )
+    {
         h->param.rc.i_qp_constant = h->param.rc.f_rf_constant;
+        h->param.rc.i_bitrate = 0;
+    }
     if( (h->param.rc.i_rc_method == X264_RC_CQP || h->param.rc.i_rc_method == X264_RC_CRF)
         && h->param.rc.i_qp_constant == 0 )
     {
@@ -421,19 +503,31 @@ static int x264_validate_parameters( x264_t *h )
         h->param.rc.i_qp_min = x264_clip3( (int)(X264_MIN3( qp_p, qp_i, qp_b )), 0, 51 );
         h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, 51 );
         h->param.rc.i_aq_mode = 0;
+        h->param.rc.b_mb_tree = 0;
     }
     h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, 51 );
     h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max );
 
-    if( ( h->param.i_width % 16 || h->param.i_height % 16 )
-        && h->param.i_height != 1080 && !h->mb.b_lossless )
+    int max_slices = (h->param.i_height+((16<<h->param.b_interlaced)-1))/(16<<h->param.b_interlaced);
+    if( h->param.b_sliced_threads )
+        h->param.i_slice_count = x264_clip3( h->param.i_threads, 0, max_slices );
+    else
     {
-        // There's nothing special about 1080 in that the warning still applies to it,
-        // but chances are the user can't help it if his content is already 1080p,
-        // so there's no point in warning in that case.
-        x264_log( h, X264_LOG_WARNING,
-                  "width or height not divisible by 16 (%dx%d), compression will suffer.\n",
-                  h->param.i_width, h->param.i_height );
+        h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices );
+        h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 );
+        h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 );
+        if( h->param.b_interlaced && h->param.i_slice_max_size )
+        {
+            x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-size is not implemented\n" );
+            h->param.i_slice_max_size = 0;
+        }
+        if( h->param.b_interlaced && h->param.i_slice_max_mbs )
+        {
+            x264_log( h, X264_LOG_WARNING, "interlaced + slice-max-mbs is not implemented\n" );
+            h->param.i_slice_max_mbs = 0;
+        }
+        if( h->param.i_slice_max_mbs || h->param.i_slice_max_size )
+            h->param.i_slice_count = 0;
     }
 
     h->param.i_frame_reference = x264_clip3( h->param.i_frame_reference, 1, 16 );
@@ -441,18 +535,68 @@ static int x264_validate_parameters( x264_t *h )
         h->param.i_keyint_max = 1;
     if( h->param.i_scenecut_threshold < 0 )
         h->param.i_scenecut_threshold = 0;
-    h->param.i_keyint_min = x264_clip3( h->param.i_keyint_min, 1, h->param.i_keyint_max/2+1 );
     if( !h->param.analyse.i_subpel_refine && h->param.analyse.i_direct_mv_pred > X264_DIRECT_PRED_SPATIAL )
     {
         x264_log( h, X264_LOG_WARNING, "subme=0 + direct=temporal is not supported\n" );
         h->param.analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
     }
     h->param.i_bframe = x264_clip3( h->param.i_bframe, 0, X264_BFRAME_MAX );
+    if( h->param.i_keyint_max == 1 )
+    {
+        h->param.i_bframe = 0;
+        h->param.b_intra_refresh = 0;
+    }
     h->param.i_bframe_bias = x264_clip3( h->param.i_bframe_bias, -90, 100 );
-    h->param.b_bframe_pyramid = h->param.b_bframe_pyramid && h->param.i_bframe > 1;
+    if( h->param.i_bframe <= 1 )
+        h->param.i_bframe_pyramid = X264_B_PYRAMID_NONE;
+    h->param.i_bframe_pyramid = x264_clip3( h->param.i_bframe_pyramid, X264_B_PYRAMID_NONE, X264_B_PYRAMID_NORMAL );
     if( !h->param.i_bframe )
+    {
         h->param.i_bframe_adaptive = X264_B_ADAPT_NONE;
-    h->param.analyse.b_weighted_bipred = h->param.analyse.b_weighted_bipred && h->param.i_bframe > 0;
+        h->param.analyse.i_direct_mv_pred = 0;
+        h->param.analyse.b_weighted_bipred = 0;
+    }
+    if( h->param.b_intra_refresh && h->param.i_bframe_pyramid == X264_B_PYRAMID_NORMAL )
+    {
+        x264_log( h, X264_LOG_WARNING, "b-pyramid normal + intra-refresh is not supported\n" );
+        h->param.i_bframe_pyramid = X264_B_PYRAMID_STRICT;
+    }
+    if( h->param.b_intra_refresh && h->param.i_frame_reference > 1 )
+    {
+        x264_log( h, X264_LOG_WARNING, "ref > 1 + intra-refresh is not supported\n" );
+        h->param.i_frame_reference = 1;
+    }
+    if( h->param.b_intra_refresh )
+        h->param.i_keyint_max = X264_MIN( h->param.i_keyint_max, (h->param.i_width+15)/16 - 1 );
+    h->param.i_keyint_min = x264_clip3( h->param.i_keyint_min, 1, h->param.i_keyint_max/2+1 );
+    h->param.rc.i_lookahead = x264_clip3( h->param.rc.i_lookahead, 0, X264_LOOKAHEAD_MAX );
+    {
+        int maxrate = X264_MAX( h->param.rc.i_vbv_max_bitrate, h->param.rc.i_bitrate );
+        float bufsize = maxrate ? (float)h->param.rc.i_vbv_buffer_size / maxrate : 0;
+        float fps = h->param.i_fps_num > 0 && h->param.i_fps_den > 0 ? (float) h->param.i_fps_num / h->param.i_fps_den : 25.0;
+        h->param.rc.i_lookahead = X264_MIN( h->param.rc.i_lookahead, X264_MAX( h->param.i_keyint_max, bufsize*fps ) );
+    }
+
+    if( !h->param.i_timebase_num || !h->param.i_timebase_den )
+    {
+        h->param.i_timebase_num = h->param.i_fps_den;
+        h->param.i_timebase_den = h->param.i_fps_num;
+    }
+
+    h->param.rc.f_qcompress = x264_clip3f( h->param.rc.f_qcompress, 0.0, 1.0 );
+    if( !h->param.rc.i_lookahead || h->param.i_keyint_max == 1 || h->param.rc.f_qcompress == 1 )
+        h->param.rc.b_mb_tree = 0;
+    if( h->param.rc.b_stat_read )
+        h->param.rc.i_lookahead = 0;
+#ifdef HAVE_PTHREAD
+    if( h->param.i_sync_lookahead )
+        h->param.i_sync_lookahead = x264_clip3( h->param.i_sync_lookahead, h->i_thread_frames + h->param.i_bframe, X264_LOOKAHEAD_MAX );
+    if( h->param.rc.b_stat_read || h->i_thread_frames == 1 )
+        h->param.i_sync_lookahead = 0;
+#else
+    h->param.i_sync_lookahead = 0;
+#endif
+
     h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
                                 && h->param.i_bframe
                                 && ( h->param.rc.b_stat_write || !h->param.rc.b_stat_read );
@@ -477,7 +621,7 @@ static int x264_validate_parameters( x264_t *h )
     if( h->param.analyse.i_me_method == X264_ME_TESA &&
         (h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1) )
         h->param.analyse.i_me_method = X264_ME_ESA;
-    h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 0, 9 );
+    h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 0, 10 );
     h->param.analyse.b_mixed_references = h->param.analyse.b_mixed_references && h->param.i_frame_reference > 1;
     h->param.analyse.inter &= X264_ANALYSE_PSUB16x16|X264_ANALYSE_PSUB8x8|X264_ANALYSE_BSUB16x16|
                               X264_ANALYSE_I4x4|X264_ANALYSE_I8x8;
@@ -493,6 +637,11 @@ static int x264_validate_parameters( x264_t *h )
     if( !h->param.b_cabac )
         h->param.analyse.i_trellis = 0;
     h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 );
+    if( !h->param.analyse.b_psy )
+    {
+        h->param.analyse.f_psy_rd = 0;
+        h->param.analyse.f_psy_trellis = 0;
+    }
     if( !h->param.analyse.i_trellis )
         h->param.analyse.f_psy_trellis = 0;
     h->param.analyse.f_psy_rd = x264_clip3f( h->param.analyse.f_psy_rd, 0, 10 );
@@ -513,11 +662,19 @@ static int x264_validate_parameters( x264_t *h )
     else
         h->mb.i_psy_trellis = 0;
     h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12);
-    h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 1 );
+    h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 );
     h->param.rc.f_aq_strength = x264_clip3f( h->param.rc.f_aq_strength, 0, 3 );
     if( h->param.rc.f_aq_strength == 0 )
         h->param.rc.i_aq_mode = 0;
+    /* MB-tree requires AQ to be on, even if the strength is zero. */
+    if( !h->param.rc.i_aq_mode && h->param.rc.b_mb_tree )
+    {
+        h->param.rc.i_aq_mode = 1;
+        h->param.rc.f_aq_strength = 0;
+    }
     h->param.analyse.i_noise_reduction = x264_clip3( h->param.analyse.i_noise_reduction, 0, 1<<16 );
+    if( h->param.analyse.i_subpel_refine == 10 && (h->param.analyse.i_trellis != 2 || !h->param.rc.i_aq_mode) )
+        h->param.analyse.i_subpel_refine = 9;
 
     {
         const x264_level_t *l = x264_levels;
@@ -548,7 +705,11 @@ static int x264_validate_parameters( x264_t *h )
             h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 512 >> h->param.b_interlaced);
     }
 
-    if( h->param.i_threads > 1 )
+    h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, 0, X264_WEIGHTP_SMART );
+    if( !h->param.analyse.i_weighted_pred && h->param.rc.b_mb_tree && h->param.analyse.b_psy && !h->param.b_interlaced )
+        h->param.analyse.i_weighted_pred = X264_WEIGHTP_FAKE;
+
+    if( h->i_thread_frames > 1 )
     {
         int r = h->param.analyse.i_mv_range_thread;
         int r2;
@@ -558,7 +719,7 @@ static int x264_validate_parameters( x264_t *h )
             // the rest is allocated to whichever thread is far enough ahead to use it.
             // reserving more space increases quality for some videos, but costs more time
             // in thread synchronization.
-            int max_range = (h->param.i_height + X264_THREAD_HEIGHT) / h->param.i_threads - X264_THREAD_HEIGHT;
+            int max_range = (h->param.i_height + X264_THREAD_HEIGHT) / h->i_thread_frames - X264_THREAD_HEIGHT;
             r = max_range / 2;
         }
         r = X264_MAX( r, h->param.analyse.i_me_range );
@@ -587,13 +748,28 @@ static int x264_validate_parameters( x264_t *h )
     /* ensure the booleans are 0 or 1 so they can be used in math */
 #define BOOLIFY(x) h->param.x = !!h->param.x
     BOOLIFY( b_cabac );
+    BOOLIFY( b_constrained_intra );
     BOOLIFY( b_deblocking_filter );
+    BOOLIFY( b_deterministic );
+    BOOLIFY( b_sliced_threads );
     BOOLIFY( b_interlaced );
+    BOOLIFY( b_intra_refresh );
+    BOOLIFY( b_visualize );
+    BOOLIFY( b_aud );
+    BOOLIFY( b_repeat_headers );
+    BOOLIFY( b_annexb );
     BOOLIFY( analyse.b_transform_8x8 );
+    BOOLIFY( analyse.b_weighted_bipred );
     BOOLIFY( analyse.b_chroma_me );
+    BOOLIFY( analyse.b_mixed_references );
     BOOLIFY( analyse.b_fast_pskip );
+    BOOLIFY( analyse.b_dct_decimate );
+    BOOLIFY( analyse.b_psy );
+    BOOLIFY( analyse.b_psnr );
+    BOOLIFY( analyse.b_ssim );
     BOOLIFY( rc.b_stat_write );
     BOOLIFY( rc.b_stat_read );
+    BOOLIFY( rc.b_mb_tree );
 #undef BOOLIFY
 
     return 0;
@@ -613,94 +789,121 @@ static void mbcmp_init( x264_t *h )
     memcpy( h->pixf.fpelcmp_x4, satd ? h->pixf.satd_x4 : h->pixf.sad_x4, sizeof(h->pixf.fpelcmp_x4) );
 }
 
+static void x264_set_aspect_ratio( x264_t *h, x264_param_t *param, int initial )
+{
+    /* VUI */
+    if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 )
+    {
+        int i_w = param->vui.i_sar_width;
+        int i_h = param->vui.i_sar_height;
+        int old_w = h->param.vui.i_sar_width;
+        int old_h = h->param.vui.i_sar_height;
+
+        x264_reduce_fraction( &i_w, &i_h );
+
+        while( i_w > 65535 || i_h > 65535 )
+        {
+            i_w /= 2;
+            i_h /= 2;
+        }
+
+        x264_reduce_fraction( &i_w, &i_h );
+
+        if( i_w != old_w || i_h != old_h || initial )
+        {
+            h->param.vui.i_sar_width = 0;
+            h->param.vui.i_sar_height = 0;
+            if( i_w == 0 || i_h == 0 )
+                x264_log( h, X264_LOG_WARNING, "cannot create valid sample aspect ratio\n" );
+            else
+            {
+                x264_log( h, initial?X264_LOG_INFO:X264_LOG_DEBUG, "using SAR=%d/%d\n", i_w, i_h );
+                h->param.vui.i_sar_width = i_w;
+                h->param.vui.i_sar_height = i_h;
+            }
+        }
+    }
+}
+
 /****************************************************************************
  * x264_encoder_open:
  ****************************************************************************/
-x264_t *x264_encoder_open   ( x264_param_t *param )
+x264_t *x264_encoder_open( x264_param_t *param )
 {
-    x264_t *h = x264_malloc( sizeof( x264_t ) );
+    x264_t *h;
     char buf[1000], *p;
-    int i;
+    int i, qp, i_slicetype_length;
 
-    memset( h, 0, sizeof( x264_t ) );
+    CHECKED_MALLOCZERO( h, sizeof(x264_t) );
 
     /* Create a copy of param */
-    memcpy( &h->param, param, sizeof( x264_param_t ) );
+    memcpy( &h->param, param, sizeof(x264_param_t) );
+
+    if( param->param_free )
+        param->param_free( param );
 
     if( x264_validate_parameters( h ) < 0 )
-    {
-        x264_free( h );
-        return NULL;
-    }
+        goto fail;
 
     if( h->param.psz_cqm_file )
         if( x264_cqm_parse_file( h, h->param.psz_cqm_file ) < 0 )
-        {
-            x264_free( h );
-            return NULL;
-        }
+            goto fail;
 
     if( h->param.rc.psz_stat_out )
         h->param.rc.psz_stat_out = strdup( h->param.rc.psz_stat_out );
     if( h->param.rc.psz_stat_in )
         h->param.rc.psz_stat_in = strdup( h->param.rc.psz_stat_in );
 
-    /* VUI */
-    if( h->param.vui.i_sar_width > 0 && h->param.vui.i_sar_height > 0 )
-    {
-        int i_w = param->vui.i_sar_width;
-        int i_h = param->vui.i_sar_height;
-
-        x264_reduce_fraction( &i_w, &i_h );
-
-        while( i_w > 65535 || i_h > 65535 )
-        {
-            i_w /= 2;
-            i_h /= 2;
-        }
-
-        h->param.vui.i_sar_width = 0;
-        h->param.vui.i_sar_height = 0;
-        if( i_w == 0 || i_h == 0 )
-        {
-            x264_log( h, X264_LOG_WARNING, "cannot create valid sample aspect ratio\n" );
-        }
-        else
-        {
-            x264_log( h, X264_LOG_INFO, "using SAR=%d/%d\n", i_w, i_h );
-            h->param.vui.i_sar_width = i_w;
-            h->param.vui.i_sar_height = i_h;
-        }
-    }
+    x264_set_aspect_ratio( h, &h->param, 1 );
 
     x264_reduce_fraction( &h->param.i_fps_num, &h->param.i_fps_den );
+    x264_reduce_fraction( &h->param.i_timebase_num, &h->param.i_timebase_den );
 
     /* Init x264_t */
-    h->i_frame = 0;
+    h->i_frame = -1;
     h->i_frame_num = 0;
     h->i_idr_pic_id = 0;
+    if( h->param.b_dts_compress )
+    {
+        /* h->i_dts_compress_multiplier == h->frames.i_bframe_delay + 1 */
+        h->i_dts_compress_multiplier = h->param.i_bframe ? (h->param.i_bframe_pyramid ? 3 : 2) : 1;
+        if( h->i_dts_compress_multiplier != 1 )
+            x264_log( h, X264_LOG_DEBUG, "DTS compresion changed timebase: %d/%d -> %d/%d\n",
+                      h->param.i_timebase_num, h->param.i_timebase_den,
+                      h->param.i_timebase_num, h->param.i_timebase_den * h->i_dts_compress_multiplier );
+        h->param.i_timebase_den *= h->i_dts_compress_multiplier;
+    }
+    else
+        h->i_dts_compress_multiplier = 1;
 
     h->sps = &h->sps_array[0];
     x264_sps_init( h->sps, h->param.i_sps_id, &h->param );
 
     h->pps = &h->pps_array[0];
-    x264_pps_init( h->pps, h->param.i_sps_id, &h->param, h->sps);
+    x264_pps_init( h->pps, h->param.i_sps_id, &h->param, h->sps );
 
     x264_validate_levels( h, 1 );
 
+    h->chroma_qp_table = i_chroma_qp_table + 12 + h->pps->i_chroma_qp_index_offset;
+
     if( x264_cqm_init( h ) < 0 )
-    {
-        x264_free( h );
-        return NULL;
-    }
+        goto fail;
 
     h->mb.i_mb_count = h->sps->i_mb_width * h->sps->i_mb_height;
 
     /* Init frames. */
     if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
-        h->frames.i_delay = X264_MAX(h->param.i_bframe,3)*4 + h->param.i_threads - 1;
+        h->frames.i_delay = X264_MAX(h->param.i_bframe,3)*4;
     else
-        h->frames.i_delay = h->param.i_bframe + h->param.i_threads - 1;
+        h->frames.i_delay = h->param.i_bframe;
+    if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size )
+        h->frames.i_delay = X264_MAX( h->frames.i_delay, h->param.rc.i_lookahead );
+    i_slicetype_length = h->frames.i_delay;
+    h->frames.i_delay += h->i_thread_frames - 1;
+    h->frames.i_delay = X264_MIN( h->frames.i_delay, X264_LOOKAHEAD_MAX );
+    h->frames.i_delay += h->param.i_sync_lookahead;
+    h->frames.i_bframe_delay = h->param.i_bframe ? (h->param.i_bframe_pyramid ? 2 : 1) : 0;
+
     h->frames.i_max_ref0 = h->param.i_frame_reference;
     h->frames.i_max_ref1 = h->sps->vui.i_num_reorder_frames;
     h->frames.i_max_dpb  = h->sps->vui.i_max_dec_frame_buffering;
@@ -708,20 +911,26 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
         && ( h->param.rc.i_rc_method == X264_RC_ABR
           || h->param.rc.i_rc_method == X264_RC_CRF
           || h->param.i_bframe_adaptive
-          || h->param.i_scenecut_threshold );
-    h->frames.b_have_lowres |= (h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0);
+          || h->param.i_scenecut_threshold
+          || h->param.rc.b_mb_tree
+          || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART );
+    h->frames.b_have_lowres |= h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0;
     h->frames.b_have_sub8x8_esa = !!(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
 
-    h->frames.i_last_idr = - h->param.i_keyint_max;
+    h->frames.i_last_keyframe = - h->param.i_keyint_max;
     h->frames.i_input    = 0;
-    h->frames.last_nonb  = NULL;
 
+    CHECKED_MALLOCZERO( h->frames.unused[0], (h->frames.i_delay + 3) * sizeof(x264_frame_t *) );
+    /* Allocate room for max refs plus a few extra just in case. */
+    CHECKED_MALLOCZERO( h->frames.unused[1], (h->i_thread_frames + 20) * sizeof(x264_frame_t *) );
+    CHECKED_MALLOCZERO( h->frames.current, (h->param.i_sync_lookahead + h->param.i_bframe
+                        + h->i_thread_frames + 3) * sizeof(x264_frame_t *) );
+    if( h->param.analyse.i_weighted_pred > 0 )
+        CHECKED_MALLOCZERO( h->frames.blank_unused, h->i_thread_frames * 4 * sizeof(x264_frame_t *) );
     h->i_ref0 = 0;
     h->i_ref1 = 0;
 
-    h->chroma_qp_table = i_chroma_qp_table + 12 + h->pps->i_chroma_qp_index_offset;
-
-    x264_rdo_init( );
+    x264_rdo_init();
 
     /* init CPU functions */
     x264_predict_16x16_init( h->param.cpu, h->predict_16x16 );
@@ -744,57 +953,105 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
     for( i=0; x264_cpu_names[i].flags; i++ )
     {
         if( !strcmp(x264_cpu_names[i].name, "SSE2")
-            && param->cpu & (X264_CPU_SSE2_IS_FAST|X264_CPU_SSE2_IS_SLOW) )
+            && h->param.cpu & (X264_CPU_SSE2_IS_FAST|X264_CPU_SSE2_IS_SLOW) )
             continue;
         if( !strcmp(x264_cpu_names[i].name, "SSE3")
-            && (param->cpu & X264_CPU_SSSE3 || !(param->cpu & X264_CPU_CACHELINE_64)) )
+            && (h->param.cpu & X264_CPU_SSSE3 || !(h->param.cpu & X264_CPU_CACHELINE_64)) )
             continue;
         if( !strcmp(x264_cpu_names[i].name, "SSE4.1")
-            && (param->cpu & X264_CPU_SSE42) )
+            && (h->param.cpu & X264_CPU_SSE42) )
             continue;
-        if( (param->cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
+        if( (h->param.cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
             && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
             p += sprintf( p, " %s", x264_cpu_names[i].name );
     }
-    if( !param->cpu )
+    if( !h->param.cpu )
         p += sprintf( p, " none!" );
     x264_log( h, X264_LOG_INFO, "%s\n", buf );
 
+    for( qp = h->param.rc.i_qp_min; qp <= h->param.rc.i_qp_max; qp++ )
+        if( x264_analyse_init_costs( h, qp ) )
+            goto fail;
+    if( x264_analyse_init_costs( h, X264_LOOKAHEAD_QP ) )
+        goto fail;
+    if( h->cost_mv[1][2013] != 24 )
+    {
+        x264_log( h, X264_LOG_ERROR, "MV cost test failed: x264 has been miscompiled!\n" );
+        goto fail;
+    }
+
     h->out.i_nal = 0;
     h->out.i_bitstream = X264_MAX( 1000000, h->param.i_width * h->param.i_height * 4
         * ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min )
           : pow( 0.95, h->param.rc.i_qp_constant ) * X264_MAX( 1, h->param.rc.f_ip_factor )));
 
+    CHECKED_MALLOC( h->nal_buffer, h->out.i_bitstream * 3/2 + 4 );
+    h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4;
+
     h->thread[0] = h;
-    h->i_thread_num = 0;
-    for( i = 1; i < h->param.i_threads; i++ )
-        h->thread[i] = x264_malloc( sizeof(x264_t) );
+    for( i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
+        CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
+
+    if( x264_lookahead_init( h, i_slicetype_length ) )
+        goto fail;
 
     for( i = 0; i < h->param.i_threads; i++ )
     {
+        int init_nal_count = h->param.i_slice_count + 3;
+        int allocate_threadlocal_data = !h->param.b_sliced_threads || !i;
         if( i > 0 )
             *h->thread[i] = *h;
-        h->thread[i]->fdec = x264_frame_pop_unused( h );
-        h->thread[i]->out.p_bitstream = x264_malloc( h->out.i_bitstream );
-        if( x264_macroblock_cache_init( h->thread[i] ) < 0 )
-            return NULL;
+
+        if( allocate_threadlocal_data )
+        {
+            h->thread[i]->fdec = x264_frame_pop_unused( h, 1 );
+            if( !h->thread[i]->fdec )
+                goto fail;
+        }
+        else
+            h->thread[i]->fdec = h->thread[0]->fdec;
+
+        CHECKED_MALLOC( h->thread[i]->out.p_bitstream, h->out.i_bitstream );
+        /* Start each thread with room for init_nal_count NAL units; it'll realloc later if needed. */
+        CHECKED_MALLOC( h->thread[i]->out.nal, init_nal_count*sizeof(x264_nal_t) );
+        h->thread[i]->out.i_nals_allocated = init_nal_count;
+
+        if( allocate_threadlocal_data && x264_macroblock_cache_init( h->thread[i] ) < 0 )
+            goto fail;
+    }
+
+    /* Allocate scratch buffer */
+    for( i = 0; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
+    {
+        int buf_hpel = (h->param.i_width+48) * sizeof(int16_t);
+        int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
+        int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
+        int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
+            ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
+        int buf_mbtree = h->param.rc.b_mb_tree * ((h->sps->i_mb_width+3)&~3) * sizeof(int);
+        int buf_nnz = !h->param.b_cabac * h->pps->b_transform_8x8_mode * (h->sps->i_mb_width * 4 * 16 * sizeof(uint8_t));
+        int scratch_size = X264_MAX4( buf_hpel, buf_ssim, buf_tesa, X264_MAX( buf_mbtree, buf_nnz ) );
+        CHECKED_MALLOC( h->thread[i]->scratch_buffer, scratch_size );
     }
 
     if( x264_ratecontrol_new( h ) < 0 )
-        return NULL;
+        goto fail;
 
     if( h->param.psz_dump_yuv )
     {
         /* create or truncate the reconstructed video file */
         FILE *f = fopen( h->param.psz_dump_yuv, "w" );
-        if( f )
-            fclose( f );
-        else
+        if( !f )
+        {
+            x264_log( h, X264_LOG_ERROR, "dump_yuv: can't write to %s\n", h->param.psz_dump_yuv );
+            goto fail;
+        }
+        else if( !x264_is_regular_file( f ) )
         {
-            x264_log( h, X264_LOG_ERROR, "can't write to fdec.yuv\n" );
-            x264_free( h );
-            return NULL;
+            x264_log( h, X264_LOG_ERROR, "dump_yuv: incompatible with non-regular file %s\n", h->param.psz_dump_yuv );
+            goto fail;
         }
+        fclose( f );
     }
 
     x264_log( h, X264_LOG_INFO, "profile %s, level %d.%d\n",
@@ -804,6 +1061,9 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
         "High 4:4:4 Predictive", h->sps->i_level_idc/10, h->sps->i_level_idc%10 );
 
     return h;
+fail:
+    x264_free( h );
+    return NULL;
 }
 
 /****************************************************************************
@@ -811,6 +1071,8 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
  ****************************************************************************/
 int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
 {
+    h = h->thread[h->i_thread_phase];
+    x264_set_aspect_ratio( h, param, 0 );
 #define COPY(var) h->param.var = param->var
     COPY( i_frame_reference ); // but never uses more refs than initially specified
     COPY( i_bframe_bias );
@@ -819,8 +1081,8 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
     COPY( b_deblocking_filter );
     COPY( i_deblocking_filter_alphac0 );
     COPY( i_deblocking_filter_beta );
-    COPY( analyse.intra );
     COPY( analyse.inter );
+    COPY( analyse.intra );
     COPY( analyse.i_direct_mv_pred );
     /* Scratch buffer prevents me_range from being increased for esa/tesa */
     if( h->param.analyse.i_me_method < X264_ME_ESA || param->analyse.i_me_range < h->param.analyse.i_me_range )
@@ -844,7 +1106,10 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
     if( h->pps->b_transform_8x8_mode )
         COPY( analyse.b_transform_8x8 );
     if( h->frames.i_max_ref1 > 1 )
-        COPY( b_bframe_pyramid );
+        COPY( i_bframe_pyramid );
+    COPY( i_slice_max_size );
+    COPY( i_slice_max_mbs );
+    COPY( i_slice_count );
 #undef COPY
 
     mbcmp_init( h );
@@ -852,6 +1117,14 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
     return x264_validate_parameters( h );
 }
 
+/****************************************************************************
+ * x264_encoder_parameters:
+ ****************************************************************************/
+void x264_encoder_parameters( x264_t *h, x264_param_t *param )
+{
+    memcpy( param, &h->thread[h->i_thread_phase]->param, sizeof(x264_param_t) );
+}
+
 /* internal usage */
 static void x264_nal_start( x264_t *h, int i_type, int i_ref_idc )
 {
@@ -863,11 +1136,57 @@ static void x264_nal_start( x264_t *h, int i_type, int i_ref_idc )
     nal->i_payload= 0;
     nal->p_payload= &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8];
 }
-static void x264_nal_end( x264_t *h )
+/* if number of allocated nals is not enough, re-allocate a larger one. */
+static int x264_nal_check_buffer( x264_t *h )
+{
+    if( h->out.i_nal >= h->out.i_nals_allocated )
+    {
+        x264_nal_t *new_out = x264_malloc( sizeof(x264_nal_t) * (h->out.i_nals_allocated*2) );
+        if( !new_out )
+            return -1;
+        memcpy( new_out, h->out.nal, sizeof(x264_nal_t) * (h->out.i_nals_allocated) );
+        x264_free( h->out.nal );
+        h->out.nal = new_out;
+        h->out.i_nals_allocated *= 2;
+    }
+    return 0;
+}
+static int x264_nal_end( x264_t *h )
 {
     x264_nal_t *nal = &h->out.nal[h->out.i_nal];
     nal->i_payload = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8] - nal->p_payload;
     h->out.i_nal++;
+
+    return x264_nal_check_buffer( h );
+}
+
+static int x264_encoder_encapsulate_nals( x264_t *h )
+{
+    int nal_size = 0, i;
+    for( i = 0; i < h->out.i_nal; i++ )
+        nal_size += h->out.nal[i].i_payload;
+
+    /* Worst-case NAL unit escaping: reallocate the buffer if it's too small. */
+    if( h->nal_buffer_size < nal_size * 3/2 + h->out.i_nal * 4 )
+    {
+        uint8_t *buf = x264_malloc( nal_size * 2 + h->out.i_nal * 4 );
+        if( !buf )
+            return -1;
+        x264_free( h->nal_buffer );
+        h->nal_buffer = buf;
+    }
+
+    uint8_t *nal_buffer = h->nal_buffer;
+
+    for( i = 0; i < h->out.i_nal; i++ )
+    {
+        int size = x264_nal_encode( nal_buffer, h->param.b_annexb, &h->out.nal[i] );
+        h->out.nal[i].i_payload = size;
+        h->out.nal[i].p_payload = nal_buffer;
+        nal_buffer += size;
+    }
+
+    return nal_buffer - h->nal_buffer;
 }
 
 /****************************************************************************
@@ -875,44 +1194,175 @@ static void x264_nal_end( x264_t *h )
  ****************************************************************************/
 int x264_encoder_headers( x264_t *h, x264_nal_t **pp_nal, int *pi_nal )
 {
+    int frame_size = 0;
     /* init bitstream context */
     h->out.i_nal = 0;
     bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
 
-    /* Put SPS and PPS */
-    if( h->i_frame == 0 )
-    {
-        /* identify ourself */
-        x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
-        x264_sei_version_write( h, &h->out.bs );
-        x264_nal_end( h );
+    /* Write SEI, SPS and PPS. */
+    x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
+    if( x264_sei_version_write( h, &h->out.bs ) )
+        return -1;
+    if( x264_nal_end( h ) )
+        return -1;
 
-        /* generate sequence parameters */
-        x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
-        x264_sps_write( &h->out.bs, h->sps );
-        x264_nal_end( h );
+    /* generate sequence parameters */
+    x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
+    x264_sps_write( &h->out.bs, h->sps );
+    if( x264_nal_end( h ) )
+        return -1;
+
+    /* generate picture parameters */
+    x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
+    x264_pps_write( &h->out.bs, h->pps );
+    if( x264_nal_end( h ) )
+        return -1;
+
+    frame_size = x264_encoder_encapsulate_nals( h );
 
-        /* generate picture parameters */
-        x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
-        x264_pps_write( &h->out.bs, h->pps );
-        x264_nal_end( h );
-    }
     /* now set output*/
     *pi_nal = h->out.i_nal;
     *pp_nal = &h->out.nal[0];
     h->out.i_nal = 0;
 
-    return 0;
+    return frame_size;
 }
 
-static inline void x264_reference_build_list( x264_t *h, int i_poc )
+/* Check to see whether we have chosen a reference list ordering different
+ * from the standard's default. */
+static inline void x264_reference_check_reorder( x264_t *h )
+{
+    int i;
+    for( i = 0; i < h->i_ref0 - 1; i++ )
+        /* P and B-frames use different default orders. */
+        if( h->sh.i_type == SLICE_TYPE_P ? h->fref0[i]->i_frame_num < h->fref0[i+1]->i_frame_num
+                                         : h->fref0[i]->i_poc < h->fref0[i+1]->i_poc )
+        {
+            h->b_ref_reorder[0] = 1;
+            break;
+        }
+}
+
+/* return -1 on failure, else return the index of the new reference frame */
+int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w )
+{
+    int i = h->i_ref0;
+    int j;
+    x264_frame_t *newframe;
+    if( i <= 1 ) /* empty list, definitely can't duplicate frame */
+        return -1;
+
+    /* Find a place to insert the duplicate in the reference list. */
+    for( j = 0; j < i; j++ )
+        if( h->fref0[i_ref]->i_frame != h->fref0[j]->i_frame )
+        {
+            /* found a place, after j, make sure there is not already a duplicate there */
+            if( j == i-1 || ( h->fref0[j+1] && h->fref0[i_ref]->i_frame != h->fref0[j+1]->i_frame ) )
+                break;
+        }
+
+    if( j == i ) /* No room in the reference list for the duplicate. */
+        return -1;
+    j++;
+
+    newframe = x264_frame_pop_blank_unused( h );
+
+    //FIXME: probably don't need to copy everything
+    *newframe = *h->fref0[i_ref];
+    newframe->i_reference_count = 1;
+    newframe->orig = h->fref0[i_ref];
+    newframe->b_duplicate = 1;
+    memcpy( h->fenc->weight[j], w, sizeof(h->fenc->weight[i]) );
+
+    /* shift the frames to make space for the dupe. */
+    h->b_ref_reorder[0] = 1;
+    if( h->i_ref0 < 16 )
+        ++h->i_ref0;
+    h->fref0[15] = NULL;
+    x264_frame_unshift( &h->fref0[j], newframe );
+
+    return j;
+}
+
+static void x264_weighted_pred_init( x264_t *h )
 {
+    int i_ref;
     int i;
-    int b_ok;
+
+    /* for now no analysis and set all weights to nothing */
+    for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
+        h->fenc->weighted[i_ref] = h->fref0[i_ref]->filtered[0];
+
+    // FIXME: This only supports weighting of one reference frame
+    // and duplicates of that frame.
+    h->fenc->i_lines_weighted = 0;
+
+    for( i_ref = 0; i_ref < (h->i_ref0 << h->sh.b_mbaff); i_ref++ )
+        for( i = 0; i < 3; i++ )
+            h->sh.weight[i_ref][i].weightfn = NULL;
+
+
+    if( h->sh.i_type != SLICE_TYPE_P || h->param.analyse.i_weighted_pred <= 0 )
+        return;
+
+    int i_padv = PADV << h->param.b_interlaced;
+    int denom = -1;
+    int weightluma = 0;
+    int buffer_next = 0;
+    int j;
+    //FIXME: when chroma support is added, move this into loop
+    h->sh.weight[0][1].weightfn = h->sh.weight[0][2].weightfn = NULL;
+    h->sh.weight[0][1].i_denom = h->sh.weight[0][2].i_denom = 0;
+    for( j = 0; j < h->i_ref0; j++ )
+    {
+        if( h->fenc->weight[j][0].weightfn )
+        {
+            h->sh.weight[j][0] = h->fenc->weight[j][0];
+            // if weight is useless, don't write it to stream
+            if( h->sh.weight[j][0].i_scale == 1<<h->sh.weight[j][0].i_denom && h->sh.weight[j][0].i_offset == 0 )
+                h->sh.weight[j][0].weightfn = NULL;
+            else
+            {
+                if( !weightluma )
+                {
+                    weightluma = 1;
+                    h->sh.weight[0][0].i_denom = denom = h->sh.weight[j][0].i_denom;
+                    assert( x264_clip3( denom, 0, 7 ) == denom );
+                }
+                assert( h->sh.weight[j][0].i_denom == denom );
+                assert( x264_clip3( h->sh.weight[j][0].i_scale, 0, 127 ) == h->sh.weight[j][0].i_scale );
+                assert( x264_clip3( h->sh.weight[j][0].i_offset, -128, 127 ) == h->sh.weight[j][0].i_offset );
+                h->fenc->weighted[j] = h->mb.p_weight_buf[buffer_next++] +
+                    h->fenc->i_stride[0] * i_padv + PADH;
+            }
+        }
+
+        //scale full resolution frame
+        if( h->sh.weight[j][0].weightfn && h->param.i_threads == 1 )
+        {
+            uint8_t *src = h->fref0[j]->filtered[0] - h->fref0[j]->i_stride[0]*i_padv - PADH;
+            uint8_t *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH;
+            int stride = h->fenc->i_stride[0];
+            int width = h->fenc->i_width[0] + PADH*2;
+            int height = h->fenc->i_lines[0] + i_padv*2;
+            x264_weight_scale_plane( h, dst, stride, src, stride, width, height, &h->sh.weight[j][0] );
+            h->fenc->i_lines_weighted = height;
+        }
+    }
+    if( !weightluma )
+        h->sh.weight[0][0].i_denom = 0;
+}
+
+static inline void x264_reference_build_list( x264_t *h, int i_poc )
+{
+    int i, b_ok;
 
     /* build ref list 0/1 */
-    h->i_ref0 = 0;
-    h->i_ref1 = 0;
+    h->mb.pic.i_fref[0] = h->i_ref0 = 0;
+    h->mb.pic.i_fref[1] = h->i_ref1 = 0;
+    if( h->sh.i_type == SLICE_TYPE_I )
+        return;
+
     for( i = 0; h->frames.reference[i]; i++ )
     {
         if( h->frames.reference[i]->i_poc < i_poc )
@@ -939,6 +1389,15 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
             }
         }
     } while( !b_ok );
+
+    if( h->sh.i_mmco_remove_from_end )
+        for( i = h->i_ref0-1; i >= h->i_ref0 - h->sh.i_mmco_remove_from_end; i-- )
+        {
+            int diff = h->i_frame_num - h->fref0[i]->i_frame_num;
+            h->sh.mmco[h->sh.i_mmco_command_count].i_poc = h->fref0[i]->i_poc;
+            h->sh.mmco[h->sh.i_mmco_command_count++].i_difference_of_pic_nums = diff;
+        }
+
     /* Order ref1 from lower to higher poc (bubble sort) for B-frame */
     do
     {
@@ -954,23 +1413,57 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc )
         }
     } while( !b_ok );
 
-    /* In the standard, a P-frame's ref list is sorted by frame_num.
-     * We use POC, but check whether explicit reordering is needed */
-    h->b_ref_reorder[0] =
-    h->b_ref_reorder[1] = 0;
-    if( h->sh.i_type == SLICE_TYPE_P )
+    x264_reference_check_reorder( h );
+
+    h->i_ref1 = X264_MIN( h->i_ref1, h->frames.i_max_ref1 );
+    h->i_ref0 = X264_MIN( h->i_ref0, h->frames.i_max_ref0 );
+    h->i_ref0 = X264_MIN( h->i_ref0, h->param.i_frame_reference ); // if reconfig() has lowered the limit
+
+    /* add duplicates */
+    if( h->fenc->i_type == X264_TYPE_P )
     {
-        for( i = 0; i < h->i_ref0 - 1; i++ )
-            if( h->fref0[i]->i_frame_num < h->fref0[i+1]->i_frame_num )
+        int idx = -1;
+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+        {
+            x264_weight_t w[3];
+            w[1].weightfn = w[2].weightfn = NULL;
+            if( h->param.rc.b_stat_read )
+                x264_ratecontrol_set_weights( h, h->fenc );
+
+            if( !h->fenc->weight[0][0].weightfn )
             {
-                h->b_ref_reorder[0] = 1;
-                break;
+                h->fenc->weight[0][0].i_denom = 0;
+                SET_WEIGHT( w[0], 1, 1, 0, -1 );
+                idx = x264_weighted_reference_duplicate( h, 0, w );
             }
+            else
+            {
+                if( h->fenc->weight[0][0].i_scale == 1<<h->fenc->weight[0][0].i_denom )
+                {
+                    SET_WEIGHT( h->fenc->weight[0][0], 1, 1, 0, h->fenc->weight[0][0].i_offset );
+                }
+                x264_weighted_reference_duplicate( h, 0, weight_none );
+                if( h->fenc->weight[0][0].i_offset > -128 )
+                {
+                    w[0] = h->fenc->weight[0][0];
+                    w[0].i_offset--;
+                    h->mc.weight_cache( h, &w[0] );
+                    idx = x264_weighted_reference_duplicate( h, 0, w );
+                }
+            }
+        }
+        else if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_BLIND )
+        {
+            //weighted offset=-1
+            x264_weight_t w[3];
+            SET_WEIGHT( w[0], 1, 1, 0, -1 );
+            h->fenc->weight[0][0].i_denom = 0;
+            w[1].weightfn = w[2].weightfn = NULL;
+            idx = x264_weighted_reference_duplicate( h, 0, w );
+        }
+        h->mb.ref_blind_dupe = idx;
     }
 
-    h->i_ref1 = X264_MIN( h->i_ref1, h->frames.i_max_ref1 );
-    h->i_ref0 = X264_MIN( h->i_ref0, h->frames.i_max_ref0 );
-    h->i_ref0 = X264_MIN( h->i_ref0, h->param.i_frame_reference ); // if reconfig() has lowered the limit
     assert( h->i_ref0 + h->i_ref1 <= 16 );
     h->mb.pic.i_fref[0] = h->i_ref0;
     h->mb.pic.i_fref[1] = h->i_ref1;
@@ -990,7 +1483,7 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
     if( min_y < 0 )
         return;
 
-    if( !b_end )
+    if( !b_end && !h->param.b_sliced_threads )
     {
         int i, j;
         for( j=0; j<=h->sh.b_mbaff; j++ )
@@ -1019,10 +1512,8 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
         }
     }
 
-    if( h->param.i_threads > 1 && h->fdec->b_kept_as_ref )
-    {
+    if( h->i_thread_frames > 1 && h->fdec->b_kept_as_ref )
         x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << h->sh.b_mbaff)) );
-    }
 
     min_y = X264_MAX( min_y*16-8, 0 );
     max_y = b_end ? h->param.i_height : mb_y*16-8;
@@ -1052,39 +1543,35 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
     }
 }
 
-static inline void x264_reference_update( x264_t *h )
+static inline int x264_reference_update( x264_t *h )
 {
-    int i;
-
-    if( h->fdec->i_frame >= 0 )
-        h->i_frame++;
-
+    int i, j;
     if( !h->fdec->b_kept_as_ref )
     {
-        if( h->param.i_threads > 1 )
+        if( h->i_thread_frames > 1 )
         {
             x264_frame_push_unused( h, h->fdec );
-            h->fdec = x264_frame_pop_unused( h );
+            h->fdec = x264_frame_pop_unused( h, 1 );
+            if( !h->fdec )
+                return -1;
         }
-        return;
-    }
-
-    /* move lowres copy of the image to the ref frame */
-    for( i = 0; i < 4; i++)
-    {
-        XCHG( uint8_t*, h->fdec->lowres[i], h->fenc->lowres[i] );
-        XCHG( uint8_t*, h->fdec->buffer_lowres[i], h->fenc->buffer_lowres[i] );
+        return 0;
     }
 
-    /* adaptive B decision needs a pointer, since it can't use the ref lists */
-    if( h->sh.i_type != SLICE_TYPE_B )
-        h->frames.last_nonb = h->fdec;
+    /* apply mmco from previous frame. */
+    for( i = 0; i < h->sh.i_mmco_command_count; i++ )
+        for( j = 0; h->frames.reference[j]; j++ )
+            if( h->frames.reference[j]->i_poc == h->sh.mmco[i].i_poc )
+                x264_frame_push_unused( h, x264_frame_shift( &h->frames.reference[j] ) );
 
     /* move frame in the buffer */
     x264_frame_push( h->frames.reference, h->fdec );
-    if( h->frames.reference[h->frames.i_max_dpb] )
+    if( h->frames.reference[h->sps->i_num_ref_frames] )
         x264_frame_push_unused( h, x264_frame_shift( h->frames.reference ) );
-    h->fdec = x264_frame_pop_unused( h );
+    h->fdec = x264_frame_pop_unused( h, 1 );
+    if( !h->fdec )
+        return -1;
+    return 0;
 }
 
 static inline void x264_reference_reset( x264_t *h )
@@ -1095,6 +1582,41 @@ static inline void x264_reference_reset( x264_t *h )
     h->fenc->i_poc = 0;
 }
 
+static inline void x264_reference_hierarchy_reset( x264_t *h )
+{
+    int i, ref;
+    int b_hasdelayframe = 0;
+    if( !h->param.i_bframe_pyramid )
+        return;
+
+    /* look for delay frames -- chain must only contain frames that are disposable */
+    for( i = 0; h->frames.current[i] && IS_DISPOSABLE( h->frames.current[i]->i_type ); i++ )
+        b_hasdelayframe |= h->frames.current[i]->i_coded
+                        != h->frames.current[i]->i_frame + h->sps->vui.i_num_reorder_frames;
+
+    if( h->param.i_bframe_pyramid != X264_B_PYRAMID_STRICT && !b_hasdelayframe )
+        return;
+
+    /* Remove last BREF. There will never be old BREFs in the
+     * dpb during a BREF decode when pyramid == STRICT */
+    for( ref = 0; h->frames.reference[ref]; ref++ )
+    {
+        if( h->param.i_bframe_pyramid == X264_B_PYRAMID_STRICT
+            && h->frames.reference[ref]->i_type == X264_TYPE_BREF )
+        {
+            int diff = h->i_frame_num - h->frames.reference[ref]->i_frame_num;
+            h->sh.mmco[h->sh.i_mmco_command_count].i_difference_of_pic_nums = diff;
+            h->sh.mmco[h->sh.i_mmco_command_count++].i_poc = h->frames.reference[ref]->i_poc;
+            x264_frame_push_unused( h, x264_frame_pop( h->frames.reference ) );
+            h->b_ref_reorder[0] = 1;
+            break;
+        }
+    }
+
+    /* Prepare to room in the dpb for the delayed display time of the later b-frame's */
+    h->sh.i_mmco_remove_from_end = X264_MAX( ref + 2 - h->frames.i_max_dpb, 0 );
+}
+
 static inline void x264_slice_init( x264_t *h, int i_nal_type, int i_global_qp )
 {
     /* ------------------------ Create slice header  ----------------------- */
@@ -1120,7 +1642,7 @@ static inline void x264_slice_init( x264_t *h, int i_nal_type, int i_global_qp )
     if( h->sps->i_poc_type == 0 )
     {
         h->sh.i_poc_lsb = h->fdec->i_poc & ( (1 << h->sps->i_log2_max_poc_lsb) - 1 );
-        h->sh.i_delta_poc_bottom = 0;   /* XXX won't work for field */
+        h->sh.i_delta_poc_bottom = 0;
     }
     else if( h->sps->i_poc_type == 1 )
     {
@@ -1134,19 +1656,24 @@ static inline void x264_slice_init( x264_t *h, int i_nal_type, int i_global_qp )
     x264_macroblock_slice_init( h );
 }
 
-static void x264_slice_write( x264_t *h )
+static int x264_slice_write( x264_t *h )
 {
     int i_skip;
     int mb_xy, i_mb_x, i_mb_y;
-    int i, i_list, i_ref;
-
-    /* init stats */
-    memset( &h->stat.frame, 0, sizeof(h->stat.frame) );
+    int i, i_list, i_ref, i_skip_bak = 0; /* Shut up GCC. */
+    bs_t bs_bak;
+    x264_cabac_t cabac_bak;
+    uint8_t cabac_prevbyte_bak = 0; /* Shut up GCC. */
+    /* Assume no more than 3 bytes of NALU escaping. */
+    int slice_max_size = h->param.i_slice_max_size > 0 ? (h->param.i_slice_max_size-3-NALU_OVERHEAD)*8 : INT_MAX;
+    int starting_bits = bs_pos(&h->out.bs);
+    bs_realign( &h->out.bs );
 
     /* Slice */
     x264_nal_start( h, h->i_nal_type, h->i_nal_ref_idc );
 
     /* Slice header */
+    x264_macroblock_thread_init( h );
     x264_slice_header_write( &h->out.bs, &h->sh, h->i_nal_ref_idc );
     if( h->param.b_cabac )
     {
@@ -1164,26 +1691,41 @@ static void x264_slice_write( x264_t *h )
     i_mb_x = h->sh.i_first_mb % h->sps->i_mb_width;
     i_skip = 0;
 
-    while( (mb_xy = i_mb_x + i_mb_y * h->sps->i_mb_width) < h->sh.i_last_mb )
+    while( (mb_xy = i_mb_x + i_mb_y * h->sps->i_mb_width) <= h->sh.i_last_mb )
     {
         int mb_spos = bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac);
+        if( h->param.i_slice_max_size > 0 )
+        {
+            /* We don't need the contexts because flushing the CABAC encoder has no context
+             * dependency and macroblocks are only re-encoded in the case where a slice is
+             * ended (and thus the content of all contexts are thrown away). */
+            if( h->param.b_cabac )
+            {
+                memcpy( &cabac_bak, &h->cabac, offsetof(x264_cabac_t, f8_bits_encoded) );
+                /* x264's CABAC writer modifies the previous byte during carry, so it has to be
+                 * backed up. */
+                cabac_prevbyte_bak = h->cabac.p[-1];
+            }
+            else
+            {
+                bs_bak = h->out.bs;
+                i_skip_bak = i_skip;
+            }
+        }
 
-        if( i_mb_x == 0 )
+        if( i_mb_x == 0 && !h->mb.b_reencode_mb && !h->param.b_sliced_threads )
             x264_fdec_filter_row( h, i_mb_y );
 
         /* load cache */
         x264_macroblock_cache_load( h, i_mb_x, i_mb_y );
 
-        /* analyse parameters
-         * Slice I: choose I_4x4 or I_16x16 mode
-         * Slice P: choose between using P mode or intra (4x4 or 16x16)
-         * */
         x264_macroblock_analyse( h );
 
         /* encode this macroblock -> be careful it can change the mb type to P_SKIP if needed */
         x264_macroblock_encode( h );
 
-        x264_bitstream_check_buffer( h );
+        if( x264_bitstream_check_buffer( h ) )
+            return -1;
 
         if( h->param.b_cabac )
         {
@@ -1210,11 +1752,42 @@ static void x264_slice_write( x264_t *h )
                     bs_write_ue( &h->out.bs, i_skip );  /* skip run */
                     i_skip = 0;
                 }
-                x264_macroblock_write_cavlc( h, &h->out.bs );
+                x264_macroblock_write_cavlc( h );
+            }
+        }
+
+        int total_bits = bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac);
+        int mb_size = total_bits - mb_spos;
+
+        /* We'll just re-encode this last macroblock if we go over the max slice size. */
+        if( total_bits - starting_bits > slice_max_size && !h->mb.b_reencode_mb )
+        {
+            if( mb_xy != h->sh.i_first_mb )
+            {
+                if( h->param.b_cabac )
+                {
+                    memcpy( &h->cabac, &cabac_bak, offsetof(x264_cabac_t, f8_bits_encoded) );
+                    h->cabac.p[-1] = cabac_prevbyte_bak;
+                }
+                else
+                {
+                    h->out.bs = bs_bak;
+                    i_skip = i_skip_bak;
+                }
+                h->mb.b_reencode_mb = 1;
+                h->sh.i_last_mb = mb_xy-1;
+                break;
+            }
+            else
+            {
+                h->sh.i_last_mb = mb_xy;
+                h->mb.b_reencode_mb = 0;
             }
         }
+        else
+            h->mb.b_reencode_mb = 0;
 
-#if VISUALIZE
+#ifdef HAVE_VISUALIZE
         if( h->param.b_visualize )
             x264_visualize_mb( h );
 #endif
@@ -1224,13 +1797,14 @@ static void x264_slice_write( x264_t *h )
 
         /* accumulate mb stats */
         h->stat.frame.i_mb_count[h->mb.i_type]++;
-        if( !IS_SKIP(h->mb.i_type) && !IS_INTRA(h->mb.i_type) && !IS_DIRECT(h->mb.i_type) )
+
+        if( !IS_INTRA(h->mb.i_type) && !IS_SKIP(h->mb.i_type) && !IS_DIRECT(h->mb.i_type) )
         {
             if( h->mb.i_partition != D_8x8 )
-                h->stat.frame.i_mb_partition[h->mb.i_partition] += 4;
-            else
-                for( i = 0; i < 4; i++ )
-                    h->stat.frame.i_mb_partition[h->mb.i_sub_partition[i]] ++;
+                    h->stat.frame.i_mb_partition[h->mb.i_partition] += 4;
+                else
+                    for( i = 0; i < 4; i++ )
+                        h->stat.frame.i_mb_partition[h->mb.i_sub_partition[i]] ++;
             if( h->param.i_frame_reference > 1 )
                 for( i_list = 0; i_list <= (h->sh.i_type == SLICE_TYPE_B); i_list++ )
                     for( i = 0; i < 4; i++ )
@@ -1240,22 +1814,37 @@ static void x264_slice_write( x264_t *h )
                             h->stat.frame.i_mb_count_ref[i_list][i_ref] ++;
                     }
         }
-        if( h->mb.i_cbp_luma || h->mb.i_cbp_chroma )
-        {
-            int cbpsum = (h->mb.i_cbp_luma&1) + ((h->mb.i_cbp_luma>>1)&1)
-                       + ((h->mb.i_cbp_luma>>2)&1) + (h->mb.i_cbp_luma>>3);
-            int b_intra = IS_INTRA(h->mb.i_type);
-            h->stat.frame.i_mb_cbp[!b_intra + 0] += cbpsum;
-            h->stat.frame.i_mb_cbp[!b_intra + 2] += h->mb.i_cbp_chroma >= 1;
-            h->stat.frame.i_mb_cbp[!b_intra + 4] += h->mb.i_cbp_chroma == 2;
-        }
-        if( h->mb.i_cbp_luma && !IS_INTRA(h->mb.i_type) )
+
+        if( h->param.i_log_level >= X264_LOG_INFO )
         {
-            h->stat.frame.i_mb_count_8x8dct[0] ++;
-            h->stat.frame.i_mb_count_8x8dct[1] += h->mb.b_transform_8x8;
+            if( h->mb.i_cbp_luma || h->mb.i_cbp_chroma )
+            {
+                int cbpsum = (h->mb.i_cbp_luma&1) + ((h->mb.i_cbp_luma>>1)&1)
+                           + ((h->mb.i_cbp_luma>>2)&1) + (h->mb.i_cbp_luma>>3);
+                int b_intra = IS_INTRA(h->mb.i_type);
+                h->stat.frame.i_mb_cbp[!b_intra + 0] += cbpsum;
+                h->stat.frame.i_mb_cbp[!b_intra + 2] += h->mb.i_cbp_chroma >= 1;
+                h->stat.frame.i_mb_cbp[!b_intra + 4] += h->mb.i_cbp_chroma == 2;
+            }
+            if( h->mb.i_cbp_luma && !IS_INTRA(h->mb.i_type) )
+            {
+                h->stat.frame.i_mb_count_8x8dct[0] ++;
+                h->stat.frame.i_mb_count_8x8dct[1] += h->mb.b_transform_8x8;
+            }
+            if( IS_INTRA(h->mb.i_type) && h->mb.i_type != I_PCM )
+            {
+                if( h->mb.i_type == I_16x16 )
+                    h->stat.frame.i_mb_pred_mode[0][h->mb.i_intra16x16_pred_mode]++;
+                else if( h->mb.i_type == I_8x8 )
+                    for( i = 0; i < 16; i += 4 )
+                        h->stat.frame.i_mb_pred_mode[1][h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]]++;
+                else //if( h->mb.i_type == I_4x4 )
+                    for( i = 0; i < 16; i++ )
+                        h->stat.frame.i_mb_pred_mode[2][h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]]++;
+            }
         }
 
-        x264_ratecontrol_mb( h, bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac) - mb_spos );
+        x264_ratecontrol_mb( h, mb_size );
 
         if( h->sh.b_mbaff )
         {
@@ -1264,7 +1853,7 @@ static void x264_slice_write( x264_t *h )
         }
         else
             i_mb_x++;
-        if(i_mb_x == h->sps->i_mb_width)
+        if( i_mb_x == h->sps->i_mb_width )
         {
             i_mb_y++;
             i_mb_x = 0;
@@ -1282,26 +1871,31 @@ static void x264_slice_write( x264_t *h )
             bs_write_ue( &h->out.bs, i_skip );  /* last skip run */
         /* rbsp_slice_trailing_bits */
         bs_rbsp_trailing( &h->out.bs );
+        bs_flush( &h->out.bs );
     }
+    if( x264_nal_end( h ) )
+        return -1;
 
-    x264_nal_end( h );
-
-    x264_fdec_filter_row( h, h->sps->i_mb_height );
+    if( h->sh.i_last_mb == h->mb.i_mb_count-1 )
+    {
+        h->stat.frame.i_misc_bits = bs_pos( &h->out.bs )
+                                  + (h->out.i_nal*NALU_OVERHEAD * 8)
+                                  - h->stat.frame.i_tex_bits
+                                  - h->stat.frame.i_mv_bits;
+        if( !h->param.b_sliced_threads )
+            x264_fdec_filter_row( h, h->sps->i_mb_height );
+    }
 
-    /* Compute misc bits */
-    h->stat.frame.i_misc_bits = bs_pos( &h->out.bs )
-                              + NALU_OVERHEAD * 8
-                              - h->stat.frame.i_tex_bits
-                              - h->stat.frame.i_mv_bits;
+    return 0;
 }
 
 static void x264_thread_sync_context( x264_t *dst, x264_t *src )
 {
-    x264_frame_t **f;
     if( dst == src )
         return;
 
     // reference counting
+    x264_frame_t **f;
     for( f = src->frames.reference; *f; f++ )
         (*f)->i_reference_count++;
     for( f = dst->frames.reference; *f; f++ )
@@ -1311,6 +1905,7 @@ static void x264_thread_sync_context( x264_t *dst, x264_t *src )
 
     // copy everything except the per-thread pointers and the constants.
     memcpy( &dst->i_frame, &src->i_frame, offsetof(x264_t, mb.type) - offsetof(x264_t, i_frame) );
+    dst->param = src->param;
     dst->stat = src->stat;
 }
 
@@ -1318,12 +1913,15 @@ static void x264_thread_sync_stat( x264_t *dst, x264_t *src )
 {
     if( dst == src )
         return;
-    memcpy( &dst->stat.i_slice_count, &src->stat.i_slice_count, sizeof(dst->stat) - sizeof(dst->stat.frame) );
+    memcpy( &dst->stat.i_frame_count, &src->stat.i_frame_count, sizeof(dst->stat) - sizeof(dst->stat.frame) );
 }
 
-static int x264_slices_write( x264_t *h )
+static void *x264_slices_write( x264_t *h )
 {
-    int i_frame_size;
+    int i_slice_num = 0;
+    int last_thread_mb = h->sh.i_last_mb;
+    if( h->param.i_sync_lookahead )
+        x264_lower_thread_priority( 10 );
 
 #ifdef HAVE_MMX
     /* Misalign mask has to be set separately for each thread. */
@@ -1331,15 +1929,34 @@ static int x264_slices_write( x264_t *h )
         x264_cpu_mask_misalign_sse();
 #endif
 
-#if VISUALIZE
+#ifdef HAVE_VISUALIZE
     if( h->param.b_visualize )
-        x264_visualize_init( h );
+        if( x264_visualize_init( h ) )
+            return (void *)-1;
 #endif
 
-    x264_stack_align( x264_slice_write, h );
-    i_frame_size = h->out.nal[h->out.i_nal-1].i_payload;
+    /* init stats */
+    memset( &h->stat.frame, 0, sizeof(h->stat.frame) );
+    h->mb.b_reencode_mb = 0;
+    while( h->sh.i_first_mb <= last_thread_mb )
+    {
+        h->sh.i_last_mb = last_thread_mb;
+        if( h->param.i_slice_max_mbs )
+            h->sh.i_last_mb = h->sh.i_first_mb + h->param.i_slice_max_mbs - 1;
+        else if( h->param.i_slice_count && !h->param.b_sliced_threads )
+        {
+            int height = h->sps->i_mb_height >> h->param.b_interlaced;
+            int width = h->sps->i_mb_width << h->param.b_interlaced;
+            i_slice_num++;
+            h->sh.i_last_mb = (height * i_slice_num + h->param.i_slice_count/2) / h->param.i_slice_count * width - 1;
+        }
+        h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, last_thread_mb );
+        if( x264_stack_align( x264_slice_write, h ) )
+            return (void *)-1;
+        h->sh.i_first_mb = h->sh.i_last_mb + 1;
+    }
 
-#if VISUALIZE
+#ifdef HAVE_VISUALIZE
     if( h->param.b_visualize )
     {
         x264_visualize_show( h );
@@ -1347,7 +1964,69 @@ static int x264_slices_write( x264_t *h )
     }
 #endif
 
-    h->out.i_frame_size = i_frame_size;
+    return (void *)0;
+}
+
+static int x264_threaded_slices_write( x264_t *h )
+{
+    int i, j;
+    void *ret = NULL;
+    /* set first/last mb and sync contexts */
+    for( i = 0; i < h->param.i_threads; i++ )
+    {
+        x264_t *t = h->thread[i];
+        if( i )
+        {
+            t->param = h->param;
+            memcpy( &t->i_frame, &h->i_frame, offsetof(x264_t, rc) - offsetof(x264_t, i_frame) );
+        }
+        int height = h->sps->i_mb_height >> h->param.b_interlaced;
+        t->i_threadslice_start = ((height *  i    + h->param.i_slice_count/2) / h->param.i_threads) << h->param.b_interlaced;
+        t->i_threadslice_end   = ((height * (i+1) + h->param.i_slice_count/2) / h->param.i_threads) << h->param.b_interlaced;
+        t->sh.i_first_mb = t->i_threadslice_start * h->sps->i_mb_width;
+        t->sh.i_last_mb  =   t->i_threadslice_end * h->sps->i_mb_width - 1;
+    }
+
+    x264_analyse_weight_frame( h, h->sps->i_mb_height*16 + 16 );
+
+    x264_threads_distribute_ratecontrol( h );
+
+    /* dispatch */
+    for( i = 0; i < h->param.i_threads; i++ )
+    {
+        if( x264_pthread_create( &h->thread[i]->thread_handle, NULL, (void*)x264_slices_write, (void*)h->thread[i] ) )
+            return -1;
+        h->thread[i]->b_thread_active = 1;
+    }
+    for( i = 0; i < h->param.i_threads; i++ )
+    {
+        x264_pthread_join( h->thread[i]->thread_handle, &ret );
+        h->thread[i]->b_thread_active = 0;
+        if( (intptr_t)ret )
+            return (intptr_t)ret;
+    }
+
+    /* deblocking and hpel filtering */
+    for( i = 0; i <= h->sps->i_mb_height; i++ )
+        x264_fdec_filter_row( h, i );
+
+    for( i = 1; i < h->param.i_threads; i++ )
+    {
+        x264_t *t = h->thread[i];
+        for( j = 0; j < t->out.i_nal; j++ )
+        {
+            h->out.nal[h->out.i_nal] = t->out.nal[j];
+            h->out.i_nal++;
+            x264_nal_check_buffer( h );
+        }
+        /* All entries in stat.frame are ints except for ssd/ssim,
+         * which are only calculated in the main thread. */
+        for( j = 0; j < (offsetof(x264_t,stat.frame.i_ssd) - offsetof(x264_t,stat.frame.i_mv_bits)) / sizeof(int); j++ )
+            ((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j];
+    }
+
+    x264_threads_merge_ratecontrol( h );
+
     return 0;
 }
 
@@ -1370,18 +2049,14 @@ int     x264_encoder_encode( x264_t *h,
                              x264_picture_t *pic_out )
 {
     x264_t *thread_current, *thread_prev, *thread_oldest;
-    int     i_nal_type;
-    int     i_nal_ref_idc;
-
-    int   i_global_qp;
+    int i_nal_type, i_nal_ref_idc, i_global_qp, i;
 
-    if( h->param.i_threads > 1)
+    if( h->i_thread_frames > 1 )
     {
-        int i = ++h->i_thread_phase;
-        int t = h->param.i_threads;
-        thread_current = h->thread[ i%t ];
-        thread_prev    = h->thread[ (i-1)%t ];
-        thread_oldest  = h->thread[ (i+1)%t ];
+        thread_prev    = h->thread[ h->i_thread_phase ];
+        h->i_thread_phase = (h->i_thread_phase + 1) % h->i_thread_frames;
+        thread_current = h->thread[ h->i_thread_phase ];
+        thread_oldest  = h->thread[ (h->i_thread_phase + 1) % h->i_thread_frames ];
         x264_thread_sync_context( thread_current, thread_prev );
         x264_thread_sync_ratecontrol( thread_current, thread_prev, thread_oldest );
         h = thread_current;
@@ -1394,7 +2069,8 @@ int     x264_encoder_encode( x264_t *h,
     }
 
     // ok to call this before encoding any frames, since the initial values of fdec have b_kept_as_ref=0
-    x264_reference_update( h );
+    if( x264_reference_update( h ) )
+        return -1;
     h->fdec->i_lines_completed = -1;
 
     /* no data out */
@@ -1405,7 +2081,9 @@ int     x264_encoder_encode( x264_t *h,
     if( pic_in != NULL )
     {
         /* 1: Copy the picture to a frame and move it to a buffer */
-        x264_frame_t *fenc = x264_frame_pop_unused( h );
+        x264_frame_t *fenc = x264_frame_pop_unused( h, 0 );
+        if( !fenc )
+            return -1;
 
         if( x264_frame_copy_picture( h, fenc, pic_in ) < 0 )
             return -1;
@@ -1416,95 +2094,102 @@ int     x264_encoder_encode( x264_t *h,
 
         fenc->i_frame = h->frames.i_input++;
 
-        x264_frame_push( h->frames.next, fenc );
+        if( h->frames.i_bframe_delay && fenc->i_frame == h->frames.i_bframe_delay )
+            h->frames.i_bframe_delay_time = fenc->i_pts;
 
         if( h->frames.b_have_lowres )
+        {
+            if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+                x264_weight_plane_analyse( h, fenc );
             x264_frame_init_lowres( h, fenc );
+        }
 
-        if( h->param.rc.i_aq_mode )
+        if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
+        {
+            if( x264_macroblock_tree_read( h, fenc ) )
+                return -1;
+        }
+        else if( h->param.rc.i_aq_mode )
             x264_adaptive_quant_frame( h, fenc );
 
-        if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads )
+        /* 2: Place the frame into the queue for its slice type decision */
+        x264_lookahead_put_frame( h, fenc );
+
+        if( h->frames.i_input <= h->frames.i_delay + 1 - h->i_thread_frames )
         {
-            /* Nothing yet to encode */
-            /* waiting for filling bframe buffer */
+            /* Nothing yet to encode, waiting for filling of buffers */
             pic_out->i_type = X264_TYPE_AUTO;
             return 0;
         }
     }
-
-    if( h->frames.current[0] == NULL )
+    else
     {
-        int bframes = 0;
-        /* 2: Select frame types */
-        if( h->frames.next[0] == NULL )
-        {
-            x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
-            return 0;
-        }
+        /* signal kills for lookahead thread */
+        x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+        h->lookahead->b_exit_thread = 1;
+        x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
+        x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+    }
 
-        x264_stack_align( x264_slicetype_decide, h );
+    h->i_frame++;
+    /* 3: The picture is analyzed in the lookahead */
+    if( !h->frames.current[0] )
+        x264_lookahead_get_frames( h );
 
-        /* 3: move some B-frames and 1 non-B to encode queue */
-        while( IS_X264_TYPE_B( h->frames.next[bframes]->i_type ) )
-            bframes++;
-        x264_frame_push( h->frames.current, x264_frame_shift( &h->frames.next[bframes] ) );
-        /* FIXME: when max B-frames > 3, BREF may no longer be centered after GOP closing */
-        if( h->param.b_bframe_pyramid && bframes > 1 )
-        {
-            x264_frame_t *mid = x264_frame_shift( &h->frames.next[bframes/2] );
-            mid->i_type = X264_TYPE_BREF;
-            x264_frame_push( h->frames.current, mid );
-            bframes--;
-        }
-        while( bframes-- )
-            x264_frame_push( h->frames.current, x264_frame_shift( h->frames.next ) );
-    }
+    if( !h->frames.current[0] && x264_lookahead_is_empty( h ) )
+        return x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
 
     /* ------------------- Get frame to be encoded ------------------------- */
     /* 4: get picture to encode */
     h->fenc = x264_frame_shift( h->frames.current );
-    if( h->fenc == NULL )
+    if( h->fenc->param )
     {
-        /* Nothing yet to encode (ex: waiting for I/P with B frames) */
-        /* waiting for filling bframe buffer */
-        pic_out->i_type = X264_TYPE_AUTO;
-        return 0;
+        x264_encoder_reconfig( h, h->fenc->param );
+        if( h->fenc->param->param_free )
+            h->fenc->param->param_free( h->fenc->param );
     }
 
-    if( h->fenc->i_type == X264_TYPE_IDR )
+    if( h->fenc->b_keyframe )
     {
-        h->frames.i_last_idr = h->fenc->i_frame;
+        h->frames.i_last_keyframe = h->fenc->i_frame;
+        if( h->fenc->i_type == X264_TYPE_IDR )
+            h->i_frame_num = 0;
     }
+    h->sh.i_mmco_command_count =
+    h->sh.i_mmco_remove_from_end = 0;
+    h->b_ref_reorder[0] =
+    h->b_ref_reorder[1] = 0;
 
     /* ------------------- Setup frame context ----------------------------- */
     /* 5: Init data dependent of frame type */
     if( h->fenc->i_type == X264_TYPE_IDR )
     {
         /* reset ref pictures */
-        x264_reference_reset( h );
-
         i_nal_type    = NAL_SLICE_IDR;
         i_nal_ref_idc = NAL_PRIORITY_HIGHEST;
         h->sh.i_type = SLICE_TYPE_I;
+        x264_reference_reset( h );
     }
     else if( h->fenc->i_type == X264_TYPE_I )
     {
         i_nal_type    = NAL_SLICE;
         i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/
         h->sh.i_type = SLICE_TYPE_I;
+        x264_reference_hierarchy_reset( h );
     }
     else if( h->fenc->i_type == X264_TYPE_P )
     {
         i_nal_type    = NAL_SLICE;
         i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/
         h->sh.i_type = SLICE_TYPE_P;
+        x264_reference_hierarchy_reset( h );
     }
     else if( h->fenc->i_type == X264_TYPE_BREF )
     {
         i_nal_type    = NAL_SLICE;
-        i_nal_ref_idc = NAL_PRIORITY_HIGH; /* maybe add MMCO to forget it? -> low */
+        i_nal_ref_idc = h->param.i_bframe_pyramid == X264_B_PYRAMID_STRICT ? NAL_PRIORITY_LOW : NAL_PRIORITY_HIGH;
         h->sh.i_type = SLICE_TYPE_B;
+        x264_reference_hierarchy_reset( h );
     }
     else    /* B frame */
     {
@@ -1514,7 +2199,7 @@ int     x264_encoder_encode( x264_t *h,
     }
 
     h->fdec->i_poc =
-    h->fenc->i_poc = 2 * (h->fenc->i_frame - h->frames.i_last_idr);
+    h->fenc->i_poc = 2 * (h->fenc->i_frame - h->frames.i_last_keyframe);
     h->fdec->i_type = h->fenc->i_type;
     h->fdec->i_frame = h->fenc->i_frame;
     h->fenc->b_kept_as_ref =
@@ -1526,99 +2211,174 @@ int     x264_encoder_encode( x264_t *h,
     /* build ref list 0/1 */
     x264_reference_build_list( h, h->fdec->i_poc );
 
-    /* Init the rate control */
-    x264_ratecontrol_start( h, h->fenc->i_qpplus1 );
-    i_global_qp = x264_ratecontrol_qp( h );
-
-    pic_out->i_qpplus1 =
-    h->fdec->i_qpplus1 = i_global_qp + 1;
-
-    if( h->sh.i_type == SLICE_TYPE_B )
-        x264_macroblock_bipred_init( h );
-
-    /* ------------------------ Create slice header  ----------------------- */
-    x264_slice_init( h, i_nal_type, i_global_qp );
-
-    if( i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE )
-        h->i_frame_num++;
-
     /* ---------------------- Write the bitstream -------------------------- */
     /* Init bitstream context */
-    h->out.i_nal = 0;
-    bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
+    if( h->param.b_sliced_threads )
+    {
+        for( i = 0; i < h->param.i_threads; i++ )
+        {
+            bs_init( &h->thread[i]->out.bs, h->thread[i]->out.p_bitstream, h->thread[i]->out.i_bitstream );
+            h->thread[i]->out.i_nal = 0;
+        }
+    }
+    else
+    {
+        bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream );
+        h->out.i_nal = 0;
+    }
 
-    if(h->param.b_aud){
+    if( h->param.b_aud )
+    {
         int pic_type;
 
-        if(h->sh.i_type == SLICE_TYPE_I)
+        if( h->sh.i_type == SLICE_TYPE_I )
             pic_type = 0;
-        else if(h->sh.i_type == SLICE_TYPE_P)
+        else if( h->sh.i_type == SLICE_TYPE_P )
             pic_type = 1;
-        else if(h->sh.i_type == SLICE_TYPE_B)
+        else if( h->sh.i_type == SLICE_TYPE_B )
             pic_type = 2;
         else
             pic_type = 7;
 
-        x264_nal_start(h, NAL_AUD, NAL_PRIORITY_DISPOSABLE);
-        bs_write(&h->out.bs, 3, pic_type);
-        bs_rbsp_trailing(&h->out.bs);
-        x264_nal_end(h);
+        x264_nal_start( h, NAL_AUD, NAL_PRIORITY_DISPOSABLE );
+        bs_write( &h->out.bs, 3, pic_type );
+        bs_rbsp_trailing( &h->out.bs );
+        if( x264_nal_end( h ) )
+            return -1;
     }
 
     h->i_nal_type = i_nal_type;
     h->i_nal_ref_idc = i_nal_ref_idc;
 
+    int overhead = NALU_OVERHEAD;
+
+    if( h->param.b_intra_refresh && h->fenc->i_type == X264_TYPE_P )
+    {
+        int pocdiff = (h->fdec->i_poc - h->fref0[0]->i_poc)/2;
+        float increment = ((float)h->sps->i_mb_width-1) / h->param.i_keyint_max;
+        if( IS_X264_TYPE_I( h->fref0[0]->i_type ) )
+            h->fdec->f_pir_position = 0;
+        else
+        {
+            if( h->fref0[0]->i_pir_end_col == h->sps->i_mb_width - 1 )
+            {
+                h->fdec->f_pir_position = 0;
+                h->fenc->b_keyframe = 1;
+            }
+            else
+                h->fdec->f_pir_position = h->fref0[0]->f_pir_position;
+        }
+        h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5;
+        h->fdec->f_pir_position += increment * pocdiff;
+        h->fdec->i_pir_end_col = X264_MIN( h->fdec->f_pir_position+0.5, h->sps->i_mb_width-1 );
+    }
+
     /* Write SPS and PPS */
-    if( i_nal_type == NAL_SLICE_IDR && h->param.b_repeat_headers )
+    if( h->fenc->b_keyframe )
     {
-        if( h->fenc->i_frame == 0 )
+        if( h->param.b_repeat_headers )
+        {
+            if( h->fenc->i_frame == 0 )
+            {
+                /* identify ourself */
+                x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
+                if( x264_sei_version_write( h, &h->out.bs ) )
+                    return -1;
+                if( x264_nal_end( h ) )
+                    return -1;
+                overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
+            }
+
+            /* generate sequence parameters */
+            x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
+            x264_sps_write( &h->out.bs, h->sps );
+            if( x264_nal_end( h ) )
+                return -1;
+            overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
+
+            /* generate picture parameters */
+            x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
+            x264_pps_write( &h->out.bs, h->pps );
+            if( x264_nal_end( h ) )
+                return -1;
+            overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
+        }
+
+        if( h->fenc->i_type != X264_TYPE_IDR )
         {
-            /* identify ourself */
             x264_nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE );
-            x264_sei_version_write( h, &h->out.bs );
+            x264_sei_recovery_point_write( h, &h->out.bs, h->param.i_keyint_max );
             x264_nal_end( h );
+            overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD;
         }
+    }
 
-        /* generate sequence parameters */
-        x264_nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST );
-        x264_sps_write( &h->out.bs, h->sps );
-        x264_nal_end( h );
+    /* Init the rate control */
+    /* FIXME: Include slice header bit cost. */
+    x264_ratecontrol_start( h, h->fenc->i_qpplus1, overhead*8 );
+    i_global_qp = x264_ratecontrol_qp( h );
+
+    pic_out->i_qpplus1 =
+    h->fdec->i_qpplus1 = i_global_qp + 1;
 
-        /* generate picture parameters */
-        x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST );
-        x264_pps_write( &h->out.bs, h->pps );
-        x264_nal_end( h );
+    if( h->param.rc.b_stat_read && h->sh.i_type != SLICE_TYPE_I )
+    {
+        x264_reference_build_list_optimal( h );
+        x264_reference_check_reorder( h );
     }
 
+    if( h->sh.i_type == SLICE_TYPE_B )
+        x264_macroblock_bipred_init( h );
+
+    /*------------------------- Weights -------------------------------------*/
+    x264_weighted_pred_init( h );
+
+    /* ------------------------ Create slice header  ----------------------- */
+    x264_slice_init( h, i_nal_type, i_global_qp );
+
+    if( i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE )
+        h->i_frame_num++;
+
     /* Write frame */
-    if( h->param.i_threads > 1 )
+    h->i_threadslice_start = 0;
+    h->i_threadslice_end = h->sps->i_mb_height;
+    if( h->i_thread_frames > 1 )
     {
-        x264_pthread_create( &h->thread_handle, NULL, (void*)x264_slices_write, h );
+        if( x264_pthread_create( &h->thread_handle, NULL, (void*)x264_slices_write, h ) )
+            return -1;
         h->b_thread_active = 1;
     }
+    else if( h->param.b_sliced_threads )
+    {
+        if( x264_threaded_slices_write( h ) )
+            return -1;
+    }
     else
-        x264_slices_write( h );
+        if( (intptr_t)x264_slices_write( h ) )
+            return -1;
 
-    x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
-    return 0;
+    return x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out );
 }
 
-static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
-                                    x264_nal_t **pp_nal, int *pi_nal,
-                                    x264_picture_t *pic_out )
+static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
+                                   x264_nal_t **pp_nal, int *pi_nal,
+                                   x264_picture_t *pic_out )
 {
-    int i, i_list;
+    int i, j, i_list, frame_size;
     char psz_message[80];
 
     if( h->b_thread_active )
     {
-        x264_pthread_join( h->thread_handle, NULL );
+        void *ret = NULL;
+        x264_pthread_join( h->thread_handle, &ret );
         h->b_thread_active = 0;
+        if( (intptr_t)ret )
+            return (intptr_t)ret;
     }
     if( !h->out.i_nal )
     {
         pic_out->i_type = X264_TYPE_AUTO;
-        return;
+        return 0;
     }
 
     x264_frame_push_unused( thread_current, h->fenc );
@@ -1626,6 +2386,9 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
     /* End bitstream, set output  */
     *pi_nal = h->out.i_nal;
     *pp_nal = h->out.nal;
+
+    frame_size = x264_encoder_encapsulate_nals( h );
+
     h->out.i_nal = 0;
 
     /* Set output picture properties */
@@ -1635,7 +2398,32 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
         pic_out->i_type = X264_TYPE_P;
     else
         pic_out->i_type = X264_TYPE_B;
-    pic_out->i_pts = h->fenc->i_pts;
+
+    pic_out->b_keyframe = h->fenc->b_keyframe;
+
+    pic_out->i_pts = h->fenc->i_pts *= h->i_dts_compress_multiplier;
+    if( h->frames.i_bframe_delay )
+    {
+        int64_t *i_prev_dts = thread_current->frames.i_prev_dts;
+        if( h->i_frame <= h->frames.i_bframe_delay )
+        {
+            if( h->i_dts_compress_multiplier == 1 )
+                pic_out->i_dts = h->fenc->i_reordered_pts - h->frames.i_bframe_delay_time;
+            else
+            {
+                /* DTS compression */
+                if( h->i_frame == 1 )
+                    thread_current->frames.i_init_delta = h->fenc->i_reordered_pts * h->i_dts_compress_multiplier;
+                pic_out->i_dts = h->i_frame * thread_current->frames.i_init_delta / h->i_dts_compress_multiplier;
+            }
+        }
+        else
+            pic_out->i_dts = i_prev_dts[ (h->i_frame - h->frames.i_bframe_delay) % h->frames.i_bframe_delay ];
+        i_prev_dts[ h->i_frame % h->frames.i_bframe_delay ] = h->fenc->i_reordered_pts * h->i_dts_compress_multiplier;
+    }
+    else
+        pic_out->i_dts = h->fenc->i_reordered_pts;
+    assert( pic_out->i_pts >= pic_out->i_dts );
 
     pic_out->img.i_plane = h->fdec->i_plane;
     for(i = 0; i < 3; i++)
@@ -1648,10 +2436,8 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
 
     /* update rc */
     x264_emms();
-    x264_ratecontrol_end( h, h->out.i_frame_size * 8 );
-
-    /* restore CPU state (before using float again) */
-    x264_emms();
+    if( x264_ratecontrol_end( h, frame_size * 8 ) < 0 )
+        return -1;
 
     x264_noise_reduction_update( thread_current );
 
@@ -1659,9 +2445,9 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
     x264_thread_sync_stat( h, h->thread[0] );
 
     /* Slice stat */
-    h->stat.i_slice_count[h->sh.i_type]++;
-    h->stat.i_slice_size[h->sh.i_type] += h->out.i_frame_size + NALU_OVERHEAD;
-    h->stat.f_slice_qp[h->sh.i_type] += h->fdec->f_qp_avg_aq;
+    h->stat.i_frame_count[h->sh.i_type]++;
+    h->stat.i_frame_size[h->sh.i_type] += frame_size;
+    h->stat.f_frame_qp[h->sh.i_type] += h->fdec->f_qp_avg_aq;
 
     for( i = 0; i < X264_MBTYPE_MAX; i++ )
         h->stat.i_mb_count[h->sh.i_type][i] += h->stat.frame.i_mb_count[i];
@@ -1671,12 +2457,27 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
         h->stat.i_mb_count_8x8dct[i] += h->stat.frame.i_mb_count_8x8dct[i];
     for( i = 0; i < 6; i++ )
         h->stat.i_mb_cbp[i] += h->stat.frame.i_mb_cbp[i];
+    for( i = 0; i < 3; i++ )
+        for( j = 0; j < 13; j++ )
+            h->stat.i_mb_pred_mode[i][j] += h->stat.frame.i_mb_pred_mode[i][j];
     if( h->sh.i_type != SLICE_TYPE_I )
         for( i_list = 0; i_list < 2; i_list++ )
             for( i = 0; i < 32; i++ )
                 h->stat.i_mb_count_ref[h->sh.i_type][i_list][i] += h->stat.frame.i_mb_count_ref[i_list][i];
     if( h->sh.i_type == SLICE_TYPE_P )
+    {
         h->stat.i_consecutive_bframes[h->fdec->i_frame - h->fref0[0]->i_frame - 1]++;
+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+        {
+            for( i = 0; i < 3; i++ )
+                for( j = 0; j < h->i_ref0; j++ )
+                    if( h->sh.weight[0][i].i_denom != 0 )
+                    {
+                        h->stat.i_wpred[i]++;
+                        break;
+                    }
+        }
+    }
     if( h->sh.i_type == SLICE_TYPE_B )
     {
         h->stat.i_direct_frames[ h->sh.b_direct_spatial_mv_pred ] ++;
@@ -1734,7 +2535,7 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
               h->stat.frame.i_mb_count_i,
               h->stat.frame.i_mb_count_p,
               h->stat.frame.i_mb_count_skip,
-              h->out.i_frame_size,
+              frame_size,
               psz_message );
 
     // keep stats all in one place
@@ -1760,8 +2561,19 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
 }
 #endif
 
+    /* Remove duplicates, must be done near the end as breaks h->fref0 array
+     * by freeing some of its pointers. */
+     for( i = 0; i < h->i_ref0; i++ )
+         if( h->fref0[i] && h->fref0[i]->b_duplicate )
+         {
+             x264_frame_push_blank_unused( h, h->fref0[i] );
+             h->fref0[i] = 0;
+         }
+
     if( h->param.psz_dump_yuv )
         x264_frame_dump( h );
+
+    return frame_size;
 }
 
 static void x264_print_intra( int64_t *i_mb_count, double i_count, int b_print_pcm, char *intra )
@@ -1788,16 +2600,32 @@ void    x264_encoder_close  ( x264_t *h )
                    || h->stat.i_mb_count[SLICE_TYPE_P][I_PCM]
                    || h->stat.i_mb_count[SLICE_TYPE_B][I_PCM];
 
-    for( i=0; i<h->param.i_threads; i++ )
+    x264_lookahead_delete( h );
+
+    if( h->param.i_threads > 1 )
     {
         // don't strictly have to wait for the other threads, but it's simpler than canceling them
-        if( h->thread[i]->b_thread_active )
+        for( i = 0; i < h->param.i_threads; i++ )
+            if( h->thread[i]->b_thread_active )
+                x264_pthread_join( h->thread[i]->thread_handle, NULL );
+        if( h->i_thread_frames > 1 )
         {
-            x264_pthread_join( h->thread[i]->thread_handle, NULL );
-            assert( h->thread[i]->fenc->i_reference_count == 1 );
-            x264_frame_delete( h->thread[i]->fenc );
+            for( i = 0; i < h->i_thread_frames; i++ )
+            {
+                if( h->thread[i]->b_thread_active )
+                {
+                    assert( h->thread[i]->fenc->i_reference_count == 1 );
+                    x264_frame_delete( h->thread[i]->fenc );
+                }
+            }
+
+            x264_t *thread_prev = h->thread[h->i_thread_phase];
+            x264_thread_sync_ratecontrol( h, thread_prev, h );
+            x264_thread_sync_ratecontrol( thread_prev, thread_prev, h );
+            h->i_frame = thread_prev->i_frame + 1 - h->i_thread_frames;
         }
     }
+    h->i_frame++;
 
     /* Slices used and PSNR */
     for( i=0; i<5; i++ )
@@ -1806,17 +2634,17 @@ void    x264_encoder_close  ( x264_t *h )
         static const char *slice_name[] = { "P", "B", "I", "SP", "SI" };
         int i_slice = slice_order[i];
 
-        if( h->stat.i_slice_count[i_slice] > 0 )
+        if( h->stat.i_frame_count[i_slice] > 0 )
         {
-            const int i_count = h->stat.i_slice_count[i_slice];
+            const int i_count = h->stat.i_frame_count[i_slice];
             if( h->param.analyse.b_psnr )
             {
                 x264_log( h, X264_LOG_INFO,
-                          "slice %s:%-5d Avg QP:%5.2f  size:%6.0f  PSNR Mean Y:%5.2f U:%5.2f V:%5.2f Avg:%5.2f Global:%5.2f\n",
+                          "frame %s:%-5d Avg QP:%5.2f  size:%6.0f  PSNR Mean Y:%5.2f U:%5.2f V:%5.2f Avg:%5.2f Global:%5.2f\n",
                           slice_name[i_slice],
                           i_count,
-                          h->stat.f_slice_qp[i_slice] / i_count,
-                          (double)h->stat.i_slice_size[i_slice] / i_count,
+                          h->stat.f_frame_qp[i_slice] / i_count,
+                          (double)h->stat.i_frame_size[i_slice] / i_count,
                           h->stat.f_psnr_mean_y[i_slice] / i_count, h->stat.f_psnr_mean_u[i_slice] / i_count, h->stat.f_psnr_mean_v[i_slice] / i_count,
                           h->stat.f_psnr_average[i_slice] / i_count,
                           x264_psnr( h->stat.i_ssd_global[i_slice], i_count * i_yuv_size ) );
@@ -1824,15 +2652,15 @@ void    x264_encoder_close  ( x264_t *h )
             else
             {
                 x264_log( h, X264_LOG_INFO,
-                          "slice %s:%-5d Avg QP:%5.2f  size:%6.0f\n",
+                          "frame %s:%-5d Avg QP:%5.2f  size:%6.0f\n",
                           slice_name[i_slice],
                           i_count,
-                          h->stat.f_slice_qp[i_slice] / i_count,
-                          (double)h->stat.i_slice_size[i_slice] / i_count );
+                          h->stat.f_frame_qp[i_slice] / i_count,
+                          (double)h->stat.i_frame_size[i_slice] / i_count );
             }
         }
     }
-    if( h->param.i_bframe && h->stat.i_slice_count[SLICE_TYPE_P] )
+    if( h->param.i_bframe && h->stat.i_frame_count[SLICE_TYPE_P] )
     {
         char *p = buf;
         int den = 0;
@@ -1852,17 +2680,17 @@ void    x264_encoder_close  ( x264_t *h )
         }
 
     /* MB types used */
-    if( h->stat.i_slice_count[SLICE_TYPE_I] > 0 )
+    if( h->stat.i_frame_count[SLICE_TYPE_I] > 0 )
     {
         int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_I];
-        double i_count = h->stat.i_slice_count[SLICE_TYPE_I] * h->mb.i_mb_count / 100.0;
+        double i_count = h->stat.i_frame_count[SLICE_TYPE_I] * h->mb.i_mb_count / 100.0;
         x264_print_intra( i_mb_count, i_count, b_print_pcm, buf );
         x264_log( h, X264_LOG_INFO, "mb I  %s\n", buf );
     }
-    if( h->stat.i_slice_count[SLICE_TYPE_P] > 0 )
+    if( h->stat.i_frame_count[SLICE_TYPE_P] > 0 )
     {
         int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_P];
-        double i_count = h->stat.i_slice_count[SLICE_TYPE_P] * h->mb.i_mb_count / 100.0;
+        double i_count = h->stat.i_frame_count[SLICE_TYPE_P] * h->mb.i_mb_count / 100.0;
         int64_t *i_mb_size = i_mb_count_size[SLICE_TYPE_P];
         x264_print_intra( i_mb_count, i_count, b_print_pcm, buf );
         x264_log( h, X264_LOG_INFO,
@@ -1875,10 +2703,10 @@ void    x264_encoder_close  ( x264_t *h )
                   i_mb_size[PIXEL_4x4] / (i_count*4),
                   i_mb_count[P_SKIP] / i_count );
     }
-    if( h->stat.i_slice_count[SLICE_TYPE_B] > 0 )
+    if( h->stat.i_frame_count[SLICE_TYPE_B] > 0 )
     {
         int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_B];
-        double i_count = h->stat.i_slice_count[SLICE_TYPE_B] * h->mb.i_mb_count / 100.0;
+        double i_count = h->stat.i_frame_count[SLICE_TYPE_B] * h->mb.i_mb_count / 100.0;
         double i_mb_list_count;
         int64_t *i_mb_size = i_mb_count_size[SLICE_TYPE_B];
         int64_t list_count[3] = {0}; /* 0 == L0, 1 == L1, 2 == BI */
@@ -1911,7 +2739,7 @@ void    x264_encoder_close  ( x264_t *h )
 
     x264_ratecontrol_summary( h );
 
-    if( h->stat.i_slice_count[SLICE_TYPE_I] + h->stat.i_slice_count[SLICE_TYPE_P] + h->stat.i_slice_count[SLICE_TYPE_B] > 0 )
+    if( h->stat.i_frame_count[SLICE_TYPE_I] + h->stat.i_frame_count[SLICE_TYPE_P] + h->stat.i_frame_count[SLICE_TYPE_B] > 0 )
     {
 #define SUM3(p) (p[SLICE_TYPE_I] + p[SLICE_TYPE_P] + p[SLICE_TYPE_B])
 #define SUM3b(p,o) (p[SLICE_TYPE_I][o] + p[SLICE_TYPE_P][o] + p[SLICE_TYPE_B][o])
@@ -1919,35 +2747,76 @@ void    x264_encoder_close  ( x264_t *h )
         int64_t i_intra = i_i8x8 + SUM3b( h->stat.i_mb_count, I_4x4 )
                                  + SUM3b( h->stat.i_mb_count, I_16x16 );
         int64_t i_all_intra = i_intra + SUM3b( h->stat.i_mb_count, I_PCM);
-        const int i_count = h->stat.i_slice_count[SLICE_TYPE_I] +
-                            h->stat.i_slice_count[SLICE_TYPE_P] +
-                            h->stat.i_slice_count[SLICE_TYPE_B];
+        const int i_count = h->stat.i_frame_count[SLICE_TYPE_I] +
+                            h->stat.i_frame_count[SLICE_TYPE_P] +
+                            h->stat.i_frame_count[SLICE_TYPE_B];
         int64_t i_mb_count = i_count * h->mb.i_mb_count;
         float fps = (float) h->param.i_fps_num / h->param.i_fps_den;
-        float f_bitrate = fps * SUM3(h->stat.i_slice_size) / i_count / 125;
+        float f_bitrate = fps * SUM3(h->stat.i_frame_size) / i_count / 125;
 
         if( h->pps->b_transform_8x8_mode )
         {
-            x264_log( h, X264_LOG_INFO, "8x8 transform  intra:%.1f%%  inter:%.1f%%\n",
-                      100. * i_i8x8 / i_intra,
-                      100. * h->stat.i_mb_count_8x8dct[1] / h->stat.i_mb_count_8x8dct[0] );
+            buf[0] = 0;
+            if( h->stat.i_mb_count_8x8dct[0] )
+                sprintf( buf, " inter:%.1f%%", 100. * h->stat.i_mb_count_8x8dct[1] / h->stat.i_mb_count_8x8dct[0] );
+            x264_log( h, X264_LOG_INFO, "8x8 transform intra:%.1f%%%s\n", 100. * i_i8x8 / i_intra, buf );
         }
 
         if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO
-            && h->stat.i_slice_count[SLICE_TYPE_B] )
+            && h->stat.i_frame_count[SLICE_TYPE_B] )
         {
-            x264_log( h, X264_LOG_INFO, "direct mvs  spatial:%.1f%%  temporal:%.1f%%\n",
-                      h->stat.i_direct_frames[1] * 100. / h->stat.i_slice_count[SLICE_TYPE_B],
-                      h->stat.i_direct_frames[0] * 100. / h->stat.i_slice_count[SLICE_TYPE_B] );
+            x264_log( h, X264_LOG_INFO, "direct mvs  spatial:%.1f%% temporal:%.1f%%\n",
+                      h->stat.i_direct_frames[1] * 100. / h->stat.i_frame_count[SLICE_TYPE_B],
+                      h->stat.i_direct_frames[0] * 100. / h->stat.i_frame_count[SLICE_TYPE_B] );
         }
 
-        x264_log( h, X264_LOG_INFO, "coded y,uvDC,uvAC intra:%.1f%% %.1f%% %.1f%% inter:%.1f%% %.1f%% %.1f%%\n",
+        buf[0] = 0;
+        if( i_mb_count != i_all_intra )
+            sprintf( buf, " inter: %.1f%% %.1f%% %.1f%%",
+                     h->stat.i_mb_cbp[1] * 100.0 / ((i_mb_count - i_all_intra)*4),
+                     h->stat.i_mb_cbp[3] * 100.0 / ((i_mb_count - i_all_intra)  ),
+                     h->stat.i_mb_cbp[5] * 100.0 / ((i_mb_count - i_all_intra)) );
+        x264_log( h, X264_LOG_INFO, "coded y,uvDC,uvAC intra: %.1f%% %.1f%% %.1f%%%s\n",
                   h->stat.i_mb_cbp[0] * 100.0 / (i_all_intra*4),
                   h->stat.i_mb_cbp[2] * 100.0 / (i_all_intra  ),
-                  h->stat.i_mb_cbp[4] * 100.0 / (i_all_intra  ),
-                  h->stat.i_mb_cbp[1] * 100.0 / ((i_mb_count - i_all_intra)*4),
-                  h->stat.i_mb_cbp[3] * 100.0 / ((i_mb_count - i_all_intra)  ),
-                  h->stat.i_mb_cbp[5] * 100.0 / ((i_mb_count - i_all_intra)) );
+                  h->stat.i_mb_cbp[4] * 100.0 / (i_all_intra  ), buf );
+
+        int64_t fixed_pred_modes[3][9] = {{0}};
+        int64_t sum_pred_modes[3] = {0};
+        for( i = 0; i <= I_PRED_16x16_DC_128; i++ )
+        {
+            fixed_pred_modes[0][x264_mb_pred_mode16x16_fix[i]] += h->stat.i_mb_pred_mode[0][i];
+            sum_pred_modes[0] += h->stat.i_mb_pred_mode[0][i];
+        }
+        if( sum_pred_modes[0] )
+            x264_log( h, X264_LOG_INFO, "i16 v,h,dc,p: %2.0f%% %2.0f%% %2.0f%% %2.0f%%\n",
+                      fixed_pred_modes[0][0] * 100.0 / sum_pred_modes[0],
+                      fixed_pred_modes[0][1] * 100.0 / sum_pred_modes[0],
+                      fixed_pred_modes[0][2] * 100.0 / sum_pred_modes[0],
+                      fixed_pred_modes[0][3] * 100.0 / sum_pred_modes[0] );
+        for( i = 1; i <= 2; i++ )
+        {
+            for( j = 0; j <= I_PRED_8x8_DC_128; j++ )
+            {
+                fixed_pred_modes[i][x264_mb_pred_mode4x4_fix(j)] += h->stat.i_mb_pred_mode[i][j];
+                sum_pred_modes[i] += h->stat.i_mb_pred_mode[i][j];
+            }
+            if( sum_pred_modes[i] )
+                x264_log( h, X264_LOG_INFO, "i%d v,h,dc,ddl,ddr,vr,hd,vl,hu: %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%%\n", (3-i)*4,
+                          fixed_pred_modes[i][0] * 100.0 / sum_pred_modes[i],
+                          fixed_pred_modes[i][1] * 100.0 / sum_pred_modes[i],
+                          fixed_pred_modes[i][2] * 100.0 / sum_pred_modes[i],
+                          fixed_pred_modes[i][3] * 100.0 / sum_pred_modes[i],
+                          fixed_pred_modes[i][4] * 100.0 / sum_pred_modes[i],
+                          fixed_pred_modes[i][5] * 100.0 / sum_pred_modes[i],
+                          fixed_pred_modes[i][6] * 100.0 / sum_pred_modes[i],
+                          fixed_pred_modes[i][7] * 100.0 / sum_pred_modes[i],
+                          fixed_pred_modes[i][8] * 100.0 / sum_pred_modes[i] );
+        }
+
+        if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->stat.i_frame_count[SLICE_TYPE_P] > 0 )
+            x264_log( h, X264_LOG_INFO, "Weighted P-Frames: Y:%.1f%%\n",
+                      h->stat.i_wpred[0] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P] );
 
         for( i_list = 0; i_list < 2; i_list++ )
         {
@@ -1967,7 +2836,7 @@ void    x264_encoder_close  ( x264_t *h )
                     continue;
                 for( i = 0; i <= i_max; i++ )
                     p += sprintf( p, " %4.1f%%", 100. * h->stat.i_mb_count_ref[i_slice][i_list][i] / i_den );
-                x264_log( h, X264_LOG_INFO, "ref %c L%d %s\n", "PB"[i_slice], i_list, buf );
+                x264_log( h, X264_LOG_INFO, "ref %c L%d:%s\n", "PB"[i_slice], i_list, buf );
             }
         }
 
@@ -1989,7 +2858,7 @@ void    x264_encoder_close  ( x264_t *h )
                       f_bitrate );
         }
         else
-            x264_log( h, X264_LOG_INFO, "kb/s:%.1f\n", f_bitrate );
+            x264_log( h, X264_LOG_INFO, "kb/s:%.2f\n", f_bitrate );
     }
 
     /* rc */
@@ -2002,26 +2871,17 @@ void    x264_encoder_close  ( x264_t *h )
         free( h->param.rc.psz_stat_in );
 
     x264_cqm_delete( h );
+    x264_free( h->nal_buffer );
+    x264_analyse_free_costs( h );
 
-    if( h->param.i_threads > 1)
-        h = h->thread[ h->i_thread_phase % h->param.i_threads ];
+    if( h->i_thread_frames > 1)
+        h = h->thread[h->i_thread_phase];
 
     /* frames */
-    for( i = 0; h->frames.current[i]; i++ )
-    {
-        assert( h->frames.current[i]->i_reference_count == 1 );
-        x264_frame_delete( h->frames.current[i] );
-    }
-    for( i = 0; h->frames.next[i]; i++ )
-    {
-        assert( h->frames.next[i]->i_reference_count == 1 );
-        x264_frame_delete( h->frames.next[i] );
-    }
-    for( i = 0; h->frames.unused[i]; i++ )
-    {
-        assert( h->frames.unused[i]->i_reference_count == 0 );
-        x264_frame_delete( h->frames.unused[i] );
-    }
+    x264_frame_delete_list( h->frames.unused[0] );
+    x264_frame_delete_list( h->frames.unused[1] );
+    x264_frame_delete_list( h->frames.current );
+    x264_frame_delete_list( h->frames.blank_unused );
 
     h = h->thread[0];
 
@@ -2029,21 +2889,50 @@ void    x264_encoder_close  ( x264_t *h )
     {
         x264_frame_t **frame;
 
-        for( frame = h->thread[i]->frames.reference; *frame; frame++ )
+        if( !h->param.b_sliced_threads || i == 0 )
         {
+            for( frame = h->thread[i]->frames.reference; *frame; frame++ )
+            {
+                assert( (*frame)->i_reference_count > 0 );
+                (*frame)->i_reference_count--;
+                if( (*frame)->i_reference_count == 0 )
+                    x264_frame_delete( *frame );
+            }
+            frame = &h->thread[i]->fdec;
             assert( (*frame)->i_reference_count > 0 );
             (*frame)->i_reference_count--;
             if( (*frame)->i_reference_count == 0 )
                 x264_frame_delete( *frame );
+            x264_macroblock_cache_end( h->thread[i] );
         }
-        frame = &h->thread[i]->fdec;
-        assert( (*frame)->i_reference_count > 0 );
-        (*frame)->i_reference_count--;
-        if( (*frame)->i_reference_count == 0 )
-            x264_frame_delete( *frame );
-
-        x264_macroblock_cache_end( h->thread[i] );
+        x264_free( h->thread[i]->scratch_buffer );
         x264_free( h->thread[i]->out.p_bitstream );
+        x264_free( h->thread[i]->out.nal);
         x264_free( h->thread[i] );
     }
 }
+
+/****************************************************************************
+ * x264_encoder_delayed_frames:
+ ****************************************************************************/
+int x264_encoder_delayed_frames( x264_t *h )
+{
+    int delayed_frames = 0;
+    int i;
+    if( h->i_thread_frames > 1 )
+    {
+        for( i=0; i<h->i_thread_frames; i++ )
+            delayed_frames += h->thread[i]->b_thread_active;
+        h = h->thread[h->i_thread_phase];
+    }
+    for( i=0; h->frames.current[i]; i++ )
+        delayed_frames++;
+    x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+    x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+    delayed_frames += h->lookahead->ifbuf.i_size + h->lookahead->next.i_size + h->lookahead->ofbuf.i_size;
+    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+    x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+    x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
+    return delayed_frames;
+}
diff --git a/encoder/lookahead.c b/encoder/lookahead.c
new file mode 100644
index 0000000..b66eedc
--- /dev/null
+++ b/encoder/lookahead.c
@@ -0,0 +1,244 @@
+/*****************************************************************************
+ * lookahead.c: Lookahead slicetype decisions for x264
+ *****************************************************************************
+ * Lookahead.c and associated modifications:
+ *     Copyright (C) 2008 Avail Media
+ *
+ * Authors: Michael Kazmier <mkazmier at availmedia.com>
+ *          Alex Giladi <agiladi at availmedia.com>
+ *          Steven Walters <kemuri9 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+/* LOOKAHEAD (threaded and non-threaded mode)
+ *
+ * Lookahead types:
+ *     [1] Slice type / scene cut;
+ *
+ * In non-threaded mode, we run the existing slicetype decision code as it was.
+ * In threaded mode, we run in a separate thread, that lives between the calls
+ * to x264_encoder_open() and x264_encoder_close(), and performs lookahead for
+ * the number of frames specified in rc_lookahead.  Recommended setting is
+ * # of bframes + # of threads.
+ */
+#include "common/common.h"
+#include "common/cpu.h"
+#include "analyse.h"
+
+static void x264_lookahead_shift( x264_synch_frame_list_t *dst, x264_synch_frame_list_t *src, int count )
+{
+    int i = count;
+    while( i-- )
+    {
+        assert( dst->i_size < dst->i_max_size );
+        assert( src->i_size );
+        dst->list[ dst->i_size++ ] = x264_frame_shift( src->list );
+        src->i_size--;
+    }
+    if( count )
+    {
+        x264_pthread_cond_broadcast( &dst->cv_fill );
+        x264_pthread_cond_broadcast( &src->cv_empty );
+    }
+}
+
+static void x264_lookahead_update_last_nonb( x264_t *h, x264_frame_t *new_nonb )
+{
+    if( h->lookahead->last_nonb )
+        x264_frame_push_unused( h, h->lookahead->last_nonb );
+    h->lookahead->last_nonb = new_nonb;
+    new_nonb->i_reference_count++;
+}
+
+#ifdef HAVE_PTHREAD
+static void x264_lookahead_slicetype_decide( x264_t *h )
+{
+    x264_stack_align( x264_slicetype_decide, h );
+
+    x264_lookahead_update_last_nonb( h, h->lookahead->next.list[0] );
+
+    x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+    while( h->lookahead->ofbuf.i_size == h->lookahead->ofbuf.i_max_size )
+        x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_empty, &h->lookahead->ofbuf.mutex );
+
+    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+    x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, h->lookahead->next.list[0]->i_bframes + 1 );
+    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+
+    /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
+    if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
+        x264_stack_align( x264_slicetype_analyse, h, 1 );
+
+    x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
+}
+
+static void x264_lookahead_thread( x264_t *h )
+{
+    int shift;
+#ifdef HAVE_MMX
+    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
+        x264_cpu_mask_misalign_sse();
+#endif
+    while( !h->lookahead->b_exit_thread )
+    {
+        x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+        x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+        shift = X264_MIN( h->lookahead->next.i_max_size - h->lookahead->next.i_size, h->lookahead->ifbuf.i_size );
+        x264_lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, shift );
+        x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+        if( h->lookahead->next.i_size <= h->lookahead->i_slicetype_length )
+        {
+            while( !h->lookahead->ifbuf.i_size && !h->lookahead->b_exit_thread )
+                x264_pthread_cond_wait( &h->lookahead->ifbuf.cv_fill, &h->lookahead->ifbuf.mutex );
+            x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+        }
+        else
+        {
+            x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+            x264_lookahead_slicetype_decide( h );
+        }
+    }   /* end of input frames */
+    x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+    x264_lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, h->lookahead->ifbuf.i_size );
+    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+    x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+    while( h->lookahead->next.i_size )
+        x264_lookahead_slicetype_decide( h );
+    x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+    h->lookahead->b_thread_active = 0;
+    x264_pthread_cond_broadcast( &h->lookahead->ofbuf.cv_fill );
+    x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
+}
+#endif
+
+int x264_lookahead_init( x264_t *h, int i_slicetype_length )
+{
+    x264_lookahead_t *look;
+    CHECKED_MALLOCZERO( look, sizeof(x264_lookahead_t) );
+    int i;
+    for( i = 0; i < h->param.i_threads; i++ )
+        h->thread[i]->lookahead = look;
+
+    look->i_last_keyframe = - h->param.i_keyint_max;
+    look->b_analyse_keyframe = (h->param.rc.b_mb_tree || (h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead))
+                               && !h->param.rc.b_stat_read;
+    look->i_slicetype_length = i_slicetype_length;
+
+    /* init frame lists */
+    if( x264_synch_frame_list_init( &look->ifbuf, h->param.i_sync_lookahead+3 ) ||
+        x264_synch_frame_list_init( &look->next, h->frames.i_delay+3 ) ||
+        x264_synch_frame_list_init( &look->ofbuf, h->frames.i_delay+3 ) )
+        goto fail;
+
+    if( !h->param.i_sync_lookahead )
+        return 0;
+
+    x264_t *look_h = h->thread[h->param.i_threads];
+    *look_h = *h;
+    if( x264_macroblock_cache_init( look_h ) )
+        goto fail;
+
+    if( x264_pthread_create( &look_h->thread_handle, NULL, (void *)x264_lookahead_thread, look_h ) )
+        goto fail;
+    look->b_thread_active = 1;
+
+    return 0;
+fail:
+    x264_free( look );
+    return -1;
+}
+
+void x264_lookahead_delete( x264_t *h )
+{
+    if( h->param.i_sync_lookahead )
+    {
+        x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex );
+        h->lookahead->b_exit_thread = 1;
+        x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill );
+        x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex );
+        x264_pthread_join( h->thread[h->param.i_threads]->thread_handle, NULL );
+        x264_macroblock_cache_end( h->thread[h->param.i_threads] );
+        x264_free( h->thread[h->param.i_threads]->scratch_buffer );
+        x264_free( h->thread[h->param.i_threads] );
+    }
+    x264_synch_frame_list_delete( &h->lookahead->ifbuf );
+    x264_synch_frame_list_delete( &h->lookahead->next );
+    if( h->lookahead->last_nonb )
+        x264_frame_push_unused( h, h->lookahead->last_nonb );
+    x264_synch_frame_list_delete( &h->lookahead->ofbuf );
+    x264_free( h->lookahead );
+}
+
+void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame )
+{
+    if( h->param.i_sync_lookahead )
+        x264_synch_frame_list_push( &h->lookahead->ifbuf, frame );
+    else
+        x264_synch_frame_list_push( &h->lookahead->next, frame );
+}
+
+int x264_lookahead_is_empty( x264_t *h )
+{
+    int b_empty;
+    x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+    x264_pthread_mutex_lock( &h->lookahead->next.mutex );
+    b_empty = !h->lookahead->next.i_size && !h->lookahead->ofbuf.i_size;
+    x264_pthread_mutex_unlock( &h->lookahead->next.mutex );
+    x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
+    return b_empty;
+}
+
+static void x264_lookahead_encoder_shift( x264_t *h )
+{
+    if( !h->lookahead->ofbuf.i_size )
+        return;
+    int i_frames = h->lookahead->ofbuf.list[0]->i_bframes + 1;
+    while( i_frames-- )
+    {
+        x264_frame_push( h->frames.current, x264_frame_shift( h->lookahead->ofbuf.list ) );
+        h->lookahead->ofbuf.i_size--;
+    }
+    x264_pthread_cond_broadcast( &h->lookahead->ofbuf.cv_empty );
+}
+
+void x264_lookahead_get_frames( x264_t *h )
+{
+    if( h->param.i_sync_lookahead )
+    {   /* We have a lookahead thread, so get frames from there */
+        x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex );
+        while( !h->lookahead->ofbuf.i_size && h->lookahead->b_thread_active )
+            x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_fill, &h->lookahead->ofbuf.mutex );
+        x264_lookahead_encoder_shift( h );
+        x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex );
+    }
+    else
+    {   /* We are not running a lookahead thread, so perform all the slicetype decide on the fly */
+
+        if( h->frames.current[0] || !h->lookahead->next.i_size )
+            return;
+
+        x264_stack_align( x264_slicetype_decide, h );
+        x264_lookahead_update_last_nonb( h, h->lookahead->next.list[0] );
+        x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, h->lookahead->next.list[0]->i_bframes + 1 );
+
+        /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */
+        if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) )
+            x264_stack_align( x264_slicetype_analyse, h, 1 );
+
+        x264_lookahead_encoder_shift( h );
+    }
+}
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 1f346e0..e4edb8a 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -27,8 +27,8 @@
 
 /* These chroma DC functions don't have assembly versions and are only used here. */
 
-#define ZIG(i,y,x) level[i] = dct[x][y];
-static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] )
+#define ZIG(i,y,x) level[i] = dct[x*2+y];
+static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[4] )
 {
     ZIG(0,0,0)
     ZIG(1,0,1)
@@ -38,11 +38,11 @@ static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] )
 #undef ZIG
 
 #define IDCT_DEQUANT_START \
-    int d0 = dct[0][0] + dct[0][1]; \
-    int d1 = dct[1][0] + dct[1][1]; \
-    int d2 = dct[0][0] - dct[0][1]; \
-    int d3 = dct[1][0] - dct[1][1]; \
-    int dmf = dequant_mf[i_qp%6][0][0]; \
+    int d0 = dct[0] + dct[1]; \
+    int d1 = dct[2] + dct[3]; \
+    int d2 = dct[0] - dct[1]; \
+    int d3 = dct[2] - dct[3]; \
+    int dmf = dequant_mf[i_qp%6][0]; \
     int qbits = i_qp/6 - 5; \
     if( qbits > 0 ) \
     { \
@@ -50,50 +50,62 @@ static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] )
         qbits = 0; \
     }
 
-static inline void idct_dequant_2x2_dc( int16_t dct[2][2], int16_t dct4x4[4][4][4], int dequant_mf[6][4][4], int i_qp )
+static inline void idct_dequant_2x2_dc( int16_t dct[4], int16_t dct4x4[4][16], int dequant_mf[6][16], int i_qp )
 {
     IDCT_DEQUANT_START
-    dct4x4[0][0][0] = (d0 + d1) * dmf >> -qbits;
-    dct4x4[1][0][0] = (d0 - d1) * dmf >> -qbits;
-    dct4x4[2][0][0] = (d2 + d3) * dmf >> -qbits;
-    dct4x4[3][0][0] = (d2 - d3) * dmf >> -qbits;
+    dct4x4[0][0] = (d0 + d1) * dmf >> -qbits;
+    dct4x4[1][0] = (d0 - d1) * dmf >> -qbits;
+    dct4x4[2][0] = (d2 + d3) * dmf >> -qbits;
+    dct4x4[3][0] = (d2 - d3) * dmf >> -qbits;
 }
 
-static inline void idct_dequant_2x2_dconly( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qp )
+static inline void idct_dequant_2x2_dconly( int16_t out[4], int16_t dct[4], int dequant_mf[6][16], int i_qp )
 {
     IDCT_DEQUANT_START
-    dct[0][0] = (d0 + d1) * dmf >> -qbits;
-    dct[0][1] = (d0 - d1) * dmf >> -qbits;
-    dct[1][0] = (d2 + d3) * dmf >> -qbits;
-    dct[1][1] = (d2 - d3) * dmf >> -qbits;
+    out[0] = (d0 + d1) * dmf >> -qbits;
+    out[1] = (d0 - d1) * dmf >> -qbits;
+    out[2] = (d2 + d3) * dmf >> -qbits;
+    out[3] = (d2 - d3) * dmf >> -qbits;
 }
 
-static inline void dct2x2dc( int16_t d[2][2], int16_t dct4x4[4][4][4] )
+static inline void dct2x2dc( int16_t d[4], int16_t dct4x4[4][16] )
 {
-    int d0 = dct4x4[0][0][0] + dct4x4[1][0][0];
-    int d1 = dct4x4[2][0][0] + dct4x4[3][0][0];
-    int d2 = dct4x4[0][0][0] - dct4x4[1][0][0];
-    int d3 = dct4x4[2][0][0] - dct4x4[3][0][0];
-    d[0][0] = d0 + d1;
-    d[1][0] = d2 + d3;
-    d[0][1] = d0 - d1;
-    d[1][1] = d2 - d3;
-    dct4x4[0][0][0] = 0;
-    dct4x4[1][0][0] = 0;
-    dct4x4[2][0][0] = 0;
-    dct4x4[3][0][0] = 0;
+    int d0 = dct4x4[0][0] + dct4x4[1][0];
+    int d1 = dct4x4[2][0] + dct4x4[3][0];
+    int d2 = dct4x4[0][0] - dct4x4[1][0];
+    int d3 = dct4x4[2][0] - dct4x4[3][0];
+    d[0] = d0 + d1;
+    d[2] = d2 + d3;
+    d[1] = d0 - d1;
+    d[3] = d2 - d3;
+    dct4x4[0][0] = 0;
+    dct4x4[1][0] = 0;
+    dct4x4[2][0] = 0;
+    dct4x4[3][0] = 0;
 }
 
-static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
+static inline void dct2x2dc_dconly( int16_t d[4] )
+{
+    int d0 = d[0] + d[1];
+    int d1 = d[2] + d[3];
+    int d2 = d[0] - d[1];
+    int d3 = d[2] - d[3];
+    d[0] = d0 + d1;
+    d[2] = d2 + d3;
+    d[1] = d0 - d1;
+    d[3] = d2 - d3;
+}
+
+static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, int16_t dct[16], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
 {
     int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY;
     if( h->mb.b_trellis )
-        return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, idx );
+        return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, 0, idx );
     else
         return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
 }
 
-static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, int16_t dct[8][8], int i_qp, int b_intra, int idx )
+static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, int16_t dct[64], int i_qp, int b_intra, int idx )
 {
     int i_quant_cat = b_intra ? CQM_8IY : CQM_8PY;
     if( h->mb.b_trellis )
@@ -118,12 +130,11 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
     int nz;
     uint8_t *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
     uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
-    DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
+    ALIGNED_ARRAY_16( int16_t, dct4x4,[16] );
 
     if( h->mb.b_lossless )
     {
-        h->zigzagf.sub_4x4( h->dct.luma4x4[idx], p_src, p_dst );
-        nz = array_non_zero( h->dct.luma4x4[idx] );
+        nz = h->zigzagf.sub_4x4( h->dct.luma4x4[idx], p_src, p_dst );
         h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
         h->mb.i_cbp_luma |= nz<<(idx>>2);
         return;
@@ -144,8 +155,16 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
 
 #define STORE_8x8_NNZ(idx,nz)\
 {\
-    *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] = nz * 0x0101;\
-    *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] = nz * 0x0101;\
+    M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] ) = nz * 0x0101;\
+    M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] ) = nz * 0x0101;\
+}
+
+#define CLEAR_16x16_NNZ \
+{\
+    M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;\
+    M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0;\
+    M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0;\
+    M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0;\
 }
 
 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
@@ -155,12 +174,11 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
     int nz;
     uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
     uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
-    DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
+    ALIGNED_ARRAY_16( int16_t, dct8x8,[64] );
 
     if( h->mb.b_lossless )
     {
-        h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
-        nz = array_non_zero( h->dct.luma8x8[idx] );
+        nz = h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
         STORE_8x8_NNZ(idx,nz);
         h->mb.i_cbp_luma |= nz<<idx;
         return;
@@ -186,8 +204,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
     uint8_t  *p_src = h->mb.pic.p_fenc[0];
     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 
-    DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
-    DECLARE_ALIGNED_16( int16_t dct_dc4x4[4][4] );
+    ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[16] );
+    ALIGNED_ARRAY_16( int16_t, dct_dc4x4,[16] );
 
     int i, nz;
     int b_decimate = h->sh.i_type == SLICE_TYPE_B || (h->param.analyse.b_dct_decimate && h->sh.i_type == SLICE_TYPE_P);
@@ -199,10 +217,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
         {
             int oe = block_idx_xy_fenc[i];
             int od = block_idx_xy_fdec[i];
-            h->zigzagf.sub_4x4( h->dct.luma4x4[i], p_src+oe, p_dst+od );
-            dct_dc4x4[0][block_idx_yx_1d[i]] = h->dct.luma4x4[i][0];
-            h->dct.luma4x4[i][0] = 0;
-            nz = array_non_zero( h->dct.luma4x4[i] );
+            nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] );
             h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
             h->mb.i_cbp_luma |= nz;
         }
@@ -217,8 +232,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
     for( i = 0; i < 16; i++ )
     {
         /* copy dc coeff */
-        dct_dc4x4[0][block_idx_xy_1d[i]] = dct4x4[i][0][0];
-        dct4x4[i][0][0] = 0;
+        dct_dc4x4[block_idx_xy_1d[i]] = dct4x4[i][0];
+        dct4x4[i][0] = 0;
 
         /* quant/scan/dequant */
         nz = x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i );
@@ -237,15 +252,12 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
     if( decimate_score < 6 )
     {
         h->mb.i_cbp_luma = 0;
-        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
-        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
-        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
-        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
+        CLEAR_16x16_NNZ
     }
 
     h->dctf.dct4x4dc( dct_dc4x4 );
     if( h->mb.b_trellis )
-        nz = x264_quant_dc_trellis( h, (int16_t*)dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1);
+        nz = x264_quant_dc_trellis( h, dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1, 0 );
     else
         nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 );
 
@@ -259,7 +271,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
         h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp );  /* XXX not inversed */
         if( h->mb.i_cbp_luma )
             for( i = 0; i < 16; i++ )
-                dct4x4[i][0][0] = dct_dc4x4[0][block_idx_xy_1d[i]];
+                dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]];
     }
 
     /* put pixels to fdec */
@@ -269,12 +281,120 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
         h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
 }
 
+static inline int idct_dequant_round_2x2_dc( int16_t ref[4], int16_t dct[4], int dequant_mf[6][16], int i_qp )
+{
+    int16_t out[4];
+    idct_dequant_2x2_dconly( out, dct, dequant_mf, i_qp );
+    return ((ref[0] ^ (out[0]+32))
+          | (ref[1] ^ (out[1]+32))
+          | (ref[2] ^ (out[2]+32))
+          | (ref[3] ^ (out[3]+32))) >> 6;
+}
+
+/* Round down coefficients losslessly in DC-only chroma blocks.
+ * Unlike luma blocks, this can't be done with a lookup table or
+ * other shortcut technique because of the interdependencies
+ * between the coefficients due to the chroma DC transform. */
+static inline int x264_mb_optimize_chroma_dc( x264_t *h, int b_inter, int i_qp, int16_t dct2x2[4] )
+{
+    int16_t dct2x2_orig[4];
+    int coeff;
+    int nz = 0;
+
+    /* If the QP is too high, there's no benefit to rounding optimization. */
+    if( h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << (i_qp/6) > 32*64 )
+        return 1;
+
+    idct_dequant_2x2_dconly( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+    dct2x2_orig[0] += 32;
+    dct2x2_orig[1] += 32;
+    dct2x2_orig[2] += 32;
+    dct2x2_orig[3] += 32;
+
+    /* If the DC coefficients already round to zero, terminate early. */
+    if( !((dct2x2_orig[0]|dct2x2_orig[1]|dct2x2_orig[2]|dct2x2_orig[3])>>6) )
+        return 0;
+
+    /* Start with the highest frequency coefficient... is this the best option? */
+    for( coeff = 3; coeff >= 0; coeff-- )
+    {
+        int sign = dct2x2[coeff] < 0 ? -1 : 1;
+        int level = dct2x2[coeff];
+
+        if( !level )
+            continue;
+
+        while( level )
+        {
+            dct2x2[coeff] = level - sign;
+            if( idct_dequant_round_2x2_dc( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) )
+                break;
+            level -= sign;
+        }
+
+        nz |= level;
+        dct2x2[coeff] = level;
+    }
+
+    return !!nz;
+}
+
 void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
 {
     int i, ch, nz, nz_dc;
     int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
+    ALIGNED_ARRAY_16( int16_t, dct2x2,[4] );
     h->mb.i_cbp_chroma = 0;
 
+    /* Early termination: check variance of chroma residual before encoding.
+     * Don't bother trying early termination at low QPs.
+     * Values are experimentally derived. */
+    if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) )
+    {
+        int thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
+        int ssd[2];
+        int score  = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
+            score += h->pixf.var2_8x8( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
+        if( score < thresh*4 )
+        {
+            h->mb.cache.non_zero_count[x264_scan8[16]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[17]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[18]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[19]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[20]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[21]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[22]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[23]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[25]] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[26]] = 0;
+            for( ch = 0; ch < 2; ch++ )
+            {
+                if( ssd[ch] > thresh )
+                {
+                    h->dctf.sub8x8_dct_dc( dct2x2, h->mb.pic.p_fenc[1+ch], h->mb.pic.p_fdec[1+ch] );
+                    dct2x2dc_dconly( dct2x2 );
+                    if( h->mb.b_trellis )
+                        nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 );
+                    else
+                        nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<
+    1 );
+
+                    if( nz_dc )
+                    {
+                        if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) )
+                            continue;
+                        h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1;
+                        zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
+                        idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+                        h->dctf.add8x8_idct_dc( h->mb.pic.p_fdec[1+ch], dct2x2 );
+                        h->mb.i_cbp_chroma = 1;
+                    }
+                }
+            }
+            return;
+        }
+    }
+
     for( ch = 0; ch < 2; ch++ )
     {
         uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
@@ -282,8 +402,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
         int i_decimate_score = 0;
         int nz_ac = 0;
 
-        DECLARE_ALIGNED_16( int16_t dct2x2[2][2]  );
-        DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
+        ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[16] );
 
         if( h->mb.b_lossless )
         {
@@ -291,10 +410,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
             {
                 int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
                 int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
-                h->zigzagf.sub_4x4( h->dct.luma4x4[16+i+ch*4], p_src+oe, p_dst+od );
-                h->dct.chroma_dc[ch][i] = h->dct.luma4x4[16+i+ch*4][0];
-                h->dct.luma4x4[16+i+ch*4][0] = 0;
-                nz = array_non_zero( h->dct.luma4x4[16+i+ch*4] );
+                nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+ch*4], p_src+oe, p_dst+od, &h->dct.chroma_dc[ch][i] );
                 h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
                 h->mb.i_cbp_chroma |= nz;
             }
@@ -308,7 +424,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
         for( i = 0; i < 4; i++ )
         {
             if( h->mb.b_trellis )
-                nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 0 );
+                nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 );
             else
                 nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
             h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
@@ -323,7 +439,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
         }
 
         if( h->mb.b_trellis )
-            nz_dc = x264_quant_dc_trellis( h, (int16_t*)dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter );
+            nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 );
         else
             nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
 
@@ -338,9 +454,14 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
             h->mb.cache.non_zero_count[x264_scan8[16+3]+24*ch] = 0;
             if( !nz_dc ) /* Whole block is empty */
                 continue;
+            if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) )
+            {
+                h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 0;
+                continue;
+            }
             /* DC-only */
             zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
-            idct_dequant_2x2_dconly( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+            idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
             h->dctf.add8x8_idct_dc( p_dst, dct2x2 );
         }
         else
@@ -387,15 +508,25 @@ static void x264_macroblock_encode_pskip( x264_t *h )
     {
         h->mc.mc_luma( h->mb.pic.p_fdec[0],    FDEC_STRIDE,
                        h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
-                       mvx, mvy, 16, 16 );
+                       mvx, mvy, 16, 16, &h->sh.weight[0][0] );
 
         h->mc.mc_chroma( h->mb.pic.p_fdec[1],       FDEC_STRIDE,
                          h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
                          mvx, mvy, 8, 8 );
 
+        if( h->sh.weight[0][1].weightfn )
+            h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
+                                               h->mb.pic.p_fdec[1], FDEC_STRIDE,
+                                               &h->sh.weight[0][1], 8 );
+
         h->mc.mc_chroma( h->mb.pic.p_fdec[2],       FDEC_STRIDE,
                          h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2],
                          mvx, mvy, 8, 8 );
+
+        if( h->sh.weight[0][2].weightfn )
+            h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
+                                               h->mb.pic.p_fdec[2], FDEC_STRIDE,
+                                               &h->sh.weight[0][2], 8 );
     }
 
     x264_macroblock_encode_skip( h );
@@ -529,16 +660,16 @@ void x264_macroblock_encode( x264_t *h )
     }
     else if( h->mb.i_type == I_8x8 )
     {
-        DECLARE_ALIGNED_16( uint8_t edge[33] );
+        ALIGNED_ARRAY_16( uint8_t, edge,[33] );
         h->mb.b_transform_8x8 = 1;
         /* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
         if( h->mb.i_skip_intra )
         {
             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
-            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i8x8_nnz_buf[0];
-            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i8x8_nnz_buf[1];
-            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i8x8_nnz_buf[2];
-            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i8x8_nnz_buf[3];
+            M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0];
+            M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1];
+            M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2];
+            M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3];
             h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
             /* In RD mode, restore the now-overwritten DCT data. */
             if( h->mb.i_skip_intra == 2 )
@@ -565,10 +696,10 @@ void x264_macroblock_encode( x264_t *h )
         if( h->mb.i_skip_intra )
         {
             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
-            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i4x4_nnz_buf[0];
-            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i4x4_nnz_buf[1];
-            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i4x4_nnz_buf[2];
-            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i4x4_nnz_buf[3];
+            M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0];
+            M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1];
+            M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2];
+            M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3];
             h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
             /* In RD mode, restore the now-overwritten DCT data. */
             if( h->mb.i_skip_intra == 2 )
@@ -581,7 +712,7 @@ void x264_macroblock_encode( x264_t *h )
 
             if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
                 /* emulate missing topright samples */
-                *(uint32_t*) &p_dst[4-FDEC_STRIDE] = p_dst[3-FDEC_STRIDE] * 0x01010101U;
+                M32( &p_dst[4-FDEC_STRIDE] ) = p_dst[3-FDEC_STRIDE] * 0x01010101U;
 
             if( h->mb.b_lossless )
                 x264_predict_lossless_4x4( h, p_dst, i, i_mode );
@@ -606,27 +737,25 @@ void x264_macroblock_encode( x264_t *h )
                 {
                     int x = 8*(i8x8&1);
                     int y = 8*(i8x8>>1);
-                    h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
+                    nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
                                         h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
                                         h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
-                    nz = array_non_zero( h->dct.luma8x8[i8x8] );
                     STORE_8x8_NNZ(i8x8,nz);
                     h->mb.i_cbp_luma |= nz << i8x8;
                 }
             else
                 for( i4x4 = 0; i4x4 < 16; i4x4++ )
                 {
-                    h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
+                    nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
                                         h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4],
                                         h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] );
-                    nz = array_non_zero( h->dct.luma4x4[i4x4] );
                     h->mb.cache.non_zero_count[x264_scan8[i4x4]] = nz;
                     h->mb.i_cbp_luma |= nz << (i4x4>>2);
                 }
         }
         else if( h->mb.b_transform_8x8 )
         {
-            DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] );
+            ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[64] );
             b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
             h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
             h->nr_count[1] += h->mb.b_noise_reduction * 4;
@@ -634,7 +763,7 @@ void x264_macroblock_encode( x264_t *h )
             for( idx = 0; idx < 4; idx++ )
             {
                 if( h->mb.b_noise_reduction )
-                    h->quantf.denoise_dct( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );
+                    h->quantf.denoise_dct( dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );
                 nz = x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx );
 
                 if( nz )
@@ -655,10 +784,7 @@ void x264_macroblock_encode( x264_t *h )
             if( i_decimate_mb < 6 && b_decimate )
             {
                 h->mb.i_cbp_luma = 0;
-                *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
-                *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
-                *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
-                *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
+                CLEAR_16x16_NNZ
             }
             else
             {
@@ -677,7 +803,7 @@ void x264_macroblock_encode( x264_t *h )
         }
         else
         {
-            DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
+            ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[16] );
             h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
             h->nr_count[0] += h->mb.b_noise_reduction * 16;
 
@@ -692,7 +818,7 @@ void x264_macroblock_encode( x264_t *h )
                     idx = i8x8 * 4 + i4x4;
 
                     if( h->mb.b_noise_reduction )
-                        h->quantf.denoise_dct( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
+                        h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
                     nz = x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx );
                     h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
 
@@ -727,10 +853,7 @@ void x264_macroblock_encode( x264_t *h )
                 if( i_decimate_mb < 6 )
                 {
                     h->mb.i_cbp_luma = 0;
-                    *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
-                    *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
-                    *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
-                    *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
+                    CLEAR_16x16_NNZ
                 }
                 else
                 {
@@ -775,7 +898,7 @@ void x264_macroblock_encode( x264_t *h )
     {
         if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
             !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
-            *(uint32_t*)h->mb.cache.mv[0][x264_scan8[0]] == *(uint32_t*)h->mb.cache.pskip_mv
+            M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
             && h->mb.cache.ref[0][x264_scan8[0]] == 0 )
         {
             h->mb.i_type = P_SKIP;
@@ -796,9 +919,9 @@ void x264_macroblock_encode( x264_t *h )
  *****************************************************************************/
 int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
 {
-    DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
-    DECLARE_ALIGNED_16( int16_t dct2x2[2][2] );
-    DECLARE_ALIGNED_16( int16_t dctscan[16] );
+    ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[16] );
+    ALIGNED_ARRAY_16( int16_t, dct2x2,[4] );
+    ALIGNED_ARRAY_16( int16_t, dctscan,[16] );
 
     int i_qp = h->mb.i_qp;
     int mvp[2];
@@ -816,7 +939,7 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
         /* Motion compensation */
         h->mc.mc_luma( h->mb.pic.p_fdec[0],    FDEC_STRIDE,
                        h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
-                       mvp[0], mvp[1], 16, 16 );
+                       mvp[0], mvp[1], 16, 16, &h->sh.weight[0][0] );
     }
 
     for( i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
@@ -852,6 +975,11 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
             h->mc.mc_chroma( h->mb.pic.p_fdec[1+ch],       FDEC_STRIDE,
                              h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch],
                              mvp[0], mvp[1], 8, 8 );
+
+            if( h->sh.weight[0][1+ch].weightfn )
+                h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
+                                                      h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
+                                                      &h->sh.weight[0][1+ch], 8 );
         }
 
         /* there is almost never a termination during chroma, but we can't avoid the check entirely */
@@ -928,15 +1056,15 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
     int nnz8x8 = 0;
     int ch, nz;
 
-    x264_mb_mc_8x8( h, i8 );
+    if( !h->mb.b_skip_mc )
+        x264_mb_mc_8x8( h, i8 );
 
     if( h->mb.b_lossless )
     {
         int i4;
         if( h->mb.b_transform_8x8 )
         {
-            h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
-            nnz8x8 = array_non_zero( h->dct.luma8x8[i8] );
+            nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
             STORE_8x8_NNZ(i8,nnz8x8);
         }
         else
@@ -944,28 +1072,27 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
             for( i4 = i8*4; i4 < i8*4+4; i4++ )
             {
                 int nz;
-                h->zigzagf.sub_4x4( h->dct.luma4x4[i4],
+                nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4],
                                     h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4],
                                     h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] );
-                nz = array_non_zero( h->dct.luma4x4[i4] );
                 h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
                 nnz8x8 |= nz;
             }
         }
         for( ch = 0; ch < 2; ch++ )
         {
+            int16_t dc;
             p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
             p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
-            h->zigzagf.sub_4x4( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec );
-            h->dct.luma4x4[16+i8+ch*4][0] = 0;
-            h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = array_non_zero( h->dct.luma4x4[16+i8+ch*4] );
+            nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec, &dc );
+            h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
         }
     }
     else
     {
         if( h->mb.b_transform_8x8 )
         {
-            DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
+            ALIGNED_ARRAY_16( int16_t, dct8x8,[64] );
             h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
             nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, 0, i8 );
             if( nnz8x8 )
@@ -991,7 +1118,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
         {
             int i4;
             int i_decimate_8x8 = 0;
-            DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
+            ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[16] );
             h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
             for( i4 = 0; i4 < 4; i4++ )
             {
@@ -1020,15 +1147,15 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
 
         for( ch = 0; ch < 2; ch++ )
         {
-            DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
+            ALIGNED_ARRAY_16( int16_t, dct4x4,[16] );
             p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
             p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
 
             h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
-            dct4x4[0][0] = 0;
+            dct4x4[0] = 0;
 
             if( h->mb.b_trellis )
-                nz = x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 0 );
+                nz = x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
             else
                 nz = h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
 
@@ -1054,21 +1181,18 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
     int i_qp = h->mb.i_qp;
     uint8_t *p_fenc = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[i4]];
     uint8_t *p_fdec = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
-    const int i_ref = h->mb.cache.ref[0][x264_scan8[i4]];
-    const int mvx   = x264_clip3( h->mb.cache.mv[0][x264_scan8[i4]][0], h->mb.mv_min[0], h->mb.mv_max[0] );
-    const int mvy   = x264_clip3( h->mb.cache.mv[0][x264_scan8[i4]][1], h->mb.mv_min[1], h->mb.mv_max[1] );
     int nz;
 
-    h->mc.mc_luma( p_fdec, FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0], mvx + 4*4*block_idx_x[i4], mvy + 4*4*block_idx_y[i4], 4, 4 );
+    /* Don't need motion compensation as this function is only used in qpel-RD, which caches pixel data. */
 
     if( h->mb.b_lossless )
     {
-        h->zigzagf.sub_4x4( h->dct.luma4x4[i4], p_fenc, p_fdec );
-        h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
+        nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4], p_fenc, p_fdec );
+        h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
     }
     else
     {
-        DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
+        ALIGNED_ARRAY_16( int16_t, dct4x4,[16] );
         h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
         nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 0, i4 );
         h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index 7b9f08a..25beb18 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -27,7 +27,7 @@
 #include "common/macroblock.h"
 
 extern const int x264_lambda2_tab[52];
-extern const int x264_lambda_tab[52];
+extern const uint8_t x264_lambda_tab[52];
 
 void x264_rdo_init( void );
 
@@ -45,7 +45,7 @@ void x264_predict_lossless_16x16( x264_t *h, int i_mode );
 
 void x264_macroblock_encode      ( x264_t *h );
 void x264_macroblock_write_cabac ( x264_t *h, x264_cabac_t *cb );
-void x264_macroblock_write_cavlc ( x264_t *h, bs_t *s );
+void x264_macroblock_write_cavlc ( x264_t *h );
 
 void x264_macroblock_encode_p8x8( x264_t *h, int i8 );
 void x264_macroblock_encode_p4x4( x264_t *h, int i4 );
@@ -56,10 +56,10 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp );
 void x264_cabac_mb_skip( x264_t *h, int b_skip );
 
 int x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
-                             int i_qp, int i_ctxBlockCat, int b_intra );
-int x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
-                             int i_qp, int i_ctxBlockCat, int b_intra, int idx );
-int x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
+                             int i_qp, int i_ctxBlockCat, int b_intra, int b_chroma );
+int x264_quant_4x4_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
+                             int i_qp, int i_ctxBlockCat, int b_intra, int b_chroma, int idx );
+int x264_quant_8x8_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
                              int i_qp, int b_intra, int idx );
 
 void x264_noise_reduction_update( x264_t *h );
diff --git a/encoder/me.c b/encoder/me.c
index f13e84b..f58a6a8 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -32,7 +32,7 @@
  * and refine_* are run only on the winner.
  * the subme=8,9 values are much higher because any amount of satd search makes
  * up its time by reducing the number of qpel-rd iterations. */
-static const int subpel_iterations[][4] =
+static const uint8_t subpel_iterations[][4] =
    {{0,0,0,0},
     {1,1,0,0},
     {0,1,1,0},
@@ -42,10 +42,11 @@ static const int subpel_iterations[][4] =
     {0,0,2,2},
     {0,0,2,2},
     {0,0,4,10},
+    {0,0,4,10},
     {0,0,4,10}};
 
 /* (x-1)%6 */
-static const int mod6m1[8] = {5,0,1,2,3,4,5,0};
+static const uint8_t mod6m1[8] = {5,0,1,2,3,4,5,0};
 /* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
 static const int hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}};
 static const int square1[9][2] = {{0,0}, {0,-1}, {0,1}, {-1,0}, {1,0}, {-1,-1}, {-1,1}, {1,-1}, {1,1}};
@@ -58,7 +59,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
 #define COST_MV( mx, my )\
 {\
     int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE,\
-                   &p_fref[(my)*stride+(mx)], stride )\
+                   &p_fref_w[(my)*stride+(mx)], stride )\
              + BITS_MVD(mx,my);\
     COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\
 }
@@ -66,7 +67,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
 #define COST_MV_HPEL( mx, my ) \
 { \
     int stride2 = 16; \
-    uint8_t *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh ); \
+    uint8_t *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \
     int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \
              + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
     COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
@@ -74,7 +75,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
 
 #define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
 {\
-    uint8_t *pix_base = p_fref + bmx + bmy*stride;\
+    uint8_t *pix_base = p_fref_w + bmx + bmy*stride;\
     h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\
         pix_base + (m0x) + (m0y)*stride,\
         pix_base + (m1x) + (m1y)*stride,\
@@ -87,7 +88,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
 
 #define COST_MV_X4_DIR( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs )\
 {\
-    uint8_t *pix_base = p_fref + bmx + bmy*stride;\
+    uint8_t *pix_base = p_fref_w + bmx + bmy*stride;\
     h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
         pix_base + (m0x) + (m0y)*stride,\
         pix_base + (m1x) + (m1y)*stride,\
@@ -102,7 +103,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
 
 #define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
 {\
-    uint8_t *pix_base = p_fref + omx + omy*stride;\
+    uint8_t *pix_base = p_fref_w + omx + omy*stride;\
     h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
         pix_base + (m0x) + (m0y)*stride,\
         pix_base + (m1x) + (m1y)*stride,\
@@ -122,9 +123,9 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
 #define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
 {\
     h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\
-        p_fref + (m0x) + (m0y)*stride,\
-        p_fref + (m1x) + (m1y)*stride,\
-        p_fref + (m2x) + (m2y)*stride,\
+        p_fref_w + (m0x) + (m0y)*stride,\
+        p_fref_w + (m1x) + (m1y)*stride,\
+        p_fref_w + (m2x) + (m2y)*stride,\
         stride, costs );\
     costs[0] += p_cost_mvx[(m0x)<<2]; /* no cost_mvy */\
     costs[1] += p_cost_mvx[(m1x)<<2];\
@@ -180,8 +181,8 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
     int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX;
     int omx, omy, pmx, pmy;
     uint8_t *p_fenc = m->p_fenc[0];
-    uint8_t *p_fref = m->p_fref[0];
-    DECLARE_ALIGNED_16( uint8_t pix[16*16] );
+    uint8_t *p_fref_w = m->p_fref_w;
+    ALIGNED_ARRAY_16( uint8_t, pix,[16*16] );
 
     int i, j;
     int dir;
@@ -194,8 +195,8 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
 
 #define CHECK_MVRANGE(mx,my) ( mx >= mv_x_min && mx <= mv_x_max && my >= mv_y_min && my <= mv_y_max )
 
-    const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
-    const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
+    const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
+    const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
 
     bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 );
     bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 );
@@ -210,7 +211,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
         COST_MV_HPEL( bmx, bmy );
         for( i = 0; i < i_mvc; i++ )
         {
-            if( *(uint32_t*)mvc[i] && (bmv - *(uint32_t*)mvc[i]) )
+            if( M32( mvc[i] ) && (bmv - M32( mvc[i] )) )
             {
                 int mx = x264_clip3( mvc[i][0], mv_x_min*4, mv_x_max*4 );
                 int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 );
@@ -451,8 +452,8 @@ me_hex2:
 
             /* hexagon grid */
             omx = bmx; omy = bmy;
-            const int16_t *p_cost_omvx = p_cost_mvx + omx*4;
-            const int16_t *p_cost_omvy = p_cost_mvy + omy*4;
+            const uint16_t *p_cost_omvx = p_cost_mvx + omx*4;
+            const uint16_t *p_cost_omvy = p_cost_mvy + omy*4;
             i = 1;
             do
             {
@@ -477,7 +478,7 @@ me_hex2:
                 else
                 {
                     int dir = 0;
-                    uint8_t *pix_base = p_fref + omx + (omy-4*i)*stride;
+                    uint8_t *pix_base = p_fref_w + omx + (omy-4*i)*stride;
                     int dy = i*stride;
 #define SADS(k,x0,y0,x1,y1,x2,y2,x3,y3)\
                     h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
@@ -535,7 +536,7 @@ me_hex2:
                     }
                 }
             } while( ++i <= i_me_range/4 );
-            if( bmy <= mv_y_max )
+            if( bmy <= mv_y_max && bmy >= mv_y_min && bmx <= mv_x_max && bmx >= mv_x_min )
                 goto me_hex2;
             break;
         }
@@ -561,15 +562,14 @@ me_hex2:
              * because sum(abs(diff)) >= abs(diff(sum)). */
             uint16_t *sums_base = m->integral;
             /* due to a GCC bug on some platforms (win32?), zero[] may not actually be aligned.
-             * unlike the similar case in ratecontrol.c, this is not a problem because it is not used for any
-             * SSE instructions and the only loss is a tiny bit of performance. */
-            DECLARE_ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] );
-            DECLARE_ALIGNED_16( int enc_dc[4] );
+             * this is not a problem because it is not used for any SSE instructions. */
+            ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] );
+            ALIGNED_ARRAY_16( int, enc_dc,[4] );
             int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
             int delta = x264_pixel_size[sad_size].w;
             int16_t *xs = h->scratch_buffer;
             int xn;
-            uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
+            uint16_t *cost_fpel_mvx = h->cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
 
             h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta,
                 p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE,
@@ -587,7 +587,7 @@ me_hex2:
                 mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15));
                 int nmvsad = 0, limit;
                 int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
-                int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref+bmy*stride+bmx, stride )
+                int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+bmy*stride+bmx, stride )
                          + BITS_MVD( bmx, bmy );
                 for( my = min_y; my <= max_y; my++ )
                 {
@@ -599,7 +599,7 @@ me_hex2:
                                                cost_fpel_mvx+min_x, xs, width, bsad*17/16 );
                     for( i=0; i<xn-2; i+=3 )
                     {
-                        uint8_t *ref = p_fref+min_x+my*stride;
+                        uint8_t *ref = p_fref_w+min_x+my*stride;
                         int sads[3];
                         h->pixf.sad_x3[i_pixel]( p_fenc, ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads );
                         for( j=0; j<3; j++ )
@@ -609,8 +609,8 @@ me_hex2:
                             {
                                 COPY1_IF_LT( bsad, sad );
                                 mvsads[nmvsad].sad = sad + ycost;
-                                mvsads[nmvsad].mx = min_x+xs[i+j];
-                                mvsads[nmvsad].my = my;
+                                mvsads[nmvsad].mv[0] = min_x+xs[i+j];
+                                mvsads[nmvsad].mv[1] = my;
                                 nmvsad++;
                             }
                         }
@@ -618,14 +618,14 @@ me_hex2:
                     for( ; i<xn; i++ )
                     {
                         int mx = min_x+xs[i];
-                        int sad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref+mx+my*stride, stride )
+                        int sad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+mx+my*stride, stride )
                                 + cost_fpel_mvx[xs[i]];
                         if( sad < bsad*sad_thresh>>3 )
                         {
                             COPY1_IF_LT( bsad, sad );
                             mvsads[nmvsad].sad = sad + ycost;
-                            mvsads[nmvsad].mx = mx;
-                            mvsads[nmvsad].my = my;
+                            mvsads[nmvsad].mv[0] = mx;
+                            mvsads[nmvsad].mv[1] = my;
                             nmvsad++;
                         }
                     }
@@ -633,42 +633,47 @@ me_hex2:
                 }
 
                 limit = i_me_range / 2;
-                if( nmvsad > limit*2 )
+                sad_thresh = bsad*sad_thresh>>3;
+                while( nmvsad > limit*2 && sad_thresh > bsad )
                 {
                     // halve the range if the domain is too large... eh, close enough
-                    bsad = bsad*(sad_thresh+8)>>4;
-                    for( i=0; i<nmvsad && mvsads[i].sad <= bsad; i++ );
+                    sad_thresh = (sad_thresh + bsad) >> 1;
+                    for( i=0; i<nmvsad && mvsads[i].sad <= sad_thresh; i++ );
                     for( j=i; j<nmvsad; j++ )
-                        if( mvsads[j].sad <= bsad )
+                    {
+                        uint32_t sad;
+                        if( WORD_SIZE == 8 && sizeof(mvsad_t) == 8 )
                         {
-                            /* mvsad_t is not guaranteed to be 8 bytes on all archs, so check before using explicit write-combining */
-                            if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
-                                *(uint64_t*)&mvsads[i++] = *(uint64_t*)&mvsads[j];
-                            else
-                                mvsads[i++] = mvsads[j];
+                            uint64_t mvsad = M64( &mvsads[i] ) = M64( &mvsads[j] );
+#ifdef WORDS_BIGENDIAN
+                            mvsad >>= 32;
+#endif
+                            sad = mvsad;
                         }
-                    nmvsad = i;
-                }
-                if( nmvsad > limit )
-                {
-                    for( i=0; i<limit; i++ )
-                    {
-                        int bj = i;
-                        int bsad = mvsads[bj].sad;
-                        for( j=i+1; j<nmvsad; j++ )
-                            COPY2_IF_LT( bsad, mvsads[j].sad, bj, j );
-                        if( bj > i )
+                        else
                         {
-                            if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
-                                XCHG( uint64_t, *(uint64_t*)&mvsads[i], *(uint64_t*)&mvsads[bj] );
-                            else
-                                XCHG( mvsad_t, mvsads[i], mvsads[bj] );
+                            sad = mvsads[j].sad;
+                            CP32( mvsads[i].mv, mvsads[j].mv );
+                            mvsads[i].sad = sad;
                         }
+                        i += (sad - (sad_thresh+1)) >> 31;
                     }
-                    nmvsad = limit;
+                    nmvsad = i;
+                }
+                while( nmvsad > limit )
+                {
+                    int bi = 0;
+                    for( i=1; i<nmvsad; i++ )
+                        if( mvsads[i].sad > mvsads[bi].sad )
+                            bi = i;
+                    nmvsad--;
+                    if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
+                        CP64( &mvsads[bi], &mvsads[nmvsad] );
+                    else
+                        mvsads[bi] = mvsads[nmvsad];
                 }
                 for( i=0; i<nmvsad; i++ )
-                    COST_MV( mvsads[i].mx, mvsads[i].my );
+                    COST_MV( mvsads[i].mv[0], mvsads[i].mv[1] );
             }
             else
             {
@@ -719,8 +724,6 @@ me_hex2:
         int qpel = subpel_iterations[h->mb.i_subpel_refine][3];
         refine_subpel( h, m, hpel, qpel, p_halfpel_thresh, 0 );
     }
-    else if( m->mv[1] > h->mb.mv_max_spel[1] )
-        m->mv[1] = h->mb.mv_max_spel[1];
 }
 #undef COST_MV
 
@@ -729,16 +732,21 @@ void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
     int hpel = subpel_iterations[h->mb.i_subpel_refine][0];
     int qpel = subpel_iterations[h->mb.i_subpel_refine][1];
 
-    if( m->i_pixel <= PIXEL_8x8 && h->sh.i_type == SLICE_TYPE_P )
+    if( m->i_pixel <= PIXEL_8x8 )
         m->cost -= m->i_ref_cost;
-	
+
     refine_subpel( h, m, hpel, qpel, NULL, 1 );
 }
 
+void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh )
+{
+    refine_subpel( h, m, 0, X264_MIN( 2, subpel_iterations[h->mb.i_subpel_refine][3] ), p_halfpel_thresh, 0 );
+}
+
 #define COST_MV_SAD( mx, my ) \
 { \
     int stride = 16; \
-    uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
+    uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
     int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
              + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
     COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \
@@ -748,17 +756,23 @@ void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
 if( b_refine_qpel || (dir^1) != odir ) \
 { \
     int stride = 16; \
-    uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
+    uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
     int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
              + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
     if( b_chroma_me && cost < bcost ) \
     { \
-        h->mc.mc_chroma( pix[0], 8, m->p_fref[4], m->i_stride[1], mx, my, bw/2, bh/2 ); \
+        h->mc.mc_chroma( pix[0], 8, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \
+        if( m->weight[1].weightfn ) \
+            m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix[0], 8, pix[0], 8, \
+                                                                  &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \
         cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix[0], 8 ); \
         if( cost < bcost ) \
         { \
-            h->mc.mc_chroma( pix[0], 8, m->p_fref[5], m->i_stride[1], mx, my, bw/2, bh/2 ); \
+            h->mc.mc_chroma( pix[0], 8, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \
             cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix[0], 8 ); \
+            if( m->weight[2].weightfn ) \
+                m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix[0], 8, pix[0], 8, \
+                                                                      &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \
         } \
     } \
     if( cost < bcost ) \
@@ -774,12 +788,13 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
 {
     const int bw = x264_pixel_size[m->i_pixel].w;
     const int bh = x264_pixel_size[m->i_pixel].h;
-    const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
-    const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
+    const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
+    const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
     const int i_pixel = m->i_pixel;
     const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
+    const int mvy_offset = h->mb.b_interlaced & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
 
-    DECLARE_ALIGNED_16( uint8_t pix[2][32*18] ); // really 17x17, but round up for alignment
+    ALIGNED_ARRAY_16( uint8_t, pix,[2],[32*18] );   // really 17x17, but round up for alignment
     int omx, omy;
     int i;
 
@@ -791,8 +806,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
     /* try the subpel component of the predicted mv */
     if( hpel_iters && h->mb.i_subpel_refine < 3 )
     {
-        int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
-        int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1], h->mb.mv_max_spel[1] );
+        int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0]+2, h->mb.mv_max_spel[0]-2 );
+        int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1]+2, h->mb.mv_max_spel[1]-2 );
         if( (mx-bmx)|(my-bmy) )
             COST_MV_SAD( mx, my );
     }
@@ -804,8 +819,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
         int costs[4];
         int stride = 32; // candidates are either all hpel or all qpel, so one stride is enough
         uint8_t *src0, *src1, *src2, *src3;
-        src0 = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1 );
-        src2 = h->mc.get_ref( pix[1], &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh );
+        src0 = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1, &m->weight[0] );
+        src2 = h->mc.get_ref( pix[1], &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh, &m->weight[0] );
         src1 = src0 + stride;
         src3 = src2 + 1;
         h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
@@ -819,9 +834,6 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
 
     if( !b_refine_qpel )
     {
-        /* check for mvrange */
-        if( bmy > h->mb.mv_max_spel[1] )
-            bmy = h->mb.mv_max_spel[1];
         bcost = COST_MAX;
         COST_MV_SATD( bmx, bmy, -1 );
     }
@@ -845,6 +857,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
     bdir = -1;
     for( i = qpel_iters; i > 0; i-- )
     {
+        if( bmy <= h->mb.mv_min_spel[1] || bmy >= h->mb.mv_max_spel[1] || bmx <= h->mb.mv_min_spel[0] || bmx >= h->mb.mv_max_spel[0] )
+            break;
         odir = bdir;
         omx = bmx;
         omy = bmy;
@@ -856,88 +870,32 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
             break;
     }
 
-    /* check for mvrange */
-    if( bmy > h->mb.mv_max_spel[1] )
-    {
-        bmy = h->mb.mv_max_spel[1];
-        bcost = COST_MAX;
-        COST_MV_SATD( bmx, bmy, -1 );
-    }
-
     m->cost = bcost;
     m->mv[0] = bmx;
     m->mv[1] = bmy;
     m->cost_mv = p_cost_mvx[ bmx ] + p_cost_mvy[ bmy ];
 }
 
-#define BIME_CACHE( dx, dy ) \
+#define BIME_CACHE( dx, dy, list ) \
 { \
+    x264_me_t *m = m##list;\
     int i = 4 + 3*dx + dy; \
-    stride0[i] = bw;\
-    stride1[i] = bw;\
-    src0[i] = h->mc.get_ref( pix0[i], &stride0[i], m0->p_fref, m0->i_stride[0], om0x+dx, om0y+dy, bw, bh ); \
-    src1[i] = h->mc.get_ref( pix1[i], &stride1[i], m1->p_fref, m1->i_stride[0], om1x+dx, om1y+dy, bw, bh ); \
+    int mvx = om##list##x+dx;\
+    int mvy = om##list##y+dy;\
+    stride##list[i] = bw;\
+    src##list[i] = h->mc.get_ref( pixy_buf[list][i], &stride##list[i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none ); \
+    if( rd )\
+    {\
+        h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
+        h->mc.mc_chroma( pixv_buf[list][i], 8, m->p_fref[5], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
+    }\
 }
 
-#define BIME_CACHE2(a,b) \
-    BIME_CACHE(a,b) \
-    BIME_CACHE(-(a),-(b))
-
 #define SATD_THRESH 17/16
 
-#define COST_BIMV_SATD( m0x, m0y, m1x, m1y ) \
-if( pass == 0 || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) ) \
-{ \
-    int cost; \
-    int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); \
-    int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); \
-    visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));\
-    h->mc.avg[i_pixel]( pix, bw, src0[i0], stride0[i0], src1[i1], stride1[i1], i_weight ); \
-    cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, bw ) \
-         + p_cost_m0x[ m0x ] + p_cost_m0y[ m0y ] \
-         + p_cost_m1x[ m1x ] + p_cost_m1y[ m1y ]; \
-    if( rd ) \
-    { \
-        if( cost < bcost * SATD_THRESH ) \
-        { \
-            uint64_t costrd; \
-            if( cost < bcost ) \
-                bcost = cost; \
-            *(uint32_t*)cache0_mv = *(uint32_t*)cache0_mv2 = pack16to32_mask(m0x,m0y); \
-            *(uint32_t*)cache1_mv = *(uint32_t*)cache1_mv2 = pack16to32_mask(m1x,m1y); \
-            costrd = x264_rd_cost_part( h, i_lambda2, i8, m0->i_pixel ); \
-            if( costrd < bcostrd ) \
-            {\
-                bcostrd = costrd;\
-                bm0x = m0x;      \
-                bm0y = m0y;      \
-                bm1x = m1x;      \
-                bm1y = m1y;      \
-            }\
-        } \
-    } \
-    else if( cost < bcost ) \
-    {                  \
-        bcost = cost;  \
-        bm0x = m0x;    \
-        bm0y = m0y;    \
-        bm1x = m1x;    \
-        bm1y = m1y;    \
-    } \
-}
-
-#define CHECK_BIDIR(a,b,c,d) \
-    COST_BIMV_SATD(om0x+a, om0y+b, om1x+c, om1y+d)
-
-#define CHECK_BIDIR2(a,b,c,d) \
-    CHECK_BIDIR(a,b,c,d) \
-    CHECK_BIDIR(-(a),-(b),-(c),-(d))
-
-#define CHECK_BIDIR8(a,b,c,d) \
-    CHECK_BIDIR2(a,b,c,d) \
-    CHECK_BIDIR2(b,c,d,a) \
-    CHECK_BIDIR2(c,d,a,b) \
-    CHECK_BIDIR2(d,a,b,c)
+/* Don't unroll the BIME_CACHE loop. I couldn't find any way to force this
+ * other than making its iteration count not a compile-time constant. */
+int x264_iter_kludge = 0;
 
 static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd )
 {
@@ -949,15 +907,22 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
     const int i_pixel = m0->i_pixel;
     const int bw = x264_pixel_size[i_pixel].w;
     const int bh = x264_pixel_size[i_pixel].h;
-    const int16_t *p_cost_m0x = m0->p_cost_mv - x264_clip3( m0->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
-    const int16_t *p_cost_m0y = m0->p_cost_mv - x264_clip3( m0->mvp[1], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
-    const int16_t *p_cost_m1x = m1->p_cost_mv - x264_clip3( m1->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
-    const int16_t *p_cost_m1y = m1->p_cost_mv - x264_clip3( m1->mvp[1], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
-    DECLARE_ALIGNED_16( uint8_t pix0[9][16*16] );
-    DECLARE_ALIGNED_16( uint8_t pix1[9][16*16] );
-    DECLARE_ALIGNED_16( uint8_t pix[16*16] );
+    const uint16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0];
+    const uint16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1];
+    const uint16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0];
+    const uint16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1];
+    ALIGNED_ARRAY_16( uint8_t, pixy_buf,[2],[9][16*16] );
+    ALIGNED_ARRAY_8( uint8_t, pixu_buf,[2],[9][8*8] );
+    ALIGNED_ARRAY_8( uint8_t, pixv_buf,[2],[9][8*8] );
     uint8_t *src0[9];
     uint8_t *src1[9];
+    uint8_t *pix  = &h->mb.pic.p_fdec[0][(i8>>1)*8*FDEC_STRIDE+(i8&1)*8];
+    uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+    uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+    const int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]];
+    const int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]];
+    const int mv0y_offset = h->mb.b_interlaced & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+    const int mv1y_offset = h->mb.b_interlaced & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
     int stride0[9];
     int stride1[9];
     int bm0x = m0->mv[0], om0x = bm0x;
@@ -966,19 +931,31 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
     int bm1y = m1->mv[1], om1y = bm1y;
     int bcost = COST_MAX;
     int pass = 0;
+    int j;
+    int mc_list0 = 1, mc_list1 = 1;
     uint64_t bcostrd = COST_MAX64;
-
     /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
-    DECLARE_ALIGNED_16( uint8_t visited[8][8][8] );
-
-    if( bm0y > h->mb.mv_max_spel[1] - 8 ||
-        bm1y > h->mb.mv_max_spel[1] - 8 )
+    ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] );
+    /* all permutations of an offset in up to 2 of the dimensions */
+    static const int8_t dia4d[33][4] = {
+        {0,0,0,0},
+        {0,0,0,1}, {0,0,0,-1}, {0,0,1,0}, {0,0,-1,0},
+        {0,1,0,0}, {0,-1,0,0}, {1,0,0,0}, {-1,0,0,0},
+        {0,0,1,1}, {0,0,-1,-1},{0,1,1,0}, {0,-1,-1,0},
+        {1,1,0,0}, {-1,-1,0,0},{1,0,0,1}, {-1,0,0,-1},
+        {0,1,0,1}, {0,-1,0,-1},{1,0,1,0}, {-1,0,-1,0},
+        {0,0,-1,1},{0,0,1,-1}, {0,-1,1,0},{0,1,-1,0},
+        {-1,1,0,0},{1,-1,0,0}, {1,0,0,-1},{-1,0,0,1},
+        {0,-1,0,1},{0,1,0,-1}, {-1,0,1,0},{1,0,-1,0},
+    };
+
+    if( bm0y < h->mb.mv_min_spel[1] + 8 || bm1y < h->mb.mv_min_spel[1] + 8 ||
+        bm0y > h->mb.mv_max_spel[1] - 8 || bm1y > h->mb.mv_max_spel[1] - 8 ||
+        bm0x < h->mb.mv_min_spel[0] + 8 || bm1x < h->mb.mv_min_spel[0] + 8 ||
+        bm0x > h->mb.mv_max_spel[0] - 8 || bm1x > h->mb.mv_max_spel[0] - 8 )
         return;
 
-    h->mc.memzero_aligned( visited, sizeof(visited) );
-
-    BIME_CACHE( 0, 0 );
-    CHECK_BIDIR( 0, 0, 0, 0 );
+    h->mc.memzero_aligned( visited, sizeof(uint8_t[8][8][8]) );
 
     for( pass = 0; pass < 8; pass++ )
     {
@@ -986,27 +963,57 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
         /* doesn't do chroma ME. this probably doesn't matter, as the gains
          * from bidir ME are the same with and without chroma ME. */
 
-        BIME_CACHE2( 1, 0 );
-        BIME_CACHE2( 0, 1 );
-        BIME_CACHE2( 1, 1 );
-        BIME_CACHE2( 1,-1 );
+        if( mc_list0 )
+            for( j = x264_iter_kludge; j < 9; j++ )
+                BIME_CACHE( square1[j][0], square1[j][1], 0 );
+
+        if( mc_list1 )
+            for( j = x264_iter_kludge; j < 9; j++ )
+                BIME_CACHE( square1[j][0], square1[j][1], 1 );
 
-        CHECK_BIDIR8( 0, 0, 0, 1 );
-        CHECK_BIDIR8( 0, 0, 1, 1 );
-        CHECK_BIDIR2( 0, 1, 0, 1 );
-        CHECK_BIDIR2( 1, 0, 1, 0 );
-        CHECK_BIDIR8( 0, 0,-1, 1 );
-        CHECK_BIDIR2( 0,-1, 0, 1 );
-        CHECK_BIDIR2(-1, 0, 1, 0 );
+        for( j = !!pass; j < 33; j++ )
+        {
+            int m0x = dia4d[j][0] + om0x;
+            int m0y = dia4d[j][1] + om0y;
+            int m1x = dia4d[j][2] + om1x;
+            int m1y = dia4d[j][3] + om1y;
+            if( !pass || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) )
+            {
+                int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y);
+                int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y);
+                visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));
+                h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src0[i0], stride0[i0], src1[i1], stride1[i1], i_weight );
+                int cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE )
+                         + p_cost_m0x[m0x] + p_cost_m0y[m0y] + p_cost_m1x[m1x] + p_cost_m1y[m1y];
+                if( rd )
+                {
+                    if( cost < bcost * SATD_THRESH )
+                    {
+                        bcost = X264_MIN( cost, bcost );
+                        M32( cache0_mv  ) = pack16to32_mask(m0x,m0y);
+                        M32( cache0_mv2 ) = pack16to32_mask(m0x,m0y);
+                        M32( cache1_mv  ) = pack16to32_mask(m1x,m1y);
+                        M32( cache1_mv2 ) = pack16to32_mask(m1x,m1y);
+                        h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
+                        h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
+                        uint64_t costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel );
+                        COPY5_IF_LT( bcostrd, costrd, bm0x, m0x, bm0y, m0y, bm1x, m1x, bm1y, m1y );
+                    }
+                }
+                else
+                    COPY5_IF_LT( bcost, cost, bm0x, m0x, bm0y, m0y, bm1x, m1x, bm1y, m1y );
+            }
+        }
 
-        if( om0x == bm0x && om0y == bm0y && om1x == bm1x && om1y == bm1y )
+        mc_list0 = (om0x-bm0x)|(om0y-bm0y);
+        mc_list1 = (om1x-bm1x)|(om1y-bm1y);
+        if( !mc_list0 && !mc_list1 )
             break;
 
         om0x = bm0x;
         om0y = bm0y;
         om1x = bm1x;
         om1y = bm1y;
-        BIME_CACHE( 0, 0 );
     }
 
     m0->mv[0] = bm0x;
@@ -1022,7 +1029,11 @@ void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_w
 
 void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2 )
 {
+    /* Motion compensation is done as part of bidir_rd; don't repeat
+     * it in encoding. */
+    h->mb.b_skip_mc = 1;
     x264_me_refine_bidir( h, m0, m1, i_weight, i8, i_lambda2, 1 );
+    h->mb.b_skip_mc = 0;
 }
 
 #undef COST_MV_SATD
@@ -1030,9 +1041,8 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
 { \
     if( !avoid_mvp || !(mx == pmx && my == pmy) ) \
     { \
-        int stride = 16; \
-        uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw*4, bh*4 ); \
-        dst = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+        h->mc.mc_luma( pix, FDEC_STRIDE, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
+        dst = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ) \
             + p_cost_mvx[mx] + p_cost_mvy[my]; \
         COPY1_IF_LT( bsatd, dst ); \
     } \
@@ -1045,7 +1055,13 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
     if( satd <= bsatd * SATD_THRESH ) \
     { \
         uint64_t cost; \
-        *(uint32_t*)cache_mv = *(uint32_t*)cache_mv2 = pack16to32_mask(mx,my); \
+        M32( cache_mv  ) = pack16to32_mask(mx,my); \
+        M32( cache_mv2 ) = pack16to32_mask(mx,my); \
+        if( m->i_pixel <= PIXEL_8x8 )\
+        {\
+            h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\
+            h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\
+        }\
         cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
         COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
     } \
@@ -1057,29 +1073,38 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
     static const int pixel_mv_offs[] = { 0, 4, 4*8, 0, 2, 2*8, 0 };
     int16_t *cache_mv = h->mb.cache.mv[i_list][x264_scan8[i4]];
     int16_t *cache_mv2 = cache_mv + pixel_mv_offs[m->i_pixel];
-    const int16_t *p_cost_mvx, *p_cost_mvy;
-    const int bw = x264_pixel_size[m->i_pixel].w>>2;
-    const int bh = x264_pixel_size[m->i_pixel].h>>2;
+    const uint16_t *p_cost_mvx, *p_cost_mvy;
+    const int bw = x264_pixel_size[m->i_pixel].w;
+    const int bh = x264_pixel_size[m->i_pixel].h;
     const int i_pixel = m->i_pixel;
+    const int mvy_offset = h->mb.b_interlaced & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
 
-    DECLARE_ALIGNED_16( uint8_t pix[16*16] );
-    uint64_t bcost = m->i_pixel == PIXEL_16x16 ? m->cost : COST_MAX64;
+    uint64_t bcost = COST_MAX64;
     int bmx = m->mv[0];
     int bmy = m->mv[1];
     int omx, omy, pmx, pmy, i, j;
     unsigned bsatd;
-    int satd = 0;
+    int satd;
     int dir = -2;
-    int satds[8];
+    int i8 = i4>>2;
+
+    uint8_t *pix  = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
+    uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+    uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+
+    h->mb.b_skip_mc = 1;
 
     if( m->i_pixel != PIXEL_16x16 && i4 != 0 )
-        x264_mb_predict_mv( h, i_list, i4, bw, m->mvp );
+        x264_mb_predict_mv( h, i_list, i4, bw>>2, m->mvp );
     pmx = m->mvp[0];
     pmy = m->mvp[1];
     p_cost_mvx = m->p_cost_mv - pmx;
     p_cost_mvy = m->p_cost_mv - pmy;
     COST_MV_SATD( bmx, bmy, bsatd, 0 );
-    COST_MV_RD( bmx, bmy, 0, 0, 0 );
+    if( m->i_pixel != PIXEL_16x16 )
+        COST_MV_RD( bmx, bmy, 0, 0, 0 )
+    else
+        bcost = m->cost;
 
     /* check the predicted mv */
     if( (bmx != pmx || bmy != pmy)
@@ -1087,7 +1112,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
         && pmy >= h->mb.mv_min_spel[1] && pmy <= h->mb.mv_max_spel[1] )
     {
         COST_MV_SATD( pmx, pmy, satd, 0 );
-        COST_MV_RD( pmx, pmy, satd, 0,0 );
+        COST_MV_RD  ( pmx, pmy, satd, 0, 0 );
         /* The hex motion search is guaranteed to not repeat the center candidate,
          * so if pmv is chosen, set the "MV to avoid checking" to bmv instead. */
         if( bmx == pmx && bmy == pmy )
@@ -1097,12 +1122,22 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
         }
     }
 
+    if( bmy < h->mb.mv_min_spel[1] + 3 || bmy > h->mb.mv_max_spel[1] - 3 ||
+        bmx < h->mb.mv_min_spel[0] + 3 || bmx > h->mb.mv_max_spel[0] - 3 )
+    {
+        h->mb.b_skip_mc = 0;
+        return;
+    }
+
     /* subpel hex search, same pattern as ME HEX. */
     dir = -2;
     omx = bmx;
     omy = bmy;
-    for( j=0; j<6; j++ ) COST_MV_SATD( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j], 1 );
-    for( j=0; j<6; j++ ) COST_MV_RD  ( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j], 1,j );
+    for( j=0; j<6; j++ )
+    {
+        COST_MV_SATD( omx + hex2[j+1][0], omy + hex2[j+1][1], satd, 1 );
+        COST_MV_RD  ( omx + hex2[j+1][0], omy + hex2[j+1][1], satd, 1, j );
+    }
 
     if( dir != -2 )
     {
@@ -1110,29 +1145,35 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
         for( i = 1; i < 10; i++ )
         {
             const int odir = mod6m1[dir+1];
-            if( bmy > h->mb.mv_max_spel[1] - 2 ||
-                bmy < h->mb.mv_min_spel[1] - 2 )
+            if( bmy < h->mb.mv_min_spel[1] + 3 ||
+                bmy > h->mb.mv_max_spel[1] - 3 )
                 break;
             dir = -2;
             omx = bmx;
             omy = bmy;
-            for( j=0; j<3; j++ ) COST_MV_SATD( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j], 1 );
-            for( j=0; j<3; j++ ) COST_MV_RD  ( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j], 1, odir-1+j );
+            for( j=0; j<3; j++ )
+            {
+                COST_MV_SATD( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satd, 1 );
+                COST_MV_RD  ( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satd, 1, odir-1+j );
+            }
             if( dir == -2 )
                 break;
         }
     }
 
-    /* square refine, same as pattern as ME HEX. */
+    /* square refine, same pattern as ME HEX. */
     omx = bmx;
     omy = bmy;
-    for( i=0; i<8; i++ ) COST_MV_SATD( omx + square1[i+1][0], omy + square1[i+1][1], satds[i], 1 );
-    for( i=0; i<8; i++ ) COST_MV_RD  ( omx + square1[i+1][0], omy + square1[i+1][1], satds[i], 0,0 );
+    for( i=0; i<8; i++ )
+    {
+        COST_MV_SATD( omx + square1[i+1][0], omy + square1[i+1][1], satd, 1 );
+        COST_MV_RD  ( omx + square1[i+1][0], omy + square1[i+1][1], satd, 0, 0 );
+    }
 
-    bmy = x264_clip3( bmy, h->mb.mv_min_spel[1],  h->mb.mv_max_spel[1] );
     m->cost = bcost;
     m->mv[0] = bmx;
     m->mv[1] = bmy;
-    x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw, bh, i_list, pack16to32_mask(bmx, bmy) );
-    x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw, bh, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) );
+    x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx, bmy) );
+    x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) );
+    h->mb.b_skip_mc = 0;
 }
diff --git a/encoder/me.h b/encoder/me.h
index 3910f74..2f19e61 100644
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -29,28 +29,32 @@
 
 typedef struct
 {
+    /* aligning the first member is a gcc hack to force the struct to be
+     * 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
     /* input */
-    int      i_pixel;   /* PIXEL_WxH */
-    int16_t *p_cost_mv; /* lambda * nbits for each possible mv */
+    ALIGNED_16( int i_pixel );   /* PIXEL_WxH */
+    uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */
     int      i_ref_cost;
     int      i_ref;
+    const x264_weight_t *weight;
 
     uint8_t *p_fref[6];
+    uint8_t *p_fref_w;
     uint8_t *p_fenc[3];
     uint16_t *integral;
     int      i_stride[2];
 
-    DECLARE_ALIGNED_4( int16_t mvp[2] );
+    ALIGNED_4( int16_t mvp[2] );
 
     /* output */
     int cost_mv;        /* lambda * nbits for the chosen mv */
     int cost;           /* satd + lambda * nbits */
-    DECLARE_ALIGNED_4( int16_t mv[2] );
-} DECLARE_ALIGNED_16( x264_me_t );
+    ALIGNED_4( int16_t mv[2] );
+} ALIGNED_16( x264_me_t );
 
 typedef struct {
     int sad;
-    int16_t mx, my;
+    int16_t mv[2];
 } mvsad_t;
 
 void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
@@ -58,6 +62,7 @@ static inline void x264_me_search( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], i
     { x264_me_search_ref( h, m, mvc, i_mvc, NULL ); }
 
 void x264_me_refine_qpel( x264_t *h, x264_me_t *m );
+void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh );
 void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int i_list );
 void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2 );
 void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight );
@@ -84,13 +89,30 @@ if((y)<(x))\
     (c)=(d);\
 }
 
-#define COPY4_IF_LT(x,y,a,b,c,d,f,e)\
+#define COPY4_IF_LT(x,y,a,b,c,d,e,f)\
 if((y)<(x))\
 {\
     (x)=(y);\
     (a)=(b);\
     (c)=(d);\
-    (f)=(e);\
+    (e)=(f);\
+}
+
+#define COPY5_IF_LT(x,y,a,b,c,d,e,f,g,h)\
+if((y)<(x))\
+{\
+    (x)=(y);\
+    (a)=(b);\
+    (c)=(d);\
+    (e)=(f);\
+    (g)=(h);\
+}
+
+#define COPY2_IF_GT(x,y,a,b)\
+if((y)>(x))\
+{\
+    (x)=(y);\
+    (a)=(b);\
 }
 
 #endif
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 2dd34d0..63b3be6 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -27,30 +27,34 @@
 #define _ISOC99_SOURCE
 #undef NDEBUG // always check asserts, the speed effect is far too small to disable them
 #include <math.h>
-#include <limits.h>
-#include <assert.h>
 
 #include "common/common.h"
 #include "common/cpu.h"
 #include "ratecontrol.h"
+#include "me.h"
 
 typedef struct
 {
     int pict_type;
+    int frame_type;
     int kept_as_ref;
-    float qscale;
+    double qscale;
     int mv_bits;
     int tex_bits;
     int misc_bits;
     uint64_t expected_bits; /*total expected bits up to the current frame (current one excluded)*/
     double expected_vbv;
-    float new_qscale;
+    double new_qscale;
     int new_qp;
     int i_count;
     int p_count;
     int s_count;
     float blurred_complexity;
     char direct_mode;
+    int16_t weight[2];
+    int16_t i_weight_denom;
+    int refcount[16];
+    int refs;
 } ratecontrol_entry_t;
 
 typedef struct
@@ -58,6 +62,7 @@ typedef struct
     double coeff;
     double count;
     double decay;
+    double offset;
 } predictor_t;
 
 struct x264_ratecontrol_t
@@ -70,6 +75,7 @@ struct x264_ratecontrol_t
     double fps;
     double bitrate;
     double rate_tolerance;
+    double qcompress;
     int nmb;                    /* number of macroblocks in a frame */
     int qp_constant[5];
 
@@ -80,6 +86,7 @@ struct x264_ratecontrol_t
     float f_qpm;                /* qp for current macroblock: precise float for AQ */
     float qpa_rc;               /* average of macroblocks' qp before aq */
     float qpa_aq;               /* average of macroblocks' qp after aq */
+    float qp_novbv;             /* QP for the current frame if 1-pass VBV was disabled. */
     int qp_force;
 
     /* VBV stuff */
@@ -88,6 +95,7 @@ struct x264_ratecontrol_t
     double buffer_fill;         /* planned buffer, if all in-progress frames hit their bit budget */
     double buffer_rate;         /* # of bits added to buffer_fill after each frame */
     predictor_t *pred;          /* predict frame size from satd */
+    int single_frame_vbv;
 
     /* ABR stuff */
     int    last_satd;
@@ -105,6 +113,10 @@ struct x264_ratecontrol_t
     /* 2pass stuff */
     FILE *p_stat_file_out;
     char *psz_stat_file_tmpname;
+    FILE *p_mbtree_stat_file_out;
+    char *psz_mbtree_stat_file_tmpname;
+    char *psz_mbtree_stat_file_name;
+    FILE *p_mbtree_stat_file_in;
 
     int num_entries;            /* number of ratecontrol_entry_ts */
     ratecontrol_entry_t *entry; /* FIXME: copy needed data and free this once init is done */
@@ -117,12 +129,16 @@ struct x264_ratecontrol_t
     double lmin[5];             /* min qscale by frame type */
     double lmax[5];
     double lstep;               /* max change (multiply) in qscale per frame */
+    uint16_t *qp_buffer[2];     /* Global buffers for converting MB-tree quantizer data. */
+    int qpbuf_pos;              /* In order to handle pyramid reordering, QP buffer acts as a stack.
+                                 * This value is the current position (0 or 1). */
 
     /* MBRC stuff */
     double frame_size_estimated;
     double frame_size_planned;
-    predictor_t *row_pred;
-    predictor_t row_preds[5];
+    double slice_size_planned;
+    predictor_t (*row_pred)[2];
+    predictor_t row_preds[5][2];
     predictor_t *pred_b_from_p; /* predict B-frame size from P-frame satd */
     int bframes;                /* # consecutive B-frames before this P-frame */
     int bframe_bits;            /* total cost of those frames */
@@ -137,10 +153,19 @@ static int parse_zones( x264_t *h );
 static int init_pass2(x264_t *);
 static float rate_estimate_qscale( x264_t *h );
 static void update_vbv( x264_t *h, int bits );
-static void update_vbv_plan( x264_t *h );
+static void update_vbv_plan( x264_t *h, int overhead );
 static double predict_size( predictor_t *p, double q, double var );
 static void update_predictor( predictor_t *p, double q, double var, double bits );
 
+#define CMP_OPT_FIRST_PASS( opt, param_val )\
+{\
+    if( ( p = strstr( opts, opt "=" ) ) && sscanf( p, opt "=%d" , &i ) && param_val != i )\
+    {\
+        x264_log( h, X264_LOG_ERROR, "different " opt " setting than first pass (%d vs %d)\n", param_val, i );\
+        return -1;\
+    }\
+}
+
 /* Terminology:
  * qp = h.264's quantizer
  * qscale = linearized quantizer = Lagrange multiplier
@@ -167,82 +192,90 @@ static inline double qscale2bits(ratecontrol_entry_t *rce, double qscale)
            + rce->misc_bits;
 }
 
+static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i )
+{
+    int w = i ? 8 : 16;
+    int shift = i ? 6 : 8;
+    int stride = frame->i_stride[i];
+    int offset = h->mb.b_interlaced
+        ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride
+        : w * (mb_x + mb_y * stride);
+    int pix = i ? PIXEL_8x8 : PIXEL_16x16;
+    stride <<= h->mb.b_interlaced;
+    uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride );
+    uint32_t sum = (uint32_t)res;
+    uint32_t sqr = res >> 32;
+    return sqr - (sum * sum >> shift);
+}
+
 // Find the total AC energy of the block in all planes.
-static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
+static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
 {
     /* This function contains annoying hacks because GCC has a habit of reordering emms
      * and putting it after floating point ops.  As a result, we put the emms at the end of the
      * function and make sure that its always called before the float math.  Noinline makes
      * sure no reordering goes on. */
-    unsigned int var = 0, i;
-    for( i = 0; i < 3; i++ )
-    {
-        int w = i ? 8 : 16;
-        int stride = frame->i_stride[i];
-        int offset = h->mb.b_interlaced
-            ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride
-            : w * (mb_x + mb_y * stride);
-        int pix = i ? PIXEL_8x8 : PIXEL_16x16;
-        stride <<= h->mb.b_interlaced;
-        var += h->pixf.var[pix]( frame->plane[i]+offset, stride );
-    }
-    var = X264_MAX(var,1);
+    uint32_t var = ac_energy_plane( h, mb_x, mb_y, frame, 0 );
+    var         += ac_energy_plane( h, mb_x, mb_y, frame, 1 );
+    var         += ac_energy_plane( h, mb_x, mb_y, frame, 2 );
     x264_emms();
     return var;
 }
 
-static const float log2_lut[128] = {
-    0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
-    0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
-    0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
-    0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
-    0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
-    0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
-    0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
-    0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
-    0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
-    0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
-    0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
-    0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
-    0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
-    0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
-    0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
-    0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
-};
-
-static const uint8_t exp2_lut[64] = {
-      1,   4,   7,  10,  13,  16,  19,  22,  25,  28,  31,  34,  37,  40,  44,  47,
-     50,  53,  57,  60,  64,  67,  71,  74,  78,  81,  85,  89,  93,  96, 100, 104,
-    108, 112, 116, 120, 124, 128, 132, 137, 141, 145, 150, 154, 159, 163, 168, 172,
-    177, 182, 186, 191, 196, 201, 206, 211, 216, 221, 226, 232, 237, 242, 248, 253,
-};
-
-static int x264_exp2fix8( float x )
-{
-    int i, f;
-    x += 8;
-    if( x <= 0 ) return 0;
-    if( x >= 16 ) return 0xffff;
-    i = x;
-    f = (x-i)*64;
-    return (exp2_lut[f]+256) << i >> 8;
-}
-
 void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
 {
     /* constants chosen to result in approximately the same overall bitrate as without AQ.
      * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
-    float strength = h->param.rc.f_aq_strength * 1.0397;
     int mb_x, mb_y;
+    float strength;
+    float avg_adj = 0.f;
+    /* Need to init it anyways for MB tree. */
+    if( h->param.rc.f_aq_strength == 0 )
+    {
+        int mb_xy;
+        memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) );
+        memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) );
+        if( h->frames.b_have_lowres )
+            for( mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ )
+                frame->i_inv_qscale_factor[mb_xy] = 256;
+        return;
+    }
+
+    if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
+    {
+        for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
+            for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
+            {
+                uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
+                float qp_adj = x264_log2( energy + 2 );
+                qp_adj *= qp_adj;
+                frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
+                avg_adj += qp_adj;
+            }
+        avg_adj /= h->mb.i_mb_count;
+        strength = h->param.rc.f_aq_strength * avg_adj * (1.f / 6000.f);
+    }
+    else
+        strength = h->param.rc.f_aq_strength * 1.0397f;
+
     for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
         for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
         {
-            uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
-            int lz = x264_clz( energy );
-            float qp_adj = strength * (log2_lut[(energy<<lz>>24)&0x7f] - lz + 16.573f);
-            frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
+            float qp_adj;
+            if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
+            {
+                qp_adj = frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride];
+                qp_adj = strength * (qp_adj - avg_adj);
+            }
+            else
+            {
+                uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
+                qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
+            }
+            frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] =
+            frame->f_qp_offset_aq[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
             if( h->frames.b_have_lowres )
-                frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj*(-1.f/6.f));
+                frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj);
         }
 }
 
@@ -257,22 +290,113 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
 void x264_adaptive_quant( x264_t *h )
 {
     x264_emms();
-    h->mb.i_qp = x264_clip3( h->rc->f_qpm + h->fenc->f_qp_offset[h->mb.i_mb_xy] + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
-    /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
-     * to lower the bit cost of the qp_delta. */
-    if( abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
-        h->mb.i_qp = h->mb.i_last_qp;
+    /* MB-tree currently doesn't adjust quantizers in unreferenced frames. */
+    float qp_offset = h->fdec->b_kept_as_ref ? h->fenc->f_qp_offset[h->mb.i_mb_xy] : h->fenc->f_qp_offset_aq[h->mb.i_mb_xy];
+    h->mb.i_qp = x264_clip3( h->rc->f_qpm + qp_offset + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
+}
+
+int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame )
+{
+    x264_ratecontrol_t *rc = h->rc;
+    uint8_t i_type_actual = rc->entry[frame->i_frame].pict_type;
+    int i;
+
+    if( rc->entry[frame->i_frame].kept_as_ref )
+    {
+        uint8_t i_type;
+        if( rc->qpbuf_pos < 0 )
+        {
+            do
+            {
+                rc->qpbuf_pos++;
+
+                if( !fread( &i_type, 1, 1, rc->p_mbtree_stat_file_in ) )
+                    goto fail;
+                if( fread( rc->qp_buffer[rc->qpbuf_pos], sizeof(uint16_t), h->mb.i_mb_count, rc->p_mbtree_stat_file_in ) != h->mb.i_mb_count )
+                    goto fail;
+
+                if( i_type != i_type_actual && rc->qpbuf_pos == 1 )
+                {
+                    x264_log(h, X264_LOG_ERROR, "MB-tree frametype %d doesn't match actual frametype %d.\n", i_type, i_type_actual);
+                    return -1;
+                }
+            } while( i_type != i_type_actual );
+        }
+
+        for( i = 0; i < h->mb.i_mb_count; i++ )
+        {
+            frame->f_qp_offset[i] = ((float)(int16_t)endian_fix16( rc->qp_buffer[rc->qpbuf_pos][i] )) * (1/256.0);
+            if( h->frames.b_have_lowres )
+                frame->i_inv_qscale_factor[i] = x264_exp2fix8(frame->f_qp_offset[i]);
+        }
+        rc->qpbuf_pos--;
+    }
+    else
+        x264_adaptive_quant_frame( h, frame );
+    return 0;
+fail:
+    x264_log(h, X264_LOG_ERROR, "Incomplete MB-tree stats file.\n");
+    return -1;
+}
+
+int x264_reference_build_list_optimal( x264_t *h )
+{
+    ratecontrol_entry_t *rce = h->rc->rce;
+    x264_frame_t *frames[16];
+    x264_weight_t weights[16][3];
+    int refcount[16];
+    int ref, i;
+
+    if( rce->refs != h->i_ref0 )
+        return -1;
+
+    memcpy( frames, h->fref0, sizeof(frames) );
+    memcpy( refcount, rce->refcount, sizeof(refcount) );
+    memcpy( weights, h->fenc->weight, sizeof(weights) );
+    memset( &h->fenc->weight[1][0], 0, sizeof(x264_weight_t[15][3]) );
+
+    /* For now don't reorder ref 0; it seems to lower quality
+       in most cases due to skips. */
+    for( ref = 1; ref < h->i_ref0; ref++ )
+    {
+        int max = -1;
+        int bestref = 1;
+
+        for( i = 1; i < h->i_ref0; i++ )
+            if( !frames[i]->b_duplicate || frames[i]->i_frame != h->fref0[ref-1]->i_frame )
+                /* Favor lower POC as a tiebreaker. */
+                COPY2_IF_GT( max, refcount[i], bestref, i );
+
+        /* FIXME: If there are duplicates from frames other than ref0 then it is possible
+         * that the optimal ordering doesnt place every duplicate. */
+
+        refcount[bestref] = -1;
+        h->fref0[ref] = frames[bestref];
+        memcpy( h->fenc->weight[ref], weights[bestref], sizeof(weights[bestref]) );
+    }
+
+    return 0;
+}
+
+static char *x264_strcat_filename( char *input, char *suffix )
+{
+    char *output = x264_malloc( strlen( input ) + strlen( suffix ) + 1 );
+    if( !output )
+        return NULL;
+    strcpy( output, input );
+    strcat( output, suffix );
+    return output;
 }
 
 int x264_ratecontrol_new( x264_t *h )
 {
     x264_ratecontrol_t *rc;
-    int i;
+    int i, j;
 
     x264_emms();
 
-    rc = h->rc = x264_malloc( h->param.i_threads * sizeof(x264_ratecontrol_t) );
-    memset( rc, 0, h->param.i_threads * sizeof(x264_ratecontrol_t) );
+    CHECKED_MALLOCZERO( h->rc, h->param.i_threads * sizeof(x264_ratecontrol_t) );
+    rc = h->rc;
 
     rc->b_abr = h->param.rc.i_rc_method != X264_RC_CQP && !h->param.rc.b_stat_read;
     rc->b_2pass = h->param.rc.i_rc_method == X264_RC_ABR && h->param.rc.b_stat_read;
@@ -283,6 +407,14 @@ int x264_ratecontrol_new( x264_t *h )
     else
         rc->fps = 25.0;
 
+    if( h->param.rc.b_mb_tree )
+    {
+        h->param.rc.f_pb_factor = 1;
+        rc->qcompress = 1;
+    }
+    else
+        rc->qcompress = h->param.rc.f_qcompress;
+
     rc->bitrate = h->param.rc.i_bitrate * 1000.;
     rc->rate_tolerance = h->param.rc.f_rate_tolerance;
     rc->nmb = h->mb.i_mb_count;
@@ -304,8 +436,16 @@ int x264_ratecontrol_new( x264_t *h )
         }
         else if( h->param.rc.i_vbv_max_bitrate == 0 )
         {
-            x264_log( h, X264_LOG_DEBUG, "VBV maxrate unspecified, assuming CBR\n" );
-            h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
+            if( h->param.rc.i_rc_method == X264_RC_ABR )
+            {
+                x264_log( h, X264_LOG_INFO, "VBV maxrate unspecified, assuming CBR\n" );
+                h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate;
+            }
+            else
+            {
+                x264_log( h, X264_LOG_INFO, "VBV bufsize set but maxrate unspecified, ignored\n" );
+                h->param.rc.i_vbv_buffer_size = 0;
+            }
         }
     }
     if( h->param.rc.i_vbv_max_bitrate < h->param.rc.i_bitrate &&
@@ -314,16 +454,18 @@ int x264_ratecontrol_new( x264_t *h )
     else if( h->param.rc.i_vbv_max_bitrate > 0 &&
              h->param.rc.i_vbv_buffer_size > 0 )
     {
-        if( h->param.rc.i_vbv_buffer_size < 3 * h->param.rc.i_vbv_max_bitrate / rc->fps )
+        if( h->param.rc.i_vbv_buffer_size < (int)(h->param.rc.i_vbv_max_bitrate / rc->fps) )
         {
-            h->param.rc.i_vbv_buffer_size = 3 * h->param.rc.i_vbv_max_bitrate / rc->fps;
-            x264_log( h, X264_LOG_WARNING, "VBV buffer size too small, using %d kbit\n",
+            h->param.rc.i_vbv_buffer_size = h->param.rc.i_vbv_max_bitrate / rc->fps;
+            x264_log( h, X264_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n",
                       h->param.rc.i_vbv_buffer_size );
         }
         if( h->param.rc.f_vbv_buffer_init > 1. )
             h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init / h->param.rc.i_vbv_buffer_size, 0, 1 );
         rc->buffer_rate = h->param.rc.i_vbv_max_bitrate * 1000. / rc->fps;
         rc->buffer_size = h->param.rc.i_vbv_buffer_size * 1000.;
+        rc->single_frame_vbv = rc->buffer_rate * 1.1 > rc->buffer_size;
+        h->param.rc.f_vbv_buffer_init = X264_MAX( h->param.rc.f_vbv_buffer_init, rc->buffer_rate / rc->buffer_size );
         rc->buffer_fill_final = rc->buffer_size * h->param.rc.f_vbv_buffer_init;
         rc->cbr_decay = 1.0 - rc->buffer_rate / rc->buffer_size
                       * 0.5 * X264_MAX(0, 1.5 - rc->buffer_rate * rc->fps / rc->bitrate);
@@ -352,17 +494,19 @@ int x264_ratecontrol_new( x264_t *h )
         rc->accum_p_norm = .01;
         rc->accum_p_qp = ABR_INIT_QP * rc->accum_p_norm;
         /* estimated ratio that produces a reasonable QP for the first I-frame */
-        rc->cplxr_sum = .01 * pow( 7.0e5, h->param.rc.f_qcompress ) * pow( h->mb.i_mb_count, 0.5 );
+        rc->cplxr_sum = .01 * pow( 7.0e5, rc->qcompress ) * pow( h->mb.i_mb_count, 0.5 );
         rc->wanted_bits_window = 1.0 * rc->bitrate / rc->fps;
         rc->last_non_b_pict_type = SLICE_TYPE_I;
     }
 
     if( h->param.rc.i_rc_method == X264_RC_CRF )
     {
-        /* arbitrary rescaling to make CRF somewhat similar to QP */
+        /* Arbitrary rescaling to make CRF somewhat similar to QP.
+         * Try to compensate for MB-tree's effects as well. */
         double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
-        rc->rate_factor_constant = pow( base_cplx, 1 - h->param.rc.f_qcompress )
-                                 / qp2qscale( h->param.rc.f_rf_constant );
+        double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0;
+        rc->rate_factor_constant = pow( base_cplx, 1 - rc->qcompress )
+                                 / qp2qscale( h->param.rc.f_rf_constant + mbtree_offset );
     }
 
     rc->ip_offset = 6.0 * log(h->param.rc.f_ip_factor) / log(2.0);
@@ -370,11 +514,12 @@ int x264_ratecontrol_new( x264_t *h )
     rc->qp_constant[SLICE_TYPE_P] = h->param.rc.i_qp_constant;
     rc->qp_constant[SLICE_TYPE_I] = x264_clip3( h->param.rc.i_qp_constant - rc->ip_offset + 0.5, 0, 51 );
     rc->qp_constant[SLICE_TYPE_B] = x264_clip3( h->param.rc.i_qp_constant + rc->pb_offset + 0.5, 0, 51 );
+    h->mb.ip_offset = rc->ip_offset + 0.5;
 
     rc->lstep = pow( 2, h->param.rc.i_qp_step / 6.0 );
     rc->last_qscale = qp2qscale(26);
-    rc->pred = x264_malloc( 5*sizeof(predictor_t) );
-    rc->pred_b_from_p = x264_malloc( sizeof(predictor_t) );
+    CHECKED_MALLOC( rc->pred, 5*sizeof(predictor_t) );
+    CHECKED_MALLOC( rc->pred_b_from_p, sizeof(predictor_t) );
     for( i = 0; i < 5; i++ )
     {
         rc->last_qscale_for[i] = qp2qscale( ABR_INIT_QP );
@@ -383,9 +528,14 @@ int x264_ratecontrol_new( x264_t *h )
         rc->pred[i].coeff= 2.0;
         rc->pred[i].count= 1.0;
         rc->pred[i].decay= 0.5;
-        rc->row_preds[i].coeff= .25;
-        rc->row_preds[i].count= 1.0;
-        rc->row_preds[i].decay= 0.5;
+        rc->pred[i].offset= 0.0;
+        for( j = 0; j < 2; j++ )
+        {
+            rc->row_preds[i][j].coeff= .25;
+            rc->row_preds[i][j].count= 1.0;
+            rc->row_preds[i][j].decay= 0.5;
+            rc->row_preds[i][j].offset= 0.0;
+        }
     }
     *rc->pred_b_from_p = rc->pred[0];
 
@@ -408,35 +558,47 @@ int x264_ratecontrol_new( x264_t *h )
             x264_log(h, X264_LOG_ERROR, "ratecontrol_init: can't open stats file\n");
             return -1;
         }
+        if( h->param.rc.b_mb_tree )
+        {
+            char *mbtree_stats_in = x264_strcat_filename( h->param.rc.psz_stat_in, ".mbtree" );
+            if( !mbtree_stats_in )
+                return -1;
+            rc->p_mbtree_stat_file_in = fopen( mbtree_stats_in, "rb" );
+            x264_free( mbtree_stats_in );
+            if( !rc->p_mbtree_stat_file_in )
+            {
+                x264_log(h, X264_LOG_ERROR, "ratecontrol_init: can't open mbtree stats file\n");
+                return -1;
+            }
+        }
 
         /* check whether 1st pass options were compatible with current options */
         if( !strncmp( stats_buf, "#options:", 9 ) )
         {
-            int i;
+            int i, j;
             char *opts = stats_buf;
             stats_in = strchr( stats_buf, '\n' );
             if( !stats_in )
                 return -1;
             *stats_in = '\0';
             stats_in++;
-
-            if( ( p = strstr( opts, "bframes=" ) ) && sscanf( p, "bframes=%d", &i )
-                && h->param.i_bframe != i )
+            if( sscanf( opts, "#options: %dx%d", &i, &j ) != 2 )
             {
-                x264_log( h, X264_LOG_ERROR, "different number of B-frames than 1st pass (%d vs %d)\n",
-                          h->param.i_bframe, i );
+                x264_log( h, X264_LOG_ERROR, "resolution specified in stats file not valid\n" );
+                return -1;
+            }
+            else if( h->param.rc.b_mb_tree && (i != h->param.i_width || j != h->param.i_height)  )
+            {
+                x264_log( h, X264_LOG_ERROR, "MB-tree doesn't support different resolution than 1st pass (%dx%d vs %dx%d)\n",
+                          h->param.i_width, h->param.i_height, i, j );
                 return -1;
             }
 
-            /* since B-adapt doesn't (yet) take into account B-pyramid,
-             * the converse is not a problem */
-            if( strstr( opts, "b_pyramid=1" ) && !h->param.b_bframe_pyramid )
-                x264_log( h, X264_LOG_WARNING, "1st pass used B-pyramid, 2nd doesn't\n" );
-
-            if( ( p = strstr( opts, "keyint=" ) ) && sscanf( p, "keyint=%d", &i )
-                && h->param.i_keyint_max != i )
-                x264_log( h, X264_LOG_WARNING, "different keyint than 1st pass (%d vs %d)\n",
-                          h->param.i_keyint_max, i );
+            CMP_OPT_FIRST_PASS( "wpredp", X264_MAX( 0, h->param.analyse.i_weighted_pred ) );
+            CMP_OPT_FIRST_PASS( "bframes", h->param.i_bframe );
+            CMP_OPT_FIRST_PASS( "b_pyramid", h->param.i_bframe_pyramid );
+            CMP_OPT_FIRST_PASS( "intra_refresh", h->param.b_intra_refresh );
+            CMP_OPT_FIRST_PASS( "keyint", h->param.i_keyint_max );
 
             if( strstr( opts, "qp=0" ) && h->param.rc.i_rc_method == X264_RC_ABR )
                 x264_log( h, X264_LOG_WARNING, "1st pass was lossless, bitrate prediction will be inaccurate\n" );
@@ -454,6 +616,9 @@ int x264_ratecontrol_new( x264_t *h )
                 x264_log( h, X264_LOG_ERROR, "b_adapt method specified in stats file not valid\n" );
                 return -1;
             }
+
+            if( h->param.rc.b_mb_tree && ( p = strstr( opts, "rc_lookahead=" ) ) && sscanf( p, "rc_lookahead=%d", &i ) )
+                h->param.rc.i_lookahead = i;
         }
 
         /* find number of pics */
@@ -479,8 +644,7 @@ int x264_ratecontrol_new( x264_t *h )
             return -1;
         }
 
-        rc->entry = (ratecontrol_entry_t*) x264_malloc(rc->num_entries * sizeof(ratecontrol_entry_t));
-        memset(rc->entry, 0, rc->num_entries * sizeof(ratecontrol_entry_t));
+        CHECKED_MALLOCZERO( rc->entry, rc->num_entries * sizeof(ratecontrol_entry_t) );
 
         /* init all to skipped p frames */
         for(i=0; i<rc->num_entries; i++)
@@ -502,6 +666,7 @@ int x264_ratecontrol_new( x264_t *h )
             int e;
             char *next;
             float qp;
+            int ref;
 
             next= strchr(p, ';');
             if(next)
@@ -524,17 +689,56 @@ int x264_ratecontrol_new( x264_t *h )
                    &rce->mv_bits, &rce->misc_bits, &rce->i_count, &rce->p_count,
                    &rce->s_count, &rce->direct_mode);
 
-            switch(pict_type)
+            p = strstr( p, "ref:" );
+            if( !p )
+                goto parse_error;
+            p += 4;
+            for( ref = 0; ref < 16; ref++ )
             {
-                case 'I': rce->kept_as_ref = 1;
-                case 'i': rce->pict_type = SLICE_TYPE_I; break;
-                case 'P': rce->pict_type = SLICE_TYPE_P; break;
-                case 'B': rce->kept_as_ref = 1;
-                case 'b': rce->pict_type = SLICE_TYPE_B; break;
+                if( sscanf( p, " %d", &rce->refcount[ref] ) != 1 )
+                    break;
+                p = strchr( p+1, ' ' );
+                if( !p )
+                    goto parse_error;
+            }
+            rce->refs = ref;
+
+            /* find weights */
+            rce->i_weight_denom = -1;
+            char *w = strchr( p, 'w' );
+            if( w )
+                if( sscanf( w, "w:%hd,%hd,%hd", &rce->i_weight_denom, &rce->weight[0], &rce->weight[1] ) != 3 )
+                    rce->i_weight_denom = -1;
+
+            if( pict_type != 'b' )
+                rce->kept_as_ref = 1;
+            switch( pict_type )
+            {
+                case 'I':
+                    rce->frame_type = X264_TYPE_IDR;
+                    rce->pict_type  = SLICE_TYPE_I;
+                    break;
+                case 'i':
+                    rce->frame_type = X264_TYPE_I;
+                    rce->pict_type  = SLICE_TYPE_I;
+                    break;
+                case 'P':
+                    rce->frame_type = X264_TYPE_P;
+                    rce->pict_type  = SLICE_TYPE_P;
+                    break;
+                case 'B':
+                    rce->frame_type = X264_TYPE_BREF;
+                    rce->pict_type  = SLICE_TYPE_B;
+                    break;
+                case 'b':
+                    rce->frame_type = X264_TYPE_B;
+                    rce->pict_type  = SLICE_TYPE_B;
+                    break;
                 default:  e = -1; break;
             }
             if(e < 10)
             {
+parse_error:
                 x264_log(h, X264_LOG_ERROR, "statistics are damaged at line %d, parser out=%d\n", i, e);
                 return -1;
             }
@@ -556,10 +760,9 @@ int x264_ratecontrol_new( x264_t *h )
     if( h->param.rc.b_stat_write )
     {
         char *p;
-
-        rc->psz_stat_file_tmpname = x264_malloc( strlen(h->param.rc.psz_stat_out) + 6 );
-        strcpy( rc->psz_stat_file_tmpname, h->param.rc.psz_stat_out );
-        strcat( rc->psz_stat_file_tmpname, ".temp" );
+        rc->psz_stat_file_tmpname = x264_strcat_filename( h->param.rc.psz_stat_out, ".temp" );
+        if( !rc->psz_stat_file_tmpname )
+            return -1;
 
         rc->p_stat_file_out = fopen( rc->psz_stat_file_tmpname, "wb" );
         if( rc->p_stat_file_out == NULL )
@@ -569,8 +772,31 @@ int x264_ratecontrol_new( x264_t *h )
         }
 
         p = x264_param2string( &h->param, 1 );
-        fprintf( rc->p_stat_file_out, "#options: %s\n", p );
+        if( p )
+            fprintf( rc->p_stat_file_out, "#options: %s\n", p );
         x264_free( p );
+        if( h->param.rc.b_mb_tree && !h->param.rc.b_stat_read )
+        {
+            rc->psz_mbtree_stat_file_tmpname = x264_strcat_filename( h->param.rc.psz_stat_out, ".mbtree.temp" );
+            rc->psz_mbtree_stat_file_name = x264_strcat_filename( h->param.rc.psz_stat_out, ".mbtree" );
+            if( !rc->psz_mbtree_stat_file_tmpname || !rc->psz_mbtree_stat_file_name )
+                return -1;
+
+            rc->p_mbtree_stat_file_out = fopen( rc->psz_mbtree_stat_file_tmpname, "wb" );
+            if( rc->p_mbtree_stat_file_out == NULL )
+            {
+                x264_log(h, X264_LOG_ERROR, "ratecontrol_init: can't open mbtree stats file\n");
+                return -1;
+            }
+        }
+    }
+
+    if( h->param.rc.b_mb_tree && (h->param.rc.b_stat_read || h->param.rc.b_stat_write) )
+    {
+        CHECKED_MALLOC( rc->qp_buffer[0], h->mb.i_mb_count * sizeof(uint16_t) );
+        if( h->param.i_bframe_pyramid && h->param.rc.b_stat_read )
+            CHECKED_MALLOC( rc->qp_buffer[1], h->mb.i_mb_count * sizeof(uint16_t) );
+        rc->qpbuf_pos = -1;
     }
 
     for( i=0; i<h->param.i_threads; i++ )
@@ -579,18 +805,20 @@ int x264_ratecontrol_new( x264_t *h )
         if( i )
         {
             rc[i] = rc[0];
-            memcpy( &h->thread[i]->param, &h->param, sizeof( x264_param_t ) );
+            h->thread[i]->param = h->param;
             h->thread[i]->mb.b_variable_qp = h->mb.b_variable_qp;
         }
     }
 
     return 0;
+fail:
+    return -1;
 }
 
 static int parse_zone( x264_t *h, x264_zone_t *z, char *p )
 {
     int len = 0;
-    char *tok, UNUSED *saveptr;
+    char *tok, UNUSED *saveptr=NULL;
     z->param = NULL;
     z->f_bitrate_factor = 1;
     if( 3 <= sscanf(p, "%u,%u,q=%u%n", &z->i_start, &z->i_end, &z->i_qp, &len) )
@@ -607,8 +835,9 @@ static int parse_zone( x264_t *h, x264_zone_t *z, char *p )
     p += len;
     if( !*p )
         return 0;
-    z->param = x264_malloc( sizeof(x264_param_t) );
+    CHECKED_MALLOC( z->param, sizeof(x264_param_t) );
     memcpy( z->param, &h->param, sizeof(x264_param_t) );
+    z->param->param_free = x264_free;
     while( (tok = strtok_r( p, ",", &saveptr )) )
     {
         char *val = strchr( tok, '=' );
@@ -625,6 +854,8 @@ static int parse_zone( x264_t *h, x264_zone_t *z, char *p )
         p = NULL;
     }
     return 0;
+fail:
+    return -1;
 }
 
 static int parse_zones( x264_t *h )
@@ -633,20 +864,21 @@ static int parse_zones( x264_t *h )
     int i;
     if( h->param.rc.psz_zones && !h->param.rc.i_zones )
     {
-        char *p, *tok, UNUSED *saveptr;
-        char *psz_zones = x264_malloc( strlen(h->param.rc.psz_zones)+1 );
+        char *psz_zones, *p;
+        CHECKED_MALLOC( psz_zones, strlen( h->param.rc.psz_zones )+1 );
         strcpy( psz_zones, h->param.rc.psz_zones );
         h->param.rc.i_zones = 1;
         for( p = psz_zones; *p; p++ )
             h->param.rc.i_zones += (*p == '/');
-        h->param.rc.zones = x264_malloc( h->param.rc.i_zones * sizeof(x264_zone_t) );
+        CHECKED_MALLOC( h->param.rc.zones, h->param.rc.i_zones * sizeof(x264_zone_t) );
         p = psz_zones;
         for( i = 0; i < h->param.rc.i_zones; i++ )
         {
-            tok = strtok_r( p, "/", &saveptr );
-            if( !tok || parse_zone( h, &h->param.rc.zones[i], tok ) )
+            int i_tok = strcspn( p, "/" );
+            p[i_tok] = 0;
+            if( parse_zone( h, &h->param.rc.zones[i], p ) )
                 return -1;
-            p = NULL;
+            p += i_tok + 1;
         }
         x264_free( psz_zones );
     }
@@ -671,7 +903,7 @@ static int parse_zones( x264_t *h )
         }
 
         rc->i_zones = h->param.rc.i_zones + 1;
-        rc->zones = x264_malloc( rc->i_zones * sizeof(x264_zone_t) );
+        CHECKED_MALLOC( rc->zones, rc->i_zones * sizeof(x264_zone_t) );
         memcpy( rc->zones+1, h->param.rc.zones, (rc->i_zones-1) * sizeof(x264_zone_t) );
 
         // default zone to fall back to if none of the others match
@@ -679,7 +911,7 @@ static int parse_zones( x264_t *h )
         rc->zones[0].i_end = INT_MAX;
         rc->zones[0].b_force_qp = 0;
         rc->zones[0].f_bitrate_factor = 1;
-        rc->zones[0].param = x264_malloc( sizeof(x264_param_t) );
+        CHECKED_MALLOC( rc->zones[0].param, sizeof(x264_param_t) );
         memcpy( rc->zones[0].param, &h->param, sizeof(x264_param_t) );
         for( i = 1; i < rc->i_zones; i++ )
         {
@@ -689,6 +921,8 @@ static int parse_zones( x264_t *h )
     }
 
     return 0;
+fail:
+    return -1;
 }
 
 static x264_zone_t *get_zone( x264_t *h, int frame_num )
@@ -709,9 +943,10 @@ void x264_ratecontrol_summary( x264_t *h )
     if( rc->b_abr && h->param.rc.i_rc_method == X264_RC_ABR && rc->cbr_decay > .9999 )
     {
         double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80);
+        double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0;
         x264_log( h, X264_LOG_INFO, "final ratefactor: %.2f\n",
-                  qscale2qp( pow( base_cplx, 1 - h->param.rc.f_qcompress )
-                             * rc->cplxr_sum / rc->wanted_bits_window ) );
+                  qscale2qp( pow( base_cplx, 1 - rc->qcompress )
+                             * rc->cplxr_sum / rc->wanted_bits_window ) - mbtree_offset );
     }
 }
 
@@ -719,11 +954,13 @@ void x264_ratecontrol_delete( x264_t *h )
 {
     x264_ratecontrol_t *rc = h->rc;
     int i;
+    int b_regular_file;
 
     if( rc->p_stat_file_out )
     {
+        b_regular_file = x264_is_regular_file( rc->p_stat_file_out );
         fclose( rc->p_stat_file_out );
-        if( h->i_frame >= rc->num_entries )
+        if( h->i_frame >= rc->num_entries && b_regular_file )
             if( rename( rc->psz_stat_file_tmpname, h->param.rc.psz_stat_out ) != 0 )
             {
                 x264_log( h, X264_LOG_ERROR, "failed to rename \"%s\" to \"%s\"\n",
@@ -731,16 +968,32 @@ void x264_ratecontrol_delete( x264_t *h )
             }
         x264_free( rc->psz_stat_file_tmpname );
     }
+    if( rc->p_mbtree_stat_file_out )
+    {
+        b_regular_file = x264_is_regular_file( rc->p_mbtree_stat_file_out );
+        fclose( rc->p_mbtree_stat_file_out );
+        if( h->i_frame >= rc->num_entries && b_regular_file )
+            if( rename( rc->psz_mbtree_stat_file_tmpname, rc->psz_mbtree_stat_file_name ) != 0 )
+            {
+                x264_log( h, X264_LOG_ERROR, "failed to rename \"%s\" to \"%s\"\n",
+                          rc->psz_mbtree_stat_file_tmpname, rc->psz_mbtree_stat_file_name );
+            }
+        x264_free( rc->psz_mbtree_stat_file_tmpname );
+        x264_free( rc->psz_mbtree_stat_file_name );
+    }
+    if( rc->p_mbtree_stat_file_in )
+        fclose( rc->p_mbtree_stat_file_in );
     x264_free( rc->pred );
     x264_free( rc->pred_b_from_p );
     x264_free( rc->entry );
+    x264_free( rc->qp_buffer[0] );
+    x264_free( rc->qp_buffer[1] );
     if( rc->zones )
     {
         x264_free( rc->zones[0].param );
-        if( h->param.rc.psz_zones )
-            for( i=1; i<rc->i_zones; i++ )
-                if( rc->zones[i].param != rc->zones[0].param )
-                    x264_free( rc->zones[i].param );
+        for( i=1; i<rc->i_zones; i++ )
+            if( rc->zones[i].param != rc->zones[0].param && rc->zones[i].param->param_free )
+                rc->zones[i].param->param_free( rc->zones[i].param );
         x264_free( rc->zones );
     }
     x264_free( rc );
@@ -775,7 +1028,7 @@ static void accum_p_qp_update( x264_t *h, float qp )
 }
 
 /* Before encoding a frame, choose a QP for it */
-void x264_ratecontrol_start( x264_t *h, int i_force_qp )
+void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead )
 {
     x264_ratecontrol_t *rc = h->rc;
     ratecontrol_entry_t *rce = NULL;
@@ -808,15 +1061,11 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp )
     {
         memset( h->fdec->i_row_bits, 0, h->sps->i_mb_height * sizeof(int) );
         rc->row_pred = &rc->row_preds[h->sh.i_type];
-        update_vbv_plan( h );
+        update_vbv_plan( h, overhead );
     }
 
     if( h->sh.i_type != SLICE_TYPE_B )
-    {
-        rc->bframes = 0;
-        while( h->frames.current[rc->bframes] && IS_X264_TYPE_B(h->frames.current[rc->bframes]->i_type) )
-            rc->bframes++;
-    }
+        rc->bframes = h->fenc->i_bframes;
 
     if( i_force_qp )
     {
@@ -847,6 +1096,8 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp )
         }
     }
 
+    q = x264_clip3f( q, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
+
     rc->qpa_rc =
     rc->qpa_aq = 0;
     h->fdec->f_qp_avg_rc =
@@ -857,12 +1108,7 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp )
     if( rce )
         rce->new_qp = rc->qp;
 
-    /* accum_p_qp needs to be here so that future frames can benefit from the
-     * data before this frame is done. but this only works because threading
-     * guarantees to not re-encode any frames. so the non-threaded case does
-     * accum_p_qp later. */
-    if( h->param.i_threads > 1 )
-        accum_p_qp_update( h, rc->qp );
+    accum_p_qp_update( h, rc->qp );
 
     if( h->sh.i_type != SLICE_TYPE_B )
         rc->last_non_b_pict_type = h->sh.i_type;
@@ -873,27 +1119,36 @@ static double predict_row_size( x264_t *h, int y, int qp )
     /* average between two predictors:
      * absolute SATD, and scaled bit cost of the colocated row in the previous frame */
     x264_ratecontrol_t *rc = h->rc;
-    double pred_s = predict_size( rc->row_pred, qp2qscale(qp), h->fdec->i_row_satd[y] );
+    double pred_s = predict_size( rc->row_pred[0], qp2qscale(qp), h->fdec->i_row_satd[y] );
     double pred_t = 0;
-    if( h->sh.i_type != SLICE_TYPE_I
-        && h->fref0[0]->i_type == h->fdec->i_type
-        && h->fref0[0]->i_row_satd[y] > 0
-        && (abs(h->fref0[0]->i_row_satd[y] - h->fdec->i_row_satd[y]) < h->fdec->i_row_satd[y]/2))
+    if( h->sh.i_type == SLICE_TYPE_I || qp >= h->fref0[0]->i_row_qp[y] )
+    {
+        if( h->sh.i_type == SLICE_TYPE_P
+            && h->fref0[0]->i_type == h->fdec->i_type
+            && h->fref0[0]->i_row_satd[y] > 0
+            && (abs(h->fref0[0]->i_row_satd[y] - h->fdec->i_row_satd[y]) < h->fdec->i_row_satd[y]/2))
+        {
+            pred_t = h->fref0[0]->i_row_bits[y] * h->fdec->i_row_satd[y] / h->fref0[0]->i_row_satd[y]
+                     * qp2qscale(h->fref0[0]->i_row_qp[y]) / qp2qscale(qp);
+        }
+        if( pred_t == 0 )
+            pred_t = pred_s;
+        return (pred_s + pred_t) / 2;
+    }
+    /* Our QP is lower than the reference! */
+    else
     {
-        pred_t = h->fref0[0]->i_row_bits[y] * h->fdec->i_row_satd[y] / h->fref0[0]->i_row_satd[y]
-                 * qp2qscale(h->fref0[0]->i_row_qp[y]) / qp2qscale(qp);
+        double pred_intra = predict_size( rc->row_pred[1], qp2qscale(qp), h->fdec->i_row_satds[0][0][y] );
+        /* Sum: better to overestimate than underestimate by using only one of the two predictors. */
+        return pred_intra + pred_s;
     }
-    if( pred_t == 0 )
-        pred_t = pred_s;
-
-    return (pred_s + pred_t) / 2;
 }
 
 static double row_bits_so_far( x264_t *h, int y )
 {
     int i;
     double bits = 0;
-    for( i = 0; i <= y; i++ )
+    for( i = h->i_threadslice_start; i <= y; i++ )
         bits += h->fdec->i_row_bits[i];
     return bits;
 }
@@ -902,7 +1157,7 @@ static double predict_row_size_sum( x264_t *h, int y, int qp )
 {
     int i;
     double bits = row_bits_so_far(h, y);
-    for( i = y+1; i < h->sps->i_mb_height; i++ )
+    for( i = y+1; i < h->i_threadslice_end; i++ )
         bits += predict_row_size( h, i, qp );
     return bits;
 }
@@ -919,83 +1174,84 @@ void x264_ratecontrol_mb( x264_t *h, int bits )
     rc->qpa_rc += rc->f_qpm;
     rc->qpa_aq += h->mb.i_qp;
 
-    if( h->mb.i_mb_x != h->sps->i_mb_width - 1 || !rc->b_vbv)
+    if( h->mb.i_mb_x != h->sps->i_mb_width - 1 || !rc->b_vbv )
         return;
 
     h->fdec->i_row_qp[y] = rc->qpm;
 
-    if( h->sh.i_type == SLICE_TYPE_B )
+    update_predictor( rc->row_pred[0], qp2qscale(rc->qpm), h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] );
+    if( h->sh.i_type == SLICE_TYPE_P && rc->qpm < h->fref0[0]->i_row_qp[y] )
+        update_predictor( rc->row_pred[1], qp2qscale(rc->qpm), h->fdec->i_row_satds[0][0][y], h->fdec->i_row_bits[y] );
+
+    /* tweak quality based on difference from predicted size */
+    if( y < h->i_threadslice_end-1 )
     {
-        /* B-frames shouldn't use lower QP than their reference frames.
-         * This code is a bit overzealous in limiting B-frame quantizers, but it helps avoid
-         * underflows due to the fact that B-frames are not explicitly covered by VBV. */
-        if( y < h->sps->i_mb_height-1 )
+        int prev_row_qp = h->fdec->i_row_qp[y];
+        int i_qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, h->param.rc.i_qp_max );
+        int i_qp_min = X264_MAX( prev_row_qp - h->param.rc.i_qp_step, h->param.rc.i_qp_min );
+
+        /* B-frames shouldn't use lower QP than their reference frames. */
+        if( h->sh.i_type == SLICE_TYPE_B )
         {
-            int i_estimated;
-            int avg_qp = X264_MAX(h->fref0[0]->i_row_qp[y+1], h->fref1[0]->i_row_qp[y+1])
-                       + rc->pb_offset * ((h->fenc->i_type == X264_TYPE_BREF) ? 0.5 : 1);
-            rc->qpm = X264_MIN(X264_MAX( rc->qp, avg_qp), 51); //avg_qp could go higher than 51 due to pb_offset
-            i_estimated = row_bits_so_far(h, y); //FIXME: compute full estimated size
-            if (i_estimated > h->rc->frame_size_planned)
-                x264_ratecontrol_set_estimated_size(h, i_estimated);
+            i_qp_min = X264_MAX( i_qp_min, X264_MAX( h->fref0[0]->i_row_qp[y+1], h->fref1[0]->i_row_qp[y+1] ) );
+            rc->qpm = X264_MAX( rc->qpm, i_qp_min );
         }
-    }
-    else
-    {
-        update_predictor( rc->row_pred, qp2qscale(rc->qpm), h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] );
 
-        /* tweak quality based on difference from predicted size */
-        if( y < h->sps->i_mb_height-1 && h->stat.i_slice_count[h->sh.i_type] > 0 )
+        float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned;
+        float slice_size_planned = h->param.b_sliced_threads ? rc->slice_size_planned : rc->frame_size_planned;
+        float size_of_other_slices = rc->frame_size_planned - slice_size_planned;
+        /* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */
+        float rc_tol = buffer_left_planned / h->param.i_threads * rc->rate_tolerance;
+        float max_frame_error = X264_MAX( 0.05, 1.0 / h->sps->i_mb_height );
+        int b1 = predict_row_size_sum( h, y, rc->qpm );
+
+        /* Assume that if this slice has become larger than expected,
+         * the other slices will have gotten equally larger. */
+        b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
+
+        /* Don't modify the row QPs until a sufficent amount of the bits of the frame have been processed, in case a flat */
+        /* area at the top of the frame was measured inaccurately. */
+        if( row_bits_so_far(h,y) < 0.05 * (rc->frame_size_planned-size_of_other_slices) )
+            return;
+
+        if( h->sh.i_type != SLICE_TYPE_I )
+            rc_tol /= 2;
+
+        if( !rc->b_vbv_min_rate )
+            i_qp_min = X264_MAX( i_qp_min, h->sh.i_qp );
+
+        while( rc->qpm < i_qp_max
+               && ((b1 > rc->frame_size_planned + rc_tol) ||
+                   (rc->buffer_fill - b1 < buffer_left_planned * 0.5) ||
+                   (b1 > rc->frame_size_planned && rc->qpm < rc->qp_novbv)) )
         {
-            int prev_row_qp = h->fdec->i_row_qp[y];
-            int b0 = predict_row_size_sum( h, y, rc->qpm );
-            int b1 = b0;
-            int i_qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, h->param.rc.i_qp_max );
-            int i_qp_min = X264_MAX( prev_row_qp - h->param.rc.i_qp_step, h->param.rc.i_qp_min );
-            float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned;
-            float rc_tol = 1;
-            float headroom = 0;
-
-            /* Don't modify the row QPs until a sufficent amount of the bits of the frame have been processed, in case a flat */
-            /* area at the top of the frame was measured inaccurately. */
-            if(row_bits_so_far(h,y) < 0.05 * rc->frame_size_planned)
-                return;
-
-            headroom = buffer_left_planned/rc->buffer_size;
-            if(h->sh.i_type != SLICE_TYPE_I)
-                headroom /= 2;
-            rc_tol += headroom;
-
-            if( !rc->b_vbv_min_rate )
-                i_qp_min = X264_MAX( i_qp_min, h->sh.i_qp );
-
-            while( rc->qpm < i_qp_max
-                   && (b1 > rc->frame_size_planned * rc_tol
-                    || (rc->buffer_fill - b1 < buffer_left_planned * 0.5)))
-            {
-                rc->qpm ++;
-                b1 = predict_row_size_sum( h, y, rc->qpm );
-            }
+            rc->qpm ++;
+            b1 = predict_row_size_sum( h, y, rc->qpm );
+            b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
+        }
 
-            /* avoid VBV underflow */
-            while( (rc->qpm < h->param.rc.i_qp_max)
-                   && (rc->buffer_fill - b1 < rc->buffer_size * 0.005))
-            {
-                rc->qpm ++;
-                b1 = predict_row_size_sum( h, y, rc->qpm );
-            }
+        while( rc->qpm > i_qp_min
+               && (rc->qpm > h->fdec->i_row_qp[0] || rc->single_frame_vbv)
+               && ((b1 < rc->frame_size_planned * 0.8 && rc->qpm <= prev_row_qp)
+               || b1 < (rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 1.1) )
+        {
+            rc->qpm --;
+            b1 = predict_row_size_sum( h, y, rc->qpm );
+            b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
+        }
 
-            while( rc->qpm > i_qp_min
-                   && rc->qpm > h->fdec->i_row_qp[0]
-                   && ((b1 < rc->frame_size_planned * 0.8 && rc->qpm <= prev_row_qp)
-                     || b1 < (rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 1.1) )
-            {
-                rc->qpm --;
-                b1 = predict_row_size_sum( h, y, rc->qpm );
-            }
-            x264_ratecontrol_set_estimated_size(h, b1);
+        /* avoid VBV underflow */
+        while( (rc->qpm < h->param.rc.i_qp_max)
+               && (rc->buffer_fill - b1 < rc->buffer_rate * max_frame_error) )
+        {
+            rc->qpm ++;
+            b1 = predict_row_size_sum( h, y, rc->qpm );
+            b1 += X264_MAX( size_of_other_slices * b1 / slice_size_planned, size_of_other_slices );
         }
+
+        x264_ratecontrol_set_estimated_size(h, b1);
     }
+
     /* loses the fractional part of the frame-wise qp */
     rc->f_qpm = rc->qpm;
 }
@@ -1018,8 +1274,8 @@ int x264_ratecontrol_slice_type( x264_t *h, int frame_num )
              * So just calculate the average QP used so far. */
             int i;
 
-            h->param.rc.i_qp_constant = (h->stat.i_slice_count[SLICE_TYPE_P] == 0) ? 24
-                                      : 1 + h->stat.f_slice_qp[SLICE_TYPE_P] / h->stat.i_slice_count[SLICE_TYPE_P];
+            h->param.rc.i_qp_constant = (h->stat.i_frame_count[SLICE_TYPE_P] == 0) ? 24
+                                      : 1 + h->stat.f_frame_qp[SLICE_TYPE_P] / h->stat.i_frame_count[SLICE_TYPE_P];
             rc->qp_constant[SLICE_TYPE_P] = x264_clip3( h->param.rc.i_qp_constant, 0, 51 );
             rc->qp_constant[SLICE_TYPE_I] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) / fabs( h->param.rc.f_ip_factor )) + 0.5 ), 0, 51 );
             rc->qp_constant[SLICE_TYPE_B] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) * fabs( h->param.rc.f_pb_factor )) + 0.5 ), 0, 51 );
@@ -1042,27 +1298,23 @@ int x264_ratecontrol_slice_type( x264_t *h, int frame_num )
             }
             return X264_TYPE_AUTO;
         }
-        switch( rc->entry[frame_num].pict_type )
-        {
-            case SLICE_TYPE_I:
-                return rc->entry[frame_num].kept_as_ref ? X264_TYPE_IDR : X264_TYPE_I;
-
-            case SLICE_TYPE_B:
-                return rc->entry[frame_num].kept_as_ref ? X264_TYPE_BREF : X264_TYPE_B;
-
-            case SLICE_TYPE_P:
-            default:
-                return X264_TYPE_P;
-        }
+        return rc->entry[frame_num].frame_type;
     }
     else
-    {
         return X264_TYPE_AUTO;
-    }
+}
+
+void x264_ratecontrol_set_weights( x264_t *h, x264_frame_t *frm )
+{
+    ratecontrol_entry_t *rce = &h->rc->entry[frm->i_frame];
+    if( h->param.analyse.i_weighted_pred <= 0 )
+        return;
+    if( rce->i_weight_denom >= 0 )
+        SET_WEIGHT( frm->weight[0][0], 1, rce->weight[0], rce->i_weight_denom, rce->weight[1] );
 }
 
 /* After encoding one frame, save stats and update ratecontrol state */
-void x264_ratecontrol_end( x264_t *h, int bits )
+int x264_ratecontrol_end( x264_t *h, int bits )
 {
     x264_ratecontrol_t *rc = h->rc;
     const int *mbs = h->stat.frame.i_mb_count;
@@ -1090,8 +1342,8 @@ void x264_ratecontrol_end( x264_t *h, int bits )
                         ( dir_frame>0 ? 's' : dir_frame<0 ? 't' :
                           dir_avg>0 ? 's' : dir_avg<0 ? 't' : '-' )
                         : '-';
-        fprintf( rc->p_stat_file_out,
-                 "in:%d out:%d type:%c q:%.2f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c;\n",
+        if( fprintf( rc->p_stat_file_out,
+                 "in:%d out:%d type:%c q:%.2f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c ref:",
                  h->fenc->i_frame, h->i_frame,
                  c_type, rc->qpa_rc,
                  h->stat.frame.i_tex_bits,
@@ -1100,7 +1352,43 @@ void x264_ratecontrol_end( x264_t *h, int bits )
                  h->stat.frame.i_mb_count_i,
                  h->stat.frame.i_mb_count_p,
                  h->stat.frame.i_mb_count_skip,
-                 c_direct);
+                 c_direct) < 0 )
+            goto fail;
+
+        /* Only write information for reference reordering once. */
+        int use_old_stats = h->param.rc.b_stat_read && rc->rce->refs > 1;
+        for( i = 0; i < (use_old_stats ? rc->rce->refs : h->i_ref0); i++ )
+        {
+            int refcount = use_old_stats         ? rc->rce->refcount[i]
+                         : h->param.b_interlaced ? h->stat.frame.i_mb_count_ref[0][i*2]
+                                                 + h->stat.frame.i_mb_count_ref[0][i*2+1]
+                         :                         h->stat.frame.i_mb_count_ref[0][i];
+            if( fprintf( rc->p_stat_file_out, "%d ", refcount ) < 0 )
+                goto fail;
+        }
+
+        if( h->sh.weight[0][0].weightfn )
+        {
+            if( fprintf( rc->p_stat_file_out, "w:%"PRId32",%"PRId32",%"PRId32, h->sh.weight[0][0].i_denom, h->sh.weight[0][0].i_scale, h->sh.weight[0][0].i_offset ) < 0 )
+                goto fail;
+        }
+
+        if( fprintf( rc->p_stat_file_out, ";\n") < 0 )
+            goto fail;
+
+        /* Don't re-write the data in multi-pass mode. */
+        if( h->param.rc.b_mb_tree && h->fenc->b_kept_as_ref && !h->param.rc.b_stat_read )
+        {
+            uint8_t i_type = h->sh.i_type;
+            int i;
+            /* Values are stored as big-endian FIX8.8 */
+            for( i = 0; i < h->mb.i_mb_count; i++ )
+                rc->qp_buffer[0][i] = endian_fix16( h->fenc->f_qp_offset[i]*256.0 );
+            if( fwrite( &i_type, 1, 1, rc->p_mbtree_stat_file_out ) < 1 )
+                goto fail;
+            if( fwrite( rc->qp_buffer[0], sizeof(uint16_t), h->mb.i_mb_count, rc->p_mbtree_stat_file_out ) < h->mb.i_mb_count )
+                goto fail;
+        }
     }
 
     if( rc->b_abr )
@@ -1116,9 +1404,6 @@ void x264_ratecontrol_end( x264_t *h, int bits )
         rc->cplxr_sum *= rc->cbr_decay;
         rc->wanted_bits_window += rc->bitrate / rc->fps;
         rc->wanted_bits_window *= rc->cbr_decay;
-
-        if( h->param.i_threads == 1 )
-            accum_p_qp_update( h, rc->qpa_rc );
     }
 
     if( rc->b_2pass )
@@ -1131,20 +1416,20 @@ void x264_ratecontrol_end( x264_t *h, int bits )
         if( h->sh.i_type == SLICE_TYPE_B )
         {
             rc->bframe_bits += bits;
-            if( !h->frames.current[0] || !IS_X264_TYPE_B(h->frames.current[0]->i_type) )
+            if( h->fenc->b_last_minigop_bframe )
             {
                 update_predictor( rc->pred_b_from_p, qp2qscale(rc->qpa_rc),
                                   h->fref1[h->i_ref1-1]->i_satd, rc->bframe_bits / rc->bframes );
-                /* In some cases, such as completely blank scenes, pred_b_from_p can go nuts */
-                /* Hackily cap the predictor coeff in case this happens. */
-                /* FIXME FIXME FIXME */
-                rc->pred_b_from_p->coeff = X264_MIN( rc->pred_b_from_p->coeff, 10. );
                 rc->bframe_bits = 0;
             }
         }
     }
 
     update_vbv( h, bits );
+    return 0;
+fail:
+    x264_log(h, X264_LOG_ERROR, "ratecontrol_end: stats file could not be written to\n");
+    return -1;
 }
 
 /****************************************************************************
@@ -1160,11 +1445,11 @@ static double get_qscale(x264_t *h, ratecontrol_entry_t *rce, double rate_factor
     double q;
     x264_zone_t *zone = get_zone( h, frame_num );
 
-    q = pow( rce->blurred_complexity, 1 - h->param.rc.f_qcompress );
+    q = pow( rce->blurred_complexity, 1 - rcc->qcompress );
 
     // avoid NaN's in the rc_eq
     if(!isfinite(q) || rce->tex_bits + rce->mv_bits == 0)
-        q = rcc->last_qscale;
+        q = rcc->last_qscale_for[rce->pict_type];
     else
     {
         rcc->last_rceq = q;
@@ -1252,17 +1537,28 @@ static double get_diff_limited_q(x264_t *h, ratecontrol_entry_t *rce, double q)
 
 static double predict_size( predictor_t *p, double q, double var )
 {
-     return p->coeff*var / (q*p->count);
+     return (p->coeff*var + p->offset) / (q*p->count);
 }
 
 static void update_predictor( predictor_t *p, double q, double var, double bits )
 {
+    const double range = 1.5;
     if( var < 10 )
         return;
-    p->count *= p->decay;
-    p->coeff *= p->decay;
-    p->count ++;
-    p->coeff += bits*q / var;
+    double old_coeff = p->coeff / p->count;
+    double new_coeff = bits*q / var;
+    double new_coeff_clipped = x264_clip3f( new_coeff, old_coeff/range, old_coeff*range );
+    double new_offset = bits*q - new_coeff_clipped * var;
+    if( new_offset >= 0 )
+        new_coeff = new_coeff_clipped;
+    else
+        new_offset = 0;
+    p->count  *= p->decay;
+    p->coeff  *= p->decay;
+    p->offset *= p->decay;
+    p->count  ++;
+    p->coeff  += new_coeff;
+    p->offset += new_offset;
 }
 
 // update VBV after encoding a frame
@@ -1277,30 +1573,34 @@ static void update_vbv( x264_t *h, int bits )
     if( !rcc->b_vbv )
         return;
 
-    rct->buffer_fill_final += rct->buffer_rate - bits;
+    rct->buffer_fill_final -= bits;
     if( rct->buffer_fill_final < 0 )
-        x264_log( h, X264_LOG_WARNING, "VBV underflow (%.0f bits)\n", rct->buffer_fill_final );
-    rct->buffer_fill_final = x264_clip3f( rct->buffer_fill_final, 0, rct->buffer_size );
+        x264_log( h, X264_LOG_WARNING, "VBV underflow (frame %d, %.0f bits)\n", h->i_frame, rct->buffer_fill_final );
+    rct->buffer_fill_final = X264_MAX( rct->buffer_fill_final, 0 );
+    rct->buffer_fill_final += rct->buffer_rate;
+    rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, rct->buffer_size );
 }
 
 // provisionally update VBV according to the planned size of all frames currently in progress
-static void update_vbv_plan( x264_t *h )
+static void update_vbv_plan( x264_t *h, int overhead )
 {
     x264_ratecontrol_t *rcc = h->rc;
-    rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final;
-    if( h->param.i_threads > 1 )
+    rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final - overhead;
+    if( h->i_thread_frames > 1 )
     {
         int j = h->rc - h->thread[0]->rc;
         int i;
-        for( i=1; i<h->param.i_threads; i++ )
+        for( i=1; i<h->i_thread_frames; i++ )
         {
-            x264_t *t = h->thread[ (j+i)%h->param.i_threads ];
+            x264_t *t = h->thread[ (j+i)%h->i_thread_frames ];
             double bits = t->rc->frame_size_planned;
             if( !t->b_thread_active )
                 continue;
             bits  = X264_MAX(bits, x264_ratecontrol_get_estimated_size(t));
-            rcc->buffer_fill += rcc->buffer_rate - bits;
-            rcc->buffer_fill = x264_clip3( rcc->buffer_fill, 0, rcc->buffer_size );
+            rcc->buffer_fill -= bits;
+            rcc->buffer_fill = X264_MAX( rcc->buffer_fill, 0 );
+            rcc->buffer_fill += rcc->buffer_rate;
+            rcc->buffer_fill = X264_MIN( rcc->buffer_fill, rcc->buffer_size );
         }
     }
 }
@@ -1314,49 +1614,104 @@ static double clip_qscale( x264_t *h, int pict_type, double q )
     double q0 = q;
 
     /* B-frames are not directly subject to VBV,
-     * since they are controlled by the P-frames' QPs.
-     * FIXME: in 2pass we could modify previous frames' QP too,
-     *        instead of waiting for the buffer to fill */
-    if( rcc->b_vbv &&
-        ( pict_type == SLICE_TYPE_P ||
-          ( pict_type == SLICE_TYPE_I && rcc->last_non_b_pict_type == SLICE_TYPE_I ) ) )
-    {
-        if( rcc->buffer_fill/rcc->buffer_size < 0.5 )
-            q /= x264_clip3f( 2.0*rcc->buffer_fill/rcc->buffer_size, 0.5, 1.0 );
-    }
+     * since they are controlled by the P-frames' QPs. */
 
     if( rcc->b_vbv && rcc->last_satd > 0 )
     {
-        /* Now a hard threshold to make sure the frame fits in VBV.
-         * This one is mostly for I-frames. */
-        double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
-        double qf = 1.0;
-        if( bits > rcc->buffer_fill/2 )
-            qf = x264_clip3f( rcc->buffer_fill/(2*bits), 0.2, 1.0 );
-        q /= qf;
-        bits *= qf;
-        if( bits < rcc->buffer_rate/2 )
-            q *= bits*2/rcc->buffer_rate;
-        q = X264_MAX( q0, q );
+        /* Lookahead VBV: raise the quantizer as necessary such that no frames in
+         * the lookahead overflow and such that the buffer is in a reasonable state
+         * by the end of the lookahead. */
+        if( h->param.rc.i_lookahead )
+        {
+            int j, iterations, terminate = 0;
+
+            /* Avoid an infinite loop. */
+            for( iterations = 0; iterations < 1000 && terminate != 3; iterations++ )
+            {
+                double frame_q[3];
+                double cur_bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
+                double buffer_fill_cur = rcc->buffer_fill - cur_bits;
+                double target_fill;
+                frame_q[0] = h->sh.i_type == SLICE_TYPE_I ? q * h->param.rc.f_ip_factor : q;
+                frame_q[1] = frame_q[0] * h->param.rc.f_pb_factor;
+                frame_q[2] = frame_q[0] / h->param.rc.f_ip_factor;
+
+                /* Loop over the planned future frames. */
+                for( j = 0; buffer_fill_cur >= 0 && buffer_fill_cur <= rcc->buffer_size; j++ )
+                {
+                    buffer_fill_cur += rcc->buffer_rate;
+                    int i_type = h->fenc->i_planned_type[j];
+                    int i_satd = h->fenc->i_planned_satd[j];
+                    if( i_type == X264_TYPE_AUTO )
+                        break;
+                    i_type = IS_X264_TYPE_I( i_type ) ? SLICE_TYPE_I : IS_X264_TYPE_B( i_type ) ? SLICE_TYPE_B : SLICE_TYPE_P;
+                    cur_bits = predict_size( &rcc->pred[i_type], frame_q[i_type], i_satd );
+                    buffer_fill_cur -= cur_bits;
+                }
+                /* Try to get to get the buffer at least 50% filled, but don't set an impossible goal. */
+                target_fill = X264_MIN( rcc->buffer_fill + j * rcc->buffer_rate * 0.5, rcc->buffer_size * 0.5 );
+                if( buffer_fill_cur < target_fill )
+                {
+                    q *= 1.01;
+                    terminate |= 1;
+                    continue;
+                }
+                /* Try to get the buffer no more than 80% filled, but don't set an impossible goal. */
+                target_fill = x264_clip3f( rcc->buffer_fill - j * rcc->buffer_rate * 0.5, rcc->buffer_size * 0.8, rcc->buffer_size );
+                if( rcc->b_vbv_min_rate && buffer_fill_cur > target_fill )
+                {
+                    q /= 1.01;
+                    terminate |= 2;
+                    continue;
+                }
+                break;
+            }
+        }
+        /* Fallback to old purely-reactive algorithm: no lookahead. */
+        else
+        {
+            if( ( pict_type == SLICE_TYPE_P ||
+                ( pict_type == SLICE_TYPE_I && rcc->last_non_b_pict_type == SLICE_TYPE_I ) ) &&
+                rcc->buffer_fill/rcc->buffer_size < 0.5 )
+            {
+                q /= x264_clip3f( 2.0*rcc->buffer_fill/rcc->buffer_size, 0.5, 1.0 );
+            }
+
+            /* Now a hard threshold to make sure the frame fits in VBV.
+             * This one is mostly for I-frames. */
+            double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
+            double qf = 1.0;
+            /* For small VBVs, allow the frame to use up the entire VBV. */
+            double max_fill_factor = h->param.rc.i_vbv_buffer_size >= 5*h->param.rc.i_vbv_max_bitrate / rcc->fps ? 2 : 1;
+            /* For single-frame VBVs, request that the frame use up the entire VBV. */
+            double min_fill_factor = rcc->single_frame_vbv ? 1 : 2;
+
+            if( bits > rcc->buffer_fill/max_fill_factor )
+                qf = x264_clip3f( rcc->buffer_fill/(max_fill_factor*bits), 0.2, 1.0 );
+            q /= qf;
+            bits *= qf;
+            if( bits < rcc->buffer_rate/min_fill_factor )
+                q *= bits*min_fill_factor/rcc->buffer_rate;
+            q = X264_MAX( q0, q );
+        }
 
         /* Check B-frame complexity, and use up any bits that would
          * overflow before the next P-frame. */
-        if( h->sh.i_type == SLICE_TYPE_P )
+        if( h->sh.i_type == SLICE_TYPE_P && !rcc->single_frame_vbv )
         {
             int nb = rcc->bframes;
+            double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
             double pbbits = bits;
             double bbits = predict_size( rcc->pred_b_from_p, q * h->param.rc.f_pb_factor, rcc->last_satd );
             double space;
-
-            if( bbits > rcc->buffer_rate )
+            if( bbits > rcc->buffer_rate  )
                 nb = 0;
             pbbits += nb * bbits;
 
             space = rcc->buffer_fill + (1+nb)*rcc->buffer_rate - rcc->buffer_size;
             if( pbbits < space )
             {
-                q *= X264_MAX( pbbits / space,
-                               bits / (0.5 * rcc->buffer_size) );
+                q *= X264_MAX( pbbits / space, bits / (0.5 * rcc->buffer_size) );
             }
             q = X264_MAX( q0-5, q );
         }
@@ -1389,9 +1744,9 @@ static float rate_estimate_qscale( x264_t *h )
     int pict_type = h->sh.i_type;
     double lmin = rcc->lmin[pict_type];
     double lmax = rcc->lmax[pict_type];
-    int64_t total_bits = 8*(h->stat.i_slice_size[SLICE_TYPE_I]
-                          + h->stat.i_slice_size[SLICE_TYPE_P]
-                          + h->stat.i_slice_size[SLICE_TYPE_B]);
+    int64_t total_bits = 8*(h->stat.i_frame_size[SLICE_TYPE_I]
+                          + h->stat.i_frame_size[SLICE_TYPE_P]
+                          + h->stat.i_frame_size[SLICE_TYPE_B]);
 
     if( rcc->b_2pass )
     {
@@ -1434,14 +1789,20 @@ static float rate_estimate_qscale( x264_t *h )
         else
             q += rcc->pb_offset;
 
-        rcc->frame_size_planned = predict_size( rcc->pred_b_from_p, q, h->fref1[h->i_ref1-1]->i_satd );
+        if( rcc->b_2pass && rcc->b_vbv )
+            rcc->frame_size_planned = qscale2bits( &rce, q );
+        else
+            rcc->frame_size_planned = predict_size( rcc->pred_b_from_p, q, h->fref1[h->i_ref1-1]->i_satd );
         x264_ratecontrol_set_estimated_size(h, rcc->frame_size_planned);
-        rcc->last_satd = 0;
+
+        /* For row SATDs */
+        if( rcc->b_vbv )
+            rcc->last_satd = x264_rc_analyse_slice( h );
         return qp2qscale(q);
     }
     else
     {
-        double abr_buffer = 2 * rcc->rate_tolerance * rcc->bitrate;
+        double abr_buffer = 2 * rcc->rate_tolerance * rcc->bitrate * h->i_thread_frames;
 
         if( rcc->b_2pass )
         {
@@ -1451,13 +1812,13 @@ static float rate_estimate_qscale( x264_t *h )
 
             if( rcc->b_vbv )
             {
-                if( h->param.i_threads > 1 )
+                if( h->i_thread_frames > 1 )
                 {
                     int j = h->rc - h->thread[0]->rc;
                     int i;
-                    for( i=1; i<h->param.i_threads; i++ )
+                    for( i=1; i<h->i_thread_frames; i++ )
                     {
-                        x264_t *t = h->thread[ (j+i)%h->param.i_threads ];
+                        x264_t *t = h->thread[ (j+i)%h->i_thread_frames ];
                         double bits = t->rc->frame_size_planned;
                         if( !t->b_thread_active )
                             continue;
@@ -1468,16 +1829,16 @@ static float rate_estimate_qscale( x264_t *h )
             }
             else
             {
-                if( h->fenc->i_frame < h->param.i_threads )
+                if( h->fenc->i_frame < h->i_thread_frames )
                     predicted_bits += (int64_t)h->fenc->i_frame * rcc->bitrate / rcc->fps;
                 else
-                    predicted_bits += (int64_t)(h->param.i_threads - 1) * rcc->bitrate / rcc->fps;
+                    predicted_bits += (int64_t)(h->i_thread_frames - 1) * rcc->bitrate / rcc->fps;
             }
 
             diff = predicted_bits - (int64_t)rce.expected_bits;
             q = rce.new_qscale;
             q /= x264_clip3f((double)(abr_buffer - diff) / abr_buffer, .5, 2);
-            if( ((h->fenc->i_frame + 1 - h->param.i_threads) >= rcc->fps) &&
+            if( ((h->fenc->i_frame + 1 - h->i_thread_frames) >= rcc->fps) &&
                 (rcc->expected_bits_sum > 0))
             {
                 /* Adjust quant based on the difference between
@@ -1505,7 +1866,7 @@ static float rate_estimate_qscale( x264_t *h )
                     expected_size = qscale2bits(&rce, q);
                     expected_vbv = rcc->buffer_fill + rcc->buffer_rate - expected_size;
                 }
-                rcc->last_satd = x264_stack_align( x264_rc_analyse_slice, h );
+                rcc->last_satd = x264_rc_analyse_slice( h );
             }
             q = x264_clip3f( q, lmin, lmax );
         }
@@ -1523,7 +1884,7 @@ static float rate_estimate_qscale( x264_t *h )
 
             double wanted_bits, overflow=1, lmin, lmax;
 
-            rcc->last_satd = x264_stack_align( x264_rc_analyse_slice, h );
+            rcc->last_satd = x264_rc_analyse_slice( h );
             rcc->short_term_cplxsum *= 0.5;
             rcc->short_term_cplxcount *= 0.5;
             rcc->short_term_cplxsum += rcc->last_satd;
@@ -1544,7 +1905,7 @@ static float rate_estimate_qscale( x264_t *h )
             }
             else
             {
-                int i_frame_done = h->fenc->i_frame + 1 - h->param.i_threads;
+                int i_frame_done = h->fenc->i_frame + 1 - h->i_thread_frames;
 
                 q = get_qscale( h, &rce, rcc->wanted_bits_window / rcc->cplxr_sum, h->fenc->i_frame );
 
@@ -1578,10 +1939,11 @@ static float rate_estimate_qscale( x264_t *h )
 
                 q = x264_clip3f(q, lmin, lmax);
             }
-            else if( h->param.rc.i_rc_method == X264_RC_CRF )
+            else if( h->param.rc.i_rc_method == X264_RC_CRF && rcc->qcompress != 1 )
             {
                 q = qp2qscale( ABR_INIT_QP ) / fabs( h->param.rc.f_ip_factor );
             }
+            rcc->qp_novbv = qscale2qp(q);
 
             //FIXME use get_diff_limited_q() ?
             q = clip_qscale( h, pict_type, q );
@@ -1591,17 +1953,73 @@ static float rate_estimate_qscale( x264_t *h )
         rcc->last_qscale = q;
 
         if( !(rcc->b_2pass && !rcc->b_vbv) && h->fenc->i_frame == 0 )
-            rcc->last_qscale_for[SLICE_TYPE_P] = q;
+            rcc->last_qscale_for[SLICE_TYPE_P] = q * fabs( h->param.rc.f_ip_factor );
 
-        if( rcc->b_2pass && rcc->b_vbv)
+        if( rcc->b_2pass && rcc->b_vbv )
             rcc->frame_size_planned = qscale2bits(&rce, q);
         else
             rcc->frame_size_planned = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd );
+
+        /* Always use up the whole VBV in this case. */
+        if( rcc->single_frame_vbv )
+            rcc->frame_size_planned = rcc->buffer_rate;
         x264_ratecontrol_set_estimated_size(h, rcc->frame_size_planned);
         return q;
     }
 }
 
+void x264_threads_distribute_ratecontrol( x264_t *h )
+{
+    int i, row, totalsize = 0;
+    if( h->rc->b_vbv )
+        for( row = 0; row < h->sps->i_mb_height; row++ )
+            totalsize += h->fdec->i_row_satd[row];
+    for( i = 0; i < h->param.i_threads; i++ )
+    {
+        x264_t *t = h->thread[i];
+        x264_ratecontrol_t *rc = h->rc;
+        memcpy( t->rc, rc, sizeof(x264_ratecontrol_t) );
+        /* Calculate the planned slice size. */
+        if( h->rc->b_vbv && rc->frame_size_planned )
+        {
+            int size = 0;
+            for( row = t->i_threadslice_start; row < t->i_threadslice_end; row++ )
+                size += h->fdec->i_row_satd[row];
+            t->rc->slice_size_planned = size * rc->frame_size_planned / totalsize;
+        }
+        else
+            t->rc->slice_size_planned = 0;
+    }
+}
+
+void x264_threads_merge_ratecontrol( x264_t *h )
+{
+    int i, j, k;
+    x264_ratecontrol_t *rc = h->rc;
+    x264_emms();
+
+    for( i = 1; i < h->param.i_threads; i++ )
+    {
+        x264_ratecontrol_t *t = h->thread[i]->rc;
+        rc->qpa_rc += t->qpa_rc;
+        rc->qpa_aq += t->qpa_aq;
+        for( j = 0; j < 5; j++ )
+            for( k = 0; k < 2; k++ )
+            {
+                rc->row_preds[j][k].coeff += t->row_preds[j][k].coeff;
+                rc->row_preds[j][k].offset += t->row_preds[j][k].offset;
+                rc->row_preds[j][k].count += t->row_preds[j][k].count;
+            }
+    }
+    for( j = 0; j < 5; j++ )
+        for( k = 0; k < 2; k++ )
+        {
+            rc->row_preds[j][k].coeff /= h->param.i_threads;
+            rc->row_preds[j][k].offset /= h->param.i_threads;
+            rc->row_preds[j][k].count /= h->param.i_threads;
+        }
+}
+
 void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
 {
     if( cur != prev )
@@ -1621,6 +2039,7 @@ void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next )
         COPY(short_term_cplxcount);
         COPY(bframes);
         COPY(prev_zone);
+        COPY(qpbuf_pos);
 #undef COPY
     }
     if( cur != next )
@@ -1704,7 +2123,7 @@ static double count_expected_bits( x264_t *h )
     return expected_bits;
 }
 
-static void vbv_pass2( x264_t *h )
+static int vbv_pass2( x264_t *h )
 {
     /* for each interval of buffer_full .. underflow, uniformly increase the qp of all
      * frames in the interval until either buffer is full at some intermediate frame or the
@@ -1712,7 +2131,7 @@ static void vbv_pass2( x264_t *h )
      * Then do the converse to put bits back into overflow areas until target size is met */
 
     x264_ratecontrol_t *rcc = h->rc;
-    double *fills = x264_malloc((rcc->num_entries+1)*sizeof(double));
+    double *fills;
     double all_available_bits = h->param.rc.i_bitrate * 1000. * rcc->num_entries / rcc->fps;
     double expected_bits = 0;
     double adjustment;
@@ -1722,6 +2141,7 @@ static void vbv_pass2( x264_t *h )
     double qscale_max = qp2qscale(h->param.rc.i_qp_max);
     int iterations = 0;
     int adj_min, adj_max;
+    CHECKED_MALLOC( fills, (rcc->num_entries+1)*sizeof(double) );
 
     fills++;
 
@@ -1753,7 +2173,7 @@ static void vbv_pass2( x264_t *h )
             adj_max = fix_underflow(h, t0, t1, 1.001, qscale_min, qscale_max);
 
         expected_bits = count_expected_bits(h);
-    } while((expected_bits < .995*all_available_bits) && ((int)(expected_bits+.5) > (int)(prev_bits+.5)) );
+    } while((expected_bits < .995*all_available_bits) && ((int64_t)(expected_bits+.5) > (int64_t)(prev_bits+.5)) );
 
     if (!adj_max)
         x264_log( h, X264_LOG_WARNING, "vbv-maxrate issue, qpmax or vbv-maxrate too low\n");
@@ -1763,6 +2183,9 @@ static void vbv_pass2( x264_t *h )
         rcc->entry[i].expected_vbv = rcc->buffer_size - fills[i];
 
     x264_free(fills-1);
+    return 0;
+fail:
+    return -1;
 }
 
 static int init_pass2( x264_t *h )
@@ -1830,9 +2253,9 @@ static int init_pass2( x264_t *h )
         rce->blurred_complexity = cplx_sum / weight_sum;
     }
 
-    qscale = x264_malloc(sizeof(double)*rcc->num_entries);
-    if(filter_size > 1)
-        blurred_qscale = x264_malloc(sizeof(double)*rcc->num_entries);
+    CHECKED_MALLOC( qscale, sizeof(double)*rcc->num_entries );
+    if( filter_size > 1 )
+        CHECKED_MALLOC( blurred_qscale, sizeof(double)*rcc->num_entries );
     else
         blurred_qscale = qscale;
 
@@ -1845,7 +2268,11 @@ static int init_pass2( x264_t *h )
 
     expected_bits = 1;
     for(i=0; i<rcc->num_entries; i++)
-        expected_bits += qscale2bits(&rcc->entry[i], get_qscale(h, &rcc->entry[i], 1.0, i));
+    {
+        double q = get_qscale(h, &rcc->entry[i], 1.0, i);
+        expected_bits += qscale2bits(&rcc->entry[i], q);
+        rcc->last_qscale_for[rcc->entry[i].pict_type] = q;
+    }
     step_mult = all_available_bits / expected_bits;
 
     rate_factor = 0;
@@ -1862,6 +2289,7 @@ static int init_pass2( x264_t *h )
         for(i=0; i<rcc->num_entries; i++)
         {
             qscale[i] = get_qscale(h, &rcc->entry[i], rate_factor, i);
+            rcc->last_qscale_for[rcc->entry[i].pict_type] = qscale[i];
         }
 
         /* fixed I/B qscale relative to P */
@@ -1914,7 +2342,8 @@ static int init_pass2( x264_t *h )
         x264_free(blurred_qscale);
 
     if(rcc->b_vbv)
-        vbv_pass2(h);
+        if( vbv_pass2( h ) )
+            return -1;
     expected_bits = count_expected_bits(h);
 
     if(fabs(expected_bits/all_available_bits - 1.0) > 0.01)
@@ -1949,6 +2378,6 @@ static int init_pass2( x264_t *h )
     }
 
     return 0;
+fail:
+    return -1;
 }
-
-
diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
index 3310d3c..5a8d088 100644
--- a/encoder/ratecontrol.h
+++ b/encoder/ratecontrol.h
@@ -29,16 +29,22 @@ void x264_ratecontrol_delete( x264_t * );
 
 void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
 void x264_adaptive_quant( x264_t * );
+int  x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame );
+int  x264_reference_build_list_optimal( x264_t *h );
 void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
-void x264_ratecontrol_start( x264_t *, int i_force_qp );
+void x264_ratecontrol_start( x264_t *, int i_force_qp, int overhead );
 int  x264_ratecontrol_slice_type( x264_t *, int i_frame );
+void x264_ratecontrol_set_weights( x264_t *h, x264_frame_t *frm );
 void x264_ratecontrol_mb( x264_t *, int bits );
 int  x264_ratecontrol_qp( x264_t * );
-void x264_ratecontrol_end( x264_t *, int bits );
+int  x264_ratecontrol_end( x264_t *, int bits );
 void x264_ratecontrol_summary( x264_t * );
 void x264_ratecontrol_set_estimated_size( x264_t *, int bits );
 int  x264_ratecontrol_get_estimated_size( x264_t const *);
 int  x264_rc_analyse_slice( x264_t *h );
+int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
+void x264_threads_distribute_ratecontrol( x264_t *h );
+void x264_threads_merge_ratecontrol( x264_t *h );
 
 #endif
 
diff --git a/encoder/rdo.c b/encoder/rdo.c
index 480d71a..3ed4a47 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -52,10 +52,9 @@ static uint16_t cabac_size_5ones[128];
 #undef  x264_cabac_encode_decision_noup
 #define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v)
 #define x264_cabac_encode_decision_noup(c,x,v) x264_cabac_size_decision_noup(c,x,v)
-#define x264_cabac_encode_terminal(c)     x264_cabac_size_decision_noup(c,276,0)
+#define x264_cabac_encode_terminal(c)     ((c)->f8_bits_encoded += 7)
 #define x264_cabac_encode_bypass(c,v)     ((c)->f8_bits_encoded += 256)
 #define x264_cabac_encode_ue_bypass(c,e,v) ((c)->f8_bits_encoded += (bs_size_ue_big(v+(1<<e)-1)-e)<<8)
-#define x264_cabac_encode_flush(h,c)
 #define x264_macroblock_write_cabac  static x264_macroblock_size_cabac
 #include "cabac.c"
 
@@ -105,7 +104,7 @@ static inline int sum_sa8d( x264_t *h, int pixel, int x, int y )
 
 static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
 {
-    DECLARE_ALIGNED_16(static uint8_t zero[16]);
+    ALIGNED_16(static uint8_t zero[16]);
     int satd = 0;
     uint8_t *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
     uint8_t *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
@@ -124,16 +123,16 @@ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
             int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1;
             satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - sum_satd( h, size, x, y ));
         }
-        satd = (satd * h->mb.i_psy_rd * x264_lambda_tab[h->mb.i_qp] + 128) >> 8;
+        satd = (satd * h->mb.i_psy_rd * h->mb.i_psy_rd_lambda + 128) >> 8;
     }
     return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd;
 }
 
 static inline int ssd_mb( x264_t *h )
 {
-    return ssd_plane(h, PIXEL_16x16, 0, 0, 0)
-         + ssd_plane(h, PIXEL_8x8,   1, 0, 0)
-         + ssd_plane(h, PIXEL_8x8,   2, 0, 0);
+    int chromassd = ssd_plane(h, PIXEL_8x8, 1, 0, 0) + ssd_plane(h, PIXEL_8x8, 2, 0, 0);
+    chromassd = (chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
+    return ssd_plane(h, PIXEL_16x16, 0, 0, 0) + chromassd;
 }
 
 static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
@@ -141,6 +140,7 @@ static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
     int b_transform_bak = h->mb.b_transform_8x8;
     int i_ssd;
     int i_bits;
+    int type_bak = h->mb.i_type;
 
     x264_macroblock_encode( h );
 
@@ -159,13 +159,12 @@ static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
     }
     else
     {
-        bs_t bs_tmp = h->out.bs;
-        bs_tmp.i_bits_encoded = 0;
-        x264_macroblock_size_cavlc( h, &bs_tmp );
-        i_bits = ( bs_tmp.i_bits_encoded * i_lambda2 + 128 ) >> 8;
+        x264_macroblock_size_cavlc( h );
+        i_bits = ( h->out.bs.i_bits_encoded * i_lambda2 + 128 ) >> 8;
     }
 
     h->mb.b_transform_8x8 = b_transform_bak;
+    h->mb.i_type = type_bak;
 
     return i_ssd + i_bits;
 }
@@ -203,12 +202,11 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
 {
     uint64_t i_ssd, i_bits;
     int i8 = i4 >> 2;
+    int chromassd;
 
     if( i_pixel == PIXEL_16x16 )
     {
-        int type_bak = h->mb.i_type;
         int i_cost = x264_rd_cost_mb( h, i_lambda2 );
-        h->mb.i_type = type_bak;
         return i_cost;
     }
 
@@ -223,9 +221,10 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
     if( i_pixel == PIXEL_8x16 )
         x264_macroblock_encode_p8x8( h, i8+2 );
 
-    i_ssd = ssd_plane( h, i_pixel,   0, (i8&1)*8, (i8>>1)*8 )
-          + ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
-          + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
+    chromassd = ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
+              + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
+    chromassd = (chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
+    i_ssd = ssd_plane( h, i_pixel,   0, (i8&1)*8, (i8>>1)*8 ) + chromassd;
 
     if( h->param.b_cabac )
     {
@@ -357,31 +356,6 @@ void x264_rdo_init( void )
     }
 }
 
-// should the intra and inter lambdas be different?
-// I'm just matching the behaviour of deadzone quant.
-static const int lambda2_tab[2][52] = {
-    // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
-    {    46,      58,      73,      92,     117,     147,
-        185,     233,     294,     370,     466,     587,
-        740,     932,    1174,    1480,    1864,    2349,
-       2959,    3728,    4697,    5918,    7457,    9395,
-      11837,   14914,   18790,   23674,   29828,   37581,
-      47349,   59656,   75163,   94699,  119313,  150326,
-     189399,  238627,  300652,  378798,  477255,  601304,
-     757596,  954511, 1202608, 1515192, 1909022, 2405217,
-    3030384, 3818045, 4810435, 6060769 },
-    // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
-    {    27,      34,      43,      54,      68,      86,
-        108,     136,     172,     216,     273,     343,
-        433,     545,     687,     865,    1090,    1374,
-       1731,    2180,    2747,    3461,    4361,    5494,
-       6922,    8721,   10988,   13844,   17442,   21976,
-      27688,   34885,   43953,   55377,   69771,   87906,
-     110755,  139543,  175813,  221511,  279087,  351627,
-     443023,  558174,  703255,  886046, 1116348, 1406511,
-    1772093, 2232697, 2813022, 3544186 }
-};
-
 typedef struct {
     int64_t score;
     int level_idx; // index into level_tree[]
@@ -540,7 +514,7 @@ static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, int16_t *dct,
             /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
             if( h->mb.i_psy_trellis && i && !dc && i_ctxBlockCat != DCT_CHROMA_AC )
             {
-                int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][i] : h->mb.pic.fenc_dct4[idx][i];
+                int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][zigzag[i]] : h->mb.pic.fenc_dct4[idx][zigzag[i]];
                 int predicted_coef = orig_coef - i_coef * signs[i];
                 int psy_value = h->mb.i_psy_trellis * abs(predicted_coef + unquant_abs_level * signs[i]);
                 int psy_weight = (i_coefs == 64) ? x264_dct8_weight_tab[zigzag[i]] : x264_dct4_weight_tab[zigzag[i]];
@@ -584,7 +558,14 @@ static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, int16_t *dct,
                     n.score += (uint64_t)f8_bits * i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
                 }
 
-                n.score += ssd;
+                if( j || i || dc )
+                    n.score += ssd;
+                /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. */
+                else
+                {
+                    d = i_coef * signs[0] - ((unquant_abs_level * signs[0] + 8)&~15);
+                    n.score += (int64_t)d*d * coef_weight[i];
+                }
 
                 /* save the node if it's better than any existing node with the same cabac ctx */
                 if( n.score < nodes_cur[node_ctx].score )
@@ -610,11 +591,13 @@ static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, int16_t *dct,
     }
 
     j = bnode->level_idx;
-    for( i = b_ac; i < i_coefs; i++ )
+    for( i = b_ac; j; i++ )
     {
         dct[zigzag[i]] = level_tree[j].abs_level * signs[i];
         j = level_tree[j].next;
     }
+    for( ; i < i_coefs; i++ )
+        dct[zigzag[i]] = 0;
 
     return 1;
 }
@@ -622,32 +605,32 @@ static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, int16_t *dct,
 const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3};
 
 int x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
-                            int i_qp, int i_ctxBlockCat, int b_intra )
+                            int i_qp, int i_ctxBlockCat, int b_intra, int b_chroma )
 {
-    return quant_trellis_cabac( h, (int16_t*)dct,
+    return quant_trellis_cabac( h, dct,
         h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
         NULL, i_ctxBlockCat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced],
-        i_ctxBlockCat, lambda2_tab[b_intra][i_qp], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 );
+        i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 );
 }
 
-int x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
-                             int i_qp, int i_ctxBlockCat, int b_intra, int idx )
+int x264_quant_4x4_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
+                             int i_qp, int i_ctxBlockCat, int b_intra, int b_chroma, int idx )
 {
     int b_ac = (i_ctxBlockCat == DCT_LUMA_AC || i_ctxBlockCat == DCT_CHROMA_AC);
-    return quant_trellis_cabac( h, (int16_t*)dct,
+    return quant_trellis_cabac( h, dct,
         h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
         x264_dct4_weight2_zigzag[h->mb.b_interlaced],
         x264_zigzag_scan4[h->mb.b_interlaced],
-        i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 0, 16, idx );
+        i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, 0, 16, idx );
 }
 
-int x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
+int x264_quant_8x8_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
                              int i_qp, int b_intra, int idx )
 {
-    return quant_trellis_cabac( h, (int16_t*)dct,
+    return quant_trellis_cabac( h, dct,
         h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
         x264_dct8_weight2_zigzag[h->mb.b_interlaced],
         x264_zigzag_scan8[h->mb.b_interlaced],
-        DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 0, 64, idx );
+        DCT_LUMA_8x8, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 64, idx );
 }
 
diff --git a/encoder/set.c b/encoder/set.c
index 3103fcd..f79919b 100644
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -24,9 +24,6 @@
 #include <math.h>
 
 #include "common/common.h"
-#ifndef _MSC_VER
-#include "config.h"
-#endif
 #include "set.h"
 
 #define bs_write_ue bs_write_ue_big
@@ -83,7 +80,7 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
         sps->i_profile_idc  = PROFILE_HIGH444_PREDICTIVE;
     else if( param->analyse.b_transform_8x8 || param->i_cqm_preset != X264_CQM_FLAT )
         sps->i_profile_idc  = PROFILE_HIGH;
-    else if( param->b_cabac || param->i_bframe > 0 || param->b_interlaced )
+    else if( param->b_cabac || param->i_bframe > 0 || param->b_interlaced || param->analyse.i_weighted_pred > 0 )
         sps->i_profile_idc  = PROFILE_MAIN;
     else
         sps->i_profile_idc  = PROFILE_BASELINE;
@@ -97,11 +94,9 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
     sps->b_constraint_set2  = 0;
 
     sps->i_log2_max_frame_num = 4;  /* at least 4 */
-    while( (1 << sps->i_log2_max_frame_num) <= param->i_keyint_max )
-    {
+    while( (1 << sps->i_log2_max_frame_num) <= param->i_keyint_max && sps->i_log2_max_frame_num < 10 )
         sps->i_log2_max_frame_num++;
-    }
-    sps->i_log2_max_frame_num++;    /* just in case */
+    sps->i_log2_max_frame_num++;
 
     sps->i_poc_type = 0;
     if( sps->i_poc_type == 0 )
@@ -185,19 +180,21 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
     }
 
     sps->vui.b_timing_info_present = 0;
-    if( param->i_fps_num > 0 && param->i_fps_den > 0)
+    if( param->i_timebase_num > 0 && param->i_timebase_den > 0 )
     {
         sps->vui.b_timing_info_present = 1;
-        sps->vui.i_num_units_in_tick = param->i_fps_den;
-        sps->vui.i_time_scale = param->i_fps_num * 2;
-        sps->vui.b_fixed_frame_rate = 1;
+        sps->vui.i_num_units_in_tick = param->i_timebase_num;
+        sps->vui.i_time_scale = param->i_timebase_den * 2;
+        sps->vui.b_fixed_frame_rate = !param->b_vfr_input;
     }
 
-    sps->vui.i_num_reorder_frames = param->b_bframe_pyramid ? 2 : param->i_bframe ? 1 : 0;
+    sps->vui.i_num_reorder_frames = param->i_bframe_pyramid ? 2 : param->i_bframe ? 1 : 0;
     /* extra slot with pyramid so that we don't have to override the
      * order of forgetting old pictures */
     sps->vui.i_max_dec_frame_buffering =
-    sps->i_num_ref_frames = X264_MIN(16, X264_MAX(param->i_frame_reference, 1 + sps->vui.i_num_reorder_frames));
+    sps->i_num_ref_frames = X264_MIN(16, X264_MAX3(param->i_frame_reference, 1 + sps->vui.i_num_reorder_frames,
+                                                   param->i_bframe_pyramid ? 4 : 1 ));
+    sps->i_num_ref_frames -= param->i_bframe_pyramid == X264_B_PYRAMID_STRICT;
 
     sps->vui.b_bitstream_restriction = 1;
     if( sps->vui.b_bitstream_restriction )
@@ -213,6 +210,7 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
 
 void x264_sps_write( bs_t *s, x264_sps_t *sps )
 {
+    bs_realign( s );
     bs_write( s, 8, sps->i_profile_idc );
     bs_write( s, 1, sps->b_constraint_set0 );
     bs_write( s, 1, sps->b_constraint_set1 );
@@ -362,6 +360,7 @@ void x264_sps_write( bs_t *s, x264_sps_t *sps )
     }
 
     bs_rbsp_trailing( s );
+    bs_flush( s );
 }
 
 void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps )
@@ -372,13 +371,13 @@ void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *
     pps->i_sps_id = sps->i_id;
     pps->b_cabac = param->b_cabac;
 
-    pps->b_pic_order = 0;
+    pps->b_pic_order = param->b_interlaced;
     pps->i_num_slice_groups = 1;
 
     pps->i_num_ref_idx_l0_active = 1;
     pps->i_num_ref_idx_l1_active = 1;
 
-    pps->b_weighted_pred = 0;
+    pps->b_weighted_pred = param->analyse.i_weighted_pred > 0;
     pps->b_weighted_bipred = param->analyse.b_weighted_bipred ? 2 : 0;
 
     pps->i_pic_init_qp = param->rc.i_rc_method == X264_RC_ABR ? 26 : param->rc.i_qp_constant;
@@ -386,7 +385,7 @@ void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *
 
     pps->i_chroma_qp_index_offset = param->analyse.i_chroma_qp_offset;
     pps->b_deblocking_filter_control = 1;
-    pps->b_constrained_intra_pred = 0;
+    pps->b_constrained_intra_pred = param->b_constrained_intra;
     pps->b_redundant_pic_cnt = 0;
 
     pps->b_transform_8x8_mode = param->analyse.b_transform_8x8 ? 1 : 0;
@@ -426,6 +425,7 @@ void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *
 
 void x264_pps_write( bs_t *s, x264_pps_t *pps )
 {
+    bs_realign( s );
     bs_write_ue( s, pps->i_id );
     bs_write_ue( s, pps->i_sps_id );
 
@@ -468,9 +468,29 @@ void x264_pps_write( bs_t *s, x264_pps_t *pps )
     }
 
     bs_rbsp_trailing( s );
+    bs_flush( s );
 }
 
-void x264_sei_version_write( x264_t *h, bs_t *s )
+void x264_sei_recovery_point_write( x264_t *h, bs_t *s, int recovery_frame_cnt )
+{
+    int payload_size;
+
+    bs_realign( s );
+    bs_write( s, 8, 0x06 ); // payload_type = Recovery Point
+    payload_size = bs_size_ue( recovery_frame_cnt ) + 4;
+
+    bs_write( s, 8, (payload_size + 7) / 8);
+    bs_write_ue( s, recovery_frame_cnt ); // recovery_frame_cnt
+    bs_write( s, 1, 1 ); //exact_match_flag 1
+    bs_write( s, 1, 0 ); //broken_link_flag 0
+    bs_write( s, 2, 0 ); //changing_slice_group 0
+
+    bs_align_10( s );
+    bs_rbsp_trailing( s );
+    bs_flush( s );
+}
+
+int x264_sei_version_write( x264_t *h, bs_t *s )
 {
     int i;
     // random ID number generated according to ISO-11578
@@ -479,14 +499,19 @@ void x264_sei_version_write( x264_t *h, bs_t *s )
         0x96, 0x2c, 0xd8, 0x20, 0xd9, 0x23, 0xee, 0xef
     };
     char *opts = x264_param2string( &h->param, 0 );
-    char *version = x264_malloc( 200 + strlen(opts) );
+    char *version;
     int length;
 
+    if( !opts )
+        return -1;
+    CHECKED_MALLOC( version, 200 + strlen( opts ) );
+
     sprintf( version, "x264 - core %d%s - H.264/MPEG-4 AVC codec - "
              "Copyleft 2003-2009 - http://www.videolan.org/x264.html - options: %s",
              X264_BUILD, X264_VERSION, opts );
     length = strlen(version)+1+16;
 
+    bs_realign( s );
     bs_write( s, 8, 0x5 ); // payload_type = user_data_unregistered
     // payload_size
     for( i = 0; i <= length-255; i += 255 )
@@ -499,9 +524,14 @@ void x264_sei_version_write( x264_t *h, bs_t *s )
         bs_write( s, 8, version[i] );
 
     bs_rbsp_trailing( s );
+    bs_flush( s );
 
     x264_free( opts );
     x264_free( version );
+    return 0;
+fail:
+    x264_free( opts );
+    return -1;
 }
 
 const x264_level_t x264_levels[] =
@@ -536,7 +566,7 @@ int x264_validate_levels( x264_t *h, int verbose )
 {
     int ret = 0;
     int mbs = h->sps->i_mb_width * h->sps->i_mb_height;
-    int dpb = mbs * 384 * h->sps->i_num_ref_frames;
+    int dpb = mbs * 384 * h->sps->vui.i_max_dec_frame_buffering;
     int cbp_factor = h->sps->i_profile_idc==PROFILE_HIGH ? 5 : 4;
 
     const x264_level_t *l = x264_levels;
@@ -550,7 +580,7 @@ int x264_validate_levels( x264_t *h, int verbose )
                h->sps->i_mb_width, h->sps->i_mb_height, l->frame_size );
     if( dpb > l->dpb )
         ERROR( "DPB size (%d frames, %d bytes) > level limit (%d frames, %d bytes)\n",
-                h->sps->i_num_ref_frames, dpb, (int)(l->dpb / (384*mbs)), l->dpb );
+                h->sps->vui.i_max_dec_frame_buffering, dpb, (int)(l->dpb / (384*mbs)), l->dpb );
 
 #define CHECK( name, limit, val ) \
     if( (val) > (limit) ) \
diff --git a/encoder/set.h b/encoder/set.h
index 3611c9a..125f7e1 100644
--- a/encoder/set.h
+++ b/encoder/set.h
@@ -28,7 +28,8 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param );
 void x264_sps_write( bs_t *s, x264_sps_t *sps );
 void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps );
 void x264_pps_write( bs_t *s, x264_pps_t *pps );
-void x264_sei_version_write( x264_t *h, bs_t *s );
+void x264_sei_recovery_point_write( x264_t *h, bs_t *s, int recovery_frame_cnt );
+int  x264_sei_version_write( x264_t *h, bs_t *s );
 int  x264_validate_levels( x264_t *h, int verbose );
 
 #endif
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 2c16429..057f6a6 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -3,8 +3,9 @@
  *****************************************************************************
  * Copyright (C) 2005-2008 x264 project
  *
- * Authors: Loren Merritt <lorenm at u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari at gmail.com>
+ * Authors: Jason Garrett-Glaser <darkshikari at gmail.com>
+ *          Loren Merritt <lorenm at u.washington.edu>
+ *          Dylan Yudaken <dyudaken at gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -22,27 +23,227 @@
  *****************************************************************************/
 
 #include <math.h>
-#include <limits.h>
 
 #include "common/common.h"
 #include "common/cpu.h"
 #include "macroblock.h"
 #include "me.h"
 
+static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
+                                      x264_frame_t **frames, int p0, int p1, int b,
+                                      int b_intra_penalty );
 
 static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
 {
-    a->i_qp = 12; // arbitrary, but low because SATD scores are 1/4 normal
+    a->i_qp = X264_LOOKAHEAD_QP;
     a->i_lambda = x264_lambda_tab[ a->i_qp ];
     x264_mb_analyse_load_costs( h, a );
-    h->mb.i_me_method = X264_MIN( X264_ME_HEX, h->param.analyse.i_me_method ); // maybe dia?
-    h->mb.i_subpel_refine = 4; // 3 should be enough, but not tweaking for speed now
+    if( h->param.analyse.i_subpel_refine > 1 )
+    {
+        h->mb.i_me_method = X264_MIN( X264_ME_HEX, h->param.analyse.i_me_method );
+        h->mb.i_subpel_refine = 4;
+    }
+    else
+    {
+        h->mb.i_me_method = X264_ME_DIA;
+        h->mb.i_subpel_refine = 2;
+    }
     h->mb.b_chroma_me = 0;
 }
 
+/* makes a non-h264 weight (i.e. fix7), into an h264 weight */
+static void x264_weight_get_h264( unsigned int weight_nonh264, int offset, x264_weight_t *w )
+{
+    w->i_offset = offset;
+    w->i_denom = 7;
+    w->i_scale = weight_nonh264;
+    while( w->i_denom > 0 && (w->i_scale > 127 || !(w->i_scale & 1)) )
+    {
+        w->i_denom--;
+        w->i_scale >>= 1;
+    }
+    w->i_scale = X264_MIN( w->i_scale, 127 );
+}
+
+void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame )
+{
+    int x,y;
+    uint32_t sad = 0;
+    uint64_t ssd = 0;
+    uint8_t *p = frame->plane[0];
+    int stride = frame->i_stride[0];
+    int width = frame->i_width[0];
+    int height = frame->i_lines[0];
+    for( y = 0; y < height>>4; y++, p += stride*16 )
+        for( x = 0; x < width; x += 16 )
+        {
+            uint64_t res = h->pixf.var[PIXEL_16x16]( p + x, stride );
+            sad += (uint32_t)res;
+            ssd += res >> 32;
+        }
+    frame->i_pixel_sum = sad;
+    frame->i_pixel_ssd = ssd - ((uint64_t)sad * sad + width * height / 2) / (width * height);
+}
+
+static NOINLINE uint8_t *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, uint8_t *dest )
+{
+    int ref0_distance = fenc->i_frame - ref->i_frame - 1;
+    /* Note: this will never run during lookahead as weights_analyse is only called if no
+     * motion search has been done. */
+    if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF )
+    {
+        int i_stride = fenc->i_stride_lowres;
+        int i_lines = fenc->i_lines_lowres;
+        int i_width = fenc->i_width_lowres;
+        int i_mb_xy = 0;
+        int x,y;
+        uint8_t *p = dest;
+
+        for( y = 0; y < i_lines; y += 8, p += i_stride*8 )
+            for( x = 0; x < i_width; x += 8, i_mb_xy++ )
+            {
+                int mvx = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][0];
+                int mvy = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][1];
+                h->mc.mc_luma( p+x, i_stride, ref->lowres, i_stride,
+                               mvx+(x<<2), mvy+(y<<2), 8, 8, weight_none );
+            }
+        x264_emms();
+        return dest;
+    }
+    x264_emms();
+    return ref->lowres[0];
+}
+
+static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, uint8_t *src, x264_weight_t *w )
+{
+    int x, y;
+    unsigned int cost = 0;
+    int i_stride = fenc->i_stride_lowres;
+    int i_lines = fenc->i_lines_lowres;
+    int i_width = fenc->i_width_lowres;
+    uint8_t *fenc_plane = fenc->lowres[0];
+    ALIGNED_ARRAY_8( uint8_t, buf,[8*8] );
+    int pixoff = 0;
+    int i_mb = 0;
+
+    if( w )
+    {
+        for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+            for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8)
+            {
+                w->weightfn[8>>2]( buf, 8, &src[pixoff], i_stride, w, 8 );
+                cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( buf, 8, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
+            }
+        /* Add cost of weights in the slice header. */
+        int numslices;
+        if( h->param.i_slice_count )
+            numslices = h->param.i_slice_count;
+        else if( h->param.i_slice_max_mbs )
+            numslices = (h->sps->i_mb_width * h->sps->i_mb_height + h->param.i_slice_max_mbs-1) / h->param.i_slice_max_mbs;
+        else
+            numslices = 1;
+        /* FIXME: find a way to account for --slice-max-size?
+         * Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used.
+         * Since using lowres frames, assume lambda = 1. */
+        cost += numslices * ( 10 + 2 * ( bs_size_ue( w[0].i_denom ) + bs_size_se( w[0].i_scale ) + bs_size_se( w[0].i_offset ) ) );
+    }
+    else
+        for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+            for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
+                cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
+    x264_emms();
+    return cost;
+}
+
+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
+{
+    float fenc_mean, ref_mean, fenc_var, ref_var;
+    int i_off, offset_search;
+    int minoff, minscale, mindenom;
+    unsigned int minscore, origscore;
+    int i_delta_index = fenc->i_frame - ref->i_frame - 1;
+    /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
+    const float epsilon = 1.0/128.0;
+    float guess_scale;
+    int found;
+    x264_weight_t *weights = fenc->weight[0];
+
+    fenc_var = round( sqrt( fenc->i_pixel_ssd ) );
+    ref_var  = round( sqrt(  ref->i_pixel_ssd ) );
+    fenc_mean = (float)fenc->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
+    ref_mean  = (float) ref->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
+
+    //early termination
+    if( fabs( ref_mean - fenc_mean ) < 0.5 && fabs( 1 - fenc_var / ref_var ) < epsilon )
+    {
+        SET_WEIGHT( weights[0], 0, 1, 0, 0 );
+        return;
+    }
+
+    guess_scale = ref_var ? fenc_var/ref_var : 0;
+    x264_weight_get_h264( round( guess_scale * 128 ), 0, &weights[0] );
+
+    found = 0;
+    mindenom = weights[0].i_denom;
+    minscale = weights[0].i_scale;
+    minoff = 0;
+    offset_search = x264_clip3( floor( fenc_mean - ref_mean * minscale / (1 << mindenom) + 0.5f*b_lookahead ), -128, 126 );
+
+    if( !fenc->b_intra_calculated )
+    {
+        x264_mb_analysis_t a;
+        x264_lowres_context_init( h, &a );
+        x264_slicetype_frame_cost( h, &a, &fenc, 0, 0, 0, 0 );
+    }
+    uint8_t *mcbuf = x264_weight_cost_init_luma( h, fenc, ref, h->mb.p_weight_buf[0] );
+    origscore = minscore = x264_weight_cost( h, fenc, mcbuf, 0 );
+
+    if( !minscore )
+    {
+        SET_WEIGHT( weights[0], 0, 1, 0, 0 );
+        return;
+    }
+
+    // This gives a slight improvement due to rounding errors but only tests
+    // one offset on lookahead.
+    // TODO: currently searches only offset +1. try other offsets/multipliers/combinations thereof?
+    for( i_off = offset_search; i_off <= offset_search+!b_lookahead; i_off++ )
+    {
+        SET_WEIGHT( weights[0], 1, minscale, mindenom, i_off );
+        unsigned int s = x264_weight_cost( h, fenc, mcbuf, &weights[0] );
+        COPY3_IF_LT( minscore, s, minoff, i_off, found, 1 );
+    }
+    x264_emms();
+
+    /* FIXME: More analysis can be done here on SAD vs. SATD termination. */
+    /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
+    if( !found || (minscale == 1<<mindenom && minoff == 0) || (float)minscore / origscore > 0.998 )
+    {
+        SET_WEIGHT( weights[0], 0, 1, 0, 0 );
+        return;
+    }
+    else
+        SET_WEIGHT( weights[0], 1, minscale, mindenom, minoff );
+
+    if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE && weights[0].weightfn )
+        fenc->f_weighted_cost_delta[i_delta_index] = (float)minscore / origscore;
+
+    if( weights[0].weightfn && b_lookahead )
+    {
+        //scale lowres in lookahead for slicetype_frame_cost
+        uint8_t *src = ref->buffer_lowres[0];
+        uint8_t *dst = h->mb.p_weight_buf[0];
+        int width = ref->i_width_lowres + PADH*2;
+        int height = ref->i_lines_lowres + PADV*2;
+        x264_weight_scale_plane( h, dst, ref->i_stride_lowres, src, ref->i_stride_lowres,
+                                 width, height, &weights[0] );
+        fenc->weighted[0] = h->mb.p_weight_buf[0] + PADH + ref->i_stride_lowres * PADV;
+    }
+}
+
 static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
-                            x264_frame_t **frames, int p0, int p1, int b,
-                            int dist_scale_factor, int do_search[2] )
+                                   x264_frame_t **frames, int p0, int p1, int b,
+                                   int dist_scale_factor, int do_search[2], const x264_weight_t *w )
 {
     x264_frame_t *fref0 = frames[p0];
     x264_frame_t *fref1 = frames[p1];
@@ -53,21 +254,22 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
     const int i_mb_stride = h->sps->i_mb_width;
     const int i_mb_xy = i_mb_x + i_mb_y * i_mb_stride;
     const int i_stride = fenc->i_stride_lowres;
-    const int i_pel_offset = 8 * ( i_mb_x + i_mb_y * i_stride );
+    const int i_pel_offset = 8 * (i_mb_x + i_mb_y * i_stride);
     const int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32;
     int16_t (*fenc_mvs[2])[2] = { &frames[b]->lowres_mvs[0][b-p0-1][i_mb_xy], &frames[b]->lowres_mvs[1][p1-b-1][i_mb_xy] };
     int (*fenc_costs[2]) = { &frames[b]->lowres_mv_costs[0][b-p0-1][i_mb_xy], &frames[b]->lowres_mv_costs[1][p1-b-1][i_mb_xy] };
 
-    DECLARE_ALIGNED_8( uint8_t pix1[9*FDEC_STRIDE] );
+    ALIGNED_ARRAY_8( uint8_t, pix1,[9*FDEC_STRIDE] );
     uint8_t *pix2 = pix1+8;
     x264_me_t m[2];
     int i_bcost = COST_MAX;
     int l, i;
+    int list_used = 0;
 
     h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
     h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, &fenc->lowres[0][i_pel_offset], i_stride, 8 );
 
-    if( !p0 && !p1 && !b )
+    if( p0 == p1 )
         goto lowres_intra_mb;
 
     // no need for h->mb.mv_min[]
@@ -90,6 +292,9 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
         (dst)[2] = &(src)[2][i_pel_offset]; \
         (dst)[3] = &(src)[3][i_pel_offset]; \
     }
+#define LOAD_WPELS_LUMA(dst,src) \
+    (dst) = &(src)[i_pel_offset];
+
 #define CLIP_MV( mv ) \
     { \
         mv[0] = x264_clip3( mv[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); \
@@ -97,33 +302,54 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
     }
 #define TRY_BIDIR( mv0, mv1, penalty ) \
     { \
-        int stride1 = 16, stride2 = 16; \
-        uint8_t *src1, *src2; \
         int i_cost; \
-        src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \
-                              (mv0)[0], (mv0)[1], 8, 8 ); \
-        src2 = h->mc.get_ref( pix2, &stride2, m[1].p_fref, m[1].i_stride[0], \
-                              (mv1)[0], (mv1)[1], 8, 8 ); \
-        h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \
+        if( h->param.analyse.i_subpel_refine <= 1 ) \
+        { \
+            int hpel_idx1 = (((mv0)[0]&2)>>1) + ((mv0)[1]&2); \
+            int hpel_idx2 = (((mv1)[0]&2)>>1) + ((mv1)[1]&2); \
+            uint8_t *src1 = m[0].p_fref[hpel_idx1] + ((mv0)[0]>>2) + ((mv0)[1]>>2) * m[0].i_stride[0]; \
+            uint8_t *src2 = m[1].p_fref[hpel_idx2] + ((mv1)[0]>>2) + ((mv1)[1]>>2) * m[1].i_stride[0]; \
+            h->mc.avg[PIXEL_8x8]( pix1, 16, src1, m[0].i_stride[0], src2, m[1].i_stride[0], i_bipred_weight ); \
+        } \
+        else \
+        { \
+            int stride1 = 16, stride2 = 16; \
+            uint8_t *src1, *src2; \
+            src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \
+                                  (mv0)[0], (mv0)[1], 8, 8, w ); \
+            src2 = h->mc.get_ref( pix2, &stride2, m[1].p_fref, m[1].i_stride[0], \
+                                  (mv1)[0], (mv1)[1], 8, 8, w ); \
+            h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \
+        } \
         i_cost = penalty + h->pixf.mbcmp[PIXEL_8x8]( \
                            m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); \
-        if( i_bcost > i_cost ) \
-            i_bcost = i_cost; \
+        COPY2_IF_LT( i_bcost, i_cost, list_used, 3 ); \
     }
 
     m[0].i_pixel = PIXEL_8x8;
     m[0].p_cost_mv = a->p_cost_mv;
     m[0].i_stride[0] = i_stride;
     m[0].p_fenc[0] = h->mb.pic.p_fenc[0];
+    m[0].weight = w;
+    m[0].i_ref = 0;
     LOAD_HPELS_LUMA( m[0].p_fref, fref0->lowres );
+    m[0].p_fref_w = m[0].p_fref[0];
+    if( w[0].weightfn )
+        LOAD_WPELS_LUMA( m[0].p_fref_w, fenc->weighted[0] );
 
     if( b_bidir )
     {
         int16_t *mvr = fref1->lowres_mvs[0][p1-p0-1][i_mb_xy];
-        int dmv[2][2];
-
-        h->mc.memcpy_aligned( &m[1], &m[0], sizeof(x264_me_t) );
+        ALIGNED_ARRAY_8( int16_t, dmv,[2],[2] );
+
+        m[1].i_pixel = PIXEL_8x8;
+        m[1].p_cost_mv = a->p_cost_mv;
+        m[1].i_stride[0] = i_stride;
+        m[1].p_fenc[0] = h->mb.pic.p_fenc[0];
+        m[1].i_ref = 0;
+        m[1].weight = weight_none;
         LOAD_HPELS_LUMA( m[1].p_fref, fref1->lowres );
+        m[1].p_fref_w = m[1].p_fref[0];
 
         dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8;
         dmv[0][1] = ( mvr[1] * dist_scale_factor + 128 ) >> 8;
@@ -131,15 +357,16 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
         dmv[1][1] = dmv[0][1] - mvr[1];
         CLIP_MV( dmv[0] );
         CLIP_MV( dmv[1] );
+        if( h->param.analyse.i_subpel_refine <= 1 )
+            M64( dmv ) &= ~0x0001000100010001ULL; /* mv & ~1 */
 
         TRY_BIDIR( dmv[0], dmv[1], 0 );
-        if( dmv[0][0] | dmv[0][1] | dmv[1][0] | dmv[1][1] )
+        if( M64( dmv ) )
         {
             int i_cost;
             h->mc.avg[PIXEL_8x8]( pix1, 16, m[0].p_fref[0], m[0].i_stride[0], m[1].p_fref[0], m[1].i_stride[0], i_bipred_weight );
             i_cost = h->pixf.mbcmp[PIXEL_8x8]( m[0].p_fenc[0], FENC_STRIDE, pix1, 16 );
-            if( i_bcost > i_cost )
-                i_bcost = i_cost;
+            COPY2_IF_LT( i_bcost, i_cost, list_used, 3 );
         }
     }
 
@@ -149,13 +376,13 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
         {
             int i_mvc = 0;
             int16_t (*fenc_mv)[2] = fenc_mvs[l];
-            DECLARE_ALIGNED_4( int16_t mvc[4][2] );
+            ALIGNED_4( int16_t mvc[4][2] );
 
             /* Reverse-order MV prediction. */
-            *(uint32_t*)mvc[0] = 0;
-            *(uint32_t*)mvc[1] = 0;
-            *(uint32_t*)mvc[2] = 0;
-#define MVC(mv) { *(uint32_t*)mvc[i_mvc] = *(uint32_t*)mv; i_mvc++; }
+            M32( mvc[0] ) = 0;
+            M32( mvc[1] ) = 0;
+            M32( mvc[2] ) = 0;
+#define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; }
             if( i_mb_x < h->sps->i_mb_width - 1 )
                 MVC(fenc_mv[1]);
             if( i_mb_y < h->sps->i_mb_height - 1 )
@@ -171,35 +398,39 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
             x264_me_search( h, &m[l], mvc, i_mvc );
 
             m[l].cost -= 2; // remove mvcost from skip mbs
-            if( *(uint32_t*)m[l].mv )
+            if( M32( m[l].mv ) )
                 m[l].cost += 5;
-            *(uint32_t*)fenc_mvs[l] = *(uint32_t*)m[l].mv;
+            CP32( fenc_mvs[l], m[l].mv );
             *fenc_costs[l] = m[l].cost;
         }
         else
         {
-            *(uint32_t*)m[l].mv = *(uint32_t*)fenc_mvs[l];
+            CP32( m[l].mv, fenc_mvs[l] );
             m[l].cost = *fenc_costs[l];
         }
-        i_bcost = X264_MIN( i_bcost, m[l].cost );
+        COPY2_IF_LT( i_bcost, m[l].cost, list_used, l+1 );
     }
 
-    if( b_bidir && ( *(uint32_t*)m[0].mv || *(uint32_t*)m[1].mv ) )
+    if( b_bidir && ( M32( m[0].mv ) || M32( m[1].mv ) ) )
         TRY_BIDIR( m[0].mv, m[1].mv, 5 );
 
+    /* Store to width-2 bitfield. */
+    frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy>>2] &= ~(3<<((i_mb_xy&3)*2));
+    frames[b]->lowres_inter_types[b-p0][p1-b][i_mb_xy>>2] |= list_used<<((i_mb_xy&3)*2);
+
 lowres_intra_mb:
     /* forbid intra-mbs in B-frames, because it's rare and not worth checking */
     /* FIXME: Should we still forbid them now that we cache intra scores? */
-    if( !b_bidir )
+    if( !b_bidir || h->param.rc.b_mb_tree )
     {
         int i_icost, b_intra;
         if( !fenc->b_intra_calculated )
         {
-            DECLARE_ALIGNED_16( uint8_t edge[33] );
+            ALIGNED_ARRAY_16( uint8_t, edge,[33] );
             uint8_t *pix = &pix1[8+FDEC_STRIDE - 1];
             uint8_t *src = &fenc->lowres[0][i_pel_offset - 1];
             const int intra_penalty = 5;
-            int satds[4];
+            int satds[3];
 
             memcpy( pix-FDEC_STRIDE, src-i_stride, 17 );
             for( i=0; i<8; i++ )
@@ -207,29 +438,30 @@ lowres_intra_mb:
             pix++;
 
             if( h->pixf.intra_mbcmp_x3_8x8c )
-            {
                 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[0], pix, satds );
-                h->predict_8x8c[I_PRED_CHROMA_P]( pix );
-                satds[I_PRED_CHROMA_P] =
-                    h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
-            }
             else
             {
-                for( i=0; i<4; i++ )
+                for( i=0; i<3; i++ )
                 {
                     h->predict_8x8c[i]( pix );
                     satds[i] = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
                 }
             }
-            i_icost = X264_MIN4( satds[0], satds[1], satds[2], satds[3] );
+            i_icost = X264_MIN3( satds[0], satds[1], satds[2] );
 
-            h->predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
-            for( i=3; i<9; i++ )
+            if( h->param.analyse.i_subpel_refine > 1 )
             {
-                int satd;
-                h->predict_8x8[i]( pix, edge );
-                satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
+                h->predict_8x8c[I_PRED_CHROMA_P]( pix );
+                int satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
                 i_icost = X264_MIN( i_icost, satd );
+                h->predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
+                for( i=3; i<9; i++ )
+                {
+                    int satd;
+                    h->predict_8x8[i]( pix, edge );
+                    satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
+                    i_icost = X264_MIN( i_icost, satd );
+                }
             }
 
             i_icost += intra_penalty;
@@ -237,18 +469,25 @@ lowres_intra_mb:
         }
         else
             i_icost = fenc->i_intra_cost[i_mb_xy];
-        b_intra = i_icost < i_bcost;
-        if( b_intra )
-            i_bcost = i_icost;
-        if(   (i_mb_x > 0 && i_mb_x < h->sps->i_mb_width - 1
-            && i_mb_y > 0 && i_mb_y < h->sps->i_mb_height - 1)
-            || h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2 )
+        if( !b_bidir )
         {
-            fenc->i_intra_mbs[b-p0] += b_intra;
-            fenc->i_cost_est[0][0] += i_icost;
+            b_intra = i_icost < i_bcost;
+            if( b_intra )
+                i_bcost = i_icost;
+            if(   (i_mb_x > 0 && i_mb_x < h->sps->i_mb_width - 1
+                && i_mb_y > 0 && i_mb_y < h->sps->i_mb_height - 1)
+                || h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2 )
+            {
+                fenc->i_intra_mbs[b-p0] += b_intra;
+                fenc->i_cost_est[0][0] += i_icost;
+                if( h->param.rc.i_aq_mode )
+                    fenc->i_cost_est_aq[0][0] += (i_icost * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8;
+            }
         }
     }
 
+    fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = i_bcost;
+
     return i_bcost;
 }
 #undef TRY_BIDIR
@@ -259,14 +498,15 @@ lowres_intra_mb:
     h->sps->i_mb_width * h->sps->i_mb_height)
 
 static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
-                               x264_frame_t **frames, int p0, int p1, int b,
-                               int b_intra_penalty )
+                                      x264_frame_t **frames, int p0, int p1, int b,
+                                      int b_intra_penalty )
 {
+
     int i_score = 0;
     /* Don't use the AQ'd scores for slicetype decision. */
     int i_score_aq = 0;
     int do_search[2];
-
+    const x264_weight_t *w = weight_none;
     /* Check whether we already evaluated this frame
      * If we have tried this frame as P, then we have also tried
      * the preceding frames as B. (is this still true?) */
@@ -283,13 +523,24 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
         /* For each list, check to see whether we have lowres motion-searched this reference frame before. */
         do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
         do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
-        if( do_search[0] ) frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+        if( do_search[0] )
+        {
+            if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART
+                  || h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
+            {
+                x264_emms();
+                x264_weights_analyse( h, frames[b], frames[p0], 1 );
+                w = frames[b]->weight[0];
+            }
+            frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+        }
         if( do_search[1] ) frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
 
         if( b == p1 )
         {
             frames[b]->i_intra_mbs[b-p0] = 0;
             frames[b]->i_cost_est[0][0] = 0;
+            frames[b]->i_cost_est_aq[0][0] = 0;
         }
         if( p1 != p0 )
             dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
@@ -299,14 +550,15 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
 
         /* the edge mbs seem to reduce the predictive quality of the
          * whole frame's score, but are needed for a spatial distribution. */
-        if( h->param.rc.i_vbv_buffer_size || h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2 )
+        if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ||
+            h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2 )
         {
             for( h->mb.i_mb_y = h->sps->i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
             {
                 row_satd[ h->mb.i_mb_y ] = 0;
                 for( h->mb.i_mb_x = h->sps->i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
                 {
-                    int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search );
+                    int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
                     int i_mb_cost_aq = i_mb_cost;
                     if( h->param.rc.i_aq_mode )
                         i_mb_cost_aq = (i_mb_cost_aq * frames[b]->i_inv_qscale_factor[h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride] + 128) >> 8;
@@ -327,7 +579,7 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
             for( h->mb.i_mb_y = h->sps->i_mb_height - 2; h->mb.i_mb_y > 0; h->mb.i_mb_y-- )
                 for( h->mb.i_mb_x = h->sps->i_mb_width - 2; h->mb.i_mb_x > 0; h->mb.i_mb_x-- )
                 {
-                    int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search );
+                    int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
                     int i_mb_cost_aq = i_mb_cost;
                     if( h->param.rc.i_aq_mode )
                         i_mb_cost_aq = (i_mb_cost_aq * frames[b]->i_inv_qscale_factor[h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride] + 128) >> 8;
@@ -337,7 +589,7 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
         }
 
         if( b != p1 )
-            i_score = i_score * 100 / (120 + h->param.i_bframe_bias);
+            i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
         else
             frames[b]->b_intra_calculated = 1;
 
@@ -355,7 +607,250 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
     return i_score;
 }
 
-#define MAX_LENGTH (X264_BFRAME_MAX*4)
+/* If MB-tree changes the quantizers, we need to recalculate the frame cost without
+ * re-running lookahead. */
+static int x264_slicetype_frame_cost_recalculate( x264_t *h, x264_frame_t **frames, int p0, int p1, int b )
+{
+    int i_score = 0;
+    int *row_satd = frames[b]->i_row_satds[b-p0][p1-b];
+    float *qp_offset = IS_X264_TYPE_B(frames[b]->i_type) ? frames[b]->f_qp_offset_aq : frames[b]->f_qp_offset;
+    x264_emms();
+    for( h->mb.i_mb_y = h->sps->i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
+    {
+        row_satd[ h->mb.i_mb_y ] = 0;
+        for( h->mb.i_mb_x = h->sps->i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
+        {
+            int i_mb_xy = h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride;
+            int i_mb_cost = frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy];
+            float qp_adj = qp_offset[i_mb_xy];
+            i_mb_cost = (i_mb_cost * x264_exp2fix8(qp_adj) + 128) >> 8;
+            row_satd[ h->mb.i_mb_y ] += i_mb_cost;
+            if( (h->mb.i_mb_y > 0 && h->mb.i_mb_y < h->sps->i_mb_height - 1 &&
+                 h->mb.i_mb_x > 0 && h->mb.i_mb_x < h->sps->i_mb_width - 1) ||
+                 h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2 )
+            {
+                i_score += i_mb_cost;
+            }
+        }
+    }
+    return i_score;
+}
+
+static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, int ref0_distance )
+{
+    int mb_index;
+    x264_emms();
+    float weightdelta = 0.0;
+    if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 )
+        weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]);
+
+    /* Allow the strength to be adjusted via qcompress, since the two
+     * concepts are very similar. */
+    float strength = 5.0f * (1.0f - h->param.rc.f_qcompress);
+    for( mb_index = 0; mb_index < h->mb.i_mb_count; mb_index++ )
+    {
+        int intra_cost = (frame->i_intra_cost[mb_index] * frame->i_inv_qscale_factor[mb_index]+128)>>8;
+        if( intra_cost )
+        {
+            int propagate_cost = frame->i_propagate_cost[mb_index];
+            float log2_ratio = x264_log2(intra_cost + propagate_cost) - x264_log2(intra_cost) + weightdelta;
+            frame->f_qp_offset[mb_index] = frame->f_qp_offset_aq[mb_index] - strength * log2_ratio;
+        }
+    }
+}
+
+static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, int p0, int p1, int b, int referenced )
+{
+    uint16_t *ref_costs[2] = {frames[p0]->i_propagate_cost,frames[p1]->i_propagate_cost};
+    int dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
+    int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32;
+    int16_t (*mvs[2])[2] = { frames[b]->lowres_mvs[0][b-p0-1], frames[b]->lowres_mvs[1][p1-b-1] };
+    int bipred_weights[2] = {i_bipred_weight, 64 - i_bipred_weight};
+    int *buf = h->scratch_buffer;
+    uint16_t *propagate_cost = frames[b]->i_propagate_cost;
+
+    /* For non-reffed frames the source costs are always zero, so just memset one row and re-use it. */
+    if( !referenced )
+        memset( frames[b]->i_propagate_cost, 0, h->sps->i_mb_width * sizeof(uint16_t) );
+
+    for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->sps->i_mb_height; h->mb.i_mb_y++ )
+    {
+        int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride;
+        h->mc.mbtree_propagate_cost( buf, propagate_cost,
+            frames[b]->i_intra_cost+mb_index, frames[b]->lowres_costs[b-p0][p1-b]+mb_index,
+            frames[b]->i_inv_qscale_factor+mb_index, h->sps->i_mb_width );
+        if( referenced )
+            propagate_cost += h->sps->i_mb_width;
+        for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->sps->i_mb_width; h->mb.i_mb_x++, mb_index++ )
+        {
+            int propagate_amount = buf[h->mb.i_mb_x];
+            /* Don't propagate for an intra block. */
+            if( propagate_amount > 0 )
+            {
+                /* Access width-2 bitfield. */
+                int lists_used = (frames[b]->lowres_inter_types[b-p0][p1-b][mb_index>>2] >> ((mb_index&3)*2))&3;
+                int list;
+                /* Follow the MVs to the previous frame(s). */
+                for( list = 0; list < 2; list++ )
+                    if( (lists_used >> list)&1 )
+                    {
+                        int x = mvs[list][mb_index][0];
+                        int y = mvs[list][mb_index][1];
+                        int listamount = propagate_amount;
+                        int mbx = (x>>5)+h->mb.i_mb_x;
+                        int mby = (y>>5)+h->mb.i_mb_y;
+                        int idx0 = mbx + mby*h->mb.i_mb_stride;
+                        int idx1 = idx0 + 1;
+                        int idx2 = idx0 + h->mb.i_mb_stride;
+                        int idx3 = idx0 + h->mb.i_mb_stride + 1;
+                        x &= 31;
+                        y &= 31;
+                        int idx0weight = (32-y)*(32-x);
+                        int idx1weight = (32-y)*x;
+                        int idx2weight = y*(32-x);
+                        int idx3weight = y*x;
+
+                        /* Apply bipred weighting. */
+                        if( lists_used == 3 )
+                            listamount = (listamount * bipred_weights[list] + 32) >> 6;
+
+#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<16)-1)
+
+                        /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't
+                         * be counted. */
+                        if( mbx < h->sps->i_mb_width-1 && mby < h->sps->i_mb_height-1 && mbx >= 0 && mby >= 0 )
+                        {
+                            CLIP_ADD( ref_costs[list][idx0], (listamount*idx0weight+512)>>10 );
+                            CLIP_ADD( ref_costs[list][idx1], (listamount*idx1weight+512)>>10 );
+                            CLIP_ADD( ref_costs[list][idx2], (listamount*idx2weight+512)>>10 );
+                            CLIP_ADD( ref_costs[list][idx3], (listamount*idx3weight+512)>>10 );
+                        }
+                        else /* Check offsets individually */
+                        {
+                            if( mbx < h->sps->i_mb_width && mby < h->sps->i_mb_height && mbx >= 0 && mby >= 0 )
+                                CLIP_ADD( ref_costs[list][idx0], (listamount*idx0weight+512)>>10 );
+                            if( mbx+1 < h->sps->i_mb_width && mby < h->sps->i_mb_height && mbx+1 >= 0 && mby >= 0 )
+                                CLIP_ADD( ref_costs[list][idx1], (listamount*idx1weight+512)>>10 );
+                            if( mbx < h->sps->i_mb_width && mby+1 < h->sps->i_mb_height && mbx >= 0 && mby+1 >= 0 )
+                                CLIP_ADD( ref_costs[list][idx2], (listamount*idx2weight+512)>>10 );
+                            if( mbx+1 < h->sps->i_mb_width && mby+1 < h->sps->i_mb_height && mbx+1 >= 0 && mby+1 >= 0 )
+                                CLIP_ADD( ref_costs[list][idx3], (listamount*idx3weight+512)>>10 );
+                        }
+                    }
+            }
+        }
+    }
+
+    if( h->param.rc.i_vbv_buffer_size && referenced )
+        x264_macroblock_tree_finish( h, frames[b], b == p1 ? b - p0 : 0 );
+}
+
+static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int num_frames, int b_intra )
+{
+    int i, idx = !b_intra;
+    int last_nonb, cur_nonb = 1;
+    int bframes = 0;
+    if( b_intra )
+        x264_slicetype_frame_cost( h, a, frames, 0, 0, 0, 0 );
+
+    i = num_frames-1;
+    while( i > 0 && frames[i]->i_type == X264_TYPE_B )
+        i--;
+    last_nonb = i;
+
+    if( last_nonb < idx )
+        return;
+
+    memset( frames[last_nonb]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) );
+    while( i-- > idx )
+    {
+        cur_nonb = i;
+        while( frames[cur_nonb]->i_type == X264_TYPE_B && cur_nonb > 0 )
+            cur_nonb--;
+        if( cur_nonb < idx )
+            break;
+        x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, last_nonb, 0 );
+        memset( frames[cur_nonb]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) );
+        bframes = last_nonb - cur_nonb - 1;
+        if( h->param.i_bframe_pyramid && bframes > 1 )
+        {
+            int middle = (bframes + 1)/2 + cur_nonb;
+            x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, middle, 0 );
+            memset( frames[middle]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) );
+            while( i > cur_nonb )
+            {
+                int p0 = i > middle ? middle : cur_nonb;
+                int p1 = i < middle ? middle : last_nonb;
+                if( i != middle )
+                {
+                    x264_slicetype_frame_cost( h, a, frames, p0, p1, i, 0 );
+                    x264_macroblock_tree_propagate( h, frames, p0, p1, i, 0 );
+                }
+                i--;
+            }
+            x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, middle, 1 );
+        }
+        else
+        {
+            while( i > cur_nonb )
+            {
+                x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, i, 0 );
+                x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, i, 0 );
+                i--;
+            }
+        }
+        x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, last_nonb, 1 );
+        last_nonb = cur_nonb;
+    }
+
+    x264_macroblock_tree_finish( h, frames[last_nonb], last_nonb );
+    if( h->param.i_bframe_pyramid && bframes > 1 && !h->param.rc.i_vbv_buffer_size )
+        x264_macroblock_tree_finish( h, frames[last_nonb+(bframes+1)/2], 0 );
+}
+
+static int x264_vbv_frame_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b )
+{
+    int cost = x264_slicetype_frame_cost( h, a, frames, p0, p1, b, 0 );
+    if( h->param.rc.i_aq_mode )
+    {
+        if( h->param.rc.b_mb_tree )
+            return x264_slicetype_frame_cost_recalculate( h, frames, p0, p1, b );
+        else
+            return frames[b]->i_cost_est_aq[b-p0][p1-b];
+    }
+    return cost;
+}
+
+static void x264_vbv_lookahead( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int num_frames, int keyframe )
+{
+    int last_nonb = 0, cur_nonb = 1, next_nonb, i, idx = 0;
+    while( cur_nonb < num_frames && frames[cur_nonb]->i_type == X264_TYPE_B )
+        cur_nonb++;
+    next_nonb = keyframe ? last_nonb : cur_nonb;
+
+    while( cur_nonb <= num_frames )
+    {
+        /* P/I cost: This shouldn't include the cost of next_nonb */
+        if( next_nonb != cur_nonb )
+        {
+            int p0 = IS_X264_TYPE_I( frames[cur_nonb]->i_type ) ? cur_nonb : last_nonb;
+            frames[next_nonb]->i_planned_satd[idx] = x264_vbv_frame_cost( h, a, frames, p0, cur_nonb, cur_nonb );
+            frames[next_nonb]->i_planned_type[idx] = frames[cur_nonb]->i_type;
+            idx++;
+        }
+        /* Handle the B-frames: coded order */
+        for( i = last_nonb+1; i < cur_nonb; i++, idx++ )
+        {
+            frames[next_nonb]->i_planned_satd[idx] = x264_vbv_frame_cost( h, a, frames, last_nonb, cur_nonb, i );
+            frames[next_nonb]->i_planned_type[idx] = X264_TYPE_B;
+        }
+        last_nonb = cur_nonb;
+        cur_nonb++;
+        while( cur_nonb <= num_frames && frames[cur_nonb]->i_type == X264_TYPE_B )
+            cur_nonb++;
+    }
+    frames[next_nonb]->i_planned_type[idx] = X264_TYPE_AUTO;
+}
 
 static int x264_slicetype_path_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, char *path, int threshold )
 {
@@ -380,8 +875,18 @@ static int x264_slicetype_path_cost( x264_t *h, x264_mb_analysis_t *a, x264_fram
         if( cost > threshold )
             break;
 
-        for( next_b = loc; next_b < next_p && cost < threshold; next_b++ )
-            cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, next_b, 0 );
+        if( h->param.i_bframe_pyramid && next_p - cur_p > 2 )
+        {
+            int middle = cur_p + (next_p - cur_p)/2;
+            cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, middle, 0 );
+            for( next_b = loc; next_b < middle && cost < threshold; next_b++ )
+                cost += x264_slicetype_frame_cost( h, a, frames, cur_p, middle, next_b, 0 );
+            for( next_b = middle+1; next_b < next_p && cost < threshold; next_b++ )
+                cost += x264_slicetype_frame_cost( h, a, frames, middle, next_p, next_b, 0 );
+        }
+        else
+            for( next_b = loc; next_b < next_p && cost < threshold; next_b++ )
+                cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, next_b, 0 );
 
         loc = next_p + 1;
         cur_p = next_p;
@@ -393,27 +898,24 @@ static int x264_slicetype_path_cost( x264_t *h, x264_mb_analysis_t *a, x264_fram
 /* Uses strings due to the fact that the speed of the control functions is
    negligable compared to the cost of running slicetype_frame_cost, and because
    it makes debugging easier. */
-static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int max_bframes, int buffer_size, char (*best_paths)[MAX_LENGTH] )
+static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int max_bframes, char (*best_paths)[X264_LOOKAHEAD_MAX] )
 {
-    char paths[X264_BFRAME_MAX+2][MAX_LENGTH] = {{0}};
-    int num_paths = X264_MIN(max_bframes+1, length);
-    int suffix_size, loc, path;
+    char paths[X264_BFRAME_MAX+1][X264_LOOKAHEAD_MAX] = {{0}};
+    int num_paths = X264_MIN( max_bframes+1, length );
+    int path;
     int best_cost = COST_MAX;
     int best_path_index = 0;
-    length = X264_MIN(length,MAX_LENGTH);
-
-    /* Iterate over all currently possible paths and add suffixes to each one */
-    for( suffix_size = 0; suffix_size < num_paths; suffix_size++ )
-    {
-        memcpy( paths[suffix_size], best_paths[length - (suffix_size + 1)], length - (suffix_size + 1) );
-        for( loc = 0; loc < suffix_size; loc++ )
-            strcat( paths[suffix_size], "B" );
-        strcat( paths[suffix_size], "P" );
-    }
 
-    /* Calculate the actual cost of each of the current paths */
+    /* Iterate over all currently possible paths */
     for( path = 0; path < num_paths; path++ )
     {
+        /* Add suffixes to the current path */
+        int len = length - (path + 1);
+        memcpy( paths[path], best_paths[len % (X264_BFRAME_MAX+1)], len );
+        memset( paths[path]+len, 'B', path );
+        strcat( paths[path], "P" );
+
+        /* Calculate the actual cost of the current path */
         int cost = x264_slicetype_path_cost( h, a, frames, paths[path], best_cost );
         if( cost < best_cost )
         {
@@ -423,19 +925,10 @@ static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t
     }
 
     /* Store the best path. */
-    memcpy( best_paths[length], paths[best_path_index], length );
-}
-
-static int x264_slicetype_path_search( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int bframes, int buffer )
-{
-    char best_paths[MAX_LENGTH][MAX_LENGTH] = {"","P"};
-    int n;
-    for( n = 2; n < length-1; n++ )
-        x264_slicetype_path( h, a, frames, n, bframes, buffer, best_paths );
-    return strspn( best_paths[length-2], "B" );
+    memcpy( best_paths[length % (X264_BFRAME_MAX+1)], paths[best_path_index], length );
 }
 
-static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1 )
+static int scenecut_internal( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int print )
 {
     x264_frame_t *frame = frames[p1];
     x264_slicetype_frame_cost( h, a, frames, p0, p1, p1, 0 );
@@ -443,7 +936,7 @@ static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, in
     int icost = frame->i_cost_est[0][0];
     int pcost = frame->i_cost_est[p1-p0][0];
     float f_bias;
-    int i_gop_size = frame->i_frame - h->frames.i_last_idr;
+    int i_gop_size = frame->i_frame - h->lookahead->i_last_keyframe;
     float f_thresh_max = h->param.i_scenecut_threshold / 100.0;
     /* magic numbers pulled out of thin air */
     float f_thresh_min = f_thresh_max * h->param.i_keyint_min
@@ -452,7 +945,7 @@ static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, in
 
     if( h->param.i_keyint_min == h->param.i_keyint_max )
         f_thresh_min= f_thresh_max;
-    if( i_gop_size < h->param.i_keyint_min / 4 )
+    if( i_gop_size < h->param.i_keyint_min / 4 || h->param.b_intra_refresh )
         f_bias = f_thresh_min / 4;
     else if( i_gop_size <= h->param.i_keyint_min )
         f_bias = f_thresh_min * i_gop_size / h->param.i_keyint_min;
@@ -465,7 +958,7 @@ static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, in
     }
 
     res = pcost >= (1.0 - f_bias) * icost;
-    if( res )
+    if( res && print )
     {
         int imb = frame->i_intra_mbs[p1-p0];
         int pmb = NUM_MBS - imb;
@@ -477,144 +970,267 @@ static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, in
     return res;
 }
 
-static void x264_slicetype_analyse( x264_t *h )
+static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int real_scenecut, int num_frames )
+{
+    int curp0, curp1, i, maxp1 = p0 + 1;
+
+    /* Only do analysis during a normal scenecut check. */
+    if( real_scenecut && h->param.i_bframe )
+    {
+        /* Look ahead to avoid coding short flashes as scenecuts. */
+        if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
+            /* Don't analyse any more frames than the trellis would have covered. */
+            maxp1 += h->param.i_bframe;
+        else
+            maxp1++;
+        maxp1 = X264_MIN( maxp1, num_frames );
+
+        /* Where A and B are scenes: AAAAAABBBAAAAAA
+         * If BBB is shorter than (maxp1-p0), it is detected as a flash
+         * and not considered a scenecut. */
+        for( curp1 = p1; curp1 <= maxp1; curp1++ )
+            if( !scenecut_internal( h, a, frames, p0, curp1, 0 ) )
+                /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */
+                for( i = curp1; i > p0; i-- )
+                    frames[i]->b_scenecut = 0;
+
+        /* Where A-F are scenes: AAAAABBCCDDEEFFFFFF
+         * If each of BB ... EE are shorter than (maxp1-p0), they are
+         * detected as flashes and not considered scenecuts.
+         * Instead, the first F frame becomes a scenecut. */
+        for( curp0 = p0; curp0 < maxp1; curp0++ )
+            if( scenecut_internal( h, a, frames, curp0, maxp1, 0 ) )
+                /* If cur_p0 is the p0 of a scenecut, it cannot be the p1 of a scenecut. */
+                    frames[curp0]->b_scenecut = 0;
+    }
+
+    /* Ignore frames that are part of a flash, i.e. cannot be real scenecuts. */
+    if( !frames[p1]->b_scenecut )
+        return 0;
+    return scenecut_internal( h, a, frames, p0, p1, real_scenecut );
+}
+
+void x264_slicetype_analyse( x264_t *h, int keyframe )
 {
     x264_mb_analysis_t a;
-    x264_frame_t *frames[X264_BFRAME_MAX*4+3] = { NULL, };
-    int num_frames;
-    int keyint_limit;
-    int j;
+    x264_frame_t *frames[X264_LOOKAHEAD_MAX+3] = { NULL, };
+    int num_frames, orig_num_frames, keyint_limit, idr_frame_type, i, j;
     int i_mb_count = NUM_MBS;
     int cost1p0, cost2p0, cost1b1, cost2p1;
-    int idr_frame_type;
+    int i_max_search = X264_MIN( h->lookahead->next.i_size, X264_LOOKAHEAD_MAX );
+    if( h->param.b_deterministic )
+        i_max_search = X264_MIN( i_max_search, h->lookahead->i_slicetype_length + !keyframe );
 
     assert( h->frames.b_have_lowres );
 
-    if( !h->frames.last_nonb )
+    if( !h->lookahead->last_nonb )
         return;
-    frames[0] = h->frames.last_nonb;
-    for( j = 0; h->frames.next[j] && h->frames.next[j]->i_type == X264_TYPE_AUTO; j++ )
-        frames[j+1] = h->frames.next[j];
-    keyint_limit = h->param.i_keyint_max - frames[0]->i_frame + h->frames.i_last_idr - 1;
-    num_frames = X264_MIN( j, keyint_limit );
-    if( num_frames == 0 )
+    frames[0] = h->lookahead->last_nonb;
+    for( j = 0; j < i_max_search && h->lookahead->next.list[j]->i_type == X264_TYPE_AUTO; j++ )
+        frames[j+1] = h->lookahead->next.list[j];
+
+    if( !j )
         return;
 
-    x264_lowres_context_init( h, &a );
-    idr_frame_type = frames[1]->i_frame - h->frames.i_last_idr >= h->param.i_keyint_min ? X264_TYPE_IDR : X264_TYPE_I;
+    keyint_limit = h->param.i_keyint_max - frames[0]->i_frame + h->lookahead->i_last_keyframe - 1;
+    orig_num_frames = num_frames = h->param.b_intra_refresh ? j : X264_MIN( j, keyint_limit );
 
-    if( num_frames == 1 )
+    x264_lowres_context_init( h, &a );
+    idr_frame_type = frames[1]->i_frame - h->lookahead->i_last_keyframe >= h->param.i_keyint_min ? X264_TYPE_IDR : X264_TYPE_I;
+
+    /* This is important psy-wise: if we have a non-scenecut keyframe,
+     * there will be significant visual artifacts if the frames just before
+     * go down in quality due to being referenced less, despite it being
+     * more RD-optimal. */
+    if( (h->param.analyse.b_psy && h->param.rc.b_mb_tree) || h->param.rc.i_vbv_buffer_size )
+        num_frames = j;
+    else if( num_frames == 1 )
     {
-no_b_frames:
         frames[1]->i_type = X264_TYPE_P;
-        if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1 ) )
+        if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1, 1, orig_num_frames ) )
             frames[1]->i_type = idr_frame_type;
         return;
     }
+    else if( num_frames == 0 )
+    {
+        frames[1]->i_type = idr_frame_type;
+        return;
+    }
 
-    if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
+    int num_bframes = 0;
+    int max_bframes = X264_MIN(num_frames-1, h->param.i_bframe);
+    int num_analysed_frames = num_frames;
+    int reset_start;
+    if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1, 1, orig_num_frames ) )
     {
-        int num_bframes;
-        int max_bframes = X264_MIN(num_frames-1, h->param.i_bframe);
-        if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1 ) )
-        {
-            frames[1]->i_type = idr_frame_type;
-            return;
-        }
-        num_bframes = x264_slicetype_path_search( h, &a, frames, num_frames, max_bframes, num_frames-max_bframes );
-        assert(num_bframes < num_frames);
+        frames[1]->i_type = idr_frame_type;
+        return;
+    }
 
-        for( j = 1; j < num_bframes+1; j++ )
+    if( h->param.i_bframe )
+    {
+        if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
         {
-            if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, j, j+1 ) )
+            char best_paths[X264_BFRAME_MAX+1][X264_LOOKAHEAD_MAX] = {"","P"};
+            int n;
+
+            /* Perform the frametype analysis. */
+            for( n = 2; n < num_frames; n++ )
+                x264_slicetype_path( h, &a, frames, n, max_bframes, best_paths );
+            if( num_frames > 1 )
             {
-                frames[j]->i_type = X264_TYPE_P;
-                return;
+                int best_path_index = (num_frames-1) % (X264_BFRAME_MAX+1);
+                num_bframes = strspn( best_paths[best_path_index], "B" );
+                /* Load the results of the analysis into the frame types. */
+                for( j = 1; j < num_frames; j++ )
+                    frames[j]->i_type = best_paths[best_path_index][j-1] == 'B' ? X264_TYPE_B : X264_TYPE_P;
             }
-            frames[j]->i_type = X264_TYPE_B;
+            frames[num_frames]->i_type = X264_TYPE_P;
         }
-        frames[num_bframes+1]->i_type = X264_TYPE_P;
-    }
-    else if( h->param.i_bframe_adaptive == X264_B_ADAPT_FAST )
-    {
-        cost2p1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 2, 1 );
-        if( frames[2]->i_intra_mbs[2] > i_mb_count / 2 )
-            goto no_b_frames;
+        else if( h->param.i_bframe_adaptive == X264_B_ADAPT_FAST )
+        {
+            for( i = 0; i <= num_frames-2; )
+            {
+                cost2p1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+2, 1 );
+                if( frames[i+2]->i_intra_mbs[2] > i_mb_count / 2 )
+                {
+                    frames[i+1]->i_type = X264_TYPE_P;
+                    frames[i+2]->i_type = X264_TYPE_P;
+                    i += 2;
+                    continue;
+                }
 
-        cost1b1 = x264_slicetype_frame_cost( h, &a, frames, 0, 2, 1, 0 );
-        cost1p0 = x264_slicetype_frame_cost( h, &a, frames, 0, 1, 1, 0 );
-        cost2p0 = x264_slicetype_frame_cost( h, &a, frames, 1, 2, 2, 0 );
+                cost1b1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+1, 0 );
+                cost1p0 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+1, i+1, 0 );
+                cost2p0 = x264_slicetype_frame_cost( h, &a, frames, i+1, i+2, i+2, 0 );
 
-        if( cost1p0 + cost2p0 < cost1b1 + cost2p1 )
-            goto no_b_frames;
+                if( cost1p0 + cost2p0 < cost1b1 + cost2p1 )
+                {
+                    frames[i+1]->i_type = X264_TYPE_P;
+                    i += 1;
+                    continue;
+                }
 
-        // arbitrary and untuned
-        #define INTER_THRESH 300
-        #define P_SENS_BIAS (50 - h->param.i_bframe_bias)
-        frames[1]->i_type = X264_TYPE_B;
+                // arbitrary and untuned
+                #define INTER_THRESH 300
+                #define P_SENS_BIAS (50 - h->param.i_bframe_bias)
+                frames[i+1]->i_type = X264_TYPE_B;
 
-        for( j = 2; j <= X264_MIN( h->param.i_bframe, num_frames-1 ); j++ )
+                for( j = i+2; j <= X264_MIN( i+h->param.i_bframe, num_frames-1 ); j++ )
+                {
+                    int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS * (j-i-1), INTER_THRESH/10);
+                    int pcost = x264_slicetype_frame_cost( h, &a, frames, i+0, j+1, j+1, 1 );
+                    if( pcost > pthresh*i_mb_count || frames[j+1]->i_intra_mbs[j-i+1] > i_mb_count/3 )
+                        break;
+                    frames[j]->i_type = X264_TYPE_B;
+                }
+                frames[j]->i_type = X264_TYPE_P;
+                i = j;
+            }
+            frames[num_frames]->i_type = X264_TYPE_P;
+            num_bframes = 0;
+            while( num_bframes < num_frames && frames[num_bframes+1]->i_type == X264_TYPE_B )
+                num_bframes++;
+        }
+        else
         {
-            int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS * (j-1), INTER_THRESH/10);
-            int pcost = x264_slicetype_frame_cost( h, &a, frames, 0, j+1, j+1, 1 );
+            num_bframes = X264_MIN(num_frames-1, h->param.i_bframe);
+            for( j = 1; j < num_frames; j++ )
+                frames[j]->i_type = (j%(num_bframes+1)) ? X264_TYPE_B : X264_TYPE_P;
+            frames[num_frames]->i_type = X264_TYPE_P;
+        }
 
-            if( pcost > pthresh*i_mb_count || frames[j+1]->i_intra_mbs[j+1] > i_mb_count/3 )
+        /* Check scenecut on the first minigop. */
+        for( j = 1; j < num_bframes+1; j++ )
+            if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, j, j+1, 0, orig_num_frames ) )
             {
                 frames[j]->i_type = X264_TYPE_P;
+                num_analysed_frames = j;
                 break;
             }
-            else
-                frames[j]->i_type = X264_TYPE_B;
-        }
+
+        reset_start = keyframe ? 1 : X264_MIN( num_bframes+2, num_analysed_frames+1 );
     }
     else
     {
-        int max_bframes = X264_MIN(num_frames-1, h->param.i_bframe);
-        if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1 ) )
-        {
-            frames[1]->i_type = idr_frame_type;
-            return;
-        }
+        for( j = 1; j <= num_frames; j++ )
+            frames[j]->i_type = X264_TYPE_P;
+        reset_start = !keyframe + 1;
+        num_bframes = 0;
+    }
+
+    /* Perform the actual macroblock tree analysis.
+     * Don't go farther than the maximum keyframe interval; this helps in short GOPs. */
+    if( h->param.rc.b_mb_tree )
+        x264_macroblock_tree( h, &a, frames, X264_MIN(num_frames, h->param.i_keyint_max), keyframe );
 
-        for( j = 1; j < max_bframes+1; j++ )
+    /* Enforce keyframe limit. */
+    if( !h->param.b_intra_refresh )
+        for( j = 0; j < num_frames; j++ )
         {
-            if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, j, j+1 ) )
+            if( ((j-keyint_limit) % h->param.i_keyint_max) == 0 )
             {
-                frames[j]->i_type = X264_TYPE_P;
-                return;
+                if( j && h->param.i_keyint_max > 1 )
+                    frames[j]->i_type = X264_TYPE_P;
+                frames[j+1]->i_type = X264_TYPE_IDR;
+                reset_start = X264_MIN( reset_start, j+2 );
             }
-            frames[j]->i_type = X264_TYPE_B;
         }
-        frames[max_bframes+1]->i_type = X264_TYPE_P;
-    }
+
+    if( h->param.rc.i_vbv_buffer_size )
+        x264_vbv_lookahead( h, &a, frames, num_frames, keyframe );
+
+    /* Restore frametypes for all frames that haven't actually been decided yet. */
+    for( j = reset_start; j <= num_frames; j++ )
+        frames[j]->i_type = X264_TYPE_AUTO;
 }
 
 void x264_slicetype_decide( x264_t *h )
 {
+    x264_frame_t *frames[X264_BFRAME_MAX+2];
     x264_frame_t *frm;
     int bframes;
+    int brefs;
     int i;
 
-    if( h->frames.next[0] == NULL )
+    if( !h->lookahead->next.i_size )
         return;
 
     if( h->param.rc.b_stat_read )
     {
         /* Use the frame types from the first pass */
-        for( i = 0; h->frames.next[i] != NULL; i++ )
-            h->frames.next[i]->i_type =
-                x264_ratecontrol_slice_type( h, h->frames.next[i]->i_frame );
+        for( i = 0; i < h->lookahead->next.i_size; i++ )
+            h->lookahead->next.list[i]->i_type =
+                x264_ratecontrol_slice_type( h, h->lookahead->next.list[i]->i_frame );
     }
     else if( (h->param.i_bframe && h->param.i_bframe_adaptive)
-             || h->param.i_scenecut_threshold )
-        x264_slicetype_analyse( h );
+             || h->param.i_scenecut_threshold
+             || h->param.rc.b_mb_tree
+             || (h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead) )
+        x264_slicetype_analyse( h, 0 );
 
-    for( bframes = 0;; bframes++ )
+    for( bframes = 0, brefs = 0;; bframes++ )
     {
-        frm = h->frames.next[bframes];
+        frm = h->lookahead->next.list[bframes];
+        if( frm->i_type == X264_TYPE_BREF && h->param.i_bframe_pyramid < X264_B_PYRAMID_NORMAL &&
+            brefs == h->param.i_bframe_pyramid )
+        {
+            frm->i_type = X264_TYPE_B;
+            x264_log( h, X264_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid %s \n",
+                      frm->i_frame, x264_b_pyramid_names[h->param.i_bframe_pyramid] );
+        }
+        /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available.
+           smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it. */
+        else if( frm->i_type == X264_TYPE_BREF && h->param.i_bframe_pyramid == X264_B_PYRAMID_NORMAL &&
+            brefs && h->param.i_frame_reference <= (brefs+3) )
+        {
+            frm->i_type = X264_TYPE_B;
+            x264_log( h, X264_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid %s and %d reference frames\n",
+                      frm->i_frame, x264_b_pyramid_names[h->param.i_bframe_pyramid], h->param.i_frame_reference );
+        }
 
         /* Limit GOP size */
-        if( frm->i_frame - h->frames.i_last_idr >= h->param.i_keyint_max )
+        if( (!h->param.b_intra_refresh || frm->i_frame == 0) && frm->i_frame - h->lookahead->i_last_keyframe >= h->param.i_keyint_max )
         {
             if( frm->i_type == X264_TYPE_AUTO )
                 frm->i_type = X264_TYPE_IDR;
@@ -624,19 +1240,17 @@ void x264_slicetype_decide( x264_t *h )
         if( frm->i_type == X264_TYPE_IDR )
         {
             /* Close GOP */
+            h->lookahead->i_last_keyframe = frm->i_frame;
+            frm->b_keyframe = 1;
             if( bframes > 0 )
             {
                 bframes--;
-                h->frames.next[bframes]->i_type = X264_TYPE_P;
-            }
-            else
-            {
-                h->i_frame_num = 0;
+                h->lookahead->next.list[bframes]->i_type = X264_TYPE_P;
             }
         }
 
-        if( bframes == h->param.i_bframe
-            || h->frames.next[bframes+1] == NULL )
+        if( bframes == h->param.i_bframe ||
+            !h->lookahead->next.list[bframes+1] )
         {
             if( IS_X264_TYPE_B( frm->i_type ) )
                 x264_log( h, X264_LOG_WARNING, "specified frame type is not compatible with max B-frames\n" );
@@ -645,50 +1259,149 @@ void x264_slicetype_decide( x264_t *h )
                 frm->i_type = X264_TYPE_P;
         }
 
-        if( frm->i_type == X264_TYPE_AUTO ) frm->i_type = X264_TYPE_B;
+        if( frm->i_type == X264_TYPE_BREF )
+            brefs++;
+
+        if( frm->i_type == X264_TYPE_AUTO )
+            frm->i_type = X264_TYPE_B;
+
         else if( !IS_X264_TYPE_B( frm->i_type ) ) break;
     }
+
+    if( bframes )
+        h->lookahead->next.list[bframes-1]->b_last_minigop_bframe = 1;
+    h->lookahead->next.list[bframes]->i_bframes = bframes;
+
+    /* insert a bref into the sequence */
+    if( h->param.i_bframe_pyramid && bframes > 1 && !brefs )
+    {
+        h->lookahead->next.list[bframes/2]->i_type = X264_TYPE_BREF;
+        brefs++;
+    }
+
+    /* calculate the frame costs ahead of time for x264_rc_analyse_slice while we still have lowres */
+    if( h->param.rc.i_rc_method != X264_RC_CQP )
+    {
+        x264_mb_analysis_t a;
+        int p0, p1, b;
+        p1 = b = bframes + 1;
+
+        x264_lowres_context_init( h, &a );
+
+        frames[0] = h->lookahead->last_nonb;
+        memcpy( &frames[1], h->lookahead->next.list, (bframes+1) * sizeof(x264_frame_t*) );
+        if( IS_X264_TYPE_I( h->lookahead->next.list[bframes]->i_type ) )
+            p0 = bframes + 1;
+        else // P
+            p0 = 0;
+
+        x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 );
+
+        if( (p0 != p1 || bframes) && h->param.rc.i_vbv_buffer_size )
+        {
+            /* We need the intra costs for row SATDs. */
+            x264_slicetype_frame_cost( h, &a, frames, b, b, b, 0 );
+
+            /* We need B-frame costs for row SATDs. */
+            p0 = 0;
+            for( b = 1; b <= bframes; b++ )
+            {
+                if( frames[b]->i_type == X264_TYPE_B )
+                    for( p1 = b; frames[p1]->i_type == X264_TYPE_B; )
+                        p1++;
+                else
+                    p1 = bframes + 1;
+                x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 );
+                if( frames[b]->i_type == X264_TYPE_BREF )
+                    p0 = b;
+            }
+        }
+    }
+
+    /* Analyse for weighted P frames */
+    if( !h->param.rc.b_stat_read && h->lookahead->next.list[bframes]->i_type == X264_TYPE_P
+        && h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+    {
+        x264_emms();
+        x264_weights_analyse( h, h->lookahead->next.list[bframes], h->lookahead->last_nonb, 0 );
+    }
+
+    /* shift sequence to coded order.
+       use a small temporary list to avoid shifting the entire next buffer around */
+    int i_coded = h->lookahead->next.list[0]->i_frame;
+    if( bframes )
+    {
+        int index[] = { brefs+1, 1 };
+        for( i = 0; i < bframes; i++ )
+        {
+            int idx = index[h->lookahead->next.list[i]->i_type == X264_TYPE_BREF]++;
+            frames[idx] = h->lookahead->next.list[i];
+            frames[idx]->i_reordered_pts = h->lookahead->next.list[idx]->i_pts;
+        }
+        frames[0] = h->lookahead->next.list[bframes];
+        frames[0]->i_reordered_pts = h->lookahead->next.list[0]->i_pts;
+        memcpy( h->lookahead->next.list, frames, (bframes+1) * sizeof(x264_frame_t*) );
+    }
+    for( i = 0; i <= bframes; i++ )
+         h->lookahead->next.list[i]->i_coded = i_coded++;
 }
 
 int x264_rc_analyse_slice( x264_t *h )
 {
-    x264_mb_analysis_t a;
-    x264_frame_t *frames[X264_BFRAME_MAX*4+2] = { NULL, };
     int p0=0, p1, b;
     int cost;
-
-    x264_lowres_context_init( h, &a );
+    x264_emms();
 
     if( IS_X264_TYPE_I(h->fenc->i_type) )
-    {
         p1 = b = 0;
-    }
-    else if( X264_TYPE_P == h->fenc->i_type )
-    {
-        p1 = 0;
-        while( h->frames.current[p1] && IS_X264_TYPE_B( h->frames.current[p1]->i_type ) )
-            p1++;
-        p1++;
-        b = p1;
-    }
+    else if( h->fenc->i_type == X264_TYPE_P )
+        p1 = b = h->fenc->i_bframes + 1;
     else //B
     {
         p1 = (h->fref1[0]->i_poc - h->fref0[0]->i_poc)/2;
-        b  = (h->fref1[0]->i_poc - h->fenc->i_poc)/2;
-        frames[p1] = h->fref1[0];
+        b  = (h->fenc->i_poc - h->fref0[0]->i_poc)/2;
     }
-    frames[p0] = h->fref0[0];
-    frames[b] = h->fenc;
+    /* We don't need to assign p0/p1 since we are not performing any real analysis here. */
+    x264_frame_t **frames = &h->fenc - b;
 
-    cost = x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 );
+    /* cost should have been already calculated by x264_slicetype_decide */
+    cost = frames[b]->i_cost_est[b-p0][p1-b];
+    assert( cost >= 0 );
 
+    if( h->param.rc.b_mb_tree && !h->param.rc.b_stat_read )
+    {
+        cost = x264_slicetype_frame_cost_recalculate( h, frames, p0, p1, b );
+        if( b && h->param.rc.i_vbv_buffer_size )
+            x264_slicetype_frame_cost_recalculate( h, frames, b, b, b );
+    }
     /* In AQ, use the weighted score instead. */
-    if( h->param.rc.i_aq_mode )
-        cost = frames[b]->i_cost_est[b-p0][p1-b];
+    else if( h->param.rc.i_aq_mode )
+        cost = frames[b]->i_cost_est_aq[b-p0][p1-b];
 
     h->fenc->i_row_satd = h->fenc->i_row_satds[b-p0][p1-b];
     h->fdec->i_row_satd = h->fdec->i_row_satds[b-p0][p1-b];
     h->fdec->i_satd = cost;
     memcpy( h->fdec->i_row_satd, h->fenc->i_row_satd, h->sps->i_mb_height * sizeof(int) );
+    if( !IS_X264_TYPE_I(h->fenc->i_type) )
+        memcpy( h->fdec->i_row_satds[0][0], h->fenc->i_row_satds[0][0], h->sps->i_mb_height * sizeof(int) );
+
+    if( h->param.b_intra_refresh && h->param.rc.i_vbv_buffer_size && h->fenc->i_type == X264_TYPE_P )
+    {
+        int x, y;
+        int ip_factor = 256 * h->param.rc.f_ip_factor; /* fix8 */
+        for( y = 0; y < h->sps->i_mb_height; y++ )
+        {
+            int mb_xy = y * h->mb.i_mb_stride;
+            for( x = h->fdec->i_pir_start_col; x <= h->fdec->i_pir_end_col; x++, mb_xy++ )
+            {
+                int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor) >> 8;
+                int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy];
+                int diff = intra_cost - inter_cost;
+                h->fdec->i_row_satd[y] += diff;
+                cost += diff;
+            }
+        }
+    }
+
     return cost;
 }
diff --git a/extras/avisynth_c.h b/extras/avisynth_c.h
new file mode 100644
index 0000000..27e8270
--- /dev/null
+++ b/extras/avisynth_c.h
@@ -0,0 +1,661 @@
+// Avisynth C Interface Version 0.20
+// Copyright 2003 Kevin Atkinson
+
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
+// http://www.gnu.org/copyleft/gpl.html .
+//
+// As a special exception, I give you permission to link to the
+// Avisynth C interface with independent modules that communicate with
+// the Avisynth C interface solely through the interfaces defined in
+// avisynth_c.h, regardless of the license terms of these independent
+// modules, and to copy and distribute the resulting combined work
+// under terms of your choice, provided that every copy of the
+// combined work is accompanied by a complete copy of the source code
+// of the Avisynth C interface and Avisynth itself (with the version
+// used to produce the combined work), being distributed under the
+// terms of the GNU General Public License plus this exception.  An
+// independent module is a module which is not derived from or based
+// on Avisynth C Interface, such as 3rd-party filters, import and
+// export plugins, or graphical user interfaces.
+
+#ifndef __AVISYNTH_C__
+#define __AVISYNTH_C__
+
+#ifdef __cplusplus
+#  define EXTERN_C extern "C"
+#else
+#  define EXTERN_C
+#endif
+
+#define AVSC_USE_STDCALL 1
+
+#ifndef AVSC_USE_STDCALL
+#  define AVSC_CC __cdecl
+#else
+#  define AVSC_CC __stdcall
+#endif
+
+#define AVSC_EXPORT EXTERN_C __declspec(dllexport)
+#define AVSC_INLINE static __inline
+#ifdef AVISYNTH_C_EXPORTS
+#  define AVSC_API(ret) EXTERN_C __declspec(dllexport) ret AVSC_CC
+#else
+#  define AVSC_API(ret) EXTERN_C __declspec(dllimport) ret AVSC_CC
+#endif
+
+typedef unsigned char BYTE;
+#ifdef __GNUC__
+typedef long long int INT64;
+#else
+typedef __int64 INT64;
+#endif
+
+
+/////////////////////////////////////////////////////////////////////
+//
+// Constants
+//
+
+#ifndef __AVISYNTH_H__
+enum { AVISYNTH_INTERFACE_VERSION = 2 };
+#endif
+
+enum {AVS_SAMPLE_INT8  = 1<<0,
+      AVS_SAMPLE_INT16 = 1<<1, 
+      AVS_SAMPLE_INT24 = 1<<2,
+      AVS_SAMPLE_INT32 = 1<<3,
+      AVS_SAMPLE_FLOAT = 1<<4};
+
+enum {AVS_PLANAR_Y=1<<0,
+      AVS_PLANAR_U=1<<1,
+      AVS_PLANAR_V=1<<2,
+      AVS_PLANAR_ALIGNED=1<<3,
+      AVS_PLANAR_Y_ALIGNED=AVS_PLANAR_Y|AVS_PLANAR_ALIGNED,
+      AVS_PLANAR_U_ALIGNED=AVS_PLANAR_U|AVS_PLANAR_ALIGNED,
+      AVS_PLANAR_V_ALIGNED=AVS_PLANAR_V|AVS_PLANAR_ALIGNED};
+
+  // Colorspace properties.
+enum {AVS_CS_BGR = 1<<28,  
+      AVS_CS_YUV = 1<<29,
+      AVS_CS_INTERLEAVED = 1<<30,
+      AVS_CS_PLANAR = 1<<31};
+
+  // Specific colorformats
+enum {
+  AVS_CS_UNKNOWN = 0,
+  AVS_CS_BGR24 = 1<<0 | AVS_CS_BGR | AVS_CS_INTERLEAVED,
+  AVS_CS_BGR32 = 1<<1 | AVS_CS_BGR | AVS_CS_INTERLEAVED,
+  AVS_CS_YUY2 = 1<<2 | AVS_CS_YUV | AVS_CS_INTERLEAVED,
+  AVS_CS_YV12 = 1<<3 | AVS_CS_YUV | AVS_CS_PLANAR,  // y-v-u, planar
+  AVS_CS_I420 = 1<<4 | AVS_CS_YUV | AVS_CS_PLANAR,  // y-u-v, planar
+  AVS_CS_IYUV = 1<<4 | AVS_CS_YUV | AVS_CS_PLANAR  // same as above
+};
+
+enum {
+  AVS_IT_BFF = 1<<0,
+  AVS_IT_TFF = 1<<1,
+  AVS_IT_FIELDBASED = 1<<2};
+
+enum {
+  AVS_FILTER_TYPE=1,
+  AVS_FILTER_INPUT_COLORSPACE=2,
+  AVS_FILTER_OUTPUT_TYPE=9,
+  AVS_FILTER_NAME=4,
+  AVS_FILTER_AUTHOR=5,
+  AVS_FILTER_VERSION=6,
+  AVS_FILTER_ARGS=7,
+  AVS_FILTER_ARGS_INFO=8,
+  AVS_FILTER_ARGS_DESCRIPTION=10,
+  AVS_FILTER_DESCRIPTION=11};
+
+enum {  //SUBTYPES
+  AVS_FILTER_TYPE_AUDIO=1,
+  AVS_FILTER_TYPE_VIDEO=2,
+  AVS_FILTER_OUTPUT_TYPE_SAME=3,
+  AVS_FILTER_OUTPUT_TYPE_DIFFERENT=4};
+
+enum {
+  AVS_CACHE_NOTHING=0,
+  AVS_CACHE_RANGE=1 };
+
+#define AVS_FRAME_ALIGN 16 
+
+typedef struct AVS_Clip AVS_Clip;
+typedef struct AVS_ScriptEnvironment AVS_ScriptEnvironment;
+
+/////////////////////////////////////////////////////////////////////
+//
+// AVS_VideoInfo
+//
+
+// AVS_VideoInfo is layed out identicly to VideoInfo
+typedef struct AVS_VideoInfo {
+  int width, height;    // width=0 means no video
+  unsigned fps_numerator, fps_denominator;
+  int num_frames;
+
+  int pixel_type;
+  
+  int audio_samples_per_second;   // 0 means no audio
+  int sample_type;
+  INT64 num_audio_samples;
+  int nchannels;
+
+  // Imagetype properties
+
+  int image_type;
+} AVS_VideoInfo;
+
+// useful functions of the above
+AVSC_INLINE int avs_has_video(const AVS_VideoInfo * p) 
+        { return (p->width!=0); }
+
+AVSC_INLINE int avs_has_audio(const AVS_VideoInfo * p) 
+        { return (p->audio_samples_per_second!=0); }
+
+AVSC_INLINE int avs_is_rgb(const AVS_VideoInfo * p) 
+        { return !!(p->pixel_type&AVS_CS_BGR); }
+
+AVSC_INLINE int avs_is_rgb24(const AVS_VideoInfo * p) 
+        { return (p->pixel_type&AVS_CS_BGR24)==AVS_CS_BGR24; } // Clear out additional properties
+
+AVSC_INLINE int avs_is_rgb32(const AVS_VideoInfo * p) 
+        { return (p->pixel_type & AVS_CS_BGR32) == AVS_CS_BGR32 ; }
+
+AVSC_INLINE int avs_is_yuv(const AVS_VideoInfo * p) 
+        { return !!(p->pixel_type&AVS_CS_YUV ); }
+
+AVSC_INLINE int avs_is_yuy2(const AVS_VideoInfo * p) 
+        { return (p->pixel_type & AVS_CS_YUY2) == AVS_CS_YUY2; }  
+
+AVSC_INLINE int avs_is_yv12(const AVS_VideoInfo * p) 
+        { return ((p->pixel_type & AVS_CS_YV12) == AVS_CS_YV12)||((p->pixel_type & AVS_CS_I420) == AVS_CS_I420); }
+
+AVSC_INLINE int avs_is_color_space(const AVS_VideoInfo * p, int c_space) 
+        { return ((p->pixel_type & c_space) == c_space); }
+
+AVSC_INLINE int avs_is_property(const AVS_VideoInfo * p, int property) 
+        { return ((p->pixel_type & property)==property ); }
+
+AVSC_INLINE int avs_is_planar(const AVS_VideoInfo * p) 
+        { return !!(p->pixel_type & AVS_CS_PLANAR); }
+        
+AVSC_INLINE int avs_is_field_based(const AVS_VideoInfo * p) 
+        { return !!(p->image_type & AVS_IT_FIELDBASED); }
+
+AVSC_INLINE int avs_is_parity_known(const AVS_VideoInfo * p) 
+        { return ((p->image_type & AVS_IT_FIELDBASED)&&(p->image_type & (AVS_IT_BFF | AVS_IT_TFF))); }
+
+AVSC_INLINE int avs_is_bff(const AVS_VideoInfo * p) 
+        { return !!(p->image_type & AVS_IT_BFF); }
+
+AVSC_INLINE int avs_is_tff(const AVS_VideoInfo * p) 
+        { return !!(p->image_type & AVS_IT_TFF); }
+
+AVSC_INLINE int avs_bits_per_pixel(const AVS_VideoInfo * p) 
+{ 
+  switch (p->pixel_type) {
+      case AVS_CS_BGR24: return 24;
+      case AVS_CS_BGR32: return 32;
+      case AVS_CS_YUY2:  return 16;
+      case AVS_CS_YV12:
+      case AVS_CS_I420:  return 12;
+      default:           return 0;
+    }
+}
+AVSC_INLINE int avs_bytes_from_pixels(const AVS_VideoInfo * p, int pixels) 
+        { return pixels * (avs_bits_per_pixel(p)>>3); }   // Will work on planar images, but will return only luma planes
+
+AVSC_INLINE int avs_row_size(const AVS_VideoInfo * p) 
+        { return avs_bytes_from_pixels(p,p->width); }  // Also only returns first plane on planar images
+
+AVSC_INLINE int avs_bmp_size(const AVS_VideoInfo * vi)                
+        { if (avs_is_planar(vi)) {int p = vi->height * ((avs_row_size(vi)+3) & ~3); p+=p>>1; return p;  } return vi->height * ((avs_row_size(vi)+3) & ~3); }
+
+AVSC_INLINE int avs_samples_per_second(const AVS_VideoInfo * p) 
+        { return p->audio_samples_per_second; }
+
+
+AVSC_INLINE int avs_bytes_per_channel_sample(const AVS_VideoInfo * p) 
+{
+    switch (p->sample_type) {
+      case AVS_SAMPLE_INT8:  return sizeof(signed char);
+      case AVS_SAMPLE_INT16: return sizeof(signed short);
+      case AVS_SAMPLE_INT24: return 3;
+      case AVS_SAMPLE_INT32: return sizeof(signed int);
+      case AVS_SAMPLE_FLOAT: return sizeof(float);
+      default: return 0;
+    }
+}
+AVSC_INLINE int avs_bytes_per_audio_sample(const AVS_VideoInfo * p)   
+        { return p->nchannels*avs_bytes_per_channel_sample(p);}
+
+AVSC_INLINE INT64 avs_audio_samples_from_frames(const AVS_VideoInfo * p, INT64 frames) 
+        { return ((INT64)(frames) * p->audio_samples_per_second * p->fps_denominator / p->fps_numerator); }
+
+AVSC_INLINE int avs_frames_from_audio_samples(const AVS_VideoInfo * p, INT64 samples) 
+        { return (int)(samples * (INT64)p->fps_numerator / (INT64)p->fps_denominator / (INT64)p->audio_samples_per_second); }
+
+AVSC_INLINE INT64 avs_audio_samples_from_bytes(const AVS_VideoInfo * p, INT64 bytes) 
+        { return bytes / avs_bytes_per_audio_sample(p); }
+
+AVSC_INLINE INT64 avs_bytes_from_audio_samples(const AVS_VideoInfo * p, INT64 samples) 
+        { return samples * avs_bytes_per_audio_sample(p); }
+
+AVSC_INLINE int avs_audio_channels(const AVS_VideoInfo * p) 
+        { return p->nchannels; }
+
+AVSC_INLINE int avs_sample_type(const AVS_VideoInfo * p)
+        { return p->sample_type;}
+
+// useful mutator
+AVSC_INLINE void avs_set_property(AVS_VideoInfo * p, int property)  
+        { p->image_type|=property; }
+
+AVSC_INLINE void avs_clear_property(AVS_VideoInfo * p, int property)  
+        { p->image_type&=~property; }
+
+AVSC_INLINE void avs_set_field_based(AVS_VideoInfo * p, int isfieldbased)  
+        { if (isfieldbased) p->image_type|=AVS_IT_FIELDBASED; else p->image_type&=~AVS_IT_FIELDBASED; }
+
+AVSC_INLINE void avs_set_fps(AVS_VideoInfo * p, unsigned numerator, unsigned denominator) 
+{
+    unsigned x=numerator, y=denominator;
+    while (y) {   // find gcd
+      unsigned t = x%y; x = y; y = t;
+    }
+    p->fps_numerator = numerator/x;
+    p->fps_denominator = denominator/x;
+}
+
+AVSC_INLINE int avs_is_same_colorspace(AVS_VideoInfo * x, AVS_VideoInfo * y)
+{
+        return (x->pixel_type == y->pixel_type)
+                || (avs_is_yv12(x) && avs_is_yv12(y));
+}
+
+/////////////////////////////////////////////////////////////////////
+//
+// AVS_VideoFrame
+//
+
+// VideoFrameBuffer holds information about a memory block which is used
+// for video data.  For efficiency, instances of this class are not deleted
+// when the refcount reaches zero; instead they're stored in a linked list
+// to be reused.  The instances are deleted when the corresponding AVS
+// file is closed.
+
+// AVS_VideoFrameBuffer is layed out identicly to VideoFrameBuffer
+// DO NOT USE THIS STRUCTURE DIRECTLY
+typedef struct AVS_VideoFrameBuffer {
+  BYTE * data;
+  int data_size;
+  // sequence_number is incremented every time the buffer is changed, so
+  // that stale views can tell they're no longer valid.
+  long sequence_number;
+
+  long refcount;
+} AVS_VideoFrameBuffer;
+
+// VideoFrame holds a "window" into a VideoFrameBuffer.
+
+// AVS_VideoFrame is layed out identicly to IVideoFrame
+// DO NOT USE THIS STRUCTURE DIRECTLY
+typedef struct AVS_VideoFrame {
+  int refcount;
+  AVS_VideoFrameBuffer * vfb;
+  int offset, pitch, row_size, height, offsetU, offsetV, pitchUV;  // U&V offsets are from top of picture.
+} AVS_VideoFrame;
+
+// Access functions for AVS_VideoFrame
+AVSC_INLINE int avs_get_pitch(const AVS_VideoFrame * p) {
+        return p->pitch;}
+
+AVSC_INLINE int avs_get_pitch_p(const AVS_VideoFrame * p, int plane) { 
+  switch (plane) {
+  case AVS_PLANAR_U: case AVS_PLANAR_V: return p->pitchUV;}
+  return p->pitch;}
+
+AVSC_INLINE int avs_get_row_size(const AVS_VideoFrame * p) {
+        return p->row_size; }
+
+AVSC_INLINE int avs_get_row_size_p(const AVS_VideoFrame * p, int plane) { 
+        int r;
+    switch (plane) {
+    case AVS_PLANAR_U: case AVS_PLANAR_V: 
+                if (p->pitchUV) return p->row_size>>1; 
+                else            return 0;
+    case AVS_PLANAR_U_ALIGNED: case AVS_PLANAR_V_ALIGNED: 
+                if (p->pitchUV) { 
+                        int r = ((p->row_size+AVS_FRAME_ALIGN-1)&(~(AVS_FRAME_ALIGN-1)) )>>1; // Aligned rowsize
+                        if (r < p->pitchUV) 
+                                return r; 
+                        return p->row_size>>1; 
+                } else return 0;
+    case AVS_PLANAR_Y_ALIGNED:
+                r = (p->row_size+AVS_FRAME_ALIGN-1)&(~(AVS_FRAME_ALIGN-1)); // Aligned rowsize
+                if (r <= p->pitch) 
+                        return r; 
+                return p->row_size;
+    }
+    return p->row_size;
+}
+
+AVSC_INLINE int avs_get_height(const AVS_VideoFrame * p) {
+        return p->height;}
+
+AVSC_INLINE int avs_get_height_p(const AVS_VideoFrame * p, int plane) {
+        switch (plane) {
+                case AVS_PLANAR_U: case AVS_PLANAR_V: 
+                        if (p->pitchUV) return p->height>>1;
+                        return 0;
+        }
+        return p->height;}
+
+AVSC_INLINE const BYTE* avs_get_read_ptr(const AVS_VideoFrame * p) {
+        return p->vfb->data + p->offset;}
+
+AVSC_INLINE const BYTE* avs_get_read_ptr_p(const AVS_VideoFrame * p, int plane) 
+{
+        switch (plane) {
+                case AVS_PLANAR_U: return p->vfb->data + p->offsetU;
+                case AVS_PLANAR_V: return p->vfb->data + p->offsetV;
+                default:           return p->vfb->data + p->offset;}
+}
+
+AVSC_INLINE int avs_is_writable(const AVS_VideoFrame * p) {
+        return (p->refcount == 1 && p->vfb->refcount == 1);}
+
+AVSC_INLINE BYTE* avs_get_write_ptr(const AVS_VideoFrame * p) 
+{
+        if (avs_is_writable(p)) {
+                ++p->vfb->sequence_number;
+                return p->vfb->data + p->offset;
+        } else
+                return 0;
+}
+
+AVSC_INLINE BYTE* avs_get_write_ptr_p(const AVS_VideoFrame * p, int plane) 
+{
+        if (plane==AVS_PLANAR_Y && avs_is_writable(p)) {
+                ++p->vfb->sequence_number;
+                return p->vfb->data + p->offset;
+        } else if (plane==AVS_PLANAR_Y) {
+                return 0;
+        } else {
+                switch (plane) {
+                        case AVS_PLANAR_U: return p->vfb->data + p->offsetU;
+                        case AVS_PLANAR_V: return p->vfb->data + p->offsetV;
+                        default:       return p->vfb->data + p->offset;
+                }
+        }
+}
+
+
+AVSC_API(void) avs_release_video_frame(AVS_VideoFrame *);
+// makes a shallow copy of a video frame
+AVSC_API(AVS_VideoFrame *) avs_copy_video_frame(AVS_VideoFrame *);
+
+AVSC_INLINE void avs_release_frame(AVS_VideoFrame * f)
+  {avs_release_video_frame(f);}
+AVSC_INLINE AVS_VideoFrame * avs_copy_frame(AVS_VideoFrame * f)
+  {return avs_copy_video_frame(f);}
+
+
+
+/////////////////////////////////////////////////////////////////////
+//
+// AVS_Value
+//
+
+// Treat AVS_Value as a fat pointer.  That is use avs_copy_value
+// and avs_release_value appropiaty as you would if AVS_Value was
+// a pointer.
+
+// To maintain source code compatibility with future versions of the
+// avisynth_c API don't use the AVS_Value directly.  Use the helper
+// functions below.
+
+// AVS_Value is layed out identicly to AVSValue
+typedef struct AVS_Value AVS_Value;
+struct AVS_Value {
+  short type;  // 'a'rray, 'c'lip, 'b'ool, 'i'nt, 'f'loat, 's'tring, 'v'oid, or 'l'ong
+               // for some function e'rror
+  short array_size;
+  union {
+    void * clip; // do not use directly, use avs_take_clip
+    char boolean;
+    int integer;
+    float floating_pt;
+    const char * string;
+    const AVS_Value * array;
+  } d;
+};
+
+// AVS_Value should be initilized with avs_void.
+// Should also set to avs_void after the value is released
+// with avs_copy_value.  Consider it the equalvent of setting
+// a pointer to NULL
+static const AVS_Value avs_void = {'v'};
+
+AVSC_API(void) avs_copy_value(AVS_Value * dest, AVS_Value src);
+AVSC_API(void) avs_release_value(AVS_Value);
+
+AVSC_INLINE int avs_defined(AVS_Value v) { return v.type != 'v'; }
+AVSC_INLINE int avs_is_clip(AVS_Value v) { return v.type == 'c'; }
+AVSC_INLINE int avs_is_bool(AVS_Value v) { return v.type == 'b'; }
+AVSC_INLINE int avs_is_int(AVS_Value v) { return v.type == 'i'; }
+AVSC_INLINE int avs_is_float(AVS_Value v) { return v.type == 'f' || v.type == 'i'; }
+AVSC_INLINE int avs_is_string(AVS_Value v) { return v.type == 's'; }
+AVSC_INLINE int avs_is_array(AVS_Value v) { return v.type == 'a'; }
+AVSC_INLINE int avs_is_error(AVS_Value v) { return v.type == 'e'; }
+
+AVSC_API(AVS_Clip *) avs_take_clip(AVS_Value, AVS_ScriptEnvironment *);
+AVSC_API(void) avs_set_to_clip(AVS_Value *, AVS_Clip *);
+
+AVSC_INLINE int avs_as_bool(AVS_Value v) 
+        { return v.d.boolean; }   
+AVSC_INLINE int avs_as_int(AVS_Value v) 
+        { return v.d.integer; }   
+AVSC_INLINE const char * avs_as_string(AVS_Value v) 
+        { return avs_is_error(v) || avs_is_string(v) ? v.d.string : 0; }
+AVSC_INLINE double avs_as_float(AVS_Value v) 
+        { return avs_is_int(v) ? v.d.integer : v.d.floating_pt; }
+AVSC_INLINE const char * avs_as_error(AVS_Value v) 
+        { return avs_is_error(v) ? v.d.string : 0; }
+AVSC_INLINE const AVS_Value * avs_as_array(AVS_Value v)
+        { return v.d.array; }
+AVSC_INLINE int avs_array_size(AVS_Value v) 
+        { return avs_is_array(v) ? v.array_size : 1; }
+AVSC_INLINE AVS_Value avs_array_elt(AVS_Value v, int index) 
+        { return avs_is_array(v) ? v.d.array[index] : v; }
+
+// only use these functions on am AVS_Value that does not already have
+// an active value.  Remember, treat AVS_Value as a fat pointer.
+AVSC_INLINE AVS_Value avs_new_value_bool(int v0) 
+        { AVS_Value v; v.type = 'b'; v.d.boolean = v0 == 0 ? 0 : 1; return v; }   
+AVSC_INLINE AVS_Value avs_new_value_int(int v0) 
+        { AVS_Value v; v.type = 'i'; v.d.integer = v0; return v; }   
+AVSC_INLINE AVS_Value avs_new_value_string(const char * v0) 
+        { AVS_Value v; v.type = 's'; v.d.string = v0; return v; }
+AVSC_INLINE AVS_Value avs_new_value_float(float v0) 
+        { AVS_Value v; v.type = 'f'; v.d.floating_pt = v0; return v;}
+AVSC_INLINE AVS_Value avs_new_value_error(const char * v0) 
+        { AVS_Value v; v.type = 'e'; v.d.string = v0; return v; }
+AVSC_INLINE AVS_Value avs_new_value_clip(AVS_Clip * v0)
+        { AVS_Value v; avs_set_to_clip(&v, v0); return v; }
+AVSC_INLINE AVS_Value avs_new_value_array(AVS_Value * v0, int size)
+        { AVS_Value v; v.type = 'a'; v.d.array = v0; v.array_size = size; return v; }
+
+/////////////////////////////////////////////////////////////////////
+//
+// AVS_Clip
+//
+
+AVSC_API(void) avs_release_clip(AVS_Clip *);
+AVSC_API(AVS_Clip *) avs_copy_clip(AVS_Clip *);
+
+AVSC_API(const char *) avs_clip_get_error(AVS_Clip *); // return 0 if no error
+
+AVSC_API(const AVS_VideoInfo *) avs_get_video_info(AVS_Clip *);
+
+AVSC_API(int) avs_get_version(AVS_Clip *);
+ 
+AVSC_API(AVS_VideoFrame *) avs_get_frame(AVS_Clip *, int n);
+// The returned video frame must be released with avs_release_video_frame
+
+AVSC_API(int) avs_get_parity(AVS_Clip *, int n); 
+// return field parity if field_based, else parity of first field in frame
+
+AVSC_API(int) avs_get_audio(AVS_Clip *, void * buf, 
+                                  INT64 start, INT64 count); 
+// start and count are in samples
+
+AVSC_API(int) avs_set_cache_hints(AVS_Clip *, 
+                                        int cachehints, int frame_range);
+
+// This is the callback type used by avs_add_function
+typedef AVS_Value (AVSC_CC * AVS_ApplyFunc)
+                        (AVS_ScriptEnvironment *, AVS_Value args, void * user_data);
+
+typedef struct AVS_FilterInfo AVS_FilterInfo;
+struct AVS_FilterInfo
+{
+  // these members should not be modified outside of the AVS_ApplyFunc callback
+  AVS_Clip * child;
+  AVS_VideoInfo vi;
+  AVS_ScriptEnvironment * env;
+  AVS_VideoFrame * (AVSC_CC * get_frame)(AVS_FilterInfo *, int n);
+  int (AVSC_CC * get_parity)(AVS_FilterInfo *, int n);
+  int (AVSC_CC * get_audio)(AVS_FilterInfo *, void * buf, 
+				  INT64 start, INT64 count);
+  int (AVSC_CC * set_cache_hints)(AVS_FilterInfo *, int cachehints, 
+					int frame_range);
+  void (AVSC_CC * free_filter)(AVS_FilterInfo *);
+  
+  // Should be set when ever there is an error to report.
+  // It is cleared before any of the above methods are called
+  const char * error;
+  // this is to store whatever and may be modified at will
+  void * user_data;
+};
+
+// Create a new filter
+// fi is set to point to the AVS_FilterInfo so that you can
+//   modify it once it is initilized.
+// store_child should generally be set to true.  If it is not
+//    set than ALL methods (the function pointers) must be defined
+// If it is set than you do not need to worry about freeing the child
+//    clip.
+AVSC_API(AVS_Clip *) avs_new_c_filter(AVS_ScriptEnvironment * e,
+                                            AVS_FilterInfo * * fi,
+                                            AVS_Value child, int store_child);
+
+/////////////////////////////////////////////////////////////////////
+//
+// AVS_ScriptEnvironment
+//
+
+// For GetCPUFlags.  These are backwards-compatible with those in VirtualDub.
+enum {                    
+                                /* slowest CPU to support extension */
+  AVS_CPU_FORCE        = 0x01,   // N/A
+  AVS_CPU_FPU          = 0x02,   // 386/486DX
+  AVS_CPU_MMX          = 0x04,   // P55C, K6, PII
+  AVS_CPU_INTEGER_SSE  = 0x08,   // PIII, Athlon
+  AVS_CPU_SSE          = 0x10,   // PIII, Athlon XP/MP
+  AVS_CPU_SSE2         = 0x20,   // PIV, Hammer
+  AVS_CPU_3DNOW        = 0x40,   // K6-2
+  AVS_CPU_3DNOW_EXT    = 0x80,   // Athlon
+  AVS_CPU_X86_64       = 0xA0,   // Hammer (note: equiv. to 3DNow + SSE2, 
+                                 // which only Hammer will have anyway)
+};
+
+
+AVSC_API(long) avs_get_cpu_flags(AVS_ScriptEnvironment *);
+AVSC_API(int) avs_check_version(AVS_ScriptEnvironment *, int version);
+
+AVSC_API(char *) avs_save_string(AVS_ScriptEnvironment *, const char* s, int length);
+AVSC_API(char *) avs_sprintf(AVS_ScriptEnvironment *, const char * fmt, ...);
+
+AVSC_API(char *) avs_vsprintf(AVS_ScriptEnvironment *, const char * fmt, void* val);
+ // note: val is really a va_list; I hope everyone typedefs va_list to a pointer
+
+AVSC_API(int) avs_add_function(AVS_ScriptEnvironment *, 
+				     const char * name, const char * params, 
+				     AVS_ApplyFunc apply, void * user_data);
+
+AVSC_API(int) avs_function_exists(AVS_ScriptEnvironment *, const char * name);
+
+AVSC_API(AVS_Value) avs_invoke(AVS_ScriptEnvironment *, const char * name, 
+                               AVS_Value args, const char** arg_names);
+// The returned value must be be released with avs_release_value
+
+AVSC_API(AVS_Value) avs_get_var(AVS_ScriptEnvironment *, const char* name);
+// The returned value must be be released with avs_release_value
+
+AVSC_API(int) avs_set_var(AVS_ScriptEnvironment *, const char* name, AVS_Value val);
+
+AVSC_API(int) avs_set_global_var(AVS_ScriptEnvironment *, const char* name, const AVS_Value val);
+
+//void avs_push_context(AVS_ScriptEnvironment *, int level=0);
+//void avs_pop_context(AVS_ScriptEnvironment *);
+
+AVSC_API(AVS_VideoFrame *) avs_new_video_frame_a(AVS_ScriptEnvironment *, 
+                                          const AVS_VideoInfo * vi, int align);
+// align should be at least 16
+
+AVSC_INLINE 
+AVS_VideoFrame * avs_new_video_frame(AVS_ScriptEnvironment * env, 
+                                     const AVS_VideoInfo * vi)
+  {return avs_new_video_frame_a(env,vi,AVS_FRAME_ALIGN);}
+
+AVSC_INLINE 
+AVS_VideoFrame * avs_new_frame(AVS_ScriptEnvironment * env, 
+                               const AVS_VideoInfo * vi)
+  {return avs_new_video_frame_a(env,vi,AVS_FRAME_ALIGN);}
+
+
+AVSC_API(int) avs_make_writable(AVS_ScriptEnvironment *, AVS_VideoFrame * * pvf);
+
+AVSC_API(void) avs_bit_blt(AVS_ScriptEnvironment *, BYTE* dstp, int dst_pitch, const BYTE* srcp, int src_pitch, int row_size, int height);
+
+typedef void (AVSC_CC *AVS_ShutdownFunc)(void* user_data, AVS_ScriptEnvironment * env);
+AVSC_API(void) avs_at_exit(AVS_ScriptEnvironment *, AVS_ShutdownFunc function, void * user_data);
+
+AVSC_API(AVS_VideoFrame *) avs_subframe(AVS_ScriptEnvironment *, AVS_VideoFrame * src, int rel_offset, int new_pitch, int new_row_size, int new_height);
+// The returned video frame must be be released
+
+AVSC_API(int) avs_set_memory_max(AVS_ScriptEnvironment *, int mem);
+
+AVSC_API(int) avs_set_working_dir(AVS_ScriptEnvironment *, const char * newdir);
+
+// avisynth.dll exports this; it's a way to use it as a library, without
+// writing an AVS script or without going through AVIFile.
+AVSC_API(AVS_ScriptEnvironment *) avs_create_script_environment(int version);
+
+// this symbol is the entry point for the plugin and must
+// be defined
+AVSC_EXPORT
+const char * AVSC_CC avisynth_c_plugin_init(AVS_ScriptEnvironment* env);
+
+
+AVSC_API(void) avs_delete_script_environment(AVS_ScriptEnvironment *);
+
+
+AVSC_API(AVS_VideoFrame *) avs_subframe_planar(AVS_ScriptEnvironment *, AVS_VideoFrame * src, int rel_offset, int new_pitch, int new_row_size, int new_height, int rel_offsetU, int rel_offsetV, int new_pitchUV);
+// The returned video frame must be be released
+
+#endif
diff --git a/extras/getopt.c b/extras/getopt.c
index d2dbd30..434efe7 100644
--- a/extras/getopt.c
+++ b/extras/getopt.c
@@ -202,11 +202,7 @@ static char *posixly_correct;
 # if HAVE_STRING_H
 #  include <string.h>
 # else
-#  ifdef _MSC_VER
-#   include <string.h>
-#  else
-#   include <strings.h>
-#  endif
+#  include <strings.h>
 # endif
 
 /* Avoid depending on library functions or files
@@ -984,10 +980,7 @@ getopt (argc, argv, optstring)
 			   0);
 }
 
-#ifdef _MSC_VER
-
-int
-getopt_long (argc, argv, optstring, long_options, opt_index)
+int getopt_long (argc, argv, optstring, long_options, opt_index)
      int argc;
      char *const *argv;
      const char *optstring;
@@ -997,8 +990,6 @@ getopt_long (argc, argv, optstring, long_options, opt_index)
   return _getopt_internal (argc, argv, optstring, long_options, opt_index, 0);
 }
 
-#endif
-
 #endif	/* Not ELIDE_CODE.  */
 
 #ifdef TEST
diff --git a/input/avs.c b/input/avs.c
new file mode 100644
index 0000000..522f8fe
--- /dev/null
+++ b/input/avs.c
@@ -0,0 +1,316 @@
+/*****************************************************************************
+ * avs.c: x264 avisynth input module
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: Steven Walters <kemuri9 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+#include <windows.h>
+
+/* the AVS interface currently uses __declspec to link function declarations to their definitions in the dll.
+   this has a side effect of preventing program execution if the avisynth dll is not found,
+   so define __declspec(dllimport) to nothing and work around this */
+#undef __declspec
+#define __declspec(i)
+#undef EXTERN_C
+
+#ifdef HAVE_AVISYNTH_C_H
+#include <avisynth_c.h>
+#else
+#include "extras/avisynth_c.h"
+#endif
+
+/* AVS uses a versioned interface to control backwards compatibility */
+/* YV12 support is required */
+#define AVS_INTERFACE_YV12 2
+/* when AVS supports other planar colorspaces, a workaround is required */
+#define AVS_INTERFACE_OTHER_PLANAR 5
+
+/* maximum size of the sequence of filters to try on non script files */
+#define AVS_MAX_SEQUENCE 5
+
+#define LOAD_AVS_FUNC(name, continue_on_fail) \
+{\
+    h->func.name = (void*)GetProcAddress( h->library, #name );\
+    if( !continue_on_fail && !h->func.name )\
+        goto fail;\
+}
+
+typedef struct
+{
+    AVS_Clip *clip;
+    AVS_ScriptEnvironment *env;
+    HMODULE library;
+    int num_frames;
+    /* declare function pointers for the utilized functions to be loaded without __declspec,
+       as the avisynth header does not compensate for this type of usage */
+    struct
+    {
+        const char *(__stdcall *avs_clip_get_error)( AVS_Clip *clip );
+        AVS_ScriptEnvironment *(__stdcall *avs_create_script_environment)( int version );
+        void (__stdcall *avs_delete_script_environment)( AVS_ScriptEnvironment *env );
+        AVS_VideoFrame *(__stdcall *avs_get_frame)( AVS_Clip *clip, int n );
+        int (__stdcall *avs_get_version)( AVS_Clip *clip );
+        const AVS_VideoInfo *(__stdcall *avs_get_video_info)( AVS_Clip *clip );
+        int (__stdcall *avs_function_exists)( AVS_ScriptEnvironment *env, const char *name );
+        AVS_Value (__stdcall *avs_invoke)( AVS_ScriptEnvironment *env, const char *name,
+            AVS_Value args, const char **arg_names );
+        void (__stdcall *avs_release_clip)( AVS_Clip *clip );
+        void (__stdcall *avs_release_value)( AVS_Value value );
+        void (__stdcall *avs_release_video_frame)( AVS_VideoFrame *frame );
+        AVS_Clip *(__stdcall *avs_take_clip)( AVS_Value, AVS_ScriptEnvironment *env );
+    } func;
+} avs_hnd_t;
+
+/* load the library and functions we require from it */
+static int avs_load_library( avs_hnd_t *h )
+{
+    h->library = LoadLibrary( "avisynth" );
+    if( !h->library )
+        return -1;
+    LOAD_AVS_FUNC( avs_clip_get_error, 0 );
+    LOAD_AVS_FUNC( avs_create_script_environment, 0 );
+    LOAD_AVS_FUNC( avs_delete_script_environment, 1 );
+    LOAD_AVS_FUNC( avs_get_frame, 0 );
+    LOAD_AVS_FUNC( avs_get_version, 0 );
+    LOAD_AVS_FUNC( avs_get_video_info, 0 );
+    LOAD_AVS_FUNC( avs_function_exists, 0 );
+    LOAD_AVS_FUNC( avs_invoke, 0 );
+    LOAD_AVS_FUNC( avs_release_clip, 0 );
+    LOAD_AVS_FUNC( avs_release_value, 0 );
+    LOAD_AVS_FUNC( avs_release_video_frame, 0 );
+    LOAD_AVS_FUNC( avs_take_clip, 0 );
+    return 0;
+fail:
+    FreeLibrary( h->library );
+    return -1;
+}
+
+/* generate a filter sequence to try based on the filename extension */
+static void avs_build_filter_sequence( char *filename_ext, const char *filter[AVS_MAX_SEQUENCE+1] )
+{
+    int i=0, j;
+    const char *all_purpose[] = { "FFmpegSource2", "DSS2", "DirectShowSource", 0 };
+    if( !strcasecmp( filename_ext, "avi" ) )
+        filter[i++] = "AVISource";
+    if( !strcasecmp( filename_ext, "d2v" ) )
+        filter[i++] = "MPEG2Source";
+    if( !strcasecmp( filename_ext, "dga" ) )
+        filter[i++] = "AVCSource";
+    for( j = 0; all_purpose[j] && i < AVS_MAX_SEQUENCE; j++ )
+        filter[i++] = all_purpose[j];
+}
+
+static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
+{
+    FILE *fh = fopen( psz_filename, "r" );
+    if( !fh )
+        return -1;
+    else if( !x264_is_regular_file( fh ) )
+    {
+        fprintf( stderr, "avs [error]: AVS input is incompatible with non-regular file `%s'\n", psz_filename );
+        return -1;
+    }
+    fclose( fh );
+
+    avs_hnd_t *h = malloc( sizeof(avs_hnd_t) );
+    if( !h )
+        return -1;
+    if( avs_load_library( h ) )
+    {
+        fprintf( stderr, "avs [error]: failed to load avisynth\n" );
+        return -1;
+    }
+    h->env = h->func.avs_create_script_environment( AVS_INTERFACE_YV12 );
+    if( !h->env )
+    {
+        fprintf( stderr, "avs [error]: failed to initiate avisynth\n" );
+        return -1;
+    }
+    AVS_Value arg = avs_new_value_string( psz_filename );
+    AVS_Value res;
+    char *filename_ext = get_filename_extension( psz_filename );
+
+    if( !strcasecmp( filename_ext, "avs" ) )
+    {
+        res = h->func.avs_invoke( h->env, "Import", arg, NULL );
+        if( avs_is_error( res ) )
+        {
+            fprintf( stderr, "avs [error]: %s\n", avs_as_string( res ) );
+            return -1;
+        }
+        /* check if the user is using a multi-threaded script and apply distributor if necessary.
+           adapted from avisynth's vfw interface */
+        AVS_Value mt_test = h->func.avs_invoke( h->env, "GetMTMode", avs_new_value_bool( 0 ), NULL );
+        int mt_mode = avs_is_int( mt_test ) ? avs_as_int( mt_test ) : 0;
+        h->func.avs_release_value( mt_test );
+        if( mt_mode > 0 && mt_mode < 5 )
+        {
+            AVS_Value temp = h->func.avs_invoke( h->env, "Distributor", res, NULL );
+            h->func.avs_release_value( res );
+            res = temp;
+        }
+    }
+    else /* non script file */
+    {
+        /* cycle through known source filters to find one that works */
+        const char *filter[AVS_MAX_SEQUENCE+1] = { 0 };
+        avs_build_filter_sequence( filename_ext, filter );
+        int i;
+        for( i = 0; filter[i]; i++ )
+        {
+            fprintf( stderr, "avs [info]: trying %s... ", filter[i] );
+            if( !h->func.avs_function_exists( h->env, filter[i] ) )
+            {
+                fprintf( stderr, "not found\n" );
+                continue;
+            }
+            if( !strncasecmp( filter[i], "FFmpegSource", 12 ) )
+            {
+                fprintf( stderr, "indexing... " );
+                fflush( stderr );
+            }
+            res = h->func.avs_invoke( h->env, filter[i], arg, NULL );
+            if( !avs_is_error( res ) )
+            {
+                fprintf( stderr, "succeeded\n" );
+                break;
+            }
+            fprintf( stderr, "failed\n" );
+        }
+        if( !filter[i] )
+        {
+            fprintf( stderr, "avs [error]: unable to find source filter to open `%s'\n", psz_filename );
+            return -1;
+        }
+    }
+    if( !avs_is_clip( res ) )
+    {
+        fprintf( stderr, "avs [error]: `%s' didn't return a video clip\n", psz_filename );
+        return -1;
+    }
+    h->clip = h->func.avs_take_clip( res, h->env );
+    int avs_version = h->func.avs_get_version( h->clip );
+    const AVS_VideoInfo *vi = h->func.avs_get_video_info( h->clip );
+    if( !avs_has_video( vi ) )
+    {
+        fprintf( stderr, "avs [error]: `%s' has no video data\n", psz_filename );
+        return -1;
+    }
+    if( vi->width&1 || vi->height&1 )
+    {
+        fprintf( stderr, "avs [error]: input clip width or height not divisible by 2 (%dx%d)\n",
+                 vi->width, vi->height );
+        return -1;
+    }
+    /* always call ConvertToYV12 to convert non YV12 planar colorspaces to YV12 when user's AVS supports them,
+       as all planar colorspaces are flagged as YV12. If it is already YV12 in this case, the call does nothing */
+    if( !avs_is_yv12( vi ) || avs_version >= AVS_INTERFACE_OTHER_PLANAR )
+    {
+        h->func.avs_release_clip( h->clip );
+        fprintf( stderr, "avs %s\n", !avs_is_yv12( vi ) ? "[warning]: converting input clip to YV12"
+               : "[info]: avisynth 2.6+ detected, forcing conversion to YV12" );
+        const char *arg_name[2] = { NULL, "interlaced" };
+        AVS_Value arg_arr[2] = { res, avs_new_value_bool( info->interlaced ) };
+        AVS_Value res2 = h->func.avs_invoke( h->env, "ConvertToYV12", avs_new_value_array( arg_arr, 2 ), arg_name );
+        if( avs_is_error( res2 ) )
+        {
+            fprintf( stderr, "avs [error]: couldn't convert input clip to YV12\n" );
+            return -1;
+        }
+        h->clip = h->func.avs_take_clip( res2, h->env );
+        h->func.avs_release_value( res2 );
+        vi = h->func.avs_get_video_info( h->clip );
+    }
+    h->func.avs_release_value( res );
+
+    info->width = vi->width;
+    info->height = vi->height;
+    info->fps_num = vi->fps_numerator;
+    info->fps_den = vi->fps_denominator;
+    h->num_frames = vi->num_frames;
+    info->csp = X264_CSP_YV12;
+    info->vfr = 0;
+
+    *p_handle = h;
+    return 0;
+}
+
+static int get_frame_total( hnd_t handle )
+{
+    avs_hnd_t *h = handle;
+    return h->num_frames;
+}
+
+static int picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
+{
+    pic->img.i_csp = i_csp;
+    pic->img.i_plane = 3;
+    pic->param = NULL;
+    return 0;
+}
+
+static int read_frame( x264_picture_t *p_pic, hnd_t handle, int i_frame )
+{
+    static int plane[3] = { AVS_PLANAR_Y, AVS_PLANAR_V, AVS_PLANAR_U };
+    avs_hnd_t *h = handle;
+    if( i_frame >= h->num_frames )
+        return -1;
+    AVS_VideoFrame *frm =
+    p_pic->opaque = h->func.avs_get_frame( h->clip, i_frame );
+    int i;
+    const char *err = h->func.avs_clip_get_error( h->clip );
+    if( err )
+    {
+        fprintf( stderr, "avs [error]: %s occurred while reading frame %d\n", err, i_frame );
+        return -1;
+    }
+    for( i = 0; i < 3; i++ )
+    {
+        /* explicitly cast away the const attribute to avoid a warning */
+        p_pic->img.plane[i] = (uint8_t*)avs_get_read_ptr_p( frm, plane[i] );
+        p_pic->img.i_stride[i] = avs_get_pitch_p( frm, plane[i] );
+    }
+    return 0;
+}
+
+static int release_frame( x264_picture_t *pic, hnd_t handle )
+{
+    avs_hnd_t *h = handle;
+    h->func.avs_release_video_frame( pic->opaque );
+    return 0;
+}
+
+static void picture_clean( x264_picture_t *pic )
+{
+    memset( pic, 0, sizeof(x264_picture_t) );
+}
+
+static int close_file( hnd_t handle )
+{
+    avs_hnd_t *h = handle;
+    h->func.avs_release_clip( h->clip );
+    if( h->func.avs_delete_script_environment )
+        h->func.avs_delete_script_environment( h->env );
+    FreeLibrary( h->library );
+    free( h );
+    return 0;
+}
+
+cli_input_t avs_input = { open_file, get_frame_total, picture_alloc, read_frame, release_frame, picture_clean, close_file };
diff --git a/input/ffms.c b/input/ffms.c
new file mode 100644
index 0000000..b680967
--- /dev/null
+++ b/input/ffms.c
@@ -0,0 +1,247 @@
+/*****************************************************************************
+ * ffms.c: x264 ffmpegsource input module
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: Mike Gurlitz <mike.gurlitz at gmail.com>
+ *          Steven Walters <kemuri9 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+#include <ffms.h>
+#undef DECLARE_ALIGNED
+#include <libavcodec/avcodec.h>
+#include <libswscale/swscale.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#define SetConsoleTitle(t)
+#endif
+
+typedef struct
+{
+    FFMS_VideoSource *video_source;
+    FFMS_Track *track;
+    int total_frames;
+    struct SwsContext *scaler;
+    int pts_offset_flag;
+    int64_t pts_offset;
+    int reduce_pts;
+    int vfr_input;
+
+    int init_width;
+    int init_height;
+
+    int cur_width;
+    int cur_height;
+    int cur_pix_fmt;
+} ffms_hnd_t;
+
+static int FFMS_CC update_progress( int64_t current, int64_t total, void *private )
+{
+    if( current % 10 )
+        return 0;
+    char buf[200];
+    sprintf( buf, "ffms [info]: indexing input file [%.1f%%]", 100.0 * current / total );
+    fprintf( stderr, "%s  \r", buf+5 );
+    SetConsoleTitle( buf );
+    fflush( stderr );
+    return 0;
+}
+
+static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
+{
+    ffms_hnd_t *h = calloc( 1, sizeof(ffms_hnd_t) );
+    if( !h )
+        return -1;
+    FFMS_Init( 0 );
+    FFMS_ErrorInfo e;
+    e.BufferSize = 0;
+    int seekmode = opt->seek ? FFMS_SEEK_NORMAL : FFMS_SEEK_LINEAR_NO_RW;
+
+    FFMS_Index *index = NULL;
+    if( opt->index )
+    {
+        struct stat index_s, input_s;
+        if( !stat( opt->index, &index_s ) && !stat( psz_filename, &input_s ) &&
+            input_s.st_mtime < index_s.st_mtime )
+            index = FFMS_ReadIndex( opt->index, &e );
+    }
+    if( !index )
+    {
+        index = FFMS_MakeIndex( psz_filename, 0, 0, NULL, NULL, 0, update_progress, NULL, &e );
+        fprintf( stderr, "                                            \r" );
+        if( !index )
+        {
+            fprintf( stderr, "ffms [error]: could not create index\n" );
+            return -1;
+        }
+        if( opt->index && FFMS_WriteIndex( opt->index, index, &e ) )
+            fprintf( stderr, "ffms [warning]: could not write index file\n" );
+    }
+
+    int trackno = FFMS_GetFirstTrackOfType( index, FFMS_TYPE_VIDEO, &e );
+    if( trackno < 0 )
+    {
+        fprintf( stderr, "ffms [error]: could not find video track\n" );
+        return -1;
+    }
+
+    h->video_source = FFMS_CreateVideoSource( psz_filename, trackno, index, 1, seekmode, &e );
+    if( !h->video_source )
+    {
+        fprintf( stderr, "ffms [error]: could not create video source\n" );
+        return -1;
+    }
+
+    h->track = FFMS_GetTrackFromVideo( h->video_source );
+    const FFMS_TrackTimeBase *timebase = FFMS_GetTimeBase( h->track );
+
+    FFMS_DestroyIndex( index );
+    const FFMS_VideoProperties *videop = FFMS_GetVideoProperties( h->video_source );
+    h->total_frames    = videop->NumFrames;
+    info->sar_height   = videop->SARDen;
+    info->sar_width    = videop->SARNum;
+    info->fps_den      = videop->FPSDenominator;
+    info->fps_num      = videop->FPSNumerator;
+    info->timebase_num = (int)timebase->Num;
+    h->vfr_input       = info->vfr;
+
+    const FFMS_Frame *frame = FFMS_GetFrame( h->video_source, 0, &e );
+    if( !frame )
+    {
+        fprintf( stderr, "ffms [error]: could not read frame 0\n" );
+        return -1;
+    }
+
+    h->init_width  = h->cur_width  = info->width  = frame->EncodedWidth;
+    h->init_height = h->cur_height = info->height = frame->EncodedHeight;
+    h->cur_pix_fmt = frame->EncodedPixelFormat;
+    info->interlaced = frame->InterlacedFrame;
+
+    if( h->cur_pix_fmt != PIX_FMT_YUV420P )
+        fprintf( stderr, "ffms [warning]: converting from %s to YV12\n",
+                 avcodec_get_pix_fmt_name( h->cur_pix_fmt ) );
+
+    /* ffms timestamps are in milliseconds. Increasing timebase denominator could cause integer overflow.
+     * Conversely, reducing PTS may lose too much accuracy */
+    if( h->vfr_input )
+    {
+        int64_t timebase_den = (int64_t)timebase->Den * 1000;
+
+        if( timebase_den > INT_MAX )
+        {
+            info->timebase_den = (int)timebase->Den;
+            h->reduce_pts = 1;
+        }
+        else
+        {
+            info->timebase_den = (int)timebase->Den * 1000;
+            h->reduce_pts = 0;
+        }
+    }
+
+    *p_handle = h;
+    return 0;
+}
+
+static int get_frame_total( hnd_t handle )
+{
+    return ((ffms_hnd_t*)handle)->total_frames;
+}
+
+static int check_swscale( ffms_hnd_t *h, const FFMS_Frame *frame, int i_frame )
+{
+    if( h->scaler && h->cur_width == frame->EncodedWidth && h->cur_height == frame->EncodedHeight &&
+        h->cur_pix_fmt == frame->EncodedPixelFormat )
+        return 0;
+    if( h->scaler )
+    {
+        sws_freeContext( h->scaler );
+        fprintf( stderr, "ffms [warning]: stream properties changed to %dx%d, %s at frame %d  \n", frame->EncodedWidth,
+                 frame->EncodedHeight, avcodec_get_pix_fmt_name( frame->EncodedPixelFormat ), i_frame );
+        h->cur_width   = frame->EncodedWidth;
+        h->cur_height  = frame->EncodedHeight;
+        h->cur_pix_fmt = frame->EncodedPixelFormat;
+    }
+    h->scaler = sws_getContext( h->cur_width, h->cur_height, h->cur_pix_fmt, h->init_width, h->init_height,
+                                PIX_FMT_YUV420P, SWS_BICUBIC, NULL, NULL, NULL );
+    if( !h->scaler )
+    {
+        fprintf( stderr, "ffms [error]: could not open swscale context\n" );
+        return -1;
+    }
+    return 0;
+}
+
+static int read_frame( x264_picture_t *p_pic, hnd_t handle, int i_frame )
+{
+    ffms_hnd_t *h = handle;
+    FFMS_ErrorInfo e;
+    e.BufferSize = 0;
+    const FFMS_Frame *frame = FFMS_GetFrame( h->video_source, i_frame, &e );
+    if( !frame )
+    {
+        fprintf( stderr, "ffms [error]: could not read frame %d\n", i_frame );
+        return -1;
+    }
+
+    if( check_swscale( h, frame, i_frame ) )
+        return -1;
+    /* FFMS_VideoSource has a single FFMS_Frame buffer for all calls to GetFrame.
+     * With threaded input, copying the pointers would result in the data changing during encoding.
+     * FIXME: don't do redundant sws_scales for singlethreaded input, or fix FFMS to allow
+     * multiple FFMS_Frame buffers. */
+    sws_scale( h->scaler, (uint8_t**)frame->Data, (int*)frame->Linesize, 0,
+               frame->EncodedHeight, p_pic->img.plane, p_pic->img.i_stride );
+
+    const FFMS_FrameInfo *info = FFMS_GetFrameInfo( h->track, i_frame );
+
+    if( h->vfr_input )
+    {
+        if( info->PTS == AV_NOPTS_VALUE )
+        {
+            fprintf( stderr, "ffms [error]: invalid timestamp. "
+                     "Use --force-cfr and specify a framerate with --fps\n" );
+            return -1;
+        }
+
+        if( !h->pts_offset_flag )
+        {
+            h->pts_offset = info->PTS;
+            h->pts_offset_flag = 1;
+        }
+
+        if( h->reduce_pts )
+            p_pic->i_pts = (int64_t)(((info->PTS - h->pts_offset) / 1000) + 0.5);
+        else
+            p_pic->i_pts = info->PTS - h->pts_offset;
+    }
+    return 0;
+}
+
+static int close_file( hnd_t handle )
+{
+    ffms_hnd_t *h = handle;
+    sws_freeContext( h->scaler );
+    FFMS_DestroyVideoSource( h->video_source );
+    free( h );
+    return 0;
+}
+
+cli_input_t ffms_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
diff --git a/input/input.h b/input/input.h
new file mode 100644
index 0000000..9fb425c
--- /dev/null
+++ b/input/input.h
@@ -0,0 +1,70 @@
+/*****************************************************************************
+ * input.h: x264 file input modules
+ *****************************************************************************
+ * Copyright (C) 2003-2009 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir at via.ecp.fr>
+ *          Loren Merritt <lorenm at u.washington.edu>
+ *          Steven Walters <kemuri9 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef X264_INPUT_H
+#define X264_INPUT_H
+
+/* options that are used by only some demuxers */
+typedef struct
+{
+    char *index;
+    char *resolution; /* resolution string parsed by raw yuv input */
+    int seek;
+} cli_input_opt_t;
+
+/* properties of the source given by the demuxer */
+typedef struct
+{
+    int csp; /* X264_CSP_YV12 or X264_CSP_I420 */
+    int fps_num;
+    int fps_den;
+    int height;
+    int interlaced;
+    int sar_width;
+    int sar_height;
+    int timebase_num;
+    int timebase_den;
+    int vfr;
+    int width;
+} video_info_t;
+
+typedef struct
+{
+    int (*open_file)( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt );
+    int (*get_frame_total)( hnd_t handle );
+    int (*picture_alloc)( x264_picture_t *pic, int i_csp, int i_width, int i_height );
+    int (*read_frame)( x264_picture_t *p_pic, hnd_t handle, int i_frame );
+    int (*release_frame)( x264_picture_t *pic, hnd_t handle );
+    void (*picture_clean)( x264_picture_t *pic );
+    int (*close_file)( hnd_t handle );
+} cli_input_t;
+
+extern cli_input_t yuv_input;
+extern cli_input_t y4m_input;
+extern cli_input_t avs_input;
+extern cli_input_t thread_input;
+extern cli_input_t lavf_input;
+extern cli_input_t ffms_input;
+
+#endif
diff --git a/input/lavf.c b/input/lavf.c
new file mode 100644
index 0000000..180e509
--- /dev/null
+++ b/input/lavf.c
@@ -0,0 +1,272 @@
+/*****************************************************************************
+ * lavf.c: x264 libavformat input module
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: Mike Gurlitz <mike.gurlitz at gmail.com>
+ *          Steven Walters <kemuri9 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+#undef DECLARE_ALIGNED
+#include <libavformat/avformat.h>
+#include <libswscale/swscale.h>
+
+typedef struct
+{
+    AVFormatContext *lavf;
+    int stream_id;
+    int next_frame;
+    int vfr_input;
+    int vertical_flip;
+    struct SwsContext *scaler;
+    int pts_offset_flag;
+    int64_t pts_offset;
+    x264_picture_t *first_pic;
+
+    int init_width;
+    int init_height;
+
+    int cur_width;
+    int cur_height;
+    enum PixelFormat cur_pix_fmt;
+} lavf_hnd_t;
+
+typedef struct
+{
+    AVFrame frame;
+    AVPacket packet;
+} lavf_pic_t;
+
+static int check_swscale( lavf_hnd_t *h, AVCodecContext *c, int i_frame )
+{
+    if( h->scaler && (h->cur_width == c->width) && (h->cur_height == c->height) && (h->cur_pix_fmt == c->pix_fmt) )
+        return 0;
+    if( h->scaler )
+    {
+        sws_freeContext( h->scaler );
+        fprintf( stderr, "lavf [warning]: stream properties changed to %dx%d, %s at frame %d  \n",
+                 c->width, c->height, avcodec_get_pix_fmt_name( c->pix_fmt ), i_frame );
+        h->cur_width   = c->width;
+        h->cur_height  = c->height;
+        h->cur_pix_fmt = c->pix_fmt;
+    }
+    h->scaler = sws_getContext( h->cur_width, h->cur_height, h->cur_pix_fmt, h->init_width, h->init_height,
+                                PIX_FMT_YUV420P, SWS_BICUBIC, NULL, NULL, NULL );
+    if( !h->scaler )
+    {
+        fprintf( stderr, "lavf [error]: could not open swscale context\n" );
+        return -1;
+    }
+    return 0;
+}
+
+static int read_frame_internal( x264_picture_t *p_pic, lavf_hnd_t *h, int i_frame, video_info_t *info )
+{
+    if( h->first_pic && !info )
+    {
+        /* see if the frame we are requesting is the frame we have already read and stored.
+         * if so, retrieve the pts and image data before freeing it. */
+        if( !i_frame )
+        {
+            XCHG( x264_image_t, p_pic->img, h->first_pic->img );
+            p_pic->i_pts = h->first_pic->i_pts;
+        }
+        lavf_input.picture_clean( h->first_pic );
+        free( h->first_pic );
+        h->first_pic = NULL;
+        if( !i_frame )
+            return 0;
+    }
+
+    AVCodecContext *c = h->lavf->streams[h->stream_id]->codec;
+    lavf_pic_t *pic_h = p_pic->opaque;
+    AVPacket *pkt = &pic_h->packet;
+    AVFrame *frame = &pic_h->frame;
+
+    while( i_frame >= h->next_frame )
+    {
+        int finished = 0;
+        while( !finished && av_read_frame( h->lavf, pkt ) >= 0 )
+            if( pkt->stream_index == h->stream_id )
+            {
+                c->reordered_opaque = pkt->pts;
+                if( avcodec_decode_video2( c, frame, &finished, pkt ) < 0 )
+                    fprintf( stderr, "lavf [warning]: video decoding failed on frame %d\n", h->next_frame );
+            }
+        if( !finished )
+        {
+            if( avcodec_decode_video2( c, frame, &finished, pkt ) < 0 )
+                fprintf( stderr, "lavf [warning]: video decoding failed on frame %d\n", h->next_frame );
+            if( !finished )
+                return -1;
+        }
+        h->next_frame++;
+    }
+
+    if( check_swscale( h, c, i_frame ) )
+        return -1;
+    /* FIXME: avoid sws_scale where possible (no colorspace conversion). */
+    sws_scale( h->scaler, frame->data, frame->linesize, 0, c->height, p_pic->img.plane, p_pic->img.i_stride );
+
+    if( info )
+        info->interlaced = frame->interlaced_frame;
+
+    if( h->vfr_input )
+    {
+        p_pic->i_pts = 0;
+        if( frame->reordered_opaque != AV_NOPTS_VALUE )
+            p_pic->i_pts = frame->reordered_opaque;
+        else if( pkt->dts != AV_NOPTS_VALUE )
+            p_pic->i_pts = pkt->dts; // for AVI files
+        else if( info )
+        {
+            h->vfr_input = info->vfr = 0;
+            goto exit;
+        }
+        if( !h->pts_offset_flag )
+        {
+            h->pts_offset = p_pic->i_pts;
+            h->pts_offset_flag = 1;
+        }
+        p_pic->i_pts -= h->pts_offset;
+    }
+
+exit:
+    if( pkt->destruct )
+        pkt->destruct( pkt );
+    avcodec_get_frame_defaults( frame );
+    return 0;
+}
+
+static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
+{
+    lavf_hnd_t *h = malloc( sizeof(lavf_hnd_t) );
+    if( !h )
+        return -1;
+    av_register_all();
+    h->scaler = NULL;
+    if( !strcmp( psz_filename, "-" ) )
+        psz_filename = "pipe:";
+
+    if( av_open_input_file( &h->lavf, psz_filename, NULL, 0, NULL ) )
+    {
+        fprintf( stderr, "lavf [error]: could not open input file\n" );
+        return -1;
+    }
+
+    if( av_find_stream_info( h->lavf ) < 0 )
+    {
+        fprintf( stderr, "lavf [error]: could not find input stream info\n" );
+        return -1;
+    }
+
+    int i = 0;
+    while( i < h->lavf->nb_streams && h->lavf->streams[i]->codec->codec_type != CODEC_TYPE_VIDEO )
+        i++;
+    if( i == h->lavf->nb_streams )
+    {
+        fprintf( stderr, "lavf [error]: could not find video stream\n" );
+        return -1;
+    }
+    h->stream_id       = i;
+    h->next_frame      = 0;
+    h->pts_offset_flag = 0;
+    h->pts_offset      = 0;
+    AVCodecContext *c  = h->lavf->streams[i]->codec;
+    h->init_width      = h->cur_width  = info->width  = c->width;
+    h->init_height     = h->cur_height = info->height = c->height;
+    h->cur_pix_fmt     = c->pix_fmt;
+    info->fps_num      = h->lavf->streams[i]->r_frame_rate.num;
+    info->fps_den      = h->lavf->streams[i]->r_frame_rate.den;
+    info->timebase_num = h->lavf->streams[i]->time_base.num;
+    info->timebase_den = h->lavf->streams[i]->time_base.den;
+    h->vfr_input       = info->vfr;
+    h->vertical_flip   = 0;
+
+    /* avisynth stores rgb data vertically flipped. */
+    if( !strcasecmp( get_filename_extension( psz_filename ), "avs" ) &&
+        (h->cur_pix_fmt == PIX_FMT_BGRA || h->cur_pix_fmt == PIX_FMT_BGR24) )
+        info->csp |= X264_CSP_VFLIP;
+
+    if( h->cur_pix_fmt != PIX_FMT_YUV420P )
+        fprintf( stderr, "lavf [warning]: converting from %s to YV12\n",
+                 avcodec_get_pix_fmt_name( h->cur_pix_fmt ) );
+
+    if( avcodec_open( c, avcodec_find_decoder( c->codec_id ) ) )
+    {
+        fprintf( stderr, "lavf [error]: could not find decoder for video stream\n" );
+        return -1;
+    }
+
+    /* prefetch the first frame and set/confirm flags */
+    h->first_pic = malloc( sizeof(x264_picture_t) );
+    if( !h->first_pic || lavf_input.picture_alloc( h->first_pic, info->csp, info->width, info->height ) )
+    {
+        fprintf( stderr, "lavf [error]: malloc failed\n" );
+        return -1;
+    }
+    else if( read_frame_internal( h->first_pic, h, 0, info ) )
+        return -1;
+
+    info->sar_height = c->sample_aspect_ratio.den;
+    info->sar_width  = c->sample_aspect_ratio.num;
+    *p_handle = h;
+
+    return 0;
+}
+
+static int picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
+{
+    if( x264_picture_alloc( pic, i_csp, i_width, i_height ) )
+        return -1;
+    lavf_pic_t *pic_h = pic->opaque = malloc( sizeof(lavf_pic_t) );
+    if( !pic_h )
+        return -1;
+    avcodec_get_frame_defaults( &pic_h->frame );
+    av_init_packet( &pic_h->packet );
+    return 0;
+}
+
+/* FIXME */
+static int get_frame_total( hnd_t handle )
+{
+    return 0;
+}
+
+static int read_frame( x264_picture_t *p_pic, hnd_t handle, int i_frame )
+{
+    return read_frame_internal( p_pic, handle, i_frame, NULL );
+}
+
+static void picture_clean( x264_picture_t *pic )
+{
+    free( pic->opaque );
+    x264_picture_clean( pic );
+}
+
+static int close_file( hnd_t handle )
+{
+    lavf_hnd_t *h = handle;
+    sws_freeContext( h->scaler );
+    avcodec_close( h->lavf->streams[h->stream_id]->codec );
+    av_close_input_file( h->lavf );
+    free( h );
+    return 0;
+}
+
+cli_input_t lavf_input = { open_file, get_frame_total, picture_alloc, read_frame, NULL, picture_clean, close_file };
diff --git a/input/thread.c b/input/thread.c
new file mode 100644
index 0000000..a88cfae
--- /dev/null
+++ b/input/thread.c
@@ -0,0 +1,136 @@
+/*****************************************************************************
+ * thread.c: x264 threaded input module
+ *****************************************************************************
+ * Copyright (C) 2003-2009 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir at via.ecp.fr>
+ *          Loren Merritt <lorenm at u.washington.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+
+extern cli_input_t input;
+
+typedef struct
+{
+    cli_input_t input;
+    hnd_t p_handle;
+    x264_picture_t pic;
+    x264_pthread_t tid;
+    int next_frame;
+    int frame_total;
+    int in_progress;
+    struct thread_input_arg_t *next_args;
+} thread_hnd_t;
+
+typedef struct thread_input_arg_t
+{
+    thread_hnd_t *h;
+    x264_picture_t *pic;
+    int i_frame;
+    int status;
+} thread_input_arg_t;
+
+static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
+{
+    thread_hnd_t *h = malloc( sizeof(thread_hnd_t) );
+    if( !h || input.picture_alloc( &h->pic, info->csp, info->width, info->height ) )
+    {
+        fprintf( stderr, "x264 [error]: malloc failed\n" );
+        return -1;
+    }
+    h->input = input;
+    h->p_handle = *p_handle;
+    h->in_progress = 0;
+    h->next_frame = -1;
+    h->next_args = malloc( sizeof(thread_input_arg_t) );
+    if( !h->next_args )
+        return -1;
+    h->next_args->h = h;
+    h->next_args->status = 0;
+    h->frame_total = input.get_frame_total( h->p_handle );
+    thread_input.picture_alloc = h->input.picture_alloc;
+    thread_input.picture_clean = h->input.picture_clean;
+
+    *p_handle = h;
+    return 0;
+}
+
+static int get_frame_total( hnd_t handle )
+{
+    thread_hnd_t *h = handle;
+    return h->frame_total;
+}
+
+static void read_frame_thread_int( thread_input_arg_t *i )
+{
+    i->status = i->h->input.read_frame( i->pic, i->h->p_handle, i->i_frame );
+}
+
+static int read_frame( x264_picture_t *p_pic, hnd_t handle, int i_frame )
+{
+    thread_hnd_t *h = handle;
+    int ret = 0;
+
+    if( h->next_frame >= 0 )
+    {
+        x264_pthread_join( h->tid, NULL );
+        ret |= h->next_args->status;
+        h->in_progress = 0;
+    }
+
+    if( h->next_frame == i_frame )
+        XCHG( x264_picture_t, *p_pic, h->pic );
+    else
+        ret |= h->input.read_frame( p_pic, h->p_handle, i_frame );
+
+    if( !h->frame_total || i_frame+1 < h->frame_total )
+    {
+        h->next_frame =
+        h->next_args->i_frame = i_frame+1;
+        h->next_args->pic = &h->pic;
+        if( x264_pthread_create( &h->tid, NULL, (void*)read_frame_thread_int, h->next_args ) )
+            return -1;
+        h->in_progress = 1;
+    }
+    else
+        h->next_frame = -1;
+
+    return ret;
+}
+
+static int release_frame( x264_picture_t *pic, hnd_t handle )
+{
+    thread_hnd_t *h = handle;
+    if( h->input.release_frame )
+        return h->input.release_frame( pic, h->p_handle );
+    return 0;
+}
+
+static int close_file( hnd_t handle )
+{
+    thread_hnd_t *h = handle;
+    if( h->in_progress )
+        x264_pthread_join( h->tid, NULL );
+    h->input.close_file( h->p_handle );
+    h->input.picture_clean( &h->pic );
+    free( h->next_args );
+    free( h );
+    return 0;
+}
+
+cli_input_t thread_input = { open_file, get_frame_total, NULL, read_frame, release_frame, NULL, close_file };
diff --git a/input/y4m.c b/input/y4m.c
new file mode 100644
index 0000000..1619f74
--- /dev/null
+++ b/input/y4m.c
@@ -0,0 +1,245 @@
+/*****************************************************************************
+ * y4m.c: x264 y4m input module
+ *****************************************************************************
+ * Copyright (C) 2003-2009 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir at via.ecp.fr>
+ *          Loren Merritt <lorenm at u.washington.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+
+typedef struct
+{
+    FILE *fh;
+    int width, height;
+    int next_frame;
+    int seq_header_len, frame_header_len;
+    int frame_size;
+} y4m_hnd_t;
+
+#define Y4M_MAGIC "YUV4MPEG2"
+#define MAX_YUV4_HEADER 80
+#define Y4M_FRAME_MAGIC "FRAME"
+#define MAX_FRAME_HEADER 80
+
+static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
+{
+    y4m_hnd_t *h = malloc( sizeof(y4m_hnd_t) );
+    int  i, n, d;
+    char header[MAX_YUV4_HEADER+10];
+    char *tokstart, *tokend, *header_end;
+    if( !h )
+        return -1;
+
+    h->next_frame = 0;
+    info->vfr = 0;
+
+    if( !strcmp( psz_filename, "-" ) )
+        h->fh = stdin;
+    else
+        h->fh = fopen(psz_filename, "rb");
+    if( h->fh == NULL )
+        return -1;
+
+    h->frame_header_len = strlen( Y4M_FRAME_MAGIC )+1;
+
+    /* Read header */
+    for( i = 0; i < MAX_YUV4_HEADER; i++ )
+    {
+        header[i] = fgetc( h->fh );
+        if( header[i] == '\n' )
+        {
+            /* Add a space after last option. Makes parsing "444" vs
+               "444alpha" easier. */
+            header[i+1] = 0x20;
+            header[i+2] = 0;
+            break;
+        }
+    }
+    if( i == MAX_YUV4_HEADER || strncmp( header, Y4M_MAGIC, strlen( Y4M_MAGIC ) ) )
+        return -1;
+
+    /* Scan properties */
+    header_end = &header[i+1]; /* Include space */
+    h->seq_header_len = i+1;
+    for( tokstart = &header[strlen( Y4M_MAGIC )+1]; tokstart < header_end; tokstart++ )
+    {
+        if( *tokstart == 0x20 )
+            continue;
+        switch( *tokstart++ )
+        {
+            case 'W': /* Width. Required. */
+                h->width = info->width = strtol( tokstart, &tokend, 10 );
+                tokstart=tokend;
+                break;
+            case 'H': /* Height. Required. */
+                h->height = info->height = strtol( tokstart, &tokend, 10 );
+                tokstart=tokend;
+                break;
+            case 'C': /* Color space */
+                if( strncmp( "420", tokstart, 3 ) )
+                {
+                    fprintf( stderr, "y4m [error]: colorspace unhandled\n" );
+                    return -1;
+                }
+                tokstart = strchr( tokstart, 0x20 );
+                break;
+            case 'I': /* Interlace type */
+                switch( *tokstart++ )
+                {
+                    case 'p': break;
+                    case '?':
+                    case 't':
+                    case 'b':
+                    case 'm':
+                    default:
+                        info->interlaced = 1;
+                }
+                break;
+            case 'F': /* Frame rate - 0:0 if unknown */
+                if( sscanf( tokstart, "%d:%d", &n, &d ) == 2 && n && d )
+                {
+                    x264_reduce_fraction( &n, &d );
+                    info->fps_num = n;
+                    info->fps_den = d;
+                }
+                tokstart = strchr( tokstart, 0x20 );
+                break;
+            case 'A': /* Pixel aspect - 0:0 if unknown */
+                /* Don't override the aspect ratio if sar has been explicitly set on the commandline. */
+                if( sscanf( tokstart, "%d:%d", &n, &d ) == 2 && n && d )
+                {
+                    x264_reduce_fraction( &n, &d );
+                    info->sar_width  = n;
+                    info->sar_height = d;
+                }
+                tokstart = strchr( tokstart, 0x20 );
+                break;
+            case 'X': /* Vendor extensions */
+                if( !strncmp( "YSCSS=", tokstart, 6 ) )
+                {
+                    /* Older nonstandard pixel format representation */
+                    tokstart += 6;
+                    if( strncmp( "420JPEG",tokstart, 7 ) &&
+                        strncmp( "420MPEG2",tokstart, 8 ) &&
+                        strncmp( "420PALDV",tokstart, 8 ) )
+                    {
+                        fprintf( stderr, "y4m [error]: unsupported extended colorspace\n" );
+                        return -1;
+                    }
+                }
+                tokstart = strchr( tokstart, 0x20 );
+                break;
+        }
+    }
+
+    *p_handle = h;
+    return 0;
+}
+
+/* Most common case: frame_header = "FRAME" */
+static int get_frame_total( hnd_t handle )
+{
+    y4m_hnd_t *h = handle;
+    int i_frame_total = 0;
+
+    if( x264_is_regular_file( h->fh ) )
+    {
+        uint64_t init_pos = ftell( h->fh );
+        fseek( h->fh, 0, SEEK_END );
+        uint64_t i_size = ftell( h->fh );
+        fseek( h->fh, init_pos, SEEK_SET );
+        i_frame_total = (int)((i_size - h->seq_header_len) /
+                              (3*(h->width*h->height)/2+h->frame_header_len));
+    }
+
+    return i_frame_total;
+}
+
+static int read_frame_internal( x264_picture_t *p_pic, y4m_hnd_t *h )
+{
+    int slen = strlen( Y4M_FRAME_MAGIC );
+    int i = 0;
+    char header[16];
+
+    /* Read frame header - without terminating '\n' */
+    if( fread( header, 1, slen, h->fh ) != slen )
+        return -1;
+
+    header[slen] = 0;
+    if( strncmp( header, Y4M_FRAME_MAGIC, slen ) )
+    {
+        fprintf( stderr, "y4m [error]: bad header magic (%"PRIx32" <=> %s)\n",
+                 M32(header), header );
+        return -1;
+    }
+
+    /* Skip most of it */
+    while( i < MAX_FRAME_HEADER && fgetc( h->fh ) != '\n' )
+        i++;
+    if( i == MAX_FRAME_HEADER )
+    {
+        fprintf( stderr, "y4m [error]: bad frame header!\n" );
+        return -1;
+    }
+    h->frame_header_len = i+slen+1;
+
+    if( fread( p_pic->img.plane[0], h->width * h->height, 1, h->fh ) <= 0
+     || fread( p_pic->img.plane[1], h->width * h->height / 4, 1, h->fh ) <= 0
+     || fread( p_pic->img.plane[2], h->width * h->height / 4, 1, h->fh ) <= 0 )
+        return -1;
+
+    return 0;
+}
+
+static int read_frame( x264_picture_t *p_pic, hnd_t handle, int i_frame )
+{
+    y4m_hnd_t *h = handle;
+
+    if( i_frame > h->next_frame )
+    {
+        if( x264_is_regular_file( h->fh ) )
+            fseek( h->fh, (uint64_t)i_frame*(3*(h->width*h->height)/2+h->frame_header_len)
+                 + h->seq_header_len, SEEK_SET );
+        else
+            while( i_frame > h->next_frame )
+            {
+                if( read_frame_internal( p_pic, h ) )
+                    return -1;
+                h->next_frame++;
+            }
+    }
+
+    if( read_frame_internal( p_pic, h ) )
+        return -1;
+
+    h->next_frame = i_frame+1;
+    return 0;
+}
+
+static int close_file( hnd_t handle )
+{
+    y4m_hnd_t *h = handle;
+    if( !h || !h->fh )
+        return 0;
+    fclose( h->fh );
+    free( h );
+    return 0;
+}
+
+cli_input_t y4m_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
diff --git a/input/yuv.c b/input/yuv.c
new file mode 100644
index 0000000..dbd0317
--- /dev/null
+++ b/input/yuv.c
@@ -0,0 +1,128 @@
+/*****************************************************************************
+ * yuv.c: x264 yuv input module
+ *****************************************************************************
+ * Copyright (C) 2003-2009 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir at via.ecp.fr>
+ *          Loren Merritt <lorenm at u.washington.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+
+typedef struct
+{
+    FILE *fh;
+    int width, height;
+    int next_frame;
+} yuv_hnd_t;
+
+static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
+{
+    yuv_hnd_t *h = malloc( sizeof(yuv_hnd_t) );
+    if( !h )
+        return -1;
+
+    if( !opt->resolution )
+    {
+        /* try to parse the file name */
+        char *p;
+        for( p = psz_filename; *p; p++ )
+            if( *p >= '0' && *p <= '9' && sscanf( p, "%ux%u", &info->width, &info->height ) == 2 )
+                break;
+    }
+    else
+        sscanf( opt->resolution, "%ux%u", &info->width, &info->height );
+    if( !info->width || !info->height )
+    {
+        fprintf( stderr, "yuv [error]: rawyuv input requires a resolution.\n" );
+        return -1;
+    }
+
+    h->next_frame = 0;
+    info->vfr     = 0;
+    h->width      = info->width;
+    h->height     = info->height;
+
+    if( !strcmp( psz_filename, "-" ) )
+        h->fh = stdin;
+    else
+        h->fh = fopen( psz_filename, "rb" );
+    if( h->fh == NULL )
+        return -1;
+
+    *p_handle = h;
+    return 0;
+}
+
+static int get_frame_total( hnd_t handle )
+{
+    yuv_hnd_t *h = handle;
+    int i_frame_total = 0;
+
+    if( x264_is_regular_file( h->fh ) )
+    {
+        fseek( h->fh, 0, SEEK_END );
+        uint64_t i_size = ftell( h->fh );
+        fseek( h->fh, 0, SEEK_SET );
+        i_frame_total = (int)(i_size / ( h->width * h->height * 3 / 2 ));
+    }
+
+    return i_frame_total;
+}
+
+static int read_frame_internal( x264_picture_t *p_pic, yuv_hnd_t *h )
+{
+    return fread( p_pic->img.plane[0], h->width * h->height, 1, h->fh ) <= 0
+        || fread( p_pic->img.plane[1], h->width * h->height / 4, 1, h->fh ) <= 0
+        || fread( p_pic->img.plane[2], h->width * h->height / 4, 1, h->fh ) <= 0;
+}
+
+static int read_frame( x264_picture_t *p_pic, hnd_t handle, int i_frame )
+{
+    yuv_hnd_t *h = handle;
+
+    if( i_frame > h->next_frame )
+    {
+        if( x264_is_regular_file( h->fh ) )
+            fseek( h->fh, (uint64_t)i_frame * h->width * h->height * 3 / 2, SEEK_SET );
+        else
+            while( i_frame > h->next_frame )
+            {
+                if( read_frame_internal( p_pic, h ) )
+                    return -1;
+                h->next_frame++;
+            }
+    }
+
+    if( read_frame_internal( p_pic, h ) )
+        return -1;
+
+    h->next_frame = i_frame+1;
+    return 0;
+}
+
+static int close_file( hnd_t handle )
+{
+    yuv_hnd_t *h = handle;
+    if( !h || !h->fh )
+        return 0;
+    fclose( h->fh );
+    free( h );
+    return 0;
+}
+
+cli_input_t yuv_input = { open_file, get_frame_total, x264_picture_alloc, read_frame, NULL, x264_picture_clean, close_file };
diff --git a/matroska.c b/matroska.c
deleted file mode 100644
index 35ae4cd..0000000
--- a/matroska.c
+++ /dev/null
@@ -1,515 +0,0 @@
-/*****************************************************************************
- * matroska.c:
- *****************************************************************************
- * Copyright (C) 2005 Mike Matsnev
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *****************************************************************************/
-
-#include <stdlib.h>
-#include <string.h>
-#include "common/osdep.h"
-#include "matroska.h"
-
-#define	CLSIZE	  1048576
-#define	CHECK(x)  do { if ((x) < 0) return -1; } while (0)
-
-struct mk_Context {
-  struct mk_Context *next, **prev, *parent;
-  struct mk_Writer  *owner;
-  unsigned	    id;
-
-  void		    *data;
-  unsigned	    d_cur, d_max;
-};
-
-typedef struct mk_Context mk_Context;
-
-struct mk_Writer {
-  FILE		      *fp;
-
-  unsigned	      duration_ptr;
-
-  mk_Context	      *root, *cluster, *frame;
-  mk_Context	      *freelist;
-  mk_Context	      *actlist;
-
-  int64_t	      def_duration;
-  int64_t	      timescale;
-  int64_t	      cluster_tc_scaled;
-  int64_t	      frame_tc, prev_frame_tc_scaled, max_frame_tc;
-
-  char		      wrote_header, in_frame, keyframe;
-};
-
-static mk_Context *mk_createContext(mk_Writer *w, mk_Context *parent, unsigned id) {
-  mk_Context  *c;
-
-  if (w->freelist) {
-    c = w->freelist;
-    w->freelist = w->freelist->next;
-  } else {
-    c = malloc(sizeof(*c));
-    memset(c, 0, sizeof(*c));
-  }
-
-  if (c == NULL)
-    return NULL;
-
-  c->parent = parent;
-  c->owner = w;
-  c->id = id;
-
-  if (c->owner->actlist)
-    c->owner->actlist->prev = &c->next;
-  c->next = c->owner->actlist;
-  c->prev = &c->owner->actlist;
-  c->owner->actlist = c;
-
-  return c;
-}
-
-static int	  mk_appendContextData(mk_Context *c, const void *data, unsigned size) {
-  unsigned  ns = c->d_cur + size;
-
-  if (ns > c->d_max) {
-    void      *dp;
-    unsigned  dn = c->d_max ? c->d_max << 1 : 16;
-    while (ns > dn)
-      dn <<= 1;
-
-    dp = realloc(c->data, dn);
-    if (dp == NULL)
-      return -1;
-
-    c->data = dp;
-    c->d_max = dn;
-  }
-
-  memcpy((char*)c->data + c->d_cur, data, size);
-
-  c->d_cur = ns;
-
-  return 0;
-}
-
-static int	  mk_writeID(mk_Context *c, unsigned id) {
-  unsigned char	  c_id[4] = { id >> 24, id >> 16, id >> 8, id };
-
-  if (c_id[0])
-    return mk_appendContextData(c, c_id, 4);
-  if (c_id[1])
-    return mk_appendContextData(c, c_id+1, 3);
-  if (c_id[2])
-    return mk_appendContextData(c, c_id+2, 2);
-  return mk_appendContextData(c, c_id+3, 1);
-}
-
-static int	  mk_writeSize(mk_Context *c, unsigned size) {
-  unsigned char	  c_size[5] = { 0x08, size >> 24, size >> 16, size >> 8, size };
-
-  if (size < 0x7f) {
-    c_size[4] |= 0x80;
-    return mk_appendContextData(c, c_size+4, 1);
-  }
-  if (size < 0x3fff) {
-    c_size[3] |= 0x40;
-    return mk_appendContextData(c, c_size+3, 2);
-  }
-  if (size < 0x1fffff) {
-    c_size[2] |= 0x20;
-    return mk_appendContextData(c, c_size+2, 3);
-  }
-  if (size < 0x0fffffff) {
-    c_size[1] |= 0x10;
-    return mk_appendContextData(c, c_size+1, 4);
-  }
-  return mk_appendContextData(c, c_size, 5);
-}
-
-static int	  mk_flushContextID(mk_Context *c) {
-  unsigned char	ff = 0xff;
-
-  if (c->id == 0)
-    return 0;
-
-  CHECK(mk_writeID(c->parent, c->id));
-  CHECK(mk_appendContextData(c->parent, &ff, 1));
-
-  c->id = 0;
-
-  return 0;
-}
-
-static int	  mk_flushContextData(mk_Context *c) {
-  if (c->d_cur == 0)
-    return 0;
-
-  if (c->parent)
-    CHECK(mk_appendContextData(c->parent, c->data, c->d_cur));
-  else
-    if (fwrite(c->data, c->d_cur, 1, c->owner->fp) != 1)
-      return -1;
-
-  c->d_cur = 0;
-
-  return 0;
-}
-
-static int	  mk_closeContext(mk_Context *c, unsigned *off) {
-  if (c->id) {
-    CHECK(mk_writeID(c->parent, c->id));
-    CHECK(mk_writeSize(c->parent, c->d_cur));
-  }
-
-  if (c->parent && off != NULL)
-    *off += c->parent->d_cur;
-
-  CHECK(mk_flushContextData(c));
-
-  if (c->next)
-    c->next->prev = c->prev;
-  *(c->prev) = c->next;
-  c->next = c->owner->freelist;
-  c->owner->freelist = c;
-
-  return 0;
-}
-
-static void	  mk_destroyContexts(mk_Writer *w) {
-  mk_Context  *cur, *next;
-
-  for (cur = w->freelist; cur; cur = next) {
-    next = cur->next;
-    free(cur->data);
-    free(cur);
-  }
-
-  for (cur = w->actlist; cur; cur = next) {
-    next = cur->next;
-    free(cur->data);
-    free(cur);
-  }
-
-  w->freelist = w->actlist = w->root = NULL;
-}
-
-static int	  mk_writeStr(mk_Context *c, unsigned id, const char *str) {
-  size_t  len = strlen(str);
-
-  CHECK(mk_writeID(c, id));
-  CHECK(mk_writeSize(c, len));
-  CHECK(mk_appendContextData(c, str, len));
-  return 0;
-}
-
-static int	  mk_writeBin(mk_Context *c, unsigned id, const void *data, unsigned size) {
-  CHECK(mk_writeID(c, id));
-  CHECK(mk_writeSize(c, size));
-  CHECK(mk_appendContextData(c, data, size));
-  return 0;
-}
-
-static int	  mk_writeUInt(mk_Context *c, unsigned id, int64_t ui) {
-  unsigned char	  c_ui[8] = { ui >> 56, ui >> 48, ui >> 40, ui >> 32, ui >> 24, ui >> 16, ui >> 8, ui };
-  unsigned	  i = 0;
-
-  CHECK(mk_writeID(c, id));
-  while (i < 7 && c_ui[i] == 0)
-    ++i;
-  CHECK(mk_writeSize(c, 8 - i));
-  CHECK(mk_appendContextData(c, c_ui+i, 8 - i));
-  return 0;
-}
-
-static int  	  mk_writeSInt(mk_Context *c, unsigned id, int64_t si) {
-  unsigned char	  c_si[8] = { si >> 56, si >> 48, si >> 40, si >> 32, si >> 24, si >> 16, si >> 8, si };
-  unsigned	  i = 0;
-
-  CHECK(mk_writeID(c, id));
-  if (si < 0)
-    while (i < 7 && c_si[i] == 0xff && c_si[i+1] & 0x80)
-      ++i;
-  else
-    while (i < 7 && c_si[i] == 0 && !(c_si[i+1] & 0x80))
-      ++i;
-  CHECK(mk_writeSize(c, 8 - i));
-  CHECK(mk_appendContextData(c, c_si+i, 8 - i));
-  return 0;
-}
-
-static int	  mk_writeFloatRaw(mk_Context *c, float f) {
-  union {
-    float f;
-    unsigned u;
-  } u;
-  unsigned char	c_f[4];
-
-  u.f = f;
-  c_f[0] = u.u >> 24;
-  c_f[1] = u.u >> 16;
-  c_f[2] = u.u >> 8;
-  c_f[3] = u.u;
-
-  return mk_appendContextData(c, c_f, 4);
-}
-
-static int	  mk_writeFloat(mk_Context *c, unsigned id, float f) {
-  CHECK(mk_writeID(c, id));
-  CHECK(mk_writeSize(c, 4));
-  CHECK(mk_writeFloatRaw(c, f));
-  return 0;
-}
-
-static unsigned	  mk_ebmlSizeSize(unsigned s) {
-  if (s < 0x7f)
-    return 1;
-  if (s < 0x3fff)
-    return 2;
-  if (s < 0x1fffff)
-    return 3;
-  if (s < 0x0fffffff)
-    return 4;
-  return 5;
-}
-
-static unsigned	  mk_ebmlSIntSize(int64_t si) {
-  unsigned char	  c_si[8] = { si >> 56, si >> 48, si >> 40, si >> 32, si >> 24, si >> 16, si >> 8, si };
-  unsigned	  i = 0;
-
-  if (si < 0)
-    while (i < 7 && c_si[i] == 0xff && c_si[i+1] & 0x80)
-      ++i;
-  else
-    while (i < 7 && c_si[i] == 0 && !(c_si[i+1] & 0x80))
-      ++i;
-
-  return 8 - i;
-}
-
-mk_Writer *mk_createWriter(const char *filename) {
-  mk_Writer *w = malloc(sizeof(*w));
-  if (w == NULL)
-    return NULL;
-
-  memset(w, 0, sizeof(*w));
-
-  w->root = mk_createContext(w, NULL, 0);
-  if (w->root == NULL) {
-    free(w);
-    return NULL;
-  }
-
-  w->fp = fopen(filename, "wb");
-  if (w->fp == NULL) {
-    mk_destroyContexts(w);
-    free(w);
-    return NULL;
-  }
-
-  w->timescale = 1000000;
-
-  return w;
-}
-
-int	  mk_writeHeader(mk_Writer *w, const char *writingApp,
-			 const char *codecID,
-			 const void *codecPrivate, unsigned codecPrivateSize,
-			 int64_t default_frame_duration,
-			 int64_t timescale,
-			 unsigned width, unsigned height,
-			 unsigned d_width, unsigned d_height)
-{
-  mk_Context  *c, *ti, *v;
-
-  if (w->wrote_header)
-    return -1;
-
-  w->timescale = timescale;
-  w->def_duration = default_frame_duration;
-
-  if ((c = mk_createContext(w, w->root, 0x1a45dfa3)) == NULL) // EBML
-    return -1;
-  CHECK(mk_writeUInt(c, 0x4286, 1)); // EBMLVersion
-  CHECK(mk_writeUInt(c, 0x42f7, 1)); // EBMLReadVersion
-  CHECK(mk_writeUInt(c, 0x42f2, 4)); // EBMLMaxIDLength
-  CHECK(mk_writeUInt(c, 0x42f3, 8)); // EBMLMaxSizeLength
-  CHECK(mk_writeStr(c, 0x4282, "matroska")); // DocType
-  CHECK(mk_writeUInt(c, 0x4287, 1)); // DocTypeVersion
-  CHECK(mk_writeUInt(c, 0x4285, 1)); // DocTypeReadversion
-  CHECK(mk_closeContext(c, 0));
-
-  if ((c = mk_createContext(w, w->root, 0x18538067)) == NULL) // Segment
-    return -1;
-  CHECK(mk_flushContextID(c));
-  CHECK(mk_closeContext(c, 0));
-
-  if ((c = mk_createContext(w, w->root, 0x1549a966)) == NULL) // SegmentInfo
-    return -1;
-  CHECK(mk_writeStr(c, 0x4d80, "Haali Matroska Writer b0"));
-  CHECK(mk_writeStr(c, 0x5741, writingApp));
-  CHECK(mk_writeUInt(c, 0x2ad7b1, w->timescale));
-  CHECK(mk_writeFloat(c, 0x4489, 0));
-  w->duration_ptr = c->d_cur - 4;
-  CHECK(mk_closeContext(c, &w->duration_ptr));
-
-  if ((c = mk_createContext(w, w->root, 0x1654ae6b)) == NULL) // tracks
-    return -1;
-  if ((ti = mk_createContext(w, c, 0xae)) == NULL) // TrackEntry
-    return -1;
-  CHECK(mk_writeUInt(ti, 0xd7, 1)); // TrackNumber
-  CHECK(mk_writeUInt(ti, 0x73c5, 1)); // TrackUID
-  CHECK(mk_writeUInt(ti, 0x83, 1)); // TrackType
-  CHECK(mk_writeUInt(ti, 0x9c, 0)); // FlagLacing
-  CHECK(mk_writeStr(ti, 0x86, codecID)); // CodecID
-  if (codecPrivateSize)
-    CHECK(mk_writeBin(ti, 0x63a2, codecPrivate, codecPrivateSize)); // CodecPrivate
-  if (default_frame_duration)
-    CHECK(mk_writeUInt(ti, 0x23e383, default_frame_duration)); // DefaultDuration
-
-  if ((v = mk_createContext(w, ti, 0xe0)) == NULL) // Video
-    return -1;
-  CHECK(mk_writeUInt(v, 0xb0, width));
-  CHECK(mk_writeUInt(v, 0xba, height));
-  CHECK(mk_writeUInt(v, 0x54b0, d_width));
-  CHECK(mk_writeUInt(v, 0x54ba, d_height));
-  CHECK(mk_closeContext(v, 0));
-
-  CHECK(mk_closeContext(ti, 0));
-
-  CHECK(mk_closeContext(c, 0));
-
-  CHECK(mk_flushContextData(w->root));
-
-  w->wrote_header = 1;
-
-  return 0;
-}
-
-static int mk_closeCluster(mk_Writer *w) {
-  if (w->cluster == NULL)
-    return 0;
-  CHECK(mk_closeContext(w->cluster, 0));
-  w->cluster = NULL;
-  CHECK(mk_flushContextData(w->root));
-  return 0;
-}
-
-static int mk_flushFrame(mk_Writer *w) {
-  int64_t	delta, ref = 0;
-  unsigned	fsize, bgsize;
-  unsigned char	c_delta_flags[3];
-
-  if (!w->in_frame)
-    return 0;
-
-  delta = w->frame_tc/w->timescale - w->cluster_tc_scaled;
-  if (delta > 32767ll || delta < -32768ll)
-    CHECK(mk_closeCluster(w));
-
-  if (w->cluster == NULL) {
-    w->cluster_tc_scaled = w->frame_tc / w->timescale;
-    w->cluster = mk_createContext(w, w->root, 0x1f43b675); // Cluster
-    if (w->cluster == NULL)
-      return -1;
-
-    CHECK(mk_writeUInt(w->cluster, 0xe7, w->cluster_tc_scaled)); // Timecode
-
-    delta = 0;
-  }
-
-  fsize = w->frame ? w->frame->d_cur : 0;
-  bgsize = fsize + 4 + mk_ebmlSizeSize(fsize + 4) + 1;
-  if (!w->keyframe) {
-    ref = w->prev_frame_tc_scaled - w->cluster_tc_scaled - delta;
-    bgsize += 1 + 1 + mk_ebmlSIntSize(ref);
-  }
-
-  CHECK(mk_writeID(w->cluster, 0xa0)); // BlockGroup
-  CHECK(mk_writeSize(w->cluster, bgsize));
-  CHECK(mk_writeID(w->cluster, 0xa1)); // Block
-  CHECK(mk_writeSize(w->cluster, fsize + 4));
-  CHECK(mk_writeSize(w->cluster, 1)); // track number
-
-  c_delta_flags[0] = delta >> 8;
-  c_delta_flags[1] = delta;
-  c_delta_flags[2] = 0;
-  CHECK(mk_appendContextData(w->cluster, c_delta_flags, 3));
-  if (w->frame) {
-    CHECK(mk_appendContextData(w->cluster, w->frame->data, w->frame->d_cur));
-    w->frame->d_cur = 0;
-  }
-  if (!w->keyframe)
-    CHECK(mk_writeSInt(w->cluster, 0xfb, ref)); // ReferenceBlock
-
-  w->in_frame = 0;
-  w->prev_frame_tc_scaled = w->cluster_tc_scaled + delta;
-
-  if (w->cluster->d_cur > CLSIZE)
-    CHECK(mk_closeCluster(w));
-
-  return 0;
-}
-
-int	  mk_startFrame(mk_Writer *w) {
-  if (mk_flushFrame(w) < 0)
-    return -1;
-
-  w->in_frame = 1;
-  w->keyframe = 0;
-
-  return 0;
-}
-
-int	  mk_setFrameFlags(mk_Writer *w,int64_t timestamp, int keyframe) {
-  if (!w->in_frame)
-    return -1;
-
-  w->frame_tc = timestamp;
-  w->keyframe = keyframe != 0;
-
-  if (w->max_frame_tc < timestamp)
-    w->max_frame_tc = timestamp;
-
-  return 0;
-}
-
-int	  mk_addFrameData(mk_Writer *w, const void *data, unsigned size) {
-  if (!w->in_frame)
-    return -1;
-
-  if (w->frame == NULL)
-    if ((w->frame = mk_createContext(w, NULL, 0)) == NULL)
-      return -1;
-
-  return mk_appendContextData(w->frame, data, size);
-}
-
-int	  mk_close(mk_Writer *w) {
-  int	ret = 0;
-  if (mk_flushFrame(w) < 0 || mk_closeCluster(w) < 0)
-    ret = -1;
-  if (w->wrote_header) {
-    fseek(w->fp, w->duration_ptr, SEEK_SET);
-    if (mk_writeFloatRaw(w->root, (float)((double)(w->max_frame_tc+w->def_duration) / w->timescale)) < 0 ||
-	mk_flushContextData(w->root) < 0)
-      ret = -1;
-  }
-  mk_destroyContexts(w);
-  fclose(w->fp);
-  free(w);
-  return ret;
-}
-
diff --git a/muxers.c b/muxers.c
deleted file mode 100644
index d62be5c..0000000
--- a/muxers.c
+++ /dev/null
@@ -1,1019 +0,0 @@
-/*****************************************************************************
- * muxers.c: h264 file i/o plugins
- *****************************************************************************
- * Copyright (C) 2003-2008 x264 project
- *
- * Authors: Laurent Aimar <fenrir at via.ecp.fr>
- *          Loren Merritt <lorenm at u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *****************************************************************************/
-
-#include "common/common.h"
-#include "x264.h"
-#include "matroska.h"
-#include "muxers.h"
-
-#ifndef _MSC_VER
-#include "config.h"
-#endif
-
-#include <sys/types.h>
-
-#ifdef AVIS_INPUT
-#include <windows.h>
-#include <vfw.h>
-#endif
-
-#ifdef MP4_OUTPUT
-#include <gpac/isomedia.h>
-#endif
-
-static int64_t gcd( int64_t a, int64_t b )
-{
-    while (1)
-    {
-        int64_t c = a % b;
-        if( !c )
-            return b;
-        a = b;
-        b = c;
-    }
-}
-
-typedef struct {
-    FILE *fh;
-    int width, height;
-    int next_frame;
-} yuv_input_t;
-
-/* raw 420 yuv file operation */
-int open_file_yuv( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param )
-{
-    yuv_input_t *h = malloc(sizeof(yuv_input_t));
-    h->width = p_param->i_width;
-    h->height = p_param->i_height;
-    h->next_frame = 0;
-
-    if( !strcmp(psz_filename, "-") )
-        h->fh = stdin;
-    else
-        h->fh = fopen(psz_filename, "rb");
-    if( h->fh == NULL )
-        return -1;
-
-    *p_handle = (hnd_t)h;
-    return 0;
-}
-
-int get_frame_total_yuv( hnd_t handle )
-{
-    yuv_input_t *h = handle;
-    int i_frame_total = 0;
-
-    if( !fseek( h->fh, 0, SEEK_END ) )
-    {
-        uint64_t i_size = ftell( h->fh );
-        fseek( h->fh, 0, SEEK_SET );
-        i_frame_total = (int)(i_size / ( h->width * h->height * 3 / 2 ));
-    }
-
-    return i_frame_total;
-}
-
-int read_frame_yuv( x264_picture_t *p_pic, hnd_t handle, int i_frame )
-{
-    yuv_input_t *h = handle;
-
-    if( i_frame != h->next_frame )
-        if( fseek( h->fh, (uint64_t)i_frame * h->width * h->height * 3 / 2, SEEK_SET ) )
-            return -1;
-
-    if( fread( p_pic->img.plane[0], 1, h->width * h->height, h->fh ) <= 0
-            || fread( p_pic->img.plane[1], 1, h->width * h->height / 4, h->fh ) <= 0
-            || fread( p_pic->img.plane[2], 1, h->width * h->height / 4, h->fh ) <= 0 )
-        return -1;
-
-    h->next_frame = i_frame+1;
-
-    return 0;
-}
-
-int close_file_yuv(hnd_t handle)
-{
-    yuv_input_t *h = handle;
-    if( !h || !h->fh )
-        return 0;
-    fclose( h->fh );
-    free( h );
-    return 0;
-}
-
-/* YUV4MPEG2 raw 420 yuv file operation */
-typedef struct {
-    FILE *fh;
-    int width, height;
-    int next_frame;
-    int seq_header_len, frame_header_len;
-    int frame_size;
-} y4m_input_t;
-
-#define Y4M_MAGIC "YUV4MPEG2"
-#define MAX_YUV4_HEADER 80
-#define Y4M_FRAME_MAGIC "FRAME"
-#define MAX_FRAME_HEADER 80
-
-int open_file_y4m( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param )
-{
-    int  i, n, d;
-    char header[MAX_YUV4_HEADER+10];
-    char *tokstart, *tokend, *header_end;
-    y4m_input_t *h = malloc(sizeof(y4m_input_t));
-
-    h->next_frame = 0;
-
-    if( !strcmp(psz_filename, "-") )
-        h->fh = stdin;
-    else
-        h->fh = fopen(psz_filename, "rb");
-    if( h->fh == NULL )
-        return -1;
-
-    h->frame_header_len = strlen(Y4M_FRAME_MAGIC)+1;
-
-    /* Read header */
-    for( i=0; i<MAX_YUV4_HEADER; i++ )
-    {
-        header[i] = fgetc(h->fh);
-        if( header[i] == '\n' )
-        {
-            /* Add a space after last option. Makes parsing "444" vs
-               "444alpha" easier. */
-            header[i+1] = 0x20;
-            header[i+2] = 0;
-            break;
-        }
-    }
-    if( i == MAX_YUV4_HEADER || strncmp(header, Y4M_MAGIC, strlen(Y4M_MAGIC)) )
-        return -1;
-
-    /* Scan properties */
-    header_end = &header[i+1]; /* Include space */
-    h->seq_header_len = i+1;
-    for( tokstart = &header[strlen(Y4M_MAGIC)+1]; tokstart < header_end; tokstart++ )
-    {
-        if(*tokstart==0x20) continue;
-        switch(*tokstart++)
-        {
-        case 'W': /* Width. Required. */
-            h->width = p_param->i_width = strtol(tokstart, &tokend, 10);
-            tokstart=tokend;
-            break;
-        case 'H': /* Height. Required. */
-            h->height = p_param->i_height = strtol(tokstart, &tokend, 10);
-            tokstart=tokend;
-            break;
-        case 'C': /* Color space */
-            if( strncmp("420", tokstart, 3) )
-            {
-                fprintf(stderr, "Colorspace unhandled\n");
-                return -1;
-            }
-            tokstart = strchr(tokstart, 0x20);
-            break;
-        case 'I': /* Interlace type */
-            switch(*tokstart++)
-            {
-            case 'p': break;
-            case '?':
-            case 't':
-            case 'b':
-            case 'm':
-            default:
-                fprintf(stderr, "Warning, this sequence might be interlaced\n");
-            }
-            break;
-        case 'F': /* Frame rate - 0:0 if unknown */
-            if( sscanf(tokstart, "%d:%d", &n, &d) == 2 && n && d )
-            {
-                x264_reduce_fraction( &n, &d );
-                p_param->i_fps_num = n;
-                p_param->i_fps_den = d;
-            }
-            tokstart = strchr(tokstart, 0x20);
-            break;
-        case 'A': /* Pixel aspect - 0:0 if unknown */
-            /* Don't override the aspect ratio if sar has been explicitly set on the commandline. */
-            if( sscanf(tokstart, "%d:%d", &n, &d) == 2 && n && d && !p_param->vui.i_sar_width && !p_param->vui.i_sar_height )
-            {
-                x264_reduce_fraction( &n, &d );
-                p_param->vui.i_sar_width = n;
-                p_param->vui.i_sar_height = d;
-            }
-            tokstart = strchr(tokstart, 0x20);
-            break;
-        case 'X': /* Vendor extensions */
-            if( !strncmp("YSCSS=",tokstart,6) )
-            {
-                /* Older nonstandard pixel format representation */
-                tokstart += 6;
-                if( strncmp("420JPEG",tokstart,7) &&
-                    strncmp("420MPEG2",tokstart,8) &&
-                    strncmp("420PALDV",tokstart,8) )
-                {
-                    fprintf(stderr, "Unsupported extended colorspace\n");
-                    return -1;
-                }
-            }
-            tokstart = strchr(tokstart, 0x20);
-            break;
-        }
-    }
-
-    fprintf(stderr, "yuv4mpeg: %ix%i@%i/%ifps, %i:%i\n",
-            h->width, h->height, p_param->i_fps_num, p_param->i_fps_den,
-            p_param->vui.i_sar_width, p_param->vui.i_sar_height);
-
-    *p_handle = (hnd_t)h;
-    return 0;
-}
-
-/* Most common case: frame_header = "FRAME" */
-int get_frame_total_y4m( hnd_t handle )
-{
-    y4m_input_t *h             = handle;
-    int          i_frame_total = 0;
-    uint64_t     init_pos      = ftell(h->fh);
-
-    if( !fseek( h->fh, 0, SEEK_END ) )
-    {
-        uint64_t i_size = ftell( h->fh );
-        fseek( h->fh, init_pos, SEEK_SET );
-        i_frame_total = (int)((i_size - h->seq_header_len) /
-                              (3*(h->width*h->height)/2+h->frame_header_len));
-    }
-
-    return i_frame_total;
-}
-
-int read_frame_y4m( x264_picture_t *p_pic, hnd_t handle, int i_frame )
-{
-    int          slen = strlen(Y4M_FRAME_MAGIC);
-    int          i    = 0;
-    char         header[16];
-    y4m_input_t *h    = handle;
-
-    if( i_frame != h->next_frame )
-    {
-        if (fseek(h->fh, (uint64_t)i_frame*(3*(h->width*h->height)/2+h->frame_header_len)
-                  + h->seq_header_len, SEEK_SET))
-            return -1;
-    }
-
-    /* Read frame header - without terminating '\n' */
-    if (fread(header, 1, slen, h->fh) != slen)
-        return -1;
-
-    header[slen] = 0;
-    if (strncmp(header, Y4M_FRAME_MAGIC, slen))
-    {
-        fprintf(stderr, "Bad header magic (%"PRIx32" <=> %s)\n",
-                *((uint32_t*)header), header);
-        return -1;
-    }
-
-    /* Skip most of it */
-    while (i<MAX_FRAME_HEADER && fgetc(h->fh) != '\n')
-        i++;
-    if (i == MAX_FRAME_HEADER)
-    {
-        fprintf(stderr, "Bad frame header!\n");
-        return -1;
-    }
-    h->frame_header_len = i+slen+1;
-
-    if( fread(p_pic->img.plane[0], 1, h->width*h->height, h->fh) <= 0
-        || fread(p_pic->img.plane[1], 1, h->width * h->height / 4, h->fh) <= 0
-        || fread(p_pic->img.plane[2], 1, h->width * h->height / 4, h->fh) <= 0)
-        return -1;
-
-    h->next_frame = i_frame+1;
-
-    return 0;
-}
-
-int close_file_y4m(hnd_t handle)
-{
-    y4m_input_t *h = handle;
-    if( !h || !h->fh )
-        return 0;
-    fclose( h->fh );
-    free( h );
-    return 0;
-}
-
-/* avs/avi input file support under cygwin */
-
-#ifdef AVIS_INPUT
-typedef struct {
-    PAVISTREAM p_avi;
-    int width, height;
-} avis_input_t;
-
-int open_file_avis( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param )
-{
-    avis_input_t *h = malloc(sizeof(avis_input_t));
-    AVISTREAMINFO info;
-    int i;
-
-    *p_handle = (hnd_t)h;
-
-    AVIFileInit();
-    if( AVIStreamOpenFromFile( &h->p_avi, psz_filename, streamtypeVIDEO, 0, OF_READ, NULL ) )
-    {
-        AVIFileExit();
-        return -1;
-    }
-
-    if( AVIStreamInfo(h->p_avi, &info, sizeof(AVISTREAMINFO)) )
-    {
-        AVIStreamRelease(h->p_avi);
-        AVIFileExit();
-        return -1;
-    }
-
-    // check input format
-    if (info.fccHandler != MAKEFOURCC('Y', 'V', '1', '2'))
-    {
-        fprintf( stderr, "avis [error]: unsupported input format (%c%c%c%c)\n",
-            (char)(info.fccHandler & 0xff), (char)((info.fccHandler >> 8) & 0xff),
-            (char)((info.fccHandler >> 16) & 0xff), (char)((info.fccHandler >> 24)) );
-
-        AVIStreamRelease(h->p_avi);
-        AVIFileExit();
-
-        return -1;
-    }
-
-    h->width =
-    p_param->i_width = info.rcFrame.right - info.rcFrame.left;
-    h->height =
-    p_param->i_height = info.rcFrame.bottom - info.rcFrame.top;
-    i = gcd(info.dwRate, info.dwScale);
-    p_param->i_fps_den = info.dwScale / i;
-    p_param->i_fps_num = info.dwRate / i;
-
-    fprintf( stderr, "avis [info]: %dx%d @ %.2f fps (%d frames)\n",
-        p_param->i_width, p_param->i_height,
-        (double)p_param->i_fps_num / (double)p_param->i_fps_den,
-        (int)info.dwLength );
-
-    return 0;
-}
-
-int get_frame_total_avis( hnd_t handle )
-{
-    avis_input_t *h = handle;
-    AVISTREAMINFO info;
-
-    if( AVIStreamInfo(h->p_avi, &info, sizeof(AVISTREAMINFO)) )
-        return -1;
-
-    return info.dwLength;
-}
-
-int read_frame_avis( x264_picture_t *p_pic, hnd_t handle, int i_frame )
-{
-    avis_input_t *h = handle;
-
-    p_pic->img.i_csp = X264_CSP_YV12;
-
-    if( AVIStreamRead(h->p_avi, i_frame, 1, p_pic->img.plane[0], h->width * h->height * 3 / 2, NULL, NULL ) )
-        return -1;
-
-    return 0;
-}
-
-int close_file_avis( hnd_t handle )
-{
-    avis_input_t *h = handle;
-    AVIStreamRelease(h->p_avi);
-    AVIFileExit();
-    free(h);
-    return 0;
-}
-#endif
-
-
-#ifdef HAVE_PTHREAD
-typedef struct {
-    int (*p_read_frame)( x264_picture_t *p_pic, hnd_t handle, int i_frame );
-    int (*p_close_infile)( hnd_t handle );
-    hnd_t p_handle;
-    x264_picture_t pic;
-    x264_pthread_t tid;
-    int next_frame;
-    int frame_total;
-    int in_progress;
-    struct thread_input_arg_t *next_args;
-} thread_input_t;
-
-typedef struct thread_input_arg_t {
-    thread_input_t *h;
-    x264_picture_t *pic;
-    int i_frame;
-    int status;
-} thread_input_arg_t;
-
-int open_file_thread( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param )
-{
-    thread_input_t *h = malloc(sizeof(thread_input_t));
-    x264_picture_alloc( &h->pic, X264_CSP_I420, p_param->i_width, p_param->i_height );
-    h->p_read_frame = p_read_frame;
-    h->p_close_infile = p_close_infile;
-    h->p_handle = *p_handle;
-    h->in_progress = 0;
-    h->next_frame = -1;
-    h->next_args = malloc(sizeof(thread_input_arg_t));
-    h->next_args->h = h;
-    h->next_args->status = 0;
-    h->frame_total = p_get_frame_total( h->p_handle );
-
-    *p_handle = (hnd_t)h;
-    return 0;
-}
-
-int get_frame_total_thread( hnd_t handle )
-{
-    thread_input_t *h = handle;
-    return h->frame_total;
-}
-
-static void read_frame_thread_int( thread_input_arg_t *i )
-{
-    i->status = i->h->p_read_frame( i->pic, i->h->p_handle, i->i_frame );
-}
-
-int read_frame_thread( x264_picture_t *p_pic, hnd_t handle, int i_frame )
-{
-    thread_input_t *h = handle;
-    UNUSED void *stuff;
-    int ret = 0;
-
-    if( h->next_frame >= 0 )
-    {
-        x264_pthread_join( h->tid, &stuff );
-        ret |= h->next_args->status;
-        h->in_progress = 0;
-    }
-
-    if( h->next_frame == i_frame )
-    {
-        XCHG( x264_picture_t, *p_pic, h->pic );
-    }
-    else
-    {
-        ret |= h->p_read_frame( p_pic, h->p_handle, i_frame );
-    }
-
-    if( !h->frame_total || i_frame+1 < h->frame_total )
-    {
-        h->next_frame =
-        h->next_args->i_frame = i_frame+1;
-        h->next_args->pic = &h->pic;
-        x264_pthread_create( &h->tid, NULL, (void*)read_frame_thread_int, h->next_args );
-        h->in_progress = 1;
-    }
-    else
-        h->next_frame = -1;
-
-    return ret;
-}
-
-int close_file_thread( hnd_t handle )
-{
-    thread_input_t *h = handle;
-    if( h->in_progress )
-        x264_pthread_join( h->tid, NULL );
-    h->p_close_infile( h->p_handle );
-    x264_picture_clean( &h->pic );
-    free( h->next_args );
-    free( h );
-    return 0;
-}
-#endif
-
-
-int open_file_bsf( char *psz_filename, hnd_t *p_handle )
-{
-    if ((*p_handle = fopen(psz_filename, "w+b")) == NULL)
-        return -1;
-
-    return 0;
-}
-
-int set_param_bsf( hnd_t handle, x264_param_t *p_param )
-{
-    return 0;
-}
-
-int write_nalu_bsf( hnd_t handle, uint8_t *p_nalu, int i_size )
-{
-    if (fwrite(p_nalu, i_size, 1, (FILE *)handle) > 0)
-        return i_size;
-    return -1;
-}
-
-int set_eop_bsf( hnd_t handle,  x264_picture_t *p_picture )
-{
-    return 0;
-}
-
-int close_file_bsf( hnd_t handle )
-{
-    if ((handle == NULL) || (handle == stdout))
-        return 0;
-
-    return fclose((FILE *)handle);
-}
-
-/* -- mp4 muxing support ------------------------------------------------- */
-#ifdef MP4_OUTPUT
-
-typedef struct
-{
-    GF_ISOFile *p_file;
-    GF_AVCConfig *p_config;
-    GF_ISOSample *p_sample;
-    int i_track;
-    uint32_t i_descidx;
-    int i_time_inc;
-    int i_time_res;
-    int i_numframe;
-    int i_init_delay;
-    uint8_t b_sps;
-    uint8_t b_pps;
-} mp4_t;
-
-
-static void recompute_bitrate_mp4(GF_ISOFile *p_file, int i_track)
-{
-    u32 i, count, di, timescale, time_wnd, rate;
-    u64 offset;
-    Double br;
-    GF_ESD *esd;
-
-    esd = gf_isom_get_esd(p_file, i_track, 1);
-    if (!esd) return;
-
-    esd->decoderConfig->avgBitrate = 0;
-    esd->decoderConfig->maxBitrate = 0;
-    rate = time_wnd = 0;
-
-    timescale = gf_isom_get_media_timescale(p_file, i_track);
-    count = gf_isom_get_sample_count(p_file, i_track);
-    for (i=0; i<count; i++) {
-        GF_ISOSample *samp = gf_isom_get_sample_info(p_file, i_track, i+1, &di, &offset);
-
-        if (samp->dataLength>esd->decoderConfig->bufferSizeDB) esd->decoderConfig->bufferSizeDB = samp->dataLength;
-
-        if (esd->decoderConfig->bufferSizeDB < samp->dataLength) esd->decoderConfig->bufferSizeDB = samp->dataLength;
-        esd->decoderConfig->avgBitrate += samp->dataLength;
-        rate += samp->dataLength;
-        if (samp->DTS > time_wnd + timescale) {
-            if (rate > esd->decoderConfig->maxBitrate) esd->decoderConfig->maxBitrate = rate;
-            time_wnd = samp->DTS;
-            rate = 0;
-        }
-
-        gf_isom_sample_del(&samp);
-    }
-
-    br = (Double) (s64) gf_isom_get_media_duration(p_file, i_track);
-    br /= timescale;
-    esd->decoderConfig->avgBitrate = (u32) (esd->decoderConfig->avgBitrate / br);
-    /*move to bps*/
-    esd->decoderConfig->avgBitrate *= 8;
-    esd->decoderConfig->maxBitrate *= 8;
-
-    gf_isom_change_mpeg4_description(p_file, i_track, 1, esd);
-    gf_odf_desc_del((GF_Descriptor *) esd);
-}
-
-
-int close_file_mp4( hnd_t handle )
-{
-    mp4_t *p_mp4 = (mp4_t *)handle;
-
-    if (p_mp4 == NULL)
-        return 0;
-
-    if (p_mp4->p_config)
-        gf_odf_avc_cfg_del(p_mp4->p_config);
-
-    if (p_mp4->p_sample)
-    {
-        if (p_mp4->p_sample->data)
-            free(p_mp4->p_sample->data);
-
-        gf_isom_sample_del(&p_mp4->p_sample);
-    }
-
-    if (p_mp4->p_file)
-    {
-        recompute_bitrate_mp4(p_mp4->p_file, p_mp4->i_track);
-        gf_isom_set_pl_indication(p_mp4->p_file, GF_ISOM_PL_VISUAL, 0x15);
-        gf_isom_set_storage_mode(p_mp4->p_file, GF_ISOM_STORE_FLAT);
-        gf_isom_close(p_mp4->p_file);
-    }
-
-    free(p_mp4);
-
-    return 0;
-}
-
-int open_file_mp4( char *psz_filename, hnd_t *p_handle )
-{
-    mp4_t *p_mp4;
-
-    *p_handle = NULL;
-
-    if ((p_mp4 = (mp4_t *)malloc(sizeof(mp4_t))) == NULL)
-        return -1;
-
-    memset(p_mp4, 0, sizeof(mp4_t));
-    p_mp4->p_file = gf_isom_open(psz_filename, GF_ISOM_OPEN_WRITE, NULL);
-
-    if ((p_mp4->p_sample = gf_isom_sample_new()) == NULL)
-    {
-        close_file_mp4( p_mp4 );
-        return -1;
-    }
-
-    gf_isom_set_brand_info(p_mp4->p_file, GF_ISOM_BRAND_AVC1, 0);
-
-    *p_handle = p_mp4;
-
-    return 0;
-}
-
-
-int set_param_mp4( hnd_t handle, x264_param_t *p_param )
-{
-    mp4_t *p_mp4 = (mp4_t *)handle;
-
-    p_mp4->i_track = gf_isom_new_track(p_mp4->p_file, 0, GF_ISOM_MEDIA_VISUAL,
-        p_param->i_fps_num);
-
-    p_mp4->p_config = gf_odf_avc_cfg_new();
-    gf_isom_avc_config_new(p_mp4->p_file, p_mp4->i_track, p_mp4->p_config,
-        NULL, NULL, &p_mp4->i_descidx);
-
-    gf_isom_set_track_enabled(p_mp4->p_file, p_mp4->i_track, 1);
-
-    gf_isom_set_visual_info(p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx,
-        p_param->i_width, p_param->i_height);
-
-    if( p_param->vui.i_sar_width && p_param->vui.i_sar_height )
-    {
-        uint64_t dw = p_param->i_width << 16;
-        uint64_t dh = p_param->i_height << 16;
-        double sar = (double)p_param->vui.i_sar_width / p_param->vui.i_sar_height;
-        if( sar > 1.0 )
-            dw *= sar ;
-        else
-            dh /= sar;
-        gf_isom_set_track_layout_info( p_mp4->p_file, p_mp4->i_track, dw, dh, 0, 0, 0 );
-    }
-
-    p_mp4->p_sample->data = (char *)malloc(p_param->i_width * p_param->i_height * 3 / 2);
-    if (p_mp4->p_sample->data == NULL)
-        return -1;
-
-    p_mp4->i_time_res = p_param->i_fps_num;
-    p_mp4->i_time_inc = p_param->i_fps_den;
-    p_mp4->i_init_delay = p_param->i_bframe ? (p_param->b_bframe_pyramid ? 2 : 1) : 0;
-    p_mp4->i_init_delay *= p_mp4->i_time_inc;
-    fprintf(stderr, "mp4 [info]: initial delay %d (scale %d)\n",
-        p_mp4->i_init_delay, p_mp4->i_time_res);
-
-    return 0;
-}
-
-
-int write_nalu_mp4( hnd_t handle, uint8_t *p_nalu, int i_size )
-{
-    mp4_t *p_mp4 = (mp4_t *)handle;
-    GF_AVCConfigSlot *p_slot;
-    uint8_t type = p_nalu[4] & 0x1f;
-    int psize;
-
-    switch(type)
-    {
-    // sps
-    case 0x07:
-        if (!p_mp4->b_sps)
-        {
-            p_mp4->p_config->configurationVersion = 1;
-            p_mp4->p_config->AVCProfileIndication = p_nalu[5];
-            p_mp4->p_config->profile_compatibility = p_nalu[6];
-            p_mp4->p_config->AVCLevelIndication = p_nalu[7];
-            p_slot = (GF_AVCConfigSlot *)malloc(sizeof(GF_AVCConfigSlot));
-            p_slot->size = i_size - 4;
-            p_slot->data = (char *)malloc(p_slot->size);
-            memcpy(p_slot->data, p_nalu + 4, i_size - 4);
-            gf_list_add(p_mp4->p_config->sequenceParameterSets, p_slot);
-            p_slot = NULL;
-            p_mp4->b_sps = 1;
-        }
-        break;
-
-    // pps
-    case 0x08:
-        if (!p_mp4->b_pps)
-        {
-            p_slot = (GF_AVCConfigSlot *)malloc(sizeof(GF_AVCConfigSlot));
-            p_slot->size = i_size - 4;
-            p_slot->data = (char *)malloc(p_slot->size);
-            memcpy(p_slot->data, p_nalu + 4, i_size - 4);
-            gf_list_add(p_mp4->p_config->pictureParameterSets, p_slot);
-            p_slot = NULL;
-            p_mp4->b_pps = 1;
-            if (p_mp4->b_sps)
-                gf_isom_avc_config_update(p_mp4->p_file, p_mp4->i_track, 1, p_mp4->p_config);
-        }
-        break;
-
-    // slice, sei
-    case 0x1:
-    case 0x5:
-    case 0x6:
-        psize = i_size - 4 ;
-        memcpy(p_mp4->p_sample->data + p_mp4->p_sample->dataLength, p_nalu, i_size);
-        p_mp4->p_sample->data[p_mp4->p_sample->dataLength + 0] = (psize >> 24) & 0xff;
-        p_mp4->p_sample->data[p_mp4->p_sample->dataLength + 1] = (psize >> 16) & 0xff;
-        p_mp4->p_sample->data[p_mp4->p_sample->dataLength + 2] = (psize >> 8) & 0xff;
-        p_mp4->p_sample->data[p_mp4->p_sample->dataLength + 3] = (psize >> 0) & 0xff;
-        p_mp4->p_sample->dataLength += i_size;
-        break;
-    }
-
-    return i_size;
-}
-
-int set_eop_mp4( hnd_t handle, x264_picture_t *p_picture )
-{
-    mp4_t *p_mp4 = (mp4_t *)handle;
-    uint64_t dts = (uint64_t)p_mp4->i_numframe * p_mp4->i_time_inc;
-    uint64_t pts = (uint64_t)p_picture->i_pts;
-    int32_t offset = p_mp4->i_init_delay + pts - dts;
-
-    p_mp4->p_sample->IsRAP = p_picture->i_type == X264_TYPE_IDR ? 1 : 0;
-    p_mp4->p_sample->DTS = dts;
-    p_mp4->p_sample->CTS_Offset = offset;
-    gf_isom_add_sample(p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx, p_mp4->p_sample);
-
-    p_mp4->p_sample->dataLength = 0;
-    p_mp4->i_numframe++;
-
-    return 0;
-}
-
-#endif
-
-
-/* -- mkv muxing support ------------------------------------------------- */
-typedef struct
-{
-    mk_Writer *w;
-
-    uint8_t   *sps, *pps;
-    int       sps_len, pps_len;
-
-    int       width, height, d_width, d_height;
-
-    int64_t   frame_duration;
-    int       fps_num;
-
-    int       b_header_written;
-    char      b_writing_frame;
-} mkv_t;
-
-static int write_header_mkv( mkv_t *p_mkv )
-{
-    int       ret;
-    uint8_t   *avcC;
-    int  avcC_len;
-
-    if( p_mkv->sps == NULL || p_mkv->pps == NULL ||
-        p_mkv->width == 0 || p_mkv->height == 0 ||
-        p_mkv->d_width == 0 || p_mkv->d_height == 0)
-        return -1;
-
-    avcC_len = 5 + 1 + 2 + p_mkv->sps_len + 1 + 2 + p_mkv->pps_len;
-    avcC = malloc(avcC_len);
-    if (avcC == NULL)
-        return -1;
-
-    avcC[0] = 1;
-    avcC[1] = p_mkv->sps[1];
-    avcC[2] = p_mkv->sps[2];
-    avcC[3] = p_mkv->sps[3];
-    avcC[4] = 0xff; // nalu size length is four bytes
-    avcC[5] = 0xe1; // one sps
-
-    avcC[6] = p_mkv->sps_len >> 8;
-    avcC[7] = p_mkv->sps_len;
-
-    memcpy(avcC+8, p_mkv->sps, p_mkv->sps_len);
-
-    avcC[8+p_mkv->sps_len] = 1; // one pps
-    avcC[9+p_mkv->sps_len] = p_mkv->pps_len >> 8;
-    avcC[10+p_mkv->sps_len] = p_mkv->pps_len;
-
-    memcpy( avcC+11+p_mkv->sps_len, p_mkv->pps, p_mkv->pps_len );
-
-    ret = mk_writeHeader( p_mkv->w, "x264", "V_MPEG4/ISO/AVC",
-                          avcC, avcC_len, p_mkv->frame_duration, 50000,
-                          p_mkv->width, p_mkv->height,
-                          p_mkv->d_width, p_mkv->d_height );
-
-    free( avcC );
-
-    p_mkv->b_header_written = 1;
-
-    return ret;
-}
-
-int open_file_mkv( char *psz_filename, hnd_t *p_handle )
-{
-    mkv_t *p_mkv;
-
-    *p_handle = NULL;
-
-    p_mkv  = malloc(sizeof(*p_mkv));
-    if (p_mkv == NULL)
-        return -1;
-
-    memset(p_mkv, 0, sizeof(*p_mkv));
-
-    p_mkv->w = mk_createWriter(psz_filename);
-    if (p_mkv->w == NULL)
-    {
-        free(p_mkv);
-        return -1;
-    }
-
-    *p_handle = p_mkv;
-
-    return 0;
-}
-
-int set_param_mkv( hnd_t handle, x264_param_t *p_param )
-{
-    mkv_t   *p_mkv = handle;
-    int64_t dw, dh;
-
-    if( p_param->i_fps_num > 0 )
-    {
-        p_mkv->frame_duration = (int64_t)p_param->i_fps_den *
-                                (int64_t)1000000000 / p_param->i_fps_num;
-        p_mkv->fps_num = p_param->i_fps_num;
-    }
-    else
-    {
-        p_mkv->frame_duration = 0;
-        p_mkv->fps_num = 1;
-    }
-
-    p_mkv->width = p_param->i_width;
-    p_mkv->height = p_param->i_height;
-
-    if( p_param->vui.i_sar_width && p_param->vui.i_sar_height )
-    {
-        dw = (int64_t)p_param->i_width  * p_param->vui.i_sar_width;
-        dh = (int64_t)p_param->i_height * p_param->vui.i_sar_height;
-    }
-    else
-    {
-        dw = p_param->i_width;
-        dh = p_param->i_height;
-    }
-
-    if( dw > 0 && dh > 0 )
-    {
-        int64_t x = gcd( dw, dh );
-        dw /= x;
-        dh /= x;
-    }
-
-    p_mkv->d_width = (int)dw;
-    p_mkv->d_height = (int)dh;
-
-    return 0;
-}
-
-int write_nalu_mkv( hnd_t handle, uint8_t *p_nalu, int i_size )
-{
-    mkv_t *p_mkv = handle;
-    uint8_t type = p_nalu[4] & 0x1f;
-    uint8_t dsize[4];
-    int psize;
-
-    switch( type )
-    {
-    // sps
-    case 0x07:
-        if( !p_mkv->sps )
-        {
-            p_mkv->sps = malloc(i_size - 4);
-            if (p_mkv->sps == NULL)
-                return -1;
-            p_mkv->sps_len = i_size - 4;
-            memcpy(p_mkv->sps, p_nalu + 4, i_size - 4);
-        }
-        break;
-
-    // pps
-    case 0x08:
-        if( !p_mkv->pps )
-        {
-            p_mkv->pps = malloc(i_size - 4);
-            if (p_mkv->pps == NULL)
-                return -1;
-            p_mkv->pps_len = i_size - 4;
-            memcpy(p_mkv->pps, p_nalu + 4, i_size - 4);
-        }
-        break;
-
-    // slice, sei
-    case 0x1:
-    case 0x5:
-    case 0x6:
-        if( !p_mkv->b_writing_frame )
-        {
-            if( mk_startFrame(p_mkv->w) < 0 )
-                return -1;
-            p_mkv->b_writing_frame = 1;
-        }
-        psize = i_size - 4 ;
-        dsize[0] = psize >> 24;
-        dsize[1] = psize >> 16;
-        dsize[2] = psize >> 8;
-        dsize[3] = psize;
-        if( mk_addFrameData(p_mkv->w, dsize, 4) < 0 ||
-            mk_addFrameData(p_mkv->w, p_nalu + 4, i_size - 4) < 0 )
-            return -1;
-        break;
-
-    default:
-        break;
-    }
-
-    if( !p_mkv->b_header_written && p_mkv->pps && p_mkv->sps &&
-        write_header_mkv(p_mkv) < 0 )
-        return -1;
-
-    return i_size;
-}
-
-int set_eop_mkv( hnd_t handle, x264_picture_t *p_picture )
-{
-    mkv_t *p_mkv = handle;
-    int64_t i_stamp = (int64_t)(p_picture->i_pts * 1e9 / p_mkv->fps_num);
-
-    p_mkv->b_writing_frame = 0;
-
-    return mk_setFrameFlags( p_mkv->w, i_stamp,
-                             p_picture->i_type == X264_TYPE_IDR );
-}
-
-int close_file_mkv( hnd_t handle )
-{
-    mkv_t *p_mkv = handle;
-    int   ret;
-
-    if( p_mkv->sps )
-        free( p_mkv->sps );
-    if( p_mkv->pps )
-        free( p_mkv->pps );
-
-    ret = mk_close(p_mkv->w);
-
-    free( p_mkv );
-
-    return ret;
-}
-
diff --git a/muxers.h b/muxers.h
index aaede1c..041dbbc 100644
--- a/muxers.h
+++ b/muxers.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
- * muxers.c: h264 file i/o plugins
+ * muxers.h: h264 file i/o modules
  *****************************************************************************
- * Copyright (C) 2003-2008 x264 project
+ * Copyright (C) 2003-2009 x264 project
  *
  * Authors: Laurent Aimar <fenrir at via.ecp.fr>
  *          Loren Merritt <lorenm at u.washington.edu>
@@ -24,49 +24,33 @@
 #ifndef X264_MUXERS_H
 #define X264_MUXERS_H
 
-typedef void *hnd_t;
-
-int open_file_yuv( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param );
-int get_frame_total_yuv( hnd_t handle );
-int read_frame_yuv( x264_picture_t *p_pic, hnd_t handle, int i_frame );
-int close_file_yuv( hnd_t handle );
-
-int open_file_y4m( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param );
-int get_frame_total_y4m( hnd_t handle );
-int read_frame_y4m( x264_picture_t *p_pic, hnd_t handle, int i_frame );
-int close_file_y4m( hnd_t handle );
-
-int open_file_avis( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param );
-int get_frame_total_avis( hnd_t handle );
-int read_frame_avis( x264_picture_t *p_pic, hnd_t handle, int i_frame );
-int close_file_avis( hnd_t handle );
+#include "common/common.h"
+#include "x264.h"
 
-int open_file_thread( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param );
-int get_frame_total_thread( hnd_t handle );
-int read_frame_thread( x264_picture_t *p_pic, hnd_t handle, int i_frame );
-int close_file_thread( hnd_t handle );
-
-int open_file_bsf( char *psz_filename, hnd_t *p_handle );
-int set_param_bsf( hnd_t handle, x264_param_t *p_param );
-int write_nalu_bsf( hnd_t handle, uint8_t *p_nal, int i_size );
-int set_eop_bsf( hnd_t handle,  x264_picture_t *p_picture );
-int close_file_bsf( hnd_t handle );
-
-int open_file_mp4( char *psz_filename, hnd_t *p_handle );
-int set_param_mp4( hnd_t handle, x264_param_t *p_param );
-int write_nalu_mp4( hnd_t handle, uint8_t *p_nal, int i_size );
-int set_eop_mp4( hnd_t handle, x264_picture_t *p_picture );
-int close_file_mp4( hnd_t handle );
-
-int open_file_mkv( char *psz_filename, hnd_t *p_handle );
-int set_param_mkv( hnd_t handle, x264_param_t *p_param );
-int write_nalu_mkv( hnd_t handle, uint8_t *p_nal, int i_size );
-int set_eop_mkv( hnd_t handle, x264_picture_t *p_picture );
-int close_file_mkv( hnd_t handle );
+typedef void *hnd_t;
 
-extern int (*p_open_infile)( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param );
-extern int (*p_get_frame_total)( hnd_t handle );
-extern int (*p_read_frame)( x264_picture_t *p_pic, hnd_t handle, int i_frame );
-extern int (*p_close_infile)( hnd_t handle );
+static inline int64_t gcd( int64_t a, int64_t b )
+{
+    while( 1 )
+    {
+        int64_t c = a % b;
+        if( !c )
+            return b;
+        a = b;
+        b = c;
+    }
+}
+
+static inline char *get_filename_extension( char *filename )
+{
+    char *ext = filename + strlen( filename );
+    while( *ext != '.' && ext > filename )
+        ext--;
+    ext += *ext == '.';
+    return ext;
+}
+
+#include "input/input.h"
+#include "output/output.h"
 
 #endif
diff --git a/output/flv.c b/output/flv.c
new file mode 100644
index 0000000..b3e5d16
--- /dev/null
+++ b/output/flv.c
@@ -0,0 +1,308 @@
+/*****************************************************************************
+ * flv.c:
+ *****************************************************************************
+ * Copyright (C) 2009 Kieran Kunhya <kieran at kunhya.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+#include "flv_bytestream.h"
+
+#define CHECK(x)\
+do {\
+    if( (x) < 0 )\
+        return -1;\
+} while( 0 )
+
+typedef struct
+{
+    flv_buffer *c;
+
+    uint8_t *sei;
+    int sei_len;
+
+    int64_t i_fps_num;
+    int64_t i_fps_den;
+    int64_t i_framenum;
+
+    uint64_t i_framerate_pos;
+    uint64_t i_duration_pos;
+    uint64_t i_filesize_pos;
+    uint64_t i_bitrate_pos;
+
+    uint8_t b_write_length;
+    int64_t i_prev_dts;
+    int64_t i_prev_pts;
+
+    int i_timebase_num;
+    int i_timebase_den;
+    int b_vfr_input;
+
+    unsigned start;
+} flv_hnd_t;
+
+static int write_header( flv_buffer *c )
+{
+    x264_put_tag( c, "FLV" ); // Signature
+    x264_put_byte( c, 1 );    // Version
+    x264_put_byte( c, 1 );    // Video Only
+    x264_put_be32( c, 9 );    // DataOffset
+    x264_put_be32( c, 0 );    // PreviousTagSize0
+
+    return flv_flush_data( c );
+}
+
+static int open_file( char *psz_filename, hnd_t *p_handle )
+{
+    flv_hnd_t *p_flv = malloc( sizeof(*p_flv) );
+    *p_handle = NULL;
+    if( !p_flv )
+        return -1;
+    memset( p_flv, 0, sizeof(*p_flv) );
+
+    p_flv->c = flv_create_writer( psz_filename );
+    if( !p_flv->c )
+        return -1;
+
+    CHECK( write_header( p_flv->c ) );
+    *p_handle = p_flv;
+
+    return 0;
+}
+
+static int set_param( hnd_t handle, x264_param_t *p_param )
+{
+    flv_hnd_t *p_flv = handle;
+    flv_buffer *c = p_flv->c;
+
+    x264_put_byte( c, FLV_TAG_TYPE_META ); // Tag Type "script data"
+
+    int start = c->d_cur;
+    x264_put_be24( c, 0 ); // data length
+    x264_put_be24( c, 0 ); // timestamp
+    x264_put_be32( c, 0 ); // reserved
+
+    x264_put_byte( c, AMF_DATA_TYPE_STRING );
+    x264_put_amf_string( c, "onMetaData" );
+
+    x264_put_byte( c, AMF_DATA_TYPE_MIXEDARRAY );
+    x264_put_be32( c, 7 );
+
+    x264_put_amf_string( c, "width" );
+    x264_put_amf_double( c, p_param->i_width );
+
+    x264_put_amf_string( c, "height" );
+    x264_put_amf_double( c, p_param->i_height );
+
+    x264_put_amf_string( c, "framerate" );
+
+    if( !p_param->b_vfr_input )
+        x264_put_amf_double( c, (double)p_param->i_fps_num / p_param->i_fps_den );
+    else
+    {
+        p_flv->i_framerate_pos = c->d_cur + c->d_total + 1;
+        x264_put_amf_double( c, 0 ); // written at end of encoding
+    }
+
+    x264_put_amf_string( c, "videocodecid" );
+    x264_put_amf_double( c, FLV_CODECID_H264 );
+
+    x264_put_amf_string( c, "duration" );
+    p_flv->i_duration_pos = c->d_cur + c->d_total + 1;
+    x264_put_amf_double( c, 0 ); // written at end of encoding
+
+    x264_put_amf_string( c, "filesize" );
+    p_flv->i_filesize_pos = c->d_cur + c->d_total + 1;
+    x264_put_amf_double( c, 0 ); // written at end of encoding
+
+    x264_put_amf_string( c, "videodatarate" );
+    p_flv->i_bitrate_pos = c->d_cur + c->d_total + 1;
+    x264_put_amf_double( c, 0 ); // written at end of encoding
+
+    x264_put_amf_string( c, "" );
+    x264_put_byte( c, AMF_END_OF_OBJECT );
+
+    unsigned length = c->d_cur - start;
+    rewrite_amf_be24( c, length - 10, start );
+
+    x264_put_be32( c, length + 1 ); // tag length
+
+    p_flv->i_fps_num = p_param->i_fps_num;
+    p_flv->i_fps_den = p_param->i_fps_den;
+    p_flv->i_timebase_num = p_param->i_timebase_num;
+    p_flv->i_timebase_den = p_param->i_timebase_den;
+    p_flv->b_vfr_input = p_param->b_vfr_input;
+
+    return 0;
+}
+
+static int write_headers( hnd_t handle, x264_nal_t *p_nal )
+{
+    flv_hnd_t *p_flv = handle;
+    flv_buffer *c = p_flv->c;
+
+    int sei_size = p_nal[0].i_payload;
+    int sps_size = p_nal[1].i_payload;
+    int pps_size = p_nal[2].i_payload;
+
+    // SEI
+    /* It is within the spec to write this as-is but for
+     * mplayer/ffmpeg playback this is deferred until before the first frame */
+
+    p_flv->sei = malloc( sei_size );
+    if( !p_flv->sei )
+        return -1;
+    p_flv->sei_len = sei_size;
+
+    memcpy( p_flv->sei, p_nal[0].p_payload, sei_size );
+
+    // SPS
+    uint8_t *sps = p_nal[1].p_payload + 4;
+
+    x264_put_byte( c, FLV_TAG_TYPE_VIDEO );
+    x264_put_be24( c, 0 ); // rewrite later
+    x264_put_be24( c, 0 ); // timestamp
+    x264_put_byte( c, 0 ); // timestamp extended
+    x264_put_be24( c, 0 ); // StreamID - Always 0
+    p_flv->start = c->d_cur; // needed for overwriting length
+
+    x264_put_byte( c, 7 | FLV_FRAME_KEY ); // Frametype and CodecID
+    x264_put_byte( c, 0 ); // AVC sequence header
+    x264_put_be24( c, 0 ); // composition time
+
+    x264_put_byte( c, 1 );      // version
+    x264_put_byte( c, sps[1] ); // profile
+    x264_put_byte( c, sps[2] ); // profile
+    x264_put_byte( c, sps[3] ); // level
+    x264_put_byte( c, 0xff );   // 6 bits reserved (111111) + 2 bits nal size length - 1 (11)
+    x264_put_byte( c, 0xe1 );   // 3 bits reserved (111) + 5 bits number of sps (00001)
+
+    x264_put_be16( c, sps_size - 4 );
+    flv_append_data( c, sps, sps_size - 4 );
+
+    // PPS
+    x264_put_byte( c, 1 ); // number of pps
+    x264_put_be16( c, pps_size - 4 );
+    flv_append_data( c, p_nal[2].p_payload + 4, pps_size - 4 );
+
+    // rewrite data length info
+    unsigned length = c->d_cur - p_flv->start;
+    rewrite_amf_be24( c, length, p_flv->start - 10 );
+    x264_put_be32( c, length + 11 ); // Last tag size
+    CHECK( flv_flush_data( c ) );
+
+    return sei_size + sps_size + pps_size;
+}
+
+static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture )
+{
+    flv_hnd_t *p_flv = handle;
+    flv_buffer *c = p_flv->c;
+
+    int64_t dts = (int64_t)( (p_picture->i_dts * 1000 * ((double)p_flv->i_timebase_num / p_flv->i_timebase_den)) + 0.5 );
+    int64_t cts = (int64_t)( (p_picture->i_pts * 1000 * ((double)p_flv->i_timebase_num / p_flv->i_timebase_den)) + 0.5 );
+    int64_t offset = cts - dts;
+
+    if( p_flv->i_framenum )
+    {
+        int64_t prev_dts = (int64_t)( (p_flv->i_prev_dts * 1000 * ((double)p_flv->i_timebase_num / p_flv->i_timebase_den)) + 0.5 );
+        int64_t prev_cts = (int64_t)( (p_flv->i_prev_pts * 1000 * ((double)p_flv->i_timebase_num / p_flv->i_timebase_den)) + 0.5 );
+        if( prev_dts == dts )
+        {
+            double fps = ((double)p_flv->i_timebase_den / p_flv->i_timebase_num) / (p_picture->i_dts - p_flv->i_prev_dts);
+            fprintf( stderr, "flv [warning]: duplicate DTS %"PRId64" generated by rounding\n"
+                             "               current internal decoding framerate: %.6f fps\n", dts, fps );
+        }
+        if( prev_cts == cts )
+        {
+            double fps = ((double)p_flv->i_timebase_den / p_flv->i_timebase_num) / (p_picture->i_pts - p_flv->i_prev_pts);
+            fprintf( stderr, "flv [warning]: duplicate CTS %"PRId64" generated by rounding\n"
+                             "               current internal composition framerate: %.6f fps\n", cts, fps );
+        }
+    }
+    p_flv->i_prev_dts = p_picture->i_dts;
+    p_flv->i_prev_pts = p_picture->i_pts;
+
+    // A new frame - write packet header
+    x264_put_byte( c, FLV_TAG_TYPE_VIDEO );
+    x264_put_be24( c, 0 ); // calculated later
+    x264_put_be24( c, dts );
+    x264_put_byte( c, dts >> 24 );
+    x264_put_be24( c, 0 );
+
+    p_flv->start = c->d_cur;
+    x264_put_byte( c, p_picture->b_keyframe ? FLV_FRAME_KEY : FLV_FRAME_INTER );
+    x264_put_byte( c, 1 ); // AVC NALU
+    x264_put_be24( c, offset );
+
+    if( p_flv->sei )
+    {
+        flv_append_data( c, p_flv->sei, p_flv->sei_len );
+        free( p_flv->sei );
+        p_flv->sei = NULL;
+    }
+    flv_append_data( c, p_nalu, i_size );
+
+    unsigned length = c->d_cur - p_flv->start;
+    rewrite_amf_be24( c, length, p_flv->start - 10 );
+    x264_put_be32( c, 11 + length ); // Last tag size
+    CHECK( flv_flush_data( c ) );
+
+    p_flv->i_framenum++;
+
+    return i_size;
+}
+
+static void rewrite_amf_double( FILE *fp, uint64_t position, double value )
+{
+    uint64_t x = endian_fix64( dbl2int( value ) );
+    fseek( fp, position, SEEK_SET );
+    fwrite( &x, 8, 1, fp );
+}
+
+static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
+{
+    flv_hnd_t *p_flv = handle;
+    flv_buffer *c = p_flv->c;
+
+    CHECK( flv_flush_data( c ) );
+
+    double total_duration = (double)(2 * largest_pts - second_largest_pts) * p_flv->i_timebase_num / p_flv->i_timebase_den;
+
+    if( x264_is_regular_file( c->fp ) )
+    {
+        double framerate;
+        uint64_t filesize = ftell( c->fp );
+
+        if( p_flv->i_framerate_pos )
+        {
+            framerate = (double)p_flv->i_framenum / total_duration;
+            rewrite_amf_double( c->fp, p_flv->i_framerate_pos, framerate );
+        }
+
+        rewrite_amf_double( c->fp, p_flv->i_duration_pos, total_duration );
+        rewrite_amf_double( c->fp, p_flv->i_filesize_pos, filesize );
+        rewrite_amf_double( c->fp, p_flv->i_bitrate_pos, filesize * 8 / ( total_duration * 1000 ) );
+    }
+
+    fclose( c->fp );
+    free( p_flv );
+    free( c );
+
+    return 0;
+}
+
+cli_output_t flv_output = { open_file, set_param, write_headers, write_frame, close_file };
diff --git a/output/flv_bytestream.c b/output/flv_bytestream.c
new file mode 100644
index 0000000..316114c
--- /dev/null
+++ b/output/flv_bytestream.c
@@ -0,0 +1,150 @@
+/*****************************************************************************
+ * flv_bytestream.c:
+ *****************************************************************************
+ * Copyright (C) 2009 Kieran Kunhya
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+#include "flv_bytestream.h"
+
+uint64_t dbl2int( double value )
+{
+    return (union {double f; uint64_t i;}){value}.i;
+}
+
+/* Put functions  */
+
+void x264_put_byte( flv_buffer *c, uint8_t b )
+{
+    flv_append_data( c, &b, 1 );
+}
+
+void x264_put_be32( flv_buffer *c, uint32_t val )
+{
+    x264_put_byte( c, val >> 24 );
+    x264_put_byte( c, val >> 16 );
+    x264_put_byte( c, val >> 8 );
+    x264_put_byte( c, val );
+}
+
+void x264_put_be64( flv_buffer *c, uint64_t val )
+{
+    x264_put_be32( c, val >> 32 );
+    x264_put_be32( c, val );
+}
+
+void x264_put_be16( flv_buffer *c, uint16_t val )
+{
+    x264_put_byte( c, val >> 8 );
+    x264_put_byte( c, val );
+}
+
+void x264_put_be24( flv_buffer *c, uint32_t val )
+{
+    x264_put_be16( c, val >> 8 );
+    x264_put_byte( c, val );
+}
+
+void x264_put_tag( flv_buffer *c, const char *tag )
+{
+    while( *tag )
+        x264_put_byte( c, *tag++ );
+}
+
+void x264_put_amf_string( flv_buffer *c, const char *str )
+{
+    uint16_t len = strlen( str );
+    x264_put_be16( c, len );
+    flv_append_data( c, (uint8_t*)str, len );
+}
+
+void x264_put_amf_double( flv_buffer *c, double d )
+{
+    x264_put_byte( c, AMF_DATA_TYPE_NUMBER );
+    x264_put_be64( c, dbl2int( d ) );
+}
+
+/* flv writing functions */
+
+flv_buffer *flv_create_writer( const char *filename )
+{
+    flv_buffer *c = malloc( sizeof(*c) );
+
+    if( !c )
+        return NULL;
+    memset( c, 0, sizeof(*c) );
+
+    if( !strcmp( filename, "-" ) )
+        c->fp = stdout;
+    else
+        c->fp = fopen( filename, "wb" );
+    if( !c->fp )
+    {
+        free( c );
+        return NULL;
+    }
+
+    return c;
+}
+
+int flv_append_data( flv_buffer *c, uint8_t *data, unsigned size )
+{
+    unsigned ns = c->d_cur + size;
+
+    if( ns > c->d_max )
+    {
+        void *dp;
+        unsigned dn = 16;
+        while( ns > dn )
+            dn <<= 1;
+
+        dp = realloc( c->data, dn );
+        if( !dp )
+            return -1;
+
+        c->data = dp;
+        c->d_max = dn;
+    }
+
+    memcpy( c->data + c->d_cur, data, size );
+
+    c->d_cur = ns;
+
+    return 0;
+}
+
+void rewrite_amf_be24( flv_buffer *c, unsigned length, unsigned start )
+{
+     *(c->data + start + 0) = length >> 16;
+     *(c->data + start + 1) = length >> 8;
+     *(c->data + start + 2) = length >> 0;
+}
+
+int flv_flush_data( flv_buffer *c )
+{
+    if( !c->d_cur )
+        return 0;
+
+    if( fwrite( c->data, c->d_cur, 1, c->fp ) != 1 )
+        return -1;
+
+    c->d_total += c->d_cur;
+
+    c->d_cur = 0;
+
+    return 0;
+}
diff --git a/output/flv_bytestream.h b/output/flv_bytestream.h
new file mode 100644
index 0000000..00f37fe
--- /dev/null
+++ b/output/flv_bytestream.h
@@ -0,0 +1,135 @@
+/*****************************************************************************
+ * flv_bytestream.h:
+ *****************************************************************************
+ * Copyright (C) 2009 Kieran Kunhya
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef X264_FLV_BYTESTREAM_H
+#define X264_FLV_BYTESTREAM_H
+
+/* offsets for packed values */
+#define FLV_AUDIO_SAMPLESSIZE_OFFSET 1
+#define FLV_AUDIO_SAMPLERATE_OFFSET  2
+#define FLV_AUDIO_CODECID_OFFSET     4
+
+#define FLV_VIDEO_FRAMETYPE_OFFSET   4
+
+/* bitmasks to isolate specific values */
+#define FLV_AUDIO_CHANNEL_MASK    0x01
+#define FLV_AUDIO_SAMPLESIZE_MASK 0x02
+#define FLV_AUDIO_SAMPLERATE_MASK 0x0c
+#define FLV_AUDIO_CODECID_MASK    0xf0
+
+#define FLV_VIDEO_CODECID_MASK    0x0f
+#define FLV_VIDEO_FRAMETYPE_MASK  0xf0
+
+#define AMF_END_OF_OBJECT         0x09
+
+enum
+{
+    FLV_HEADER_FLAG_HASVIDEO = 1,
+    FLV_HEADER_FLAG_HASAUDIO = 4,
+};
+
+enum
+{
+    FLV_TAG_TYPE_AUDIO = 0x08,
+    FLV_TAG_TYPE_VIDEO = 0x09,
+    FLV_TAG_TYPE_META  = 0x12,
+};
+
+enum
+{
+    FLV_MONO   = 0,
+    FLV_STEREO = 1,
+};
+
+enum
+{
+    FLV_SAMPLESSIZE_8BIT  = 0,
+    FLV_SAMPLESSIZE_16BIT = 1 << FLV_AUDIO_SAMPLESSIZE_OFFSET,
+};
+
+enum
+{
+    FLV_SAMPLERATE_SPECIAL = 0, /**< signifies 5512Hz and 8000Hz in the case of NELLYMOSER */
+    FLV_SAMPLERATE_11025HZ = 1 << FLV_AUDIO_SAMPLERATE_OFFSET,
+    FLV_SAMPLERATE_22050HZ = 2 << FLV_AUDIO_SAMPLERATE_OFFSET,
+    FLV_SAMPLERATE_44100HZ = 3 << FLV_AUDIO_SAMPLERATE_OFFSET,
+};
+
+enum
+{
+    FLV_CODECID_MP3 = 2 << FLV_AUDIO_CODECID_OFFSET,
+    FLV_CODECID_AAC = 10<< FLV_AUDIO_CODECID_OFFSET,
+};
+
+enum
+{
+    FLV_CODECID_H264 = 7,
+};
+
+enum
+{
+    FLV_FRAME_KEY   = 1 << FLV_VIDEO_FRAMETYPE_OFFSET | 7,
+    FLV_FRAME_INTER = 2 << FLV_VIDEO_FRAMETYPE_OFFSET | 7,
+};
+
+typedef enum
+{
+    AMF_DATA_TYPE_NUMBER      = 0x00,
+    AMF_DATA_TYPE_BOOL        = 0x01,
+    AMF_DATA_TYPE_STRING      = 0x02,
+    AMF_DATA_TYPE_OBJECT      = 0x03,
+    AMF_DATA_TYPE_NULL        = 0x05,
+    AMF_DATA_TYPE_UNDEFINED   = 0x06,
+    AMF_DATA_TYPE_REFERENCE   = 0x07,
+    AMF_DATA_TYPE_MIXEDARRAY  = 0x08,
+    AMF_DATA_TYPE_OBJECT_END  = 0x09,
+    AMF_DATA_TYPE_ARRAY       = 0x0a,
+    AMF_DATA_TYPE_DATE        = 0x0b,
+    AMF_DATA_TYPE_LONG_STRING = 0x0c,
+    AMF_DATA_TYPE_UNSUPPORTED = 0x0d,
+} AMFDataType;
+
+typedef struct flv_buffer
+{
+    uint8_t *data;
+    unsigned d_cur;
+    unsigned d_max;
+    FILE *fp;
+    uint64_t d_total;
+} flv_buffer;
+
+flv_buffer *flv_create_writer( const char *filename );
+int flv_append_data( flv_buffer *c, uint8_t *data, unsigned size );
+int flv_write_byte( flv_buffer *c, uint8_t *byte );
+int flv_flush_data( flv_buffer *c );
+void rewrite_amf_be24( flv_buffer *c, unsigned length, unsigned start );
+
+uint64_t dbl2int( double value );
+uint64_t get_amf_double( double value );
+void x264_put_byte( flv_buffer *c, uint8_t b );
+void x264_put_be32( flv_buffer *c, uint32_t val );
+void x264_put_be64( flv_buffer *c, uint64_t val );
+void x264_put_be16( flv_buffer *c, uint16_t val );
+void x264_put_be24( flv_buffer *c, uint32_t val );
+void x264_put_tag( flv_buffer *c, const char *tag );
+void x264_put_amf_string( flv_buffer *c, const char *str );
+void x264_put_amf_double( flv_buffer *c, double d );
+
+#endif
diff --git a/output/matroska.c b/output/matroska.c
new file mode 100644
index 0000000..8e84f52
--- /dev/null
+++ b/output/matroska.c
@@ -0,0 +1,209 @@
+/*****************************************************************************
+ * matroska.c: x264 matroska output module
+ *****************************************************************************
+ * Copyright (C) 2005 Mike Matsnev
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+#include "matroska_ebml.h"
+
+typedef struct
+{
+    mk_writer *w;
+
+    int width, height, d_width, d_height;
+
+    int64_t frame_duration;
+
+    char b_writing_frame;
+    int i_timebase_num;
+    int i_timebase_den;
+
+} mkv_hnd_t;
+
+static int open_file( char *psz_filename, hnd_t *p_handle )
+{
+    mkv_hnd_t *p_mkv;
+
+    *p_handle = NULL;
+
+    p_mkv  = malloc( sizeof(*p_mkv) );
+    if( !p_mkv )
+        return -1;
+
+    memset( p_mkv, 0, sizeof(*p_mkv) );
+
+    p_mkv->w = mk_create_writer( psz_filename );
+    if( !p_mkv->w )
+    {
+        free( p_mkv );
+        return -1;
+    }
+
+    *p_handle = p_mkv;
+
+    return 0;
+}
+
+static int set_param( hnd_t handle, x264_param_t *p_param )
+{
+    mkv_hnd_t   *p_mkv = handle;
+    int64_t dw, dh;
+
+    if( p_param->i_fps_num > 0 && !p_param->b_vfr_input )
+    {
+        p_mkv->frame_duration = (int64_t)p_param->i_fps_den *
+                                (int64_t)1000000000 / p_param->i_fps_num;
+    }
+    else
+    {
+        p_mkv->frame_duration = 0;
+    }
+
+    p_mkv->width = p_param->i_width;
+    p_mkv->height = p_param->i_height;
+
+    if( p_param->vui.i_sar_width && p_param->vui.i_sar_height )
+    {
+        dw = (int64_t)p_param->i_width  * p_param->vui.i_sar_width;
+        dh = (int64_t)p_param->i_height * p_param->vui.i_sar_height;
+    }
+    else
+    {
+        dw = p_param->i_width;
+        dh = p_param->i_height;
+    }
+
+    if( dw > 0 && dh > 0 )
+    {
+        int64_t x = gcd( dw, dh );
+        dw /= x;
+        dh /= x;
+    }
+
+    p_mkv->d_width = (int)dw;
+    p_mkv->d_height = (int)dh;
+    p_mkv->i_timebase_num = p_param->i_timebase_num;
+    p_mkv->i_timebase_den = p_param->i_timebase_den;
+
+    return 0;
+}
+
+static int write_headers( hnd_t handle, x264_nal_t *p_nal )
+{
+    mkv_hnd_t *p_mkv = handle;
+
+    int sei_size = p_nal[0].i_payload;
+    int sps_size = p_nal[1].i_payload - 4;
+    int pps_size = p_nal[2].i_payload - 4;
+
+    uint8_t *sei = p_nal[0].p_payload;
+    uint8_t *sps = p_nal[1].p_payload + 4;
+    uint8_t *pps = p_nal[2].p_payload + 4;
+
+    int ret;
+    uint8_t *avcC;
+    int avcC_len;
+
+    if( !p_mkv->width || !p_mkv->height ||
+        !p_mkv->d_width || !p_mkv->d_height )
+        return -1;
+
+    avcC_len = 5 + 1 + 2 + sps_size + 1 + 2 + pps_size;
+    avcC = malloc( avcC_len );
+    if( !avcC )
+        return -1;
+
+    avcC[0] = 1;
+    avcC[1] = sps[1];
+    avcC[2] = sps[2];
+    avcC[3] = sps[3];
+    avcC[4] = 0xff; // nalu size length is four bytes
+    avcC[5] = 0xe1; // one sps
+
+    avcC[6] = sps_size >> 8;
+    avcC[7] = sps_size;
+
+    memcpy( avcC+8, sps, sps_size );
+
+    avcC[8+sps_size] = 1; // one pps
+    avcC[9+sps_size] = pps_size >> 8;
+    avcC[10+sps_size] = pps_size;
+
+    memcpy( avcC+11+sps_size, pps, pps_size );
+
+    ret = mk_writeHeader( p_mkv->w, "x264", "V_MPEG4/ISO/AVC",
+                          avcC, avcC_len, p_mkv->frame_duration, 50000,
+                          p_mkv->width, p_mkv->height,
+                          p_mkv->d_width, p_mkv->d_height );
+
+    free( avcC );
+
+    // SEI
+
+    if( !p_mkv->b_writing_frame )
+    {
+        if( mk_start_frame( p_mkv->w ) < 0 )
+            return -1;
+        p_mkv->b_writing_frame = 1;
+    }
+    if( mk_add_frame_data( p_mkv->w, sei, sei_size ) < 0 )
+        return -1;
+
+    return sei_size + sps_size + pps_size;
+}
+
+static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture )
+{
+    mkv_hnd_t *p_mkv = handle;
+
+    if( !p_mkv->b_writing_frame )
+    {
+        if( mk_start_frame( p_mkv->w ) < 0 )
+            return -1;
+        p_mkv->b_writing_frame = 1;
+    }
+
+    if( mk_add_frame_data( p_mkv->w, p_nalu, i_size ) < 0 )
+        return -1;
+
+    int64_t i_stamp = (int64_t)((p_picture->i_pts * 1e9 * p_mkv->i_timebase_num / p_mkv->i_timebase_den) + 0.5);
+
+    p_mkv->b_writing_frame = 0;
+
+    if( mk_set_frame_flags( p_mkv->w, i_stamp, p_picture->b_keyframe ) < 0 )
+        return -1;
+
+    return i_size;
+}
+
+static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
+{
+    mkv_hnd_t *p_mkv = handle;
+    int ret;
+    int64_t i_last_delta;
+
+    i_last_delta = (int64_t)(((largest_pts - second_largest_pts) * p_mkv->i_timebase_num / p_mkv->i_timebase_den) + 0.5);
+
+    ret = mk_close( p_mkv->w, i_last_delta );
+
+    free( p_mkv );
+
+    return ret;
+}
+
+cli_output_t mkv_output = { open_file, set_param, write_headers, write_frame, close_file };
diff --git a/output/matroska_ebml.c b/output/matroska_ebml.c
new file mode 100644
index 0000000..d1c6e13
--- /dev/null
+++ b/output/matroska_ebml.c
@@ -0,0 +1,562 @@
+/*****************************************************************************
+ * matroska_ebml.c:
+ *****************************************************************************
+ * Copyright (C) 2005 Mike Matsnev
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+#include "matroska_ebml.h"
+
+#define CLSIZE 1048576
+#define CHECK(x)\
+do {\
+    if( (x) < 0 )\
+        return -1;\
+} while( 0 )
+
+struct mk_context
+{
+    struct mk_context *next, **prev, *parent;
+    mk_writer *owner;
+    unsigned id;
+
+    void *data;
+    unsigned d_cur, d_max;
+};
+
+typedef struct mk_context mk_context;
+
+struct mk_writer
+{
+    FILE *fp;
+
+    unsigned duration_ptr;
+
+    mk_context *root, *cluster, *frame;
+    mk_context *freelist;
+    mk_context *actlist;
+
+    int64_t def_duration;
+    int64_t timescale;
+    int64_t cluster_tc_scaled;
+    int64_t frame_tc, prev_frame_tc_scaled, max_frame_tc;
+
+    char wrote_header, in_frame, keyframe;
+};
+
+static mk_context *mk_create_context( mk_writer *w, mk_context *parent, unsigned id )
+{
+    mk_context *c;
+
+    if( w->freelist )
+    {
+        c = w->freelist;
+        w->freelist = w->freelist->next;
+    }
+    else
+    {
+        c = malloc( sizeof(*c) );
+        if( !c )
+            return NULL;
+        memset( c, 0, sizeof(*c) );
+    }
+
+    c->parent = parent;
+    c->owner = w;
+    c->id = id;
+
+    if( c->owner->actlist )
+        c->owner->actlist->prev = &c->next;
+    c->next = c->owner->actlist;
+    c->prev = &c->owner->actlist;
+    c->owner->actlist = c;
+
+    return c;
+}
+
+static int mk_append_context_data( mk_context *c, const void *data, unsigned size )
+{
+    unsigned ns = c->d_cur + size;
+
+    if( ns > c->d_max )
+    {
+        void *dp;
+        unsigned dn = c->d_max ? c->d_max << 1 : 16;
+        while( ns > dn )
+            dn <<= 1;
+
+        dp = realloc( c->data, dn );
+        if( !dp )
+            return -1;
+
+        c->data = dp;
+        c->d_max = dn;
+    }
+
+    memcpy( (char*)c->data + c->d_cur, data, size );
+
+    c->d_cur = ns;
+
+    return 0;
+}
+
+static int mk_write_id( mk_context *c, unsigned id )
+{
+    unsigned char c_id[4] = { id >> 24, id >> 16, id >> 8, id };
+
+    if( c_id[0] )
+        return mk_append_context_data( c, c_id, 4 );
+    if( c_id[1] )
+        return mk_append_context_data( c, c_id+1, 3 );
+    if( c_id[2] )
+        return mk_append_context_data( c, c_id+2, 2 );
+    return mk_append_context_data( c, c_id+3, 1 );
+}
+
+static int mk_write_size( mk_context *c, unsigned size )
+{
+    unsigned char c_size[5] = { 0x08, size >> 24, size >> 16, size >> 8, size };
+
+    if( size < 0x7f )
+    {
+        c_size[4] |= 0x80;
+        return mk_append_context_data( c, c_size+4, 1 );
+    }
+    if( size < 0x3fff )
+    {
+        c_size[3] |= 0x40;
+        return mk_append_context_data( c, c_size+3, 2 );
+    }
+    if( size < 0x1fffff )
+    {
+        c_size[2] |= 0x20;
+        return mk_append_context_data( c, c_size+2, 3 );
+    }
+    if( size < 0x0fffffff )
+    {
+        c_size[1] |= 0x10;
+        return mk_append_context_data( c, c_size+1, 4 );
+    }
+    return mk_append_context_data( c, c_size, 5 );
+}
+
+static int mk_flush_context_id( mk_context *c )
+{
+    unsigned char ff = 0xff;
+
+    if( !c->id )
+        return 0;
+
+    CHECK( mk_write_id( c->parent, c->id ) );
+    CHECK( mk_append_context_data( c->parent, &ff, 1 ) );
+
+    c->id = 0;
+
+    return 0;
+}
+
+static int mk_flush_context_data( mk_context *c )
+{
+    if( !c->d_cur )
+        return 0;
+
+    if( c->parent )
+        CHECK( mk_append_context_data( c->parent, c->data, c->d_cur ) );
+    else if( fwrite( c->data, c->d_cur, 1, c->owner->fp ) != 1 )
+        return -1;
+
+    c->d_cur = 0;
+
+    return 0;
+}
+
+static int mk_close_context( mk_context *c, unsigned *off )
+{
+    if( c->id )
+    {
+        CHECK( mk_write_id( c->parent, c->id ) );
+        CHECK( mk_write_size( c->parent, c->d_cur ) );
+    }
+
+    if( c->parent && off )
+        *off += c->parent->d_cur;
+
+    CHECK( mk_flush_context_data( c ) );
+
+    if( c->next )
+        c->next->prev = c->prev;
+    *(c->prev) = c->next;
+    c->next = c->owner->freelist;
+    c->owner->freelist = c;
+
+    return 0;
+}
+
+static void mk_destroy_contexts( mk_writer *w )
+{
+    mk_context *cur, *next;
+
+    for( cur = w->freelist; cur; cur = next )
+    {
+        next = cur->next;
+        free( cur->data );
+        free( cur );
+    }
+
+    for( cur = w->actlist; cur; cur = next )
+    {
+        next = cur->next;
+        free( cur->data );
+        free( cur );
+    }
+
+    w->freelist = w->actlist = w->root = NULL;
+}
+
+static int mk_write_string( mk_context *c, unsigned id, const char *str )
+{
+    size_t len = strlen( str );
+
+    CHECK( mk_write_id( c, id ) );
+    CHECK( mk_write_size( c, len ) );
+    CHECK( mk_append_context_data( c, str, len ) );
+    return 0;
+}
+
+static int mk_write_bin( mk_context *c, unsigned id, const void *data, unsigned size )
+{
+    CHECK( mk_write_id( c, id ) );
+    CHECK( mk_write_size( c, size ) );
+    CHECK( mk_append_context_data( c, data, size ) ) ;
+    return 0;
+}
+
+static int mk_write_uint( mk_context *c, unsigned id, int64_t ui )
+{
+    unsigned char c_ui[8] = { ui >> 56, ui >> 48, ui >> 40, ui >> 32, ui >> 24, ui >> 16, ui >> 8, ui };
+    unsigned i = 0;
+
+    CHECK( mk_write_id( c, id ) );
+    while( i < 7 && !c_ui[i] )
+        ++i;
+    CHECK( mk_write_size( c, 8 - i ) );
+    CHECK( mk_append_context_data( c, c_ui+i, 8 - i ) );
+    return 0;
+}
+
+static int mk_write_sint( mk_context *c, unsigned id, int64_t si )
+{
+    unsigned char c_si[8] = { si >> 56, si >> 48, si >> 40, si >> 32, si >> 24, si >> 16, si >> 8, si };
+    unsigned i = 0;
+
+    CHECK( mk_write_id( c, id ) );
+    if( si < 0 )
+        while( i < 7 && c_si[i] == 0xff && c_si[i+1] & 0x80 )
+            ++i;
+    else
+        while( i < 7 && c_si[i] == 0 && !(c_si[i+1] & 0x80 ) )
+            ++i;
+    CHECK( mk_write_size( c, 8 - i ) );
+    CHECK( mk_append_context_data( c, c_si+i, 8 - i ) );
+    return 0;
+}
+
+static int mk_write_float_raw( mk_context *c, float f )
+{
+    union
+    {
+        float f;
+        unsigned u;
+    } u;
+    unsigned char c_f[4];
+
+    u.f = f;
+    c_f[0] = u.u >> 24;
+    c_f[1] = u.u >> 16;
+    c_f[2] = u.u >> 8;
+    c_f[3] = u.u;
+
+    return mk_append_context_data( c, c_f, 4 );
+}
+
+static int mk_write_float( mk_context *c, unsigned id, float f )
+{
+    CHECK( mk_write_id( c, id ) );
+    CHECK( mk_write_size( c, 4 ) );
+    CHECK( mk_write_float_raw( c, f ) );
+    return 0;
+}
+
+static unsigned mk_ebml_size_size( unsigned s )
+{
+    if( s < 0x7f )
+        return 1;
+    if( s < 0x3fff )
+        return 2;
+    if( s < 0x1fffff )
+        return 3;
+    if( s < 0x0fffffff )
+        return 4;
+    return 5;
+}
+
+static unsigned mk_ebml_sint_size( int64_t si )
+{
+    unsigned char c_si[8] = { si >> 56, si >> 48, si >> 40, si >> 32, si >> 24, si >> 16, si >> 8, si };
+    unsigned i = 0;
+
+    if( si < 0 )
+        while( i < 7 && c_si[i] == 0xff && c_si[i+1] & 0x80 )
+            ++i;
+    else
+        while( i < 7 && c_si[i] == 0 && !(c_si[i+1] & 0x80) )
+            ++i;
+
+    return 8 - i;
+}
+
+mk_writer *mk_create_writer( const char *filename )
+{
+    mk_writer *w = malloc( sizeof(*w) );
+    if( !w )
+        return NULL;
+
+    memset( w, 0, sizeof(*w) );
+
+    w->root = mk_create_context( w, NULL, 0 );
+    if( !w->root )
+    {
+        free( w );
+        return NULL;
+    }
+
+    if( !strcmp( filename, "-" ) )
+        w->fp = stdout;
+    else
+        w->fp = fopen( filename, "wb" );
+    if( !w->fp )
+    {
+        mk_destroy_contexts( w );
+        free( w );
+        return NULL;
+    }
+
+    w->timescale = 1000000;
+
+    return w;
+}
+
+int mk_writeHeader( mk_writer *w, const char *writing_app,
+                    const char *codec_id,
+                    const void *codec_private, unsigned codec_private_size,
+                    int64_t default_frame_duration,
+                    int64_t timescale,
+                    unsigned width, unsigned height,
+                    unsigned d_width, unsigned d_height )
+{
+    mk_context  *c, *ti, *v;
+
+    if( w->wrote_header )
+        return -1;
+
+    w->timescale = timescale;
+    w->def_duration = default_frame_duration;
+
+    if( !(c = mk_create_context( w, w->root, 0x1a45dfa3 )) ) // EBML
+        return -1;
+    CHECK( mk_write_uint( c, 0x4286, 1 ) ); // EBMLVersion
+    CHECK( mk_write_uint( c, 0x42f7, 1 ) ); // EBMLReadVersion
+    CHECK( mk_write_uint( c, 0x42f2, 4 ) ); // EBMLMaxIDLength
+    CHECK( mk_write_uint( c, 0x42f3, 8 ) ); // EBMLMaxSizeLength
+    CHECK( mk_write_string( c, 0x4282, "matroska") ); // DocType
+    CHECK( mk_write_uint( c, 0x4287, 1 ) ); // DocTypeVersion
+    CHECK( mk_write_uint( c, 0x4285, 1 ) ); // DocTypeReadversion
+    CHECK( mk_close_context( c, 0 ) );
+
+    if( !(c = mk_create_context( w, w->root, 0x18538067 )) ) // Segment
+        return -1;
+    CHECK( mk_flush_context_id( c ) );
+    CHECK( mk_close_context( c, 0 ) );
+
+    if( !(c = mk_create_context( w, w->root, 0x1549a966 )) ) // SegmentInfo
+        return -1;
+    CHECK( mk_write_string( c, 0x4d80, "Haali Matroska Writer b0" ) );
+    CHECK( mk_write_string( c, 0x5741, writing_app ) );
+    CHECK( mk_write_uint( c, 0x2ad7b1, w->timescale ) );
+    CHECK( mk_write_float( c, 0x4489, 0) );
+    w->duration_ptr = c->d_cur - 4;
+    CHECK( mk_close_context( c, &w->duration_ptr ) );
+
+    if( !(c = mk_create_context( w, w->root, 0x1654ae6b )) ) // tracks
+        return -1;
+    if( !(ti = mk_create_context( w, c, 0xae )) ) // TrackEntry
+        return -1;
+    CHECK( mk_write_uint( ti, 0xd7, 1 ) ); // TrackNumber
+    CHECK( mk_write_uint( ti, 0x73c5, 1 ) ); // TrackUID
+    CHECK( mk_write_uint( ti, 0x83, 1 ) ); // TrackType
+    CHECK( mk_write_uint( ti, 0x9c, 0 ) ); // FlagLacing
+    CHECK( mk_write_string( ti, 0x86, codec_id ) ); // codec_id
+    if( codec_private_size )
+        CHECK( mk_write_bin( ti, 0x63a2, codec_private, codec_private_size ) ); // codec_private
+    if( default_frame_duration )
+        CHECK( mk_write_uint( ti, 0x23e383, default_frame_duration ) ); // DefaultDuration
+
+    if( !(v = mk_create_context( w, ti, 0xe0 ) ) ) // Video
+        return -1;
+    CHECK( mk_write_uint( v, 0xb0, width ) );
+    CHECK( mk_write_uint( v, 0xba, height ) );
+    CHECK( mk_write_uint( v, 0x54b0, d_width ) );
+    CHECK( mk_write_uint( v, 0x54ba, d_height ) );
+    CHECK( mk_close_context( v, 0 ) );
+
+    CHECK( mk_close_context( ti, 0 ) );
+
+    CHECK( mk_close_context( c, 0 ) );
+
+    CHECK( mk_flush_context_data( w->root ) );
+
+    w->wrote_header = 1;
+
+    return 0;
+}
+
+static int mk_close_cluster( mk_writer *w )
+{
+    if( w->cluster == NULL )
+        return 0;
+    CHECK( mk_close_context( w->cluster, 0 ) );
+    w->cluster = NULL;
+    CHECK( mk_flush_context_data( w->root ) );
+    return 0;
+}
+
+static int mk_flush_frame( mk_writer *w )
+{
+    int64_t delta, ref = 0;
+    unsigned fsize, bgsize;
+    unsigned char c_delta_flags[3];
+
+    if( !w->in_frame )
+        return 0;
+
+    delta = w->frame_tc/w->timescale - w->cluster_tc_scaled;
+    if( delta > 32767ll || delta < -32768ll )
+        CHECK( mk_close_cluster( w ) );
+
+    if( !w->cluster )
+    {
+        w->cluster_tc_scaled = w->frame_tc / w->timescale;
+        w->cluster = mk_create_context( w, w->root, 0x1f43b675 ); // Cluster
+        if( !w->cluster )
+            return -1;
+
+        CHECK( mk_write_uint( w->cluster, 0xe7, w->cluster_tc_scaled ) ); // Timecode
+
+        delta = 0;
+    }
+
+    fsize = w->frame ? w->frame->d_cur : 0;
+    bgsize = fsize + 4 + mk_ebml_size_size( fsize + 4 ) + 1;
+    if( !w->keyframe )
+    {
+        ref = w->prev_frame_tc_scaled - w->cluster_tc_scaled - delta;
+        bgsize += 1 + 1 + mk_ebml_sint_size( ref );
+    }
+
+    CHECK( mk_write_id( w->cluster, 0xa0 ) ); // BlockGroup
+    CHECK( mk_write_size( w->cluster, bgsize ) );
+    CHECK( mk_write_id( w->cluster, 0xa1 ) ); // Block
+    CHECK( mk_write_size( w->cluster, fsize + 4 ) );
+    CHECK( mk_write_size( w->cluster, 1 ) ); // track number
+
+    c_delta_flags[0] = delta >> 8;
+    c_delta_flags[1] = delta;
+    c_delta_flags[2] = 0;
+    CHECK( mk_append_context_data( w->cluster, c_delta_flags, 3 ) );
+    if( w->frame )
+    {
+        CHECK( mk_append_context_data( w->cluster, w->frame->data, w->frame->d_cur ) );
+        w->frame->d_cur = 0;
+    }
+    if( !w->keyframe )
+        CHECK( mk_write_sint( w->cluster, 0xfb, ref ) ); // ReferenceBlock
+
+    w->in_frame = 0;
+    w->prev_frame_tc_scaled = w->cluster_tc_scaled + delta;
+
+    if( w->cluster->d_cur > CLSIZE )
+        CHECK( mk_close_cluster( w ) );
+
+    return 0;
+}
+
+int mk_start_frame( mk_writer *w )
+{
+    if( mk_flush_frame( w ) < 0 )
+        return -1;
+
+    w->in_frame = 1;
+    w->keyframe = 0;
+
+    return 0;
+}
+
+int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe )
+{
+    if( !w->in_frame )
+        return -1;
+
+    w->frame_tc = timestamp;
+    w->keyframe = keyframe != 0;
+
+    if( w->max_frame_tc < timestamp )
+        w->max_frame_tc = timestamp;
+
+    return 0;
+}
+
+int mk_add_frame_data( mk_writer *w, const void *data, unsigned size )
+{
+    if( !w->in_frame )
+        return -1;
+
+    if( !w->frame )
+        if( !(w->frame = mk_create_context( w, NULL, 0 )) )
+        return -1;
+
+    return mk_append_context_data( w->frame, data, size );
+}
+
+int mk_close( mk_writer *w, int64_t last_delta )
+{
+    int ret = 0;
+    if( mk_flush_frame( w ) < 0 || mk_close_cluster( w ) < 0 )
+        ret = -1;
+    if( w->wrote_header && x264_is_regular_file( w->fp ) )
+    {
+        fseek( w->fp, w->duration_ptr, SEEK_SET );
+        int64_t last_frametime = w->def_duration ? w->def_duration : last_delta;
+        int64_t total_duration = w->max_frame_tc+last_frametime;
+        if( mk_write_float_raw( w->root, (float)((double)total_duration / w->timescale) ) < 0 ||
+            mk_flush_context_data( w->root ) < 0 )
+            ret = -1;
+    }
+    mk_destroy_contexts( w );
+    fclose( w->fp );
+    free( w );
+    return ret;
+}
diff --git a/matroska.h b/output/matroska_ebml.h
similarity index 56%
rename from matroska.h
rename to output/matroska_ebml.h
index be6f530..252e781 100644
--- a/matroska.h
+++ b/output/matroska_ebml.h
@@ -1,5 +1,5 @@
 /*****************************************************************************
- * matroska.h:
+ * matroska_ebml.h:
  *****************************************************************************
  * Copyright (C) 2005 Mike Matsnev
  *
@@ -18,24 +18,24 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/
 
-#ifndef X264_MATROSKA_H
-#define X264_MATROSKA_H
+#ifndef X264_MATROSKA_EBML_H
+#define X264_MATROSKA_EBML_H
 
-typedef struct mk_Writer mk_Writer;
+typedef struct mk_writer mk_writer;
 
-mk_Writer *mk_createWriter( const char *filename );
+mk_writer *mk_create_writer( const char *filename );
 
-int  mk_writeHeader( mk_Writer *w, const char *writingApp,
-                     const char *codecID,
-                     const void *codecPrivate, unsigned codecPrivateSize,
-                     int64_t default_frame_duration,
-                     int64_t timescale,
-                     unsigned width, unsigned height,
-                     unsigned d_width, unsigned d_height );
+int mk_writeHeader( mk_writer *w, const char *writing_app,
+                    const char *codec_id,
+                    const void *codec_private, unsigned codec_private_size,
+                    int64_t default_frame_duration,
+                    int64_t timescale,
+                    unsigned width, unsigned height,
+                    unsigned d_width, unsigned d_height );
 
-int  mk_startFrame( mk_Writer *w );
-int  mk_addFrameData( mk_Writer *w, const void *data, unsigned size );
-int  mk_setFrameFlags( mk_Writer *w, int64_t timestamp, int keyframe );
-int  mk_close( mk_Writer *w );
+int mk_start_frame( mk_writer *w );
+int mk_add_frame_data( mk_writer *w, const void *data, unsigned size );
+int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe );
+int mk_close( mk_writer *w, int64_t last_delta );
 
 #endif
diff --git a/output/mp4.c b/output/mp4.c
new file mode 100644
index 0000000..e3ad9c6
--- /dev/null
+++ b/output/mp4.c
@@ -0,0 +1,300 @@
+/*****************************************************************************
+ * mp4.c: x264 mp4 output module
+ *****************************************************************************
+ * Copyright (C) 2003-2009 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir at via.ecp.fr>
+ *          Loren Merritt <lorenm at u.washington.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+#include <gpac/isomedia.h>
+
+typedef struct
+{
+    GF_ISOFile *p_file;
+    GF_AVCConfig *p_config;
+    GF_ISOSample *p_sample;
+    int i_track;
+    uint32_t i_descidx;
+    int i_time_res;
+    int64_t i_time_inc;
+    int i_numframe;
+    int i_delay_time;
+} mp4_hnd_t;
+
+static void recompute_bitrate_mp4( GF_ISOFile *p_file, int i_track )
+{
+    u32 i, count, di, timescale, time_wnd, rate;
+    u64 offset;
+    Double br;
+    GF_ESD *esd;
+
+    esd = gf_isom_get_esd( p_file, i_track, 1 );
+    if( !esd )
+        return;
+
+    esd->decoderConfig->avgBitrate = 0;
+    esd->decoderConfig->maxBitrate = 0;
+    rate = time_wnd = 0;
+
+    timescale = gf_isom_get_media_timescale( p_file, i_track );
+    count = gf_isom_get_sample_count( p_file, i_track );
+    for( i = 0; i < count; i++ )
+    {
+        GF_ISOSample *samp = gf_isom_get_sample_info( p_file, i_track, i+1, &di, &offset );
+        if( !samp )
+        {
+            fprintf( stderr, "mp4 [error]: failure reading back frame %u\n", i );
+            break;
+        }
+
+        if( esd->decoderConfig->bufferSizeDB < samp->dataLength )
+            esd->decoderConfig->bufferSizeDB = samp->dataLength;
+
+        esd->decoderConfig->avgBitrate += samp->dataLength;
+        rate += samp->dataLength;
+        if( samp->DTS > time_wnd + timescale )
+        {
+            if( rate > esd->decoderConfig->maxBitrate )
+                esd->decoderConfig->maxBitrate = rate;
+            time_wnd = samp->DTS;
+            rate = 0;
+        }
+
+        gf_isom_sample_del( &samp );
+    }
+
+    br = (Double)(s64)gf_isom_get_media_duration( p_file, i_track );
+    br /= timescale;
+    esd->decoderConfig->avgBitrate = (u32)(esd->decoderConfig->avgBitrate / br);
+    /*move to bps*/
+    esd->decoderConfig->avgBitrate *= 8;
+    esd->decoderConfig->maxBitrate *= 8;
+
+    gf_isom_change_mpeg4_description( p_file, i_track, 1, esd );
+    gf_odf_desc_del( (GF_Descriptor*)esd );
+}
+
+static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
+{
+    mp4_hnd_t *p_mp4 = handle;
+    uint64_t total_duration = 0;
+
+    if( !p_mp4 )
+        return 0;
+
+    if( p_mp4->p_config )
+        gf_odf_avc_cfg_del( p_mp4->p_config );
+
+    if( p_mp4->p_sample )
+    {
+        if( p_mp4->p_sample->data )
+            free( p_mp4->p_sample->data );
+
+        gf_isom_sample_del( &p_mp4->p_sample );
+    }
+
+    if( p_mp4->p_file )
+    {
+        /* The mdhd duration is defined as CTS[final] - CTS[0] + duration of last frame.
+         * The mdhd duration (in seconds) should be able to be longer than the tkhd duration since the track is managed by edts.
+         * So, if mdhd duration is equal to the last DTS or less, we give the last composition time delta to the last sample duration.
+         * And then, the mdhd duration is updated, but it time-wise doesn't give the actual duration.
+         * The tkhd duration is the actual track duration. */
+        uint64_t mdhd_duration = (2 * largest_pts - second_largest_pts - p_mp4->i_delay_time) * p_mp4->i_time_inc;
+        total_duration = gf_isom_get_media_duration( p_mp4->p_file, p_mp4->i_track );
+        if( mdhd_duration != total_duration )
+        {
+            uint64_t last_dts = gf_isom_get_sample_dts( p_mp4->p_file, p_mp4->i_track, p_mp4->i_numframe );
+            uint32_t last_duration = (uint32_t)( mdhd_duration > last_dts ? mdhd_duration - last_dts : (largest_pts - second_largest_pts) * p_mp4->i_time_inc  );
+            gf_isom_set_last_sample_duration( p_mp4->p_file, p_mp4->i_track, last_duration );
+            total_duration = gf_isom_get_media_duration( p_mp4->p_file, p_mp4->i_track );
+        }
+
+        /* Write an Edit Box if the first CTS offset is positive.
+         * A media_time is given by not the mvhd timescale but rather the mdhd timescale.
+         * The reason is that an Edit Box maps the presentation time-line to the media time-line.
+         * Any demuxers should follow the Edit Box if it exists. */
+        GF_ISOSample *sample = gf_isom_get_sample_info( p_mp4->p_file, p_mp4->i_track, 1, NULL, NULL );
+        if( sample->CTS_Offset > 0 )
+        {
+            uint32_t mvhd_timescale = gf_isom_get_timescale( p_mp4->p_file );
+            uint64_t tkhd_duration = (uint64_t)( mdhd_duration * ( (double)mvhd_timescale / p_mp4->i_time_res ) );
+            gf_isom_append_edit_segment( p_mp4->p_file, p_mp4->i_track, tkhd_duration, sample->CTS_Offset, GF_ISOM_EDIT_NORMAL );
+        }
+        gf_isom_sample_del( &sample );
+
+        recompute_bitrate_mp4( p_mp4->p_file, p_mp4->i_track );
+        gf_isom_set_pl_indication( p_mp4->p_file, GF_ISOM_PL_VISUAL, 0x15 );
+        gf_isom_set_storage_mode( p_mp4->p_file, GF_ISOM_STORE_FLAT );
+        gf_isom_close( p_mp4->p_file );
+    }
+
+    free( p_mp4 );
+
+    return 0;
+}
+
+static int open_file( char *psz_filename, hnd_t *p_handle )
+{
+    mp4_hnd_t *p_mp4;
+
+    *p_handle = NULL;
+    FILE *fh = fopen( psz_filename, "w" );
+    if( !fh )
+        return -1;
+    else if( !x264_is_regular_file( fh ) )
+    {
+        fprintf( stderr, "mp4 [error]: MP4 output is incompatible with non-regular file `%s'\n", psz_filename );
+        return -1;
+    }
+    fclose( fh );
+
+    if( !(p_mp4 = malloc( sizeof(mp4_hnd_t) )) )
+        return -1;
+
+    memset( p_mp4, 0, sizeof(mp4_hnd_t) );
+    p_mp4->p_file = gf_isom_open( psz_filename, GF_ISOM_OPEN_WRITE, NULL );
+
+    if( !(p_mp4->p_sample = gf_isom_sample_new()) )
+    {
+        close_file( p_mp4, 0, 0 );
+        return -1;
+    }
+
+    gf_isom_set_brand_info( p_mp4->p_file, GF_ISOM_BRAND_AVC1, 0 );
+
+    *p_handle = p_mp4;
+
+    return 0;
+}
+
+static int set_param( hnd_t handle, x264_param_t *p_param )
+{
+    mp4_hnd_t *p_mp4 = handle;
+
+    p_mp4->i_time_res = p_param->i_timebase_den;
+    p_mp4->i_time_inc = p_param->i_timebase_num;
+
+    p_mp4->i_track = gf_isom_new_track( p_mp4->p_file, 0, GF_ISOM_MEDIA_VISUAL,
+                                        p_mp4->i_time_res );
+
+    p_mp4->p_config = gf_odf_avc_cfg_new();
+    gf_isom_avc_config_new( p_mp4->p_file, p_mp4->i_track, p_mp4->p_config,
+                            NULL, NULL, &p_mp4->i_descidx );
+
+    gf_isom_set_track_enabled( p_mp4->p_file, p_mp4->i_track, 1 );
+
+    gf_isom_set_visual_info( p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx,
+                             p_param->i_width, p_param->i_height );
+
+    if( p_param->vui.i_sar_width && p_param->vui.i_sar_height )
+    {
+        uint64_t dw = p_param->i_width << 16;
+        uint64_t dh = p_param->i_height << 16;
+        double sar = (double)p_param->vui.i_sar_width / p_param->vui.i_sar_height;
+        if( sar > 1.0 )
+            dw *= sar ;
+        else
+            dh /= sar;
+        gf_isom_set_track_layout_info( p_mp4->p_file, p_mp4->i_track, dw, dh, 0, 0, 0 );
+    }
+
+    p_mp4->p_sample->data = malloc( p_param->i_width * p_param->i_height * 3 / 2 );
+    if( !p_mp4->p_sample->data )
+        return -1;
+
+    return 0;
+}
+
+static int write_headers( hnd_t handle, x264_nal_t *p_nal )
+{
+    mp4_hnd_t *p_mp4 = handle;
+    GF_AVCConfigSlot *p_slot;
+
+    int sei_size = p_nal[0].i_payload;
+    int sps_size = p_nal[1].i_payload - 4;
+    int pps_size = p_nal[2].i_payload - 4;
+
+    uint8_t *sei = p_nal[0].p_payload;
+    uint8_t *sps = p_nal[1].p_payload + 4;
+    uint8_t *pps = p_nal[2].p_payload + 4;
+
+    // SPS
+
+    p_mp4->p_config->configurationVersion = 1;
+    p_mp4->p_config->AVCProfileIndication = sps[1];
+    p_mp4->p_config->profile_compatibility = sps[2];
+    p_mp4->p_config->AVCLevelIndication = sps[3];
+    p_slot = malloc( sizeof(GF_AVCConfigSlot) );
+    if( !p_slot )
+        return -1;
+    p_slot->size = sps_size;
+    p_slot->data = malloc( p_slot->size );
+    if( !p_slot->data )
+        return -1;
+    memcpy( p_slot->data, sps, sps_size );
+    gf_list_add( p_mp4->p_config->sequenceParameterSets, p_slot );
+
+    // PPS
+
+    p_slot = malloc( sizeof(GF_AVCConfigSlot) );
+    if( !p_slot )
+        return -1;
+    p_slot->size = pps_size;
+    p_slot->data = malloc( p_slot->size );
+    if( !p_slot->data )
+        return -1;
+    memcpy( p_slot->data, pps, pps_size );
+    gf_list_add( p_mp4->p_config->pictureParameterSets, p_slot );
+    gf_isom_avc_config_update( p_mp4->p_file, p_mp4->i_track, 1, p_mp4->p_config );
+
+    // SEI
+
+    memcpy( p_mp4->p_sample->data + p_mp4->p_sample->dataLength, sei, sei_size );
+    p_mp4->p_sample->dataLength += sei_size;
+
+    return sei_size + sps_size + pps_size;
+}
+static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture )
+{
+    mp4_hnd_t *p_mp4 = handle;
+    int64_t dts;
+    int64_t cts;
+
+    memcpy( p_mp4->p_sample->data + p_mp4->p_sample->dataLength, p_nalu, i_size );
+    p_mp4->p_sample->dataLength += i_size;
+
+    if( !p_mp4->i_numframe )
+        p_mp4->i_delay_time = p_picture->i_dts * -1;
+
+    dts = (p_picture->i_dts + p_mp4->i_delay_time) * p_mp4->i_time_inc;
+    cts = (p_picture->i_pts + p_mp4->i_delay_time) * p_mp4->i_time_inc;
+
+    p_mp4->p_sample->IsRAP = p_picture->b_keyframe;
+    p_mp4->p_sample->DTS = dts;
+    p_mp4->p_sample->CTS_Offset = (uint32_t)(cts - dts);
+    gf_isom_add_sample( p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx, p_mp4->p_sample );
+
+    p_mp4->p_sample->dataLength = 0;
+    p_mp4->i_numframe++;
+
+    return i_size;
+}
+
+cli_output_t mp4_output = { open_file, set_param, write_headers, write_frame, close_file };
diff --git a/encoder/analyse.h b/output/output.h
similarity index 62%
copy from encoder/analyse.h
copy to output/output.h
index b8c828f..851b819 100644
--- a/encoder/analyse.h
+++ b/output/output.h
@@ -1,7 +1,7 @@
 /*****************************************************************************
- * analyse.h: h264 encoder library
+ * output.h: x264 file output modules
  *****************************************************************************
- * Copyright (C) 2003-2008 x264 project
+ * Copyright (C) 2003-2009 x264 project
  *
  * Authors: Laurent Aimar <fenrir at via.ecp.fr>
  *          Loren Merritt <lorenm at u.washington.edu>
@@ -21,10 +21,21 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/
 
-#ifndef X264_ANALYSE_H
-#define X264_ANALYSE_H
+#ifndef X264_OUTPUT_H
+#define X264_OUTPUT_H
 
-void x264_macroblock_analyse( x264_t *h );
-void x264_slicetype_decide( x264_t *h );
+typedef struct
+{
+    int (*open_file)( char *psz_filename, hnd_t *p_handle );
+    int (*set_param)( hnd_t handle, x264_param_t *p_param );
+    int (*write_headers)( hnd_t handle, x264_nal_t *p_nal );
+    int (*write_frame)( hnd_t handle, uint8_t *p_nal, int i_size, x264_picture_t *p_picture );
+    int (*close_file)( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts );
+} cli_output_t;
+
+extern cli_output_t raw_output;
+extern cli_output_t mkv_output;
+extern cli_output_t mp4_output;
+extern cli_output_t flv_output;
 
 #endif
diff --git a/output/raw.c b/output/raw.c
new file mode 100644
index 0000000..a4d1175
--- /dev/null
+++ b/output/raw.c
@@ -0,0 +1,66 @@
+/*****************************************************************************
+ * raw.c: x264 raw bitstream output module
+ *****************************************************************************
+ * Copyright (C) 2003-2009 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir at via.ecp.fr>
+ *          Loren Merritt <lorenm at u.washington.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#include "muxers.h"
+
+static int open_file( char *psz_filename, hnd_t *p_handle )
+{
+    if( !strcmp( psz_filename, "-" ) )
+        *p_handle = stdout;
+    else if( !(*p_handle = fopen( psz_filename, "w+b" )) )
+        return -1;
+
+    return 0;
+}
+
+static int set_param( hnd_t handle, x264_param_t *p_param )
+{
+    return 0;
+}
+
+static int write_headers( hnd_t handle, x264_nal_t *p_nal )
+{
+    int size = p_nal[0].i_payload + p_nal[1].i_payload + p_nal[2].i_payload;
+
+    if( fwrite( p_nal[0].p_payload, size, 1, (FILE*)handle ) )
+        return size;
+    return -1;
+}
+
+static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture )
+{
+    if( fwrite( p_nalu, i_size, 1, (FILE*)handle ) )
+        return i_size;
+    return -1;
+}
+
+static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
+{
+    if( !handle || handle == stdout )
+        return 0;
+
+    return fclose( (FILE*)handle );
+}
+
+cli_output_t raw_output = { open_file, set_param, write_headers, write_frame, close_file };
+
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 4f34309..0bedc5b 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -30,6 +30,12 @@
 #include "common/common.h"
 #include "common/cpu.h"
 
+// GCC doesn't align stack variables on ARM, so use .bss
+#ifdef ARCH_ARM
+#undef ALIGNED_16
+#define ALIGNED_16( var ) DECLARE_ALIGNED( static var, 16 )
+#endif
+
 /* buf1, buf2: initialised to random data and shouldn't write into them */
 uint8_t * buf1, * buf2;
 /* buf3, buf4: used to store output */
@@ -76,17 +82,15 @@ static const char **intra_predict_8x8_names = intra_predict_4x4_names;
 
 static inline uint32_t read_time(void)
 {
+    uint32_t a = 0;
 #if defined(__GNUC__) && (defined(ARCH_X86) || defined(ARCH_X86_64))
-    uint32_t a;
     asm volatile( "rdtsc" :"=a"(a) ::"edx" );
-    return a;
 #elif defined(ARCH_PPC)
-    uint32_t a;
     asm volatile( "mftb %0" : "=r" (a) );
-    return a;
-#else
-    return 0;
+#elif defined(ARCH_ARM)     // ARMv7 only
+    asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) );
 #endif
+    return a;
 }
 
 static bench_t* get_bench( const char *name, int cpu )
@@ -158,11 +162,14 @@ static void print_bench(void)
                     b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
                     b->cpu&X264_CPU_SSE2 ? "sse2" :
                     b->cpu&X264_CPU_MMX ? "mmx" :
-                    b->cpu&X264_CPU_ALTIVEC ? "altivec" : "c",
+                    b->cpu&X264_CPU_ALTIVEC ? "altivec" :
+                    b->cpu&X264_CPU_NEON ? "neon" :
+                    b->cpu&X264_CPU_ARMV6 ? "armv6" : "c",
                     b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
                     b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
                     b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
-                    b->cpu&X264_CPU_LZCNT ? "_lzcnt" : "",
+                    b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
+                    b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : "",
                     ((int64_t)10*b->cycles/b->den - nop_time)/4 );
         }
 }
@@ -229,7 +236,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
     x264_predict_t predict_4x4[9+3];
     x264_predict8x8_t predict_8x8[9+3];
     x264_predict_8x8_filter_t predict_8x8_filter;
-    DECLARE_ALIGNED_16( uint8_t edge[33] );
+    ALIGNED_16( uint8_t edge[33] );
     uint16_t cost_mv[32];
     int ret = 0, ok, used_asm;
     int i, j;
@@ -337,16 +344,20 @@ static int check_pixel( int cpu_ref, int cpu_new )
 #define TEST_PIXEL_VAR( i ) \
     if( pixel_asm.var[i] != pixel_ref.var[i] ) \
     { \
-        int res_c, res_asm; \
         set_func_name( "%s_%s", "var", pixel_names[i] ); \
         used_asm = 1; \
-        res_c   = call_c( pixel_c.var[i], buf1, 16 ); \
-        res_asm = call_a( pixel_asm.var[i], buf1, 16 ); \
+        /* abi-check wrapper can't return uint64_t, so separate it from return value check */\
+        call_c1( pixel_c.var[i], buf1, 16 ); \
+        call_a1( pixel_asm.var[i], buf1, 16 ); \
+        uint64_t res_c   = pixel_c.var[i]( buf1, 16 ); \
+        uint64_t res_asm = pixel_asm.var[i]( buf1, 16 ); \
         if( res_c != res_asm ) \
         { \
             ok = 0; \
-            fprintf( stderr, "var[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
+            fprintf( stderr, "var[%d]: %d %d != %d %d [FAILED]\n", i, (int)res_c, (int)(res_c>>32), (int)res_asm, (int)(res_asm>>32) ); \
         } \
+        call_c2( pixel_c.var[i], buf1, 16 ); \
+        call_a2( pixel_asm.var[i], buf1, 16 ); \
     }
 
     ok = 1; used_asm = 0;
@@ -354,6 +365,23 @@ static int check_pixel( int cpu_ref, int cpu_new )
     TEST_PIXEL_VAR( PIXEL_8x8 );
     report( "pixel var :" );
 
+    ok = 1; used_asm = 0;
+    if( pixel_asm.var2_8x8 != pixel_ref.var2_8x8 )
+    {
+        int res_c, res_asm, ssd_c, ssd_asm;
+        set_func_name( "var2_8x8" );
+        used_asm = 1;
+        res_c   = call_c( pixel_c.var2_8x8, buf1, 16, buf2, 16, &ssd_c );
+        res_asm = call_a( pixel_asm.var2_8x8, buf1, 16, buf2, 16, &ssd_asm );
+        if( res_c != res_asm || ssd_c != ssd_asm )
+        {
+            ok = 0;
+            fprintf( stderr, "var[%d]: %d != %d or %d != %d [FAILED]\n", i, res_c, res_asm, ssd_c, ssd_asm );
+        }
+    }
+
+    report( "pixel var2 :" );
+
     for( i=0, ok=1, used_asm=0; i<4; i++ )
         if( pixel_asm.hadamard_ac[i] != pixel_ref.hadamard_ac[i] )
         {
@@ -362,6 +390,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
             for( j=0; j<32; j++ )
             {
                 uint8_t *pix = (j&16 ? buf1 : buf3) + (j&15)*256;
+                call_c1( pixel_c.hadamard_ac[i], buf1, 16 );
+                call_a1( pixel_asm.hadamard_ac[i], buf1, 16 );
                 uint64_t rc = pixel_c.hadamard_ac[i]( pix, 16 );
                 uint64_t ra = pixel_asm.hadamard_ac[i]( pix, 16 );
                 if( rc != ra )
@@ -414,7 +444,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
         pixel_asm.ssim_end4 != pixel_ref.ssim_end4 )
     {
         float res_c, res_a;
-        DECLARE_ALIGNED_16( int sums[5][4] ) = {{0}};
+        ALIGNED_16( int sums[5][4] ) = {{0}};
         used_asm = ok = 1;
         x264_emms();
         res_c = x264_pixel_ssim_wxh( &pixel_c,   buf1+2, 32, buf2+2, 32, 32, 28, buf3 );
@@ -439,8 +469,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
     for( i=0; i<100 && ok; i++ )
         if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] )
         {
-            DECLARE_ALIGNED_16( uint16_t sums[72] );
-            DECLARE_ALIGNED_16( int dc[4] );
+            ALIGNED_16( uint16_t sums[72] );
+            ALIGNED_16( int dc[4] );
             int16_t mvs_a[32], mvs_c[32];
             int mvn_a, mvn_c;
             int thresh = rand() & 0x3fff;
@@ -476,10 +506,11 @@ static int check_dct( int cpu_ref, int cpu_new )
     x264_dct_function_t dct_asm;
     x264_quant_function_t qf;
     int ret = 0, ok, used_asm, i, j, interlace;
-    DECLARE_ALIGNED_16( int16_t dct1[16][4][4] );
-    DECLARE_ALIGNED_16( int16_t dct2[16][4][4] );
-    DECLARE_ALIGNED_16( int16_t dct4[16][4][4] );
-    DECLARE_ALIGNED_16( int16_t dct8[4][8][8] );
+    ALIGNED_16( int16_t dct1[16][16] );
+    ALIGNED_16( int16_t dct2[16][16] );
+    ALIGNED_16( int16_t dct4[16][16] );
+    ALIGNED_16( int16_t dct8[4][64] );
+    ALIGNED_8( int16_t dctdc[2][4] );
     x264_t h_buf;
     x264_t *h = &h_buf;
 
@@ -490,6 +521,7 @@ static int check_dct( int cpu_ref, int cpu_new )
     memset( h, 0, sizeof(*h) );
     h->pps = h->pps_array;
     x264_param_default( &h->param );
+    h->chroma_qp_table = i_chroma_qp_table + 12;
     h->param.analyse.i_luma_deadzone[0] = 0;
     h->param.analyse.i_luma_deadzone[1] = 0;
     h->param.analyse.b_transform_8x8 = 1;
@@ -514,6 +546,7 @@ static int check_dct( int cpu_ref, int cpu_new )
     ok = 1; used_asm = 0;
     TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16*2 );
     TEST_DCT( sub8x8_dct, dct1, dct2, 16*2*4 );
+    TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4*2 );
     TEST_DCT( sub16x16_dct, dct1, dct2, 16*2*16 );
     report( "sub_dct4 :" );
 
@@ -581,9 +614,9 @@ static int check_dct( int cpu_ref, int cpu_new )
         for( i=0; i<16 && ok; i++ )\
         {\
             for( j=0; j<16; j++ )\
-                dct1[0][0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? 4080 : -4080 /* max dc */\
-                              : i<8 ? (*p++)&1 ? 4080 : -4080 /* max elements */\
-                              : ((*p++)&0x1fff)-0x1000; /* general case */\
+                dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? 4080 : -4080 /* max dc */\
+                           : i<8 ? (*p++)&1 ? 4080 : -4080 /* max elements */\
+                           : ((*p++)&0x1fff)-0x1000; /* general case */\
             memcpy( dct2, dct1, 32 );\
             call_c1( dct_c.name, dct1[0] );\
             call_a1( dct_asm.name, dct2[0] );\
@@ -603,8 +636,8 @@ static int check_dct( int cpu_ref, int cpu_new )
     x264_zigzag_function_t zigzag_ref;
     x264_zigzag_function_t zigzag_asm;
 
-    DECLARE_ALIGNED_16( int16_t level1[64] );
-    DECLARE_ALIGNED_16( int16_t level2[64] );
+    ALIGNED_16( int16_t level1[64] );
+    ALIGNED_16( int16_t level2[64] );
 
 #define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size )   \
     if( zigzag_asm.name != zigzag_ref.name ) \
@@ -624,13 +657,14 @@ static int check_dct( int cpu_ref, int cpu_new )
 #define TEST_ZIGZAG_SUB( name, t1, t2, size ) \
     if( zigzag_asm.name != zigzag_ref.name ) \
     { \
+        int nz_a, nz_c; \
         set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
         used_asm = 1; \
         memcpy( buf3, buf1, 16*FDEC_STRIDE ); \
         memcpy( buf4, buf1, 16*FDEC_STRIDE ); \
-        call_c1( zigzag_c.name, t1, buf2, buf3 );  \
-        call_a1( zigzag_asm.name, t2, buf2, buf4 ); \
-        if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) )  \
+        nz_c = call_c1( zigzag_c.name, t1, buf2, buf3 );  \
+        nz_a = call_a1( zigzag_asm.name, t2, buf2, buf4 ); \
+        if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a )  \
         { \
             ok = 0; \
             fprintf( stderr, #name " [FAILED]\n" ); \
@@ -639,6 +673,35 @@ static int check_dct( int cpu_ref, int cpu_new )
         call_a2( zigzag_asm.name, t2, buf2, buf4 ); \
     }
 
+#define TEST_ZIGZAG_SUBAC( name, t1, t2 ) \
+    if( zigzag_asm.name != zigzag_ref.name ) \
+    { \
+        int nz_a, nz_c; \
+        int16_t dc_a, dc_c; \
+        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
+        used_asm = 1; \
+        for( i = 0; i < 2; i++ ) \
+        { \
+            memcpy( buf3, buf2, 16*FDEC_STRIDE ); \
+            memcpy( buf4, buf2, 16*FDEC_STRIDE ); \
+            for( j = 0; j < 4; j++ ) \
+            { \
+                memcpy( buf3 + j*FDEC_STRIDE, (i?buf1:buf2) + j*FENC_STRIDE, 4 ); \
+                memcpy( buf4 + j*FDEC_STRIDE, (i?buf1:buf2) + j*FENC_STRIDE, 4 ); \
+            } \
+            nz_c = call_c1( zigzag_c.name, t1, buf2, buf3, &dc_c );  \
+            nz_a = call_a1( zigzag_asm.name, t2, buf2, buf4, &dc_a ); \
+            if( memcmp( t1+1, t2+1, 15*sizeof(int16_t) ) || memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a || dc_c != dc_a )  \
+            { \
+                ok = 0; \
+                fprintf( stderr, #name " [FAILED]\n" ); \
+                break; \
+            } \
+        } \
+        call_c2( zigzag_c.name, t1, buf2, buf3, &dc_c );  \
+        call_a2( zigzag_asm.name, t2, buf2, buf4, &dc_a ); \
+    }
+
 #define TEST_INTERLEAVE( name, t1, t2, dct, size )   \
     if( zigzag_asm.name != zigzag_ref.name ) \
     { \
@@ -668,6 +731,7 @@ static int check_dct( int cpu_ref, int cpu_new )
     TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
     TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16  );
     TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
+    TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 );
     report( "zigzag_frame :" );
 
     interlace = 1;
@@ -679,10 +743,11 @@ static int check_dct( int cpu_ref, int cpu_new )
     TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
     TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16  );
     TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
+    TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 );
     report( "zigzag_field :" );
 
     ok = 1; used_asm = 0;
-    TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct1[0][0], 64 );
+    TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct1[0], 64 );
     report( "zigzag_interleave :" );
 #undef TEST_ZIGZAG_SCAN
 #undef TEST_ZIGZAG_SUB
@@ -697,7 +762,7 @@ static int check_mc( int cpu_ref, int cpu_new )
     x264_mc_functions_t mc_a;
     x264_pixel_function_t pixel;
 
-    uint8_t *src     = &buf1[2*32+2];
+    uint8_t *src     = &buf1[2*64+2];
     uint8_t *src2[4] = { &buf1[3*64+2], &buf1[5*64+2],
                          &buf1[7*64+2], &buf1[9*64+2] };
     uint8_t *dst1    = buf3;
@@ -714,12 +779,13 @@ static int check_mc( int cpu_ref, int cpu_new )
 #define MC_TEST_LUMA( w, h ) \
         if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \
         { \
+            const x264_weight_t *weight = weight_none; \
             set_func_name( "mc_luma_%dx%d", w, h );\
             used_asm = 1; \
             memset(buf3, 0xCD, 1024); \
             memset(buf4, 0xCD, 1024); \
-            call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h ); \
-            call_a( mc_a.mc_luma, dst2, 32, src2, 64, dx, dy, w, h ); \
+            call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \
+            call_a( mc_a.mc_luma, dst2, 32, src2, 64, dx, dy, w, h, weight ); \
             if( memcmp( buf3, buf4, 1024 ) ) \
             { \
                 fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
@@ -730,12 +796,13 @@ static int check_mc( int cpu_ref, int cpu_new )
         { \
             uint8_t *ref = dst2; \
             int ref_stride = 32; \
+            const x264_weight_t *weight = weight_none; \
             set_func_name( "get_ref_%dx%d", w, h );\
             used_asm = 1; \
             memset(buf3, 0xCD, 1024); \
             memset(buf4, 0xCD, 1024); \
-            call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h ); \
-            ref = (uint8_t*) call_a( mc_a.get_ref, ref, &ref_stride, src2, 64, dx, dy, w, h ); \
+            call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \
+            ref = (uint8_t*) call_a( mc_a.get_ref, ref, &ref_stride, src2, 64, dx, dy, w, h, weight ); \
             for( i=0; i<h; i++ ) \
                 if( memcmp( dst1+i*32, ref+i*ref_stride, w ) ) \
                 { \
@@ -752,8 +819,8 @@ static int check_mc( int cpu_ref, int cpu_new )
             used_asm = 1; \
             memset(buf3, 0xCD, 1024); \
             memset(buf4, 0xCD, 1024); \
-            call_c( mc_c.mc_chroma, dst1, 16, src, 32, dx, dy, w, h ); \
-            call_a( mc_a.mc_chroma, dst2, 16, src, 32, dx, dy, w, h ); \
+            call_c( mc_c.mc_chroma, dst1, 16, src, 64, dx, dy, w, h ); \
+            call_a( mc_a.mc_chroma, dst2, 16, src, 64, dx, dy, w, h ); \
             /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\
             for( j=0; j<h; j++ ) \
                 for( i=w; i<4; i++ ) \
@@ -783,8 +850,9 @@ static int check_mc( int cpu_ref, int cpu_new )
 
     ok = 1; used_asm = 0;
     for( dy = -1; dy < 9; dy++ )
-        for( dx = -1; dx < 9; dx++ )
+        for( dx = -128; dx < 128; dx++ )
         {
+            if( rand()&15 ) continue;
             MC_TEST_CHROMA( 8, 8 );
             MC_TEST_CHROMA( 8, 4 );
             MC_TEST_CHROMA( 4, 8 );
@@ -822,6 +890,79 @@ static int check_mc( int cpu_ref, int cpu_new )
         MC_TEST_AVG( avg, w );
     report( "mc wpredb :" );
 
+#define MC_TEST_WEIGHT( name, weight, aligned ) \
+    int align_off = (aligned ? 0 : rand()%16); \
+    for( i = 1, ok = 1, used_asm = 0; i <= 5; i++ ) \
+    { \
+        ALIGNED_16( uint8_t buffC[640] ); \
+        ALIGNED_16( uint8_t buffA[640] ); \
+        j = X264_MAX( i*4, 2 ); \
+        memset( buffC, 0, 640 ); \
+        memset( buffA, 0, 640 ); \
+        x264_t ha; \
+        ha.mc = mc_a; \
+        /* w12 is the same as w16 in some cases */ \
+        if( i == 3 && mc_a.name[i] == mc_a.name[i+1] ) \
+            continue; \
+        if( mc_a.name[i] != mc_ref.name[i] ) \
+        { \
+            int k; \
+            set_func_name( "%s_w%d", #name, j ); \
+            used_asm = 1; \
+            call_c1( mc_c.weight[i], buffC, 32, buf2+align_off, 32, &weight, 16 ); \
+            mc_a.weight_cache(&ha, &weight); \
+            call_a1( weight.weightfn[i], buffA, 32, buf2+align_off, 32, &weight, 16 ); \
+            for( k = 0; k < 16; k++ ) \
+                if( memcmp( &buffC[k*32], &buffA[k*32], j ) ) \
+                { \
+                    ok = 0; \
+                    fprintf( stderr, #name "[%d]: [FAILED] s:%d o:%d d%d\n", i, s, o, d ); \
+                    break; \
+                } \
+            call_c2( mc_c.weight[i], buffC, 32, buf2+align_off, 32, &weight, 16 ); \
+            call_a2( weight.weightfn[i], buffA, 32, buf2+align_off, 32, &weight, 16 ); \
+        } \
+    }
+
+    ok = 1; used_asm = 0;
+
+    int s,o,d;
+    int align_cnt = 0;
+    for( s = 0; s <= 127 && ok; s++ )
+    {
+        for( o = -128; o <= 127 && ok; o++ )
+        {
+            if( rand() & 2047 ) continue;
+            for( d = 0; d <= 7 && ok; d++ )
+            {
+                if( s == 1<<d )
+                    continue;
+                x264_weight_t weight = { .i_scale = s, .i_denom = d, .i_offset = o };
+                MC_TEST_WEIGHT( weight, weight, (align_cnt++ % 4) );
+            }
+        }
+
+    }
+    report( "mc weight :" );
+
+    ok = 1; used_asm = 0;
+    s = 1; d = 0;
+    for( o = 0; o <= 127 && ok; o++ )
+    {
+        if( rand() & 15 ) continue;
+        x264_weight_t weight = { .i_scale = 1, .i_denom = 0, .i_offset = o };
+        MC_TEST_WEIGHT( offsetadd, weight, (align_cnt++ % 4) );
+    }
+    report( "mc offsetadd :" );
+    ok = 1; used_asm = 0;
+    for( o = -128; o < 0 && ok; o++ )
+    {
+        if( rand() & 15 ) continue;
+        x264_weight_t weight = { .i_scale = 1, .i_denom = 0, .i_offset = o };
+        MC_TEST_WEIGHT( offsetsub, weight, (align_cnt++ % 4) );
+    }
+    report( "mc offsetsub :" );
+
     if( mc_a.hpel_filter != mc_ref.hpel_filter )
     {
         uint8_t *src = buf1+8+2*64;
@@ -855,7 +996,7 @@ static int check_mc( int cpu_ref, int cpu_new )
     if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core )
     {
         uint8_t *dstc[4] = { buf3, buf3+1024, buf3+2048, buf3+3072 };
-        uint8_t *dsta[4] = { buf4, buf4+1024, buf4+2048, buf3+3072 };
+        uint8_t *dsta[4] = { buf4, buf4+1024, buf4+2048, buf4+3072 };
         set_func_name( "lowres_init" );
         ok = 1; used_asm = 1;
         for( w=40; w<=48; w+=8 )
@@ -908,6 +1049,34 @@ static int check_mc( int cpu_ref, int cpu_new )
     INTEGRAL_INIT( integral_init8v, 9, sum, stride );
     report( "integral init :" );
 
+    if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost )
+    {
+        ok = 1; used_asm = 1;
+        set_func_name( "mbtree_propagate" );
+        int *dsta = (int*)buf3;
+        int *dstc = dsta+400;
+        uint16_t *prop = (uint16_t*)buf1;
+        uint16_t *intra = (uint16_t*)buf4;
+        uint16_t *inter = intra+400;
+        uint16_t *qscale = inter+400;
+        uint16_t *rand = (uint16_t*)buf2;
+        x264_emms();
+        for( i=0; i<400; i++ )
+        {
+            intra[i]  = *rand++ & 0x7fff;
+            intra[i] += !intra[i];
+            inter[i]  = *rand++ & 0x7fff;
+            qscale[i] = *rand++ & 0x7fff;
+        }
+        call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, 400 );
+        call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, 400 );
+        // I don't care about exact rounding, this is just how close the floating-point implementation happens to be
+        x264_emms();
+        for( i=0; i<400; i++ )
+            ok &= abs(dstc[i]-dsta[i]) <= (abs(dstc[i])>512) || fabs((double)dstc[i]/dsta[i]-1) < 1e-6;
+        report( "mbtree propagate :" );
+    }
+
     return ret;
 }
 
@@ -981,9 +1150,9 @@ static int check_quant( int cpu_ref, int cpu_new )
     x264_quant_function_t qf_c;
     x264_quant_function_t qf_ref;
     x264_quant_function_t qf_a;
-    DECLARE_ALIGNED_16( int16_t dct1[64] );
-    DECLARE_ALIGNED_16( int16_t dct2[64] );
-    DECLARE_ALIGNED_16( uint8_t cqm_buf[64] );
+    ALIGNED_16( int16_t dct1[64] );
+    ALIGNED_16( int16_t dct2[64] );
+    ALIGNED_16( uint8_t cqm_buf[64] );
     int ret = 0, ok, used_asm;
     int oks[2] = {1,1}, used_asms[2] = {0,0};
     int i, j, i_cqm, qp;
@@ -992,6 +1161,7 @@ static int check_quant( int cpu_ref, int cpu_new )
     memset( h, 0, sizeof(*h) );
     h->pps = h->pps_array;
     x264_param_default( &h->param );
+    h->chroma_qp_table = i_chroma_qp_table + 12;
     h->param.rc.i_qp_min = 26;
     h->param.analyse.b_transform_8x8 = 1;
 
@@ -1030,25 +1200,21 @@ static int check_quant( int cpu_ref, int cpu_new )
 #define INIT_QUANT8() \
         { \
             static const int scale1d[8] = {32,31,24,31,32,31,24,31}; \
-            int x, y; \
-            for( y = 0; y < 8; y++ ) \
-                for( x = 0; x < 8; x++ ) \
-                { \
-                    unsigned int scale = (255*scale1d[y]*scale1d[x])/16; \
-                    dct1[y*8+x] = dct2[y*8+x] = j ? (rand()%(2*scale+1))-scale : 0; \
-                } \
+            for( i = 0; i < 64; i++ ) \
+            { \
+                unsigned int scale = (255*scale1d[i>>3]*scale1d[i&7])/16; \
+                dct1[i] = dct2[i] = j ? (rand()%(2*scale+1))-scale : 0; \
+            } \
         }
 
 #define INIT_QUANT4() \
         { \
             static const int scale1d[4] = {4,6,4,6}; \
-            int x, y; \
-            for( y = 0; y < 4; y++ ) \
-                for( x = 0; x < 4; x++ ) \
-                { \
-                    unsigned int scale = 255*scale1d[y]*scale1d[x]; \
-                    dct1[y*4+x] = dct2[y*4+x] = j ? (rand()%(2*scale+1))-scale : 0; \
-                } \
+            for( i = 0; i < 16; i++ ) \
+            { \
+                unsigned int scale = 255*scale1d[i>>2]*scale1d[i&3]; \
+                dct1[i] = dct2[i] = j ? (rand()%(2*scale+1))-scale : 0; \
+            } \
         }
 
 #define TEST_QUANT_DC( name, cqm ) \
@@ -1063,16 +1229,16 @@ static int check_quant( int cpu_ref, int cpu_new )
                     int result_c, result_a; \
                     for( i = 0; i < 16; i++ ) \
                         dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \
-                    result_c = call_c1( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
-                    result_a = call_a1( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+                    result_c = call_c1( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+                    result_a = call_a1( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
                     if( memcmp( dct1, dct2, 16*2 ) || result_c != result_a )       \
                     { \
                         oks[0] = 0; \
                         fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
                         break; \
                     } \
-                    call_c2( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
-                    call_a2( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+                    call_c2( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
+                    call_a2( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
                 } \
             } \
         }
@@ -1088,16 +1254,16 @@ static int check_quant( int cpu_ref, int cpu_new )
                 { \
                     int result_c, result_a; \
                     INIT_QUANT##w() \
-                    result_c = call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
-                    result_a = call_a1( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+                    result_c = call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+                    result_a = call_a1( qf_a.qname, dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
                     if( memcmp( dct1, dct2, w*w*2 ) || result_c != result_a ) \
                     { \
                         oks[0] = 0; \
                         fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
                         break; \
                     } \
-                    call_c2( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
-                    call_a2( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+                    call_c2( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+                    call_a2( qf_a.qname, dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
                 } \
             } \
         }
@@ -1118,18 +1284,18 @@ static int check_quant( int cpu_ref, int cpu_new )
             for( qp = 51; qp > 0; qp-- ) \
             { \
                 INIT_QUANT##w() \
-                call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+                call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
                 memcpy( dct2, dct1, w*w*2 ); \
-                call_c1( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
-                call_a1( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
+                call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
+                call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
                 if( memcmp( dct1, dct2, w*w*2 ) ) \
                 { \
                     oks[1] = 0; \
                     fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
                     break; \
                 } \
-                call_c2( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
-                call_a2( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
+                call_c2( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
+                call_a2( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
             } \
         }
 
@@ -1147,17 +1313,17 @@ static int check_quant( int cpu_ref, int cpu_new )
             { \
                 for( i = 0; i < 16; i++ ) \
                     dct1[i] = rand(); \
-                call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
+                call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
                 memcpy( dct2, dct1, w*w*2 ); \
-                call_c1( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
-                call_a1( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
+                call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
+                call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
                 if( memcmp( dct1, dct2, w*w*2 ) ) \
                 { \
                     oks[1] = 0; \
                     fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
                 } \
-                call_c2( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
-                call_a2( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
+                call_c2( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
+                call_a2( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
             } \
         }
 
@@ -1205,8 +1371,8 @@ static int check_quant( int cpu_ref, int cpu_new )
                 dct1[idx] = !(rand()&3) + (!(rand()&15))*(rand()&3); \
             if( ac ) \
                 dct1[0] = 0; \
-            result_c = call_c( qf_c.decname, (void*)dct1 ); \
-            result_a = call_a( qf_a.decname, (void*)dct1 ); \
+            result_c = call_c( qf_c.decname, dct1 ); \
+            result_a = call_a( qf_a.decname, dct1 ); \
             if( X264_MIN(result_c,thresh) != X264_MIN(result_a,thresh) ) \
             { \
                 ok = 0; \
@@ -1236,8 +1402,8 @@ static int check_quant( int cpu_ref, int cpu_new )
                 nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
             if( !nnz ) \
                 dct1[ac] = 1; \
-            result_c = call_c( qf_c.last, (void*)(dct1+ac) ); \
-            result_a = call_a( qf_a.last, (void*)(dct1+ac) ); \
+            result_c = call_c( qf_c.last, dct1+ac ); \
+            result_a = call_a( qf_a.last, dct1+ac ); \
             if( result_c != result_a ) \
             { \
                 ok = 0; \
@@ -1271,8 +1437,8 @@ static int check_quant( int cpu_ref, int cpu_new )
                 nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
             if( !nnz ) \
                 dct1[ac] = 1; \
-            result_c = call_c( qf_c.lastname, (void*)(dct1+ac), &runlevel_c ); \
-            result_a = call_a( qf_a.lastname, (void*)(dct1+ac), &runlevel_a ); \
+            result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \
+            result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \
             if( result_c != result_a || runlevel_c.last != runlevel_a.last || \
                 memcmp(runlevel_c.level, runlevel_a.level, sizeof(int16_t)*result_c) || \
                 memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \
@@ -1297,8 +1463,8 @@ static int check_intra( int cpu_ref, int cpu_new )
 {
     int ret = 0, ok = 1, used_asm = 0;
     int i;
-    DECLARE_ALIGNED_16( uint8_t edge[33] );
-    DECLARE_ALIGNED_16( uint8_t edge2[33] );
+    ALIGNED_16( uint8_t edge[33] );
+    ALIGNED_16( uint8_t edge2[33] );
     struct
     {
         x264_predict_t      predict_16x16[4+3];
@@ -1502,6 +1668,13 @@ static int check_all_flags( void )
         fprintf( stderr, "x264: ALTIVEC against C\n" );
         ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
     }
+#elif ARCH_ARM
+    if( x264_cpu_detect() & X264_CPU_ARMV6 )
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" );
+    if( x264_cpu_detect() & X264_CPU_NEON )
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" );
+    if( x264_cpu_detect() & X264_CPU_FAST_NEON_MRC )
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FAST_NEON_MRC, "Fast NEON MRC" );
 #endif
     return ret;
 }
@@ -1513,7 +1686,7 @@ int main(int argc, char *argv[])
 
     if( argc > 1 && !strncmp( argv[1], "--bench", 7 ) )
     {
-#if !defined(ARCH_X86) && !defined(ARCH_X86_64) && !defined(ARCH_PPC)
+#if !defined(ARCH_X86) && !defined(ARCH_X86_64) && !defined(ARCH_PPC) && !defined(ARCH_ARM)
         fprintf( stderr, "no --bench for your cpu until you port rdtsc\n" );
         return 1;
 #endif
@@ -1532,6 +1705,11 @@ int main(int argc, char *argv[])
     srand( i );
 
     buf1 = x264_malloc( 0x3e00 + 16*BENCH_ALIGNS );
+    if( !buf1 )
+    {
+        fprintf( stderr, "malloc failed, unable to initiate tests!\n" );
+        return -1;
+    }
     buf2 = buf1 + 0xf00;
     buf3 = buf2 + 0xf00;
     buf4 = buf3 + 0x1000;
diff --git a/x264.c b/x264.c
index 9aafc35..58bc1f4 100644
--- a/x264.c
+++ b/x264.c
@@ -5,6 +5,8 @@
  *
  * Authors: Loren Merritt <lorenm at u.washington.edu>
  *          Laurent Aimar <fenrir at via.ecp.fr>
+ *          Steven Walters <kemuri9 at gmail.com>
+ *          Kieran Kunhya <kieran at kunhya.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -33,19 +35,12 @@
 #include "x264.h"
 #include "muxers.h"
 
-#ifndef _MSC_VER
-#include "config.h"
-#endif
-
 #ifdef _WIN32
 #include <windows.h>
 #else
 #define SetConsoleTitle(t)
 #endif
 
-uint8_t *mux_buffer = NULL;
-int mux_buffer_size = 0;
-
 /* Ctrl-C handler */
 static int     b_ctrl_c = 0;
 static int     b_exit_on_ctrl_c = 0;
@@ -64,24 +59,43 @@ typedef struct {
     FILE *qpfile;
 } cli_opt_t;
 
-/* input file operation function pointers */
-int (*p_open_infile)( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param );
-int (*p_get_frame_total)( hnd_t handle );
-int (*p_read_frame)( x264_picture_t *p_pic, hnd_t handle, int i_frame );
-int (*p_close_infile)( hnd_t handle );
+/* i/o file operation function pointer structs */
+cli_input_t input;
+static cli_output_t output;
 
-/* output file operation function pointers */
-static int (*p_open_outfile)( char *psz_filename, hnd_t *p_handle );
-static int (*p_set_outfile_param)( hnd_t handle, x264_param_t *p_param );
-static int (*p_write_nalu)( hnd_t handle, uint8_t *p_nal, int i_size );
-static int (*p_set_eop)( hnd_t handle, x264_picture_t *p_picture );
-static int (*p_close_outfile)( hnd_t handle );
+static const char * const demuxer_names[] =
+{
+    "auto",
+    "yuv",
+    "y4m",
+#ifdef AVS_INPUT
+    "avs",
+#endif
+#ifdef LAVF_INPUT
+    "lavf",
+#endif
+#ifdef FFMS_INPUT
+    "ffms",
+#endif
+    0
+};
 
-static void Help( x264_param_t *defaults, int b_longhelp );
+static const char * const muxer_names[] =
+{
+    "auto",
+    "raw",
+    "mkv",
+    "flv",
+#ifdef MP4_OUTPUT
+    "mp4",
+#endif
+    0
+};
+
+static void Help( x264_param_t *defaults, int longhelp );
 static int  Parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt );
 static int  Encode( x264_param_t *param, cli_opt_t *opt );
 
-
 /****************************************************************************
  * main:
  ****************************************************************************/
@@ -126,31 +140,59 @@ static char const *strtable_lookup( const char * const table[], int index )
     return ( ( index >= 0 && index < i ) ? table[ index ] : "???" );
 }
 
+static char *stringify_names( char *buf, const char * const names[] )
+{
+    int i = 0;
+    char *p = buf;
+    for( p[0] = 0; names[i]; i++ )
+    {
+        p += sprintf( p, "%s", names[i] );
+        if( names[i+1] )
+            p += sprintf( p, ", " );
+    }
+    return buf;
+}
+
 /*****************************************************************************
  * Help:
  *****************************************************************************/
-static void Help( x264_param_t *defaults, int b_longhelp )
+static void Help( x264_param_t *defaults, int longhelp )
 {
+    char buf[50];
 #define H0 printf
-#define H1 if(b_longhelp) printf
+#define H1 if(longhelp>=1) printf
+#define H2 if(longhelp==2) printf
     H0( "x264 core:%d%s\n"
         "Syntax: x264 [options] -o outfile infile [widthxheight]\n"
         "\n"
         "Infile can be raw YUV 4:2:0 (in which case resolution is required),\n"
         "  or YUV4MPEG 4:2:0 (*.y4m),\n"
-        "  or AVI or Avisynth if compiled with AVIS support (%s).\n"
+        "  or Avisynth if compiled with support (%s).\n"
+        "  or libav* formats if compiled with lavf support (%s) or ffms support (%s).\n"
         "Outfile type is selected by filename:\n"
         " .264 -> Raw bytestream\n"
         " .mkv -> Matroska\n"
+        " .flv -> Flash Video\n"
         " .mp4 -> MP4 if compiled with GPAC support (%s)\n"
         "\n"
         "Options:\n"
         "\n"
-        "  -h, --help                  List the more commonly used options\n"
-        "      --longhelp              List all options\n"
+        "  -h, --help                  List basic options\n"
+        "      --longhelp              List more options\n"
+        "      --fullhelp              List all options\n"
         "\n",
         X264_BUILD, X264_VERSION,
-#ifdef AVIS_INPUT
+#ifdef AVS_INPUT
+        "yes",
+#else
+        "no",
+#endif
+#ifdef LAVF_INPUT
+        "yes",
+#else
+        "no",
+#endif
+#ifdef FFMS_INPUT
         "yes",
 #else
         "no",
@@ -161,333 +203,844 @@ static void Help( x264_param_t *defaults, int b_longhelp )
         "no"
 #endif
       );
+    H0( "Example usage:\n" );
+    H0( "\n" );
+    H0( "      Constant quality mode:\n" );
+    H0( "            x264 --crf 24 -o <output> <input>\n" );
+    H0( "\n" );
+    H0( "      Two-pass with a bitrate of 1000kbps:\n" );
+    H0( "            x264 --pass 1 --bitrate 1000 -o <output> <input>\n" );
+    H0( "            x264 --pass 2 --bitrate 1000 -o <output> <input>\n" );
+    H0( "\n" );
+    H0( "      Lossless:\n" );
+    H0( "            x264 --crf 0 -o <output> <input>\n" );
+    H0( "\n" );
+    H0( "      Maximum PSNR at the cost of speed and visual quality:\n" );
+    H0( "            x264 --preset placebo --tune psnr -o <output> <input>\n" );
+    H0( "\n" );
+    H0( "      Constant bitrate at 1000kbps with a 2 second-buffer:\n");
+    H0( "            x264 --vbv-bufsize 2000 --bitrate 1000 -o <output> <input>\n" );
+    H0( "\n" );
+    H0( "Presets:\n" );
+    H0( "\n" );
+    H0( "      --profile               Force the limits of an H.264 profile [high]\n"
+        "                                  Overrides all settings.\n" );
+    H2( "                                  - baseline:\n"
+        "                                    --no-8x8dct --bframes 0 --no-cabac\n"
+        "                                    --cqm flat --weightp 0 No interlaced\n"
+        "                                    No lossless\n"
+        "                                  - main:\n"
+        "                                    --no-8x8dct --cqm flat No lossless\n"
+        "                                  - high:\n"
+        "                                    No lossless\n" );
+        else H0( "                                  - baseline,main,high\n" );
+    H0( "      --preset                Use a preset to select encoding settings [medium]\n"
+        "                                  Overridden by user settings.\n" );
+    H2( "                                  - ultrafast:\n"
+        "                                    --no-8x8dct --aq-mode 0 --b-adapt 0\n"
+        "                                    --bframes 0 --no-cabac --no-deblock\n"
+        "                                    --no-mbtree --me dia --no-mixed-refs\n"
+        "                                    --partitions none --ref 1 --scenecut 0\n"
+        "                                    --subme 0 --trellis 0 --no-weightb\n"
+        "                                    --weightp 0\n"
+        "                                  - veryfast:\n"
+        "                                    --no-mbtree --me dia --no-mixed-refs\n"
+        "                                    --partitions i8x8,i4x4 --ref 1\n"
+        "                                    --subme 1 --trellis 0 --weightp 0\n"
+        "                                  - faster:\n"
+        "                                    --no-mbtree --no-mixed-refs --ref 2\n"
+        "                                    --subme 4 --weightp 1\n"
+        "                                  - fast\n"
+        "                                    --rc-lookahead 30 --ref 2 --subme 6\n"
+        "                                  - medium\n"
+        "                                    Default settings apply.\n"
+        "                                  - slow\n"
+        "                                    --b-adapt 2 --direct auto --me umh\n"
+        "                                    --rc-lookahead 50 --ref 5 --subme 8\n"
+        "                                  - slower\n"
+        "                                    --b-adapt 2 --direct auto --me umh\n"
+        "                                    --partitions all --rc-lookahead 60\n"
+        "                                    --ref 8 --subme 9 --trellis 2\n"
+        "                                  - veryslow\n"
+        "                                    --b-adapt 2 --bframes 8 --direct auto\n"
+        "                                    --me umh --merange 24 --partitions all\n"
+        "                                    --ref 16 --subme 10 --trellis 2\n"
+        "                                    --rc-lookahead 60\n"
+        "                                  - placebo\n"
+        "                                    --bframes 16 --b-adapt 2 --direct auto\n"
+        "                                    --slow-firstpass --no-fast-pskip\n"
+        "                                    --me tesa --merange 24 --partitions all\n"
+        "                                    --rc-lookahead 60 --ref 16 --subme 10\n"
+        "                                    --trellis 2\n" );
+    else H0( "                                  - ultrafast,veryfast,faster,fast,medium\n"
+             "                                  - slow,slower,veryslow,placebo\n" );
+    H0( "      --tune                  Tune the settings for a particular type of source\n"
+        "                              or situation\n"
+        "                                  Overridden by user settings.\n"
+        "                                  Multiple tunings are separated by commas.\n"
+        "                                  Only one psy tuning can be used at a time.\n" );
+    H2( "                                  - film (psy tuning):\n"
+        "                                    --deblock -1:-1 --psy-rd <unset>:0.15\n"
+        "                                  - animation(psy tuning):\n"
+        "                                    --bframes {+2} --deblock 1:1\n"
+        "                                    --psy-rd 0.4:<unset> --aq-strength 0.6\n"
+        "                                    --ref {Double if >1 else 1}\n"
+        "                                  - grain(psy tuning):\n"
+        "                                    --aq-strength 0.5 --no-dct-decimate\n"
+        "                                    --deadzone-inter 6 --deadzone-intra 6\n"
+        "                                    --deblock -2:-2 --ipratio 1.1 \n"
+        "                                    --pbratio 1.1 --psy-rd <unset>:0.25\n"
+        "                                    --qcomp 0.8\n"
+        "                                  - psnr(psy tuning):\n"
+        "                                    --aq-mode 0 --no-psy\n"
+        "                                  - ssim(psy tuning):\n"
+        "                                    --aq-mode 2 --no-psy\n"
+        "                                  - fastdecode:\n"
+        "                                    --no-cabac --no-deblock --no-weightb\n"
+        "                                    --weightp 0\n"
+        "                                  - zerolatency:\n"
+        "                                    --bframes 0 --rc-lookahead 0\n"
+        "                                    --sync-lookahead 0 --sliced-threads\n"
+        "                                  - touhou(psy tuning):\n"
+        "                                    --aq-strength 1.3 --deblock -1:-1\n"
+        "                                    --partitions {p4x4 if p8x8 set}\n"
+        "                                    --psy-rd <unset>:0.2\n"
+        "                                    --ref {Double if >1 else 1}\n" );
+    else H0( "                                  - psy tunings: film,animation,grain,psnr,ssim\n"
+             "                                  - other tunings: fastdecode,zerolatency\n" );
+    H1( "      --slow-firstpass        Don't use faster settings with --pass 1\n" );
+    H0( "\n" );
     H0( "Frame-type options:\n" );
     H0( "\n" );
     H0( "  -I, --keyint <integer>      Maximum GOP size [%d]\n", defaults->i_keyint_max );
-    H1( "  -i, --min-keyint <integer>  Minimum GOP size [%d]\n", defaults->i_keyint_min );
-    H1( "      --no-scenecut           Disable adaptive I-frame decision\n" );
-    H1( "      --scenecut <integer>    How aggressively to insert extra I-frames [%d]\n", defaults->i_scenecut_threshold );
-    H0( "  -b, --bframes <integer>     Number of B-frames between I and P [%d]\n", defaults->i_bframe );
-    H1( "      --b-adapt               Adaptive B-frame decision method [%d]\n"
+    H2( "  -i, --min-keyint <integer>  Minimum GOP size [%d]\n", defaults->i_keyint_min );
+    H2( "      --no-scenecut           Disable adaptive I-frame decision\n" );
+    H2( "      --scenecut <integer>    How aggressively to insert extra I-frames [%d]\n", defaults->i_scenecut_threshold );
+    H2( "      --intra-refresh         Use Periodic Intra Refresh instead of IDR frames\n" );
+    H1( "  -b, --bframes <integer>     Number of B-frames between I and P [%d]\n", defaults->i_bframe );
+    H1( "      --b-adapt <integer>     Adaptive B-frame decision method [%d]\n"
         "                                  Higher values may lower threading efficiency.\n"
         "                                  - 0: Disabled\n"
         "                                  - 1: Fast\n"
         "                                  - 2: Optimal (slow with high --bframes)\n", defaults->i_bframe_adaptive );
-    H1( "      --b-bias <integer>      Influences how often B-frames are used [%d]\n", defaults->i_bframe_bias );
-    H0( "      --b-pyramid             Keep some B-frames as references\n" );
-    H0( "      --no-cabac              Disable CABAC\n" );
-    H0( "  -r, --ref <integer>         Number of reference frames [%d]\n", defaults->i_frame_reference );
+    H2( "      --b-bias <integer>      Influences how often B-frames are used [%d]\n", defaults->i_bframe_bias );
+    H1( "      --b-pyramid <string>    Keep some B-frames as references [%s]\n"
+        "                                  - none: Disabled\n"
+        "                                  - strict: Strictly hierarchical pyramid\n"
+        "                                  - normal: Non-strict (not Blu-ray compatible)\n",
+        strtable_lookup( x264_b_pyramid_names, defaults->i_bframe_pyramid ) );
+    H1( "      --no-cabac              Disable CABAC\n" );
+    H1( "  -r, --ref <integer>         Number of reference frames [%d]\n", defaults->i_frame_reference );
     H1( "      --no-deblock            Disable loop filter\n" );
-    H0( "  -f, --deblock <alpha:beta>  Loop filter AlphaC0 and Beta parameters [%d:%d]\n",
+    H1( "  -f, --deblock <alpha:beta>  Loop filter parameters [%d:%d]\n",
                                        defaults->i_deblocking_filter_alphac0, defaults->i_deblocking_filter_beta );
+    H2( "      --slices <integer>      Number of slices per frame; forces rectangular\n"
+        "                              slices and is overridden by other slicing options\n" );
+    else H1( "      --slices <integer>      Number of slices per frame\n" );
+    H2( "      --slice-max-size <integer> Limit the size of each slice in bytes\n");
+    H2( "      --slice-max-mbs <integer> Limit the size of each slice in macroblocks\n");
     H0( "      --interlaced            Enable pure-interlaced mode\n" );
+    H2( "      --constrained-intra     Enable constrained intra prediction.\n" );
     H0( "\n" );
     H0( "Ratecontrol:\n" );
     H0( "\n" );
-    H0( "  -q, --qp <integer>          Set QP (0=lossless) [%d]\n", defaults->rc.i_qp_constant );
+    H1( "  -q, --qp <integer>          Force constant QP (0-51, 0=lossless)\n" );
     H0( "  -B, --bitrate <integer>     Set bitrate (kbit/s)\n" );
-    H0( "      --crf <float>           Quality-based VBR (nominal QP)\n" );
-    H1( "      --vbv-maxrate <integer> Max local bitrate (kbit/s) [%d]\n", defaults->rc.i_vbv_max_bitrate );
-    H0( "      --vbv-bufsize <integer> Enable CBR and set size of the VBV buffer (kbit) [%d]\n", defaults->rc.i_vbv_buffer_size );
-    H1( "      --vbv-init <float>      Initial VBV buffer occupancy [%.1f]\n", defaults->rc.f_vbv_buffer_init );
-    H1( "      --qpmin <integer>       Set min QP [%d]\n", defaults->rc.i_qp_min );
-    H1( "      --qpmax <integer>       Set max QP [%d]\n", defaults->rc.i_qp_max );
-    H1( "      --qpstep <integer>      Set max QP step [%d]\n", defaults->rc.i_qp_step );
-    H0( "      --ratetol <float>       Allowed variance of average bitrate [%.1f]\n", defaults->rc.f_rate_tolerance );
-    H0( "      --ipratio <float>       QP factor between I and P [%.2f]\n", defaults->rc.f_ip_factor );
-    H0( "      --pbratio <float>       QP factor between P and B [%.2f]\n", defaults->rc.f_pb_factor );
-    H1( "      --chroma-qp-offset <integer>  QP difference between chroma and luma [%d]\n", defaults->analyse.i_chroma_qp_offset );
-    H1( "      --aq-mode <integer>     AQ method [%d]\n"
+    H0( "      --crf <float>           Quality-based VBR (0-51, 0=lossless) [%.1f]\n", defaults->rc.f_rf_constant );
+    H1( "      --rc-lookahead <integer> Number of frames for frametype lookahead [%d]\n", defaults->rc.i_lookahead );
+    H0( "      --vbv-maxrate <integer> Max local bitrate (kbit/s) [%d]\n", defaults->rc.i_vbv_max_bitrate );
+    H0( "      --vbv-bufsize <integer> Set size of the VBV buffer (kbit) [%d]\n", defaults->rc.i_vbv_buffer_size );
+    H2( "      --vbv-init <float>      Initial VBV buffer occupancy [%.1f]\n", defaults->rc.f_vbv_buffer_init );
+    H2( "      --qpmin <integer>       Set min QP [%d]\n", defaults->rc.i_qp_min );
+    H2( "      --qpmax <integer>       Set max QP [%d]\n", defaults->rc.i_qp_max );
+    H2( "      --qpstep <integer>      Set max QP step [%d]\n", defaults->rc.i_qp_step );
+    H2( "      --ratetol <float>       Tolerance of ABR ratecontrol and VBV [%.1f]\n", defaults->rc.f_rate_tolerance );
+    H2( "      --ipratio <float>       QP factor between I and P [%.2f]\n", defaults->rc.f_ip_factor );
+    H2( "      --pbratio <float>       QP factor between P and B [%.2f]\n", defaults->rc.f_pb_factor );
+    H2( "      --chroma-qp-offset <integer>  QP difference between chroma and luma [%d]\n", defaults->analyse.i_chroma_qp_offset );
+    H2( "      --aq-mode <integer>     AQ method [%d]\n"
         "                                  - 0: Disabled\n"
-        "                                  - 1: Variance AQ (complexity mask)\n", defaults->rc.i_aq_mode );
-    H0( "      --aq-strength <float>   Reduces blocking and blurring in flat and\n"
-        "                              textured areas. [%.1f]\n"
-        "                                  - 0.5: weak AQ\n"
-        "                                  - 1.5: strong AQ\n", defaults->rc.f_aq_strength );
-    H0( "\n" );
-    H0( "  -p, --pass <1|2|3>          Enable multipass ratecontrol\n"
+        "                                  - 1: Variance AQ (complexity mask)\n"
+        "                                  - 2: Auto-variance AQ (experimental)\n", defaults->rc.i_aq_mode );
+    H1( "      --aq-strength <float>   Reduces blocking and blurring in flat and\n"
+        "                              textured areas. [%.1f]\n", defaults->rc.f_aq_strength );
+    H1( "\n" );
+    H0( "  -p, --pass <integer>        Enable multipass ratecontrol\n"
         "                                  - 1: First pass, creates stats file\n"
-        "                                  - 2: Last pass, does not overwrite stats file\n"
-        "                                  - 3: Nth pass, overwrites stats file\n" );
-    H0( "      --stats <string>        Filename for 2 pass stats [\"%s\"]\n", defaults->rc.psz_stat_out );
-    H0( "      --qcomp <float>         QP curve compression: 0.0 => CBR, 1.0 => CQP [%.2f]\n", defaults->rc.f_qcompress );
-    H1( "      --cplxblur <float>      Reduce fluctuations in QP (before curve compression) [%.1f]\n", defaults->rc.f_complexity_blur );
-    H1( "      --qblur <float>         Reduce fluctuations in QP (after curve compression) [%.1f]\n", defaults->rc.f_qblur );
-    H0( "      --zones <zone0>/<zone1>/...  Tweak the bitrate of some regions of the video\n" );
-    H1( "                              Each zone is of the form\n"
+        "                                  - 2: Last pass, does not overwrite stats file\n" );
+    H2( "                                  - 3: Nth pass, overwrites stats file\n" );
+    H1( "      --stats <string>        Filename for 2 pass stats [\"%s\"]\n", defaults->rc.psz_stat_out );
+    H2( "      --no-mbtree             Disable mb-tree ratecontrol.\n");
+    H2( "      --qcomp <float>         QP curve compression [%.2f]\n", defaults->rc.f_qcompress );
+    H2( "      --cplxblur <float>      Reduce fluctuations in QP (before curve compression) [%.1f]\n", defaults->rc.f_complexity_blur );
+    H2( "      --qblur <float>         Reduce fluctuations in QP (after curve compression) [%.1f]\n", defaults->rc.f_qblur );
+    H2( "      --zones <zone0>/<zone1>/...  Tweak the bitrate of regions of the video\n" );
+    H2( "                              Each zone is of the form\n"
         "                                  <start frame>,<end frame>,<option>\n"
         "                                  where <option> is either\n"
         "                                      q=<integer> (force QP)\n"
         "                                  or  b=<float> (bitrate multiplier)\n" );
-    H1( "      --qpfile <string>       Force frametypes and QPs for some or all frames\n"
+    H2( "      --qpfile <string>       Force frametypes and QPs for some or all frames\n"
         "                              Format of each line: framenumber frametype QP\n"
-        "                              QP of -1 lets x264 choose. Frametypes: I,i,P,B,b.\n" );
-    H0( "\n" );
-    H0( "Analysis:\n" );
-    H0( "\n" );
-    H0( "  -A, --partitions <string>   Partitions to consider [\"p8x8,b8x8,i8x8,i4x4\"]\n"
+        "                              QP of -1 lets x264 choose. Frametypes: I,i,P,B,b.\n"
+        "                              QPs are restricted by qpmin/qpmax.\n" );
+    H1( "\n" );
+    H1( "Analysis:\n" );
+    H1( "\n" );
+    H1( "  -A, --partitions <string>   Partitions to consider [\"p8x8,b8x8,i8x8,i4x4\"]\n"
         "                                  - p8x8, p4x4, b8x8, i8x8, i4x4\n"
         "                                  - none, all\n"
         "                                  (p4x4 requires p8x8. i8x8 requires --8x8dct.)\n" );
-    H0( "      --direct <string>       Direct MV prediction mode [\"%s\"]\n"
+    H1( "      --direct <string>       Direct MV prediction mode [\"%s\"]\n"
         "                                  - none, spatial, temporal, auto\n",
                                        strtable_lookup( x264_direct_pred_names, defaults->analyse.i_direct_mv_pred ) );
-    H0( "  -w, --weightb               Weighted prediction for B-frames\n" );
-    H0( "      --me <string>           Integer pixel motion estimation method [\"%s\"]\n",
+    H2( "      --no-weightb            Disable weighted prediction for B-frames\n" );
+    H1( "      --weightp <integer>     Weighted prediction for P-frames [%d]\n"
+        "                              - 0: Disabled\n"
+        "                              - 1: Blind offset\n"
+        "                              - 2: Smart analysis\n", defaults->analyse.i_weighted_pred );
+    H1( "      --me <string>           Integer pixel motion estimation method [\"%s\"]\n",
                                        strtable_lookup( x264_motion_est_names, defaults->analyse.i_me_method ) );
-    H1( "                                  - dia: diamond search, radius 1 (fast)\n"
+    H2( "                                  - dia: diamond search, radius 1 (fast)\n"
         "                                  - hex: hexagonal search, radius 2\n"
         "                                  - umh: uneven multi-hexagon search\n"
         "                                  - esa: exhaustive search\n"
         "                                  - tesa: hadamard exhaustive search (slow)\n" );
-    else H0( "                                  - dia, hex, umh\n" );
-    H0( "      --merange <integer>     Maximum motion vector search range [%d]\n", defaults->analyse.i_me_range );
-    H1( "      --mvrange <integer>     Maximum motion vector length [-1 (auto)]\n" );
-    H1( "      --mvrange-thread <int>  Minimum buffer between threads [-1 (auto)]\n" );
-    H0( "  -m, --subme <integer>       Subpixel motion estimation and mode decision [%d]\n", defaults->analyse.i_subpel_refine );
-    H1( "                                  - 0: fullpel only (not recommended)\n"
+    else H1( "                                  - dia, hex, umh\n" );
+    H2( "      --merange <integer>     Maximum motion vector search range [%d]\n", defaults->analyse.i_me_range );
+    H2( "      --mvrange <integer>     Maximum motion vector length [-1 (auto)]\n" );
+    H2( "      --mvrange-thread <int>  Minimum buffer between threads [-1 (auto)]\n" );
+    H1( "  -m, --subme <integer>       Subpixel motion estimation and mode decision [%d]\n", defaults->analyse.i_subpel_refine );
+    H2( "                                  - 0: fullpel only (not recommended)\n"
         "                                  - 1: SAD mode decision, one qpel iteration\n"
         "                                  - 2: SATD mode decision\n"
         "                                  - 3-5: Progressively more qpel\n"
         "                                  - 6: RD mode decision for I/P-frames\n"
         "                                  - 7: RD mode decision for all frames\n"
         "                                  - 8: RD refinement for I/P-frames\n"
-        "                                  - 9: RD refinement for all frames\n" );
-    else H0( "                                  decision quality: 1=fast, 9=best.\n"  );
-    H0( "      --psy-rd                Strength of psychovisual optimization [\"%.1f:%.1f\"]\n"
+        "                                  - 9: RD refinement for all frames\n"
+        "                                  - 10: QP-RD - requires trellis=2, aq-mode>0\n" );
+    else H1( "                                  decision quality: 1=fast, 10=best.\n"  );
+    H1( "      --psy-rd                Strength of psychovisual optimization [\"%.1f:%.1f\"]\n"
         "                                  #1: RD (requires subme>=6)\n"
         "                                  #2: Trellis (requires trellis, experimental)\n",
                                        defaults->analyse.f_psy_rd, defaults->analyse.f_psy_trellis );
-    H0( "      --mixed-refs            Decide references on a per partition basis\n" );
-    H1( "      --no-chroma-me          Ignore chroma in motion estimation\n" );
-    H0( "  -8, --8x8dct                Adaptive spatial transform size\n" );
-    H0( "  -t, --trellis <integer>     Trellis RD quantization. Requires CABAC. [%d]\n"
+    H2( "      --no-psy                Disable all visual optimizations that worsen\n"
+        "                              both PSNR and SSIM.\n" );
+    H2( "      --no-mixed-refs         Don't decide references on a per partition basis\n" );
+    H2( "      --no-chroma-me          Ignore chroma in motion estimation\n" );
+    H1( "      --no-8x8dct             Disable adaptive spatial transform size\n" );
+    H1( "  -t, --trellis <integer>     Trellis RD quantization. Requires CABAC. [%d]\n"
         "                                  - 0: disabled\n"
         "                                  - 1: enabled only on the final encode of a MB\n"
         "                                  - 2: enabled on all mode decisions\n", defaults->analyse.i_trellis );
-    H0( "      --no-fast-pskip         Disables early SKIP detection on P-frames\n" );
-    H0( "      --no-dct-decimate       Disables coefficient thresholding on P-frames\n" );
-    H0( "      --nr <integer>          Noise reduction [%d]\n", defaults->analyse.i_noise_reduction );
-    H1( "\n" );
-    H1( "      --deadzone-inter <int>  Set the size of the inter luma quantization deadzone [%d]\n", defaults->analyse.i_luma_deadzone[0] );
-    H1( "      --deadzone-intra <int>  Set the size of the intra luma quantization deadzone [%d]\n", defaults->analyse.i_luma_deadzone[1] );
-    H1( "                                  Deadzones should be in the range 0 - 32.\n" );
-    H1( "      --cqm <string>          Preset quant matrices [\"flat\"]\n"
+    H2( "      --no-fast-pskip         Disables early SKIP detection on P-frames\n" );
+    H2( "      --no-dct-decimate       Disables coefficient thresholding on P-frames\n" );
+    H1( "      --nr <integer>          Noise reduction [%d]\n", defaults->analyse.i_noise_reduction );
+    H2( "\n" );
+    H2( "      --deadzone-inter <int>  Set the size of the inter luma quantization deadzone [%d]\n", defaults->analyse.i_luma_deadzone[0] );
+    H2( "      --deadzone-intra <int>  Set the size of the intra luma quantization deadzone [%d]\n", defaults->analyse.i_luma_deadzone[1] );
+    H2( "                                  Deadzones should be in the range 0 - 32.\n" );
+    H2( "      --cqm <string>          Preset quant matrices [\"flat\"]\n"
         "                                  - jvt, flat\n" );
-    H0( "      --cqmfile <string>      Read custom quant matrices from a JM-compatible file\n" );
-    H1( "                                  Overrides any other --cqm* options.\n" );
-    H1( "      --cqm4 <list>           Set all 4x4 quant matrices\n"
+    H1( "      --cqmfile <string>      Read custom quant matrices from a JM-compatible file\n" );
+    H2( "                                  Overrides any other --cqm* options.\n" );
+    H2( "      --cqm4 <list>           Set all 4x4 quant matrices\n"
         "                                  Takes a comma-separated list of 16 integers.\n" );
-    H1( "      --cqm8 <list>           Set all 8x8 quant matrices\n"
+    H2( "      --cqm8 <list>           Set all 8x8 quant matrices\n"
         "                                  Takes a comma-separated list of 64 integers.\n" );
-    H1( "      --cqm4i, --cqm4p, --cqm8i, --cqm8p\n"
+    H2( "      --cqm4i, --cqm4p, --cqm8i, --cqm8p\n"
         "                              Set both luma and chroma quant matrices\n" );
-    H1( "      --cqm4iy, --cqm4ic, --cqm4py, --cqm4pc\n"
+    H2( "      --cqm4iy, --cqm4ic, --cqm4py, --cqm4pc\n"
         "                              Set individual quant matrices\n" );
-    H1( "\n" );
-    H1( "Video Usability Info (Annex E):\n" );
-    H1( "The VUI settings are not used by the encoder but are merely suggestions to\n" );
-    H1( "the playback equipment. See doc/vui.txt for details. Use at your own risk.\n" );
-    H1( "\n" );
-    H1( "      --overscan <string>     Specify crop overscan setting [\"%s\"]\n"
+    H2( "\n" );
+    H2( "Video Usability Info (Annex E):\n" );
+    H2( "The VUI settings are not used by the encoder but are merely suggestions to\n" );
+    H2( "the playback equipment. See doc/vui.txt for details. Use at your own risk.\n" );
+    H2( "\n" );
+    H2( "      --overscan <string>     Specify crop overscan setting [\"%s\"]\n"
         "                                  - undef, show, crop\n",
                                        strtable_lookup( x264_overscan_names, defaults->vui.i_overscan ) );
-    H1( "      --videoformat <string>  Specify video format [\"%s\"]\n"
+    H2( "      --videoformat <string>  Specify video format [\"%s\"]\n"
         "                                  - component, pal, ntsc, secam, mac, undef\n",
                                        strtable_lookup( x264_vidformat_names, defaults->vui.i_vidformat ) );
-    H1( "      --fullrange <string>    Specify full range samples setting [\"%s\"]\n"
+    H2( "      --fullrange <string>    Specify full range samples setting [\"%s\"]\n"
         "                                  - off, on\n",
                                        strtable_lookup( x264_fullrange_names, defaults->vui.b_fullrange ) );
-    H1( "      --colorprim <string>    Specify color primaries [\"%s\"]\n"
+    H2( "      --colorprim <string>    Specify color primaries [\"%s\"]\n"
         "                                  - undef, bt709, bt470m, bt470bg\n"
         "                                    smpte170m, smpte240m, film\n",
                                        strtable_lookup( x264_colorprim_names, defaults->vui.i_colorprim ) );
-    H1( "      --transfer <string>     Specify transfer characteristics [\"%s\"]\n"
+    H2( "      --transfer <string>     Specify transfer characteristics [\"%s\"]\n"
         "                                  - undef, bt709, bt470m, bt470bg, linear,\n"
         "                                    log100, log316, smpte170m, smpte240m\n",
                                        strtable_lookup( x264_transfer_names, defaults->vui.i_transfer ) );
-    H1( "      --colormatrix <string>  Specify color matrix setting [\"%s\"]\n"
+    H2( "      --colormatrix <string>  Specify color matrix setting [\"%s\"]\n"
         "                                  - undef, bt709, fcc, bt470bg\n"
         "                                    smpte170m, smpte240m, GBR, YCgCo\n",
                                        strtable_lookup( x264_colmatrix_names, defaults->vui.i_colmatrix ) );
-    H1( "      --chromaloc <integer>   Specify chroma sample location (0 to 5) [%d]\n",
+    H2( "      --chromaloc <integer>   Specify chroma sample location (0 to 5) [%d]\n",
                                        defaults->vui.i_chroma_loc );
     H0( "\n" );
     H0( "Input/Output:\n" );
     H0( "\n" );
     H0( "  -o, --output                Specify output file\n" );
+    H1( "      --muxer <string>        Specify output container format [\"%s\"]\n"
+        "                                  - %s\n", muxer_names[0], stringify_names( buf, muxer_names ) );
+    H1( "      --demuxer <string>      Specify input container format [\"%s\"]\n"
+        "                                  - %s\n", demuxer_names[0], stringify_names( buf, demuxer_names ) );
+    H1( "      --index <string>        Filename for input index file\n" );
     H0( "      --sar width:height      Specify Sample Aspect Ratio\n" );
     H0( "      --fps <float|rational>  Specify framerate\n" );
     H0( "      --seek <integer>        First frame to encode\n" );
     H0( "      --frames <integer>      Maximum number of frames to encode\n" );
     H0( "      --level <string>        Specify level (as defined by Annex A)\n" );
-    H0( "\n" );
-    H0( "  -v, --verbose               Print stats for each frame\n" );
-    H0( "      --progress              Show a progress indicator while encoding\n" );
+    H1( "\n" );
+    H1( "  -v, --verbose               Print stats for each frame\n" );
+    H1( "      --no-progress           Don't show the progress indicator while encoding\n" );
     H0( "      --quiet                 Quiet Mode\n" );
-    H0( "      --no-psnr               Disable PSNR computation\n" );
-    H0( "      --no-ssim               Disable SSIM computation\n" );
-    H0( "      --threads <integer>     Parallel encoding\n" );
-    H0( "      --thread-input          Run Avisynth in its own thread\n" );
-    H1( "      --non-deterministic     Slightly improve quality of SMP, at the cost of repeatability\n" );
-    H1( "      --asm <integer>         Override CPU detection\n" );
-    H1( "      --no-asm                Disable all CPU optimizations\n" );
-    H1( "      --visualize             Show MB types overlayed on the encoded video\n" );
-    H1( "      --dump-yuv <string>     Save reconstructed frames\n" );
-    H1( "      --sps-id <integer>      Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id );
-    H1( "      --aud                   Use access unit delimiters\n" );
+    H1( "      --psnr                  Enable PSNR computation\n" );
+    H1( "      --ssim                  Enable SSIM computation\n" );
+    H1( "      --threads <integer>     Force a specific number of threads\n" );
+    H2( "      --sliced-threads        Low-latency but lower-efficiency threading\n" );
+    H2( "      --thread-input          Run Avisynth in its own thread\n" );
+    H2( "      --sync-lookahead <integer> Number of buffer frames for threaded lookahead\n" );
+    H2( "      --non-deterministic     Slightly improve quality of SMP, at the cost of repeatability\n" );
+    H2( "      --asm <integer>         Override CPU detection\n" );
+    H2( "      --no-asm                Disable all CPU optimizations\n" );
+    H2( "      --visualize             Show MB types overlayed on the encoded video\n" );
+    H2( "      --dump-yuv <string>     Save reconstructed frames\n" );
+    H2( "      --sps-id <integer>      Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id );
+    H2( "      --aud                   Use access unit delimiters\n" );
+    H2( "      --force-cfr             Force constant framerate timestamp generation\n" );
     H0( "\n" );
 }
 
+#define OPT_FRAMES 256
+#define OPT_SEEK 257
+#define OPT_QPFILE 258
+#define OPT_THREAD_INPUT 259
+#define OPT_QUIET 260
+#define OPT_NOPROGRESS 261
+#define OPT_VISUALIZE 262
+#define OPT_LONGHELP 263
+#define OPT_PROFILE 264
+#define OPT_PRESET 265
+#define OPT_TUNE 266
+#define OPT_SLOWFIRSTPASS 267
+#define OPT_FULLHELP 268
+#define OPT_FPS 269
+#define OPT_MUXER 270
+#define OPT_DEMUXER 271
+#define OPT_INDEX 272
+#define OPT_INTERLACED 273
+
+static char short_options[] = "8A:B:b:f:hI:i:m:o:p:q:r:t:Vvw";
+static struct option long_options[] =
+{
+    { "help",              no_argument, NULL, 'h' },
+    { "longhelp",          no_argument, NULL, OPT_LONGHELP },
+    { "fullhelp",          no_argument, NULL, OPT_FULLHELP },
+    { "version",           no_argument, NULL, 'V' },
+    { "profile",     required_argument, NULL, OPT_PROFILE },
+    { "preset",      required_argument, NULL, OPT_PRESET },
+    { "tune",        required_argument, NULL, OPT_TUNE },
+    { "slow-firstpass",    no_argument, NULL, OPT_SLOWFIRSTPASS },
+    { "bitrate",     required_argument, NULL, 'B' },
+    { "bframes",     required_argument, NULL, 'b' },
+    { "b-adapt",     required_argument, NULL, 0 },
+    { "no-b-adapt",        no_argument, NULL, 0 },
+    { "b-bias",      required_argument, NULL, 0 },
+    { "b-pyramid",   required_argument, NULL, 0 },
+    { "min-keyint",  required_argument, NULL, 'i' },
+    { "keyint",      required_argument, NULL, 'I' },
+    { "intra-refresh",     no_argument, NULL, 0 },
+    { "scenecut",    required_argument, NULL, 0 },
+    { "no-scenecut",       no_argument, NULL, 0 },
+    { "nf",                no_argument, NULL, 0 },
+    { "no-deblock",        no_argument, NULL, 0 },
+    { "filter",      required_argument, NULL, 0 },
+    { "deblock",     required_argument, NULL, 'f' },
+    { "interlaced",        no_argument, NULL, OPT_INTERLACED },
+    { "no-interlaced",     no_argument, NULL, OPT_INTERLACED },
+    { "constrained-intra", no_argument, NULL, 0 },
+    { "cabac",             no_argument, NULL, 0 },
+    { "no-cabac",          no_argument, NULL, 0 },
+    { "qp",          required_argument, NULL, 'q' },
+    { "qpmin",       required_argument, NULL, 0 },
+    { "qpmax",       required_argument, NULL, 0 },
+    { "qpstep",      required_argument, NULL, 0 },
+    { "crf",         required_argument, NULL, 0 },
+    { "rc-lookahead",required_argument, NULL, 0 },
+    { "ref",         required_argument, NULL, 'r' },
+    { "asm",         required_argument, NULL, 0 },
+    { "no-asm",            no_argument, NULL, 0 },
+    { "sar",         required_argument, NULL, 0 },
+    { "fps",         required_argument, NULL, OPT_FPS },
+    { "frames",      required_argument, NULL, OPT_FRAMES },
+    { "seek",        required_argument, NULL, OPT_SEEK },
+    { "output",      required_argument, NULL, 'o' },
+    { "muxer",       required_argument, NULL, OPT_MUXER },
+    { "demuxer",     required_argument, NULL, OPT_DEMUXER },
+    { "stdout",      required_argument, NULL, OPT_MUXER },
+    { "stdin",       required_argument, NULL, OPT_DEMUXER },
+    { "index",       required_argument, NULL, OPT_INDEX },
+    { "analyse",     required_argument, NULL, 0 },
+    { "partitions",  required_argument, NULL, 'A' },
+    { "direct",      required_argument, NULL, 0 },
+    { "weightb",           no_argument, NULL, 'w' },
+    { "no-weightb",        no_argument, NULL, 0 },
+    { "weightp",     required_argument, NULL, 0 },
+    { "me",          required_argument, NULL, 0 },
+    { "merange",     required_argument, NULL, 0 },
+    { "mvrange",     required_argument, NULL, 0 },
+    { "mvrange-thread", required_argument, NULL, 0 },
+    { "subme",       required_argument, NULL, 'm' },
+    { "psy-rd",      required_argument, NULL, 0 },
+    { "no-psy",            no_argument, NULL, 0 },
+    { "psy",               no_argument, NULL, 0 },
+    { "mixed-refs",        no_argument, NULL, 0 },
+    { "no-mixed-refs",     no_argument, NULL, 0 },
+    { "no-chroma-me",      no_argument, NULL, 0 },
+    { "8x8dct",            no_argument, NULL, 0 },
+    { "no-8x8dct",         no_argument, NULL, 0 },
+    { "trellis",     required_argument, NULL, 't' },
+    { "fast-pskip",        no_argument, NULL, 0 },
+    { "no-fast-pskip",     no_argument, NULL, 0 },
+    { "no-dct-decimate",   no_argument, NULL, 0 },
+    { "aq-strength", required_argument, NULL, 0 },
+    { "aq-mode",     required_argument, NULL, 0 },
+    { "deadzone-inter", required_argument, NULL, '0' },
+    { "deadzone-intra", required_argument, NULL, '0' },
+    { "level",       required_argument, NULL, 0 },
+    { "ratetol",     required_argument, NULL, 0 },
+    { "vbv-maxrate", required_argument, NULL, 0 },
+    { "vbv-bufsize", required_argument, NULL, 0 },
+    { "vbv-init",    required_argument, NULL,  0 },
+    { "ipratio",     required_argument, NULL, 0 },
+    { "pbratio",     required_argument, NULL, 0 },
+    { "chroma-qp-offset", required_argument, NULL, 0 },
+    { "pass",        required_argument, NULL, 'p' },
+    { "stats",       required_argument, NULL, 0 },
+    { "qcomp",       required_argument, NULL, 0 },
+    { "mbtree",            no_argument, NULL, 0 },
+    { "no-mbtree",         no_argument, NULL, 0 },
+    { "qblur",       required_argument, NULL, 0 },
+    { "cplxblur",    required_argument, NULL, 0 },
+    { "zones",       required_argument, NULL, 0 },
+    { "qpfile",      required_argument, NULL, OPT_QPFILE },
+    { "threads",     required_argument, NULL, 0 },
+    { "sliced-threads",    no_argument, NULL, 0 },
+    { "no-sliced-threads", no_argument, NULL, 0 },
+    { "slice-max-size",    required_argument, NULL, 0 },
+    { "slice-max-mbs",     required_argument, NULL, 0 },
+    { "slices",            required_argument, NULL, 0 },
+    { "thread-input",      no_argument, NULL, OPT_THREAD_INPUT },
+    { "sync-lookahead",    required_argument, NULL, 0 },
+    { "non-deterministic", no_argument, NULL, 0 },
+    { "psnr",              no_argument, NULL, 0 },
+    { "ssim",              no_argument, NULL, 0 },
+    { "quiet",             no_argument, NULL, OPT_QUIET },
+    { "verbose",           no_argument, NULL, 'v' },
+    { "no-progress",       no_argument, NULL, OPT_NOPROGRESS },
+    { "visualize",         no_argument, NULL, OPT_VISUALIZE },
+    { "dump-yuv",    required_argument, NULL, 0 },
+    { "sps-id",      required_argument, NULL, 0 },
+    { "aud",               no_argument, NULL, 0 },
+    { "nr",          required_argument, NULL, 0 },
+    { "cqm",         required_argument, NULL, 0 },
+    { "cqmfile",     required_argument, NULL, 0 },
+    { "cqm4",        required_argument, NULL, 0 },
+    { "cqm4i",       required_argument, NULL, 0 },
+    { "cqm4iy",      required_argument, NULL, 0 },
+    { "cqm4ic",      required_argument, NULL, 0 },
+    { "cqm4p",       required_argument, NULL, 0 },
+    { "cqm4py",      required_argument, NULL, 0 },
+    { "cqm4pc",      required_argument, NULL, 0 },
+    { "cqm8",        required_argument, NULL, 0 },
+    { "cqm8i",       required_argument, NULL, 0 },
+    { "cqm8p",       required_argument, NULL, 0 },
+    { "overscan",    required_argument, NULL, 0 },
+    { "videoformat", required_argument, NULL, 0 },
+    { "fullrange",   required_argument, NULL, 0 },
+    { "colorprim",   required_argument, NULL, 0 },
+    { "transfer",    required_argument, NULL, 0 },
+    { "colormatrix", required_argument, NULL, 0 },
+    { "chromaloc",   required_argument, NULL, 0 },
+    { "force-cfr",         no_argument, NULL, 0 },
+    {0, 0, 0, 0}
+};
+
+static int select_output( const char *muxer, char *filename, x264_param_t *param )
+{
+    const char *ext = get_filename_extension( filename );
+    if( !strcmp( filename, "-" ) || strcasecmp( muxer, "auto" ) )
+        ext = muxer;
+
+    if( !strcasecmp( ext, "mp4" ) )
+    {
+#ifdef MP4_OUTPUT
+        output = mp4_output;
+        param->b_annexb = 0;
+        param->b_aud = 0;
+        param->b_dts_compress = 0;
+        param->b_repeat_headers = 0;
+#else
+        fprintf( stderr, "x264 [error]: not compiled with MP4 output support\n" );
+        return -1;
+#endif
+    }
+    else if( !strcasecmp( ext, "mkv" ) )
+    {
+        output = mkv_output;
+        param->b_annexb = 0;
+        param->b_aud = 0;
+        param->b_dts_compress = 0;
+        param->b_repeat_headers = 0;
+    }
+    else if( !strcasecmp( ext, "flv" ) )
+    {
+        output = flv_output;
+        param->b_annexb = 0;
+        param->b_aud = 0;
+        param->b_dts_compress = 1;
+        param->b_repeat_headers = 0;
+    }
+    else
+        output = raw_output;
+    return 0;
+}
+
+static int select_input( const char *demuxer, char *used_demuxer, char *filename,
+                         hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
+{
+    const char *ext = get_filename_extension( filename );
+    int b_regular = strcmp( filename, "-" );
+    int b_auto = !strcasecmp( demuxer, "auto" );
+    if( !b_regular && b_auto )
+        ext = "yuv";
+    if( b_regular )
+    {
+        FILE *f = fopen( filename, "r" );
+        if( f )
+        {
+            b_regular = x264_is_regular_file( f );
+            fclose( f );
+        }
+    }
+    const char *module = b_auto ? ext : demuxer;
+
+    if( !strcasecmp( module, "avs" ) || !strcasecmp( ext, "d2v" ) || !strcasecmp( ext, "dga" ) )
+    {
+#ifdef AVS_INPUT
+        input = avs_input;
+        module = "avs";
+#else
+        fprintf( stderr, "x264 [error]: not compiled with AVS input support\n" );
+        return -1;
+#endif
+    }
+    else if( !strcasecmp( module, "y4m" ) )
+        input = y4m_input;
+    else if( !strcasecmp( module, "yuv" ) )
+        input = yuv_input;
+    else
+    {
+#ifdef FFMS_INPUT
+        if( b_regular && (b_auto || !strcasecmp( demuxer, "ffms" )) &&
+            !ffms_input.open_file( filename, p_handle, info, opt ) )
+        {
+            module = "ffms";
+            b_auto = 0;
+            input = ffms_input;
+        }
+#endif
+#ifdef LAVF_INPUT
+        if( (b_auto || !strcasecmp( demuxer, "lavf" )) &&
+            !lavf_input.open_file( filename, p_handle, info, opt ) )
+        {
+            module = "lavf";
+            b_auto = 0;
+            input = lavf_input;
+        }
+#endif
+#ifdef AVS_INPUT
+        if( b_regular && (b_auto || !strcasecmp( demuxer, "avs" )) &&
+            !avs_input.open_file( filename, p_handle, info, opt ) )
+        {
+            module = "avs";
+            b_auto = 0;
+            input = avs_input;
+        }
+#endif
+        if( b_auto && !yuv_input.open_file( filename, p_handle, info, opt ) )
+        {
+            module = "yuv";
+            b_auto = 0;
+            input = yuv_input;
+        }
+
+        if( !(*p_handle) )
+        {
+            fprintf( stderr, "x264 [error]: could not open input file `%s' via any method!\n", filename );
+            return -1;
+        }
+    }
+    strcpy( used_demuxer, module );
+
+    return 0;
+}
+
 /*****************************************************************************
  * Parse:
  *****************************************************************************/
-static int  Parse( int argc, char **argv,
-                   x264_param_t *param, cli_opt_t *opt )
+static int Parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
 {
-    char *psz_filename = NULL;
+    char *input_filename = NULL;
+    const char *demuxer = demuxer_names[0];
+    char *output_filename = NULL;
+    const char *muxer = muxer_names[0];
     x264_param_t defaults = *param;
-    char *psz;
-    int b_avis = 0;
-    int b_y4m = 0;
+    char *profile = NULL;
     int b_thread_input = 0;
+    int b_turbo = 1;
+    int b_pass1 = 0;
+    int b_user_ref = 0;
+    int b_user_fps = 0;
+    int b_user_interlaced = 0;
+    int i;
+    cli_input_opt_t input_opt;
 
     memset( opt, 0, sizeof(cli_opt_t) );
+    memset( &input_opt, 0, sizeof(cli_input_opt_t) );
+    opt->b_progress = 1;
+
+    /* Presets are applied before all other options. */
+    for( optind = 0;; )
+    {
+        int c = getopt_long( argc, argv, short_options, long_options, NULL );
+        if( c == -1 )
+            break;
 
-    /* Default input file driver */
-    p_open_infile = open_file_yuv;
-    p_get_frame_total = get_frame_total_yuv;
-    p_read_frame = read_frame_yuv;
-    p_close_infile = close_file_yuv;
+        if( c == OPT_PRESET )
+        {
+            if( !strcasecmp( optarg, "ultrafast" ) )
+            {
+                param->i_frame_reference = 1;
+                param->i_scenecut_threshold = 0;
+                param->b_deblocking_filter = 0;
+                param->b_cabac = 0;
+                param->i_bframe = 0;
+                param->analyse.intra = 0;
+                param->analyse.inter = 0;
+                param->analyse.b_transform_8x8 = 0;
+                param->analyse.i_me_method = X264_ME_DIA;
+                param->analyse.i_subpel_refine = 0;
+                param->rc.i_aq_mode = 0;
+                param->analyse.b_mixed_references = 0;
+                param->analyse.i_trellis = 0;
+                param->i_bframe_adaptive = X264_B_ADAPT_NONE;
+                param->rc.b_mb_tree = 0;
+                param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
+            }
+            else if( !strcasecmp( optarg, "veryfast" ) )
+            {
+                param->analyse.inter = X264_ANALYSE_I8x8|X264_ANALYSE_I4x4;
+                param->analyse.i_me_method = X264_ME_DIA;
+                param->analyse.i_subpel_refine = 1;
+                param->i_frame_reference = 1;
+                param->analyse.b_mixed_references = 0;
+                param->analyse.i_trellis = 0;
+                param->rc.b_mb_tree = 0;
+                param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
+            }
+            else if( !strcasecmp( optarg, "faster" ) )
+            {
+                param->analyse.b_mixed_references = 0;
+                param->i_frame_reference = 2;
+                param->analyse.i_subpel_refine = 4;
+                param->rc.b_mb_tree = 0;
+                param->analyse.i_weighted_pred = X264_WEIGHTP_BLIND;
+            }
+            else if( !strcasecmp( optarg, "fast" ) )
+            {
+                param->i_frame_reference = 2;
+                param->analyse.i_subpel_refine = 6;
+                param->rc.i_lookahead = 30;
+            }
+            else if( !strcasecmp( optarg, "medium" ) )
+            {
+                /* Default is medium */
+            }
+            else if( !strcasecmp( optarg, "slow" ) )
+            {
+                param->analyse.i_me_method = X264_ME_UMH;
+                param->analyse.i_subpel_refine = 8;
+                param->i_frame_reference = 5;
+                param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
+                param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
+                param->rc.i_lookahead = 50;
+            }
+            else if( !strcasecmp( optarg, "slower" ) )
+            {
+                param->analyse.i_me_method = X264_ME_UMH;
+                param->analyse.i_subpel_refine = 9;
+                param->i_frame_reference = 8;
+                param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
+                param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
+                param->analyse.inter |= X264_ANALYSE_PSUB8x8;
+                param->analyse.i_trellis = 2;
+                param->rc.i_lookahead = 60;
+            }
+            else if( !strcasecmp( optarg, "veryslow" ) )
+            {
+                param->analyse.i_me_method = X264_ME_UMH;
+                param->analyse.i_subpel_refine = 10;
+                param->analyse.i_me_range = 24;
+                param->i_frame_reference = 16;
+                param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
+                param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
+                param->analyse.inter |= X264_ANALYSE_PSUB8x8;
+                param->analyse.i_trellis = 2;
+                param->i_bframe = 8;
+                param->rc.i_lookahead = 60;
+            }
+            else if( !strcasecmp( optarg, "placebo" ) )
+            {
+                param->analyse.i_me_method = X264_ME_TESA;
+                param->analyse.i_subpel_refine = 10;
+                param->analyse.i_me_range = 24;
+                param->i_frame_reference = 16;
+                param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS;
+                param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO;
+                param->analyse.inter |= X264_ANALYSE_PSUB8x8;
+                param->analyse.b_fast_pskip = 0;
+                param->analyse.i_trellis = 2;
+                param->i_bframe = 16;
+                param->rc.i_lookahead = 60;
+                b_turbo = 0;
+            }
+            else
+            {
+                fprintf( stderr, "x264 [error]: invalid preset '%s'\n", optarg );
+                return -1;
+            }
+        }
+        else if( c == '?' )
+            return -1;
+    }
 
-    /* Default output file driver */
-    p_open_outfile = open_file_bsf;
-    p_set_outfile_param = set_param_bsf;
-    p_write_nalu = write_nalu_bsf;
-    p_set_eop = set_eop_bsf;
-    p_close_outfile = close_file_bsf;
+    /* Tunings are applied next. */
+    for( optind = 0;; )
+    {
+        int c = getopt_long( argc, argv, short_options, long_options, NULL );
+        if( c == -1 )
+            break;
+
+        if( c == OPT_TUNE )
+        {
+            char *s = strtok( optarg, ",./-+" );
+            int psy_tuning_used = 0;
+            while( s )
+            {
+                if( !strncasecmp( s, "film", 4 ) )
+                {
+                    if( psy_tuning_used ) goto psy_failure;
+                    param->i_deblocking_filter_alphac0 = -1;
+                    param->i_deblocking_filter_beta = -1;
+                    param->analyse.f_psy_trellis = 0.15;
+                    psy_tuning_used = 1;
+                }
+                else if( !strncasecmp( s, "animation", 9 ) )
+                {
+                    if( psy_tuning_used ) goto psy_failure;
+                    param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1;
+                    param->i_deblocking_filter_alphac0 = 1;
+                    param->i_deblocking_filter_beta = 1;
+                    param->analyse.f_psy_rd = 0.4;
+                    param->rc.f_aq_strength = 0.6;
+                    param->i_bframe += 2;
+                    psy_tuning_used = 1;
+                }
+                else if( !strncasecmp( s, "grain", 5 ) )
+                {
+                    if( psy_tuning_used ) goto psy_failure;
+                    param->i_deblocking_filter_alphac0 = -2;
+                    param->i_deblocking_filter_beta = -2;
+                    param->analyse.f_psy_trellis = 0.25;
+                    param->analyse.b_dct_decimate = 0;
+                    param->rc.f_pb_factor = 1.1;
+                    param->rc.f_ip_factor = 1.1;
+                    param->rc.f_aq_strength = 0.5;
+                    param->analyse.i_luma_deadzone[0] = 6;
+                    param->analyse.i_luma_deadzone[1] = 6;
+                    param->rc.f_qcompress = 0.8;
+                    psy_tuning_used = 1;
+                }
+                else if( !strncasecmp( s, "psnr", 4 ) )
+                {
+                    if( psy_tuning_used ) goto psy_failure;
+                    param->rc.i_aq_mode = X264_AQ_NONE;
+                    param->analyse.b_psy = 0;
+                    psy_tuning_used = 1;
+                }
+                else if( !strncasecmp( s, "ssim", 4 ) )
+                {
+                    if( psy_tuning_used ) goto psy_failure;
+                    param->rc.i_aq_mode = X264_AQ_AUTOVARIANCE;
+                    param->analyse.b_psy = 0;
+                    psy_tuning_used = 1;
+                }
+                else if( !strncasecmp( s, "fastdecode", 10 ) )
+                {
+                    param->b_deblocking_filter = 0;
+                    param->b_cabac = 0;
+                    param->analyse.b_weighted_bipred = 0;
+                    param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
+                }
+                else if( !strncasecmp( s, "zerolatency", 11 ) )
+                {
+                    param->rc.i_lookahead = 0;
+                    param->i_sync_lookahead = 0;
+                    param->i_bframe = 0;
+                    param->b_sliced_threads = 1;
+                }
+                else if( !strncasecmp( s, "touhou", 6 ) )
+                {
+                    if( psy_tuning_used ) goto psy_failure;
+                    param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1;
+                    param->i_deblocking_filter_alphac0 = -1;
+                    param->i_deblocking_filter_beta = -1;
+                    param->analyse.f_psy_trellis = 0.2;
+                    param->rc.f_aq_strength = 1.3;
+                    if( param->analyse.inter & X264_ANALYSE_PSUB16x16 )
+                        param->analyse.inter |= X264_ANALYSE_PSUB8x8;
+                    psy_tuning_used = 1;
+                }
+                else
+                {
+                    fprintf( stderr, "x264 [error]: invalid tune '%s'\n", s );
+                    return -1;
+                }
+                if( 0 )
+                {
+psy_failure:
+                    fprintf( stderr, "x264 [warning]: only 1 psy tuning can be used: ignoring tune %s\n", s );
+                }
+                s = strtok( NULL, ",./-+" );
+            }
+        }
+        else if( c == '?' )
+            return -1;
+    }
 
     /* Parse command line options */
-    for( ;; )
+    for( optind = 0;; )
     {
         int b_error = 0;
         int long_options_index = -1;
 
-#define OPT_FRAMES 256
-#define OPT_SEEK 257
-#define OPT_QPFILE 258
-#define OPT_THREAD_INPUT 259
-#define OPT_QUIET 260
-#define OPT_PROGRESS 261
-#define OPT_VISUALIZE 262
-#define OPT_LONGHELP 263
-
-        static struct option long_options[] =
-        {
-            { "help",    no_argument,       NULL, 'h' },
-            { "longhelp",no_argument,       NULL, OPT_LONGHELP },
-            { "version", no_argument,       NULL, 'V' },
-            { "bitrate", required_argument, NULL, 'B' },
-            { "bframes", required_argument, NULL, 'b' },
-            { "b-adapt", required_argument, NULL, 0 },
-            { "no-b-adapt", no_argument,    NULL, 0 },
-            { "b-bias",  required_argument, NULL, 0 },
-            { "b-pyramid", no_argument,     NULL, 0 },
-            { "min-keyint",required_argument,NULL,'i' },
-            { "keyint",  required_argument, NULL, 'I' },
-            { "scenecut",required_argument, NULL, 0 },
-            { "no-scenecut",no_argument,    NULL, 0 },
-            { "nf",      no_argument,       NULL, 0 },
-            { "no-deblock", no_argument,    NULL, 0 },
-            { "filter",  required_argument, NULL, 0 },
-            { "deblock", required_argument, NULL, 'f' },
-            { "interlaced", no_argument,    NULL, 0 },
-            { "no-cabac",no_argument,       NULL, 0 },
-            { "qp",      required_argument, NULL, 'q' },
-            { "qpmin",   required_argument, NULL, 0 },
-            { "qpmax",   required_argument, NULL, 0 },
-            { "qpstep",  required_argument, NULL, 0 },
-            { "crf",     required_argument, NULL, 0 },
-            { "ref",     required_argument, NULL, 'r' },
-            { "asm",     required_argument, NULL, 0 },
-            { "no-asm",  no_argument,       NULL, 0 },
-            { "sar",     required_argument, NULL, 0 },
-            { "fps",     required_argument, NULL, 0 },
-            { "frames",  required_argument, NULL, OPT_FRAMES },
-            { "seek",    required_argument, NULL, OPT_SEEK },
-            { "output",  required_argument, NULL, 'o' },
-            { "analyse", required_argument, NULL, 0 },
-            { "partitions", required_argument, NULL, 'A' },
-            { "direct",  required_argument, NULL, 0 },
-            { "weightb", no_argument,       NULL, 'w' },
-            { "me",      required_argument, NULL, 0 },
-            { "merange", required_argument, NULL, 0 },
-            { "mvrange", required_argument, NULL, 0 },
-            { "mvrange-thread", required_argument, NULL, 0 },
-            { "subme",   required_argument, NULL, 'm' },
-            { "psy-rd",  required_argument, NULL, 0 },
-            { "mixed-refs", no_argument,    NULL, 0 },
-            { "no-chroma-me", no_argument,  NULL, 0 },
-            { "8x8dct",  no_argument,       NULL, '8' },
-            { "trellis", required_argument, NULL, 't' },
-            { "no-fast-pskip", no_argument, NULL, 0 },
-            { "no-dct-decimate", no_argument, NULL, 0 },
-            { "aq-strength", required_argument, NULL, 0 },
-            { "aq-mode", required_argument, NULL, 0 },
-            { "deadzone-inter", required_argument, NULL, '0' },
-            { "deadzone-intra", required_argument, NULL, '0' },
-            { "level",   required_argument, NULL, 0 },
-            { "ratetol", required_argument, NULL, 0 },
-            { "vbv-maxrate", required_argument, NULL, 0 },
-            { "vbv-bufsize", required_argument, NULL, 0 },
-            { "vbv-init", required_argument,NULL,  0 },
-            { "ipratio", required_argument, NULL, 0 },
-            { "pbratio", required_argument, NULL, 0 },
-            { "chroma-qp-offset", required_argument, NULL, 0 },
-            { "pass",    required_argument, NULL, 'p' },
-            { "stats",   required_argument, NULL, 0 },
-            { "qcomp",   required_argument, NULL, 0 },
-            { "qblur",   required_argument, NULL, 0 },
-            { "cplxblur",required_argument, NULL, 0 },
-            { "zones",   required_argument, NULL, 0 },
-            { "qpfile",  required_argument, NULL, OPT_QPFILE },
-            { "threads", required_argument, NULL, 0 },
-            { "thread-input", no_argument,  NULL, OPT_THREAD_INPUT },
-            { "non-deterministic", no_argument, NULL, 0 },
-            { "no-psnr", no_argument,       NULL, 0 },
-            { "no-ssim", no_argument,       NULL, 0 },
-            { "quiet",   no_argument,       NULL, OPT_QUIET },
-            { "verbose", no_argument,       NULL, 'v' },
-            { "progress",no_argument,       NULL, OPT_PROGRESS },
-            { "visualize",no_argument,      NULL, OPT_VISUALIZE },
-            { "dump-yuv",required_argument, NULL, 0 },
-            { "sps-id",  required_argument, NULL, 0 },
-            { "aud",     no_argument,       NULL, 0 },
-            { "nr",      required_argument, NULL, 0 },
-            { "cqm",     required_argument, NULL, 0 },
-            { "cqmfile", required_argument, NULL, 0 },
-            { "cqm4",    required_argument, NULL, 0 },
-            { "cqm4i",   required_argument, NULL, 0 },
-            { "cqm4iy",  required_argument, NULL, 0 },
-            { "cqm4ic",  required_argument, NULL, 0 },
-            { "cqm4p",   required_argument, NULL, 0 },
-            { "cqm4py",  required_argument, NULL, 0 },
-            { "cqm4pc",  required_argument, NULL, 0 },
-            { "cqm8",    required_argument, NULL, 0 },
-            { "cqm8i",   required_argument, NULL, 0 },
-            { "cqm8p",   required_argument, NULL, 0 },
-            { "overscan", required_argument, NULL, 0 },
-            { "videoformat", required_argument, NULL, 0 },
-            { "fullrange", required_argument, NULL, 0 },
-            { "colorprim", required_argument, NULL, 0 },
-            { "transfer", required_argument, NULL, 0 },
-            { "colormatrix", required_argument, NULL, 0 },
-            { "chromaloc", required_argument, NULL, 0 },
-            {0, 0, 0, 0}
-        };
-
-        int c = getopt_long( argc, argv, "8A:B:b:f:hI:i:m:o:p:q:r:t:Vvw",
-                             long_options, &long_options_index);
+        int c = getopt_long( argc, argv, short_options, long_options, &long_options_index );
 
         if( c == -1 )
         {
@@ -502,6 +1055,9 @@ static int  Parse( int argc, char **argv,
             case OPT_LONGHELP:
                 Help( &defaults, 1 );
                 exit(0);
+            case OPT_FULLHELP:
+                Help( &defaults, 2 );
+                exit(0);
             case 'V':
 #ifdef X264_POINTVER
                 printf( "x264 "X264_POINTVER"\n" );
@@ -516,46 +1072,48 @@ static int  Parse( int argc, char **argv,
 #endif
                 exit(0);
             case OPT_FRAMES:
-                param->i_frame_total = atoi( optarg );
+                param->i_frame_total = X264_MAX( atoi( optarg ), 0 );
                 break;
             case OPT_SEEK:
-                opt->i_seek = atoi( optarg );
+                opt->i_seek = input_opt.seek = X264_MAX( atoi( optarg ), 0 );
                 break;
             case 'o':
-                if( !strncasecmp(optarg + strlen(optarg) - 4, ".mp4", 4) )
+                output_filename = optarg;
+                break;
+            case OPT_MUXER:
+                for( i = 0; muxer_names[i] && strcasecmp( muxer_names[i], optarg ); )
+                    i++;
+                if( !muxer_names[i] )
                 {
-#ifdef MP4_OUTPUT
-                    p_open_outfile = open_file_mp4;
-                    p_write_nalu = write_nalu_mp4;
-                    p_set_outfile_param = set_param_mp4;
-                    p_set_eop = set_eop_mp4;
-                    p_close_outfile = close_file_mp4;
-#else
-                    fprintf( stderr, "x264 [error]: not compiled with MP4 output support\n" );
+                    fprintf( stderr, "x264 [error]: invalid muxer '%s'\n", optarg );
                     return -1;
-#endif
-                }
-                else if( !strncasecmp(optarg + strlen(optarg) - 4, ".mkv", 4) )
-                {
-                    p_open_outfile = open_file_mkv;
-                    p_write_nalu = write_nalu_mkv;
-                    p_set_outfile_param = set_param_mkv;
-                    p_set_eop = set_eop_mkv;
-                    p_close_outfile = close_file_mkv;
                 }
-                if( !strcmp(optarg, "-") )
-                    opt->hout = stdout;
-                else if( p_open_outfile( optarg, &opt->hout ) )
+                muxer = optarg;
+                break;
+            case OPT_DEMUXER:
+                for( i = 0; demuxer_names[i] && strcasecmp( demuxer_names[i], optarg ); )
+                    i++;
+                if( !demuxer_names[i] )
                 {
-                    fprintf( stderr, "x264 [error]: can't open output file `%s'\n", optarg );
+                    fprintf( stderr, "x264 [error]: invalid demuxer '%s'\n", optarg );
                     return -1;
                 }
+                demuxer = optarg;
+                break;
+            case OPT_INDEX:
+                input_opt.index = optarg;
                 break;
             case OPT_QPFILE:
-                opt->qpfile = fopen( optarg, "r" );
+                opt->qpfile = fopen( optarg, "rb" );
                 if( !opt->qpfile )
                 {
-                    fprintf( stderr, "x264 [error]: can't open `%s'\n", optarg );
+                    fprintf( stderr, "x264 [error]: can't open qpfile `%s'\n", optarg );
+                    return -1;
+                }
+                else if( !x264_is_regular_file( opt->qpfile ) )
+                {
+                    fprintf( stderr, "x264 [error]: qpfile incompatible with non-regular file `%s'\n", optarg );
+                    fclose( opt->qpfile );
                     return -1;
                 }
                 break;
@@ -568,8 +1126,8 @@ static int  Parse( int argc, char **argv,
             case 'v':
                 param->i_log_level = X264_LOG_DEBUG;
                 break;
-            case OPT_PROGRESS:
-                opt->b_progress = 1;
+            case OPT_NOPROGRESS:
+                opt->b_progress = 0;
                 break;
             case OPT_VISUALIZE:
 #ifdef VISUALIZE
@@ -579,7 +1137,30 @@ static int  Parse( int argc, char **argv,
                 fprintf( stderr, "x264 [warning]: not compiled with visualization support\n" );
 #endif
                 break;
+            case OPT_TUNE:
+            case OPT_PRESET:
+                break;
+            case OPT_PROFILE:
+                profile = optarg;
+                break;
+            case OPT_SLOWFIRSTPASS:
+                b_turbo = 0;
+                break;
+            case 'r':
+                b_user_ref = 1;
+                goto generic_option;
+            case 'p':
+                b_pass1 = atoi( optarg ) == 1;
+                goto generic_option;
+            case OPT_FPS:
+                b_user_fps = 1;
+                param->b_vfr_input = 0;
+                goto generic_option;
+            case OPT_INTERLACED:
+                b_user_interlaced = 1;
+                goto generic_option;
             default:
+generic_option:
             {
                 int i;
                 if( long_options_index < 0 )
@@ -609,101 +1190,167 @@ static int  Parse( int argc, char **argv,
         }
     }
 
-    /* Get the file name */
-    if( optind > argc - 1 || !opt->hout )
+    /* Set faster options in case of turbo firstpass. */
+    if( b_turbo && b_pass1 )
     {
-        fprintf( stderr, "x264 [error]: No %s file. Run x264 --help for a list of options.\n",
-                 optind > argc - 1 ? "input" : "output" );
-        return -1;
+        param->i_frame_reference = 1;
+        param->analyse.b_transform_8x8 = 0;
+        param->analyse.inter = 0;
+        param->analyse.i_me_method = X264_ME_DIA;
+        param->analyse.i_subpel_refine = X264_MIN( 2, param->analyse.i_subpel_refine );
+        param->analyse.i_trellis = 0;
     }
-    psz_filename = argv[optind++];
-
-    /* check demuxer type */
-    psz = psz_filename + strlen(psz_filename) - 1;
-    while( psz > psz_filename && *psz != '.' )
-        psz--;
-    if( !strncasecmp( psz, ".avi", 4 ) || !strncasecmp( psz, ".avs", 4 ) )
-        b_avis = 1;
-    if( !strncasecmp( psz, ".y4m", 4 ) )
-        b_y4m = 1;
 
-    if( !(b_avis || b_y4m) ) // raw yuv
+    /* Apply profile restrictions. */
+    if( profile )
     {
-        if( optind > argc - 1 )
+        if( !strcasecmp( profile, "baseline" ) )
         {
-            /* try to parse the file name */
-            for( psz = psz_filename; *psz; psz++ )
+            param->analyse.b_transform_8x8 = 0;
+            param->b_cabac = 0;
+            param->i_cqm_preset = X264_CQM_FLAT;
+            param->i_bframe = 0;
+            param->analyse.i_weighted_pred = X264_WEIGHTP_NONE;
+            if( param->b_interlaced )
             {
-                if( *psz >= '0' && *psz <= '9'
-                    && sscanf( psz, "%ux%u", &param->i_width, &param->i_height ) == 2 )
-                {
-                    if( param->i_log_level >= X264_LOG_INFO )
-                        fprintf( stderr, "x264 [info]: %dx%d (given by file name) @ %.2f fps\n", param->i_width, param->i_height, (double)param->i_fps_num / (double)param->i_fps_den);
-                    break;
-                }
+                fprintf( stderr, "x264 [error]: baseline profile doesn't support interlacing\n" );
+                return -1;
             }
         }
+        else if( !strcasecmp( profile, "main" ) )
+        {
+            param->analyse.b_transform_8x8 = 0;
+            param->i_cqm_preset = X264_CQM_FLAT;
+        }
+        else if( !strcasecmp( profile, "high" ) )
+        {
+            /* Default */
+        }
         else
         {
-            sscanf( argv[optind++], "%ux%u", &param->i_width, &param->i_height );
-            if( param->i_log_level >= X264_LOG_INFO )
-                fprintf( stderr, "x264 [info]: %dx%d @ %.2f fps\n", param->i_width, param->i_height, (double)param->i_fps_num / (double)param->i_fps_den);
+            fprintf( stderr, "x264 [error]: invalid profile: %s\n", profile );
+            return -1;
+        }
+        if( (param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0) ||
+            (param->rc.i_rc_method == X264_RC_CRF && param->rc.f_rf_constant == 0) )
+        {
+            fprintf( stderr, "x264 [error]: %s profile doesn't support lossless\n", profile );
+            return -1;
         }
     }
 
-    if( !(b_avis || b_y4m) && ( !param->i_width || !param->i_height ) )
+    /* Get the file name */
+    if( optind > argc - 1 || !output_filename )
     {
-        fprintf( stderr, "x264 [error]: Rawyuv input requires a resolution.\n" );
+        fprintf( stderr, "x264 [error]: No %s file. Run x264 --help for a list of options.\n",
+                 optind > argc - 1 ? "input" : "output" );
         return -1;
     }
 
-    /* open the input */
+    if( select_output( muxer, output_filename, param ) )
+        return -1;
+    if( output.open_file( output_filename, &opt->hout ) )
     {
-        if( b_avis )
-        {
-#ifdef AVIS_INPUT
-            p_open_infile = open_file_avis;
-            p_get_frame_total = get_frame_total_avis;
-            p_read_frame = read_frame_avis;
-            p_close_infile = close_file_avis;
-#else
-            fprintf( stderr, "x264 [error]: not compiled with AVIS input support\n" );
-            return -1;
-#endif
-        }
-        if ( b_y4m )
-        {
-            p_open_infile = open_file_y4m;
-            p_get_frame_total = get_frame_total_y4m;
-            p_read_frame = read_frame_y4m;
-            p_close_infile = close_file_y4m;
-        }
+        fprintf( stderr, "x264 [error]: could not open output file `%s'\n", output_filename );
+        return -1;
+    }
 
-        if( p_open_infile( psz_filename, &opt->hin, param ) )
-        {
-            fprintf( stderr, "x264 [error]: could not open input file '%s'\n", psz_filename );
-            return -1;
-        }
+    input_filename = argv[optind++];
+    input_opt.resolution = optind < argc ? argv[optind++] : NULL;
+    video_info_t info = {0};
+    char demuxername[5];
+
+    /* set info flags to param flags to be overwritten by demuxer as necessary. */
+    info.csp        = param->i_csp;
+    info.fps_num    = param->i_fps_num;
+    info.fps_den    = param->i_fps_den;
+    info.interlaced = param->b_interlaced;
+    info.sar_width  = param->vui.i_sar_width;
+    info.sar_height = param->vui.i_sar_height;
+    info.vfr        = param->b_vfr_input;
+
+    if( select_input( demuxer, demuxername, input_filename, &opt->hin, &info, &input_opt ) )
+        return -1;
+
+    if( !opt->hin && input.open_file( input_filename, &opt->hin, &info, &input_opt ) )
+    {
+        fprintf( stderr, "x264 [error]: could not open input file `%s'\n", input_filename );
+        return -1;
+    }
+
+    x264_reduce_fraction( &info.sar_width, &info.sar_height );
+    x264_reduce_fraction( &info.fps_num, &info.fps_den );
+    if( param->i_log_level >= X264_LOG_INFO )
+        fprintf( stderr, "%s [info]: %dx%d%c %d:%d @ %d/%d fps (%cfr)\n", demuxername, info.width,
+                 info.height, info.interlaced ? 'i' : 'p', info.sar_width, info.sar_height,
+                 info.fps_num, info.fps_den, info.vfr ? 'v' : 'c' );
+
+    /* set param flags from the info flags as necessary */
+    param->i_csp       = info.csp;
+    param->i_height    = info.height;
+    param->b_vfr_input = info.vfr;
+    param->i_width     = info.width;
+    if( !b_user_interlaced && info.interlaced )
+    {
+        fprintf( stderr, "x264 [warning]: input appears to be interlaced, enabling interlaced mode.\n"
+                         "                If you want otherwise, use --no-interlaced\n" );
+        param->b_interlaced = 1;
+    }
+    if( !b_user_fps )
+    {
+        param->i_fps_num = info.fps_num;
+        param->i_fps_den = info.fps_den;
+    }
+    if( param->b_vfr_input )
+    {
+        param->i_timebase_num = info.timebase_num;
+        param->i_timebase_den = info.timebase_den;
+    }
+    else
+    {
+        param->i_timebase_den = param->i_fps_num;
+        param->i_timebase_num = param->i_fps_den;
+    }
+    if( !param->vui.i_sar_width || !param->vui.i_sar_height )
+    {
+        param->vui.i_sar_width  = info.sar_width;
+        param->vui.i_sar_height = info.sar_height;
     }
 
 #ifdef HAVE_PTHREAD
     if( b_thread_input || param->i_threads > 1
-        || (param->i_threads == 0 && x264_cpu_num_processors() > 1) )
+        || (param->i_threads == X264_THREADS_AUTO && x264_cpu_num_processors() > 1) )
     {
-        if( open_file_thread( NULL, &opt->hin, param ) )
+        if( thread_input.open_file( NULL, &opt->hin, &info, NULL ) )
         {
-            fprintf( stderr, "x264 [warning]: threaded input failed\n" );
+            fprintf( stderr, "x264 [error]: threaded input failed\n" );
+            return -1;
         }
         else
-        {
-            p_open_infile = open_file_thread;
-            p_get_frame_total = get_frame_total_thread;
-            p_read_frame = read_frame_thread;
-            p_close_infile = close_file_thread;
-        }
+            input = thread_input;
     }
 #endif
 
+
+    /* Automatically reduce reference frame count to match the user's target level
+     * if the user didn't explicitly set a reference frame count. */
+    if( !b_user_ref )
+    {
+        int mbs = (((param->i_width)+15)>>4) * (((param->i_height)+15)>>4);
+        int i;
+        for( i = 0; x264_levels[i].level_idc != 0; i++ )
+            if( param->i_level_idc == x264_levels[i].level_idc )
+            {
+                while( mbs * 384 * param->i_frame_reference > x264_levels[i].dpb
+                       && param->i_frame_reference > 1 )
+                {
+                    param->i_frame_reference--;
+                }
+                break;
+            }
+    }
+
+
     return 0;
 }
 
@@ -716,14 +1363,14 @@ static void parse_qpfile( cli_opt_t *opt, x264_picture_t *pic, int i_frame )
     {
         file_pos = ftell( opt->qpfile );
         ret = fscanf( opt->qpfile, "%d %c %d\n", &num, &type, &qp );
-		if( num > i_frame || ret == EOF )
-		{
-			pic->i_type = X264_TYPE_AUTO;
-			pic->i_qpplus1 = 0;
-			fseek( opt->qpfile , file_pos , SEEK_SET );
-			break;
-		}
-        if( num < i_frame )
+        if( num > i_frame || ret == EOF )
+        {
+            pic->i_type = X264_TYPE_AUTO;
+            pic->i_qpplus1 = 0;
+            fseek( opt->qpfile, file_pos, SEEK_SET );
+            break;
+        }
+        if( num < i_frame && ret == 3 )
             continue;
         pic->i_qpplus1 = qp+1;
         if     ( type == 'I' ) pic->i_type = X264_TYPE_IDR;
@@ -748,37 +1395,50 @@ static void parse_qpfile( cli_opt_t *opt, x264_picture_t *pic, int i_frame )
  * Encode:
  *****************************************************************************/
 
-static int  Encode_frame( x264_t *h, hnd_t hout, x264_picture_t *pic )
+static int  Encode_frame( x264_t *h, hnd_t hout, x264_picture_t *pic, int64_t *last_pts )
 {
     x264_picture_t pic_out;
     x264_nal_t *nal;
-    int i_nal, i;
-    int i_file = 0;
+    int i_nal;
+    int i_frame_size = 0;
+
+    i_frame_size = x264_encoder_encode( h, &nal, &i_nal, pic, &pic_out );
 
-    if( x264_encoder_encode( h, &nal, &i_nal, pic, &pic_out ) < 0 )
+    if( i_frame_size < 0 )
     {
         fprintf( stderr, "x264 [error]: x264_encoder_encode failed\n" );
+        return -1;
     }
 
-    for( i = 0; i < i_nal; i++ )
+    if( i_frame_size )
     {
-        int i_size;
+        i_frame_size = output.write_frame( hout, nal[0].p_payload, i_frame_size, &pic_out );
+        *last_pts = pic_out.i_pts;
+    }
 
-        if( mux_buffer_size < nal[i].i_payload * 3/2 + 4 )
-        {
-            mux_buffer_size = nal[i].i_payload * 2 + 4;
-            x264_free( mux_buffer );
-            mux_buffer = x264_malloc( mux_buffer_size );
-        }
+    return i_frame_size;
+}
 
-        i_size = mux_buffer_size;
-        x264_nal_encode( mux_buffer, &i_size, 1, &nal[i] );
-        i_file += p_write_nalu( hout, mux_buffer, i_size );
+static void Print_status( int64_t i_start, int i_frame, int i_frame_total, int64_t i_file, x264_param_t *param, int64_t last_pts )
+{
+    char    buf[200];
+    int64_t i_elapsed = x264_mdate() - i_start;
+    double fps = i_elapsed > 0 ? i_frame * 1000000. / i_elapsed : 0;
+    double bitrate = (double) i_file * 8 / ( (double) last_pts * 1000 * param->i_timebase_num / param->i_timebase_den );
+    if( i_frame_total )
+    {
+        int eta = i_elapsed * (i_frame_total - i_frame) / ((int64_t)i_frame * 1000000);
+        sprintf( buf, "x264 [%.1f%%] %d/%d frames, %.2f fps, %.2f kb/s, eta %d:%02d:%02d",
+                 100. * i_frame / i_frame_total, i_frame, i_frame_total, fps, bitrate,
+                 eta/3600, (eta/60)%60, eta%60 );
     }
-    if (i_nal)
-        p_set_eop( hout, &pic_out );
-
-    return i_file;
+    else
+    {
+        sprintf( buf, "x264 %d frames: %.2f fps, %.2f kb/s", i_frame, fps, bitrate );
+    }
+    fprintf( stderr, "%s  \r", buf+5 );
+    SetConsoleTitle( buf );
+    fflush( stderr ); // needed in windows
 }
 
 static int  Encode( x264_param_t *param, cli_opt_t *opt )
@@ -786,16 +1446,24 @@ static int  Encode( x264_param_t *param, cli_opt_t *opt )
     x264_t *h;
     x264_picture_t pic;
 
-    int     i_frame, i_frame_total;
+    int     i_frame, i_frame_total, i_frame_output;
     int64_t i_start, i_end;
-    int64_t i_file;
+    int64_t i_file = 0;
     int     i_frame_size;
     int     i_update_interval;
-    char    buf[200];
+    int64_t last_pts = 0;
+#   define  MAX_PTS_WARNING 3 /* arbitrary */
+    int     pts_warning_cnt = 0;
+    int64_t largest_pts = -1;
+    int64_t second_largest_pts = -1;
+    int64_t ticks_per_frame;
+    double  duration;
+    int     prev_timebase_den = param->i_timebase_den / gcd( param->i_timebase_num, param->i_timebase_den );
+    int     dts_compress_multiplier;
 
     opt->b_progress &= param->i_log_level < X264_LOG_DEBUG;
-    i_frame_total = p_get_frame_total( opt->hin );
-    i_frame_total -= opt->i_seek;
+    i_frame_total = input.get_frame_total( opt->hin );
+    i_frame_total = X264_MAX( i_frame_total - opt->i_seek, 0 );
     if( ( i_frame_total == 0 || param->i_frame_total < i_frame_total )
         && param->i_frame_total > 0 )
         i_frame_total = param->i_frame_total;
@@ -805,30 +1473,77 @@ static int  Encode( x264_param_t *param, cli_opt_t *opt )
     if( ( h = x264_encoder_open( param ) ) == NULL )
     {
         fprintf( stderr, "x264 [error]: x264_encoder_open failed\n" );
-        p_close_infile( opt->hin );
+        input.close_file( opt->hin );
         return -1;
     }
 
-    if( p_set_outfile_param( opt->hout, param ) )
+    x264_encoder_parameters( h, param );
+
+    dts_compress_multiplier = param->i_timebase_den / prev_timebase_den;
+
+    if( output.set_param( opt->hout, param ) )
     {
         fprintf( stderr, "x264 [error]: can't set outfile param\n" );
-        p_close_infile( opt->hin );
-        p_close_outfile( opt->hout );
+        input.close_file( opt->hin );
+        output.close_file( opt->hout, largest_pts, second_largest_pts );
         return -1;
     }
 
     /* Create a new pic */
-    x264_picture_alloc( &pic, X264_CSP_I420, param->i_width, param->i_height );
+    if( input.picture_alloc( &pic, param->i_csp, param->i_width, param->i_height ) )
+    {
+        fprintf( stderr, "x264 [error]: malloc failed\n" );
+        return -1;
+    }
 
     i_start = x264_mdate();
+    /* ticks/frame = ticks/second / frames/second */
+    ticks_per_frame = (int64_t)param->i_timebase_den * param->i_fps_den / param->i_timebase_num / param->i_fps_num;
+    if( ticks_per_frame < 1 )
+    {
+        fprintf( stderr, "x264 [error]: ticks_per_frame invalid: %"PRId64"\n", ticks_per_frame );
+        return -1;
+    }
+
+    if( !param->b_repeat_headers )
+    {
+        // Write SPS/PPS/SEI
+        x264_nal_t *headers;
+        int i_nal;
+
+        if( x264_encoder_headers( h, &headers, &i_nal ) < 0 )
+        {
+            fprintf( stderr, "x264 [error]: x264_encoder_headers failed\n" );
+            return -1;
+        }
+
+        if( (i_file = output.write_headers( opt->hout, headers )) < 0 )
+            return -1;
+    }
 
     /* Encode frames */
-    for( i_frame = 0, i_file = 0; b_ctrl_c == 0 && (i_frame < i_frame_total || i_frame_total == 0); )
+    for( i_frame = 0, i_frame_output = 0; b_ctrl_c == 0 && (i_frame < i_frame_total || i_frame_total == 0); )
     {
-        if( p_read_frame( &pic, opt->hin, i_frame + opt->i_seek ) )
+        if( input.read_frame( &pic, opt->hin, i_frame + opt->i_seek ) )
             break;
 
-        pic.i_pts = (int64_t)i_frame * param->i_fps_den;
+        if( !param->b_vfr_input )
+            pic.i_pts = i_frame;
+        if( pic.i_pts <= largest_pts )
+        {
+            if( param->i_log_level >= X264_LOG_WARNING )
+            {
+                if( param->i_log_level >= X264_LOG_DEBUG || pts_warning_cnt < MAX_PTS_WARNING )
+                    fprintf( stderr, "x264 [warning]: non-strictly-monotonic pts at frame %d (%"PRId64" <= %"PRId64")\n",
+                             i_frame, pic.i_pts * dts_compress_multiplier, largest_pts * dts_compress_multiplier );
+                else if( pts_warning_cnt == MAX_PTS_WARNING )
+                    fprintf( stderr, "x264 [warning]: too many nonmonotonic pts warnings, suppressing further ones\n" );
+                pts_warning_cnt++;
+            }
+            pic.i_pts = largest_pts + ticks_per_frame;
+        }
+        second_largest_pts = largest_pts;
+        largest_pts = pic.i_pts;
 
         if( opt->qpfile )
             parse_qpfile( opt, &pic, i_frame + opt->i_seek );
@@ -839,61 +1554,65 @@ static int  Encode( x264_param_t *param, cli_opt_t *opt )
             pic.i_qpplus1 = 0;
         }
 
-        i_file += Encode_frame( h, opt->hout, &pic );
+        i_frame_size = Encode_frame( h, opt->hout, &pic, &last_pts );
+        if( i_frame_size < 0 )
+            return -1;
+        i_file += i_frame_size;
+        if( i_frame_size )
+            i_frame_output++;
 
         i_frame++;
 
+        if( input.release_frame && input.release_frame( &pic, opt->hin ) )
+            break;
+
         /* update status line (up to 1000 times per input file) */
-        if( opt->b_progress && i_frame % i_update_interval == 0 )
-        {
-            int64_t i_elapsed = x264_mdate() - i_start;
-            double fps = i_elapsed > 0 ? i_frame * 1000000. / i_elapsed : 0;
-            double bitrate = (double) i_file * 8 * param->i_fps_num / ( (double) param->i_fps_den * i_frame * 1000 );
-            if( i_frame_total )
-            {
-                int eta = i_elapsed * (i_frame_total - i_frame) / ((int64_t)i_frame * 1000000);
-                sprintf( buf, "x264 [%.1f%%] %d/%d frames, %.2f fps, %.2f kb/s, eta %d:%02d:%02d",
-                         100. * i_frame / i_frame_total, i_frame, i_frame_total, fps, bitrate,
-                         eta/3600, (eta/60)%60, eta%60 );
-            }
-            else
-            {
-                sprintf( buf, "x264 %d frames: %.2f fps, %.2f kb/s", i_frame, fps, bitrate );
-            }
-            fprintf( stderr, "%s  \r", buf+5 );
-            SetConsoleTitle( buf );
-            fflush( stderr ); // needed in windows
-        }
+        if( opt->b_progress && i_frame_output % i_update_interval == 0 && i_frame_output )
+            Print_status( i_start, i_frame_output, i_frame_total, i_file, param, last_pts );
     }
-    /* Flush delayed B-frames */
-    do {
-        i_file +=
-        i_frame_size = Encode_frame( h, opt->hout, NULL );
-    } while( i_frame_size );
+    /* Flush delayed frames */
+    while( !b_ctrl_c && x264_encoder_delayed_frames( h ) )
+    {
+        i_frame_size = Encode_frame( h, opt->hout, NULL, &last_pts );
+        if( i_frame_size < 0 )
+            return -1;
+        i_file += i_frame_size;
+        if( i_frame_size )
+            i_frame_output++;
+        if( opt->b_progress && i_frame_output % i_update_interval == 0 && i_frame_output )
+            Print_status( i_start, i_frame_output, i_frame_total, i_file, param, last_pts );
+    }
+    if( pts_warning_cnt >= MAX_PTS_WARNING && param->i_log_level < X264_LOG_DEBUG )
+        fprintf( stderr, "x264 [warning]: %d suppressed nonmonotonic pts warnings\n", pts_warning_cnt-MAX_PTS_WARNING );
+
+    /* duration algorithm fails when only 1 frame is output */
+    if( i_frame_output == 1 )
+        duration = (double)param->i_fps_den / param->i_fps_num;
+    else
+        duration = (double)(2 * largest_pts - second_largest_pts) * param->i_timebase_num / param->i_timebase_den;
+    duration *= dts_compress_multiplier;
 
     i_end = x264_mdate();
-    x264_picture_clean( &pic );
+    input.picture_clean( &pic );
     /* Erase progress indicator before printing encoding stats. */
     if( opt->b_progress )
         fprintf( stderr, "                                                                               \r" );
     x264_encoder_close( h );
-    x264_free( mux_buffer );
     fprintf( stderr, "\n" );
 
     if( b_ctrl_c )
-        fprintf( stderr, "aborted at input frame %d\n", opt->i_seek + i_frame );
+        fprintf( stderr, "aborted at input frame %d, output frame %d\n", opt->i_seek + i_frame, i_frame_output );
 
-    p_close_infile( opt->hin );
-    p_close_outfile( opt->hout );
+    input.close_file( opt->hin );
+    output.close_file( opt->hout, largest_pts, second_largest_pts );
 
-    if( i_frame > 0 )
+    if( i_frame_output > 0 )
     {
-        double fps = (double)i_frame * (double)1000000 /
+        double fps = (double)i_frame_output * (double)1000000 /
                      (double)( i_end - i_start );
 
-        fprintf( stderr, "encoded %d frames, %.2f fps, %.2f kb/s\n", i_frame, fps,
-                 (double) i_file * 8 * param->i_fps_num /
-                 ( (double) param->i_fps_den * i_frame * 1000 ) );
+        fprintf( stderr, "encoded %d frames, %.2f fps, %.2f kb/s\n", i_frame_output, fps,
+                 (double) i_file * 8 / ( 1000 * duration ) );
     }
 
     return 0;
diff --git a/x264.h b/x264.h
index 26ac421..2550864 100644
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@
 
 #include <stdarg.h>
 
-#define X264_BUILD 67
+#define X264_BUILD 84
 
 /* x264_t:
  *      opaque handler for encoder */
@@ -63,6 +63,9 @@ typedef struct x264_t x264_t;
 #define X264_CPU_SSE42          0x004000  /* SSE4.2 */
 #define X264_CPU_SSE_MISALIGN   0x008000  /* Phenom support for misaligned SSE instruction arguments */
 #define X264_CPU_LZCNT          0x010000  /* Phenom support for "leading zero count" instruction. */
+#define X264_CPU_ARMV6          0x020000
+#define X264_CPU_NEON           0x040000  /* ARM NEON */
+#define X264_CPU_FAST_NEON_MRC  0x080000  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
 
 /* Analyse flags
  */
@@ -83,18 +86,25 @@ typedef struct x264_t x264_t;
 #define X264_CQM_FLAT                0
 #define X264_CQM_JVT                 1
 #define X264_CQM_CUSTOM              2
-#define X264_RC_NONE                 -1
 #define X264_RC_CQP                  0
 #define X264_RC_CRF                  1
 #define X264_RC_ABR                  2
 #define X264_AQ_NONE                 0
 #define X264_AQ_VARIANCE             1
+#define X264_AQ_AUTOVARIANCE         2
 #define X264_B_ADAPT_NONE            0
 #define X264_B_ADAPT_FAST            1
 #define X264_B_ADAPT_TRELLIS         2
+#define X264_WEIGHTP_NONE            0
+#define X264_WEIGHTP_BLIND           1
+#define X264_WEIGHTP_SMART           2
+#define X264_B_PYRAMID_NONE          0
+#define X264_B_PYRAMID_STRICT        1
+#define X264_B_PYRAMID_NORMAL        2
 
 static const char * const x264_direct_pred_names[] = { "none", "spatial", "temporal", "auto", 0 };
 static const char * const x264_motion_est_names[] = { "dia", "hex", "umh", "esa", "tesa", 0 };
+static const char * const x264_b_pyramid_names[] = { "none", "strict", "normal", 0 };
 static const char * const x264_overscan_names[] = { "undef", "show", "crop", 0 };
 static const char * const x264_vidformat_names[] = { "component", "pal", "ntsc", "secam", "mac", "undef", 0 };
 static const char * const x264_fullrange_names[] = { "off", "on", 0 };
@@ -103,8 +113,7 @@ static const char * const x264_transfer_names[] = { "", "bt709", "undef", "", "b
 static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", "YCgCo", 0 };
 
 /* Colorspace type
- * legacy only; nothing other than I420 is really supported.
- */
+ * legacy only; nothing other than I420 is really supported. */
 #define X264_CSP_MASK           0x00ff  /* */
 #define X264_CSP_NONE           0x0000  /* Invalid mode     */
 #define X264_CSP_I420           0x0001  /* yuv 4:2:0 planar */
@@ -118,8 +127,7 @@ static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", ""
 #define X264_CSP_MAX            0x0009  /* end of list */
 #define X264_CSP_VFLIP          0x1000  /* */
 
-/* Slice type
- */
+/* Slice type */
 #define X264_TYPE_AUTO          0x0000  /* Let x264 choose the right type */
 #define X264_TYPE_IDR           0x0001
 #define X264_TYPE_I             0x0002
@@ -129,14 +137,17 @@ static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", ""
 #define IS_X264_TYPE_I(x) ((x)==X264_TYPE_I || (x)==X264_TYPE_IDR)
 #define IS_X264_TYPE_B(x) ((x)==X264_TYPE_B || (x)==X264_TYPE_BREF)
 
-/* Log level
- */
+/* Log level */
 #define X264_LOG_NONE          (-1)
 #define X264_LOG_ERROR          0
 #define X264_LOG_WARNING        1
 #define X264_LOG_INFO           2
 #define X264_LOG_DEBUG          3
 
+/* Threading */
+#define X264_THREADS_AUTO 0 /* Automatically select optimal number of threads */
+#define X264_SYNC_LOOKAHEAD_AUTO (-1) /* Automatically select optimal lookahead thread buffer size */
+
 /* Zones: override ratecontrol or other options for specific sections of the video.
  * See x264_encoder_reconfig() for which options can be changed.
  * If zones overlap, whichever comes later in the list takes precedence. */
@@ -154,7 +165,9 @@ typedef struct x264_param_t
     /* CPU flags */
     unsigned int cpu;
     int         i_threads;       /* encode multiple frames in parallel */
+    int         b_sliced_threads;  /* Whether to use slice-based threading. */
     int         b_deterministic; /* whether to allow non-deterministic optimizations when threaded */
+    int         i_sync_lookahead; /* threaded lookahead buffer */
 
     /* Video Properties */
     int         i_width;
@@ -188,10 +201,12 @@ typedef struct x264_param_t
     int         i_keyint_max;       /* Force an IDR keyframe at this interval */
     int         i_keyint_min;       /* Scenecuts closer together than this are coded as I, not IDR. */
     int         i_scenecut_threshold; /* how aggressively to insert extra I frames */
+    int         b_intra_refresh;    /* Whether or not to use periodic intra refresh instead of IDR frames. */
+
     int         i_bframe;   /* how many b-frame between 2 references pictures */
     int         i_bframe_adaptive;
     int         i_bframe_bias;
-    int         b_bframe_pyramid;   /* Keep some B-frames as references */
+    int         i_bframe_pyramid;   /* Keep some B-frames as references: 0=off, 1=strict hierarchical, 2=normal */
 
     int         b_deblocking_filter;
     int         i_deblocking_filter_alphac0;    /* [-6, 6] -6 light filter, 6 strong */
@@ -201,6 +216,7 @@ typedef struct x264_param_t
     int         i_cabac_init_idc;
 
     int         b_interlaced;
+    int         b_constrained_intra;
 
     int         i_cqm_preset;
     char        *psz_cqm_file;      /* JM format */
@@ -225,6 +241,7 @@ typedef struct x264_param_t
         unsigned int inter;     /* inter partitions */
 
         int          b_transform_8x8;
+        int          i_weighted_pred; /* weighting for P-frames */
         int          b_weighted_bipred; /* implicit weighting for B-frames */
         int          i_direct_mv_pred; /* spatial vs temporal mv prediction */
         int          i_chroma_qp_offset;
@@ -242,6 +259,7 @@ typedef struct x264_param_t
         int          i_noise_reduction; /* adaptive pseudo-deadzone */
         float        f_psy_rd; /* Psy RD strength */
         float        f_psy_trellis; /* Psy trellis strength */
+        int          b_psy; /* Toggle all psy optimizations */
 
         /* the deadzone size that will be used in luma quantization */
         int          i_luma_deadzone[2]; /* {inter, intra} */
@@ -271,6 +289,8 @@ typedef struct x264_param_t
 
         int         i_aq_mode;      /* psy adaptive QP. (X264_AQ_*) */
         float       f_aq_strength;
+        int         b_mb_tree;      /* Macroblock-tree ratecontrol. */
+        int         i_lookahead;
 
         /* 2pass */
         int         b_stat_write;   /* Enable stat writing in psz_stat_out */
@@ -290,7 +310,26 @@ typedef struct x264_param_t
     /* Muxing parameters */
     int b_aud;                  /* generate access unit delimiters */
     int b_repeat_headers;       /* put SPS/PPS before each keyframe */
+    int b_annexb;               /* if set, place start codes (4 bytes) before NAL units,
+                                 * otherwise place size (4 bytes) before NAL units. */
     int i_sps_id;               /* SPS and PPS id number */
+    int b_vfr_input;            /* VFR input */
+    int i_timebase_num;         /* Timebase numerator */
+    int i_timebase_den;         /* Timebase denominator */
+    int b_dts_compress;         /* DTS compression: this algorithm eliminates negative DTS
+                                 * by compressing them to be less than the second PTS.
+                                 * Warning: this will change the timebase! */
+
+    /* Slicing parameters */
+    int i_slice_max_size;    /* Max size per slice in bytes; includes estimated NAL overhead. */
+    int i_slice_max_mbs;     /* Max number of MBs per slice; overrides i_slice_count. */
+    int i_slice_count;       /* Number of slices per frame: forces rectangular slices. */
+
+    /* Optional callback for freeing this x264_param_t when it is done being used.
+     * Only used when the x264_param_t sits in memory for an indefinite period of time,
+     * i.e. when an x264_param_t is passed to x264_t in an x264_picture_t or in zones.
+     * Not used when x264_encoder_reconfig is called directly. */
+    void (*param_free)( void* );
 } x264_param_t;
 
 typedef struct {
@@ -349,16 +388,32 @@ typedef struct
     int     i_type;
     /* In: force quantizer for > 0 */
     int     i_qpplus1;
+    /* Out: whether this frame is a keyframe.  Important when using modes that result in
+     * SEI recovery points being used instead of IDR frames. */
+    int     b_keyframe;
     /* In: user pts, Out: pts of encoded picture (user)*/
     int64_t i_pts;
-
+    /* Out: frame dts. Since the pts of the first frame is always zero,
+     *      initial frames may have a negative dts which must be dealt with by any muxer */
+    int64_t i_dts;
+    /* In: custom encoding parameters to be set from this frame forwards
+           (in coded order, not display order). If NULL, continue using
+           parameters from the previous frame.  Some parameters, such as
+           aspect ratio, can only be changed per-GOP due to the limitations
+           of H.264 itself; in this case, the caller must force an IDR frame
+           if it needs the changed parameter to apply immediately. */
+    x264_param_t *param;
     /* In: raw data */
     x264_image_t img;
+    /* private user data. libx264 doesn't touch this,
+       not even copy it from input to output frames. */
+    void *opaque;
 } x264_picture_t;
 
 /* x264_picture_alloc:
- *  alloc data for a picture. You must call x264_picture_clean on it. */
-void x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height );
+ *  alloc data for a picture. You must call x264_picture_clean on it.
+ *  returns 0 on success, or -1 on malloc failure. */
+int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height );
 
 /* x264_picture_clean:
  *  free associated resource for a x264_picture_t allocated with
@@ -391,41 +446,72 @@ enum nal_priority_e
     NAL_PRIORITY_HIGHEST    = 3,
 };
 
+/* The data within the payload is already NAL-encapsulated; the ref_idc and type
+ * are merely in the struct for easy access by the calling application.
+ * All data returned in an x264_nal_t, including the data in p_payload, is no longer
+ * valid after the next call to x264_encoder_encode.  Thus it must be used or copied
+ * before calling x264_encoder_encode or x264_encoder_headers again. */
 typedef struct
 {
     int i_ref_idc;  /* nal_priority_e */
     int i_type;     /* nal_unit_type_e */
 
-    /* This data are raw payload */
+    /* Size of payload in bytes. */
     int     i_payload;
+    /* If param->b_annexb is set, Annex-B bytestream with 4-byte startcode.
+     * Otherwise, startcode is replaced with a 4-byte size.
+     * This size is the size used in mp4/similar muxing; it is equal to i_payload-4 */
     uint8_t *p_payload;
 } x264_nal_t;
 
-/* x264_nal_encode:
- *      encode a nal into a buffer, setting the size.
- *      if b_annexeb then a long synch work is added
- *      XXX: it currently doesn't check for overflow */
-int x264_nal_encode( void *, int *, int b_annexeb, x264_nal_t *nal );
-
 /****************************************************************************
  * Encoder functions:
  ****************************************************************************/
 
+/* Force a link error in the case of linking against an incompatible API version.
+ * Glue #defines exist to force correct macro expansion; the final output of the macro
+ * is x264_encoder_open_##X264_BUILD (for purposes of dlopen). */
+#define x264_encoder_glue1(x,y) x##y
+#define x264_encoder_glue2(x,y) x264_encoder_glue1(x,y)
+#define x264_encoder_open x264_encoder_glue2(x264_encoder_open_,X264_BUILD)
+
 /* x264_encoder_open:
  *      create a new encoder handler, all parameters from x264_param_t are copied */
-x264_t *x264_encoder_open   ( x264_param_t * );
+x264_t *x264_encoder_open( x264_param_t * );
+
 /* x264_encoder_reconfig:
- *      change encoder options while encoding,
- *      analysis-related parameters from x264_param_t are copied */
+ *      analysis-related parameters from x264_param_t are copied.
+ *      this takes effect immediately, on whichever frame is encoded next;
+ *      due to delay, this may not be the next frame passed to encoder_encode.
+ *      if the change should apply to some particular frame, use x264_picture_t->param instead.
+ *      returns 0 on success, negative on parameter validation error. */
 int     x264_encoder_reconfig( x264_t *, x264_param_t * );
+/* x264_encoder_parameters:
+ *      copies the current internal set of parameters to the pointer provided
+ *      by the caller.  useful when the calling application needs to know
+ *      how x264_encoder_open has changed the parameters, or the current state
+ *      of the encoder after multiple x264_encoder_reconfig calls.
+ *      note that the data accessible through pointers in the returned param struct
+ *      (e.g. filenames) should not be modified by the calling application. */
+void    x264_encoder_parameters( x264_t *, x264_param_t * );
 /* x264_encoder_headers:
- *      return the SPS and PPS that will be used for the whole stream */
+ *      return the SPS and PPS that will be used for the whole stream.
+ *      if i_nal > 0, returns the total size of all NAL payloads.
+ *      returns negative on error.
+ *      the payloads of all output NALs are guaranteed to be sequential in memory. */
 int     x264_encoder_headers( x264_t *, x264_nal_t **, int * );
 /* x264_encoder_encode:
- *      encode one picture */
+ *      encode one picture.
+ *      if i_nal > 0, returns the total size of all NAL payloads.
+ *      returns negative on error, zero if no NAL units returned.
+ *      the payloads of all output NALs are guaranteed to be sequential in memory. */
 int     x264_encoder_encode ( x264_t *, x264_nal_t **, int *, x264_picture_t *, x264_picture_t * );
 /* x264_encoder_close:
  *      close an encoder handler */
 void    x264_encoder_close  ( x264_t * );
+/* x264_encoder_delayed_frames:
+ *      return the number of currently delayed (buffered) frames
+ *      this should be used at the end of the stream, to know when you have all the encoded frames. */
+int     x264_encoder_delayed_frames( x264_t * );
 
 #endif
diff --git a/common/visualize.h b/x264dll.c
similarity index 56%
copy from common/visualize.h
copy to x264dll.c
index b611f6c..2b6524d 100644
--- a/common/visualize.h
+++ b/x264dll.c
@@ -1,7 +1,9 @@
 /*****************************************************************************
- * x264: h264 encoder
+ * x264dll: x264 DLLMain for win32
  *****************************************************************************
- * Copyright (C) 2005 Tuukka Toivonen <tuukkat at ee.oulu.fi>
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: Anton Mitrofanov <BugMaster at narod.ru>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -18,14 +20,32 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/
 
-#ifndef X264_VISUALIZE_H
-#define X264_VISUALIZE_H
-
 #include "common/common.h"
+#include <windows.h>
+
+/* Callback for our DLL so we can initialize pthread */
+BOOL WINAPI DllMain( HANDLE hinstDLL, DWORD fdwReason, LPVOID lpvReserved )
+{
+#ifdef PTW32_STATIC_LIB
+    switch( fdwReason )
+    {
+        case DLL_PROCESS_ATTACH:
+            pthread_win32_process_attach_np();
 
-void x264_visualize_init( x264_t *h );
-void x264_visualize_mb( x264_t *h );
-void x264_visualize_show( x264_t *h );
-void x264_visualize_close( x264_t *h );
+        case DLL_THREAD_ATTACH:
+            pthread_win32_thread_attach_np();
+            break;
 
+        case DLL_THREAD_DETACH:
+            pthread_win32_thread_detach_np();
+            break;
+
+        case DLL_PROCESS_DETACH:
+            pthread_win32_thread_detach_np();
+            pthread_win32_process_detach_np();
+            break;
+    }
 #endif
+
+    return TRUE;
+}

-- 
x264 packaging