[SCM] libvpx/upstream: Imported Upstream version 1.6.0

Wed Aug 10 21:33:17 UTC 2016

The following commit has been merged in the upstream branch:
commit 7bfa49d306a8547fae6b54434686e6d3335b4e0d
Author: Ondřej Nový <onovy at debian.org>
Date:   Wed Aug 10 23:27:45 2016 +0200

    Imported Upstream version 1.6.0

diff --git a/.mailmap b/.mailmap
index 42f3617..94cb1ec 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1,27 +1,28 @@
 Adrian Grange <agrange at google.com>
-Adrian Grange <agrange at google.com> <agrange at agrange-macbookpro.roam.corp.google.com>
 Aℓex Converse <aconverse at google.com>
 Aℓex Converse <aconverse at google.com> <alex.converse at gmail.com>
 Alexis Ballier <aballier at gentoo.org> <alexis.ballier at gmail.com>
 Alpha Lam <hclam at google.com> <hclam at chromium.org>
+Daniele Castagna <dcastagna at chromium.org> <dcastagna at google.com>
 Deb Mukherjee <debargha at google.com>
 Erik Niemeyer <erik.a.niemeyer at intel.com> <erik.a.niemeyer at gmail.com>
 Guillaume Martres <gmartres at google.com> <smarter3 at gmail.com>
 Hangyu Kuang <hkuang at google.com>
-Hangyu Kuang <hkuang at google.com> <hkuang at hkuang-macbookpro.roam.corp.google.com>
 Hui Su <huisu at google.com>
 Jacky Chen <jackychen at google.com>
 Jim Bankoski <jimbankoski at google.com>
 Johann Koenig <johannkoenig at google.com>
 Johann Koenig <johannkoenig at google.com> <johann.koenig at duck.com>
-Johann Koenig <johannkoenig at google.com> <johannkoenig at dhcp-172-19-7-52.mtv.corp.google.com>
 Johann Koenig <johannkoenig at google.com> <johann.koenig at gmail.com>
+Johann Koenig <johannkoenig at google.com> <johannkoenig at chromium.org>
 John Koleszar <jkoleszar at google.com>
 Joshua Litt <joshualitt at google.com> <joshualitt at chromium.org>
 Marco Paniconi <marpan at google.com>
 Marco Paniconi <marpan at google.com> <marpan at chromium.org>
 Pascal Massimino <pascal.massimino at gmail.com>
 Paul Wilkins <paulwilkins at google.com>
+Peter de Rivaz <peter.derivaz at gmail.com>
+Peter de Rivaz <peter.derivaz at gmail.com> <peter.derivaz at argondesign.com>
 Ralph Giles <giles at xiph.org> <giles at entropywave.com>
 Ralph Giles <giles at xiph.org> <giles at mozilla.com>
 Ronald S. Bultje <rsbultje at gmail.com> <rbultje at google.com>
@@ -29,8 +30,8 @@ Sami Pietilä <samipietila at google.com>
 Tamar Levy <tamar.levy at intel.com>
 Tamar Levy <tamar.levy at intel.com> <levytamar82 at gmail.com>
 Tero Rintaluoma <teror at google.com> <tero.rintaluoma at on2.com>
-Timothy B. Terriberry <tterribe at xiph.org> Tim Terriberry <tterriberry at mozilla.com>
+Timothy B. Terriberry <tterribe at xiph.org> <tterriberry at mozilla.com>
 Tom Finegan <tomfinegan at google.com>
 Tom Finegan <tomfinegan at google.com> <tomfinegan at chromium.org>
 Yaowu Xu <yaowu at google.com> <yaowu at xuyaowu.com>
-Yaowu Xu <yaowu at google.com> <yaowu at YAOWU2-W.ad.corp.google.com>
+Yaowu Xu <yaowu at google.com> <Yaowu Xu>
diff --git a/AUTHORS b/AUTHORS
index f89b677..fcd5c53 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -24,6 +24,7 @@ changjun.yang <changjun.yang at intel.com>
 Charles 'Buck' Krasic <ckrasic at google.com>
 chm <chm at rock-chips.com>
 Christian Duvivier <cduvivier at google.com>
+Daniele Castagna <dcastagna at chromium.org>
 Daniel Kang <ddkang at google.com>
 Deb Mukherjee <debargha at google.com>
 Dim Temp <dimtemp0 at gmail.com>
@@ -56,10 +57,12 @@ James Zern <jzern at google.com>
 Jan Gerber <j at mailb.org>
 Jan Kratochvil <jan.kratochvil at redhat.com>
 Janne Salonen <jsalonen at google.com>
+Jean-Yves Avenard <jyavenard at mozilla.com>
 Jeff Faust <jfaust at google.com>
 Jeff Muizelaar <jmuizelaar at mozilla.com>
 Jeff Petkau <jpet at chromium.org>
 Jia Jia <jia.jia at linaro.org>
+Jian Zhou <zhoujian at google.com>
 Jim Bankoski <jimbankoski at google.com>
 Jingning Han <jingning at google.com>
 Joey Parrish <joeyparrish at google.com>
@@ -74,6 +77,7 @@ Justin Clift <justin at salasaga.org>
 Justin Lebar <justin.lebar at gmail.com>
 KO Myung-Hun <komh at chollian.net>
 Lawrence Velázquez <larryv at macports.org>
+Linfeng Zhang <linfengz at google.com>
 Lou Quillio <louquillio at google.com>
 Luca Barbato <lu_zero at gentoo.org>
 Makoto Kato <makoto.kt at gmail.com>
@@ -107,9 +111,11 @@ Rob Bradford <rob at linux.intel.com>
 Ronald S. Bultje <rsbultje at gmail.com>
 Rui Ueyama <ruiu at google.com>
 Sami Pietilä <samipietila at google.com>
+Sasi Inguva <isasi at google.com>
 Scott Graham <scottmg at chromium.org>
 Scott LaVarnway <slavarnway at google.com>
 Sean McGovern <gseanmcg at gmail.com>
+Sergey Kolomenkin <kolomenkin at gmail.com>
 Sergey Ulanov <sergeyu at chromium.org>
 Shimon Doodkin <helpmepro1 at gmail.com>
 Shunyao Li <shunyaoli at google.com>
@@ -126,8 +132,10 @@ Timothy B. Terriberry <tterribe at xiph.org>
 Tom Finegan <tomfinegan at google.com>
 Vignesh Venkatasubramanian <vigneshv at google.com>
 Yaowu Xu <yaowu at google.com>
+Yi Luo <luoyi at google.com>
 Yongzhe Wang <yongzhe at google.com>
 Yunqing Wang <yunqingwang at google.com>
+Yury Gitman <yuryg at google.com>
 Zoe Liu <zoeliu at google.com>
 Google Inc.
 The Mozilla Foundation
diff --git a/CHANGELOG b/CHANGELOG
index 7746cc6..795d395 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,33 @@
+2016-07-20 v1.6.0 "Khaki Campbell Duck"
+  This release improves upon the VP9 encoder and speeds up the encoding and
+  decoding processes.
+
+  - Upgrading:
+    This release is ABI incompatible with 1.5.0 due to a new 'color_range' enum
+    in vpx_image and some minor changes to the VP8_COMP structure.
+
+    The default key frame interval for VP9 has changed from 128 to 9999.
+
+  - Enhancement:
+    A core focus has been performance for low end Intel processors. SSSE3
+    instructions such as 'pshufb' have been avoided and instructions have been
+    reordered to better accommodate the more constrained pipelines.
+
+    As a result, devices based on Celeron processors have seen substantial
+    decoding improvements. From Indian Runner Duck to Javan Whistling Duck,
+    decoding speed improved between 10 and 30%. Between Javan Whistling Duck
+    and Khaki Campbell Duck, it improved another 10 to 15%.
+
+    While Celeron benefited most, Core-i5 also improved 5% and 10% between the
+    respective releases.
+
+    Realtime performance for WebRTC for both speed and quality has received a
+    lot of attention.
+
+  - Bug Fixes:
+    A number of fuzzing issues, found variously by Mozilla, Chromium and others,
+    have been fixed and we strongly recommend updating.
+
 2015-11-09 v1.5.0 "Javan Whistling Duck"
   This release improves upon the VP9 encoder and speeds up the encoding and
   decoding processes.
diff --git a/README b/README
index 979440e..a8e6aeb 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-README - 23 March 2015
+README - 20 July 2016
 
 Welcome to the WebM VP8/VP9 Codec SDK!
 
@@ -47,7 +47,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
   --help output of the configure script. As of this writing, the list of
   available targets is:
 
-    armv6-darwin-gcc
     armv6-linux-rvct
     armv6-linux-gcc
     armv6-none-rvct
diff --git a/build/make/Android.mk b/build/make/Android.mk
index df01dec..9eb6dd2 100644
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@@ -174,9 +174,6 @@ endif
 ifeq ($(CONFIG_VP9), yes)
 $$(rtcd_dep_template_SRCS): vp9_rtcd.h
 endif
-ifeq ($(CONFIG_VP10), yes)
-$$(rtcd_dep_template_SRCS): vp10_rtcd.h
-endif
 $$(rtcd_dep_template_SRCS): vpx_scale_rtcd.h
 $$(rtcd_dep_template_SRCS): vpx_dsp_rtcd.h
 
diff --git a/build/make/Makefile b/build/make/Makefile
index 3081a92..3e8c024 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -119,29 +119,25 @@ utiltest:
 test-no-data-check::
 exampletest-no-data-check utiltest-no-data-check:
 
-# Add compiler flags for intrinsic files
+# Force to realign stack always on OS/2
 ifeq ($(TOOLCHAIN), x86-os2-gcc)
-STACKREALIGN=-mstackrealign
-else
-STACKREALIGN=
+CFLAGS += -mstackrealign
 endif
 
 $(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx
 $(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx
-$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2 $(STACKREALIGN)
-$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2 $(STACKREALIGN)
-$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3 $(STACKREALIGN)
-$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3 $(STACKREALIGN)
-$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3 $(STACKREALIGN)
-$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3 $(STACKREALIGN)
-$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1 $(STACKREALIGN)
-$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1 $(STACKREALIGN)
-$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx $(STACKREALIGN)
-$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx $(STACKREALIGN)
-$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 $(STACKREALIGN)
-$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 $(STACKREALIGN)
-$(BUILD_PFX)%vp9_reconintra.c.d: CFLAGS += $(STACKREALIGN)
-$(BUILD_PFX)%vp9_reconintra.c.o: CFLAGS += $(STACKREALIGN)
+$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2
+$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2
+$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3
+$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3
+$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3
+$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3
+$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1
+$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1
+$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx
+$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx
+$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2
+$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
 
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet), at echo "    [DEP] $@")
diff --git a/build/make/configure.sh b/build/make/configure.sh
index c592b63..4f0071b 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -185,6 +185,7 @@ add_extralibs() {
 #
 # Boolean Manipulation Functions
 #
+
 enable_feature(){
   set_all yes $*
 }
@@ -201,6 +202,20 @@ disabled(){
   eval test "x\$$1" = "xno"
 }
 
+enable_codec(){
+  enabled "${1}" || echo "  enabling ${1}"
+  enable_feature "${1}"
+
+  is_in "${1}" vp8 vp9 && enable_feature "${1}_encoder" "${1}_decoder"
+}
+
+disable_codec(){
+  disabled "${1}" || echo "  disabling ${1}"
+  disable_feature "${1}"
+
+  is_in "${1}" vp8 vp9 && disable_feature "${1}_encoder" "${1}_decoder"
+}
+
 # Iterates through positional parameters, checks to confirm the parameter has
 # not been explicitly (force) disabled, and enables the setting controlled by
 # the parameter when the setting is not disabled.
@@ -521,22 +536,20 @@ process_common_cmdline() {
         ;;
       --enable-?*|--disable-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        if echo "${ARCH_EXT_LIST}" | grep "^ *$option\$" >/dev/null; then
+        if is_in ${option} ${ARCH_EXT_LIST}; then
           [ $action = "disable" ] && RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${option} "
         elif [ $action = "disable" ] && ! disabled $option ; then
-          echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
-            die_unknown $opt
+          is_in ${option} ${CMDLINE_SELECT} || die_unknown $opt
           log_echo "  disabling $option"
         elif [ $action = "enable" ] && ! enabled $option ; then
-          echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
-            die_unknown $opt
+          is_in ${option} ${CMDLINE_SELECT} || die_unknown $opt
           log_echo "  enabling $option"
         fi
         ${action}_feature $option
         ;;
       --require-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        if echo "${ARCH_EXT_LIST}" none | grep "^ *$option\$" >/dev/null; then
+        if is_in ${option} ${ARCH_EXT_LIST}; then
             RTCD_OPTIONS="${RTCD_OPTIONS}${opt} "
         else
             die_unknown $opt
@@ -638,16 +651,39 @@ show_darwin_sdk_major_version() {
   xcrun --sdk $1 --show-sdk-version 2>/dev/null | cut -d. -f1
 }
 
+# Print the Xcode version.
+show_xcode_version() {
+  xcodebuild -version | head -n1 | cut -d' ' -f2
+}
+
+# Fails when Xcode version is less than 6.3.
+check_xcode_minimum_version() {
+  xcode_major=$(show_xcode_version | cut -f1 -d.)
+  xcode_minor=$(show_xcode_version | cut -f2 -d.)
+  xcode_min_major=6
+  xcode_min_minor=3
+  if [ ${xcode_major} -lt ${xcode_min_major} ]; then
+    return 1
+  fi
+  if [ ${xcode_major} -eq ${xcode_min_major} ] \
+    && [ ${xcode_minor} -lt ${xcode_min_minor} ]; then
+    return 1
+  fi
+}
+
 process_common_toolchain() {
   if [ -z "$toolchain" ]; then
     gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
 
     # detect tgt_isa
     case "$gcctarget" in
+      aarch64*)
+        tgt_isa=arm64
+        ;;
       armv6*)
         tgt_isa=armv6
         ;;
-      armv7*-hardfloat*)
+      armv7*-hardfloat* | armv7*-gnueabihf | arm-*-gnueabihf)
         tgt_isa=armv7
         float_abi=hard
         ;;
@@ -688,6 +724,10 @@ process_common_toolchain() {
         tgt_isa=x86_64
         tgt_os=darwin14
         ;;
+      *darwin15*)
+        tgt_isa=x86_64
+        tgt_os=darwin15
+        ;;
       x86_64*mingw32*)
         tgt_os=win64
         ;;
@@ -744,7 +784,14 @@ process_common_toolchain() {
   enabled shared && soft_enable pic
 
   # Minimum iOS version for all target platforms (darwin and iphonesimulator).
-  IOS_VERSION_MIN="6.0"
+  # Shared library framework builds are only possible on iOS 8 and later.
+  if enabled shared; then
+    IOS_VERSION_OPTIONS="--enable-shared"
+    IOS_VERSION_MIN="8.0"
+  else
+    IOS_VERSION_OPTIONS=""
+    IOS_VERSION_MIN="6.0"
+  fi
 
   # Handle darwin variants. Newer SDKs allow targeting older
   # platforms, so use the newest one available.
@@ -795,6 +842,10 @@ process_common_toolchain() {
       add_cflags  "-mmacosx-version-min=10.10"
       add_ldflags "-mmacosx-version-min=10.10"
       ;;
+    *-darwin15-*)
+      add_cflags  "-mmacosx-version-min=10.11"
+      add_ldflags "-mmacosx-version-min=10.11"
+      ;;
     *-iphonesimulator-*)
       add_cflags  "-miphoneos-version-min=${IOS_VERSION_MIN}"
       add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
@@ -869,7 +920,6 @@ process_common_toolchain() {
 
       case ${tgt_cc} in
         gcc)
-          CROSS=${CROSS:-arm-none-linux-gnueabi-}
           link_with_cc=gcc
           setup_gnu_toolchain
           arch_int=${tgt_isa##armv}
@@ -891,6 +941,9 @@ EOF
               check_add_cflags -mfpu=neon #-ftree-vectorize
               check_add_asflags -mfpu=neon
             fi
+          elif [ ${tgt_isa} = "arm64" ] || [ ${tgt_isa} = "armv8" ]; then
+            check_add_cflags -march=armv8-a
+            check_add_asflags -march=armv8-a
           else
             check_add_cflags -march=${tgt_isa}
             check_add_asflags -march=${tgt_isa}
@@ -958,6 +1011,10 @@ EOF
           ;;
 
         android*)
+          if [ -z "${sdk_path}" ]; then
+            die "Must specify --sdk-path for Android builds."
+          fi
+
           SDK_PATH=${sdk_path}
           COMPILER_LOCATION=`find "${SDK_PATH}" \
                              -name "arm-linux-androideabi-gcc*" -print -quit`
@@ -979,8 +1036,10 @@ EOF
                 awk '{ print $1 }' | tail -1`
           fi
 
-          add_cflags "--sysroot=${alt_libc}"
-          add_ldflags "--sysroot=${alt_libc}"
+          if [ -d "${alt_libc}" ]; then
+            add_cflags "--sysroot=${alt_libc}"
+            add_ldflags "--sysroot=${alt_libc}"
+          fi
 
           # linker flag that routes around a CPU bug in some
           # Cortex-A8 implementations (NDK Dev Guide)
@@ -1006,18 +1065,7 @@ EOF
           NM="$(${XCRUN_FIND} nm)"
           RANLIB="$(${XCRUN_FIND} ranlib)"
           AS_SFX=.s
-
-          # Special handling of ld for armv6 because libclang_rt.ios.a does
-          # not contain armv6 support in Apple's clang package:
-          #   Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn).
-          # TODO(tomfinegan): Remove this. Our minimum iOS version (6.0)
-          # renders support for armv6 unnecessary because the 3GS and up
-          # support neon.
-          if [ "${tgt_isa}" = "armv6" ]; then
-            LD="$(${XCRUN_FIND} ld)"
-          else
-            LD="${CXX:-$(${XCRUN_FIND} ld)}"
-          fi
+          LD="${CXX:-$(${XCRUN_FIND} ld)}"
 
           # ASFLAGS is written here instead of using check_add_asflags
           # because we need to overwrite all of ASFLAGS and purge the
@@ -1043,6 +1091,19 @@ EOF
             [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
           done
 
+          case ${tgt_isa} in
+            armv7|armv7s|armv8|arm64)
+              if enabled neon && ! check_xcode_minimum_version; then
+                soft_disable neon
+                log_echo "  neon disabled: upgrade Xcode (need v6.3+)."
+                if enabled neon_asm; then
+                  soft_disable neon_asm
+                  log_echo "  neon_asm disabled: upgrade Xcode (need v6.3+)."
+                fi
+              fi
+              ;;
+          esac
+
           asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl"
 
           if [ "$(show_darwin_sdk_major_version iphoneos)" -gt 8 ]; then
@@ -1057,7 +1118,7 @@ EOF
           if enabled rvct; then
             # Check if we have CodeSourcery GCC in PATH. Needed for
             # libraries
-            hash arm-none-linux-gnueabi-gcc 2>&- || \
+            which arm-none-linux-gnueabi-gcc 2>&- || \
               die "Couldn't find CodeSourcery GCC from PATH"
 
             # Use armcc as a linker to enable translation of
@@ -1098,7 +1159,7 @@ EOF
             check_add_ldflags -mfp64
             ;;
           i6400)
-            check_add_cflags -mips64r6 -mabi=64 -funroll-loops -msched-weight 
+            check_add_cflags -mips64r6 -mabi=64 -funroll-loops -msched-weight
             check_add_cflags  -mload-store-pairs -mhard-float -mfp64
             check_add_asflags -mips64r6 -mabi=64 -mhard-float -mfp64
             check_add_ldflags -mips64r6 -mabi=64 -mfp64
@@ -1125,7 +1186,7 @@ EOF
           CC=${CC:-${CROSS}gcc}
           CXX=${CXX:-${CROSS}g++}
           LD=${LD:-${CROSS}gcc}
-          CROSS=${CROSS:-g}
+          CROSS=${CROSS-g}
           ;;
         os2)
           disable_feature pic
@@ -1178,6 +1239,12 @@ EOF
               soft_disable avx2
               ;;
           esac
+          case $vc_version in
+            7|8|9)
+              echo "${tgt_cc} omits stdint.h, disabling webm-io..."
+              soft_disable webm_io
+              ;;
+          esac
           ;;
       esac
 
@@ -1198,33 +1265,43 @@ EOF
       soft_enable runtime_cpu_detect
       # We can't use 'check_cflags' until the compiler is configured and CC is
       # populated.
-      check_gcc_machine_option mmx
-      check_gcc_machine_option sse
-      check_gcc_machine_option sse2
-      check_gcc_machine_option sse3
-      check_gcc_machine_option ssse3
-      check_gcc_machine_option sse4 sse4_1
-      check_gcc_machine_option avx
-      check_gcc_machine_option avx2
-
-      case "${AS}" in
-        auto|"")
-          which nasm >/dev/null 2>&1 && AS=nasm
-          which yasm >/dev/null 2>&1 && AS=yasm
-          if [ "${AS}" = nasm ] ; then
-            # Apple ships version 0.98 of nasm through at least Xcode 6. Revisit
-            # this check if they start shipping a compatible version.
-            apple=`nasm -v | grep "Apple"`
-            [ -n "${apple}" ] \
-              && echo "Unsupported version of nasm: ${apple}" \
-              && AS=""
+      for ext in ${ARCH_EXT_LIST_X86}; do
+        # disable higher order extensions to simplify asm dependencies
+        if [ "$disable_exts" = "yes" ]; then
+          if ! disabled $ext; then
+            RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
+            disable_feature $ext
           fi
-          [ "${AS}" = auto ] || [ -z "${AS}" ] \
-            && die "Neither yasm nor nasm have been found." \
-                   "See the prerequisites section in the README for more info."
-          ;;
-      esac
-      log_echo "  using $AS"
+        elif disabled $ext; then
+          disable_exts="yes"
+        else
+          # use the shortened version for the flag: sse4_1 -> sse4
+          check_gcc_machine_option ${ext%_*} $ext
+        fi
+      done
+
+      if enabled external_build; then
+        log_echo "  skipping assembler detection"
+      else
+        case "${AS}" in
+          auto|"")
+            which nasm >/dev/null 2>&1 && AS=nasm
+            which yasm >/dev/null 2>&1 && AS=yasm
+            if [ "${AS}" = nasm ] ; then
+              # Apple ships version 0.98 of nasm through at least Xcode 6. Revisit
+              # this check if they start shipping a compatible version.
+              apple=`nasm -v | grep "Apple"`
+              [ -n "${apple}" ] \
+                && echo "Unsupported version of nasm: ${apple}" \
+                && AS=""
+            fi
+            [ "${AS}" = auto ] || [ -z "${AS}" ] \
+              && die "Neither yasm nor nasm have been found." \
+                     "See the prerequisites section in the README for more info."
+            ;;
+        esac
+        log_echo "  using $AS"
+      fi
       [ "${AS##*/}" = nasm ] && add_asflags -Ox
       AS_SFX=.asm
       case  ${tgt_os} in
diff --git a/build/make/gen_msvs_proj.sh b/build/make/gen_msvs_proj.sh
index 0cf335b..2b91fbf 100755
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -193,7 +193,7 @@ for opt in "$@"; do
 done
 
 # Make one call to fix_path for file_list to improve performance.
-fix_file_list
+fix_file_list file_list
 
 outfile=${outfile:-/dev/stdout}
 guid=${guid:-`generate_uuid`}
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index 182ea28..e98611d 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -211,7 +211,7 @@ for opt in "$@"; do
 done
 
 # Make one call to fix_path for file_list to improve performance.
-fix_file_list
+fix_file_list file_list
 
 outfile=${outfile:-/dev/stdout}
 guid=${guid:-`generate_uuid`}
diff --git a/build/make/ios-Info.plist b/build/make/ios-Info.plist
new file mode 100644
index 0000000..d157b11
--- /dev/null
+++ b/build/make/ios-Info.plist
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleExecutable</key>
+	<string>VPX</string>
+	<key>CFBundleIdentifier</key>
+	<string>org.webmproject.VPX</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>VPX</string>
+	<key>CFBundlePackageType</key>
+	<string>FMWK</string>
+	<key>CFBundleShortVersionString</key>
+	<string>${VERSION}</string>
+	<key>CFBundleSignature</key>
+	<string>????</string>
+	<key>CFBundleSupportedPlatforms</key>
+	<array>
+		<string>iPhoneOS</string>
+	</array>
+	<key>CFBundleVersion</key>
+	<string>${VERSION}</string>
+	<key>MinimumOSVersion</key>
+	<string>${IOS_VERSION_MIN}</string>
+	<key>UIDeviceFamily</key>
+	<array>
+		<integer>1</integer>
+		<integer>2</integer>
+	</array>
+	<key>VPXFullVersion</key>
+	<string>${FULLVERSION}</string>
+</dict>
+</plist>
diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh
index 6f7180d..c703f22 100755
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@@ -24,16 +24,20 @@ CONFIGURE_ARGS="--disable-docs
                 --disable-unit-tests"
 DIST_DIR="_dist"
 FRAMEWORK_DIR="VPX.framework"
+FRAMEWORK_LIB="VPX.framework/VPX"
 HEADER_DIR="${FRAMEWORK_DIR}/Headers/vpx"
 SCRIPT_DIR=$(dirname "$0")
 LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
 LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
 ORIG_PWD="$(pwd)"
-TARGETS="arm64-darwin-gcc
-         armv7-darwin-gcc
-         armv7s-darwin-gcc
-         x86-iphonesimulator-gcc
-         x86_64-iphonesimulator-gcc"
+ARM_TARGETS="arm64-darwin-gcc
+             armv7-darwin-gcc
+             armv7s-darwin-gcc"
+SIM_TARGETS="x86-iphonesimulator-gcc
+             x86_64-iphonesimulator-gcc"
+OSX_TARGETS="x86-darwin15-gcc
+             x86_64-darwin15-gcc"
+TARGETS="${ARM_TARGETS} ${SIM_TARGETS}"
 
 # Configures for the target specified by $1, and invokes make with the dist
 # target using $DIST_DIR as the distribution output directory.
@@ -134,6 +138,44 @@ create_vpx_framework_config_shim() {
   printf "#endif  // ${include_guard}" >> "${config_file}"
 }
 
+# Verifies that $FRAMEWORK_LIB fat library contains requested builds.
+verify_framework_targets() {
+  local requested_cpus=""
+  local cpu=""
+
+  # Extract CPU from full target name.
+  for target; do
+    cpu="${target%%-*}"
+    if [ "${cpu}" = "x86" ]; then
+      # lipo -info outputs i386 for libvpx x86 targets.
+      cpu="i386"
+    fi
+    requested_cpus="${requested_cpus}${cpu} "
+  done
+
+  # Get target CPUs present in framework library.
+  local targets_built=$(${LIPO} -info ${FRAMEWORK_LIB})
+
+  # $LIPO -info outputs a string like the following:
+  #   Architectures in the fat file: $FRAMEWORK_LIB <architectures>
+  # Capture only the architecture strings.
+  targets_built=${targets_built##*: }
+
+  # Sort CPU strings to make the next step a simple string compare.
+  local actual=$(echo ${targets_built} | tr " " "\n" | sort | tr "\n" " ")
+  local requested=$(echo ${requested_cpus} | tr " " "\n" | sort | tr "\n" " ")
+
+  vlog "Requested ${FRAMEWORK_LIB} CPUs: ${requested}"
+  vlog "Actual ${FRAMEWORK_LIB} CPUs: ${actual}"
+
+  if [ "${requested}" != "${actual}" ]; then
+    elog "Actual ${FRAMEWORK_LIB} targets do not match requested target list."
+    elog "  Requested target CPUs: ${requested}"
+    elog "  Actual target CPUs: ${actual}"
+    return 1
+  fi
+}
+
 # Configures and builds each target specified by $1, and then builds
 # VPX.framework.
 build_framework() {
@@ -154,7 +196,12 @@ build_framework() {
   for target in ${targets}; do
     build_target "${target}"
     target_dist_dir="${BUILD_ROOT}/${target}/${DIST_DIR}"
-    lib_list="${lib_list} ${target_dist_dir}/lib/libvpx.a"
+    if [ "${ENABLE_SHARED}" = "yes" ]; then
+      local suffix="dylib"
+    else
+      local suffix="a"
+    fi
+    lib_list="${lib_list} ${target_dist_dir}/lib/libvpx.${suffix}"
   done
 
   cd "${ORIG_PWD}"
@@ -173,13 +220,25 @@ build_framework() {
   # Copy in vpx_version.h.
   cp -p "${BUILD_ROOT}/${target}/vpx_version.h" "${HEADER_DIR}"
 
-  vlog "Created fat library ${FRAMEWORK_DIR}/VPX containing:"
+  if [ "${ENABLE_SHARED}" = "yes" ]; then
+    # Adjust the dylib's name so dynamic linking in apps works as expected.
+    install_name_tool -id '@rpath/VPX.framework/VPX' ${FRAMEWORK_DIR}/VPX
+
+    # Copy in Info.plist.
+    cat "${SCRIPT_DIR}/ios-Info.plist" \
+      | sed "s/\${FULLVERSION}/${FULLVERSION}/g" \
+      | sed "s/\${VERSION}/${VERSION}/g" \
+      | sed "s/\${IOS_VERSION_MIN}/${IOS_VERSION_MIN}/g" \
+      > "${FRAMEWORK_DIR}/Info.plist"
+  fi
+
+  # Confirm VPX.framework/VPX contains the targets requested.
+  verify_framework_targets ${targets}
+
+  vlog "Created fat library ${FRAMEWORK_LIB} containing:"
   for lib in ${lib_list}; do
     vlog "  $(echo ${lib} | awk -F / '{print $2, $NF}')"
   done
-
-  # TODO(tomfinegan): Verify that expected targets are included within
-  # VPX.framework/VPX via lipo -info.
 }
 
 # Trap function. Cleans up the subtree used to build all targets contained in
@@ -197,15 +256,28 @@ cleanup() {
   fi
 }
 
+print_list() {
+  local indent="$1"
+  shift
+  local list="$@"
+  for entry in ${list}; do
+    echo "${indent}${entry}"
+  done
+}
+
 iosbuild_usage() {
 cat << EOF
   Usage: ${0##*/} [arguments]
     --help: Display this message and exit.
+    --enable-shared: Build a dynamic framework for use on iOS 8 or later.
     --extra-configure-args <args>: Extra args to pass when configuring libvpx.
+    --macosx: Uses darwin15 targets instead of iphonesimulator targets for x86
+              and x86_64. Allows linking to framework when builds target MacOSX
+              instead of iOS.
     --preserve-build-output: Do not delete the build directory.
     --show-build-output: Show output from each library build.
     --targets <targets>: Override default target list. Defaults:
-         ${TARGETS}
+$(print_list "        " ${TARGETS})
     --test-link: Confirms all targets can be linked. Functionally identical to
                  passing --enable-examples via --extra-configure-args.
     --verbose: Output information about the environment and each stage of the
@@ -236,6 +308,9 @@ while [ -n "$1" ]; do
       iosbuild_usage
       exit
       ;;
+    --enable-shared)
+      ENABLE_SHARED=yes
+      ;;
     --preserve-build-output)
       PRESERVE_BUILD_OUTPUT=yes
       ;;
@@ -249,6 +324,9 @@ while [ -n "$1" ]; do
       TARGETS="$2"
       shift
       ;;
+    --macosx)
+      TARGETS="${ARM_TARGETS} ${OSX_TARGETS}"
+      ;;
     --verbose)
       VERBOSE=yes
       ;;
@@ -260,6 +338,21 @@ while [ -n "$1" ]; do
   shift
 done
 
+if [ "${ENABLE_SHARED}" = "yes" ]; then
+  CONFIGURE_ARGS="--enable-shared ${CONFIGURE_ARGS}"
+fi
+
+FULLVERSION=$("${SCRIPT_DIR}"/version.sh --bare "${LIBVPX_SOURCE_DIR}")
+VERSION=$(echo "${FULLVERSION}" | sed -E 's/^v([0-9]+\.[0-9]+\.[0-9]+).*$/\1/')
+
+if [ "$ENABLE_SHARED" = "yes" ]; then
+  IOS_VERSION_OPTIONS="--enable-shared"
+  IOS_VERSION_MIN="8.0"
+else
+  IOS_VERSION_OPTIONS=""
+  IOS_VERSION_MIN="6.0"
+fi
+
 if [ "${VERBOSE}" = "yes" ]; then
 cat << EOF
   BUILD_ROOT=${BUILD_ROOT}
@@ -267,16 +360,24 @@ cat << EOF
   CONFIGURE_ARGS=${CONFIGURE_ARGS}
   EXTRA_CONFIGURE_ARGS=${EXTRA_CONFIGURE_ARGS}
   FRAMEWORK_DIR=${FRAMEWORK_DIR}
+  FRAMEWORK_LIB=${FRAMEWORK_LIB}
   HEADER_DIR=${HEADER_DIR}
   LIBVPX_SOURCE_DIR=${LIBVPX_SOURCE_DIR}
   LIPO=${LIPO}
   MAKEFLAGS=${MAKEFLAGS}
   ORIG_PWD=${ORIG_PWD}
   PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
-  TARGETS="${TARGETS}"
+  TARGETS="$(print_list "" ${TARGETS})"
+  ENABLE_SHARED=${ENABLE_SHARED}
+  OSX_TARGETS="${OSX_TARGETS}"
+  SIM_TARGETS="${SIM_TARGETS}"
+  SCRIPT_DIR="${SCRIPT_DIR}"
+  FULLVERSION="${FULLVERSION}"
+  VERSION="${VERSION}"
+  IOS_VERSION_MIN="${IOS_VERSION_MIN}"
 EOF
 fi
 
 build_framework "${TARGETS}"
 echo "Successfully built '${FRAMEWORK_DIR}' for:"
-echo "         ${TARGETS}"
+print_list "" ${TARGETS}
diff --git a/build/make/msvs_common.sh b/build/make/msvs_common.sh
index 90c1488..88f1cf9 100644
--- a/build/make/msvs_common.sh
+++ b/build/make/msvs_common.sh
@@ -39,11 +39,12 @@ fix_path() {
 }
 
 # Corrects the paths in file_list in one pass for efficiency.
+# $1 is the name of the array to be modified.
 fix_file_list() {
-    # TODO(jzern): this could be more generic and take the array as a param.
-    files=$(fix_path "${file_list[@]}")
+    declare -n array_ref=$1
+    files=$(fix_path "${array_ref[@]}")
     local IFS=$'\n'
-    file_list=($files)
+    array_ref=($files)
 }
 
 generate_uuid() {
diff --git a/build/make/version.sh b/build/make/version.sh
index b340142..6967527 100755
--- a/build/make/version.sh
+++ b/build/make/version.sh
@@ -24,8 +24,9 @@ out_file=${2}
 id=${3:-VERSION_STRING}
 
 git_version_id=""
-if [ -d "${source_path}/.git" ]; then
+if [ -e "${source_path}/.git" ]; then
     # Source Path is a git working copy. Check for local modifications.
+    # Note that git submodules may have a file as .git, not a directory.
     export GIT_DIR="${source_path}/.git"
     git_version_id=`git describe --match=v[0-9]* 2>/dev/null`
 fi
diff --git a/configure b/configure
index a40f3ab..f82ee04 100755
--- a/configure
+++ b/configure
@@ -35,9 +35,11 @@ Advanced options:
   ${toggle_debug_libs}            in/exclude debug version of libraries
   ${toggle_static_msvcrt}         use static MSVCRT (VS builds only)
   ${toggle_vp9_highbitdepth}      use VP9 high bit depth (10/12) profiles
+  ${toggle_better_hw_compatibility}
+                                  enable encoder to produce streams with better
+                                  hardware decoder compatibility
   ${toggle_vp8}                   VP8 codec support
   ${toggle_vp9}                   VP9 codec support
-  ${toggle_vp10}                  VP10 codec support
   ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
   ${toggle_postproc}              postprocessing
   ${toggle_vp9_postproc}          vp9 specific postprocessing
@@ -95,11 +97,11 @@ EOF
 
 # all_platforms is a list of all supported target platforms. Maintain
 # alphabetically by architecture, generic-gnu last.
-all_platforms="${all_platforms} armv6-darwin-gcc"
+all_platforms="${all_platforms} arm64-darwin-gcc"
+all_platforms="${all_platforms} arm64-linux-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
 all_platforms="${all_platforms} armv6-none-rvct"
-all_platforms="${all_platforms} arm64-darwin-gcc"
 all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
@@ -109,6 +111,7 @@ all_platforms="${all_platforms} armv7-win32-vs11"
 all_platforms="${all_platforms} armv7-win32-vs12"
 all_platforms="${all_platforms} armv7-win32-vs14"
 all_platforms="${all_platforms} armv7s-darwin-gcc"
+all_platforms="${all_platforms} armv8-linux-gcc"
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} mips64-linux-gcc"
 all_platforms="${all_platforms} sparc-solaris-gcc"
@@ -122,6 +125,7 @@ all_platforms="${all_platforms} x86-darwin11-gcc"
 all_platforms="${all_platforms} x86-darwin12-gcc"
 all_platforms="${all_platforms} x86-darwin13-gcc"
 all_platforms="${all_platforms} x86-darwin14-gcc"
+all_platforms="${all_platforms} x86-darwin15-gcc"
 all_platforms="${all_platforms} x86-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86-linux-gcc"
 all_platforms="${all_platforms} x86-linux-icc"
@@ -142,6 +146,7 @@ all_platforms="${all_platforms} x86_64-darwin11-gcc"
 all_platforms="${all_platforms} x86_64-darwin12-gcc"
 all_platforms="${all_platforms} x86_64-darwin13-gcc"
 all_platforms="${all_platforms} x86_64-darwin14-gcc"
+all_platforms="${all_platforms} x86_64-darwin15-gcc"
 all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
@@ -190,12 +195,8 @@ if [ ${doxy_major:-0} -ge 1 ]; then
 fi
 
 # disable codecs when their source directory does not exist
-[ -d "${source_path}/vp8" ] || disable_feature vp8
-[ -d "${source_path}/vp9" ] || disable_feature vp9
-[ -d "${source_path}/vp10" ] || disable_feature vp10
-
-# disable vp10 codec by default
-disable_feature vp10
+[ -d "${source_path}/vp8" ] || disable_codec vp8
+[ -d "${source_path}/vp9" ] || disable_codec vp9
 
 # install everything except the sources, by default. sources will have
 # to be enabled when doing dist builds, since that's no longer a common
@@ -217,13 +218,10 @@ CODECS="
     vp8_decoder
     vp9_encoder
     vp9_decoder
-    vp10_encoder
-    vp10_decoder
 "
 CODEC_FAMILIES="
     vp8
     vp9
-    vp10
 "
 
 ARCH_LIST="
@@ -232,6 +230,16 @@ ARCH_LIST="
     x86
     x86_64
 "
+ARCH_EXT_LIST_X86="
+    mmx
+    sse
+    sse2
+    sse3
+    ssse3
+    sse4_1
+    avx
+    avx2
+"
 ARCH_EXT_LIST="
     edsp
     media
@@ -243,21 +251,12 @@ ARCH_EXT_LIST="
     msa
     mips64
 
-    mmx
-    sse
-    sse2
-    sse3
-    ssse3
-    sse4_1
-    avx
-    avx2
+    ${ARCH_EXT_LIST_X86}
 "
 HAVE_LIST="
     ${ARCH_EXT_LIST}
     vpx_ports
-    stdint_h
     pthread_h
-    sys_mman_h
     unistd_h
 "
 EXPERIMENT_LIST="
@@ -317,6 +316,7 @@ CONFIG_LIST="
     vp9_temporal_denoising
     coefficient_range_checking
     vp9_highbitdepth
+    better_hw_compatibility
     experimental
     size_limit
     ${EXPERIMENT_LIST}
@@ -375,6 +375,7 @@ CMDLINE_SELECT="
     temporal_denoising
     vp9_temporal_denoising
     coefficient_range_checking
+    better_hw_compatibility
     vp9_highbitdepth
     experimental
 "
@@ -383,15 +384,19 @@ process_cmdline() {
     for opt do
         optval="${opt#*=}"
         case "$opt" in
-        --disable-codecs) for c in ${CODECS}; do disable_feature $c; done ;;
+        --disable-codecs)
+          for c in ${CODEC_FAMILIES}; do disable_codec $c; done
+          ;;
         --enable-?*|--disable-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        if echo "${EXPERIMENT_LIST}" | grep "^ *$option\$" >/dev/null; then
+        if is_in ${option} ${EXPERIMENT_LIST}; then
             if enabled experimental; then
                 ${action}_feature $option
             else
                 log_echo "Ignoring $opt -- not in experimental mode."
             fi
+        elif is_in ${option} "${CODECS} ${CODEC_FAMILIES}"; then
+            ${action}_codec ${option}
         else
             process_common_cmdline $opt
         fi
@@ -405,14 +410,6 @@ process_cmdline() {
 post_process_cmdline() {
     c=""
 
-    # If the codec family is disabled, disable all components of that family.
-    # If the codec family is enabled, enable all components of that family.
-    log_echo "Configuring selected codecs"
-    for c in ${CODECS}; do
-        disabled ${c%%_*} && disable_feature ${c}
-        enabled ${c%%_*} && enable_feature ${c}
-    done
-
     # Enable all detected codecs, if they haven't been disabled
     for c in ${CODECS}; do soft_enable $c; done
 
@@ -507,13 +504,18 @@ process_detect() {
         # Can only build shared libs on a subset of platforms. Doing this check
         # here rather than at option parse time because the target auto-detect
         # magic happens after the command line has been parsed.
-        if ! enabled linux && ! enabled os2; then
+        case "${tgt_os}" in
+        linux|os2|darwin*|iphonesimulator*)
+            # Supported platforms
+            ;;
+        *)
             if enabled gnu; then
                 echo "--enable-shared is only supported on ELF; assuming this is OK"
             else
-                die "--enable-shared only supported on ELF and OS/2 for now"
+                die "--enable-shared only supported on ELF, OS/2, and Darwin for now"
             fi
-        fi
+            ;;
+        esac
     fi
     if [ -z "$CC" ] || enabled external_build; then
         echo "Bypassing toolchain for environment detection."
@@ -540,16 +542,12 @@ process_detect() {
             # Specialize windows and POSIX environments.
             case $toolchain in
                 *-win*-*)
-                    case $header-$toolchain in
-                        stdint*-gcc) true;;
-                        *) false;;
-                    esac && enable_feature $var
-                    ;;
+                    # Don't check for any headers in Windows builds.
+                    false
+                ;;
                 *)
                     case $header in
-                        stdint.h) true;;
                         pthread.h) true;;
-                        sys/mman.h) true;;
                         unistd.h) true;;
                         *) false;;
                     esac && enable_feature $var
@@ -565,9 +563,7 @@ process_detect() {
 int main(void) {return 0;}
 EOF
     # check system headers
-    check_header stdint.h
     check_header pthread.h
-    check_header sys/mman.h
     check_header unistd.h # for sysconf(3) and friends.
 
     check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports
@@ -598,7 +594,11 @@ process_toolchain() {
           ;;
           *) check_add_cflags -Wunused-but-set-variable ;;
         esac
-        enabled extra_warnings || check_add_cflags -Wno-unused-function
+        if enabled mips || [ -z "${INLINE}" ]; then
+          enabled extra_warnings || check_add_cflags -Wno-unused-function
+        else
+          check_add_cflags -Wunused-function
+        fi
     fi
 
     if enabled icc; then
diff --git a/examples.mk b/examples.mk
index f10bec6..c891a54 100644
--- a/examples.mk
+++ b/examples.mk
@@ -36,21 +36,30 @@ LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \
                 third_party/libyuv/source/scale_neon64.cc \
                 third_party/libyuv/source/scale_win.cc \
 
-LIBWEBM_COMMON_SRCS += third_party/libwebm/webmids.hpp
+LIBWEBM_COMMON_SRCS += third_party/libwebm/common/hdr_util.cc \
+                       third_party/libwebm/common/hdr_util.h \
+                       third_party/libwebm/common/webmids.h
 
-LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \
-                      third_party/libwebm/mkvmuxerutil.cpp \
-                      third_party/libwebm/mkvwriter.cpp \
-                      third_party/libwebm/mkvmuxer.hpp \
-                      third_party/libwebm/mkvmuxertypes.hpp \
-                      third_party/libwebm/mkvmuxerutil.hpp \
-                      third_party/libwebm/mkvparser.hpp \
-                      third_party/libwebm/mkvwriter.hpp
+LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer/mkvmuxer.cc \
+                      third_party/libwebm/mkvmuxer/mkvmuxerutil.cc \
+                      third_party/libwebm/mkvmuxer/mkvwriter.cc \
+                      third_party/libwebm/mkvmuxer/mkvmuxer.h \
+                      third_party/libwebm/mkvmuxer/mkvmuxertypes.h \
+                      third_party/libwebm/mkvmuxer/mkvmuxerutil.h \
+                      third_party/libwebm/mkvparser/mkvparser.h \
+                      third_party/libwebm/mkvmuxer/mkvwriter.h
+
+LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser/mkvparser.cc \
+                      third_party/libwebm/mkvparser/mkvreader.cc \
+                      third_party/libwebm/mkvparser/mkvparser.h \
+                      third_party/libwebm/mkvparser/mkvreader.h
+
+# Add compile flags and include path for libwebm sources.
+ifeq ($(CONFIG_WEBM_IO),yes)
+  CXXFLAGS     += -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
+  INC_PATH-yes += $(SRC_PATH_BARE)/third_party/libwebm
+endif
 
-LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser.cpp \
-                      third_party/libwebm/mkvreader.cpp \
-                      third_party/libwebm/mkvparser.hpp \
-                      third_party/libwebm/mkvreader.hpp
 
 # List of examples to build. UTILS are tools meant for distribution
 # while EXAMPLES demonstrate specific portions of the API.
@@ -70,6 +79,7 @@ ifeq ($(CONFIG_LIBYUV),yes)
 endif
 ifeq ($(CONFIG_WEBM_IO),yes)
   vpxdec.SRCS                 += $(LIBWEBM_COMMON_SRCS)
+  vpxdec.SRCS                 += $(LIBWEBM_MUXER_SRCS)
   vpxdec.SRCS                 += $(LIBWEBM_PARSER_SRCS)
   vpxdec.SRCS                 += webmdec.cc webmdec.h
 endif
@@ -93,6 +103,7 @@ endif
 ifeq ($(CONFIG_WEBM_IO),yes)
   vpxenc.SRCS                 += $(LIBWEBM_COMMON_SRCS)
   vpxenc.SRCS                 += $(LIBWEBM_MUXER_SRCS)
+  vpxenc.SRCS                 += $(LIBWEBM_PARSER_SRCS)
   vpxenc.SRCS                 += webmenc.cc webmenc.h
 endif
 vpxenc.GUID                  = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1
diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c
index a307729..64f0a01 100644
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c
@@ -109,8 +109,8 @@ static const char *exec_name;
 void usage_exit(void) {
   fprintf(stderr,
           "Usage: %s <codec> <width> <height> <infile> <outfile> "
-              "<keyframe-interval> [<error-resilient>]\nSee comments in "
-              "simple_encoder.c for more information.\n",
+              "<keyframe-interval> <error-resilient> <frames to encode>\n"
+              "See comments in simple_encoder.c for more information.\n",
           exec_name);
   exit(EXIT_FAILURE);
 }
@@ -147,6 +147,7 @@ static int encode_frame(vpx_codec_ctx_t *codec,
   return got_pkts;
 }
 
+// TODO(tomfinegan): Improve command line parsing and add args for bitrate/fps.
 int main(int argc, char **argv) {
   FILE *infile = NULL;
   vpx_codec_ctx_t codec;
@@ -157,12 +158,11 @@ int main(int argc, char **argv) {
   VpxVideoInfo info = {0};
   VpxVideoWriter *writer = NULL;
   const VpxInterface *encoder = NULL;
-  const int fps = 30;        // TODO(dkovalev) add command line argument
-  const int bitrate = 200;   // kbit/s TODO(dkovalev) add command line argument
+  const int fps = 30;
+  const int bitrate = 200;
   int keyframe_interval = 0;
-
-  // TODO(dkovalev): Add some simple command line parsing code to make the
-  // command line more flexible.
+  int max_frames = 0;
+  int frames_encoded = 0;
   const char *codec_arg = NULL;
   const char *width_arg = NULL;
   const char *height_arg = NULL;
@@ -172,7 +172,7 @@ int main(int argc, char **argv) {
 
   exec_name = argv[0];
 
-  if (argc < 7)
+  if (argc != 9)
     die("Invalid number of arguments");
 
   codec_arg = argv[1];
@@ -181,6 +181,7 @@ int main(int argc, char **argv) {
   infile_arg = argv[4];
   outfile_arg = argv[5];
   keyframe_interval_arg = argv[6];
+  max_frames = strtol(argv[8], NULL, 0);
 
   encoder = get_vpx_encoder_by_name(codec_arg);
   if (!encoder)
@@ -219,7 +220,7 @@ int main(int argc, char **argv) {
   cfg.g_timebase.num = info.time_base.numerator;
   cfg.g_timebase.den = info.time_base.denominator;
   cfg.rc_target_bitrate = bitrate;
-  cfg.g_error_resilient = argc > 7 ? strtol(argv[7], NULL, 0) : 0;
+  cfg.g_error_resilient = strtol(argv[7], NULL, 0);
 
   writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
   if (!writer)
@@ -237,6 +238,9 @@ int main(int argc, char **argv) {
     if (keyframe_interval > 0 && frame_count % keyframe_interval == 0)
       flags |= VPX_EFLAG_FORCE_KF;
     encode_frame(&codec, &raw, frame_count++, flags, writer);
+    frames_encoded++;
+    if (max_frames > 0 && frames_encoded >= max_frames)
+      break;
   }
 
   // Flush encoder.
diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c
index aecc11d..15a6617 100644
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c
@@ -59,7 +59,9 @@
 static const char *exec_name;
 
 void usage_exit(void) {
-  fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile>\n",
+  fprintf(stderr,
+          "Usage: %s <codec> <width> <height> <infile> <outfile> "
+              "<frame limit>\n",
           exec_name);
   exit(EXIT_FAILURE);
 }
@@ -129,7 +131,8 @@ static int encode_frame(vpx_codec_ctx_t *ctx,
 static vpx_fixed_buf_t pass0(vpx_image_t *raw,
                              FILE *infile,
                              const VpxInterface *encoder,
-                             const vpx_codec_enc_cfg_t *cfg) {
+                             const vpx_codec_enc_cfg_t *cfg,
+                             int max_frames) {
   vpx_codec_ctx_t codec;
   int frame_count = 0;
   vpx_fixed_buf_t stats = {NULL, 0};
@@ -142,6 +145,8 @@ static vpx_fixed_buf_t pass0(vpx_image_t *raw,
     ++frame_count;
     get_frame_stats(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY,
                     &stats);
+    if (max_frames > 0 && frame_count >= max_frames)
+      break;
   }
 
   // Flush encoder.
@@ -159,7 +164,8 @@ static void pass1(vpx_image_t *raw,
                   FILE *infile,
                   const char *outfile_name,
                   const VpxInterface *encoder,
-                  const vpx_codec_enc_cfg_t *cfg) {
+                  const vpx_codec_enc_cfg_t *cfg,
+                  int max_frames) {
   VpxVideoInfo info = {
     encoder->fourcc,
     cfg->g_w,
@@ -181,6 +187,9 @@ static void pass1(vpx_image_t *raw,
   while (vpx_img_read(raw, infile)) {
     ++frame_count;
     encode_frame(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY, writer);
+
+    if (max_frames > 0 && frame_count >= max_frames)
+      break;
   }
 
   // Flush encoder.
@@ -213,11 +222,14 @@ int main(int argc, char **argv) {
   const char *const height_arg = argv[3];
   const char *const infile_arg = argv[4];
   const char *const outfile_arg = argv[5];
+  int max_frames = 0;
   exec_name = argv[0];
 
-  if (argc != 6)
+  if (argc != 7)
     die("Invalid number of arguments.");
 
+  max_frames = strtol(argv[6], NULL, 0);
+
   encoder = get_vpx_encoder_by_name(codec_arg);
   if (!encoder)
     die("Unsupported codec.");
@@ -249,13 +261,13 @@ int main(int argc, char **argv) {
 
   // Pass 0
   cfg.g_pass = VPX_RC_FIRST_PASS;
-  stats = pass0(&raw, infile, encoder, &cfg);
+  stats = pass0(&raw, infile, encoder, &cfg, max_frames);
 
   // Pass 1
   rewind(infile);
   cfg.g_pass = VPX_RC_LAST_PASS;
   cfg.rc_twopass_stats_in = stats;
-  pass1(&raw, infile, outfile_arg, encoder, &cfg);
+  pass1(&raw, infile, outfile_arg, encoder, &cfg, max_frames);
   free(stats.buf);
 
   vpx_img_free(&raw);
diff --git a/examples/vp8_multi_resolution_encoder.c b/examples/vp8_multi_resolution_encoder.c
index 2b03204..fc775ef 100644
--- a/examples/vp8_multi_resolution_encoder.c
+++ b/examples/vp8_multi_resolution_encoder.c
@@ -29,13 +29,6 @@
 #include <math.h>
 #include <assert.h>
 #include <sys/time.h>
-#if USE_POSIX_MMAP
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-#include <unistd.h>
-#endif
 #include "vpx_ports/vpx_timer.h"
 #include "vpx/vpx_encoder.h"
 #include "vpx/vp8cx.h"
@@ -354,8 +347,7 @@ int main(int argc, char **argv)
     double               psnr_totals[NUM_ENCODERS][4] = {{0,0}};
     int                  psnr_count[NUM_ENCODERS] = {0};
 
-    double               cx_time = 0;
-    struct  timeval      tv1, tv2, difftv;
+    int64_t              cx_time = 0;
 
     /* Set the required target bitrates for each resolution level.
      * If target bitrate for highest-resolution level is set to 0,
@@ -589,6 +581,7 @@ int main(int argc, char **argv)
 
     while(frame_avail || got_data)
     {
+        struct vpx_usec_timer timer;
         vpx_codec_iter_t iter[NUM_ENCODERS]={NULL};
         const vpx_codec_cx_pkt_t *pkt[NUM_ENCODERS];
 
@@ -643,18 +636,18 @@ int main(int argc, char **argv)
             vpx_codec_control(&codec[i], VP8E_SET_TEMPORAL_LAYER_ID, layer_id);
         }
 
-        gettimeofday(&tv1, NULL);
         /* Encode each frame at multi-levels */
         /* Note the flags must be set to 0 in the encode call if they are set
            for each frame with the vpx_codec_control(), as done above. */
+        vpx_usec_timer_start(&timer);
         if(vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
             frame_cnt, 1, 0, arg_deadline))
         {
             die_codec(&codec[0], "Failed to encode frame");
         }
-        gettimeofday(&tv2, NULL);
-        timersub(&tv2, &tv1, &difftv);
-        cx_time += (double)(difftv.tv_sec * 1000000 + difftv.tv_usec);
+        vpx_usec_timer_mark(&timer);
+        cx_time += vpx_usec_timer_elapsed(&timer);
+
         for (i=NUM_ENCODERS-1; i>=0 ; i--)
         {
             got_data = 0;
@@ -693,8 +686,10 @@ int main(int argc, char **argv)
         frame_cnt++;
     }
     printf("\n");
-    printf("FPS for encoding %d %f %f \n", frame_cnt, (float)cx_time / 1000000,
-           1000000 * (double)frame_cnt / (double)cx_time);
+    printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n",
+            frame_cnt,
+            1000 * (float)cx_time / (double)(frame_cnt * 1000000),
+            1000000 * (double)frame_cnt / (double)cx_time);
 
     fclose(infile);
 
diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index b26e987..271ab70 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -30,6 +30,7 @@
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 #include "../vpxstats.h"
+#include "vp9/encoder/vp9_encoder.h"
 #define OUTPUT_RC_STATS 1
 
 static const arg_def_t skip_frames_arg =
@@ -408,7 +409,10 @@ static void set_rate_control_stats(struct RateControlStats *rc,
     for (tl = 0; tl < cfg->ts_number_layers; ++tl) {
       const int layer = sl * cfg->ts_number_layers + tl;
       const int tlayer0 = sl * cfg->ts_number_layers;
-      rc->layer_framerate[layer] =
+      if (cfg->ts_number_layers == 1)
+        rc->layer_framerate[layer] = framerate;
+      else
+        rc->layer_framerate[layer] =
           framerate / cfg->ts_rate_decimator[tl];
       if (tl > 0) {
         rc->layer_pfb[layer] = 1000.0 *
@@ -714,6 +718,7 @@ int main(int argc, const char **argv) {
     // TODO(marpan): Should rename the "VP9E_TEMPORAL_LAYERING_MODE_BYPASS"
     // mode to "VP9E_LAYERING_MODE_BYPASS".
     if (svc_ctx.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+      layer_id.spatial_layer_id = 0;
       // Example for 2 temporal layers.
       if (frame_cnt % 2 == 0)
         layer_id.temporal_layer_id = 0;
@@ -729,6 +734,12 @@ int main(int argc, const char **argv) {
                                   &ref_frame_config);
       vpx_codec_control(&codec, VP9E_SET_SVC_REF_FRAME_CONFIG,
                         &ref_frame_config);
+      // Keep track of input frames, to account for frame drops in rate control
+      // stats/metrics.
+      for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+        ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
+                                layer_id.temporal_layer_id];
+      }
     }
 
     vpx_usec_timer_start(&timer);
@@ -739,6 +750,7 @@ int main(int argc, const char **argv) {
     cx_time += vpx_usec_timer_elapsed(&timer);
 
     printf("%s", vpx_svc_get_message(&svc_ctx));
+    fflush(stdout);
     if (res != VPX_CODEC_OK) {
       die_codec(&codec, "Failed to encode frame");
     }
@@ -746,6 +758,7 @@ int main(int argc, const char **argv) {
     while ((cx_pkt = vpx_codec_get_cx_data(&codec, &iter)) != NULL) {
       switch (cx_pkt->kind) {
         case VPX_CODEC_CX_FRAME_PKT: {
+          SvcInternal_t *const si = (SvcInternal_t *)svc_ctx.internal;
           if (cx_pkt->data.frame.sz > 0) {
 #if OUTPUT_RC_STATS
             uint32_t sizes[8];
@@ -761,9 +774,16 @@ int main(int argc, const char **argv) {
               vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id);
               parse_superframe_index(cx_pkt->data.frame.buf,
                                      cx_pkt->data.frame.sz, sizes, &count);
-              for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
-                ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
-                                        layer_id.temporal_layer_id];
+              // Note computing input_layer_frames here won't account for frame
+              // drops in rate control stats.
+              // TODO(marpan): Fix this for non-bypass mode so we can get stats
+              // for dropped frames.
+              if (svc_ctx.temporal_layering_mode !=
+                  VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+                for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+                  ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
+                                         layer_id.temporal_layer_id];
+                }
               }
               for (tl = layer_id.temporal_layer_id;
                   tl < enc_cfg.ts_number_layers; ++tl) {
@@ -834,6 +854,8 @@ int main(int argc, const char **argv) {
           printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received,
                  !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY),
                  (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
+          if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1)
+            si->bytes_sum[0] += (int)cx_pkt->data.frame.sz;
           ++frames_received;
           break;
         }
@@ -854,6 +876,16 @@ int main(int argc, const char **argv) {
       pts += frame_duration;
     }
   }
+
+  // Compensate for the extra frame count for the bypass mode.
+  if (svc_ctx.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+    for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+      const int layer = sl * enc_cfg.ts_number_layers +
+          layer_id.temporal_layer_id;
+      --rc.layer_input_frames[layer];
+    }
+  }
+
   printf("Processed %d frames\n", frame_cnt);
   fclose(infile);
 #if OUTPUT_RC_STATS
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index 5adda9e..e6c09fb 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -41,7 +41,7 @@ enum denoiserState {
   kDenoiserOnAdaptive
 };
 
-static int mode_to_num_layers[12] = {1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3};
+static int mode_to_num_layers[13] = {1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3, 3};
 
 // For rate control encoding stats.
 struct RateControlMetrics {
@@ -432,7 +432,32 @@ static void set_temporal_layer_pattern(int layering_mode,
       layer_flags[7] = layer_flags[3];
       break;
     }
-    case 11:
+    case 11: {
+      // 3-layers structure with one reference frame.
+      // This works same as temporal_layering_mode 3.
+      // This was added to compare with vp9_spatial_svc_encoder.
+
+      // 3-layers, 4-frame period.
+      int ids[4] = {0, 2, 1, 2};
+      cfg->ts_periodicity = 4;
+      *flag_periodicity = 4;
+      cfg->ts_number_layers = 3;
+      cfg->ts_rate_decimator[0] = 4;
+      cfg->ts_rate_decimator[1] = 2;
+      cfg->ts_rate_decimator[2] = 1;
+      memcpy(cfg->ts_layer_id, ids, sizeof(ids));
+      // 0=L, 1=GF, 2=ARF, Intra-layer prediction disabled.
+      layer_flags[0] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
+          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      layer_flags[2] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
+          VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
+      layer_flags[1] = VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
+          VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
+      layer_flags[3] = VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_ARF |
+          VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
+      break;
+    }
+    case 12:
     default: {
       // 3-layers structure as in case 10, but no sync/refresh points for
       // layer 1 and 2.
@@ -530,7 +555,7 @@ int main(int argc, char **argv) {
   }
 
   layering_mode = strtol(argv[10], NULL, 0);
-  if (layering_mode < 0 || layering_mode > 12) {
+  if (layering_mode < 0 || layering_mode > 13) {
     die("Invalid layering mode (0..12) %s", argv[10]);
   }
 
@@ -690,7 +715,7 @@ int main(int argc, char **argv) {
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
     vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
     vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
-    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, 0);
+    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kDenoiserOff);
     vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
     vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
     vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
diff --git a/ivfdec.c b/ivfdec.c
index 6dcd66f..7fc25a0 100644
--- a/ivfdec.c
+++ b/ivfdec.c
@@ -23,7 +23,7 @@ static void fix_framerate(int *num, int *den) {
   // we can guess the framerate using only the timebase in this
   // case. Other files would require reading ahead to guess the
   // timebase, like we do for webm.
-  if (*num < 1000) {
+  if (*den > 0 && *den < 1000000000 && *num > 0 && *num < 1000) {
     // Correct for the factor of 2 applied to the timebase in the encoder.
     if (*num & 1)
       *den *= 2;
diff --git a/libs.mk b/libs.mk
index f28d84a..9a6092a 100644
--- a/libs.mk
+++ b/libs.mk
@@ -109,40 +109,6 @@ endif
 VP9_PREFIX=vp9/
 $(BUILD_PFX)$(VP9_PREFIX)%.c.o: CFLAGS += -Wextra
 
-#  VP10 make file
-ifeq ($(CONFIG_VP10),yes)
-  VP10_PREFIX=vp10/
-  include $(SRC_PATH_BARE)/$(VP10_PREFIX)vp10_common.mk
-endif
-
-ifeq ($(CONFIG_VP10_ENCODER),yes)
-  VP10_PREFIX=vp10/
-  include $(SRC_PATH_BARE)/$(VP10_PREFIX)vp10cx.mk
-  CODEC_SRCS-yes += $(addprefix $(VP10_PREFIX),$(call enabled,VP10_CX_SRCS))
-  CODEC_EXPORTS-yes += $(addprefix $(VP10_PREFIX),$(VP10_CX_EXPORTS))
-  CODEC_SRCS-yes += $(VP10_PREFIX)vp10cx.mk vpx/vp8.h vpx/vp8cx.h
-  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
-  INSTALL-LIBS-$(CONFIG_SPATIAL_SVC) += include/vpx/svc_context.h
-  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP10_PREFIX)/%
-  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
-  CODEC_DOC_SECTIONS += vp9 vp9_encoder
-endif
-
-ifeq ($(CONFIG_VP10_DECODER),yes)
-  VP10_PREFIX=vp10/
-  include $(SRC_PATH_BARE)/$(VP10_PREFIX)vp10dx.mk
-  CODEC_SRCS-yes += $(addprefix $(VP10_PREFIX),$(call enabled,VP10_DX_SRCS))
-  CODEC_EXPORTS-yes += $(addprefix $(VP10_PREFIX),$(VP10_DX_EXPORTS))
-  CODEC_SRCS-yes += $(VP10_PREFIX)vp10dx.mk vpx/vp8.h vpx/vp8dx.h
-  INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
-  INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP10_PREFIX)/%
-  CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h
-  CODEC_DOC_SECTIONS += vp9 vp9_decoder
-endif
-
-VP10_PREFIX=vp10/
-$(BUILD_PFX)$(VP10_PREFIX)%.c.o: CFLAGS += -Wextra
-
 ifeq ($(CONFIG_ENCODERS),yes)
   CODEC_DOC_SECTIONS += encoder
 endif
@@ -183,6 +149,9 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm
 endif
 CODEC_EXPORTS-yes += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
+ifeq ($(CONFIG_SPATIAL_SVC),yes)
+CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_spatial_svc
+endif
 CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec
 
 INSTALL-LIBS-yes += include/vpx/vpx_codec.h
@@ -260,7 +229,7 @@ OBJS-yes += $(LIBVPX_OBJS)
 LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 
-SO_VERSION_MAJOR := 3
+SO_VERSION_MAJOR := 4
 SO_VERSION_MINOR := 0
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
@@ -270,6 +239,12 @@ EXPORT_FILE             := libvpx.syms
 LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
                              libvpx.dylib  )
 else
+ifeq ($(filter iphonesimulator%,$(TGT_OS)),$(TGT_OS))
+LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
+SHARED_LIB_SUF          := .dylib
+EXPORT_FILE             := libvpx.syms
+LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, libvpx.dylib)
+else
 ifeq ($(filter os2%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx$(SO_VERSION_MAJOR).dll
 SHARED_LIB_SUF          := _dll.a
@@ -285,6 +260,7 @@ LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
                              libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR))
 endif
 endif
+endif
 
 LIBS-$(CONFIG_SHARED) += $(BUILD_PFX)$(LIBVPX_SO)\
                            $(notdir $(LIBVPX_SO_SYMLINKS)) \
@@ -394,6 +370,12 @@ $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
 $(shell $(SRC_PATH_BARE)/build/make/version.sh "$(SRC_PATH_BARE)" $(BUILD_PFX)vpx_version.h)
 CLEAN-OBJS += $(BUILD_PFX)vpx_version.h
 
+#
+# Add include path for libwebm sources.
+#
+ifeq ($(CONFIG_WEBM_IO),yes)
+  CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/libwebm
+endif
 
 ##
 ## libvpx test directives
@@ -429,12 +411,10 @@ testdata:: $(LIBVPX_TEST_DATA)
           if [ -n "$${sha1sum}" ]; then\
             set -e;\
             echo "Checking test data:";\
-            if [ -n "$(LIBVPX_TEST_DATA)" ]; then\
-                for f in $(call enabled,LIBVPX_TEST_DATA); do\
-                    grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\
-                        (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\
-                done; \
-            fi; \
+            for f in $(call enabled,LIBVPX_TEST_DATA); do\
+                grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\
+                    (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\
+            done; \
         else\
             echo "Skipping test data integrity check, sha1sum not found.";\
         fi
@@ -471,6 +451,7 @@ test_libvpx.$(VCPROJ_SFX): $(LIBVPX_TEST_SRCS) vpx.$(VCPROJ_SFX) gtest.$(VCPROJ_
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
             --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
+            $(if $(CONFIG_WEBM_IO),-I"$(SRC_PATH_BARE)/third_party/libwebm") \
             -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^
 
 PROJECTS-$(CONFIG_MSVS) += test_libvpx.$(VCPROJ_SFX)
diff --git a/md5_utils.c b/md5_utils.c
index f4f893a..a9b979a 100644
--- a/md5_utils.c
+++ b/md5_utils.c
@@ -150,12 +150,23 @@ MD5Final(md5byte digest[16], struct MD5Context *ctx) {
 #define MD5STEP(f,w,x,y,z,in,s) \
   (w += f(x,y,z) + in, w = (w<<s | w>>(32-s)) + x)
 
+#if defined(__clang__) && defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+#define VPX_NO_UNSIGNED_OVERFLOW_CHECK \
+  __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
+
+#ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK
+#define VPX_NO_UNSIGNED_OVERFLOW_CHECK
+#endif
+
 /*
  * The core of the MD5 algorithm, this alters an existing MD5 hash to
  * reflect the addition of 16 longwords of new data.  MD5Update blocks
  * the data and converts bytes into longwords for this routine.
  */
-void
+VPX_NO_UNSIGNED_OVERFLOW_CHECK void
 MD5Transform(UWORD32 buf[4], UWORD32 const in[16]) {
   register UWORD32 a, b, c, d;
 
@@ -238,4 +249,6 @@ MD5Transform(UWORD32 buf[4], UWORD32 const in[16]) {
   buf[3] += d;
 }
 
+#undef VPX_NO_UNSIGNED_OVERFLOW_CHECK
+
 #endif
diff --git a/test/acm_random.h b/test/acm_random.h
index ff5c93e..b94b6e1 100644
--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -32,6 +32,12 @@ class ACMRandom {
     return (value >> 15) & 0xffff;
   }
 
+  int16_t Rand9Signed(void) {
+    // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
+    const uint32_t value = random_.Generate(512);
+    return static_cast<int16_t>(value) - 256;
+  }
+
   uint8_t Rand8(void) {
     const uint32_t value =
         random_.Generate(testing::internal::Random::kMaxRange);
diff --git a/test/active_map_test.cc b/test/active_map_test.cc
index 0221995..dc3de72 100644
--- a/test/active_map_test.cc
+++ b/test/active_map_test.cc
@@ -85,5 +85,5 @@ TEST_P(ActiveMapTest, Test) {
 
 VP9_INSTANTIATE_TEST_CASE(ActiveMapTest,
                           ::testing::Values(::libvpx_test::kRealTime),
-                          ::testing::Range(0, 6));
+                          ::testing::Range(0, 9));
 }  // namespace
diff --git a/test/add_noise_test.cc b/test/add_noise_test.cc
new file mode 100644
index 0000000..e9945c4
--- /dev/null
+++ b/test/add_noise_test.cc
@@ -0,0 +1,197 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <math.h>
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+// TODO(jimbankoski): make width and height integers not unsigned.
+typedef void (*AddNoiseFunc)(unsigned char *start, char *noise,
+                             char blackclamp[16], char whiteclamp[16],
+                             char bothclamp[16], unsigned int width,
+                             unsigned int height, int pitch);
+
+class AddNoiseTest
+    : public ::testing::TestWithParam<AddNoiseFunc> {
+ public:
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+  virtual ~AddNoiseTest() {}
+};
+
+double stddev6(char a, char b, char c, char d, char e, char f) {
+  const double n = (a + b + c + d + e + f) / 6.0;
+  const double v = ((a - n) * (a - n) + (b - n) * (b - n) + (c - n) * (c - n) +
+                    (d - n) * (d - n) + (e - n) * (e - n) + (f - n) * (f - n)) /
+                   6.0;
+  return sqrt(v);
+}
+
+// TODO(jimbankoski): The following 2 functions are duplicated in each codec.
+// For now the vp9 one has been copied into the test as is. We should normalize
+// these in vpx_dsp and not have 3 copies of these unless there is different
+// noise we add for each codec.
+
+double gaussian(double sigma, double mu, double x) {
+  return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
+         (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
+}
+
+int setup_noise(int size_noise, char *noise) {
+  char char_dist[300];
+  const int ai = 4;
+  const int qi = 24;
+  const double sigma = ai + .5 + .6 * (63 - qi) / 63.0;
+
+  /* set up a lookup table of 256 entries that matches
+   * a gaussian distribution with sigma determined by q.
+   */
+  int next = 0;
+
+  for (int i = -32; i < 32; i++) {
+    int a_i = (int) (0.5 + 256 * gaussian(sigma, 0, i));
+
+    if (a_i) {
+      for (int j = 0; j < a_i; j++) {
+        char_dist[next + j] = (char)(i);
+      }
+
+      next = next + a_i;
+    }
+  }
+
+  for (; next < 256; next++)
+    char_dist[next] = 0;
+
+  for (int i = 0; i < size_noise; i++) {
+    noise[i] = char_dist[rand() & 0xff];  // NOLINT
+  }
+
+  // Returns the most negative value in distribution.
+  return char_dist[0];
+}
+
+TEST_P(AddNoiseTest, CheckNoiseAdded) {
+  DECLARE_ALIGNED(16, char, blackclamp[16]);
+  DECLARE_ALIGNED(16, char, whiteclamp[16]);
+  DECLARE_ALIGNED(16, char, bothclamp[16]);
+  const int width  = 64;
+  const int height = 64;
+  const int image_size = width * height;
+  char noise[3072];
+
+  const int clamp = setup_noise(3072, noise);
+  for (int i = 0; i < 16; i++) {
+    blackclamp[i] = -clamp;
+    whiteclamp[i] = -clamp;
+    bothclamp[i] = -2 * clamp;
+  }
+
+  uint8_t *const s = reinterpret_cast<uint8_t *>(vpx_calloc(image_size, 1));
+  memset(s, 99, image_size);
+
+  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
+                                      bothclamp, width, height, width));
+
+  // Check to make sure we don't end up having either the same or no added
+  // noise either vertically or horizontally.
+  for (int i = 0; i < image_size - 6 * width - 6; ++i) {
+    const double hd = stddev6(s[i] - 99, s[i + 1] - 99, s[i + 2] - 99,
+                              s[i + 3] - 99, s[i + 4] - 99, s[i + 5] - 99);
+    const double vd = stddev6(s[i] - 99, s[i + width] - 99,
+                              s[i + 2 * width] - 99, s[i + 3 * width] - 99,
+                              s[i + 4 * width] - 99, s[i + 5 * width] - 99);
+
+    EXPECT_NE(hd, 0);
+    EXPECT_NE(vd, 0);
+  }
+
+  // Initialize pixels in the image to 255 and check for roll over.
+  memset(s, 255, image_size);
+
+  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
+                                      bothclamp, width, height, width));
+
+  // Check to make sure don't roll over.
+  for (int i = 0; i < image_size; ++i) {
+    EXPECT_GT((int)s[i], 10) << "i = " << i;
+  }
+
+  // Initialize pixels in the image to 0 and check for roll under.
+  memset(s, 0, image_size);
+
+  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
+                                      bothclamp, width, height, width));
+
+  // Check to make sure don't roll under.
+  for (int i = 0; i < image_size; ++i) {
+    EXPECT_LT((int)s[i], 245) << "i = " << i;
+  }
+
+  vpx_free(s);
+}
+
+TEST_P(AddNoiseTest, CheckCvsAssembly) {
+  DECLARE_ALIGNED(16, char, blackclamp[16]);
+  DECLARE_ALIGNED(16, char, whiteclamp[16]);
+  DECLARE_ALIGNED(16, char, bothclamp[16]);
+  const int width  = 64;
+  const int height = 64;
+  const int image_size = width * height;
+  char noise[3072];
+
+  const int clamp = setup_noise(3072, noise);
+  for (int i = 0; i < 16; i++) {
+    blackclamp[i] = -clamp;
+    whiteclamp[i] = -clamp;
+    bothclamp[i] = -2 * clamp;
+  }
+
+  uint8_t *const s = reinterpret_cast<uint8_t *>(vpx_calloc(image_size, 1));
+  uint8_t *const d = reinterpret_cast<uint8_t *>(vpx_calloc(image_size, 1));
+
+  memset(s, 99, image_size);
+  memset(d, 99, image_size);
+
+  srand(0);
+  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
+                                      bothclamp, width, height, width));
+  srand(0);
+  ASM_REGISTER_STATE_CHECK(vpx_plane_add_noise_c(d, noise, blackclamp,
+                                                 whiteclamp, bothclamp,
+                                                 width, height, width));
+
+  for (int i = 0; i < image_size; ++i) {
+    EXPECT_EQ((int)s[i], (int)d[i]) << "i = " << i;
+  }
+
+  vpx_free(d);
+  vpx_free(s);
+}
+
+INSTANTIATE_TEST_CASE_P(C, AddNoiseTest,
+                        ::testing::Values(vpx_plane_add_noise_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, AddNoiseTest,
+                        ::testing::Values(vpx_plane_add_noise_sse2));
+#endif
+
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, AddNoiseTest,
+                        ::testing::Values(vpx_plane_add_noise_msa));
+#endif
+}  // namespace
diff --git a/test/altref_test.cc b/test/altref_test.cc
index af25b72..d9f83d8 100644
--- a/test/altref_test.cc
+++ b/test/altref_test.cc
@@ -14,6 +14,8 @@
 #include "test/util.h"
 namespace {
 
+#if CONFIG_VP8_ENCODER
+
 // lookahead range: [kLookAheadMin, kLookAheadMax).
 const int kLookAheadMin = 5;
 const int kLookAheadMax = 26;
@@ -63,7 +65,95 @@ TEST_P(AltRefTest, MonotonicTimestamps) {
   EXPECT_GE(altref_count(), 1);
 }
 
-
 VP8_INSTANTIATE_TEST_CASE(AltRefTest,
                           ::testing::Range(kLookAheadMin, kLookAheadMax));
+
+#endif  // CONFIG_VP8_ENCODER
+
+class AltRefForcedKeyTestLarge
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  AltRefForcedKeyTestLarge()
+      : EncoderTest(GET_PARAM(0)),
+        encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)),
+        forced_kf_frame_num_(1),
+        frame_num_(0) {}
+  virtual ~AltRefForcedKeyTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    cfg_.rc_end_usage = VPX_VBR;
+    cfg_.g_threads = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      // override test default for tile columns if necessary.
+#if CONFIG_VP9_ENCODER
+      if (GET_PARAM(0) == &libvpx_test::kVP9) {
+        encoder->Control(VP9E_SET_TILE_COLUMNS, 6);
+      }
+#endif
+    }
+    frame_flags_ =
+        (video->frame() == forced_kf_frame_num_) ? VPX_EFLAG_FORCE_KF : 0;
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (frame_num_ == forced_kf_frame_num_) {
+      ASSERT_TRUE(!!(pkt->data.frame.flags & VPX_FRAME_IS_KEY))
+          << "Frame #" << frame_num_ << " isn't a keyframe!";
+    }
+    ++frame_num_;
+  }
+
+  ::libvpx_test::TestMode encoding_mode_;
+  int cpu_used_;
+  unsigned int forced_kf_frame_num_;
+  unsigned int frame_num_;
+};
+
+TEST_P(AltRefForcedKeyTestLarge, Frame1IsKey) {
+  const vpx_rational timebase = { 1, 30 };
+  const int lag_values[] = { 3, 15, 25, -1 };
+
+  forced_kf_frame_num_ = 1;
+  for (int i = 0; lag_values[i] != -1; ++i) {
+    frame_num_ = 0;
+    cfg_.g_lag_in_frames = lag_values[i];
+    libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       timebase.den, timebase.num, 0, 30);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+}
+
+TEST_P(AltRefForcedKeyTestLarge, ForcedFrameIsKey) {
+  const vpx_rational timebase = { 1, 30 };
+  const int lag_values[] = { 3, 15, 25, -1 };
+
+  for (int i = 0; lag_values[i] != -1; ++i) {
+    frame_num_ = 0;
+    forced_kf_frame_num_ = lag_values[i] - 1;
+    cfg_.g_lag_in_frames = lag_values[i];
+    libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       timebase.den, timebase.num, 0, 30);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+}
+
+VP8_INSTANTIATE_TEST_CASE(
+    AltRefForcedKeyTestLarge,
+    ::testing::Values(::libvpx_test::kOnePassGood),
+    ::testing::Range(0, 9));
+
+VP9_INSTANTIATE_TEST_CASE(
+    AltRefForcedKeyTestLarge,
+    ::testing::Values(::libvpx_test::kOnePassGood),
+    ::testing::Range(0, 9));
 }  // namespace
diff --git a/test/vp9_avg_test.cc b/test/avg_test.cc
similarity index 61%
rename from test/vp9_avg_test.cc
rename to test/avg_test.cc
index d383131..44d8dd7 100644
--- a/test/vp9_avg_test.cc
+++ b/test/avg_test.cc
@@ -15,9 +15,7 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./vpx_config.h"
-#if CONFIG_VP9_ENCODER
-#include "./vp9_rtcd.h"
-#endif
+#include "./vpx_dsp_rtcd.h"
 
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
@@ -57,19 +55,19 @@ class AverageTestBase : public ::testing::Test {
   }
 
   // Sum Pixels
-  unsigned int ReferenceAverage8x8(const uint8_t* source, int pitch ) {
+  unsigned int ReferenceAverage8x8(const uint8_t* source, int pitch) {
     unsigned int average = 0;
     for (int h = 0; h < 8; ++h)
       for (int w = 0; w < 8; ++w)
-        average += source[h * source_stride_ + w];
+        average += source[h * pitch + w];
     return ((average + 32) >> 6);
   }
 
-  unsigned int ReferenceAverage4x4(const uint8_t* source, int pitch ) {
+  unsigned int ReferenceAverage4x4(const uint8_t* source, int pitch) {
     unsigned int average = 0;
     for (int h = 0; h < 4; ++h)
       for (int w = 0; w < 4; ++w)
-        average += source[h * source_stride_ + w];
+        average += source[h * pitch + w];
     return ((average + 8) >> 4);
   }
 
@@ -194,6 +192,48 @@ class IntProColTest
   int16_t sum_c_;
 };
 
+typedef int (*SatdFunc)(const int16_t *coeffs, int length);
+typedef std::tr1::tuple<int, SatdFunc> SatdTestParam;
+
+class SatdTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<SatdTestParam> {
+ protected:
+  virtual void SetUp() {
+    satd_size_ = GET_PARAM(0);
+    satd_func_ = GET_PARAM(1);
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<int16_t*>(
+        vpx_memalign(16, sizeof(*src_) * satd_size_));
+    ASSERT_TRUE(src_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+    vpx_free(src_);
+  }
+
+  void FillConstant(const int16_t val) {
+    for (int i = 0; i < satd_size_; ++i) src_[i] = val;
+  }
+
+  void FillRandom() {
+    for (int i = 0; i < satd_size_; ++i) src_[i] = rnd_.Rand16();
+  }
+
+  void Check(const int expected) {
+    int total;
+    ASM_REGISTER_STATE_CHECK(total = satd_func_(src_, satd_size_));
+    EXPECT_EQ(expected, total);
+  }
+
+  int satd_size_;
+
+ private:
+  int16_t *src_;
+  SatdFunc satd_func_;
+  ACMRandom rnd_;
+};
 
 uint8_t* AverageTestBase::source_data_ = NULL;
 
@@ -246,69 +286,126 @@ TEST_P(IntProColTest, Random) {
   RunComparison();
 }
 
+
+TEST_P(SatdTest, MinValue) {
+  const int kMin = -32640;
+  const int expected = -kMin * satd_size_;
+  FillConstant(kMin);
+  Check(expected);
+}
+
+TEST_P(SatdTest, MaxValue) {
+  const int kMax = 32640;
+  const int expected = kMax * satd_size_;
+  FillConstant(kMax);
+  Check(expected);
+}
+
+TEST_P(SatdTest, Random) {
+  int expected;
+  switch (satd_size_) {
+    case 16: expected = 205298; break;
+    case 64: expected = 1113950; break;
+    case 256: expected = 4268415; break;
+    case 1024: expected = 16954082; break;
+    default:
+      FAIL() << "Invalid satd size (" << satd_size_
+             << ") valid: 16/64/256/1024";
+  }
+  FillRandom();
+  Check(expected);
+}
+
 using std::tr1::make_tuple;
 
 INSTANTIATE_TEST_CASE_P(
     C, AverageTest,
     ::testing::Values(
-        make_tuple(16, 16, 1, 8, &vp9_avg_8x8_c),
-        make_tuple(16, 16, 1, 4, &vp9_avg_4x4_c)));
+        make_tuple(16, 16, 1, 8, &vpx_avg_8x8_c),
+        make_tuple(16, 16, 1, 4, &vpx_avg_4x4_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    C, SatdTest,
+    ::testing::Values(
+        make_tuple(16, &vpx_satd_c),
+        make_tuple(64, &vpx_satd_c),
+        make_tuple(256, &vpx_satd_c),
+        make_tuple(1024, &vpx_satd_c)));
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, AverageTest,
     ::testing::Values(
-        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_sse2),
-        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_sse2),
-        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_sse2),
-        make_tuple(16, 16, 0, 4, &vp9_avg_4x4_sse2),
-        make_tuple(16, 16, 5, 4, &vp9_avg_4x4_sse2),
-        make_tuple(32, 32, 15, 4, &vp9_avg_4x4_sse2)));
+        make_tuple(16, 16, 0, 8, &vpx_avg_8x8_sse2),
+        make_tuple(16, 16, 5, 8, &vpx_avg_8x8_sse2),
+        make_tuple(32, 32, 15, 8, &vpx_avg_8x8_sse2),
+        make_tuple(16, 16, 0, 4, &vpx_avg_4x4_sse2),
+        make_tuple(16, 16, 5, 4, &vpx_avg_4x4_sse2),
+        make_tuple(32, 32, 15, 4, &vpx_avg_4x4_sse2)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, IntProRowTest, ::testing::Values(
-        make_tuple(16, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c),
-        make_tuple(32, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c),
-        make_tuple(64, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c)));
+        make_tuple(16, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c),
+        make_tuple(32, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c),
+        make_tuple(64, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, IntProColTest, ::testing::Values(
-        make_tuple(16, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c),
-        make_tuple(32, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c),
-        make_tuple(64, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c)));
+        make_tuple(16, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c),
+        make_tuple(32, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c),
+        make_tuple(64, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, SatdTest,
+    ::testing::Values(
+        make_tuple(16, &vpx_satd_sse2),
+        make_tuple(64, &vpx_satd_sse2),
+        make_tuple(256, &vpx_satd_sse2),
+        make_tuple(1024, &vpx_satd_sse2)));
 #endif
 
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(
     NEON, AverageTest,
     ::testing::Values(
-        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_neon),
-        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon),
-        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon)));
+        make_tuple(16, 16, 0, 8, &vpx_avg_8x8_neon),
+        make_tuple(16, 16, 5, 8, &vpx_avg_8x8_neon),
+        make_tuple(32, 32, 15, 8, &vpx_avg_8x8_neon),
+        make_tuple(16, 16, 0, 4, &vpx_avg_4x4_neon),
+        make_tuple(16, 16, 5, 4, &vpx_avg_4x4_neon),
+        make_tuple(32, 32, 15, 4, &vpx_avg_4x4_neon)));
 
 INSTANTIATE_TEST_CASE_P(
     NEON, IntProRowTest, ::testing::Values(
-        make_tuple(16, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
-        make_tuple(32, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
-        make_tuple(64, &vp9_int_pro_row_neon, &vp9_int_pro_row_c)));
+        make_tuple(16, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
+        make_tuple(32, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
+        make_tuple(64, &vpx_int_pro_row_neon, &vpx_int_pro_row_c)));
 
 INSTANTIATE_TEST_CASE_P(
     NEON, IntProColTest, ::testing::Values(
-        make_tuple(16, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
-        make_tuple(32, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
-        make_tuple(64, &vp9_int_pro_col_neon, &vp9_int_pro_col_c)));
+        make_tuple(16, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
+        make_tuple(32, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
+        make_tuple(64, &vpx_int_pro_col_neon, &vpx_int_pro_col_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, SatdTest,
+    ::testing::Values(
+        make_tuple(16, &vpx_satd_neon),
+        make_tuple(64, &vpx_satd_neon),
+        make_tuple(256, &vpx_satd_neon),
+        make_tuple(1024, &vpx_satd_neon)));
 #endif
 
 #if HAVE_MSA
 INSTANTIATE_TEST_CASE_P(
     MSA, AverageTest,
     ::testing::Values(
-        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_msa),
-        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_msa),
-        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_msa),
-        make_tuple(16, 16, 0, 4, &vp9_avg_4x4_msa),
-        make_tuple(16, 16, 5, 4, &vp9_avg_4x4_msa),
-        make_tuple(32, 32, 15, 4, &vp9_avg_4x4_msa)));
+        make_tuple(16, 16, 0, 8, &vpx_avg_8x8_msa),
+        make_tuple(16, 16, 5, 8, &vpx_avg_8x8_msa),
+        make_tuple(32, 32, 15, 8, &vpx_avg_8x8_msa),
+        make_tuple(16, 16, 0, 4, &vpx_avg_4x4_msa),
+        make_tuple(16, 16, 5, 4, &vpx_avg_4x4_msa),
+        make_tuple(32, 32, 15, 4, &vpx_avg_4x4_msa)));
 #endif
 
 }  // namespace
diff --git a/test/borders_test.cc b/test/borders_test.cc
index 6592375..bd3ac39 100644
--- a/test/borders_test.cc
+++ b/test/borders_test.cc
@@ -52,7 +52,7 @@ TEST_P(BordersTest, TestEncodeHighBitrate) {
   // extend into the border and test the border condition.
   cfg_.g_lag_in_frames = 25;
   cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
   cfg_.rc_target_bitrate = 2000;
   cfg_.rc_max_quantizer = 10;
 
@@ -80,7 +80,4 @@ TEST_P(BordersTest, TestLowBitrate) {
 
 VP9_INSTANTIATE_TEST_CASE(BordersTest, ::testing::Values(
     ::libvpx_test::kTwoPassGood));
-
-VP10_INSTANTIATE_TEST_CASE(BordersTest, ::testing::Values(
-    ::libvpx_test::kTwoPassGood));
 }  // namespace
diff --git a/test/byte_alignment_test.cc b/test/byte_alignment_test.cc
index aa4b78b..3a808b0 100644
--- a/test/byte_alignment_test.cc
+++ b/test/byte_alignment_test.cc
@@ -21,14 +21,14 @@
 
 namespace {
 
+#if CONFIG_WEBM_IO
+
 const int kLegacyByteAlignment = 0;
 const int kLegacyYPlaneByteAlignment = 32;
 const int kNumPlanesToCheck = 3;
 const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm";
 const char kVP9Md5File[] = "vp90-2-02-size-lf-1920x1080.webm.md5";
 
-#if CONFIG_WEBM_IO
-
 struct ByteAlignmentTestParam {
   int byte_alignment;
   vpx_codec_err_t expected_value;
diff --git a/test/codec_factory.h b/test/codec_factory.h
index 09c9cf9..429d40d 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -13,10 +13,10 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_decoder.h"
 #include "vpx/vpx_encoder.h"
-#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
 #include "vpx/vp8cx.h"
 #endif
-#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER || CONFIG_VP10_DECODER
+#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
 #include "vpx/vp8dx.h"
 #endif
 
@@ -233,8 +233,6 @@ class VP9CodecFactory : public CodecFactory {
                                                int usage) const {
 #if CONFIG_VP9_ENCODER
     return vpx_codec_enc_config_default(&vpx_codec_vp9_cx_algo, cfg, usage);
-#elif CONFIG_VP10_ENCODER
-    return vpx_codec_enc_config_default(&vpx_codec_vp10_cx_algo, cfg, usage);
 #else
     return VPX_CODEC_INCAPABLE;
 #endif
@@ -253,96 +251,5 @@ const libvpx_test::VP9CodecFactory kVP9;
 #define VP9_INSTANTIATE_TEST_CASE(test, ...)
 #endif  // CONFIG_VP9
 
-/*
- * VP10 Codec Definitions
- */
-#if CONFIG_VP10
-class VP10Decoder : public Decoder {
- public:
-  VP10Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
-      : Decoder(cfg, deadline) {}
-
-  VP10Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
-              unsigned long deadline)  // NOLINT
-      : Decoder(cfg, flag, deadline) {}
-
- protected:
-  virtual vpx_codec_iface_t* CodecInterface() const {
-#if CONFIG_VP10_DECODER
-    return &vpx_codec_vp10_dx_algo;
-#else
-    return NULL;
-#endif
-  }
-};
-
-class VP10Encoder : public Encoder {
- public:
-  VP10Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
-              const unsigned long init_flags, TwopassStatsStore *stats)
-      : Encoder(cfg, deadline, init_flags, stats) {}
-
- protected:
-  virtual vpx_codec_iface_t* CodecInterface() const {
-#if CONFIG_VP10_ENCODER
-    return &vpx_codec_vp10_cx_algo;
-#else
-    return NULL;
-#endif
-  }
-};
-
-class VP10CodecFactory : public CodecFactory {
- public:
-  VP10CodecFactory() : CodecFactory() {}
-
-  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
-                                 unsigned long deadline) const {
-    return CreateDecoder(cfg, 0, deadline);
-  }
-
-  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
-                                 const vpx_codec_flags_t flags,
-                                 unsigned long deadline) const {  // NOLINT
-#if CONFIG_VP10_DECODER
-    return new VP10Decoder(cfg, flags, deadline);
-#else
-    return NULL;
-#endif
-  }
-
-  virtual Encoder* CreateEncoder(vpx_codec_enc_cfg_t cfg,
-                                 unsigned long deadline,
-                                 const unsigned long init_flags,
-                                 TwopassStatsStore *stats) const {
-#if CONFIG_VP10_ENCODER
-    return new VP10Encoder(cfg, deadline, init_flags, stats);
-#else
-    return NULL;
-#endif
-  }
-
-  virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
-                                               int usage) const {
-#if CONFIG_VP10_ENCODER
-    return vpx_codec_enc_config_default(&vpx_codec_vp10_cx_algo, cfg, usage);
-#else
-    return VPX_CODEC_INCAPABLE;
-#endif
-  }
-};
-
-const libvpx_test::VP10CodecFactory kVP10;
-
-#define VP10_INSTANTIATE_TEST_CASE(test, ...)\
-  INSTANTIATE_TEST_CASE_P(VP10, test, \
-      ::testing::Combine( \
-          ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \
-               &libvpx_test::kVP10)), \
-          __VA_ARGS__))
-#else
-#define VP10_INSTANTIATE_TEST_CASE(test, ...)
-#endif  // CONFIG_VP10
-
 }  // namespace libvpx_test
 #endif  // TEST_CODEC_FACTORY_H_
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 0826788..73b0edb 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -69,6 +69,21 @@ struct ConvolveFunctions {
 
 typedef std::tr1::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
 
+#define ALL_SIZES(convolve_fn) \
+    make_tuple(4, 4, &convolve_fn),     \
+    make_tuple(8, 4, &convolve_fn),     \
+    make_tuple(4, 8, &convolve_fn),     \
+    make_tuple(8, 8, &convolve_fn),     \
+    make_tuple(16, 8, &convolve_fn),    \
+    make_tuple(8, 16, &convolve_fn),    \
+    make_tuple(16, 16, &convolve_fn),   \
+    make_tuple(32, 16, &convolve_fn),   \
+    make_tuple(16, 32, &convolve_fn),   \
+    make_tuple(32, 32, &convolve_fn),   \
+    make_tuple(64, 32, &convolve_fn),   \
+    make_tuple(32, 64, &convolve_fn),   \
+    make_tuple(64, 64, &convolve_fn)
+
 // Reference 8-tap subpixel filter, slightly modified to fit into this test.
 #define VP9_FILTER_WEIGHT 128
 #define VP9_FILTER_SHIFT 7
@@ -103,7 +118,8 @@ void filter_block2d_8_c(const uint8_t *src_ptr,
   // and filter_max_width          = 16
   //
   uint8_t intermediate_buffer[71 * kMaxDimension];
-  const int intermediate_next_stride = 1 - intermediate_height * output_width;
+  const int intermediate_next_stride =
+      1 - static_cast<int>(intermediate_height * output_width);
 
   // Horizontal pass (src -> transposed intermediate).
   uint8_t *output_ptr = intermediate_buffer;
@@ -215,7 +231,8 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
    * and filter_max_width = 16
    */
   uint16_t intermediate_buffer[71 * kMaxDimension];
-  const int intermediate_next_stride = 1 - intermediate_height * output_width;
+  const int intermediate_next_stride =
+      1 - static_cast<int>(intermediate_height * output_width);
 
   // Horizontal pass (src -> transposed intermediate).
   {
@@ -279,8 +296,7 @@ void highbd_block2d_average_c(uint16_t *src,
                               uint16_t *output_ptr,
                               unsigned int output_stride,
                               unsigned int output_width,
-                              unsigned int output_height,
-                              int bd) {
+                              unsigned int output_height) {
   unsigned int i, j;
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
@@ -306,7 +322,7 @@ void highbd_filter_average_block2d_8_c(const uint16_t *src_ptr,
   highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
                             output_width, output_height, bd);
   highbd_block2d_average_c(tmp, 64, dst_ptr, dst_stride,
-                           output_width, output_height, bd);
+                           output_width, output_height);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -1035,20 +1051,6 @@ const ConvolveFunctions convolve8_c(
     wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
     wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8,
     wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
-INSTANTIATE_TEST_CASE_P(C_8, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_c),
-    make_tuple(8, 4, &convolve8_c),
-    make_tuple(4, 8, &convolve8_c),
-    make_tuple(8, 8, &convolve8_c),
-    make_tuple(16, 8, &convolve8_c),
-    make_tuple(8, 16, &convolve8_c),
-    make_tuple(16, 16, &convolve8_c),
-    make_tuple(32, 16, &convolve8_c),
-    make_tuple(16, 32, &convolve8_c),
-    make_tuple(32, 32, &convolve8_c),
-    make_tuple(64, 32, &convolve8_c),
-    make_tuple(32, 64, &convolve8_c),
-    make_tuple(64, 64, &convolve8_c)));
 const ConvolveFunctions convolve10_c(
     wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
     wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
@@ -1057,20 +1059,6 @@ const ConvolveFunctions convolve10_c(
     wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
     wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10,
     wrap_convolve8_c_10, wrap_convolve8_avg_c_10, 10);
-INSTANTIATE_TEST_CASE_P(C_10, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve10_c),
-    make_tuple(8, 4, &convolve10_c),
-    make_tuple(4, 8, &convolve10_c),
-    make_tuple(8, 8, &convolve10_c),
-    make_tuple(16, 8, &convolve10_c),
-    make_tuple(8, 16, &convolve10_c),
-    make_tuple(16, 16, &convolve10_c),
-    make_tuple(32, 16, &convolve10_c),
-    make_tuple(16, 32, &convolve10_c),
-    make_tuple(32, 32, &convolve10_c),
-    make_tuple(64, 32, &convolve10_c),
-    make_tuple(32, 64, &convolve10_c),
-    make_tuple(64, 64, &convolve10_c)));
 const ConvolveFunctions convolve12_c(
     wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
     wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
@@ -1079,23 +1067,13 @@ const ConvolveFunctions convolve12_c(
     wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
     wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12,
     wrap_convolve8_c_12, wrap_convolve8_avg_c_12, 12);
-INSTANTIATE_TEST_CASE_P(C_12, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve12_c),
-    make_tuple(8, 4, &convolve12_c),
-    make_tuple(4, 8, &convolve12_c),
-    make_tuple(8, 8, &convolve12_c),
-    make_tuple(16, 8, &convolve12_c),
-    make_tuple(8, 16, &convolve12_c),
-    make_tuple(16, 16, &convolve12_c),
-    make_tuple(32, 16, &convolve12_c),
-    make_tuple(16, 32, &convolve12_c),
-    make_tuple(32, 32, &convolve12_c),
-    make_tuple(64, 32, &convolve12_c),
-    make_tuple(32, 64, &convolve12_c),
-    make_tuple(64, 64, &convolve12_c)));
+const ConvolveParam kArrayConvolve_c[] = {
+    ALL_SIZES(convolve8_c),
+    ALL_SIZES(convolve10_c),
+    ALL_SIZES(convolve12_c)
+};
 
 #else
-
 const ConvolveFunctions convolve8_c(
     vpx_convolve_copy_c, vpx_convolve_avg_c,
     vpx_convolve8_horiz_c, vpx_convolve8_avg_horiz_c,
@@ -1104,22 +1082,10 @@ const ConvolveFunctions convolve8_c(
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
     vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
-
-INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_c),
-    make_tuple(8, 4, &convolve8_c),
-    make_tuple(4, 8, &convolve8_c),
-    make_tuple(8, 8, &convolve8_c),
-    make_tuple(16, 8, &convolve8_c),
-    make_tuple(8, 16, &convolve8_c),
-    make_tuple(16, 16, &convolve8_c),
-    make_tuple(32, 16, &convolve8_c),
-    make_tuple(16, 32, &convolve8_c),
-    make_tuple(32, 32, &convolve8_c),
-    make_tuple(64, 32, &convolve8_c),
-    make_tuple(32, 64, &convolve8_c),
-    make_tuple(64, 64, &convolve8_c)));
+const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c) };
 #endif
+INSTANTIATE_TEST_CASE_P(C, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve_c));
 
 #if HAVE_SSE2 && ARCH_X86_64
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1159,46 +1125,11 @@ const ConvolveFunctions convolve12_sse2(
     wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
     wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
     wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12, 12);
-INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_sse2),
-    make_tuple(8, 4, &convolve8_sse2),
-    make_tuple(4, 8, &convolve8_sse2),
-    make_tuple(8, 8, &convolve8_sse2),
-    make_tuple(16, 8, &convolve8_sse2),
-    make_tuple(8, 16, &convolve8_sse2),
-    make_tuple(16, 16, &convolve8_sse2),
-    make_tuple(32, 16, &convolve8_sse2),
-    make_tuple(16, 32, &convolve8_sse2),
-    make_tuple(32, 32, &convolve8_sse2),
-    make_tuple(64, 32, &convolve8_sse2),
-    make_tuple(32, 64, &convolve8_sse2),
-    make_tuple(64, 64, &convolve8_sse2),
-    make_tuple(4, 4, &convolve10_sse2),
-    make_tuple(8, 4, &convolve10_sse2),
-    make_tuple(4, 8, &convolve10_sse2),
-    make_tuple(8, 8, &convolve10_sse2),
-    make_tuple(16, 8, &convolve10_sse2),
-    make_tuple(8, 16, &convolve10_sse2),
-    make_tuple(16, 16, &convolve10_sse2),
-    make_tuple(32, 16, &convolve10_sse2),
-    make_tuple(16, 32, &convolve10_sse2),
-    make_tuple(32, 32, &convolve10_sse2),
-    make_tuple(64, 32, &convolve10_sse2),
-    make_tuple(32, 64, &convolve10_sse2),
-    make_tuple(64, 64, &convolve10_sse2),
-    make_tuple(4, 4, &convolve12_sse2),
-    make_tuple(8, 4, &convolve12_sse2),
-    make_tuple(4, 8, &convolve12_sse2),
-    make_tuple(8, 8, &convolve12_sse2),
-    make_tuple(16, 8, &convolve12_sse2),
-    make_tuple(8, 16, &convolve12_sse2),
-    make_tuple(16, 16, &convolve12_sse2),
-    make_tuple(32, 16, &convolve12_sse2),
-    make_tuple(16, 32, &convolve12_sse2),
-    make_tuple(32, 32, &convolve12_sse2),
-    make_tuple(64, 32, &convolve12_sse2),
-    make_tuple(32, 64, &convolve12_sse2),
-    make_tuple(64, 64, &convolve12_sse2)));
+const ConvolveParam kArrayConvolve_sse2[] = {
+    ALL_SIZES(convolve8_sse2),
+    ALL_SIZES(convolve10_sse2),
+    ALL_SIZES(convolve12_sse2)
+};
 #else
 const ConvolveFunctions convolve8_sse2(
 #if CONFIG_USE_X86INC
@@ -1213,21 +1144,10 @@ const ConvolveFunctions convolve8_sse2(
     vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
-INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_sse2),
-    make_tuple(8, 4, &convolve8_sse2),
-    make_tuple(4, 8, &convolve8_sse2),
-    make_tuple(8, 8, &convolve8_sse2),
-    make_tuple(16, 8, &convolve8_sse2),
-    make_tuple(8, 16, &convolve8_sse2),
-    make_tuple(16, 16, &convolve8_sse2),
-    make_tuple(32, 16, &convolve8_sse2),
-    make_tuple(16, 32, &convolve8_sse2),
-    make_tuple(32, 32, &convolve8_sse2),
-    make_tuple(64, 32, &convolve8_sse2),
-    make_tuple(32, 64, &convolve8_sse2),
-    make_tuple(64, 64, &convolve8_sse2)));
+const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2) };
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve_sse2));
 #endif
 
 #if HAVE_SSSE3
@@ -1238,22 +1158,11 @@ const ConvolveFunctions convolve8_ssse3(
     vpx_convolve8_ssse3, vpx_convolve8_avg_ssse3,
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
     vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
-    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+    vpx_scaled_2d_ssse3, vpx_scaled_avg_2d_c, 0);
 
-INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_ssse3),
-    make_tuple(8, 4, &convolve8_ssse3),
-    make_tuple(4, 8, &convolve8_ssse3),
-    make_tuple(8, 8, &convolve8_ssse3),
-    make_tuple(16, 8, &convolve8_ssse3),
-    make_tuple(8, 16, &convolve8_ssse3),
-    make_tuple(16, 16, &convolve8_ssse3),
-    make_tuple(32, 16, &convolve8_ssse3),
-    make_tuple(16, 32, &convolve8_ssse3),
-    make_tuple(32, 32, &convolve8_ssse3),
-    make_tuple(64, 32, &convolve8_ssse3),
-    make_tuple(32, 64, &convolve8_ssse3),
-    make_tuple(64, 64, &convolve8_ssse3)));
+const ConvolveParam kArrayConvolve8_ssse3[] = { ALL_SIZES(convolve8_ssse3) };
+INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve8_ssse3));
 #endif
 
 #if HAVE_AVX2 && HAVE_SSSE3
@@ -1266,20 +1175,9 @@ const ConvolveFunctions convolve8_avx2(
     vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
-INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_avx2),
-    make_tuple(8, 4, &convolve8_avx2),
-    make_tuple(4, 8, &convolve8_avx2),
-    make_tuple(8, 8, &convolve8_avx2),
-    make_tuple(8, 16, &convolve8_avx2),
-    make_tuple(16, 8, &convolve8_avx2),
-    make_tuple(16, 16, &convolve8_avx2),
-    make_tuple(32, 16, &convolve8_avx2),
-    make_tuple(16, 32, &convolve8_avx2),
-    make_tuple(32, 32, &convolve8_avx2),
-    make_tuple(64, 32, &convolve8_avx2),
-    make_tuple(32, 64, &convolve8_avx2),
-    make_tuple(64, 64, &convolve8_avx2)));
+const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
+INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve8_avx2));
 #endif  // HAVE_AVX2 && HAVE_SSSE3
 
 #if HAVE_NEON
@@ -1303,20 +1201,9 @@ const ConvolveFunctions convolve8_neon(
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 #endif  // HAVE_NEON_ASM
 
-INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_neon),
-    make_tuple(8, 4, &convolve8_neon),
-    make_tuple(4, 8, &convolve8_neon),
-    make_tuple(8, 8, &convolve8_neon),
-    make_tuple(16, 8, &convolve8_neon),
-    make_tuple(8, 16, &convolve8_neon),
-    make_tuple(16, 16, &convolve8_neon),
-    make_tuple(32, 16, &convolve8_neon),
-    make_tuple(16, 32, &convolve8_neon),
-    make_tuple(32, 32, &convolve8_neon),
-    make_tuple(64, 32, &convolve8_neon),
-    make_tuple(32, 64, &convolve8_neon),
-    make_tuple(64, 64, &convolve8_neon)));
+const ConvolveParam kArrayConvolve8_neon[] = { ALL_SIZES(convolve8_neon) };
+INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve8_neon));
 #endif  // HAVE_NEON
 
 #if HAVE_DSPR2
@@ -1329,21 +1216,10 @@ const ConvolveFunctions convolve8_dspr2(
     vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
-INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_dspr2),
-    make_tuple(8, 4, &convolve8_dspr2),
-    make_tuple(4, 8, &convolve8_dspr2),
-    make_tuple(8, 8, &convolve8_dspr2),
-    make_tuple(16, 8, &convolve8_dspr2),
-    make_tuple(8, 16, &convolve8_dspr2),
-    make_tuple(16, 16, &convolve8_dspr2),
-    make_tuple(32, 16, &convolve8_dspr2),
-    make_tuple(16, 32, &convolve8_dspr2),
-    make_tuple(32, 32, &convolve8_dspr2),
-    make_tuple(64, 32, &convolve8_dspr2),
-    make_tuple(32, 64, &convolve8_dspr2),
-    make_tuple(64, 64, &convolve8_dspr2)));
-#endif
+const ConvolveParam kArrayConvolve8_dspr2[] = { ALL_SIZES(convolve8_dspr2) };
+INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve8_dspr2));
+#endif  // HAVE_DSPR2
 
 #if HAVE_MSA
 const ConvolveFunctions convolve8_msa(
@@ -1355,19 +1231,8 @@ const ConvolveFunctions convolve8_msa(
     vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
-INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, ::testing::Values(
-    make_tuple(4, 4, &convolve8_msa),
-    make_tuple(8, 4, &convolve8_msa),
-    make_tuple(4, 8, &convolve8_msa),
-    make_tuple(8, 8, &convolve8_msa),
-    make_tuple(16, 8, &convolve8_msa),
-    make_tuple(8, 16, &convolve8_msa),
-    make_tuple(16, 16, &convolve8_msa),
-    make_tuple(32, 16, &convolve8_msa),
-    make_tuple(16, 32, &convolve8_msa),
-    make_tuple(32, 32, &convolve8_msa),
-    make_tuple(64, 32, &convolve8_msa),
-    make_tuple(32, 64, &convolve8_msa),
-    make_tuple(64, 64, &convolve8_msa)));
+const ConvolveParam kArrayConvolve8_msa[] = { ALL_SIZES(convolve8_msa) };
+INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve8_msa));
 #endif  // HAVE_MSA
 }  // namespace
diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index 8baa2f9..2cad30f 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -26,7 +26,8 @@ class CpuSpeedTest
       : EncoderTest(GET_PARAM(0)),
         encoding_mode_(GET_PARAM(1)),
         set_cpu_used_(GET_PARAM(2)),
-        min_psnr_(kMaxPSNR) {}
+        min_psnr_(kMaxPSNR),
+        tune_content_(VP9E_CONTENT_DEFAULT) {}
   virtual ~CpuSpeedTest() {}
 
   virtual void SetUp() {
@@ -49,6 +50,7 @@ class CpuSpeedTest
                                   ::libvpx_test::Encoder *encoder) {
     if (video->frame() == 1) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_);
       if (encoding_mode_ != ::libvpx_test::kRealTime) {
         encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
         encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
@@ -66,6 +68,7 @@ class CpuSpeedTest
   ::libvpx_test::TestMode encoding_mode_;
   int set_cpu_used_;
   double min_psnr_;
+  int tune_content_;
 };
 
 TEST_P(CpuSpeedTest, TestQ0) {
@@ -74,7 +77,7 @@ TEST_P(CpuSpeedTest, TestQ0) {
   // the encoder to producing lots of big partitions which will likely
   // extend into the border and test the border condition.
   cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
   cfg_.rc_target_bitrate = 400;
   cfg_.rc_max_quantizer = 0;
   cfg_.rc_min_quantizer = 0;
@@ -92,7 +95,7 @@ TEST_P(CpuSpeedTest, TestScreencastQ0) {
   ::libvpx_test::Y4mVideoSource video("screendata.y4m", 0, 25);
   cfg_.g_timebase = video.timebase();
   cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
   cfg_.rc_target_bitrate = 400;
   cfg_.rc_max_quantizer = 0;
   cfg_.rc_min_quantizer = 0;
@@ -103,13 +106,28 @@ TEST_P(CpuSpeedTest, TestScreencastQ0) {
   EXPECT_GE(min_psnr_, kMaxPSNR);
 }
 
+TEST_P(CpuSpeedTest, TestTuneScreen) {
+  ::libvpx_test::Y4mVideoSource video("screendata.y4m", 0, 25);
+  cfg_.g_timebase = video.timebase();
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_min_quantizer = 0;
+  tune_content_ = VP9E_CONTENT_SCREEN;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
 TEST_P(CpuSpeedTest, TestEncodeHighBitrate) {
   // Validate that this non multiple of 64 wide clip encodes and decodes
   // without a mismatch when passing in a very low max q.  This pushes
   // the encoder to producing lots of big partitions which will likely
   // extend into the border and test the border condition.
   cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
   cfg_.rc_target_bitrate = 12000;
   cfg_.rc_max_quantizer = 10;
   cfg_.rc_min_quantizer = 0;
@@ -125,7 +143,7 @@ TEST_P(CpuSpeedTest, TestLowBitrate) {
   // when passing in a very high min q.  This pushes the encoder to producing
   // lots of small partitions which might will test the other condition.
   cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
   cfg_.rc_target_bitrate = 200;
   cfg_.rc_min_quantizer = 40;
 
@@ -140,9 +158,4 @@ VP9_INSTANTIATE_TEST_CASE(
     ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
                       ::libvpx_test::kRealTime),
     ::testing::Range(0, 9));
-
-VP10_INSTANTIATE_TEST_CASE(
-    CpuSpeedTest,
-    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood),
-    ::testing::Range(0, 3));
 }  // namespace
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index b6cae79..220cbf3 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -90,7 +90,7 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest,
           << pkt->data.frame.pts;
     }
 
-    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+    const int64_t frame_size_in_bits = pkt->data.frame.sz * 8;
 
     // Subtract from the buffer the bits associated with a played back frame.
     bits_in_buffer_model_ -= frame_size_in_bits;
@@ -135,7 +135,7 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest,
   double duration_;
   double file_datarate_;
   double effective_datarate_;
-  size_t bits_in_last_frame_;
+  int64_t bits_in_last_frame_;
   int denoiser_on_;
   int denoiser_offon_test_;
   int denoiser_offon_period_;
@@ -450,7 +450,28 @@ class DatarateTestVP9Large : public ::libvpx_test::EncoderTest,
   int denoiser_offon_period_;
 };
 
-// Check basic rate targeting,
+// Check basic rate targeting for VBR mode.
+TEST_P(DatarateTestVP9Large, BasicRateTargetingVBR) {
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_error_resilient = 0;
+  cfg_.rc_end_usage = VPX_VBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  for (int i = 400; i <= 800; i += 400) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+        << " The datarate for the file is greater than target by too much!";
+  }
+}
+
+// Check basic rate targeting for CBR,
 TEST_P(DatarateTestVP9Large, BasicRateTargeting) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
@@ -474,7 +495,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting) {
   }
 }
 
-// Check basic rate targeting,
+// Check basic rate targeting for CBR.
 TEST_P(DatarateTestVP9Large, BasicRateTargeting444) {
   ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
 
@@ -519,6 +540,9 @@ TEST_P(DatarateTestVP9Large, ChangingDropFrameThresh) {
   cfg_.rc_end_usage = VPX_CBR;
   cfg_.rc_target_bitrate = 200;
   cfg_.g_lag_in_frames = 0;
+  // TODO(marpan): Investigate datarate target failures with a smaller keyframe
+  // interval (128).
+  cfg_.kf_max_dist = 9999;
 
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 140);
@@ -538,7 +562,7 @@ TEST_P(DatarateTestVP9Large, ChangingDropFrameThresh) {
         << " The first dropped frame for drop_thresh " << i
         << " > first dropped frame for drop_thresh "
         << i - kDropFrameThreshTestStep;
-    ASSERT_GE(num_drops_, last_num_drops * 0.90)
+    ASSERT_GE(num_drops_, last_num_drops * 0.85)
         << " The number of dropped frames for drop_thresh " << i
         << " < number of dropped frames for drop_thresh "
         << i - kDropFrameThreshTestStep;
@@ -770,14 +794,10 @@ class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest,
                                   ::libvpx_test::Encoder *encoder) {
     if (video->frame() == 0) {
       int i;
-      for (i = 0; i < 2; ++i) {
+      for (i = 0; i < VPX_MAX_LAYERS; ++i) {
         svc_params_.max_quantizers[i] = 63;
         svc_params_.min_quantizers[i] = 0;
       }
-      svc_params_.scaling_factor_num[0] = 144;
-      svc_params_.scaling_factor_den[0] = 288;
-      svc_params_.scaling_factor_num[1] = 288;
-      svc_params_.scaling_factor_den[1] = 288;
       encoder->Control(VP9E_SET_SVC, 1);
       encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
       encoder->Control(VP8E_SET_CPUUSED, speed_setting_);
@@ -814,8 +834,6 @@ class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest,
     if (bits_total_) {
       const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
       duration_ = (last_pts_ + 1) * timebase_;
-      effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0
-          / (cfg_.rc_buf_initial_sz / 1000.0 + duration_);
       file_datarate_ = file_size_in_kb / duration_;
     }
   }
@@ -839,7 +857,6 @@ class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest,
   int64_t bits_total_;
   double duration_;
   double file_datarate_;
-  double effective_datarate_;
   size_t bits_in_last_frame_;
   vpx_svc_extra_cfg_t svc_params_;
   int speed_setting_;
@@ -850,8 +867,7 @@ static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
     const vpx_svc_extra_cfg_t *svc_params,
     int spatial_layers,
     int temporal_layers,
-    int temporal_layering_mode,
-    unsigned int total_rate) {
+    int temporal_layering_mode) {
   int sl, spatial_layer_target;
   float total = 0;
   float alloc_ratio[VPX_MAX_LAYERS] = {0};
@@ -885,7 +901,7 @@ static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
 
 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
 // 3 temporal layers. Run CIF clip with 1 thread.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -905,31 +921,71 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc) {
   svc_params_.scaling_factor_den[0] = 288;
   svc_params_.scaling_factor_num[1] = 288;
   svc_params_.scaling_factor_den[1] = 288;
-  // TODO(wonkap/marpan): No frame drop for now, we need to implement correct
-  // frame dropping for SVC.
-  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.kf_max_dist = 9999;
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 200);
   // TODO(wonkap/marpan): Check that effective_datarate for each layer hits the
-  // layer target_bitrate. Also check if test can pass at lower bitrate (~200k).
-  for (int i = 400; i <= 800; i += 200) {
+  // layer target_bitrate.
+  for (int i = 200; i <= 800; i += 200) {
     cfg_.rc_target_bitrate = i;
     ResetModel();
     assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-        cfg_.rc_target_bitrate);
+        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.85)
+    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
             << " The datarate for the file exceeds the target by too much!";
     ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
         << " The datarate for the file is lower than the target by too much!";
-    EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+  }
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3
+// temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayersSmallKf) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 2;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 3;
+  svc_params_.scaling_factor_num[0] = 144;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 288;
+  svc_params_.scaling_factor_den[1] = 288;
+  cfg_.rc_dropframe_thresh = 10;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 200);
+  cfg_.rc_target_bitrate = 400;
+  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
+  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
+  for (int j = 64; j <= 67; j++) {
+    cfg_.kf_max_dist = j;
+    ResetModel();
+    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
+            << " The datarate for the file exceeds the target by too much!";
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
+        << " The datarate for the file is lower than the target by too much!";
+    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
   }
 }
 
 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
 // 3 temporal layers. Run HD clip with 4 threads.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc4threads) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers4threads) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -949,30 +1005,152 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc4threads) {
   svc_params_.scaling_factor_den[0] = 288;
   svc_params_.scaling_factor_num[1] = 288;
   svc_params_.scaling_factor_den[1] = 288;
-  // TODO(wonkap/marpan): No frame drop for now, we need to implement correct
-  // frame dropping for SVC.
-  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.kf_max_dist = 9999;
   ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720,
                                        30, 1, 0, 300);
   cfg_.rc_target_bitrate = 800;
   ResetModel();
   assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-      cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-      cfg_.rc_target_bitrate);
+      cfg_.ts_number_layers, cfg_.temporal_layering_mode);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.85)
+  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
           << " The datarate for the file exceeds the target by too much!";
   ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
       << " The datarate for the file is lower than the target by too much!";
-  EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers. Run CIF clip with 1 thread.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 3;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 3;
+  svc_params_.scaling_factor_num[0] = 72;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 144;
+  svc_params_.scaling_factor_den[1] = 288;
+  svc_params_.scaling_factor_num[2] = 288;
+  svc_params_.scaling_factor_den[2] = 288;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+     cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
+          << " The datarate for the file exceeds the target by too much!";
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
+      << " The datarate for the file is lower than the target by too much!";
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayersSmallKf) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 3;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 3;
+  svc_params_.scaling_factor_num[0] = 72;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 144;
+  svc_params_.scaling_factor_den[1] = 288;
+  svc_params_.scaling_factor_num[2] = 288;
+  svc_params_.scaling_factor_den[2] = 288;
+  cfg_.rc_dropframe_thresh = 10;
+  ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 800;
+  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
+  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
+  for (int j = 32; j <= 35; j++) {
+    cfg_.kf_max_dist = j;
+    ResetModel();
+    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+       cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
+            << " The datarate for the file exceeds the target by too much!";
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.30)
+        << " The datarate for the file is lower than the target by too much!";
+    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+  }
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers. Run HD clip with 4 threads.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers4threads) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 3;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 4;
+  cfg_.temporal_layering_mode = 3;
+  svc_params_.scaling_factor_num[0] = 72;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 144;
+  svc_params_.scaling_factor_den[1] = 288;
+  svc_params_.scaling_factor_num[2] = 288;
+  svc_params_.scaling_factor_den[2] = 288;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+      cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
+          << " The datarate for the file exceeds the target by too much!";
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
+      << " The datarate for the file is lower than the target by too much!";
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 }
 
 VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES);
 VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large,
                           ::testing::Values(::libvpx_test::kOnePassGood,
                                             ::libvpx_test::kRealTime),
-                          ::testing::Range(2, 7));
+                          ::testing::Range(2, 9));
 VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvc,
                           ::testing::Values(::libvpx_test::kRealTime),
-                          ::testing::Range(5, 8));
+                          ::testing::Range(5, 9));
 }  // namespace
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 332210d..ddaf939 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -276,12 +276,12 @@ void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) {
 }
 
 void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,
-                      int tx_type) {
+                      int /*tx_type*/) {
   idct16x16_10(in, out, stride);
 }
 
 void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride,
-                      int tx_type) {
+                      int /*tx_type*/) {
   idct16x16_12(in, out, stride);
 }
 
@@ -293,6 +293,7 @@ void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
   vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12);
 }
 
+#if HAVE_SSE2
 void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
   vpx_highbd_idct16x16_10_add_c(in, out, stride, 10);
 }
@@ -301,7 +302,6 @@ void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
   vpx_highbd_idct16x16_10_add_c(in, out, stride, 12);
 }
 
-#if HAVE_SSE2
 void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
   vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 10);
 }
@@ -373,10 +373,10 @@ class Trans16x16TestBase {
 
       for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_VP9_HIGHBITDEPTH
-        const uint32_t diff =
+        const int32_t diff =
             bit_depth_ == VPX_BITS_8 ?  dst[j] - src[j] : dst16[j] - src16[j];
 #else
-        const uint32_t diff = dst[j] - src[j];
+        const int32_t diff = dst[j] - src[j];
 #endif
         const uint32_t error = diff * diff;
         if (max_error < error)
@@ -778,7 +778,7 @@ class InvTrans16x16DCT
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {}
+  void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, int /*stride*/) {}
   void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride);
   }
@@ -792,6 +792,67 @@ TEST_P(InvTrans16x16DCT, CompareReference) {
   CompareInvReference(ref_txfm_, thresh_);
 }
 
+class PartialTrans16x16Test
+    : public ::testing::TestWithParam<
+          std::tr1::tuple<FdctFunc, vpx_bit_depth_t> > {
+ public:
+  virtual ~PartialTrans16x16Test() {}
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    bit_depth_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  vpx_bit_depth_t bit_depth_;
+  FdctFunc fwd_txfm_;
+};
+
+TEST_P(PartialTrans16x16Test, Extremes) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int16_t maxval =
+      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
+#else
+  const int16_t maxval = 255;
+#endif
+  const int minval = -maxval;
+  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
+
+  for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval;
+  output[0] = 0;
+  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
+  EXPECT_EQ((maxval * kNumCoeffs) >> 1, output[0]);
+
+  for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval;
+  output[0] = 0;
+  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
+  EXPECT_EQ((minval * kNumCoeffs) >> 1, output[0]);
+}
+
+TEST_P(PartialTrans16x16Test, Random) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int16_t maxval =
+      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
+#else
+  const int16_t maxval = 255;
+#endif
+  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  int sum = 0;
+  for (int i = 0; i < kNumCoeffs; ++i) {
+    const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
+    input[i] = val;
+    sum += val;
+  }
+  output[0] = 0;
+  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
+  EXPECT_EQ(sum >> 1, output[0]);
+}
+
 using std::tr1::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -824,6 +885,11 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    C, PartialTrans16x16Test,
+    ::testing::Values(make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_8),
+                      make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_10),
+                      make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_12)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, Trans16x16HT,
@@ -832,6 +898,9 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(C, PartialTrans16x16Test,
+                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_c,
+                                                     VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -859,6 +928,9 @@ INSTANTIATE_TEST_CASE_P(
                    VPX_BITS_8),
         make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3,
                    VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
+                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2,
+                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -896,6 +968,9 @@ INSTANTIATE_TEST_CASE_P(
                    &idct16x16_10_add_12_sse2, 3167, VPX_BITS_12),
         make_tuple(&idct16x16_12,
                    &idct16x16_256_add_12_sse2, 3167, VPX_BITS_12)));
+INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
+                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2,
+                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -912,5 +987,8 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3,
                    VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(MSA, PartialTrans16x16Test,
+                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_msa,
+                                                     VPX_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index f7327b1..16d8825 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -81,10 +81,6 @@ typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
     Trans32x32Param;
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void idct32x32_8(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct32x32_1024_add_c(in, out, stride, 8);
-}
-
 void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
   vpx_highbd_idct32x32_1024_add_c(in, out, stride, 10);
 }
@@ -158,10 +154,10 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
 
     for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_VP9_HIGHBITDEPTH
-      const uint32_t diff =
+      const int32_t diff =
           bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
 #else
-      const uint32_t diff = dst[j] - src[j];
+      const int32_t diff = dst[j] - src[j];
 #endif
       const uint32_t error = diff * diff;
       if (max_error < error)
@@ -309,6 +305,67 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
   }
 }
 
+class PartialTrans32x32Test
+    : public ::testing::TestWithParam<
+          std::tr1::tuple<FwdTxfmFunc, vpx_bit_depth_t> > {
+ public:
+  virtual ~PartialTrans32x32Test() {}
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    bit_depth_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  vpx_bit_depth_t bit_depth_;
+  FwdTxfmFunc fwd_txfm_;
+};
+
+TEST_P(PartialTrans32x32Test, Extremes) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int16_t maxval =
+      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
+#else
+  const int16_t maxval = 255;
+#endif
+  const int minval = -maxval;
+  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
+
+  for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval;
+  output[0] = 0;
+  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
+  EXPECT_EQ((maxval * kNumCoeffs) >> 3, output[0]);
+
+  for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval;
+  output[0] = 0;
+  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
+  EXPECT_EQ((minval * kNumCoeffs) >> 3, output[0]);
+}
+
+TEST_P(PartialTrans32x32Test, Random) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int16_t maxval =
+      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
+#else
+  const int16_t maxval = 255;
+#endif
+  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  int sum = 0;
+  for (int i = 0; i < kNumCoeffs; ++i) {
+    const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
+    input[i] = val;
+    sum += val;
+  }
+  output[0] = 0;
+  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
+  EXPECT_EQ(sum >> 3, output[0]);
+}
+
 using std::tr1::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -327,6 +384,11 @@ INSTANTIATE_TEST_CASE_P(
                    &vpx_idct32x32_1024_add_c, 0, VPX_BITS_8),
         make_tuple(&vpx_fdct32x32_rd_c,
                    &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    C, PartialTrans32x32Test,
+    ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_8),
+                      make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_10),
+                      make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_12)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, Trans32x32Test,
@@ -335,9 +397,12 @@ INSTANTIATE_TEST_CASE_P(
                    &vpx_idct32x32_1024_add_c, 0, VPX_BITS_8),
         make_tuple(&vpx_fdct32x32_rd_c,
                    &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(C, PartialTrans32x32Test,
+                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_c,
+                                                     VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans32x32Test,
     ::testing::Values(
@@ -345,7 +410,7 @@ INSTANTIATE_TEST_CASE_P(
                    &vpx_idct32x32_1024_add_neon, 0, VPX_BITS_8),
         make_tuple(&vpx_fdct32x32_rd_c,
                    &vpx_idct32x32_1024_add_neon, 1, VPX_BITS_8)));
-#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
@@ -355,6 +420,9 @@ INSTANTIATE_TEST_CASE_P(
                    &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
         make_tuple(&vpx_fdct32x32_rd_sse2,
                    &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test,
+                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2,
+                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -371,6 +439,9 @@ INSTANTIATE_TEST_CASE_P(
                    VPX_BITS_8),
         make_tuple(&vpx_fdct32x32_rd_sse2, &vpx_idct32x32_1024_add_c, 1,
                    VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test,
+                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2,
+                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -391,5 +462,8 @@ INSTANTIATE_TEST_CASE_P(
                    &vpx_idct32x32_1024_add_msa, 0, VPX_BITS_8),
         make_tuple(&vpx_fdct32x32_rd_msa,
                    &vpx_idct32x32_1024_add_msa, 1, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(MSA, PartialTrans32x32Test,
+                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa,
+                                                     VPX_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/test/decode_api_test.cc b/test/decode_api_test.cc
index 318351b..99b4db1 100644
--- a/test/decode_api_test.cc
+++ b/test/decode_api_test.cc
@@ -27,9 +27,6 @@ TEST(DecodeAPI, InvalidParams) {
 #if CONFIG_VP9_DECODER
     &vpx_codec_vp9_dx_algo,
 #endif
-#if CONFIG_VP10_DECODER
-    &vpx_codec_vp10_dx_algo,
-#endif
   };
   uint8_t buf[1] = {0};
   vpx_codec_ctx_t dec;
@@ -146,6 +143,40 @@ TEST(DecodeAPI, Vp9InvalidDecode) {
   TestVp9Controls(&dec);
   EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec));
 }
+
+TEST(DecodeAPI, Vp9PeekSI) {
+  const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
+  // The first 9 bytes are valid and the rest of the bytes are made up. Until
+  // size 10, this should return VPX_CODEC_UNSUP_BITSTREAM and after that it
+  // should return VPX_CODEC_CORRUPT_FRAME.
+  const uint8_t data[32] = {
+    0x85, 0xa4, 0xc1, 0xa1, 0x38, 0x81, 0xa3, 0x49,
+    0x83, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  };
+
+  for (uint32_t data_sz = 1; data_sz <= 32; ++data_sz) {
+    // Verify behavior of vpx_codec_decode. vpx_codec_decode doesn't even get
+    // to decoder_peek_si_internal on frames of size < 8.
+    if (data_sz >= 8) {
+      vpx_codec_ctx_t dec;
+      EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, NULL, 0));
+      EXPECT_EQ((data_sz < 10) ?
+                    VPX_CODEC_UNSUP_BITSTREAM : VPX_CODEC_CORRUPT_FRAME,
+                vpx_codec_decode(&dec, data, data_sz, NULL, 0));
+      vpx_codec_iter_t iter = NULL;
+      EXPECT_EQ(NULL, vpx_codec_get_frame(&dec, &iter));
+      EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec));
+    }
+
+    // Verify behavior of vpx_codec_peek_stream_info.
+    vpx_codec_stream_info_t si;
+    si.sz = sizeof(si);
+    EXPECT_EQ((data_sz < 10) ? VPX_CODEC_UNSUP_BITSTREAM : VPX_CODEC_OK,
+              vpx_codec_peek_stream_info(codec, data, data_sz, &si));
+  }
+}
 #endif  // CONFIG_VP9_DECODER
 
 }  // namespace
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
new file mode 100644
index 0000000..94afdde
--- /dev/null
+++ b/test/encode_api_test.cc
@@ -0,0 +1,65 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+
+namespace {
+
+#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
+
+TEST(EncodeAPI, InvalidParams) {
+  static const vpx_codec_iface_t *kCodecs[] = {
+#if CONFIG_VP8_ENCODER
+    &vpx_codec_vp8_cx_algo,
+#endif
+#if CONFIG_VP9_ENCODER
+    &vpx_codec_vp9_cx_algo,
+#endif
+  };
+  uint8_t buf[1] = {0};
+  vpx_image_t img;
+  vpx_codec_ctx_t enc;
+  vpx_codec_enc_cfg_t cfg;
+
+  EXPECT_EQ(&img, vpx_img_wrap(&img, VPX_IMG_FMT_I420, 1, 1, 1, buf));
+
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_enc_init(NULL, NULL, NULL, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_enc_init(&enc, NULL, NULL, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_encode(NULL, NULL, 0, 0, 0, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_encode(NULL, &img, 0, 0, 0, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_destroy(NULL));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_enc_config_default(NULL, NULL, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_enc_config_default(NULL, &cfg, 0));
+  EXPECT_TRUE(vpx_codec_error(NULL) != NULL);
+
+  for (int i = 0; i < NELEMENTS(kCodecs); ++i) {
+    SCOPED_TRACE(vpx_codec_iface_name(kCodecs[i]));
+    EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+              vpx_codec_enc_init(NULL, kCodecs[i], NULL, 0));
+    EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+              vpx_codec_enc_init(&enc, kCodecs[i], NULL, 0));
+    EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+              vpx_codec_enc_config_default(kCodecs[i], &cfg, 1));
+
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(kCodecs[i], &cfg, 0));
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, kCodecs[i], &cfg, 0));
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, NULL, 0, 0, 0, 0));
+
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&enc));
+  }
+}
+
+}  // namespace
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 128436e..b8c7371 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -43,15 +43,6 @@ void Encoder::InitEncoder(VideoSource *video) {
       ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
     } else
 #endif
-#if CONFIG_VP10_ENCODER
-    if (CodecInterface() == &vpx_codec_vp10_cx_algo) {
-      // Default to 1 tile column for VP10.
-      const int log2_tile_columns = 0;
-      res = vpx_codec_control_(&encoder_, VP9E_SET_TILE_COLUMNS,
-                               log2_tile_columns);
-      ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
-    } else
-#endif
     {
 #if CONFIG_VP8_ENCODER
       ASSERT_EQ(&vpx_codec_vp8_cx_algo, CodecInterface())
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 6d0a72f..d14ddc7 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -16,7 +16,7 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./vpx_config.h"
-#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
 #include "vpx/vp8cx.h"
 #endif
 #include "vpx/vpx_encoder.h"
@@ -143,7 +143,7 @@ class Encoder {
     const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
-#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
   void Control(int ctrl_id, vpx_active_map_t *arg) {
     const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 9a2ad2f..00a095c 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -100,7 +100,7 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest,
   }
 
   virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+                                  ::libvpx_test::Encoder * /*encoder*/) {
     frame_flags_ &= ~(VP8_EFLAG_NO_UPD_LAST |
                       VP8_EFLAG_NO_UPD_GF |
                       VP8_EFLAG_NO_UPD_ARF);
@@ -596,7 +596,4 @@ VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTestLargeCodecControls,
                           ONE_PASS_TEST_MODES);
 VP9_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES,
                           ::testing::Values(true));
-// SVC-related tests don't run for VP10 since SVC is not supported.
-VP10_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES,
-                           ::testing::Values(false));
 }  // namespace
diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc
index d02dca2..2570f44 100644
--- a/test/external_frame_buffer_test.cc
+++ b/test/external_frame_buffer_test.cc
@@ -24,7 +24,6 @@
 namespace {
 
 const int kVideoNameParam = 1;
-const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm";
 
 struct ExternalFrameBuffer {
   uint8_t *data;
@@ -155,6 +154,8 @@ class ExternalFrameBufferList {
   ExternalFrameBuffer *ext_fb_list_;
 };
 
+#if CONFIG_WEBM_IO
+
 // Callback used by libvpx to request the application to return a frame
 // buffer of at least |min_size| in bytes.
 int get_vp9_frame_buffer(void *user_priv, size_t min_size,
@@ -197,6 +198,8 @@ int do_not_release_vp9_frame_buffer(void *user_priv,
   return 0;
 }
 
+#endif  // CONFIG_WEBM_IO
+
 // Class for testing passing in external frame buffers to libvpx.
 class ExternalFrameBufferMD5Test
     : public ::libvpx_test::DecoderTest,
@@ -278,6 +281,8 @@ class ExternalFrameBufferMD5Test
 };
 
 #if CONFIG_WEBM_IO
+const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm";
+
 // Class for testing passing in external frame buffers to libvpx.
 class ExternalFrameBufferTest : public ::testing::Test {
  protected:
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 3f6b738..735cccf 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -40,7 +40,7 @@ typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct4x4Param;
 typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht4x4Param;
 
 void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
-                 int tx_type) {
+                 int /*tx_type*/) {
   vpx_fdct4x4_c(in, out, stride);
 }
 
@@ -49,7 +49,7 @@ void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
 }
 
 void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
-                 int tx_type) {
+                 int /*tx_type*/) {
   vp9_fwht4x4_c(in, out, stride);
 }
 
@@ -141,11 +141,11 @@ class Trans4x4TestBase {
 
       for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_VP9_HIGHBITDEPTH
-        const uint32_t diff =
+        const int diff =
             bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
 #else
         ASSERT_EQ(VPX_BITS_8, bit_depth_);
-        const uint32_t diff = dst[j] - src[j];
+        const int diff = dst[j] - src[j];
 #endif
         const uint32_t error = diff * diff;
         if (max_error < error)
@@ -258,10 +258,10 @@ class Trans4x4TestBase {
 
       for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_VP9_HIGHBITDEPTH
-        const uint32_t diff =
+        const int diff =
             bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
 #else
-        const uint32_t diff = dst[j] - src[j];
+        const int diff = dst[j] - src[j];
 #endif
         const uint32_t error = diff * diff;
         EXPECT_GE(static_cast<uint32_t>(limit), error)
@@ -487,19 +487,11 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
 #endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH && \
-    !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    MMX, Trans4x4WHT,
-    ::testing::Values(
-        make_tuple(&vp9_fwht4x4_mmx, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
-#endif
-
-#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && \
-    !CONFIG_EMULATE_HARDWARE
+#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4WHT,
     ::testing::Values(
+        make_tuple(&vp9_fwht4x4_sse2, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8),
         make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8)));
 #endif
 
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index c0deaf4..29f2158 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -47,7 +47,7 @@ typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
 typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
 typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t> Idct8x8Param;
 
-void reference_8x8_dct_1d(const double in[8], double out[8], int stride) {
+void reference_8x8_dct_1d(const double in[8], double out[8]) {
   const double kInvSqrt2 = 0.707106781186547524400844362104;
   for (int k = 0; k < 8; k++) {
     out[k] = 0.0;
@@ -65,7 +65,7 @@ void reference_8x8_dct_2d(const int16_t input[kNumCoeffs],
     double temp_in[8], temp_out[8];
     for (int j = 0; j < 8; ++j)
       temp_in[j] = input[j*8 + i];
-    reference_8x8_dct_1d(temp_in, temp_out, 1);
+    reference_8x8_dct_1d(temp_in, temp_out);
     for (int j = 0; j < 8; ++j)
       output[j * 8 + i] = temp_out[j];
   }
@@ -74,7 +74,7 @@ void reference_8x8_dct_2d(const int16_t input[kNumCoeffs],
     double temp_in[8], temp_out[8];
     for (int j = 0; j < 8; ++j)
       temp_in[j] = output[j + i*8];
-    reference_8x8_dct_1d(temp_in, temp_out, 1);
+    reference_8x8_dct_1d(temp_in, temp_out);
     // Scale by some magic number
     for (int j = 0; j < 8; ++j)
       output[j + i * 8] = temp_out[j] * 2;
@@ -82,7 +82,8 @@ void reference_8x8_dct_2d(const int16_t input[kNumCoeffs],
 }
 
 
-void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride,
+                 int /*tx_type*/) {
   vpx_fdct8x8_c(in, out, stride);
 }
 
@@ -107,6 +108,8 @@ void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
   vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12);
 }
 
+#if HAVE_SSE2
+
 void idct8x8_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
   vpx_highbd_idct8x8_10_add_c(in, out, stride, 10);
 }
@@ -115,7 +118,6 @@ void idct8x8_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
   vpx_highbd_idct8x8_10_add_c(in, out, stride, 12);
 }
 
-#if HAVE_SSE2
 void idct8x8_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
   vpx_highbd_idct8x8_10_add_sse2(in, out, stride, 10);
 }
@@ -423,10 +425,10 @@ class FwdTrans8x8TestBase {
 
       for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_VP9_HIGHBITDEPTH
-        const uint32_t diff =
+        const int diff =
             bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
 #else
-        const uint32_t diff = dst[j] - src[j];
+        const int diff = dst[j] - src[j];
 #endif
         const uint32_t error = diff * diff;
         EXPECT_GE(1u << 2 * (bit_depth_ - 8), error)
@@ -456,7 +458,7 @@ class FwdTrans8x8TestBase {
         coeff_r[j] = static_cast<tran_low_t>(round(out_r[j]));
 
       for (int j = 0; j < kNumCoeffs; ++j) {
-        const uint32_t diff = coeff[j] - coeff_r[j];
+        const int32_t diff = coeff[j] - coeff_r[j];
         const uint32_t error = diff * diff;
         EXPECT_GE(9u << 2 * (bit_depth_ - 8), error)
             << "Error: 8x8 DCT has error " << error
@@ -509,10 +511,10 @@ void CompareInvReference(IdctFunc ref_txfm, int thresh) {
 
       for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_VP9_HIGHBITDEPTH
-        const uint32_t diff =
+        const int diff =
             bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
 #else
-        const uint32_t diff = dst[j] - ref[j];
+        const int diff = dst[j] - ref[j];
 #endif
         const uint32_t error = diff * diff;
         EXPECT_EQ(0u, error)
@@ -641,7 +643,7 @@ class InvTrans8x8DCT
   void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride);
   }
-  void RunFwdTxfm(int16_t *out, tran_low_t *dst, int stride) {}
+  void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/, int /*stride*/) {}
 
   IdctFunc ref_txfm_;
   IdctFunc inv_txfm_;
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
new file mode 100644
index 0000000..7a5bd5b
--- /dev/null
+++ b/test/hadamard_test.cc
@@ -0,0 +1,220 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <algorithm>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+
+namespace {
+
+using ::libvpx_test::ACMRandom;
+
+typedef void (*HadamardFunc)(const int16_t *a, int a_stride, int16_t *b);
+
+void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) {
+  int16_t b[8];
+  for (int i = 0; i < 8; i += 2) {
+    b[i + 0] = a[i * a_stride] + a[(i + 1) * a_stride];
+    b[i + 1] = a[i * a_stride] - a[(i + 1) * a_stride];
+  }
+  int16_t c[8];
+  for (int i = 0; i < 8; i += 4) {
+    c[i + 0] = b[i + 0] + b[i + 2];
+    c[i + 1] = b[i + 1] + b[i + 3];
+    c[i + 2] = b[i + 0] - b[i + 2];
+    c[i + 3] = b[i + 1] - b[i + 3];
+  }
+  out[0] = c[0] + c[4];
+  out[7] = c[1] + c[5];
+  out[3] = c[2] + c[6];
+  out[4] = c[3] + c[7];
+  out[2] = c[0] - c[4];
+  out[6] = c[1] - c[5];
+  out[1] = c[2] - c[6];
+  out[5] = c[3] - c[7];
+}
+
+void reference_hadamard8x8(const int16_t *a, int a_stride, int16_t *b) {
+  int16_t buf[64];
+  for (int i = 0; i < 8; ++i) {
+    hadamard_loop(a + i, a_stride, buf + i * 8);
+  }
+
+  for (int i = 0; i < 8; ++i) {
+    hadamard_loop(buf + i, 8, b + i * 8);
+  }
+}
+
+void reference_hadamard16x16(const int16_t *a, int a_stride, int16_t *b) {
+  /* The source is a 16x16 block. The destination is rearranged to 8x32.
+   * Input is 9 bit. */
+  reference_hadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0);
+  reference_hadamard8x8(a + 8 + 0 * a_stride, a_stride, b + 64);
+  reference_hadamard8x8(a + 0 + 8 * a_stride, a_stride, b + 128);
+  reference_hadamard8x8(a + 8 + 8 * a_stride, a_stride, b + 192);
+
+  /* Overlay the 8x8 blocks and combine. */
+  for (int i = 0; i < 64; ++i) {
+    /* 8x8 steps the range up to 15 bits. */
+    const int16_t a0 = b[0];
+    const int16_t a1 = b[64];
+    const int16_t a2 = b[128];
+    const int16_t a3 = b[192];
+
+    /* Prevent the result from escaping int16_t. */
+    const int16_t b0 = (a0 + a1) >> 1;
+    const int16_t b1 = (a0 - a1) >> 1;
+    const int16_t b2 = (a2 + a3) >> 1;
+    const int16_t b3 = (a2 - a3) >> 1;
+
+    /* Store a 16 bit value. */
+    b[  0] = b0 + b2;
+    b[ 64] = b1 + b3;
+    b[128] = b0 - b2;
+    b[192] = b1 - b3;
+
+    ++b;
+  }
+}
+
+class HadamardTestBase : public ::testing::TestWithParam<HadamardFunc> {
+ public:
+  virtual void SetUp() {
+    h_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  HadamardFunc h_func_;
+  ACMRandom rnd_;
+};
+
+class Hadamard8x8Test : public HadamardTestBase {};
+
+TEST_P(Hadamard8x8Test, CompareReferenceRandom) {
+  DECLARE_ALIGNED(16, int16_t, a[64]);
+  DECLARE_ALIGNED(16, int16_t, b[64]);
+  int16_t b_ref[64];
+  for (int i = 0; i < 64; ++i) {
+    a[i] = rnd_.Rand9Signed();
+  }
+  memset(b, 0, sizeof(b));
+  memset(b_ref, 0, sizeof(b_ref));
+
+  reference_hadamard8x8(a, 8, b_ref);
+  ASM_REGISTER_STATE_CHECK(h_func_(a, 8, b));
+
+  // The order of the output is not important. Sort before checking.
+  std::sort(b, b + 64);
+  std::sort(b_ref, b_ref + 64);
+  EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+}
+
+TEST_P(Hadamard8x8Test, VaryStride) {
+  DECLARE_ALIGNED(16, int16_t, a[64 * 8]);
+  DECLARE_ALIGNED(16, int16_t, b[64]);
+  int16_t b_ref[64];
+  for (int i = 0; i < 64 * 8; ++i) {
+    a[i] = rnd_.Rand9Signed();
+  }
+
+  for (int i = 8; i < 64; i += 8) {
+    memset(b, 0, sizeof(b));
+    memset(b_ref, 0, sizeof(b_ref));
+
+    reference_hadamard8x8(a, i, b_ref);
+    ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
+
+    // The order of the output is not important. Sort before checking.
+    std::sort(b, b + 64);
+    std::sort(b_ref, b_ref + 64);
+    EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test,
+                        ::testing::Values(&vpx_hadamard_8x8_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, Hadamard8x8Test,
+                        ::testing::Values(&vpx_hadamard_8x8_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(SSSE3, Hadamard8x8Test,
+                        ::testing::Values(&vpx_hadamard_8x8_ssse3));
+#endif  // HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test,
+                        ::testing::Values(&vpx_hadamard_8x8_neon));
+#endif  // HAVE_NEON
+
+class Hadamard16x16Test : public HadamardTestBase {};
+
+TEST_P(Hadamard16x16Test, CompareReferenceRandom) {
+  DECLARE_ALIGNED(16, int16_t, a[16 * 16]);
+  DECLARE_ALIGNED(16, int16_t, b[16 * 16]);
+  int16_t b_ref[16 * 16];
+  for (int i = 0; i < 16 * 16; ++i) {
+    a[i] = rnd_.Rand9Signed();
+  }
+  memset(b, 0, sizeof(b));
+  memset(b_ref, 0, sizeof(b_ref));
+
+  reference_hadamard16x16(a, 16, b_ref);
+  ASM_REGISTER_STATE_CHECK(h_func_(a, 16, b));
+
+  // The order of the output is not important. Sort before checking.
+  std::sort(b, b + 16 * 16);
+  std::sort(b_ref, b_ref + 16 * 16);
+  EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+}
+
+TEST_P(Hadamard16x16Test, VaryStride) {
+  DECLARE_ALIGNED(16, int16_t, a[16 * 16 * 8]);
+  DECLARE_ALIGNED(16, int16_t, b[16 * 16]);
+  int16_t b_ref[16 * 16];
+  for (int i = 0; i < 16 * 16 * 8; ++i) {
+    a[i] = rnd_.Rand9Signed();
+  }
+
+  for (int i = 8; i < 64; i += 8) {
+    memset(b, 0, sizeof(b));
+    memset(b_ref, 0, sizeof(b_ref));
+
+    reference_hadamard16x16(a, i, b_ref);
+    ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
+
+    // The order of the output is not important. Sort before checking.
+    std::sort(b, b + 16 * 16);
+    std::sort(b_ref, b_ref + 16 * 16);
+    EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_neon));
+#endif  // HAVE_NEON
+}  // namespace
diff --git a/test/level_test.cc b/test/level_test.cc
new file mode 100644
index 0000000..62d0247
--- /dev/null
+++ b/test/level_test.cc
@@ -0,0 +1,119 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+class LevelTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  LevelTest()
+     : EncoderTest(GET_PARAM(0)),
+       encoding_mode_(GET_PARAM(1)),
+       cpu_used_(GET_PARAM(2)),
+       min_gf_internal_(24),
+       target_level_(0),
+       level_(0) {}
+  virtual ~LevelTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libvpx_test::kRealTime) {
+      cfg_.g_lag_in_frames = 25;
+      cfg_.rc_end_usage = VPX_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = VPX_CBR;
+    }
+    cfg_.rc_2pass_vbr_minsection_pct = 5;
+    cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+    cfg_.rc_target_bitrate = 400;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+      encoder->Control(VP9E_SET_TARGET_LEVEL, target_level_);
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, min_gf_internal_);
+      if (encoding_mode_ != ::libvpx_test::kRealTime) {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      }
+    }
+    encoder->Control(VP9E_GET_LEVEL, &level_);
+    ASSERT_LE(level_, 51);
+    ASSERT_GE(level_, 0);
+  }
+
+  ::libvpx_test::TestMode encoding_mode_;
+  int cpu_used_;
+  int min_gf_internal_;
+  int target_level_;
+  int level_;
+};
+
+// Test for keeping level stats only
+TEST_P(LevelTest, TestTargetLevel0) {
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       40);
+  target_level_ = 0;
+  min_gf_internal_ = 4;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(11, level_);
+
+  cfg_.rc_target_bitrate = 1600;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(20, level_);
+}
+
+// Test for level control being turned off
+TEST_P(LevelTest, TestTargetLevel255) {
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       30);
+  target_level_ = 255;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(LevelTest, TestTargetLevelApi) {
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, 1);
+  static const vpx_codec_iface_t *codec = &vpx_codec_vp9_cx_algo;
+  vpx_codec_ctx_t enc;
+  vpx_codec_enc_cfg_t cfg;
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(codec, &cfg, 0));
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, codec, &cfg, 0));
+  for (int level = 0; level <= 256; ++level) {
+    if (level == 10 || level == 11 || level == 20 || level == 21 ||
+        level == 30 || level == 31 || level == 40 || level == 41 ||
+        level == 50 || level == 51 || level == 52 || level == 60 ||
+        level == 61 || level == 62 || level == 0 || level == 255)
+      EXPECT_EQ(VPX_CODEC_OK,
+                vpx_codec_control(&enc, VP9E_SET_TARGET_LEVEL, level));
+    else
+      EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+                vpx_codec_control(&enc, VP9E_SET_TARGET_LEVEL, level));
+  }
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&enc));
+}
+
+VP9_INSTANTIATE_TEST_CASE(LevelTest,
+                          ::testing::Values(::libvpx_test::kTwoPassGood,
+                                            ::libvpx_test::kOnePassGood),
+                          ::testing::Range(0, 9));
+}  // namespace
diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index 0bf6b0c..94646e4 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -37,120 +37,23 @@ const int number_of_iterations = 10000;
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef void (*loop_op_t)(uint16_t *s, int p, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh,
-                          int count, int bd);
+                          int bd);
 typedef void (*dual_loop_op_t)(uint16_t *s, int p, const uint8_t *blimit0,
                                const uint8_t *limit0, const uint8_t *thresh0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1, int bd);
 #else
 typedef void (*loop_op_t)(uint8_t *s, int p, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh,
-                          int count);
+                          const uint8_t *limit, const uint8_t *thresh);
 typedef void (*dual_loop_op_t)(uint8_t *s, int p, const uint8_t *blimit0,
                                const uint8_t *limit0, const uint8_t *thresh0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-typedef std::tr1::tuple<loop_op_t, loop_op_t, int, int> loop8_param_t;
+typedef std::tr1::tuple<loop_op_t, loop_op_t, int> loop8_param_t;
 typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;
 
-#if HAVE_SSE2
-#if CONFIG_VP9_HIGHBITDEPTH
-void wrapper_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                              const uint8_t *limit, const uint8_t *thresh,
-                              int count, int bd) {
-  vpx_highbd_lpf_vertical_16_sse2(s, p, blimit, limit, thresh, bd);
-}
-
-void wrapper_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
-                           const uint8_t *limit, const uint8_t *thresh,
-                           int count, int bd) {
-  vpx_highbd_lpf_vertical_16_c(s, p, blimit, limit, thresh, bd);
-}
-
-void wrapper_vertical_16_dual_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh,
-                                   int count, int bd) {
-  vpx_highbd_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh, bd);
-}
-
-void wrapper_vertical_16_dual_c(uint16_t *s, int p, const uint8_t *blimit,
-                                const uint8_t *limit, const uint8_t *thresh,
-                                int count, int bd) {
-  vpx_highbd_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh, bd);
-}
-#else
-void wrapper_vertical_16_sse2(uint8_t *s, int p, const uint8_t *blimit,
-                              const uint8_t *limit, const uint8_t *thresh,
-                              int count) {
-  vpx_lpf_vertical_16_sse2(s, p, blimit, limit, thresh);
-}
-
-void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
-                           const uint8_t *limit, const uint8_t *thresh,
-                           int count) {
-  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
-}
-
-void wrapper_vertical_16_dual_sse2(uint8_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh,
-                                   int count) {
-  vpx_lpf_vertical_16_dual_sse2(s, p, blimit, limit, thresh);
-}
-
-void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
-                                const uint8_t *limit, const uint8_t *thresh,
-                                int count) {
-  vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-#endif  // HAVE_SSE2
-
-#if HAVE_NEON_ASM
-#if CONFIG_VP9_HIGHBITDEPTH
-// No neon high bitdepth functions.
-#else
-void wrapper_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
-                              const uint8_t *limit, const uint8_t *thresh,
-                              int count) {
-  vpx_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
-}
-
-void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
-                           const uint8_t *limit, const uint8_t *thresh,
-                           int count) {
-  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
-}
-
-void wrapper_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh,
-                                   int count) {
-  vpx_lpf_vertical_16_dual_neon(s, p, blimit, limit, thresh);
-}
-
-void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
-                                const uint8_t *limit, const uint8_t *thresh,
-                                int count) {
-  vpx_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-#endif  // HAVE_NEON_ASM
-
-#if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
-void wrapper_vertical_16_msa(uint8_t *s, int p, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh,
-                             int count) {
-  vpx_lpf_vertical_16_msa(s, p, blimit, limit, thresh);
-}
-
-void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
-                           const uint8_t *limit, const uint8_t *thresh,
-                           int count) {
-  vpx_lpf_vertical_16_c(s, p, blimit, limit, thresh);
-}
-#endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
-
 class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
  public:
   virtual ~Loop8Test6Param() {}
@@ -158,7 +61,6 @@ class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
     loopfilter_op_ = GET_PARAM(0);
     ref_loopfilter_op_ = GET_PARAM(1);
     bit_depth_ = GET_PARAM(2);
-    count_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
   }
 
@@ -166,7 +68,6 @@ class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
 
  protected:
   int bit_depth_;
-  int count_;
   int mask_;
   loop_op_t loopfilter_op_;
   loop_op_t ref_loopfilter_op_;
@@ -253,13 +154,13 @@ TEST_P(Loop8Test6Param, OperationCheck) {
       ref_s[j] = s[j];
     }
 #if CONFIG_VP9_HIGHBITDEPTH
-    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd);
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, bd);
     ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd));
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, bd));
 #else
-    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_);
+    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh);
     ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_));
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     for (int j = 0; j < kNumCoeffs; ++j) {
@@ -325,13 +226,13 @@ TEST_P(Loop8Test6Param, ValueCheck) {
       ref_s[j] = s[j];
     }
 #if CONFIG_VP9_HIGHBITDEPTH
-    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd);
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, bd);
     ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd));
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, bd));
 #else
-    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_);
+    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh);
     ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_));
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     for (int j = 0; j < kNumCoeffs; ++j) {
       err_count += ref_s[j] != s[j];
@@ -535,64 +436,73 @@ INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test6Param,
     ::testing::Values(
         make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
-                   &vpx_highbd_lpf_horizontal_4_c, 8, 1),
+                   &vpx_highbd_lpf_horizontal_4_c, 8),
         make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
-                   &vpx_highbd_lpf_vertical_4_c, 8, 1),
+                   &vpx_highbd_lpf_vertical_4_c, 8),
         make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
-                   &vpx_highbd_lpf_horizontal_8_c, 8, 1),
-        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
-                   &vpx_highbd_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
-                   &vpx_highbd_lpf_horizontal_16_c, 8, 2),
+                   &vpx_highbd_lpf_horizontal_8_c, 8),
+        make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2,
+                   &vpx_highbd_lpf_horizontal_edge_8_c, 8),
+        make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2,
+                   &vpx_highbd_lpf_horizontal_edge_16_c, 8),
         make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
-                   &vpx_highbd_lpf_vertical_8_c, 8, 1),
-        make_tuple(&wrapper_vertical_16_sse2,
-                   &wrapper_vertical_16_c, 8, 1),
+                   &vpx_highbd_lpf_vertical_8_c, 8),
+        make_tuple(&vpx_highbd_lpf_vertical_16_sse2,
+                   &vpx_highbd_lpf_vertical_16_c, 8),
         make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
-                   &vpx_highbd_lpf_horizontal_4_c, 10, 1),
+                   &vpx_highbd_lpf_horizontal_4_c, 10),
         make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
-                   &vpx_highbd_lpf_vertical_4_c, 10, 1),
+                   &vpx_highbd_lpf_vertical_4_c, 10),
         make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
-                   &vpx_highbd_lpf_horizontal_8_c, 10, 1),
-        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
-                   &vpx_highbd_lpf_horizontal_16_c, 10, 1),
-        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
-                   &vpx_highbd_lpf_horizontal_16_c, 10, 2),
+                   &vpx_highbd_lpf_horizontal_8_c, 10),
+        make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2,
+                   &vpx_highbd_lpf_horizontal_edge_8_c, 10),
+        make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2,
+                   &vpx_highbd_lpf_horizontal_edge_16_c, 10),
         make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
-                   &vpx_highbd_lpf_vertical_8_c, 10, 1),
-        make_tuple(&wrapper_vertical_16_sse2,
-                   &wrapper_vertical_16_c, 10, 1),
+                   &vpx_highbd_lpf_vertical_8_c, 10),
+        make_tuple(&vpx_highbd_lpf_vertical_16_sse2,
+                   &vpx_highbd_lpf_vertical_16_c, 10),
         make_tuple(&vpx_highbd_lpf_horizontal_4_sse2,
-                   &vpx_highbd_lpf_horizontal_4_c, 12, 1),
+                   &vpx_highbd_lpf_horizontal_4_c, 12),
         make_tuple(&vpx_highbd_lpf_vertical_4_sse2,
-                   &vpx_highbd_lpf_vertical_4_c, 12, 1),
+                   &vpx_highbd_lpf_vertical_4_c, 12),
         make_tuple(&vpx_highbd_lpf_horizontal_8_sse2,
-                   &vpx_highbd_lpf_horizontal_8_c, 12, 1),
-        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
-                   &vpx_highbd_lpf_horizontal_16_c, 12, 1),
-        make_tuple(&vpx_highbd_lpf_horizontal_16_sse2,
-                   &vpx_highbd_lpf_horizontal_16_c, 12, 2),
+                   &vpx_highbd_lpf_horizontal_8_c, 12),
+        make_tuple(&vpx_highbd_lpf_horizontal_edge_8_sse2,
+                   &vpx_highbd_lpf_horizontal_edge_8_c, 12),
+        make_tuple(&vpx_highbd_lpf_horizontal_edge_16_sse2,
+                   &vpx_highbd_lpf_horizontal_edge_16_c, 12),
         make_tuple(&vpx_highbd_lpf_vertical_8_sse2,
-                   &vpx_highbd_lpf_vertical_8_c, 12, 1),
-        make_tuple(&wrapper_vertical_16_sse2,
-                   &wrapper_vertical_16_c, 12, 1),
-        make_tuple(&wrapper_vertical_16_dual_sse2,
-                   &wrapper_vertical_16_dual_c, 8, 1),
-        make_tuple(&wrapper_vertical_16_dual_sse2,
-                   &wrapper_vertical_16_dual_c, 10, 1),
-        make_tuple(&wrapper_vertical_16_dual_sse2,
-                   &wrapper_vertical_16_dual_c, 12, 1)));
+                   &vpx_highbd_lpf_vertical_8_c, 12),
+        make_tuple(&vpx_highbd_lpf_vertical_16_sse2,
+                   &vpx_highbd_lpf_vertical_16_c, 12),
+        make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2,
+                   &vpx_highbd_lpf_vertical_16_dual_c, 8),
+        make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2,
+                   &vpx_highbd_lpf_vertical_16_dual_c, 10),
+        make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2,
+                   &vpx_highbd_lpf_vertical_16_dual_c, 12)));
 #else
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vpx_lpf_horizontal_8_sse2, &vpx_lpf_horizontal_8_c, 8, 1),
-        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vpx_lpf_horizontal_16_sse2, &vpx_lpf_horizontal_16_c, 8, 2),
-        make_tuple(&vpx_lpf_vertical_8_sse2, &vpx_lpf_vertical_8_c, 8, 1),
-        make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8, 1),
-        make_tuple(&wrapper_vertical_16_dual_sse2,
-                   &wrapper_vertical_16_dual_c, 8, 1)));
+        make_tuple(&vpx_lpf_horizontal_4_sse2,
+                   &vpx_lpf_horizontal_4_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_sse2,
+                   &vpx_lpf_horizontal_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_edge_8_sse2,
+                   &vpx_lpf_horizontal_edge_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_edge_16_sse2,
+                   &vpx_lpf_horizontal_edge_16_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_sse2,
+                   &vpx_lpf_vertical_4_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_sse2,
+                   &vpx_lpf_vertical_8_c, 8),
+        make_tuple(&vpx_lpf_vertical_16_sse2,
+                   &vpx_lpf_vertical_16_c, 8),
+        make_tuple(&vpx_lpf_vertical_16_dual_sse2,
+                   &vpx_lpf_vertical_16_dual_c, 8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif
 
@@ -600,9 +510,10 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     AVX2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8,
-                   2)));
+        make_tuple(&vpx_lpf_horizontal_edge_8_avx2,
+                   &vpx_lpf_horizontal_edge_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_edge_16_avx2,
+                   &vpx_lpf_horizontal_edge_16_c, 8)));
 #endif
 
 #if HAVE_SSE2
@@ -659,23 +570,23 @@ INSTANTIATE_TEST_CASE_P(
 #if HAVE_NEON_ASM
 // Using #if inside the macro is unsupported on MSVS but the tests are not
 // currently built for MSVS with ARM and NEON.
-        make_tuple(&vpx_lpf_horizontal_16_neon,
-                   &vpx_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vpx_lpf_horizontal_16_neon,
-                   &vpx_lpf_horizontal_16_c, 8, 2),
-        make_tuple(&wrapper_vertical_16_neon,
-                   &wrapper_vertical_16_c, 8, 1),
-        make_tuple(&wrapper_vertical_16_dual_neon,
-                   &wrapper_vertical_16_dual_c, 8, 1),
+        make_tuple(&vpx_lpf_horizontal_edge_8_neon,
+                   &vpx_lpf_horizontal_edge_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_edge_16_neon,
+                   &vpx_lpf_horizontal_edge_16_c, 8),
+        make_tuple(&vpx_lpf_vertical_16_neon,
+                   &vpx_lpf_vertical_16_c, 8),
+        make_tuple(&vpx_lpf_vertical_16_dual_neon,
+                   &vpx_lpf_vertical_16_dual_c, 8),
 #endif  // HAVE_NEON_ASM
         make_tuple(&vpx_lpf_horizontal_8_neon,
-                   &vpx_lpf_horizontal_8_c, 8, 1),
+                   &vpx_lpf_horizontal_8_c, 8),
         make_tuple(&vpx_lpf_vertical_8_neon,
-                   &vpx_lpf_vertical_8_c, 8, 1),
+                   &vpx_lpf_vertical_8_c, 8),
         make_tuple(&vpx_lpf_horizontal_4_neon,
-                   &vpx_lpf_horizontal_4_c, 8, 1),
+                   &vpx_lpf_horizontal_4_c, 8),
         make_tuple(&vpx_lpf_vertical_4_neon,
-                   &vpx_lpf_vertical_4_c, 8, 1)));
+                   &vpx_lpf_vertical_4_c, 8)));
 INSTANTIATE_TEST_CASE_P(
     NEON, Loop8Test9Param,
     ::testing::Values(
@@ -692,15 +603,58 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
+#if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    DSPR2, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_4_dspr2,
+                   &vpx_lpf_horizontal_4_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_dspr2,
+                   &vpx_lpf_horizontal_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_edge_8,
+                   &vpx_lpf_horizontal_edge_8, 8),
+        make_tuple(&vpx_lpf_horizontal_edge_16,
+                   &vpx_lpf_horizontal_edge_16, 8),
+        make_tuple(&vpx_lpf_vertical_4_dspr2,
+                   &vpx_lpf_vertical_4_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dspr2,
+                   &vpx_lpf_vertical_8_c, 8),
+        make_tuple(&vpx_lpf_vertical_16_dspr2,
+                   &vpx_lpf_vertical_16_c, 8),
+        make_tuple(&vpx_lpf_vertical_16_dual_dspr2,
+                   &vpx_lpf_vertical_16_dual_c, 8)));
+
+INSTANTIATE_TEST_CASE_P(
+    DSPR2, Loop8Test9Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_4_dual_dspr2,
+                   &vpx_lpf_horizontal_4_dual_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_dual_dspr2,
+                   &vpx_lpf_horizontal_8_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_dual_dspr2,
+                   &vpx_lpf_vertical_4_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_dual_dspr2,
+                   &vpx_lpf_vertical_8_dual_c, 8)));
+#endif  // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH
+
 #if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
 INSTANTIATE_TEST_CASE_P(
     MSA, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vpx_lpf_horizontal_8_msa, &vpx_lpf_horizontal_8_c, 8, 1),
-        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 1),
-        make_tuple(&vpx_lpf_horizontal_16_msa, &vpx_lpf_horizontal_16_c, 8, 2),
-        make_tuple(&vpx_lpf_vertical_8_msa, &vpx_lpf_vertical_8_c, 8, 1),
-        make_tuple(&wrapper_vertical_16_msa, &wrapper_vertical_16_c, 8, 1)));
+        make_tuple(&vpx_lpf_horizontal_4_msa,
+                   &vpx_lpf_horizontal_4_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_msa,
+                   &vpx_lpf_horizontal_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_edge_8_msa,
+                   &vpx_lpf_horizontal_edge_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_edge_16_msa,
+                   &vpx_lpf_horizontal_edge_16_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_msa,
+                   &vpx_lpf_vertical_4_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_msa,
+                   &vpx_lpf_vertical_8_c, 8),
+        make_tuple(&vpx_lpf_vertical_16_msa,
+                   &vpx_lpf_vertical_16_c, 8)));
 
 INSTANTIATE_TEST_CASE_P(
     MSA, Loop8Test9Param,
diff --git a/test/minmax_test.cc b/test/minmax_test.cc
new file mode 100644
index 0000000..dbe4342
--- /dev/null
+++ b/test/minmax_test.cc
@@ -0,0 +1,132 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+
+namespace {
+
+using ::libvpx_test::ACMRandom;
+
+typedef void (*MinMaxFunc)(const uint8_t *a, int a_stride,
+                           const uint8_t *b, int b_stride,
+                           int *min, int *max);
+
+class MinMaxTest : public ::testing::TestWithParam<MinMaxFunc> {
+ public:
+  virtual void SetUp() {
+    mm_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  MinMaxFunc mm_func_;
+  ACMRandom rnd_;
+};
+
+void reference_minmax(const uint8_t *a, int a_stride,
+                      const uint8_t *b, int b_stride,
+                      int *min_ret, int *max_ret) {
+  int min = 255;
+  int max = 0;
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 8; j++) {
+      const int diff = abs(a[i * a_stride + j] - b[i * b_stride + j]);
+      if (min > diff) min = diff;
+      if (max < diff) max = diff;
+    }
+  }
+
+  *min_ret = min;
+  *max_ret = max;
+}
+
+TEST_P(MinMaxTest, MinValue) {
+  for (int i = 0; i < 64; i++) {
+    uint8_t a[64], b[64];
+    memset(a, 0, sizeof(a));
+    memset(b, 255, sizeof(b));
+    b[i] = i;  // Set a minimum difference of i.
+
+    int min, max;
+    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+    EXPECT_EQ(255, max);
+    EXPECT_EQ(i, min);
+  }
+}
+
+TEST_P(MinMaxTest, MaxValue) {
+  for (int i = 0; i < 64; i++) {
+    uint8_t a[64], b[64];
+    memset(a, 0, sizeof(a));
+    memset(b, 0, sizeof(b));
+    b[i] = i;  // Set a maximum difference of i.
+
+    int min, max;
+    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+    EXPECT_EQ(i, max);
+    EXPECT_EQ(0, min);
+  }
+}
+
+TEST_P(MinMaxTest, CompareReference) {
+  uint8_t a[64], b[64];
+  for (int j = 0; j < 64; j++) {
+    a[j] = rnd_.Rand8();
+    b[j] = rnd_.Rand8();
+  }
+
+  int min_ref, max_ref, min, max;
+  reference_minmax(a, 8, b, 8, &min_ref, &max_ref);
+  ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+  EXPECT_EQ(max_ref, max);
+  EXPECT_EQ(min_ref, min);
+}
+
+TEST_P(MinMaxTest, CompareReferenceAndVaryStride) {
+  uint8_t a[8 * 64], b[8 * 64];
+  for (int i = 0; i < 8 * 64; i++) {
+    a[i] = rnd_.Rand8();
+    b[i] = rnd_.Rand8();
+  }
+  for (int a_stride = 8; a_stride <= 64; a_stride += 8) {
+    for (int b_stride = 8; b_stride <= 64; b_stride += 8) {
+      int min_ref, max_ref, min, max;
+      reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref);
+      ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max));
+      EXPECT_EQ(max_ref, max) << "when a_stride = " << a_stride
+                              << " and b_stride = " << b_stride;;
+      EXPECT_EQ(min_ref, min) << "when a_stride = " << a_stride
+                              << " and b_stride = " << b_stride;;
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, MinMaxTest,
+                        ::testing::Values(&vpx_minmax_8x8_sse2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, MinMaxTest,
+                        ::testing::Values(&vpx_minmax_8x8_neon));
+#endif
+
+}  // namespace
diff --git a/test/realtime_test.cc b/test/realtime_test.cc
new file mode 100644
index 0000000..24749e4
--- /dev/null
+++ b/test/realtime_test.cc
@@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace {
+
+const int kVideoSourceWidth = 320;
+const int kVideoSourceHeight = 240;
+const int kFramesToEncode = 2;
+
+class RealtimeTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  RealtimeTest()
+      : EncoderTest(GET_PARAM(0)), frame_packets_(0) {}
+  virtual ~RealtimeTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    cfg_.g_lag_in_frames = 0;
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    // TODO(tomfinegan): We're changing the pass value here to make sure
+    // we get frames when real time mode is combined with |g_pass| set to
+    // VPX_RC_FIRST_PASS. This is necessary because EncoderTest::RunLoop() sets
+    // the pass value based on the mode passed into EncoderTest::SetMode(),
+    // which overrides the one specified in SetUp() above.
+    cfg_.g_pass = VPX_RC_FIRST_PASS;
+  }
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {
+    frame_packets_++;
+  }
+
+  int frame_packets_;
+};
+
+TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) {
+  ::libvpx_test::RandomVideoSource video;
+  video.SetSize(kVideoSourceWidth, kVideoSourceHeight);
+  video.set_limit(kFramesToEncode);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_EQ(kFramesToEncode, frame_packets_);
+}
+
+VP8_INSTANTIATE_TEST_CASE(RealtimeTest,
+                          ::testing::Values(::libvpx_test::kRealTime));
+VP9_INSTANTIATE_TEST_CASE(RealtimeTest,
+                          ::testing::Values(::libvpx_test::kRealTime));
+
+}  // namespace
diff --git a/test/register_state_check.h b/test/register_state_check.h
index 489c419..5336f2f 100644
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@@ -36,16 +36,10 @@
 #include <windows.h>
 #include <winnt.h>
 
-namespace testing {
-namespace internal {
-
 inline bool operator==(const M128A& lhs, const M128A& rhs) {
   return (lhs.Low == rhs.Low && lhs.High == rhs.High);
 }
 
-}  // namespace internal
-}  // namespace testing
-
 namespace libvpx_test {
 
 // Compares the state of xmm[6-15] at construction with their state at
diff --git a/test/resize_test.cc b/test/resize_test.cc
index 98b6f87..90f5452 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -7,6 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <stdio.h>
+
 #include <climits>
 #include <vector>
 #include "third_party/googletest/src/include/gtest/gtest.h"
@@ -90,34 +92,178 @@ struct FrameInfo {
   unsigned int h;
 };
 
-unsigned int ScaleForFrameNumber(unsigned int frame, unsigned int val) {
-  if (frame < 10)
-    return val;
-  if (frame < 20)
-    return val / 2;
-  if (frame < 30)
-    return val * 2 / 3;
-  if (frame < 40)
-    return val / 4;
-  if (frame < 50)
-    return val * 7 / 8;
-  return val;
+void ScaleForFrameNumber(unsigned int frame,
+                         unsigned int initial_w,
+                         unsigned int initial_h,
+                         unsigned int *w,
+                         unsigned int *h,
+                         int flag_codec) {
+  if (frame < 10) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 20) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 30) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 40) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 50) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 60) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 70) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 80) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 90) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 100) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 110) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 120) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 130) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 140) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 150) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 160) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 170) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 180) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 190) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 200) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 210) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 220) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 230) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 240) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 250) {
+    *w = initial_w  / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 260) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  // Go down very low.
+  if (frame < 270) {
+    *w = initial_w / 4;
+    *h = initial_h / 4;
+    return;
+  }
+  if (flag_codec == 1) {
+    // Cases that only works for VP9.
+    // For VP9: Swap width and height of original.
+    if (frame < 320) {
+      *w = initial_h;
+      *h = initial_w;
+      return;
+    }
+  }
+  *w = initial_w;
+  *h = initial_h;
 }
 
 class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
  public:
   ResizingVideoSource() {
     SetSize(kInitialWidth, kInitialHeight);
-    limit_ = 60;
+    limit_ = 350;
   }
-
+  int flag_codec_;
   virtual ~ResizingVideoSource() {}
 
  protected:
   virtual void Next() {
     ++frame_;
-    SetSize(ScaleForFrameNumber(frame_, kInitialWidth),
-            ScaleForFrameNumber(frame_, kInitialHeight));
+    unsigned int width;
+    unsigned int height;
+    ScaleForFrameNumber(frame_, kInitialWidth, kInitialHeight, &width, &height,
+                        flag_codec_);
+    SetSize(width, height);
     FillFrame();
   }
 };
@@ -144,15 +290,17 @@ class ResizeTest : public ::libvpx_test::EncoderTest,
 
 TEST_P(ResizeTest, TestExternalResizeWorks) {
   ResizingVideoSource video;
+  video.flag_codec_ = 0;
   cfg_.g_lag_in_frames = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
        info != frame_info_list_.end(); ++info) {
     const unsigned int frame = static_cast<unsigned>(info->pts);
-    const unsigned int expected_w = ScaleForFrameNumber(frame, kInitialWidth);
-    const unsigned int expected_h = ScaleForFrameNumber(frame, kInitialHeight);
-
+    unsigned int expected_w;
+    unsigned int expected_h;
+    ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight,
+                        &expected_w, &expected_h, 0);
     EXPECT_EQ(expected_w, info->w)
         << "Frame " << frame << " had unexpected width";
     EXPECT_EQ(expected_h, info->h)
@@ -286,11 +434,11 @@ TEST_P(ResizeInternalTest, TestInternalResizeChangeConfig) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-class ResizeInternalRealtimeTest : public ::libvpx_test::EncoderTest,
+class ResizeRealtimeTest : public ::libvpx_test::EncoderTest,
   public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
-  ResizeInternalRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~ResizeInternalRealtimeTest() {}
+  ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~ResizeRealtimeTest() {}
 
   virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                   libvpx_test::Encoder *encoder) {
@@ -317,9 +465,18 @@ class ResizeInternalRealtimeTest : public ::libvpx_test::EncoderTest,
     frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
   }
 
+  virtual void MismatchHook(const vpx_image_t *img1,
+                             const vpx_image_t *img2) {
+    double mismatch_psnr = compute_psnr(img1, img2);
+    mismatch_psnr_ += mismatch_psnr;
+    ++mismatch_nframes_;
+  }
+
+  unsigned int GetMismatchFrames() {
+      return mismatch_nframes_;
+  }
+
   void DefaultConfig() {
-    cfg_.g_w = 352;
-    cfg_.g_h = 288;
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 600;
     cfg_.rc_buf_sz = 1000;
@@ -344,16 +501,48 @@ class ResizeInternalRealtimeTest : public ::libvpx_test::EncoderTest,
   std::vector< FrameInfo > frame_info_list_;
   int set_cpu_used_;
   bool change_bitrate_;
+  double mismatch_psnr_;
+  int mismatch_nframes_;
 };
 
+TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
+  ResizingVideoSource video;
+  video.flag_codec_ = 1;
+  DefaultConfig();
+  // Disable internal resize for this test.
+  cfg_.rc_resize_allowed = 0;
+  change_bitrate_ = false;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const unsigned int frame = static_cast<unsigned>(info->pts);
+    unsigned int expected_w;
+    unsigned int expected_h;
+    ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight,
+                        &expected_w, &expected_h, 1);
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
+    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+  }
+}
+
 // Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
 // Run at low bitrate, with resize_allowed = 1, and verify that we get
 // one resize down event.
-TEST_P(ResizeInternalRealtimeTest, TestInternalResizeDown) {
+TEST_P(ResizeRealtimeTest, TestInternalResizeDown) {
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 299);
   DefaultConfig();
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
   change_bitrate_ = false;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   unsigned int last_w = cfg_.g_w;
@@ -371,22 +560,31 @@ TEST_P(ResizeInternalRealtimeTest, TestInternalResizeDown) {
     }
   }
 
+#if CONFIG_VP9_DECODER
   // Verify that we get 1 resize down event in this test.
   ASSERT_EQ(1, resize_count) << "Resizing should occur.";
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+#else
+  printf("Warning: VP9 decoder unavailable, unable to check resize count!\n");
+#endif
 }
 
 // Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
 // Start at low target bitrate, raise the bitrate in the middle of the clip,
 // scaling-up should occur after bitrate changed.
-TEST_P(ResizeInternalRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
+TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 299);
+                                       30, 1, 0, 359);
   DefaultConfig();
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
   change_bitrate_ = true;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
   // Disable dropped frames.
   cfg_.rc_dropframe_thresh = 0;
   // Starting bitrate low.
-  cfg_.rc_target_bitrate = 100;
+  cfg_.rc_target_bitrate = 80;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   unsigned int last_w = cfg_.g_w;
@@ -410,8 +608,13 @@ TEST_P(ResizeInternalRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
     }
   }
 
+#if CONFIG_VP9_DECODER
   // Verify that we get 2 resize events in this test.
-  ASSERT_EQ(2, resize_count) << "Resizing should occur twice.";
+  ASSERT_EQ(resize_count, 2) << "Resizing should occur twice.";
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+#else
+  printf("Warning: VP9 decoder unavailable, unable to check resize count!\n");
+#endif
 }
 
 vpx_img_fmt_t CspForFrameNumber(int frame) {
@@ -524,7 +727,7 @@ VP9_INSTANTIATE_TEST_CASE(ResizeTest,
                           ::testing::Values(::libvpx_test::kRealTime));
 VP9_INSTANTIATE_TEST_CASE(ResizeInternalTest,
                           ::testing::Values(::libvpx_test::kOnePassBest));
-VP9_INSTANTIATE_TEST_CASE(ResizeInternalRealtimeTest,
+VP9_INSTANTIATE_TEST_CASE(ResizeRealtimeTest,
                           ::testing::Values(::libvpx_test::kRealTime),
                           ::testing::Range(5, 9));
 VP9_INSTANTIATE_TEST_CASE(ResizeCspTest,
diff --git a/test/sad_test.cc b/test/sad_test.cc
index e6a5e0b..e6bd0d7 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -484,260 +484,176 @@ using std::tr1::make_tuple;
 
 //------------------------------------------------------------------------------
 // C functions
-const SadMxNFunc sad64x64_c = vpx_sad64x64_c;
-const SadMxNFunc sad64x32_c = vpx_sad64x32_c;
-const SadMxNFunc sad32x64_c = vpx_sad32x64_c;
-const SadMxNFunc sad32x32_c = vpx_sad32x32_c;
-const SadMxNFunc sad32x16_c = vpx_sad32x16_c;
-const SadMxNFunc sad16x32_c = vpx_sad16x32_c;
-const SadMxNFunc sad16x16_c = vpx_sad16x16_c;
-const SadMxNFunc sad16x8_c = vpx_sad16x8_c;
-const SadMxNFunc sad8x16_c = vpx_sad8x16_c;
-const SadMxNFunc sad8x8_c = vpx_sad8x8_c;
-const SadMxNFunc sad8x4_c = vpx_sad8x4_c;
-const SadMxNFunc sad4x8_c = vpx_sad4x8_c;
-const SadMxNFunc sad4x4_c = vpx_sad4x4_c;
-#if CONFIG_VP9_HIGHBITDEPTH
-const SadMxNFunc highbd_sad64x64_c = vpx_highbd_sad64x64_c;
-const SadMxNFunc highbd_sad64x32_c = vpx_highbd_sad64x32_c;
-const SadMxNFunc highbd_sad32x64_c = vpx_highbd_sad32x64_c;
-const SadMxNFunc highbd_sad32x32_c = vpx_highbd_sad32x32_c;
-const SadMxNFunc highbd_sad32x16_c = vpx_highbd_sad32x16_c;
-const SadMxNFunc highbd_sad16x32_c = vpx_highbd_sad16x32_c;
-const SadMxNFunc highbd_sad16x16_c = vpx_highbd_sad16x16_c;
-const SadMxNFunc highbd_sad16x8_c = vpx_highbd_sad16x8_c;
-const SadMxNFunc highbd_sad8x16_c = vpx_highbd_sad8x16_c;
-const SadMxNFunc highbd_sad8x8_c = vpx_highbd_sad8x8_c;
-const SadMxNFunc highbd_sad8x4_c = vpx_highbd_sad8x4_c;
-const SadMxNFunc highbd_sad4x8_c = vpx_highbd_sad4x8_c;
-const SadMxNFunc highbd_sad4x4_c = vpx_highbd_sad4x4_c;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 const SadMxNParam c_tests[] = {
-  make_tuple(64, 64, sad64x64_c, -1),
-  make_tuple(64, 32, sad64x32_c, -1),
-  make_tuple(32, 64, sad32x64_c, -1),
-  make_tuple(32, 32, sad32x32_c, -1),
-  make_tuple(32, 16, sad32x16_c, -1),
-  make_tuple(16, 32, sad16x32_c, -1),
-  make_tuple(16, 16, sad16x16_c, -1),
-  make_tuple(16, 8, sad16x8_c, -1),
-  make_tuple(8, 16, sad8x16_c, -1),
-  make_tuple(8, 8, sad8x8_c, -1),
-  make_tuple(8, 4, sad8x4_c, -1),
-  make_tuple(4, 8, sad4x8_c, -1),
-  make_tuple(4, 4, sad4x4_c, -1),
+  make_tuple(64, 64, &vpx_sad64x64_c, -1),
+  make_tuple(64, 32, &vpx_sad64x32_c, -1),
+  make_tuple(32, 64, &vpx_sad32x64_c, -1),
+  make_tuple(32, 32, &vpx_sad32x32_c, -1),
+  make_tuple(32, 16, &vpx_sad32x16_c, -1),
+  make_tuple(16, 32, &vpx_sad16x32_c, -1),
+  make_tuple(16, 16, &vpx_sad16x16_c, -1),
+  make_tuple(16, 8, &vpx_sad16x8_c, -1),
+  make_tuple(8, 16, &vpx_sad8x16_c, -1),
+  make_tuple(8, 8, &vpx_sad8x8_c, -1),
+  make_tuple(8, 4, &vpx_sad8x4_c, -1),
+  make_tuple(4, 8, &vpx_sad4x8_c, -1),
+  make_tuple(4, 4, &vpx_sad4x4_c, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
-  make_tuple(64, 64, highbd_sad64x64_c, 8),
-  make_tuple(64, 32, highbd_sad64x32_c, 8),
-  make_tuple(32, 64, highbd_sad32x64_c, 8),
-  make_tuple(32, 32, highbd_sad32x32_c, 8),
-  make_tuple(32, 16, highbd_sad32x16_c, 8),
-  make_tuple(16, 32, highbd_sad16x32_c, 8),
-  make_tuple(16, 16, highbd_sad16x16_c, 8),
-  make_tuple(16, 8, highbd_sad16x8_c, 8),
-  make_tuple(8, 16, highbd_sad8x16_c, 8),
-  make_tuple(8, 8, highbd_sad8x8_c, 8),
-  make_tuple(8, 4, highbd_sad8x4_c, 8),
-  make_tuple(4, 8, highbd_sad4x8_c, 8),
-  make_tuple(4, 4, highbd_sad4x4_c, 8),
-  make_tuple(64, 64, highbd_sad64x64_c, 10),
-  make_tuple(64, 32, highbd_sad64x32_c, 10),
-  make_tuple(32, 64, highbd_sad32x64_c, 10),
-  make_tuple(32, 32, highbd_sad32x32_c, 10),
-  make_tuple(32, 16, highbd_sad32x16_c, 10),
-  make_tuple(16, 32, highbd_sad16x32_c, 10),
-  make_tuple(16, 16, highbd_sad16x16_c, 10),
-  make_tuple(16, 8, highbd_sad16x8_c, 10),
-  make_tuple(8, 16, highbd_sad8x16_c, 10),
-  make_tuple(8, 8, highbd_sad8x8_c, 10),
-  make_tuple(8, 4, highbd_sad8x4_c, 10),
-  make_tuple(4, 8, highbd_sad4x8_c, 10),
-  make_tuple(4, 4, highbd_sad4x4_c, 10),
-  make_tuple(64, 64, highbd_sad64x64_c, 12),
-  make_tuple(64, 32, highbd_sad64x32_c, 12),
-  make_tuple(32, 64, highbd_sad32x64_c, 12),
-  make_tuple(32, 32, highbd_sad32x32_c, 12),
-  make_tuple(32, 16, highbd_sad32x16_c, 12),
-  make_tuple(16, 32, highbd_sad16x32_c, 12),
-  make_tuple(16, 16, highbd_sad16x16_c, 12),
-  make_tuple(16, 8, highbd_sad16x8_c, 12),
-  make_tuple(8, 16, highbd_sad8x16_c, 12),
-  make_tuple(8, 8, highbd_sad8x8_c, 12),
-  make_tuple(8, 4, highbd_sad8x4_c, 12),
-  make_tuple(4, 8, highbd_sad4x8_c, 12),
-  make_tuple(4, 4, highbd_sad4x4_c, 12),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_c, 8),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_c, 8),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_c, 8),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_c, 8),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_c, 8),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_c, 8),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_c, 8),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_c, 8),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_c, 8),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_c, 8),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_c, 8),
+  make_tuple(4, 8, &vpx_highbd_sad4x8_c, 8),
+  make_tuple(4, 4, &vpx_highbd_sad4x4_c, 8),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_c, 10),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_c, 10),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_c, 10),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_c, 10),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_c, 10),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_c, 10),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_c, 10),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_c, 10),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_c, 10),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_c, 10),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_c, 10),
+  make_tuple(4, 8, &vpx_highbd_sad4x8_c, 10),
+  make_tuple(4, 4, &vpx_highbd_sad4x4_c, 10),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_c, 12),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_c, 12),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_c, 12),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_c, 12),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_c, 12),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_c, 12),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_c, 12),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_c, 12),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_c, 12),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_c, 12),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_c, 12),
+  make_tuple(4, 8, &vpx_highbd_sad4x8_c, 12),
+  make_tuple(4, 4, &vpx_highbd_sad4x4_c, 12),
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 
-const SadMxNAvgFunc sad64x64_avg_c = vpx_sad64x64_avg_c;
-const SadMxNAvgFunc sad64x32_avg_c = vpx_sad64x32_avg_c;
-const SadMxNAvgFunc sad32x64_avg_c = vpx_sad32x64_avg_c;
-const SadMxNAvgFunc sad32x32_avg_c = vpx_sad32x32_avg_c;
-const SadMxNAvgFunc sad32x16_avg_c = vpx_sad32x16_avg_c;
-const SadMxNAvgFunc sad16x32_avg_c = vpx_sad16x32_avg_c;
-const SadMxNAvgFunc sad16x16_avg_c = vpx_sad16x16_avg_c;
-const SadMxNAvgFunc sad16x8_avg_c = vpx_sad16x8_avg_c;
-const SadMxNAvgFunc sad8x16_avg_c = vpx_sad8x16_avg_c;
-const SadMxNAvgFunc sad8x8_avg_c = vpx_sad8x8_avg_c;
-const SadMxNAvgFunc sad8x4_avg_c = vpx_sad8x4_avg_c;
-const SadMxNAvgFunc sad4x8_avg_c = vpx_sad4x8_avg_c;
-const SadMxNAvgFunc sad4x4_avg_c = vpx_sad4x4_avg_c;
-#if CONFIG_VP9_HIGHBITDEPTH
-const SadMxNAvgFunc highbd_sad64x64_avg_c = vpx_highbd_sad64x64_avg_c;
-const SadMxNAvgFunc highbd_sad64x32_avg_c = vpx_highbd_sad64x32_avg_c;
-const SadMxNAvgFunc highbd_sad32x64_avg_c = vpx_highbd_sad32x64_avg_c;
-const SadMxNAvgFunc highbd_sad32x32_avg_c = vpx_highbd_sad32x32_avg_c;
-const SadMxNAvgFunc highbd_sad32x16_avg_c = vpx_highbd_sad32x16_avg_c;
-const SadMxNAvgFunc highbd_sad16x32_avg_c = vpx_highbd_sad16x32_avg_c;
-const SadMxNAvgFunc highbd_sad16x16_avg_c = vpx_highbd_sad16x16_avg_c;
-const SadMxNAvgFunc highbd_sad16x8_avg_c = vpx_highbd_sad16x8_avg_c;
-const SadMxNAvgFunc highbd_sad8x16_avg_c = vpx_highbd_sad8x16_avg_c;
-const SadMxNAvgFunc highbd_sad8x8_avg_c = vpx_highbd_sad8x8_avg_c;
-const SadMxNAvgFunc highbd_sad8x4_avg_c = vpx_highbd_sad8x4_avg_c;
-const SadMxNAvgFunc highbd_sad4x8_avg_c = vpx_highbd_sad4x8_avg_c;
-const SadMxNAvgFunc highbd_sad4x4_avg_c = vpx_highbd_sad4x4_avg_c;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 const SadMxNAvgParam avg_c_tests[] = {
-  make_tuple(64, 64, sad64x64_avg_c, -1),
-  make_tuple(64, 32, sad64x32_avg_c, -1),
-  make_tuple(32, 64, sad32x64_avg_c, -1),
-  make_tuple(32, 32, sad32x32_avg_c, -1),
-  make_tuple(32, 16, sad32x16_avg_c, -1),
-  make_tuple(16, 32, sad16x32_avg_c, -1),
-  make_tuple(16, 16, sad16x16_avg_c, -1),
-  make_tuple(16, 8, sad16x8_avg_c, -1),
-  make_tuple(8, 16, sad8x16_avg_c, -1),
-  make_tuple(8, 8, sad8x8_avg_c, -1),
-  make_tuple(8, 4, sad8x4_avg_c, -1),
-  make_tuple(4, 8, sad4x8_avg_c, -1),
-  make_tuple(4, 4, sad4x4_avg_c, -1),
+  make_tuple(64, 64, &vpx_sad64x64_avg_c, -1),
+  make_tuple(64, 32, &vpx_sad64x32_avg_c, -1),
+  make_tuple(32, 64, &vpx_sad32x64_avg_c, -1),
+  make_tuple(32, 32, &vpx_sad32x32_avg_c, -1),
+  make_tuple(32, 16, &vpx_sad32x16_avg_c, -1),
+  make_tuple(16, 32, &vpx_sad16x32_avg_c, -1),
+  make_tuple(16, 16, &vpx_sad16x16_avg_c, -1),
+  make_tuple(16, 8, &vpx_sad16x8_avg_c, -1),
+  make_tuple(8, 16, &vpx_sad8x16_avg_c, -1),
+  make_tuple(8, 8, &vpx_sad8x8_avg_c, -1),
+  make_tuple(8, 4, &vpx_sad8x4_avg_c, -1),
+  make_tuple(4, 8, &vpx_sad4x8_avg_c, -1),
+  make_tuple(4, 4, &vpx_sad4x4_avg_c, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
-  make_tuple(64, 64, highbd_sad64x64_avg_c, 8),
-  make_tuple(64, 32, highbd_sad64x32_avg_c, 8),
-  make_tuple(32, 64, highbd_sad32x64_avg_c, 8),
-  make_tuple(32, 32, highbd_sad32x32_avg_c, 8),
-  make_tuple(32, 16, highbd_sad32x16_avg_c, 8),
-  make_tuple(16, 32, highbd_sad16x32_avg_c, 8),
-  make_tuple(16, 16, highbd_sad16x16_avg_c, 8),
-  make_tuple(16, 8, highbd_sad16x8_avg_c, 8),
-  make_tuple(8, 16, highbd_sad8x16_avg_c, 8),
-  make_tuple(8, 8, highbd_sad8x8_avg_c, 8),
-  make_tuple(8, 4, highbd_sad8x4_avg_c, 8),
-  make_tuple(4, 8, highbd_sad4x8_avg_c, 8),
-  make_tuple(4, 4, highbd_sad4x4_avg_c, 8),
-  make_tuple(64, 64, highbd_sad64x64_avg_c, 10),
-  make_tuple(64, 32, highbd_sad64x32_avg_c, 10),
-  make_tuple(32, 64, highbd_sad32x64_avg_c, 10),
-  make_tuple(32, 32, highbd_sad32x32_avg_c, 10),
-  make_tuple(32, 16, highbd_sad32x16_avg_c, 10),
-  make_tuple(16, 32, highbd_sad16x32_avg_c, 10),
-  make_tuple(16, 16, highbd_sad16x16_avg_c, 10),
-  make_tuple(16, 8, highbd_sad16x8_avg_c, 10),
-  make_tuple(8, 16, highbd_sad8x16_avg_c, 10),
-  make_tuple(8, 8, highbd_sad8x8_avg_c, 10),
-  make_tuple(8, 4, highbd_sad8x4_avg_c, 10),
-  make_tuple(4, 8, highbd_sad4x8_avg_c, 10),
-  make_tuple(4, 4, highbd_sad4x4_avg_c, 10),
-  make_tuple(64, 64, highbd_sad64x64_avg_c, 12),
-  make_tuple(64, 32, highbd_sad64x32_avg_c, 12),
-  make_tuple(32, 64, highbd_sad32x64_avg_c, 12),
-  make_tuple(32, 32, highbd_sad32x32_avg_c, 12),
-  make_tuple(32, 16, highbd_sad32x16_avg_c, 12),
-  make_tuple(16, 32, highbd_sad16x32_avg_c, 12),
-  make_tuple(16, 16, highbd_sad16x16_avg_c, 12),
-  make_tuple(16, 8, highbd_sad16x8_avg_c, 12),
-  make_tuple(8, 16, highbd_sad8x16_avg_c, 12),
-  make_tuple(8, 8, highbd_sad8x8_avg_c, 12),
-  make_tuple(8, 4, highbd_sad8x4_avg_c, 12),
-  make_tuple(4, 8, highbd_sad4x8_avg_c, 12),
-  make_tuple(4, 4, highbd_sad4x4_avg_c, 12),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_avg_c, 8),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_avg_c, 8),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_avg_c, 8),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_avg_c, 8),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_avg_c, 8),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_avg_c, 8),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_avg_c, 8),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_avg_c, 8),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_avg_c, 8),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_avg_c, 8),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_avg_c, 8),
+  make_tuple(4, 8, &vpx_highbd_sad4x8_avg_c, 8),
+  make_tuple(4, 4, &vpx_highbd_sad4x4_avg_c, 8),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_avg_c, 10),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_avg_c, 10),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_avg_c, 10),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_avg_c, 10),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_avg_c, 10),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_avg_c, 10),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_avg_c, 10),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_avg_c, 10),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_avg_c, 10),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_avg_c, 10),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_avg_c, 10),
+  make_tuple(4, 8, &vpx_highbd_sad4x8_avg_c, 10),
+  make_tuple(4, 4, &vpx_highbd_sad4x4_avg_c, 10),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_avg_c, 12),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_avg_c, 12),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_avg_c, 12),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_avg_c, 12),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_avg_c, 12),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_avg_c, 12),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_avg_c, 12),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_avg_c, 12),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_avg_c, 12),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_avg_c, 12),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_avg_c, 12),
+  make_tuple(4, 8, &vpx_highbd_sad4x8_avg_c, 12),
+  make_tuple(4, 4, &vpx_highbd_sad4x4_avg_c, 12),
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_CASE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
 
-const SadMxNx4Func sad64x64x4d_c = vpx_sad64x64x4d_c;
-const SadMxNx4Func sad64x32x4d_c = vpx_sad64x32x4d_c;
-const SadMxNx4Func sad32x64x4d_c = vpx_sad32x64x4d_c;
-const SadMxNx4Func sad32x32x4d_c = vpx_sad32x32x4d_c;
-const SadMxNx4Func sad32x16x4d_c = vpx_sad32x16x4d_c;
-const SadMxNx4Func sad16x32x4d_c = vpx_sad16x32x4d_c;
-const SadMxNx4Func sad16x16x4d_c = vpx_sad16x16x4d_c;
-const SadMxNx4Func sad16x8x4d_c = vpx_sad16x8x4d_c;
-const SadMxNx4Func sad8x16x4d_c = vpx_sad8x16x4d_c;
-const SadMxNx4Func sad8x8x4d_c = vpx_sad8x8x4d_c;
-const SadMxNx4Func sad8x4x4d_c = vpx_sad8x4x4d_c;
-const SadMxNx4Func sad4x8x4d_c = vpx_sad4x8x4d_c;
-const SadMxNx4Func sad4x4x4d_c = vpx_sad4x4x4d_c;
-#if CONFIG_VP9_HIGHBITDEPTH
-const SadMxNx4Func highbd_sad64x64x4d_c = vpx_highbd_sad64x64x4d_c;
-const SadMxNx4Func highbd_sad64x32x4d_c = vpx_highbd_sad64x32x4d_c;
-const SadMxNx4Func highbd_sad32x64x4d_c = vpx_highbd_sad32x64x4d_c;
-const SadMxNx4Func highbd_sad32x32x4d_c = vpx_highbd_sad32x32x4d_c;
-const SadMxNx4Func highbd_sad32x16x4d_c = vpx_highbd_sad32x16x4d_c;
-const SadMxNx4Func highbd_sad16x32x4d_c = vpx_highbd_sad16x32x4d_c;
-const SadMxNx4Func highbd_sad16x16x4d_c = vpx_highbd_sad16x16x4d_c;
-const SadMxNx4Func highbd_sad16x8x4d_c = vpx_highbd_sad16x8x4d_c;
-const SadMxNx4Func highbd_sad8x16x4d_c = vpx_highbd_sad8x16x4d_c;
-const SadMxNx4Func highbd_sad8x8x4d_c = vpx_highbd_sad8x8x4d_c;
-const SadMxNx4Func highbd_sad8x4x4d_c = vpx_highbd_sad8x4x4d_c;
-const SadMxNx4Func highbd_sad4x8x4d_c = vpx_highbd_sad4x8x4d_c;
-const SadMxNx4Func highbd_sad4x4x4d_c = vpx_highbd_sad4x4x4d_c;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 const SadMxNx4Param x4d_c_tests[] = {
-  make_tuple(64, 64, sad64x64x4d_c, -1),
-  make_tuple(64, 32, sad64x32x4d_c, -1),
-  make_tuple(32, 64, sad32x64x4d_c, -1),
-  make_tuple(32, 32, sad32x32x4d_c, -1),
-  make_tuple(32, 16, sad32x16x4d_c, -1),
-  make_tuple(16, 32, sad16x32x4d_c, -1),
-  make_tuple(16, 16, sad16x16x4d_c, -1),
-  make_tuple(16, 8, sad16x8x4d_c, -1),
-  make_tuple(8, 16, sad8x16x4d_c, -1),
-  make_tuple(8, 8, sad8x8x4d_c, -1),
-  make_tuple(8, 4, sad8x4x4d_c, -1),
-  make_tuple(4, 8, sad4x8x4d_c, -1),
-  make_tuple(4, 4, sad4x4x4d_c, -1),
+  make_tuple(64, 64, &vpx_sad64x64x4d_c, -1),
+  make_tuple(64, 32, &vpx_sad64x32x4d_c, -1),
+  make_tuple(32, 64, &vpx_sad32x64x4d_c, -1),
+  make_tuple(32, 32, &vpx_sad32x32x4d_c, -1),
+  make_tuple(32, 16, &vpx_sad32x16x4d_c, -1),
+  make_tuple(16, 32, &vpx_sad16x32x4d_c, -1),
+  make_tuple(16, 16, &vpx_sad16x16x4d_c, -1),
+  make_tuple(16, 8, &vpx_sad16x8x4d_c, -1),
+  make_tuple(8, 16, &vpx_sad8x16x4d_c, -1),
+  make_tuple(8, 8, &vpx_sad8x8x4d_c, -1),
+  make_tuple(8, 4, &vpx_sad8x4x4d_c, -1),
+  make_tuple(4, 8, &vpx_sad4x8x4d_c, -1),
+  make_tuple(4, 4, &vpx_sad4x4x4d_c, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
-  make_tuple(64, 64, highbd_sad64x64x4d_c, 8),
-  make_tuple(64, 32, highbd_sad64x32x4d_c, 8),
-  make_tuple(32, 64, highbd_sad32x64x4d_c, 8),
-  make_tuple(32, 32, highbd_sad32x32x4d_c, 8),
-  make_tuple(32, 16, highbd_sad32x16x4d_c, 8),
-  make_tuple(16, 32, highbd_sad16x32x4d_c, 8),
-  make_tuple(16, 16, highbd_sad16x16x4d_c, 8),
-  make_tuple(16, 8, highbd_sad16x8x4d_c, 8),
-  make_tuple(8, 16, highbd_sad8x16x4d_c, 8),
-  make_tuple(8, 8, highbd_sad8x8x4d_c, 8),
-  make_tuple(8, 4, highbd_sad8x4x4d_c, 8),
-  make_tuple(4, 8, highbd_sad4x8x4d_c, 8),
-  make_tuple(4, 4, highbd_sad4x4x4d_c, 8),
-  make_tuple(64, 64, highbd_sad64x64x4d_c, 10),
-  make_tuple(64, 32, highbd_sad64x32x4d_c, 10),
-  make_tuple(32, 64, highbd_sad32x64x4d_c, 10),
-  make_tuple(32, 32, highbd_sad32x32x4d_c, 10),
-  make_tuple(32, 16, highbd_sad32x16x4d_c, 10),
-  make_tuple(16, 32, highbd_sad16x32x4d_c, 10),
-  make_tuple(16, 16, highbd_sad16x16x4d_c, 10),
-  make_tuple(16, 8, highbd_sad16x8x4d_c, 10),
-  make_tuple(8, 16, highbd_sad8x16x4d_c, 10),
-  make_tuple(8, 8, highbd_sad8x8x4d_c, 10),
-  make_tuple(8, 4, highbd_sad8x4x4d_c, 10),
-  make_tuple(4, 8, highbd_sad4x8x4d_c, 10),
-  make_tuple(4, 4, highbd_sad4x4x4d_c, 10),
-  make_tuple(64, 64, highbd_sad64x64x4d_c, 12),
-  make_tuple(64, 32, highbd_sad64x32x4d_c, 12),
-  make_tuple(32, 64, highbd_sad32x64x4d_c, 12),
-  make_tuple(32, 32, highbd_sad32x32x4d_c, 12),
-  make_tuple(32, 16, highbd_sad32x16x4d_c, 12),
-  make_tuple(16, 32, highbd_sad16x32x4d_c, 12),
-  make_tuple(16, 16, highbd_sad16x16x4d_c, 12),
-  make_tuple(16, 8, highbd_sad16x8x4d_c, 12),
-  make_tuple(8, 16, highbd_sad8x16x4d_c, 12),
-  make_tuple(8, 8, highbd_sad8x8x4d_c, 12),
-  make_tuple(8, 4, highbd_sad8x4x4d_c, 12),
-  make_tuple(4, 8, highbd_sad4x8x4d_c, 12),
-  make_tuple(4, 4, highbd_sad4x4x4d_c, 12),
+  make_tuple(64, 64, &vpx_highbd_sad64x64x4d_c, 8),
+  make_tuple(64, 32, &vpx_highbd_sad64x32x4d_c, 8),
+  make_tuple(32, 64, &vpx_highbd_sad32x64x4d_c, 8),
+  make_tuple(32, 32, &vpx_highbd_sad32x32x4d_c, 8),
+  make_tuple(32, 16, &vpx_highbd_sad32x16x4d_c, 8),
+  make_tuple(16, 32, &vpx_highbd_sad16x32x4d_c, 8),
+  make_tuple(16, 16, &vpx_highbd_sad16x16x4d_c, 8),
+  make_tuple(16, 8, &vpx_highbd_sad16x8x4d_c, 8),
+  make_tuple(8, 16, &vpx_highbd_sad8x16x4d_c, 8),
+  make_tuple(8, 8, &vpx_highbd_sad8x8x4d_c, 8),
+  make_tuple(8, 4, &vpx_highbd_sad8x4x4d_c, 8),
+  make_tuple(4, 8, &vpx_highbd_sad4x8x4d_c, 8),
+  make_tuple(4, 4, &vpx_highbd_sad4x4x4d_c, 8),
+  make_tuple(64, 64, &vpx_highbd_sad64x64x4d_c, 10),
+  make_tuple(64, 32, &vpx_highbd_sad64x32x4d_c, 10),
+  make_tuple(32, 64, &vpx_highbd_sad32x64x4d_c, 10),
+  make_tuple(32, 32, &vpx_highbd_sad32x32x4d_c, 10),
+  make_tuple(32, 16, &vpx_highbd_sad32x16x4d_c, 10),
+  make_tuple(16, 32, &vpx_highbd_sad16x32x4d_c, 10),
+  make_tuple(16, 16, &vpx_highbd_sad16x16x4d_c, 10),
+  make_tuple(16, 8, &vpx_highbd_sad16x8x4d_c, 10),
+  make_tuple(8, 16, &vpx_highbd_sad8x16x4d_c, 10),
+  make_tuple(8, 8, &vpx_highbd_sad8x8x4d_c, 10),
+  make_tuple(8, 4, &vpx_highbd_sad8x4x4d_c, 10),
+  make_tuple(4, 8, &vpx_highbd_sad4x8x4d_c, 10),
+  make_tuple(4, 4, &vpx_highbd_sad4x4x4d_c, 10),
+  make_tuple(64, 64, &vpx_highbd_sad64x64x4d_c, 12),
+  make_tuple(64, 32, &vpx_highbd_sad64x32x4d_c, 12),
+  make_tuple(32, 64, &vpx_highbd_sad32x64x4d_c, 12),
+  make_tuple(32, 32, &vpx_highbd_sad32x32x4d_c, 12),
+  make_tuple(32, 16, &vpx_highbd_sad32x16x4d_c, 12),
+  make_tuple(16, 32, &vpx_highbd_sad16x32x4d_c, 12),
+  make_tuple(16, 16, &vpx_highbd_sad16x16x4d_c, 12),
+  make_tuple(16, 8, &vpx_highbd_sad16x8x4d_c, 12),
+  make_tuple(8, 16, &vpx_highbd_sad8x16x4d_c, 12),
+  make_tuple(8, 8, &vpx_highbd_sad8x8x4d_c, 12),
+  make_tuple(8, 4, &vpx_highbd_sad8x4x4d_c, 12),
+  make_tuple(4, 8, &vpx_highbd_sad4x8x4d_c, 12),
+  make_tuple(4, 4, &vpx_highbd_sad4x4x4d_c, 12),
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
@@ -745,318 +661,194 @@ INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
 //------------------------------------------------------------------------------
 // ARM functions
 #if HAVE_MEDIA
-const SadMxNFunc sad16x16_media = vpx_sad16x16_media;
 const SadMxNParam media_tests[] = {
-  make_tuple(16, 16, sad16x16_media, -1),
+  make_tuple(16, 16, &vpx_sad16x16_media, -1),
 };
 INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::ValuesIn(media_tests));
 #endif  // HAVE_MEDIA
 
 #if HAVE_NEON
-const SadMxNFunc sad64x64_neon = vpx_sad64x64_neon;
-const SadMxNFunc sad32x32_neon = vpx_sad32x32_neon;
-const SadMxNFunc sad16x16_neon = vpx_sad16x16_neon;
-const SadMxNFunc sad16x8_neon = vpx_sad16x8_neon;
-const SadMxNFunc sad8x16_neon = vpx_sad8x16_neon;
-const SadMxNFunc sad8x8_neon = vpx_sad8x8_neon;
-const SadMxNFunc sad4x4_neon = vpx_sad4x4_neon;
-
 const SadMxNParam neon_tests[] = {
-  make_tuple(64, 64, sad64x64_neon, -1),
-  make_tuple(32, 32, sad32x32_neon, -1),
-  make_tuple(16, 16, sad16x16_neon, -1),
-  make_tuple(16, 8, sad16x8_neon, -1),
-  make_tuple(8, 16, sad8x16_neon, -1),
-  make_tuple(8, 8, sad8x8_neon, -1),
-  make_tuple(4, 4, sad4x4_neon, -1),
+  make_tuple(64, 64, &vpx_sad64x64_neon, -1),
+  make_tuple(32, 32, &vpx_sad32x32_neon, -1),
+  make_tuple(16, 16, &vpx_sad16x16_neon, -1),
+  make_tuple(16, 8, &vpx_sad16x8_neon, -1),
+  make_tuple(8, 16, &vpx_sad8x16_neon, -1),
+  make_tuple(8, 8, &vpx_sad8x8_neon, -1),
+  make_tuple(4, 4, &vpx_sad4x4_neon, -1),
 };
 INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
 
-const SadMxNx4Func sad64x64x4d_neon = vpx_sad64x64x4d_neon;
-const SadMxNx4Func sad32x32x4d_neon = vpx_sad32x32x4d_neon;
-const SadMxNx4Func sad16x16x4d_neon = vpx_sad16x16x4d_neon;
 const SadMxNx4Param x4d_neon_tests[] = {
-  make_tuple(64, 64, sad64x64x4d_neon, -1),
-  make_tuple(32, 32, sad32x32x4d_neon, -1),
-  make_tuple(16, 16, sad16x16x4d_neon, -1),
+  make_tuple(64, 64, &vpx_sad64x64x4d_neon, -1),
+  make_tuple(32, 32, &vpx_sad32x32x4d_neon, -1),
+  make_tuple(16, 16, &vpx_sad16x16x4d_neon, -1),
 };
 INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
 #endif  // HAVE_NEON
 
 //------------------------------------------------------------------------------
 // x86 functions
-#if HAVE_MMX
-const SadMxNFunc sad16x16_mmx = vpx_sad16x16_mmx;
-const SadMxNFunc sad16x8_mmx = vpx_sad16x8_mmx;
-const SadMxNFunc sad8x16_mmx = vpx_sad8x16_mmx;
-const SadMxNFunc sad8x8_mmx = vpx_sad8x8_mmx;
-const SadMxNFunc sad4x4_mmx = vpx_sad4x4_mmx;
-const SadMxNParam mmx_tests[] = {
-  make_tuple(16, 16, sad16x16_mmx, -1),
-  make_tuple(16, 8, sad16x8_mmx, -1),
-  make_tuple(8, 16, sad8x16_mmx, -1),
-  make_tuple(8, 8, sad8x8_mmx, -1),
-  make_tuple(4, 4, sad4x4_mmx, -1),
-};
-INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
-#endif  // HAVE_MMX
-
-#if HAVE_SSE
-#if CONFIG_USE_X86INC
-const SadMxNFunc sad4x8_sse = vpx_sad4x8_sse;
-const SadMxNFunc sad4x4_sse = vpx_sad4x4_sse;
-const SadMxNParam sse_tests[] = {
-  make_tuple(4, 8, sad4x8_sse, -1),
-  make_tuple(4, 4, sad4x4_sse, -1),
-};
-INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::ValuesIn(sse_tests));
-
-const SadMxNAvgFunc sad4x8_avg_sse = vpx_sad4x8_avg_sse;
-const SadMxNAvgFunc sad4x4_avg_sse = vpx_sad4x4_avg_sse;
-const SadMxNAvgParam avg_sse_tests[] = {
-  make_tuple(4, 8, sad4x8_avg_sse, -1),
-  make_tuple(4, 4, sad4x4_avg_sse, -1),
-};
-INSTANTIATE_TEST_CASE_P(SSE, SADavgTest, ::testing::ValuesIn(avg_sse_tests));
-
-const SadMxNx4Func sad4x8x4d_sse = vpx_sad4x8x4d_sse;
-const SadMxNx4Func sad4x4x4d_sse = vpx_sad4x4x4d_sse;
-const SadMxNx4Param x4d_sse_tests[] = {
-  make_tuple(4, 8, sad4x8x4d_sse, -1),
-  make_tuple(4, 4, sad4x4x4d_sse, -1),
-};
-INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::ValuesIn(x4d_sse_tests));
-#endif  // CONFIG_USE_X86INC
-#endif  // HAVE_SSE
-
 #if HAVE_SSE2
 #if CONFIG_USE_X86INC
-const SadMxNFunc sad64x64_sse2 = vpx_sad64x64_sse2;
-const SadMxNFunc sad64x32_sse2 = vpx_sad64x32_sse2;
-const SadMxNFunc sad32x64_sse2 = vpx_sad32x64_sse2;
-const SadMxNFunc sad32x32_sse2 = vpx_sad32x32_sse2;
-const SadMxNFunc sad32x16_sse2 = vpx_sad32x16_sse2;
-const SadMxNFunc sad16x32_sse2 = vpx_sad16x32_sse2;
-const SadMxNFunc sad16x16_sse2 = vpx_sad16x16_sse2;
-const SadMxNFunc sad16x8_sse2 = vpx_sad16x8_sse2;
-const SadMxNFunc sad8x16_sse2 = vpx_sad8x16_sse2;
-const SadMxNFunc sad8x8_sse2 = vpx_sad8x8_sse2;
-const SadMxNFunc sad8x4_sse2 = vpx_sad8x4_sse2;
-#if CONFIG_VP9_HIGHBITDEPTH
-const SadMxNFunc highbd_sad64x64_sse2 = vpx_highbd_sad64x64_sse2;
-const SadMxNFunc highbd_sad64x32_sse2 = vpx_highbd_sad64x32_sse2;
-const SadMxNFunc highbd_sad32x64_sse2 = vpx_highbd_sad32x64_sse2;
-const SadMxNFunc highbd_sad32x32_sse2 = vpx_highbd_sad32x32_sse2;
-const SadMxNFunc highbd_sad32x16_sse2 = vpx_highbd_sad32x16_sse2;
-const SadMxNFunc highbd_sad16x32_sse2 = vpx_highbd_sad16x32_sse2;
-const SadMxNFunc highbd_sad16x16_sse2 = vpx_highbd_sad16x16_sse2;
-const SadMxNFunc highbd_sad16x8_sse2 = vpx_highbd_sad16x8_sse2;
-const SadMxNFunc highbd_sad8x16_sse2 = vpx_highbd_sad8x16_sse2;
-const SadMxNFunc highbd_sad8x8_sse2 = vpx_highbd_sad8x8_sse2;
-const SadMxNFunc highbd_sad8x4_sse2 = vpx_highbd_sad8x4_sse2;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 const SadMxNParam sse2_tests[] = {
-  make_tuple(64, 64, sad64x64_sse2, -1),
-  make_tuple(64, 32, sad64x32_sse2, -1),
-  make_tuple(32, 64, sad32x64_sse2, -1),
-  make_tuple(32, 32, sad32x32_sse2, -1),
-  make_tuple(32, 16, sad32x16_sse2, -1),
-  make_tuple(16, 32, sad16x32_sse2, -1),
-  make_tuple(16, 16, sad16x16_sse2, -1),
-  make_tuple(16, 8, sad16x8_sse2, -1),
-  make_tuple(8, 16, sad8x16_sse2, -1),
-  make_tuple(8, 8, sad8x8_sse2, -1),
-  make_tuple(8, 4, sad8x4_sse2, -1),
+  make_tuple(64, 64, &vpx_sad64x64_sse2, -1),
+  make_tuple(64, 32, &vpx_sad64x32_sse2, -1),
+  make_tuple(32, 64, &vpx_sad32x64_sse2, -1),
+  make_tuple(32, 32, &vpx_sad32x32_sse2, -1),
+  make_tuple(32, 16, &vpx_sad32x16_sse2, -1),
+  make_tuple(16, 32, &vpx_sad16x32_sse2, -1),
+  make_tuple(16, 16, &vpx_sad16x16_sse2, -1),
+  make_tuple(16, 8, &vpx_sad16x8_sse2, -1),
+  make_tuple(8, 16, &vpx_sad8x16_sse2, -1),
+  make_tuple(8, 8, &vpx_sad8x8_sse2, -1),
+  make_tuple(8, 4, &vpx_sad8x4_sse2, -1),
+  make_tuple(4, 8, &vpx_sad4x8_sse2, -1),
+  make_tuple(4, 4, &vpx_sad4x4_sse2, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
-  make_tuple(64, 64, highbd_sad64x64_sse2, 8),
-  make_tuple(64, 32, highbd_sad64x32_sse2, 8),
-  make_tuple(32, 64, highbd_sad32x64_sse2, 8),
-  make_tuple(32, 32, highbd_sad32x32_sse2, 8),
-  make_tuple(32, 16, highbd_sad32x16_sse2, 8),
-  make_tuple(16, 32, highbd_sad16x32_sse2, 8),
-  make_tuple(16, 16, highbd_sad16x16_sse2, 8),
-  make_tuple(16, 8, highbd_sad16x8_sse2, 8),
-  make_tuple(8, 16, highbd_sad8x16_sse2, 8),
-  make_tuple(8, 8, highbd_sad8x8_sse2, 8),
-  make_tuple(8, 4, highbd_sad8x4_sse2, 8),
-  make_tuple(64, 64, highbd_sad64x64_sse2, 10),
-  make_tuple(64, 32, highbd_sad64x32_sse2, 10),
-  make_tuple(32, 64, highbd_sad32x64_sse2, 10),
-  make_tuple(32, 32, highbd_sad32x32_sse2, 10),
-  make_tuple(32, 16, highbd_sad32x16_sse2, 10),
-  make_tuple(16, 32, highbd_sad16x32_sse2, 10),
-  make_tuple(16, 16, highbd_sad16x16_sse2, 10),
-  make_tuple(16, 8, highbd_sad16x8_sse2, 10),
-  make_tuple(8, 16, highbd_sad8x16_sse2, 10),
-  make_tuple(8, 8, highbd_sad8x8_sse2, 10),
-  make_tuple(8, 4, highbd_sad8x4_sse2, 10),
-  make_tuple(64, 64, highbd_sad64x64_sse2, 12),
-  make_tuple(64, 32, highbd_sad64x32_sse2, 12),
-  make_tuple(32, 64, highbd_sad32x64_sse2, 12),
-  make_tuple(32, 32, highbd_sad32x32_sse2, 12),
-  make_tuple(32, 16, highbd_sad32x16_sse2, 12),
-  make_tuple(16, 32, highbd_sad16x32_sse2, 12),
-  make_tuple(16, 16, highbd_sad16x16_sse2, 12),
-  make_tuple(16, 8, highbd_sad16x8_sse2, 12),
-  make_tuple(8, 16, highbd_sad8x16_sse2, 12),
-  make_tuple(8, 8, highbd_sad8x8_sse2, 12),
-  make_tuple(8, 4, highbd_sad8x4_sse2, 12),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_sse2, 8),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_sse2, 8),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_sse2, 8),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_sse2, 8),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_sse2, 8),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_sse2, 8),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_sse2, 8),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_sse2, 8),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_sse2, 8),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_sse2, 8),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_sse2, 8),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_sse2, 10),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_sse2, 10),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_sse2, 10),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_sse2, 10),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_sse2, 10),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_sse2, 10),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_sse2, 10),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_sse2, 10),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_sse2, 10),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_sse2, 10),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_sse2, 10),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_sse2, 12),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_sse2, 12),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_sse2, 12),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_sse2, 12),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_sse2, 12),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_sse2, 12),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_sse2, 12),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_sse2, 12),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_sse2, 12),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_sse2, 12),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_sse2, 12),
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
-const SadMxNAvgFunc sad64x64_avg_sse2 = vpx_sad64x64_avg_sse2;
-const SadMxNAvgFunc sad64x32_avg_sse2 = vpx_sad64x32_avg_sse2;
-const SadMxNAvgFunc sad32x64_avg_sse2 = vpx_sad32x64_avg_sse2;
-const SadMxNAvgFunc sad32x32_avg_sse2 = vpx_sad32x32_avg_sse2;
-const SadMxNAvgFunc sad32x16_avg_sse2 = vpx_sad32x16_avg_sse2;
-const SadMxNAvgFunc sad16x32_avg_sse2 = vpx_sad16x32_avg_sse2;
-const SadMxNAvgFunc sad16x16_avg_sse2 = vpx_sad16x16_avg_sse2;
-const SadMxNAvgFunc sad16x8_avg_sse2 = vpx_sad16x8_avg_sse2;
-const SadMxNAvgFunc sad8x16_avg_sse2 = vpx_sad8x16_avg_sse2;
-const SadMxNAvgFunc sad8x8_avg_sse2 = vpx_sad8x8_avg_sse2;
-const SadMxNAvgFunc sad8x4_avg_sse2 = vpx_sad8x4_avg_sse2;
-#if CONFIG_VP9_HIGHBITDEPTH
-const SadMxNAvgFunc highbd_sad64x64_avg_sse2 = vpx_highbd_sad64x64_avg_sse2;
-const SadMxNAvgFunc highbd_sad64x32_avg_sse2 = vpx_highbd_sad64x32_avg_sse2;
-const SadMxNAvgFunc highbd_sad32x64_avg_sse2 = vpx_highbd_sad32x64_avg_sse2;
-const SadMxNAvgFunc highbd_sad32x32_avg_sse2 = vpx_highbd_sad32x32_avg_sse2;
-const SadMxNAvgFunc highbd_sad32x16_avg_sse2 = vpx_highbd_sad32x16_avg_sse2;
-const SadMxNAvgFunc highbd_sad16x32_avg_sse2 = vpx_highbd_sad16x32_avg_sse2;
-const SadMxNAvgFunc highbd_sad16x16_avg_sse2 = vpx_highbd_sad16x16_avg_sse2;
-const SadMxNAvgFunc highbd_sad16x8_avg_sse2 = vpx_highbd_sad16x8_avg_sse2;
-const SadMxNAvgFunc highbd_sad8x16_avg_sse2 = vpx_highbd_sad8x16_avg_sse2;
-const SadMxNAvgFunc highbd_sad8x8_avg_sse2 = vpx_highbd_sad8x8_avg_sse2;
-const SadMxNAvgFunc highbd_sad8x4_avg_sse2 = vpx_highbd_sad8x4_avg_sse2;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 const SadMxNAvgParam avg_sse2_tests[] = {
-  make_tuple(64, 64, sad64x64_avg_sse2, -1),
-  make_tuple(64, 32, sad64x32_avg_sse2, -1),
-  make_tuple(32, 64, sad32x64_avg_sse2, -1),
-  make_tuple(32, 32, sad32x32_avg_sse2, -1),
-  make_tuple(32, 16, sad32x16_avg_sse2, -1),
-  make_tuple(16, 32, sad16x32_avg_sse2, -1),
-  make_tuple(16, 16, sad16x16_avg_sse2, -1),
-  make_tuple(16, 8, sad16x8_avg_sse2, -1),
-  make_tuple(8, 16, sad8x16_avg_sse2, -1),
-  make_tuple(8, 8, sad8x8_avg_sse2, -1),
-  make_tuple(8, 4, sad8x4_avg_sse2, -1),
+  make_tuple(64, 64, &vpx_sad64x64_avg_sse2, -1),
+  make_tuple(64, 32, &vpx_sad64x32_avg_sse2, -1),
+  make_tuple(32, 64, &vpx_sad32x64_avg_sse2, -1),
+  make_tuple(32, 32, &vpx_sad32x32_avg_sse2, -1),
+  make_tuple(32, 16, &vpx_sad32x16_avg_sse2, -1),
+  make_tuple(16, 32, &vpx_sad16x32_avg_sse2, -1),
+  make_tuple(16, 16, &vpx_sad16x16_avg_sse2, -1),
+  make_tuple(16, 8, &vpx_sad16x8_avg_sse2, -1),
+  make_tuple(8, 16, &vpx_sad8x16_avg_sse2, -1),
+  make_tuple(8, 8, &vpx_sad8x8_avg_sse2, -1),
+  make_tuple(8, 4, &vpx_sad8x4_avg_sse2, -1),
+  make_tuple(4, 8, &vpx_sad4x8_avg_sse2, -1),
+  make_tuple(4, 4, &vpx_sad4x4_avg_sse2, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
-  make_tuple(64, 64, highbd_sad64x64_avg_sse2, 8),
-  make_tuple(64, 32, highbd_sad64x32_avg_sse2, 8),
-  make_tuple(32, 64, highbd_sad32x64_avg_sse2, 8),
-  make_tuple(32, 32, highbd_sad32x32_avg_sse2, 8),
-  make_tuple(32, 16, highbd_sad32x16_avg_sse2, 8),
-  make_tuple(16, 32, highbd_sad16x32_avg_sse2, 8),
-  make_tuple(16, 16, highbd_sad16x16_avg_sse2, 8),
-  make_tuple(16, 8, highbd_sad16x8_avg_sse2, 8),
-  make_tuple(8, 16, highbd_sad8x16_avg_sse2, 8),
-  make_tuple(8, 8, highbd_sad8x8_avg_sse2, 8),
-  make_tuple(8, 4, highbd_sad8x4_avg_sse2, 8),
-  make_tuple(64, 64, highbd_sad64x64_avg_sse2, 10),
-  make_tuple(64, 32, highbd_sad64x32_avg_sse2, 10),
-  make_tuple(32, 64, highbd_sad32x64_avg_sse2, 10),
-  make_tuple(32, 32, highbd_sad32x32_avg_sse2, 10),
-  make_tuple(32, 16, highbd_sad32x16_avg_sse2, 10),
-  make_tuple(16, 32, highbd_sad16x32_avg_sse2, 10),
-  make_tuple(16, 16, highbd_sad16x16_avg_sse2, 10),
-  make_tuple(16, 8, highbd_sad16x8_avg_sse2, 10),
-  make_tuple(8, 16, highbd_sad8x16_avg_sse2, 10),
-  make_tuple(8, 8, highbd_sad8x8_avg_sse2, 10),
-  make_tuple(8, 4, highbd_sad8x4_avg_sse2, 10),
-  make_tuple(64, 64, highbd_sad64x64_avg_sse2, 12),
-  make_tuple(64, 32, highbd_sad64x32_avg_sse2, 12),
-  make_tuple(32, 64, highbd_sad32x64_avg_sse2, 12),
-  make_tuple(32, 32, highbd_sad32x32_avg_sse2, 12),
-  make_tuple(32, 16, highbd_sad32x16_avg_sse2, 12),
-  make_tuple(16, 32, highbd_sad16x32_avg_sse2, 12),
-  make_tuple(16, 16, highbd_sad16x16_avg_sse2, 12),
-  make_tuple(16, 8, highbd_sad16x8_avg_sse2, 12),
-  make_tuple(8, 16, highbd_sad8x16_avg_sse2, 12),
-  make_tuple(8, 8, highbd_sad8x8_avg_sse2, 12),
-  make_tuple(8, 4, highbd_sad8x4_avg_sse2, 12),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_avg_sse2, 8),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_avg_sse2, 8),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_avg_sse2, 8),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_avg_sse2, 8),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_avg_sse2, 8),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_avg_sse2, 8),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_avg_sse2, 8),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_avg_sse2, 8),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_avg_sse2, 8),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_avg_sse2, 8),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_avg_sse2, 8),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_avg_sse2, 10),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_avg_sse2, 10),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_avg_sse2, 10),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_avg_sse2, 10),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_avg_sse2, 10),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_avg_sse2, 10),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_avg_sse2, 10),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_avg_sse2, 10),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_avg_sse2, 10),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_avg_sse2, 10),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_avg_sse2, 10),
+  make_tuple(64, 64, &vpx_highbd_sad64x64_avg_sse2, 12),
+  make_tuple(64, 32, &vpx_highbd_sad64x32_avg_sse2, 12),
+  make_tuple(32, 64, &vpx_highbd_sad32x64_avg_sse2, 12),
+  make_tuple(32, 32, &vpx_highbd_sad32x32_avg_sse2, 12),
+  make_tuple(32, 16, &vpx_highbd_sad32x16_avg_sse2, 12),
+  make_tuple(16, 32, &vpx_highbd_sad16x32_avg_sse2, 12),
+  make_tuple(16, 16, &vpx_highbd_sad16x16_avg_sse2, 12),
+  make_tuple(16, 8, &vpx_highbd_sad16x8_avg_sse2, 12),
+  make_tuple(8, 16, &vpx_highbd_sad8x16_avg_sse2, 12),
+  make_tuple(8, 8, &vpx_highbd_sad8x8_avg_sse2, 12),
+  make_tuple(8, 4, &vpx_highbd_sad8x4_avg_sse2, 12),
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_CASE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));
 
-const SadMxNx4Func sad64x64x4d_sse2 = vpx_sad64x64x4d_sse2;
-const SadMxNx4Func sad64x32x4d_sse2 = vpx_sad64x32x4d_sse2;
-const SadMxNx4Func sad32x64x4d_sse2 = vpx_sad32x64x4d_sse2;
-const SadMxNx4Func sad32x32x4d_sse2 = vpx_sad32x32x4d_sse2;
-const SadMxNx4Func sad32x16x4d_sse2 = vpx_sad32x16x4d_sse2;
-const SadMxNx4Func sad16x32x4d_sse2 = vpx_sad16x32x4d_sse2;
-const SadMxNx4Func sad16x16x4d_sse2 = vpx_sad16x16x4d_sse2;
-const SadMxNx4Func sad16x8x4d_sse2 = vpx_sad16x8x4d_sse2;
-const SadMxNx4Func sad8x16x4d_sse2 = vpx_sad8x16x4d_sse2;
-const SadMxNx4Func sad8x8x4d_sse2 = vpx_sad8x8x4d_sse2;
-const SadMxNx4Func sad8x4x4d_sse2 = vpx_sad8x4x4d_sse2;
-#if CONFIG_VP9_HIGHBITDEPTH
-const SadMxNx4Func highbd_sad64x64x4d_sse2 = vpx_highbd_sad64x64x4d_sse2;
-const SadMxNx4Func highbd_sad64x32x4d_sse2 = vpx_highbd_sad64x32x4d_sse2;
-const SadMxNx4Func highbd_sad32x64x4d_sse2 = vpx_highbd_sad32x64x4d_sse2;
-const SadMxNx4Func highbd_sad32x32x4d_sse2 = vpx_highbd_sad32x32x4d_sse2;
-const SadMxNx4Func highbd_sad32x16x4d_sse2 = vpx_highbd_sad32x16x4d_sse2;
-const SadMxNx4Func highbd_sad16x32x4d_sse2 = vpx_highbd_sad16x32x4d_sse2;
-const SadMxNx4Func highbd_sad16x16x4d_sse2 = vpx_highbd_sad16x16x4d_sse2;
-const SadMxNx4Func highbd_sad16x8x4d_sse2 = vpx_highbd_sad16x8x4d_sse2;
-const SadMxNx4Func highbd_sad8x16x4d_sse2 = vpx_highbd_sad8x16x4d_sse2;
-const SadMxNx4Func highbd_sad8x8x4d_sse2 = vpx_highbd_sad8x8x4d_sse2;
-const SadMxNx4Func highbd_sad8x4x4d_sse2 = vpx_highbd_sad8x4x4d_sse2;
-const SadMxNx4Func highbd_sad4x8x4d_sse2 = vpx_highbd_sad4x8x4d_sse2;
-const SadMxNx4Func highbd_sad4x4x4d_sse2 = vpx_highbd_sad4x4x4d_sse2;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 const SadMxNx4Param x4d_sse2_tests[] = {
-  make_tuple(64, 64, sad64x64x4d_sse2, -1),
-  make_tuple(64, 32, sad64x32x4d_sse2, -1),
-  make_tuple(32, 64, sad32x64x4d_sse2, -1),
-  make_tuple(32, 32, sad32x32x4d_sse2, -1),
-  make_tuple(32, 16, sad32x16x4d_sse2, -1),
-  make_tuple(16, 32, sad16x32x4d_sse2, -1),
-  make_tuple(16, 16, sad16x16x4d_sse2, -1),
-  make_tuple(16, 8, sad16x8x4d_sse2, -1),
-  make_tuple(8, 16, sad8x16x4d_sse2, -1),
-  make_tuple(8, 8, sad8x8x4d_sse2, -1),
-  make_tuple(8, 4, sad8x4x4d_sse2, -1),
+  make_tuple(64, 64, &vpx_sad64x64x4d_sse2, -1),
+  make_tuple(64, 32, &vpx_sad64x32x4d_sse2, -1),
+  make_tuple(32, 64, &vpx_sad32x64x4d_sse2, -1),
+  make_tuple(32, 32, &vpx_sad32x32x4d_sse2, -1),
+  make_tuple(32, 16, &vpx_sad32x16x4d_sse2, -1),
+  make_tuple(16, 32, &vpx_sad16x32x4d_sse2, -1),
+  make_tuple(16, 16, &vpx_sad16x16x4d_sse2, -1),
+  make_tuple(16, 8, &vpx_sad16x8x4d_sse2, -1),
+  make_tuple(8, 16, &vpx_sad8x16x4d_sse2, -1),
+  make_tuple(8, 8, &vpx_sad8x8x4d_sse2, -1),
+  make_tuple(8, 4, &vpx_sad8x4x4d_sse2, -1),
+  make_tuple(4, 8, &vpx_sad4x8x4d_sse2, -1),
+  make_tuple(4, 4, &vpx_sad4x4x4d_sse2, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
-  make_tuple(64, 64, highbd_sad64x64x4d_sse2, 8),
-  make_tuple(64, 32, highbd_sad64x32x4d_sse2, 8),
-  make_tuple(32, 64, highbd_sad32x64x4d_sse2, 8),
-  make_tuple(32, 32, highbd_sad32x32x4d_sse2, 8),
-  make_tuple(32, 16, highbd_sad32x16x4d_sse2, 8),
-  make_tuple(16, 32, highbd_sad16x32x4d_sse2, 8),
-  make_tuple(16, 16, highbd_sad16x16x4d_sse2, 8),
-  make_tuple(16, 8, highbd_sad16x8x4d_sse2, 8),
-  make_tuple(8, 16, highbd_sad8x16x4d_sse2, 8),
-  make_tuple(8, 8, highbd_sad8x8x4d_sse2, 8),
-  make_tuple(8, 4, highbd_sad8x4x4d_sse2, 8),
-  make_tuple(4, 8, highbd_sad4x8x4d_sse2, 8),
-  make_tuple(4, 4, highbd_sad4x4x4d_sse2, 8),
-  make_tuple(64, 64, highbd_sad64x64x4d_sse2, 10),
-  make_tuple(64, 32, highbd_sad64x32x4d_sse2, 10),
-  make_tuple(32, 64, highbd_sad32x64x4d_sse2, 10),
-  make_tuple(32, 32, highbd_sad32x32x4d_sse2, 10),
-  make_tuple(32, 16, highbd_sad32x16x4d_sse2, 10),
-  make_tuple(16, 32, highbd_sad16x32x4d_sse2, 10),
-  make_tuple(16, 16, highbd_sad16x16x4d_sse2, 10),
-  make_tuple(16, 8, highbd_sad16x8x4d_sse2, 10),
-  make_tuple(8, 16, highbd_sad8x16x4d_sse2, 10),
-  make_tuple(8, 8, highbd_sad8x8x4d_sse2, 10),
-  make_tuple(8, 4, highbd_sad8x4x4d_sse2, 10),
-  make_tuple(4, 8, highbd_sad4x8x4d_sse2, 10),
-  make_tuple(4, 4, highbd_sad4x4x4d_sse2, 10),
-  make_tuple(64, 64, highbd_sad64x64x4d_sse2, 12),
-  make_tuple(64, 32, highbd_sad64x32x4d_sse2, 12),
-  make_tuple(32, 64, highbd_sad32x64x4d_sse2, 12),
-  make_tuple(32, 32, highbd_sad32x32x4d_sse2, 12),
-  make_tuple(32, 16, highbd_sad32x16x4d_sse2, 12),
-  make_tuple(16, 32, highbd_sad16x32x4d_sse2, 12),
-  make_tuple(16, 16, highbd_sad16x16x4d_sse2, 12),
-  make_tuple(16, 8, highbd_sad16x8x4d_sse2, 12),
-  make_tuple(8, 16, highbd_sad8x16x4d_sse2, 12),
-  make_tuple(8, 8, highbd_sad8x8x4d_sse2, 12),
-  make_tuple(8, 4, highbd_sad8x4x4d_sse2, 12),
-  make_tuple(4, 8, highbd_sad4x8x4d_sse2, 12),
-  make_tuple(4, 4, highbd_sad4x4x4d_sse2, 12),
+  make_tuple(64, 64, &vpx_highbd_sad64x64x4d_sse2, 8),
+  make_tuple(64, 32, &vpx_highbd_sad64x32x4d_sse2, 8),
+  make_tuple(32, 64, &vpx_highbd_sad32x64x4d_sse2, 8),
+  make_tuple(32, 32, &vpx_highbd_sad32x32x4d_sse2, 8),
+  make_tuple(32, 16, &vpx_highbd_sad32x16x4d_sse2, 8),
+  make_tuple(16, 32, &vpx_highbd_sad16x32x4d_sse2, 8),
+  make_tuple(16, 16, &vpx_highbd_sad16x16x4d_sse2, 8),
+  make_tuple(16, 8, &vpx_highbd_sad16x8x4d_sse2, 8),
+  make_tuple(8, 16, &vpx_highbd_sad8x16x4d_sse2, 8),
+  make_tuple(8, 8, &vpx_highbd_sad8x8x4d_sse2, 8),
+  make_tuple(8, 4, &vpx_highbd_sad8x4x4d_sse2, 8),
+  make_tuple(4, 8, &vpx_highbd_sad4x8x4d_sse2, 8),
+  make_tuple(4, 4, &vpx_highbd_sad4x4x4d_sse2, 8),
+  make_tuple(64, 64, &vpx_highbd_sad64x64x4d_sse2, 10),
+  make_tuple(64, 32, &vpx_highbd_sad64x32x4d_sse2, 10),
+  make_tuple(32, 64, &vpx_highbd_sad32x64x4d_sse2, 10),
+  make_tuple(32, 32, &vpx_highbd_sad32x32x4d_sse2, 10),
+  make_tuple(32, 16, &vpx_highbd_sad32x16x4d_sse2, 10),
+  make_tuple(16, 32, &vpx_highbd_sad16x32x4d_sse2, 10),
+  make_tuple(16, 16, &vpx_highbd_sad16x16x4d_sse2, 10),
+  make_tuple(16, 8, &vpx_highbd_sad16x8x4d_sse2, 10),
+  make_tuple(8, 16, &vpx_highbd_sad8x16x4d_sse2, 10),
+  make_tuple(8, 8, &vpx_highbd_sad8x8x4d_sse2, 10),
+  make_tuple(8, 4, &vpx_highbd_sad8x4x4d_sse2, 10),
+  make_tuple(4, 8, &vpx_highbd_sad4x8x4d_sse2, 10),
+  make_tuple(4, 4, &vpx_highbd_sad4x4x4d_sse2, 10),
+  make_tuple(64, 64, &vpx_highbd_sad64x64x4d_sse2, 12),
+  make_tuple(64, 32, &vpx_highbd_sad64x32x4d_sse2, 12),
+  make_tuple(32, 64, &vpx_highbd_sad32x64x4d_sse2, 12),
+  make_tuple(32, 32, &vpx_highbd_sad32x32x4d_sse2, 12),
+  make_tuple(32, 16, &vpx_highbd_sad32x16x4d_sse2, 12),
+  make_tuple(16, 32, &vpx_highbd_sad16x32x4d_sse2, 12),
+  make_tuple(16, 16, &vpx_highbd_sad16x16x4d_sse2, 12),
+  make_tuple(16, 8, &vpx_highbd_sad16x8x4d_sse2, 12),
+  make_tuple(8, 16, &vpx_highbd_sad8x16x4d_sse2, 12),
+  make_tuple(8, 8, &vpx_highbd_sad8x8x4d_sse2, 12),
+  make_tuple(8, 4, &vpx_highbd_sad8x4x4d_sse2, 12),
+  make_tuple(4, 8, &vpx_highbd_sad4x8x4d_sse2, 12),
+  make_tuple(4, 4, &vpx_highbd_sad4x4x4d_sse2, 12),
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
@@ -1076,39 +868,27 @@ INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
 #endif  // HAVE_SSE4_1
 
 #if HAVE_AVX2
-const SadMxNFunc sad64x64_avx2 = vpx_sad64x64_avx2;
-const SadMxNFunc sad64x32_avx2 = vpx_sad64x32_avx2;
-const SadMxNFunc sad32x64_avx2 = vpx_sad32x64_avx2;
-const SadMxNFunc sad32x32_avx2 = vpx_sad32x32_avx2;
-const SadMxNFunc sad32x16_avx2 = vpx_sad32x16_avx2;
 const SadMxNParam avx2_tests[] = {
-  make_tuple(64, 64, sad64x64_avx2, -1),
-  make_tuple(64, 32, sad64x32_avx2, -1),
-  make_tuple(32, 64, sad32x64_avx2, -1),
-  make_tuple(32, 32, sad32x32_avx2, -1),
-  make_tuple(32, 16, sad32x16_avx2, -1),
+  make_tuple(64, 64, &vpx_sad64x64_avx2, -1),
+  make_tuple(64, 32, &vpx_sad64x32_avx2, -1),
+  make_tuple(32, 64, &vpx_sad32x64_avx2, -1),
+  make_tuple(32, 32, &vpx_sad32x32_avx2, -1),
+  make_tuple(32, 16, &vpx_sad32x16_avx2, -1),
 };
 INSTANTIATE_TEST_CASE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
 
-const SadMxNAvgFunc sad64x64_avg_avx2 = vpx_sad64x64_avg_avx2;
-const SadMxNAvgFunc sad64x32_avg_avx2 = vpx_sad64x32_avg_avx2;
-const SadMxNAvgFunc sad32x64_avg_avx2 = vpx_sad32x64_avg_avx2;
-const SadMxNAvgFunc sad32x32_avg_avx2 = vpx_sad32x32_avg_avx2;
-const SadMxNAvgFunc sad32x16_avg_avx2 = vpx_sad32x16_avg_avx2;
 const SadMxNAvgParam avg_avx2_tests[] = {
-  make_tuple(64, 64, sad64x64_avg_avx2, -1),
-  make_tuple(64, 32, sad64x32_avg_avx2, -1),
-  make_tuple(32, 64, sad32x64_avg_avx2, -1),
-  make_tuple(32, 32, sad32x32_avg_avx2, -1),
-  make_tuple(32, 16, sad32x16_avg_avx2, -1),
+  make_tuple(64, 64, &vpx_sad64x64_avg_avx2, -1),
+  make_tuple(64, 32, &vpx_sad64x32_avg_avx2, -1),
+  make_tuple(32, 64, &vpx_sad32x64_avg_avx2, -1),
+  make_tuple(32, 32, &vpx_sad32x32_avg_avx2, -1),
+  make_tuple(32, 16, &vpx_sad32x16_avg_avx2, -1),
 };
 INSTANTIATE_TEST_CASE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
 
-const SadMxNx4Func sad64x64x4d_avx2 = vpx_sad64x64x4d_avx2;
-const SadMxNx4Func sad32x32x4d_avx2 = vpx_sad32x32x4d_avx2;
 const SadMxNx4Param x4d_avx2_tests[] = {
-  make_tuple(64, 64, sad64x64x4d_avx2, -1),
-  make_tuple(32, 32, sad32x32x4d_avx2, -1),
+  make_tuple(64, 64, &vpx_sad64x64x4d_avx2, -1),
+  make_tuple(32, 32, &vpx_sad32x32x4d_avx2, -1),
 };
 INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 #endif  // HAVE_AVX2
@@ -1116,93 +896,54 @@ INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 //------------------------------------------------------------------------------
 // MIPS functions
 #if HAVE_MSA
-const SadMxNFunc sad64x64_msa = vpx_sad64x64_msa;
-const SadMxNFunc sad64x32_msa = vpx_sad64x32_msa;
-const SadMxNFunc sad32x64_msa = vpx_sad32x64_msa;
-const SadMxNFunc sad32x32_msa = vpx_sad32x32_msa;
-const SadMxNFunc sad32x16_msa = vpx_sad32x16_msa;
-const SadMxNFunc sad16x32_msa = vpx_sad16x32_msa;
-const SadMxNFunc sad16x16_msa = vpx_sad16x16_msa;
-const SadMxNFunc sad16x8_msa = vpx_sad16x8_msa;
-const SadMxNFunc sad8x16_msa = vpx_sad8x16_msa;
-const SadMxNFunc sad8x8_msa = vpx_sad8x8_msa;
-const SadMxNFunc sad8x4_msa = vpx_sad8x4_msa;
-const SadMxNFunc sad4x8_msa = vpx_sad4x8_msa;
-const SadMxNFunc sad4x4_msa = vpx_sad4x4_msa;
 const SadMxNParam msa_tests[] = {
-  make_tuple(64, 64, sad64x64_msa, -1),
-  make_tuple(64, 32, sad64x32_msa, -1),
-  make_tuple(32, 64, sad32x64_msa, -1),
-  make_tuple(32, 32, sad32x32_msa, -1),
-  make_tuple(32, 16, sad32x16_msa, -1),
-  make_tuple(16, 32, sad16x32_msa, -1),
-  make_tuple(16, 16, sad16x16_msa, -1),
-  make_tuple(16, 8, sad16x8_msa, -1),
-  make_tuple(8, 16, sad8x16_msa, -1),
-  make_tuple(8, 8, sad8x8_msa, -1),
-  make_tuple(8, 4, sad8x4_msa, -1),
-  make_tuple(4, 8, sad4x8_msa, -1),
-  make_tuple(4, 4, sad4x4_msa, -1),
+  make_tuple(64, 64, &vpx_sad64x64_msa, -1),
+  make_tuple(64, 32, &vpx_sad64x32_msa, -1),
+  make_tuple(32, 64, &vpx_sad32x64_msa, -1),
+  make_tuple(32, 32, &vpx_sad32x32_msa, -1),
+  make_tuple(32, 16, &vpx_sad32x16_msa, -1),
+  make_tuple(16, 32, &vpx_sad16x32_msa, -1),
+  make_tuple(16, 16, &vpx_sad16x16_msa, -1),
+  make_tuple(16, 8, &vpx_sad16x8_msa, -1),
+  make_tuple(8, 16, &vpx_sad8x16_msa, -1),
+  make_tuple(8, 8, &vpx_sad8x8_msa, -1),
+  make_tuple(8, 4, &vpx_sad8x4_msa, -1),
+  make_tuple(4, 8, &vpx_sad4x8_msa, -1),
+  make_tuple(4, 4, &vpx_sad4x4_msa, -1),
 };
 INSTANTIATE_TEST_CASE_P(MSA, SADTest, ::testing::ValuesIn(msa_tests));
 
-const SadMxNAvgFunc sad64x64_avg_msa = vpx_sad64x64_avg_msa;
-const SadMxNAvgFunc sad64x32_avg_msa = vpx_sad64x32_avg_msa;
-const SadMxNAvgFunc sad32x64_avg_msa = vpx_sad32x64_avg_msa;
-const SadMxNAvgFunc sad32x32_avg_msa = vpx_sad32x32_avg_msa;
-const SadMxNAvgFunc sad32x16_avg_msa = vpx_sad32x16_avg_msa;
-const SadMxNAvgFunc sad16x32_avg_msa = vpx_sad16x32_avg_msa;
-const SadMxNAvgFunc sad16x16_avg_msa = vpx_sad16x16_avg_msa;
-const SadMxNAvgFunc sad16x8_avg_msa = vpx_sad16x8_avg_msa;
-const SadMxNAvgFunc sad8x16_avg_msa = vpx_sad8x16_avg_msa;
-const SadMxNAvgFunc sad8x8_avg_msa = vpx_sad8x8_avg_msa;
-const SadMxNAvgFunc sad8x4_avg_msa = vpx_sad8x4_avg_msa;
-const SadMxNAvgFunc sad4x8_avg_msa = vpx_sad4x8_avg_msa;
-const SadMxNAvgFunc sad4x4_avg_msa = vpx_sad4x4_avg_msa;
 const SadMxNAvgParam avg_msa_tests[] = {
-  make_tuple(64, 64, sad64x64_avg_msa, -1),
-  make_tuple(64, 32, sad64x32_avg_msa, -1),
-  make_tuple(32, 64, sad32x64_avg_msa, -1),
-  make_tuple(32, 32, sad32x32_avg_msa, -1),
-  make_tuple(32, 16, sad32x16_avg_msa, -1),
-  make_tuple(16, 32, sad16x32_avg_msa, -1),
-  make_tuple(16, 16, sad16x16_avg_msa, -1),
-  make_tuple(16, 8, sad16x8_avg_msa, -1),
-  make_tuple(8, 16, sad8x16_avg_msa, -1),
-  make_tuple(8, 8, sad8x8_avg_msa, -1),
-  make_tuple(8, 4, sad8x4_avg_msa, -1),
-  make_tuple(4, 8, sad4x8_avg_msa, -1),
-  make_tuple(4, 4, sad4x4_avg_msa, -1),
+  make_tuple(64, 64, &vpx_sad64x64_avg_msa, -1),
+  make_tuple(64, 32, &vpx_sad64x32_avg_msa, -1),
+  make_tuple(32, 64, &vpx_sad32x64_avg_msa, -1),
+  make_tuple(32, 32, &vpx_sad32x32_avg_msa, -1),
+  make_tuple(32, 16, &vpx_sad32x16_avg_msa, -1),
+  make_tuple(16, 32, &vpx_sad16x32_avg_msa, -1),
+  make_tuple(16, 16, &vpx_sad16x16_avg_msa, -1),
+  make_tuple(16, 8, &vpx_sad16x8_avg_msa, -1),
+  make_tuple(8, 16, &vpx_sad8x16_avg_msa, -1),
+  make_tuple(8, 8, &vpx_sad8x8_avg_msa, -1),
+  make_tuple(8, 4, &vpx_sad8x4_avg_msa, -1),
+  make_tuple(4, 8, &vpx_sad4x8_avg_msa, -1),
+  make_tuple(4, 4, &vpx_sad4x4_avg_msa, -1),
 };
 INSTANTIATE_TEST_CASE_P(MSA, SADavgTest, ::testing::ValuesIn(avg_msa_tests));
 
-const SadMxNx4Func sad64x64x4d_msa = vpx_sad64x64x4d_msa;
-const SadMxNx4Func sad64x32x4d_msa = vpx_sad64x32x4d_msa;
-const SadMxNx4Func sad32x64x4d_msa = vpx_sad32x64x4d_msa;
-const SadMxNx4Func sad32x32x4d_msa = vpx_sad32x32x4d_msa;
-const SadMxNx4Func sad32x16x4d_msa = vpx_sad32x16x4d_msa;
-const SadMxNx4Func sad16x32x4d_msa = vpx_sad16x32x4d_msa;
-const SadMxNx4Func sad16x16x4d_msa = vpx_sad16x16x4d_msa;
-const SadMxNx4Func sad16x8x4d_msa = vpx_sad16x8x4d_msa;
-const SadMxNx4Func sad8x16x4d_msa = vpx_sad8x16x4d_msa;
-const SadMxNx4Func sad8x8x4d_msa = vpx_sad8x8x4d_msa;
-const SadMxNx4Func sad8x4x4d_msa = vpx_sad8x4x4d_msa;
-const SadMxNx4Func sad4x8x4d_msa = vpx_sad4x8x4d_msa;
-const SadMxNx4Func sad4x4x4d_msa = vpx_sad4x4x4d_msa;
 const SadMxNx4Param x4d_msa_tests[] = {
-  make_tuple(64, 64, sad64x64x4d_msa, -1),
-  make_tuple(64, 32, sad64x32x4d_msa, -1),
-  make_tuple(32, 64, sad32x64x4d_msa, -1),
-  make_tuple(32, 32, sad32x32x4d_msa, -1),
-  make_tuple(32, 16, sad32x16x4d_msa, -1),
-  make_tuple(16, 32, sad16x32x4d_msa, -1),
-  make_tuple(16, 16, sad16x16x4d_msa, -1),
-  make_tuple(16, 8, sad16x8x4d_msa, -1),
-  make_tuple(8, 16, sad8x16x4d_msa, -1),
-  make_tuple(8, 8, sad8x8x4d_msa, -1),
-  make_tuple(8, 4, sad8x4x4d_msa, -1),
-  make_tuple(4, 8, sad4x8x4d_msa, -1),
-  make_tuple(4, 4, sad4x4x4d_msa, -1),
+  make_tuple(64, 64, &vpx_sad64x64x4d_msa, -1),
+  make_tuple(64, 32, &vpx_sad64x32x4d_msa, -1),
+  make_tuple(32, 64, &vpx_sad32x64x4d_msa, -1),
+  make_tuple(32, 32, &vpx_sad32x32x4d_msa, -1),
+  make_tuple(32, 16, &vpx_sad32x16x4d_msa, -1),
+  make_tuple(16, 32, &vpx_sad16x32x4d_msa, -1),
+  make_tuple(16, 16, &vpx_sad16x16x4d_msa, -1),
+  make_tuple(16, 8, &vpx_sad16x8x4d_msa, -1),
+  make_tuple(8, 16, &vpx_sad8x16x4d_msa, -1),
+  make_tuple(8, 8, &vpx_sad8x8x4d_msa, -1),
+  make_tuple(8, 4, &vpx_sad8x4x4d_msa, -1),
+  make_tuple(4, 8, &vpx_sad4x8x4d_msa, -1),
+  make_tuple(4, 4, &vpx_sad4x4x4d_msa, -1),
 };
 INSTANTIATE_TEST_CASE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests));
 #endif  // HAVE_MSA
diff --git a/test/simple_encoder.sh b/test/simple_encoder.sh
index c4a6280..ee633ae 100755
--- a/test/simple_encoder.sh
+++ b/test/simple_encoder.sh
@@ -23,7 +23,7 @@ simple_encoder_verify_environment() {
   fi
 }
 
-# Runs simple_encoder using the codec specified by $1.
+# Runs simple_encoder using the codec specified by $1 with a frame limit of 100.
 simple_encoder() {
   local encoder="${LIBVPX_BIN_PATH}/simple_encoder${VPX_TEST_EXE_SUFFIX}"
   local codec="$1"
@@ -35,7 +35,7 @@ simple_encoder() {
   fi
 
   eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
-      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 0 100 \
       ${devnull}
 
   [ -e "${output_file}" ] || return 1
@@ -47,16 +47,13 @@ simple_encoder_vp8() {
   fi
 }
 
-# TODO(tomfinegan): Add a frame limit param to simple_encoder and enable this
-# test. VP9 is just too slow right now: This test takes 4m30s+ on a fast
-# machine.
-DISABLED_simple_encoder_vp9() {
+simple_encoder_vp9() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
     simple_encoder vp9 || return 1
   fi
 }
 
 simple_encoder_tests="simple_encoder_vp8
-                      DISABLED_simple_encoder_vp9"
+                      simple_encoder_vp9"
 
 run_tests simple_encoder_verify_environment "${simple_encoder_tests}"
diff --git a/test/sixtap_predict_test.cc b/test/sixtap_predict_test.cc
index 1e682e7..304a148 100644
--- a/test/sixtap_predict_test.cc
+++ b/test/sixtap_predict_test.cc
@@ -186,70 +186,48 @@ TEST_P(SixtapPredictTest, TestWithRandomData) {
 
 using std::tr1::make_tuple;
 
-const SixtapPredictFunc sixtap_16x16_c = vp8_sixtap_predict16x16_c;
-const SixtapPredictFunc sixtap_8x8_c = vp8_sixtap_predict8x8_c;
-const SixtapPredictFunc sixtap_8x4_c = vp8_sixtap_predict8x4_c;
-const SixtapPredictFunc sixtap_4x4_c = vp8_sixtap_predict4x4_c;
 INSTANTIATE_TEST_CASE_P(
     C, SixtapPredictTest, ::testing::Values(
-        make_tuple(16, 16, sixtap_16x16_c),
-        make_tuple(8, 8, sixtap_8x8_c),
-        make_tuple(8, 4, sixtap_8x4_c),
-        make_tuple(4, 4, sixtap_4x4_c)));
+        make_tuple(16, 16, &vp8_sixtap_predict16x16_c),
+        make_tuple(8, 8, &vp8_sixtap_predict8x8_c),
+        make_tuple(8, 4, &vp8_sixtap_predict8x4_c),
+        make_tuple(4, 4, &vp8_sixtap_predict4x4_c)));
 #if HAVE_NEON
-const SixtapPredictFunc sixtap_16x16_neon = vp8_sixtap_predict16x16_neon;
-const SixtapPredictFunc sixtap_8x8_neon = vp8_sixtap_predict8x8_neon;
-const SixtapPredictFunc sixtap_8x4_neon = vp8_sixtap_predict8x4_neon;
 INSTANTIATE_TEST_CASE_P(
     NEON, SixtapPredictTest, ::testing::Values(
-        make_tuple(16, 16, sixtap_16x16_neon),
-        make_tuple(8, 8, sixtap_8x8_neon),
-        make_tuple(8, 4, sixtap_8x4_neon)));
+        make_tuple(16, 16, &vp8_sixtap_predict16x16_neon),
+        make_tuple(8, 8, &vp8_sixtap_predict8x8_neon),
+        make_tuple(8, 4, &vp8_sixtap_predict8x4_neon)));
 #endif
 #if HAVE_MMX
-const SixtapPredictFunc sixtap_16x16_mmx = vp8_sixtap_predict16x16_mmx;
-const SixtapPredictFunc sixtap_8x8_mmx = vp8_sixtap_predict8x8_mmx;
-const SixtapPredictFunc sixtap_8x4_mmx = vp8_sixtap_predict8x4_mmx;
-const SixtapPredictFunc sixtap_4x4_mmx = vp8_sixtap_predict4x4_mmx;
 INSTANTIATE_TEST_CASE_P(
     MMX, SixtapPredictTest, ::testing::Values(
-        make_tuple(16, 16, sixtap_16x16_mmx),
-        make_tuple(8, 8, sixtap_8x8_mmx),
-        make_tuple(8, 4, sixtap_8x4_mmx),
-        make_tuple(4, 4, sixtap_4x4_mmx)));
+        make_tuple(16, 16, &vp8_sixtap_predict16x16_mmx),
+        make_tuple(8, 8, &vp8_sixtap_predict8x8_mmx),
+        make_tuple(8, 4, &vp8_sixtap_predict8x4_mmx),
+        make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx)));
 #endif
 #if HAVE_SSE2
-const SixtapPredictFunc sixtap_16x16_sse2 = vp8_sixtap_predict16x16_sse2;
-const SixtapPredictFunc sixtap_8x8_sse2 = vp8_sixtap_predict8x8_sse2;
-const SixtapPredictFunc sixtap_8x4_sse2 = vp8_sixtap_predict8x4_sse2;
 INSTANTIATE_TEST_CASE_P(
     SSE2, SixtapPredictTest, ::testing::Values(
-        make_tuple(16, 16, sixtap_16x16_sse2),
-        make_tuple(8, 8, sixtap_8x8_sse2),
-        make_tuple(8, 4, sixtap_8x4_sse2)));
+        make_tuple(16, 16, &vp8_sixtap_predict16x16_sse2),
+        make_tuple(8, 8, &vp8_sixtap_predict8x8_sse2),
+        make_tuple(8, 4, &vp8_sixtap_predict8x4_sse2)));
 #endif
 #if HAVE_SSSE3
-const SixtapPredictFunc sixtap_16x16_ssse3 = vp8_sixtap_predict16x16_ssse3;
-const SixtapPredictFunc sixtap_8x8_ssse3 = vp8_sixtap_predict8x8_ssse3;
-const SixtapPredictFunc sixtap_8x4_ssse3 = vp8_sixtap_predict8x4_ssse3;
-const SixtapPredictFunc sixtap_4x4_ssse3 = vp8_sixtap_predict4x4_ssse3;
 INSTANTIATE_TEST_CASE_P(
     SSSE3, SixtapPredictTest, ::testing::Values(
-        make_tuple(16, 16, sixtap_16x16_ssse3),
-        make_tuple(8, 8, sixtap_8x8_ssse3),
-        make_tuple(8, 4, sixtap_8x4_ssse3),
-        make_tuple(4, 4, sixtap_4x4_ssse3)));
+        make_tuple(16, 16, &vp8_sixtap_predict16x16_ssse3),
+        make_tuple(8, 8, &vp8_sixtap_predict8x8_ssse3),
+        make_tuple(8, 4, &vp8_sixtap_predict8x4_ssse3),
+        make_tuple(4, 4, &vp8_sixtap_predict4x4_ssse3)));
 #endif
 #if HAVE_MSA
-const SixtapPredictFunc sixtap_16x16_msa = vp8_sixtap_predict16x16_msa;
-const SixtapPredictFunc sixtap_8x8_msa = vp8_sixtap_predict8x8_msa;
-const SixtapPredictFunc sixtap_8x4_msa = vp8_sixtap_predict8x4_msa;
-const SixtapPredictFunc sixtap_4x4_msa = vp8_sixtap_predict4x4_msa;
 INSTANTIATE_TEST_CASE_P(
     MSA, SixtapPredictTest, ::testing::Values(
-        make_tuple(16, 16, sixtap_16x16_msa),
-        make_tuple(8, 8, sixtap_8x8_msa),
-        make_tuple(8, 4, sixtap_8x4_msa),
-        make_tuple(4, 4, sixtap_4x4_msa)));
+        make_tuple(16, 16, &vp8_sixtap_predict16x16_msa),
+        make_tuple(8, 8, &vp8_sixtap_predict8x8_msa),
+        make_tuple(8, 4, &vp8_sixtap_predict8x4_msa),
+        make_tuple(4, 4, &vp8_sixtap_predict4x4_msa)));
 #endif
 }  // namespace
diff --git a/test/superframe_test.cc b/test/superframe_test.cc
index 90aa75b..b07bcb2 100644
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc
@@ -17,7 +17,6 @@
 namespace {
 
 const int kTestMode = 0;
-const int kSuperframeSyntax = 1;
 
 typedef std::tr1::tuple<libvpx_test::TestMode,int> SuperframeTestParam;
 
@@ -32,11 +31,9 @@ class SuperframeTest : public ::libvpx_test::EncoderTest,
     InitializeConfig();
     const SuperframeTestParam input = GET_PARAM(1);
     const libvpx_test::TestMode mode = std::tr1::get<kTestMode>(input);
-    const int syntax = std::tr1::get<kSuperframeSyntax>(input);
     SetMode(mode);
     sf_count_ = 0;
     sf_count_max_ = INT_MAX;
-    is_vp10_style_superframe_ = syntax;
   }
 
   virtual void TearDown() {
@@ -59,8 +56,7 @@ class SuperframeTest : public ::libvpx_test::EncoderTest,
     const uint8_t marker = buffer[pkt->data.frame.sz - 1];
     const int frames = (marker & 0x7) + 1;
     const int mag = ((marker >> 3) & 3) + 1;
-    const unsigned int index_sz =
-        2 + mag * (frames - is_vp10_style_superframe_);
+    const unsigned int index_sz = 2 + mag * frames;
     if ((marker & 0xe0) == 0xc0 &&
         pkt->data.frame.sz >= index_sz &&
         buffer[pkt->data.frame.sz - index_sz] == marker) {
@@ -85,7 +81,6 @@ class SuperframeTest : public ::libvpx_test::EncoderTest,
     return pkt;
   }
 
-  int is_vp10_style_superframe_;
   int sf_count_;
   int sf_count_max_;
   vpx_codec_cx_pkt_t modified_pkt_;
@@ -106,8 +101,4 @@ TEST_P(SuperframeTest, TestSuperframeIndexIsOptional) {
 VP9_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Combine(
     ::testing::Values(::libvpx_test::kTwoPassGood),
     ::testing::Values(0)));
-
-VP10_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Combine(
-    ::testing::Values(::libvpx_test::kTwoPassGood),
-    ::testing::Values(CONFIG_MISC_FIXES)));
 }  // namespace
diff --git a/test/test-data.mk b/test/test-data.mk
index 4280b35..05a0885 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -418,6 +418,18 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-130x132.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-130x132.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-132x130.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-132x130.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-132x132.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-132x132.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-178x180.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-178x180.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-180x178.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-180x178.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-180x180.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-180x180.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
@@ -550,6 +562,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-352x288.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-352x288.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm
@@ -642,6 +656,34 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-fp-tiles-8-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-1-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-2-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-4-8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-14-resize-10frames-fp-tiles-8-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-15-segkey_adpq.webm
@@ -769,3 +811,53 @@ endif  # CONFIG_ENCODE_PERF_TESTS
 
 # sort and remove duplicates
 LIBVPX_TEST_DATA-yes := $(sort $(LIBVPX_TEST_DATA-yes))
+
+# VP9 dynamic resizing test (decoder)
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_5_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_5_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_5_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_5_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_7_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_7_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_7_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x180_7_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_5_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_5_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_5_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_5_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_7_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_7_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_7_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_320x240_7_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_5_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_5_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_5_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_5_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_7_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_7_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_7_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x360_7_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_5_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_5_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_5_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_5_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_7_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_7_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_7_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_640x480_7_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_5_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_5_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_5_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_5_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_7_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_7_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_7_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1280x720_7_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_5_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_5_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_5_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_5_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_1-2.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_1-2.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm.md5
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 4e4ac62..a4ed174 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -550,6 +550,8 @@ d17bc08eedfc60c4c23d576a6c964a21bf854d1f *vp90-2-03-size-226x202.webm
 83c6d8f2969b759e10e5c6542baca1265c874c29 *vp90-2-03-size-226x224.webm.md5
 fe0af2ee47b1e5f6a66db369e2d7e9d870b38dce *vp90-2-03-size-226x226.webm
 94ad19b8b699cea105e2ff18f0df2afd7242bcf7 *vp90-2-03-size-226x226.webm.md5
+52bc1dfd3a97b24d922eb8a31d07527891561f2a *vp90-2-03-size-352x288.webm
+3084d6d0a1eec22e85a394422fbc8faae58930a5 *vp90-2-03-size-352x288.webm.md5
 b6524e4084d15b5d0caaa3d3d1368db30cbee69c *vp90-2-03-deltaq.webm
 65f45ec9a55537aac76104818278e0978f94a678 *vp90-2-03-deltaq.webm.md5
 4dbb87494c7f565ffc266c98d17d0d8c7a5c5aba *vp90-2-05-resize.ivf
@@ -744,3 +746,91 @@ e60d859b0ef2b331b21740cf6cb83fabe469b079 *invalid-vp90-2-03-size-202x210.webm.iv
 0ae808dca4d3c1152a9576e14830b6faa39f1b4a *invalid-vp90-2-03-size-202x210.webm.ivf.s113306_r01-05_b6-.ivf.res
 9cfc855459e7549fd015c79e8eca512b2f2cb7e3 *niklas_1280_720_30.y4m
 5b5763b388b1b52a81bb82b39f7ec25c4bd3d0e1 *desktop_credits.y4m
+85771f6ab44e4a0226e206c0cde8351dd5918953 *vp90-2-02-size-130x132.webm
+512dad5eabbed37b4bbbc64ce153f1a5484427b8 *vp90-2-02-size-130x132.webm.md5
+01f7127d40360289db63b27f61cb9afcda350e95 *vp90-2-02-size-132x130.webm
+4a94275328ae076cf60f966c097a8721010fbf5a *vp90-2-02-size-132x130.webm.md5
+f41c0400b5716b4b70552c40dd03d44be131e1cc *vp90-2-02-size-132x132.webm
+1a69e989f697e424bfe3e3e8a77bb0c0992c8e47 *vp90-2-02-size-132x132.webm.md5
+94a5cbfacacba100e0c5f7861c72a1b417feca0f *vp90-2-02-size-178x180.webm
+dedfecf1d784bcf70629592fa5e6f01d5441ccc9 *vp90-2-02-size-178x180.webm.md5
+4828b62478c04014bba3095a83106911a71cf387 *vp90-2-02-size-180x178.webm
+423da2b861050c969d78ed8e8f8f14045d1d8199 *vp90-2-02-size-180x178.webm.md5
+338f7c9282f43e29940f5391118aadd17e4f9234 *vp90-2-02-size-180x180.webm
+6c2ef013392310778dca5dd5351160eca66b0a60 *vp90-2-02-size-180x180.webm.md5
+679fa7d6807e936ff937d7b282e7dbd8ac76447e *vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm
+fc7267ab8fc2bf5d6c234e34ee6c078a967b4888 *vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm.md5
+9d33a137c819792209c5ce4e4e1ee5da73d574fe *vp90-2-14-resize-10frames-fp-tiles-1-2.webm
+0c78a154956a8605d050bdd75e0dcc4d39c040a6 *vp90-2-14-resize-10frames-fp-tiles-1-2.webm.md5
+d6a8d8c57f66a91d23e8e7df480f9ae841e56c37 *vp90-2-14-resize-10frames-fp-tiles-1-4.webm
+e9b4e8c7b33b5fda745d340c3f47e6623ae40cf2 *vp90-2-14-resize-10frames-fp-tiles-1-4.webm.md5
+aa6fe043a0c4a42b49c87ebbe812d4afd9945bec *vp90-2-14-resize-10frames-fp-tiles-1-8.webm
+028520578994c2d013d4c0129033d4f2ff31bbe0 *vp90-2-14-resize-10frames-fp-tiles-1-8.webm.md5
+d1d5463c9ea7b5cc5f609ddedccddf656f348d1a *vp90-2-14-resize-10frames-fp-tiles-2-1.webm
+92d5872f5bdffbed721703b7e959b4f885e3d77a *vp90-2-14-resize-10frames-fp-tiles-2-1.webm.md5
+677cb29de1215d97346015af5807a9b1faad54cf *vp90-2-14-resize-10frames-fp-tiles-2-4.webm
+a5db19f977094ec3fd60b4f7671b3e6740225e12 *vp90-2-14-resize-10frames-fp-tiles-2-4.webm.md5
+cdd3c52ba21067efdbb2de917fe2a965bf27332e *vp90-2-14-resize-10frames-fp-tiles-2-8.webm
+db17ec5d894ea8b8d0b7f32206d0dd3d46dcfa6d *vp90-2-14-resize-10frames-fp-tiles-2-8.webm.md5
+0f6093c472125d05b764d7d1965c1d56771c0ea2 *vp90-2-14-resize-10frames-fp-tiles-4-1.webm
+bc7c79e1bee07926dd970462ce6f64fc30eec3e1 *vp90-2-14-resize-10frames-fp-tiles-4-1.webm.md5
+c5142e2bff4091338196c8ea8bc9266e64f548bc *vp90-2-14-resize-10frames-fp-tiles-4-2.webm
+22aa3dd430b69fd3d92f6561bac86deeed90486d *vp90-2-14-resize-10frames-fp-tiles-4-2.webm.md5
+ede8b1466d2f26e1b1bd9602addb9cd1017e1d8c *vp90-2-14-resize-10frames-fp-tiles-4-8.webm
+508d5ebb9c0eac2a4100281a3ee052ec2fc19217 *vp90-2-14-resize-10frames-fp-tiles-4-8.webm.md5
+2b292e3392854cd1d76ae597a6f53656cf741cfa *vp90-2-14-resize-10frames-fp-tiles-8-1.webm
+1c24e54fa19e94e1722f24676404444e941c3d31 *vp90-2-14-resize-10frames-fp-tiles-8-1.webm.md5
+61beda21064e09634564caa6697ab90bd53c9af7 *vp90-2-14-resize-10frames-fp-tiles-8-2.webm
+9c0657b4d9e1d0e4c9d28a90e5a8630a65519124 *vp90-2-14-resize-10frames-fp-tiles-8-2.webm.md5
+1758c50a11a7c92522749b4a251664705f1f0d4b *vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm
+4f454a06750614314ae15a44087b79016fe2db97 *vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm.md5
+3920c95ba94f1f048a731d9d9b416043b44aa4bd *vp90-2-14-resize-10frames-fp-tiles-8-4.webm
+4eb347a0456d2c49a1e1d8de5aa1c51acc39887e *vp90-2-14-resize-10frames-fp-tiles-8-4.webm.md5
+4b95a74c032a473b6683d7ad5754db1b0ec378e9 *vp90-2-21-resize_inter_1280x720_5_1-2.webm
+a7826dd386bedfe69d02736969bfb47fb6a40a5e *vp90-2-21-resize_inter_1280x720_5_1-2.webm.md5
+5cfff79e82c4d69964ccb8e75b4f0c53b9295167 *vp90-2-21-resize_inter_1280x720_5_3-4.webm
+a18f57db4a25e1f543a99f2ceb182e00db0ee22f *vp90-2-21-resize_inter_1280x720_5_3-4.webm.md5
+d26db0811bf30eb4131d928669713e2485f8e833 *vp90-2-21-resize_inter_1280x720_7_1-2.webm
+fd6f9f332cd5bea4c0f0d57be4297bea493cc5a1 *vp90-2-21-resize_inter_1280x720_7_1-2.webm.md5
+5c7d73d4d268e2ba9593b31cb091fd339505c7fd *vp90-2-21-resize_inter_1280x720_7_3-4.webm
+7bbb949cabc1e70dadcc74582739f63b833034e0 *vp90-2-21-resize_inter_1280x720_7_3-4.webm.md5
+f2d2a41a60eb894aff0c5854afca15931f1445a8 *vp90-2-21-resize_inter_1920x1080_5_1-2.webm
+66d7789992613ac9d678ff905ff1059daa1b89e4 *vp90-2-21-resize_inter_1920x1080_5_1-2.webm.md5
+764edb75fe7dd64e73a1b4f3b4b2b1bf237a4dea *vp90-2-21-resize_inter_1920x1080_5_3-4.webm
+f78bea1075983fd990e7f25d4f31438f9b5efa34 *vp90-2-21-resize_inter_1920x1080_5_3-4.webm.md5
+96496f2ade764a5de9f0c27917c7df1f120fb2ef *vp90-2-21-resize_inter_1920x1080_7_1-2.webm
+2632b635135ed5ecd67fd22dec7990d29c4f4cb5 *vp90-2-21-resize_inter_1920x1080_7_1-2.webm.md5
+74889ea42001bf41428cb742ca74e65129c886dc *vp90-2-21-resize_inter_1920x1080_7_3-4.webm
+d2cf3b25956415bb579d368e7098097e482dd73a *vp90-2-21-resize_inter_1920x1080_7_3-4.webm.md5
+4658986a8ce36ebfcc80a1903e446eaab3985336 *vp90-2-21-resize_inter_320x180_5_1-2.webm
+8a3d8cf325109ffa913cc9426c32eea8c202a09a *vp90-2-21-resize_inter_320x180_5_1-2.webm.md5
+16303aa45176520ee42c2c425247aadc1506b881 *vp90-2-21-resize_inter_320x180_5_3-4.webm
+41cab1ddf7715b680a4dbce42faa9bcd72af4e5c *vp90-2-21-resize_inter_320x180_5_3-4.webm.md5
+56648adcee66dd0e5cb6ac947f5ee1b9cc8ba129 *vp90-2-21-resize_inter_320x180_7_1-2.webm
+70047377787003cc03dda7b2394e6d7eaa666d9e *vp90-2-21-resize_inter_320x180_7_1-2.webm.md5
+d2ff99165488499cc55f75929f1ce5ca9c9e359b *vp90-2-21-resize_inter_320x180_7_3-4.webm
+e69019e378114a4643db283b66d1a7e304761a56 *vp90-2-21-resize_inter_320x180_7_3-4.webm.md5
+4834d129bed0f4289d3a88f2ae3a1736f77621b0 *vp90-2-21-resize_inter_320x240_5_1-2.webm
+a75653c53d22b623c1927fc0088da21dafef21f4 *vp90-2-21-resize_inter_320x240_5_1-2.webm.md5
+19818e1b7fd1c1e63d8873c31b0babe29dd33ba6 *vp90-2-21-resize_inter_320x240_5_3-4.webm
+8d89814ff469a186312111651b16601dfbce4336 *vp90-2-21-resize_inter_320x240_5_3-4.webm.md5
+ac8057bae52498f324ce92a074d5f8207cc4a4a7 *vp90-2-21-resize_inter_320x240_7_1-2.webm
+2643440898c83c08cc47bc744245af696b877c24 *vp90-2-21-resize_inter_320x240_7_1-2.webm.md5
+cf4a4cd38ac8b18c42d8c25a3daafdb39132256b *vp90-2-21-resize_inter_320x240_7_3-4.webm
+70ba8ec9120b26e9b0ffa2c79b432f16cbcb50ec *vp90-2-21-resize_inter_320x240_7_3-4.webm.md5
+669f10409fe1c4a054010162ca47773ea1fdbead *vp90-2-21-resize_inter_640x360_5_1-2.webm
+6355a04249004a35fb386dd1024214234f044383 *vp90-2-21-resize_inter_640x360_5_1-2.webm.md5
+c23763b950b8247c1775d1f8158d93716197676c *vp90-2-21-resize_inter_640x360_5_3-4.webm
+59e6fc381e3ec3b7bdaac586334e0bc944d18fb6 *vp90-2-21-resize_inter_640x360_5_3-4.webm.md5
+71b45cbfdd068baa1f679a69e5e6f421d256a85f *vp90-2-21-resize_inter_640x360_7_1-2.webm
+1416fc761b690c54a955c4cf017fa078520e8c18 *vp90-2-21-resize_inter_640x360_7_1-2.webm.md5
+6c409903279448a697e4db63bab1061784bcd8d2 *vp90-2-21-resize_inter_640x360_7_3-4.webm
+60de1299793433a630b71130cf76c9f5965758e2 *vp90-2-21-resize_inter_640x360_7_3-4.webm.md5
+852b597b8af096d90c80bf0ed6ed3b336b851f19 *vp90-2-21-resize_inter_640x480_5_1-2.webm
+f6856f19236ee46ed462bd0a2e7e72b9c3b9cea6 *vp90-2-21-resize_inter_640x480_5_1-2.webm.md5
+792a16c6f60043bd8dceb515f0b95b8891647858 *vp90-2-21-resize_inter_640x480_5_3-4.webm
+68ffe59877e9a7863805e1c0a3ce18ce037d7c9d *vp90-2-21-resize_inter_640x480_5_3-4.webm.md5
+61e044c4759972a35ea3db8c1478a988910a4ef4 *vp90-2-21-resize_inter_640x480_7_1-2.webm
+7739bfca167b1b43fea72f807f01e097b7cb98d8 *vp90-2-21-resize_inter_640x480_7_1-2.webm.md5
+7291af354b4418917eee00e3a7e366086a0b7a10 *vp90-2-21-resize_inter_640x480_7_3-4.webm
+4a18b09ccb36564193f0215f599d745d95bb558c *vp90-2-21-resize_inter_640x480_7_3-4.webm.md5
diff --git a/test/test.mk b/test/test.mk
index 8d66244..2d50ce8 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -18,15 +18,17 @@ LIBVPX_TEST_SRCS-yes += video_source.h
 LIBVPX_TEST_SRCS-yes                   += ../md5_utils.h ../md5_utils.c
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += altref_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += aq_segment_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += datarate_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += encode_api_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += realtime_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += resize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += yuv_video_source.h
 
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += altref_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
@@ -44,6 +46,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_end_to_end_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ethread_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += level_test.cc
 
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h
@@ -58,10 +61,10 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_test.cc ../y4menc.c ../y4menc.h
 
 ## WebM Parsing
 ifeq ($(CONFIG_WEBM_IO), yes)
-LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvparser.cpp
-LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvreader.cpp
-LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvparser.hpp
-LIBWEBM_PARSER_SRCS                    += ../third_party/libwebm/mkvreader.hpp
+LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvparser.cc
+LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvreader.cc
+LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvparser.h
+LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvreader.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += $(LIBWEBM_PARSER_SRCS)
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../tools_common.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.cc
@@ -92,10 +95,9 @@ endif
 ## shared library builds don't make these functions accessible.
 ##
 ifeq ($(CONFIG_SHARED),)
-LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += lpf_8_test.cc
 
 ## VP8
-ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
+ifeq ($(CONFIG_VP8),yes)
 
 # These tests require both the encoder and decoder to be built.
 ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes)
@@ -103,12 +105,13 @@ LIBVPX_TEST_SRCS-yes                   += vp8_boolcoder_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp8_fragments_test.cc
 endif
 
+LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += add_noise_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
 
 LIBVPX_TEST_SRCS-yes                   += idct_test.cc
 LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
@@ -121,7 +124,7 @@ endif
 endif # VP8
 
 ## VP9
-ifneq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),)
+ifeq ($(CONFIG_VP9),yes)
 
 # These tests require both the encoder and decoder to be built.
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),yesyes)
@@ -134,25 +137,27 @@ LIBVPX_TEST_SRCS-yes                   += vp9_boolcoder_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp9_encoder_parms_get_to_decoder.cc
 endif
 
-LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += convolve_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
+LIBVPX_TEST_SRCS-yes                   += convolve_test.cc
+LIBVPX_TEST_SRCS-yes                   += lpf_8_test.cc
+LIBVPX_TEST_SRCS-yes                   += vp9_intrapred_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_decrypt_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += avg_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += vp9_intrapred_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
 
 ifeq ($(CONFIG_VP9_ENCODER),yes)
 LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += blockiness_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += consistency_test.cc
-
 endif
 
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes)
@@ -162,14 +167,12 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_arf_freq_test.cc
 
 endif # VP9
 
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc
+## Multi-codec / unconditional whitebox tests.
 
-TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) := test_intra_pred_speed.cc
-TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) += ../md5_utils.h ../md5_utils.c
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
 
-## VP10
-LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm_test.cc
+TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
+TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
 
 endif # CONFIG_SHARED
 
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 5d59e83..2acf744 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -187,18 +187,20 @@ INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c,
                 vpx_d153_predictor_4x4_c, vpx_d207_predictor_4x4_c,
                 vpx_d63_predictor_4x4_c, vpx_tm_predictor_4x4_c)
 
-#if HAVE_SSE && CONFIG_USE_X86INC
-INTRA_PRED_TEST(SSE, TestIntraPred4, vpx_dc_predictor_4x4_sse,
-                vpx_dc_left_predictor_4x4_sse, vpx_dc_top_predictor_4x4_sse,
-                vpx_dc_128_predictor_4x4_sse, vpx_v_predictor_4x4_sse, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_4x4_sse)
-#endif  // HAVE_SSE && CONFIG_USE_X86INC
+#if HAVE_SSE2 && CONFIG_USE_X86INC
+INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
+                vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
+                vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
+                vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL,
+                NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL,
+                vpx_tm_predictor_4x4_sse2)
+#endif  // HAVE_SSE2 && CONFIG_USE_X86INC
 
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
-                vpx_h_predictor_4x4_ssse3, vpx_d45_predictor_4x4_ssse3, NULL,
-                NULL, vpx_d153_predictor_4x4_ssse3,
-                vpx_d207_predictor_4x4_ssse3, vpx_d63_predictor_4x4_ssse3, NULL)
+                NULL, NULL, NULL, NULL,
+                vpx_d153_predictor_4x4_ssse3, NULL,
+                vpx_d63_predictor_4x4_ssse3, NULL)
 #endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
 
 #if HAVE_DSPR2
@@ -235,23 +237,19 @@ INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c,
                 vpx_d153_predictor_8x8_c, vpx_d207_predictor_8x8_c,
                 vpx_d63_predictor_8x8_c, vpx_tm_predictor_8x8_c)
 
-#if HAVE_SSE && CONFIG_USE_X86INC
-INTRA_PRED_TEST(SSE, TestIntraPred8, vpx_dc_predictor_8x8_sse,
-                vpx_dc_left_predictor_8x8_sse, vpx_dc_top_predictor_8x8_sse,
-                vpx_dc_128_predictor_8x8_sse, vpx_v_predictor_8x8_sse, NULL,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-#endif  // HAVE_SSE && CONFIG_USE_X86INC
-
 #if HAVE_SSE2 && CONFIG_USE_X86INC
-INTRA_PRED_TEST(SSE2, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
+INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
+                vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
+                vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
+                vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL,
+                NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC
 
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL,
-                vpx_h_predictor_8x8_ssse3, vpx_d45_predictor_8x8_ssse3, NULL,
-                NULL, vpx_d153_predictor_8x8_ssse3,
-                vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL)
+                NULL, NULL, NULL, NULL,
+                vpx_d153_predictor_8x8_ssse3, vpx_d207_predictor_8x8_ssse3,
+                vpx_d63_predictor_8x8_ssse3, NULL)
 #endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
 
 #if HAVE_DSPR2
@@ -293,13 +291,13 @@ INTRA_PRED_TEST(SSE2, TestIntraPred16, vpx_dc_predictor_16x16_sse2,
                 vpx_dc_left_predictor_16x16_sse2,
                 vpx_dc_top_predictor_16x16_sse2,
                 vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
                 vpx_tm_predictor_16x16_sse2)
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC
 
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL,
-                vpx_h_predictor_16x16_ssse3, vpx_d45_predictor_16x16_ssse3,
+                NULL, vpx_d45_predictor_16x16_ssse3,
                 NULL, NULL, vpx_d153_predictor_16x16_ssse3,
                 vpx_d207_predictor_16x16_ssse3, vpx_d63_predictor_16x16_ssse3,
                 NULL)
@@ -340,28 +338,19 @@ INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c,
                 vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c)
 
 #if HAVE_SSE2 && CONFIG_USE_X86INC
-#if ARCH_X86_64
-INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
-                vpx_dc_left_predictor_32x32_sse2,
-                vpx_dc_top_predictor_32x32_sse2,
-                vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_32x32_sse2)
-#else
 INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
                 vpx_dc_left_predictor_32x32_sse2,
                 vpx_dc_top_predictor_32x32_sse2,
                 vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
-                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-#endif  // ARCH_X86_64
+                vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL,
+                NULL, vpx_tm_predictor_32x32_sse2)
 #endif  // HAVE_SSE2 && CONFIG_USE_X86INC
 
 #if HAVE_SSSE3 && CONFIG_USE_X86INC
 INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL,
-                vpx_h_predictor_32x32_ssse3, vpx_d45_predictor_32x32_ssse3,
-                NULL, NULL, vpx_d153_predictor_32x32_ssse3,
-                vpx_d207_predictor_32x32_ssse3, vpx_d63_predictor_32x32_ssse3,
-                NULL)
+                NULL, vpx_d45_predictor_32x32_ssse3, NULL, NULL,
+                vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3,
+                vpx_d63_predictor_32x32_ssse3, NULL)
 #endif  // HAVE_SSSE3 && CONFIG_USE_X86INC
 
 #if HAVE_NEON
diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index 437ce44..f1aa4d7 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -10,6 +10,7 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <set>
 #include <string>
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "../tools_common.h"
@@ -44,6 +45,12 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
   TestVectorTest()
       : DecoderTest(GET_PARAM(0)),
         md5_file_(NULL) {
+#if CONFIG_VP9_DECODER
+    resize_clips_.insert(
+      ::libvpx_test::kVP9TestVectorsResize,
+      ::libvpx_test::kVP9TestVectorsResize +
+          ::libvpx_test::kNumVP9TestVectorsResize);
+#endif
   }
 
   virtual ~TestVectorTest() {
@@ -77,6 +84,10 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
         << "Md5 checksums don't match: frame number = " << frame_number;
   }
 
+#if CONFIG_VP9_DECODER
+  std::set<std::string> resize_clips_;
+#endif
+
  private:
   FILE *md5_file_;
 };
@@ -92,11 +103,19 @@ TEST_P(TestVectorTest, MD5Match) {
   const int mode = std::tr1::get<kDecodeMode>(input);
   libvpx_test::CompressedVideoSource *video = NULL;
   vpx_codec_flags_t flags = 0;
-  vpx_codec_dec_cfg_t cfg = {0};
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
   char str[256];
 
   if (mode == kFrameParallelMode) {
     flags |= VPX_CODEC_USE_FRAME_THREADING;
+#if CONFIG_VP9_DECODER
+    // TODO(hkuang): Fix frame parallel decode bug. See issue 1086.
+    if (resize_clips_.find(filename) != resize_clips_.end()) {
+      printf("Skipping the test file: %s, due to frame parallel decode bug.\n",
+             filename.c_str());
+      return;
+    }
+#endif
   }
 
   cfg.threads = threads;
diff --git a/test/test_vectors.cc b/test/test_vectors.cc
index 434a382..c822479 100644
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -52,6 +52,31 @@ const char *const kVP8TestVectors[] = {
 const int kNumVP8TestVectors = NELEMENTS(kVP8TestVectors);
 #endif  // CONFIG_VP8_DECODER
 #if CONFIG_VP9_DECODER
+#define RESIZE_TEST_VECTORS "vp90-2-21-resize_inter_320x180_5_1-2.webm", \
+  "vp90-2-21-resize_inter_320x180_5_3-4.webm", \
+  "vp90-2-21-resize_inter_320x180_7_1-2.webm", \
+  "vp90-2-21-resize_inter_320x180_7_3-4.webm", \
+  "vp90-2-21-resize_inter_320x240_5_1-2.webm", \
+  "vp90-2-21-resize_inter_320x240_5_3-4.webm", \
+  "vp90-2-21-resize_inter_320x240_7_1-2.webm", \
+  "vp90-2-21-resize_inter_320x240_7_3-4.webm", \
+  "vp90-2-21-resize_inter_640x360_5_1-2.webm", \
+  "vp90-2-21-resize_inter_640x360_5_3-4.webm", \
+  "vp90-2-21-resize_inter_640x360_7_1-2.webm", \
+  "vp90-2-21-resize_inter_640x360_7_3-4.webm", \
+  "vp90-2-21-resize_inter_640x480_5_1-2.webm", \
+  "vp90-2-21-resize_inter_640x480_5_3-4.webm", \
+  "vp90-2-21-resize_inter_640x480_7_1-2.webm", \
+  "vp90-2-21-resize_inter_640x480_7_3-4.webm", \
+  "vp90-2-21-resize_inter_1280x720_5_1-2.webm", \
+  "vp90-2-21-resize_inter_1280x720_5_3-4.webm", \
+  "vp90-2-21-resize_inter_1280x720_7_1-2.webm", \
+  "vp90-2-21-resize_inter_1280x720_7_3-4.webm", \
+  "vp90-2-21-resize_inter_1920x1080_5_1-2.webm", \
+  "vp90-2-21-resize_inter_1920x1080_5_3-4.webm", \
+  "vp90-2-21-resize_inter_1920x1080_7_1-2.webm", \
+  "vp90-2-21-resize_inter_1920x1080_7_3-4.webm",
+
 const char *const kVP9TestVectors[] = {
   "vp90-2-00-quantizer-00.webm", "vp90-2-00-quantizer-01.webm",
   "vp90-2-00-quantizer-02.webm", "vp90-2-00-quantizer-03.webm",
@@ -120,7 +145,10 @@ const char *const kVP9TestVectors[] = {
   "vp90-2-02-size-66x10.webm", "vp90-2-02-size-66x16.webm",
   "vp90-2-02-size-66x18.webm", "vp90-2-02-size-66x32.webm",
   "vp90-2-02-size-66x34.webm", "vp90-2-02-size-66x64.webm",
-  "vp90-2-02-size-66x66.webm", "vp90-2-03-size-196x196.webm",
+  "vp90-2-02-size-66x66.webm", "vp90-2-02-size-130x132.webm",
+  "vp90-2-02-size-132x130.webm", "vp90-2-02-size-132x132.webm",
+  "vp90-2-02-size-178x180.webm", "vp90-2-02-size-180x178.webm",
+  "vp90-2-02-size-180x180.webm", "vp90-2-03-size-196x196.webm",
   "vp90-2-03-size-196x198.webm", "vp90-2-03-size-196x200.webm",
   "vp90-2-03-size-196x202.webm", "vp90-2-03-size-196x208.webm",
   "vp90-2-03-size-196x210.webm", "vp90-2-03-size-196x224.webm",
@@ -152,7 +180,8 @@ const char *const kVP9TestVectors[] = {
   "vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm",
   "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
   "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
-  "vp90-2-03-size-226x226.webm", "vp90-2-03-deltaq.webm",
+  "vp90-2-03-size-226x226.webm", "vp90-2-03-size-352x288.webm",
+  "vp90-2-03-deltaq.webm",
   "vp90-2-05-resize.ivf", "vp90-2-06-bilinear.webm",
   "vp90-2-07-frame_parallel.webm", "vp90-2-08-tile_1x2_frame_parallel.webm",
   "vp90-2-08-tile_1x2.webm", "vp90-2-08-tile_1x4_frame_parallel.webm",
@@ -182,6 +211,20 @@ const char *const kVP9TestVectors[] = {
   "vp90-2-14-resize-fp-tiles-4-2.webm", "vp90-2-14-resize-fp-tiles-4-8.webm",
   "vp90-2-14-resize-fp-tiles-8-16.webm", "vp90-2-14-resize-fp-tiles-8-1.webm",
   "vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-1-2-4-8.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-1-2.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-1-4.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-1-8.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-2-1.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-2-4.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-2-8.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-4-1.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-4-2.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-4-8.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-8-1.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-8-2.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-8-4-2-1.webm",
+  "vp90-2-14-resize-10frames-fp-tiles-8-4.webm",
   "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm",
   "vp90-2-16-intra-only.webm", "vp90-2-17-show-existing-frame.webm",
   "vp90-2-18-resize.ivf", "vp90-2-19-skip.webm",
@@ -193,10 +236,16 @@ const char *const kVP9TestVectors[] = {
   "vp93-2-20-10bit-yuv422.webm", "vp93-2-20-12bit-yuv422.webm",
   "vp93-2-20-10bit-yuv440.webm", "vp93-2-20-12bit-yuv440.webm",
   "vp93-2-20-10bit-yuv444.webm", "vp93-2-20-12bit-yuv444.webm",
-#endif  // CONFIG_VP9_HIGHBITDEPTH`
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   "vp90-2-20-big_superframe-01.webm", "vp90-2-20-big_superframe-02.webm",
+  RESIZE_TEST_VECTORS
 };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
+const char *const kVP9TestVectorsResize[] = {
+  RESIZE_TEST_VECTORS
+};
+const int kNumVP9TestVectorsResize = NELEMENTS(kVP9TestVectorsResize);
+#undef RESIZE_TEST_VECTORS
 #endif  // CONFIG_VP9_DECODER
 
 }  // namespace libvpx_test
diff --git a/test/test_vectors.h b/test/test_vectors.h
index 8e1aabb..2c6918a 100644
--- a/test/test_vectors.h
+++ b/test/test_vectors.h
@@ -23,6 +23,8 @@ extern const char *const kVP8TestVectors[];
 #if CONFIG_VP9_DECODER
 extern const int kNumVP9TestVectors;
 extern const char *const kVP9TestVectors[];
+extern const int kNumVP9TestVectorsResize;
+extern const char *const kVP9TestVectorsResize[];
 #endif  // CONFIG_VP9_DECODER
 
 }  // namespace libvpx_test
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index 193bd45..f15d94a 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -103,6 +103,4 @@ TEST_P(TileIndependenceTest, MD5Match) {
 }
 
 VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1));
-
-VP10_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1));
 }  // namespace
diff --git a/test/twopass_encoder.sh b/test/twopass_encoder.sh
index 1189e51..7a223f2 100755
--- a/test/twopass_encoder.sh
+++ b/test/twopass_encoder.sh
@@ -23,7 +23,8 @@ twopass_encoder_verify_environment() {
   fi
 }
 
-# Runs twopass_encoder using the codec specified by $1.
+# Runs twopass_encoder using the codec specified by $1 with a frame limit of
+# 100.
 twopass_encoder() {
   local encoder="${LIBVPX_BIN_PATH}/twopass_encoder${VPX_TEST_EXE_SUFFIX}"
   local codec="$1"
@@ -35,7 +36,7 @@ twopass_encoder() {
   fi
 
   eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
-      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 100 \
       ${devnull}
 
   [ -e "${output_file}" ] || return 1
@@ -47,16 +48,13 @@ twopass_encoder_vp8() {
   fi
 }
 
-# TODO(tomfinegan): Add a frame limit param to twopass_encoder and enable this
-# test. VP9 is just too slow right now: This test takes 31m16s+ on a fast
-# machine.
-DISABLED_twopass_encoder_vp9() {
+twopass_encoder_vp9() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
     twopass_encoder vp9 || return 1
   fi
 }
 
 twopass_encoder_tests="twopass_encoder_vp8
-                       DISABLED_twopass_encoder_vp9"
+                       twopass_encoder_vp9"
 
 run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 7a34db6..cb63390 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -74,6 +74,10 @@ static unsigned int mb_ss_ref(const int16_t *src) {
   return res;
 }
 
+/* Note:
+ *  Our codebase calculates the "diff" value in the variance algorithm by
+ *  (src - ref).
+ */
 static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref,
                              int l2w, int l2h, int src_stride_coeff,
                              int ref_stride_coeff, uint32_t *sse_ptr,
@@ -87,14 +91,14 @@ static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref,
     for (int x = 0; x < w; x++) {
       int diff;
       if (!use_high_bit_depth_) {
-        diff = ref[w * y * ref_stride_coeff + x] -
-               src[w * y * src_stride_coeff + x];
+        diff = src[w * y * src_stride_coeff + x] -
+               ref[w * y * ref_stride_coeff + x];
         se += diff;
         sse += diff * diff;
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        diff = CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x] -
-               CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x];
+        diff = CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x] -
+               CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x];
         se += diff;
         sse += diff * diff;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -309,15 +313,15 @@ template<typename VarianceFunctionType>
 void VarianceTest<VarianceFunctionType>::RefTest() {
   for (int i = 0; i < 10; ++i) {
     for (int j = 0; j < block_size_; j++) {
-    if (!use_high_bit_depth_) {
-      src_[j] = rnd_.Rand8();
-      ref_[j] = rnd_.Rand8();
+      if (!use_high_bit_depth_) {
+        src_[j] = rnd_.Rand8();
+        ref_[j] = rnd_.Rand8();
 #if CONFIG_VP9_HIGHBITDEPTH
-    } else {
-      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() && mask_;
-      CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() && mask_;
+      } else {
+        CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
+        CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    }
+      }
     }
     unsigned int sse1, sse2;
     unsigned int var1;
@@ -328,8 +332,10 @@ void VarianceTest<VarianceFunctionType>::RefTest() {
                                            log2height_, stride_coeff,
                                            stride_coeff, &sse2,
                                            use_high_bit_depth_, bit_depth_);
-    EXPECT_EQ(sse1, sse2);
-    EXPECT_EQ(var1, var2);
+    EXPECT_EQ(sse1, sse2)
+        << "Error at test index: " << i;
+    EXPECT_EQ(var1, var2)
+        << "Error at test index: " << i;
   }
 }
 
@@ -346,8 +352,8 @@ void VarianceTest<VarianceFunctionType>::RefStrideTest() {
         ref_[ref_ind] = rnd_.Rand8();
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() && mask_;
-        CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() && mask_;
+        CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() & mask_;
+        CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() & mask_;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
     }
@@ -361,8 +367,10 @@ void VarianceTest<VarianceFunctionType>::RefStrideTest() {
                                            log2height_, src_stride_coeff,
                                            ref_stride_coeff, &sse2,
                                            use_high_bit_depth_, bit_depth_);
-    EXPECT_EQ(sse1, sse2);
-    EXPECT_EQ(var1, var2);
+    EXPECT_EQ(sse1, sse2)
+        << "Error at test index: " << i;
+    EXPECT_EQ(var1, var2)
+        << "Error at test index: " << i;
   }
 }
 
@@ -747,115 +755,63 @@ TEST_P(VpxSubpelAvgVarianceTest, Ref) { RefTest(); }
 INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
                         ::testing::Values(vpx_get_mb_ss_c));
 
-const Get4x4SseFunc get4x4sse_cs_c = vpx_get4x4sse_cs_c;
 INSTANTIATE_TEST_CASE_P(C, VpxSseTest,
-                        ::testing::Values(make_tuple(2, 2, get4x4sse_cs_c)));
+                        ::testing::Values(make_tuple(2, 2,
+                                                     &vpx_get4x4sse_cs_c)));
 
-const VarianceMxNFunc mse16x16_c = vpx_mse16x16_c;
-const VarianceMxNFunc mse16x8_c = vpx_mse16x8_c;
-const VarianceMxNFunc mse8x16_c = vpx_mse8x16_c;
-const VarianceMxNFunc mse8x8_c = vpx_mse8x8_c;
 INSTANTIATE_TEST_CASE_P(C, VpxMseTest,
-                        ::testing::Values(make_tuple(4, 4, mse16x16_c),
-                                          make_tuple(4, 3, mse16x8_c),
-                                          make_tuple(3, 4, mse8x16_c),
-                                          make_tuple(3, 3, mse8x8_c)));
-
-const VarianceMxNFunc variance64x64_c = vpx_variance64x64_c;
-const VarianceMxNFunc variance64x32_c = vpx_variance64x32_c;
-const VarianceMxNFunc variance32x64_c = vpx_variance32x64_c;
-const VarianceMxNFunc variance32x32_c = vpx_variance32x32_c;
-const VarianceMxNFunc variance32x16_c = vpx_variance32x16_c;
-const VarianceMxNFunc variance16x32_c = vpx_variance16x32_c;
-const VarianceMxNFunc variance16x16_c = vpx_variance16x16_c;
-const VarianceMxNFunc variance16x8_c = vpx_variance16x8_c;
-const VarianceMxNFunc variance8x16_c = vpx_variance8x16_c;
-const VarianceMxNFunc variance8x8_c = vpx_variance8x8_c;
-const VarianceMxNFunc variance8x4_c = vpx_variance8x4_c;
-const VarianceMxNFunc variance4x8_c = vpx_variance4x8_c;
-const VarianceMxNFunc variance4x4_c = vpx_variance4x4_c;
+                        ::testing::Values(make_tuple(4, 4, &vpx_mse16x16_c),
+                                          make_tuple(4, 3, &vpx_mse16x8_c),
+                                          make_tuple(3, 4, &vpx_mse8x16_c),
+                                          make_tuple(3, 3, &vpx_mse8x8_c)));
+
 INSTANTIATE_TEST_CASE_P(
     C, VpxVarianceTest,
-    ::testing::Values(make_tuple(6, 6, variance64x64_c, 0),
-                      make_tuple(6, 5, variance64x32_c, 0),
-                      make_tuple(5, 6, variance32x64_c, 0),
-                      make_tuple(5, 5, variance32x32_c, 0),
-                      make_tuple(5, 4, variance32x16_c, 0),
-                      make_tuple(4, 5, variance16x32_c, 0),
-                      make_tuple(4, 4, variance16x16_c, 0),
-                      make_tuple(4, 3, variance16x8_c, 0),
-                      make_tuple(3, 4, variance8x16_c, 0),
-                      make_tuple(3, 3, variance8x8_c, 0),
-                      make_tuple(3, 2, variance8x4_c, 0),
-                      make_tuple(2, 3, variance4x8_c, 0),
-                      make_tuple(2, 2, variance4x4_c, 0)));
-
-const SubpixVarMxNFunc subpel_var64x64_c = vpx_sub_pixel_variance64x64_c;
-const SubpixVarMxNFunc subpel_var64x32_c = vpx_sub_pixel_variance64x32_c;
-const SubpixVarMxNFunc subpel_var32x64_c = vpx_sub_pixel_variance32x64_c;
-const SubpixVarMxNFunc subpel_var32x32_c = vpx_sub_pixel_variance32x32_c;
-const SubpixVarMxNFunc subpel_var32x16_c = vpx_sub_pixel_variance32x16_c;
-const SubpixVarMxNFunc subpel_var16x32_c = vpx_sub_pixel_variance16x32_c;
-const SubpixVarMxNFunc subpel_var16x16_c = vpx_sub_pixel_variance16x16_c;
-const SubpixVarMxNFunc subpel_var16x8_c = vpx_sub_pixel_variance16x8_c;
-const SubpixVarMxNFunc subpel_var8x16_c = vpx_sub_pixel_variance8x16_c;
-const SubpixVarMxNFunc subpel_var8x8_c = vpx_sub_pixel_variance8x8_c;
-const SubpixVarMxNFunc subpel_var8x4_c = vpx_sub_pixel_variance8x4_c;
-const SubpixVarMxNFunc subpel_var4x8_c = vpx_sub_pixel_variance4x8_c;
-const SubpixVarMxNFunc subpel_var4x4_c = vpx_sub_pixel_variance4x4_c;
+    ::testing::Values(make_tuple(6, 6, &vpx_variance64x64_c, 0),
+                      make_tuple(6, 5, &vpx_variance64x32_c, 0),
+                      make_tuple(5, 6, &vpx_variance32x64_c, 0),
+                      make_tuple(5, 5, &vpx_variance32x32_c, 0),
+                      make_tuple(5, 4, &vpx_variance32x16_c, 0),
+                      make_tuple(4, 5, &vpx_variance16x32_c, 0),
+                      make_tuple(4, 4, &vpx_variance16x16_c, 0),
+                      make_tuple(4, 3, &vpx_variance16x8_c, 0),
+                      make_tuple(3, 4, &vpx_variance8x16_c, 0),
+                      make_tuple(3, 3, &vpx_variance8x8_c, 0),
+                      make_tuple(3, 2, &vpx_variance8x4_c, 0),
+                      make_tuple(2, 3, &vpx_variance4x8_c, 0),
+                      make_tuple(2, 2, &vpx_variance4x4_c, 0)));
+
 INSTANTIATE_TEST_CASE_P(
     C, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, subpel_var64x64_c, 0),
-                      make_tuple(6, 5, subpel_var64x32_c, 0),
-                      make_tuple(5, 6, subpel_var32x64_c, 0),
-                      make_tuple(5, 5, subpel_var32x32_c, 0),
-                      make_tuple(5, 4, subpel_var32x16_c, 0),
-                      make_tuple(4, 5, subpel_var16x32_c, 0),
-                      make_tuple(4, 4, subpel_var16x16_c, 0),
-                      make_tuple(4, 3, subpel_var16x8_c, 0),
-                      make_tuple(3, 4, subpel_var8x16_c, 0),
-                      make_tuple(3, 3, subpel_var8x8_c, 0),
-                      make_tuple(3, 2, subpel_var8x4_c, 0),
-                      make_tuple(2, 3, subpel_var4x8_c, 0),
-                      make_tuple(2, 2, subpel_var4x4_c, 0)));
-
-const SubpixAvgVarMxNFunc subpel_avg_var64x64_c =
-    vpx_sub_pixel_avg_variance64x64_c;
-const SubpixAvgVarMxNFunc subpel_avg_var64x32_c =
-    vpx_sub_pixel_avg_variance64x32_c;
-const SubpixAvgVarMxNFunc subpel_avg_var32x64_c =
-    vpx_sub_pixel_avg_variance32x64_c;
-const SubpixAvgVarMxNFunc subpel_avg_var32x32_c =
-    vpx_sub_pixel_avg_variance32x32_c;
-const SubpixAvgVarMxNFunc subpel_avg_var32x16_c =
-    vpx_sub_pixel_avg_variance32x16_c;
-const SubpixAvgVarMxNFunc subpel_avg_var16x32_c =
-    vpx_sub_pixel_avg_variance16x32_c;
-const SubpixAvgVarMxNFunc subpel_avg_var16x16_c =
-    vpx_sub_pixel_avg_variance16x16_c;
-const SubpixAvgVarMxNFunc subpel_avg_var16x8_c =
-    vpx_sub_pixel_avg_variance16x8_c;
-const SubpixAvgVarMxNFunc subpel_avg_var8x16_c =
-    vpx_sub_pixel_avg_variance8x16_c;
-const SubpixAvgVarMxNFunc subpel_avg_var8x8_c = vpx_sub_pixel_avg_variance8x8_c;
-const SubpixAvgVarMxNFunc subpel_avg_var8x4_c = vpx_sub_pixel_avg_variance8x4_c;
-const SubpixAvgVarMxNFunc subpel_avg_var4x8_c = vpx_sub_pixel_avg_variance4x8_c;
-const SubpixAvgVarMxNFunc subpel_avg_var4x4_c = vpx_sub_pixel_avg_variance4x4_c;
+    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
+                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
+                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
+                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
+                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_c, 0),
+                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_c, 0),
+                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_c, 0),
+                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_c, 0),
+                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_c, 0),
+                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_c, 0),
+                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_c, 0),
+                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
+                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
+
 INSTANTIATE_TEST_CASE_P(
     C, VpxSubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(6, 6, subpel_avg_var64x64_c, 0),
-                      make_tuple(6, 5, subpel_avg_var64x32_c, 0),
-                      make_tuple(5, 6, subpel_avg_var32x64_c, 0),
-                      make_tuple(5, 5, subpel_avg_var32x32_c, 0),
-                      make_tuple(5, 4, subpel_avg_var32x16_c, 0),
-                      make_tuple(4, 5, subpel_avg_var16x32_c, 0),
-                      make_tuple(4, 4, subpel_avg_var16x16_c, 0),
-                      make_tuple(4, 3, subpel_avg_var16x8_c, 0),
-                      make_tuple(3, 4, subpel_avg_var8x16_c, 0),
-                      make_tuple(3, 3, subpel_avg_var8x8_c, 0),
-                      make_tuple(3, 2, subpel_avg_var8x4_c, 0),
-                      make_tuple(2, 3, subpel_avg_var4x8_c, 0),
-                      make_tuple(2, 2, subpel_avg_var4x4_c, 0)));
+    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
+                      make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
+                      make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
+                      make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
+                      make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0),
+                      make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0),
+                      make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0),
+                      make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0),
+                      make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0),
+                      make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0),
+                      make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0),
+                      make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0),
+                      make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
 
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef MseTest<VarianceMxNFunc> VpxHBDMseTest;
@@ -875,1166 +831,507 @@ TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
 
 /* TODO(debargha): This test does not support the highbd version
-const VarianceMxNFunc highbd_12_mse16x16_c = vpx_highbd_12_mse16x16_c;
-const VarianceMxNFunc highbd_12_mse16x8_c = vpx_highbd_12_mse16x8_c;
-const VarianceMxNFunc highbd_12_mse8x16_c = vpx_highbd_12_mse8x16_c;
-const VarianceMxNFunc highbd_12_mse8x8_c = vpx_highbd_12_mse8x8_c;
-
-const VarianceMxNFunc highbd_10_mse16x16_c = vpx_highbd_10_mse16x16_c;
-const VarianceMxNFunc highbd_10_mse16x8_c = vpx_highbd_10_mse16x8_c;
-const VarianceMxNFunc highbd_10_mse8x16_c = vpx_highbd_10_mse8x16_c;
-const VarianceMxNFunc highbd_10_mse8x8_c = vpx_highbd_10_mse8x8_c;
-
-const VarianceMxNFunc highbd_8_mse16x16_c = vpx_highbd_8_mse16x16_c;
-const VarianceMxNFunc highbd_8_mse16x8_c = vpx_highbd_8_mse16x8_c;
-const VarianceMxNFunc highbd_8_mse8x16_c = vpx_highbd_8_mse8x16_c;
-const VarianceMxNFunc highbd_8_mse8x8_c = vpx_highbd_8_mse8x8_c;
 INSTANTIATE_TEST_CASE_P(
-    C, VpxHBDMseTest, ::testing::Values(make_tuple(4, 4, highbd_12_mse16x16_c),
-                                        make_tuple(4, 4, highbd_12_mse16x8_c),
-                                        make_tuple(4, 4, highbd_12_mse8x16_c),
-                                        make_tuple(4, 4, highbd_12_mse8x8_c),
-                                        make_tuple(4, 4, highbd_10_mse16x16_c),
-                                        make_tuple(4, 4, highbd_10_mse16x8_c),
-                                        make_tuple(4, 4, highbd_10_mse8x16_c),
-                                        make_tuple(4, 4, highbd_10_mse8x8_c),
-                                        make_tuple(4, 4, highbd_8_mse16x16_c),
-                                        make_tuple(4, 4, highbd_8_mse16x8_c),
-                                        make_tuple(4, 4, highbd_8_mse8x16_c),
-                                        make_tuple(4, 4, highbd_8_mse8x8_c)));
+    C, VpxHBDMseTest,
+    ::testing::Values(make_tuple(4, 4, &vpx_highbd_12_mse16x16_c),
+                      make_tuple(4, 4, &vpx_highbd_12_mse16x8_c),
+                      make_tuple(4, 4, &vpx_highbd_12_mse8x16_c),
+                      make_tuple(4, 4, &vpx_highbd_12_mse8x8_c),
+                      make_tuple(4, 4, &vpx_highbd_10_mse16x16_c),
+                      make_tuple(4, 4, &vpx_highbd_10_mse16x8_c),
+                      make_tuple(4, 4, &vpx_highbd_10_mse8x16_c),
+                      make_tuple(4, 4, &vpx_highbd_10_mse8x8_c),
+                      make_tuple(4, 4, &vpx_highbd_8_mse16x16_c),
+                      make_tuple(4, 4, &vpx_highbd_8_mse16x8_c),
+                      make_tuple(4, 4, &vpx_highbd_8_mse8x16_c),
+                      make_tuple(4, 4, &vpx_highbd_8_mse8x8_c)));
 */
 
-const VarianceMxNFunc highbd_12_variance64x64_c = vpx_highbd_12_variance64x64_c;
-const VarianceMxNFunc highbd_12_variance64x32_c = vpx_highbd_12_variance64x32_c;
-const VarianceMxNFunc highbd_12_variance32x64_c = vpx_highbd_12_variance32x64_c;
-const VarianceMxNFunc highbd_12_variance32x32_c = vpx_highbd_12_variance32x32_c;
-const VarianceMxNFunc highbd_12_variance32x16_c = vpx_highbd_12_variance32x16_c;
-const VarianceMxNFunc highbd_12_variance16x32_c = vpx_highbd_12_variance16x32_c;
-const VarianceMxNFunc highbd_12_variance16x16_c = vpx_highbd_12_variance16x16_c;
-const VarianceMxNFunc highbd_12_variance16x8_c = vpx_highbd_12_variance16x8_c;
-const VarianceMxNFunc highbd_12_variance8x16_c = vpx_highbd_12_variance8x16_c;
-const VarianceMxNFunc highbd_12_variance8x8_c = vpx_highbd_12_variance8x8_c;
-const VarianceMxNFunc highbd_12_variance8x4_c = vpx_highbd_12_variance8x4_c;
-const VarianceMxNFunc highbd_12_variance4x8_c = vpx_highbd_12_variance4x8_c;
-const VarianceMxNFunc highbd_12_variance4x4_c = vpx_highbd_12_variance4x4_c;
-const VarianceMxNFunc highbd_10_variance64x64_c = vpx_highbd_10_variance64x64_c;
-const VarianceMxNFunc highbd_10_variance64x32_c = vpx_highbd_10_variance64x32_c;
-const VarianceMxNFunc highbd_10_variance32x64_c = vpx_highbd_10_variance32x64_c;
-const VarianceMxNFunc highbd_10_variance32x32_c = vpx_highbd_10_variance32x32_c;
-const VarianceMxNFunc highbd_10_variance32x16_c = vpx_highbd_10_variance32x16_c;
-const VarianceMxNFunc highbd_10_variance16x32_c = vpx_highbd_10_variance16x32_c;
-const VarianceMxNFunc highbd_10_variance16x16_c = vpx_highbd_10_variance16x16_c;
-const VarianceMxNFunc highbd_10_variance16x8_c = vpx_highbd_10_variance16x8_c;
-const VarianceMxNFunc highbd_10_variance8x16_c = vpx_highbd_10_variance8x16_c;
-const VarianceMxNFunc highbd_10_variance8x8_c = vpx_highbd_10_variance8x8_c;
-const VarianceMxNFunc highbd_10_variance8x4_c = vpx_highbd_10_variance8x4_c;
-const VarianceMxNFunc highbd_10_variance4x8_c = vpx_highbd_10_variance4x8_c;
-const VarianceMxNFunc highbd_10_variance4x4_c = vpx_highbd_10_variance4x4_c;
-const VarianceMxNFunc highbd_8_variance64x64_c = vpx_highbd_8_variance64x64_c;
-const VarianceMxNFunc highbd_8_variance64x32_c = vpx_highbd_8_variance64x32_c;
-const VarianceMxNFunc highbd_8_variance32x64_c = vpx_highbd_8_variance32x64_c;
-const VarianceMxNFunc highbd_8_variance32x32_c = vpx_highbd_8_variance32x32_c;
-const VarianceMxNFunc highbd_8_variance32x16_c = vpx_highbd_8_variance32x16_c;
-const VarianceMxNFunc highbd_8_variance16x32_c = vpx_highbd_8_variance16x32_c;
-const VarianceMxNFunc highbd_8_variance16x16_c = vpx_highbd_8_variance16x16_c;
-const VarianceMxNFunc highbd_8_variance16x8_c = vpx_highbd_8_variance16x8_c;
-const VarianceMxNFunc highbd_8_variance8x16_c = vpx_highbd_8_variance8x16_c;
-const VarianceMxNFunc highbd_8_variance8x8_c = vpx_highbd_8_variance8x8_c;
-const VarianceMxNFunc highbd_8_variance8x4_c = vpx_highbd_8_variance8x4_c;
-const VarianceMxNFunc highbd_8_variance4x8_c = vpx_highbd_8_variance4x8_c;
-const VarianceMxNFunc highbd_8_variance4x4_c = vpx_highbd_8_variance4x4_c;
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDVarianceTest,
-    ::testing::Values(make_tuple(6, 6, highbd_12_variance64x64_c, 12),
-                      make_tuple(6, 5, highbd_12_variance64x32_c, 12),
-                      make_tuple(5, 6, highbd_12_variance32x64_c, 12),
-                      make_tuple(5, 5, highbd_12_variance32x32_c, 12),
-                      make_tuple(5, 4, highbd_12_variance32x16_c, 12),
-                      make_tuple(4, 5, highbd_12_variance16x32_c, 12),
-                      make_tuple(4, 4, highbd_12_variance16x16_c, 12),
-                      make_tuple(4, 3, highbd_12_variance16x8_c, 12),
-                      make_tuple(3, 4, highbd_12_variance8x16_c, 12),
-                      make_tuple(3, 3, highbd_12_variance8x8_c, 12),
-                      make_tuple(3, 2, highbd_12_variance8x4_c, 12),
-                      make_tuple(2, 3, highbd_12_variance4x8_c, 12),
-                      make_tuple(2, 2, highbd_12_variance4x4_c, 12),
-                      make_tuple(6, 6, highbd_10_variance64x64_c, 10),
-                      make_tuple(6, 5, highbd_10_variance64x32_c, 10),
-                      make_tuple(5, 6, highbd_10_variance32x64_c, 10),
-                      make_tuple(5, 5, highbd_10_variance32x32_c, 10),
-                      make_tuple(5, 4, highbd_10_variance32x16_c, 10),
-                      make_tuple(4, 5, highbd_10_variance16x32_c, 10),
-                      make_tuple(4, 4, highbd_10_variance16x16_c, 10),
-                      make_tuple(4, 3, highbd_10_variance16x8_c, 10),
-                      make_tuple(3, 4, highbd_10_variance8x16_c, 10),
-                      make_tuple(3, 3, highbd_10_variance8x8_c, 10),
-                      make_tuple(3, 2, highbd_10_variance8x4_c, 10),
-                      make_tuple(2, 3, highbd_10_variance4x8_c, 10),
-                      make_tuple(2, 2, highbd_10_variance4x4_c, 10),
-                      make_tuple(6, 6, highbd_8_variance64x64_c, 8),
-                      make_tuple(6, 5, highbd_8_variance64x32_c, 8),
-                      make_tuple(5, 6, highbd_8_variance32x64_c, 8),
-                      make_tuple(5, 5, highbd_8_variance32x32_c, 8),
-                      make_tuple(5, 4, highbd_8_variance32x16_c, 8),
-                      make_tuple(4, 5, highbd_8_variance16x32_c, 8),
-                      make_tuple(4, 4, highbd_8_variance16x16_c, 8),
-                      make_tuple(4, 3, highbd_8_variance16x8_c, 8),
-                      make_tuple(3, 4, highbd_8_variance8x16_c, 8),
-                      make_tuple(3, 3, highbd_8_variance8x8_c, 8),
-                      make_tuple(3, 2, highbd_8_variance8x4_c, 8),
-                      make_tuple(2, 3, highbd_8_variance4x8_c, 8),
-                      make_tuple(2, 2, highbd_8_variance4x4_c, 8)));
-
-const SubpixVarMxNFunc highbd_8_subpel_var64x64_c =
-    vpx_highbd_8_sub_pixel_variance64x64_c;
-const SubpixVarMxNFunc highbd_8_subpel_var64x32_c =
-    vpx_highbd_8_sub_pixel_variance64x32_c;
-const SubpixVarMxNFunc highbd_8_subpel_var32x64_c =
-    vpx_highbd_8_sub_pixel_variance32x64_c;
-const SubpixVarMxNFunc highbd_8_subpel_var32x32_c =
-    vpx_highbd_8_sub_pixel_variance32x32_c;
-const SubpixVarMxNFunc highbd_8_subpel_var32x16_c =
-    vpx_highbd_8_sub_pixel_variance32x16_c;
-const SubpixVarMxNFunc highbd_8_subpel_var16x32_c =
-    vpx_highbd_8_sub_pixel_variance16x32_c;
-const SubpixVarMxNFunc highbd_8_subpel_var16x16_c =
-    vpx_highbd_8_sub_pixel_variance16x16_c;
-const SubpixVarMxNFunc highbd_8_subpel_var16x8_c =
-    vpx_highbd_8_sub_pixel_variance16x8_c;
-const SubpixVarMxNFunc highbd_8_subpel_var8x16_c =
-    vpx_highbd_8_sub_pixel_variance8x16_c;
-const SubpixVarMxNFunc highbd_8_subpel_var8x8_c =
-    vpx_highbd_8_sub_pixel_variance8x8_c;
-const SubpixVarMxNFunc highbd_8_subpel_var8x4_c =
-    vpx_highbd_8_sub_pixel_variance8x4_c;
-const SubpixVarMxNFunc highbd_8_subpel_var4x8_c =
-    vpx_highbd_8_sub_pixel_variance4x8_c;
-const SubpixVarMxNFunc highbd_8_subpel_var4x4_c =
-    vpx_highbd_8_sub_pixel_variance4x4_c;
-const SubpixVarMxNFunc highbd_10_subpel_var64x64_c =
-    vpx_highbd_10_sub_pixel_variance64x64_c;
-const SubpixVarMxNFunc highbd_10_subpel_var64x32_c =
-    vpx_highbd_10_sub_pixel_variance64x32_c;
-const SubpixVarMxNFunc highbd_10_subpel_var32x64_c =
-    vpx_highbd_10_sub_pixel_variance32x64_c;
-const SubpixVarMxNFunc highbd_10_subpel_var32x32_c =
-    vpx_highbd_10_sub_pixel_variance32x32_c;
-const SubpixVarMxNFunc highbd_10_subpel_var32x16_c =
-    vpx_highbd_10_sub_pixel_variance32x16_c;
-const SubpixVarMxNFunc highbd_10_subpel_var16x32_c =
-    vpx_highbd_10_sub_pixel_variance16x32_c;
-const SubpixVarMxNFunc highbd_10_subpel_var16x16_c =
-    vpx_highbd_10_sub_pixel_variance16x16_c;
-const SubpixVarMxNFunc highbd_10_subpel_var16x8_c =
-    vpx_highbd_10_sub_pixel_variance16x8_c;
-const SubpixVarMxNFunc highbd_10_subpel_var8x16_c =
-    vpx_highbd_10_sub_pixel_variance8x16_c;
-const SubpixVarMxNFunc highbd_10_subpel_var8x8_c =
-    vpx_highbd_10_sub_pixel_variance8x8_c;
-const SubpixVarMxNFunc highbd_10_subpel_var8x4_c =
-    vpx_highbd_10_sub_pixel_variance8x4_c;
-const SubpixVarMxNFunc highbd_10_subpel_var4x8_c =
-    vpx_highbd_10_sub_pixel_variance4x8_c;
-const SubpixVarMxNFunc highbd_10_subpel_var4x4_c =
-    vpx_highbd_10_sub_pixel_variance4x4_c;
-const SubpixVarMxNFunc highbd_12_subpel_var64x64_c =
-    vpx_highbd_12_sub_pixel_variance64x64_c;
-const SubpixVarMxNFunc highbd_12_subpel_var64x32_c =
-    vpx_highbd_12_sub_pixel_variance64x32_c;
-const SubpixVarMxNFunc highbd_12_subpel_var32x64_c =
-    vpx_highbd_12_sub_pixel_variance32x64_c;
-const SubpixVarMxNFunc highbd_12_subpel_var32x32_c =
-    vpx_highbd_12_sub_pixel_variance32x32_c;
-const SubpixVarMxNFunc highbd_12_subpel_var32x16_c =
-    vpx_highbd_12_sub_pixel_variance32x16_c;
-const SubpixVarMxNFunc highbd_12_subpel_var16x32_c =
-    vpx_highbd_12_sub_pixel_variance16x32_c;
-const SubpixVarMxNFunc highbd_12_subpel_var16x16_c =
-    vpx_highbd_12_sub_pixel_variance16x16_c;
-const SubpixVarMxNFunc highbd_12_subpel_var16x8_c =
-    vpx_highbd_12_sub_pixel_variance16x8_c;
-const SubpixVarMxNFunc highbd_12_subpel_var8x16_c =
-    vpx_highbd_12_sub_pixel_variance8x16_c;
-const SubpixVarMxNFunc highbd_12_subpel_var8x8_c =
-    vpx_highbd_12_sub_pixel_variance8x8_c;
-const SubpixVarMxNFunc highbd_12_subpel_var8x4_c =
-    vpx_highbd_12_sub_pixel_variance8x4_c;
-const SubpixVarMxNFunc highbd_12_subpel_var4x8_c =
-    vpx_highbd_12_sub_pixel_variance4x8_c;
-const SubpixVarMxNFunc highbd_12_subpel_var4x4_c =
-    vpx_highbd_12_sub_pixel_variance4x4_c;
+    ::testing::Values(make_tuple(6, 6, &vpx_highbd_12_variance64x64_c, 12),
+                      make_tuple(6, 5, &vpx_highbd_12_variance64x32_c, 12),
+                      make_tuple(5, 6, &vpx_highbd_12_variance32x64_c, 12),
+                      make_tuple(5, 5, &vpx_highbd_12_variance32x32_c, 12),
+                      make_tuple(5, 4, &vpx_highbd_12_variance32x16_c, 12),
+                      make_tuple(4, 5, &vpx_highbd_12_variance16x32_c, 12),
+                      make_tuple(4, 4, &vpx_highbd_12_variance16x16_c, 12),
+                      make_tuple(4, 3, &vpx_highbd_12_variance16x8_c, 12),
+                      make_tuple(3, 4, &vpx_highbd_12_variance8x16_c, 12),
+                      make_tuple(3, 3, &vpx_highbd_12_variance8x8_c, 12),
+                      make_tuple(3, 2, &vpx_highbd_12_variance8x4_c, 12),
+                      make_tuple(2, 3, &vpx_highbd_12_variance4x8_c, 12),
+                      make_tuple(2, 2, &vpx_highbd_12_variance4x4_c, 12),
+                      make_tuple(6, 6, &vpx_highbd_10_variance64x64_c, 10),
+                      make_tuple(6, 5, &vpx_highbd_10_variance64x32_c, 10),
+                      make_tuple(5, 6, &vpx_highbd_10_variance32x64_c, 10),
+                      make_tuple(5, 5, &vpx_highbd_10_variance32x32_c, 10),
+                      make_tuple(5, 4, &vpx_highbd_10_variance32x16_c, 10),
+                      make_tuple(4, 5, &vpx_highbd_10_variance16x32_c, 10),
+                      make_tuple(4, 4, &vpx_highbd_10_variance16x16_c, 10),
+                      make_tuple(4, 3, &vpx_highbd_10_variance16x8_c, 10),
+                      make_tuple(3, 4, &vpx_highbd_10_variance8x16_c, 10),
+                      make_tuple(3, 3, &vpx_highbd_10_variance8x8_c, 10),
+                      make_tuple(3, 2, &vpx_highbd_10_variance8x4_c, 10),
+                      make_tuple(2, 3, &vpx_highbd_10_variance4x8_c, 10),
+                      make_tuple(2, 2, &vpx_highbd_10_variance4x4_c, 10),
+                      make_tuple(6, 6, &vpx_highbd_8_variance64x64_c, 8),
+                      make_tuple(6, 5, &vpx_highbd_8_variance64x32_c, 8),
+                      make_tuple(5, 6, &vpx_highbd_8_variance32x64_c, 8),
+                      make_tuple(5, 5, &vpx_highbd_8_variance32x32_c, 8),
+                      make_tuple(5, 4, &vpx_highbd_8_variance32x16_c, 8),
+                      make_tuple(4, 5, &vpx_highbd_8_variance16x32_c, 8),
+                      make_tuple(4, 4, &vpx_highbd_8_variance16x16_c, 8),
+                      make_tuple(4, 3, &vpx_highbd_8_variance16x8_c, 8),
+                      make_tuple(3, 4, &vpx_highbd_8_variance8x16_c, 8),
+                      make_tuple(3, 3, &vpx_highbd_8_variance8x8_c, 8),
+                      make_tuple(3, 2, &vpx_highbd_8_variance8x4_c, 8),
+                      make_tuple(2, 3, &vpx_highbd_8_variance4x8_c, 8),
+                      make_tuple(2, 2, &vpx_highbd_8_variance4x4_c, 8)));
+
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, highbd_8_subpel_var64x64_c, 8),
-                      make_tuple(6, 5, highbd_8_subpel_var64x32_c, 8),
-                      make_tuple(5, 6, highbd_8_subpel_var32x64_c, 8),
-                      make_tuple(5, 5, highbd_8_subpel_var32x32_c, 8),
-                      make_tuple(5, 4, highbd_8_subpel_var32x16_c, 8),
-                      make_tuple(4, 5, highbd_8_subpel_var16x32_c, 8),
-                      make_tuple(4, 4, highbd_8_subpel_var16x16_c, 8),
-                      make_tuple(4, 3, highbd_8_subpel_var16x8_c, 8),
-                      make_tuple(3, 4, highbd_8_subpel_var8x16_c, 8),
-                      make_tuple(3, 3, highbd_8_subpel_var8x8_c, 8),
-                      make_tuple(3, 2, highbd_8_subpel_var8x4_c, 8),
-                      make_tuple(2, 3, highbd_8_subpel_var4x8_c, 8),
-                      make_tuple(2, 2, highbd_8_subpel_var4x4_c, 8),
-                      make_tuple(6, 6, highbd_10_subpel_var64x64_c, 10),
-                      make_tuple(6, 5, highbd_10_subpel_var64x32_c, 10),
-                      make_tuple(5, 6, highbd_10_subpel_var32x64_c, 10),
-                      make_tuple(5, 5, highbd_10_subpel_var32x32_c, 10),
-                      make_tuple(5, 4, highbd_10_subpel_var32x16_c, 10),
-                      make_tuple(4, 5, highbd_10_subpel_var16x32_c, 10),
-                      make_tuple(4, 4, highbd_10_subpel_var16x16_c, 10),
-                      make_tuple(4, 3, highbd_10_subpel_var16x8_c, 10),
-                      make_tuple(3, 4, highbd_10_subpel_var8x16_c, 10),
-                      make_tuple(3, 3, highbd_10_subpel_var8x8_c, 10),
-                      make_tuple(3, 2, highbd_10_subpel_var8x4_c, 10),
-                      make_tuple(2, 3, highbd_10_subpel_var4x8_c, 10),
-                      make_tuple(2, 2, highbd_10_subpel_var4x4_c, 10),
-                      make_tuple(6, 6, highbd_12_subpel_var64x64_c, 12),
-                      make_tuple(6, 5, highbd_12_subpel_var64x32_c, 12),
-                      make_tuple(5, 6, highbd_12_subpel_var32x64_c, 12),
-                      make_tuple(5, 5, highbd_12_subpel_var32x32_c, 12),
-                      make_tuple(5, 4, highbd_12_subpel_var32x16_c, 12),
-                      make_tuple(4, 5, highbd_12_subpel_var16x32_c, 12),
-                      make_tuple(4, 4, highbd_12_subpel_var16x16_c, 12),
-                      make_tuple(4, 3, highbd_12_subpel_var16x8_c, 12),
-                      make_tuple(3, 4, highbd_12_subpel_var8x16_c, 12),
-                      make_tuple(3, 3, highbd_12_subpel_var8x8_c, 12),
-                      make_tuple(3, 2, highbd_12_subpel_var8x4_c, 12),
-                      make_tuple(2, 3, highbd_12_subpel_var4x8_c, 12),
-                      make_tuple(2, 2, highbd_12_subpel_var4x4_c, 12)));
-
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var64x64_c =
-    vpx_highbd_8_sub_pixel_avg_variance64x64_c;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var64x32_c =
-    vpx_highbd_8_sub_pixel_avg_variance64x32_c;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var32x64_c =
-    vpx_highbd_8_sub_pixel_avg_variance32x64_c;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var32x32_c =
-    vpx_highbd_8_sub_pixel_avg_variance32x32_c;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var32x16_c =
-    vpx_highbd_8_sub_pixel_avg_variance32x16_c;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var16x32_c =
-    vpx_highbd_8_sub_pixel_avg_variance16x32_c;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var16x16_c =
-    vpx_highbd_8_sub_pixel_avg_variance16x16_c;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var16x8_c =
-    vpx_highbd_8_sub_pixel_avg_variance16x8_c;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var8x16_c =
-    vpx_highbd_8_sub_pixel_avg_variance8x16_c;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var8x8_c =
-    vpx_highbd_8_sub_pixel_avg_variance8x8_c;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var8x4_c =
-    vpx_highbd_8_sub_pixel_avg_variance8x4_c;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var4x8_c =
-    vpx_highbd_8_sub_pixel_avg_variance4x8_c;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var4x4_c =
-    vpx_highbd_8_sub_pixel_avg_variance4x4_c;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var64x64_c =
-    vpx_highbd_10_sub_pixel_avg_variance64x64_c;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var64x32_c =
-    vpx_highbd_10_sub_pixel_avg_variance64x32_c;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var32x64_c =
-    vpx_highbd_10_sub_pixel_avg_variance32x64_c;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var32x32_c =
-    vpx_highbd_10_sub_pixel_avg_variance32x32_c;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var32x16_c =
-    vpx_highbd_10_sub_pixel_avg_variance32x16_c;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var16x32_c =
-    vpx_highbd_10_sub_pixel_avg_variance16x32_c;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var16x16_c =
-    vpx_highbd_10_sub_pixel_avg_variance16x16_c;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var16x8_c =
-    vpx_highbd_10_sub_pixel_avg_variance16x8_c;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var8x16_c =
-    vpx_highbd_10_sub_pixel_avg_variance8x16_c;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var8x8_c =
-    vpx_highbd_10_sub_pixel_avg_variance8x8_c;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var8x4_c =
-    vpx_highbd_10_sub_pixel_avg_variance8x4_c;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var4x8_c =
-    vpx_highbd_10_sub_pixel_avg_variance4x8_c;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var4x4_c =
-    vpx_highbd_10_sub_pixel_avg_variance4x4_c;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var64x64_c =
-    vpx_highbd_12_sub_pixel_avg_variance64x64_c;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var64x32_c =
-    vpx_highbd_12_sub_pixel_avg_variance64x32_c;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var32x64_c =
-    vpx_highbd_12_sub_pixel_avg_variance32x64_c;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var32x32_c =
-    vpx_highbd_12_sub_pixel_avg_variance32x32_c;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var32x16_c =
-    vpx_highbd_12_sub_pixel_avg_variance32x16_c;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var16x32_c =
-    vpx_highbd_12_sub_pixel_avg_variance16x32_c;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var16x16_c =
-    vpx_highbd_12_sub_pixel_avg_variance16x16_c;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var16x8_c =
-    vpx_highbd_12_sub_pixel_avg_variance16x8_c;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var8x16_c =
-    vpx_highbd_12_sub_pixel_avg_variance8x16_c;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var8x8_c =
-    vpx_highbd_12_sub_pixel_avg_variance8x8_c;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var8x4_c =
-    vpx_highbd_12_sub_pixel_avg_variance8x4_c;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var4x8_c =
-    vpx_highbd_12_sub_pixel_avg_variance4x8_c;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var4x4_c =
-    vpx_highbd_12_sub_pixel_avg_variance4x4_c;
+    ::testing::Values(
+        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
+        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8),
+        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8),
+        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8),
+        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8),
+        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8),
+        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8),
+        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8),
+        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8),
+        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8),
+        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8),
+        make_tuple(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8),
+        make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8),
+        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, 10),
+        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, 10),
+        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, 10),
+        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c, 10),
+        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c, 10),
+        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c, 10),
+        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c, 10),
+        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10),
+        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10),
+        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10),
+        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10),
+        make_tuple(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10),
+        make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10),
+        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, 12),
+        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, 12),
+        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, 12),
+        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c, 12),
+        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c, 12),
+        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c, 12),
+        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c, 12),
+        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12),
+        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12),
+        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12),
+        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12),
+        make_tuple(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12),
+        make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, 12)));
+
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDSubpelAvgVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, highbd_8_subpel_avg_var64x64_c, 8),
-        make_tuple(6, 5, highbd_8_subpel_avg_var64x32_c, 8),
-        make_tuple(5, 6, highbd_8_subpel_avg_var32x64_c, 8),
-        make_tuple(5, 5, highbd_8_subpel_avg_var32x32_c, 8),
-        make_tuple(5, 4, highbd_8_subpel_avg_var32x16_c, 8),
-        make_tuple(4, 5, highbd_8_subpel_avg_var16x32_c, 8),
-        make_tuple(4, 4, highbd_8_subpel_avg_var16x16_c, 8),
-        make_tuple(4, 3, highbd_8_subpel_avg_var16x8_c, 8),
-        make_tuple(3, 4, highbd_8_subpel_avg_var8x16_c, 8),
-        make_tuple(3, 3, highbd_8_subpel_avg_var8x8_c, 8),
-        make_tuple(3, 2, highbd_8_subpel_avg_var8x4_c, 8),
-        make_tuple(2, 3, highbd_8_subpel_avg_var4x8_c, 8),
-        make_tuple(2, 2, highbd_8_subpel_avg_var4x4_c, 8),
-        make_tuple(6, 6, highbd_10_subpel_avg_var64x64_c, 10),
-        make_tuple(6, 5, highbd_10_subpel_avg_var64x32_c, 10),
-        make_tuple(5, 6, highbd_10_subpel_avg_var32x64_c, 10),
-        make_tuple(5, 5, highbd_10_subpel_avg_var32x32_c, 10),
-        make_tuple(5, 4, highbd_10_subpel_avg_var32x16_c, 10),
-        make_tuple(4, 5, highbd_10_subpel_avg_var16x32_c, 10),
-        make_tuple(4, 4, highbd_10_subpel_avg_var16x16_c, 10),
-        make_tuple(4, 3, highbd_10_subpel_avg_var16x8_c, 10),
-        make_tuple(3, 4, highbd_10_subpel_avg_var8x16_c, 10),
-        make_tuple(3, 3, highbd_10_subpel_avg_var8x8_c, 10),
-        make_tuple(3, 2, highbd_10_subpel_avg_var8x4_c, 10),
-        make_tuple(2, 3, highbd_10_subpel_avg_var4x8_c, 10),
-        make_tuple(2, 2, highbd_10_subpel_avg_var4x4_c, 10),
-        make_tuple(6, 6, highbd_12_subpel_avg_var64x64_c, 12),
-        make_tuple(6, 5, highbd_12_subpel_avg_var64x32_c, 12),
-        make_tuple(5, 6, highbd_12_subpel_avg_var32x64_c, 12),
-        make_tuple(5, 5, highbd_12_subpel_avg_var32x32_c, 12),
-        make_tuple(5, 4, highbd_12_subpel_avg_var32x16_c, 12),
-        make_tuple(4, 5, highbd_12_subpel_avg_var16x32_c, 12),
-        make_tuple(4, 4, highbd_12_subpel_avg_var16x16_c, 12),
-        make_tuple(4, 3, highbd_12_subpel_avg_var16x8_c, 12),
-        make_tuple(3, 4, highbd_12_subpel_avg_var8x16_c, 12),
-        make_tuple(3, 3, highbd_12_subpel_avg_var8x8_c, 12),
-        make_tuple(3, 2, highbd_12_subpel_avg_var8x4_c, 12),
-        make_tuple(2, 3, highbd_12_subpel_avg_var4x8_c, 12),
-        make_tuple(2, 2, highbd_12_subpel_avg_var4x4_c, 12)));
+        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8),
+        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8),
+        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8),
+        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8),
+        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8),
+        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8),
+        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8),
+        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8),
+        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8),
+        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c, 8),
+        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, 8),
+        make_tuple(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, 8),
+        make_tuple(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, 8),
+        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_c, 10),
+        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_c, 10),
+        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_c, 10),
+        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_c, 10),
+        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_c, 10),
+        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_c, 10),
+        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_c, 10),
+        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_c, 10),
+        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_c, 10),
+        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10),
+        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10),
+        make_tuple(2, 3, &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10),
+        make_tuple(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10),
+        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_c, 12),
+        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_c, 12),
+        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_c, 12),
+        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_c, 12),
+        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_c, 12),
+        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_c, 12),
+        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_c, 12),
+        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_c, 12),
+        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_c, 12),
+        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12),
+        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12),
+        make_tuple(2, 3, &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12),
+        make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_c, 12)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_MMX
-const VarianceMxNFunc mse16x16_mmx = vpx_mse16x16_mmx;
-INSTANTIATE_TEST_CASE_P(MMX, VpxMseTest,
-                        ::testing::Values(make_tuple(4, 4, mse16x16_mmx)));
-
-INSTANTIATE_TEST_CASE_P(MMX, SumOfSquaresTest,
-                        ::testing::Values(vpx_get_mb_ss_mmx));
-
-const VarianceMxNFunc variance16x16_mmx = vpx_variance16x16_mmx;
-const VarianceMxNFunc variance16x8_mmx = vpx_variance16x8_mmx;
-const VarianceMxNFunc variance8x16_mmx = vpx_variance8x16_mmx;
-const VarianceMxNFunc variance8x8_mmx = vpx_variance8x8_mmx;
-const VarianceMxNFunc variance4x4_mmx = vpx_variance4x4_mmx;
-INSTANTIATE_TEST_CASE_P(
-    MMX, VpxVarianceTest,
-    ::testing::Values(make_tuple(4, 4, variance16x16_mmx, 0),
-                      make_tuple(4, 3, variance16x8_mmx, 0),
-                      make_tuple(3, 4, variance8x16_mmx, 0),
-                      make_tuple(3, 3, variance8x8_mmx, 0),
-                      make_tuple(2, 2, variance4x4_mmx, 0)));
-
-const SubpixVarMxNFunc subpel_var16x16_mmx = vpx_sub_pixel_variance16x16_mmx;
-const SubpixVarMxNFunc subpel_var16x8_mmx = vpx_sub_pixel_variance16x8_mmx;
-const SubpixVarMxNFunc subpel_var8x16_mmx = vpx_sub_pixel_variance8x16_mmx;
-const SubpixVarMxNFunc subpel_var8x8_mmx = vpx_sub_pixel_variance8x8_mmx;
-const SubpixVarMxNFunc subpel_var4x4_mmx = vpx_sub_pixel_variance4x4_mmx;
-INSTANTIATE_TEST_CASE_P(
-    MMX, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(4, 4, subpel_var16x16_mmx, 0),
-                      make_tuple(4, 3, subpel_var16x8_mmx, 0),
-                      make_tuple(3, 4, subpel_var8x16_mmx, 0),
-                      make_tuple(3, 3, subpel_var8x8_mmx, 0),
-                      make_tuple(2, 2, subpel_var4x4_mmx, 0)));
-#endif  // HAVE_MMX
-
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
                         ::testing::Values(vpx_get_mb_ss_sse2));
 
-const VarianceMxNFunc mse16x16_sse2 = vpx_mse16x16_sse2;
-const VarianceMxNFunc mse16x8_sse2 = vpx_mse16x8_sse2;
-const VarianceMxNFunc mse8x16_sse2 = vpx_mse8x16_sse2;
-const VarianceMxNFunc mse8x8_sse2 = vpx_mse8x8_sse2;
 INSTANTIATE_TEST_CASE_P(SSE2, VpxMseTest,
-                        ::testing::Values(make_tuple(4, 4, mse16x16_sse2),
-                                          make_tuple(4, 3, mse16x8_sse2),
-                                          make_tuple(3, 4, mse8x16_sse2),
-                                          make_tuple(3, 3, mse8x8_sse2)));
-
-const VarianceMxNFunc variance64x64_sse2 = vpx_variance64x64_sse2;
-const VarianceMxNFunc variance64x32_sse2 = vpx_variance64x32_sse2;
-const VarianceMxNFunc variance32x64_sse2 = vpx_variance32x64_sse2;
-const VarianceMxNFunc variance32x32_sse2 = vpx_variance32x32_sse2;
-const VarianceMxNFunc variance32x16_sse2 = vpx_variance32x16_sse2;
-const VarianceMxNFunc variance16x32_sse2 = vpx_variance16x32_sse2;
-const VarianceMxNFunc variance16x16_sse2 = vpx_variance16x16_sse2;
-const VarianceMxNFunc variance16x8_sse2 = vpx_variance16x8_sse2;
-const VarianceMxNFunc variance8x16_sse2 = vpx_variance8x16_sse2;
-const VarianceMxNFunc variance8x8_sse2 = vpx_variance8x8_sse2;
-const VarianceMxNFunc variance8x4_sse2 = vpx_variance8x4_sse2;
-const VarianceMxNFunc variance4x8_sse2 = vpx_variance4x8_sse2;
-const VarianceMxNFunc variance4x4_sse2 = vpx_variance4x4_sse2;
+                        ::testing::Values(make_tuple(4, 4, &vpx_mse16x16_sse2),
+                                          make_tuple(4, 3, &vpx_mse16x8_sse2),
+                                          make_tuple(3, 4, &vpx_mse8x16_sse2),
+                                          make_tuple(3, 3, &vpx_mse8x8_sse2)));
+
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxVarianceTest,
-    ::testing::Values(make_tuple(6, 6, variance64x64_sse2, 0),
-                      make_tuple(6, 5, variance64x32_sse2, 0),
-                      make_tuple(5, 6, variance32x64_sse2, 0),
-                      make_tuple(5, 5, variance32x32_sse2, 0),
-                      make_tuple(5, 4, variance32x16_sse2, 0),
-                      make_tuple(4, 5, variance16x32_sse2, 0),
-                      make_tuple(4, 4, variance16x16_sse2, 0),
-                      make_tuple(4, 3, variance16x8_sse2, 0),
-                      make_tuple(3, 4, variance8x16_sse2, 0),
-                      make_tuple(3, 3, variance8x8_sse2, 0),
-                      make_tuple(3, 2, variance8x4_sse2, 0),
-                      make_tuple(2, 3, variance4x8_sse2, 0),
-                      make_tuple(2, 2, variance4x4_sse2, 0)));
+    ::testing::Values(make_tuple(6, 6, &vpx_variance64x64_sse2, 0),
+                      make_tuple(6, 5, &vpx_variance64x32_sse2, 0),
+                      make_tuple(5, 6, &vpx_variance32x64_sse2, 0),
+                      make_tuple(5, 5, &vpx_variance32x32_sse2, 0),
+                      make_tuple(5, 4, &vpx_variance32x16_sse2, 0),
+                      make_tuple(4, 5, &vpx_variance16x32_sse2, 0),
+                      make_tuple(4, 4, &vpx_variance16x16_sse2, 0),
+                      make_tuple(4, 3, &vpx_variance16x8_sse2, 0),
+                      make_tuple(3, 4, &vpx_variance8x16_sse2, 0),
+                      make_tuple(3, 3, &vpx_variance8x8_sse2, 0),
+                      make_tuple(3, 2, &vpx_variance8x4_sse2, 0),
+                      make_tuple(2, 3, &vpx_variance4x8_sse2, 0),
+                      make_tuple(2, 2, &vpx_variance4x4_sse2, 0)));
 
 #if CONFIG_USE_X86INC
-const SubpixVarMxNFunc subpel_variance64x64_sse2 =
-    vpx_sub_pixel_variance64x64_sse2;
-const SubpixVarMxNFunc subpel_variance64x32_sse2 =
-    vpx_sub_pixel_variance64x32_sse2;
-const SubpixVarMxNFunc subpel_variance32x64_sse2 =
-    vpx_sub_pixel_variance32x64_sse2;
-const SubpixVarMxNFunc subpel_variance32x32_sse2 =
-    vpx_sub_pixel_variance32x32_sse2;
-const SubpixVarMxNFunc subpel_variance32x16_sse2 =
-    vpx_sub_pixel_variance32x16_sse2;
-const SubpixVarMxNFunc subpel_variance16x32_sse2 =
-    vpx_sub_pixel_variance16x32_sse2;
-const SubpixVarMxNFunc subpel_variance16x16_sse2 =
-    vpx_sub_pixel_variance16x16_sse2;
-const SubpixVarMxNFunc subpel_variance16x8_sse2 =
-    vpx_sub_pixel_variance16x8_sse2;
-const SubpixVarMxNFunc subpel_variance8x16_sse2 =
-    vpx_sub_pixel_variance8x16_sse2;
-const SubpixVarMxNFunc subpel_variance8x8_sse2 = vpx_sub_pixel_variance8x8_sse2;
-const SubpixVarMxNFunc subpel_variance8x4_sse2 = vpx_sub_pixel_variance8x4_sse2;
-const SubpixVarMxNFunc subpel_variance4x8_sse = vpx_sub_pixel_variance4x8_sse;
-const SubpixVarMxNFunc subpel_variance4x4_sse = vpx_sub_pixel_variance4x4_sse;
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, subpel_variance64x64_sse2, 0),
-                      make_tuple(6, 5, subpel_variance64x32_sse2, 0),
-                      make_tuple(5, 6, subpel_variance32x64_sse2, 0),
-                      make_tuple(5, 5, subpel_variance32x32_sse2, 0),
-                      make_tuple(5, 4, subpel_variance32x16_sse2, 0),
-                      make_tuple(4, 5, subpel_variance16x32_sse2, 0),
-                      make_tuple(4, 4, subpel_variance16x16_sse2, 0),
-                      make_tuple(4, 3, subpel_variance16x8_sse2, 0),
-                      make_tuple(3, 4, subpel_variance8x16_sse2, 0),
-                      make_tuple(3, 3, subpel_variance8x8_sse2, 0),
-                      make_tuple(3, 2, subpel_variance8x4_sse2, 0),
-                      make_tuple(2, 3, subpel_variance4x8_sse, 0),
-                      make_tuple(2, 2, subpel_variance4x4_sse, 0)));
-
-const SubpixAvgVarMxNFunc subpel_avg_variance64x64_sse2 =
-    vpx_sub_pixel_avg_variance64x64_sse2;
-const SubpixAvgVarMxNFunc subpel_avg_variance64x32_sse2 =
-    vpx_sub_pixel_avg_variance64x32_sse2;
-const SubpixAvgVarMxNFunc subpel_avg_variance32x64_sse2 =
-    vpx_sub_pixel_avg_variance32x64_sse2;
-const SubpixAvgVarMxNFunc subpel_avg_variance32x32_sse2 =
-    vpx_sub_pixel_avg_variance32x32_sse2;
-const SubpixAvgVarMxNFunc subpel_avg_variance32x16_sse2 =
-    vpx_sub_pixel_avg_variance32x16_sse2;
-const SubpixAvgVarMxNFunc subpel_avg_variance16x32_sse2 =
-    vpx_sub_pixel_avg_variance16x32_sse2;
-const SubpixAvgVarMxNFunc subpel_avg_variance16x16_sse2 =
-    vpx_sub_pixel_avg_variance16x16_sse2;
-const SubpixAvgVarMxNFunc subpel_avg_variance16x8_sse2 =
-    vpx_sub_pixel_avg_variance16x8_sse2;
-const SubpixAvgVarMxNFunc subpel_avg_variance8x16_sse2 =
-    vpx_sub_pixel_avg_variance8x16_sse2;
-const SubpixAvgVarMxNFunc subpel_avg_variance8x8_sse2 =
-    vpx_sub_pixel_avg_variance8x8_sse2;
-const SubpixAvgVarMxNFunc subpel_avg_variance8x4_sse2 =
-    vpx_sub_pixel_avg_variance8x4_sse2;
-const SubpixAvgVarMxNFunc subpel_avg_variance4x8_sse =
-    vpx_sub_pixel_avg_variance4x8_sse;
-const SubpixAvgVarMxNFunc subpel_avg_variance4x4_sse =
-    vpx_sub_pixel_avg_variance4x4_sse;
+    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
+                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0),
+                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0),
+                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0),
+                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0),
+                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0),
+                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0),
+                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0),
+                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
+                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
+                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
+                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
+                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
+
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxSubpelAvgVarianceTest,
     ::testing::Values(
-                      make_tuple(6, 6, subpel_avg_variance64x64_sse2, 0),
-                      make_tuple(6, 5, subpel_avg_variance64x32_sse2, 0),
-                      make_tuple(5, 6, subpel_avg_variance32x64_sse2, 0),
-                      make_tuple(5, 5, subpel_avg_variance32x32_sse2, 0),
-                      make_tuple(5, 4, subpel_avg_variance32x16_sse2, 0),
-                      make_tuple(4, 5, subpel_avg_variance16x32_sse2, 0),
-                      make_tuple(4, 4, subpel_avg_variance16x16_sse2, 0),
-                      make_tuple(4, 3, subpel_avg_variance16x8_sse2, 0),
-                      make_tuple(3, 4, subpel_avg_variance8x16_sse2, 0),
-                      make_tuple(3, 3, subpel_avg_variance8x8_sse2, 0),
-                      make_tuple(3, 2, subpel_avg_variance8x4_sse2, 0),
-                      make_tuple(2, 3, subpel_avg_variance4x8_sse, 0),
-                      make_tuple(2, 2, subpel_avg_variance4x4_sse, 0)));
+        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0),
+        make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0),
+        make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0),
+        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0),
+        make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0),
+        make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0),
+        make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0),
+        make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0),
+        make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
+        make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
+        make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
+        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
+        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
 #endif  // CONFIG_USE_X86INC
 
 #if CONFIG_VP9_HIGHBITDEPTH
 /* TODO(debargha): This test does not support the highbd version
-const VarianceMxNFunc highbd_12_mse16x16_sse2 = vpx_highbd_12_mse16x16_sse2;
-const VarianceMxNFunc highbd_12_mse16x8_sse2 = vpx_highbd_12_mse16x8_sse2;
-const VarianceMxNFunc highbd_12_mse8x16_sse2 = vpx_highbd_12_mse8x16_sse2;
-const VarianceMxNFunc highbd_12_mse8x8_sse2 = vpx_highbd_12_mse8x8_sse2;
-
-const VarianceMxNFunc highbd_10_mse16x16_sse2 = vpx_highbd_10_mse16x16_sse2;
-const VarianceMxNFunc highbd_10_mse16x8_sse2 = vpx_highbd_10_mse16x8_sse2;
-const VarianceMxNFunc highbd_10_mse8x16_sse2 = vpx_highbd_10_mse8x16_sse2;
-const VarianceMxNFunc highbd_10_mse8x8_sse2 = vpx_highbd_10_mse8x8_sse2;
-
-const VarianceMxNFunc highbd_8_mse16x16_sse2 = vpx_highbd_8_mse16x16_sse2;
-const VarianceMxNFunc highbd_8_mse16x8_sse2 = vpx_highbd_8_mse16x8_sse2;
-const VarianceMxNFunc highbd_8_mse8x16_sse2 = vpx_highbd_8_mse8x16_sse2;
-const VarianceMxNFunc highbd_8_mse8x8_sse2 = vpx_highbd_8_mse8x8_sse2;
 INSTANTIATE_TEST_CASE_P(
-    SSE2, VpxHBDMseTest, ::testing::Values(make_tuple(4, 4, highbd_12_mse16x16_sse2),
-                                           make_tuple(4, 3, highbd_12_mse16x8_sse2),
-                                           make_tuple(3, 4, highbd_12_mse8x16_sse2),
-                                           make_tuple(3, 3, highbd_12_mse8x8_sse2),
-                                           make_tuple(4, 4, highbd_10_mse16x16_sse2),
-                                           make_tuple(4, 3, highbd_10_mse16x8_sse2),
-                                           make_tuple(3, 4, highbd_10_mse8x16_sse2),
-                                           make_tuple(3, 3, highbd_10_mse8x8_sse2),
-                                           make_tuple(4, 4, highbd_8_mse16x16_sse2),
-                                           make_tuple(4, 3, highbd_8_mse16x8_sse2),
-                                           make_tuple(3, 4, highbd_8_mse8x16_sse2),
-                                           make_tuple(3, 3, highbd_8_mse8x8_sse2)));
+    SSE2, VpxHBDMseTest,
+    ::testing::Values(make_tuple(4, 4, &vpx_highbd_12_mse16x16_sse2),
+                      make_tuple(4, 3, &vpx_highbd_12_mse16x8_sse2),
+                      make_tuple(3, 4, &vpx_highbd_12_mse8x16_sse2),
+                      make_tuple(3, 3, &vpx_highbd_12_mse8x8_sse2),
+                      make_tuple(4, 4, &vpx_highbd_10_mse16x16_sse2),
+                      make_tuple(4, 3, &vpx_highbd_10_mse16x8_sse2),
+                      make_tuple(3, 4, &vpx_highbd_10_mse8x16_sse2),
+                      make_tuple(3, 3, &vpx_highbd_10_mse8x8_sse2),
+                      make_tuple(4, 4, &vpx_highbd_8_mse16x16_sse2),
+                      make_tuple(4, 3, &vpx_highbd_8_mse16x8_sse2),
+                      make_tuple(3, 4, &vpx_highbd_8_mse8x16_sse2),
+                      make_tuple(3, 3, &vpx_highbd_8_mse8x8_sse2)));
 */
 
-const VarianceMxNFunc highbd_12_variance64x64_sse2 =
-    vpx_highbd_12_variance64x64_sse2;
-const VarianceMxNFunc highbd_12_variance64x32_sse2 =
-    vpx_highbd_12_variance64x32_sse2;
-const VarianceMxNFunc highbd_12_variance32x64_sse2 =
-    vpx_highbd_12_variance32x64_sse2;
-const VarianceMxNFunc highbd_12_variance32x32_sse2 =
-    vpx_highbd_12_variance32x32_sse2;
-const VarianceMxNFunc highbd_12_variance32x16_sse2 =
-    vpx_highbd_12_variance32x16_sse2;
-const VarianceMxNFunc highbd_12_variance16x32_sse2 =
-    vpx_highbd_12_variance16x32_sse2;
-const VarianceMxNFunc highbd_12_variance16x16_sse2 =
-    vpx_highbd_12_variance16x16_sse2;
-const VarianceMxNFunc highbd_12_variance16x8_sse2 =
-    vpx_highbd_12_variance16x8_sse2;
-const VarianceMxNFunc highbd_12_variance8x16_sse2 =
-    vpx_highbd_12_variance8x16_sse2;
-const VarianceMxNFunc highbd_12_variance8x8_sse2 =
-    vpx_highbd_12_variance8x8_sse2;
-const VarianceMxNFunc highbd_10_variance64x64_sse2 =
-    vpx_highbd_10_variance64x64_sse2;
-const VarianceMxNFunc highbd_10_variance64x32_sse2 =
-    vpx_highbd_10_variance64x32_sse2;
-const VarianceMxNFunc highbd_10_variance32x64_sse2 =
-    vpx_highbd_10_variance32x64_sse2;
-const VarianceMxNFunc highbd_10_variance32x32_sse2 =
-    vpx_highbd_10_variance32x32_sse2;
-const VarianceMxNFunc highbd_10_variance32x16_sse2 =
-    vpx_highbd_10_variance32x16_sse2;
-const VarianceMxNFunc highbd_10_variance16x32_sse2 =
-    vpx_highbd_10_variance16x32_sse2;
-const VarianceMxNFunc highbd_10_variance16x16_sse2 =
-    vpx_highbd_10_variance16x16_sse2;
-const VarianceMxNFunc highbd_10_variance16x8_sse2 =
-    vpx_highbd_10_variance16x8_sse2;
-const VarianceMxNFunc highbd_10_variance8x16_sse2 =
-    vpx_highbd_10_variance8x16_sse2;
-const VarianceMxNFunc highbd_10_variance8x8_sse2 =
-    vpx_highbd_10_variance8x8_sse2;
-const VarianceMxNFunc highbd_8_variance64x64_sse2 =
-    vpx_highbd_8_variance64x64_sse2;
-const VarianceMxNFunc highbd_8_variance64x32_sse2 =
-    vpx_highbd_8_variance64x32_sse2;
-const VarianceMxNFunc highbd_8_variance32x64_sse2 =
-    vpx_highbd_8_variance32x64_sse2;
-const VarianceMxNFunc highbd_8_variance32x32_sse2 =
-    vpx_highbd_8_variance32x32_sse2;
-const VarianceMxNFunc highbd_8_variance32x16_sse2 =
-    vpx_highbd_8_variance32x16_sse2;
-const VarianceMxNFunc highbd_8_variance16x32_sse2 =
-    vpx_highbd_8_variance16x32_sse2;
-const VarianceMxNFunc highbd_8_variance16x16_sse2 =
-    vpx_highbd_8_variance16x16_sse2;
-const VarianceMxNFunc highbd_8_variance16x8_sse2 =
-    vpx_highbd_8_variance16x8_sse2;
-const VarianceMxNFunc highbd_8_variance8x16_sse2 =
-    vpx_highbd_8_variance8x16_sse2;
-const VarianceMxNFunc highbd_8_variance8x8_sse2 =
-    vpx_highbd_8_variance8x8_sse2;
-
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxHBDVarianceTest,
-    ::testing::Values(make_tuple(6, 6, highbd_12_variance64x64_sse2, 12),
-                      make_tuple(6, 5, highbd_12_variance64x32_sse2, 12),
-                      make_tuple(5, 6, highbd_12_variance32x64_sse2, 12),
-                      make_tuple(5, 5, highbd_12_variance32x32_sse2, 12),
-                      make_tuple(5, 4, highbd_12_variance32x16_sse2, 12),
-                      make_tuple(4, 5, highbd_12_variance16x32_sse2, 12),
-                      make_tuple(4, 4, highbd_12_variance16x16_sse2, 12),
-                      make_tuple(4, 3, highbd_12_variance16x8_sse2, 12),
-                      make_tuple(3, 4, highbd_12_variance8x16_sse2, 12),
-                      make_tuple(3, 3, highbd_12_variance8x8_sse2, 12),
-                      make_tuple(6, 6, highbd_10_variance64x64_sse2, 10),
-                      make_tuple(6, 5, highbd_10_variance64x32_sse2, 10),
-                      make_tuple(5, 6, highbd_10_variance32x64_sse2, 10),
-                      make_tuple(5, 5, highbd_10_variance32x32_sse2, 10),
-                      make_tuple(5, 4, highbd_10_variance32x16_sse2, 10),
-                      make_tuple(4, 5, highbd_10_variance16x32_sse2, 10),
-                      make_tuple(4, 4, highbd_10_variance16x16_sse2, 10),
-                      make_tuple(4, 3, highbd_10_variance16x8_sse2, 10),
-                      make_tuple(3, 4, highbd_10_variance8x16_sse2, 10),
-                      make_tuple(3, 3, highbd_10_variance8x8_sse2, 10),
-                      make_tuple(6, 6, highbd_8_variance64x64_sse2, 8),
-                      make_tuple(6, 5, highbd_8_variance64x32_sse2, 8),
-                      make_tuple(5, 6, highbd_8_variance32x64_sse2, 8),
-                      make_tuple(5, 5, highbd_8_variance32x32_sse2, 8),
-                      make_tuple(5, 4, highbd_8_variance32x16_sse2, 8),
-                      make_tuple(4, 5, highbd_8_variance16x32_sse2, 8),
-                      make_tuple(4, 4, highbd_8_variance16x16_sse2, 8),
-                      make_tuple(4, 3, highbd_8_variance16x8_sse2, 8),
-                      make_tuple(3, 4, highbd_8_variance8x16_sse2, 8),
-                      make_tuple(3, 3, highbd_8_variance8x8_sse2, 8)));
+    ::testing::Values(make_tuple(6, 6, &vpx_highbd_12_variance64x64_sse2, 12),
+                      make_tuple(6, 5, &vpx_highbd_12_variance64x32_sse2, 12),
+                      make_tuple(5, 6, &vpx_highbd_12_variance32x64_sse2, 12),
+                      make_tuple(5, 5, &vpx_highbd_12_variance32x32_sse2, 12),
+                      make_tuple(5, 4, &vpx_highbd_12_variance32x16_sse2, 12),
+                      make_tuple(4, 5, &vpx_highbd_12_variance16x32_sse2, 12),
+                      make_tuple(4, 4, &vpx_highbd_12_variance16x16_sse2, 12),
+                      make_tuple(4, 3, &vpx_highbd_12_variance16x8_sse2, 12),
+                      make_tuple(3, 4, &vpx_highbd_12_variance8x16_sse2, 12),
+                      make_tuple(3, 3, &vpx_highbd_12_variance8x8_sse2, 12),
+                      make_tuple(6, 6, &vpx_highbd_10_variance64x64_sse2, 10),
+                      make_tuple(6, 5, &vpx_highbd_10_variance64x32_sse2, 10),
+                      make_tuple(5, 6, &vpx_highbd_10_variance32x64_sse2, 10),
+                      make_tuple(5, 5, &vpx_highbd_10_variance32x32_sse2, 10),
+                      make_tuple(5, 4, &vpx_highbd_10_variance32x16_sse2, 10),
+                      make_tuple(4, 5, &vpx_highbd_10_variance16x32_sse2, 10),
+                      make_tuple(4, 4, &vpx_highbd_10_variance16x16_sse2, 10),
+                      make_tuple(4, 3, &vpx_highbd_10_variance16x8_sse2, 10),
+                      make_tuple(3, 4, &vpx_highbd_10_variance8x16_sse2, 10),
+                      make_tuple(3, 3, &vpx_highbd_10_variance8x8_sse2, 10),
+                      make_tuple(6, 6, &vpx_highbd_8_variance64x64_sse2, 8),
+                      make_tuple(6, 5, &vpx_highbd_8_variance64x32_sse2, 8),
+                      make_tuple(5, 6, &vpx_highbd_8_variance32x64_sse2, 8),
+                      make_tuple(5, 5, &vpx_highbd_8_variance32x32_sse2, 8),
+                      make_tuple(5, 4, &vpx_highbd_8_variance32x16_sse2, 8),
+                      make_tuple(4, 5, &vpx_highbd_8_variance16x32_sse2, 8),
+                      make_tuple(4, 4, &vpx_highbd_8_variance16x16_sse2, 8),
+                      make_tuple(4, 3, &vpx_highbd_8_variance16x8_sse2, 8),
+                      make_tuple(3, 4, &vpx_highbd_8_variance8x16_sse2, 8),
+                      make_tuple(3, 3, &vpx_highbd_8_variance8x8_sse2, 8)));
 
 #if CONFIG_USE_X86INC
-const SubpixVarMxNFunc highbd_12_subpel_variance64x64_sse2 =
-    vpx_highbd_12_sub_pixel_variance64x64_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance64x32_sse2 =
-    vpx_highbd_12_sub_pixel_variance64x32_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance32x64_sse2 =
-    vpx_highbd_12_sub_pixel_variance32x64_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance32x32_sse2 =
-    vpx_highbd_12_sub_pixel_variance32x32_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance32x16_sse2 =
-    vpx_highbd_12_sub_pixel_variance32x16_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance16x32_sse2 =
-    vpx_highbd_12_sub_pixel_variance16x32_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance16x16_sse2 =
-    vpx_highbd_12_sub_pixel_variance16x16_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance16x8_sse2 =
-    vpx_highbd_12_sub_pixel_variance16x8_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance8x16_sse2 =
-    vpx_highbd_12_sub_pixel_variance8x16_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance8x8_sse2 =
-    vpx_highbd_12_sub_pixel_variance8x8_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance8x4_sse2 =
-    vpx_highbd_12_sub_pixel_variance8x4_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance64x64_sse2 =
-    vpx_highbd_10_sub_pixel_variance64x64_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance64x32_sse2 =
-    vpx_highbd_10_sub_pixel_variance64x32_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance32x64_sse2 =
-    vpx_highbd_10_sub_pixel_variance32x64_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance32x32_sse2 =
-    vpx_highbd_10_sub_pixel_variance32x32_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance32x16_sse2 =
-    vpx_highbd_10_sub_pixel_variance32x16_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance16x32_sse2 =
-    vpx_highbd_10_sub_pixel_variance16x32_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance16x16_sse2 =
-    vpx_highbd_10_sub_pixel_variance16x16_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance16x8_sse2 =
-    vpx_highbd_10_sub_pixel_variance16x8_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance8x16_sse2 =
-    vpx_highbd_10_sub_pixel_variance8x16_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance8x8_sse2 =
-    vpx_highbd_10_sub_pixel_variance8x8_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance8x4_sse2 =
-    vpx_highbd_10_sub_pixel_variance8x4_sse2;
-const SubpixVarMxNFunc highbd_8_subpel_variance64x64_sse2 =
-    vpx_highbd_8_sub_pixel_variance64x64_sse2;
-const SubpixVarMxNFunc highbd_8_subpel_variance64x32_sse2 =
-    vpx_highbd_8_sub_pixel_variance64x32_sse2;
-const SubpixVarMxNFunc highbd_8_subpel_variance32x64_sse2 =
-    vpx_highbd_8_sub_pixel_variance32x64_sse2;
-const SubpixVarMxNFunc highbd_8_subpel_variance32x32_sse2 =
-    vpx_highbd_8_sub_pixel_variance32x32_sse2;
-const SubpixVarMxNFunc highbd_8_subpel_variance32x16_sse2 =
-    vpx_highbd_8_sub_pixel_variance32x16_sse2;
-const SubpixVarMxNFunc highbd_8_subpel_variance16x32_sse2 =
-    vpx_highbd_8_sub_pixel_variance16x32_sse2;
-const SubpixVarMxNFunc highbd_8_subpel_variance16x16_sse2 =
-    vpx_highbd_8_sub_pixel_variance16x16_sse2;
-const SubpixVarMxNFunc highbd_8_subpel_variance16x8_sse2 =
-    vpx_highbd_8_sub_pixel_variance16x8_sse2;
-const SubpixVarMxNFunc highbd_8_subpel_variance8x16_sse2 =
-    vpx_highbd_8_sub_pixel_variance8x16_sse2;
-const SubpixVarMxNFunc highbd_8_subpel_variance8x8_sse2 =
-    vpx_highbd_8_sub_pixel_variance8x8_sse2;
-const SubpixVarMxNFunc highbd_8_subpel_variance8x4_sse2 =
-    vpx_highbd_8_sub_pixel_variance8x4_sse2;
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxHBDSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, highbd_12_subpel_variance64x64_sse2, 12),
-                      make_tuple(6, 5, highbd_12_subpel_variance64x32_sse2, 12),
-                      make_tuple(5, 6, highbd_12_subpel_variance32x64_sse2, 12),
-                      make_tuple(5, 5, highbd_12_subpel_variance32x32_sse2, 12),
-                      make_tuple(5, 4, highbd_12_subpel_variance32x16_sse2, 12),
-                      make_tuple(4, 5, highbd_12_subpel_variance16x32_sse2, 12),
-                      make_tuple(4, 4, highbd_12_subpel_variance16x16_sse2, 12),
-                      make_tuple(4, 3, highbd_12_subpel_variance16x8_sse2, 12),
-                      make_tuple(3, 4, highbd_12_subpel_variance8x16_sse2, 12),
-                      make_tuple(3, 3, highbd_12_subpel_variance8x8_sse2, 12),
-                      make_tuple(3, 2, highbd_12_subpel_variance8x4_sse2, 12),
-                      make_tuple(6, 6, highbd_10_subpel_variance64x64_sse2, 10),
-                      make_tuple(6, 5, highbd_10_subpel_variance64x32_sse2, 10),
-                      make_tuple(5, 6, highbd_10_subpel_variance32x64_sse2, 10),
-                      make_tuple(5, 5, highbd_10_subpel_variance32x32_sse2, 10),
-                      make_tuple(5, 4, highbd_10_subpel_variance32x16_sse2, 10),
-                      make_tuple(4, 5, highbd_10_subpel_variance16x32_sse2, 10),
-                      make_tuple(4, 4, highbd_10_subpel_variance16x16_sse2, 10),
-                      make_tuple(4, 3, highbd_10_subpel_variance16x8_sse2, 10),
-                      make_tuple(3, 4, highbd_10_subpel_variance8x16_sse2, 10),
-                      make_tuple(3, 3, highbd_10_subpel_variance8x8_sse2, 10),
-                      make_tuple(3, 2, highbd_10_subpel_variance8x4_sse2, 10),
-                      make_tuple(6, 6, highbd_8_subpel_variance64x64_sse2, 8),
-                      make_tuple(6, 5, highbd_8_subpel_variance64x32_sse2, 8),
-                      make_tuple(5, 6, highbd_8_subpel_variance32x64_sse2, 8),
-                      make_tuple(5, 5, highbd_8_subpel_variance32x32_sse2, 8),
-                      make_tuple(5, 4, highbd_8_subpel_variance32x16_sse2, 8),
-                      make_tuple(4, 5, highbd_8_subpel_variance16x32_sse2, 8),
-                      make_tuple(4, 4, highbd_8_subpel_variance16x16_sse2, 8),
-                      make_tuple(4, 3, highbd_8_subpel_variance16x8_sse2, 8),
-                      make_tuple(3, 4, highbd_8_subpel_variance8x16_sse2, 8),
-                      make_tuple(3, 3, highbd_8_subpel_variance8x8_sse2, 8),
-                      make_tuple(3, 2, highbd_8_subpel_variance8x4_sse2, 8)));
-
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance64x64_sse2 =
-    vpx_highbd_12_sub_pixel_avg_variance64x64_sse2;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance64x32_sse2 =
-    vpx_highbd_12_sub_pixel_avg_variance64x32_sse2;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance32x64_sse2 =
-    vpx_highbd_12_sub_pixel_avg_variance32x64_sse2;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance32x32_sse2 =
-    vpx_highbd_12_sub_pixel_avg_variance32x32_sse2;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance32x16_sse2 =
-    vpx_highbd_12_sub_pixel_avg_variance32x16_sse2;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance16x32_sse2 =
-    vpx_highbd_12_sub_pixel_avg_variance16x32_sse2;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance16x16_sse2 =
-    vpx_highbd_12_sub_pixel_avg_variance16x16_sse2;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance16x8_sse2 =
-    vpx_highbd_12_sub_pixel_avg_variance16x8_sse2;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance8x16_sse2 =
-    vpx_highbd_12_sub_pixel_avg_variance8x16_sse2;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance8x8_sse2 =
-    vpx_highbd_12_sub_pixel_avg_variance8x8_sse2;
-const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance8x4_sse2 =
-    vpx_highbd_12_sub_pixel_avg_variance8x4_sse2;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance64x64_sse2 =
-    vpx_highbd_10_sub_pixel_avg_variance64x64_sse2;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance64x32_sse2 =
-    vpx_highbd_10_sub_pixel_avg_variance64x32_sse2;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance32x64_sse2 =
-    vpx_highbd_10_sub_pixel_avg_variance32x64_sse2;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance32x32_sse2 =
-    vpx_highbd_10_sub_pixel_avg_variance32x32_sse2;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance32x16_sse2 =
-    vpx_highbd_10_sub_pixel_avg_variance32x16_sse2;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance16x32_sse2 =
-    vpx_highbd_10_sub_pixel_avg_variance16x32_sse2;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance16x16_sse2 =
-    vpx_highbd_10_sub_pixel_avg_variance16x16_sse2;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance16x8_sse2 =
-    vpx_highbd_10_sub_pixel_avg_variance16x8_sse2;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance8x16_sse2 =
-    vpx_highbd_10_sub_pixel_avg_variance8x16_sse2;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance8x8_sse2 =
-    vpx_highbd_10_sub_pixel_avg_variance8x8_sse2;
-const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance8x4_sse2 =
-    vpx_highbd_10_sub_pixel_avg_variance8x4_sse2;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance64x64_sse2 =
-    vpx_highbd_8_sub_pixel_avg_variance64x64_sse2;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance64x32_sse2 =
-    vpx_highbd_8_sub_pixel_avg_variance64x32_sse2;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance32x64_sse2 =
-    vpx_highbd_8_sub_pixel_avg_variance32x64_sse2;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance32x32_sse2 =
-    vpx_highbd_8_sub_pixel_avg_variance32x32_sse2;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance32x16_sse2 =
-    vpx_highbd_8_sub_pixel_avg_variance32x16_sse2;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance16x32_sse2 =
-    vpx_highbd_8_sub_pixel_avg_variance16x32_sse2;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance16x16_sse2 =
-    vpx_highbd_8_sub_pixel_avg_variance16x16_sse2;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance16x8_sse2 =
-    vpx_highbd_8_sub_pixel_avg_variance16x8_sse2;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance8x16_sse2 =
-    vpx_highbd_8_sub_pixel_avg_variance8x16_sse2;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance8x8_sse2 =
-    vpx_highbd_8_sub_pixel_avg_variance8x8_sse2;
-const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance8x4_sse2 =
-    vpx_highbd_8_sub_pixel_avg_variance8x4_sse2;
+    ::testing::Values(
+        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2, 12),
+        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2, 12),
+        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2, 12),
+        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2, 12),
+        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2, 12),
+        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2, 12),
+        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2, 12),
+        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2, 12),
+        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2, 12),
+        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2, 12),
+        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2, 12),
+        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2, 10),
+        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2, 10),
+        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2, 10),
+        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2, 10),
+        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2, 10),
+        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2, 10),
+        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2, 10),
+        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2, 10),
+        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2, 10),
+        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2, 10),
+        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2, 10),
+        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2, 8),
+        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2, 8),
+        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2, 8),
+        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2, 8),
+        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2, 8),
+        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2, 8),
+        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2, 8),
+        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2, 8),
+        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2, 8),
+        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8),
+        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2, 8)));
+
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxHBDSubpelAvgVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_sse2, 12),
-        make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_sse2, 12),
-        make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_sse2, 12),
-        make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_sse2, 12),
-        make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_sse2, 12),
-        make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_sse2, 12),
-        make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_sse2, 12),
-        make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_sse2, 12),
-        make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_sse2, 12),
-        make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_sse2, 12),
-        make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_sse2, 12),
-        make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_sse2, 10),
-        make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_sse2, 10),
-        make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_sse2, 10),
-        make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_sse2, 10),
-        make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_sse2, 10),
-        make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_sse2, 10),
-        make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_sse2, 10),
-        make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_sse2, 10),
-        make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_sse2, 10),
-        make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_sse2, 10),
-        make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_sse2, 10),
-        make_tuple(6, 6, highbd_8_subpel_avg_variance64x64_sse2, 8),
-        make_tuple(6, 5, highbd_8_subpel_avg_variance64x32_sse2, 8),
-        make_tuple(5, 6, highbd_8_subpel_avg_variance32x64_sse2, 8),
-        make_tuple(5, 5, highbd_8_subpel_avg_variance32x32_sse2, 8),
-        make_tuple(5, 4, highbd_8_subpel_avg_variance32x16_sse2, 8),
-        make_tuple(4, 5, highbd_8_subpel_avg_variance16x32_sse2, 8),
-        make_tuple(4, 4, highbd_8_subpel_avg_variance16x16_sse2, 8),
-        make_tuple(4, 3, highbd_8_subpel_avg_variance16x8_sse2, 8),
-        make_tuple(3, 4, highbd_8_subpel_avg_variance8x16_sse2, 8),
-        make_tuple(3, 3, highbd_8_subpel_avg_variance8x8_sse2, 8),
-        make_tuple(3, 2, highbd_8_subpel_avg_variance8x4_sse2, 8)));
+        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2, 12),
+        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2, 12),
+        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2, 12),
+        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2, 12),
+        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2, 12),
+        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2, 12),
+        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2, 12),
+        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2, 12),
+        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2, 12),
+        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2, 12),
+        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2, 12),
+        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2, 10),
+        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2, 10),
+        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2, 10),
+        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2, 10),
+        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2, 10),
+        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2, 10),
+        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2, 10),
+        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2, 10),
+        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2, 10),
+        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2, 10),
+        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2, 10),
+        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2, 8),
+        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2, 8),
+        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2, 8),
+        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2, 8),
+        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2, 8),
+        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2, 8),
+        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2, 8),
+        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2, 8),
+        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2, 8),
+        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2, 8),
+        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2, 8)));
 #endif  // CONFIG_USE_X86INC
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
 #if CONFIG_USE_X86INC
-const SubpixVarMxNFunc subpel_variance64x64_ssse3 =
-    vpx_sub_pixel_variance64x64_ssse3;
-const SubpixVarMxNFunc subpel_variance64x32_ssse3 =
-    vpx_sub_pixel_variance64x32_ssse3;
-const SubpixVarMxNFunc subpel_variance32x64_ssse3 =
-    vpx_sub_pixel_variance32x64_ssse3;
-const SubpixVarMxNFunc subpel_variance32x32_ssse3 =
-    vpx_sub_pixel_variance32x32_ssse3;
-const SubpixVarMxNFunc subpel_variance32x16_ssse3 =
-    vpx_sub_pixel_variance32x16_ssse3;
-const SubpixVarMxNFunc subpel_variance16x32_ssse3 =
-    vpx_sub_pixel_variance16x32_ssse3;
-const SubpixVarMxNFunc subpel_variance16x16_ssse3 =
-    vpx_sub_pixel_variance16x16_ssse3;
-const SubpixVarMxNFunc subpel_variance16x8_ssse3 =
-    vpx_sub_pixel_variance16x8_ssse3;
-const SubpixVarMxNFunc subpel_variance8x16_ssse3 =
-    vpx_sub_pixel_variance8x16_ssse3;
-const SubpixVarMxNFunc subpel_variance8x8_ssse3 =
-    vpx_sub_pixel_variance8x8_ssse3;
-const SubpixVarMxNFunc subpel_variance8x4_ssse3 =
-    vpx_sub_pixel_variance8x4_ssse3;
-const SubpixVarMxNFunc subpel_variance4x8_ssse3 =
-    vpx_sub_pixel_variance4x8_ssse3;
-const SubpixVarMxNFunc subpel_variance4x4_ssse3 =
-    vpx_sub_pixel_variance4x4_ssse3;
 INSTANTIATE_TEST_CASE_P(
     SSSE3, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, subpel_variance64x64_ssse3, 0),
-                      make_tuple(6, 5, subpel_variance64x32_ssse3, 0),
-                      make_tuple(5, 6, subpel_variance32x64_ssse3, 0),
-                      make_tuple(5, 5, subpel_variance32x32_ssse3, 0),
-                      make_tuple(5, 4, subpel_variance32x16_ssse3, 0),
-                      make_tuple(4, 5, subpel_variance16x32_ssse3, 0),
-                      make_tuple(4, 4, subpel_variance16x16_ssse3, 0),
-                      make_tuple(4, 3, subpel_variance16x8_ssse3, 0),
-                      make_tuple(3, 4, subpel_variance8x16_ssse3, 0),
-                      make_tuple(3, 3, subpel_variance8x8_ssse3, 0),
-                      make_tuple(3, 2, subpel_variance8x4_ssse3, 0),
-                      make_tuple(2, 3, subpel_variance4x8_ssse3, 0),
-                      make_tuple(2, 2, subpel_variance4x4_ssse3, 0)));
-
-const SubpixAvgVarMxNFunc subpel_avg_variance64x64_ssse3 =
-    vpx_sub_pixel_avg_variance64x64_ssse3;
-const SubpixAvgVarMxNFunc subpel_avg_variance64x32_ssse3 =
-    vpx_sub_pixel_avg_variance64x32_ssse3;
-const SubpixAvgVarMxNFunc subpel_avg_variance32x64_ssse3 =
-    vpx_sub_pixel_avg_variance32x64_ssse3;
-const SubpixAvgVarMxNFunc subpel_avg_variance32x32_ssse3 =
-    vpx_sub_pixel_avg_variance32x32_ssse3;
-const SubpixAvgVarMxNFunc subpel_avg_variance32x16_ssse3 =
-    vpx_sub_pixel_avg_variance32x16_ssse3;
-const SubpixAvgVarMxNFunc subpel_avg_variance16x32_ssse3 =
-    vpx_sub_pixel_avg_variance16x32_ssse3;
-const SubpixAvgVarMxNFunc subpel_avg_variance16x16_ssse3 =
-    vpx_sub_pixel_avg_variance16x16_ssse3;
-const SubpixAvgVarMxNFunc subpel_avg_variance16x8_ssse3 =
-    vpx_sub_pixel_avg_variance16x8_ssse3;
-const SubpixAvgVarMxNFunc subpel_avg_variance8x16_ssse3 =
-    vpx_sub_pixel_avg_variance8x16_ssse3;
-const SubpixAvgVarMxNFunc subpel_avg_variance8x8_ssse3 =
-    vpx_sub_pixel_avg_variance8x8_ssse3;
-const SubpixAvgVarMxNFunc subpel_avg_variance8x4_ssse3 =
-    vpx_sub_pixel_avg_variance8x4_ssse3;
-const SubpixAvgVarMxNFunc subpel_avg_variance4x8_ssse3 =
-    vpx_sub_pixel_avg_variance4x8_ssse3;
-const SubpixAvgVarMxNFunc subpel_avg_variance4x4_ssse3 =
-    vpx_sub_pixel_avg_variance4x4_ssse3;
+    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0),
+                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0),
+                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0),
+                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0),
+                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0),
+                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0),
+                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0),
+                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0),
+                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0),
+                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0),
+                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0),
+                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0),
+                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0)));
+
 INSTANTIATE_TEST_CASE_P(
     SSSE3, VpxSubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(6, 6, subpel_avg_variance64x64_ssse3, 0),
-                      make_tuple(6, 5, subpel_avg_variance64x32_ssse3, 0),
-                      make_tuple(5, 6, subpel_avg_variance32x64_ssse3, 0),
-                      make_tuple(5, 5, subpel_avg_variance32x32_ssse3, 0),
-                      make_tuple(5, 4, subpel_avg_variance32x16_ssse3, 0),
-                      make_tuple(4, 5, subpel_avg_variance16x32_ssse3, 0),
-                      make_tuple(4, 4, subpel_avg_variance16x16_ssse3, 0),
-                      make_tuple(4, 3, subpel_avg_variance16x8_ssse3, 0),
-                      make_tuple(3, 4, subpel_avg_variance8x16_ssse3, 0),
-                      make_tuple(3, 3, subpel_avg_variance8x8_ssse3, 0),
-                      make_tuple(3, 2, subpel_avg_variance8x4_ssse3, 0),
-                      make_tuple(2, 3, subpel_avg_variance4x8_ssse3, 0),
-                      make_tuple(2, 2, subpel_avg_variance4x4_ssse3, 0)));
+    ::testing::Values(
+        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3, 0),
+        make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3, 0),
+        make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3, 0),
+        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3, 0),
+        make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3, 0),
+        make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3, 0),
+        make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3, 0),
+        make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0),
+        make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0),
+        make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0),
+        make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0),
+        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0),
+        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3, 0)));
 #endif  // CONFIG_USE_X86INC
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX2
-const VarianceMxNFunc mse16x16_avx2 = vpx_mse16x16_avx2;
 INSTANTIATE_TEST_CASE_P(AVX2, VpxMseTest,
-                        ::testing::Values(make_tuple(4, 4, mse16x16_avx2)));
+                        ::testing::Values(make_tuple(4, 4,
+                                                     &vpx_mse16x16_avx2)));
 
-const VarianceMxNFunc variance64x64_avx2 = vpx_variance64x64_avx2;
-const VarianceMxNFunc variance64x32_avx2 = vpx_variance64x32_avx2;
-const VarianceMxNFunc variance32x32_avx2 = vpx_variance32x32_avx2;
-const VarianceMxNFunc variance32x16_avx2 = vpx_variance32x16_avx2;
-const VarianceMxNFunc variance16x16_avx2 = vpx_variance16x16_avx2;
 INSTANTIATE_TEST_CASE_P(
     AVX2, VpxVarianceTest,
-    ::testing::Values(make_tuple(6, 6, variance64x64_avx2, 0),
-                      make_tuple(6, 5, variance64x32_avx2, 0),
-                      make_tuple(5, 5, variance32x32_avx2, 0),
-                      make_tuple(5, 4, variance32x16_avx2, 0),
-                      make_tuple(4, 4, variance16x16_avx2, 0)));
-
-const SubpixVarMxNFunc subpel_variance64x64_avx2 =
-    vpx_sub_pixel_variance64x64_avx2;
-const SubpixVarMxNFunc subpel_variance32x32_avx2 =
-    vpx_sub_pixel_variance32x32_avx2;
+    ::testing::Values(make_tuple(6, 6, &vpx_variance64x64_avx2, 0),
+                      make_tuple(6, 5, &vpx_variance64x32_avx2, 0),
+                      make_tuple(5, 5, &vpx_variance32x32_avx2, 0),
+                      make_tuple(5, 4, &vpx_variance32x16_avx2, 0),
+                      make_tuple(4, 4, &vpx_variance16x16_avx2, 0)));
+
 INSTANTIATE_TEST_CASE_P(
     AVX2, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, subpel_variance64x64_avx2, 0),
-                      make_tuple(5, 5, subpel_variance32x32_avx2, 0)));
+    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0),
+                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0)));
 
-const SubpixAvgVarMxNFunc subpel_avg_variance64x64_avx2 =
-    vpx_sub_pixel_avg_variance64x64_avx2;
-const SubpixAvgVarMxNFunc subpel_avg_variance32x32_avx2 =
-    vpx_sub_pixel_avg_variance32x32_avx2;
 INSTANTIATE_TEST_CASE_P(
     AVX2, VpxSubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(6, 6, subpel_avg_variance64x64_avx2, 0),
-                      make_tuple(5, 5, subpel_avg_variance32x32_avx2, 0)));
+    ::testing::Values(
+        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0),
+        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, 0)));
 #endif  // HAVE_AVX2
 
 #if HAVE_MEDIA
-const VarianceMxNFunc mse16x16_media = vpx_mse16x16_media;
 INSTANTIATE_TEST_CASE_P(MEDIA, VpxMseTest,
-                        ::testing::Values(make_tuple(4, 4, mse16x16_media)));
+                        ::testing::Values(make_tuple(4, 4,
+                                                     &vpx_mse16x16_media)));
 
-const VarianceMxNFunc variance16x16_media = vpx_variance16x16_media;
-const VarianceMxNFunc variance8x8_media = vpx_variance8x8_media;
 INSTANTIATE_TEST_CASE_P(
     MEDIA, VpxVarianceTest,
-    ::testing::Values(make_tuple(4, 4, variance16x16_media, 0),
-                      make_tuple(3, 3, variance8x8_media, 0)));
+    ::testing::Values(make_tuple(4, 4, &vpx_variance16x16_media, 0),
+                      make_tuple(3, 3, &vpx_variance8x8_media, 0)));
 
-const SubpixVarMxNFunc subpel_variance16x16_media =
-    vpx_sub_pixel_variance16x16_media;
-const SubpixVarMxNFunc subpel_variance8x8_media =
-    vpx_sub_pixel_variance8x8_media;
 INSTANTIATE_TEST_CASE_P(
     MEDIA, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(4, 4, subpel_variance16x16_media, 0),
-                      make_tuple(3, 3, subpel_variance8x8_media, 0)));
+    ::testing::Values(make_tuple(4, 4, &vpx_sub_pixel_variance16x16_media, 0),
+                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_media, 0)));
 #endif  // HAVE_MEDIA
 
 #if HAVE_NEON
-const Get4x4SseFunc get4x4sse_cs_neon = vpx_get4x4sse_cs_neon;
 INSTANTIATE_TEST_CASE_P(NEON, VpxSseTest,
-                        ::testing::Values(make_tuple(2, 2, get4x4sse_cs_neon)));
+                        ::testing::Values(make_tuple(2, 2,
+                                                     &vpx_get4x4sse_cs_neon)));
 
-const VarianceMxNFunc mse16x16_neon = vpx_mse16x16_neon;
 INSTANTIATE_TEST_CASE_P(NEON, VpxMseTest,
-                        ::testing::Values(make_tuple(4, 4, mse16x16_neon)));
-
-const VarianceMxNFunc variance64x64_neon = vpx_variance64x64_neon;
-const VarianceMxNFunc variance64x32_neon = vpx_variance64x32_neon;
-const VarianceMxNFunc variance32x64_neon = vpx_variance32x64_neon;
-const VarianceMxNFunc variance32x32_neon = vpx_variance32x32_neon;
-const VarianceMxNFunc variance16x16_neon = vpx_variance16x16_neon;
-const VarianceMxNFunc variance16x8_neon = vpx_variance16x8_neon;
-const VarianceMxNFunc variance8x16_neon = vpx_variance8x16_neon;
-const VarianceMxNFunc variance8x8_neon = vpx_variance8x8_neon;
+                        ::testing::Values(make_tuple(4, 4,
+                                                     &vpx_mse16x16_neon)));
+
 INSTANTIATE_TEST_CASE_P(
     NEON, VpxVarianceTest,
-    ::testing::Values(make_tuple(6, 6, variance64x64_neon, 0),
-                      make_tuple(6, 5, variance64x32_neon, 0),
-                      make_tuple(5, 6, variance32x64_neon, 0),
-                      make_tuple(5, 5, variance32x32_neon, 0),
-                      make_tuple(4, 4, variance16x16_neon, 0),
-                      make_tuple(4, 3, variance16x8_neon, 0),
-                      make_tuple(3, 4, variance8x16_neon, 0),
-                      make_tuple(3, 3, variance8x8_neon, 0)));
-
-const SubpixVarMxNFunc subpel_variance64x64_neon =
-    vpx_sub_pixel_variance64x64_neon;
-const SubpixVarMxNFunc subpel_variance32x32_neon =
-    vpx_sub_pixel_variance32x32_neon;
-const SubpixVarMxNFunc subpel_variance16x16_neon =
-    vpx_sub_pixel_variance16x16_neon;
-const SubpixVarMxNFunc subpel_variance8x8_neon = vpx_sub_pixel_variance8x8_neon;
+    ::testing::Values(make_tuple(6, 6, &vpx_variance64x64_neon, 0),
+                      make_tuple(6, 5, &vpx_variance64x32_neon, 0),
+                      make_tuple(5, 6, &vpx_variance32x64_neon, 0),
+                      make_tuple(5, 5, &vpx_variance32x32_neon, 0),
+                      make_tuple(4, 4, &vpx_variance16x16_neon, 0),
+                      make_tuple(4, 3, &vpx_variance16x8_neon, 0),
+                      make_tuple(3, 4, &vpx_variance8x16_neon, 0),
+                      make_tuple(3, 3, &vpx_variance8x8_neon, 0)));
+
 INSTANTIATE_TEST_CASE_P(
     NEON, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, subpel_variance64x64_neon, 0),
-                      make_tuple(5, 5, subpel_variance32x32_neon, 0),
-                      make_tuple(4, 4, subpel_variance16x16_neon, 0),
-                      make_tuple(3, 3, subpel_variance8x8_neon, 0)));
+    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_neon, 0),
+                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_neon, 0),
+                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_neon, 0),
+                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_neon, 0)));
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
 INSTANTIATE_TEST_CASE_P(MSA, SumOfSquaresTest,
                         ::testing::Values(vpx_get_mb_ss_msa));
 
-const Get4x4SseFunc get4x4sse_cs_msa = vpx_get4x4sse_cs_msa;
 INSTANTIATE_TEST_CASE_P(MSA, VpxSseTest,
-                        ::testing::Values(make_tuple(2, 2, get4x4sse_cs_msa)));
+                        ::testing::Values(make_tuple(2, 2,
+                                                     &vpx_get4x4sse_cs_msa)));
 
-const VarianceMxNFunc mse16x16_msa = vpx_mse16x16_msa;
-const VarianceMxNFunc mse16x8_msa = vpx_mse16x8_msa;
-const VarianceMxNFunc mse8x16_msa = vpx_mse8x16_msa;
-const VarianceMxNFunc mse8x8_msa = vpx_mse8x8_msa;
 INSTANTIATE_TEST_CASE_P(MSA, VpxMseTest,
-                        ::testing::Values(make_tuple(4, 4, mse16x16_msa),
-                                          make_tuple(4, 3, mse16x8_msa),
-                                          make_tuple(3, 4, mse8x16_msa),
-                                          make_tuple(3, 3, mse8x8_msa)));
-
-const VarianceMxNFunc variance64x64_msa = vpx_variance64x64_msa;
-const VarianceMxNFunc variance64x32_msa = vpx_variance64x32_msa;
-const VarianceMxNFunc variance32x64_msa = vpx_variance32x64_msa;
-const VarianceMxNFunc variance32x32_msa = vpx_variance32x32_msa;
-const VarianceMxNFunc variance32x16_msa = vpx_variance32x16_msa;
-const VarianceMxNFunc variance16x32_msa = vpx_variance16x32_msa;
-const VarianceMxNFunc variance16x16_msa = vpx_variance16x16_msa;
-const VarianceMxNFunc variance16x8_msa = vpx_variance16x8_msa;
-const VarianceMxNFunc variance8x16_msa = vpx_variance8x16_msa;
-const VarianceMxNFunc variance8x8_msa = vpx_variance8x8_msa;
-const VarianceMxNFunc variance8x4_msa = vpx_variance8x4_msa;
-const VarianceMxNFunc variance4x8_msa = vpx_variance4x8_msa;
-const VarianceMxNFunc variance4x4_msa = vpx_variance4x4_msa;
+                        ::testing::Values(make_tuple(4, 4, &vpx_mse16x16_msa),
+                                          make_tuple(4, 3, &vpx_mse16x8_msa),
+                                          make_tuple(3, 4, &vpx_mse8x16_msa),
+                                          make_tuple(3, 3, &vpx_mse8x8_msa)));
+
 INSTANTIATE_TEST_CASE_P(
     MSA, VpxVarianceTest,
-    ::testing::Values(make_tuple(6, 6, variance64x64_msa, 0),
-                      make_tuple(6, 5, variance64x32_msa, 0),
-                      make_tuple(5, 6, variance32x64_msa, 0),
-                      make_tuple(5, 5, variance32x32_msa, 0),
-                      make_tuple(5, 4, variance32x16_msa, 0),
-                      make_tuple(4, 5, variance16x32_msa, 0),
-                      make_tuple(4, 4, variance16x16_msa, 0),
-                      make_tuple(4, 3, variance16x8_msa, 0),
-                      make_tuple(3, 4, variance8x16_msa, 0),
-                      make_tuple(3, 3, variance8x8_msa, 0),
-                      make_tuple(3, 2, variance8x4_msa, 0),
-                      make_tuple(2, 3, variance4x8_msa, 0),
-                      make_tuple(2, 2, variance4x4_msa, 0)));
-
-const SubpixVarMxNFunc subpel_variance4x4_msa = vpx_sub_pixel_variance4x4_msa;
-const SubpixVarMxNFunc subpel_variance4x8_msa = vpx_sub_pixel_variance4x8_msa;
-const SubpixVarMxNFunc subpel_variance8x4_msa = vpx_sub_pixel_variance8x4_msa;
-const SubpixVarMxNFunc subpel_variance8x8_msa = vpx_sub_pixel_variance8x8_msa;
-const SubpixVarMxNFunc subpel_variance8x16_msa = vpx_sub_pixel_variance8x16_msa;
-const SubpixVarMxNFunc subpel_variance16x8_msa = vpx_sub_pixel_variance16x8_msa;
-const SubpixVarMxNFunc subpel_variance16x16_msa =
-    vpx_sub_pixel_variance16x16_msa;
-const SubpixVarMxNFunc subpel_variance16x32_msa =
-    vpx_sub_pixel_variance16x32_msa;
-const SubpixVarMxNFunc subpel_variance32x16_msa =
-    vpx_sub_pixel_variance32x16_msa;
-const SubpixVarMxNFunc subpel_variance32x32_msa =
-    vpx_sub_pixel_variance32x32_msa;
-const SubpixVarMxNFunc subpel_variance32x64_msa =
-    vpx_sub_pixel_variance32x64_msa;
-const SubpixVarMxNFunc subpel_variance64x32_msa =
-    vpx_sub_pixel_variance64x32_msa;
-const SubpixVarMxNFunc subpel_variance64x64_msa =
-    vpx_sub_pixel_variance64x64_msa;
+    ::testing::Values(make_tuple(6, 6, &vpx_variance64x64_msa, 0),
+                      make_tuple(6, 5, &vpx_variance64x32_msa, 0),
+                      make_tuple(5, 6, &vpx_variance32x64_msa, 0),
+                      make_tuple(5, 5, &vpx_variance32x32_msa, 0),
+                      make_tuple(5, 4, &vpx_variance32x16_msa, 0),
+                      make_tuple(4, 5, &vpx_variance16x32_msa, 0),
+                      make_tuple(4, 4, &vpx_variance16x16_msa, 0),
+                      make_tuple(4, 3, &vpx_variance16x8_msa, 0),
+                      make_tuple(3, 4, &vpx_variance8x16_msa, 0),
+                      make_tuple(3, 3, &vpx_variance8x8_msa, 0),
+                      make_tuple(3, 2, &vpx_variance8x4_msa, 0),
+                      make_tuple(2, 3, &vpx_variance4x8_msa, 0),
+                      make_tuple(2, 2, &vpx_variance4x4_msa, 0)));
+
 INSTANTIATE_TEST_CASE_P(
     MSA, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_msa, 0),
-                      make_tuple(2, 3, subpel_variance4x8_msa, 0),
-                      make_tuple(3, 2, subpel_variance8x4_msa, 0),
-                      make_tuple(3, 3, subpel_variance8x8_msa, 0),
-                      make_tuple(3, 4, subpel_variance8x16_msa, 0),
-                      make_tuple(4, 3, subpel_variance16x8_msa, 0),
-                      make_tuple(4, 4, subpel_variance16x16_msa, 0),
-                      make_tuple(4, 5, subpel_variance16x32_msa, 0),
-                      make_tuple(5, 4, subpel_variance32x16_msa, 0),
-                      make_tuple(5, 5, subpel_variance32x32_msa, 0),
-                      make_tuple(5, 6, subpel_variance32x64_msa, 0),
-                      make_tuple(6, 5, subpel_variance64x32_msa, 0),
-                      make_tuple(6, 6, subpel_variance64x64_msa, 0)));
-
-const SubpixAvgVarMxNFunc subpel_avg_variance64x64_msa =
-    vpx_sub_pixel_avg_variance64x64_msa;
-const SubpixAvgVarMxNFunc subpel_avg_variance64x32_msa =
-    vpx_sub_pixel_avg_variance64x32_msa;
-const SubpixAvgVarMxNFunc subpel_avg_variance32x64_msa =
-    vpx_sub_pixel_avg_variance32x64_msa;
-const SubpixAvgVarMxNFunc subpel_avg_variance32x32_msa =
-    vpx_sub_pixel_avg_variance32x32_msa;
-const SubpixAvgVarMxNFunc subpel_avg_variance32x16_msa =
-    vpx_sub_pixel_avg_variance32x16_msa;
-const SubpixAvgVarMxNFunc subpel_avg_variance16x32_msa =
-    vpx_sub_pixel_avg_variance16x32_msa;
-const SubpixAvgVarMxNFunc subpel_avg_variance16x16_msa =
-    vpx_sub_pixel_avg_variance16x16_msa;
-const SubpixAvgVarMxNFunc subpel_avg_variance16x8_msa =
-    vpx_sub_pixel_avg_variance16x8_msa;
-const SubpixAvgVarMxNFunc subpel_avg_variance8x16_msa =
-    vpx_sub_pixel_avg_variance8x16_msa;
-const SubpixAvgVarMxNFunc subpel_avg_variance8x8_msa =
-    vpx_sub_pixel_avg_variance8x8_msa;
-const SubpixAvgVarMxNFunc subpel_avg_variance8x4_msa =
-    vpx_sub_pixel_avg_variance8x4_msa;
-const SubpixAvgVarMxNFunc subpel_avg_variance4x8_msa =
-    vpx_sub_pixel_avg_variance4x8_msa;
-const SubpixAvgVarMxNFunc subpel_avg_variance4x4_msa =
-    vpx_sub_pixel_avg_variance4x4_msa;
+    ::testing::Values(make_tuple(2, 2, &vpx_sub_pixel_variance4x4_msa, 0),
+                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_msa, 0),
+                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_msa, 0),
+                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_msa, 0),
+                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_msa, 0),
+                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_msa, 0),
+                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_msa, 0),
+                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_msa, 0),
+                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_msa, 0),
+                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_msa, 0),
+                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_msa, 0),
+                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_msa, 0),
+                      make_tuple(6, 6, &vpx_sub_pixel_variance64x64_msa, 0)));
+
 INSTANTIATE_TEST_CASE_P(
     MSA, VpxSubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(6, 6, subpel_avg_variance64x64_msa, 0),
-                      make_tuple(6, 5, subpel_avg_variance64x32_msa, 0),
-                      make_tuple(5, 6, subpel_avg_variance32x64_msa, 0),
-                      make_tuple(5, 5, subpel_avg_variance32x32_msa, 0),
-                      make_tuple(5, 4, subpel_avg_variance32x16_msa, 0),
-                      make_tuple(4, 5, subpel_avg_variance16x32_msa, 0),
-                      make_tuple(4, 4, subpel_avg_variance16x16_msa, 0),
-                      make_tuple(4, 3, subpel_avg_variance16x8_msa, 0),
-                      make_tuple(3, 4, subpel_avg_variance8x16_msa, 0),
-                      make_tuple(3, 3, subpel_avg_variance8x8_msa, 0),
-                      make_tuple(3, 2, subpel_avg_variance8x4_msa, 0),
-                      make_tuple(2, 3, subpel_avg_variance4x8_msa, 0),
-                      make_tuple(2, 2, subpel_avg_variance4x4_msa, 0)));
+    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0),
+                      make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0),
+                      make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0),
+                      make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0),
+                      make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0),
+                      make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0),
+                      make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0),
+                      make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0),
+                      make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0),
+                      make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0),
+                      make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0),
+                      make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0),
+                      make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0)));
 #endif  // HAVE_MSA
 }  // namespace
diff --git a/test/vp10_dct_test.cc b/test/vp10_dct_test.cc
deleted file mode 100644
index b2c301a..0000000
--- a/test/vp10_dct_test.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include <new>
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/acm_random.h"
-#include "test/util.h"
-#include "./vpx_config.h"
-#include "vpx_ports/msvc.h"
-
-#undef CONFIG_COEFFICIENT_RANGE_CHECKING
-#define CONFIG_COEFFICIENT_RANGE_CHECKING 1
-#include "vp10/encoder/dct.c"
-
-using libvpx_test::ACMRandom;
-
-namespace {
-void reference_dct_1d(const double *in, double *out, int size) {
-  const double PI = 3.141592653589793238462643383279502884;
-  const double kInvSqrt2 = 0.707106781186547524400844362104;
-  for (int k = 0; k < size; ++k) {
-    out[k] = 0;
-    for (int n = 0; n < size; ++n) {
-      out[k] += in[n] * cos(PI * (2 * n + 1) * k / (2 * size));
-    }
-    if (k == 0)
-      out[k] = out[k] * kInvSqrt2;
-  }
-}
-
-typedef void (*FdctFuncRef)(const double *in, double *out, int size);
-typedef void (*IdctFuncRef)(const double *in, double *out, int size);
-typedef void (*FdctFunc)(const tran_low_t *in, tran_low_t *out);
-typedef void (*IdctFunc)(const tran_low_t *in, tran_low_t *out);
-
-class TransTestBase {
- public:
-  virtual ~TransTestBase() {}
-
- protected:
-  void RunFwdAccuracyCheck() {
-    tran_low_t *input  = new tran_low_t[txfm_size_];
-    tran_low_t *output = new tran_low_t[txfm_size_];
-    double *ref_input  = new double[txfm_size_];
-    double *ref_output = new double[txfm_size_];
-
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 5000;
-    for (int ti =  0; ti < count_test_block; ++ti) {
-      for (int ni = 0; ni < txfm_size_; ++ni) {
-        input[ni] = rnd.Rand8() - rnd.Rand8();
-        ref_input[ni] = static_cast<double>(input[ni]);
-      }
-
-      fwd_txfm_(input, output);
-      fwd_txfm_ref_(ref_input, ref_output, txfm_size_);
-
-      for (int ni = 0; ni < txfm_size_; ++ni) {
-        EXPECT_LE(
-            abs(output[ni] - static_cast<tran_low_t>(round(ref_output[ni]))),
-            max_error_);
-      }
-    }
-
-    delete[] input;
-    delete[] output;
-    delete[] ref_input;
-    delete[] ref_output;
-  }
-
-  double max_error_;
-  int txfm_size_;
-  FdctFunc fwd_txfm_;
-  FdctFuncRef fwd_txfm_ref_;
-};
-
-typedef std::tr1::tuple<FdctFunc, FdctFuncRef, int, int> FdctParam;
-class Vp10FwdTxfm
-    : public TransTestBase,
-      public ::testing::TestWithParam<FdctParam> {
- public:
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    fwd_txfm_ref_ = GET_PARAM(1);
-    txfm_size_ = GET_PARAM(2);
-    max_error_ = GET_PARAM(3);
-  }
-  virtual void TearDown() {}
-};
-
-TEST_P(Vp10FwdTxfm, RunFwdAccuracyCheck) {
-  RunFwdAccuracyCheck();
-}
-
-INSTANTIATE_TEST_CASE_P(
-    C, Vp10FwdTxfm,
-    ::testing::Values(
-        FdctParam(&fdct4, &reference_dct_1d, 4, 1),
-        FdctParam(&fdct8, &reference_dct_1d, 8, 1),
-        FdctParam(&fdct16, &reference_dct_1d, 16, 2)));
-}  // namespace
diff --git a/test/vp10_inv_txfm_test.cc b/test/vp10_inv_txfm_test.cc
deleted file mode 100644
index c49081e..0000000
--- a/test/vp10_inv_txfm_test.cc
+++ /dev/null
@@ -1,321 +0,0 @@
-/*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
-#include "./vp10_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "vp10/common/blockd.h"
-#include "vp10/common/scan.h"
-#include "vpx/vpx_integer.h"
-#include "vp10/common/vp10_inv_txfm.h"
-
-using libvpx_test::ACMRandom;
-
-namespace {
-const double PI = 3.141592653589793238462643383279502884;
-const double kInvSqrt2 = 0.707106781186547524400844362104;
-
-void reference_idct_1d(const double *in, double *out, int size) {
-  for (int n = 0; n < size; ++n) {
-    out[n] = 0;
-    for (int k = 0; k < size; ++k) {
-      if (k == 0)
-        out[n] += kInvSqrt2 * in[k] * cos(PI * (2 * n + 1) * k / (2 * size));
-      else
-        out[n] += in[k] * cos(PI * (2 * n + 1) * k / (2 * size));
-    }
-  }
-}
-
-typedef void (*IdctFuncRef)(const double *in, double *out, int size);
-typedef void (*IdctFunc)(const tran_low_t *in, tran_low_t *out);
-
-class TransTestBase {
- public:
-  virtual ~TransTestBase() {}
-
- protected:
-  void RunInvAccuracyCheck() {
-    tran_low_t *input  = new tran_low_t[txfm_size_];
-    tran_low_t *output = new tran_low_t[txfm_size_];
-    double *ref_input  = new double[txfm_size_];
-    double *ref_output = new double[txfm_size_];
-
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 5000;
-    for (int ti =  0; ti < count_test_block; ++ti) {
-      for (int ni = 0; ni < txfm_size_; ++ni) {
-        input[ni] = rnd.Rand8() - rnd.Rand8();
-        ref_input[ni] = static_cast<double>(input[ni]);
-      }
-
-      fwd_txfm_(input, output);
-      fwd_txfm_ref_(ref_input, ref_output, txfm_size_);
-
-      for (int ni = 0; ni < txfm_size_; ++ni) {
-        EXPECT_LE(
-            abs(output[ni] - static_cast<tran_low_t>(round(ref_output[ni]))),
-            max_error_);
-      }
-    }
-
-    delete[] input;
-    delete[] output;
-    delete[] ref_input;
-    delete[] ref_output;
-  }
-
-  double max_error_;
-  int txfm_size_;
-  IdctFunc fwd_txfm_;
-  IdctFuncRef fwd_txfm_ref_;
-};
-
-typedef std::tr1::tuple<IdctFunc, IdctFuncRef, int, int> IdctParam;
-class Vp10InvTxfm
-    : public TransTestBase,
-      public ::testing::TestWithParam<IdctParam> {
- public:
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    fwd_txfm_ref_ = GET_PARAM(1);
-    txfm_size_ = GET_PARAM(2);
-    max_error_ = GET_PARAM(3);
-  }
-  virtual void TearDown() {}
-};
-
-TEST_P(Vp10InvTxfm, RunInvAccuracyCheck) {
-  RunInvAccuracyCheck();
-}
-
-INSTANTIATE_TEST_CASE_P(
-    C, Vp10InvTxfm,
-    ::testing::Values(
-        IdctParam(&vp10_idct4_c, &reference_idct_1d, 4, 1),
-        IdctParam(&vp10_idct8_c, &reference_idct_1d, 8, 2),
-        IdctParam(&vp10_idct16_c, &reference_idct_1d, 16, 4),
-        IdctParam(&vp10_idct32_c, &reference_idct_1d, 32, 6))
-);
-
-typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef std::tr1::tuple<FwdTxfmFunc,
-                        InvTxfmFunc,
-                        InvTxfmFunc,
-                        TX_SIZE, int> PartialInvTxfmParam;
-const int kMaxNumCoeffs = 1024;
-class Vp10PartialIDctTest
-    : public ::testing::TestWithParam<PartialInvTxfmParam> {
- public:
-  virtual ~Vp10PartialIDctTest() {}
-  virtual void SetUp() {
-    ftxfm_ = GET_PARAM(0);
-    full_itxfm_ = GET_PARAM(1);
-    partial_itxfm_ = GET_PARAM(2);
-    tx_size_  = GET_PARAM(3);
-    last_nonzero_ = GET_PARAM(4);
-  }
-
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  int last_nonzero_;
-  TX_SIZE tx_size_;
-  FwdTxfmFunc ftxfm_;
-  InvTxfmFunc full_itxfm_;
-  InvTxfmFunc partial_itxfm_;
-};
-
-TEST_P(Vp10PartialIDctTest, RunQuantCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int size;
-  switch (tx_size_) {
-    case TX_4X4:
-      size = 4;
-      break;
-    case TX_8X8:
-      size = 8;
-      break;
-    case TX_16X16:
-      size = 16;
-      break;
-    case TX_32X32:
-      size = 32;
-      break;
-    default:
-      FAIL() << "Wrong Size!";
-      break;
-  }
-  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
-
-  const int count_test_block = 1000;
-  const int block_size = size * size;
-
-  DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
-
-  int max_error = 0;
-  for (int i = 0; i < count_test_block; ++i) {
-    // clear out destination buffer
-    memset(dst1, 0, sizeof(*dst1) * block_size);
-    memset(dst2, 0, sizeof(*dst2) * block_size);
-    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
-    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
-
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
-      if (i == 0) {
-        for (int j = 0; j < block_size; ++j)
-          input_extreme_block[j] = 255;
-      } else if (i == 1) {
-        for (int j = 0; j < block_size; ++j)
-          input_extreme_block[j] = -255;
-      } else {
-        for (int j = 0; j < block_size; ++j) {
-          input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
-        }
-      }
-
-      ftxfm_(input_extreme_block, output_ref_block, size);
-
-      // quantization with maximum allowed step sizes
-      test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
-      for (int j = 1; j < last_nonzero_; ++j)
-        test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]]
-                         = (output_ref_block[j] / 1828) * 1828;
-    }
-
-    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
-    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size));
-
-    for (int j = 0; j < block_size; ++j) {
-      const int diff = dst1[j] - dst2[j];
-      const int error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-    }
-  }
-
-  EXPECT_EQ(0, max_error)
-      << "Error: partial inverse transform produces different results";
-}
-
-TEST_P(Vp10PartialIDctTest, ResultsMatch) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int size;
-  switch (tx_size_) {
-    case TX_4X4:
-      size = 4;
-      break;
-    case TX_8X8:
-      size = 8;
-      break;
-    case TX_16X16:
-      size = 16;
-      break;
-    case TX_32X32:
-      size = 32;
-      break;
-    default:
-      FAIL() << "Wrong Size!";
-      break;
-  }
-  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
-  const int count_test_block = 1000;
-  const int max_coeff = 32766 / 4;
-  const int block_size = size * size;
-  int max_error = 0;
-  for (int i = 0; i < count_test_block; ++i) {
-    // clear out destination buffer
-    memset(dst1, 0, sizeof(*dst1) * block_size);
-    memset(dst2, 0, sizeof(*dst2) * block_size);
-    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
-    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
-    int max_energy_leftover = max_coeff * max_coeff;
-    for (int j = 0; j < last_nonzero_; ++j) {
-      int16_t coef = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
-                                          (rnd.Rand16() - 32768) / 65536);
-      max_energy_leftover -= coef * coef;
-      if (max_energy_leftover < 0) {
-        max_energy_leftover = 0;
-        coef = 0;
-      }
-      test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]] = coef;
-    }
-
-    memcpy(test_coef_block2, test_coef_block1,
-           sizeof(*test_coef_block2) * block_size);
-
-    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
-    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block2, dst2, size));
-
-    for (int j = 0; j < block_size; ++j) {
-      const int diff = dst1[j] - dst2[j];
-      const int error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-    }
-  }
-
-  EXPECT_EQ(0, max_error)
-      << "Error: partial inverse transform produces different results";
-}
-using std::tr1::make_tuple;
-
-INSTANTIATE_TEST_CASE_P(
-    C, Vp10PartialIDctTest,
-    ::testing::Values(
-        make_tuple(&vpx_fdct32x32_c,
-                   &vp10_idct32x32_1024_add_c,
-                   &vp10_idct32x32_34_add_c,
-                   TX_32X32, 34),
-        make_tuple(&vpx_fdct32x32_c,
-                   &vp10_idct32x32_1024_add_c,
-                   &vp10_idct32x32_1_add_c,
-                   TX_32X32, 1),
-        make_tuple(&vpx_fdct16x16_c,
-                   &vp10_idct16x16_256_add_c,
-                   &vp10_idct16x16_10_add_c,
-                   TX_16X16, 10),
-        make_tuple(&vpx_fdct16x16_c,
-                   &vp10_idct16x16_256_add_c,
-                   &vp10_idct16x16_1_add_c,
-                   TX_16X16, 1),
-        make_tuple(&vpx_fdct8x8_c,
-                   &vp10_idct8x8_64_add_c,
-                   &vp10_idct8x8_12_add_c,
-                   TX_8X8, 12),
-        make_tuple(&vpx_fdct8x8_c,
-                   &vp10_idct8x8_64_add_c,
-                   &vp10_idct8x8_1_add_c,
-                   TX_8X8, 1),
-        make_tuple(&vpx_fdct4x4_c,
-                   &vp10_idct4x4_16_add_c,
-                   &vp10_idct4x4_1_add_c,
-                   TX_4X4, 1)));
-}  // namespace
diff --git a/test/vp9_arf_freq_test.cc b/test/vp9_arf_freq_test.cc
index 89200d4..aa3e34d 100644
--- a/test/vp9_arf_freq_test.cc
+++ b/test/vp9_arf_freq_test.cc
@@ -229,24 +229,4 @@ VP9_INSTANTIATE_TEST_CASE(
     ::testing::ValuesIn(kTestVectors),
     ::testing::ValuesIn(kEncodeVectors),
     ::testing::ValuesIn(kMinArfVectors));
-
-#if CONFIG_VP9_HIGHBITDEPTH
-# if CONFIG_VP10_ENCODER
-// TODO(angiebird): 25-29 fail in high bitdepth mode.
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_VP10, ArfFreqTest,
-    ::testing::Combine(
-        ::testing::Values(static_cast<const libvpx_test::CodecFactory *>(
-            &libvpx_test::kVP10)),
-        ::testing::ValuesIn(kTestVectors),
-        ::testing::ValuesIn(kEncodeVectors),
-        ::testing::ValuesIn(kMinArfVectors)));
-# endif  // CONFIG_VP10_ENCODER
-#else
-VP10_INSTANTIATE_TEST_CASE(
-    ArfFreqTest,
-    ::testing::ValuesIn(kTestVectors),
-    ::testing::ValuesIn(kEncodeVectors),
-    ::testing::ValuesIn(kMinArfVectors));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/test/vp9_denoiser_sse2_test.cc b/test/vp9_denoiser_sse2_test.cc
index 17c799d..c84d7ff 100644
--- a/test/vp9_denoiser_sse2_test.cc
+++ b/test/vp9_denoiser_sse2_test.cc
@@ -94,8 +94,7 @@ TEST_P(VP9DenoiserTest, BitexactCheck) {
 // Test for all block size.
 INSTANTIATE_TEST_CASE_P(
     SSE2, VP9DenoiserTest,
-    ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, BLOCK_8X8,
-                      BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, BLOCK_16X32,
-                      BLOCK_32X16, BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
-                      BLOCK_64X64));
+    ::testing::Values(BLOCK_8X8, BLOCK_8X16, BLOCK_16X8, BLOCK_16X16,
+                      BLOCK_16X32, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64,
+                      BLOCK_64X32, BLOCK_64X64));
 }  // namespace
diff --git a/test/vp9_encoder_parms_get_to_decoder.cc b/test/vp9_encoder_parms_get_to_decoder.cc
index 3ef6022..bd84098 100644
--- a/test/vp9_encoder_parms_get_to_decoder.cc
+++ b/test/vp9_encoder_parms_get_to_decoder.cc
@@ -45,9 +45,9 @@ struct EncodeParameters {
 };
 
 const EncodeParameters kVP9EncodeParameterSet[] = {
-  {0, 0, 0, 1, 0, VPX_CR_STUDIO_RANGE, VPX_CS_BT_601},
-  {0, 0, 0, 0, 0, VPX_CR_FULL_RANGE, VPX_CS_BT_709},
-  {0, 0, 1, 0, 0, VPX_CR_FULL_RANGE, VPX_CS_BT_2020},
+  {0, 0, 0, 1, 0, VPX_CR_STUDIO_RANGE, VPX_CS_BT_601, { 0, 0 }},
+  {0, 0, 0, 0, 0, VPX_CR_FULL_RANGE, VPX_CS_BT_709, { 0, 0 }},
+  {0, 0, 1, 0, 0, VPX_CR_FULL_RANGE, VPX_CS_BT_2020, { 0, 0 }},
   {0, 2, 0, 0, 1, VPX_CR_STUDIO_RANGE, VPX_CS_UNKNOWN, { 640, 480 }},
   // TODO(JBB): Test profiles (requires more work).
 };
@@ -93,7 +93,7 @@ class VpxEncoderParmsGetToDecoder
   }
 
   virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource &video,
+                                  const libvpx_test::VideoSource & /*video*/,
                                   libvpx_test::Decoder *decoder) {
     vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder();
     vpx_codec_alg_priv_t *const priv =
diff --git a/test/vp9_end_to_end_test.cc b/test/vp9_end_to_end_test.cc
index be1fa68..666919f 100644
--- a/test/vp9_end_to_end_test.cc
+++ b/test/vp9_end_to_end_test.cc
@@ -186,24 +186,4 @@ VP9_INSTANTIATE_TEST_CASE(
     ::testing::ValuesIn(kEncodingModeVectors),
     ::testing::ValuesIn(kTestVectors),
     ::testing::ValuesIn(kCpuUsedVectors));
-
-#if CONFIG_VP9_HIGHBITDEPTH
-# if CONFIG_VP10_ENCODER
-// TODO(angiebird): many fail in high bitdepth mode.
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_VP10, EndToEndTestLarge,
-    ::testing::Combine(
-        ::testing::Values(static_cast<const libvpx_test::CodecFactory *>(
-            &libvpx_test::kVP10)),
-        ::testing::ValuesIn(kEncodingModeVectors),
-        ::testing::ValuesIn(kTestVectors),
-        ::testing::ValuesIn(kCpuUsedVectors)));
-# endif  // CONFIG_VP10_ENCODER
-#else
-VP10_INSTANTIATE_TEST_CASE(
-    EndToEndTestLarge,
-    ::testing::ValuesIn(kEncodingModeVectors),
-    ::testing::ValuesIn(kTestVectors),
-    ::testing::ValuesIn(kCpuUsedVectors));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/test/vp9_error_block_test.cc b/test/vp9_error_block_test.cc
index 77b12ea..23a249e 100644
--- a/test/vp9_error_block_test.cc
+++ b/test/vp9_error_block_test.cc
@@ -164,7 +164,7 @@ int64_t wrap_vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
                                            const tran_low_t *dqcoeff,
                                            intptr_t block_size,
                                            int64_t *ssz, int bps) {
-  assert(bps == 8);
+  EXPECT_EQ(8, bps);
   return vp9_highbd_block_error_8bit_c(coeff, dqcoeff, block_size, ssz);
 }
 
@@ -173,7 +173,7 @@ int64_t wrap_vp9_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
                                               const tran_low_t *dqcoeff,
                                               intptr_t block_size,
                                               int64_t *ssz, int bps) {
-  assert(bps == 8);
+  EXPECT_EQ(8, bps);
   return vp9_highbd_block_error_8bit_sse2(coeff, dqcoeff, block_size, ssz);
 }
 
@@ -195,7 +195,7 @@ int64_t wrap_vp9_highbd_block_error_8bit_avx(const tran_low_t *coeff,
                                               const tran_low_t *dqcoeff,
                                               intptr_t block_size,
                                               int64_t *ssz, int bps) {
-  assert(bps == 8);
+  EXPECT_EQ(8, bps);
   return vp9_highbd_block_error_8bit_avx(coeff, dqcoeff, block_size, ssz);
 }
 
diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index 63f6dfe..62b9109 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -29,16 +29,9 @@ class VPxEncoderThreadTest
         encoding_mode_(GET_PARAM(1)),
         set_cpu_used_(GET_PARAM(2)) {
     init_flags_ = VPX_CODEC_USE_PSNR;
-    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
-    cfg.w = 1280;
-    cfg.h = 720;
-    decoder_ = codec_->CreateDecoder(cfg, 0);
-
     md5_.clear();
   }
-  virtual ~VPxEncoderThreadTest() {
-    delete decoder_;
-  }
+  virtual ~VPxEncoderThreadTest() {}
 
   virtual void SetUp() {
     InitializeConfig();
@@ -48,7 +41,7 @@ class VPxEncoderThreadTest
       cfg_.g_lag_in_frames = 3;
       cfg_.rc_end_usage = VPX_VBR;
       cfg_.rc_2pass_vbr_minsection_pct = 5;
-      cfg_.rc_2pass_vbr_minsection_pct = 2000;
+      cfg_.rc_2pass_vbr_maxsection_pct = 2000;
     } else {
       cfg_.g_lag_in_frames = 0;
       cfg_.rc_end_usage = VPX_CBR;
@@ -62,7 +55,7 @@ class VPxEncoderThreadTest
     encoder_initialized_ = false;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
                                   ::libvpx_test::Encoder *encoder) {
     if (!encoder_initialized_) {
       // Encode 4 column tiles.
@@ -81,27 +74,28 @@ class VPxEncoderThreadTest
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    const vpx_codec_err_t res = decoder_->DecodeFrame(
-        reinterpret_cast<uint8_t*>(pkt->data.frame.buf), pkt->data.frame.sz);
+  virtual void DecompressedFrameHook(const vpx_image_t &img,
+                                     vpx_codec_pts_t /*pts*/) {
+    ::libvpx_test::MD5 md5_res;
+    md5_res.Add(&img);
+    md5_.push_back(md5_res.Get());
+  }
+
+  virtual bool HandleDecodeResult(const vpx_codec_err_t res,
+                                  const libvpx_test::VideoSource& /*video*/,
+                                  libvpx_test::Decoder * /*decoder*/) {
     if (res != VPX_CODEC_OK) {
-      abort_ = true;
-      ASSERT_EQ(VPX_CODEC_OK, res);
+      EXPECT_EQ(VPX_CODEC_OK, res);
+      return false;
     }
-    const vpx_image_t *img = decoder_->GetDxData().Next();
 
-    if (img) {
-      ::libvpx_test::MD5 md5_res;
-      md5_res.Add(img);
-      md5_.push_back(md5_res.Get());
-    }
+    return true;
   }
 
   bool encoder_initialized_;
   int tiles_;
   ::libvpx_test::TestMode encoding_mode_;
   int set_cpu_used_;
-  ::libvpx_test::Decoder *decoder_;
   std::vector<std::string> md5_;
 };
 
@@ -134,9 +128,4 @@ VP9_INSTANTIATE_TEST_CASE(
     ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
                       ::libvpx_test::kRealTime),
     ::testing::Range(1, 9));
-
-VP10_INSTANTIATE_TEST_CASE(
-    VPxEncoderThreadTest,
-    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood),
-    ::testing::Range(1, 3));
 }  // namespace
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index ad3327e..416f3c3 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -34,7 +34,7 @@ class VP9IntraPredBase {
   virtual ~VP9IntraPredBase() { libvpx_test::ClearSystemState(); }
 
  protected:
-  virtual void Predict(PREDICTION_MODE mode) = 0;
+  virtual void Predict() = 0;
 
   void CheckPrediction(int test_case_number, int *error_count) const {
     // For each pixel ensure that the calculated value is the same as reference.
@@ -73,7 +73,7 @@ class VP9IntraPredBase {
           left_col_[y] = rnd.Rand16() & mask_;
         }
       }
-      Predict(DC_PRED);
+      Predict();
       CheckPrediction(i, &error_count);
     }
     ASSERT_EQ(0, error_count);
@@ -106,7 +106,7 @@ class VP9IntraPredTest
     mask_       = (1 << bit_depth_) - 1;
   }
 
-  virtual void Predict(PREDICTION_MODE mode) {
+  virtual void Predict() {
     const uint16_t *const_above_row = above_row_;
     const uint16_t *const_left_col = left_col_;
     ref_fn_(ref_dst_, stride_, const_above_row, const_left_col, bit_depth_);
@@ -132,7 +132,6 @@ using std::tr1::make_tuple;
 #if HAVE_SSE2
 #if CONFIG_VP9_HIGHBITDEPTH
 #if CONFIG_USE_X86INC
-#if ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                         ::testing::Values(
                             make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
@@ -141,13 +140,13 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                                        &vpx_highbd_tm_predictor_16x16_c, 16, 8),
                             make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
                                        &vpx_highbd_tm_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 8),
                             make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                        &vpx_highbd_dc_predictor_16x16_c, 16, 8),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                        &vpx_highbd_v_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                        &vpx_highbd_v_predictor_8x8_c, 8, 8),
@@ -155,34 +154,11 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                                        &vpx_highbd_v_predictor_16x16_c, 16, 8),
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
-#else
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
-                        ::testing::Values(
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
-                                       &vpx_highbd_dc_predictor_4x4_c, 4, 8),
-                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
-                                       &vpx_highbd_dc_predictor_8x8_c, 8, 8),
-                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
-                                       &vpx_highbd_dc_predictor_16x16_c, 16, 8),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
-                                       &vpx_highbd_v_predictor_4x4_c, 4, 8),
-                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
-                                       &vpx_highbd_v_predictor_8x8_c, 8, 8),
-                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
-                                       &vpx_highbd_v_predictor_16x16_c, 16, 8),
-                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
-                                       &vpx_highbd_v_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
-                                       &vpx_highbd_tm_predictor_4x4_c, 4, 8),
-                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
-                                       &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
-#endif  // !ARCH_X86_64
 
-#if ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                         ::testing::Values(
                             make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
@@ -194,14 +170,14 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                             make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
                                        &vpx_highbd_tm_predictor_32x32_c, 32,
                                        10),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 10),
                             make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                        &vpx_highbd_dc_predictor_16x16_c, 16,
                                        10),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                        &vpx_highbd_v_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                        &vpx_highbd_v_predictor_8x8_c, 8, 10),
@@ -211,35 +187,11 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32,
                                        10),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
-                                       &vpx_highbd_tm_predictor_4x4_c, 4, 10),
-                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
-                                       &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
-#else
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
-                        ::testing::Values(
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
-                                       &vpx_highbd_dc_predictor_4x4_c, 4, 10),
-                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
-                                       &vpx_highbd_dc_predictor_8x8_c, 8, 10),
-                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
-                                       &vpx_highbd_dc_predictor_16x16_c, 16,
-                                       10),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
-                                       &vpx_highbd_v_predictor_4x4_c, 4, 10),
-                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
-                                       &vpx_highbd_v_predictor_8x8_c, 8, 10),
-                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
-                                       &vpx_highbd_v_predictor_16x16_c, 16, 10),
-                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
-                                       &vpx_highbd_v_predictor_32x32_c, 32, 10),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
-#endif  // !ARCH_X86_64
 
-#if ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                         ::testing::Values(
                             make_tuple(&vpx_highbd_dc_predictor_32x32_sse2,
@@ -251,14 +203,14 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                             make_tuple(&vpx_highbd_tm_predictor_32x32_sse2,
                                        &vpx_highbd_tm_predictor_32x32_c, 32,
                                        12),
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse2,
                                        &vpx_highbd_dc_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
                                        &vpx_highbd_dc_predictor_8x8_c, 8, 12),
                             make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
                                        &vpx_highbd_dc_predictor_16x16_c, 16,
                                        12),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_v_predictor_4x4_sse2,
                                        &vpx_highbd_v_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
                                        &vpx_highbd_v_predictor_8x8_c, 8, 12),
@@ -268,33 +220,11 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32,
                                        12),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
-#else
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
-                        ::testing::Values(
-                            make_tuple(&vpx_highbd_dc_predictor_4x4_sse,
-                                       &vpx_highbd_dc_predictor_4x4_c, 4, 12),
-                            make_tuple(&vpx_highbd_dc_predictor_8x8_sse2,
-                                       &vpx_highbd_dc_predictor_8x8_c, 8, 12),
-                            make_tuple(&vpx_highbd_dc_predictor_16x16_sse2,
-                                       &vpx_highbd_dc_predictor_16x16_c, 16,
-                                       12),
-                            make_tuple(&vpx_highbd_v_predictor_4x4_sse,
-                                       &vpx_highbd_v_predictor_4x4_c, 4, 12),
-                            make_tuple(&vpx_highbd_v_predictor_8x8_sse2,
-                                       &vpx_highbd_v_predictor_8x8_c, 8, 12),
-                            make_tuple(&vpx_highbd_v_predictor_16x16_sse2,
-                                       &vpx_highbd_v_predictor_16x16_c, 16, 12),
-                            make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
-                                       &vpx_highbd_v_predictor_32x32_c, 32, 12),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
-                                       &vpx_highbd_tm_predictor_4x4_c, 4, 12),
-                            make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
-                                       &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
-#endif  // !ARCH_X86_64
+
 #endif  // CONFIG_USE_X86INC
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2
diff --git a/test/vp9_lossless_test.cc b/test/vp9_lossless_test.cc
index 09c1070..4177393 100644
--- a/test/vp9_lossless_test.cc
+++ b/test/vp9_lossless_test.cc
@@ -127,8 +127,4 @@ VP9_INSTANTIATE_TEST_CASE(LosslessTest,
                           ::testing::Values(::libvpx_test::kRealTime,
                                             ::libvpx_test::kOnePassGood,
                                             ::libvpx_test::kTwoPassGood));
-
-VP10_INSTANTIATE_TEST_CASE(LosslessTest,
-                           ::testing::Values(::libvpx_test::kOnePassGood,
-                                             ::libvpx_test::kTwoPassGood));
 }  // namespace
diff --git a/test/vp9_spatial_svc_encoder.sh b/test/vp9_spatial_svc_encoder.sh
index 6dd5f17..6503107 100755
--- a/test/vp9_spatial_svc_encoder.sh
+++ b/test/vp9_spatial_svc_encoder.sh
@@ -54,7 +54,7 @@ vp9_spatial_svc() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
     local readonly test_name="vp9_spatial_svc"
     for layers in $(seq 1 ${vp9_ssvc_test_layers}); do
-      vp9_spatial_svc_encoder "${test_name}" -l ${layers}
+      vp9_spatial_svc_encoder "${test_name}" -sl ${layers}
     done
   fi
 }
diff --git a/test/webm_video_source.h b/test/webm_video_source.h
index 650bc52..8258756 100644
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h
@@ -62,7 +62,7 @@ class WebMVideoSource : public CompressedVideoSource {
 
   void FillFrame() {
     ASSERT_TRUE(vpx_ctx_->file != NULL);
-    const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_, &buf_sz_);
+    const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_);
     ASSERT_GE(status, 0) << "webm_read_frame failed";
     if (status == 1) {
       end_of_file_ = true;
@@ -72,7 +72,7 @@ class WebMVideoSource : public CompressedVideoSource {
   void SeekToNextKeyFrame() {
     ASSERT_TRUE(vpx_ctx_->file != NULL);
     do {
-      const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_, &buf_sz_);
+      const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_);
       ASSERT_GE(status, 0) << "webm_read_frame failed";
       ++frame_;
       if (status == 1) {
diff --git a/third_party/googletest/README.libvpx b/third_party/googletest/README.libvpx
index 7201a67..1eca78d 100644
--- a/third_party/googletest/README.libvpx
+++ b/third_party/googletest/README.libvpx
@@ -12,4 +12,8 @@ failures, various options for running the tests, and XML test report
 generation.
 
 Local Modifications:
-Removed unused declarations of kPathSeparatorString to have warning free build.
\ No newline at end of file
+- Removed unused declarations of kPathSeparatorString to have warning
+  free build.
+- Added GTEST_ATTRIBUTE_UNUSED_ to test registering dummies in TEST_P
+  and INSTANTIATE_TEST_CASE_P to remove warnings about unused variables
+  under GCC 5.
\ No newline at end of file
diff --git a/third_party/googletest/src/include/gtest/gtest.h b/third_party/googletest/src/include/gtest/gtest.h
index 4f3804f..581a44e 100644
--- a/third_party/googletest/src/include/gtest/gtest.h
+++ b/third_party/googletest/src/include/gtest/gtest.h
@@ -16960,7 +16960,7 @@ internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
                       GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \
       return 0; \
     } \
-    static int gtest_registering_dummy_; \
+    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \
     GTEST_DISALLOW_COPY_AND_ASSIGN_(\
         GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
   }; \
@@ -16972,7 +16972,7 @@ internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
 # define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator) \
   ::testing::internal::ParamGenerator<test_case_name::ParamType> \
       gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
-  int gtest_##prefix##test_case_name##_dummy_ = \
+  int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \
       ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
           GetTestCasePatternHolder<test_case_name>(\
               #test_case_name, __FILE__, __LINE__)->AddTestCaseInstantiation(\
diff --git a/third_party/libwebm/Android.mk b/third_party/libwebm/Android.mk
index be9d77d..8149a08 100644
--- a/third_party/libwebm/Android.mk
+++ b/third_party/libwebm/Android.mk
@@ -2,9 +2,16 @@ LOCAL_PATH:= $(call my-dir)
 
 include $(CLEAR_VARS)
 LOCAL_MODULE:= libwebm
-LOCAL_SRC_FILES:= mkvparser.cpp \
-                  mkvreader.cpp \
-                  mkvmuxer.cpp \
-                  mkvmuxerutil.cpp \
-                  mkvwriter.cpp
+LOCAL_CPPFLAGS:=-D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS
+LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -Wno-extern-c-compat
+LOCAL_C_INCLUDES:= $(LOCAL_PATH)
+LOCAL_EXPORT_C_INCLUDES:= $(LOCAL_PATH)
+
+LOCAL_SRC_FILES:= common/file_util.cc \
+                  common/hdr_util.cc \
+                  mkvparser/mkvparser.cc \
+                  mkvparser/mkvreader.cc \
+                  mkvmuxer/mkvmuxer.cc \
+                  mkvmuxer/mkvmuxerutil.cc \
+                  mkvmuxer/mkvwriter.cc
 include $(BUILD_STATIC_LIBRARY)
diff --git a/third_party/libwebm/README.libvpx b/third_party/libwebm/README.libvpx
index 2989d3d..73f8303 100644
--- a/third_party/libwebm/README.libvpx
+++ b/third_party/libwebm/README.libvpx
@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 476366249e1fda7710a389cd41c57db42305e0d4
+Version: 32d5ac49414a8914ec1e1f285f3f927c6e8ec29d
 License: BSD
 License File: LICENSE.txt
 
diff --git a/third_party/libwebm/RELEASE.TXT b/third_party/libwebm/RELEASE.TXT
deleted file mode 100644
index a7e9f03..0000000
--- a/third_party/libwebm/RELEASE.TXT
+++ /dev/null
@@ -1,34 +0,0 @@
-1.0.0.5
- * Handled case when no duration
- * Handled empty clusters
- * Handled empty clusters when seeking
- * Implemented check lacing bits
-
-1.0.0.4
- * Made Cues member variables mutables
- * Defined against badly-formatted cue points
- * Segment::GetCluster returns CuePoint too
- * Separated cue-based searches
-
-1.0.0.3
- * Added Block::GetOffset() to get a frame's offset in a block
- * Changed cluster count type from size_t to long
- * Parsed SeekHead to find cues
- * Allowed seeking beyond end of cluster cache
- * Added not to attempt to reparse cues element
- * Restructured Segment::LoadCluster
- * Marked position of cues without parsing cues element
- * Allowed cue points to be loaded incrementally
- * Implemented to load lazily cue points as they're searched
- * Merged Cues::LoadCuePoint into Cues::Find
- * Lazy init cues
- * Loaded cue point during find
-
-1.0.0.2
- * added support for Cues element
- * seeking was improved
-
-1.0.0.1
- * fixed item 141
- * added item 142
- * added this file, RELEASE.TXT, to repository
diff --git a/third_party/libwebm/common/file_util.cc b/third_party/libwebm/common/file_util.cc
new file mode 100644
index 0000000..4f91318
--- /dev/null
+++ b/third_party/libwebm/common/file_util.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#include "common/file_util.h"
+
+#include <sys/stat.h>
+#ifndef _MSC_VER
+#include <unistd.h>  // close()
+#endif
+
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <ios>
+
+namespace libwebm {
+
+std::string GetTempFileName() {
+#if !defined _MSC_VER && !defined __MINGW32__
+  char temp_file_name_template[] = "libwebm_temp.XXXXXX";
+  int fd = mkstemp(temp_file_name_template);
+  if (fd != -1) {
+    close(fd);
+    return std::string(temp_file_name_template);
+  }
+  return std::string();
+#else
+  char tmp_file_name[_MAX_PATH];
+  errno_t err = tmpnam_s(tmp_file_name);
+  if (err == 0) {
+    return std::string(tmp_file_name);
+  }
+  return std::string();
+#endif
+}
+
+uint64_t GetFileSize(const std::string& file_name) {
+  uint64_t file_size = 0;
+#ifndef _MSC_VER
+  struct stat st;
+  st.st_size = 0;
+  if (stat(file_name.c_str(), &st) == 0) {
+#else
+  struct _stat st;
+  st.st_size = 0;
+  if (_stat(file_name.c_str(), &st) == 0) {
+#endif
+    file_size = st.st_size;
+  }
+  return file_size;
+}
+
+TempFileDeleter::TempFileDeleter() { file_name_ = GetTempFileName(); }
+
+TempFileDeleter::~TempFileDeleter() {
+  std::ifstream file(file_name_.c_str());
+  if (file.good()) {
+    file.close();
+    std::remove(file_name_.c_str());
+  }
+}
+
+}  // namespace libwebm
diff --git a/third_party/libwebm/common/file_util.h b/third_party/libwebm/common/file_util.h
new file mode 100644
index 0000000..0e71eac
--- /dev/null
+++ b/third_party/libwebm/common/file_util.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef LIBWEBM_COMMON_FILE_UTIL_H_
+#define LIBWEBM_COMMON_FILE_UTIL_H_
+
+#include <stdint.h>
+
+#include <string>
+
+#include "mkvmuxer/mkvmuxertypes.h"  // LIBWEBM_DISALLOW_COPY_AND_ASSIGN()
+
+namespace libwebm {
+
+// Returns a temporary file name.
+std::string GetTempFileName();
+
+// Returns size of file specified by |file_name|, or 0 upon failure.
+uint64_t GetFileSize(const std::string& file_name);
+
+// Manages life of temporary file specified at time of construction. Deletes
+// file upon destruction.
+class TempFileDeleter {
+ public:
+  TempFileDeleter();
+  explicit TempFileDeleter(std::string file_name) : file_name_(file_name) {}
+  ~TempFileDeleter();
+  const std::string& name() const { return file_name_; }
+
+ private:
+  std::string file_name_;
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(TempFileDeleter);
+};
+
+}  // namespace libwebm
+
+#endif  // LIBWEBM_COMMON_FILE_UTIL_H_
\ No newline at end of file
diff --git a/third_party/libwebm/common/hdr_util.cc b/third_party/libwebm/common/hdr_util.cc
new file mode 100644
index 0000000..e1a9842
--- /dev/null
+++ b/third_party/libwebm/common/hdr_util.cc
@@ -0,0 +1,182 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#include "hdr_util.h"
+
+#include <cstddef>
+#include <new>
+
+#include "mkvparser/mkvparser.h"
+
+namespace libwebm {
+bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
+                             PrimaryChromaticityPtr* muxer_pc) {
+  muxer_pc->reset(new (std::nothrow)
+                      mkvmuxer::PrimaryChromaticity(parser_pc.x, parser_pc.y));
+  if (!muxer_pc->get())
+    return false;
+  return true;
+}
+
+bool MasteringMetadataValuePresent(double value) {
+  return value != mkvparser::MasteringMetadata::kValueNotPresent;
+}
+
+bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm,
+                           mkvmuxer::MasteringMetadata* muxer_mm) {
+  if (MasteringMetadataValuePresent(parser_mm.luminance_max))
+    muxer_mm->luminance_max = parser_mm.luminance_max;
+  if (MasteringMetadataValuePresent(parser_mm.luminance_min))
+    muxer_mm->luminance_min = parser_mm.luminance_min;
+
+  PrimaryChromaticityPtr r_ptr(NULL);
+  PrimaryChromaticityPtr g_ptr(NULL);
+  PrimaryChromaticityPtr b_ptr(NULL);
+  PrimaryChromaticityPtr wp_ptr(NULL);
+
+  if (parser_mm.r) {
+    if (!CopyPrimaryChromaticity(*parser_mm.r, &r_ptr))
+      return false;
+  }
+  if (parser_mm.g) {
+    if (!CopyPrimaryChromaticity(*parser_mm.g, &g_ptr))
+      return false;
+  }
+  if (parser_mm.b) {
+    if (!CopyPrimaryChromaticity(*parser_mm.b, &b_ptr))
+      return false;
+  }
+  if (parser_mm.white_point) {
+    if (!CopyPrimaryChromaticity(*parser_mm.white_point, &wp_ptr))
+      return false;
+  }
+
+  if (!muxer_mm->SetChromaticity(r_ptr.get(), g_ptr.get(), b_ptr.get(),
+                                 wp_ptr.get())) {
+    return false;
+  }
+
+  return true;
+}
+
+bool ColourValuePresent(long long value) {
+  return value != mkvparser::Colour::kValueNotPresent;
+}
+
+bool CopyColour(const mkvparser::Colour& parser_colour,
+                mkvmuxer::Colour* muxer_colour) {
+  if (!muxer_colour)
+    return false;
+
+  if (ColourValuePresent(parser_colour.matrix_coefficients))
+    muxer_colour->matrix_coefficients = parser_colour.matrix_coefficients;
+  if (ColourValuePresent(parser_colour.bits_per_channel))
+    muxer_colour->bits_per_channel = parser_colour.bits_per_channel;
+  if (ColourValuePresent(parser_colour.chroma_subsampling_horz))
+    muxer_colour->chroma_subsampling_horz =
+        parser_colour.chroma_subsampling_horz;
+  if (ColourValuePresent(parser_colour.chroma_subsampling_vert))
+    muxer_colour->chroma_subsampling_vert =
+        parser_colour.chroma_subsampling_vert;
+  if (ColourValuePresent(parser_colour.cb_subsampling_horz))
+    muxer_colour->cb_subsampling_horz = parser_colour.cb_subsampling_horz;
+  if (ColourValuePresent(parser_colour.cb_subsampling_vert))
+    muxer_colour->cb_subsampling_vert = parser_colour.cb_subsampling_vert;
+  if (ColourValuePresent(parser_colour.chroma_siting_horz))
+    muxer_colour->chroma_siting_horz = parser_colour.chroma_siting_horz;
+  if (ColourValuePresent(parser_colour.chroma_siting_vert))
+    muxer_colour->chroma_siting_vert = parser_colour.chroma_siting_vert;
+  if (ColourValuePresent(parser_colour.range))
+    muxer_colour->range = parser_colour.range;
+  if (ColourValuePresent(parser_colour.transfer_characteristics))
+    muxer_colour->transfer_characteristics =
+        parser_colour.transfer_characteristics;
+  if (ColourValuePresent(parser_colour.primaries))
+    muxer_colour->primaries = parser_colour.primaries;
+  if (ColourValuePresent(parser_colour.max_cll))
+    muxer_colour->max_cll = parser_colour.max_cll;
+  if (ColourValuePresent(parser_colour.max_fall))
+    muxer_colour->max_fall = parser_colour.max_fall;
+
+  if (parser_colour.mastering_metadata) {
+    mkvmuxer::MasteringMetadata muxer_mm;
+    if (!CopyMasteringMetadata(*parser_colour.mastering_metadata, &muxer_mm))
+      return false;
+    if (!muxer_colour->SetMasteringMetadata(muxer_mm))
+      return false;
+  }
+  return true;
+}
+
+// Format of VPx private data:
+//
+//   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+//  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//  |    ID Byte    |             Length            |               |
+//  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+               |
+//  |                                                               |
+//  :               Bytes 1..Length of Codec Feature                :
+//  |                                                               |
+//  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//
+// ID Byte Format
+// ID byte is an unsigned byte.
+//   0 1 2 3 4 5 6 7
+//  +-+-+-+-+-+-+-+-+
+//  |X|    ID       |
+//  +-+-+-+-+-+-+-+-+
+//
+// The X bit is reserved.
+//
+// Currently only profile level is supported. ID byte must be set to 1, and
+// length must be 1. Supported values are:
+//
+//   10: Level 1
+//   11: Level 1.1
+//   20: Level 2
+//   21: Level 2.1
+//   30: Level 3
+//   31: Level 3.1
+//   40: Level 4
+//   41: Level 4.1
+//   50: Level 5
+//   51: Level 5.1
+//   52: Level 5.2
+//   60: Level 6
+//   61: Level 6.1
+//   62: Level 6.2
+//
+// See the following link for more information:
+// http://www.webmproject.org/vp9/profiles/
+int ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length) {
+  const int kVpxCodecPrivateLength = 3;
+  if (!private_data || length != kVpxCodecPrivateLength)
+    return 0;
+
+  const uint8_t id_byte = *private_data;
+  if (id_byte != 1)
+    return 0;
+
+  const int kVpxProfileLength = 1;
+  const uint8_t length_byte = private_data[1];
+  if (length_byte != kVpxProfileLength)
+    return 0;
+
+  const int level = static_cast<int>(private_data[2]);
+
+  const int kNumLevels = 14;
+  const int levels[kNumLevels] = {10, 11, 20, 21, 30, 31, 40,
+                                  41, 50, 51, 52, 60, 61, 62};
+
+  for (int i = 0; i < kNumLevels; ++i) {
+    if (level == levels[i])
+      return level;
+  }
+
+  return 0;
+}
+}  // namespace libwebm
diff --git a/third_party/libwebm/common/hdr_util.h b/third_party/libwebm/common/hdr_util.h
new file mode 100644
index 0000000..d30c2b9
--- /dev/null
+++ b/third_party/libwebm/common/hdr_util.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef LIBWEBM_COMMON_HDR_UTIL_H_
+#define LIBWEBM_COMMON_HDR_UTIL_H_
+
+#include <stdint.h>
+
+#include <memory>
+
+#include "mkvmuxer/mkvmuxer.h"
+
+namespace mkvparser {
+struct Colour;
+struct MasteringMetadata;
+struct PrimaryChromaticity;
+}  // namespace mkvparser
+
+namespace libwebm {
+// Utility types and functions for working with the Colour element and its
+// children. Copiers return true upon success. Presence functions return true
+// when the specified element is present.
+
+// TODO(tomfinegan): These should be moved to libwebm_utils once c++11 is
+// required by libwebm.
+
+typedef std::auto_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
+
+bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
+                             PrimaryChromaticityPtr* muxer_pc);
+
+bool MasteringMetadataValuePresent(double value);
+
+bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm,
+                           mkvmuxer::MasteringMetadata* muxer_mm);
+
+bool ColourValuePresent(long long value);
+
+bool CopyColour(const mkvparser::Colour& parser_colour,
+                mkvmuxer::Colour* muxer_colour);
+
+// Returns VP9 profile upon success or 0 upon failure.
+int ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length);
+
+}  // namespace libwebm
+
+#endif  // LIBWEBM_COMMON_HDR_UTIL_H_
diff --git a/third_party/libwebm/webmids.hpp b/third_party/libwebm/common/webmids.h
similarity index 79%
rename from third_party/libwebm/webmids.hpp
rename to third_party/libwebm/common/webmids.h
index ad4ab57..32a0c5f 100644
--- a/third_party/libwebm/webmids.hpp
+++ b/third_party/libwebm/common/webmids.h
@@ -6,10 +6,10 @@
 // in the file PATENTS.  All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 
-#ifndef WEBMIDS_HPP
-#define WEBMIDS_HPP
+#ifndef COMMON_WEBMIDS_H_
+#define COMMON_WEBMIDS_H_
 
-namespace mkvmuxer {
+namespace libwebm {
 
 enum MkvId {
   kMkvEBML = 0x1A45DFA3,
@@ -95,6 +95,35 @@ enum MkvId {
   kMkvAspectRatioType = 0x54B3,
   kMkvFrameRate = 0x2383E3,
   // end video
+  // colour
+  kMkvColour = 0x55B0,
+  kMkvMatrixCoefficients = 0x55B1,
+  kMkvBitsPerChannel = 0x55B2,
+  kMkvChromaSubsamplingHorz = 0x55B3,
+  kMkvChromaSubsamplingVert = 0x55B4,
+  kMkvCbSubsamplingHorz = 0x55B5,
+  kMkvCbSubsamplingVert = 0x55B6,
+  kMkvChromaSitingHorz = 0x55B7,
+  kMkvChromaSitingVert = 0x55B8,
+  kMkvRange = 0x55B9,
+  kMkvTransferCharacteristics = 0x55BA,
+  kMkvPrimaries = 0x55BB,
+  kMkvMaxCLL = 0x55BC,
+  kMkvMaxFALL = 0x55BD,
+  // mastering metadata
+  kMkvMasteringMetadata = 0x55D0,
+  kMkvPrimaryRChromaticityX = 0x55D1,
+  kMkvPrimaryRChromaticityY = 0x55D2,
+  kMkvPrimaryGChromaticityX = 0x55D3,
+  kMkvPrimaryGChromaticityY = 0x55D4,
+  kMkvPrimaryBChromaticityX = 0x55D5,
+  kMkvPrimaryBChromaticityY = 0x55D6,
+  kMkvWhitePointChromaticityX = 0x55D7,
+  kMkvWhitePointChromaticityY = 0x55D8,
+  kMkvLuminanceMax = 0x55D9,
+  kMkvLuminanceMin = 0x55DA,
+  // end mastering metadata
+  // end colour
   // audio
   kMkvAudio = 0xE1,
   kMkvSamplingFrequency = 0xB5,
@@ -150,6 +179,6 @@ enum MkvId {
   kMkvTagString = 0x4487
 };
 
-}  // end namespace mkvmuxer
+}  // namespace libwebm
 
-#endif  // WEBMIDS_HPP
+#endif  // COMMON_WEBMIDS_H_
diff --git a/third_party/libwebm/mkvmuxer.cpp b/third_party/libwebm/mkvmuxer/mkvmuxer.cc
similarity index 56%
rename from third_party/libwebm/mkvmuxer.cpp
rename to third_party/libwebm/mkvmuxer/mkvmuxer.cc
index 9be3119..c79ce24 100644
--- a/third_party/libwebm/mkvmuxer.cpp
+++ b/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -6,27 +6,28 @@
 // in the file PATENTS.  All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 
-#include "mkvmuxer.hpp"
+#include "mkvmuxer/mkvmuxer.h"
 
+#include <cfloat>
 #include <climits>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <ctime>
+#include <memory>
 #include <new>
+#include <vector>
 
-#include "mkvmuxerutil.hpp"
-#include "mkvparser.hpp"
-#include "mkvwriter.hpp"
-#include "webmids.hpp"
-
-#ifdef _MSC_VER
-// Disable MSVC warnings that suggest making code non-portable.
-#pragma warning(disable : 4996)
-#endif
+#include "common/webmids.h"
+#include "mkvmuxer/mkvmuxerutil.h"
+#include "mkvmuxer/mkvwriter.h"
+#include "mkvparser/mkvparser.h"
 
 namespace mkvmuxer {
 
+const float MasteringMetadata::kValueNotPresent = FLT_MAX;
+const uint64_t Colour::kValueNotPresent = UINT64_MAX;
+
 namespace {
 // Deallocate the string designated by |dst|, and then copy the |src|
 // string to |dst|.  The caller owns both the |src| string and the
@@ -55,6 +56,20 @@ bool StrCpy(const char* src, char** dst_ptr) {
   strcpy(dst, src);  // NOLINT
   return true;
 }
+
+typedef std::auto_ptr<PrimaryChromaticity> PrimaryChromaticityPtr;
+bool CopyChromaticity(const PrimaryChromaticity* src,
+                      PrimaryChromaticityPtr* dst) {
+  if (!dst)
+    return false;
+
+  dst->reset(new (std::nothrow) PrimaryChromaticity(src->x, src->y));
+  if (!dst->get())
+    return false;
+
+  return true;
+}
+
 }  // namespace
 
 ///////////////////////////////////////////////////////////////
@@ -65,31 +80,31 @@ IMkvWriter::IMkvWriter() {}
 
 IMkvWriter::~IMkvWriter() {}
 
-bool WriteEbmlHeader(IMkvWriter* writer, uint64 doc_type_version) {
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version) {
   // Level 0
-  uint64 size = EbmlElementSize(kMkvEBMLVersion, 1ULL);
-  size += EbmlElementSize(kMkvEBMLReadVersion, 1ULL);
-  size += EbmlElementSize(kMkvEBMLMaxIDLength, 4ULL);
-  size += EbmlElementSize(kMkvEBMLMaxSizeLength, 8ULL);
-  size += EbmlElementSize(kMkvDocType, "webm");
-  size += EbmlElementSize(kMkvDocTypeVersion, doc_type_version);
-  size += EbmlElementSize(kMkvDocTypeReadVersion, 2ULL);
+  uint64_t size = EbmlElementSize(libwebm::kMkvEBMLVersion, UINT64_C(1));
+  size += EbmlElementSize(libwebm::kMkvEBMLReadVersion, UINT64_C(1));
+  size += EbmlElementSize(libwebm::kMkvEBMLMaxIDLength, UINT64_C(4));
+  size += EbmlElementSize(libwebm::kMkvEBMLMaxSizeLength, UINT64_C(8));
+  size += EbmlElementSize(libwebm::kMkvDocType, "webm");
+  size += EbmlElementSize(libwebm::kMkvDocTypeVersion, doc_type_version);
+  size += EbmlElementSize(libwebm::kMkvDocTypeReadVersion, UINT64_C(2));
 
-  if (!WriteEbmlMasterElement(writer, kMkvEBML, size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvEBML, size))
     return false;
-  if (!WriteEbmlElement(writer, kMkvEBMLVersion, 1ULL))
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLVersion, UINT64_C(1)))
     return false;
-  if (!WriteEbmlElement(writer, kMkvEBMLReadVersion, 1ULL))
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLReadVersion, UINT64_C(1)))
     return false;
-  if (!WriteEbmlElement(writer, kMkvEBMLMaxIDLength, 4ULL))
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxIDLength, UINT64_C(4)))
     return false;
-  if (!WriteEbmlElement(writer, kMkvEBMLMaxSizeLength, 8ULL))
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxSizeLength, UINT64_C(8)))
     return false;
-  if (!WriteEbmlElement(writer, kMkvDocType, "webm"))
+  if (!WriteEbmlElement(writer, libwebm::kMkvDocType, "webm"))
     return false;
-  if (!WriteEbmlElement(writer, kMkvDocTypeVersion, doc_type_version))
+  if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeVersion, doc_type_version))
     return false;
-  if (!WriteEbmlElement(writer, kMkvDocTypeReadVersion, 2ULL))
+  if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeReadVersion, UINT64_C(2)))
     return false;
 
   return true;
@@ -100,16 +115,16 @@ bool WriteEbmlHeader(IMkvWriter* writer) {
 }
 
 bool ChunkedCopy(mkvparser::IMkvReader* source, mkvmuxer::IMkvWriter* dst,
-                 mkvmuxer::int64 start, int64 size) {
+                 int64_t start, int64_t size) {
   // TODO(vigneshv): Check if this is a reasonable value.
-  const uint32 kBufSize = 2048;
-  uint8* buf = new uint8[kBufSize];
-  int64 offset = start;
+  const uint32_t kBufSize = 2048;
+  uint8_t* buf = new uint8_t[kBufSize];
+  int64_t offset = start;
   while (size > 0) {
-    const int64 read_len = (size > kBufSize) ? kBufSize : size;
+    const int64_t read_len = (size > kBufSize) ? kBufSize : size;
     if (source->Read(offset, static_cast<long>(read_len), buf))
       return false;
-    dst->Write(buf, static_cast<uint32>(read_len));
+    dst->Write(buf, static_cast<uint32_t>(read_len));
     offset += read_len;
     size -= read_len;
   }
@@ -126,6 +141,7 @@ Frame::Frame()
       additional_(NULL),
       additional_length_(0),
       duration_(0),
+      duration_set_(false),
       frame_(NULL),
       is_key_(false),
       length_(0),
@@ -158,16 +174,19 @@ bool Frame::CopyFrom(const Frame& frame) {
     return false;
   }
   duration_ = frame.duration();
+  duration_set_ = frame.duration_set();
   is_key_ = frame.is_key();
   track_number_ = frame.track_number();
   timestamp_ = frame.timestamp();
   discard_padding_ = frame.discard_padding();
+  reference_block_timestamp_ = frame.reference_block_timestamp();
+  reference_block_timestamp_set_ = frame.reference_block_timestamp_set();
   return true;
 }
 
-bool Frame::Init(const uint8* frame, uint64 length) {
-  uint8* const data =
-      new (std::nothrow) uint8[static_cast<size_t>(length)];  // NOLINT
+bool Frame::Init(const uint8_t* frame, uint64_t length) {
+  uint8_t* const data =
+      new (std::nothrow) uint8_t[static_cast<size_t>(length)];  // NOLINT
   if (!data)
     return false;
 
@@ -179,10 +198,10 @@ bool Frame::Init(const uint8* frame, uint64 length) {
   return true;
 }
 
-bool Frame::AddAdditionalData(const uint8* additional, uint64 length,
-                              uint64 add_id) {
-  uint8* const data =
-      new (std::nothrow) uint8[static_cast<size_t>(length)];  // NOLINT
+bool Frame::AddAdditionalData(const uint8_t* additional, uint64_t length,
+                              uint64_t add_id) {
+  uint8_t* const data =
+      new (std::nothrow) uint8_t[static_cast<size_t>(length)];  // NOLINT
   if (!data)
     return false;
 
@@ -216,7 +235,12 @@ bool Frame::CanBeSimpleBlock() const {
   return additional_ == NULL && discard_padding_ == 0 && duration_ == 0;
 }
 
-void Frame::set_reference_block_timestamp(int64 reference_block_timestamp) {
+void Frame::set_duration(uint64_t duration) {
+  duration_ = duration;
+  duration_set_ = true;
+}
+
+void Frame::set_reference_block_timestamp(int64_t reference_block_timestamp) {
   reference_block_timestamp_ = reference_block_timestamp;
   reference_block_timestamp_set_ = true;
 }
@@ -238,61 +262,64 @@ bool CuePoint::Write(IMkvWriter* writer) const {
   if (!writer || track_ < 1 || cluster_pos_ < 1)
     return false;
 
-  uint64 size = EbmlElementSize(kMkvCueClusterPosition, cluster_pos_);
-  size += EbmlElementSize(kMkvCueTrack, track_);
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvCueClusterPosition, cluster_pos_);
+  size += EbmlElementSize(libwebm::kMkvCueTrack, track_);
   if (output_block_number_ && block_number_ > 1)
-    size += EbmlElementSize(kMkvCueBlockNumber, block_number_);
-  const uint64 track_pos_size =
-      EbmlMasterElementSize(kMkvCueTrackPositions, size) + size;
-  const uint64 payload_size =
-      EbmlElementSize(kMkvCueTime, time_) + track_pos_size;
+    size += EbmlElementSize(libwebm::kMkvCueBlockNumber, block_number_);
+  const uint64_t track_pos_size =
+      EbmlMasterElementSize(libwebm::kMkvCueTrackPositions, size) + size;
+  const uint64_t payload_size =
+      EbmlElementSize(libwebm::kMkvCueTime, time_) + track_pos_size;
 
-  if (!WriteEbmlMasterElement(writer, kMkvCuePoint, payload_size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvCuePoint, payload_size))
     return false;
 
-  const int64 payload_position = writer->Position();
+  const int64_t payload_position = writer->Position();
   if (payload_position < 0)
     return false;
 
-  if (!WriteEbmlElement(writer, kMkvCueTime, time_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvCueTime, time_))
     return false;
 
-  if (!WriteEbmlMasterElement(writer, kMkvCueTrackPositions, size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvCueTrackPositions, size))
     return false;
-  if (!WriteEbmlElement(writer, kMkvCueTrack, track_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvCueTrack, track_))
     return false;
-  if (!WriteEbmlElement(writer, kMkvCueClusterPosition, cluster_pos_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvCueClusterPosition, cluster_pos_))
     return false;
   if (output_block_number_ && block_number_ > 1)
-    if (!WriteEbmlElement(writer, kMkvCueBlockNumber, block_number_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvCueBlockNumber, block_number_))
       return false;
 
-  const int64 stop_position = writer->Position();
+  const int64_t stop_position = writer->Position();
   if (stop_position < 0)
     return false;
 
-  if (stop_position - payload_position != static_cast<int64>(payload_size))
+  if (stop_position - payload_position != static_cast<int64_t>(payload_size))
     return false;
 
   return true;
 }
 
-uint64 CuePoint::PayloadSize() const {
-  uint64 size = EbmlElementSize(kMkvCueClusterPosition, cluster_pos_);
-  size += EbmlElementSize(kMkvCueTrack, track_);
+uint64_t CuePoint::PayloadSize() const {
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvCueClusterPosition, cluster_pos_);
+  size += EbmlElementSize(libwebm::kMkvCueTrack, track_);
   if (output_block_number_ && block_number_ > 1)
-    size += EbmlElementSize(kMkvCueBlockNumber, block_number_);
-  const uint64 track_pos_size =
-      EbmlMasterElementSize(kMkvCueTrackPositions, size) + size;
-  const uint64 payload_size =
-      EbmlElementSize(kMkvCueTime, time_) + track_pos_size;
+    size += EbmlElementSize(libwebm::kMkvCueBlockNumber, block_number_);
+  const uint64_t track_pos_size =
+      EbmlMasterElementSize(libwebm::kMkvCueTrackPositions, size) + size;
+  const uint64_t payload_size =
+      EbmlElementSize(libwebm::kMkvCueTime, time_) + track_pos_size;
 
   return payload_size;
 }
 
-uint64 CuePoint::Size() const {
-  const uint64 payload_size = PayloadSize();
-  return EbmlMasterElementSize(kMkvCuePoint, payload_size) + payload_size;
+uint64_t CuePoint::Size() const {
+  const uint64_t payload_size = PayloadSize();
+  return EbmlMasterElementSize(libwebm::kMkvCuePoint, payload_size) +
+         payload_size;
 }
 
 ///////////////////////////////////////////////////////////////
@@ -307,7 +334,7 @@ Cues::Cues()
 
 Cues::~Cues() {
   if (cue_entries_) {
-    for (int32 i = 0; i < cue_entries_size_; ++i) {
+    for (int32_t i = 0; i < cue_entries_size_; ++i) {
       CuePoint* const cue = cue_entries_[i];
       delete cue;
     }
@@ -321,7 +348,7 @@ bool Cues::AddCue(CuePoint* cue) {
 
   if ((cue_entries_size_ + 1) > cue_entries_capacity_) {
     // Add more CuePoints.
-    const int32 new_capacity =
+    const int32_t new_capacity =
         (!cue_entries_capacity_) ? 2 : cue_entries_capacity_ * 2;
 
     if (new_capacity < 1)
@@ -332,7 +359,7 @@ bool Cues::AddCue(CuePoint* cue) {
     if (!cues)
       return false;
 
-    for (int32 i = 0; i < cue_entries_size_; ++i) {
+    for (int32_t i = 0; i < cue_entries_size_; ++i) {
       cues[i] = cue_entries_[i];
     }
 
@@ -347,7 +374,7 @@ bool Cues::AddCue(CuePoint* cue) {
   return true;
 }
 
-CuePoint* Cues::GetCueByIndex(int32 index) const {
+CuePoint* Cues::GetCueByIndex(int32_t index) const {
   if (cue_entries_ == NULL)
     return NULL;
 
@@ -357,11 +384,11 @@ CuePoint* Cues::GetCueByIndex(int32 index) const {
   return cue_entries_[index];
 }
 
-uint64 Cues::Size() {
-  uint64 size = 0;
-  for (int32 i = 0; i < cue_entries_size_; ++i)
+uint64_t Cues::Size() {
+  uint64_t size = 0;
+  for (int32_t i = 0; i < cue_entries_size_; ++i)
     size += GetCueByIndex(i)->Size();
-  size += EbmlMasterElementSize(kMkvCues, size);
+  size += EbmlMasterElementSize(libwebm::kMkvCues, size);
   return size;
 }
 
@@ -369,8 +396,8 @@ bool Cues::Write(IMkvWriter* writer) const {
   if (!writer)
     return false;
 
-  uint64 size = 0;
-  for (int32 i = 0; i < cue_entries_size_; ++i) {
+  uint64_t size = 0;
+  for (int32_t i = 0; i < cue_entries_size_; ++i) {
     const CuePoint* const cue = GetCueByIndex(i);
 
     if (!cue)
@@ -379,25 +406,25 @@ bool Cues::Write(IMkvWriter* writer) const {
     size += cue->Size();
   }
 
-  if (!WriteEbmlMasterElement(writer, kMkvCues, size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvCues, size))
     return false;
 
-  const int64 payload_position = writer->Position();
+  const int64_t payload_position = writer->Position();
   if (payload_position < 0)
     return false;
 
-  for (int32 i = 0; i < cue_entries_size_; ++i) {
+  for (int32_t i = 0; i < cue_entries_size_; ++i) {
     const CuePoint* const cue = GetCueByIndex(i);
 
     if (!cue->Write(writer))
       return false;
   }
 
-  const int64 stop_position = writer->Position();
+  const int64_t stop_position = writer->Position();
   if (stop_position < 0)
     return false;
 
-  if (stop_position - payload_position != static_cast<int64>(size))
+  if (stop_position - payload_position != static_cast<int64_t>(size))
     return false;
 
   return true;
@@ -409,36 +436,40 @@ bool Cues::Write(IMkvWriter* writer) const {
 
 ContentEncAESSettings::ContentEncAESSettings() : cipher_mode_(kCTR) {}
 
-uint64 ContentEncAESSettings::Size() const {
-  const uint64 payload = PayloadSize();
-  const uint64 size =
-      EbmlMasterElementSize(kMkvContentEncAESSettings, payload) + payload;
+uint64_t ContentEncAESSettings::Size() const {
+  const uint64_t payload = PayloadSize();
+  const uint64_t size =
+      EbmlMasterElementSize(libwebm::kMkvContentEncAESSettings, payload) +
+      payload;
   return size;
 }
 
 bool ContentEncAESSettings::Write(IMkvWriter* writer) const {
-  const uint64 payload = PayloadSize();
+  const uint64_t payload = PayloadSize();
 
-  if (!WriteEbmlMasterElement(writer, kMkvContentEncAESSettings, payload))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncAESSettings,
+                              payload))
     return false;
 
-  const int64 payload_position = writer->Position();
+  const int64_t payload_position = writer->Position();
   if (payload_position < 0)
     return false;
 
-  if (!WriteEbmlElement(writer, kMkvAESSettingsCipherMode, cipher_mode_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvAESSettingsCipherMode,
+                        cipher_mode_))
     return false;
 
-  const int64 stop_position = writer->Position();
+  const int64_t stop_position = writer->Position();
   if (stop_position < 0 ||
-      stop_position - payload_position != static_cast<int64>(payload))
+      stop_position - payload_position != static_cast<int64_t>(payload))
     return false;
 
   return true;
 }
 
-uint64 ContentEncAESSettings::PayloadSize() const {
-  uint64 size = EbmlElementSize(kMkvAESSettingsCipherMode, cipher_mode_);
+uint64_t ContentEncAESSettings::PayloadSize() const {
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvAESSettingsCipherMode, cipher_mode_);
   return size;
 }
 
@@ -456,14 +487,14 @@ ContentEncoding::ContentEncoding()
 
 ContentEncoding::~ContentEncoding() { delete[] enc_key_id_; }
 
-bool ContentEncoding::SetEncryptionID(const uint8* id, uint64 length) {
+bool ContentEncoding::SetEncryptionID(const uint8_t* id, uint64_t length) {
   if (!id || length < 1)
     return false;
 
   delete[] enc_key_id_;
 
   enc_key_id_ =
-      new (std::nothrow) uint8[static_cast<size_t>(length)];  // NOLINT
+      new (std::nothrow) uint8_t[static_cast<size_t>(length)];  // NOLINT
   if (!enc_key_id_)
     return false;
 
@@ -473,79 +504,89 @@ bool ContentEncoding::SetEncryptionID(const uint8* id, uint64 length) {
   return true;
 }
 
-uint64 ContentEncoding::Size() const {
-  const uint64 encryption_size = EncryptionSize();
-  const uint64 encoding_size = EncodingSize(0, encryption_size);
-  const uint64 encodings_size =
-      EbmlMasterElementSize(kMkvContentEncoding, encoding_size) + encoding_size;
+uint64_t ContentEncoding::Size() const {
+  const uint64_t encryption_size = EncryptionSize();
+  const uint64_t encoding_size = EncodingSize(0, encryption_size);
+  const uint64_t encodings_size =
+      EbmlMasterElementSize(libwebm::kMkvContentEncoding, encoding_size) +
+      encoding_size;
 
   return encodings_size;
 }
 
 bool ContentEncoding::Write(IMkvWriter* writer) const {
-  const uint64 encryption_size = EncryptionSize();
-  const uint64 encoding_size = EncodingSize(0, encryption_size);
-  const uint64 size =
-      EbmlMasterElementSize(kMkvContentEncoding, encoding_size) + encoding_size;
+  const uint64_t encryption_size = EncryptionSize();
+  const uint64_t encoding_size = EncodingSize(0, encryption_size);
+  const uint64_t size =
+      EbmlMasterElementSize(libwebm::kMkvContentEncoding, encoding_size) +
+      encoding_size;
 
-  const int64 payload_position = writer->Position();
+  const int64_t payload_position = writer->Position();
   if (payload_position < 0)
     return false;
 
-  if (!WriteEbmlMasterElement(writer, kMkvContentEncoding, encoding_size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncoding,
+                              encoding_size))
     return false;
-  if (!WriteEbmlElement(writer, kMkvContentEncodingOrder, encoding_order_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingOrder,
+                        encoding_order_))
     return false;
-  if (!WriteEbmlElement(writer, kMkvContentEncodingScope, encoding_scope_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingScope,
+                        encoding_scope_))
     return false;
-  if (!WriteEbmlElement(writer, kMkvContentEncodingType, encoding_type_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingType,
+                        encoding_type_))
     return false;
 
-  if (!WriteEbmlMasterElement(writer, kMkvContentEncryption, encryption_size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncryption,
+                              encryption_size))
     return false;
-  if (!WriteEbmlElement(writer, kMkvContentEncAlgo, enc_algo_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncAlgo, enc_algo_))
     return false;
-  if (!WriteEbmlElement(writer, kMkvContentEncKeyID, enc_key_id_,
+  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncKeyID, enc_key_id_,
                         enc_key_id_length_))
     return false;
 
   if (!enc_aes_settings_.Write(writer))
     return false;
 
-  const int64 stop_position = writer->Position();
+  const int64_t stop_position = writer->Position();
   if (stop_position < 0 ||
-      stop_position - payload_position != static_cast<int64>(size))
+      stop_position - payload_position != static_cast<int64_t>(size))
     return false;
 
   return true;
 }
 
-uint64 ContentEncoding::EncodingSize(uint64 compresion_size,
-                                     uint64 encryption_size) const {
+uint64_t ContentEncoding::EncodingSize(uint64_t compresion_size,
+                                       uint64_t encryption_size) const {
   // TODO(fgalligan): Add support for compression settings.
   if (compresion_size != 0)
     return 0;
 
-  uint64 encoding_size = 0;
+  uint64_t encoding_size = 0;
 
   if (encryption_size > 0) {
     encoding_size +=
-        EbmlMasterElementSize(kMkvContentEncryption, encryption_size) +
+        EbmlMasterElementSize(libwebm::kMkvContentEncryption, encryption_size) +
         encryption_size;
   }
-  encoding_size += EbmlElementSize(kMkvContentEncodingType, encoding_type_);
-  encoding_size += EbmlElementSize(kMkvContentEncodingScope, encoding_scope_);
-  encoding_size += EbmlElementSize(kMkvContentEncodingOrder, encoding_order_);
+  encoding_size +=
+      EbmlElementSize(libwebm::kMkvContentEncodingType, encoding_type_);
+  encoding_size +=
+      EbmlElementSize(libwebm::kMkvContentEncodingScope, encoding_scope_);
+  encoding_size +=
+      EbmlElementSize(libwebm::kMkvContentEncodingOrder, encoding_order_);
 
   return encoding_size;
 }
 
-uint64 ContentEncoding::EncryptionSize() const {
-  const uint64 aes_size = enc_aes_settings_.Size();
+uint64_t ContentEncoding::EncryptionSize() const {
+  const uint64_t aes_size = enc_aes_settings_.Size();
 
-  uint64 encryption_size =
-      EbmlElementSize(kMkvContentEncKeyID, enc_key_id_, enc_key_id_length_);
-  encryption_size += EbmlElementSize(kMkvContentEncAlgo, enc_algo_);
+  uint64_t encryption_size = EbmlElementSize(libwebm::kMkvContentEncKeyID,
+                                             enc_key_id_, enc_key_id_length_);
+  encryption_size += EbmlElementSize(libwebm::kMkvContentEncAlgo, enc_algo_);
 
   return encryption_size + aes_size;
 }
@@ -577,7 +618,7 @@ Track::~Track() {
   delete[] name_;
 
   if (content_encoding_entries_) {
-    for (uint32 i = 0; i < content_encoding_entries_size_; ++i) {
+    for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
       ContentEncoding* const encoding = content_encoding_entries_[i];
       delete encoding;
     }
@@ -586,7 +627,7 @@ Track::~Track() {
 }
 
 bool Track::AddContentEncoding() {
-  const uint32 count = content_encoding_entries_size_ + 1;
+  const uint32_t count = content_encoding_entries_size_ + 1;
 
   ContentEncoding** const content_encoding_entries =
       new (std::nothrow) ContentEncoding*[count];  // NOLINT
@@ -600,7 +641,7 @@ bool Track::AddContentEncoding() {
     return false;
   }
 
-  for (uint32 i = 0; i < content_encoding_entries_size_; ++i) {
+  for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
     content_encoding_entries[i] = content_encoding_entries_[i];
   }
 
@@ -612,7 +653,7 @@ bool Track::AddContentEncoding() {
   return true;
 }
 
-ContentEncoding* Track::GetContentEncodingByIndex(uint32 index) const {
+ContentEncoding* Track::GetContentEncodingByIndex(uint32_t index) const {
   if (content_encoding_entries_ == NULL)
     return NULL;
 
@@ -622,46 +663,47 @@ ContentEncoding* Track::GetContentEncodingByIndex(uint32 index) const {
   return content_encoding_entries_[index];
 }
 
-uint64 Track::PayloadSize() const {
-  uint64 size = EbmlElementSize(kMkvTrackNumber, number_);
-  size += EbmlElementSize(kMkvTrackUID, uid_);
-  size += EbmlElementSize(kMkvTrackType, type_);
+uint64_t Track::PayloadSize() const {
+  uint64_t size = EbmlElementSize(libwebm::kMkvTrackNumber, number_);
+  size += EbmlElementSize(libwebm::kMkvTrackUID, uid_);
+  size += EbmlElementSize(libwebm::kMkvTrackType, type_);
   if (codec_id_)
-    size += EbmlElementSize(kMkvCodecID, codec_id_);
+    size += EbmlElementSize(libwebm::kMkvCodecID, codec_id_);
   if (codec_private_)
-    size += EbmlElementSize(kMkvCodecPrivate, codec_private_,
+    size += EbmlElementSize(libwebm::kMkvCodecPrivate, codec_private_,
                             codec_private_length_);
   if (language_)
-    size += EbmlElementSize(kMkvLanguage, language_);
+    size += EbmlElementSize(libwebm::kMkvLanguage, language_);
   if (name_)
-    size += EbmlElementSize(kMkvName, name_);
+    size += EbmlElementSize(libwebm::kMkvName, name_);
   if (max_block_additional_id_)
-    size += EbmlElementSize(kMkvMaxBlockAdditionID, max_block_additional_id_);
+    size += EbmlElementSize(libwebm::kMkvMaxBlockAdditionID,
+                            max_block_additional_id_);
   if (codec_delay_)
-    size += EbmlElementSize(kMkvCodecDelay, codec_delay_);
+    size += EbmlElementSize(libwebm::kMkvCodecDelay, codec_delay_);
   if (seek_pre_roll_)
-    size += EbmlElementSize(kMkvSeekPreRoll, seek_pre_roll_);
+    size += EbmlElementSize(libwebm::kMkvSeekPreRoll, seek_pre_roll_);
   if (default_duration_)
-    size += EbmlElementSize(kMkvDefaultDuration, default_duration_);
+    size += EbmlElementSize(libwebm::kMkvDefaultDuration, default_duration_);
 
   if (content_encoding_entries_size_ > 0) {
-    uint64 content_encodings_size = 0;
-    for (uint32 i = 0; i < content_encoding_entries_size_; ++i) {
+    uint64_t content_encodings_size = 0;
+    for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
       ContentEncoding* const encoding = content_encoding_entries_[i];
       content_encodings_size += encoding->Size();
     }
 
-    size +=
-        EbmlMasterElementSize(kMkvContentEncodings, content_encodings_size) +
-        content_encodings_size;
+    size += EbmlMasterElementSize(libwebm::kMkvContentEncodings,
+                                  content_encodings_size) +
+            content_encodings_size;
   }
 
   return size;
 }
 
-uint64 Track::Size() const {
-  uint64 size = PayloadSize();
-  size += EbmlMasterElementSize(kMkvTrackEntry, size);
+uint64_t Track::Size() const {
+  uint64_t size = PayloadSize();
+  size += EbmlMasterElementSize(libwebm::kMkvTrackEntry, size);
   return size;
 }
 
@@ -675,95 +717,97 @@ bool Track::Write(IMkvWriter* writer) const {
 
   // |size| may be bigger than what is written out in this function because
   // derived classes may write out more data in the Track element.
-  const uint64 payload_size = PayloadSize();
+  const uint64_t payload_size = PayloadSize();
 
-  if (!WriteEbmlMasterElement(writer, kMkvTrackEntry, payload_size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvTrackEntry, payload_size))
     return false;
 
-  uint64 size = EbmlElementSize(kMkvTrackNumber, number_);
-  size += EbmlElementSize(kMkvTrackUID, uid_);
-  size += EbmlElementSize(kMkvTrackType, type_);
+  uint64_t size = EbmlElementSize(libwebm::kMkvTrackNumber, number_);
+  size += EbmlElementSize(libwebm::kMkvTrackUID, uid_);
+  size += EbmlElementSize(libwebm::kMkvTrackType, type_);
   if (codec_id_)
-    size += EbmlElementSize(kMkvCodecID, codec_id_);
+    size += EbmlElementSize(libwebm::kMkvCodecID, codec_id_);
   if (codec_private_)
-    size += EbmlElementSize(kMkvCodecPrivate, codec_private_,
+    size += EbmlElementSize(libwebm::kMkvCodecPrivate, codec_private_,
                             codec_private_length_);
   if (language_)
-    size += EbmlElementSize(kMkvLanguage, language_);
+    size += EbmlElementSize(libwebm::kMkvLanguage, language_);
   if (name_)
-    size += EbmlElementSize(kMkvName, name_);
+    size += EbmlElementSize(libwebm::kMkvName, name_);
   if (max_block_additional_id_)
-    size += EbmlElementSize(kMkvMaxBlockAdditionID, max_block_additional_id_);
+    size += EbmlElementSize(libwebm::kMkvMaxBlockAdditionID,
+                            max_block_additional_id_);
   if (codec_delay_)
-    size += EbmlElementSize(kMkvCodecDelay, codec_delay_);
+    size += EbmlElementSize(libwebm::kMkvCodecDelay, codec_delay_);
   if (seek_pre_roll_)
-    size += EbmlElementSize(kMkvSeekPreRoll, seek_pre_roll_);
+    size += EbmlElementSize(libwebm::kMkvSeekPreRoll, seek_pre_roll_);
   if (default_duration_)
-    size += EbmlElementSize(kMkvDefaultDuration, default_duration_);
+    size += EbmlElementSize(libwebm::kMkvDefaultDuration, default_duration_);
 
-  const int64 payload_position = writer->Position();
+  const int64_t payload_position = writer->Position();
   if (payload_position < 0)
     return false;
 
-  if (!WriteEbmlElement(writer, kMkvTrackNumber, number_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvTrackNumber, number_))
     return false;
-  if (!WriteEbmlElement(writer, kMkvTrackUID, uid_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvTrackUID, uid_))
     return false;
-  if (!WriteEbmlElement(writer, kMkvTrackType, type_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvTrackType, type_))
     return false;
   if (max_block_additional_id_) {
-    if (!WriteEbmlElement(writer, kMkvMaxBlockAdditionID,
+    if (!WriteEbmlElement(writer, libwebm::kMkvMaxBlockAdditionID,
                           max_block_additional_id_)) {
       return false;
     }
   }
   if (codec_delay_) {
-    if (!WriteEbmlElement(writer, kMkvCodecDelay, codec_delay_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvCodecDelay, codec_delay_))
       return false;
   }
   if (seek_pre_roll_) {
-    if (!WriteEbmlElement(writer, kMkvSeekPreRoll, seek_pre_roll_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvSeekPreRoll, seek_pre_roll_))
       return false;
   }
   if (default_duration_) {
-    if (!WriteEbmlElement(writer, kMkvDefaultDuration, default_duration_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvDefaultDuration,
+                          default_duration_))
       return false;
   }
   if (codec_id_) {
-    if (!WriteEbmlElement(writer, kMkvCodecID, codec_id_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvCodecID, codec_id_))
       return false;
   }
   if (codec_private_) {
-    if (!WriteEbmlElement(writer, kMkvCodecPrivate, codec_private_,
+    if (!WriteEbmlElement(writer, libwebm::kMkvCodecPrivate, codec_private_,
                           codec_private_length_))
       return false;
   }
   if (language_) {
-    if (!WriteEbmlElement(writer, kMkvLanguage, language_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvLanguage, language_))
       return false;
   }
   if (name_) {
-    if (!WriteEbmlElement(writer, kMkvName, name_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvName, name_))
       return false;
   }
 
-  int64 stop_position = writer->Position();
+  int64_t stop_position = writer->Position();
   if (stop_position < 0 ||
-      stop_position - payload_position != static_cast<int64>(size))
+      stop_position - payload_position != static_cast<int64_t>(size))
     return false;
 
   if (content_encoding_entries_size_ > 0) {
-    uint64 content_encodings_size = 0;
-    for (uint32 i = 0; i < content_encoding_entries_size_; ++i) {
+    uint64_t content_encodings_size = 0;
+    for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
       ContentEncoding* const encoding = content_encoding_entries_[i];
       content_encodings_size += encoding->Size();
     }
 
-    if (!WriteEbmlMasterElement(writer, kMkvContentEncodings,
+    if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncodings,
                                 content_encodings_size))
       return false;
 
-    for (uint32 i = 0; i < content_encoding_entries_size_; ++i) {
+    for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
       ContentEncoding* const encoding = content_encoding_entries_[i];
       if (!encoding->Write(writer))
         return false;
@@ -776,14 +820,14 @@ bool Track::Write(IMkvWriter* writer) const {
   return true;
 }
 
-bool Track::SetCodecPrivate(const uint8* codec_private, uint64 length) {
+bool Track::SetCodecPrivate(const uint8_t* codec_private, uint64_t length) {
   if (!codec_private || length < 1)
     return false;
 
   delete[] codec_private_;
 
   codec_private_ =
-      new (std::nothrow) uint8[static_cast<size_t>(length)];  // NOLINT
+      new (std::nothrow) uint8_t[static_cast<size_t>(length)];  // NOLINT
   if (!codec_private_)
     return false;
 
@@ -844,6 +888,279 @@ void Track::set_name(const char* name) {
 
 ///////////////////////////////////////////////////////////////
 //
+// Colour and its child elements
+
+uint64_t PrimaryChromaticity::PrimaryChromaticityPayloadSize(
+    libwebm::MkvId x_id, libwebm::MkvId y_id) const {
+  return EbmlElementSize(x_id, x) + EbmlElementSize(y_id, y);
+}
+
+bool PrimaryChromaticity::Write(IMkvWriter* writer, libwebm::MkvId x_id,
+                                libwebm::MkvId y_id) const {
+  return WriteEbmlElement(writer, x_id, x) && WriteEbmlElement(writer, y_id, y);
+}
+
+uint64_t MasteringMetadata::MasteringMetadataSize() const {
+  uint64_t size = PayloadSize();
+
+  if (size > 0)
+    size += EbmlMasterElementSize(libwebm::kMkvMasteringMetadata, size);
+
+  return size;
+}
+
+bool MasteringMetadata::Write(IMkvWriter* writer) const {
+  const uint64_t size = PayloadSize();
+
+  // Don't write an empty element.
+  if (size == 0)
+    return true;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvMasteringMetadata, size))
+    return false;
+  if (luminance_max != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvLuminanceMax, luminance_max)) {
+    return false;
+  }
+  if (luminance_min != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min)) {
+    return false;
+  }
+  if (r_ &&
+      !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX,
+                 libwebm::kMkvPrimaryRChromaticityY)) {
+    return false;
+  }
+  if (g_ &&
+      !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX,
+                 libwebm::kMkvPrimaryGChromaticityY)) {
+    return false;
+  }
+  if (b_ &&
+      !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX,
+                 libwebm::kMkvPrimaryBChromaticityY)) {
+    return false;
+  }
+  if (white_point_ &&
+      !white_point_->Write(writer, libwebm::kMkvWhitePointChromaticityX,
+                           libwebm::kMkvWhitePointChromaticityY)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool MasteringMetadata::SetChromaticity(
+    const PrimaryChromaticity* r, const PrimaryChromaticity* g,
+    const PrimaryChromaticity* b, const PrimaryChromaticity* white_point) {
+  PrimaryChromaticityPtr r_ptr(NULL);
+  if (r) {
+    if (!CopyChromaticity(r, &r_ptr))
+      return false;
+  }
+  PrimaryChromaticityPtr g_ptr(NULL);
+  if (g) {
+    if (!CopyChromaticity(g, &g_ptr))
+      return false;
+  }
+  PrimaryChromaticityPtr b_ptr(NULL);
+  if (b) {
+    if (!CopyChromaticity(b, &b_ptr))
+      return false;
+  }
+  PrimaryChromaticityPtr wp_ptr(NULL);
+  if (white_point) {
+    if (!CopyChromaticity(white_point, &wp_ptr))
+      return false;
+  }
+
+  r_ = r_ptr.release();
+  g_ = g_ptr.release();
+  b_ = b_ptr.release();
+  white_point_ = wp_ptr.release();
+  return true;
+}
+
+uint64_t MasteringMetadata::PayloadSize() const {
+  uint64_t size = 0;
+
+  if (luminance_max != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvLuminanceMax, luminance_max);
+  if (luminance_min != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvLuminanceMin, luminance_min);
+
+  if (r_) {
+    size += r_->PrimaryChromaticityPayloadSize(
+        libwebm::kMkvPrimaryRChromaticityX, libwebm::kMkvPrimaryRChromaticityY);
+  }
+  if (g_) {
+    size += g_->PrimaryChromaticityPayloadSize(
+        libwebm::kMkvPrimaryGChromaticityX, libwebm::kMkvPrimaryGChromaticityY);
+  }
+  if (b_) {
+    size += b_->PrimaryChromaticityPayloadSize(
+        libwebm::kMkvPrimaryBChromaticityX, libwebm::kMkvPrimaryBChromaticityY);
+  }
+  if (white_point_) {
+    size += white_point_->PrimaryChromaticityPayloadSize(
+        libwebm::kMkvWhitePointChromaticityX,
+        libwebm::kMkvWhitePointChromaticityY);
+  }
+
+  return size;
+}
+
+uint64_t Colour::ColourSize() const {
+  uint64_t size = PayloadSize();
+
+  if (size > 0)
+    size += EbmlMasterElementSize(libwebm::kMkvColour, size);
+
+  return size;
+}
+
+bool Colour::Write(IMkvWriter* writer) const {
+  const uint64_t size = PayloadSize();
+
+  // Don't write an empty element.
+  if (size == 0)
+    return true;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvColour, size))
+    return false;
+
+  if (matrix_coefficients != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvMatrixCoefficients,
+                        matrix_coefficients)) {
+    return false;
+  }
+  if (bits_per_channel != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvBitsPerChannel,
+                        bits_per_channel)) {
+    return false;
+  }
+  if (chroma_subsampling_horz != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvChromaSubsamplingHorz,
+                        chroma_subsampling_horz)) {
+    return false;
+  }
+  if (chroma_subsampling_vert != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvChromaSubsamplingVert,
+                        chroma_subsampling_vert)) {
+    return false;
+  }
+
+  if (cb_subsampling_horz != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvCbSubsamplingHorz,
+                        cb_subsampling_horz)) {
+    return false;
+  }
+  if (cb_subsampling_vert != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvCbSubsamplingVert,
+                        cb_subsampling_vert)) {
+    return false;
+  }
+  if (chroma_siting_horz != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvChromaSitingHorz,
+                        chroma_siting_horz)) {
+    return false;
+  }
+  if (chroma_siting_vert != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvChromaSitingVert,
+                        chroma_siting_vert)) {
+    return false;
+  }
+  if (range != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvRange, range)) {
+    return false;
+  }
+  if (transfer_characteristics != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvTransferCharacteristics,
+                        transfer_characteristics)) {
+    return false;
+  }
+  if (primaries != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvPrimaries, primaries)) {
+    return false;
+  }
+  if (max_cll != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvMaxCLL, max_cll)) {
+    return false;
+  }
+  if (max_fall != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvMaxFALL, max_fall)) {
+    return false;
+  }
+
+  if (mastering_metadata_ && !mastering_metadata_->Write(writer))
+    return false;
+
+  return true;
+}
+
+bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) {
+  std::auto_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
+  if (!mm_ptr.get())
+    return false;
+
+  mm_ptr->luminance_max = mastering_metadata.luminance_max;
+  mm_ptr->luminance_min = mastering_metadata.luminance_min;
+
+  if (!mm_ptr->SetChromaticity(mastering_metadata.r(), mastering_metadata.g(),
+                               mastering_metadata.b(),
+                               mastering_metadata.white_point())) {
+    return false;
+  }
+
+  delete mastering_metadata_;
+  mastering_metadata_ = mm_ptr.release();
+  return true;
+}
+
+uint64_t Colour::PayloadSize() const {
+  uint64_t size = 0;
+
+  if (matrix_coefficients != kValueNotPresent)
+    size +=
+        EbmlElementSize(libwebm::kMkvMatrixCoefficients, matrix_coefficients);
+  if (bits_per_channel != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvBitsPerChannel, bits_per_channel);
+  if (chroma_subsampling_horz != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvChromaSubsamplingHorz,
+                            chroma_subsampling_horz);
+  if (chroma_subsampling_vert != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvChromaSubsamplingVert,
+                            chroma_subsampling_vert);
+  if (cb_subsampling_horz != kValueNotPresent)
+    size +=
+        EbmlElementSize(libwebm::kMkvCbSubsamplingHorz, cb_subsampling_horz);
+  if (cb_subsampling_vert != kValueNotPresent)
+    size +=
+        EbmlElementSize(libwebm::kMkvCbSubsamplingVert, cb_subsampling_vert);
+  if (chroma_siting_horz != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvChromaSitingHorz, chroma_siting_horz);
+  if (chroma_siting_vert != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvChromaSitingVert, chroma_siting_vert);
+  if (range != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvRange, range);
+  if (transfer_characteristics != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvTransferCharacteristics,
+                            transfer_characteristics);
+  if (primaries != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvPrimaries, primaries);
+  if (max_cll != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvMaxCLL, max_cll);
+  if (max_fall != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvMaxFALL, max_fall);
+
+  if (mastering_metadata_)
+    size += mastering_metadata_->MasteringMetadataSize();
+
+  return size;
+}
+
+///////////////////////////////////////////////////////////////
+//
 // VideoTrack Class
 
 VideoTrack::VideoTrack(unsigned int* seed)
@@ -858,11 +1175,12 @@ VideoTrack::VideoTrack(unsigned int* seed)
       height_(0),
       stereo_mode_(0),
       alpha_mode_(0),
-      width_(0) {}
+      width_(0),
+      colour_(NULL) {}
 
-VideoTrack::~VideoTrack() {}
+VideoTrack::~VideoTrack() { delete colour_; }
 
-bool VideoTrack::SetStereoMode(uint64 stereo_mode) {
+bool VideoTrack::SetStereoMode(uint64_t stereo_mode) {
   if (stereo_mode != kMono && stereo_mode != kSideBySideLeftIsFirst &&
       stereo_mode != kTopBottomRightIsFirst &&
       stereo_mode != kTopBottomLeftIsFirst &&
@@ -873,7 +1191,7 @@ bool VideoTrack::SetStereoMode(uint64 stereo_mode) {
   return true;
 }
 
-bool VideoTrack::SetAlphaMode(uint64 alpha_mode) {
+bool VideoTrack::SetAlphaMode(uint64_t alpha_mode) {
   if (alpha_mode != kNoAlpha && alpha_mode != kAlpha)
     return false;
 
@@ -881,11 +1199,11 @@ bool VideoTrack::SetAlphaMode(uint64 alpha_mode) {
   return true;
 }
 
-uint64 VideoTrack::PayloadSize() const {
-  const uint64 parent_size = Track::PayloadSize();
+uint64_t VideoTrack::PayloadSize() const {
+  const uint64_t parent_size = Track::PayloadSize();
 
-  uint64 size = VideoPayloadSize();
-  size += EbmlMasterElementSize(kMkvVideo, size);
+  uint64_t size = VideoPayloadSize();
+  size += EbmlMasterElementSize(libwebm::kMkvVideo, size);
 
   return parent_size + size;
 }
@@ -894,88 +1212,122 @@ bool VideoTrack::Write(IMkvWriter* writer) const {
   if (!Track::Write(writer))
     return false;
 
-  const uint64 size = VideoPayloadSize();
+  const uint64_t size = VideoPayloadSize();
 
-  if (!WriteEbmlMasterElement(writer, kMkvVideo, size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvVideo, size))
     return false;
 
-  const int64 payload_position = writer->Position();
+  const int64_t payload_position = writer->Position();
   if (payload_position < 0)
     return false;
 
-  if (!WriteEbmlElement(writer, kMkvPixelWidth, width_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvPixelWidth, width_))
     return false;
-  if (!WriteEbmlElement(writer, kMkvPixelHeight, height_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvPixelHeight, height_))
     return false;
   if (display_width_ > 0) {
-    if (!WriteEbmlElement(writer, kMkvDisplayWidth, display_width_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvDisplayWidth, display_width_))
       return false;
   }
   if (display_height_ > 0) {
-    if (!WriteEbmlElement(writer, kMkvDisplayHeight, display_height_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvDisplayHeight, display_height_))
       return false;
   }
   if (crop_left_ > 0) {
-    if (!WriteEbmlElement(writer, kMkvPixelCropLeft, crop_left_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropLeft, crop_left_))
       return false;
   }
   if (crop_right_ > 0) {
-    if (!WriteEbmlElement(writer, kMkvPixelCropRight, crop_right_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropRight, crop_right_))
       return false;
   }
   if (crop_top_ > 0) {
-    if (!WriteEbmlElement(writer, kMkvPixelCropTop, crop_top_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropTop, crop_top_))
       return false;
   }
   if (crop_bottom_ > 0) {
-    if (!WriteEbmlElement(writer, kMkvPixelCropBottom, crop_bottom_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropBottom, crop_bottom_))
       return false;
   }
   if (stereo_mode_ > kMono) {
-    if (!WriteEbmlElement(writer, kMkvStereoMode, stereo_mode_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvStereoMode, stereo_mode_))
       return false;
   }
   if (alpha_mode_ > kNoAlpha) {
-    if (!WriteEbmlElement(writer, kMkvAlphaMode, alpha_mode_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvAlphaMode, alpha_mode_))
       return false;
   }
   if (frame_rate_ > 0.0) {
-    if (!WriteEbmlElement(writer, kMkvFrameRate,
+    if (!WriteEbmlElement(writer, libwebm::kMkvFrameRate,
                           static_cast<float>(frame_rate_))) {
       return false;
     }
   }
+  if (colour_) {
+    if (!colour_->Write(writer))
+      return false;
+  }
 
-  const int64 stop_position = writer->Position();
+  const int64_t stop_position = writer->Position();
   if (stop_position < 0 ||
-      stop_position - payload_position != static_cast<int64>(size)) {
+      stop_position - payload_position != static_cast<int64_t>(size)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool VideoTrack::SetColour(const Colour& colour) {
+  std::auto_ptr<Colour> colour_ptr(new Colour());
+  if (!colour_ptr.get())
     return false;
+
+  if (colour.mastering_metadata()) {
+    if (!colour_ptr->SetMasteringMetadata(*colour.mastering_metadata()))
+      return false;
   }
 
+  colour_ptr->matrix_coefficients = colour.matrix_coefficients;
+  colour_ptr->bits_per_channel = colour.bits_per_channel;
+  colour_ptr->chroma_subsampling_horz = colour.chroma_subsampling_horz;
+  colour_ptr->chroma_subsampling_vert = colour.chroma_subsampling_vert;
+  colour_ptr->cb_subsampling_horz = colour.cb_subsampling_horz;
+  colour_ptr->cb_subsampling_vert = colour.cb_subsampling_vert;
+  colour_ptr->chroma_siting_horz = colour.chroma_siting_horz;
+  colour_ptr->chroma_siting_vert = colour.chroma_siting_vert;
+  colour_ptr->range = colour.range;
+  colour_ptr->transfer_characteristics = colour.transfer_characteristics;
+  colour_ptr->primaries = colour.primaries;
+  colour_ptr->max_cll = colour.max_cll;
+  colour_ptr->max_fall = colour.max_fall;
+  colour_ = colour_ptr.release();
   return true;
 }
 
-uint64 VideoTrack::VideoPayloadSize() const {
-  uint64 size = EbmlElementSize(kMkvPixelWidth, width_);
-  size += EbmlElementSize(kMkvPixelHeight, height_);
+uint64_t VideoTrack::VideoPayloadSize() const {
+  uint64_t size = EbmlElementSize(libwebm::kMkvPixelWidth, width_);
+  size += EbmlElementSize(libwebm::kMkvPixelHeight, height_);
   if (display_width_ > 0)
-    size += EbmlElementSize(kMkvDisplayWidth, display_width_);
+    size += EbmlElementSize(libwebm::kMkvDisplayWidth, display_width_);
   if (display_height_ > 0)
-    size += EbmlElementSize(kMkvDisplayHeight, display_height_);
+    size += EbmlElementSize(libwebm::kMkvDisplayHeight, display_height_);
   if (crop_left_ > 0)
-    size += EbmlElementSize(kMkvPixelCropLeft, crop_left_);
+    size += EbmlElementSize(libwebm::kMkvPixelCropLeft, crop_left_);
   if (crop_right_ > 0)
-    size += EbmlElementSize(kMkvPixelCropRight, crop_right_);
+    size += EbmlElementSize(libwebm::kMkvPixelCropRight, crop_right_);
   if (crop_top_ > 0)
-    size += EbmlElementSize(kMkvPixelCropTop, crop_top_);
+    size += EbmlElementSize(libwebm::kMkvPixelCropTop, crop_top_);
   if (crop_bottom_ > 0)
-    size += EbmlElementSize(kMkvPixelCropBottom, crop_bottom_);
+    size += EbmlElementSize(libwebm::kMkvPixelCropBottom, crop_bottom_);
   if (stereo_mode_ > kMono)
-    size += EbmlElementSize(kMkvStereoMode, stereo_mode_);
+    size += EbmlElementSize(libwebm::kMkvStereoMode, stereo_mode_);
   if (alpha_mode_ > kNoAlpha)
-    size += EbmlElementSize(kMkvAlphaMode, alpha_mode_);
+    size += EbmlElementSize(libwebm::kMkvAlphaMode, alpha_mode_);
   if (frame_rate_ > 0.0)
-    size += EbmlElementSize(kMkvFrameRate, static_cast<float>(frame_rate_));
+    size += EbmlElementSize(libwebm::kMkvFrameRate,
+                            static_cast<float>(frame_rate_));
+  if (colour_)
+    size += colour_->ColourSize();
 
   return size;
 }
@@ -989,15 +1341,15 @@ AudioTrack::AudioTrack(unsigned int* seed)
 
 AudioTrack::~AudioTrack() {}
 
-uint64 AudioTrack::PayloadSize() const {
-  const uint64 parent_size = Track::PayloadSize();
+uint64_t AudioTrack::PayloadSize() const {
+  const uint64_t parent_size = Track::PayloadSize();
 
-  uint64 size =
-      EbmlElementSize(kMkvSamplingFrequency, static_cast<float>(sample_rate_));
-  size += EbmlElementSize(kMkvChannels, channels_);
+  uint64_t size = EbmlElementSize(libwebm::kMkvSamplingFrequency,
+                                  static_cast<float>(sample_rate_));
+  size += EbmlElementSize(libwebm::kMkvChannels, channels_);
   if (bit_depth_ > 0)
-    size += EbmlElementSize(kMkvBitDepth, bit_depth_);
-  size += EbmlMasterElementSize(kMkvAudio, size);
+    size += EbmlElementSize(libwebm::kMkvBitDepth, bit_depth_);
+  size += EbmlMasterElementSize(libwebm::kMkvAudio, size);
 
   return parent_size + size;
 }
@@ -1007,31 +1359,31 @@ bool AudioTrack::Write(IMkvWriter* writer) const {
     return false;
 
   // Calculate AudioSettings size.
-  uint64 size =
-      EbmlElementSize(kMkvSamplingFrequency, static_cast<float>(sample_rate_));
-  size += EbmlElementSize(kMkvChannels, channels_);
+  uint64_t size = EbmlElementSize(libwebm::kMkvSamplingFrequency,
+                                  static_cast<float>(sample_rate_));
+  size += EbmlElementSize(libwebm::kMkvChannels, channels_);
   if (bit_depth_ > 0)
-    size += EbmlElementSize(kMkvBitDepth, bit_depth_);
+    size += EbmlElementSize(libwebm::kMkvBitDepth, bit_depth_);
 
-  if (!WriteEbmlMasterElement(writer, kMkvAudio, size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvAudio, size))
     return false;
 
-  const int64 payload_position = writer->Position();
+  const int64_t payload_position = writer->Position();
   if (payload_position < 0)
     return false;
 
-  if (!WriteEbmlElement(writer, kMkvSamplingFrequency,
+  if (!WriteEbmlElement(writer, libwebm::kMkvSamplingFrequency,
                         static_cast<float>(sample_rate_)))
     return false;
-  if (!WriteEbmlElement(writer, kMkvChannels, channels_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvChannels, channels_))
     return false;
   if (bit_depth_ > 0)
-    if (!WriteEbmlElement(writer, kMkvBitDepth, bit_depth_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvBitDepth, bit_depth_))
       return false;
 
-  const int64 stop_position = writer->Position();
+  const int64_t stop_position = writer->Position();
   if (stop_position < 0 ||
-      stop_position - payload_position != static_cast<int64>(size))
+      stop_position - payload_position != static_cast<int64_t>(size))
     return false;
 
   return true;
@@ -1047,11 +1399,12 @@ const char Tracks::kVp8CodecId[] = "V_VP8";
 const char Tracks::kVp9CodecId[] = "V_VP9";
 const char Tracks::kVp10CodecId[] = "V_VP10";
 
-Tracks::Tracks() : track_entries_(NULL), track_entries_size_(0) {}
+Tracks::Tracks()
+    : track_entries_(NULL), track_entries_size_(0), wrote_tracks_(false) {}
 
 Tracks::~Tracks() {
   if (track_entries_) {
-    for (uint32 i = 0; i < track_entries_size_; ++i) {
+    for (uint32_t i = 0; i < track_entries_size_; ++i) {
       Track* const track = track_entries_[i];
       delete track;
     }
@@ -1059,8 +1412,8 @@ Tracks::~Tracks() {
   }
 }
 
-bool Tracks::AddTrack(Track* track, int32 number) {
-  if (number < 0)
+bool Tracks::AddTrack(Track* track, int32_t number) {
+  if (number < 0 || wrote_tracks_)
     return false;
 
   // This muxer only supports track numbers in the range [1, 126], in
@@ -1071,23 +1424,23 @@ bool Tracks::AddTrack(Track* track, int32 number) {
   if (number > 0x7E)
     return false;
 
-  uint32 track_num = number;
+  uint32_t track_num = number;
 
   if (track_num > 0) {
     // Check to make sure a track does not already have |track_num|.
-    for (uint32 i = 0; i < track_entries_size_; ++i) {
+    for (uint32_t i = 0; i < track_entries_size_; ++i) {
       if (track_entries_[i]->number() == track_num)
         return false;
     }
   }
 
-  const uint32 count = track_entries_size_ + 1;
+  const uint32_t count = track_entries_size_ + 1;
 
   Track** const track_entries = new (std::nothrow) Track*[count];  // NOLINT
   if (!track_entries)
     return false;
 
-  for (uint32 i = 0; i < track_entries_size_; ++i) {
+  for (uint32_t i = 0; i < track_entries_size_; ++i) {
     track_entries[i] = track_entries_[i];
   }
 
@@ -1101,7 +1454,7 @@ bool Tracks::AddTrack(Track* track, int32 number) {
     bool exit = false;
     do {
       exit = true;
-      for (uint32 i = 0; i < track_entries_size_; ++i) {
+      for (uint32_t i = 0; i < track_entries_size_; ++i) {
         if (track_entries[i]->number() == track_num) {
           track_num++;
           exit = false;
@@ -1118,7 +1471,7 @@ bool Tracks::AddTrack(Track* track, int32 number) {
   return true;
 }
 
-const Track* Tracks::GetTrackByIndex(uint32 index) const {
+const Track* Tracks::GetTrackByIndex(uint32_t index) const {
   if (track_entries_ == NULL)
     return NULL;
 
@@ -1128,9 +1481,9 @@ const Track* Tracks::GetTrackByIndex(uint32 index) const {
   return track_entries_[index];
 }
 
-Track* Tracks::GetTrackByNumber(uint64 track_number) const {
-  const int32 count = track_entries_size();
-  for (int32 i = 0; i < count; ++i) {
+Track* Tracks::GetTrackByNumber(uint64_t track_number) const {
+  const int32_t count = track_entries_size();
+  for (int32_t i = 0; i < count; ++i) {
     if (track_entries_[i]->number() == track_number)
       return track_entries_[i];
   }
@@ -1138,7 +1491,7 @@ Track* Tracks::GetTrackByNumber(uint64 track_number) const {
   return NULL;
 }
 
-bool Tracks::TrackIsAudio(uint64 track_number) const {
+bool Tracks::TrackIsAudio(uint64_t track_number) const {
   const Track* const track = GetTrackByNumber(track_number);
 
   if (track->type() == kAudio)
@@ -1147,7 +1500,7 @@ bool Tracks::TrackIsAudio(uint64 track_number) const {
   return false;
 }
 
-bool Tracks::TrackIsVideo(uint64 track_number) const {
+bool Tracks::TrackIsVideo(uint64_t track_number) const {
   const Track* const track = GetTrackByNumber(track_number);
 
   if (track->type() == kVideo)
@@ -1157,9 +1510,9 @@ bool Tracks::TrackIsVideo(uint64 track_number) const {
 }
 
 bool Tracks::Write(IMkvWriter* writer) const {
-  uint64 size = 0;
-  const int32 count = track_entries_size();
-  for (int32 i = 0; i < count; ++i) {
+  uint64_t size = 0;
+  const int32_t count = track_entries_size();
+  for (int32_t i = 0; i < count; ++i) {
     const Track* const track = GetTrackByIndex(i);
 
     if (!track)
@@ -1168,24 +1521,25 @@ bool Tracks::Write(IMkvWriter* writer) const {
     size += track->Size();
   }
 
-  if (!WriteEbmlMasterElement(writer, kMkvTracks, size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvTracks, size))
     return false;
 
-  const int64 payload_position = writer->Position();
+  const int64_t payload_position = writer->Position();
   if (payload_position < 0)
     return false;
 
-  for (int32 i = 0; i < count; ++i) {
+  for (int32_t i = 0; i < count; ++i) {
     const Track* const track = GetTrackByIndex(i);
     if (!track->Write(writer))
       return false;
   }
 
-  const int64 stop_position = writer->Position();
+  const int64_t stop_position = writer->Position();
   if (stop_position < 0 ||
-      stop_position - payload_position != static_cast<int64>(size))
+      stop_position - payload_position != static_cast<int64_t>(size))
     return false;
 
+  wrote_tracks_ = true;
   return true;
 }
 
@@ -1195,9 +1549,10 @@ bool Tracks::Write(IMkvWriter* writer) const {
 
 bool Chapter::set_id(const char* id) { return StrCpy(id, &id_); }
 
-void Chapter::set_time(const Segment& segment, uint64 start_ns, uint64 end_ns) {
+void Chapter::set_time(const Segment& segment, uint64_t start_ns,
+                       uint64_t end_ns) {
   const SegmentInfo* const info = segment.GetSegmentInfo();
-  const uint64 timecode_scale = info->timecode_scale();
+  const uint64_t timecode_scale = info->timecode_scale();
   start_timecode_ = start_ns / timecode_scale;
   end_timecode_ = end_ns / timecode_scale;
 }
@@ -1292,38 +1647,40 @@ bool Chapter::ExpandDisplaysArray() {
   return true;
 }
 
-uint64 Chapter::WriteAtom(IMkvWriter* writer) const {
-  uint64 payload_size = EbmlElementSize(kMkvChapterStringUID, id_) +
-                        EbmlElementSize(kMkvChapterUID, uid_) +
-                        EbmlElementSize(kMkvChapterTimeStart, start_timecode_) +
-                        EbmlElementSize(kMkvChapterTimeEnd, end_timecode_);
+uint64_t Chapter::WriteAtom(IMkvWriter* writer) const {
+  uint64_t payload_size =
+      EbmlElementSize(libwebm::kMkvChapterStringUID, id_) +
+      EbmlElementSize(libwebm::kMkvChapterUID, uid_) +
+      EbmlElementSize(libwebm::kMkvChapterTimeStart, start_timecode_) +
+      EbmlElementSize(libwebm::kMkvChapterTimeEnd, end_timecode_);
 
   for (int idx = 0; idx < displays_count_; ++idx) {
     const Display& d = displays_[idx];
     payload_size += d.WriteDisplay(NULL);
   }
 
-  const uint64 atom_size =
-      EbmlMasterElementSize(kMkvChapterAtom, payload_size) + payload_size;
+  const uint64_t atom_size =
+      EbmlMasterElementSize(libwebm::kMkvChapterAtom, payload_size) +
+      payload_size;
 
   if (writer == NULL)
     return atom_size;
 
-  const int64 start = writer->Position();
+  const int64_t start = writer->Position();
 
-  if (!WriteEbmlMasterElement(writer, kMkvChapterAtom, payload_size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvChapterAtom, payload_size))
     return 0;
 
-  if (!WriteEbmlElement(writer, kMkvChapterStringUID, id_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapterStringUID, id_))
     return 0;
 
-  if (!WriteEbmlElement(writer, kMkvChapterUID, uid_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapterUID, uid_))
     return 0;
 
-  if (!WriteEbmlElement(writer, kMkvChapterTimeStart, start_timecode_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeStart, start_timecode_))
     return 0;
 
-  if (!WriteEbmlElement(writer, kMkvChapterTimeEnd, end_timecode_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeEnd, end_timecode_))
     return 0;
 
   for (int idx = 0; idx < displays_count_; ++idx) {
@@ -1333,9 +1690,9 @@ uint64 Chapter::WriteAtom(IMkvWriter* writer) const {
       return 0;
   }
 
-  const int64 stop = writer->Position();
+  const int64_t stop = writer->Position();
 
-  if (stop >= start && uint64(stop - start) != atom_size)
+  if (stop >= start && uint64_t(stop - start) != atom_size)
     return 0;
 
   return atom_size;
@@ -1365,42 +1722,44 @@ bool Chapter::Display::set_country(const char* country) {
   return StrCpy(country, &country_);
 }
 
-uint64 Chapter::Display::WriteDisplay(IMkvWriter* writer) const {
-  uint64 payload_size = EbmlElementSize(kMkvChapString, title_);
+uint64_t Chapter::Display::WriteDisplay(IMkvWriter* writer) const {
+  uint64_t payload_size = EbmlElementSize(libwebm::kMkvChapString, title_);
 
   if (language_)
-    payload_size += EbmlElementSize(kMkvChapLanguage, language_);
+    payload_size += EbmlElementSize(libwebm::kMkvChapLanguage, language_);
 
   if (country_)
-    payload_size += EbmlElementSize(kMkvChapCountry, country_);
+    payload_size += EbmlElementSize(libwebm::kMkvChapCountry, country_);
 
-  const uint64 display_size =
-      EbmlMasterElementSize(kMkvChapterDisplay, payload_size) + payload_size;
+  const uint64_t display_size =
+      EbmlMasterElementSize(libwebm::kMkvChapterDisplay, payload_size) +
+      payload_size;
 
   if (writer == NULL)
     return display_size;
 
-  const int64 start = writer->Position();
+  const int64_t start = writer->Position();
 
-  if (!WriteEbmlMasterElement(writer, kMkvChapterDisplay, payload_size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvChapterDisplay,
+                              payload_size))
     return 0;
 
-  if (!WriteEbmlElement(writer, kMkvChapString, title_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapString, title_))
     return 0;
 
   if (language_) {
-    if (!WriteEbmlElement(writer, kMkvChapLanguage, language_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvChapLanguage, language_))
       return 0;
   }
 
   if (country_) {
-    if (!WriteEbmlElement(writer, kMkvChapCountry, country_))
+    if (!WriteEbmlElement(writer, libwebm::kMkvChapCountry, country_))
       return 0;
   }
 
-  const int64 stop = writer->Position();
+  const int64_t stop = writer->Position();
 
-  if (stop >= start && uint64(stop - start) != display_size)
+  if (stop >= start && uint64_t(stop - start) != display_size)
     return 0;
 
   return display_size;
@@ -1438,19 +1797,19 @@ bool Chapters::Write(IMkvWriter* writer) const {
   if (writer == NULL)
     return false;
 
-  const uint64 payload_size = WriteEdition(NULL);  // return size only
+  const uint64_t payload_size = WriteEdition(NULL);  // return size only
 
-  if (!WriteEbmlMasterElement(writer, kMkvChapters, payload_size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvChapters, payload_size))
     return false;
 
-  const int64 start = writer->Position();
+  const int64_t start = writer->Position();
 
   if (WriteEdition(writer) == 0)  // error
     return false;
 
-  const int64 stop = writer->Position();
+  const int64_t stop = writer->Position();
 
-  if (stop >= start && uint64(stop - start) != payload_size)
+  if (stop >= start && uint64_t(stop - start) != payload_size)
     return false;
 
   return true;
@@ -1480,36 +1839,37 @@ bool Chapters::ExpandChaptersArray() {
   return true;
 }
 
-uint64 Chapters::WriteEdition(IMkvWriter* writer) const {
-  uint64 payload_size = 0;
+uint64_t Chapters::WriteEdition(IMkvWriter* writer) const {
+  uint64_t payload_size = 0;
 
   for (int idx = 0; idx < chapters_count_; ++idx) {
     const Chapter& chapter = chapters_[idx];
     payload_size += chapter.WriteAtom(NULL);
   }
 
-  const uint64 edition_size =
-      EbmlMasterElementSize(kMkvEditionEntry, payload_size) + payload_size;
+  const uint64_t edition_size =
+      EbmlMasterElementSize(libwebm::kMkvEditionEntry, payload_size) +
+      payload_size;
 
   if (writer == NULL)  // return size only
     return edition_size;
 
-  const int64 start = writer->Position();
+  const int64_t start = writer->Position();
 
-  if (!WriteEbmlMasterElement(writer, kMkvEditionEntry, payload_size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvEditionEntry, payload_size))
     return 0;  // error
 
   for (int idx = 0; idx < chapters_count_; ++idx) {
     const Chapter& chapter = chapters_[idx];
 
-    const uint64 chapter_size = chapter.WriteAtom(writer);
+    const uint64_t chapter_size = chapter.WriteAtom(writer);
     if (chapter_size == 0)  // error
       return 0;
   }
 
-  const int64 stop = writer->Position();
+  const int64_t stop = writer->Position();
 
-  if (stop >= start && uint64(stop - start) != edition_size)
+  if (stop >= start && uint64_t(stop - start) != edition_size)
     return 0;
 
   return edition_size;
@@ -1581,23 +1941,23 @@ bool Tag::ExpandSimpleTagsArray() {
   return true;
 }
 
-uint64 Tag::Write(IMkvWriter* writer) const {
-  uint64 payload_size = 0;
+uint64_t Tag::Write(IMkvWriter* writer) const {
+  uint64_t payload_size = 0;
 
   for (int idx = 0; idx < simple_tags_count_; ++idx) {
     const SimpleTag& st = simple_tags_[idx];
     payload_size += st.Write(NULL);
   }
 
-  const uint64 tag_size =
-      EbmlMasterElementSize(kMkvTag, payload_size) + payload_size;
+  const uint64_t tag_size =
+      EbmlMasterElementSize(libwebm::kMkvTag, payload_size) + payload_size;
 
   if (writer == NULL)
     return tag_size;
 
-  const int64 start = writer->Position();
+  const int64_t start = writer->Position();
 
-  if (!WriteEbmlMasterElement(writer, kMkvTag, payload_size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvTag, payload_size))
     return 0;
 
   for (int idx = 0; idx < simple_tags_count_; ++idx) {
@@ -1607,9 +1967,9 @@ uint64 Tag::Write(IMkvWriter* writer) const {
       return 0;
   }
 
-  const int64 stop = writer->Position();
+  const int64_t stop = writer->Position();
 
-  if (stop >= start && uint64(stop - start) != tag_size)
+  if (stop >= start && uint64_t(stop - start) != tag_size)
     return 0;
 
   return tag_size;
@@ -1635,31 +1995,32 @@ bool Tag::SimpleTag::set_tag_string(const char* tag_string) {
   return StrCpy(tag_string, &tag_string_);
 }
 
-uint64 Tag::SimpleTag::Write(IMkvWriter* writer) const {
-  uint64 payload_size = EbmlElementSize(kMkvTagName, tag_name_);
+uint64_t Tag::SimpleTag::Write(IMkvWriter* writer) const {
+  uint64_t payload_size = EbmlElementSize(libwebm::kMkvTagName, tag_name_);
 
-  payload_size += EbmlElementSize(kMkvTagString, tag_string_);
+  payload_size += EbmlElementSize(libwebm::kMkvTagString, tag_string_);
 
-  const uint64 simple_tag_size =
-      EbmlMasterElementSize(kMkvSimpleTag, payload_size) + payload_size;
+  const uint64_t simple_tag_size =
+      EbmlMasterElementSize(libwebm::kMkvSimpleTag, payload_size) +
+      payload_size;
 
   if (writer == NULL)
     return simple_tag_size;
 
-  const int64 start = writer->Position();
+  const int64_t start = writer->Position();
 
-  if (!WriteEbmlMasterElement(writer, kMkvSimpleTag, payload_size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvSimpleTag, payload_size))
     return 0;
 
-  if (!WriteEbmlElement(writer, kMkvTagName, tag_name_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvTagName, tag_name_))
     return 0;
 
-  if (!WriteEbmlElement(writer, kMkvTagString, tag_string_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvTagString, tag_string_))
     return 0;
 
-  const int64 stop = writer->Position();
+  const int64_t stop = writer->Position();
 
-  if (stop >= start && uint64(stop - start) != simple_tag_size)
+  if (stop >= start && uint64_t(stop - start) != simple_tag_size)
     return 0;
 
   return simple_tag_size;
@@ -1694,29 +2055,29 @@ bool Tags::Write(IMkvWriter* writer) const {
   if (writer == NULL)
     return false;
 
-  uint64 payload_size = 0;
+  uint64_t payload_size = 0;
 
   for (int idx = 0; idx < tags_count_; ++idx) {
     const Tag& tag = tags_[idx];
     payload_size += tag.Write(NULL);
   }
 
-  if (!WriteEbmlMasterElement(writer, kMkvTags, payload_size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvTags, payload_size))
     return false;
 
-  const int64 start = writer->Position();
+  const int64_t start = writer->Position();
 
   for (int idx = 0; idx < tags_count_; ++idx) {
     const Tag& tag = tags_[idx];
 
-    const uint64 tag_size = tag.Write(writer);
+    const uint64_t tag_size = tag.Write(writer);
     if (tag_size == 0)  // error
       return 0;
   }
 
-  const int64 stop = writer->Position();
+  const int64_t stop = writer->Position();
 
-  if (stop >= start && uint64(stop - start) != payload_size)
+  if (stop >= start && uint64_t(stop - start) != payload_size)
     return false;
 
   return true;
@@ -1750,15 +2111,18 @@ bool Tags::ExpandTagsArray() {
 //
 // Cluster class
 
-Cluster::Cluster(uint64 timecode, int64 cues_pos, uint64 timecode_scale)
+Cluster::Cluster(uint64_t timecode, int64_t cues_pos, uint64_t timecode_scale,
+                 bool write_last_frame_with_duration, bool fixed_size_timecode)
     : blocks_added_(0),
       finalized_(false),
+      fixed_size_timecode_(fixed_size_timecode),
       header_written_(false),
       payload_size_(0),
       position_for_cues_(cues_pos),
       size_position_(-1),
       timecode_(timecode),
       timecode_scale_(timecode_scale),
+      write_last_frame_with_duration_(write_last_frame_with_duration),
       writer_(NULL) {}
 
 Cluster::~Cluster() {}
@@ -1771,24 +2135,27 @@ bool Cluster::Init(IMkvWriter* ptr_writer) {
   return true;
 }
 
-bool Cluster::AddFrame(const Frame* const frame) { return DoWriteFrame(frame); }
+bool Cluster::AddFrame(const Frame* const frame) {
+  return QueueOrWriteFrame(frame);
+}
 
-bool Cluster::AddFrame(const uint8* data, uint64 length, uint64 track_number,
-                       uint64 abs_timecode, bool is_key) {
+bool Cluster::AddFrame(const uint8_t* data, uint64_t length,
+                       uint64_t track_number, uint64_t abs_timecode,
+                       bool is_key) {
   Frame frame;
   if (!frame.Init(data, length))
     return false;
   frame.set_track_number(track_number);
   frame.set_timestamp(abs_timecode);
   frame.set_is_key(is_key);
-  return DoWriteFrame(&frame);
+  return QueueOrWriteFrame(&frame);
 }
 
-bool Cluster::AddFrameWithAdditional(const uint8* data, uint64 length,
-                                     const uint8* additional,
-                                     uint64 additional_length, uint64 add_id,
-                                     uint64 track_number, uint64 abs_timecode,
-                                     bool is_key) {
+bool Cluster::AddFrameWithAdditional(const uint8_t* data, uint64_t length,
+                                     const uint8_t* additional,
+                                     uint64_t additional_length,
+                                     uint64_t add_id, uint64_t track_number,
+                                     uint64_t abs_timecode, bool is_key) {
   if (!additional || additional_length == 0) {
     return false;
   }
@@ -1800,13 +2167,13 @@ bool Cluster::AddFrameWithAdditional(const uint8* data, uint64 length,
   frame.set_track_number(track_number);
   frame.set_timestamp(abs_timecode);
   frame.set_is_key(is_key);
-  return DoWriteFrame(&frame);
+  return QueueOrWriteFrame(&frame);
 }
 
-bool Cluster::AddFrameWithDiscardPadding(const uint8* data, uint64 length,
-                                         int64 discard_padding,
-                                         uint64 track_number,
-                                         uint64 abs_timecode, bool is_key) {
+bool Cluster::AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length,
+                                         int64_t discard_padding,
+                                         uint64_t track_number,
+                                         uint64_t abs_timecode, bool is_key) {
   Frame frame;
   if (!frame.Init(data, length))
     return false;
@@ -1814,11 +2181,12 @@ bool Cluster::AddFrameWithDiscardPadding(const uint8* data, uint64 length,
   frame.set_track_number(track_number);
   frame.set_timestamp(abs_timecode);
   frame.set_is_key(is_key);
-  return DoWriteFrame(&frame);
+  return QueueOrWriteFrame(&frame);
 }
 
-bool Cluster::AddMetadata(const uint8* data, uint64 length, uint64 track_number,
-                          uint64 abs_timecode, uint64 duration_timecode) {
+bool Cluster::AddMetadata(const uint8_t* data, uint64_t length,
+                          uint64_t track_number, uint64_t abs_timecode,
+                          uint64_t duration_timecode) {
   Frame frame;
   if (!frame.Init(data, length))
     return false;
@@ -1826,17 +2194,62 @@ bool Cluster::AddMetadata(const uint8* data, uint64 length, uint64 track_number,
   frame.set_timestamp(abs_timecode);
   frame.set_duration(duration_timecode);
   frame.set_is_key(true);  // All metadata blocks are keyframes.
-  return DoWriteFrame(&frame);
+  return QueueOrWriteFrame(&frame);
 }
 
-void Cluster::AddPayloadSize(uint64 size) { payload_size_ += size; }
+void Cluster::AddPayloadSize(uint64_t size) { payload_size_ += size; }
 
 bool Cluster::Finalize() {
-  if (!writer_ || finalized_ || size_position_ == -1)
+  return !write_last_frame_with_duration_ && Finalize(false, 0);
+}
+
+bool Cluster::Finalize(bool set_last_frame_duration, uint64_t duration) {
+  if (!writer_ || finalized_)
+    return false;
+
+  if (write_last_frame_with_duration_) {
+    // Write out held back Frames. This essentially performs a k-way merge
+    // across all tracks in the increasing order of timestamps.
+    while (!stored_frames_.empty()) {
+      Frame* frame = stored_frames_.begin()->second.front();
+
+      // Get the next frame to write (frame with least timestamp across all
+      // tracks).
+      for (FrameMapIterator frames_iterator = ++stored_frames_.begin();
+           frames_iterator != stored_frames_.end(); ++frames_iterator) {
+        if (frames_iterator->second.front()->timestamp() < frame->timestamp()) {
+          frame = frames_iterator->second.front();
+        }
+      }
+
+      // Set the duration if it's the last frame for the track.
+      if (set_last_frame_duration &&
+          stored_frames_[frame->track_number()].size() == 1 &&
+          !frame->duration_set()) {
+        frame->set_duration(duration - frame->timestamp());
+        if (!frame->is_key() && !frame->reference_block_timestamp_set()) {
+          frame->set_reference_block_timestamp(
+              last_block_timestamp_[frame->track_number()]);
+        }
+      }
+
+      // Write the frame and remove it from |stored_frames_|.
+      const bool wrote_frame = DoWriteFrame(frame);
+      stored_frames_[frame->track_number()].pop_front();
+      if (stored_frames_[frame->track_number()].empty()) {
+        stored_frames_.erase(frame->track_number());
+      }
+      delete frame;
+      if (!wrote_frame)
+        return false;
+    }
+  }
+
+  if (size_position_ == -1)
     return false;
 
   if (writer_->Seekable()) {
-    const int64 pos = writer_->Position();
+    const int64_t pos = writer_->Position();
 
     if (writer_->Position(size_position_))
       return false;
@@ -1853,9 +2266,10 @@ bool Cluster::Finalize() {
   return true;
 }
 
-uint64 Cluster::Size() const {
-  const uint64 element_size =
-      EbmlMasterElementSize(kMkvCluster, 0xFFFFFFFFFFFFFFFFULL) + payload_size_;
+uint64_t Cluster::Size() const {
+  const uint64_t element_size =
+      EbmlMasterElementSize(libwebm::kMkvCluster, 0xFFFFFFFFFFFFFFFFULL) +
+      payload_size_;
   return element_size;
 }
 
@@ -1871,15 +2285,15 @@ bool Cluster::PreWriteBlock() {
   return true;
 }
 
-void Cluster::PostWriteBlock(uint64 element_size) {
+void Cluster::PostWriteBlock(uint64_t element_size) {
   AddPayloadSize(element_size);
   ++blocks_added_;
 }
 
-int64 Cluster::GetRelativeTimecode(int64 abs_timecode) const {
-  const int64 cluster_timecode = this->Cluster::timecode();
-  const int64 rel_timecode =
-      static_cast<int64>(abs_timecode) - cluster_timecode;
+int64_t Cluster::GetRelativeTimecode(int64_t abs_timecode) const {
+  const int64_t cluster_timecode = this->Cluster::timecode();
+  const int64_t rel_timecode =
+      static_cast<int64_t>(abs_timecode) - cluster_timecode;
 
   if (rel_timecode < 0 || rel_timecode > kMaxBlockTimecode)
     return -1;
@@ -1894,11 +2308,67 @@ bool Cluster::DoWriteFrame(const Frame* const frame) {
   if (!PreWriteBlock())
     return false;
 
-  const uint64 element_size = WriteFrame(writer_, frame, this);
+  const uint64_t element_size = WriteFrame(writer_, frame, this);
   if (element_size == 0)
     return false;
 
   PostWriteBlock(element_size);
+  last_block_timestamp_[frame->track_number()] = frame->timestamp();
+  return true;
+}
+
+bool Cluster::QueueOrWriteFrame(const Frame* const frame) {
+  if (!frame || !frame->IsValid())
+    return false;
+
+  // If |write_last_frame_with_duration_| is not set, then write the frame right
+  // away.
+  if (!write_last_frame_with_duration_) {
+    return DoWriteFrame(frame);
+  }
+
+  // Queue the current frame.
+  uint64_t track_number = frame->track_number();
+  Frame* const frame_to_store = new Frame();
+  frame_to_store->CopyFrom(*frame);
+  stored_frames_[track_number].push_back(frame_to_store);
+
+  // Iterate through all queued frames in the current track except the last one
+  // and write it if it is okay to do so (i.e.) no other track has an held back
+  // frame with timestamp <= the timestamp of the frame in question.
+  std::vector<std::list<Frame*>::iterator> frames_to_erase;
+  for (std::list<Frame *>::iterator
+           current_track_iterator = stored_frames_[track_number].begin(),
+           end = --stored_frames_[track_number].end();
+       current_track_iterator != end; ++current_track_iterator) {
+    const Frame* const frame_to_write = *current_track_iterator;
+    bool okay_to_write = true;
+    for (FrameMapIterator track_iterator = stored_frames_.begin();
+         track_iterator != stored_frames_.end(); ++track_iterator) {
+      if (track_iterator->first == track_number) {
+        continue;
+      }
+      if (track_iterator->second.front()->timestamp() <
+          frame_to_write->timestamp()) {
+        okay_to_write = false;
+        break;
+      }
+    }
+    if (okay_to_write) {
+      const bool wrote_frame = DoWriteFrame(frame_to_write);
+      delete frame_to_write;
+      if (!wrote_frame)
+        return false;
+      frames_to_erase.push_back(current_track_iterator);
+    } else {
+      break;
+    }
+  }
+  for (std::vector<std::list<Frame*>::iterator>::iterator iterator =
+           frames_to_erase.begin();
+       iterator != frames_to_erase.end(); ++iterator) {
+    stored_frames_[track_number].erase(*iterator);
+  }
   return true;
 }
 
@@ -1906,7 +2376,7 @@ bool Cluster::WriteClusterHeader() {
   if (finalized_)
     return false;
 
-  if (WriteID(writer_, kMkvCluster))
+  if (WriteID(writer_, libwebm::kMkvCluster))
     return false;
 
   // Save for later.
@@ -1917,9 +2387,12 @@ bool Cluster::WriteClusterHeader() {
   if (SerializeInt(writer_, kEbmlUnknownValue, 8))
     return false;
 
-  if (!WriteEbmlElement(writer_, kMkvTimecode, timecode()))
+  if (!WriteEbmlElement(writer_, libwebm::kMkvTimecode, timecode(),
+                        fixed_size_timecode_ ? 8 : 0)) {
     return false;
-  AddPayloadSize(EbmlElementSize(kMkvTimecode, timecode()));
+  }
+  AddPayloadSize(EbmlElementSize(libwebm::kMkvTimecode, timecode(),
+                                 fixed_size_timecode_ ? 8 : 0));
   header_written_ = true;
 
   return true;
@@ -1930,7 +2403,7 @@ bool Cluster::WriteClusterHeader() {
 // SeekHead Class
 
 SeekHead::SeekHead() : start_pos_(0ULL) {
-  for (int32 i = 0; i < kSeekEntryCount; ++i) {
+  for (int32_t i = 0; i < kSeekEntryCount; ++i) {
     seek_entry_id_[i] = 0;
     seek_entry_pos_[i] = 0;
   }
@@ -1943,17 +2416,19 @@ bool SeekHead::Finalize(IMkvWriter* writer) const {
     if (start_pos_ == -1)
       return false;
 
-    uint64 payload_size = 0;
-    uint64 entry_size[kSeekEntryCount];
+    uint64_t payload_size = 0;
+    uint64_t entry_size[kSeekEntryCount];
 
-    for (int32 i = 0; i < kSeekEntryCount; ++i) {
+    for (int32_t i = 0; i < kSeekEntryCount; ++i) {
       if (seek_entry_id_[i] != 0) {
-        entry_size[i] =
-            EbmlElementSize(kMkvSeekID, static_cast<uint64>(seek_entry_id_[i]));
-        entry_size[i] += EbmlElementSize(kMkvSeekPosition, seek_entry_pos_[i]);
+        entry_size[i] = EbmlElementSize(
+            libwebm::kMkvSeekID, static_cast<uint64_t>(seek_entry_id_[i]));
+        entry_size[i] +=
+            EbmlElementSize(libwebm::kMkvSeekPosition, seek_entry_pos_[i]);
 
         payload_size +=
-            EbmlMasterElementSize(kMkvSeek, entry_size[i]) + entry_size[i];
+            EbmlMasterElementSize(libwebm::kMkvSeek, entry_size[i]) +
+            entry_size[i];
       }
     }
 
@@ -1961,34 +2436,35 @@ bool SeekHead::Finalize(IMkvWriter* writer) const {
     if (payload_size == 0)
       return true;
 
-    const int64 pos = writer->Position();
+    const int64_t pos = writer->Position();
     if (writer->Position(start_pos_))
       return false;
 
-    if (!WriteEbmlMasterElement(writer, kMkvSeekHead, payload_size))
+    if (!WriteEbmlMasterElement(writer, libwebm::kMkvSeekHead, payload_size))
       return false;
 
-    for (int32 i = 0; i < kSeekEntryCount; ++i) {
+    for (int32_t i = 0; i < kSeekEntryCount; ++i) {
       if (seek_entry_id_[i] != 0) {
-        if (!WriteEbmlMasterElement(writer, kMkvSeek, entry_size[i]))
+        if (!WriteEbmlMasterElement(writer, libwebm::kMkvSeek, entry_size[i]))
           return false;
 
-        if (!WriteEbmlElement(writer, kMkvSeekID,
-                              static_cast<uint64>(seek_entry_id_[i])))
+        if (!WriteEbmlElement(writer, libwebm::kMkvSeekID,
+                              static_cast<uint64_t>(seek_entry_id_[i])))
           return false;
 
-        if (!WriteEbmlElement(writer, kMkvSeekPosition, seek_entry_pos_[i]))
+        if (!WriteEbmlElement(writer, libwebm::kMkvSeekPosition,
+                              seek_entry_pos_[i]))
           return false;
       }
     }
 
-    const uint64 total_entry_size = kSeekEntryCount * MaxEntrySize();
-    const uint64 total_size =
-        EbmlMasterElementSize(kMkvSeekHead, total_entry_size) +
+    const uint64_t total_entry_size = kSeekEntryCount * MaxEntrySize();
+    const uint64_t total_size =
+        EbmlMasterElementSize(libwebm::kMkvSeekHead, total_entry_size) +
         total_entry_size;
-    const int64 size_left = total_size - (writer->Position() - start_pos_);
+    const int64_t size_left = total_size - (writer->Position() - start_pos_);
 
-    const uint64 bytes_written = WriteVoidElement(writer, size_left);
+    const uint64_t bytes_written = WriteVoidElement(writer, size_left);
     if (!bytes_written)
       return false;
 
@@ -2000,20 +2476,21 @@ bool SeekHead::Finalize(IMkvWriter* writer) const {
 }
 
 bool SeekHead::Write(IMkvWriter* writer) {
-  const uint64 entry_size = kSeekEntryCount * MaxEntrySize();
-  const uint64 size = EbmlMasterElementSize(kMkvSeekHead, entry_size);
+  const uint64_t entry_size = kSeekEntryCount * MaxEntrySize();
+  const uint64_t size =
+      EbmlMasterElementSize(libwebm::kMkvSeekHead, entry_size);
 
   start_pos_ = writer->Position();
 
-  const uint64 bytes_written = WriteVoidElement(writer, size + entry_size);
+  const uint64_t bytes_written = WriteVoidElement(writer, size + entry_size);
   if (!bytes_written)
     return false;
 
   return true;
 }
 
-bool SeekHead::AddSeekEntry(uint32 id, uint64 pos) {
-  for (int32 i = 0; i < kSeekEntryCount; ++i) {
+bool SeekHead::AddSeekEntry(uint32_t id, uint64_t pos) {
+  for (int32_t i = 0; i < kSeekEntryCount; ++i) {
     if (seek_entry_id_[i] == 0) {
       seek_entry_id_[i] = id;
       seek_entry_pos_[i] = pos;
@@ -2023,19 +2500,19 @@ bool SeekHead::AddSeekEntry(uint32 id, uint64 pos) {
   return false;
 }
 
-uint32 SeekHead::GetId(int index) const {
+uint32_t SeekHead::GetId(int index) const {
   if (index < 0 || index >= kSeekEntryCount)
     return UINT_MAX;
   return seek_entry_id_[index];
 }
 
-uint64 SeekHead::GetPosition(int index) const {
+uint64_t SeekHead::GetPosition(int index) const {
   if (index < 0 || index >= kSeekEntryCount)
     return ULLONG_MAX;
   return seek_entry_pos_[index];
 }
 
-bool SeekHead::SetSeekEntry(int index, uint32 id, uint64 position) {
+bool SeekHead::SetSeekEntry(int index, uint32_t id, uint64_t position) {
   if (index < 0 || index >= kSeekEntryCount)
     return false;
   seek_entry_id_[index] = id;
@@ -2043,12 +2520,12 @@ bool SeekHead::SetSeekEntry(int index, uint32 id, uint64 position) {
   return true;
 }
 
-uint64 SeekHead::MaxEntrySize() const {
-  const uint64 max_entry_payload_size =
-      EbmlElementSize(kMkvSeekID, 0xffffffffULL) +
-      EbmlElementSize(kMkvSeekPosition, 0xffffffffffffffffULL);
-  const uint64 max_entry_size =
-      EbmlMasterElementSize(kMkvSeek, max_entry_payload_size) +
+uint64_t SeekHead::MaxEntrySize() const {
+  const uint64_t max_entry_payload_size =
+      EbmlElementSize(libwebm::kMkvSeekID, UINT64_C(0xffffffff)) +
+      EbmlElementSize(libwebm::kMkvSeekPosition, UINT64_C(0xffffffffffffffff));
+  const uint64_t max_entry_size =
+      EbmlMasterElementSize(libwebm::kMkvSeek, max_entry_payload_size) +
       max_entry_payload_size;
 
   return max_entry_size;
@@ -2072,10 +2549,10 @@ SegmentInfo::~SegmentInfo() {
 }
 
 bool SegmentInfo::Init() {
-  int32 major;
-  int32 minor;
-  int32 build;
-  int32 revision;
+  int32_t major;
+  int32_t minor;
+  int32_t build;
+  int32_t revision;
   GetVersion(&major, &minor, &build, &revision);
   char temp[256];
 #ifdef _MSC_VER
@@ -2115,12 +2592,12 @@ bool SegmentInfo::Finalize(IMkvWriter* writer) const {
       if (duration_pos_ == -1)
         return false;
 
-      const int64 pos = writer->Position();
+      const int64_t pos = writer->Position();
 
       if (writer->Position(duration_pos_))
         return false;
 
-      if (!WriteEbmlElement(writer, kMkvDuration,
+      if (!WriteEbmlElement(writer, libwebm::kMkvDuration,
                             static_cast<float>(duration_)))
         return false;
 
@@ -2136,43 +2613,45 @@ bool SegmentInfo::Write(IMkvWriter* writer) {
   if (!writer || !muxing_app_ || !writing_app_)
     return false;
 
-  uint64 size = EbmlElementSize(kMkvTimecodeScale, timecode_scale_);
+  uint64_t size = EbmlElementSize(libwebm::kMkvTimecodeScale, timecode_scale_);
   if (duration_ > 0.0)
-    size += EbmlElementSize(kMkvDuration, static_cast<float>(duration_));
+    size +=
+        EbmlElementSize(libwebm::kMkvDuration, static_cast<float>(duration_));
   if (date_utc_ != LLONG_MIN)
-    size += EbmlDateElementSize(kMkvDateUTC);
-  size += EbmlElementSize(kMkvMuxingApp, muxing_app_);
-  size += EbmlElementSize(kMkvWritingApp, writing_app_);
+    size += EbmlDateElementSize(libwebm::kMkvDateUTC);
+  size += EbmlElementSize(libwebm::kMkvMuxingApp, muxing_app_);
+  size += EbmlElementSize(libwebm::kMkvWritingApp, writing_app_);
 
-  if (!WriteEbmlMasterElement(writer, kMkvInfo, size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvInfo, size))
     return false;
 
-  const int64 payload_position = writer->Position();
+  const int64_t payload_position = writer->Position();
   if (payload_position < 0)
     return false;
 
-  if (!WriteEbmlElement(writer, kMkvTimecodeScale, timecode_scale_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvTimecodeScale, timecode_scale_))
     return false;
 
   if (duration_ > 0.0) {
     // Save for later
     duration_pos_ = writer->Position();
 
-    if (!WriteEbmlElement(writer, kMkvDuration, static_cast<float>(duration_)))
+    if (!WriteEbmlElement(writer, libwebm::kMkvDuration,
+                          static_cast<float>(duration_)))
       return false;
   }
 
   if (date_utc_ != LLONG_MIN)
-    WriteEbmlDateElement(writer, kMkvDateUTC, date_utc_);
+    WriteEbmlDateElement(writer, libwebm::kMkvDateUTC, date_utc_);
 
-  if (!WriteEbmlElement(writer, kMkvMuxingApp, muxing_app_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvMuxingApp, muxing_app_))
     return false;
-  if (!WriteEbmlElement(writer, kMkvWritingApp, writing_app_))
+  if (!WriteEbmlElement(writer, libwebm::kMkvWritingApp, writing_app_))
     return false;
 
-  const int64 stop_position = writer->Position();
+  const int64_t stop_position = writer->Position();
   if (stop_position < 0 ||
-      stop_position - payload_position != static_cast<int64>(size))
+      stop_position - payload_position != static_cast<int64_t>(size))
     return false;
 
   return true;
@@ -2244,6 +2723,8 @@ Segment::Segment()
       mode_(kFile),
       new_cuepoint_(false),
       output_cues_(true),
+      accurate_cluster_duration_(false),
+      fixed_size_cluster_timecode_(false),
       payload_pos_(0),
       size_position_(0),
       doc_type_version_(kDefaultDocTypeVersion),
@@ -2260,7 +2741,7 @@ Segment::Segment()
 
 Segment::~Segment() {
   if (cluster_list_) {
-    for (int32 i = 0; i < cluster_list_size_; ++i) {
+    for (int32_t i = 0; i < cluster_list_size_; ++i) {
       Cluster* const cluster = cluster_list_[i];
       delete cluster;
     }
@@ -2268,7 +2749,7 @@ Segment::~Segment() {
   }
 
   if (frames_) {
-    for (int32 i = 0; i < frames_size_; ++i) {
+    for (int32_t i = 0; i < frames_size_; ++i) {
       Frame* const frame = frames_[i];
       delete frame;
     }
@@ -2292,13 +2773,13 @@ Segment::~Segment() {
   }
 }
 
-void Segment::MoveCuesBeforeClustersHelper(uint64 diff, int32 index,
-                                           uint64* cues_size) {
+void Segment::MoveCuesBeforeClustersHelper(uint64_t diff, int32_t index,
+                                           uint64_t* cues_size) {
   CuePoint* const cue_point = cues_.GetCueByIndex(index);
   if (cue_point == NULL)
     return;
-  const uint64 old_cue_point_size = cue_point->Size();
-  const uint64 cluster_pos = cue_point->cluster_pos() + diff;
+  const uint64_t old_cue_point_size = cue_point->Size();
+  const uint64_t cluster_pos = cue_point->cluster_pos() + diff;
   cue_point->set_cluster_pos(cluster_pos);  // update the new cluster position
   // New size of the cue is computed as follows
   //    Let a = current sum of size of all CuePoints
@@ -2308,40 +2789,40 @@ void Segment::MoveCuesBeforeClustersHelper(uint64 diff, int32 index,
   //    Let d = b + c. Now d is the |diff| passed to the next recursive call.
   //    Let e = a + b. Now e is the |cues_size| passed to the next recursive
   //                   call.
-  const uint64 cue_point_size_diff = cue_point->Size() - old_cue_point_size;
-  const uint64 cue_size_diff =
+  const uint64_t cue_point_size_diff = cue_point->Size() - old_cue_point_size;
+  const uint64_t cue_size_diff =
       GetCodedUIntSize(*cues_size + cue_point_size_diff) -
       GetCodedUIntSize(*cues_size);
   *cues_size += cue_point_size_diff;
   diff = cue_size_diff + cue_point_size_diff;
   if (diff > 0) {
-    for (int32 i = 0; i < cues_.cue_entries_size(); ++i) {
+    for (int32_t i = 0; i < cues_.cue_entries_size(); ++i) {
       MoveCuesBeforeClustersHelper(diff, i, cues_size);
     }
   }
 }
 
 void Segment::MoveCuesBeforeClusters() {
-  const uint64 current_cue_size = cues_.Size();
-  uint64 cue_size = 0;
-  for (int32 i = 0; i < cues_.cue_entries_size(); ++i)
+  const uint64_t current_cue_size = cues_.Size();
+  uint64_t cue_size = 0;
+  for (int32_t i = 0; i < cues_.cue_entries_size(); ++i)
     cue_size += cues_.GetCueByIndex(i)->Size();
-  for (int32 i = 0; i < cues_.cue_entries_size(); ++i)
+  for (int32_t i = 0; i < cues_.cue_entries_size(); ++i)
     MoveCuesBeforeClustersHelper(current_cue_size, i, &cue_size);
 
   // Adjust the Seek Entry to reflect the change in position
   // of Cluster and Cues
-  int32 cluster_index = 0;
-  int32 cues_index = 0;
-  for (int32 i = 0; i < SeekHead::kSeekEntryCount; ++i) {
-    if (seek_head_.GetId(i) == kMkvCluster)
+  int32_t cluster_index = 0;
+  int32_t cues_index = 0;
+  for (int32_t i = 0; i < SeekHead::kSeekEntryCount; ++i) {
+    if (seek_head_.GetId(i) == libwebm::kMkvCluster)
       cluster_index = i;
-    if (seek_head_.GetId(i) == kMkvCues)
+    if (seek_head_.GetId(i) == libwebm::kMkvCues)
       cues_index = i;
   }
-  seek_head_.SetSeekEntry(cues_index, kMkvCues,
+  seek_head_.SetSeekEntry(cues_index, libwebm::kMkvCues,
                           seek_head_.GetPosition(cluster_index));
-  seek_head_.SetSeekEntry(cluster_index, kMkvCluster,
+  seek_head_.SetSeekEntry(cluster_index, libwebm::kMkvCluster,
                           cues_.Size() + seek_head_.GetPosition(cues_index));
 }
 
@@ -2359,8 +2840,8 @@ bool Segment::CopyAndMoveCuesBeforeClusters(mkvparser::IMkvReader* reader,
                                             IMkvWriter* writer) {
   if (!writer->Seekable() || chunking_)
     return false;
-  const int64 cluster_offset =
-      cluster_list_[0]->size_position() - GetUIntSize(kMkvCluster);
+  const int64_t cluster_offset =
+      cluster_list_[0]->size_position() - GetUIntSize(libwebm::kMkvCluster);
 
   // Copy the headers.
   if (!ChunkedCopy(reader, writer, 0, cluster_offset))
@@ -2383,8 +2864,8 @@ bool Segment::CopyAndMoveCuesBeforeClusters(mkvparser::IMkvReader* reader,
     return false;
 
   // Update the Segment size in case the Cues size has changed.
-  const int64 pos = writer->Position();
-  const int64 segment_size = writer->Position() - payload_pos_;
+  const int64_t pos = writer->Position();
+  const int64_t segment_size = writer->Position() - payload_pos_;
   if (writer->Position(size_position_) ||
       WriteUIntSize(writer, segment_size, 8) || writer->Position(pos))
     return false;
@@ -2395,15 +2876,17 @@ bool Segment::Finalize() {
   if (WriteFramesAll() < 0)
     return false;
 
-  if (mode_ == kFile) {
-    if (cluster_list_size_ > 0) {
-      // Update last cluster's size
-      Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1];
+  if (cluster_list_size_ > 0) {
+    // Update last cluster's size
+    Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1];
 
-      if (!old_cluster || !old_cluster->Finalize())
-        return false;
-    }
+    // For the last frame of the last Cluster, we don't write it as a BlockGroup
+    // with Duration unless the frame itself has duration set explicitly.
+    if (!old_cluster || !old_cluster->Finalize(false, 0))
+      return false;
+  }
 
+  if (mode_ == kFile) {
     if (chunking_ && chunk_writer_cluster_) {
       chunk_writer_cluster_->Close();
       chunk_count_++;
@@ -2417,7 +2900,7 @@ bool Segment::Finalize() {
       return false;
 
     if (output_cues_)
-      if (!seek_head_.AddSeekEntry(kMkvCues, MaxOffset()))
+      if (!seek_head_.AddSeekEntry(libwebm::kMkvCues, MaxOffset()))
         return false;
 
     if (chunking_) {
@@ -2448,11 +2931,11 @@ bool Segment::Finalize() {
       if (size_position_ == -1)
         return false;
 
-      const int64 segment_size = MaxOffset();
+      const int64_t segment_size = MaxOffset();
       if (segment_size < 1)
         return false;
 
-      const int64 pos = writer_header_->Position();
+      const int64_t pos = writer_header_->Position();
       UpdateDocTypeVersion();
       if (doc_type_version_ != doc_type_version_written_) {
         if (writer_header_->Position(0))
@@ -2490,7 +2973,7 @@ bool Segment::Finalize() {
   return true;
 }
 
-Track* Segment::AddTrack(int32 number) {
+Track* Segment::AddTrack(int32_t number) {
   Track* const track = new (std::nothrow) Track(&seed_);  // NOLINT
 
   if (!track)
@@ -2508,7 +2991,7 @@ Chapter* Segment::AddChapter() { return chapters_.AddChapter(&seed_); }
 
 Tag* Segment::AddTag() { return tags_.AddTag(); }
 
-uint64 Segment::AddVideoTrack(int32 width, int32 height, int32 number) {
+uint64_t Segment::AddVideoTrack(int32_t width, int32_t height, int32_t number) {
   VideoTrack* const track = new (std::nothrow) VideoTrack(&seed_);  // NOLINT
   if (!track)
     return 0;
@@ -2524,7 +3007,7 @@ uint64 Segment::AddVideoTrack(int32 width, int32 height, int32 number) {
   return track->number();
 }
 
-bool Segment::AddCuePoint(uint64 timestamp, uint64 track) {
+bool Segment::AddCuePoint(uint64_t timestamp, uint64_t track) {
   if (cluster_list_size_ < 1)
     return false;
 
@@ -2547,7 +3030,8 @@ bool Segment::AddCuePoint(uint64 timestamp, uint64 track) {
   return true;
 }
 
-uint64 Segment::AddAudioTrack(int32 sample_rate, int32 channels, int32 number) {
+uint64_t Segment::AddAudioTrack(int32_t sample_rate, int32_t channels,
+                                int32_t number) {
   AudioTrack* const track = new (std::nothrow) AudioTrack(&seed_);  // NOLINT
   if (!track)
     return 0;
@@ -2562,8 +3046,8 @@ uint64 Segment::AddAudioTrack(int32 sample_rate, int32 channels, int32 number) {
   return track->number();
 }
 
-bool Segment::AddFrame(const uint8* data, uint64 length, uint64 track_number,
-                       uint64 timestamp, bool is_key) {
+bool Segment::AddFrame(const uint8_t* data, uint64_t length,
+                       uint64_t track_number, uint64_t timestamp, bool is_key) {
   if (!data)
     return false;
 
@@ -2576,11 +3060,11 @@ bool Segment::AddFrame(const uint8* data, uint64 length, uint64 track_number,
   return AddGenericFrame(&frame);
 }
 
-bool Segment::AddFrameWithAdditional(const uint8* data, uint64 length,
-                                     const uint8* additional,
-                                     uint64 additional_length, uint64 add_id,
-                                     uint64 track_number, uint64 timestamp,
-                                     bool is_key) {
+bool Segment::AddFrameWithAdditional(const uint8_t* data, uint64_t length,
+                                     const uint8_t* additional,
+                                     uint64_t additional_length,
+                                     uint64_t add_id, uint64_t track_number,
+                                     uint64_t timestamp, bool is_key) {
   if (!data || !additional)
     return false;
 
@@ -2595,10 +3079,10 @@ bool Segment::AddFrameWithAdditional(const uint8* data, uint64 length,
   return AddGenericFrame(&frame);
 }
 
-bool Segment::AddFrameWithDiscardPadding(const uint8* data, uint64 length,
-                                         int64 discard_padding,
-                                         uint64 track_number, uint64 timestamp,
-                                         bool is_key) {
+bool Segment::AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length,
+                                         int64_t discard_padding,
+                                         uint64_t track_number,
+                                         uint64_t timestamp, bool is_key) {
   if (!data)
     return false;
 
@@ -2612,8 +3096,9 @@ bool Segment::AddFrameWithDiscardPadding(const uint8* data, uint64 length,
   return AddGenericFrame(&frame);
 }
 
-bool Segment::AddMetadata(const uint8* data, uint64 length, uint64 track_number,
-                          uint64 timestamp_ns, uint64 duration_ns) {
+bool Segment::AddMetadata(const uint8_t* data, uint64_t length,
+                          uint64_t track_number, uint64_t timestamp_ns,
+                          uint64_t duration_ns) {
   if (!data)
     return false;
 
@@ -2702,6 +3187,14 @@ bool Segment::AddGenericFrame(const Frame* frame) {
 
 void Segment::OutputCues(bool output_cues) { output_cues_ = output_cues; }
 
+void Segment::AccurateClusterDuration(bool accurate_cluster_duration) {
+  accurate_cluster_duration_ = accurate_cluster_duration;
+}
+
+void Segment::UseFixedSizeClusterTimecode(bool fixed_size_cluster_timecode) {
+  fixed_size_cluster_timecode_ = fixed_size_cluster_timecode;
+}
+
 bool Segment::SetChunking(bool chunking, const char* filename) {
   if (chunk_count_ > 0)
     return false;
@@ -2781,7 +3274,7 @@ bool Segment::SetChunking(bool chunking, const char* filename) {
   return true;
 }
 
-bool Segment::CuesTrack(uint64 track_number) {
+bool Segment::CuesTrack(uint64_t track_number) {
   const Track* const track = GetTrackByNumber(track_number);
   if (!track)
     return false;
@@ -2792,7 +3285,7 @@ bool Segment::CuesTrack(uint64 track_number) {
 
 void Segment::ForceNewClusterOnNextFrame() { force_new_cluster_ = true; }
 
-Track* Segment::GetTrackByNumber(uint64 track_number) const {
+Track* Segment::GetTrackByNumber(uint64_t track_number) const {
   return tracks_.GetTrackByNumber(track_number);
 }
 
@@ -2803,11 +3296,11 @@ bool Segment::WriteSegmentHeader() {
   if (!WriteEbmlHeader(writer_header_, doc_type_version_))
     return false;
   doc_type_version_written_ = doc_type_version_;
-  ebml_header_size_ = static_cast<int32>(writer_header_->Position());
+  ebml_header_size_ = static_cast<int32_t>(writer_header_->Position());
 
   // Write "unknown" (-1) as segment size value. If mode is kFile, Segment
   // will write over duration when the file is finalized.
-  if (WriteID(writer_header_, kMkvSegment))
+  if (WriteID(writer_header_, libwebm::kMkvSegment))
     return false;
 
   // Save for later.
@@ -2831,25 +3324,25 @@ bool Segment::WriteSegmentHeader() {
       return false;
   }
 
-  if (!seek_head_.AddSeekEntry(kMkvInfo, MaxOffset()))
+  if (!seek_head_.AddSeekEntry(libwebm::kMkvInfo, MaxOffset()))
     return false;
   if (!segment_info_.Write(writer_header_))
     return false;
 
-  if (!seek_head_.AddSeekEntry(kMkvTracks, MaxOffset()))
+  if (!seek_head_.AddSeekEntry(libwebm::kMkvTracks, MaxOffset()))
     return false;
   if (!tracks_.Write(writer_header_))
     return false;
 
   if (chapters_.Count() > 0) {
-    if (!seek_head_.AddSeekEntry(kMkvChapters, MaxOffset()))
+    if (!seek_head_.AddSeekEntry(libwebm::kMkvChapters, MaxOffset()))
       return false;
     if (!chapters_.Write(writer_header_))
       return false;
   }
 
   if (tags_.Count() > 0) {
-    if (!seek_head_.AddSeekEntry(kMkvTags, MaxOffset()))
+    if (!seek_head_.AddSeekEntry(libwebm::kMkvTags, MaxOffset()))
       return false;
     if (!tags_.Write(writer_header_))
       return false;
@@ -2870,7 +3363,7 @@ bool Segment::WriteSegmentHeader() {
 // Here we are testing whether to create a new cluster, given a frame
 // having time frame_timestamp_ns.
 //
-int Segment::TestFrame(uint64 track_number, uint64 frame_timestamp_ns,
+int Segment::TestFrame(uint64_t track_number, uint64_t frame_timestamp_ns,
                        bool is_key) const {
   if (force_new_cluster_)
     return 1;
@@ -2888,11 +3381,11 @@ int Segment::TestFrame(uint64 track_number, uint64 frame_timestamp_ns,
   // written to the existing cluster, or that a new cluster should be
   // created.
 
-  const uint64 timecode_scale = segment_info_.timecode_scale();
-  const uint64 frame_timecode = frame_timestamp_ns / timecode_scale;
+  const uint64_t timecode_scale = segment_info_.timecode_scale();
+  const uint64_t frame_timecode = frame_timestamp_ns / timecode_scale;
 
   const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1];
-  const uint64 last_cluster_timecode = last_cluster->timecode();
+  const uint64_t last_cluster_timecode = last_cluster->timecode();
 
   // For completeness we test for the case when the frame's timecode
   // is less than the cluster's timecode.  Although in principle that
@@ -2907,7 +3400,7 @@ int Segment::TestFrame(uint64 track_number, uint64 frame_timestamp_ns,
   // using a 16-bit signed integer), then we cannot write this frame
   // to that cluster, and so we must create a new cluster.
 
-  const int64 delta_timecode = frame_timecode - last_cluster_timecode;
+  const int64_t delta_timecode = frame_timecode - last_cluster_timecode;
 
   if (delta_timecode > kMaxBlockTimecode)
     return 2;
@@ -2923,7 +3416,7 @@ int Segment::TestFrame(uint64 track_number, uint64 frame_timestamp_ns,
   // already, where "too many" is defined as "the total time of frames
   // in the cluster exceeds a threshold".
 
-  const uint64 delta_ns = delta_timecode * timecode_scale;
+  const uint64_t delta_ns = delta_timecode * timecode_scale;
 
   if (max_cluster_duration_ > 0 && delta_ns >= max_cluster_duration_)
     return 1;
@@ -2932,7 +3425,7 @@ int Segment::TestFrame(uint64 track_number, uint64 frame_timestamp_ns,
   // cluster is created when the size of the current cluster exceeds a
   // threshold.
 
-  const uint64 cluster_size = last_cluster->payload_size();
+  const uint64_t cluster_size = last_cluster->payload_size();
 
   if (max_cluster_size_ > 0 && cluster_size >= max_cluster_size_)
     return 1;
@@ -2942,19 +3435,19 @@ int Segment::TestFrame(uint64 track_number, uint64 frame_timestamp_ns,
   return 0;
 }
 
-bool Segment::MakeNewCluster(uint64 frame_timestamp_ns) {
-  const int32 new_size = cluster_list_size_ + 1;
+bool Segment::MakeNewCluster(uint64_t frame_timestamp_ns) {
+  const int32_t new_size = cluster_list_size_ + 1;
 
   if (new_size > cluster_list_capacity_) {
     // Add more clusters.
-    const int32 new_capacity =
+    const int32_t new_capacity =
         (cluster_list_capacity_ <= 0) ? 1 : cluster_list_capacity_ * 2;
     Cluster** const clusters =
         new (std::nothrow) Cluster*[new_capacity];  // NOLINT
     if (!clusters)
       return false;
 
-    for (int32 i = 0; i < cluster_list_size_; ++i) {
+    for (int32_t i = 0; i < cluster_list_size_; ++i) {
       clusters[i] = cluster_list_[i];
     }
 
@@ -2967,19 +3460,17 @@ bool Segment::MakeNewCluster(uint64 frame_timestamp_ns) {
   if (!WriteFramesLessThan(frame_timestamp_ns))
     return false;
 
-  if (mode_ == kFile) {
-    if (cluster_list_size_ > 0) {
-      // Update old cluster's size
-      Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1];
+  if (cluster_list_size_ > 0) {
+    // Update old cluster's size
+    Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1];
 
-      if (!old_cluster || !old_cluster->Finalize())
-        return false;
-    }
-
-    if (output_cues_)
-      new_cuepoint_ = true;
+    if (!old_cluster || !old_cluster->Finalize(true, frame_timestamp_ns))
+      return false;
   }
 
+  if (output_cues_)
+    new_cuepoint_ = true;
+
   if (chunking_ && cluster_list_size_ > 0) {
     chunk_writer_cluster_->Close();
     chunk_count_++;
@@ -2990,24 +3481,25 @@ bool Segment::MakeNewCluster(uint64 frame_timestamp_ns) {
       return false;
   }
 
-  const uint64 timecode_scale = segment_info_.timecode_scale();
-  const uint64 frame_timecode = frame_timestamp_ns / timecode_scale;
+  const uint64_t timecode_scale = segment_info_.timecode_scale();
+  const uint64_t frame_timecode = frame_timestamp_ns / timecode_scale;
 
-  uint64 cluster_timecode = frame_timecode;
+  uint64_t cluster_timecode = frame_timecode;
 
   if (frames_size_ > 0) {
     const Frame* const f = frames_[0];  // earliest queued frame
-    const uint64 ns = f->timestamp();
-    const uint64 tc = ns / timecode_scale;
+    const uint64_t ns = f->timestamp();
+    const uint64_t tc = ns / timecode_scale;
 
     if (tc < cluster_timecode)
       cluster_timecode = tc;
   }
 
   Cluster*& cluster = cluster_list_[cluster_list_size_];
-  const int64 offset = MaxOffset();
-  cluster = new (std::nothrow) Cluster(cluster_timecode,  // NOLINT
-                                       offset, segment_info_.timecode_scale());
+  const int64_t offset = MaxOffset();
+  cluster = new (std::nothrow)
+      Cluster(cluster_timecode, offset, segment_info_.timecode_scale(),
+              accurate_cluster_duration_, fixed_size_cluster_timecode_);
   if (!cluster)
     return false;
 
@@ -3018,8 +3510,8 @@ bool Segment::MakeNewCluster(uint64 frame_timestamp_ns) {
   return true;
 }
 
-bool Segment::DoNewClusterProcessing(uint64 track_number,
-                                     uint64 frame_timestamp_ns, bool is_key) {
+bool Segment::DoNewClusterProcessing(uint64_t track_number,
+                                     uint64_t frame_timestamp_ns, bool is_key) {
   for (;;) {
     // Based on the characteristics of the current frame and current
     // cluster, decide whether to create a new cluster.
@@ -3055,12 +3547,12 @@ bool Segment::CheckHeaderInfo() {
     if (!WriteSegmentHeader())
       return false;
 
-    if (!seek_head_.AddSeekEntry(kMkvCluster, MaxOffset()))
+    if (!seek_head_.AddSeekEntry(libwebm::kMkvCluster, MaxOffset()))
       return false;
 
     if (output_cues_ && cues_track_ == 0) {
       // Check for a video track
-      for (uint32 i = 0; i < tracks_.track_entries_size(); ++i) {
+      for (uint32_t i = 0; i < tracks_.track_entries_size(); ++i) {
         const Track* const track = tracks_.GetTrackByIndex(i);
         if (!track)
           return false;
@@ -3085,7 +3577,7 @@ bool Segment::CheckHeaderInfo() {
 }
 
 void Segment::UpdateDocTypeVersion() {
-  for (uint32 index = 0; index < tracks_.track_entries_size(); ++index) {
+  for (uint32_t index = 0; index < tracks_.track_entries_size(); ++index) {
     const Track* track = tracks_.GetTrackByIndex(index);
     if (track == NULL)
       break;
@@ -3127,14 +3619,14 @@ bool Segment::UpdateChunkName(const char* ext, char** name) const {
   return true;
 }
 
-int64 Segment::MaxOffset() {
+int64_t Segment::MaxOffset() {
   if (!writer_header_)
     return -1;
 
-  int64 offset = writer_header_->Position() - payload_pos_;
+  int64_t offset = writer_header_->Position() - payload_pos_;
 
   if (chunking_) {
-    for (int32 i = 0; i < cluster_list_size_; ++i) {
+    for (int32_t i = 0; i < cluster_list_size_; ++i) {
       Cluster* const cluster = cluster_list_[i];
       offset += cluster->Size();
     }
@@ -3147,11 +3639,11 @@ int64 Segment::MaxOffset() {
 }
 
 bool Segment::QueueFrame(Frame* frame) {
-  const int32 new_size = frames_size_ + 1;
+  const int32_t new_size = frames_size_ + 1;
 
   if (new_size > frames_capacity_) {
     // Add more frames.
-    const int32 new_capacity = (!frames_capacity_) ? 2 : frames_capacity_ * 2;
+    const int32_t new_capacity = (!frames_capacity_) ? 2 : frames_capacity_ * 2;
 
     if (new_capacity < 1)
       return false;
@@ -3160,7 +3652,7 @@ bool Segment::QueueFrame(Frame* frame) {
     if (!frames)
       return false;
 
-    for (int32 i = 0; i < frames_size_; ++i) {
+    for (int32_t i = 0; i < frames_size_; ++i) {
       frames[i] = frames_[i];
     }
 
@@ -3186,7 +3678,7 @@ int Segment::WriteFramesAll() {
   if (!cluster)
     return -1;
 
-  for (int32 i = 0; i < frames_size_; ++i) {
+  for (int32_t i = 0; i < frames_size_; ++i) {
     Frame*& frame = frames_[i];
     // TODO(jzern/vigneshv): using Segment::AddGenericFrame here would limit the
     // places where |doc_type_version_| needs to be updated.
@@ -3215,7 +3707,7 @@ int Segment::WriteFramesAll() {
   return result;
 }
 
-bool Segment::WriteFramesLessThan(uint64 timestamp) {
+bool Segment::WriteFramesLessThan(uint64_t timestamp) {
   // Check |cluster_list_size_| to see if this is the first cluster. If it is
   // the first cluster the audio frames that are less than the first video
   // timesatmp will be written in a later step.
@@ -3227,11 +3719,11 @@ bool Segment::WriteFramesLessThan(uint64 timestamp) {
     if (!cluster)
       return false;
 
-    int32 shift_left = 0;
+    int32_t shift_left = 0;
 
     // TODO(fgalligan): Change this to use the durations of frames instead of
     // the next frame's start time if the duration is accurate.
-    for (int32 i = 1; i < frames_size_; ++i) {
+    for (int32_t i = 1; i < frames_size_; ++i) {
       const Frame* const frame_curr = frames_[i];
 
       if (frame_curr->timestamp() > timestamp)
@@ -3262,8 +3754,8 @@ bool Segment::WriteFramesLessThan(uint64 timestamp) {
       if (shift_left >= frames_size_)
         return false;
 
-      const int32 new_frames_size = frames_size_ - shift_left;
-      for (int32 i = 0; i < new_frames_size; ++i) {
+      const int32_t new_frames_size = frames_size_ - shift_left;
+      for (int32_t i = 0; i < new_frames_size; ++i) {
         frames_[i] = frames_[i + shift_left];
       }
 
diff --git a/third_party/libwebm/mkvmuxer.hpp b/third_party/libwebm/mkvmuxer/mkvmuxer.h
similarity index 68%
rename from third_party/libwebm/mkvmuxer.hpp
rename to third_party/libwebm/mkvmuxer/mkvmuxer.h
index 03a002c..55ba071 100644
--- a/third_party/libwebm/mkvmuxer.hpp
+++ b/third_party/libwebm/mkvmuxer/mkvmuxer.h
@@ -6,24 +6,31 @@
 // in the file PATENTS.  All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 
-#ifndef MKVMUXER_HPP
-#define MKVMUXER_HPP
+#ifndef MKVMUXER_MKVMUXER_H_
+#define MKVMUXER_MKVMUXER_H_
 
-#include "mkvmuxertypes.hpp"
+#include <stdint.h>
+
+#include <cstddef>
+#include <list>
+#include <map>
+
+#include "common/webmids.h"
+#include "mkvmuxer/mkvmuxertypes.h"
 
 // For a description of the WebM elements see
 // http://www.webmproject.org/code/specs/container/.
 
 namespace mkvparser {
 class IMkvReader;
-}  // end namespace
+}  // namespace mkvparser
 
 namespace mkvmuxer {
 
 class MkvWriter;
 class Segment;
 
-const uint64 kMaxTrackNumber = 126;
+const uint64_t kMaxTrackNumber = 126;
 
 ///////////////////////////////////////////////////////////////
 // Interface used by the mkvmuxer to write out the Mkv data.
@@ -59,15 +66,15 @@ class IMkvWriter {
 
 // Writes out the EBML header for a WebM file. This function must be called
 // before any other libwebm writing functions are called.
-bool WriteEbmlHeader(IMkvWriter* writer, uint64 doc_type_version);
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version);
 
 // Deprecated. Writes out EBML header with doc_type_version as
 // kDefaultDocTypeVersion. Exists for backward compatibility.
 bool WriteEbmlHeader(IMkvWriter* writer);
 
 // Copies in Chunk from source to destination between the given byte positions
-bool ChunkedCopy(mkvparser::IMkvReader* source, IMkvWriter* dst, int64 start,
-                 int64 size);
+bool ChunkedCopy(mkvparser::IMkvReader* source, IMkvWriter* dst, int64_t start,
+                 int64_t size);
 
 ///////////////////////////////////////////////////////////////
 // Class to hold data the will be written to a block.
@@ -81,10 +88,11 @@ class Frame {
   bool CopyFrom(const Frame& frame);
 
   // Copies |frame| data into |frame_|. Returns true on success.
-  bool Init(const uint8* frame, uint64 length);
+  bool Init(const uint8_t* frame, uint64_t length);
 
   // Copies |additional| data into |additional_|. Returns true on success.
-  bool AddAdditionalData(const uint8* additional, uint64 length, uint64 add_id);
+  bool AddAdditionalData(const uint8_t* additional, uint64_t length,
+                         uint64_t add_id);
 
   // Returns true if the frame has valid parameters.
   bool IsValid() const;
@@ -93,62 +101,70 @@ class Frame {
   // parameters.
   bool CanBeSimpleBlock() const;
 
-  uint64 add_id() const { return add_id_; }
-  const uint8* additional() const { return additional_; }
-  uint64 additional_length() const { return additional_length_; }
-  void set_duration(uint64 duration) { duration_ = duration; }
-  uint64 duration() const { return duration_; }
-  const uint8* frame() const { return frame_; }
+  uint64_t add_id() const { return add_id_; }
+  const uint8_t* additional() const { return additional_; }
+  uint64_t additional_length() const { return additional_length_; }
+  void set_duration(uint64_t duration);
+  uint64_t duration() const { return duration_; }
+  bool duration_set() const { return duration_set_; }
+  const uint8_t* frame() const { return frame_; }
   void set_is_key(bool key) { is_key_ = key; }
   bool is_key() const { return is_key_; }
-  uint64 length() const { return length_; }
-  void set_track_number(uint64 track_number) { track_number_ = track_number; }
-  uint64 track_number() const { return track_number_; }
-  void set_timestamp(uint64 timestamp) { timestamp_ = timestamp; }
-  uint64 timestamp() const { return timestamp_; }
-  void set_discard_padding(int64 discard_padding) {
+  uint64_t length() const { return length_; }
+  void set_track_number(uint64_t track_number) { track_number_ = track_number; }
+  uint64_t track_number() const { return track_number_; }
+  void set_timestamp(uint64_t timestamp) { timestamp_ = timestamp; }
+  uint64_t timestamp() const { return timestamp_; }
+  void set_discard_padding(int64_t discard_padding) {
     discard_padding_ = discard_padding;
   }
-  int64 discard_padding() const { return discard_padding_; }
-  void set_reference_block_timestamp(int64 reference_block_timestamp);
-  int64 reference_block_timestamp() const { return reference_block_timestamp_; }
+  int64_t discard_padding() const { return discard_padding_; }
+  void set_reference_block_timestamp(int64_t reference_block_timestamp);
+  int64_t reference_block_timestamp() const {
+    return reference_block_timestamp_;
+  }
   bool reference_block_timestamp_set() const {
     return reference_block_timestamp_set_;
   }
 
  private:
   // Id of the Additional data.
-  uint64 add_id_;
+  uint64_t add_id_;
 
   // Pointer to additional data. Owned by this class.
-  uint8* additional_;
+  uint8_t* additional_;
 
   // Length of the additional data.
-  uint64 additional_length_;
+  uint64_t additional_length_;
 
   // Duration of the frame in nanoseconds.
-  uint64 duration_;
+  uint64_t duration_;
+
+  // Flag indicating that |duration_| has been set. Setting duration causes the
+  // frame to be written out as a Block with BlockDuration instead of as a
+  // SimpleBlock.
+  bool duration_set_;
 
   // Pointer to the data. Owned by this class.
-  uint8* frame_;
+  uint8_t* frame_;
 
   // Flag telling if the data should set the key flag of a block.
   bool is_key_;
 
   // Length of the data.
-  uint64 length_;
+  uint64_t length_;
 
   // Mkv track number the data is associated with.
-  uint64 track_number_;
+  uint64_t track_number_;
 
   // Timestamp of the data in nanoseconds.
-  uint64 timestamp_;
+  uint64_t timestamp_;
 
   // Discard padding for the frame.
-  int64 discard_padding_;
+  int64_t discard_padding_;
 
   // Reference block timestamp.
-  int64 reference_block_timestamp_;
+  int64_t reference_block_timestamp_;
 
   // Flag indicating if |reference_block_timestamp_| has been set.
   bool reference_block_timestamp_set_;
@@ -164,19 +180,19 @@ class CuePoint {
   ~CuePoint();
 
   // Returns the size in bytes for the entire CuePoint element.
-  uint64 Size() const;
+  uint64_t Size() const;
 
   // Output the CuePoint element to the writer. Returns true on success.
   bool Write(IMkvWriter* writer) const;
 
-  void set_time(uint64 time) { time_ = time; }
-  uint64 time() const { return time_; }
-  void set_track(uint64 track) { track_ = track; }
-  uint64 track() const { return track_; }
-  void set_cluster_pos(uint64 cluster_pos) { cluster_pos_ = cluster_pos; }
-  uint64 cluster_pos() const { return cluster_pos_; }
-  void set_block_number(uint64 block_number) { block_number_ = block_number; }
-  uint64 block_number() const { return block_number_; }
+  void set_time(uint64_t time) { time_ = time; }
+  uint64_t time() const { return time_; }
+  void set_track(uint64_t track) { track_ = track; }
+  uint64_t track() const { return track_; }
+  void set_cluster_pos(uint64_t cluster_pos) { cluster_pos_ = cluster_pos; }
+  uint64_t cluster_pos() const { return cluster_pos_; }
+  void set_block_number(uint64_t block_number) { block_number_ = block_number; }
+  uint64_t block_number() const { return block_number_; }
   void set_output_block_number(bool output_block_number) {
     output_block_number_ = output_block_number;
   }
@@ -184,19 +200,19 @@ class CuePoint {
 
  private:
   // Returns the size in bytes for the payload of the CuePoint element.
-  uint64 PayloadSize() const;
+  uint64_t PayloadSize() const;
 
   // Absolute timecode according to the segment time base.
-  uint64 time_;
+  uint64_t time_;
 
   // The Track element associated with the CuePoint.
-  uint64 track_;
+  uint64_t track_;
 
   // The position of the Cluster containing the Block.
-  uint64 cluster_pos_;
+  uint64_t cluster_pos_;
 
   // Number of the Block within the Cluster, starting from 1.
-  uint64 block_number_;
+  uint64_t block_number_;
 
   // If true the muxer will write out the block number for the cue if the
   // block number is different than the default of 1. Default is set to true.
@@ -217,15 +233,15 @@ class Cues {
 
   // Returns the cue point by index. Returns NULL if there is no cue point
   // match.
-  CuePoint* GetCueByIndex(int32 index) const;
+  CuePoint* GetCueByIndex(int32_t index) const;
 
   // Returns the total size of the Cues element
-  uint64 Size();
+  uint64_t Size();
 
   // Output the Cues element to the writer. Returns true on success.
   bool Write(IMkvWriter* writer) const;
 
-  int32 cue_entries_size() const { return cue_entries_size_; }
+  int32_t cue_entries_size() const { return cue_entries_size_; }
   void set_output_block_number(bool output_block_number) {
     output_block_number_ = output_block_number;
   }
@@ -233,10 +249,10 @@ class Cues {
 
  private:
   // Number of allocated elements in |cue_entries_|.
-  int32 cue_entries_capacity_;
+  int32_t cue_entries_capacity_;
 
   // Number of CuePoints in |cue_entries_|.
-  int32 cue_entries_size_;
+  int32_t cue_entries_size_;
 
   // CuePoint list.
   CuePoint** cue_entries_;
@@ -258,21 +274,21 @@ class ContentEncAESSettings {
   ~ContentEncAESSettings() {}
 
   // Returns the size in bytes for the ContentEncAESSettings element.
-  uint64 Size() const;
+  uint64_t Size() const;
 
   // Writes out the ContentEncAESSettings element to |writer|. Returns true on
   // success.
   bool Write(IMkvWriter* writer) const;
 
-  uint64 cipher_mode() const { return cipher_mode_; }
+  uint64_t cipher_mode() const { return cipher_mode_; }
 
  private:
   // Returns the size in bytes for the payload of the ContentEncAESSettings
   // element.
-  uint64 PayloadSize() const;
+  uint64_t PayloadSize() const;
 
   // Sub elements
-  uint64 cipher_mode_;
+  uint64_t cipher_mode_;
 
   LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncAESSettings);
 };
@@ -291,45 +307,158 @@ class ContentEncoding {
 
   // Sets the content encryption id. Copies |length| bytes from |id| to
   // |enc_key_id_|. Returns true on success.
-  bool SetEncryptionID(const uint8* id, uint64 length);
+  bool SetEncryptionID(const uint8_t* id, uint64_t length);
 
   // Returns the size in bytes for the ContentEncoding element.
-  uint64 Size() const;
+  uint64_t Size() const;
 
   // Writes out the ContentEncoding element to |writer|. Returns true on
   // success.
   bool Write(IMkvWriter* writer) const;
 
-  uint64 enc_algo() const { return enc_algo_; }
-  uint64 encoding_order() const { return encoding_order_; }
-  uint64 encoding_scope() const { return encoding_scope_; }
-  uint64 encoding_type() const { return encoding_type_; }
+  uint64_t enc_algo() const { return enc_algo_; }
+  uint64_t encoding_order() const { return encoding_order_; }
+  uint64_t encoding_scope() const { return encoding_scope_; }
+  uint64_t encoding_type() const { return encoding_type_; }
   ContentEncAESSettings* enc_aes_settings() { return &enc_aes_settings_; }
 
  private:
   // Returns the size in bytes for the encoding elements.
-  uint64 EncodingSize(uint64 compresion_size, uint64 encryption_size) const;
+  uint64_t EncodingSize(uint64_t compresion_size,
+                        uint64_t encryption_size) const;
 
   // Returns the size in bytes for the encryption elements.
-  uint64 EncryptionSize() const;
+  uint64_t EncryptionSize() const;
 
   // Track element names
-  uint64 enc_algo_;
-  uint8* enc_key_id_;
-  uint64 encoding_order_;
-  uint64 encoding_scope_;
-  uint64 encoding_type_;
+  uint64_t enc_algo_;
+  uint8_t* enc_key_id_;
+  uint64_t encoding_order_;
+  uint64_t encoding_scope_;
+  uint64_t encoding_type_;
 
   // ContentEncAESSettings element.
   ContentEncAESSettings enc_aes_settings_;
 
   // Size of the ContentEncKeyID data in bytes.
-  uint64 enc_key_id_length_;
+  uint64_t enc_key_id_length_;
 
   LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncoding);
 };
 
 ///////////////////////////////////////////////////////////////
+// Colour element.
+struct PrimaryChromaticity {
+  PrimaryChromaticity(float x_val, float y_val) : x(x_val), y(y_val) {}
+  PrimaryChromaticity() : x(0), y(0) {}
+  ~PrimaryChromaticity() {}
+  uint64_t PrimaryChromaticityPayloadSize(libwebm::MkvId x_id,
+                                          libwebm::MkvId y_id) const;
+  bool Write(IMkvWriter* writer, libwebm::MkvId x_id,
+             libwebm::MkvId y_id) const;
+
+  float x;
+  float y;
+};
+
+class MasteringMetadata {
+ public:
+  static const float kValueNotPresent;
+
+  MasteringMetadata()
+      : luminance_max(kValueNotPresent),
+        luminance_min(kValueNotPresent),
+        r_(NULL),
+        g_(NULL),
+        b_(NULL),
+        white_point_(NULL) {}
+  ~MasteringMetadata() {
+    delete r_;
+    delete g_;
+    delete b_;
+    delete white_point_;
+  }
+
+  // Returns total size of the MasteringMetadata element.
+  uint64_t MasteringMetadataSize() const;
+  bool Write(IMkvWriter* writer) const;
+
+  // Copies non-null chromaticity.
+  bool SetChromaticity(const PrimaryChromaticity* r,
+                       const PrimaryChromaticity* g,
+                       const PrimaryChromaticity* b,
+                       const PrimaryChromaticity* white_point);
+  const PrimaryChromaticity* r() const { return r_; }
+  const PrimaryChromaticity* g() const { return g_; }
+  const PrimaryChromaticity* b() const { return b_; }
+  const PrimaryChromaticity* white_point() const { return white_point_; }
+
+  float luminance_max;
+  float luminance_min;
+
+ private:
+  // Returns size of MasteringMetadata child elements.
+  uint64_t PayloadSize() const;
+
+  PrimaryChromaticity* r_;
+  PrimaryChromaticity* g_;
+  PrimaryChromaticity* b_;
+  PrimaryChromaticity* white_point_;
+};
+
+class Colour {
+ public:
+  static const uint64_t kValueNotPresent;
+  Colour()
+      : matrix_coefficients(kValueNotPresent),
+        bits_per_channel(kValueNotPresent),
+        chroma_subsampling_horz(kValueNotPresent),
+        chroma_subsampling_vert(kValueNotPresent),
+        cb_subsampling_horz(kValueNotPresent),
+        cb_subsampling_vert(kValueNotPresent),
+        chroma_siting_horz(kValueNotPresent),
+        chroma_siting_vert(kValueNotPresent),
+        range(kValueNotPresent),
+        transfer_characteristics(kValueNotPresent),
+        primaries(kValueNotPresent),
+        max_cll(kValueNotPresent),
+        max_fall(kValueNotPresent),
+        mastering_metadata_(NULL) {}
+  ~Colour() { delete mastering_metadata_; }
+
+  // Returns total size of the Colour element.
+  uint64_t ColourSize() const;
+  bool Write(IMkvWriter* writer) const;
+
+  // Deep copies |mastering_metadata|.
+  bool SetMasteringMetadata(const MasteringMetadata& mastering_metadata);
+
+  const MasteringMetadata* mastering_metadata() const {
+    return mastering_metadata_;
+  }
+
+  uint64_t matrix_coefficients;
+  uint64_t bits_per_channel;
+  uint64_t chroma_subsampling_horz;
+  uint64_t chroma_subsampling_vert;
+  uint64_t cb_subsampling_horz;
+  uint64_t cb_subsampling_vert;
+  uint64_t chroma_siting_horz;
+  uint64_t chroma_siting_vert;
+  uint64_t range;
+  uint64_t transfer_characteristics;
+  uint64_t primaries;
+  uint64_t max_cll;
+  uint64_t max_fall;
+
+ private:
+  // Returns size of Colour child elements.
+  uint64_t PayloadSize() const;
+
+  MasteringMetadata* mastering_metadata_;
+};
+
+///////////////////////////////////////////////////////////////
 // Track element.
 class Track {
  public:
@@ -342,76 +471,76 @@ class Track {
 
   // Returns the ContentEncoding by index. Returns NULL if there is no
   // ContentEncoding match.
-  ContentEncoding* GetContentEncodingByIndex(uint32 index) const;
+  ContentEncoding* GetContentEncodingByIndex(uint32_t index) const;
 
   // Returns the size in bytes for the payload of the Track element.
-  virtual uint64 PayloadSize() const;
+  virtual uint64_t PayloadSize() const;
 
   // Returns the size in bytes of the Track element.
-  virtual uint64 Size() const;
+  virtual uint64_t Size() const;
 
   // Output the Track element to the writer. Returns true on success.
   virtual bool Write(IMkvWriter* writer) const;
 
   // Sets the CodecPrivate element of the Track element. Copies |length|
   // bytes from |codec_private| to |codec_private_|. Returns true on success.
-  bool SetCodecPrivate(const uint8* codec_private, uint64 length);
+  bool SetCodecPrivate(const uint8_t* codec_private, uint64_t length);
 
   void set_codec_id(const char* codec_id);
   const char* codec_id() const { return codec_id_; }
-  const uint8* codec_private() const { return codec_private_; }
+  const uint8_t* codec_private() const { return codec_private_; }
   void set_language(const char* language);
   const char* language() const { return language_; }
-  void set_max_block_additional_id(uint64 max_block_additional_id) {
+  void set_max_block_additional_id(uint64_t max_block_additional_id) {
     max_block_additional_id_ = max_block_additional_id;
   }
-  uint64 max_block_additional_id() const { return max_block_additional_id_; }
+  uint64_t max_block_additional_id() const { return max_block_additional_id_; }
   void set_name(const char* name);
   const char* name() const { return name_; }
-  void set_number(uint64 number) { number_ = number; }
-  uint64 number() const { return number_; }
-  void set_type(uint64 type) { type_ = type; }
-  uint64 type() const { return type_; }
-  void set_uid(uint64 uid) { uid_ = uid; }
-  uint64 uid() const { return uid_; }
-  void set_codec_delay(uint64 codec_delay) { codec_delay_ = codec_delay; }
-  uint64 codec_delay() const { return codec_delay_; }
-  void set_seek_pre_roll(uint64 seek_pre_roll) {
+  void set_number(uint64_t number) { number_ = number; }
+  uint64_t number() const { return number_; }
+  void set_type(uint64_t type) { type_ = type; }
+  uint64_t type() const { return type_; }
+  void set_uid(uint64_t uid) { uid_ = uid; }
+  uint64_t uid() const { return uid_; }
+  void set_codec_delay(uint64_t codec_delay) { codec_delay_ = codec_delay; }
+  uint64_t codec_delay() const { return codec_delay_; }
+  void set_seek_pre_roll(uint64_t seek_pre_roll) {
     seek_pre_roll_ = seek_pre_roll;
   }
-  uint64 seek_pre_roll() const { return seek_pre_roll_; }
-  void set_default_duration(uint64 default_duration) {
+  uint64_t seek_pre_roll() const { return seek_pre_roll_; }
+  void set_default_duration(uint64_t default_duration) {
     default_duration_ = default_duration;
   }
-  uint64 default_duration() const { return default_duration_; }
+  uint64_t default_duration() const { return default_duration_; }
 
-  uint64 codec_private_length() const { return codec_private_length_; }
-  uint32 content_encoding_entries_size() const {
+  uint64_t codec_private_length() const { return codec_private_length_; }
+  uint32_t content_encoding_entries_size() const {
     return content_encoding_entries_size_;
   }
 
  private:
   // Track element names.
   char* codec_id_;
-  uint8* codec_private_;
+  uint8_t* codec_private_;
   char* language_;
-  uint64 max_block_additional_id_;
+  uint64_t max_block_additional_id_;
   char* name_;
-  uint64 number_;
-  uint64 type_;
-  uint64 uid_;
-  uint64 codec_delay_;
-  uint64 seek_pre_roll_;
-  uint64 default_duration_;
+  uint64_t number_;
+  uint64_t type_;
+  uint64_t uid_;
+  uint64_t codec_delay_;
+  uint64_t seek_pre_roll_;
+  uint64_t default_duration_;
 
   // Size of the CodecPrivate data in bytes.
-  uint64 codec_private_length_;
+  uint64_t codec_private_length_;
 
   // ContentEncoding element list.
   ContentEncoding** content_encoding_entries_;
 
   // Number of ContentEncoding elements added.
-  uint32 content_encoding_entries_size_;
+  uint32_t content_encoding_entries_size_;
 
   LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Track);
 };
@@ -437,56 +566,63 @@ class VideoTrack : public Track {
 
   // Returns the size in bytes for the payload of the Track element plus the
   // video specific elements.
-  virtual uint64 PayloadSize() const;
+  virtual uint64_t PayloadSize() const;
 
   // Output the VideoTrack element to the writer. Returns true on success.
   virtual bool Write(IMkvWriter* writer) const;
 
   // Sets the video's stereo mode. Returns true on success.
-  bool SetStereoMode(uint64 stereo_mode);
+  bool SetStereoMode(uint64_t stereo_mode);
 
   // Sets the video's alpha mode. Returns true on success.
-  bool SetAlphaMode(uint64 alpha_mode);
-
-  void set_display_height(uint64 height) { display_height_ = height; }
-  uint64 display_height() const { return display_height_; }
-  void set_display_width(uint64 width) { display_width_ = width; }
-  uint64 display_width() const { return display_width_; }
-
-  void set_crop_left(uint64 crop_left) { crop_left_ = crop_left; }
-  uint64 crop_left() const { return crop_left_; }
-  void set_crop_right(uint64 crop_right) { crop_right_ = crop_right; }
-  uint64 crop_right() const { return crop_right_; }
-  void set_crop_top(uint64 crop_top) { crop_top_ = crop_top; }
-  uint64 crop_top() const { return crop_top_; }
-  void set_crop_bottom(uint64 crop_bottom) { crop_bottom_ = crop_bottom; }
-  uint64 crop_bottom() const { return crop_bottom_; }
+  bool SetAlphaMode(uint64_t alpha_mode);
+
+  void set_display_height(uint64_t height) { display_height_ = height; }
+  uint64_t display_height() const { return display_height_; }
+  void set_display_width(uint64_t width) { display_width_ = width; }
+  uint64_t display_width() const { return display_width_; }
+
+  void set_crop_left(uint64_t crop_left) { crop_left_ = crop_left; }
+  uint64_t crop_left() const { return crop_left_; }
+  void set_crop_right(uint64_t crop_right) { crop_right_ = crop_right; }
+  uint64_t crop_right() const { return crop_right_; }
+  void set_crop_top(uint64_t crop_top) { crop_top_ = crop_top; }
+  uint64_t crop_top() const { return crop_top_; }
+  void set_crop_bottom(uint64_t crop_bottom) { crop_bottom_ = crop_bottom; }
+  uint64_t crop_bottom() const { return crop_bottom_; }
 
   void set_frame_rate(double frame_rate) { frame_rate_ = frame_rate; }
   double frame_rate() const { return frame_rate_; }
-  void set_height(uint64 height) { height_ = height; }
-  uint64 height() const { return height_; }
-  uint64 stereo_mode() { return stereo_mode_; }
-  uint64 alpha_mode() { return alpha_mode_; }
-  void set_width(uint64 width) { width_ = width; }
-  uint64 width() const { return width_; }
+  void set_height(uint64_t height) { height_ = height; }
+  uint64_t height() const { return height_; }
+  uint64_t stereo_mode() { return stereo_mode_; }
+  uint64_t alpha_mode() { return alpha_mode_; }
+  void set_width(uint64_t width) { width_ = width; }
+  uint64_t width() const { return width_; }
+
+  Colour* colour() { return colour_; }
+
+  // Deep copies |colour|.
+  bool SetColour(const Colour& colour);
 
  private:
   // Returns the size in bytes of the Video element.
-  uint64 VideoPayloadSize() const;
+  uint64_t VideoPayloadSize() const;
 
   // Video track element names.
-  uint64 display_height_;
-  uint64 display_width_;
-  uint64 crop_left_;
-  uint64 crop_right_;
-  uint64 crop_top_;
-  uint64 crop_bottom_;
+  uint64_t display_height_;
+  uint64_t display_width_;
+  uint64_t crop_left_;
+  uint64_t crop_right_;
+  uint64_t crop_top_;
+  uint64_t crop_bottom_;
   double frame_rate_;
-  uint64 height_;
-  uint64 stereo_mode_;
-  uint64 alpha_mode_;
-  uint64 width_;
+  uint64_t height_;
+  uint64_t stereo_mode_;
+  uint64_t alpha_mode_;
+  uint64_t width_;
+
+  Colour* colour_;
 
   LIBWEBM_DISALLOW_COPY_AND_ASSIGN(VideoTrack);
 };
@@ -501,22 +637,22 @@ class AudioTrack : public Track {
 
   // Returns the size in bytes for the payload of the Track element plus the
   // audio specific elements.
-  virtual uint64 PayloadSize() const;
+  virtual uint64_t PayloadSize() const;
 
   // Output the AudioTrack element to the writer. Returns true on success.
   virtual bool Write(IMkvWriter* writer) const;
 
-  void set_bit_depth(uint64 bit_depth) { bit_depth_ = bit_depth; }
-  uint64 bit_depth() const { return bit_depth_; }
-  void set_channels(uint64 channels) { channels_ = channels; }
-  uint64 channels() const { return channels_; }
+  void set_bit_depth(uint64_t bit_depth) { bit_depth_ = bit_depth; }
+  uint64_t bit_depth() const { return bit_depth_; }
+  void set_channels(uint64_t channels) { channels_ = channels; }
+  uint64_t channels() const { return channels_; }
   void set_sample_rate(double sample_rate) { sample_rate_ = sample_rate; }
   double sample_rate() const { return sample_rate_; }
 
  private:
   // Audio track element names.
-  uint64 bit_depth_;
-  uint64 channels_;
+  uint64_t bit_depth_;
+  uint64_t channels_;
   double sample_rate_;
 
   LIBWEBM_DISALLOW_COPY_AND_ASSIGN(AudioTrack);
@@ -542,32 +678,35 @@ class Tracks {
   // deleted by the Tracks object. Returns true on success. |number| is the
   // number to use for the track. |number| must be >= 0. If |number| == 0
   // then the muxer will decide on the track number.
-  bool AddTrack(Track* track, int32 number);
+  bool AddTrack(Track* track, int32_t number);
 
   // Returns the track by index. Returns NULL if there is no track match.
-  const Track* GetTrackByIndex(uint32 idx) const;
+  const Track* GetTrackByIndex(uint32_t idx) const;
 
   // Search the Tracks and return the track that matches |tn|. Returns NULL
   // if there is no track match.
-  Track* GetTrackByNumber(uint64 track_number) const;
+  Track* GetTrackByNumber(uint64_t track_number) const;
 
   // Returns true if the track number is an audio track.
-  bool TrackIsAudio(uint64 track_number) const;
+  bool TrackIsAudio(uint64_t track_number) const;
 
   // Returns true if the track number is a video track.
-  bool TrackIsVideo(uint64 track_number) const;
+  bool TrackIsVideo(uint64_t track_number) const;
 
   // Output the Tracks element to the writer. Returns true on success.
   bool Write(IMkvWriter* writer) const;
 
-  uint32 track_entries_size() const { return track_entries_size_; }
+  uint32_t track_entries_size() const { return track_entries_size_; }
 
  private:
   // Track element list.
   Track** track_entries_;
 
   // Number of Track elements added.
-  uint32 track_entries_size_;
+  uint32_t track_entries_size_;
+
+  // Whether or not Tracks element has already been written via IMkvWriter.
+  mutable bool wrote_tracks_;
 
   LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Tracks);
 };
@@ -585,12 +724,12 @@ class Chapter {
 
   // Converts the nanosecond start and stop times of this chapter to
   // their corresponding timecode values, and stores them that way.
-  void set_time(const Segment& segment, uint64 start_time_ns,
-                uint64 end_time_ns);
+  void set_time(const Segment& segment, uint64_t start_time_ns,
+                uint64_t end_time_ns);
 
   // Sets the uid for this chapter. Primarily used to enable
   // deterministic output from the muxer.
-  void set_uid(const uint64 uid) { uid_ = uid; }
+  void set_uid(const uint64_t uid) { uid_ = uid; }
 
   // Add a title string to this chapter, per the semantics described
   // here:
@@ -637,7 +776,7 @@ class Chapter {
     // If |writer| is non-NULL, serialize the Display sub-element of
     // the Atom into the stream.  Returns the Display element size on
     // success, 0 if error.
-    uint64 WriteDisplay(IMkvWriter* writer) const;
+    uint64_t WriteDisplay(IMkvWriter* writer) const;
 
    private:
     char* title_;
@@ -670,20 +809,20 @@ class Chapter {
   // If |writer| is non-NULL, serialize the Atom sub-element into the
   // stream.  Returns the total size of the element on success, 0 if
   // error.
-  uint64 WriteAtom(IMkvWriter* writer) const;
+  uint64_t WriteAtom(IMkvWriter* writer) const;
 
   // The string identifier for this chapter (corresponds to WebVTT cue
   // identifier).
   char* id_;
 
   // Start timecode of the chapter.
-  uint64 start_timecode_;
+  uint64_t start_timecode_;
 
   // Stop timecode of the chapter.
-  uint64 end_timecode_;
+  uint64_t end_timecode_;
 
   // The binary identifier for this chapter.
-  uint64 uid_;
+  uint64_t uid_;
 
   // The Atom element can contain multiple Display sub-elements, as
   // the same logical title can be rendered in different languages.
@@ -723,7 +862,7 @@ class Chapters {
   // If |writer| is non-NULL, serialize the Edition sub-element of the
   // Chapters element into the stream.  Returns the Edition element
   // size on success, 0 if error.
-  uint64 WriteEdition(IMkvWriter* writer) const;
+  uint64_t WriteEdition(IMkvWriter* writer) const;
 
   // Total length of the chapters_ array.
   int chapters_size_;
@@ -768,7 +907,7 @@ class Tag {
     // If |writer| is non-NULL, serialize the SimpleTag sub-element of
     // the Atom into the stream.  Returns the SimpleTag element size on
     // success, 0 if error.
-    uint64 Write(IMkvWriter* writer) const;
+    uint64_t Write(IMkvWriter* writer) const;
 
    private:
     char* tag_name_;
@@ -795,7 +934,7 @@ class Tag {
   // If |writer| is non-NULL, serialize the Tag sub-element into the
   // stream.  Returns the total size of the element on success, 0 if
   // error.
-  uint64 Write(IMkvWriter* writer) const;
+  uint64_t Write(IMkvWriter* writer) const;
 
   // The Atom element can contain multiple SimpleTag sub-elements
   SimpleTag* simple_tags_;
@@ -853,7 +992,9 @@ class Cluster {
   // |timecode| is the absolute timecode of the cluster. |cues_pos| is the
   // position for the cluster within the segment that should be written in
   // the cues element. |timecode_scale| is the timecode scale of the segment.
-  Cluster(uint64 timecode, int64 cues_pos, uint64 timecode_scale);
+  Cluster(uint64_t timecode, int64_t cues_pos, uint64_t timecode_scale,
+          bool write_last_frame_with_duration = false,
+          bool fixed_size_timecode = false);
   ~Cluster();
 
   bool Init(IMkvWriter* ptr_writer);
@@ -872,8 +1013,8 @@ class Cluster {
   //   timecode:     Absolute (not relative to cluster) timestamp of the
   //                 frame, expressed in timecode units.
   //   is_key:       Flag telling whether or not this frame is a key frame.
-  bool AddFrame(const uint8* data, uint64 length, uint64 track_number,
-                uint64 timecode,  // timecode units (absolute)
+  bool AddFrame(const uint8_t* data, uint64_t length, uint64_t track_number,
+                uint64_t timecode,  // timecode units (absolute)
                 bool is_key);
 
   // Adds a frame to be output in the file. The frame is written out through
@@ -889,10 +1030,11 @@ class Cluster {
   //   abs_timecode: Absolute (not relative to cluster) timestamp of the
   //                 frame, expressed in timecode units.
   //   is_key:       Flag telling whether or not this frame is a key frame.
-  bool AddFrameWithAdditional(const uint8* data, uint64 length,
-                              const uint8* additional, uint64 additional_length,
-                              uint64 add_id, uint64 track_number,
-                              uint64 abs_timecode, bool is_key);
+  bool AddFrameWithAdditional(const uint8_t* data, uint64_t length,
+                              const uint8_t* additional,
+                              uint64_t additional_length, uint64_t add_id,
+                              uint64_t track_number, uint64_t abs_timecode,
+                              bool is_key);
 
   // Adds a frame to be output in the file. The frame is written out through
   // |writer_| if successful. Returns true on success.
@@ -905,9 +1047,10 @@ class Cluster {
   //   abs_timecode: Absolute (not relative to cluster) timestamp of the
   //                 frame, expressed in timecode units.
   //   is_key:       Flag telling whether or not this frame is a key frame.
-  bool AddFrameWithDiscardPadding(const uint8* data, uint64 length,
-                                  int64 discard_padding, uint64 track_number,
-                                  uint64 abs_timecode, bool is_key);
+  bool AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length,
+                                  int64_t discard_padding,
+                                  uint64_t track_number, uint64_t abs_timecode,
+                                  bool is_key);
 
   // Writes a frame of metadata to the output medium; returns true on
   // success.
@@ -923,31 +1066,53 @@ class Cluster {
   // The metadata frame is written as a block group, with a duration
   // sub-element but no reference time sub-elements (indicating that
   // it is considered a keyframe, per Matroska semantics).
-  bool AddMetadata(const uint8* data, uint64 length, uint64 track_number,
-                   uint64 timecode, uint64 duration);
+  bool AddMetadata(const uint8_t* data, uint64_t length, uint64_t track_number,
+                   uint64_t timecode, uint64_t duration);
 
   // Increments the size of the cluster's data in bytes.
-  void AddPayloadSize(uint64 size);
+  void AddPayloadSize(uint64_t size);
 
   // Closes the cluster so no more data can be written to it. Will update the
-  // cluster's size if |writer_| is seekable. Returns true on success.
+  // cluster's size if |writer_| is seekable. Returns true on success. This
+  // variant of Finalize() fails when |write_last_frame_with_duration_| is set
+  // to true.
   bool Finalize();
 
+  // Closes the cluster so no more data can be written to it. Will update the
+  // cluster's size if |writer_| is seekable. Returns true on success.
+  // Inputs:
+  //   set_last_frame_duration: Boolean indicating whether or not the duration
+  //                            of the last frame should be set. If set to
+  //                            false, the |duration| value is ignored and
+  //                            |write_last_frame_with_duration_| will not be
+  //                            honored.
+  //   duration: Duration of the Cluster in timecode scale.
+  bool Finalize(bool set_last_frame_duration, uint64_t duration);
+
   // Returns the size in bytes for the entire Cluster element.
-  uint64 Size() const;
+  uint64_t Size() const;
 
   // Given |abs_timecode|, calculates timecode relative to most recent timecode.
   // Returns -1 on failure, or a relative timecode.
-  int64 GetRelativeTimecode(int64 abs_timecode) const;
-
-  int64 size_position() const { return size_position_; }
-  int32 blocks_added() const { return blocks_added_; }
-  uint64 payload_size() const { return payload_size_; }
-  int64 position_for_cues() const { return position_for_cues_; }
-  uint64 timecode() const { return timecode_; }
-  uint64 timecode_scale() const { return timecode_scale_; }
+  int64_t GetRelativeTimecode(int64_t abs_timecode) const;
+
+  int64_t size_position() const { return size_position_; }
+  int32_t blocks_added() const { return blocks_added_; }
+  uint64_t payload_size() const { return payload_size_; }
+  int64_t position_for_cues() const { return position_for_cues_; }
+  uint64_t timecode() const { return timecode_; }
+  uint64_t timecode_scale() const { return timecode_scale_; }
+  void set_write_last_frame_with_duration(bool write_last_frame_with_duration) {
+    write_last_frame_with_duration_ = write_last_frame_with_duration;
+  }
+  bool write_last_frame_with_duration() const {
+    return write_last_frame_with_duration_;
+  }
 
  private:
+  // Iterator type for the |stored_frames_| map.
+  typedef std::map<uint64_t, std::list<Frame*> >::iterator FrameMapIterator;
+
   // Utility method that confirms that blocks can still be added, and that the
   // cluster header has been written. Used by |DoWriteFrame*|. Returns true
   // when successful.
@@ -955,37 +1120,58 @@ class Cluster {
 
   // Utility method used by the |DoWriteFrame*| methods that handles the book
   // keeping required after each block is written.
-  void PostWriteBlock(uint64 element_size);
+  void PostWriteBlock(uint64_t element_size);
 
   // Does some verification and calls WriteFrame.
   bool DoWriteFrame(const Frame* const frame);
 
+  // Either holds back the given frame, or writes it out depending on whether or
+  // not |write_last_frame_with_duration_| is set.
+  bool QueueOrWriteFrame(const Frame* const frame);
+
   // Outputs the Cluster header to |writer_|. Returns true on success.
   bool WriteClusterHeader();
 
   // Number of blocks added to the cluster.
-  int32 blocks_added_;
+  int32_t blocks_added_;
 
   // Flag telling if the cluster has been closed.
   bool finalized_;
 
+  // Flag indicating whether the cluster's timecode will always be written out
+  // using 8 bytes.
+  bool fixed_size_timecode_;
+
   // Flag telling if the cluster's header has been written.
   bool header_written_;
 
   // The size of the cluster elements in bytes.
-  uint64 payload_size_;
+  uint64_t payload_size_;
 
   // The file position used for cue points.
-  const int64 position_for_cues_;
+  const int64_t position_for_cues_;
 
   // The file position of the cluster's size element.
-  int64 size_position_;
+  int64_t size_position_;
 
   // The absolute timecode of the cluster.
-  const uint64 timecode_;
+  const uint64_t timecode_;
 
   // The timecode scale of the Segment containing the cluster.
-  const uint64 timecode_scale_;
+  const uint64_t timecode_scale_;
+
+  // Flag indicating whether the last frame of the cluster should be written as
+  // a Block with Duration. If set to true, then it will result in holding back
+  // of frames and the parameterized version of Finalize() must be called to
+  // finish writing the Cluster.
+  bool write_last_frame_with_duration_;
+
+  // Map used to hold back frames, if required. Track number is the key.
+  std::map<uint64_t, std::list<Frame*> > stored_frames_;
+
+  // Map from track number to the timestamp of the last block written for that
+  // track.
+  std::map<uint64_t, uint64_t> last_block_timestamp_;
 
   // Pointer to the writer object. Not owned by this class.
   IMkvWriter* writer_;
@@ -1006,42 +1192,42 @@ class SeekHead {
   // Adds a seek entry to be written out when the element is finalized. |id|
   // must be the coded mkv element id. |pos| is the file position of the
   // element. Returns true on success.
-  bool AddSeekEntry(uint32 id, uint64 pos);
+  bool AddSeekEntry(uint32_t id, uint64_t pos);
 
   // Writes out SeekHead and SeekEntry elements. Returns true on success.
   bool Finalize(IMkvWriter* writer) const;
 
   // Returns the id of the Seek Entry at the given index. Returns -1 if index is
   // out of range.
-  uint32 GetId(int index) const;
+  uint32_t GetId(int index) const;
 
   // Returns the position of the Seek Entry at the given index. Returns -1 if
   // index is out of range.
-  uint64 GetPosition(int index) const;
+  uint64_t GetPosition(int index) const;
 
   // Sets the Seek Entry id and position at given index.
   // Returns true on success.
-  bool SetSeekEntry(int index, uint32 id, uint64 position);
+  bool SetSeekEntry(int index, uint32_t id, uint64_t position);
 
   // Reserves space by writing out a Void element which will be updated with
   // a SeekHead element later. Returns true on success.
   bool Write(IMkvWriter* writer);
 
   // We are going to put a cap on the number of Seek Entries.
-  const static int32 kSeekEntryCount = 5;
+  const static int32_t kSeekEntryCount = 5;
 
  private:
   // Returns the maximum size in bytes of one seek entry.
-  uint64 MaxEntrySize() const;
+  uint64_t MaxEntrySize() const;
 
   // Seek entry id element list.
-  uint32 seek_entry_id_[kSeekEntryCount];
+  uint32_t seek_entry_id_[kSeekEntryCount];
 
   // Seek entry pos element list.
-  uint64 seek_entry_pos_[kSeekEntryCount];
+  uint64_t seek_entry_pos_[kSeekEntryCount];
 
   // The file position of SeekHead element.
-  int64 start_pos_;
+  int64_t start_pos_;
 
   LIBWEBM_DISALLOW_COPY_AND_ASSIGN(SeekHead);
 };
@@ -1067,12 +1253,12 @@ class SegmentInfo {
   double duration() const { return duration_; }
   void set_muxing_app(const char* app);
   const char* muxing_app() const { return muxing_app_; }
-  void set_timecode_scale(uint64 scale) { timecode_scale_ = scale; }
-  uint64 timecode_scale() const { return timecode_scale_; }
+  void set_timecode_scale(uint64_t scale) { timecode_scale_ = scale; }
+  uint64_t timecode_scale() const { return timecode_scale_; }
   void set_writing_app(const char* app);
   const char* writing_app() const { return writing_app_; }
-  void set_date_utc(int64 date_utc) { date_utc_ = date_utc; }
-  int64 date_utc() const { return date_utc_; }
+  void set_date_utc(int64_t date_utc) { date_utc_ = date_utc; }
+  int64_t date_utc() const { return date_utc_; }
 
  private:
   // Segment Information element names.
@@ -1081,14 +1267,14 @@ class SegmentInfo {
   double duration_;
   // Set to libwebm-%d.%d.%d.%d, major, minor, build, revision.
   char* muxing_app_;
-  uint64 timecode_scale_;
+  uint64_t timecode_scale_;
   // Initially set to libwebm-%d.%d.%d.%d, major, minor, build, revision.
   char* writing_app_;
   // LLONG_MIN when DateUTC is not set.
-  int64 date_utc_;
+  int64_t date_utc_;
 
   // The file position of the duration element.
-  int64 duration_pos_;
+  int64_t duration_pos_;
 
   LIBWEBM_DISALLOW_COPY_AND_ASSIGN(SegmentInfo);
 };
@@ -1108,8 +1294,8 @@ class Segment {
     kBeforeClusters = 0x1  // Position Cues before Clusters
   };
 
-  const static uint32 kDefaultDocTypeVersion = 2;
-  const static uint64 kDefaultMaxClusterDuration = 30000000000ULL;
+  const static uint32_t kDefaultDocTypeVersion = 2;
+  const static uint64_t kDefaultMaxClusterDuration = 30000000000ULL;
 
   Segment();
   ~Segment();
@@ -1123,13 +1309,13 @@ class Segment {
   // error. |number| is the number to use for the track.  |number|
   // must be >= 0. If |number| == 0 then the muxer will decide on the
   // track number.
-  Track* AddTrack(int32 number);
+  Track* AddTrack(int32_t number);
 
   // Adds a Vorbis audio track to the segment. Returns the number of the track
   // on success, 0 on error. |number| is the number to use for the audio track.
   // |number| must be >= 0. If |number| == 0 then the muxer will decide on
   // the track number.
-  uint64 AddAudioTrack(int32 sample_rate, int32 channels, int32 number);
+  uint64_t AddAudioTrack(int32_t sample_rate, int32_t channels, int32_t number);
 
   // Adds an empty chapter to the chapters of this segment.  Returns
   // non-NULL on success.  After adding the chapter, the caller should
@@ -1145,7 +1331,7 @@ class Segment {
   // nanoseconds of the cue's time. |track| is the Track of the Cue. This
   // function must be called after AddFrame to calculate the correct
   // BlockNumber for the CuePoint. Returns true on success.
-  bool AddCuePoint(uint64 timestamp, uint64 track);
+  bool AddCuePoint(uint64_t timestamp, uint64_t track);
 
   // Adds a frame to be output in the file. Returns true on success.
   // Inputs:
@@ -1155,8 +1341,8 @@ class Segment {
   //                 functions.
   //   timestamp:    Timestamp of the frame in nanoseconds from 0.
   //   is_key:       Flag telling whether or not this frame is a key frame.
-  bool AddFrame(const uint8* data, uint64 length, uint64 track_number,
-                uint64 timestamp_ns, bool is_key);
+  bool AddFrame(const uint8_t* data, uint64_t length, uint64_t track_number,
+                uint64_t timestamp_ns, bool is_key);
 
   // Writes a frame of metadata to the output medium; returns true on
   // success.
@@ -1172,8 +1358,8 @@ class Segment {
   // The metadata frame is written as a block group, with a duration
   // sub-element but no reference time sub-elements (indicating that
   // it is considered a keyframe, per Matroska semantics).
-  bool AddMetadata(const uint8* data, uint64 length, uint64 track_number,
-                   uint64 timestamp_ns, uint64 duration_ns);
+  bool AddMetadata(const uint8_t* data, uint64_t length, uint64_t track_number,
+                   uint64_t timestamp_ns, uint64_t duration_ns);
 
   // Writes a frame with additional data to the output medium; returns true on
   // success.
@@ -1188,10 +1374,11 @@ class Segment {
   //   timestamp:    Absolute timestamp of the frame, expressed in nanosecond
   //                 units.
   //   is_key:       Flag telling whether or not this frame is a key frame.
-  bool AddFrameWithAdditional(const uint8* data, uint64 length,
-                              const uint8* additional, uint64 additional_length,
-                              uint64 add_id, uint64 track_number,
-                              uint64 timestamp, bool is_key);
+  bool AddFrameWithAdditional(const uint8_t* data, uint64_t length,
+                              const uint8_t* additional,
+                              uint64_t additional_length, uint64_t add_id,
+                              uint64_t track_number, uint64_t timestamp,
+                              bool is_key);
 
   // Writes a frame with DiscardPadding to the output medium; returns true on
   // success.
@@ -1204,9 +1391,10 @@ class Segment {
   //   timestamp:    Absolute timestamp of the frame, expressed in nanosecond
   //                 units.
   //   is_key:       Flag telling whether or not this frame is a key frame.
-  bool AddFrameWithDiscardPadding(const uint8* data, uint64 length,
-                                  int64 discard_padding, uint64 track_number,
-                                  uint64 timestamp, bool is_key);
+  bool AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length,
+                                  int64_t discard_padding,
+                                  uint64_t track_number, uint64_t timestamp,
+                                  bool is_key);
 
   // Writes a Frame to the output medium. Chooses the correct way of writing
   // the frame (Block vs SimpleBlock) based on the parameters passed.
@@ -1218,7 +1406,7 @@ class Segment {
   // success, 0 on error. |number| is the number to use for the video track.
   // |number| must be >= 0. If |number| == 0 then the muxer will decide on
   // the track number.
-  uint64 AddVideoTrack(int32 width, int32 height, int32 number);
+  uint64_t AddVideoTrack(int32_t width, int32_t height, int32_t number);
 
   // This function must be called after Finalize() if you need a copy of the
   // output with Cues written before the Clusters. It will return false if the
@@ -1237,7 +1425,7 @@ class Segment {
   // Sets which track to use for the Cues element. Must have added the track
   // before calling this function. Returns true on success. |track_number| is
   // returned by the Add track functions.
-  bool CuesTrack(uint64 track_number);
+  bool CuesTrack(uint64_t track_number);
 
   // This will force the muxer to create a new Cluster when the next frame is
   // added.
@@ -1257,11 +1445,17 @@ class Segment {
 
   // Search the Tracks and return the track that matches |track_number|.
   // Returns NULL if there is no track match.
-  Track* GetTrackByNumber(uint64 track_number) const;
+  Track* GetTrackByNumber(uint64_t track_number) const;
 
   // Toggles whether to output a cues element.
   void OutputCues(bool output_cues);
 
+  // Toggles whether to write the last frame in each Cluster with Duration.
+  void AccurateClusterDuration(bool accurate_cluster_duration);
+
+  // Toggles whether to write the Cluster Timecode using exactly 8 bytes.
+  void UseFixedSizeClusterTimecode(bool fixed_size_cluster_timecode);
+
   // Sets if the muxer will output files in chunks or not. |chunking| is a
   // flag telling whether or not to turn on chunking. |filename| is the base
   // filename for the chunk files. The header chunk file will be named
@@ -1274,15 +1468,15 @@ class Segment {
   bool SetChunking(bool chunking, const char* filename);
 
   bool chunking() const { return chunking_; }
-  uint64 cues_track() const { return cues_track_; }
-  void set_max_cluster_duration(uint64 max_cluster_duration) {
+  uint64_t cues_track() const { return cues_track_; }
+  void set_max_cluster_duration(uint64_t max_cluster_duration) {
     max_cluster_duration_ = max_cluster_duration;
   }
-  uint64 max_cluster_duration() const { return max_cluster_duration_; }
-  void set_max_cluster_size(uint64 max_cluster_size) {
+  uint64_t max_cluster_duration() const { return max_cluster_duration_; }
+  void set_max_cluster_size(uint64_t max_cluster_size) {
     max_cluster_size_ = max_cluster_size;
   }
-  uint64 max_cluster_size() const { return max_cluster_size_; }
+  uint64_t max_cluster_size() const { return max_cluster_size_; }
   void set_mode(Mode mode) { mode_ = mode; }
   Mode mode() const { return mode_; }
   CuesPosition cues_position() const { return cues_position_; }
@@ -1306,7 +1500,7 @@ class Segment {
   // Returns the maximum offset within the segment's payload. When chunking
   // this function is needed to determine offsets of elements within the
   // chunked files. Returns -1 on error.
-  int64 MaxOffset();
+  int64_t MaxOffset();
 
   // Adds the frame to our frame array.
   bool QueueFrame(Frame* frame);
@@ -1318,7 +1512,7 @@ class Segment {
   // Output all frames that are queued that have an end time that is less
   // then |timestamp|. Returns true on success and if there are no frames
   // queued.
-  bool WriteFramesLessThan(uint64 timestamp);
+  bool WriteFramesLessThan(uint64_t timestamp);
 
   // Outputs the segment header, Segment Information element, SeekHead element,
   // and Tracks element to |writer_|.
@@ -1332,16 +1526,17 @@ class Segment {
   //  0 = do not create a new cluster, and write frame to the existing cluster
   //  1 = create a new cluster, and write frame to that new cluster
   //  2 = create a new cluster, and re-run test
-  int TestFrame(uint64 track_num, uint64 timestamp_ns, bool key) const;
+  int TestFrame(uint64_t track_num, uint64_t timestamp_ns, bool key) const;
 
   // Create a new cluster, using the earlier of the first enqueued
   // frame, or the indicated time. Returns true on success.
-  bool MakeNewCluster(uint64 timestamp_ns);
+  bool MakeNewCluster(uint64_t timestamp_ns);
 
   // Checks whether a new cluster needs to be created, and if so
   // creates a new cluster. Returns false if creation of a new cluster
   // was necessary but creation was not successful.
-  bool DoNewClusterProcessing(uint64 track_num, uint64 timestamp_ns, bool key);
+  bool DoNewClusterProcessing(uint64_t track_num, uint64_t timestamp_ns,
+                              bool key);
 
   // Adjusts Cue Point values (to place Cues before Clusters) so that they
   // reflect the correct offsets.
@@ -1355,7 +1550,8 @@ class Segment {
   //        accounted for.
   // index - index in the list of Cues which is currently being adjusted.
   // cue_size - sum of size of all the CuePoint elements.
-  void MoveCuesBeforeClustersHelper(uint64 diff, int index, uint64* cue_size);
+  void MoveCuesBeforeClustersHelper(uint64_t diff, int index,
+                                    uint64_t* cue_size);
 
   // Seeds the random number generator used to make UIDs.
   unsigned int seed_;
@@ -1394,22 +1590,22 @@ class Segment {
   char* chunking_base_name_;
 
   // File position offset where the Clusters end.
-  int64 cluster_end_offset_;
+  int64_t cluster_end_offset_;
 
   // List of clusters.
   Cluster** cluster_list_;
 
   // Number of cluster pointers allocated in the cluster list.
-  int32 cluster_list_capacity_;
+  int32_t cluster_list_capacity_;
 
   // Number of clusters in the cluster list.
-  int32 cluster_list_size_;
+  int32_t cluster_list_size_;
 
   // Indicates whether Cues should be written before or after Clusters
   CuesPosition cues_position_;
 
   // Track number that is associated with the cues element for this segment.
-  uint64 cues_track_;
+  uint64_t cues_track_;
 
   // Tells the muxer to force a new cluster on the next Block.
   bool force_new_cluster_;
@@ -1421,10 +1617,10 @@ class Segment {
   Frame** frames_;
 
   // Number of frame pointers allocated in the frame list.
-  int32 frames_capacity_;
+  int32_t frames_capacity_;
 
   // Number of frames in the frame list.
-  int32 frames_size_;
+  int32_t frames_size_;
 
   // Flag telling if a video track has been added to the segment.
   bool has_video_;
@@ -1433,23 +1629,23 @@ class Segment {
   bool header_written_;
 
   // Duration of the last block in nanoseconds.
-  uint64 last_block_duration_;
+  uint64_t last_block_duration_;
 
   // Last timestamp in nanoseconds added to a cluster.
-  uint64 last_timestamp_;
+  uint64_t last_timestamp_;
 
   // Last timestamp in nanoseconds by track number added to a cluster.
-  uint64 last_track_timestamp_[kMaxTrackNumber];
+  uint64_t last_track_timestamp_[kMaxTrackNumber];
 
   // Maximum time in nanoseconds for a cluster duration. This variable is a
   // guideline and some clusters may have a longer duration. Default is 30
   // seconds.
-  uint64 max_cluster_duration_;
+  uint64_t max_cluster_duration_;
 
   // Maximum size in bytes for a cluster. This variable is a guideline and
   // some clusters may have a larger size. Default is 0 which signifies that
   // the muxer will decide the size.
-  uint64 max_cluster_size_;
+  uint64_t max_cluster_size_;
 
   // The mode that segment is in. If set to |kLive| the writer must not
   // seek backwards.
@@ -1462,22 +1658,29 @@ class Segment {
   // Flag whether or not the muxer should output a Cues element.
   bool output_cues_;
 
+  // Flag whether or not the last frame in each Cluster will have a Duration
+  // element in it.
+  bool accurate_cluster_duration_;
+
+  // Flag whether or not to write the Cluster Timecode using exactly 8 bytes.
+  bool fixed_size_cluster_timecode_;
+
   // The size of the EBML header, used to validate the header if
   // WriteEbmlHeader() is called more than once.
-  int32 ebml_header_size_;
+  int32_t ebml_header_size_;
 
   // The file position of the segment's payload.
-  int64 payload_pos_;
+  int64_t payload_pos_;
 
   // The file position of the element's size.
-  int64 size_position_;
+  int64_t size_position_;
 
   // Current DocTypeVersion (|doc_type_version_|) and that written in
   // WriteSegmentHeader().
   // WriteEbmlHeader() will be called from Finalize() if |doc_type_version_|
   // differs from |doc_type_version_written_|.
-  uint32 doc_type_version_;
-  uint32 doc_type_version_written_;
+  uint32_t doc_type_version_;
+  uint32_t doc_type_version_written_;
 
   // Pointer to the writer objects. Not owned by this class.
   IMkvWriter* writer_cluster_;
@@ -1487,6 +1690,6 @@ class Segment {
   LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Segment);
 };
 
-}  // end namespace mkvmuxer
+}  // namespace mkvmuxer
 
-#endif  // MKVMUXER_HPP
+#endif  // MKVMUXER_MKVMUXER_H_
diff --git a/third_party/libwebm/mkvmuxertypes.hpp b/third_party/libwebm/mkvmuxer/mkvmuxertypes.h
similarity index 87%
rename from third_party/libwebm/mkvmuxertypes.hpp
rename to third_party/libwebm/mkvmuxer/mkvmuxertypes.h
index d0fc9fe..e5db121 100644
--- a/third_party/libwebm/mkvmuxertypes.hpp
+++ b/third_party/libwebm/mkvmuxer/mkvmuxertypes.h
@@ -6,25 +6,23 @@
 // in the file PATENTS.  All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 
-#ifndef MKVMUXERTYPES_HPP
-#define MKVMUXERTYPES_HPP
-
-// Copied from Chromium basictypes.h
-// A macro to disallow the copy constructor and operator= functions
-// This should be used in the private: declarations for a class
-#define LIBWEBM_DISALLOW_COPY_AND_ASSIGN(TypeName) \
-  TypeName(const TypeName&);                       \
-  void operator=(const TypeName&)
+#ifndef MKVMUXER_MKVMUXERTYPES_H_
+#define MKVMUXER_MKVMUXERTYPES_H_
 
 namespace mkvmuxer {
-
 typedef unsigned char uint8;
 typedef short int16;
 typedef int int32;
 typedef unsigned int uint32;
 typedef long long int64;
 typedef unsigned long long uint64;
+}  // namespace mkvmuxer
 
-}  // end namespace mkvmuxer
+// Copied from Chromium basictypes.h
+// A macro to disallow the copy constructor and operator= functions
+// This should be used in the private: declarations for a class
+#define LIBWEBM_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&);                       \
+  void operator=(const TypeName&)
 
-#endif  // MKVMUXERTYPES_HPP
+#endif  // MKVMUXER_MKVMUXERTYPES_HPP_
diff --git a/third_party/libwebm/mkvmuxerutil.cpp b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
similarity index 50%
rename from third_party/libwebm/mkvmuxerutil.cpp
rename to third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
index 27ab15d..3562b8a 100644
--- a/third_party/libwebm/mkvmuxerutil.cpp
+++ b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -6,7 +6,7 @@
 // in the file PATENTS.  All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 
-#include "mkvmuxerutil.hpp"
+#include "mkvmuxer/mkvmuxerutil.h"
 
 #ifdef __ANDROID__
 #include <fcntl.h>
@@ -20,13 +20,9 @@
 #include <ctime>
 #include <new>
 
-#include "mkvwriter.hpp"
-#include "webmids.hpp"
-
-#ifdef _MSC_VER
-// Disable MSVC warnings that suggest making code non-portable.
-#pragma warning(disable : 4996)
-#endif
+#include "common/webmids.h"
+#include "mkvmuxer/mkvmuxer.h"
+#include "mkvmuxer/mkvwriter.h"
 
 namespace mkvmuxer {
 
@@ -35,64 +31,68 @@ namespace {
 // Date elements are always 8 octets in size.
 const int kDateElementSize = 8;
 
-uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode,
-                  uint64 timecode_scale) {
-  uint64 block_additional_elem_size = 0;
-  uint64 block_addid_elem_size = 0;
-  uint64 block_more_payload_size = 0;
-  uint64 block_more_elem_size = 0;
-  uint64 block_additions_payload_size = 0;
-  uint64 block_additions_elem_size = 0;
+uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame,
+                    int64_t timecode, uint64_t timecode_scale) {
+  uint64_t block_additional_elem_size = 0;
+  uint64_t block_addid_elem_size = 0;
+  uint64_t block_more_payload_size = 0;
+  uint64_t block_more_elem_size = 0;
+  uint64_t block_additions_payload_size = 0;
+  uint64_t block_additions_elem_size = 0;
   if (frame->additional()) {
-    block_additional_elem_size = EbmlElementSize(
-        kMkvBlockAdditional, frame->additional(), frame->additional_length());
-    block_addid_elem_size = EbmlElementSize(kMkvBlockAddID, frame->add_id());
+    block_additional_elem_size =
+        EbmlElementSize(libwebm::kMkvBlockAdditional, frame->additional(),
+                        frame->additional_length());
+    block_addid_elem_size =
+        EbmlElementSize(libwebm::kMkvBlockAddID, frame->add_id());
 
     block_more_payload_size =
         block_addid_elem_size + block_additional_elem_size;
     block_more_elem_size =
-        EbmlMasterElementSize(kMkvBlockMore, block_more_payload_size) +
+        EbmlMasterElementSize(libwebm::kMkvBlockMore, block_more_payload_size) +
         block_more_payload_size;
     block_additions_payload_size = block_more_elem_size;
     block_additions_elem_size =
-        EbmlMasterElementSize(kMkvBlockAdditions,
+        EbmlMasterElementSize(libwebm::kMkvBlockAdditions,
                               block_additions_payload_size) +
         block_additions_payload_size;
   }
 
-  uint64 discard_padding_elem_size = 0;
+  uint64_t discard_padding_elem_size = 0;
   if (frame->discard_padding() != 0) {
     discard_padding_elem_size =
-        EbmlElementSize(kMkvDiscardPadding, frame->discard_padding());
+        EbmlElementSize(libwebm::kMkvDiscardPadding, frame->discard_padding());
   }
 
-  const uint64 reference_block_timestamp =
+  const uint64_t reference_block_timestamp =
       frame->reference_block_timestamp() / timecode_scale;
-  uint64 reference_block_elem_size = 0;
+  uint64_t reference_block_elem_size = 0;
   if (!frame->is_key()) {
     reference_block_elem_size =
-        EbmlElementSize(kMkvReferenceBlock, reference_block_timestamp);
+        EbmlElementSize(libwebm::kMkvReferenceBlock, reference_block_timestamp);
   }
 
-  const uint64 duration = frame->duration() / timecode_scale;
-  uint64 block_duration_elem_size = 0;
+  const uint64_t duration = frame->duration() / timecode_scale;
+  uint64_t block_duration_elem_size = 0;
   if (duration > 0)
-    block_duration_elem_size = EbmlElementSize(kMkvBlockDuration, duration);
+    block_duration_elem_size =
+        EbmlElementSize(libwebm::kMkvBlockDuration, duration);
 
-  const uint64 block_payload_size = 4 + frame->length();
-  const uint64 block_elem_size =
-      EbmlMasterElementSize(kMkvBlock, block_payload_size) + block_payload_size;
+  const uint64_t block_payload_size = 4 + frame->length();
+  const uint64_t block_elem_size =
+      EbmlMasterElementSize(libwebm::kMkvBlock, block_payload_size) +
+      block_payload_size;
 
-  const uint64 block_group_payload_size =
+  const uint64_t block_group_payload_size =
       block_elem_size + block_additions_elem_size + block_duration_elem_size +
       discard_padding_elem_size + reference_block_elem_size;
 
-  if (!WriteEbmlMasterElement(writer, kMkvBlockGroup,
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlockGroup,
                               block_group_payload_size)) {
     return 0;
   }
 
-  if (!WriteEbmlMasterElement(writer, kMkvBlock, block_payload_size))
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlock, block_payload_size))
     return 0;
 
   if (WriteUInt(writer, frame->track_number()))
@@ -105,77 +105,81 @@ uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode,
   if (SerializeInt(writer, 0, 1))
     return 0;
 
-  if (writer->Write(frame->frame(), static_cast<uint32>(frame->length())))
+  if (writer->Write(frame->frame(), static_cast<uint32_t>(frame->length())))
     return 0;
 
   if (frame->additional()) {
-    if (!WriteEbmlMasterElement(writer, kMkvBlockAdditions,
+    if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlockAdditions,
                                 block_additions_payload_size)) {
       return 0;
     }
 
-    if (!WriteEbmlMasterElement(writer, kMkvBlockMore, block_more_payload_size))
+    if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlockMore,
+                                block_more_payload_size))
       return 0;
 
-    if (!WriteEbmlElement(writer, kMkvBlockAddID, frame->add_id()))
+    if (!WriteEbmlElement(writer, libwebm::kMkvBlockAddID, frame->add_id()))
       return 0;
 
-    if (!WriteEbmlElement(writer, kMkvBlockAdditional, frame->additional(),
-                          frame->additional_length())) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvBlockAdditional,
+                          frame->additional(), frame->additional_length())) {
       return 0;
     }
   }
 
   if (frame->discard_padding() != 0 &&
-      !WriteEbmlElement(writer, kMkvDiscardPadding, frame->discard_padding())) {
+      !WriteEbmlElement(writer, libwebm::kMkvDiscardPadding,
+                        frame->discard_padding())) {
     return false;
   }
 
   if (!frame->is_key() &&
-      !WriteEbmlElement(writer, kMkvReferenceBlock,
+      !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock,
                         reference_block_timestamp)) {
     return false;
   }
 
-  if (duration > 0 && !WriteEbmlElement(writer, kMkvBlockDuration, duration)) {
+  if (duration > 0 &&
+      !WriteEbmlElement(writer, libwebm::kMkvBlockDuration, duration)) {
     return false;
   }
-  return EbmlMasterElementSize(kMkvBlockGroup, block_group_payload_size) +
+  return EbmlMasterElementSize(libwebm::kMkvBlockGroup,
+                               block_group_payload_size) +
          block_group_payload_size;
 }
 
-uint64 WriteSimpleBlock(IMkvWriter* writer, const Frame* const frame,
-                        int64 timecode) {
-  if (WriteID(writer, kMkvSimpleBlock))
+uint64_t WriteSimpleBlock(IMkvWriter* writer, const Frame* const frame,
+                          int64_t timecode) {
+  if (WriteID(writer, libwebm::kMkvSimpleBlock))
     return 0;
 
-  const int32 size = static_cast<int32>(frame->length()) + 4;
+  const int32_t size = static_cast<int32_t>(frame->length()) + 4;
   if (WriteUInt(writer, size))
     return 0;
 
-  if (WriteUInt(writer, static_cast<uint64>(frame->track_number())))
+  if (WriteUInt(writer, static_cast<uint64_t>(frame->track_number())))
     return 0;
 
   if (SerializeInt(writer, timecode, 2))
     return 0;
 
-  uint64 flags = 0;
+  uint64_t flags = 0;
   if (frame->is_key())
     flags |= 0x80;
 
   if (SerializeInt(writer, flags, 1))
     return 0;
 
-  if (writer->Write(frame->frame(), static_cast<uint32>(frame->length())))
+  if (writer->Write(frame->frame(), static_cast<uint32_t>(frame->length())))
     return 0;
 
-  return GetUIntSize(kMkvSimpleBlock) + GetCodedUIntSize(size) + 4 +
-         frame->length();
+  return static_cast<uint64_t>(GetUIntSize(libwebm::kMkvSimpleBlock) +
+                               GetCodedUIntSize(size) + 4 + frame->length());
 }
 
 }  // namespace
 
-int32 GetCodedUIntSize(uint64 value) {
+int32_t GetCodedUIntSize(uint64_t value) {
   if (value < 0x000000000000007FULL)
     return 1;
   else if (value < 0x0000000000003FFFULL)
@@ -193,7 +197,7 @@ int32 GetCodedUIntSize(uint64 value) {
   return 8;
 }
 
-int32 GetUIntSize(uint64 value) {
+int32_t GetUIntSize(uint64_t value) {
   if (value < 0x0000000000000100ULL)
     return 1;
   else if (value < 0x0000000000010000ULL)
@@ -211,26 +215,26 @@ int32 GetUIntSize(uint64 value) {
   return 8;
 }
 
-int32 GetIntSize(int64 value) {
+int32_t GetIntSize(int64_t value) {
   // Doubling the requested value ensures positive values with their high bit
   // set are written with 0-padding to avoid flipping the signedness.
-  const uint64 v = (value < 0) ? value ^ -1LL : value;
+  const uint64_t v = (value < 0) ? value ^ -1LL : value;
   return GetUIntSize(2 * v);
 }
 
-uint64 EbmlMasterElementSize(uint64 type, uint64 value) {
+uint64_t EbmlMasterElementSize(uint64_t type, uint64_t value) {
   // Size of EBML ID
-  int32 ebml_size = GetUIntSize(type);
+  int32_t ebml_size = GetUIntSize(type);
 
   // Datasize
   ebml_size += GetCodedUIntSize(value);
 
-  return ebml_size;
+  return static_cast<uint64_t>(ebml_size);
 }
 
-uint64 EbmlElementSize(uint64 type, int64 value) {
+uint64_t EbmlElementSize(uint64_t type, int64_t value) {
   // Size of EBML ID
-  int32 ebml_size = GetUIntSize(type);
+  int32_t ebml_size = GetUIntSize(type);
 
   // Datasize
   ebml_size += GetIntSize(value);
@@ -238,15 +242,20 @@ uint64 EbmlElementSize(uint64 type, int64 value) {
   // Size of Datasize
   ebml_size++;
 
-  return ebml_size;
+  return static_cast<uint64_t>(ebml_size);
+}
+
+uint64_t EbmlElementSize(uint64_t type, uint64_t value) {
+  return EbmlElementSize(type, value, 0);
 }
 
-uint64 EbmlElementSize(uint64 type, uint64 value) {
+uint64_t EbmlElementSize(uint64_t type, uint64_t value, uint64_t fixed_size) {
   // Size of EBML ID
-  int32 ebml_size = GetUIntSize(type);
+  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
 
   // Datasize
-  ebml_size += GetUIntSize(value);
+  ebml_size +=
+      (fixed_size > 0) ? fixed_size : static_cast<uint64_t>(GetUIntSize(value));
 
   // Size of Datasize
   ebml_size++;
@@ -254,9 +263,9 @@ uint64 EbmlElementSize(uint64 type, uint64 value) {
   return ebml_size;
 }
 
-uint64 EbmlElementSize(uint64 type, float /* value */) {
+uint64_t EbmlElementSize(uint64_t type, float /* value */) {
   // Size of EBML ID
-  uint64 ebml_size = GetUIntSize(type);
+  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
 
   // Datasize
   ebml_size += sizeof(float);
@@ -267,12 +276,12 @@ uint64 EbmlElementSize(uint64 type, float /* value */) {
   return ebml_size;
 }
 
-uint64 EbmlElementSize(uint64 type, const char* value) {
+uint64_t EbmlElementSize(uint64_t type, const char* value) {
   if (!value)
     return 0;
 
   // Size of EBML ID
-  uint64 ebml_size = GetUIntSize(type);
+  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
 
   // Datasize
   ebml_size += strlen(value);
@@ -283,12 +292,12 @@ uint64 EbmlElementSize(uint64 type, const char* value) {
   return ebml_size;
 }
 
-uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size) {
+uint64_t EbmlElementSize(uint64_t type, const uint8_t* value, uint64_t size) {
   if (!value)
     return 0;
 
   // Size of EBML ID
-  uint64 ebml_size = GetUIntSize(type);
+  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
 
   // Datasize
   ebml_size += size;
@@ -299,9 +308,9 @@ uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size) {
   return ebml_size;
 }
 
-uint64 EbmlDateElementSize(uint64 type) {
+uint64_t EbmlDateElementSize(uint64_t type) {
   // Size of EBML ID
-  uint64 ebml_size = GetUIntSize(type);
+  uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type));
 
   // Datasize
   ebml_size += kDateElementSize;
@@ -312,18 +321,18 @@ uint64 EbmlDateElementSize(uint64 type) {
   return ebml_size;
 }
 
-int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size) {
+int32_t SerializeInt(IMkvWriter* writer, int64_t value, int32_t size) {
   if (!writer || size < 1 || size > 8)
     return -1;
 
-  for (int32 i = 1; i <= size; ++i) {
-    const int32 byte_count = size - i;
-    const int32 bit_count = byte_count * 8;
+  for (int32_t i = 1; i <= size; ++i) {
+    const int32_t byte_count = size - i;
+    const int32_t bit_count = byte_count * 8;
 
-    const int64 bb = value >> bit_count;
-    const uint8 b = static_cast<uint8>(bb);
+    const int64_t bb = value >> bit_count;
+    const uint8_t b = static_cast<uint8_t>(bb);
 
-    const int32 status = writer->Write(&b, 1);
+    const int32_t status = writer->Write(&b, 1);
 
     if (status < 0)
       return status;
@@ -332,26 +341,26 @@ int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size) {
   return 0;
 }
 
-int32 SerializeFloat(IMkvWriter* writer, float f) {
+int32_t SerializeFloat(IMkvWriter* writer, float f) {
   if (!writer)
     return -1;
 
-  assert(sizeof(uint32) == sizeof(float));
+  assert(sizeof(uint32_t) == sizeof(float));
   // This union is merely used to avoid a reinterpret_cast from float& to
   // uint32& which will result in violation of strict aliasing.
   union U32 {
-    uint32 u32;
+    uint32_t u32;
     float f;
   } value;
   value.f = f;
 
-  for (int32 i = 1; i <= 4; ++i) {
-    const int32 byte_count = 4 - i;
-    const int32 bit_count = byte_count * 8;
+  for (int32_t i = 1; i <= 4; ++i) {
+    const int32_t byte_count = 4 - i;
+    const int32_t bit_count = byte_count * 8;
 
-    const uint8 byte = static_cast<uint8>(value.u32 >> bit_count);
+    const uint8_t byte = static_cast<uint8_t>(value.u32 >> bit_count);
 
-    const int32 status = writer->Write(&byte, 1);
+    const int32_t status = writer->Write(&byte, 1);
 
     if (status < 0)
       return status;
@@ -360,21 +369,21 @@ int32 SerializeFloat(IMkvWriter* writer, float f) {
   return 0;
 }
 
-int32 WriteUInt(IMkvWriter* writer, uint64 value) {
+int32_t WriteUInt(IMkvWriter* writer, uint64_t value) {
   if (!writer)
     return -1;
 
-  int32 size = GetCodedUIntSize(value);
+  int32_t size = GetCodedUIntSize(value);
 
   return WriteUIntSize(writer, value, size);
 }
 
-int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size) {
+int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size) {
   if (!writer || size < 0 || size > 8)
     return -1;
 
   if (size > 0) {
-    const uint64 bit = 1LL << (size * 7);
+    const uint64_t bit = 1LL << (size * 7);
 
     if (value > (bit - 2))
       return -1;
@@ -382,11 +391,11 @@ int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size) {
     value |= bit;
   } else {
     size = 1;
-    int64 bit;
+    int64_t bit;
 
     for (;;) {
       bit = 1LL << (size * 7);
-      const uint64 max = bit - 2;
+      const uint64_t max = bit - 2;
 
       if (value <= max)
         break;
@@ -403,18 +412,18 @@ int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size) {
   return SerializeInt(writer, value, size);
 }
 
-int32 WriteID(IMkvWriter* writer, uint64 type) {
+int32_t WriteID(IMkvWriter* writer, uint64_t type) {
   if (!writer)
     return -1;
 
   writer->ElementStartNotify(type, writer->Position());
 
-  const int32 size = GetUIntSize(type);
+  const int32_t size = GetUIntSize(type);
 
   return SerializeInt(writer, type, size);
 }
 
-bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 type, uint64 size) {
+bool WriteEbmlMasterElement(IMkvWriter* writer, uint64_t type, uint64_t size) {
   if (!writer)
     return false;
 
@@ -427,41 +436,51 @@ bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 type, uint64 size) {
   return true;
 }
 
-bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value) {
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value) {
+  return WriteEbmlElement(writer, type, value, 0);
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value,
+                      uint64_t fixed_size) {
   if (!writer)
     return false;
 
   if (WriteID(writer, type))
     return false;
 
-  const uint64 size = GetUIntSize(value);
+  uint64_t size = static_cast<uint64_t>(GetUIntSize(value));
+  if (fixed_size > 0) {
+    if (size > fixed_size)
+      return false;
+    size = fixed_size;
+  }
   if (WriteUInt(writer, size))
     return false;
 
-  if (SerializeInt(writer, value, static_cast<int32>(size)))
+  if (SerializeInt(writer, value, static_cast<int32_t>(size)))
     return false;
 
   return true;
 }
 
-bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value) {
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, int64_t value) {
   if (!writer)
     return false;
 
   if (WriteID(writer, type))
     return 0;
 
-  const uint64 size = GetIntSize(value);
+  const uint64_t size = GetIntSize(value);
   if (WriteUInt(writer, size))
     return false;
 
-  if (SerializeInt(writer, value, static_cast<int32>(size)))
+  if (SerializeInt(writer, value, static_cast<int32_t>(size)))
     return false;
 
   return true;
 }
 
-bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value) {
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, float value) {
   if (!writer)
     return false;
 
@@ -477,25 +496,25 @@ bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value) {
   return true;
 }
 
-bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value) {
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const char* value) {
   if (!writer || !value)
     return false;
 
   if (WriteID(writer, type))
     return false;
 
-  const uint64 length = strlen(value);
+  const uint64_t length = strlen(value);
   if (WriteUInt(writer, length))
     return false;
 
-  if (writer->Write(value, static_cast<const uint32>(length)))
+  if (writer->Write(value, static_cast<const uint32_t>(length)))
     return false;
 
   return true;
 }
 
-bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
-                      uint64 size) {
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const uint8_t* value,
+                      uint64_t size) {
   if (!writer || !value || size < 1)
     return false;
 
@@ -505,13 +524,13 @@ bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
   if (WriteUInt(writer, size))
     return false;
 
-  if (writer->Write(value, static_cast<uint32>(size)))
+  if (writer->Write(value, static_cast<uint32_t>(size)))
     return false;
 
   return true;
 }
 
-bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value) {
+bool WriteEbmlDateElement(IMkvWriter* writer, uint64_t type, int64_t value) {
   if (!writer)
     return false;
 
@@ -527,8 +546,8 @@ bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value) {
   return true;
 }
 
-uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
-                  Cluster* cluster) {
+uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame,
+                    Cluster* cluster) {
   if (!writer || !frame || !frame->IsValid() || !cluster ||
       !cluster->timecode_scale())
     return 0;
@@ -537,7 +556,7 @@ uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
   //  timecode for the cluster itself (remember that block timecode
   //  is a signed, 16-bit integer).  However, as a simplification we
   //  only permit non-negative cluster-relative timecodes for blocks.
-  const int64 relative_timecode = cluster->GetRelativeTimecode(
+  const int64_t relative_timecode = cluster->GetRelativeTimecode(
       frame->timestamp() / cluster->timecode_scale());
   if (relative_timecode < 0 || relative_timecode > kMaxBlockTimecode)
     return 0;
@@ -548,53 +567,53 @@ uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
                         cluster->timecode_scale());
 }
 
-uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) {
+uint64_t WriteVoidElement(IMkvWriter* writer, uint64_t size) {
   if (!writer)
     return false;
 
   // Subtract one for the void ID and the coded size.
-  uint64 void_entry_size = size - 1 - GetCodedUIntSize(size - 1);
-  uint64 void_size =
-      EbmlMasterElementSize(kMkvVoid, void_entry_size) + void_entry_size;
+  uint64_t void_entry_size = size - 1 - GetCodedUIntSize(size - 1);
+  uint64_t void_size =
+      EbmlMasterElementSize(libwebm::kMkvVoid, void_entry_size) +
+      void_entry_size;
 
   if (void_size != size)
     return 0;
 
-  const int64 payload_position = writer->Position();
+  const int64_t payload_position = writer->Position();
   if (payload_position < 0)
     return 0;
 
-  if (WriteID(writer, kMkvVoid))
+  if (WriteID(writer, libwebm::kMkvVoid))
     return 0;
 
   if (WriteUInt(writer, void_entry_size))
     return 0;
 
-  const uint8 value = 0;
-  for (int32 i = 0; i < static_cast<int32>(void_entry_size); ++i) {
+  const uint8_t value = 0;
+  for (int32_t i = 0; i < static_cast<int32_t>(void_entry_size); ++i) {
     if (writer->Write(&value, 1))
       return 0;
   }
 
-  const int64 stop_position = writer->Position();
+  const int64_t stop_position = writer->Position();
   if (stop_position < 0 ||
-      stop_position - payload_position != static_cast<int64>(void_size))
+      stop_position - payload_position != static_cast<int64_t>(void_size))
     return 0;
 
   return void_size;
 }
 
-void GetVersion(int32* major, int32* minor, int32* build, int32* revision) {
+void GetVersion(int32_t* major, int32_t* minor, int32_t* build,
+                int32_t* revision) {
   *major = 0;
   *minor = 2;
   *build = 1;
   *revision = 0;
 }
 
-}  // namespace mkvmuxer
-
-mkvmuxer::uint64 mkvmuxer::MakeUID(unsigned int* seed) {
-  uint64 uid = 0;
+uint64_t MakeUID(unsigned int* seed) {
+  uint64_t uid = 0;
 
 #ifdef __MINGW32__
   srand(*seed);
@@ -606,24 +625,26 @@ mkvmuxer::uint64 mkvmuxer::MakeUID(unsigned int* seed) {
 // TODO(fgalligan): Move random number generation to platform specific code.
 #ifdef _MSC_VER
     (void)seed;
-    const int32 nn = rand();
+    const int32_t nn = rand();
 #elif __ANDROID__
-    int32 temp_num = 1;
+    int32_t temp_num = 1;
     int fd = open("/dev/urandom", O_RDONLY);
     if (fd != -1) {
-      read(fd, &temp_num, sizeof(int32));
+      read(fd, &temp_num, sizeof(temp_num));
       close(fd);
     }
-    const int32 nn = temp_num;
+    const int32_t nn = temp_num;
 #elif defined __MINGW32__
-    const int32 nn = rand();
+    const int32_t nn = rand();
 #else
-    const int32 nn = rand_r(seed);
+    const int32_t nn = rand_r(seed);
 #endif
-    const int32 n = 0xFF & (nn >> 4);  // throw away low-order bits
+    const int32_t n = 0xFF & (nn >> 4);  // throw away low-order bits
 
     uid |= n;
   }
 
   return uid;
 }
+
+}  // namespace mkvmuxer
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxerutil.h b/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
new file mode 100644
index 0000000..0e21a2d
--- /dev/null
+++ b/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
@@ -0,0 +1,95 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef MKVMUXER_MKVMUXERUTIL_H_
+#define MKVMUXER_MKVMUXERUTIL_H_
+
+#include <stdint.h>
+
+namespace mkvmuxer {
+class Cluster;
+class Frame;
+class IMkvWriter;
+
+const uint64_t kEbmlUnknownValue = 0x01FFFFFFFFFFFFFFULL;
+const int64_t kMaxBlockTimecode = 0x07FFFLL;
+
+// Writes out |value| in Big Endian order. Returns 0 on success.
+int32_t SerializeInt(IMkvWriter* writer, int64_t value, int32_t size);
+
+// Returns the size in bytes of the element.
+int32_t GetUIntSize(uint64_t value);
+int32_t GetIntSize(int64_t value);
+int32_t GetCodedUIntSize(uint64_t value);
+uint64_t EbmlMasterElementSize(uint64_t type, uint64_t value);
+uint64_t EbmlElementSize(uint64_t type, int64_t value);
+uint64_t EbmlElementSize(uint64_t type, uint64_t value);
+uint64_t EbmlElementSize(uint64_t type, float value);
+uint64_t EbmlElementSize(uint64_t type, const char* value);
+uint64_t EbmlElementSize(uint64_t type, const uint8_t* value, uint64_t size);
+uint64_t EbmlDateElementSize(uint64_t type);
+
+// Returns the size in bytes of the element assuming that the element was
+// written using |fixed_size| bytes. If |fixed_size| is set to zero, then it
+// computes the necessary number of bytes based on |value|.
+uint64_t EbmlElementSize(uint64_t type, uint64_t value, uint64_t fixed_size);
+
+// Creates an EBML coded number from |value| and writes it out. The size of
+// the coded number is determined by the value of |value|. |value| must not
+// be in a coded form. Returns 0 on success.
+int32_t WriteUInt(IMkvWriter* writer, uint64_t value);
+
+// Creates an EBML coded number from |value| and writes it out. The size of
+// the coded number is determined by the value of |size|. |value| must not
+// be in a coded form. Returns 0 on success.
+int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size);
+
+// Output an Mkv master element. Returns true if the element was written.
+bool WriteEbmlMasterElement(IMkvWriter* writer, uint64_t value, uint64_t size);
+
+// Outputs an Mkv ID, calls |IMkvWriter::ElementStartNotify|, and passes the
+// ID to |SerializeInt|. Returns 0 on success.
+int32_t WriteID(IMkvWriter* writer, uint64_t type);
+
+// Output an Mkv non-master element. Returns true if the element was written.
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, int64_t value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, float value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const char* value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const uint8_t* value,
+                      uint64_t size);
+bool WriteEbmlDateElement(IMkvWriter* writer, uint64_t type, int64_t value);
+
+// Output an Mkv non-master element using fixed size. The element will be
+// written out using exactly |fixed_size| bytes. If |fixed_size| is set to zero
+// then it computes the necessary number of bytes based on |value|. Returns true
+// if the element was written.
+bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value,
+                      uint64_t fixed_size);
+
+// Output a Mkv Frame. It decides the correct element to write (Block vs
+// SimpleBlock) based on the parameters of the Frame.
+uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame,
+                    Cluster* cluster);
+
+// Output a void element. |size| must be the entire size in bytes that will be
+// void. The function will calculate the size of the void header and subtract
+// it from |size|.
+uint64_t WriteVoidElement(IMkvWriter* writer, uint64_t size);
+
+// Returns the version number of the muxer in |major|, |minor|, |build|,
+// and |revision|.
+void GetVersion(int32_t* major, int32_t* minor, int32_t* build,
+                int32_t* revision);
+
+// Returns a random number to be used for UID, using |seed| to seed
+// the random-number generator (see POSIX rand_r() for semantics).
+uint64_t MakeUID(unsigned int* seed);
+
+}  // namespace mkvmuxer
+
+#endif  // MKVMUXER_MKVMUXERUTIL_H_
diff --git a/third_party/libwebm/mkvwriter.cpp b/third_party/libwebm/mkvmuxer/mkvwriter.cc
similarity index 97%
rename from third_party/libwebm/mkvwriter.cpp
rename to third_party/libwebm/mkvmuxer/mkvwriter.cc
index 75d4350..ca48e14 100644
--- a/third_party/libwebm/mkvwriter.cpp
+++ b/third_party/libwebm/mkvmuxer/mkvwriter.cc
@@ -6,14 +6,12 @@
 // in the file PATENTS.  All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 
-#include "mkvwriter.hpp"
+#include "mkvmuxer/mkvwriter.h"
 
 #ifdef _MSC_VER
 #include <share.h>  // for _SH_DENYWR
 #endif
 
-#include <new>
-
 namespace mkvmuxer {
 
 MkvWriter::MkvWriter() : file_(NULL), writer_owns_file_(true) {}
diff --git a/third_party/libwebm/mkvwriter.hpp b/third_party/libwebm/mkvmuxer/mkvwriter.h
similarity index 87%
rename from third_party/libwebm/mkvwriter.hpp
rename to third_party/libwebm/mkvmuxer/mkvwriter.h
index 684560c..4227c63 100644
--- a/third_party/libwebm/mkvwriter.hpp
+++ b/third_party/libwebm/mkvmuxer/mkvwriter.h
@@ -6,13 +6,13 @@
 // in the file PATENTS.  All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 
-#ifndef MKVWRITER_HPP
-#define MKVWRITER_HPP
+#ifndef MKVMUXER_MKVWRITER_H_
+#define MKVMUXER_MKVWRITER_H_
 
 #include <stdio.h>
 
-#include "mkvmuxer.hpp"
-#include "mkvmuxertypes.hpp"
+#include "mkvmuxer/mkvmuxer.h"
+#include "mkvmuxer/mkvmuxertypes.h"
 
 namespace mkvmuxer {
 
@@ -46,6 +46,6 @@ class MkvWriter : public IMkvWriter {
   LIBWEBM_DISALLOW_COPY_AND_ASSIGN(MkvWriter);
 };
 
-}  // end namespace mkvmuxer
+}  // namespace mkvmuxer
 
-#endif  // MKVWRITER_HPP
+#endif  // MKVMUXER_MKVWRITER_H_
diff --git a/third_party/libwebm/mkvmuxerutil.hpp b/third_party/libwebm/mkvmuxerutil.hpp
deleted file mode 100644
index e318576..0000000
--- a/third_party/libwebm/mkvmuxerutil.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the LICENSE file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS.  All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-
-#ifndef MKVMUXERUTIL_HPP
-#define MKVMUXERUTIL_HPP
-
-#include "mkvmuxer.hpp"
-#include "mkvmuxertypes.hpp"
-
-namespace mkvmuxer {
-
-class IMkvWriter;
-
-const uint64 kEbmlUnknownValue = 0x01FFFFFFFFFFFFFFULL;
-const int64 kMaxBlockTimecode = 0x07FFFLL;
-
-// Writes out |value| in Big Endian order. Returns 0 on success.
-int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size);
-
-// Returns the size in bytes of the element.
-int32 GetUIntSize(uint64 value);
-int32 GetIntSize(int64 value);
-int32 GetCodedUIntSize(uint64 value);
-uint64 EbmlMasterElementSize(uint64 type, uint64 value);
-uint64 EbmlElementSize(uint64 type, int64 value);
-uint64 EbmlElementSize(uint64 type, uint64 value);
-uint64 EbmlElementSize(uint64 type, float value);
-uint64 EbmlElementSize(uint64 type, const char* value);
-uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size);
-uint64 EbmlDateElementSize(uint64 type);
-
-// Creates an EBML coded number from |value| and writes it out. The size of
-// the coded number is determined by the value of |value|. |value| must not
-// be in a coded form. Returns 0 on success.
-int32 WriteUInt(IMkvWriter* writer, uint64 value);
-
-// Creates an EBML coded number from |value| and writes it out. The size of
-// the coded number is determined by the value of |size|. |value| must not
-// be in a coded form. Returns 0 on success.
-int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size);
-
-// Output an Mkv master element. Returns true if the element was written.
-bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 value, uint64 size);
-
-// Outputs an Mkv ID, calls |IMkvWriter::ElementStartNotify|, and passes the
-// ID to |SerializeInt|. Returns 0 on success.
-int32 WriteID(IMkvWriter* writer, uint64 type);
-
-// Output an Mkv non-master element. Returns true if the element was written.
-bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value);
-bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value);
-bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value);
-bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value);
-bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
-                      uint64 size);
-bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value);
-
-// Output a Mkv Frame. It decides the correct element to write (Block vs
-// SimpleBlock) based on the parameters of the Frame.
-uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
-                  Cluster* cluster);
-
-// Output a void element. |size| must be the entire size in bytes that will be
-// void. The function will calculate the size of the void header and subtract
-// it from |size|.
-uint64 WriteVoidElement(IMkvWriter* writer, uint64 size);
-
-// Returns the version number of the muxer in |major|, |minor|, |build|,
-// and |revision|.
-void GetVersion(int32* major, int32* minor, int32* build, int32* revision);
-
-// Returns a random number to be used for UID, using |seed| to seed
-// the random-number generator (see POSIX rand_r() for semantics).
-uint64 MakeUID(unsigned int* seed);
-
-}  // end namespace mkvmuxer
-
-#endif  // MKVMUXERUTIL_HPP
diff --git a/third_party/libwebm/mkvparser.cpp b/third_party/libwebm/mkvparser/mkvparser.cc
similarity index 92%
rename from third_party/libwebm/mkvparser.cpp
rename to third_party/libwebm/mkvparser/mkvparser.cc
index f2855d5..2180115 100644
--- a/third_party/libwebm/mkvparser.cpp
+++ b/third_party/libwebm/mkvparser/mkvparser.cc
@@ -5,8 +5,7 @@
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS.  All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
-
-#include "mkvparser.hpp"
+#include "mkvparser/mkvparser.h"
 
 #if defined(_MSC_VER) && _MSC_VER < 1800
 #include <float.h>  // _isnan() / _finite()
@@ -14,19 +13,18 @@
 #endif
 
 #include <cassert>
+#include <cfloat>
 #include <climits>
 #include <cmath>
 #include <cstring>
+#include <memory>
 #include <new>
 
-#include "webmids.hpp"
-
-#ifdef _MSC_VER
-// Disable MSVC warnings that suggest making code non-portable.
-#pragma warning(disable : 4996)
-#endif
+#include "common/webmids.h"
 
 namespace mkvparser {
+const float MasteringMetadata::kValueNotPresent = FLT_MAX;
+const long long Colour::kValueNotPresent = LLONG_MAX;
 
 #ifdef MSC_COMPAT
 inline bool isnan(double val) { return !!_isnan(val); }
@@ -38,8 +36,9 @@ inline bool isinf(double val) { return std::isinf(val); }
 
 IMkvReader::~IMkvReader() {}
 
-template<typename Type> Type* SafeArrayAlloc(unsigned long long num_elements,
-                                             unsigned long long element_size) {
+template <typename Type>
+Type* SafeArrayAlloc(unsigned long long num_elements,
+                     unsigned long long element_size) {
   if (num_elements == 0 || element_size == 0)
     return NULL;
 
@@ -350,9 +349,8 @@ long UnserializeString(IMkvReader* pReader, long long pos, long long size,
   return 0;
 }
 
-long ParseElementHeader(IMkvReader* pReader, long long& pos,
-                        long long stop, long long& id,
-                        long long& size) {
+long ParseElementHeader(IMkvReader* pReader, long long& pos, long long stop,
+                        long long& id, long long& size) {
   if (stop >= 0 && pos >= stop)
     return E_FILE_FORMAT_INVALID;
 
@@ -386,7 +384,7 @@ long ParseElementHeader(IMkvReader* pReader, long long& pos,
 
   // pos now designates payload
 
-  if (stop >= 0 && pos >= stop)
+  if (stop >= 0 && pos > stop)
     return E_FILE_FORMAT_INVALID;
 
   return 0;  // success
@@ -520,7 +518,6 @@ long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
     return status;
 
   pos = 0;
-  long long end = (available >= 1024) ? 1024 : available;
 
   // Scan until we find what looks like the first byte of the EBML header.
   const long long kMaxScanBytes = (available >= 1024) ? 1024 : available;
@@ -544,8 +541,10 @@ long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
   long len = 0;
   const long long ebml_id = ReadID(pReader, pos, len);
 
-  // TODO(tomfinegan): Move Matroska ID constants into a common namespace.
-  if (len != 4 || ebml_id != mkvmuxer::kMkvEBML)
+  if (ebml_id == E_BUFFER_NOT_FULL)
+    return E_BUFFER_NOT_FULL;
+
+  if (len != 4 || ebml_id != libwebm::kMkvEBML)
     return E_FILE_FORMAT_INVALID;
 
   // Move read pos forward to the EBML header size field.
@@ -584,7 +583,7 @@ long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
   if ((available - pos) < result)
     return pos + result;
 
-  end = pos + result;
+  const long long end = pos + result;
 
   Init();
 
@@ -599,27 +598,27 @@ long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
     if (size == 0)
       return E_FILE_FORMAT_INVALID;
 
-    if (id == mkvmuxer::kMkvEBMLVersion) {
+    if (id == libwebm::kMkvEBMLVersion) {
       m_version = UnserializeUInt(pReader, pos, size);
 
       if (m_version <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == mkvmuxer::kMkvEBMLReadVersion) {
+    } else if (id == libwebm::kMkvEBMLReadVersion) {
       m_readVersion = UnserializeUInt(pReader, pos, size);
 
       if (m_readVersion <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == mkvmuxer::kMkvEBMLMaxIDLength) {
+    } else if (id == libwebm::kMkvEBMLMaxIDLength) {
       m_maxIdLength = UnserializeUInt(pReader, pos, size);
 
       if (m_maxIdLength <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == mkvmuxer::kMkvEBMLMaxSizeLength) {
+    } else if (id == libwebm::kMkvEBMLMaxSizeLength) {
       m_maxSizeLength = UnserializeUInt(pReader, pos, size);
 
       if (m_maxSizeLength <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == mkvmuxer::kMkvDocType) {
+    } else if (id == libwebm::kMkvDocType) {
       if (m_docType)
         return E_FILE_FORMAT_INVALID;
 
@@ -627,12 +626,12 @@ long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
 
       if (status)  // error
         return status;
-    } else if (id == mkvmuxer::kMkvDocTypeVersion) {
+    } else if (id == libwebm::kMkvDocTypeVersion) {
       m_docTypeVersion = UnserializeUInt(pReader, pos, size);
 
       if (m_docTypeVersion <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == mkvmuxer::kMkvDocTypeReadVersion) {
+    } else if (id == libwebm::kMkvDocTypeReadVersion) {
       m_docTypeReadVersion = UnserializeUInt(pReader, pos, size);
 
       if (m_docTypeReadVersion <= 0)
@@ -650,8 +649,8 @@ long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
     return E_FILE_FORMAT_INVALID;
 
   // Make sure EBMLMaxIDLength and EBMLMaxSizeLength are valid.
-  if (m_maxIdLength <= 0 || m_maxIdLength > 4 ||
-      m_maxSizeLength <= 0 || m_maxSizeLength > 8)
+  if (m_maxIdLength <= 0 || m_maxIdLength > 4 || m_maxSizeLength <= 0 ||
+      m_maxSizeLength > 8)
     return E_FILE_FORMAT_INVALID;
 
   return 0;
@@ -786,7 +785,7 @@ long long Segment::CreateInstance(IMkvReader* pReader, long long pos,
     // Handle "unknown size" for live streaming of webm files.
     const long long unknown_size = (1LL << (7 * len)) - 1;
 
-    if (id == mkvmuxer::kMkvSegment) {
+    if (id == libwebm::kMkvSegment) {
       if (size == unknown_size)
         size = -1;
 
@@ -878,7 +877,7 @@ long long Segment::ParseHeaders() {
     if (id < 0)
       return E_FILE_FORMAT_INVALID;
 
-    if (id == mkvmuxer::kMkvCluster)
+    if (id == libwebm::kMkvCluster)
       break;
 
     pos += len;  // consume ID
@@ -930,7 +929,7 @@ long long Segment::ParseHeaders() {
     if ((pos + size) > available)
       return pos + size;
 
-    if (id == mkvmuxer::kMkvInfo) {
+    if (id == libwebm::kMkvInfo) {
       if (m_pInfo)
         return E_FILE_FORMAT_INVALID;
 
@@ -944,7 +943,7 @@ long long Segment::ParseHeaders() {
 
       if (status)
         return status;
-    } else if (id == mkvmuxer::kMkvTracks) {
+    } else if (id == libwebm::kMkvTracks) {
       if (m_pTracks)
         return E_FILE_FORMAT_INVALID;
 
@@ -958,7 +957,7 @@ long long Segment::ParseHeaders() {
 
       if (status)
         return status;
-    } else if (id == mkvmuxer::kMkvCues) {
+    } else if (id == libwebm::kMkvCues) {
       if (m_pCues == NULL) {
         m_pCues = new (std::nothrow)
             Cues(this, pos, size, element_start, element_size);
@@ -966,7 +965,7 @@ long long Segment::ParseHeaders() {
         if (m_pCues == NULL)
           return -1;
       }
-    } else if (id == mkvmuxer::kMkvSeekHead) {
+    } else if (id == libwebm::kMkvSeekHead) {
       if (m_pSeekHead == NULL) {
         m_pSeekHead = new (std::nothrow)
             SeekHead(this, pos, size, element_start, element_size);
@@ -979,7 +978,7 @@ long long Segment::ParseHeaders() {
         if (status)
           return status;
       }
-    } else if (id == mkvmuxer::kMkvChapters) {
+    } else if (id == libwebm::kMkvChapters) {
       if (m_pChapters == NULL) {
         m_pChapters = new (std::nothrow)
             Chapters(this, pos, size, element_start, element_size);
@@ -992,7 +991,7 @@ long long Segment::ParseHeaders() {
         if (status)
           return status;
       }
-    } else if (id == mkvmuxer::kMkvTags) {
+    } else if (id == libwebm::kMkvTags) {
       if (m_pTags == NULL) {
         m_pTags = new (std::nothrow)
             Tags(this, pos, size, element_start, element_size);
@@ -1131,7 +1130,7 @@ long Segment::DoLoadCluster(long long& pos, long& len) {
       return E_FILE_FORMAT_INVALID;
     }
 
-    if (id == mkvmuxer::kMkvCues) {
+    if (id == libwebm::kMkvCues) {
       if (size == unknown_size) {
         // Cues element of unknown size: Not supported.
         return E_FILE_FORMAT_INVALID;
@@ -1149,7 +1148,7 @@ long Segment::DoLoadCluster(long long& pos, long& len) {
       continue;
     }
 
-    if (id != mkvmuxer::kMkvCluster) {
+    if (id != libwebm::kMkvCluster) {
       // Besides the Segment, Libwebm allows only cluster elements of unknown
       // size. Fail the parse upon encountering a non-cluster element reporting
       // unknown size.
@@ -1466,7 +1465,7 @@ long Segment::Load() {
     return E_FILE_FORMAT_INVALID;
 
   for (;;) {
-    const int status = LoadCluster();
+    const long status = LoadCluster();
 
     if (status < 0)  // error
       return status;
@@ -1512,9 +1511,9 @@ long SeekHead::Parse() {
     if (status < 0)  // error
       return status;
 
-    if (id == mkvmuxer::kMkvSeek)
+    if (id == libwebm::kMkvSeek)
       ++entry_count;
-    else if (id == mkvmuxer::kMkvVoid)
+    else if (id == libwebm::kMkvVoid)
       ++void_element_count;
 
     pos += size;  // consume payload
@@ -1553,14 +1552,14 @@ long SeekHead::Parse() {
     if (status < 0)  // error
       return status;
 
-    if (id == mkvmuxer::kMkvSeek) {
+    if (id == libwebm::kMkvSeek) {
       if (ParseEntry(pReader, pos, size, pEntry)) {
         Entry& e = *pEntry++;
 
         e.element_start = idpos;
         e.element_size = (pos + size) - idpos;
       }
-    } else if (id == mkvmuxer::kMkvVoid) {
+    } else if (id == libwebm::kMkvVoid) {
       VoidElement& e = *pVoidElement++;
 
       e.element_start = idpos;
@@ -1664,7 +1663,7 @@ long Segment::ParseCues(long long off, long long& pos, long& len) {
 
   const long long id = ReadID(m_pReader, idpos, len);
 
-  if (id != mkvmuxer::kMkvCues)
+  if (id != libwebm::kMkvCues)
     return E_FILE_FORMAT_INVALID;
 
   pos += len;  // consume ID
@@ -1746,7 +1745,7 @@ bool SeekHead::ParseEntry(IMkvReader* pReader, long long start, long long size_,
   if (seekIdId < 0)
     return false;
 
-  if (seekIdId != mkvmuxer::kMkvSeekID)
+  if (seekIdId != libwebm::kMkvSeekID)
     return false;
 
   if ((pos + len) > stop)
@@ -1790,7 +1789,7 @@ bool SeekHead::ParseEntry(IMkvReader* pReader, long long start, long long size_,
 
   const long long seekPosId = ReadID(pReader, pos, len);
 
-  if (seekPosId != mkvmuxer::kMkvSeekPosition)
+  if (seekPosId != libwebm::kMkvSeekPosition)
     return false;
 
   if ((pos + len) > stop)
@@ -1900,7 +1899,7 @@ bool Cues::Init() const {
       return false;
     }
 
-    if (id == mkvmuxer::kMkvCuePoint) {
+    if (id == libwebm::kMkvCuePoint) {
       if (!PreloadCuePoint(cue_points_size, idpos))
         return false;
     }
@@ -1975,7 +1974,7 @@ bool Cues::LoadCuePoint() const {
     if ((m_pos + size) > stop)
       return false;
 
-    if (id != mkvmuxer::kMkvCuePoint) {
+    if (id != libwebm::kMkvCuePoint) {
       m_pos += size;  // consume payload
       if (m_pos > stop)
         return false;
@@ -2105,8 +2104,8 @@ const CuePoint* Cues::GetLast() const {
 }
 
 const CuePoint* Cues::GetNext(const CuePoint* pCurr) const {
-  if (pCurr == NULL || pCurr->GetTimeCode() < 0 ||
-      m_cue_points == NULL || m_count < 1) {
+  if (pCurr == NULL || pCurr->GetTimeCode() < 0 || m_cue_points == NULL ||
+      m_count < 1) {
     return NULL;
   }
 
@@ -2286,7 +2285,7 @@ bool CuePoint::Load(IMkvReader* pReader) {
     long len;
 
     const long long id = ReadID(pReader, pos_, len);
-    if (id != mkvmuxer::kMkvCuePoint)
+    if (id != libwebm::kMkvCuePoint)
       return false;
 
     pos_ += len;  // consume ID
@@ -2326,10 +2325,10 @@ bool CuePoint::Load(IMkvReader* pReader) {
       return false;
     }
 
-    if (id == mkvmuxer::kMkvCueTime)
+    if (id == libwebm::kMkvCueTime)
       m_timecode = UnserializeUInt(pReader, pos, size);
 
-    else if (id == mkvmuxer::kMkvCueTrackPositions)
+    else if (id == libwebm::kMkvCueTrackPositions)
       ++m_track_positions_count;
 
     pos += size;  // consume payload
@@ -2368,7 +2367,7 @@ bool CuePoint::Load(IMkvReader* pReader) {
     pos += len;  // consume Size field
     assert((pos + size) <= stop);
 
-    if (id == mkvmuxer::kMkvCueTrackPositions) {
+    if (id == libwebm::kMkvCueTrackPositions) {
       TrackPosition& tp = *p++;
       if (!tp.Parse(pReader, pos, size)) {
         return false;
@@ -2417,11 +2416,11 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
       return false;
     }
 
-    if (id == mkvmuxer::kMkvCueTrack)
+    if (id == libwebm::kMkvCueTrack)
       m_track = UnserializeUInt(pReader, pos, size);
-    else if (id == mkvmuxer::kMkvCueClusterPosition)
+    else if (id == libwebm::kMkvCueClusterPosition)
       m_pos = UnserializeUInt(pReader, pos, size);
-    else if (id == mkvmuxer::kMkvCueBlockNumber)
+    else if (id == libwebm::kMkvCueBlockNumber)
       m_block = UnserializeUInt(pReader, pos, size);
 
     pos += size;  // consume payload
@@ -2555,7 +2554,7 @@ const Cluster* Segment::GetNext(const Cluster* pCurr) {
       return NULL;
 
     const long long id = ReadID(m_pReader, pos, len);
-    if (id != mkvmuxer::kMkvCluster)
+    if (id != libwebm::kMkvCluster)
       return NULL;
 
     pos += len;  // consume ID
@@ -2612,7 +2611,7 @@ const Cluster* Segment::GetNext(const Cluster* pCurr) {
     if (size == 0)  // weird
       continue;
 
-    if (id == mkvmuxer::kMkvCluster) {
+    if (id == libwebm::kMkvCluster) {
       const long long off_next_ = idpos - m_start;
 
       long long pos_;
@@ -2762,7 +2761,7 @@ long Segment::ParseNext(const Cluster* pCurr, const Cluster*& pResult,
 
     const long long id = ReadUInt(m_pReader, pos, len);
 
-    if (id != mkvmuxer::kMkvCluster)
+    if (id != libwebm::kMkvCluster)
       return -1;
 
     pos += len;  // consume ID
@@ -2927,7 +2926,7 @@ long Segment::DoParseNext(const Cluster*& pResult, long long& pos, long& len) {
       return E_FILE_FORMAT_INVALID;
     }
 
-    if (id == mkvmuxer::kMkvCues) {
+    if (id == libwebm::kMkvCues) {
       if (size == unknown_size)
         return E_FILE_FORMAT_INVALID;
 
@@ -2953,7 +2952,7 @@ long Segment::DoParseNext(const Cluster*& pResult, long long& pos, long& len) {
       continue;
     }
 
-    if (id != mkvmuxer::kMkvCluster) {  // not a Cluster ID
+    if (id != libwebm::kMkvCluster) {  // not a Cluster ID
       if (size == unknown_size)
         return E_FILE_FORMAT_INVALID;
 
@@ -3091,7 +3090,7 @@ long Segment::DoParseNext(const Cluster*& pResult, long long& pos, long& len) {
       // that we have exhausted the sub-element's inside the cluster
       // whose ID we parsed earlier.
 
-      if (id == mkvmuxer::kMkvCluster || id == mkvmuxer::kMkvCues)
+      if (id == libwebm::kMkvCluster || id == libwebm::kMkvCues)
         break;
 
       pos += len;  // consume ID (of sub-element)
@@ -3259,7 +3258,7 @@ long Chapters::Parse() {
     if (size == 0)  // weird
       continue;
 
-    if (id == mkvmuxer::kMkvEditionEntry) {
+    if (id == libwebm::kMkvEditionEntry) {
       status = ParseEdition(pos, size);
 
       if (status < 0)  // error
@@ -3375,7 +3374,7 @@ long Chapters::Edition::Parse(IMkvReader* pReader, long long pos,
     if (size == 0)
       continue;
 
-    if (id == mkvmuxer::kMkvChapterAtom) {
+    if (id == libwebm::kMkvChapterAtom) {
       status = ParseAtom(pReader, pos, size);
 
       if (status < 0)  // error
@@ -3508,17 +3507,17 @@ long Chapters::Atom::Parse(IMkvReader* pReader, long long pos, long long size) {
     if (size == 0)  // 0 length payload, skip.
       continue;
 
-    if (id == mkvmuxer::kMkvChapterDisplay) {
+    if (id == libwebm::kMkvChapterDisplay) {
       status = ParseDisplay(pReader, pos, size);
 
       if (status < 0)  // error
         return status;
-    } else if (id == mkvmuxer::kMkvChapterStringUID) {
+    } else if (id == libwebm::kMkvChapterStringUID) {
       status = UnserializeString(pReader, pos, size, m_string_uid);
 
       if (status < 0)  // error
         return status;
-    } else if (id == mkvmuxer::kMkvChapterUID) {
+    } else if (id == libwebm::kMkvChapterUID) {
       long long val;
       status = UnserializeInt(pReader, pos, size, val);
 
@@ -3526,14 +3525,14 @@ long Chapters::Atom::Parse(IMkvReader* pReader, long long pos, long long size) {
         return status;
 
       m_uid = static_cast<unsigned long long>(val);
-    } else if (id == mkvmuxer::kMkvChapterTimeStart) {
+    } else if (id == libwebm::kMkvChapterTimeStart) {
       const long long val = UnserializeUInt(pReader, pos, size);
 
       if (val < 0)  // error
         return static_cast<long>(val);
 
       m_start_timecode = val;
-    } else if (id == mkvmuxer::kMkvChapterTimeEnd) {
+    } else if (id == libwebm::kMkvChapterTimeEnd) {
       const long long val = UnserializeUInt(pReader, pos, size);
 
       if (val < 0)  // error
@@ -3661,17 +3660,17 @@ long Chapters::Display::Parse(IMkvReader* pReader, long long pos,
     if (size == 0)  // No payload.
       continue;
 
-    if (id == mkvmuxer::kMkvChapString) {
+    if (id == libwebm::kMkvChapString) {
       status = UnserializeString(pReader, pos, size, m_string);
 
       if (status)
         return status;
-    } else if (id == mkvmuxer::kMkvChapLanguage) {
+    } else if (id == libwebm::kMkvChapLanguage) {
       status = UnserializeString(pReader, pos, size, m_language);
 
       if (status)
         return status;
-    } else if (id == mkvmuxer::kMkvChapCountry) {
+    } else if (id == libwebm::kMkvChapCountry) {
       status = UnserializeString(pReader, pos, size, m_country);
 
       if (status)
@@ -3724,7 +3723,7 @@ long Tags::Parse() {
     if (size == 0)  // 0 length tag, read another
       continue;
 
-    if (id == mkvmuxer::kMkvTag) {
+    if (id == libwebm::kMkvTag) {
       status = ParseTag(pos, size);
 
       if (status < 0)
@@ -3840,7 +3839,7 @@ long Tags::Tag::Parse(IMkvReader* pReader, long long pos, long long size) {
     if (size == 0)  // 0 length tag, read another
       continue;
 
-    if (id == mkvmuxer::kMkvSimpleTag) {
+    if (id == libwebm::kMkvSimpleTag) {
       status = ParseSimpleTag(pReader, pos, size);
 
       if (status < 0)
@@ -3931,12 +3930,12 @@ long Tags::SimpleTag::Parse(IMkvReader* pReader, long long pos,
     if (size == 0)  // weird
       continue;
 
-    if (id == mkvmuxer::kMkvTagName) {
+    if (id == libwebm::kMkvTagName) {
       status = UnserializeString(pReader, pos, size, m_tag_name);
 
       if (status)
         return status;
-    } else if (id == mkvmuxer::kMkvTagString) {
+    } else if (id == libwebm::kMkvTagString) {
       status = UnserializeString(pReader, pos, size, m_tag_string);
 
       if (status)
@@ -3996,12 +3995,12 @@ long SegmentInfo::Parse() {
     if (status < 0)  // error
       return status;
 
-    if (id == mkvmuxer::kMkvTimecodeScale) {
+    if (id == libwebm::kMkvTimecodeScale) {
       m_timecodeScale = UnserializeUInt(pReader, pos, size);
 
       if (m_timecodeScale <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == mkvmuxer::kMkvDuration) {
+    } else if (id == libwebm::kMkvDuration) {
       const long status = UnserializeFloat(pReader, pos, size, m_duration);
 
       if (status < 0)
@@ -4009,19 +4008,19 @@ long SegmentInfo::Parse() {
 
       if (m_duration < 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == mkvmuxer::kMkvMuxingApp) {
+    } else if (id == libwebm::kMkvMuxingApp) {
       const long status =
           UnserializeString(pReader, pos, size, m_pMuxingAppAsUTF8);
 
       if (status)
         return status;
-    } else if (id == mkvmuxer::kMkvWritingApp) {
+    } else if (id == libwebm::kMkvWritingApp) {
       const long status =
           UnserializeString(pReader, pos, size, m_pWritingAppAsUTF8);
 
       if (status)
         return status;
-    } else if (id == mkvmuxer::kMkvTitle) {
+    } else if (id == libwebm::kMkvTitle) {
       const long status = UnserializeString(pReader, pos, size, m_pTitleAsUTF8);
 
       if (status)
@@ -4176,7 +4175,7 @@ long ContentEncoding::ParseContentEncAESSettingsEntry(
     if (status < 0)  // error
       return status;
 
-    if (id == mkvmuxer::kMkvAESSettingsCipherMode) {
+    if (id == libwebm::kMkvAESSettingsCipherMode) {
       aes->cipher_mode = UnserializeUInt(pReader, pos, size);
       if (aes->cipher_mode != 1)
         return E_FILE_FORMAT_INVALID;
@@ -4207,10 +4206,10 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
     if (status < 0)  // error
       return status;
 
-    if (id == mkvmuxer::kMkvContentCompression)
+    if (id == libwebm::kMkvContentCompression)
       ++compression_count;
 
-    if (id == mkvmuxer::kMkvContentEncryption)
+    if (id == libwebm::kMkvContentEncryption)
       ++encryption_count;
 
     pos += size;  // consume payload
@@ -4246,15 +4245,15 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
     if (status < 0)  // error
       return status;
 
-    if (id == mkvmuxer::kMkvContentEncodingOrder) {
+    if (id == libwebm::kMkvContentEncodingOrder) {
       encoding_order_ = UnserializeUInt(pReader, pos, size);
-    } else if (id == mkvmuxer::kMkvContentEncodingScope) {
+    } else if (id == libwebm::kMkvContentEncodingScope) {
       encoding_scope_ = UnserializeUInt(pReader, pos, size);
       if (encoding_scope_ < 1)
         return -1;
-    } else if (id == mkvmuxer::kMkvContentEncodingType) {
+    } else if (id == libwebm::kMkvContentEncodingType) {
       encoding_type_ = UnserializeUInt(pReader, pos, size);
-    } else if (id == mkvmuxer::kMkvContentCompression) {
+    } else if (id == libwebm::kMkvContentCompression) {
       ContentCompression* const compression =
           new (std::nothrow) ContentCompression();
       if (!compression)
@@ -4266,7 +4265,7 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
         return status;
       }
       *compression_entries_end_++ = compression;
-    } else if (id == mkvmuxer::kMkvContentEncryption) {
+    } else if (id == libwebm::kMkvContentEncryption) {
       ContentEncryption* const encryption =
           new (std::nothrow) ContentEncryption();
       if (!encryption)
@@ -4307,13 +4306,13 @@ long ContentEncoding::ParseCompressionEntry(long long start, long long size,
     if (status < 0)  // error
       return status;
 
-    if (id == mkvmuxer::kMkvContentCompAlgo) {
+    if (id == libwebm::kMkvContentCompAlgo) {
       long long algo = UnserializeUInt(pReader, pos, size);
       if (algo < 0)
         return E_FILE_FORMAT_INVALID;
       compression->algo = algo;
       valid = true;
-    } else if (id == mkvmuxer::kMkvContentCompSettings) {
+    } else if (id == libwebm::kMkvContentCompSettings) {
       if (size <= 0)
         return E_FILE_FORMAT_INVALID;
 
@@ -4360,11 +4359,11 @@ long ContentEncoding::ParseEncryptionEntry(long long start, long long size,
     if (status < 0)  // error
       return status;
 
-    if (id == mkvmuxer::kMkvContentEncAlgo) {
+    if (id == libwebm::kMkvContentEncAlgo) {
       encryption->algo = UnserializeUInt(pReader, pos, size);
       if (encryption->algo != 5)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == mkvmuxer::kMkvContentEncKeyID) {
+    } else if (id == libwebm::kMkvContentEncKeyID) {
       delete[] encryption->key_id;
       encryption->key_id = NULL;
       encryption->key_id_len = 0;
@@ -4386,7 +4385,7 @@ long ContentEncoding::ParseEncryptionEntry(long long start, long long size,
 
       encryption->key_id = buf;
       encryption->key_id_len = buflen;
-    } else if (id == mkvmuxer::kMkvContentSignature) {
+    } else if (id == libwebm::kMkvContentSignature) {
       delete[] encryption->signature;
       encryption->signature = NULL;
       encryption->signature_len = 0;
@@ -4408,7 +4407,7 @@ long ContentEncoding::ParseEncryptionEntry(long long start, long long size,
 
       encryption->signature = buf;
       encryption->signature_len = buflen;
-    } else if (id == mkvmuxer::kMkvContentSigKeyID) {
+    } else if (id == libwebm::kMkvContentSigKeyID) {
       delete[] encryption->sig_key_id;
       encryption->sig_key_id = NULL;
       encryption->sig_key_id_len = 0;
@@ -4430,11 +4429,11 @@ long ContentEncoding::ParseEncryptionEntry(long long start, long long size,
 
       encryption->sig_key_id = buf;
       encryption->sig_key_id_len = buflen;
-    } else if (id == mkvmuxer::kMkvContentSigAlgo) {
+    } else if (id == libwebm::kMkvContentSigAlgo) {
       encryption->sig_algo = UnserializeUInt(pReader, pos, size);
-    } else if (id == mkvmuxer::kMkvContentSigHashAlgo) {
+    } else if (id == libwebm::kMkvContentSigHashAlgo) {
       encryption->sig_hash_algo = UnserializeUInt(pReader, pos, size);
-    } else if (id == mkvmuxer::kMkvContentEncAESSettings) {
+    } else if (id == libwebm::kMkvContentEncAESSettings) {
       const long status = ParseContentEncAESSettingsEntry(
           pos, size, pReader, &encryption->aes_settings);
       if (status)
@@ -4921,7 +4920,7 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) {
       return status;
 
     // pos now designates start of element
-    if (id == mkvmuxer::kMkvContentEncoding)
+    if (id == libwebm::kMkvContentEncoding)
       ++count;
 
     pos += size;  // consume payload
@@ -4946,7 +4945,7 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) {
       return status;
 
     // pos now designates start of element
-    if (id == mkvmuxer::kMkvContentEncoding) {
+    if (id == libwebm::kMkvContentEncoding) {
       ContentEncoding* const content_encoding =
           new (std::nothrow) ContentEncoding();
       if (!content_encoding)
@@ -4978,9 +4977,222 @@ BlockEntry::Kind Track::EOSBlock::GetKind() const { return kBlockEOS; }
 
 const Block* Track::EOSBlock::GetBlock() const { return NULL; }
 
+bool PrimaryChromaticity::Parse(IMkvReader* reader, long long read_pos,
+                                long long value_size, bool is_x,
+                                PrimaryChromaticity** chromaticity) {
+  if (!reader)
+    return false;
+
+  std::auto_ptr<PrimaryChromaticity> chromaticity_ptr;
+
+  if (!*chromaticity) {
+    chromaticity_ptr.reset(new PrimaryChromaticity());
+  } else {
+    chromaticity_ptr.reset(*chromaticity);
+  }
+
+  if (!chromaticity_ptr.get())
+    return false;
+
+  float* value = is_x ? &chromaticity_ptr->x : &chromaticity_ptr->y;
+
+  double parser_value = 0;
+  const long long value_parse_status =
+      UnserializeFloat(reader, read_pos, value_size, parser_value);
+
+  *value = static_cast<float>(parser_value);
+
+  if (value_parse_status < 0 || *value < 0.0 || *value > 1.0)
+    return false;
+
+  *chromaticity = chromaticity_ptr.release();
+  return true;
+}
+
+bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start,
+                              long long mm_size, MasteringMetadata** mm) {
+  if (!reader || *mm)
+    return false;
+
+  std::auto_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
+  if (!mm_ptr.get())
+    return false;
+
+  const long long mm_end = mm_start + mm_size;
+  long long read_pos = mm_start;
+
+  while (read_pos < mm_end) {
+    long long child_id = 0;
+    long long child_size = 0;
+
+    const long long status =
+        ParseElementHeader(reader, read_pos, mm_end, child_id, child_size);
+    if (status < 0)
+      return false;
+
+    if (child_id == libwebm::kMkvLuminanceMax) {
+      double value = 0;
+      const long long value_parse_status =
+          UnserializeFloat(reader, read_pos, child_size, value);
+      mm_ptr->luminance_max = static_cast<float>(value);
+      if (value_parse_status < 0 || mm_ptr->luminance_max < 0.0 ||
+          mm_ptr->luminance_max > 9999.99) {
+        return false;
+      }
+    } else if (child_id == libwebm::kMkvLuminanceMin) {
+      double value = 0;
+      const long long value_parse_status =
+          UnserializeFloat(reader, read_pos, child_size, value);
+      mm_ptr->luminance_min = static_cast<float>(value);
+      if (value_parse_status < 0 || mm_ptr->luminance_min < 0.0 ||
+          mm_ptr->luminance_min > 999.9999) {
+        return false;
+      }
+    } else {
+      bool is_x = false;
+      PrimaryChromaticity** chromaticity;
+      switch (child_id) {
+        case libwebm::kMkvPrimaryRChromaticityX:
+        case libwebm::kMkvPrimaryRChromaticityY:
+          is_x = child_id == libwebm::kMkvPrimaryRChromaticityX;
+          chromaticity = &mm_ptr->r;
+          break;
+        case libwebm::kMkvPrimaryGChromaticityX:
+        case libwebm::kMkvPrimaryGChromaticityY:
+          is_x = child_id == libwebm::kMkvPrimaryGChromaticityX;
+          chromaticity = &mm_ptr->g;
+          break;
+        case libwebm::kMkvPrimaryBChromaticityX:
+        case libwebm::kMkvPrimaryBChromaticityY:
+          is_x = child_id == libwebm::kMkvPrimaryBChromaticityX;
+          chromaticity = &mm_ptr->b;
+          break;
+        case libwebm::kMkvWhitePointChromaticityX:
+        case libwebm::kMkvWhitePointChromaticityY:
+          is_x = child_id == libwebm::kMkvWhitePointChromaticityX;
+          chromaticity = &mm_ptr->white_point;
+          break;
+        default:
+          return false;
+      }
+      const bool value_parse_status = PrimaryChromaticity::Parse(
+          reader, read_pos, child_size, is_x, chromaticity);
+      if (!value_parse_status)
+        return false;
+    }
+
+    read_pos += child_size;
+    if (read_pos > mm_end)
+      return false;
+  }
+
+  *mm = mm_ptr.release();
+  return true;
+}
+
+bool Colour::Parse(IMkvReader* reader, long long colour_start,
+                   long long colour_size, Colour** colour) {
+  if (!reader || *colour)
+    return false;
+
+  std::auto_ptr<Colour> colour_ptr(new Colour());
+  if (!colour_ptr.get())
+    return false;
+
+  const long long colour_end = colour_start + colour_size;
+  long long read_pos = colour_start;
+
+  while (read_pos < colour_end) {
+    long long child_id = 0;
+    long long child_size = 0;
+
+    const long status =
+        ParseElementHeader(reader, read_pos, colour_end, child_id, child_size);
+    if (status < 0)
+      return false;
+
+    if (child_id == libwebm::kMkvMatrixCoefficients) {
+      colour_ptr->matrix_coefficients =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->matrix_coefficients < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvBitsPerChannel) {
+      colour_ptr->bits_per_channel =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->bits_per_channel < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvChromaSubsamplingHorz) {
+      colour_ptr->chroma_subsampling_horz =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->chroma_subsampling_horz < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvChromaSubsamplingVert) {
+      colour_ptr->chroma_subsampling_vert =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->chroma_subsampling_vert < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvCbSubsamplingHorz) {
+      colour_ptr->cb_subsampling_horz =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->cb_subsampling_horz < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvCbSubsamplingVert) {
+      colour_ptr->cb_subsampling_vert =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->cb_subsampling_vert < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvChromaSitingHorz) {
+      colour_ptr->chroma_siting_horz =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->chroma_siting_horz < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvChromaSitingVert) {
+      colour_ptr->chroma_siting_vert =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->chroma_siting_vert < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvRange) {
+      colour_ptr->range = UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->range < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvTransferCharacteristics) {
+      colour_ptr->transfer_characteristics =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->transfer_characteristics < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvPrimaries) {
+      colour_ptr->primaries = UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->primaries < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvMaxCLL) {
+      colour_ptr->max_cll = UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->max_cll < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvMaxFALL) {
+      colour_ptr->max_fall = UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->max_fall < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvMasteringMetadata) {
+      if (!MasteringMetadata::Parse(reader, read_pos, child_size,
+                                    &colour_ptr->mastering_metadata))
+        return false;
+    } else {
+      return false;
+    }
+
+    read_pos += child_size;
+    if (read_pos > colour_end)
+      return false;
+  }
+  *colour = colour_ptr.release();
+  return true;
+}
+
 VideoTrack::VideoTrack(Segment* pSegment, long long element_start,
                        long long element_size)
-    : Track(pSegment, element_start, element_size) {}
+    : Track(pSegment, element_start, element_size), m_colour(NULL) {}
+
+VideoTrack::~VideoTrack() { delete m_colour; }
 
 long VideoTrack::Parse(Segment* pSegment, const Info& info,
                        long long element_start, long long element_size,
@@ -5011,6 +5223,8 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
 
   const long long stop = pos + s.size;
 
+  Colour* colour = NULL;
+
   while (pos < stop) {
     long long id, size;
 
@@ -5019,37 +5233,37 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
     if (status < 0)  // error
       return status;
 
-    if (id == mkvmuxer::kMkvPixelWidth) {
+    if (id == libwebm::kMkvPixelWidth) {
       width = UnserializeUInt(pReader, pos, size);
 
       if (width <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == mkvmuxer::kMkvPixelHeight) {
+    } else if (id == libwebm::kMkvPixelHeight) {
       height = UnserializeUInt(pReader, pos, size);
 
       if (height <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == mkvmuxer::kMkvDisplayWidth) {
+    } else if (id == libwebm::kMkvDisplayWidth) {
       display_width = UnserializeUInt(pReader, pos, size);
 
       if (display_width <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == mkvmuxer::kMkvDisplayHeight) {
+    } else if (id == libwebm::kMkvDisplayHeight) {
       display_height = UnserializeUInt(pReader, pos, size);
 
       if (display_height <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == mkvmuxer::kMkvDisplayUnit) {
+    } else if (id == libwebm::kMkvDisplayUnit) {
       display_unit = UnserializeUInt(pReader, pos, size);
 
       if (display_unit < 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == mkvmuxer::kMkvStereoMode) {
+    } else if (id == libwebm::kMkvStereoMode) {
       stereo_mode = UnserializeUInt(pReader, pos, size);
 
       if (stereo_mode < 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == mkvmuxer::kMkvFrameRate) {
+    } else if (id == libwebm::kMkvFrameRate) {
       const long status = UnserializeFloat(pReader, pos, size, rate);
 
       if (status < 0)
@@ -5057,6 +5271,9 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
 
       if (rate <= 0)
         return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvColour) {
+      if (!Colour::Parse(pReader, pos, size, &colour))
+        return E_FILE_FORMAT_INVALID;
     }
 
     pos += size;  // consume payload
@@ -5087,6 +5304,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
   pTrack->m_display_unit = display_unit;
   pTrack->m_stereo_mode = stereo_mode;
   pTrack->m_rate = rate;
+  pTrack->m_colour = colour;
 
   pResult = pTrack;
   return 0;  // success
@@ -5185,6 +5403,8 @@ long VideoTrack::Seek(long long time_ns, const BlockEntry*& pResult) const {
   return 0;
 }
 
+Colour* VideoTrack::GetColour() const { return m_colour; }
+
 long long VideoTrack::GetWidth() const { return m_width; }
 
 long long VideoTrack::GetHeight() const { return m_height; }
@@ -5239,7 +5459,7 @@ long AudioTrack::Parse(Segment* pSegment, const Info& info,
     if (status < 0)  // error
       return status;
 
-    if (id == mkvmuxer::kMkvSamplingFrequency) {
+    if (id == libwebm::kMkvSamplingFrequency) {
       status = UnserializeFloat(pReader, pos, size, rate);
 
       if (status < 0)
@@ -5247,12 +5467,12 @@ long AudioTrack::Parse(Segment* pSegment, const Info& info,
 
       if (rate <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == mkvmuxer::kMkvChannels) {
+    } else if (id == libwebm::kMkvChannels) {
       channels = UnserializeUInt(pReader, pos, size);
 
       if (channels <= 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == mkvmuxer::kMkvBitDepth) {
+    } else if (id == libwebm::kMkvBitDepth) {
       bit_depth = UnserializeUInt(pReader, pos, size);
 
       if (bit_depth <= 0)
@@ -5325,7 +5545,7 @@ long Tracks::Parse() {
     if (size == 0)  // weird
       continue;
 
-    if (id == mkvmuxer::kMkvTrackEntry)
+    if (id == libwebm::kMkvTrackEntry)
       ++count;
 
     pos += size;  // consume payload
@@ -5367,7 +5587,7 @@ long Tracks::Parse() {
 
     const long long element_size = payload_stop - element_start;
 
-    if (id == mkvmuxer::kMkvTrackEntry) {
+    if (id == libwebm::kMkvTrackEntry) {
       Track*& pTrack = *m_trackEntriesEnd;
       pTrack = NULL;
 
@@ -5443,16 +5663,16 @@ long Tracks::ParseTrackEntry(long long track_start, long long track_size,
 
     const long long start = pos;
 
-    if (id == mkvmuxer::kMkvVideo) {
+    if (id == libwebm::kMkvVideo) {
       v.start = start;
       v.size = size;
-    } else if (id == mkvmuxer::kMkvAudio) {
+    } else if (id == libwebm::kMkvAudio) {
       a.start = start;
       a.size = size;
-    } else if (id == mkvmuxer::kMkvContentEncodings) {
+    } else if (id == libwebm::kMkvContentEncodings) {
       e.start = start;
       e.size = size;
-    } else if (id == mkvmuxer::kMkvTrackUID) {
+    } else if (id == libwebm::kMkvTrackUID) {
       if (size > 8)
         return E_FILE_FORMAT_INVALID;
 
@@ -5474,49 +5694,49 @@ long Tracks::ParseTrackEntry(long long track_start, long long track_size,
 
         ++pos_;
       }
-    } else if (id == mkvmuxer::kMkvTrackNumber) {
+    } else if (id == libwebm::kMkvTrackNumber) {
       const long long num = UnserializeUInt(pReader, pos, size);
 
       if ((num <= 0) || (num > 127))
         return E_FILE_FORMAT_INVALID;
 
       info.number = static_cast<long>(num);
-    } else if (id == mkvmuxer::kMkvTrackType) {
+    } else if (id == libwebm::kMkvTrackType) {
       const long long type = UnserializeUInt(pReader, pos, size);
 
       if ((type <= 0) || (type > 254))
         return E_FILE_FORMAT_INVALID;
 
       info.type = static_cast<long>(type);
-    } else if (id == mkvmuxer::kMkvName) {
+    } else if (id == libwebm::kMkvName) {
       const long status =
           UnserializeString(pReader, pos, size, info.nameAsUTF8);
 
       if (status)
         return status;
-    } else if (id == mkvmuxer::kMkvLanguage) {
+    } else if (id == libwebm::kMkvLanguage) {
       const long status = UnserializeString(pReader, pos, size, info.language);
 
       if (status)
         return status;
-    } else if (id == mkvmuxer::kMkvDefaultDuration) {
+    } else if (id == libwebm::kMkvDefaultDuration) {
       const long long duration = UnserializeUInt(pReader, pos, size);
 
       if (duration < 0)
         return E_FILE_FORMAT_INVALID;
 
       info.defaultDuration = static_cast<unsigned long long>(duration);
-    } else if (id == mkvmuxer::kMkvCodecID) {
+    } else if (id == libwebm::kMkvCodecID) {
       const long status = UnserializeString(pReader, pos, size, info.codecId);
 
       if (status)
         return status;
-    } else if (id == mkvmuxer::kMkvFlagLacing) {
+    } else if (id == libwebm::kMkvFlagLacing) {
       lacing = UnserializeUInt(pReader, pos, size);
 
       if ((lacing < 0) || (lacing > 1))
         return E_FILE_FORMAT_INVALID;
-    } else if (id == mkvmuxer::kMkvCodecPrivate) {
+    } else if (id == libwebm::kMkvCodecPrivate) {
       delete[] info.codecPrivate;
       info.codecPrivate = NULL;
       info.codecPrivateSize = 0;
@@ -5539,15 +5759,15 @@ long Tracks::ParseTrackEntry(long long track_start, long long track_size,
         info.codecPrivate = buf;
         info.codecPrivateSize = buflen;
       }
-    } else if (id == mkvmuxer::kMkvCodecName) {
+    } else if (id == libwebm::kMkvCodecName) {
       const long status =
           UnserializeString(pReader, pos, size, info.codecNameAsUTF8);
 
       if (status)
         return status;
-    } else if (id == mkvmuxer::kMkvCodecDelay) {
+    } else if (id == libwebm::kMkvCodecDelay) {
       info.codecDelay = UnserializeUInt(pReader, pos, size);
-    } else if (id == mkvmuxer::kMkvSeekPreRoll) {
+    } else if (id == libwebm::kMkvSeekPreRoll) {
       info.seekPreRoll = UnserializeUInt(pReader, pos, size);
     }
 
@@ -5730,7 +5950,7 @@ long Cluster::Load(long long& pos, long& len) const {
   if (id_ < 0)  // error
     return static_cast<long>(id_);
 
-  if (id_ != mkvmuxer::kMkvCluster)
+  if (id_ != libwebm::kMkvCluster)
     return E_FILE_FORMAT_INVALID;
 
   pos += len;  // consume id
@@ -5812,10 +6032,10 @@ long Cluster::Load(long long& pos, long& len) const {
     // that we have exhausted the sub-element's inside the cluster
     // whose ID we parsed earlier.
 
-    if (id == mkvmuxer::kMkvCluster)
+    if (id == libwebm::kMkvCluster)
       break;
 
-    if (id == mkvmuxer::kMkvCues)
+    if (id == libwebm::kMkvCues)
       break;
 
     pos += len;  // consume ID field
@@ -5864,7 +6084,7 @@ long Cluster::Load(long long& pos, long& len) const {
     if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
       return E_FILE_FORMAT_INVALID;
 
-    if (id == mkvmuxer::kMkvTimecode) {
+    if (id == libwebm::kMkvTimecode) {
       len = static_cast<long>(size);
 
       if ((pos + size) > avail)
@@ -5879,10 +6099,10 @@ long Cluster::Load(long long& pos, long& len) const {
 
       if (bBlock)
         break;
-    } else if (id == mkvmuxer::kMkvBlockGroup) {
+    } else if (id == libwebm::kMkvBlockGroup) {
       bBlock = true;
       break;
-    } else if (id == mkvmuxer::kMkvSimpleBlock) {
+    } else if (id == libwebm::kMkvSimpleBlock) {
       bBlock = true;
       break;
     }
@@ -5980,7 +6200,7 @@ long Cluster::Parse(long long& pos, long& len) const {
     // that we have exhausted the sub-element's inside the cluster
     // whose ID we parsed earlier.
 
-    if ((id == mkvmuxer::kMkvCluster) || (id == mkvmuxer::kMkvCues)) {
+    if ((id == libwebm::kMkvCluster) || (id == libwebm::kMkvCues)) {
       if (m_element_size < 0)
         m_element_size = pos - m_element_start;
 
@@ -6035,8 +6255,7 @@ long Cluster::Parse(long long& pos, long& len) const {
 
     if (cluster_stop >= 0) {
       if (block_stop > cluster_stop) {
-        if (id == mkvmuxer::kMkvBlockGroup ||
-            id == mkvmuxer::kMkvSimpleBlock) {
+        if (id == libwebm::kMkvBlockGroup || id == libwebm::kMkvSimpleBlock) {
           return E_FILE_FORMAT_INVALID;
         }
 
@@ -6054,10 +6273,10 @@ long Cluster::Parse(long long& pos, long& len) const {
 
     Cluster* const this_ = const_cast<Cluster*>(this);
 
-    if (id == mkvmuxer::kMkvBlockGroup)
+    if (id == libwebm::kMkvBlockGroup)
       return this_->ParseBlockGroup(size, pos, len);
 
-    if (id == mkvmuxer::kMkvSimpleBlock)
+    if (id == libwebm::kMkvSimpleBlock)
       return this_->ParseSimpleBlock(size, pos, len);
 
     pos += size;  // consume payload
@@ -6188,8 +6407,7 @@ long Cluster::ParseSimpleBlock(long long block_size, long long& pos,
     return E_BUFFER_NOT_FULL;
   }
 
-  status = CreateBlock(mkvmuxer::kMkvSimpleBlock,
-                       block_start, block_size,
+  status = CreateBlock(libwebm::kMkvSimpleBlock, block_start, block_size,
                        0);  // DiscardPadding
 
   if (status != 0)
@@ -6299,14 +6517,14 @@ long Cluster::ParseBlockGroup(long long payload_size, long long& pos,
     if (size == unknown_size)
       return E_FILE_FORMAT_INVALID;
 
-    if (id == mkvmuxer::kMkvDiscardPadding) {
+    if (id == libwebm::kMkvDiscardPadding) {
       status = UnserializeInt(pReader, pos, size, discard_padding);
 
       if (status < 0)  // error
         return status;
     }
 
-    if (id != mkvmuxer::kMkvBlock) {
+    if (id != libwebm::kMkvBlock) {
       pos += size;  // consume sub-part of block group
 
       if (pos > payload_stop)
@@ -6399,8 +6617,8 @@ long Cluster::ParseBlockGroup(long long payload_size, long long& pos,
   if (pos != payload_stop)
     return E_FILE_FORMAT_INVALID;
 
-  status = CreateBlock(mkvmuxer::kMkvBlockGroup,
-                       payload_start, payload_size, discard_padding);
+  status = CreateBlock(libwebm::kMkvBlockGroup, payload_start, payload_size,
+                       discard_padding);
   if (status != 0)
     return status;
 
@@ -6565,7 +6783,7 @@ long Cluster::HasBlockEntries(
     if (id < 0)  // error
       return static_cast<long>(id);
 
-    if (id != mkvmuxer::kMkvCluster)
+    if (id != libwebm::kMkvCluster)
       return E_PARSE_FAILED;
 
     pos += len;  // consume Cluster ID field
@@ -6653,10 +6871,10 @@ long Cluster::HasBlockEntries(
     // that we have exhausted the sub-element's inside the cluster
     // whose ID we parsed earlier.
 
-    if (id == mkvmuxer::kMkvCluster)
+    if (id == libwebm::kMkvCluster)
       return 0;  // no entries found
 
-    if (id == mkvmuxer::kMkvCues)
+    if (id == libwebm::kMkvCues)
       return 0;  // no entries found
 
     pos += len;  // consume id field
@@ -6708,10 +6926,10 @@ long Cluster::HasBlockEntries(
     if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
       return E_FILE_FORMAT_INVALID;
 
-    if (id == mkvmuxer::kMkvBlockGroup)
+    if (id == libwebm::kMkvBlockGroup)
       return 1;  // have at least one entry
 
-    if (id == mkvmuxer::kMkvSimpleBlock)
+    if (id == libwebm::kMkvSimpleBlock)
       return 1;  // have at least one entry
 
     pos += size;  // consume payload
@@ -6786,7 +7004,7 @@ long long Cluster::GetLastTime() const {
 long Cluster::CreateBlock(long long id,
                           long long pos,  // absolute pos of payload
                           long long size, long long discard_padding) {
-  if (id != mkvmuxer::kMkvBlockGroup && id != mkvmuxer::kMkvSimpleBlock)
+  if (id != libwebm::kMkvBlockGroup && id != libwebm::kMkvSimpleBlock)
     return E_PARSE_FAILED;
 
   if (m_entries_count < 0) {  // haven't parsed anything yet
@@ -6826,7 +7044,7 @@ long Cluster::CreateBlock(long long id,
     }
   }
 
-  if (id == mkvmuxer::kMkvBlockGroup)
+  if (id == libwebm::kMkvBlockGroup)
     return CreateBlockGroup(pos, size, discard_padding);
   else
     return CreateSimpleBlock(pos, size);
@@ -6871,12 +7089,12 @@ long Cluster::CreateBlockGroup(long long start_offset, long long size,
 
     pos += len;  // consume size
 
-    if (id == mkvmuxer::kMkvBlock) {
+    if (id == libwebm::kMkvBlock) {
       if (bpos < 0) {  // Block ID
         bpos = pos;
         bsize = size;
       }
-    } else if (id == mkvmuxer::kMkvBlockDuration) {
+    } else if (id == libwebm::kMkvBlockDuration) {
       if (size > 8)
         return E_FILE_FORMAT_INVALID;
 
@@ -6884,7 +7102,7 @@ long Cluster::CreateBlockGroup(long long start_offset, long long size,
 
       if (duration < 0)
         return E_FILE_FORMAT_INVALID;
-    } else if (id == mkvmuxer::kMkvReferenceBlock) {
+    } else if (id == libwebm::kMkvReferenceBlock) {
       if (size > 8 || size <= 0)
         return E_FILE_FORMAT_INVALID;
       const long size_ = static_cast<long>(size);
@@ -7231,7 +7449,6 @@ const BlockEntry* Cluster::GetEntry(const CuePoint& cp,
 
 BlockEntry::BlockEntry(Cluster* p, long idx) : m_pCluster(p), m_index(idx) {}
 BlockEntry::~BlockEntry() {}
-bool BlockEntry::EOS() const { return (GetKind() == kBlockEOS); }
 const Cluster* BlockEntry::GetCluster() const { return m_pCluster; }
 long BlockEntry::GetIndex() const { return m_index; }
 
@@ -7555,7 +7772,6 @@ long Block::Parse(const Cluster* pCluster) {
       if (pf >= pf_end)
         return E_FILE_FORMAT_INVALID;
 
-
       const Frame& prev = *pf++;
       assert(prev.len == frame_size);
       if (prev.len != frame_size)
@@ -7581,7 +7797,7 @@ long Block::Parse(const Cluster* pCluster) {
       if (pos > stop)
         return E_FILE_FORMAT_INVALID;
 
-      const int exp = 7 * len - 1;
+      const long exp = 7 * len - 1;
       const long long bias = (1LL << exp) - 1LL;
       const long long delta_size = delta_size_ - bias;
 
@@ -7721,4 +7937,4 @@ long Block::Frame::Read(IMkvReader* pReader, unsigned char* buf) const {
 
 long long Block::GetDiscardPadding() const { return m_discard_padding; }
 
-}  // end namespace mkvparser
+}  // namespace mkvparser
diff --git a/third_party/libwebm/mkvparser.hpp b/third_party/libwebm/mkvparser/mkvparser.h
similarity index 90%
rename from third_party/libwebm/mkvparser.hpp
rename to third_party/libwebm/mkvparser/mkvparser.h
index 75ef69d..42e6e88 100644
--- a/third_party/libwebm/mkvparser.hpp
+++ b/third_party/libwebm/mkvparser/mkvparser.h
@@ -5,13 +5,10 @@
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS.  All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
-
-#ifndef MKVPARSER_HPP
-#define MKVPARSER_HPP
+#ifndef MKVPARSER_MKVPARSER_H_
+#define MKVPARSER_MKVPARSER_H_
 
 #include <cstddef>
-#include <cstdio>
-#include <cstdlib>
 
 namespace mkvparser {
 
@@ -28,8 +25,9 @@ class IMkvReader {
   virtual ~IMkvReader();
 };
 
-template<typename Type> Type* SafeArrayAlloc(unsigned long long num_elements,
-                                             unsigned long long element_size);
+template <typename Type>
+Type* SafeArrayAlloc(unsigned long long num_elements,
+                     unsigned long long element_size);
 long long GetUIntLength(IMkvReader*, long long, long&);
 long long ReadUInt(IMkvReader*, long long, long&);
 long long ReadID(IMkvReader* pReader, long long pos, long& len);
@@ -128,7 +126,7 @@ class BlockEntry {
  public:
   virtual ~BlockEntry();
 
-  bool EOS() const;
+  bool EOS() const { return (GetKind() == kBlockEOS); }
   const Cluster* GetCluster() const;
   long GetIndex() const;
   virtual const Block* GetBlock() const = 0;
@@ -391,6 +389,90 @@ class Track {
   ContentEncoding** content_encoding_entries_end_;
 };
 
+struct PrimaryChromaticity {
+  PrimaryChromaticity() : x(0), y(0) {}
+  ~PrimaryChromaticity() {}
+  static bool Parse(IMkvReader* reader, long long read_pos,
+                    long long value_size, bool is_x,
+                    PrimaryChromaticity** chromaticity);
+  float x;
+  float y;
+};
+
+struct MasteringMetadata {
+  static const float kValueNotPresent;
+
+  MasteringMetadata()
+      : r(NULL),
+        g(NULL),
+        b(NULL),
+        white_point(NULL),
+        luminance_max(kValueNotPresent),
+        luminance_min(kValueNotPresent) {}
+  ~MasteringMetadata() {
+    delete r;
+    delete g;
+    delete b;
+    delete white_point;
+  }
+
+  static bool Parse(IMkvReader* reader, long long element_start,
+                    long long element_size,
+                    MasteringMetadata** mastering_metadata);
+
+  PrimaryChromaticity* r;
+  PrimaryChromaticity* g;
+  PrimaryChromaticity* b;
+  PrimaryChromaticity* white_point;
+  float luminance_max;
+  float luminance_min;
+};
+
+struct Colour {
+  static const long long kValueNotPresent;
+
+  // Unless otherwise noted all values assigned upon construction are the
+  // equivalent of unspecified/default.
+  Colour()
+      : matrix_coefficients(kValueNotPresent),
+        bits_per_channel(kValueNotPresent),
+        chroma_subsampling_horz(kValueNotPresent),
+        chroma_subsampling_vert(kValueNotPresent),
+        cb_subsampling_horz(kValueNotPresent),
+        cb_subsampling_vert(kValueNotPresent),
+        chroma_siting_horz(kValueNotPresent),
+        chroma_siting_vert(kValueNotPresent),
+        range(kValueNotPresent),
+        transfer_characteristics(kValueNotPresent),
+        primaries(kValueNotPresent),
+        max_cll(kValueNotPresent),
+        max_fall(kValueNotPresent),
+        mastering_metadata(NULL) {}
+  ~Colour() {
+    delete mastering_metadata;
+    mastering_metadata = NULL;
+  }
+
+  static bool Parse(IMkvReader* reader, long long element_start,
+                    long long element_size, Colour** colour);
+
+  long long matrix_coefficients;
+  long long bits_per_channel;
+  long long chroma_subsampling_horz;
+  long long chroma_subsampling_vert;
+  long long cb_subsampling_horz;
+  long long cb_subsampling_vert;
+  long long chroma_siting_horz;
+  long long chroma_siting_vert;
+  long long range;
+  long long transfer_characteristics;
+  long long primaries;
+  long long max_cll;
+  long long max_fall;
+
+  MasteringMetadata* mastering_metadata;
+};
+
 class VideoTrack : public Track {
   VideoTrack(const VideoTrack&);
   VideoTrack& operator=(const VideoTrack&);
@@ -398,6 +480,7 @@ class VideoTrack : public Track {
   VideoTrack(Segment*, long long element_start, long long element_size);
 
  public:
+  virtual ~VideoTrack();
   static long Parse(Segment*, const Info&, long long element_start,
                     long long element_size, VideoTrack*&);
 
@@ -412,6 +495,8 @@ class VideoTrack : public Track {
   bool VetEntry(const BlockEntry*) const;
   long Seek(long long time_ns, const BlockEntry*&) const;
 
+  Colour* GetColour() const;
+
  private:
   long long m_width;
   long long m_height;
@@ -421,6 +506,8 @@ class VideoTrack : public Track {
   long long m_stereo_mode;
 
   double m_rate;
+
+  Colour* m_colour;
 };
 
 class AudioTrack : public Track {
@@ -1013,7 +1100,7 @@ class Segment {
   const BlockEntry* GetBlock(const CuePoint&, const CuePoint::TrackPosition&);
 };
 
-}  // end namespace mkvparser
+}  // namespace mkvparser
 
 inline long mkvparser::Segment::LoadCluster() {
   long long pos;
@@ -1022,4 +1109,4 @@ inline long mkvparser::Segment::LoadCluster() {
   return LoadCluster(pos, size);
 }
 
-#endif  // MKVPARSER_HPP
+#endif  // MKVPARSER_MKVPARSER_H_
diff --git a/third_party/libwebm/mkvreader.cpp b/third_party/libwebm/mkvparser/mkvreader.cc
similarity index 97%
rename from third_party/libwebm/mkvreader.cpp
rename to third_party/libwebm/mkvparser/mkvreader.cc
index eaf9e0a..9f90d8c 100644
--- a/third_party/libwebm/mkvreader.cpp
+++ b/third_party/libwebm/mkvparser/mkvreader.cc
@@ -5,8 +5,7 @@
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS.  All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
-
-#include "mkvreader.hpp"
+#include "mkvparser/mkvreader.h"
 
 #include <cassert>
 
@@ -129,4 +128,4 @@ int MkvReader::Read(long long offset, long len, unsigned char* buffer) {
   return 0;  // success
 }
 
-}  // end namespace mkvparser
+}  // namespace mkvparser
\ No newline at end of file
diff --git a/third_party/libwebm/mkvreader.hpp b/third_party/libwebm/mkvparser/mkvreader.h
similarity index 87%
rename from third_party/libwebm/mkvreader.hpp
rename to third_party/libwebm/mkvparser/mkvreader.h
index 82ebad5..9831ecf 100644
--- a/third_party/libwebm/mkvreader.hpp
+++ b/third_party/libwebm/mkvparser/mkvreader.h
@@ -5,13 +5,13 @@
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS.  All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
+#ifndef MKVPARSER_MKVREADER_H_
+#define MKVPARSER_MKVREADER_H_
 
-#ifndef MKVREADER_HPP
-#define MKVREADER_HPP
-
-#include "mkvparser.hpp"
 #include <cstdio>
 
+#include "mkvparser/mkvparser.h"
+
 namespace mkvparser {
 
 class MkvReader : public IMkvReader {
@@ -40,6 +40,6 @@ class MkvReader : public IMkvReader {
   bool reader_owns_file_;
 };
 
-}  // end namespace mkvparser
+}  // namespace mkvparser
 
-#endif  // MKVREADER_HPP
+#endif  // MKVPARSER_MKVREADER_H_
diff --git a/third_party/x86inc/README.libvpx b/third_party/x86inc/README.libvpx
index e91e305..8d3cd96 100644
--- a/third_party/x86inc/README.libvpx
+++ b/third_party/x86inc/README.libvpx
@@ -1,5 +1,5 @@
-URL: http://git.videolan.org/?p=x264.git
-Version: a95584945dd9ce3acc66c6cd8f6796bc4404d40d
+URL: https://git.videolan.org/git/x264.git
+Version: d23d18655249944c1ca894b451e2c82c7a584c62
 License: ISC
 License File: LICENSE
 
@@ -13,12 +13,8 @@ Prefix functions with vpx by default.
 Manage name mangling (prefixing with '_') manually because 'PREFIX' does not
   exist in libvpx.
 Expand PIC default to macho64 and respect CONFIG_PIC from libvpx
-Catch all elf formats for 'hidden' status and SECTION notes.
-Avoid 'amdnop' when building with nasm.
 Set 'private_extern' visibility for macho targets.
 Copy PIC 'GLOBAL' macros from x86_abi_support.asm
 Use .text instead of .rodata on macho to avoid broken tables in PIC mode.
 Use .text with no alignment for aout
 Only use 'hidden' visibility with Chromium
-Move '%use smartalign' for nasm out of 'INIT_CPUFLAGS' and before
-  'ALIGNMODE'.
diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm
index be59de3..b647dff 100644
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* x86inc.asm: x264asm abstraction layer
 ;*****************************************************************************
-;* Copyright (C) 2005-2015 x264 project
+;* Copyright (C) 2005-2016 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm at u.washington.edu>
 ;*          Anton Mitrofanov <BugMaster at narod.ru>
@@ -66,16 +66,35 @@
     %endif
 %endif
 
-%ifidn   __OUTPUT_FORMAT__,elf32
-    %define mangle(x) x
+%define FORMAT_ELF 0
+%ifidn __OUTPUT_FORMAT__,elf
+    %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf32
+    %define FORMAT_ELF 1
 %elifidn __OUTPUT_FORMAT__,elf64
-    %define mangle(x) x
-%elifidn __OUTPUT_FORMAT__,x64
-    %define mangle(x) x
-%elifidn __OUTPUT_FORMAT__,win64
-    %define mangle(x) x
+    %define FORMAT_ELF 1
+%endif
+
+%define FORMAT_MACHO 0
+%ifidn __OUTPUT_FORMAT__,macho32
+     %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho64
+     %define FORMAT_MACHO 1
+%endif
+
+; Set PREFIX for libvpx builds.
+%if FORMAT_ELF
+    %undef PREFIX
+%elif WIN64
+    %undef PREFIX
 %else
+    %define PREFIX
+%endif
+
+%ifdef PREFIX
     %define mangle(x) _ %+ x
+%else
+    %define mangle(x) x
 %endif
 
 ; In some instances macho32 tables get misaligned when using .rodata.
@@ -94,14 +113,6 @@
     %endif
 %endmacro
 
-%macro SECTION_TEXT 0-1 16
-    %ifidn __OUTPUT_FORMAT__,aout
-        SECTION .text
-    %else
-        SECTION .text align=%1
-    %endif
-%endmacro
-
 ; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC"
 ; from original code is added in for 64bit.
 %ifidn __OUTPUT_FORMAT__,elf32
@@ -119,7 +130,7 @@
 %if ABI_IS_32BIT
     %if CONFIG_PIC=1
         %ifidn __OUTPUT_FORMAT__,elf32
-            %define GET_GOT_SAVE_ARG 1
+            %define GET_GOT_DEFINED 1
             %define WRT_PLT wrt ..plt
             %macro GET_GOT 1
                 extern _GLOBAL_OFFSET_TABLE_
@@ -138,7 +149,7 @@
                 %define RESTORE_GOT pop %1
             %endmacro
         %elifidn __OUTPUT_FORMAT__,macho32
-            %define GET_GOT_SAVE_ARG 1
+            %define GET_GOT_DEFINED 1
             %macro GET_GOT 1
                 push %1
                 call %%get_got
@@ -149,6 +160,8 @@
                 %undef RESTORE_GOT
                 %define RESTORE_GOT pop %1
             %endmacro
+        %else
+            %define GET_GOT_DEFINED 0
         %endif
     %endif
 
@@ -186,8 +199,16 @@
 %ifdef PIC
     default rel
 %endif
+
+%ifndef GET_GOT_DEFINED
+    %define GET_GOT_DEFINED 0
+%endif
 ; Done with PIC macros
 
+%ifdef __NASM_VER__
+    %use smartalign
+%endif
+
 ; Macros to eliminate most code duplication between x86_32 and x86_64:
 ; Currently this works only for leaf functions which load all their arguments
 ; into registers at the start, and make no other use of the stack. Luckily that
@@ -235,6 +256,7 @@
     %define r%1w %2w
     %define r%1b %2b
     %define r%1h %2h
+    %define %2q %2
     %if %0 == 2
         %define r%1m  %2d
         %define r%1mp %2
@@ -259,9 +281,9 @@
     %define e%1h %3
     %define r%1b %2
     %define e%1b %2
-%if ARCH_X86_64 == 0
-    %define r%1  e%1
-%endif
+    %if ARCH_X86_64 == 0
+        %define r%1 e%1
+    %endif
 %endmacro
 
 DECLARE_REG_SIZE ax, al, ah
@@ -371,7 +393,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
 
 %macro ASSERT 1
     %if (%1) == 0
-        %error assert failed
+        %error assertion ``%1'' failed
     %endif
 %endmacro
 
@@ -462,8 +484,10 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
         %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
             %if %1 > 0
                 %assign regs_used (regs_used + 1)
-            %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
-                %warning "Stack pointer will overwrite register argument"
+            %endif
+            %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
+                ; Ensure that we don't clobber any registers containing arguments
+                %assign regs_used 5 + UNIX64 * 3
             %endif
         %endif
     %endif
@@ -577,9 +601,9 @@ DECLARE_REG 14, R15, 120
 %macro RET 0
     WIN64_RESTORE_XMM_INTERNAL rsp
     POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
-%if mmsize == 32
-    vzeroupper
-%endif
+    %if mmsize == 32
+        vzeroupper
+    %endif
     AUTO_REP_RET
 %endmacro
 
@@ -616,17 +640,17 @@ DECLARE_REG 14, R15, 72
 %define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
 
 %macro RET 0
-%if stack_size_padded > 0
-%if required_stack_alignment > STACK_ALIGNMENT
-    mov rsp, rstkm
-%else
-    add rsp, stack_size_padded
-%endif
-%endif
+    %if stack_size_padded > 0
+        %if required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add rsp, stack_size_padded
+        %endif
+    %endif
     POP_IF_USED 14, 13, 12, 11, 10, 9
-%if mmsize == 32
-    vzeroupper
-%endif
+    %if mmsize == 32
+        vzeroupper
+    %endif
     AUTO_REP_RET
 %endmacro
 
@@ -672,29 +696,29 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 %define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
 
 %macro RET 0
-%if stack_size_padded > 0
-%if required_stack_alignment > STACK_ALIGNMENT
-    mov rsp, rstkm
-%else
-    add rsp, stack_size_padded
-%endif
-%endif
+    %if stack_size_padded > 0
+        %if required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add rsp, stack_size_padded
+        %endif
+    %endif
     POP_IF_USED 6, 5, 4, 3
-%if mmsize == 32
-    vzeroupper
-%endif
+    %if mmsize == 32
+        vzeroupper
+    %endif
     AUTO_REP_RET
 %endmacro
 
 %endif ;======================================================================
 
 %if WIN64 == 0
-%macro WIN64_SPILL_XMM 1
-%endmacro
-%macro WIN64_RESTORE_XMM 1
-%endmacro
-%macro WIN64_PUSH_XMM 0
-%endmacro
+    %macro WIN64_SPILL_XMM 1
+    %endmacro
+    %macro WIN64_RESTORE_XMM 1
+    %endmacro
+    %macro WIN64_PUSH_XMM 0
+    %endmacro
 %endif
 
 ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
@@ -707,24 +731,26 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
     %else
         rep ret
     %endif
+    annotate_function_size
 %endmacro
 
 %define last_branch_adr $$
 %macro AUTO_REP_RET 0
-    %ifndef cpuflags
-        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
-    %elif notcpuflag(ssse3)
-        times ((last_branch_adr-$)>>31)+1 rep
+    %if notcpuflag(ssse3)
+        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
     %endif
     ret
+    annotate_function_size
 %endmacro
 
 %macro BRANCH_INSTR 0-*
     %rep %0
         %macro %1 1-2 %1
             %2 %1
-            %%branch_instr:
-            %xdefine last_branch_adr %%branch_instr
+            %if notcpuflag(ssse3)
+                %%branch_instr equ $
+                %xdefine last_branch_adr %%branch_instr
+            %endif
         %endmacro
         %rotate 1
     %endrep
@@ -739,6 +765,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     %elif %2
         jmp %1
     %endif
+    annotate_function_size
 %endmacro
 
 ;=============================================================================
@@ -760,6 +787,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     cglobal_internal 0, %1 %+ SUFFIX, %2
 %endmacro
 %macro cglobal_internal 2-3+
+    annotate_function_size
     %if %1
         %xdefine %%FUNCTION_PREFIX private_prefix
         ; libvpx explicitly sets visibility in shared object builds. Avoid
@@ -780,17 +808,10 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
         CAT_XDEFINE cglobaled_, %2, 1
     %endif
     %xdefine current_function %2
-    %ifidn __OUTPUT_FORMAT__,elf32
-        global %2:function %%VISIBILITY
-    %elifidn __OUTPUT_FORMAT__,elf64
+    %xdefine current_function_section __SECT__
+    %if FORMAT_ELF
         global %2:function %%VISIBILITY
-    %elifidn __OUTPUT_FORMAT__,macho32
-        %ifdef __NASM_VER__
-            global %2
-        %else
-            global %2:private_extern
-        %endif
-    %elifidn __OUTPUT_FORMAT__,macho64
+    %elif FORMAT_MACHO
         %ifdef __NASM_VER__
             global %2
         %else
@@ -820,16 +841,16 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
 
 ; like cextern, but without the prefix
 %macro cextern_naked 1
-    %xdefine %1 mangle(%1)
+    %ifdef PREFIX
+        %xdefine %1 mangle(%1)
+    %endif
     CAT_XDEFINE cglobaled_, %1, 1
     extern %1
 %endmacro
 
 %macro const 1-2+
     %xdefine %1 mangle(private_prefix %+ _ %+ %1)
-    %ifidn __OUTPUT_FORMAT__,elf32
-        global %1:data hidden
-    %elifidn __OUTPUT_FORMAT__,elf64
+    %if FORMAT_ELF
         global %1:data hidden
     %else
         global %1
@@ -837,14 +858,29 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     %1: %2
 %endmacro
 
-; This is needed for ELF, otherwise the GNU linker assumes the stack is
-; executable by default.
-%ifidn __OUTPUT_FORMAT__,elf32
-SECTION .note.GNU-stack noalloc noexec nowrite progbits
-%elifidn __OUTPUT_FORMAT__,elf64
-SECTION .note.GNU-stack noalloc noexec nowrite progbits
+; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
+%if FORMAT_ELF
+    [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
 %endif
 
+; Tell debuggers how large the function was.
+; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
+; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
+; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
+; then its size might be unspecified.
+%macro annotate_function_size 0
+    %ifdef __YASM_VER__
+        %ifdef current_function
+            %if FORMAT_ELF
+                current_function_section
+                %%ecf equ $
+                size current_function %%ecf - current_function
+                __SECT__
+            %endif
+        %endif
+    %endif
+%endmacro
+
 ; cpuflags
 
 %assign cpuflags_mmx      (1<<0)
@@ -873,12 +909,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
 %assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
 %assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
 
-%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
-%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
-
-%ifdef __NASM_VER__
-    %use smartalign
-%endif
+; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
+%define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
+%define notcpuflag(x) (cpuflag(x) ^ 1)
 
 ; Takes an arbitrary number of cpuflags from the above list.
 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
@@ -915,12 +948,18 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
         %endif
     %endif
 
-    %ifdef __NASM_VER__
-        ALIGNMODE k7
-    %elif ARCH_X86_64 || cpuflag(sse2)
-        CPU amdnop
+    %if ARCH_X86_64 || cpuflag(sse2)
+        %ifdef __NASM_VER__
+            ALIGNMODE k8
+        %else
+            CPU amdnop
+        %endif
     %else
-        CPU basicnop
+        %ifdef __NASM_VER__
+            ALIGNMODE nop
+        %else
+            CPU basicnop
+        %endif
     %endif
 %endmacro
 
@@ -949,14 +988,14 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
     %define movnta movntq
     %assign %%i 0
     %rep 8
-    CAT_XDEFINE m, %%i, mm %+ %%i
-    CAT_XDEFINE nnmm, %%i, %%i
-    %assign %%i %%i+1
+        CAT_XDEFINE m, %%i, mm %+ %%i
+        CAT_XDEFINE nnmm, %%i, %%i
+        %assign %%i %%i+1
     %endrep
     %rep 8
-    CAT_UNDEF m, %%i
-    CAT_UNDEF nnmm, %%i
-    %assign %%i %%i+1
+        CAT_UNDEF m, %%i
+        CAT_UNDEF nnmm, %%i
+        %assign %%i %%i+1
     %endrep
     INIT_CPUFLAGS %1
 %endmacro
@@ -967,7 +1006,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
     %define mmsize 16
     %define num_mmregs 8
     %if ARCH_X86_64
-    %define num_mmregs 16
+        %define num_mmregs 16
     %endif
     %define mova movdqa
     %define movu movdqu
@@ -975,9 +1014,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
     %define movnta movntdq
     %assign %%i 0
     %rep num_mmregs
-    CAT_XDEFINE m, %%i, xmm %+ %%i
-    CAT_XDEFINE nnxmm, %%i, %%i
-    %assign %%i %%i+1
+        CAT_XDEFINE m, %%i, xmm %+ %%i
+        CAT_XDEFINE nnxmm, %%i, %%i
+        %assign %%i %%i+1
     %endrep
     INIT_CPUFLAGS %1
 %endmacro
@@ -988,7 +1027,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
     %define mmsize 32
     %define num_mmregs 8
     %if ARCH_X86_64
-    %define num_mmregs 16
+        %define num_mmregs 16
     %endif
     %define mova movdqa
     %define movu movdqu
@@ -996,9 +1035,9 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
     %define movnta movntdq
     %assign %%i 0
     %rep num_mmregs
-    CAT_XDEFINE m, %%i, ymm %+ %%i
-    CAT_XDEFINE nnymm, %%i, %%i
-    %assign %%i %%i+1
+        CAT_XDEFINE m, %%i, ymm %+ %%i
+        CAT_XDEFINE nnymm, %%i, %%i
+        %assign %%i %%i+1
     %endrep
     INIT_CPUFLAGS %1
 %endmacro
@@ -1022,7 +1061,7 @@ INIT_XMM
 %assign i 0
 %rep 16
     DECLARE_MMCAST i
-%assign i i+1
+    %assign i i+1
 %endrep
 
 ; I often want to use macros that permute their arguments. e.g. there's no
@@ -1040,23 +1079,23 @@ INIT_XMM
 ; doesn't cost any cycles.
 
 %macro PERMUTE 2-* ; takes a list of pairs to swap
-%rep %0/2
-    %xdefine %%tmp%2 m%2
-    %rotate 2
-%endrep
-%rep %0/2
-    %xdefine m%1 %%tmp%2
-    CAT_XDEFINE nn, m%1, %1
-    %rotate 2
-%endrep
+    %rep %0/2
+        %xdefine %%tmp%2 m%2
+        %rotate 2
+    %endrep
+    %rep %0/2
+        %xdefine m%1 %%tmp%2
+        CAT_XDEFINE nn, m%1, %1
+        %rotate 2
+    %endrep
 %endmacro
 
 %macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
-%ifnum %1 ; SWAP 0, 1, ...
-    SWAP_INTERNAL_NUM %1, %2
-%else ; SWAP m0, m1, ...
-    SWAP_INTERNAL_NAME %1, %2
-%endif
+    %ifnum %1 ; SWAP 0, 1, ...
+        SWAP_INTERNAL_NUM %1, %2
+    %else ; SWAP m0, m1, ...
+        SWAP_INTERNAL_NAME %1, %2
+    %endif
 %endmacro
 
 %macro SWAP_INTERNAL_NUM 2-*
@@ -1066,7 +1105,7 @@ INIT_XMM
         %xdefine m%2 %%tmp
         CAT_XDEFINE nn, m%1, %1
         CAT_XDEFINE nn, m%2, %2
-    %rotate 1
+        %rotate 1
     %endrep
 %endmacro
 
@@ -1074,7 +1113,7 @@ INIT_XMM
     %xdefine %%args nn %+ %1
     %rep %0-1
         %xdefine %%args %%args, nn %+ %2
-    %rotate 1
+        %rotate 1
     %endrep
     SWAP_INTERNAL_NUM %%args
 %endmacro
@@ -1091,7 +1130,7 @@ INIT_XMM
     %assign %%i 0
     %rep num_mmregs
         CAT_XDEFINE %%f, %%i, m %+ %%i
-    %assign %%i %%i+1
+        %assign %%i %%i+1
     %endrep
 %endmacro
 
@@ -1101,20 +1140,20 @@ INIT_XMM
         %rep num_mmregs
             CAT_XDEFINE m, %%i, %1_m %+ %%i
             CAT_XDEFINE nn, m %+ %%i, %%i
-        %assign %%i %%i+1
+            %assign %%i %%i+1
         %endrep
     %endif
 %endmacro
 
 ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
 %macro call 1
-    call_internal %1, %1 %+ SUFFIX
+    call_internal %1 %+ SUFFIX, %1
 %endmacro
 %macro call_internal 2
-    %xdefine %%i %1
-    %ifndef cglobaled_%1
-        %ifdef cglobaled_%2
-            %xdefine %%i %2
+    %xdefine %%i %2
+    %ifndef cglobaled_%2
+        %ifdef cglobaled_%1
+            %xdefine %%i %1
         %endif
     %endif
     call %%i
@@ -1157,7 +1196,7 @@ INIT_XMM
     %endif
     CAT_XDEFINE sizeofxmm, i, 16
     CAT_XDEFINE sizeofymm, i, 32
-%assign i i+1
+    %assign i i+1
 %endrep
 %undef i
 
@@ -1534,7 +1573,7 @@ AVX_INSTR pfmul, 3dnow, 1, 0, 1
     %else
         CAT_XDEFINE q, j, i
     %endif
-%assign i i+1
+    %assign i i+1
 %endrep
 %undef i
 %undef j
@@ -1557,55 +1596,54 @@ FMA_INSTR  pmacsdd,  pmulld, paddd ; sse4 emulation
 FMA_INSTR pmacsdql,  pmuldq, paddq ; sse4 emulation
 FMA_INSTR pmadcswd, pmaddwd, paddd
 
-; convert FMA4 to FMA3 if possible
-%macro FMA4_INSTR 4
-    %macro %1 4-8 %1, %2, %3, %4
-        %if cpuflag(fma4)
-            v%5 %1, %2, %3, %4
-        %elifidn %1, %2
-            v%6 %1, %4, %3 ; %1 = %1 * %3 + %4
-        %elifidn %1, %3
-            v%7 %1, %2, %4 ; %1 = %2 * %1 + %4
-        %elifidn %1, %4
-            v%8 %1, %2, %3 ; %1 = %2 * %3 + %1
-        %else
-            %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported
-        %endif
-    %endmacro
-%endmacro
-
-FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd
-FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps
-FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd
-FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss
-
-FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd
-FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps
-FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd
-FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps
-
-FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd
-FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps
-FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd
-FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss
-
-FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd
-FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps
-FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd
-FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss
-
-FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd
-FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
-FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
-FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
-
-; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug
-%if ARCH_X86_64 == 0
-%macro vpbroadcastq 2
-%if sizeof%1 == 16
-    movddup %1, %2
-%else
-    vbroadcastsd %1, %2
-%endif
-%endmacro
+; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
+; FMA3 is only possible if dst is the same as one of the src registers.
+; Either src2 or src3 can be a memory operand.
+%macro FMA4_INSTR 2-*
+    %push fma4_instr
+    %xdefine %$prefix %1
+    %rep %0 - 1
+        %macro %$prefix%2 4-6 %$prefix, %2
+            %if notcpuflag(fma3) && notcpuflag(fma4)
+                %error use of ``%5%6'' fma instruction in cpuname function: current_function
+            %elif cpuflag(fma4)
+                v%5%6 %1, %2, %3, %4
+            %elifidn %1, %2
+                ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
+                %ifid %3
+                    v%{5}213%6 %2, %3, %4
+                %else
+                    v%{5}132%6 %2, %4, %3
+                %endif
+            %elifidn %1, %3
+                v%{5}213%6 %3, %2, %4
+            %elifidn %1, %4
+                v%{5}231%6 %4, %2, %3
+            %else
+                %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
+            %endif
+        %endmacro
+        %rotate 1
+    %endrep
+    %pop
+%endmacro
+
+FMA4_INSTR fmadd,    pd, ps, sd, ss
+FMA4_INSTR fmaddsub, pd, ps
+FMA4_INSTR fmsub,    pd, ps, sd, ss
+FMA4_INSTR fmsubadd, pd, ps
+FMA4_INSTR fnmadd,   pd, ps, sd, ss
+FMA4_INSTR fnmsub,   pd, ps, sd, ss
+
+; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
+%ifdef __YASM_VER__
+    %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
+        %macro vpbroadcastq 2
+            %if sizeof%1 == 16
+                movddup %1, %2
+            %else
+                vbroadcastsd %1, %2
+            %endif
+        %endmacro
+    %endif
 %endif
diff --git a/tools/gen_authors.sh b/tools/gen_authors.sh
index e1246f0..4cfd81e 100755
--- a/tools/gen_authors.sh
+++ b/tools/gen_authors.sh
@@ -6,7 +6,7 @@ cat <<EOF
 # This file is automatically generated from the git commit history
 # by tools/gen_authors.sh.
 
-$(git log --pretty=format:"%aN <%aE>" | sort | uniq)
+$(git log --pretty=format:"%aN <%aE>" | sort | uniq | grep -v corp.google)
 Google Inc.
 The Mozilla Foundation
 The Xiph.Org Foundation
diff --git a/tools_common.c b/tools_common.c
index 20b259c..17c0d44 100644
--- a/tools_common.c
+++ b/tools_common.c
@@ -16,11 +16,11 @@
 
 #include "./tools_common.h"
 
-#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
 #include "vpx/vp8cx.h"
 #endif
 
-#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER || CONFIG_VP10_DECODER
+#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
 #include "vpx/vp8dx.h"
 #endif
 
@@ -133,10 +133,6 @@ int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) {
 #if CONFIG_ENCODERS
 
 static const VpxInterface vpx_encoders[] = {
-#if CONFIG_VP10_ENCODER
-  {"vp10", VP10_FOURCC, &vpx_codec_vp10_cx},
-#endif
-
 #if CONFIG_VP8_ENCODER
   {"vp8", VP8_FOURCC, &vpx_codec_vp8_cx},
 #endif
@@ -178,10 +174,6 @@ static const VpxInterface vpx_decoders[] = {
 #if CONFIG_VP9_DECODER
   {"vp9", VP9_FOURCC, &vpx_codec_vp9_dx},
 #endif
-
-#if CONFIG_VP10_DECODER
-  {"vp10", VP10_FOURCC, &vpx_codec_vp10_dx},
-#endif
 };
 
 int get_vpx_decoder_count(void) {
diff --git a/tools_common.h b/tools_common.h
index 98347b6..310b569 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -62,7 +62,6 @@
 
 #define VP8_FOURCC 0x30385056
 #define VP9_FOURCC 0x30395056
-#define VP10_FOURCC 0x303a5056
 
 enum VideoFileType {
   FILE_TYPE_RAW,
diff --git a/vp8/common/arm/neon/bilinearpredict_neon.c b/vp8/common/arm/neon/bilinearpredict_neon.c
index 9824a31..bb6ea76 100644
--- a/vp8/common/arm/neon/bilinearpredict_neon.c
+++ b/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -21,114 +21,6 @@ static const uint8_t bifilter4_coeff[8][2] = {
     { 16, 112}
 };
 
-void vp8_bilinear_predict4x4_neon(
-        unsigned char *src_ptr,
-        int src_pixels_per_line,
-        int xoffset,
-        int yoffset,
-        unsigned char *dst_ptr,
-        int dst_pitch) {
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8;
-    uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8;
-    uint8x16_t q1u8, q2u8;
-    uint16x8_t q1u16, q2u16;
-    uint16x8_t q7u16, q8u16, q9u16;
-    uint64x2_t q4u64, q5u64;
-    uint64x1_t d12u64;
-    uint32x2x2_t d0u32x2, d1u32x2, d2u32x2, d3u32x2;
-
-    if (xoffset == 0) {  // skip_1stpass_filter
-        uint32x2_t d28u32 = vdup_n_u32(0);
-        uint32x2_t d29u32 = vdup_n_u32(0);
-        uint32x2_t d30u32 = vdup_n_u32(0);
-
-        d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 0);
-        src_ptr += src_pixels_per_line;
-        d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 1);
-        src_ptr += src_pixels_per_line;
-        d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 0);
-        src_ptr += src_pixels_per_line;
-        d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 1);
-        src_ptr += src_pixels_per_line;
-        d30u32 = vld1_lane_u32((const uint32_t *)src_ptr, d30u32, 0);
-        d28u8 = vreinterpret_u8_u32(d28u32);
-        d29u8 = vreinterpret_u8_u32(d29u32);
-        d30u8 = vreinterpret_u8_u32(d30u32);
-    } else {
-        d2u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
-        d3u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
-        d4u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
-        d5u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
-        d6u8 = vld1_u8(src_ptr);
-
-        q1u8 = vcombine_u8(d2u8, d3u8);
-        q2u8 = vcombine_u8(d4u8, d5u8);
-
-        d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
-        d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
-
-        q4u64  = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8);
-        q5u64  = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8);
-        d12u64 = vshr_n_u64(vreinterpret_u64_u8(d6u8), 8);
-
-        d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q1u8)),
-                           vreinterpret_u32_u8(vget_high_u8(q1u8)));
-        d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q2u8)),
-                           vreinterpret_u32_u8(vget_high_u8(q2u8)));
-        d2u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q4u64)),
-                           vreinterpret_u32_u64(vget_high_u64(q4u64)));
-        d3u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),
-                           vreinterpret_u32_u64(vget_high_u64(q5u64)));
-
-        q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
-        q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
-        q9u16 = vmull_u8(d6u8, d0u8);
-
-        q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d2u32x2.val[0]), d1u8);
-        q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d3u32x2.val[0]), d1u8);
-        q9u16 = vmlal_u8(q9u16, vreinterpret_u8_u64(d12u64), d1u8);
-
-        d28u8 = vqrshrn_n_u16(q7u16, 7);
-        d29u8 = vqrshrn_n_u16(q8u16, 7);
-        d30u8 = vqrshrn_n_u16(q9u16, 7);
-    }
-
-    // secondpass_filter
-    if (yoffset == 0) {  // skip_2ndpass_filter
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 1);
-    } else {
-        d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
-        d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
-
-        q1u16 = vmull_u8(d28u8, d0u8);
-        q2u16 = vmull_u8(d29u8, d0u8);
-
-        d26u8 = vext_u8(d28u8, d29u8, 4);
-        d27u8 = vext_u8(d29u8, d30u8, 4);
-
-        q1u16 = vmlal_u8(q1u16, d26u8, d1u8);
-        q2u16 = vmlal_u8(q2u16, d27u8, d1u8);
-
-        d2u8 = vqrshrn_n_u16(q1u16, 7);
-        d3u8 = vqrshrn_n_u16(q2u16, 7);
-
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
-    }
-    return;
-}
-
 void vp8_bilinear_predict8x4_neon(
         unsigned char *src_ptr,
         int src_pixels_per_line,
diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c
index 4c2efc9..49d8d22 100644
--- a/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/vp8/common/arm/neon/sixtappredict_neon.c
@@ -22,383 +22,6 @@ static const int8_t vp8_sub_pel_filters[8][8] = {
     {0, -1,   12, 123,  -6, 0, 0, 0},
 };
 
-void vp8_sixtap_predict4x4_neon(
-        unsigned char *src_ptr,
-        int src_pixels_per_line,
-        int xoffset,
-        int yoffset,
-        unsigned char *dst_ptr,
-        int dst_pitch) {
-    unsigned char *src;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d18u8, d19u8, d20u8, d21u8;
-    uint8x8_t d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
-    int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
-    uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
-    uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
-    int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
-    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
-    uint8x16_t q3u8, q4u8, q5u8, q6u8, q11u8;
-    uint64x2_t q3u64, q4u64, q5u64, q6u64, q9u64, q10u64;
-    uint32x2x2_t d0u32x2, d1u32x2;
-
-    if (xoffset == 0) {  // secondpass_filter4x4_only
-        uint32x2_t d27u32 = vdup_n_u32(0);
-        uint32x2_t d28u32 = vdup_n_u32(0);
-        uint32x2_t d29u32 = vdup_n_u32(0);
-        uint32x2_t d30u32 = vdup_n_u32(0);
-        uint32x2_t d31u32 = vdup_n_u32(0);
-
-        // load second_pass filter
-        dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
-        d0s8 = vdup_lane_s8(dtmps8, 0);
-        d1s8 = vdup_lane_s8(dtmps8, 1);
-        d2s8 = vdup_lane_s8(dtmps8, 2);
-        d3s8 = vdup_lane_s8(dtmps8, 3);
-        d4s8 = vdup_lane_s8(dtmps8, 4);
-        d5s8 = vdup_lane_s8(dtmps8, 5);
-        d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
-        d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
-        d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
-        d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
-        d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
-        d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
-
-        // load src data
-        src = src_ptr - src_pixels_per_line * 2;
-        d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 0);
-        src += src_pixels_per_line;
-        d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 1);
-        src += src_pixels_per_line;
-        d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 0);
-        src += src_pixels_per_line;
-        d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 1);
-        src += src_pixels_per_line;
-        d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 0);
-        src += src_pixels_per_line;
-        d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 1);
-        src += src_pixels_per_line;
-        d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 0);
-        src += src_pixels_per_line;
-        d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 1);
-        src += src_pixels_per_line;
-        d31u32 = vld1_lane_u32((const uint32_t *)src, d31u32, 0);
-
-        d27u8 = vreinterpret_u8_u32(d27u32);
-        d28u8 = vreinterpret_u8_u32(d28u32);
-        d29u8 = vreinterpret_u8_u32(d29u32);
-        d30u8 = vreinterpret_u8_u32(d30u32);
-        d31u8 = vreinterpret_u8_u32(d31u32);
-
-        d23u8 = vext_u8(d27u8, d28u8, 4);
-        d24u8 = vext_u8(d28u8, d29u8, 4);
-        d25u8 = vext_u8(d29u8, d30u8, 4);
-        d26u8 = vext_u8(d30u8, d31u8, 4);
-
-        q3u16 = vmull_u8(d27u8, d0u8);
-        q4u16 = vmull_u8(d28u8, d0u8);
-        q5u16 = vmull_u8(d25u8, d5u8);
-        q6u16 = vmull_u8(d26u8, d5u8);
-
-        q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
-        q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
-        q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
-        q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
-
-        q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
-        q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
-        q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
-        q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
-
-        q3s16 = vreinterpretq_s16_u16(q3u16);
-        q4s16 = vreinterpretq_s16_u16(q4u16);
-        q5s16 = vreinterpretq_s16_u16(q5u16);
-        q6s16 = vreinterpretq_s16_u16(q6u16);
-
-        q5s16 = vqaddq_s16(q5s16, q3s16);
-        q6s16 = vqaddq_s16(q6s16, q4s16);
-
-        d3u8 = vqrshrun_n_s16(q5s16, 7);
-        d4u8 = vqrshrun_n_s16(q6s16, 7);
-
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
-        return;
-    }
-
-    // load first_pass filter
-    dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
-    d0s8 = vdup_lane_s8(dtmps8, 0);
-    d1s8 = vdup_lane_s8(dtmps8, 1);
-    d2s8 = vdup_lane_s8(dtmps8, 2);
-    d3s8 = vdup_lane_s8(dtmps8, 3);
-    d4s8 = vdup_lane_s8(dtmps8, 4);
-    d5s8 = vdup_lane_s8(dtmps8, 5);
-    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
-    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
-    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
-    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
-    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
-    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
-
-    // First pass: output_height lines x output_width columns (9x4)
-
-    if (yoffset == 0)  // firstpass_filter4x4_only
-        src = src_ptr - 2;
-    else
-        src = src_ptr - 2 - (src_pixels_per_line * 2);
-
-    q3u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q4u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q5u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q6u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-
-    d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
-    d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
-    d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
-    d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
-
-    // vswp here
-    q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
-    q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8),  // d18 d19
-                       vreinterpret_u32_u8(d19u8));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8),  // d20 d21
-                       vreinterpret_u32_u8(d21u8));
-    q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
-    q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
-
-    // keep original src data in q4 q6
-    q4u64 = vreinterpretq_u64_u8(q3u8);
-    q6u64 = vreinterpretq_u64_u8(q5u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)),  // d6 d7
-                       vreinterpret_u32_u8(vget_high_u8(q3u8)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)),  // d10 d11
-                       vreinterpret_u32_u8(vget_high_u8(q5u8)));
-    q9u64 = vshrq_n_u64(q4u64, 8);
-    q10u64 = vshrq_n_u64(q6u64, 8);
-    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
-    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
-                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
-                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
-    q3u64 = vshrq_n_u64(q4u64, 32);
-    q5u64 = vshrq_n_u64(q6u64, 32);
-    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
-    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
-                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
-                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
-    q9u64 = vshrq_n_u64(q4u64, 16);
-    q10u64 = vshrq_n_u64(q6u64, 16);
-    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
-    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
-                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
-                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
-    q3u64 = vshrq_n_u64(q4u64, 24);
-    q5u64 = vshrq_n_u64(q6u64, 24);
-    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
-    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
-                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
-                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
-    q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
-    q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
-
-    q7s16 = vreinterpretq_s16_u16(q7u16);
-    q8s16 = vreinterpretq_s16_u16(q8u16);
-    q9s16 = vreinterpretq_s16_u16(q9u16);
-    q10s16 = vreinterpretq_s16_u16(q10u16);
-    q7s16 = vqaddq_s16(q7s16, q9s16);
-    q8s16 = vqaddq_s16(q8s16, q10s16);
-
-    d27u8 = vqrshrun_n_s16(q7s16, 7);
-    d28u8 = vqrshrun_n_s16(q8s16, 7);
-
-    if (yoffset == 0) {  // firstpass_filter4x4_only
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 1);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
-        dst_ptr += dst_pitch;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
-        return;
-    }
-
-    // First Pass on rest 5-line data
-    q3u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q4u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q5u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q6u8 = vld1q_u8(src);
-    src += src_pixels_per_line;
-    q11u8 = vld1q_u8(src);
-
-    d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
-    d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
-    d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
-    d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
-
-    // vswp here
-    q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8));
-    q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8));
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8),  // d18 d19
-                       vreinterpret_u32_u8(d19u8));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8),  // d20 d21
-                       vreinterpret_u32_u8(d21u8));
-    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 5);
-    q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8);
-    q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8);
-    q12u16 = vmull_u8(d31u8, d5u8);
-
-    q4u64 = vreinterpretq_u64_u8(q3u8);
-    q6u64 = vreinterpretq_u64_u8(q5u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)),  // d6 d7
-                       vreinterpret_u32_u8(vget_high_u8(q3u8)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)),  // d10 d11
-                       vreinterpret_u32_u8(vget_high_u8(q5u8)));
-    q9u64 = vshrq_n_u64(q4u64, 8);
-    q10u64 = vshrq_n_u64(q6u64, 8);
-    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
-    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
-    q12u16 = vmlal_u8(q12u16, vget_low_u8(q11u8), d0u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
-                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
-                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
-    q3u64 = vshrq_n_u64(q4u64, 32);
-    q5u64 = vshrq_n_u64(q6u64, 32);
-    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 1);
-    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8);
-    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8);
-    q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
-                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
-                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
-    q9u64 = vshrq_n_u64(q4u64, 16);
-    q10u64 = vshrq_n_u64(q6u64, 16);
-    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 4);
-    q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8);
-    q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8);
-    q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)),   // d18 d19
-                       vreinterpret_u32_u64(vget_high_u64(q9u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)),  // d20 d211
-                       vreinterpret_u32_u64(vget_high_u64(q10u64)));
-    q3u64 = vshrq_n_u64(q4u64, 24);
-    q5u64 = vshrq_n_u64(q6u64, 24);
-    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 2);
-    q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8);
-    q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8);
-    q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
-
-    d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)),  // d6 d7
-                       vreinterpret_u32_u64(vget_high_u64(q3u64)));
-    d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),  // d10 d11
-                       vreinterpret_u32_u64(vget_high_u64(q5u64)));
-    d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 3);
-    q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8);
-    q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8);
-    q11u16 = vmull_u8(d31u8, d3u8);
-
-    q7s16 = vreinterpretq_s16_u16(q7u16);
-    q8s16 = vreinterpretq_s16_u16(q8u16);
-    q9s16 = vreinterpretq_s16_u16(q9u16);
-    q10s16 = vreinterpretq_s16_u16(q10u16);
-    q11s16 = vreinterpretq_s16_u16(q11u16);
-    q12s16 = vreinterpretq_s16_u16(q12u16);
-    q7s16 = vqaddq_s16(q7s16, q9s16);
-    q8s16 = vqaddq_s16(q8s16, q10s16);
-    q12s16 = vqaddq_s16(q12s16, q11s16);
-
-    d29u8 = vqrshrun_n_s16(q7s16, 7);
-    d30u8 = vqrshrun_n_s16(q8s16, 7);
-    d31u8 = vqrshrun_n_s16(q12s16, 7);
-
-    // Second pass: 4x4
-    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
-    d0s8 = vdup_lane_s8(dtmps8, 0);
-    d1s8 = vdup_lane_s8(dtmps8, 1);
-    d2s8 = vdup_lane_s8(dtmps8, 2);
-    d3s8 = vdup_lane_s8(dtmps8, 3);
-    d4s8 = vdup_lane_s8(dtmps8, 4);
-    d5s8 = vdup_lane_s8(dtmps8, 5);
-    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
-    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
-    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
-    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
-    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
-    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
-
-    d23u8 = vext_u8(d27u8, d28u8, 4);
-    d24u8 = vext_u8(d28u8, d29u8, 4);
-    d25u8 = vext_u8(d29u8, d30u8, 4);
-    d26u8 = vext_u8(d30u8, d31u8, 4);
-
-    q3u16 = vmull_u8(d27u8, d0u8);
-    q4u16 = vmull_u8(d28u8, d0u8);
-    q5u16 = vmull_u8(d25u8, d5u8);
-    q6u16 = vmull_u8(d26u8, d5u8);
-
-    q3u16 = vmlsl_u8(q3u16, d29u8, d4u8);
-    q4u16 = vmlsl_u8(q4u16, d30u8, d4u8);
-    q5u16 = vmlsl_u8(q5u16, d23u8, d1u8);
-    q6u16 = vmlsl_u8(q6u16, d24u8, d1u8);
-
-    q3u16 = vmlal_u8(q3u16, d28u8, d2u8);
-    q4u16 = vmlal_u8(q4u16, d29u8, d2u8);
-    q5u16 = vmlal_u8(q5u16, d24u8, d3u8);
-    q6u16 = vmlal_u8(q6u16, d25u8, d3u8);
-
-    q3s16 = vreinterpretq_s16_u16(q3u16);
-    q4s16 = vreinterpretq_s16_u16(q4u16);
-    q5s16 = vreinterpretq_s16_u16(q5u16);
-    q6s16 = vreinterpretq_s16_u16(q6u16);
-
-    q5s16 = vqaddq_s16(q5s16, q3s16);
-    q6s16 = vqaddq_s16(q6s16, q4s16);
-
-    d3u8 = vqrshrun_n_s16(q5s16, 7);
-    d4u8 = vqrshrun_n_s16(q6s16, 7);
-
-    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
-    dst_ptr += dst_pitch;
-    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
-    dst_ptr += dst_pitch;
-    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0);
-    dst_ptr += dst_pitch;
-    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1);
-    return;
-}
-
 void vp8_sixtap_predict8x4_neon(
         unsigned char *src_ptr,
         int src_pixels_per_line,
diff --git a/vp8/common/findnearmv.h b/vp8/common/findnearmv.h
index 155847c..472a7b5 100644
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h
@@ -104,7 +104,7 @@ vp8_prob *vp8_mv_ref_probs(
 extern const unsigned char vp8_mbsplit_offset[4][16];
 
 
-static INLINE int left_block_mv(const MODE_INFO *cur_mb, int b)
+static INLINE uint32_t left_block_mv(const MODE_INFO *cur_mb, int b)
 {
     if (!(b & 3))
     {
@@ -119,7 +119,8 @@ static INLINE int left_block_mv(const MODE_INFO *cur_mb, int b)
     return (cur_mb->bmi + b - 1)->mv.as_int;
 }
 
-static INLINE int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride)
+static INLINE uint32_t above_block_mv(const MODE_INFO *cur_mb, int b,
+                                      int mi_stride)
 {
     if (!(b >> 2))
     {
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index 28dc262..6d5f302 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -94,6 +94,8 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
 {
 #if CONFIG_MULTITHREAD
     ctx->processor_core_count = get_cpu_count();
+#else
+    (void)ctx;
 #endif /* CONFIG_MULTITHREAD */
 
 #if ARCH_ARM
diff --git a/vp8/common/mips/msa/postproc_msa.c b/vp8/common/mips/msa/postproc_msa.c
index c88f302..23dcde2 100644
--- a/vp8/common/mips/msa/postproc_msa.c
+++ b/vp8/common/mips/msa/postproc_msa.c
@@ -10,6 +10,7 @@
 
 #include <stdlib.h>
 #include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp8/common/mips/msa/vp8_macros_msa.h"
 
 static const int16_t vp8_rv_msa[] =
@@ -798,54 +799,3 @@ void vp8_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
         }
     }
 }
-
-void vp8_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
-                             char blackclamp[16], char whiteclamp[16],
-                             char bothclamp[16],
-                             uint32_t width, uint32_t height,
-                             int32_t pitch)
-{
-    uint32_t i, j;
-
-    for (i = 0; i < height / 2; ++i)
-    {
-        uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
-        int8_t *ref0_ptr = (int8_t *) (noise + (rand() & 0xff));
-        uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
-        int8_t *ref1_ptr = (int8_t *) (noise + (rand() & 0xff));
-        for (j = width / 16; j--;)
-        {
-            v16i8 temp00_s, temp01_s;
-            v16u8 temp00, temp01, black_clamp, white_clamp;
-            v16u8 pos0, ref0, pos1, ref1;
-            v16i8 const127 = __msa_ldi_b(127);
-
-            pos0 = LD_UB(pos0_ptr);
-            ref0 = LD_UB(ref0_ptr);
-            pos1 = LD_UB(pos1_ptr);
-            ref1 = LD_UB(ref1_ptr);
-            black_clamp = (v16u8)__msa_fill_b(blackclamp[0]);
-            white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]);
-            temp00 = (pos0 < black_clamp);
-            pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
-            temp01 = (pos1 < black_clamp);
-            pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
-            XORI_B2_128_UB(pos0, pos1);
-            temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
-            temp00 = (v16u8)(temp00_s < pos0);
-            pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
-            temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
-            temp01 = (temp01_s < pos1);
-            pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
-            XORI_B2_128_UB(pos0, pos1);
-            pos0 += ref0;
-            ST_UB(pos0, pos0_ptr);
-            pos1 += ref1;
-            ST_UB(pos1, pos1_ptr);
-            pos0_ptr += 16;
-            pos1_ptr += 16;
-            ref0_ptr += 16;
-            ref1_ptr += 16;
-        }
-    }
-}
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 322b613..6baf00f 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -10,6 +10,7 @@
 
 
 #include "vpx_config.h"
+#include "vpx_dsp_rtcd.h"
 #include "vp8_rtcd.h"
 #include "vpx_scale_rtcd.h"
 #include "vpx_scale/yv12config.h"
@@ -490,54 +491,6 @@ static void fillrd(struct postproc_state *state, int q, int a)
     state->last_noise = a;
 }
 
-/****************************************************************************
- *
- *  ROUTINE       : plane_add_noise_c
- *
- *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
- *                                  noise to
- *                  unsigned int Width    width of plane
- *                  unsigned int Height   height of plane
- *                  int  Pitch    distance between subsequent lines of frame
- *                  int  q        quantizer used to determine amount of noise
- *                                  to add
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void.
- *
- *  FUNCTION      : adds gaussian noise to a plane of pixels
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void vp8_plane_add_noise_c(unsigned char *Start, char *noise,
-                           char blackclamp[16],
-                           char whiteclamp[16],
-                           char bothclamp[16],
-                           unsigned int Width, unsigned int Height, int Pitch)
-{
-    unsigned int i, j;
-    (void)bothclamp;
-
-    for (i = 0; i < Height; i++)
-    {
-        unsigned char *Pos = Start + i * Pitch;
-        char  *Ref = (char *)(noise + (rand() & 0xff));
-
-        for (j = 0; j < Width; j++)
-        {
-            if (Pos[j] < blackclamp[0])
-                Pos[j] = blackclamp[0];
-
-            if (Pos[j] > 255 + whiteclamp[0])
-                Pos[j] = 255 + whiteclamp[0];
-
-            Pos[j] += Ref[j];
-        }
-    }
-}
-
 /* Blend the macro block with a solid colored square.  Leave the
  * edges unblended to give distinction to macro blocks in areas
  * filled with the same color block.
@@ -828,7 +781,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
             fillrd(&oci->postproc_state, 63 - q, noise_level);
         }
 
-        vp8_plane_add_noise
+        vpx_plane_add_noise
         (oci->post_proc_buffer.y_buffer,
          oci->postproc_state.noise,
          oci->postproc_state.blackclamp,
diff --git a/vp8/common/reconintra4x4.h b/vp8/common/reconintra4x4.h
index 869841e..5dc5d13 100644
--- a/vp8/common/reconintra4x4.h
+++ b/vp8/common/reconintra4x4.h
@@ -17,8 +17,8 @@
 extern "C" {
 #endif
 
-static void intra_prediction_down_copy(MACROBLOCKD *xd,
-                                       unsigned char *above_right_src)
+static INLINE void intra_prediction_down_copy(MACROBLOCKD *xd,
+                                              unsigned char *above_right_src)
 {
     int dst_stride = xd->dst.y_stride;
     unsigned char *above_right_dst = xd->dst.y_buffer - dst_stride + 16;
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 6799c27..856ede1 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -167,10 +167,6 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") {
     add_proto qw/void vp8_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
     specialize qw/vp8_post_proc_down_and_across_mb_row sse2 msa/;
 
-    add_proto qw/void vp8_plane_add_noise/, "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch";
-    specialize qw/vp8_plane_add_noise mmx sse2 msa/;
-    $vp8_plane_add_noise_sse2=vp8_plane_add_noise_wmt;
-
     add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
     # no asm yet
 
@@ -209,7 +205,6 @@ $vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6;
 $vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2;
 
 add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-#TODO(johannkoenig): fix the neon version https://code.google.com/p/webm/issues/detail?id=817
 specialize qw/vp8_sixtap_predict4x4 mmx ssse3 media dspr2 msa/;
 $vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6;
 $vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2;
@@ -227,7 +222,6 @@ specialize qw/vp8_bilinear_predict8x4 mmx media neon msa/;
 $vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6;
 
 add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-#TODO(johannkoenig): fix the neon version https://code.google.com/p/webm/issues/detail?id=892
 specialize qw/vp8_bilinear_predict4x4 mmx media msa/;
 $vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
 
diff --git a/vp8/common/threading.h b/vp8/common/threading.h
index 01c82db..183b49b 100644
--- a/vp8/common/threading.h
+++ b/vp8/common/threading.h
@@ -12,6 +12,8 @@
 #ifndef VP8_COMMON_THREADING_H_
 #define VP8_COMMON_THREADING_H_
 
+#include "./vpx_config.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -19,17 +21,15 @@ extern "C" {
 #if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
 
 /* Thread management macros */
-#ifdef _WIN32
+#if defined(_WIN32) && !HAVE_PTHREAD_H
 /* Win32 */
 #include <process.h>
 #include <windows.h>
-#define THREAD_FUNCTION DWORD WINAPI
+#define THREAD_FUNCTION unsigned int __stdcall
 #define THREAD_FUNCTION_RETURN DWORD
 #define THREAD_SPECIFIC_INDEX DWORD
 #define pthread_t HANDLE
 #define pthread_attr_t DWORD
-#define pthread_create(thhandle,attr,thfunc,tharg) (int)((*thhandle=(HANDLE)_beginthreadex(NULL,0,(unsigned int (__stdcall *)(void *))thfunc,tharg,0,NULL))==NULL)
-#define pthread_join(thread, result) ((WaitForSingleObject((thread),INFINITE)!=WAIT_OBJECT_0) || !CloseHandle(thread))
 #define pthread_detach(thread) if(thread!=NULL)CloseHandle(thread)
 #define thread_sleep(nms) Sleep(nms)
 #define pthread_cancel(thread) terminate_thread(thread,0)
@@ -44,14 +44,11 @@ extern "C" {
 #include <os2.h>
 
 #include <stdlib.h>
-#define THREAD_FUNCTION void
-#define THREAD_FUNCTION_RETURN void
+#define THREAD_FUNCTION void *
+#define THREAD_FUNCTION_RETURN void *
 #define THREAD_SPECIFIC_INDEX PULONG
 #define pthread_t TID
 #define pthread_attr_t ULONG
-#define pthread_create(thhandle,attr,thfunc,tharg) \
-    ((int)((*(thhandle)=_beginthread(thfunc,NULL,1024*1024,tharg))==-1))
-#define pthread_join(thread, result) ((int)DosWaitThread(&(thread),0))
 #define pthread_detach(thread) 0
 #define thread_sleep(nms) DosSleep(nms)
 #define pthread_cancel(thread) DosKillThread(thread)
@@ -81,8 +78,8 @@ extern "C" {
 #define ts_key_create(ts_key, destructor) pthread_key_create (&(ts_key), destructor);
 #endif
 
-/* Syncrhronization macros: Win32 and Pthreads */
-#ifdef _WIN32
+/* Synchronization macros: Win32 and Pthreads */
+#if defined(_WIN32) && !HAVE_PTHREAD_H
 #define sem_t HANDLE
 #define pause(voidpara) __asm PAUSE
 #define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateSemaphore(NULL,0,32768,NULL))==NULL)
@@ -185,6 +182,47 @@ static inline int sem_destroy(sem_t * sem)
 #define x86_pause_hint()
 #endif
 
+#include "vpx_util/vpx_thread.h"
+
+static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
+    const int kMaxTryLocks = 4000;
+    int locked = 0;
+    int i;
+
+    for (i = 0; i < kMaxTryLocks; ++i) {
+        if (!pthread_mutex_trylock(mutex)) {
+            locked = 1;
+            break;
+        }
+    }
+
+    if (!locked)
+        pthread_mutex_lock(mutex);
+}
+
+static INLINE int protected_read(pthread_mutex_t *const mutex, const int *p) {
+    int ret;
+    mutex_lock(mutex);
+    ret = *p;
+    pthread_mutex_unlock(mutex);
+    return ret;
+}
+
+static INLINE void sync_read(pthread_mutex_t *const mutex, int mb_col,
+                             const int *last_row_current_mb_col,
+                             const int nsync) {
+    while (mb_col > (protected_read(mutex, last_row_current_mb_col) - nsync)) {
+        x86_pause_hint();
+        thread_sleep(0);
+    }
+}
+
+static INLINE void protected_write(pthread_mutex_t *mutex, int *p, int v) {
+    mutex_lock(mutex);
+    *p = v;
+    pthread_mutex_unlock(mutex);
+}
+
 #endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
 
 #ifdef __cplusplus
diff --git a/vp8/common/vp8_loopfilter.c b/vp8/common/vp8_loopfilter.c
index 8b55dff..756ad48 100644
--- a/vp8/common/vp8_loopfilter.c
+++ b/vp8/common/vp8_loopfilter.c
@@ -141,8 +141,8 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm,
             else  /* Delta Value */
             {
                 lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
-                lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0;
             }
+            lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0;
         }
 
         if (!mbd->mode_ref_lf_delta_enabled)
diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm
index a2b1632..1a89e7e 100644
--- a/vp8/common/x86/postproc_mmx.asm
+++ b/vp8/common/x86/postproc_mmx.asm
@@ -241,68 +241,6 @@ sym(vp8_mbpost_proc_down_mmx):
 %undef flimit2
 
 
-;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int Width, unsigned int Height, int Pitch)
-global sym(vp8_plane_add_noise_mmx) PRIVATE
-sym(vp8_plane_add_noise_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movq        mm1,[rsi+rax]         ; get the source
-
-            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     mm1, [rdx+32] ;bothclamp
-            psubusb     mm1, [rdx+16] ;whiteclamp
-
-            movq        mm2,[rdi+rax]         ; get the noise for this line
-            paddb       mm1,mm2              ; add it in
-            movq        [rsi+rax],mm1         ; store the result
-
-            add         rax,8                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 Blur:
diff --git a/vp8/common/x86/postproc_sse2.asm b/vp8/common/x86/postproc_sse2.asm
index fed4ee5..de17afa 100644
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -655,68 +655,6 @@ sym(vp8_mbpost_proc_across_ip_xmm):
 %undef flimit4
 
 
-;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int Width, unsigned int Height, int Pitch)
-global sym(vp8_plane_add_noise_wmt) PRIVATE
-sym(vp8_plane_add_noise_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movdqu      xmm1,[rsi+rax]         ; get the source
-
-            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     xmm1, [rdx+32] ;bothclamp
-            psubusb     xmm1, [rdx+16] ;whiteclamp
-
-            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
-            paddb       xmm1,xmm2              ; add it in
-            movdqu      [rsi+rax],xmm1         ; store the result
-
-            add         rax,16                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 four8s:
diff --git a/vp8/decoder/dboolhuff.c b/vp8/decoder/dboolhuff.c
index 8a7e332..5cdd2a2 100644
--- a/vp8/decoder/dboolhuff.c
+++ b/vp8/decoder/dboolhuff.c
@@ -44,7 +44,7 @@ void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
     int shift = VP8_BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
     size_t bytes_left = br->user_buffer_end - bufptr;
     size_t bits_left = bytes_left * CHAR_BIT;
-    int x = (int)(shift + CHAR_BIT - bits_left);
+    int x = shift + CHAR_BIT - (int)bits_left;
     int loop_end = 0;
     unsigned char decrypted[sizeof(VP8_BD_VALUE) + 1];
 
diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h
index cc9eaaf..1b1bbf8 100644
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@@ -83,7 +83,7 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
     }
 
     {
-        register unsigned int shift = vp8_norm[range];
+        register int shift = vp8_norm[range];
         range <<= shift;
         value <<= shift;
         count -= shift;
diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c
index f0d7603..51acdbb 100644
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@@ -73,10 +73,9 @@ void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
 
         /* Delta Value */
         else
-        {
             QIndex = pc->base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
-            QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    /* Clamp to valid range */
-        }
+
+        QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    /* Clamp to valid range */
     }
     else
         QIndex = pc->base_qindex;
@@ -145,8 +144,6 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
              */
             pbi->frame_corrupt_residual = 1;
             memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
-            vp8_conceal_corrupt_mb(xd);
-
 
             corruption_detected = 1;
 
@@ -626,8 +623,7 @@ static void decode_mb_rows(VP8D_COMP *pbi)
                      */
                     vp8_interpolate_motion(xd,
                                            mb_row, mb_col,
-                                           pc->mb_rows, pc->mb_cols,
-                                           pc->mode_info_stride);
+                                           pc->mb_rows, pc->mb_cols);
                 }
             }
 #endif
@@ -987,7 +983,8 @@ int vp8_decode_frame(VP8D_COMP *pbi)
     VP8_COMMON *const pc = &pbi->common;
     MACROBLOCKD *const xd  = &pbi->mb;
     const unsigned char *data = pbi->fragments.ptrs[0];
-    const unsigned char *data_end =  data + pbi->fragments.sizes[0];
+    const unsigned int data_sz = pbi->fragments.sizes[0];
+    const unsigned char *data_end = data + data_sz;
     ptrdiff_t first_partition_length_in_bytes;
 
     int i, j, k, l;
@@ -1023,7 +1020,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
         const unsigned char *clear = data;
         if (pbi->decrypt_cb)
         {
-            int n = (int)VPXMIN(sizeof(clear_buffer), data_end - data);
+            int n = (int)VPXMIN(sizeof(clear_buffer), data_sz);
             pbi->decrypt_cb(pbi->decrypt_state, data, clear_buffer, n);
             clear = clear_buffer;
         }
diff --git a/vp8/decoder/error_concealment.c b/vp8/decoder/error_concealment.c
index 0b846a0..a73813f 100644
--- a/vp8/decoder/error_concealment.c
+++ b/vp8/decoder/error_concealment.c
@@ -194,7 +194,7 @@ void vp8_calculate_overlaps(MB_OVERLAP *overlap_ul,
         return;
     }
 
-    if (new_row <= (-4 << 3) || new_col <= (-4 << 3))
+    if (new_row <= -32 || new_col <= -32)
     {
         /* outside the frame */
         return;
@@ -558,8 +558,7 @@ static void interpolate_mvs(MACROBLOCKD *mb,
 
 void vp8_interpolate_motion(MACROBLOCKD *mb,
                         int mb_row, int mb_col,
-                        int mb_rows, int mb_cols,
-                        int mi_stride)
+                        int mb_rows, int mb_cols)
 {
     /* Find relevant neighboring blocks */
     EC_BLOCK neighbors[NUM_NEIGHBORS];
@@ -585,13 +584,3 @@ void vp8_interpolate_motion(MACROBLOCKD *mb,
     mb->mode_info_context->mbmi.partitioning = 3;
     mb->mode_info_context->mbmi.segment_id = 0;
 }
-
-void vp8_conceal_corrupt_mb(MACROBLOCKD *xd)
-{
-    /* This macroblock has corrupt residual, use the motion compensated
-       image (predictor) for concealment */
-
-    /* The build predictor functions now output directly into the dst buffer,
-     * so the copies are no longer necessary */
-
-}
diff --git a/vp8/decoder/error_concealment.h b/vp8/decoder/error_concealment.h
index 9a1e024..b6b4972 100644
--- a/vp8/decoder/error_concealment.h
+++ b/vp8/decoder/error_concealment.h
@@ -34,13 +34,7 @@ void vp8_estimate_missing_mvs(VP8D_COMP *pbi);
  * (mb_row, mb_col). */
 void vp8_interpolate_motion(MACROBLOCKD *mb,
                             int mb_row, int mb_col,
-                            int mb_rows, int mb_cols,
-                            int mi_stride);
-
-/* Conceal a macroblock with corrupt residual.
- * Copies the prediction signal to the reconstructed image.
- */
-void vp8_conceal_corrupt_mb(MACROBLOCKD *xd);
+                            int mb_rows, int mb_cols);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index aa2cc57..313fe01 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -81,7 +81,7 @@ typedef struct VP8D_COMP
 #if CONFIG_MULTITHREAD
     /* variable for threading */
 
-    volatile int b_multithreaded_rd;
+    int b_multithreaded_rd;
     int max_threads;
     int current_mb_col_main;
     unsigned int decoding_thread_count;
@@ -90,6 +90,8 @@ typedef struct VP8D_COMP
     int mt_baseline_filter_level[MAX_MB_SEGMENTS];
     int sync_range;
     int *mt_current_mb_col;                  /* Each row remembers its already decoded column. */
+    pthread_mutex_t *pmutex;
+    pthread_mutex_t mt_mutex;                /* mutex for b_multithreaded_rd */
 
     unsigned char **mt_yabove_row;           /* mb_rows x width */
     unsigned char **mt_uabove_row;
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 7c7184c..3c1b838 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -52,9 +52,6 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_D
         mbd->subpixel_predict8x8     = xd->subpixel_predict8x8;
         mbd->subpixel_predict16x16   = xd->subpixel_predict16x16;
 
-        mbd->mode_info_context = pc->mi   + pc->mode_info_stride * (i + 1);
-        mbd->mode_info_stride  = pc->mode_info_stride;
-
         mbd->frame_type = pc->frame_type;
         mbd->pre = xd->pre;
         mbd->dst = xd->dst;
@@ -139,8 +136,6 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
              */
             pbi->frame_corrupt_residual = 1;
             memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
-            vp8_conceal_corrupt_mb(xd);
-
 
             corruption_detected = 1;
 
@@ -298,8 +293,8 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
 static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row)
 {
-    volatile const int *last_row_current_mb_col;
-    volatile int *current_mb_col;
+    const int *last_row_current_mb_col;
+    int *current_mb_col;
     int mb_row;
     VP8_COMMON *pc = &pbi->common;
     const int nsync = pbi->sync_range;
@@ -337,6 +332,9 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row)
 
     xd->up_available = (start_mb_row != 0);
 
+    xd->mode_info_context = pc->mi + pc->mode_info_stride * start_mb_row;
+    xd->mode_info_stride = pc->mode_info_stride;
+
     for (mb_row = start_mb_row; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
     {
        int recon_yoffset, recon_uvoffset;
@@ -405,17 +403,15 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row)
                                  xd->dst.uv_stride);
        }
 
-       for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
-       {
-           *current_mb_col = mb_col - 1;
+       for (mb_col = 0; mb_col < pc->mb_cols; mb_col++) {
+           if (((mb_col - 1) % nsync) == 0) {
+               pthread_mutex_t *mutex = &pbi->pmutex[mb_row];
+               protected_write(mutex, current_mb_col, mb_col - 1);
+           }
 
-           if ((mb_col & (nsync - 1)) == 0)
-           {
-               while (mb_col > (*last_row_current_mb_col - nsync))
-               {
-                   x86_pause_hint();
-                   thread_sleep(0);
-               }
+           if (mb_row && !(mb_col & (nsync - 1))) {
+               pthread_mutex_t *mutex = &pbi->pmutex[mb_row-1];
+               sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
            }
 
            /* Distance of MB to the various image edges.
@@ -449,8 +445,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row)
                     */
                    vp8_interpolate_motion(xd,
                                           mb_row, mb_col,
-                                          pc->mb_rows, pc->mb_cols,
-                                          pc->mode_info_stride);
+                                          pc->mb_rows, pc->mb_cols);
                }
            }
     #endif
@@ -604,7 +599,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row)
                              xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
 
        /* last MB of row is ready just after extension is done */
-       *current_mb_col = mb_col + nsync;
+       protected_write(&pbi->pmutex[mb_row], current_mb_col, mb_col + nsync);
 
        ++xd->mode_info_context;      /* skip prediction column */
        xd->up_available = 1;
@@ -629,12 +624,12 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data)
 
     while (1)
     {
-        if (pbi->b_multithreaded_rd == 0)
+        if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0)
             break;
 
         if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0)
         {
-            if (pbi->b_multithreaded_rd == 0)
+            if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0)
                 break;
             else
             {
@@ -657,6 +652,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi)
 
     pbi->b_multithreaded_rd = 0;
     pbi->allocated_decoding_thread_count = 0;
+    pthread_mutex_init(&pbi->mt_mutex, NULL);
 
     /* limit decoding threads to the max number of token partitions */
     core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads;
@@ -699,8 +695,17 @@ void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
 {
     int i;
 
-    if (pbi->b_multithreaded_rd)
+    if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd))
     {
+        /* De-allocate mutex */
+        if (pbi->pmutex != NULL) {
+            for (i = 0; i < mb_rows; i++) {
+                pthread_mutex_destroy(&pbi->pmutex[i]);
+            }
+            vpx_free(pbi->pmutex);
+            pbi->pmutex = NULL;
+        }
+
             vpx_free(pbi->mt_current_mb_col);
             pbi->mt_current_mb_col = NULL ;
 
@@ -781,7 +786,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
     int i;
     int uv_width;
 
-    if (pbi->b_multithreaded_rd)
+    if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd))
     {
         vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows);
 
@@ -796,6 +801,15 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
 
         uv_width = width >>1;
 
+        /* Allocate mutex */
+        CHECK_MEM_ERROR(pbi->pmutex, vpx_malloc(sizeof(*pbi->pmutex) *
+                                                pc->mb_rows));
+        if (pbi->pmutex) {
+            for (i = 0; i < pc->mb_rows; i++) {
+                pthread_mutex_init(&pbi->pmutex[i], NULL);
+            }
+        }
+
         /* Allocate an int for each mb row. */
         CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows);
 
@@ -831,11 +845,11 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
 void vp8_decoder_remove_threads(VP8D_COMP *pbi)
 {
     /* shutdown MB Decoding thread; */
-    if (pbi->b_multithreaded_rd)
+    if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd))
     {
         int i;
 
-        pbi->b_multithreaded_rd = 0;
+        protected_write(&pbi->mt_mutex, &pbi->b_multithreaded_rd, 0);
 
         /* allow all threads to exit */
         for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
@@ -863,6 +877,7 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
             vpx_free(pbi->de_thread_data);
             pbi->de_thread_data = NULL;
     }
+    pthread_mutex_destroy(&pbi->mt_mutex);
 }
 
 void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index f3d91b5..3196422 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -163,7 +163,7 @@ void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount)
 {
     const TOKENEXTRA *stop = p + xcount;
     unsigned int split;
-    unsigned int shift;
+    int shift;
     int count = w->count;
     unsigned int range = w->range;
     unsigned int lowvalue = w->lowvalue;
diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h
index 7c012a8..e66a2db 100644
--- a/vp8/encoder/boolhuff.h
+++ b/vp8/encoder/boolhuff.h
@@ -65,7 +65,7 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability)
     int count = br->count;
     unsigned int range = br->range;
     unsigned int lowvalue = br->lowvalue;
-    register unsigned int shift;
+    register int shift;
 
 #ifdef VP8_ENTROPY_STATS
 #if defined(SECTIONBITS_OUTPUT)
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index d197f8f..26ce120 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -23,7 +23,7 @@ static const unsigned int NOISE_MOTION_THRESHOLD = 25 * 25;
  */
 static const unsigned int SSE_DIFF_THRESHOLD = 16 * 16 * 20;
 static const unsigned int SSE_THRESHOLD = 16 * 16 * 40;
-static const unsigned int SSE_THRESHOLD_HIGH = 16 * 16 * 60;
+static const unsigned int SSE_THRESHOLD_HIGH = 16 * 16 * 80;
 
 /*
  * The filter function was modified to reduce the computational complexity.
@@ -440,6 +440,11 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
            denoiser->yv12_last_source.frame_size);
 
     denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1);
+    if (!denoiser->denoise_state)
+    {
+        vp8_denoiser_free(denoiser);
+        return 1;
+    }
     memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
     vp8_denoiser_set_parameters(denoiser, mode);
     denoiser->nmse_source_diff = 0;
@@ -492,7 +497,8 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                              loop_filter_info_n *lfi_n,
                              int mb_row,
                              int mb_col,
-                             int block_index)
+                             int block_index,
+                             int consec_zero_last)
 
 {
     int mv_row;
@@ -523,7 +529,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
         // Bias on zero motion vector sse.
         const int zero_bias = denoiser->denoise_pars.denoise_mv_bias;
         zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100);
-        sse_diff = zero_mv_sse - best_sse;
+        sse_diff = (int)zero_mv_sse - (int)best_sse;
 
         saved_mbmi = *mbmi;
 
@@ -566,59 +572,69 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
             best_sse = zero_mv_sse;
         }
 
-        saved_pre = filter_xd->pre;
-        saved_dst = filter_xd->dst;
-
-        /* Compensate the running average. */
-        filter_xd->pre.y_buffer = src->y_buffer + recon_yoffset;
-        filter_xd->pre.u_buffer = src->u_buffer + recon_uvoffset;
-        filter_xd->pre.v_buffer = src->v_buffer + recon_uvoffset;
-        /* Write the compensated running average to the destination buffer. */
-        filter_xd->dst.y_buffer = dst->y_buffer + recon_yoffset;
-        filter_xd->dst.u_buffer = dst->u_buffer + recon_uvoffset;
-        filter_xd->dst.v_buffer = dst->v_buffer + recon_uvoffset;
-
-        if (!x->skip)
-        {
-            vp8_build_inter_predictors_mb(filter_xd);
-        }
-        else
-        {
-            vp8_build_inter16x16_predictors_mb(filter_xd,
-                                               filter_xd->dst.y_buffer,
-                                               filter_xd->dst.u_buffer,
-                                               filter_xd->dst.v_buffer,
-                                               filter_xd->dst.y_stride,
-                                               filter_xd->dst.uv_stride);
+        mv_row = x->best_sse_mv.as_mv.row;
+        mv_col = x->best_sse_mv.as_mv.col;
+        motion_magnitude2 = mv_row * mv_row + mv_col * mv_col;
+        motion_threshold = denoiser->denoise_pars.scale_motion_thresh *
+            NOISE_MOTION_THRESHOLD;
+
+        if (motion_magnitude2 <
+          denoiser->denoise_pars.scale_increase_filter * NOISE_MOTION_THRESHOLD)
+          x->increase_denoising = 1;
+
+        sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD;
+        if (x->increase_denoising)
+          sse_thresh =
+              denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD_HIGH;
+
+        if (best_sse > sse_thresh || motion_magnitude2 > motion_threshold)
+          decision = COPY_BLOCK;
+
+        // If block is considered skin, don't denoise if the block
+        // (1) is selected as non-zero motion for current frame, or
+        // (2) has not been selected as ZERO_LAST mode at least x past frames
+        // in a row.
+        // TODO(marpan): Parameter "x" should be varied with framerate.
+        // In particualar, should be reduced for layers (base layer/LAST).
+        if (x->is_skin && (consec_zero_last < 2 || motion_magnitude2 > 0))
+          decision = COPY_BLOCK;
+
+        if (decision == FILTER_BLOCK) {
+          saved_pre = filter_xd->pre;
+          saved_dst = filter_xd->dst;
+
+          /* Compensate the running average. */
+          filter_xd->pre.y_buffer = src->y_buffer + recon_yoffset;
+          filter_xd->pre.u_buffer = src->u_buffer + recon_uvoffset;
+          filter_xd->pre.v_buffer = src->v_buffer + recon_uvoffset;
+          /* Write the compensated running average to the destination buffer. */
+          filter_xd->dst.y_buffer = dst->y_buffer + recon_yoffset;
+          filter_xd->dst.u_buffer = dst->u_buffer + recon_uvoffset;
+          filter_xd->dst.v_buffer = dst->v_buffer + recon_uvoffset;
+
+          if (!x->skip)
+          {
+              vp8_build_inter_predictors_mb(filter_xd);
+          }
+          else
+          {
+              vp8_build_inter16x16_predictors_mb(filter_xd,
+                                                 filter_xd->dst.y_buffer,
+                                                 filter_xd->dst.u_buffer,
+                                                 filter_xd->dst.v_buffer,
+                                                 filter_xd->dst.y_stride,
+                                                 filter_xd->dst.uv_stride);
+          }
+          filter_xd->pre = saved_pre;
+          filter_xd->dst = saved_dst;
+          *mbmi = saved_mbmi;
         }
-        filter_xd->pre = saved_pre;
-        filter_xd->dst = saved_dst;
-        *mbmi = saved_mbmi;
-
-    }
-
-    mv_row = x->best_sse_mv.as_mv.row;
-    mv_col = x->best_sse_mv.as_mv.col;
-    motion_magnitude2 = mv_row * mv_row + mv_col * mv_col;
-    motion_threshold = denoiser->denoise_pars.scale_motion_thresh *
-        NOISE_MOTION_THRESHOLD;
-
-    // If block is considered to be skin area, lower the motion threshold.
-    // In current version set threshold = 1, so only denoise very low
-    // (i.e., zero) mv on skin.
-    if (x->is_skin)
-        motion_threshold = 1;
-
-    if (motion_magnitude2 <
-        denoiser->denoise_pars.scale_increase_filter * NOISE_MOTION_THRESHOLD)
-      x->increase_denoising = 1;
-
-    sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD;
-    if (x->increase_denoising)
-      sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD_HIGH;
-
-    if (best_sse > sse_thresh || motion_magnitude2 > motion_threshold)
+    } else {
+      // zero_frame should always be 1 for real-time mode, as the
+      // ZEROMV mode is always checked, so we should never go into this branch.
+      // If case ZEROMV is not checked, then we will force no denoise (COPY).
       decision = COPY_BLOCK;
+    }
 
     if (decision == FILTER_BLOCK)
     {
diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
index 9a379a6..8c126c1 100644
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -18,8 +18,8 @@
 extern "C" {
 #endif
 
-#define SUM_DIFF_THRESHOLD (16 * 16 * 2)
-#define SUM_DIFF_THRESHOLD_HIGH (600)  // ~(16 * 16 * 1.5)
+#define SUM_DIFF_THRESHOLD 512
+#define SUM_DIFF_THRESHOLD_HIGH 600
 #define MOTION_MAGNITUDE_THRESHOLD (8*3)
 
 #define SUM_DIFF_THRESHOLD_UV (96)   // (8 * 8 * 1.5)
@@ -108,7 +108,8 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                              loop_filter_info_n *lfi_n,
                              int mb_row,
                              int mb_col,
-                             int block_index);
+                             int block_index,
+                             int consec_zero_last);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index b0aaa2f..9b05cd1 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -386,8 +386,8 @@ void encode_mb_row(VP8_COMP *cpi,
 #if CONFIG_MULTITHREAD
     const int nsync = cpi->mt_sync_range;
     const int rightmost_col = cm->mb_cols + nsync;
-    volatile const int *last_row_current_mb_col;
-    volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+    const int *last_row_current_mb_col;
+    int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
 
     if ((cpi->b_multi_threaded != 0) && (mb_row != 0))
         last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
@@ -461,17 +461,15 @@ void encode_mb_row(VP8_COMP *cpi,
         vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 
 #if CONFIG_MULTITHREAD
-        if (cpi->b_multi_threaded != 0)
-        {
-            *current_mb_col = mb_col - 1; /* set previous MB done */
+        if (cpi->b_multi_threaded != 0) {
+            if (((mb_col - 1) % nsync) == 0) {
+                pthread_mutex_t *mutex = &cpi->pmutex[mb_row];
+                protected_write(mutex, current_mb_col, mb_col - 1);
+            }
 
-            if ((mb_col & (nsync - 1)) == 0)
-            {
-                while (mb_col > (*last_row_current_mb_col - nsync))
-                {
-                    x86_pause_hint();
-                    thread_sleep(0);
-                }
+            if (mb_row && !(mb_col & (nsync - 1))) {
+                pthread_mutex_t *mutex = &cpi->pmutex[mb_row-1];
+                sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
             }
         }
 #endif
@@ -616,7 +614,7 @@ void encode_mb_row(VP8_COMP *cpi,
 
 #if CONFIG_MULTITHREAD
     if (cpi->b_multi_threaded != 0)
-        *current_mb_col = rightmost_col;
+        protected_write(&cpi->pmutex[mb_row], current_mb_col, rightmost_col);
 #endif
 
     /* this is to account for the border */
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 4e234cc..2a0c298 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -26,12 +26,13 @@ static THREAD_FUNCTION thread_loopfilter(void *p_data)
 
     while (1)
     {
-        if (cpi->b_multi_threaded == 0)
+        if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0)
             break;
 
         if (sem_wait(&cpi->h_event_start_lpf) == 0)
         {
-            if (cpi->b_multi_threaded == 0) /* we're shutting down */
+            /* we're shutting down */
+            if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0)
                 break;
 
             vp8_loopfilter_frame(cpi, cm);
@@ -53,7 +54,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
 
     while (1)
     {
-        if (cpi->b_multi_threaded == 0)
+        if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0)
             break;
 
         if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0)
@@ -72,9 +73,14 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
             int *segment_counts = mbri->segment_counts;
             int *totalrate = &mbri->totalrate;
 
-            if (cpi->b_multi_threaded == 0) /* we're shutting down */
+            /* we're shutting down */
+            if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0)
                 break;
 
+            xd->mode_info_context = cm->mi + cm->mode_info_stride *
+                (ithread + 1);
+            xd->mode_info_stride = cm->mode_info_stride;
+
             for (mb_row = ithread + 1; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
             {
 
@@ -85,8 +91,8 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                 int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
                 int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
                 int map_index = (mb_row * cm->mb_cols);
-                volatile const int *last_row_current_mb_col;
-                volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+                const int *last_row_current_mb_col;
+                int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
 
 #if  (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
                 vp8_writer *w = &cpi->bc[1 + (mb_row % num_part)];
@@ -113,15 +119,14 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                 /* for each macroblock col in image */
                 for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
                 {
-                    *current_mb_col = mb_col - 1;
+                    if (((mb_col - 1) % nsync) == 0) {
+                        pthread_mutex_t *mutex = &cpi->pmutex[mb_row];
+                        protected_write(mutex, current_mb_col, mb_col - 1);
+                    }
 
-                    if ((mb_col & (nsync - 1)) == 0)
-                    {
-                        while (mb_col > (*last_row_current_mb_col - nsync))
-                        {
-                            x86_pause_hint();
-                            thread_sleep(0);
-                        }
+                    if (mb_row && !(mb_col & (nsync - 1))) {
+                      pthread_mutex_t *mutex = &cpi->pmutex[mb_row-1];
+                      sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
                     }
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
@@ -296,7 +301,8 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                                     xd->dst.u_buffer + 8,
                                     xd->dst.v_buffer + 8);
 
-                *current_mb_col = mb_col + nsync;
+                protected_write(&cpi->pmutex[mb_row], current_mb_col,
+                                mb_col + nsync);
 
                 /* this is to account for the border */
                 xd->mode_info_context++;
@@ -473,9 +479,6 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
 
         mb->partition_info = x->pi + x->e_mbd.mode_info_stride * (i + 1);
 
-        mbd->mode_info_context = cm->mi   + x->e_mbd.mode_info_stride * (i + 1);
-        mbd->mode_info_stride  = cm->mode_info_stride;
-
         mbd->frame_type = cm->frame_type;
 
         mb->src = * cpi->Source;
@@ -515,7 +518,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi)
 
     cpi->b_multi_threaded = 0;
     cpi->encoding_thread_count = 0;
-    cpi->b_lpf_running = 0;
+
+    pthread_mutex_init(&cpi->mt_mutex, NULL);
 
     if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
     {
@@ -580,7 +584,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi)
         if(rc)
         {
             /* shutdown other threads */
-            cpi->b_multi_threaded = 0;
+            protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
             for(--ithread; ithread >= 0; ithread--)
             {
                 pthread_join(cpi->h_encoding_thread[ithread], 0);
@@ -594,6 +598,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi)
             vpx_free(cpi->mb_row_ei);
             vpx_free(cpi->en_thread_data);
 
+            pthread_mutex_destroy(&cpi->mt_mutex);
+
             return -1;
         }
 
@@ -611,7 +617,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi)
             if(rc)
             {
                 /* shutdown other threads */
-                cpi->b_multi_threaded = 0;
+                protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
                 for(--ithread; ithread >= 0; ithread--)
                 {
                     sem_post(&cpi->h_event_start_encoding[ithread]);
@@ -628,6 +634,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi)
                 vpx_free(cpi->mb_row_ei);
                 vpx_free(cpi->en_thread_data);
 
+                pthread_mutex_destroy(&cpi->mt_mutex);
+
                 return -2;
             }
         }
@@ -637,10 +645,10 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi)
 
 void vp8cx_remove_encoder_threads(VP8_COMP *cpi)
 {
-    if (cpi->b_multi_threaded)
+    if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded))
     {
         /* shutdown other threads */
-        cpi->b_multi_threaded = 0;
+        protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
         {
             int i;
 
@@ -666,5 +674,6 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi)
         vpx_free(cpi->mb_row_ei);
         vpx_free(cpi->en_thread_data);
     }
+    pthread_mutex_destroy(&cpi->mt_mutex);
 }
 #endif
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 4c2acc7..c526a3e 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -18,6 +18,7 @@
 #include "onyx_int.h"
 #include "vpx_dsp/variance.h"
 #include "encodeintra.h"
+#include "vp8/common/common.h"
 #include "vp8/common/setupintrarecon.h"
 #include "vp8/common/systemdependent.h"
 #include "mcomp.h"
@@ -2417,7 +2418,7 @@ void vp8_second_pass(VP8_COMP *cpi)
     int tmp_q;
     int frames_left = (int)(cpi->twopass.total_stats.count - cpi->common.current_video_frame);
 
-    FIRSTPASS_STATS this_frame = {0};
+    FIRSTPASS_STATS this_frame;
     FIRSTPASS_STATS this_frame_copy;
 
     double this_frame_intra_error;
@@ -2425,6 +2426,8 @@ void vp8_second_pass(VP8_COMP *cpi)
 
     int overhead_bits;
 
+    vp8_zero(this_frame);
+
     if (!cpi->twopass.stats_in)
     {
         return ;
@@ -2808,7 +2811,8 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
              * static scene.
              */
             if ( detect_transition_to_still( cpi, i,
-                                             (cpi->key_frame_frequency-i),
+                                             ((int)(cpi->key_frame_frequency) -
+                                              (int)i),
                                              loop_decay_rate,
                                              decay_accumulator ) )
             {
diff --git a/vp8/encoder/lookahead.c b/vp8/encoder/lookahead.c
index ce2ce08..6623385 100644
--- a/vp8/encoder/lookahead.c
+++ b/vp8/encoder/lookahead.c
@@ -181,6 +181,7 @@ vp8_lookahead_pop(struct lookahead_ctx *ctx,
 {
     struct lookahead_entry* buf = NULL;
 
+    assert(ctx != NULL);
     if(ctx->sz && (drain || ctx->sz == ctx->max_sz - 1))
     {
         buf = pop(ctx, &ctx->read_idx);
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index 768c764..e20c1ea 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -1591,7 +1591,6 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
     int col_min = ref_col - distance;
     int col_max = ref_col + distance;
 
-    // TODO(johannkoenig): check if this alignment is necessary.
     DECLARE_ALIGNED(16, unsigned int, sad_array8[8]);
     unsigned int sad_array[3];
 
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index df5bcf6..d5a0fff 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -477,6 +477,18 @@ static void dealloc_compressor_data(VP8_COMP *cpi)
     cpi->mb.pip = 0;
 
 #if CONFIG_MULTITHREAD
+    /* De-allocate mutex */
+    if (cpi->pmutex != NULL) {
+        VP8_COMMON *const pc = &cpi->common;
+        int i;
+
+        for (i = 0; i < pc->mb_rows; i++) {
+            pthread_mutex_destroy(&cpi->pmutex[i]);
+        }
+        vpx_free(cpi->pmutex);
+        cpi->pmutex = NULL;
+    }
+
     vpx_free(cpi->mt_current_mb_col);
     cpi->mt_current_mb_col = NULL;
 #endif
@@ -1180,6 +1192,9 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
 
     int width = cm->Width;
     int height = cm->Height;
+#if CONFIG_MULTITHREAD
+    int prev_mb_rows = cm->mb_rows;
+#endif
 
     if (vp8_alloc_frame_buffers(cm, width, height))
         vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
@@ -1271,6 +1286,25 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
 
     if (cpi->oxcf.multi_threaded > 1)
     {
+        int i;
+
+        /* De-allocate and re-allocate mutex */
+        if (cpi->pmutex != NULL) {
+            for (i = 0; i < prev_mb_rows; i++) {
+                pthread_mutex_destroy(&cpi->pmutex[i]);
+            }
+            vpx_free(cpi->pmutex);
+            cpi->pmutex = NULL;
+        }
+
+        CHECK_MEM_ERROR(cpi->pmutex, vpx_malloc(sizeof(*cpi->pmutex) *
+                                                cm->mb_rows));
+        if (cpi->pmutex) {
+            for (i = 0; i < cm->mb_rows; i++) {
+                pthread_mutex_init(&cpi->pmutex[i], NULL);
+            }
+        }
+
         vpx_free(cpi->mt_current_mb_col);
         CHECK_MEM_ERROR(cpi->mt_current_mb_col,
                     vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
@@ -1284,9 +1318,11 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
 #if CONFIG_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity > 0) {
       vp8_denoiser_free(&cpi->denoiser);
-      vp8_denoiser_allocate(&cpi->denoiser, width, height,
-                            cm->mb_rows, cm->mb_cols,
-                            cpi->oxcf.noise_sensitivity);
+      if (vp8_denoiser_allocate(&cpi->denoiser, width, height,
+                                cm->mb_rows, cm->mb_cols,
+                                cpi->oxcf.noise_sensitivity))
+          vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                             "Failed to allocate denoiser");
     }
 #endif
 }
@@ -1487,7 +1523,8 @@ static void update_layer_contexts (VP8_COMP *cpi)
 void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
 {
     VP8_COMMON *cm = &cpi->common;
-    int last_w, last_h, prev_number_of_layers;
+    int last_w, last_h;
+    unsigned int prev_number_of_layers;
 
     if (!cpi)
         return;
@@ -1495,15 +1532,6 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
     if (!oxcf)
         return;
 
-#if CONFIG_MULTITHREAD
-    /*  wait for the last picture loopfilter thread done */
-    if (cpi->b_lpf_running)
-    {
-        sem_wait(&cpi->h_event_end_lpf);
-        cpi->b_lpf_running = 0;
-    }
-#endif
-
     if (cm->version != oxcf->Version)
     {
         cm->version = oxcf->Version;
@@ -1759,10 +1787,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
     if (last_w != cpi->oxcf.Width || last_h != cpi->oxcf.Height)
         cpi->force_next_frame_intra = 1;
 
-    if (((cm->Width + 15) & 0xfffffff0) !=
-          cm->yv12_fb[cm->lst_fb_idx].y_width ||
-        ((cm->Height + 15) & 0xfffffff0) !=
-          cm->yv12_fb[cm->lst_fb_idx].y_height ||
+    if (((cm->Width + 15) & ~15) != cm->yv12_fb[cm->lst_fb_idx].y_width ||
+        ((cm->Height + 15) & ~15) != cm->yv12_fb[cm->lst_fb_idx].y_height ||
         cm->yv12_fb[cm->lst_fb_idx].y_width == 0)
     {
         dealloc_raw_frame_buffers(cpi);
@@ -1798,9 +1824,11 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
       {
         int width = (cpi->oxcf.Width + 15) & ~15;
         int height = (cpi->oxcf.Height + 15) & ~15;
-        vp8_denoiser_allocate(&cpi->denoiser, width, height,
-                              cm->mb_rows, cm->mb_cols,
-                              cpi->oxcf.noise_sensitivity);
+        if (vp8_denoiser_allocate(&cpi->denoiser, width, height,
+                                  cm->mb_rows, cm->mb_cols,
+                                  cpi->oxcf.noise_sensitivity))
+            vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                               "Failed to allocate denoiser");
       }
     }
 #endif
@@ -2228,6 +2256,8 @@ void vp8_remove_compressor(VP8_COMP **ptr)
             double total_encode_time = (cpi->time_receive_data +
                                             cpi->time_compress_data) / 1000.000;
             double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded;
+            const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000;
+            const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
 
             if (cpi->b_calculate_psnr)
             {
@@ -2273,12 +2303,14 @@ void vp8_remove_compressor(VP8_COMP **ptr)
                                                       cpi->summed_weights, 8.0);
 
                     fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t"
-                               "GLPsnrP\tVPXSSIM\t  Time(us)\n");
+                               "GLPsnrP\tVPXSSIM\t  Time(us)  Rc-Err "
+                               "Abs Err\n");
                     fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
-                               "%7.3f\t%8.0f\n",
+                               "%7.3f\t%8.0f %7.2f %7.2f\n",
                                dr, cpi->total / cpi->count, total_psnr,
                                cpi->totalp / cpi->count, total_psnr2,
-                               total_ssim, total_encode_time);
+                               total_ssim, total_encode_time,
+                               rate_err, fabs(rate_err));
                 }
             }
 
@@ -3600,15 +3632,6 @@ static void encode_frame_to_data_rate
     /* Clear down mmx registers to allow floating point in what follows */
     vp8_clear_system_state();
 
-#if CONFIG_MULTITHREAD
-    /*  wait for the last picture loopfilter thread done */
-    if (cpi->b_lpf_running)
-    {
-        sem_wait(&cpi->h_event_end_lpf);
-        cpi->b_lpf_running = 0;
-    }
-#endif
-
     if(cpi->force_next_frame_intra)
     {
         cm->frame_type = KEY_FRAME;  /* delayed intra frame */
@@ -4337,8 +4360,6 @@ static void encode_frame_to_data_rate
             vp8_setup_key_frame(cpi);
         }
 
-
-
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
         {
             if(cpi->oxcf.error_resilient_mode)
@@ -4804,7 +4825,6 @@ static void encode_frame_to_data_rate
     {
         /* start loopfilter in separate thread */
         sem_post(&cpi->h_event_start_lpf);
-        cpi->b_lpf_running = 1;
     }
     else
 #endif
@@ -4836,11 +4856,10 @@ static void encode_frame_to_data_rate
     vp8_pack_bitstream(cpi, dest, dest_end, size);
 
 #if CONFIG_MULTITHREAD
-    /* if PSNR packets are generated we have to wait for the lpf */
-    if (cpi->b_lpf_running && cpi->b_calculate_psnr)
+    /* wait for the lpf thread done */
+    if (cpi->b_multi_threaded)
     {
         sem_wait(&cpi->h_event_end_lpf);
-        cpi->b_lpf_running = 0;
     }
 #endif
 
@@ -5201,7 +5220,7 @@ static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest,
         vp8_second_pass(cpi);
 
     encode_frame_to_data_rate(cpi, size, dest, dest_end, frame_flags);
-    cpi->twopass.bits_left -= 8 * *size;
+    cpi->twopass.bits_left -= 8 * (int)(*size);
 
     if (!cpi->common.refresh_alt_ref_frame)
     {
@@ -5800,14 +5819,6 @@ int vp8_get_preview_raw_frame(VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp8_ppfla
     {
         int ret;
 
-#if CONFIG_MULTITHREAD
-        if(cpi->b_lpf_running)
-        {
-            sem_wait(&cpi->h_event_end_lpf);
-            cpi->b_lpf_running = 0;
-        }
-#endif
-
 #if CONFIG_POSTPROC
         cpi->common.show_frame_mi = cpi->common.mi;
         ret = vp8_post_proc_frame(&cpi->common, dest, flags);
@@ -5845,7 +5856,7 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigne
         return -1;
 
     // Check number of rows and columns match
-    if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
+    if (cpi->common.mb_rows != (int)rows || cpi->common.mb_cols != (int)cols)
         return -1;
 
     // Range check the delta Q values and convert the external Q range values
@@ -5901,7 +5912,7 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigne
 
 int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols)
 {
-    if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols)
+    if ((int)rows == cpi->common.mb_rows && (int)cols == cpi->common.mb_cols)
     {
         if (map)
         {
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 317e4b9..44fbbd4 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -371,7 +371,7 @@ typedef struct VP8_COMP
     double key_frame_rate_correction_factor;
     double gf_rate_correction_factor;
 
-    unsigned int frames_since_golden;
+    int frames_since_golden;
     /* Count down till next GF */
     int frames_till_gf_update_due;
 
@@ -530,11 +530,12 @@ typedef struct VP8_COMP
 
 #if CONFIG_MULTITHREAD
     /* multithread data */
+    pthread_mutex_t *pmutex;
+    pthread_mutex_t mt_mutex;           /* mutex for b_multi_threaded */
     int * mt_current_mb_col;
     int mt_sync_range;
     int b_multi_threaded;
     int encoding_thread_count;
-    int b_lpf_running;
 
     pthread_t *h_encoding_thread;
     pthread_t h_filter_thread;
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index d0fff3f..24b332d 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -36,6 +36,8 @@
 extern unsigned int cnt_pm;
 #endif
 
+#define MODEL_MODE 1
+
 extern const int vp8_ref_frame_order[MAX_MODES];
 extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
 
@@ -45,18 +47,22 @@ extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
 // skin color classifier is defined.
 
 // Fixed-point skin color model parameters.
-static const int skin_mean[2] = {7463, 9614};                 // q6
+static const int skin_mean[5][2] =
+    {{7463, 9614}, {6400, 10240}, {7040, 10240}, {8320, 9280}, {6800, 9614}};
 static const int skin_inv_cov[4] = {4107, 1663, 1663, 2157};  // q16
-static const int skin_threshold = 1570636;                    // q18
+static const int skin_threshold[6] = {1570636, 1400000, 800000, 800000, 800000,
+    800000};  // q18
 
 // Evaluates the Mahalanobis distance measure for the input CbCr values.
-static int evaluate_skin_color_difference(int cb, int cr)
-{
+static int evaluate_skin_color_difference(int cb, int cr, int idx) {
   const int cb_q6 = cb << 6;
   const int cr_q6 = cr << 6;
-  const int cb_diff_q12 = (cb_q6 - skin_mean[0]) * (cb_q6 - skin_mean[0]);
-  const int cbcr_diff_q12 = (cb_q6 - skin_mean[0]) * (cr_q6 - skin_mean[1]);
-  const int cr_diff_q12 = (cr_q6 - skin_mean[1]) * (cr_q6 - skin_mean[1]);
+  const int cb_diff_q12 =
+      (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
+  const int cbcr_diff_q12 =
+      (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
+  const int cr_diff_q12 =
+      (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
   const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
   const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
   const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
@@ -67,6 +73,52 @@ static int evaluate_skin_color_difference(int cb, int cr)
   return skin_diff;
 }
 
+// Checks if the input yCbCr values corresponds to skin color.
+static int is_skin_color(int y, int cb, int cr, int consec_zeromv)
+{
+  if (y < 40 || y > 220)
+  {
+    return 0;
+  }
+  else
+  {
+    if (MODEL_MODE == 0)
+    {
+      return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
+    }
+    else
+    {
+      int i = 0;
+      // No skin if block has been zero motion for long consecutive time.
+      if (consec_zeromv > 60)
+        return 0;
+      // Exit on grey.
+       if (cb == 128 && cr == 128)
+         return 0;
+       // Exit on very strong cb.
+       if (cb > 150 && cr < 110)
+         return 0;
+       for (; i < 5; i++) {
+         int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
+         if (skin_color_diff < skin_threshold[i + 1]) {
+            if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2))
+              return 0;
+            else if (consec_zeromv > 25 &&
+                     skin_color_diff > (skin_threshold[i + 1] >> 1))
+              return 0;
+            else
+             return 1;
+         }
+         // Exit if difference is much large than the threshold.
+         if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
+           return 0;
+         }
+       }
+      return 0;
+    }
+  }
+}
+
 static int macroblock_corner_grad(unsigned char* signal, int stride,
                                   int offsetx, int offsety, int sgnx, int sgny)
 {
@@ -157,16 +209,6 @@ static int check_dot_artifact_candidate(VP8_COMP *cpi,
   return 0;
 }
 
-// Checks if the input yCbCr values corresponds to skin color.
-static int is_skin_color(int y, int cb, int cr)
-{
-  if (y < 40 || y > 220)
-  {
-    return 0;
-  }
-  return (evaluate_skin_color_difference(cb, cr) < skin_threshold);
-}
-
 int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
                                 int_mv *bestmv, int_mv *ref_mv,
                                 int error_per_bit,
@@ -828,8 +870,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         x->src.v_buffer[4 * x->src.uv_stride + 3] +
         x->src.v_buffer[4 * x->src.uv_stride + 4]) >> 2;
     x->is_skin = 0;
-    if (!cpi->oxcf.screen_content_mode)
-      x->is_skin = is_skin_color(y, cb, cr);
+    if (!cpi->oxcf.screen_content_mode) {
+      int block_index = mb_row * cpi->common.mb_cols + mb_col;
+      x->is_skin = is_skin_color(y, cb, cr, cpi->consec_zero_last[block_index]);
+    }
     }
 #if CONFIG_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity) {
@@ -1433,7 +1477,8 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
                                 recon_yoffset, recon_uvoffset,
                                 &cpi->common.lf_info, mb_row, mb_col,
-                                block_index);
+                                block_index,
+                                cpi->consec_zero_last_mvbias[block_index]);
 
         // Reevaluate ZEROMV after denoising: for large noise content
         // (i.e., cpi->mse_source_denoised is above threshold), do this for all
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index ab0ad15..6507ae9 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1899,7 +1899,8 @@ static int calculate_final_rd_costs(int this_rd,
                     int prob_skip_cost;
 
                     prob_skip_cost = vp8_cost_bit(cpi->prob_skip_false, 1);
-                    prob_skip_cost -= vp8_cost_bit(cpi->prob_skip_false, 0);
+                    prob_skip_cost -=
+                        (int)vp8_cost_bit(cpi->prob_skip_false, 0);
                     rd->rate2 += prob_skip_cost;
                     *other_cost += prob_skip_cost;
                 }
@@ -2530,7 +2531,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
                                 recon_yoffset, recon_uvoffset,
                                 &cpi->common.lf_info, mb_row, mb_col,
-                                block_index);
+                                block_index, 0);
 
         /* Reevaluate ZEROMV after denoising. */
         if (best_mode.mbmode.ref_frame == INTRA_FRAME &&
diff --git a/vp8/encoder/vp8_quantize.c b/vp8/encoder/vp8_quantize.c
index ee922c9..0d101ba 100644
--- a/vp8/encoder/vp8_quantize.c
+++ b/vp8/encoder/vp8_quantize.c
@@ -227,12 +227,12 @@ static void invert_quant(int improved_quant, short *quant,
     if(improved_quant)
     {
         unsigned t;
-        int l;
+        int l, m;
         t = d;
         for(l = 0; t > 1; l++)
             t>>=1;
-        t = 1 + (1<<(16+l))/d;
-        *quant = (short)(t - (1<<16));
+        m = 1 + (1<<(16+l))/d;
+        *quant = (short)(m - (1<<16));
         *shift = l;
         /* use multiplication and constant shift by 16 */
         *shift = 1 << (16 - *shift);
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index c125ae8..22a82b7 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -22,6 +22,7 @@
 #include "vpx/vp8cx.h"
 #include "vp8/encoder/firstpass.h"
 #include "vp8/common/onyx.h"
+#include "vp8/common/common.h"
 #include <stdlib.h>
 #include <string.h>
 
@@ -760,7 +761,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t  *ctx,
                                     unsigned long          duration,
                                     unsigned long          deadline)
 {
-    unsigned int new_qc;
+    int new_qc;
 
 #if !(CONFIG_REALTIME_ONLY)
     /* Use best quality mode if no deadline is given. */
@@ -782,10 +783,13 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t  *ctx,
     }
 
 #else
+    (void)duration;
     new_qc = MODE_REALTIME;
 #endif
 
-    if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
+    if (deadline == VPX_DL_REALTIME)
+        new_qc = MODE_REALTIME;
+    else if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
         new_qc = MODE_FIRSTPASS;
     else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS)
         new_qc = (new_qc == MODE_BESTQUALITY)
@@ -1116,7 +1120,8 @@ static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx)
 {
 
     YV12_BUFFER_CONFIG sd;
-    vp8_ppflags_t flags = {0};
+    vp8_ppflags_t flags;
+    vp8_zero(flags);
 
     if (ctx->preview_ppcfg.post_proc_flag)
     {
@@ -1162,31 +1167,6 @@ static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx)
         return NULL;
 }
 
-static vpx_codec_err_t vp8e_update_entropy(vpx_codec_alg_priv_t *ctx,
-                                           va_list args)
-{
-    int update = va_arg(args, int);
-    vp8_update_entropy(ctx->cpi, update);
-    return VPX_CODEC_OK;
-
-}
-
-static vpx_codec_err_t vp8e_update_reference(vpx_codec_alg_priv_t *ctx,
-                                             va_list args)
-{
-    int update = va_arg(args, int);
-    vp8_update_reference(ctx->cpi, update);
-    return VPX_CODEC_OK;
-}
-
-static vpx_codec_err_t vp8e_use_reference(vpx_codec_alg_priv_t *ctx,
-                                          va_list args)
-{
-    int reference_flag = va_arg(args, int);
-    vp8_use_as_reference(ctx->cpi, reference_flag);
-    return VPX_CODEC_OK;
-}
-
 static vpx_codec_err_t vp8e_set_frame_flags(vpx_codec_alg_priv_t *ctx,
                                             va_list args)
 {
@@ -1330,8 +1310,8 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
         30,                 /* rc_resize_up_thresold */
 
         VPX_VBR,            /* rc_end_usage */
-        {0},                /* rc_twopass_stats_in */
-        {0},                /* rc_firstpass_mb_stats_in */
+        {NULL, 0},          /* rc_twopass_stats_in */
+        {NULL, 0},          /* rc_firstpass_mb_stats_in */
         256,                /* rc_target_bandwidth */
         4,                  /* rc_min_quantizer */
         63,                 /* rc_max_quantizer */
@@ -1359,6 +1339,8 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
         {0},                /* ts_rate_decimator */
         0,                  /* ts_periodicity */
         {0},                /* ts_layer_id */
+        {0},                /* layer_target_bitrate */
+        0                   /* temporal_layering_mode */
     }},
 };
 
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index a12a2ad..fc9288d 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -9,6 +9,7 @@
  */
 
 
+#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 #include "./vp8_rtcd.h"
@@ -67,10 +68,11 @@ struct vpx_codec_alg_priv
     FRAGMENT_DATA           fragments;
 };
 
-static void vp8_init_ctx(vpx_codec_ctx_t *ctx)
+static int vp8_init_ctx(vpx_codec_ctx_t *ctx)
 {
     vpx_codec_alg_priv_t *priv =
         (vpx_codec_alg_priv_t *)vpx_calloc(1, sizeof(*priv));
+    if (!priv) return 1;
 
     ctx->priv = (vpx_codec_priv_t *)priv;
     ctx->priv->init_flags = ctx->init_flags;
@@ -85,6 +87,8 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx)
         priv->cfg = *ctx->config.dec;
         ctx->config.dec = &priv->cfg;
     }
+
+    return 0;
 }
 
 static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
@@ -103,7 +107,7 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
      * information becomes known.
      */
     if (!ctx->priv) {
-      vp8_init_ctx(ctx);
+      if (vp8_init_ctx(ctx)) return VPX_CODEC_MEM_ERROR;
       priv = (vpx_codec_alg_priv_t *)ctx->priv;
 
       /* initialize number of fragments to zero */
@@ -151,6 +155,8 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
 {
     vpx_codec_err_t res = VPX_CODEC_OK;
 
+    assert(data != NULL);
+
     if(data + data_sz <= data)
     {
         res = VPX_CODEC_INVALID_PARAM;
@@ -516,7 +522,8 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t  *ctx,
     {
         YV12_BUFFER_CONFIG sd;
         int64_t time_stamp = 0, time_end_stamp = 0;
-        vp8_ppflags_t flags = {0};
+        vp8_ppflags_t flags;
+        vp8_zero(flags);
 
         if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
         {
@@ -810,11 +817,12 @@ CODEC_INTERFACE(vpx_codec_vp8_dx) =
     },
     { /* encoder functions */
         0,
-        NULL,
-        NULL,
-        NULL,
-        NULL,
-        NULL,
-        NULL
+        NULL,  /* vpx_codec_enc_cfg_map_t */
+        NULL,  /* vpx_codec_encode_fn_t */
+        NULL,  /* vpx_codec_get_cx_data_fn_t */
+        NULL,  /* vpx_codec_enc_config_set_fn_t */
+        NULL,  /* vpx_codec_get_global_headers_fn_t */
+        NULL,  /* vpx_codec_get_preview_frame_fn_t */
+        NULL   /* vpx_codec_enc_mr_get_mem_loc_fn_t */
     }
 };
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 24c6c54..7dd1005 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -119,6 +119,20 @@ void vp9_free_context_buffers(VP9_COMMON *cm) {
   cm->lf.lfm = NULL;
 }
 
+
+int vp9_alloc_loop_filter(VP9_COMMON *cm) {
+  vpx_free(cm->lf.lfm);
+  // Each lfm holds bit masks for all the 8x8 blocks in a 64x64 region.  The
+  // stride and rows are rounded up / truncated to a multiple of 8.
+  cm->lf.lfm_stride = (cm->mi_cols + (MI_BLOCK_SIZE - 1)) >> 3;
+  cm->lf.lfm = (LOOP_FILTER_MASK *)vpx_calloc(
+      ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride,
+      sizeof(*cm->lf.lfm));
+  if (!cm->lf.lfm)
+    return 1;
+  return 0;
+}
+
 int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
   int new_mi_size;
 
@@ -151,15 +165,8 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
     cm->above_context_alloc_cols = cm->mi_cols;
   }
 
-  vpx_free(cm->lf.lfm);
-
-  // Each lfm holds bit masks for all the 8x8 blocks in a 64x64 region.  The
-  // stride and rows are rounded up / truncated to a multiple of 8.
-  cm->lf.lfm_stride = (cm->mi_cols + (MI_BLOCK_SIZE - 1)) >> 3;
-  cm->lf.lfm = (LOOP_FILTER_MASK *)vpx_calloc(
-      ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride,
-      sizeof(*cm->lf.lfm));
-  if (!cm->lf.lfm) goto fail;
+  if (vp9_alloc_loop_filter(cm))
+    goto fail;
 
   return 0;
 
diff --git a/vp9/common/vp9_alloccommon.h b/vp9/common/vp9_alloccommon.h
index c0e51a6..e53955b 100644
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@@ -23,6 +23,7 @@ struct BufferPool;
 
 void vp9_remove_common(struct VP9Common *cm);
 
+int vp9_alloc_loop_filter(struct VP9Common *cm);
 int vp9_alloc_context_buffers(struct VP9Common *cm, int width, int height);
 void vp9_init_context_buffers(struct VP9Common *cm);
 void vp9_free_context_buffers(struct VP9Common *cm);
diff --git a/vp9/common/vp9_blockd.c b/vp9/common/vp9_blockd.c
index 0e104ee..7bab27d 100644
--- a/vp9/common/vp9_blockd.c
+++ b/vp9/common/vp9_blockd.c
@@ -13,7 +13,7 @@
 PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi,
                                     const MODE_INFO *left_mi, int b) {
   if (b == 0 || b == 2) {
-    if (!left_mi || is_inter_block(&left_mi->mbmi))
+    if (!left_mi || is_inter_block(left_mi))
       return DC_PRED;
 
     return get_y_mode(left_mi, b + 1);
@@ -26,7 +26,7 @@ PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi,
 PREDICTION_MODE vp9_above_block_mode(const MODE_INFO *cur_mi,
                                      const MODE_INFO *above_mi, int b) {
   if (b == 0 || b == 1) {
-    if (!above_mi || is_inter_block(&above_mi->mbmi))
+    if (!above_mi || is_inter_block(above_mi))
       return DC_PRED;
 
     return get_y_mode(above_mi, b + 2);
@@ -40,12 +40,12 @@ void vp9_foreach_transformed_block_in_plane(
     const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
     foreach_transformed_block_visitor visit, void *arg) {
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const MB_MODE_INFO* mbmi = &xd->mi[0]->mbmi;
+  const MODE_INFO* mi = xd->mi[0];
   // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
   // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
   // transform size varies per plane, look it up in a common way.
-  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd)
-                                : mbmi->tx_size;
+  const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd)
+                                : mi->tx_size;
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 61eb591..3d26fb2 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -64,7 +64,7 @@ typedef struct {
 typedef int8_t MV_REFERENCE_FRAME;
 
 // This structure now relates to 8x8 block regions.
-typedef struct {
+typedef struct MODE_INFO {
   // Common for both INTER and INTRA blocks
   BLOCK_SIZE sb_type;
   PREDICTION_MODE mode;
@@ -82,24 +82,21 @@ typedef struct {
 
   // TODO(slavarnway): Delete and use bmi[3].as_mv[] instead.
   int_mv mv[2];
-} MB_MODE_INFO;
 
-typedef struct MODE_INFO {
-  MB_MODE_INFO mbmi;
   b_mode_info bmi[4];
 } MODE_INFO;
 
 static INLINE PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) {
-  return mi->mbmi.sb_type < BLOCK_8X8 ? mi->bmi[block].as_mode
-                                      : mi->mbmi.mode;
+  return mi->sb_type < BLOCK_8X8 ? mi->bmi[block].as_mode
+                                 : mi->mode;
 }
 
-static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
-  return mbmi->ref_frame[0] > INTRA_FRAME;
+static INLINE int is_inter_block(const MODE_INFO *mi) {
+  return mi->ref_frame[0] > INTRA_FRAME;
 }
 
-static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
-  return mbmi->ref_frame[1] > INTRA_FRAME;
+static INLINE int has_second_ref(const MODE_INFO *mi) {
+  return mi->ref_frame[1] > INTRA_FRAME;
 }
 
 PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi,
@@ -160,11 +157,9 @@ typedef struct macroblockd {
   MODE_INFO **mi;
   MODE_INFO *left_mi;
   MODE_INFO *above_mi;
-  MB_MODE_INFO *left_mbmi;
-  MB_MODE_INFO *above_mbmi;
 
-  int up_available;
-  int left_available;
+  unsigned int max_blocks_wide;
+  unsigned int max_blocks_high;
 
   const vpx_prob (*partition_probs)[PARTITION_TYPES - 1];
 
@@ -212,19 +207,19 @@ extern const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES];
 
 static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type,
                                   const MACROBLOCKD *xd) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MODE_INFO *const mi = xd->mi[0];
 
-  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mbmi))
+  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mi))
     return DCT_DCT;
 
-  return intra_mode_to_tx_type_lookup[mbmi->mode];
+  return intra_mode_to_tx_type_lookup[mi->mode];
 }
 
 static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
                                       const MACROBLOCKD *xd, int ib) {
   const MODE_INFO *const mi = xd->mi[0];
 
-  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(&mi->mbmi))
+  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mi))
     return DCT_DCT;
 
   return intra_mode_to_tx_type_lookup[get_y_mode(mi, ib)];
@@ -242,9 +237,9 @@ static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize,
   }
 }
 
-static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi,
+static INLINE TX_SIZE get_uv_tx_size(const MODE_INFO *mi,
                                      const struct macroblockd_plane *pd) {
-  return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type, pd->subsampling_x,
+  return get_uv_tx_size_impl(mi->tx_size, mi->sb_type, pd->subsampling_x,
                              pd->subsampling_y);
 }
 
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 76e7cd4..908fa80 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -67,7 +67,6 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
 
 #define VP9_FRAME_MARKER 0x2
 
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/common/vp9_common_data.c b/vp9/common/vp9_common_data.c
index a6dae6a..3409d04 100644
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@@ -159,3 +159,18 @@ const struct {
   {0,  8 },  // 64X32 - {0b0000, 0b1000}
   {0,  0 },  // 64X64 - {0b0000, 0b0000}
 };
+
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+const uint8_t need_top_left[INTRA_MODES] = {
+    0,  // DC_PRED
+    0,  // V_PRED
+    0,  // H_PRED
+    0,  // D45_PRED
+    1,  // D135_PRED
+    1,  // D117_PRED
+    1,  // D153_PRED
+    0,  // D207_PRED
+    0,  // D63_PRED
+    1,  // TM_PRED
+};
+#endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/common/vp9_common_data.h b/vp9/common/vp9_common_data.h
index 95a1179..0ae24da 100644
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@@ -33,6 +33,9 @@ extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
 extern const BLOCK_SIZE txsize_to_bsize[TX_SIZES];
 extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES];
 extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+extern const uint8_t need_top_left[INTRA_MODES];
+#endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c
index 3d80103..d9c1fd9 100644
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c
@@ -35,7 +35,7 @@ static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor,
     fprintf(file, "%c ", prefix);
     for (mi_col = 0; mi_col < cols; mi_col++) {
       fprintf(file, "%2d ",
-              *((int*) ((char *) (&mi[0]->mbmi) +
+              *((int*) ((char *) (mi[0]) +
                                   member_offset)));
       mi++;
     }
@@ -53,18 +53,18 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, const char *file) {
   int rows = cm->mi_rows;
   int cols = cm->mi_cols;
 
-  print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
-  print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
-  print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
-  print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
-  print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
+  print_mi_data(cm, mvs, "Partitions:", offsetof(MODE_INFO, sb_type));
+  print_mi_data(cm, mvs, "Modes:", offsetof(MODE_INFO, mode));
+  print_mi_data(cm, mvs, "Ref frame:", offsetof(MODE_INFO, ref_frame[0]));
+  print_mi_data(cm, mvs, "Transform:", offsetof(MODE_INFO, tx_size));
+  print_mi_data(cm, mvs, "UV Modes:", offsetof(MODE_INFO, uv_mode));
 
   // output skip infomation.
   log_frame_info(cm, "Skips:", mvs);
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(mvs, "S ");
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%2d ", mi[0]->mbmi.skip);
+      fprintf(mvs, "%2d ", mi[0]->skip);
       mi++;
     }
     fprintf(mvs, "\n");
@@ -78,8 +78,8 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, const char *file) {
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(mvs, "V ");
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%4d:%4d ", mi[0]->mbmi.mv[0].as_mv.row,
-                               mi[0]->mbmi.mv[0].as_mv.col);
+      fprintf(mvs, "%4d:%4d ", mi[0]->mv[0].as_mv.row,
+                               mi[0]->mv[0].as_mv.col);
       mi++;
     }
     fprintf(mvs, "\n");
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 579857b..7b490af 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -36,20 +36,6 @@ const vpx_prob vp9_cat6_prob[] = {
     254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
 };
 #if CONFIG_VP9_HIGHBITDEPTH
-const vpx_prob vp9_cat1_prob_high10[] = { 159 };
-const vpx_prob vp9_cat2_prob_high10[] = { 165, 145 };
-const vpx_prob vp9_cat3_prob_high10[] = { 173, 148, 140 };
-const vpx_prob vp9_cat4_prob_high10[] = { 176, 155, 140, 135 };
-const vpx_prob vp9_cat5_prob_high10[] = { 180, 157, 141, 134, 130 };
-const vpx_prob vp9_cat6_prob_high10[] = {
-    255, 255, 254, 254, 254, 252, 249, 243,
-    230, 196, 177, 153, 140, 133, 130, 129
-};
-const vpx_prob vp9_cat1_prob_high12[] = { 159 };
-const vpx_prob vp9_cat2_prob_high12[] = { 165, 145 };
-const vpx_prob vp9_cat3_prob_high12[] = { 173, 148, 140 };
-const vpx_prob vp9_cat4_prob_high12[] = { 176, 155, 140, 135 };
-const vpx_prob vp9_cat5_prob_high12[] = { 180, 157, 141, 134, 130 };
 const vpx_prob vp9_cat6_prob_high12[] = {
     255, 255, 255, 255, 254, 254, 254, 252, 249,
     243, 230, 196, 177, 153, 140, 133, 130, 129
@@ -403,7 +389,6 @@ const vpx_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = {
   {255, 241, 243, 255, 236, 255, 252, 254},
   {255, 243, 245, 255, 237, 255, 252, 254},
   {255, 246, 247, 255, 239, 255, 253, 255},
-  {255, 246, 247, 255, 239, 255, 253, 255},
 };
 
 static const vp9_coeff_probs_model default_coef_probs_4x4[PLANE_TYPES] = {
@@ -743,8 +728,8 @@ static const vp9_coeff_probs_model default_coef_probs_32x32[PLANE_TYPES] = {
 };
 
 static void extend_to_full_distribution(vpx_prob *probs, vpx_prob p) {
-  memcpy(probs, vp9_pareto8_full[p = 0 ? 0 : p - 1],
-         MODEL_NODES * sizeof(vpx_prob));
+  assert(p != 0);
+  memcpy(probs, vp9_pareto8_full[p - 1], MODEL_NODES * sizeof(vpx_prob));
 }
 
 void vp9_model_to_full_probs(const vpx_prob *model, vpx_prob *full) {
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 21611ed..63b3bff 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -138,7 +138,7 @@ static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
 
-#define COEFF_PROB_MODELS 256
+#define COEFF_PROB_MODELS 255
 
 #define UNCONSTRAINED_NODES         3
 
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index 3acfe14..566ae91 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -11,9 +11,6 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_entropymv.h"
 
-// Integer pel reference mv threshold for use of high-precision 1/8 mv
-#define COMPANDED_MVREF_THRESH 8
-
 const vpx_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
   -MV_JOINT_ZERO, 2,
   -MV_JOINT_HNZVZ, 4,
@@ -127,11 +124,6 @@ MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
   return c;
 }
 
-int vp9_use_mv_hp(const MV *ref) {
-  return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&
-         (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH;
-}
-
 static void inc_mv_component(int v, nmv_component_counts *comp_counts,
                              int incr, int usehp) {
   int s, z, c, o, d, e, f;
diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index 8c817bf..2f05ad4 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -27,7 +27,14 @@ struct VP9Common;
 void vp9_init_mv_probs(struct VP9Common *cm);
 
 void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp);
-int vp9_use_mv_hp(const MV *ref);
+
+// Integer pel reference mv threshold for use of high-precision 1/8 mv
+#define COMPANDED_MVREF_THRESH 8
+
+static INLINE int use_mv_hp(const MV *ref) {
+  return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&
+         (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH;
+}
 
 #define MV_UPDATE_PROB 252
 
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index d12cd76..1b42014 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -174,6 +174,9 @@ void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
   else if (eob <= 34)
     // non-zero coeff only in upper-left 8x8
     vpx_idct32x32_34_add(input, dest, stride);
+  else if (eob <= 135)
+    // non-zero coeff only in upper-left 16x16
+    vpx_idct32x32_135_add(input, dest, stride);
   else
     vpx_idct32x32_1024_add(input, dest, stride);
 }
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index b8a1132..183dec4 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -232,9 +232,9 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
 }
 
 static uint8_t get_filter_level(const loop_filter_info_n *lfi_n,
-                                const MB_MODE_INFO *mbmi) {
-  return lfi_n->lvl[mbmi->segment_id][mbmi->ref_frame[0]]
-                   [mode_lf_lut[mbmi->mode]];
+                                const MODE_INFO *mi) {
+  return lfi_n->lvl[mi->segment_id][mi->ref_frame[0]]
+                   [mode_lf_lut[mi->mode]];
 }
 
 void vp9_loop_filter_init(VP9_COMMON *cm) {
@@ -298,200 +298,168 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
 
 static void filter_selectively_vert_row2(int subsampling_factor,
                                          uint8_t *s, int pitch,
-                                         unsigned int mask_16x16_l,
-                                         unsigned int mask_8x8_l,
-                                         unsigned int mask_4x4_l,
-                                         unsigned int mask_4x4_int_l,
-                                         const loop_filter_info_n *lfi_n,
+                                         unsigned int mask_16x16,
+                                         unsigned int mask_8x8,
+                                         unsigned int mask_4x4,
+                                         unsigned int mask_4x4_int,
+                                         const loop_filter_thresh *lfthr,
                                          const uint8_t *lfl) {
-  const int mask_shift = subsampling_factor ? 4 : 8;
-  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int dual_mask_cutoff = subsampling_factor ? 0xff : 0xffff;
   const int lfl_forward = subsampling_factor ? 4 : 8;
-
-  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
-  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
-  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
-  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
-  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
+  const unsigned int dual_one = 1 | (1 << lfl_forward);
   unsigned int mask;
-
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
-              mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
-       mask; mask >>= 1) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
-
-    // TODO(yunqingwang): count in loopfilter functions should be removed.
-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          vpx_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                   lfi0->hev_thr);
-        } else if (mask_16x16_0 & 1) {
-          vpx_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
-                              lfi0->hev_thr);
+  uint8_t *ss[2];
+  ss[0] = s;
+
+  for (mask =
+           (mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int) & dual_mask_cutoff;
+       mask; mask = (mask & ~dual_one) >> 1) {
+    if (mask & dual_one) {
+      const loop_filter_thresh *lfis[2];
+      lfis[0] = lfthr + *lfl;
+      lfis[1] = lfthr + *(lfl + lfl_forward);
+      ss[1] = ss[0] + 8 * pitch;
+
+      if (mask_16x16 & dual_one) {
+        if ((mask_16x16 & dual_one) == dual_one) {
+          vpx_lpf_vertical_16_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim,
+                                   lfis[0]->hev_thr);
         } else {
-          vpx_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
-                              lfi1->lim, lfi1->hev_thr);
+          const loop_filter_thresh *lfi = lfis[!(mask_16x16 & 1)];
+          vpx_lpf_vertical_16(ss[!(mask_16x16 & 1)], pitch, lfi->mblim,
+                              lfi->lim, lfi->hev_thr);
         }
       }
 
-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          vpx_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_8x8_0 & 1) {
-          vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
-                             1);
+      if (mask_8x8 & dual_one) {
+        if ((mask_8x8 & dual_one) == dual_one) {
+          vpx_lpf_vertical_8_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim,
+                                  lfis[0]->hev_thr, lfis[1]->mblim,
+                                  lfis[1]->lim, lfis[1]->hev_thr);
         } else {
-          vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr, 1);
+          const loop_filter_thresh *lfi = lfis[!(mask_8x8 & 1)];
+          vpx_lpf_vertical_8(ss[!(mask_8x8 & 1)], pitch, lfi->mblim, lfi->lim,
+                             lfi->hev_thr);
         }
       }
 
-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          vpx_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_4x4_0 & 1) {
-          vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
-                             1);
+      if (mask_4x4 & dual_one) {
+        if ((mask_4x4 & dual_one) == dual_one) {
+          vpx_lpf_vertical_4_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim,
+                                  lfis[0]->hev_thr, lfis[1]->mblim,
+                                  lfis[1]->lim, lfis[1]->hev_thr);
         } else {
-          vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr, 1);
+          const loop_filter_thresh *lfi = lfis[!(mask_4x4 & 1)];
+          vpx_lpf_vertical_4(ss[!(mask_4x4 & 1)], pitch, lfi->mblim, lfi->lim,
+                             lfi->hev_thr);
         }
       }
 
-      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
-        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          vpx_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_4x4_int_0 & 1) {
-          vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                             lfi0->hev_thr, 1);
+      if (mask_4x4_int & dual_one) {
+        if ((mask_4x4_int & dual_one) == dual_one) {
+          vpx_lpf_vertical_4_dual(ss[0] + 4, pitch, lfis[0]->mblim,
+                                  lfis[0]->lim, lfis[0]->hev_thr,
+                                  lfis[1]->mblim, lfis[1]->lim,
+                                  lfis[1]->hev_thr);
         } else {
-          vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr, 1);
+          const loop_filter_thresh *lfi = lfis[!(mask_4x4_int & 1)];
+          vpx_lpf_vertical_4(ss[!(mask_4x4_int & 1)] + 4, pitch, lfi->mblim,
+                             lfi->lim, lfi->hev_thr);
         }
       }
     }
 
-    s += 8;
+    ss[0] += 8;
     lfl += 1;
-    mask_16x16_0 >>= 1;
-    mask_8x8_0 >>= 1;
-    mask_4x4_0 >>= 1;
-    mask_4x4_int_0 >>= 1;
-    mask_16x16_1 >>= 1;
-    mask_8x8_1 >>= 1;
-    mask_4x4_1 >>= 1;
-    mask_4x4_int_1 >>= 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
   }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static void highbd_filter_selectively_vert_row2(int subsampling_factor,
                                                 uint16_t *s, int pitch,
-                                                unsigned int mask_16x16_l,
-                                                unsigned int mask_8x8_l,
-                                                unsigned int mask_4x4_l,
-                                                unsigned int mask_4x4_int_l,
-                                                const loop_filter_info_n *lfi_n,
+                                                unsigned int mask_16x16,
+                                                unsigned int mask_8x8,
+                                                unsigned int mask_4x4,
+                                                unsigned int mask_4x4_int,
+                                                const loop_filter_thresh *lfthr,
                                                 const uint8_t *lfl, int bd) {
-  const int mask_shift = subsampling_factor ? 4 : 8;
-  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int dual_mask_cutoff = subsampling_factor ? 0xff : 0xffff;
   const int lfl_forward = subsampling_factor ? 4 : 8;
-
-  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
-  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
-  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
-  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
-  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
-  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
+  const unsigned int dual_one = 1 | (1 << lfl_forward);
   unsigned int mask;
-
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
-       mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
-       mask; mask >>= 1) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
-
-    // TODO(yunqingwang): count in loopfilter functions should be removed.
-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          vpx_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                          lfi0->hev_thr, bd);
-        } else if (mask_16x16_0 & 1) {
-          vpx_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim,
-                                     lfi0->hev_thr, bd);
+  uint16_t *ss[2];
+  ss[0] = s;
+
+  for (mask =
+           (mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int) & dual_mask_cutoff;
+       mask; mask = (mask & ~dual_one) >> 1) {
+    if (mask & dual_one) {
+      const loop_filter_thresh *lfis[2];
+      lfis[0] = lfthr + *lfl;
+      lfis[1] = lfthr + *(lfl + lfl_forward);
+      ss[1] = ss[0] + 8 * pitch;
+
+      if (mask_16x16 & dual_one) {
+        if ((mask_16x16 & dual_one) == dual_one) {
+          vpx_highbd_lpf_vertical_16_dual(ss[0], pitch, lfis[0]->mblim,
+                                          lfis[0]->lim, lfis[0]->hev_thr, bd);
         } else {
-          vpx_highbd_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim,
-                                     lfi1->lim, lfi1->hev_thr, bd);
+          const loop_filter_thresh *lfi = lfis[!(mask_16x16 & 1)];
+          vpx_highbd_lpf_vertical_16(ss[!(mask_16x16 & 1)], pitch, lfi->mblim,
+                                     lfi->lim, lfi->hev_thr, bd);
         }
       }
 
-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          vpx_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_8x8_0 & 1) {
-          vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, 1, bd);
+      if (mask_8x8 & dual_one) {
+        if ((mask_8x8 & dual_one) == dual_one) {
+          vpx_highbd_lpf_vertical_8_dual(ss[0], pitch, lfis[0]->mblim,
+                                         lfis[0]->lim, lfis[0]->hev_thr,
+                                         lfis[1]->mblim, lfis[1]->lim,
+                                         lfis[1]->hev_thr, bd);
         } else {
-          vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+          const loop_filter_thresh *lfi = lfis[!(mask_8x8 & 1)];
+          vpx_highbd_lpf_vertical_8(ss[!(mask_8x8 & 1)], pitch, lfi->mblim,
+                                    lfi->lim, lfi->hev_thr, bd);
         }
       }
 
-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          vpx_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_4x4_0 & 1) {
-          vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, 1, bd);
+      if (mask_4x4 & dual_one) {
+        if ((mask_4x4 & dual_one) == dual_one) {
+          vpx_highbd_lpf_vertical_4_dual(ss[0], pitch, lfis[0]->mblim,
+                                         lfis[0]->lim, lfis[0]->hev_thr,
+                                         lfis[1]->mblim, lfis[1]->lim,
+                                         lfis[1]->hev_thr, bd);
         } else {
-          vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+          const loop_filter_thresh *lfi = lfis[!(mask_4x4 & 1)];
+          vpx_highbd_lpf_vertical_4(ss[!(mask_4x4 & 1)], pitch, lfi->mblim,
+                                    lfi->lim, lfi->hev_thr, bd);
         }
       }
 
-      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
-        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
-          vpx_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_4x4_int_0 & 1) {
-          vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, 1, bd);
+      if (mask_4x4_int & dual_one) {
+        if ((mask_4x4_int & dual_one) == dual_one) {
+          vpx_highbd_lpf_vertical_4_dual(ss[0] + 4, pitch, lfis[0]->mblim,
+                                         lfis[0]->lim, lfis[0]->hev_thr,
+                                         lfis[1]->mblim, lfis[1]->lim,
+                                         lfis[1]->hev_thr, bd);
         } else {
-          vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, 1, bd);
+          const loop_filter_thresh *lfi = lfis[!(mask_4x4_int & 1)];
+          vpx_highbd_lpf_vertical_4(ss[!(mask_4x4_int & 1)] + 4, pitch,
+                                    lfi->mblim, lfi->lim, lfi->hev_thr, bd);
         }
       }
     }
 
-    s += 8;
+    ss[0] += 8;
     lfl += 1;
-    mask_16x16_0 >>= 1;
-    mask_8x8_0 >>= 1;
-    mask_4x4_0 >>= 1;
-    mask_4x4_int_0 >>= 1;
-    mask_16x16_1 >>= 1;
-    mask_8x8_1 >>= 1;
-    mask_4x4_1 >>= 1;
-    mask_4x4_int_1 >>= 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
   }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -501,30 +469,30 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
                                      unsigned int mask_8x8,
                                      unsigned int mask_4x4,
                                      unsigned int mask_4x4_int,
-                                     const loop_filter_info_n *lfi_n,
+                                     const loop_filter_thresh *lfthr,
                                      const uint8_t *lfl) {
   unsigned int mask;
   int count;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-
     count = 1;
     if (mask & 1) {
+      const loop_filter_thresh *lfi = lfthr + *lfl;
+
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
-          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
-                                lfi->hev_thr, 2);
+          vpx_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
+                                     lfi->hev_thr);
           count = 2;
         } else {
-          vpx_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
-                                lfi->hev_thr, 1);
+          vpx_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr);
         }
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
           // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+          const loop_filter_thresh *lfin = lfthr + *(lfl + 1);
 
           vpx_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, lfin->mblim, lfin->lim,
@@ -537,23 +505,23 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
           } else {
             if (mask_4x4_int & 1)
               vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                   lfi->hev_thr, 1);
+                                   lfi->hev_thr);
             else if (mask_4x4_int & 2)
               vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                   lfin->lim, lfin->hev_thr, 1);
+                                   lfin->lim, lfin->hev_thr);
           }
           count = 2;
         } else {
-          vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+          vpx_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
 
           if (mask_4x4_int & 1)
             vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                 lfi->hev_thr, 1);
+                                 lfi->hev_thr);
         }
       } else if (mask_4x4 & 1) {
         if ((mask_4x4 & 3) == 3) {
           // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+          const loop_filter_thresh *lfin = lfthr + *(lfl + 1);
 
           vpx_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, lfin->mblim, lfin->lim,
@@ -565,22 +533,22 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
           } else {
             if (mask_4x4_int & 1)
               vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                   lfi->hev_thr, 1);
+                                   lfi->hev_thr);
             else if (mask_4x4_int & 2)
               vpx_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                   lfin->lim, lfin->hev_thr, 1);
+                                   lfin->lim, lfin->hev_thr);
           }
           count = 2;
         } else {
-          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+          vpx_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
 
           if (mask_4x4_int & 1)
             vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                 lfi->hev_thr, 1);
+                                 lfi->hev_thr);
         }
-      } else if (mask_4x4_int & 1) {
+      } else {
         vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                             lfi->hev_thr, 1);
+                             lfi->hev_thr);
       }
     }
     s += 8 * count;
@@ -598,30 +566,30 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
                                             unsigned int mask_8x8,
                                             unsigned int mask_4x4,
                                             unsigned int mask_4x4_int,
-                                            const loop_filter_info_n *lfi_n,
+                                            const loop_filter_thresh *lfthr,
                                             const uint8_t *lfl, int bd) {
   unsigned int mask;
   int count;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-
     count = 1;
     if (mask & 1) {
+      const loop_filter_thresh *lfi = lfthr + *lfl;
+
       if (mask_16x16 & 1) {
         if ((mask_16x16 & 3) == 3) {
-          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
-                                       lfi->hev_thr, 2, bd);
+          vpx_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim,
+                                            lfi->hev_thr, bd);
           count = 2;
         } else {
-          vpx_highbd_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim,
-                                       lfi->hev_thr, 1, bd);
+          vpx_highbd_lpf_horizontal_edge_8(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, bd);
         }
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
           // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+          const loop_filter_thresh *lfin = lfthr + *(lfl + 1);
 
           vpx_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
                                            lfi->hev_thr, lfin->mblim, lfin->lim,
@@ -635,26 +603,26 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
           } else {
             if (mask_4x4_int & 1) {
               vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                          lfi->lim, lfi->hev_thr, 1, bd);
+                                          lfi->lim, lfi->hev_thr, bd);
             } else if (mask_4x4_int & 2) {
               vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                          lfin->lim, lfin->hev_thr, 1, bd);
+                                          lfin->lim, lfin->hev_thr, bd);
             }
           }
           count = 2;
         } else {
           vpx_highbd_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 1, bd);
+                                      lfi->hev_thr, bd);
 
           if (mask_4x4_int & 1) {
             vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                        lfi->lim, lfi->hev_thr, 1, bd);
+                                        lfi->lim, lfi->hev_thr, bd);
           }
         }
       } else if (mask_4x4 & 1) {
         if ((mask_4x4 & 3) == 3) {
           // Next block's thresholds.
-          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+          const loop_filter_thresh *lfin = lfthr + *(lfl + 1);
 
           vpx_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
                                            lfi->hev_thr, lfin->mblim, lfin->lim,
@@ -667,25 +635,25 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
           } else {
             if (mask_4x4_int & 1) {
               vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                          lfi->lim, lfi->hev_thr, 1, bd);
+                                          lfi->lim, lfi->hev_thr, bd);
             } else if (mask_4x4_int & 2) {
               vpx_highbd_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim,
-                                          lfin->lim, lfin->hev_thr, 1, bd);
+                                          lfin->lim, lfin->hev_thr, bd);
             }
           }
           count = 2;
         } else {
           vpx_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 1, bd);
+                                      lfi->hev_thr, bd);
 
           if (mask_4x4_int & 1) {
             vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim,
-                                        lfi->lim, lfi->hev_thr, 1, bd);
+                                        lfi->lim, lfi->hev_thr, bd);
           }
         }
-      } else if (mask_4x4_int & 1) {
+      } else {
         vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, 1, bd);
+                                    lfi->hev_thr, bd);
       }
     }
     s += 8 * count;
@@ -704,16 +672,14 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch,
 // whether there were any coefficients encoded, and the loop filter strength
 // block we are currently looking at. Shift is used to position the
 // 1's we produce.
-// TODO(JBB) Need another function for different resolution color..
 static void build_masks(const loop_filter_info_n *const lfi_n,
                         const MODE_INFO *mi, const int shift_y,
                         const int shift_uv,
                         LOOP_FILTER_MASK *lfm) {
-  const MB_MODE_INFO *mbmi = &mi->mbmi;
-  const BLOCK_SIZE block_size = mbmi->sb_type;
-  const TX_SIZE tx_size_y = mbmi->tx_size;
+  const BLOCK_SIZE block_size = mi->sb_type;
+  const TX_SIZE tx_size_y = mi->tx_size;
   const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1);
-  const int filter_level = get_filter_level(lfi_n, mbmi);
+  const int filter_level = get_filter_level(lfi_n, mi);
   uint64_t *const left_y = &lfm->left_y[tx_size_y];
   uint64_t *const above_y = &lfm->above_y[tx_size_y];
   uint64_t *const int_4x4_y = &lfm->int_4x4_y;
@@ -754,7 +720,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
 
   // If the block has no coefficients and is not intra we skip applying
   // the loop filter on block edges.
-  if (mbmi->skip && is_inter_block(mbmi))
+  if (mi->skip && is_inter_block(mi))
     return;
 
   // Here we are adding a mask for the transform size. The transform
@@ -788,10 +754,9 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
 static void build_y_mask(const loop_filter_info_n *const lfi_n,
                          const MODE_INFO *mi, const int shift_y,
                          LOOP_FILTER_MASK *lfm) {
-  const MB_MODE_INFO *mbmi = &mi->mbmi;
-  const BLOCK_SIZE block_size = mbmi->sb_type;
-  const TX_SIZE tx_size_y = mbmi->tx_size;
-  const int filter_level = get_filter_level(lfi_n, mbmi);
+  const BLOCK_SIZE block_size = mi->sb_type;
+  const TX_SIZE tx_size_y = mi->tx_size;
+  const int filter_level = get_filter_level(lfi_n, mi);
   uint64_t *const left_y = &lfm->left_y[tx_size_y];
   uint64_t *const above_y = &lfm->above_y[tx_size_y];
   uint64_t *const int_4x4_y = &lfm->int_4x4_y;
@@ -812,7 +777,7 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
   *above_y |= above_prediction_mask[block_size] << shift_y;
   *left_y |= left_prediction_mask[block_size] << shift_y;
 
-  if (mbmi->skip && is_inter_block(mbmi))
+  if (mi->skip && is_inter_block(mi))
     return;
 
   *above_y |= (size_mask[block_size] &
@@ -941,7 +906,6 @@ void vp9_adjust_mask(VP9_COMMON *const cm, const int mi_row,
 
 // This function sets up the bit masks for the entire 64x64 region represented
 // by mi_row, mi_col.
-// TODO(JBB): This function only works for yv12.
 void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                     MODE_INFO **mi, const int mode_info_stride,
                     LOOP_FILTER_MASK *lfm) {
@@ -977,10 +941,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
   vp9_zero(*lfm);
   assert(mip[0] != NULL);
 
-  // TODO(jimbankoski): Try moving most of the following code into decode
-  // loop and storing lfm in the mbmi structure so that we don't have to go
-  // through the recursive loop structure multiple times.
-  switch (mip[0]->mbmi.sb_type) {
+  switch (mip[0]->sb_type) {
     case BLOCK_64X64:
       build_masks(lfi_n, mip[0] , 0, 0, lfm);
       break;
@@ -1006,7 +967,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
         const int mi_32_row_offset = ((idx_32 >> 1) << 2);
         if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
           continue;
-        switch (mip[0]->mbmi.sb_type) {
+        switch (mip[0]->sb_type) {
           case BLOCK_32X32:
             build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
             break;
@@ -1036,7 +997,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
               if (mi_16_col_offset >= max_cols || mi_16_row_offset >= max_rows)
                 continue;
 
-              switch (mip[0]->mbmi.sb_type) {
+              switch (mip[0]->sb_type) {
                 case BLOCK_16X16:
                   build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                   break;
@@ -1083,8 +1044,6 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
       }
       break;
   }
-
-  vp9_adjust_mask(cm, mi_row, mi_col, lfm);
 }
 
 static void filter_selectively_vert(uint8_t *s, int pitch,
@@ -1092,25 +1051,25 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
                                     unsigned int mask_8x8,
                                     unsigned int mask_4x4,
                                     unsigned int mask_4x4_int,
-                                    const loop_filter_info_n *lfi_n,
+                                    const loop_filter_thresh *lfthr,
                                     const uint8_t *lfl) {
   unsigned int mask;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= 1) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi = lfthr + *lfl;
 
     if (mask & 1) {
       if (mask_16x16 & 1) {
         vpx_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
       } else if (mask_8x8 & 1) {
-        vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+        vpx_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
       } else if (mask_4x4 & 1) {
-        vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+        vpx_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
       }
     }
     if (mask_4x4_int & 1)
-      vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+      vpx_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
     s += 8;
     lfl += 1;
     mask_16x16 >>= 1;
@@ -1126,13 +1085,13 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch,
                                            unsigned int mask_8x8,
                                            unsigned int mask_4x4,
                                            unsigned int mask_4x4_int,
-                                           const loop_filter_info_n *lfi_n,
+                                           const loop_filter_thresh *lfthr,
                                            const uint8_t *lfl, int bd) {
   unsigned int mask;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= 1) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi = lfthr + *lfl;
 
     if (mask & 1) {
       if (mask_16x16 & 1) {
@@ -1140,15 +1099,15 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch,
                                    lfi->hev_thr, bd);
       } else if (mask_8x8 & 1) {
         vpx_highbd_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim,
-                                  lfi->hev_thr, 1, bd);
+                                  lfi->hev_thr, bd);
       } else if (mask_4x4 & 1) {
         vpx_highbd_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim,
-                                lfi->hev_thr, 1, bd);
+                                  lfi->hev_thr, bd);
       }
     }
     if (mask_4x4_int & 1)
       vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim,
-                                lfi->hev_thr, 1, bd);
+                                lfi->hev_thr, bd);
     s += 8;
     lfl += 1;
     mask_16x16 >>= 1;
@@ -1186,8 +1145,8 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
     // Determine the vertical edges that need filtering
     for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
       const MODE_INFO *mi = mi_8x8[c];
-      const BLOCK_SIZE sb_type = mi[0].mbmi.sb_type;
-      const int skip_this = mi[0].mbmi.skip && is_inter_block(&mi[0].mbmi);
+      const BLOCK_SIZE sb_type = mi[0].sb_type;
+      const int skip_this = mi[0].skip && is_inter_block(mi);
       // left edge of current unit is block/partition edge -> no skip
       const int block_edge_left = (num_4x4_blocks_wide_lookup[sb_type] > 1) ?
           !(c & (num_8x8_blocks_wide_lookup[sb_type] - 1)) : 1;
@@ -1196,13 +1155,13 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
       const int block_edge_above = (num_4x4_blocks_high_lookup[sb_type] > 1) ?
           !(r & (num_8x8_blocks_high_lookup[sb_type] - 1)) : 1;
       const int skip_this_r = skip_this && !block_edge_above;
-      const TX_SIZE tx_size = get_uv_tx_size(&mi[0].mbmi, plane);
+      const TX_SIZE tx_size = get_uv_tx_size(mi, plane);
       const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
       const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
       // Filter level can vary per MI
       if (!(lfl[(r << 3) + (c >> ss_x)] =
-            get_filter_level(&cm->lf_info, &mi[0].mbmi)))
+            get_filter_level(&cm->lf_info, mi)))
         continue;
 
       // Build masks based on the transform size of each block
@@ -1263,23 +1222,18 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
                                      mask_8x8_c & border_mask,
                                      mask_4x4_c & border_mask,
                                      mask_4x4_int[r],
-                                     &cm->lf_info, &lfl[r << 3],
+                                     cm->lf_info.lfthr, &lfl[r << 3],
                                      (int)cm->bit_depth);
     } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       filter_selectively_vert(dst->buf, dst->stride,
                               mask_16x16_c & border_mask,
                               mask_8x8_c & border_mask,
                               mask_4x4_c & border_mask,
                               mask_4x4_int[r],
-                              &cm->lf_info, &lfl[r << 3]);
+                              cm->lf_info.lfthr, &lfl[r << 3]);
+#if CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    filter_selectively_vert(dst->buf, dst->stride,
-                            mask_16x16_c & border_mask,
-                            mask_8x8_c & border_mask,
-                            mask_4x4_c & border_mask,
-                            mask_4x4_int[r],
-                            &cm->lf_info, &lfl[r << 3]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     dst->buf += 8 * dst->stride;
     mi_8x8 += row_step_stride;
@@ -1312,23 +1266,18 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
                                       mask_8x8_r,
                                       mask_4x4_r,
                                       mask_4x4_int_r,
-                                      &cm->lf_info, &lfl[r << 3],
+                                      cm->lf_info.lfthr, &lfl[r << 3],
                                       (int)cm->bit_depth);
     } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       filter_selectively_horiz(dst->buf, dst->stride,
                                mask_16x16_r,
                                mask_8x8_r,
                                mask_4x4_r,
                                mask_4x4_int_r,
-                               &cm->lf_info, &lfl[r << 3]);
+                               cm->lf_info.lfthr, &lfl[r << 3]);
+#if CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    filter_selectively_horiz(dst->buf, dst->stride,
-                             mask_16x16_r,
-                             mask_8x8_r,
-                             mask_4x4_r,
-                             mask_4x4_int_r,
-                             &cm->lf_info, &lfl[r << 3]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     dst->buf += 8 * dst->stride;
   }
@@ -1350,27 +1299,29 @@ void vp9_filter_block_plane_ss00(VP9_COMMON *const cm,
 
   // Vertical pass: do 2 rows at one time
   for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
-    unsigned int mask_16x16_l = mask_16x16 & 0xffff;
-    unsigned int mask_8x8_l = mask_8x8 & 0xffff;
-    unsigned int mask_4x4_l = mask_4x4 & 0xffff;
-    unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
-
-// Disable filtering on the leftmost column.
+    // Disable filtering on the leftmost column.
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
-      highbd_filter_selectively_vert_row2(
-          plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
-          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfm->lfl_y[r << 3], (int)cm->bit_depth);
+      highbd_filter_selectively_vert_row2(plane->subsampling_x,
+                                          CONVERT_TO_SHORTPTR(dst->buf),
+                                          dst->stride,
+                                          (unsigned int)mask_16x16,
+                                          (unsigned int)mask_8x8,
+                                          (unsigned int)mask_4x4,
+                                          (unsigned int)mask_4x4_int,
+                                          cm->lf_info.lfthr,
+                                          &lfm->lfl_y[r << 3],
+                                          (int)cm->bit_depth);
     } else {
-      filter_selectively_vert_row2(
-          plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
-          mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      filter_selectively_vert_row2(plane->subsampling_x, dst->buf, dst->stride,
+                                   (unsigned int)mask_16x16,
+                                   (unsigned int)mask_8x8,
+                                   (unsigned int)mask_4x4,
+                                   (unsigned int)mask_4x4_int,
+                                   cm->lf_info.lfthr, &lfm->lfl_y[r << 3]);
+#if CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    filter_selectively_vert_row2(
-        plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
-        mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     dst->buf += 16 * dst->stride;
     mask_16x16 >>= 16;
@@ -1403,19 +1354,18 @@ void vp9_filter_block_plane_ss00(VP9_COMMON *const cm,
 
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
-      highbd_filter_selectively_horiz(
-          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
-          mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r << 3],
-          (int)cm->bit_depth);
+      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                      dst->stride, mask_16x16_r, mask_8x8_r,
+                                      mask_4x4_r, mask_4x4_int & 0xff,
+                                      cm->lf_info.lfthr, &lfm->lfl_y[r << 3],
+                                      (int)cm->bit_depth);
     } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                               mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-                               &lfm->lfl_y[r << 3]);
+                               mask_4x4_r, mask_4x4_int & 0xff,
+                               cm->lf_info.lfthr, &lfm->lfl_y[r << 3]);
+#if CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                             mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-                             &lfm->lfl_y[r << 3]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     dst->buf += 8 * dst->stride;
@@ -1449,38 +1399,35 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
       lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
     }
 
-    {
-      unsigned int mask_16x16_l = mask_16x16 & 0xff;
-      unsigned int mask_8x8_l = mask_8x8 & 0xff;
-      unsigned int mask_4x4_l = mask_4x4 & 0xff;
-      unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
-
-// Disable filtering on the leftmost column.
+    // Disable filtering on the leftmost column.
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (cm->use_highbitdepth) {
-        highbd_filter_selectively_vert_row2(
-            plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
-            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfl_uv[r << 1], (int)cm->bit_depth);
-      } else {
-        filter_selectively_vert_row2(
-            plane->subsampling_x, dst->buf, dst->stride,
-            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfl_uv[r << 1]);
-      }
-#else
-      filter_selectively_vert_row2(
-          plane->subsampling_x, dst->buf, dst->stride,
-          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfl_uv[r << 1]);
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_vert_row2(plane->subsampling_x,
+                                          CONVERT_TO_SHORTPTR(dst->buf),
+                                          dst->stride,
+                                          (unsigned int)mask_16x16,
+                                          (unsigned int)mask_8x8,
+                                          (unsigned int)mask_4x4,
+                                          (unsigned int)mask_4x4_int,
+                                          cm->lf_info.lfthr, &lfl_uv[r << 1],
+                                          (int)cm->bit_depth);
+    } else {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-
-      dst->buf += 16 * dst->stride;
-      mask_16x16 >>= 8;
-      mask_8x8 >>= 8;
-      mask_4x4 >>= 8;
-      mask_4x4_int >>= 8;
+      filter_selectively_vert_row2(plane->subsampling_x, dst->buf, dst->stride,
+                                   (unsigned int)mask_16x16,
+                                   (unsigned int)mask_8x8,
+                                   (unsigned int)mask_4x4,
+                                   (unsigned int)mask_4x4_int,
+                                   cm->lf_info.lfthr, &lfl_uv[r << 1]);
+#if CONFIG_VP9_HIGHBITDEPTH
     }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    dst->buf += 16 * dst->stride;
+    mask_16x16 >>= 8;
+    mask_8x8 >>= 8;
+    mask_4x4 >>= 8;
+    mask_4x4_int >>= 8;
   }
 
   // Horizontal pass
@@ -1512,17 +1459,16 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
     if (cm->use_highbitdepth) {
       highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
                                       dst->stride, mask_16x16_r, mask_8x8_r,
-                                      mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                                      &lfl_uv[r << 1], (int)cm->bit_depth);
+                                      mask_4x4_r, mask_4x4_int_r,
+                                      cm->lf_info.lfthr, &lfl_uv[r << 1],
+                                      (int)cm->bit_depth);
     } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                               mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+                               mask_4x4_r, mask_4x4_int_r, cm->lf_info.lfthr,
                                &lfl_uv[r << 1]);
+#if CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
-                             mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                             &lfl_uv[r << 1]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     dst->buf += 8 * dst->stride;
@@ -1558,7 +1504,7 @@ static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, VP9_COMMON *cm,
 
       vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
-      // TODO(JBB): Make setup_mask work for non 420.
+      // TODO(jimbankoski): For 444 only need to do y mask.
       vp9_adjust_mask(cm, mi_row, mi_col, lfm);
 
       vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, lfm);
@@ -1598,6 +1544,8 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
 }
 
 // Used by the encoder to build the loopfilter masks.
+// TODO(slavarnway): Do the encoder the same way the decoder does it and
+//                   build the masks in line as part of the encode process.
 void vp9_build_mask_frame(VP9_COMMON *cm, int frame_filter_level,
                           int partial_frame) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
@@ -1640,12 +1588,12 @@ static const uint8_t first_block_in_16x16[8][8] = {
 // This function sets up the bit masks for a block represented
 // by mi_row, mi_col in a 64x64 region.
 // TODO(SJL): This function only works for yv12.
-void vp9_build_mask(VP9_COMMON *cm, const MB_MODE_INFO *mbmi, int mi_row,
+void vp9_build_mask(VP9_COMMON *cm, const MODE_INFO *mi, int mi_row,
                     int mi_col, int bw, int bh) {
-  const BLOCK_SIZE block_size = mbmi->sb_type;
-  const TX_SIZE tx_size_y = mbmi->tx_size;
+  const BLOCK_SIZE block_size = mi->sb_type;
+  const TX_SIZE tx_size_y = mi->tx_size;
   const loop_filter_info_n *const lfi_n = &cm->lf_info;
-  const int filter_level = get_filter_level(lfi_n, mbmi);
+  const int filter_level = get_filter_level(lfi_n, mi);
   const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1);
   LOOP_FILTER_MASK *const lfm = get_lfm(&cm->lf, mi_row, mi_col);
   uint64_t *const left_y = &lfm->left_y[tx_size_y];
@@ -1693,7 +1641,7 @@ void vp9_build_mask(VP9_COMMON *cm, const MB_MODE_INFO *mbmi, int mi_row,
 
   // If the block has no coefficients and is not intra we skip applying
   // the loop filter on block edges.
-  if (mbmi->skip && is_inter_block(mbmi))
+  if (mi->skip && is_inter_block(mi))
     return;
 
   // Add a mask for the transform size. The transform size mask is set to
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index 7f943ea..fca8830 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -69,6 +69,7 @@ typedef struct {
 
 struct loopfilter {
   int filter_level;
+  int last_filt_level;
 
   int sharpness_level;
   int last_sharpness_level;
@@ -134,7 +135,7 @@ static INLINE LOOP_FILTER_MASK *get_lfm(const struct loopfilter *lf,
   return &lf->lfm[(mi_col >> 3) + ((mi_row >> 3) * lf->lfm_stride)];
 }
 
-void vp9_build_mask(struct VP9Common *cm, const MB_MODE_INFO *mbmi, int mi_row,
+void vp9_build_mask(struct VP9Common *cm, const MODE_INFO *mi, int mi_row,
                     int mi_col, int bw, int bh);
 void vp9_adjust_mask(struct VP9Common *const cm, const int mi_row,
                      const int mi_col, LOOP_FILTER_MASK *lfm);
diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c
index 6d560f4..f526466 100644
--- a/vp9/common/vp9_mfqe.c
+++ b/vp9/common/vp9_mfqe.c
@@ -203,12 +203,12 @@ static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
 static int mfqe_decision(MODE_INFO *mi, BLOCK_SIZE cur_bs) {
   // Check the motion in current block(for inter frame),
   // or check the motion in the correlated block in last frame (for keyframe).
-  const int mv_len_square = mi->mbmi.mv[0].as_mv.row *
-                            mi->mbmi.mv[0].as_mv.row +
-                            mi->mbmi.mv[0].as_mv.col *
-                            mi->mbmi.mv[0].as_mv.col;
+  const int mv_len_square = mi->mv[0].as_mv.row *
+                            mi->mv[0].as_mv.row +
+                            mi->mv[0].as_mv.col *
+                            mi->mv[0].as_mv.col;
   const int mv_threshold = 100;
-  return mi->mbmi.mode >= NEARESTMV &&  // Not an intra block
+  return mi->mode >= NEARESTMV &&  // Not an intra block
          cur_bs >= BLOCK_16X16 &&
          mv_len_square <= mv_threshold;
 }
@@ -220,7 +220,7 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
                            uint8_t *yd, uint8_t *ud, uint8_t *vd,
                            int yd_stride, int uvd_stride) {
   int mi_offset, y_offset, uv_offset;
-  const BLOCK_SIZE cur_bs = mi->mbmi.sb_type;
+  const BLOCK_SIZE cur_bs = mi->sb_type;
   const int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
   const int bsl = b_width_log2_lookup[bs];
   PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index 77d1ff4..0eb01a5 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -11,20 +11,19 @@
 
 #include "vp9/common/vp9_mvref_common.h"
 
-// This function searches the neighbourhood of a given MB/SB
+// This function searches the neighborhood of a given MB/SB
 // to try and find candidate reference vectors.
 static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                              MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                              int_mv *mv_ref_list,
                              int block, int mi_row, int mi_col,
-                             find_mv_refs_sync sync, void *const data,
                              uint8_t *mode_context) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
-  const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
+  const POSITION *const mv_ref_search = mv_ref_blocks[mi->sb_type];
   int different_ref_found = 0;
   int context_counter = 0;
-  const MV_REF *const  prev_frame_mvs = cm->use_prev_frame_mvs ?
+  const MV_REF *const prev_frame_mvs = cm->use_prev_frame_mvs ?
       cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL;
   const TileInfo *const tile = &xd->tile;
 
@@ -39,15 +38,14 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
     if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
       const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row *
                                                    xd->mi_stride];
-      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
       // Keep counts for entropy encoding.
-      context_counter += mode_2_counter[candidate->mode];
+      context_counter += mode_2_counter[candidate_mi->mode];
       different_ref_found = 1;
 
-      if (candidate->ref_frame[0] == ref_frame)
+      if (candidate_mi->ref_frame[0] == ref_frame)
         ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block),
                         refmv_count, mv_ref_list, Done);
-      else if (candidate->ref_frame[1] == ref_frame)
+      else if (candidate_mi->ref_frame[1] == ref_frame)
         ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block),
                         refmv_count, mv_ref_list, Done);
     }
@@ -59,34 +57,19 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   for (; i < MVREF_NEIGHBOURS; ++i) {
     const POSITION *const mv_ref = &mv_ref_search[i];
     if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
-      const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row *
-                                                    xd->mi_stride]->mbmi;
+      const MODE_INFO *const candidate_mi =
+          xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
       different_ref_found = 1;
 
-      if (candidate->ref_frame[0] == ref_frame)
-        ADD_MV_REF_LIST(candidate->mv[0], refmv_count, mv_ref_list, Done);
-      else if (candidate->ref_frame[1] == ref_frame)
-        ADD_MV_REF_LIST(candidate->mv[1], refmv_count, mv_ref_list, Done);
+      if (candidate_mi->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST(candidate_mi->mv[0], refmv_count, mv_ref_list, Done);
+      else if (candidate_mi->ref_frame[1] == ref_frame)
+        ADD_MV_REF_LIST(candidate_mi->mv[1], refmv_count, mv_ref_list, Done);
     }
   }
 
-  // TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast
-  // on windows platform. The sync here is unncessary if use_perv_frame_mvs
-  // is 0. But after removing it, there will be hang in the unit test on windows
-  // due to several threads waiting for a thread's signal.
-#if defined(_WIN32) && !HAVE_PTHREAD_H
-    if (cm->frame_parallel_decode && sync != NULL) {
-      sync(data, mi_row);
-    }
-#endif
-
   // Check the last frame's mode and mv info.
   if (cm->use_prev_frame_mvs) {
-    // Synchronize here for frame parallel decode if sync function is provided.
-    if (cm->frame_parallel_decode && sync != NULL) {
-      sync(data, mi_row);
-    }
-
     if (prev_frame_mvs->ref_frame[0] == ref_frame) {
       ADD_MV_REF_LIST(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done);
     } else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
@@ -101,11 +84,11 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
     for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
       const POSITION *mv_ref = &mv_ref_search[i];
       if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
-        const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row
-                                              * xd->mi_stride]->mbmi;
+        const MODE_INFO *const candidate_mi =
+            xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
 
         // If the candidate is INTRA we don't want to consider its mv.
-        IF_DIFF_REF_FRAME_ADD_MV(candidate, ref_frame, ref_sign_bias,
+        IF_DIFF_REF_FRAME_ADD_MV(candidate_mi, ref_frame, ref_sign_bias,
                                  refmv_count, mv_ref_list, Done);
       }
     }
@@ -150,20 +133,9 @@ void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                       int_mv *mv_ref_list,
                       int mi_row, int mi_col,
-                      find_mv_refs_sync sync, void *const data,
                       uint8_t *mode_context) {
   find_mv_refs_idx(cm, xd, mi, ref_frame, mv_ref_list, -1,
-                   mi_row, mi_col, sync, data, mode_context);
-}
-
-static void lower_mv_precision(MV *mv, int allow_hp) {
-  const int use_hp = allow_hp && vp9_use_mv_hp(mv);
-  if (!use_hp) {
-    if (mv->row & 1)
-      mv->row += (mv->row > 0 ? -1 : 1);
-    if (mv->col & 1)
-      mv->col += (mv->col > 0 ? -1 : 1);
-  }
+                   mi_row, mi_col, mode_context);
 }
 
 void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
@@ -190,8 +162,8 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
 
   assert(MAX_MV_REF_CANDIDATES == 2);
 
-  find_mv_refs_idx(cm, xd, mi, mi->mbmi.ref_frame[ref], mv_list, block,
-                   mi_row, mi_col, NULL, NULL, mode_context);
+  find_mv_refs_idx(cm, xd, mi, mi->ref_frame[ref], mv_list, block,
+                   mi_row, mi_col, mode_context);
 
   near_mv->as_int = 0;
   switch (block) {
diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h
index bd216d4..4380843 100644
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -136,19 +136,19 @@ static INLINE void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
 // on whether the block_size < 8x8 and we have check_sub_blocks set.
 static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
                                       int search_col, int block_idx) {
-  return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
+  return block_idx >= 0 && candidate->sb_type < BLOCK_8X8
           ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
               .as_mv[which_mv]
-          : candidate->mbmi.mv[which_mv];
+          : candidate->mv[which_mv];
 }
 
 
 // Performs mv sign inversion if indicated by the reference frame combination.
-static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
+static INLINE int_mv scale_mv(const MODE_INFO *mi, int ref,
                               const MV_REFERENCE_FRAME this_ref_frame,
                               const int *ref_sign_bias) {
-  int_mv mv = mbmi->mv[ref];
-  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
+  int_mv mv = mi->mv[ref];
+  if (ref_sign_bias[mi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
     mv.as_mv.row *= -1;
     mv.as_mv.col *= -1;
   }
@@ -157,7 +157,7 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
 
 // This macro is used to add a motion vector mv_ref list if it isn't
 // already in the list.  If it's the second motion vector it will also
-// skip all additional processing and jump to done!
+// skip all additional processing and jump to Done!
 #define ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done) \
   do { \
     if (refmv_count) { \
@@ -207,11 +207,20 @@ static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
                xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
+static INLINE void lower_mv_precision(MV *mv, int allow_hp) {
+  const int use_hp = allow_hp && use_mv_hp(mv);
+  if (!use_hp) {
+    if (mv->row & 1)
+      mv->row += (mv->row > 0 ? -1 : 1);
+    if (mv->col & 1)
+      mv->col += (mv->col > 0 ? -1 : 1);
+  }
+}
+
 typedef void (*find_mv_refs_sync)(void *const data, int mi_row);
 void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                       int_mv *mv_ref_list, int mi_row, int mi_col,
-                      find_mv_refs_sync sync, void *const data,
                       uint8_t *mode_context);
 
 // check a list of motion vectors by sad score using a number rows of pixels
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index ceffded..3fd935e 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -404,25 +404,8 @@ static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
   xd->mb_to_right_edge  = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;
 
   // Are edges available for intra prediction?
-  xd->up_available    = (mi_row != 0);
-  xd->left_available  = (mi_col > tile->mi_col_start);
-  if (xd->up_available) {
-    xd->above_mi = xd->mi[-xd->mi_stride];
-    // above_mi may be NULL in VP9 encoder's first pass.
-    xd->above_mbmi = xd->above_mi ? &xd->above_mi->mbmi : NULL;
-  } else {
-    xd->above_mi = NULL;
-    xd->above_mbmi = NULL;
-  }
-
-  if (xd->left_available) {
-    xd->left_mi = xd->mi[-1];
-    // left_mi may be NULL in VP9 encoder's first pass.
-    xd->left_mbmi = xd->left_mi ? &xd->left_mi->mbmi : NULL;
-  } else {
-    xd->left_mi = NULL;
-    xd->left_mbmi = NULL;
-  }
+  xd->above_mi = (mi_row != 0) ? xd->mi[-xd->mi_stride] : NULL;
+  xd->left_mi  = (mi_col > tile->mi_col_start) ? xd->mi[-1] : NULL;
 }
 
 static INLINE void update_partition_context(MACROBLOCKD *xd,
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index b685d81..c04cc8f 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -12,6 +12,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_scale_rtcd.h"
 #include "./vp9_rtcd.h"
@@ -587,32 +588,6 @@ static void fillrd(struct postproc_state *state, int q, int a) {
   state->last_noise = a;
 }
 
-void vp9_plane_add_noise_c(uint8_t *start, char *noise,
-                           char blackclamp[16],
-                           char whiteclamp[16],
-                           char bothclamp[16],
-                           unsigned int width, unsigned int height, int pitch) {
-  unsigned int i, j;
-
-  // TODO(jbb): why does simd code use both but c doesn't,  normalize and
-  // fix..
-  (void) bothclamp;
-  for (i = 0; i < height; i++) {
-    uint8_t *pos = start + i * pitch;
-    char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
-
-    for (j = 0; j < width; j++) {
-      if (pos[j] < blackclamp[0])
-        pos[j] = blackclamp[0];
-
-      if (pos[j] > 255 + whiteclamp[0])
-        pos[j] = 255 + whiteclamp[0];
-
-      pos[j] += ref[j];
-    }
-  }
-}
-
 static void swap_mi_and_prev_mi(VP9_COMMON *cm) {
   // Current mip will be the prev_mip for the next frame.
   MODE_INFO *temp = cm->postproc_state.prev_mip;
@@ -726,8 +701,7 @@ int vp9_post_proc_frame(struct VP9Common *cm,
         ppstate->last_noise != noise_level) {
       fillrd(ppstate, 63 - q, noise_level);
     }
-
-    vp9_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
+    vpx_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
                         ppstate->whiteclamp, ppstate->bothclamp,
                         ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride);
   }
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index 1f16325..8f90e70 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -17,82 +17,57 @@
 int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
   // Note:
   // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int left_type = xd->left_available && is_inter_block(left_mbmi) ?
-                            left_mbmi->interp_filter : SWITCHABLE_FILTERS;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const int above_type = xd->up_available && is_inter_block(above_mbmi) ?
-                             above_mbmi->interp_filter : SWITCHABLE_FILTERS;
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int left_type = left_mi && is_inter_block(left_mi) ?
+                            left_mi->interp_filter : SWITCHABLE_FILTERS;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const int above_type = above_mi && is_inter_block(above_mi) ?
+                             above_mi->interp_filter : SWITCHABLE_FILTERS;
 
   if (left_type == above_type)
     return left_type;
-  else if (left_type == SWITCHABLE_FILTERS && above_type != SWITCHABLE_FILTERS)
+  else if (left_type == SWITCHABLE_FILTERS)
     return above_type;
-  else if (left_type != SWITCHABLE_FILTERS && above_type == SWITCHABLE_FILTERS)
+  else if (above_type == SWITCHABLE_FILTERS)
     return left_type;
   else
     return SWITCHABLE_FILTERS;
 }
 
-// The mode info data structure has a one element border above and to the
-// left of the entries corresponding to real macroblocks.
-// The prediction flags in these dummy entries are initialized to 0.
-// 0 - inter/inter, inter/--, --/inter, --/--
-// 1 - intra/inter, inter/intra
-// 2 - intra/--, --/intra
-// 3 - intra/intra
-int vp9_get_intra_inter_context(const MACROBLOCKD *xd) {
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
-
-  if (has_above && has_left) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
-    return left_intra && above_intra ? 3
-                                     : left_intra || above_intra;
-  } else if (has_above || has_left) {  // one edge available
-    return 2 * !is_inter_block(has_above ? above_mbmi : left_mbmi);
-  } else {
-    return 0;
-  }
-}
-
 int vp9_get_reference_mode_context(const VP9_COMMON *cm,
                                    const MACROBLOCKD *xd) {
   int ctx;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int has_above = !!above_mi;
+  const int has_left = !!left_mi;
   // Note:
   // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
   if (has_above && has_left) {  // both edges available
-    if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
+    if (!has_second_ref(above_mi) && !has_second_ref(left_mi))
       // neither edge uses comp pred (0/1)
-      ctx = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^
-            (left_mbmi->ref_frame[0] == cm->comp_fixed_ref);
-    else if (!has_second_ref(above_mbmi))
+      ctx = (above_mi->ref_frame[0] == cm->comp_fixed_ref) ^
+            (left_mi->ref_frame[0] == cm->comp_fixed_ref);
+    else if (!has_second_ref(above_mi))
       // one of two edges uses comp pred (2/3)
-      ctx = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
-                 !is_inter_block(above_mbmi));
-    else if (!has_second_ref(left_mbmi))
+      ctx = 2 + (above_mi->ref_frame[0] == cm->comp_fixed_ref ||
+                 !is_inter_block(above_mi));
+    else if (!has_second_ref(left_mi))
       // one of two edges uses comp pred (2/3)
-      ctx = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
-                 !is_inter_block(left_mbmi));
+      ctx = 2 + (left_mi->ref_frame[0] == cm->comp_fixed_ref ||
+                 !is_inter_block(left_mi));
     else  // both edges use comp pred (4)
       ctx = 4;
   } else if (has_above || has_left) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+    const MODE_INFO *edge_mi = has_above ? above_mi : left_mi;
 
-    if (!has_second_ref(edge_mbmi))
+    if (!has_second_ref(edge_mi))
       // edge does not use comp pred (0/1)
-      ctx = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref;
+      ctx = edge_mi->ref_frame[0] == cm->comp_fixed_ref;
     else
       // edge uses comp pred (3)
       ctx = 3;
@@ -107,39 +82,39 @@ int vp9_get_reference_mode_context(const VP9_COMMON *cm,
 int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
                                     const MACROBLOCKD *xd) {
   int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int above_in_image = xd->up_available;
-  const int left_in_image = xd->left_available;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int above_in_image = !!above_mi;
+  const int left_in_image = !!left_mi;
 
   // Note:
   // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
   const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
   const int var_ref_idx = !fix_ref_idx;
 
   if (above_in_image && left_in_image) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
+    const int above_intra = !is_inter_block(above_mi);
+    const int left_intra = !is_inter_block(left_mi);
 
     if (above_intra && left_intra) {  // intra/intra (2)
       pred_context = 2;
     } else if (above_intra || left_intra) {  // intra/inter
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      const MODE_INFO *edge_mi = above_intra ? left_mi : above_mi;
 
-      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
-        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
+      if (!has_second_ref(edge_mi))  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mi->ref_frame[0] != cm->comp_var_ref[1]);
       else  // comp pred (1/3)
-        pred_context = 1 + 2 * (edge_mbmi->ref_frame[var_ref_idx]
+        pred_context = 1 + 2 * (edge_mi->ref_frame[var_ref_idx]
                                     != cm->comp_var_ref[1]);
     } else {  // inter/inter
-      const int l_sg = !has_second_ref(left_mbmi);
-      const int a_sg = !has_second_ref(above_mbmi);
-      const MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
-                                           : above_mbmi->ref_frame[var_ref_idx];
-      const MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
-                                           : left_mbmi->ref_frame[var_ref_idx];
+      const int l_sg = !has_second_ref(left_mi);
+      const int a_sg = !has_second_ref(above_mi);
+      const MV_REFERENCE_FRAME vrfa = a_sg ? above_mi->ref_frame[0]
+                                           : above_mi->ref_frame[var_ref_idx];
+      const MV_REFERENCE_FRAME vrfl = l_sg ? left_mi->ref_frame[0]
+                                           : left_mi->ref_frame[var_ref_idx];
 
       if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) {
         pred_context = 0;
@@ -167,16 +142,16 @@ int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
       }
     }
   } else if (above_in_image || left_in_image) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+    const MODE_INFO *edge_mi = above_in_image ? above_mi : left_mi;
 
-    if (!is_inter_block(edge_mbmi)) {
+    if (!is_inter_block(edge_mi)) {
       pred_context = 2;
     } else {
-      if (has_second_ref(edge_mbmi))
-        pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx]
+      if (has_second_ref(edge_mi))
+        pred_context = 4 * (edge_mi->ref_frame[var_ref_idx]
                               != cm->comp_var_ref[1]);
       else
-        pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
+        pred_context = 3 * (edge_mi->ref_frame[0] != cm->comp_var_ref[1]);
     }
   } else {  // no edges available (2)
     pred_context = 2;
@@ -188,34 +163,34 @@ int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
 
 int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
   int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int has_above = !!above_mi;
+  const int has_left = !!left_mi;
   // Note:
   // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
   if (has_above && has_left) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
+    const int above_intra = !is_inter_block(above_mi);
+    const int left_intra = !is_inter_block(left_mi);
 
     if (above_intra && left_intra) {  // intra/intra
       pred_context = 2;
     } else if (above_intra || left_intra) {  // intra/inter or inter/intra
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi))
-        pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+      const MODE_INFO *edge_mi = above_intra ? left_mi : above_mi;
+      if (!has_second_ref(edge_mi))
+        pred_context = 4 * (edge_mi->ref_frame[0] == LAST_FRAME);
       else
-        pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
-                            edge_mbmi->ref_frame[1] == LAST_FRAME);
+        pred_context = 1 + (edge_mi->ref_frame[0] == LAST_FRAME ||
+                            edge_mi->ref_frame[1] == LAST_FRAME);
     } else {  // inter/inter
-      const int above_has_second = has_second_ref(above_mbmi);
-      const int left_has_second = has_second_ref(left_mbmi);
-      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
-      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+      const int above_has_second = has_second_ref(above_mi);
+      const int left_has_second = has_second_ref(left_mi);
+      const MV_REFERENCE_FRAME above0 = above_mi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mi->ref_frame[1];
 
       if (above_has_second && left_has_second) {
         pred_context = 1 + (above0 == LAST_FRAME || above1 == LAST_FRAME ||
@@ -234,15 +209,15 @@ int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
       }
     }
   } else if (has_above || has_left) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
-    if (!is_inter_block(edge_mbmi)) {  // intra
+    const MODE_INFO *edge_mi = has_above ? above_mi : left_mi;
+    if (!is_inter_block(edge_mi)) {  // intra
       pred_context = 2;
     } else {  // inter
-      if (!has_second_ref(edge_mbmi))
-        pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+      if (!has_second_ref(edge_mi))
+        pred_context = 4 * (edge_mi->ref_frame[0] == LAST_FRAME);
       else
-        pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME ||
-                            edge_mbmi->ref_frame[1] == LAST_FRAME);
+        pred_context = 1 + (edge_mi->ref_frame[0] == LAST_FRAME ||
+                            edge_mi->ref_frame[1] == LAST_FRAME);
     }
   } else {  // no edges available
     pred_context = 2;
@@ -254,39 +229,39 @@ int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
 
 int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
   int pred_context;
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int has_above = !!above_mi;
+  const int has_left = !!left_mi;
 
   // Note:
   // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
   if (has_above && has_left) {  // both edges available
-    const int above_intra = !is_inter_block(above_mbmi);
-    const int left_intra = !is_inter_block(left_mbmi);
+    const int above_intra = !is_inter_block(above_mi);
+    const int left_intra = !is_inter_block(left_mi);
 
     if (above_intra && left_intra) {  // intra/intra
       pred_context = 2;
     } else if (above_intra || left_intra) {  // intra/inter or inter/intra
-      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi)) {
-        if (edge_mbmi->ref_frame[0] == LAST_FRAME)
+      const MODE_INFO *edge_mi = above_intra ? left_mi : above_mi;
+      if (!has_second_ref(edge_mi)) {
+        if (edge_mi->ref_frame[0] == LAST_FRAME)
           pred_context = 3;
         else
-          pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+          pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME);
       } else {
-        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                                edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
+        pred_context = 1 + 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME ||
+                                edge_mi->ref_frame[1] == GOLDEN_FRAME);
       }
     } else {  // inter/inter
-      const int above_has_second = has_second_ref(above_mbmi);
-      const int left_has_second = has_second_ref(left_mbmi);
-      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
-      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+      const int above_has_second = has_second_ref(above_mi);
+      const int left_has_second = has_second_ref(left_mi);
+      const MV_REFERENCE_FRAME above0 = above_mi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mi->ref_frame[1];
 
       if (above_has_second && left_has_second) {
         if (above0 == left0 && above1 == left1)
@@ -321,16 +296,16 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
       }
     }
   } else if (has_above || has_left) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+    const MODE_INFO *edge_mi = has_above ? above_mi : left_mi;
 
-    if (!is_inter_block(edge_mbmi) ||
-        (edge_mbmi->ref_frame[0] == LAST_FRAME && !has_second_ref(edge_mbmi)))
+    if (!is_inter_block(edge_mi) ||
+        (edge_mi->ref_frame[0] == LAST_FRAME && !has_second_ref(edge_mi)))
       pred_context = 2;
-    else if (!has_second_ref(edge_mbmi))
-      pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME);
+    else if (!has_second_ref(edge_mi))
+      pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME);
     else
-      pred_context = 3 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                          edge_mbmi->ref_frame[1] == GOLDEN_FRAME);
+      pred_context = 3 * (edge_mi->ref_frame[0] == GOLDEN_FRAME ||
+                          edge_mi->ref_frame[1] == GOLDEN_FRAME);
   } else {  // no edges available (2)
     pred_context = 2;
   }
diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index 6f7af4a..f3c676e 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -42,8 +42,8 @@ static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
   const MODE_INFO *const above_mi = xd->above_mi;
   const MODE_INFO *const left_mi = xd->left_mi;
   const int above_sip = (above_mi != NULL) ?
-                        above_mi->mbmi.seg_id_predicted : 0;
-  const int left_sip = (left_mi != NULL) ? left_mi->mbmi.seg_id_predicted : 0;
+                        above_mi->seg_id_predicted : 0;
+  const int left_sip = (left_mi != NULL) ? left_mi->seg_id_predicted : 0;
 
   return above_sip + left_sip;
 }
@@ -56,8 +56,8 @@ static INLINE vpx_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg,
 static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) {
   const MODE_INFO *const above_mi = xd->above_mi;
   const MODE_INFO *const left_mi = xd->left_mi;
-  const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip : 0;
-  const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip : 0;
+  const int above_skip = (above_mi != NULL) ? above_mi->skip : 0;
+  const int left_skip = (left_mi != NULL) ? left_mi->skip : 0;
   return above_skip + left_skip;
 }
 
@@ -68,11 +68,32 @@ static INLINE vpx_prob vp9_get_skip_prob(const VP9_COMMON *cm,
 
 int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
 
-int vp9_get_intra_inter_context(const MACROBLOCKD *xd);
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real macroblocks.
+// The prediction flags in these dummy entries are initialized to 0.
+// 0 - inter/inter, inter/--, --/inter, --/--
+// 1 - intra/inter, inter/intra
+// 2 - intra/--, --/intra
+// 3 - intra/intra
+static INLINE int get_intra_inter_context(const MACROBLOCKD *xd) {
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int has_above = !!above_mi;
+  const int has_left = !!left_mi;
+
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mi);
+    const int left_intra = !is_inter_block(left_mi);
+    return left_intra && above_intra ? 3 : left_intra || above_intra;
+  } else if (has_above || has_left) {  // one edge available
+    return 2 * !is_inter_block(has_above ? above_mi : left_mi);
+  }
+  return 0;
+}
 
 static INLINE vpx_prob vp9_get_intra_inter_prob(const VP9_COMMON *cm,
                                                 const MACROBLOCKD *xd) {
-  return cm->fc->intra_inter_prob[vp9_get_intra_inter_context(xd)];
+  return cm->fc->intra_inter_prob[get_intra_inter_context(xd)];
 }
 
 int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd);
@@ -110,15 +131,15 @@ static INLINE vpx_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
 // left of the entries corresponding to real blocks.
 // The prediction flags in these dummy entries are initialized to 0.
 static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
-  const int max_tx_size = max_txsize_lookup[xd->mi[0]->mbmi.sb_type];
-  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
-  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
-  const int has_above = xd->up_available;
-  const int has_left = xd->left_available;
-  int above_ctx = (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size
-                                                   : max_tx_size;
-  int left_ctx = (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size
-                                                : max_tx_size;
+  const int max_tx_size = max_txsize_lookup[xd->mi[0]->sb_type];
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int has_above = !!above_mi;
+  const int has_left = !!left_mi;
+  int above_ctx = (has_above && !above_mi->skip) ? (int)above_mi->tx_size
+                                                 : max_tx_size;
+  int left_ctx = (has_left && !left_mi->skip) ? (int)left_mi->tx_size
+                                              : max_tx_size;
   if (!has_left)
     left_ctx = above_ctx;
 
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index d8c14ec..84718e9 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -20,19 +20,6 @@
 #include "vp9/common/vp9_reconintra.h"
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void high_inter_predictor(const uint8_t *src, int src_stride,
-                                 uint8_t *dst, int dst_stride,
-                                 const int subpel_x,
-                                 const int subpel_y,
-                                 const struct scale_factors *sf,
-                                 int w, int h, int ref,
-                                 const InterpKernel *kernel,
-                                 int xs, int ys, int bd) {
-  sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
-      src, src_stride, dst, dst_stride,
-      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
-}
-
 void vp9_highbd_build_inter_predictor(const uint8_t *src, int src_stride,
                                       uint8_t *dst, int dst_stride,
                                       const MV *src_mv,
@@ -50,8 +37,9 @@ void vp9_highbd_build_inter_predictor(const uint8_t *src, int src_stride,
 
   src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
 
-  high_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
-                       sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4, bd);
+  highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
+                         sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4,
+                         bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -159,8 +147,8 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
                                    int mi_x, int mi_y) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const MODE_INFO *mi = xd->mi[0];
-  const int is_compound = has_second_ref(&mi->mbmi);
-  const InterpKernel *kernel = vp9_filter_kernels[mi->mbmi.interp_filter];
+  const int is_compound = has_second_ref(mi);
+  const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
   int ref;
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
@@ -168,9 +156,9 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
     struct buf_2d *const pre_buf = &pd->pre[ref];
     struct buf_2d *const dst_buf = &pd->dst;
     uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-    const MV mv = mi->mbmi.sb_type < BLOCK_8X8
+    const MV mv = mi->sb_type < BLOCK_8X8
                ? average_split_mvs(pd, mi, ref, block)
-               : mi->mbmi.mv[ref].as_mv;
+               : mi->mv[ref].as_mv;
 
     // TODO(jkoleszar): This clamping is done in the incorrect place for the
     // scaling case. It needs to be done on the scaled MV, not the pre-scaling
@@ -190,6 +178,12 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
       // Co-ordinate of containing block to pixel precision.
       const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
       const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+#if CONFIG_BETTER_HW_COMPATIBILITY
+      assert(xd->mi[0]->sb_type != BLOCK_4X8 &&
+             xd->mi[0]->sb_type != BLOCK_8X4);
+      assert(mv_q4.row == mv.row * (1 << (1 - pd->subsampling_y)) &&
+             mv_q4.col == mv.col * (1 << (1 - pd->subsampling_x)));
+#endif
       if (plane == 0)
         pre_buf->buf = xd->block_refs[ref]->buf->y_buffer;
       else if (plane == 1)
@@ -216,9 +210,9 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
 
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      high_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
-                           subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys,
-                           xd->bd);
+      highbd_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                             subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys,
+                             xd->bd);
     } else {
       inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
                       subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
@@ -244,7 +238,7 @@ static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
     const int bw = 4 * num_4x4_w;
     const int bh = 4 * num_4x4_h;
 
-    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
+    if (xd->mi[0]->sb_type < BLOCK_8X8) {
       int i = 0, x, y;
       assert(bsize == BLOCK_8X8);
       for (y = 0; y < num_4x4_h; ++y)
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 7d90774..07745e3 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -34,14 +34,18 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride,
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void high_inter_predictor(const uint8_t *src, int src_stride,
-                                 uint8_t *dst, int dst_stride,
-                                 const int subpel_x,
-                                 const int subpel_y,
-                                 const struct scale_factors *sf,
-                                 int w, int h, int ref,
-                                 const InterpKernel *kernel,
-                                 int xs, int ys, int bd);
+static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
+                                          uint8_t *dst, int dst_stride,
+                                          const int subpel_x,
+                                          const int subpel_y,
+                                          const struct scale_factors *sf,
+                                          int w, int h, int ref,
+                                          const InterpKernel *kernel,
+                                          int xs, int ys, int bd) {
+  sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
+      src, src_stride, dst, dst_stride,
+      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 MV average_split_mvs(const struct macroblockd_plane *pd, const MODE_INFO *mi,
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 3d84a28..4457858 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -133,12 +133,16 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
   int frame_width, frame_height;
   int x0, y0;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int need_left = extend_modes[mode] & NEED_LEFT;
+  const int need_above = extend_modes[mode] & NEED_ABOVE;
+  const int need_aboveright = extend_modes[mode] & NEED_ABOVERIGHT;
   int base = 128 << (bd - 8);
   // 127 127 127 .. 127 127 127 127 127 127
   // 129  A   B  ..  Y   Z
   // 129  C   D  ..  W   X
   // 129  E   F  ..  U   V
   // 129  G   H  ..  S   T   T   T   T   T
+  // For 10 bit and 12 bit, 127 and 129 are replaced by base -1 and base + 1.
 
   // Get current frame pointer, width and height.
   if (plane == 0) {
@@ -153,79 +157,106 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
   x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
   y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
 
-  // left
-  if (left_available) {
-    if (xd->mb_to_bottom_edge < 0) {
-      /* slower path if the block needs border extension */
-      if (y0 + bs <= frame_height) {
-        for (i = 0; i < bs; ++i)
-          left_col[i] = ref[i * ref_stride - 1];
+  // NEED_LEFT
+  if (need_left) {
+    if (left_available) {
+      if (xd->mb_to_bottom_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (y0 + bs <= frame_height) {
+          for (i = 0; i < bs; ++i)
+            left_col[i] = ref[i * ref_stride - 1];
+        } else {
+          const int extend_bottom = frame_height - y0;
+          for (i = 0; i < extend_bottom; ++i)
+            left_col[i] = ref[i * ref_stride - 1];
+          for (; i < bs; ++i)
+            left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
+        }
       } else {
-        const int extend_bottom = frame_height - y0;
-        for (i = 0; i < extend_bottom; ++i)
+        /* faster path if the block does not need extension */
+        for (i = 0; i < bs; ++i)
           left_col[i] = ref[i * ref_stride - 1];
-        for (; i < bs; ++i)
-          left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
       }
     } else {
-      /* faster path if the block does not need extension */
-      for (i = 0; i < bs; ++i)
-        left_col[i] = ref[i * ref_stride - 1];
+      vpx_memset16(left_col, base + 1, bs);
     }
-  } else {
-    // TODO(Peter): this value should probably change for high bitdepth
-    vpx_memset16(left_col, base + 1, bs);
   }
 
-  // TODO(hkuang) do not extend 2*bs pixels for all modes.
-  // above
-  if (up_available) {
-    const uint16_t *above_ref = ref - ref_stride;
-    if (xd->mb_to_right_edge < 0) {
-      /* slower path if the block needs border extension */
-      if (x0 + 2 * bs <= frame_width) {
-        if (right_available && bs == 4) {
-          memcpy(above_row, above_ref, 2 * bs * sizeof(above_row[0]));
+  // NEED_ABOVE
+  if (need_above) {
+    if (up_available) {
+      const uint16_t *above_ref = ref - ref_stride;
+      if (xd->mb_to_right_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (x0 + bs <= frame_width) {
+          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
+        } else if (x0 <= frame_width) {
+          const int r = frame_width - x0;
+          memcpy(above_row, above_ref, r * sizeof(above_row[0]));
+          vpx_memset16(above_row + r, above_row[r - 1], x0 + bs - frame_width);
+        }
+      } else {
+        /* faster path if the block does not need extension */
+        if (bs == 4 && right_available && left_available) {
+          const_above_row = above_ref;
         } else {
           memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
-          vpx_memset16(above_row + bs, above_row[bs - 1], bs);
         }
-      } else if (x0 + bs <= frame_width) {
-        const int r = frame_width - x0;
-        if (right_available && bs == 4) {
+      }
+      above_row[-1] = left_available ? above_ref[-1] : (base + 1);
+    } else {
+      vpx_memset16(above_row, base - 1, bs);
+      above_row[-1] = base - 1;
+    }
+  }
+
+  // NEED_ABOVERIGHT
+  if (need_aboveright) {
+    if (up_available) {
+      const uint16_t *above_ref = ref - ref_stride;
+      if (xd->mb_to_right_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (x0 + 2 * bs <= frame_width) {
+          if (right_available && bs == 4) {
+            memcpy(above_row, above_ref, 2 * bs * sizeof(above_row[0]));
+          } else {
+            memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
+            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
+          }
+        } else if (x0 + bs <= frame_width) {
+          const int r = frame_width - x0;
+          if (right_available && bs == 4) {
+            memcpy(above_row, above_ref, r * sizeof(above_row[0]));
+            vpx_memset16(above_row + r, above_row[r - 1],
+                         x0 + 2 * bs - frame_width);
+          } else {
+            memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
+            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
+          }
+        } else if (x0 <= frame_width) {
+          const int r = frame_width - x0;
           memcpy(above_row, above_ref, r * sizeof(above_row[0]));
           vpx_memset16(above_row + r, above_row[r - 1],
                        x0 + 2 * bs - frame_width);
+        }
+        above_row[-1] = left_available ? above_ref[-1] : (base + 1);
+      } else {
+        /* faster path if the block does not need extension */
+        if (bs == 4 && right_available && left_available) {
+          const_above_row = above_ref;
         } else {
           memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
-          vpx_memset16(above_row + bs, above_row[bs - 1], bs);
+          if (bs == 4 && right_available)
+            memcpy(above_row + bs, above_ref + bs, bs * sizeof(above_row[0]));
+          else
+            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
+          above_row[-1] = left_available ? above_ref[-1] : (base + 1);
         }
-      } else if (x0 <= frame_width) {
-        const int r = frame_width - x0;
-        memcpy(above_row, above_ref, r * sizeof(above_row[0]));
-        vpx_memset16(above_row + r, above_row[r - 1],
-                       x0 + 2 * bs - frame_width);
       }
-      // TODO(Peter) this value should probably change for high bitdepth
-      above_row[-1] = left_available ? above_ref[-1] : (base+1);
     } else {
-      /* faster path if the block does not need extension */
-      if (bs == 4 && right_available && left_available) {
-        const_above_row = above_ref;
-      } else {
-        memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
-        if (bs == 4 && right_available)
-          memcpy(above_row + bs, above_ref + bs, bs * sizeof(above_row[0]));
-        else
-          vpx_memset16(above_row + bs, above_row[bs - 1], bs);
-        // TODO(Peter): this value should probably change for high bitdepth
-        above_row[-1] = left_available ? above_ref[-1] : (base+1);
-      }
+      vpx_memset16(above_row, base - 1, bs * 2);
+      above_row[-1] = base - 1;
     }
-  } else {
-    vpx_memset16(above_row, base - 1, bs * 2);
-    // TODO(Peter): this value should probably change for high bitdepth
-    above_row[-1] = base - 1;
   }
 
   // predict
@@ -391,8 +422,8 @@ void vp9_predict_intra_block(const MACROBLOCKD *xd, int bwl_in,
                              int aoff, int loff, int plane) {
   const int bw = (1 << bwl_in);
   const int txw = (1 << tx_size);
-  const int have_top = loff || xd->up_available;
-  const int have_left = aoff || xd->left_available;
+  const int have_top = loff || (xd->above_mi != NULL);
+  const int have_left = aoff || (xd->left_mi != NULL);
   const int have_right = (aoff + txw) < bw;
   const int x = aoff * 4;
   const int y = loff * 4;
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 5bf71ef..8461336 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -70,10 +70,6 @@ add_proto qw/void vp9_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8
 specialize qw/vp9_post_proc_down_and_across sse2/;
 $vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
 
-add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-specialize qw/vp9_plane_add_noise sse2/;
-$vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
-
 add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
 specialize qw/vp9_filter_by_weight16x16 sse2 msa/;
 
@@ -169,9 +165,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
     add_proto qw/void vp9_highbd_post_proc_down_and_across/, "const uint16_t *src_ptr, uint16_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
     specialize qw/vp9_highbd_post_proc_down_and_across/;
-
-    add_proto qw/void vp9_highbd_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-    specialize qw/vp9_highbd_plane_add_noise/;
   }
 
   #
@@ -194,42 +187,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
 
-add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
-specialize qw/vp9_avg_8x8 sse2 neon msa/;
-
-add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
-specialize qw/vp9_avg_4x4 sse2 msa/;
-
-add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-specialize qw/vp9_minmax_8x8 sse2/;
-
-add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
-specialize qw/vp9_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";
-
-add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
-specialize qw/vp9_hadamard_16x16 sse2/;
-
-add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length";
-specialize qw/vp9_satd sse2/;
-
-add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
-specialize qw/vp9_int_pro_row sse2 neon/;
-
-add_proto qw/int16_t vp9_int_pro_col/, "uint8_t const *ref, const int width";
-specialize qw/vp9_int_pro_col sse2 neon/;
-
-add_proto qw/int vp9_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
-specialize qw/vp9_vector_var neon sse2/;
-
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p";
-  specialize qw/vp9_highbd_avg_8x8/;
-  add_proto qw/unsigned int vp9_highbd_avg_4x4/, "const uint8_t *, int p";
-  specialize qw/vp9_highbd_avg_4x4/;
-  add_proto qw/void vp9_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-  specialize qw/vp9_highbd_minmax_8x8/;
-}
-
 # ENCODEMB INVOKE
 
 #
@@ -288,7 +245,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_fht16x16 sse2/;
 
   add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+  specialize qw/vp9_fwht4x4/, "$sse2_x86inc";
 } else {
   add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp9_fht4x4 sse2 msa/;
@@ -300,7 +257,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_fht16x16 sse2 msa/;
 
   add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4 msa/, "$mmx_x86inc";
+  specialize qw/vp9_fwht4x4 msa/, "$sse2_x86inc";
 }
 
 #
@@ -312,10 +269,7 @@ $vp9_full_search_sad_sse3=vp9_full_search_sadx3;
 $vp9_full_search_sad_sse4_1=vp9_full_search_sadx8;
 
 add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_diamond_search_sad/;
-
-add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_full_range_search/;
+specialize qw/vp9_diamond_search_sad avx/;
 
 add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
 specialize qw/vp9_temporal_filter_apply sse2 msa/;
@@ -349,6 +303,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 }
 # End vp9_high encoder functions
 
+#
+# frame based scale
+#
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+} else {
+  add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst";
+  specialize qw/vp9_scale_and_extend_frame ssse3/;
+}
+
 }
 # end encoder functions
 1;
diff --git a/vp9/common/vp9_scan.c b/vp9/common/vp9_scan.c
index d6fb8b2..8b8b09f 100644
--- a/vp9/common/vp9_scan.c
+++ b/vp9/common/vp9_scan.c
@@ -229,10 +229,8 @@ DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {
   990, 959, 1022, 991, 1023,
 };
 
-// Neighborhood 5-tuples for various scans and blocksizes,
-// in {top, left, topleft, topright, bottomleft} order
-// for each position in raster scan order.
-// -1 indicates the neighbor does not exist.
+// Neighborhood 2-tuples for various scans and blocksizes,
+// in {top, left} order for each position in corresponding scan order.
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
   0, 0, 0, 0, 0, 0, 1, 4, 4, 4, 1, 1, 8, 8, 5, 8, 2, 2, 2, 5, 9, 12, 6, 9,
diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h
index 1d86b5c..4c1ee81 100644
--- a/vp9/common/vp9_scan.h
+++ b/vp9/common/vp9_scan.h
@@ -42,7 +42,7 @@ static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
                                          PLANE_TYPE type, int block_idx) {
   const MODE_INFO *const mi = xd->mi[0];
 
-  if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) {
+  if (is_inter_block(mi) || type != PLANE_TYPE_Y || xd->lossless) {
     return &vp9_default_scan_orders[tx_size];
   } else {
     const PREDICTION_MODE mode = get_y_mode(mi, block_idx);
diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c
index c8ef618..7af6162 100644
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c
@@ -28,6 +28,7 @@ static const int seg_feature_data_max[SEG_LVL_MAX] = {
 void vp9_clearall_segfeatures(struct segmentation *seg) {
   vp9_zero(seg->feature_data);
   vp9_zero(seg->feature_mask);
+  seg->aq_av_offset = 0;
 }
 
 void vp9_enable_segfeature(struct segmentation *seg, int segment_id,
diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h
index 5b75d8d..99a9440 100644
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h
@@ -46,7 +46,8 @@ struct segmentation {
   vpx_prob pred_probs[PREDICTION_PROBS];
 
   int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
-  unsigned int feature_mask[MAX_SEGMENTS];
+  uint32_t feature_mask[MAX_SEGMENTS];
+  int aq_av_offset;
 };
 
 static INLINE int segfeature_active(const struct segmentation *seg,
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 8d312d0..1c77b57 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vp9_rtcd.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
diff --git a/vp9/common/x86/vp9_postproc_sse2.asm b/vp9/common/x86/vp9_postproc_sse2.asm
index ec8bfdb..4307628 100644
--- a/vp9/common/x86/vp9_postproc_sse2.asm
+++ b/vp9/common/x86/vp9_postproc_sse2.asm
@@ -624,68 +624,6 @@ sym(vp9_mbpost_proc_across_ip_xmm):
 %undef flimit4
 
 
-;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int width, unsigned int height, int pitch)
-global sym(vp9_plane_add_noise_wmt) PRIVATE
-sym(vp9_plane_add_noise_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movdqu      xmm1,[rsi+rax]         ; get the source
-
-            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     xmm1, [rdx+32] ;bothclamp
-            psubusb     xmm1, [rdx+16] ;whiteclamp
-
-            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
-            paddb       xmm1,xmm2              ; add it in
-            movdqu      [rsi+rax],xmm1         ; store the result
-
-            add         rax,16                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 rd42:
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index f191663..d639129 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -189,54 +189,31 @@ static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane,
                                           uint8_t *dst, int stride,
                                           int eob) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  if (eob > 0) {
-    tran_low_t *const dqcoeff = pd->dqcoeff;
+  tran_low_t *const dqcoeff = pd->dqcoeff;
+  assert(eob > 0);
 #if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      if (xd->lossless) {
-        vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
-      } else {
-        switch (tx_size) {
-          case TX_4X4:
-            vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          case TX_8X8:
-            vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          case TX_16X16:
-            vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          case TX_32X32:
-            vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          default:
-            assert(0 && "Invalid transform size");
-        }
-      }
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (xd->lossless) {
+      vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
     } else {
-      if (xd->lossless) {
-        vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
-      } else {
-        switch (tx_size) {
-          case TX_4X4:
-            vp9_idct4x4_add(dqcoeff, dst, stride, eob);
-            break;
-          case TX_8X8:
-            vp9_idct8x8_add(dqcoeff, dst, stride, eob);
-            break;
-          case TX_16X16:
-            vp9_idct16x16_add(dqcoeff, dst, stride, eob);
-            break;
-          case TX_32X32:
-            vp9_idct32x32_add(dqcoeff, dst, stride, eob);
-            break;
-          default:
-            assert(0 && "Invalid transform size");
-            return;
-        }
+      switch (tx_size) {
+        case TX_4X4:
+          vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        case TX_8X8:
+          vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        case TX_16X16:
+          vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        case TX_32X32:
+          vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        default:
+          assert(0 && "Invalid transform size");
       }
     }
-#else
+  } else {
     if (xd->lossless) {
       vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
     } else {
@@ -258,18 +235,40 @@ static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane,
           return;
       }
     }
+  }
+#else
+  if (xd->lossless) {
+    vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+  } else {
+    switch (tx_size) {
+      case TX_4X4:
+        vp9_idct4x4_add(dqcoeff, dst, stride, eob);
+        break;
+      case TX_8X8:
+        vp9_idct8x8_add(dqcoeff, dst, stride, eob);
+        break;
+      case TX_16X16:
+        vp9_idct16x16_add(dqcoeff, dst, stride, eob);
+        break;
+      case TX_32X32:
+        vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+        break;
+      default:
+        assert(0 && "Invalid transform size");
+        return;
+    }
+  }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    if (eob == 1) {
-      dqcoeff[0] = 0;
-    } else {
-      if (tx_size <= TX_16X16 && eob <= 10)
-        memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
-      else if (tx_size == TX_32X32 && eob <= 34)
-        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
-      else
-        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
-    }
+  if (eob == 1) {
+    dqcoeff[0] = 0;
+  } else {
+    if (tx_size <= TX_16X16 && eob <= 10)
+      memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+    else if (tx_size == TX_32X32 && eob <= 34)
+      memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+    else
+      memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
   }
 }
 
@@ -279,54 +278,31 @@ static void inverse_transform_block_intra(MACROBLOCKD* xd, int plane,
                                           uint8_t *dst, int stride,
                                           int eob) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  if (eob > 0) {
-    tran_low_t *const dqcoeff = pd->dqcoeff;
+  tran_low_t *const dqcoeff = pd->dqcoeff;
+  assert(eob > 0);
 #if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      if (xd->lossless) {
-        vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
-      } else {
-        switch (tx_size) {
-          case TX_4X4:
-            vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          case TX_8X8:
-            vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          case TX_16X16:
-            vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          case TX_32X32:
-            vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
-            break;
-          default:
-            assert(0 && "Invalid transform size");
-        }
-      }
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    if (xd->lossless) {
+      vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
     } else {
-      if (xd->lossless) {
-        vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
-      } else {
-        switch (tx_size) {
-          case TX_4X4:
-            vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
-            break;
-          case TX_8X8:
-            vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
-            break;
-          case TX_16X16:
-            vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
-            break;
-          case TX_32X32:
-            vp9_idct32x32_add(dqcoeff, dst, stride, eob);
-            break;
-          default:
-            assert(0 && "Invalid transform size");
-            return;
-        }
+      switch (tx_size) {
+        case TX_4X4:
+          vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        case TX_8X8:
+          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        case TX_16X16:
+          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        case TX_32X32:
+          vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+          break;
+        default:
+          assert(0 && "Invalid transform size");
       }
     }
-#else
+  } else {
     if (xd->lossless) {
       vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
     } else {
@@ -348,33 +324,55 @@ static void inverse_transform_block_intra(MACROBLOCKD* xd, int plane,
           return;
       }
     }
+  }
+#else
+  if (xd->lossless) {
+    vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+  } else {
+    switch (tx_size) {
+      case TX_4X4:
+        vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+        break;
+      case TX_8X8:
+        vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+        break;
+      case TX_16X16:
+        vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+        break;
+      case TX_32X32:
+        vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+        break;
+      default:
+        assert(0 && "Invalid transform size");
+        return;
+    }
+  }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    if (eob == 1) {
-      dqcoeff[0] = 0;
-    } else {
-      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
-        memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
-      else if (tx_size == TX_32X32 && eob <= 34)
-        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
-      else
-        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
-    }
+  if (eob == 1) {
+    dqcoeff[0] = 0;
+  } else {
+    if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
+      memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+    else if (tx_size == TX_32X32 && eob <= 34)
+      memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+    else
+      memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
   }
 }
 
 static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd,
                                                 vpx_reader *r,
-                                                MB_MODE_INFO *const mbmi,
+                                                MODE_INFO *const mi,
                                                 int plane,
                                                 int row, int col,
                                                 TX_SIZE tx_size) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  PREDICTION_MODE mode = (plane == 0) ? mbmi->mode : mbmi->uv_mode;
+  PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode;
   uint8_t *dst;
   dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
 
-  if (mbmi->sb_type < BLOCK_8X8)
+  if (mi->sb_type < BLOCK_8X8)
     if (plane == 0)
       mode = xd->mi[0]->bmi[(row << 1) + col].as_mode;
 
@@ -382,29 +380,33 @@ static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd,
                           dst, pd->dst.stride, dst, pd->dst.stride,
                           col, row, plane);
 
-  if (!mbmi->skip) {
+  if (!mi->skip) {
     const TX_TYPE tx_type = (plane || xd->lossless) ?
         DCT_DCT : intra_mode_to_tx_type_lookup[mode];
     const scan_order *sc = (plane || xd->lossless) ?
         &vp9_default_scan_orders[tx_size] : &vp9_scan_orders[tx_size][tx_type];
     const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size,
-                                            r, mbmi->segment_id);
-    inverse_transform_block_intra(xd, plane, tx_type, tx_size,
-                                  dst, pd->dst.stride, eob);
+                                            r, mi->segment_id);
+    if (eob > 0) {
+      inverse_transform_block_intra(xd, plane, tx_type, tx_size,
+                                    dst, pd->dst.stride, eob);
+    }
   }
 }
 
 static int reconstruct_inter_block(MACROBLOCKD *const xd, vpx_reader *r,
-                                   MB_MODE_INFO *const mbmi, int plane,
+                                   MODE_INFO *const mi, int plane,
                                    int row, int col, TX_SIZE tx_size) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const scan_order *sc = &vp9_default_scan_orders[tx_size];
   const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size, r,
-                                          mbmi->segment_id);
+                                          mi->segment_id);
 
-  inverse_transform_block_inter(xd, plane, tx_size,
-                            &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
-                            pd->dst.stride, eob);
+  if (eob > 0) {
+    inverse_transform_block_inter(
+        xd, plane, tx_size, &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
+        pd->dst.stride, eob);
+  }
   return eob;
 }
 
@@ -523,8 +525,8 @@ static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
   }
 
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    high_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
-                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+    highbd_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
+                           subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
   } else {
     inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
                     subpel_y, sf, w, h, ref, kernel, xs, ys);
@@ -552,7 +554,7 @@ static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-static void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
+static void dec_build_inter_predictors(VPxWorker *const worker, MACROBLOCKD *xd,
                                        int plane, int bw, int bh, int x,
                                        int y, int w, int h, int mi_x, int mi_y,
                                        const InterpKernel *kernel,
@@ -587,7 +589,12 @@ static void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
     // Co-ordinate of containing block to pixel precision.
     int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
     int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
-
+#if CONFIG_BETTER_HW_COMPATIBILITY
+    assert(xd->mi[0]->sb_type != BLOCK_4X8 &&
+           xd->mi[0]->sb_type != BLOCK_8X4);
+    assert(mv_q4.row == mv->row * (1 << (1 - pd->subsampling_y)) &&
+           mv_q4.col == mv->col * (1 << (1 - pd->subsampling_x)));
+#endif
     // Co-ordinate of the block to 1/16th pixel precision.
     x0_16 = (x_start + x) << SUBPEL_BITS;
     y0_16 = (y_start + y) << SUBPEL_BITS;
@@ -657,8 +664,8 @@ static void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
 
     // Wait until reference block is ready. Pad 7 more pixels as last 7
     // pixels of each superblock row can be changed by next superblock row.
-    if (pbi->frame_parallel_decode)
-      vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+    if (worker != NULL)
+      vp9_frameworker_wait(worker, ref_frame_buf,
                            VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
 
     // Skip border extension if block is inside the frame.
@@ -684,16 +691,16 @@ static void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
   } else {
     // Wait until reference block is ready. Pad 7 more pixels as last 7
     // pixels of each superblock row can be changed by next superblock row.
-     if (pbi->frame_parallel_decode) {
-       const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
-       vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
-                            VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
-     }
+    if (worker != NULL) {
+      const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
+      vp9_frameworker_wait(worker, ref_frame_buf,
+                           VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+    }
   }
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+    highbd_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                           subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
   } else {
     inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
                     subpel_y, sf, w, h, ref, kernel, xs, ys);
@@ -711,55 +718,75 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
   const int mi_x = mi_col * MI_SIZE;
   const int mi_y = mi_row * MI_SIZE;
   const MODE_INFO *mi = xd->mi[0];
-  const InterpKernel *kernel = vp9_filter_kernels[mi->mbmi.interp_filter];
-  const BLOCK_SIZE sb_type = mi->mbmi.sb_type;
-  const int is_compound = has_second_ref(&mi->mbmi);
-
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-    struct buf_2d *const dst_buf = &pd->dst;
-    const int num_4x4_w = pd->n4_w;
-    const int num_4x4_h = pd->n4_h;
-
-    const int n4w_x4 = 4 * num_4x4_w;
-    const int n4h_x4 = 4 * num_4x4_h;
-    int ref;
-
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-      struct buf_2d *const pre_buf = &pd->pre[ref];
-      const int idx = xd->block_refs[ref]->idx;
-      BufferPool *const pool = pbi->common.buffer_pool;
-      RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
-      const int is_scaled = vp9_is_scaled(sf);
-
-      if (sb_type < BLOCK_8X8) {
+  const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
+  const BLOCK_SIZE sb_type = mi->sb_type;
+  const int is_compound = has_second_ref(mi);
+  int ref;
+  int is_scaled;
+  VPxWorker *const fwo = pbi->frame_parallel_decode ?
+      pbi->frame_worker_owner : NULL;
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
+    RefBuffer *ref_buf = &pbi->common.frame_refs[frame - LAST_FRAME];
+    const struct scale_factors *const sf = &ref_buf->sf;
+    const int idx = ref_buf->idx;
+    BufferPool *const pool = pbi->common.buffer_pool;
+    RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
+
+    if (!vp9_is_valid_scale(sf))
+      vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Reference frame has invalid dimensions");
+
+    is_scaled = vp9_is_scaled(sf);
+    vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col,
+                         is_scaled ? sf : NULL);
+    xd->block_refs[ref] = ref_buf;
+
+    if (sb_type < BLOCK_8X8) {
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        struct macroblockd_plane *const pd = &xd->plane[plane];
+        struct buf_2d *const dst_buf = &pd->dst;
+        const int num_4x4_w = pd->n4_w;
+        const int num_4x4_h = pd->n4_h;
+        const int n4w_x4 = 4 * num_4x4_w;
+        const int n4h_x4 = 4 * num_4x4_h;
+        struct buf_2d *const pre_buf = &pd->pre[ref];
         int i = 0, x, y;
         for (y = 0; y < num_4x4_h; ++y) {
           for (x = 0; x < num_4x4_w; ++x) {
             const MV mv = average_split_mvs(pd, mi, ref, i++);
-            dec_build_inter_predictors(pbi, xd, plane, n4w_x4, n4h_x4,
+            dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4,
                                        4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel,
                                        sf, pre_buf, dst_buf, &mv,
                                        ref_frame_buf, is_scaled, ref);
           }
         }
-      } else {
-        const MV mv = mi->mbmi.mv[ref].as_mv;
-        dec_build_inter_predictors(pbi, xd, plane, n4w_x4, n4h_x4,
+      }
+    } else {
+      const MV mv = mi->mv[ref].as_mv;
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        struct macroblockd_plane *const pd = &xd->plane[plane];
+        struct buf_2d *const dst_buf = &pd->dst;
+        const int num_4x4_w = pd->n4_w;
+        const int num_4x4_h = pd->n4_h;
+        const int n4w_x4 = 4 * num_4x4_w;
+        const int n4h_x4 = 4 * num_4x4_h;
+        struct buf_2d *const pre_buf = &pd->pre[ref];
+        dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4,
                                    0, 0, n4w_x4, n4h_x4, mi_x, mi_y, kernel,
-                                   sf, pre_buf, dst_buf, &mv, ref_frame_buf,
-                                   is_scaled, ref);
+                                   sf, pre_buf, dst_buf, &mv,
+                                   ref_frame_buf, is_scaled, ref);
       }
     }
   }
 }
 
-static INLINE TX_SIZE dec_get_uv_tx_size(const MB_MODE_INFO *mbmi,
+static INLINE TX_SIZE dec_get_uv_tx_size(const MODE_INFO *mi,
                                          int n4_wl, int n4_hl) {
   // get minimum log2 num4x4s dimension
   const int x = VPXMIN(n4_wl, n4_hl);
-  return VPXMIN(mbmi->tx_size,  x);
+  return VPXMIN(mi->tx_size,  x);
 }
 
 static INLINE void dec_reset_skip_context(MACROBLOCKD *xd) {
@@ -782,10 +809,10 @@ static void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, int bwl,
   }
 }
 
-static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                 int bw, int bh, int x_mis, int y_mis,
-                                 int bwl, int bhl) {
+static MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                              BLOCK_SIZE bsize, int mi_row, int mi_col,
+                              int bw, int bh, int x_mis, int y_mis,
+                              int bwl, int bhl) {
   const int offset = mi_row * cm->mi_stride + mi_col;
   int x, y;
   const TileInfo *const tile = &xd->tile;
@@ -794,7 +821,7 @@ static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   xd->mi[0] = &cm->mi[offset];
   // TODO(slavarnway): Generate sb_type based on bwl and bhl, instead of
   // passing bsize from decode_partition().
-  xd->mi[0]->mbmi.sb_type = bsize;
+  xd->mi[0]->sb_type = bsize;
   for (y = 0; y < y_mis; ++y)
     for (x = !y; x < x_mis; ++x) {
       xd->mi[y * cm->mi_stride + x] = xd->mi[0];
@@ -809,7 +836,7 @@ static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 
   vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
-  return &xd->mi[0]->mbmi;
+  return xd->mi[0];
 }
 
 static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
@@ -823,8 +850,8 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
   const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
   const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
 
-  MB_MODE_INFO *mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col,
-                                   bw, bh, x_mis, y_mis, bwl, bhl);
+  MODE_INFO *mi = set_offsets(cm, xd, bsize, mi_row, mi_col,
+                              bw, bh, x_mis, y_mis, bwl, bhl);
 
   if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
     const BLOCK_SIZE uv_subsize =
@@ -834,19 +861,19 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
                          VPX_CODEC_CORRUPT_FRAME, "Invalid block size.");
   }
 
-  vpx_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+  vp9_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
 
-  if (mbmi->skip) {
+  if (mi->skip) {
     dec_reset_skip_context(xd);
   }
 
-  if (!is_inter_block(mbmi)) {
+  if (!is_inter_block(mi)) {
     int plane;
     for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
       const struct macroblockd_plane *const pd = &xd->plane[plane];
       const TX_SIZE tx_size =
-          plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
-                  : mbmi->tx_size;
+          plane ? dec_get_uv_tx_size(mi, pd->n4_wl, pd->n4_hl)
+                  : mi->tx_size;
       const int num_4x4_w = pd->n4_w;
       const int num_4x4_h = pd->n4_h;
       const int step = (1 << tx_size);
@@ -856,9 +883,12 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
       const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
           0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
+      xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide;
+      xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high;
+
       for (row = 0; row < max_blocks_high; row += step)
         for (col = 0; col < max_blocks_wide; col += step)
-          predict_and_reconstruct_intra_block(xd, r, mbmi, plane,
+          predict_and_reconstruct_intra_block(xd, r, mi, plane,
                                               row, col, tx_size);
     }
   } else {
@@ -866,15 +896,15 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
     dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col);
 
     // Reconstruction
-    if (!mbmi->skip) {
+    if (!mi->skip) {
       int eobtotal = 0;
       int plane;
 
       for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
         const struct macroblockd_plane *const pd = &xd->plane[plane];
         const TX_SIZE tx_size =
-            plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
-                    : mbmi->tx_size;
+            plane ? dec_get_uv_tx_size(mi, pd->n4_wl, pd->n4_hl)
+                    : mi->tx_size;
         const int num_4x4_w = pd->n4_w;
         const int num_4x4_h = pd->n4_h;
         const int step = (1 << tx_size);
@@ -884,21 +914,24 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
         const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
             0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
+        xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide;
+        xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high;
+
         for (row = 0; row < max_blocks_high; row += step)
           for (col = 0; col < max_blocks_wide; col += step)
-            eobtotal += reconstruct_inter_block(xd, r, mbmi, plane, row, col,
+            eobtotal += reconstruct_inter_block(xd, r, mi, plane, row, col,
                                                 tx_size);
       }
 
       if (!less8x8 && eobtotal == 0)
-        mbmi->skip = 1;  // skip loopfilter
+        mi->skip = 1;  // skip loopfilter
     }
   }
 
   xd->corrupted |= vpx_reader_has_error(r);
 
   if (cm->lf.filter_level) {
-    vp9_build_mask(cm, mbmi, mi_row, mi_col, bw, bh);
+    vp9_build_mask(cm, mi, mi_row, mi_col, bw, bh);
   }
 }
 
@@ -1196,8 +1229,9 @@ static void resize_mv_buffer(VP9_COMMON *cm) {
   vpx_free(cm->cur_frame->mvs);
   cm->cur_frame->mi_rows = cm->mi_rows;
   cm->cur_frame->mi_cols = cm->mi_cols;
-  cm->cur_frame->mvs = (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
-                                            sizeof(*cm->cur_frame->mvs));
+  CHECK_MEM_ERROR(cm, cm->cur_frame->mvs,
+                  (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                       sizeof(*cm->cur_frame->mvs)));
 }
 
 static void resize_context_buffers(VP9_COMMON *cm, int width, int height) {
@@ -1281,11 +1315,16 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
   BufferPool *const pool = cm->buffer_pool;
   for (i = 0; i < REFS_PER_FRAME; ++i) {
     if (vpx_rb_read_bit(rb)) {
-      YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
-      width = buf->y_crop_width;
-      height = buf->y_crop_height;
-      found = 1;
-      break;
+      if (cm->frame_refs[i].idx != INVALID_IDX) {
+        YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
+        width = buf->y_crop_width;
+        height = buf->y_crop_height;
+        found = 1;
+        break;
+      } else {
+        vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                           "Failed to decode frame size");
+      }
     }
   }
 
@@ -1300,22 +1339,23 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
   // has valid dimensions.
   for (i = 0; i < REFS_PER_FRAME; ++i) {
     RefBuffer *const ref_frame = &cm->frame_refs[i];
-    has_valid_ref_frame |= valid_ref_frame_size(ref_frame->buf->y_crop_width,
-                                                ref_frame->buf->y_crop_height,
-                                                width, height);
+    has_valid_ref_frame |= (ref_frame->idx != INVALID_IDX &&
+                            valid_ref_frame_size(ref_frame->buf->y_crop_width,
+                                                 ref_frame->buf->y_crop_height,
+                                                 width, height));
   }
   if (!has_valid_ref_frame)
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Referenced frame has invalid size");
   for (i = 0; i < REFS_PER_FRAME; ++i) {
     RefBuffer *const ref_frame = &cm->frame_refs[i];
-    if (!valid_ref_frame_img_fmt(
-            ref_frame->buf->bit_depth,
-            ref_frame->buf->subsampling_x,
-            ref_frame->buf->subsampling_y,
-            cm->bit_depth,
-            cm->subsampling_x,
-            cm->subsampling_y))
+    if (ref_frame->idx == INVALID_IDX ||
+        !valid_ref_frame_img_fmt(ref_frame->buf->bit_depth,
+                                 ref_frame->buf->subsampling_x,
+                                 ref_frame->buf->subsampling_y,
+                                 cm->bit_depth,
+                                 cm->subsampling_x,
+                                 cm->subsampling_y))
       vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                          "Referenced frame has incompatible color format");
   }
@@ -1434,7 +1474,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
   TileBuffer tile_buffers[4][1 << 6];
   int tile_row, tile_col;
   int mi_row, mi_col;
-  TileData *tile_data = NULL;
+  TileWorkerData *tile_data = NULL;
 
   if (cm->lf.filter_level && !cm->skip_loop_filter &&
       pbi->lf_worker.data1 == NULL) {
@@ -1470,28 +1510,17 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
 
   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
 
-  if (pbi->tile_data == NULL ||
-      (tile_cols * tile_rows) != pbi->total_tiles) {
-    vpx_free(pbi->tile_data);
-    CHECK_MEM_ERROR(
-        cm,
-        pbi->tile_data,
-        vpx_memalign(32, tile_cols * tile_rows * (sizeof(*pbi->tile_data))));
-    pbi->total_tiles = tile_rows * tile_cols;
-  }
-
   // Load all tile information into tile_data.
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
       const TileBuffer *const buf = &tile_buffers[tile_row][tile_col];
-      tile_data = pbi->tile_data + tile_cols * tile_row + tile_col;
-      tile_data->cm = cm;
+      tile_data = pbi->tile_worker_data + tile_cols * tile_row + tile_col;
       tile_data->xd = pbi->mb;
       tile_data->xd.corrupted = 0;
-      tile_data->xd.counts = cm->frame_parallel_decoding_mode ?
-                             NULL : &cm->counts;
+      tile_data->xd.counts =
+          cm->frame_parallel_decoding_mode ? NULL : &cm->counts;
       vp9_zero(tile_data->dqcoeff);
-      vp9_tile_init(&tile_data->xd.tile, tile_data->cm, tile_row, tile_col);
+      vp9_tile_init(&tile_data->xd.tile, cm, tile_row, tile_col);
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                           &tile_data->bit_reader, pbi->decrypt_cb,
                           pbi->decrypt_state);
@@ -1507,8 +1536,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
       for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
         const int col = pbi->inv_tile_order ?
                         tile_cols - tile_col - 1 : tile_col;
-        tile_data = pbi->tile_data + tile_cols * tile_row + col;
-        vp9_tile_set_col(&tile, tile_data->cm, col);
+        tile_data = pbi->tile_worker_data + tile_cols * tile_row + col;
+        vp9_tile_set_col(&tile, cm, col);
         vp9_zero(tile_data->xd.left_context);
         vp9_zero(tile_data->xd.left_seg_context);
         for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
@@ -1560,7 +1589,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
   }
 
   // Get last tile data.
-  tile_data = pbi->tile_data + tile_cols * tile_rows - 1;
+  tile_data = pbi->tile_worker_data + tile_cols * tile_rows - 1;
 
   if (pbi->frame_parallel_decode)
     vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX);
@@ -1645,12 +1674,6 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
     const int num_threads = pbi->max_threads;
     CHECK_MEM_ERROR(cm, pbi->tile_workers,
                     vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
-    // Ensure tile data offsets will be properly aligned. This may fail on
-    // platforms without DECLARE_ALIGNED().
-    assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
-    CHECK_MEM_ERROR(cm, pbi->tile_worker_data,
-                    vpx_memalign(32, num_threads *
-                                 sizeof(*pbi->tile_worker_data)));
     for (n = 0; n < num_threads; ++n) {
       VPxWorker *const worker = &pbi->tile_workers[n];
       ++pbi->num_tile_workers;
@@ -1666,7 +1689,8 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
   // Reset tile decoding hook
   for (n = 0; n < num_workers; ++n) {
     VPxWorker *const worker = &pbi->tile_workers[n];
-    TileWorkerData *const tile_data = &pbi->tile_worker_data[n];
+    TileWorkerData *const tile_data =
+        &pbi->tile_worker_data[n + pbi->total_tiles];
     winterface->sync(worker);
     tile_data->xd = pbi->mb;
     tile_data->xd.counts =
@@ -1979,6 +2003,8 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
   if (!cm->error_resilient_mode) {
     cm->refresh_frame_context = vpx_rb_read_bit(rb);
     cm->frame_parallel_decoding_mode = vpx_rb_read_bit(rb);
+    if (!cm->frame_parallel_decoding_mode)
+      vp9_zero(cm->counts);
   } else {
     cm->refresh_frame_context = 0;
     cm->frame_parallel_decoding_mode = 1;
@@ -2082,43 +2108,6 @@ static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data,
   return vpx_reader_has_error(&r);
 }
 
-#ifdef NDEBUG
-#define debug_check_frame_counts(cm) (void)0
-#else  // !NDEBUG
-// Counts should only be incremented when frame_parallel_decoding_mode and
-// error_resilient_mode are disabled.
-static void debug_check_frame_counts(const VP9_COMMON *const cm) {
-  FRAME_COUNTS zero_counts;
-  vp9_zero(zero_counts);
-  assert(cm->frame_parallel_decoding_mode || cm->error_resilient_mode);
-  assert(!memcmp(cm->counts.y_mode, zero_counts.y_mode,
-                 sizeof(cm->counts.y_mode)));
-  assert(!memcmp(cm->counts.uv_mode, zero_counts.uv_mode,
-                 sizeof(cm->counts.uv_mode)));
-  assert(!memcmp(cm->counts.partition, zero_counts.partition,
-                 sizeof(cm->counts.partition)));
-  assert(!memcmp(cm->counts.coef, zero_counts.coef,
-                 sizeof(cm->counts.coef)));
-  assert(!memcmp(cm->counts.eob_branch, zero_counts.eob_branch,
-                 sizeof(cm->counts.eob_branch)));
-  assert(!memcmp(cm->counts.switchable_interp, zero_counts.switchable_interp,
-                 sizeof(cm->counts.switchable_interp)));
-  assert(!memcmp(cm->counts.inter_mode, zero_counts.inter_mode,
-                 sizeof(cm->counts.inter_mode)));
-  assert(!memcmp(cm->counts.intra_inter, zero_counts.intra_inter,
-                 sizeof(cm->counts.intra_inter)));
-  assert(!memcmp(cm->counts.comp_inter, zero_counts.comp_inter,
-                 sizeof(cm->counts.comp_inter)));
-  assert(!memcmp(cm->counts.single_ref, zero_counts.single_ref,
-                 sizeof(cm->counts.single_ref)));
-  assert(!memcmp(cm->counts.comp_ref, zero_counts.comp_ref,
-                 sizeof(cm->counts.comp_ref)));
-  assert(!memcmp(&cm->counts.tx, &zero_counts.tx, sizeof(cm->counts.tx)));
-  assert(!memcmp(cm->counts.skip, zero_counts.skip, sizeof(cm->counts.skip)));
-  assert(!memcmp(&cm->counts.mv, &zero_counts.mv, sizeof(cm->counts.mv)));
-}
-#endif  // NDEBUG
-
 static struct vpx_read_bit_buffer *init_read_bit_buffer(
     VP9Decoder *pbi,
     struct vpx_read_bit_buffer *rb,
@@ -2202,8 +2191,6 @@ void vp9_decode_frame(VP9Decoder *pbi,
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Uninitialized entropy context.");
 
-  vp9_zero(cm->counts);
-
   xd->corrupted = 0;
   new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
   if (new_fb->corrupted)
@@ -2232,6 +2219,19 @@ void vp9_decode_frame(VP9Decoder *pbi,
     vp9_frameworker_unlock_stats(worker);
   }
 
+  if (pbi->tile_worker_data == NULL ||
+      (tile_cols * tile_rows) != pbi->total_tiles) {
+    const int num_tile_workers = tile_cols * tile_rows +
+        ((pbi->max_threads > 1) ? pbi->max_threads : 0);
+    const size_t twd_size = num_tile_workers * sizeof(*pbi->tile_worker_data);
+    // Ensure tile data offsets will be properly aligned. This may fail on
+    // platforms without DECLARE_ALIGNED().
+    assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
+    vpx_free(pbi->tile_worker_data);
+    CHECK_MEM_ERROR(cm, pbi->tile_worker_data, vpx_memalign(32, twd_size));
+    pbi->total_tiles = tile_rows * tile_cols;
+  }
+
   if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) {
     // Multi-threaded tile decoder
     *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
@@ -2259,8 +2259,6 @@ void vp9_decode_frame(VP9Decoder *pbi,
         vp9_adapt_mode_probs(cm);
         vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv);
       }
-    } else {
-      debug_check_frame_counts(cm);
     }
   } else {
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index d3ca7b3..ffc6839 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -81,10 +81,10 @@ static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
   return (TX_SIZE)tx_size;
 }
 
-static TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
-                            int allow_select, vpx_reader *r) {
+static INLINE TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                   int allow_select, vpx_reader *r) {
   TX_MODE tx_mode = cm->tx_mode;
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8)
     return read_selected_tx_size(cm, xd, max_tx_size, r);
@@ -149,17 +149,12 @@ static int read_intra_segment_id(VP9_COMMON *const cm, int mi_offset,
 }
 
 static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                                 int mi_row, int mi_col, vpx_reader *r) {
+                                 int mi_row, int mi_col, vpx_reader *r,
+                                 int x_mis, int y_mis) {
   struct segmentation *const seg = &cm->seg;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *const mi = xd->mi[0];
   int predicted_segment_id, segment_id;
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = xd->plane[0].n4_w >> 1;
-  const int bh = xd->plane[0].n4_h >> 1;
-
-  // TODO(slavarnway): move x_mis, y_mis into xd ?????
-  const int x_mis = VPXMIN(cm->mi_cols - mi_col, bw);
-  const int y_mis = VPXMIN(cm->mi_rows - mi_row, bh);
 
   if (!seg->enabled)
     return 0;  // Default for disabled segmentation
@@ -176,9 +171,9 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
 
   if (seg->temporal_update) {
     const vpx_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
-    mbmi->seg_id_predicted = vpx_read(r, pred_prob);
-    segment_id = mbmi->seg_id_predicted ? predicted_segment_id
-                                        : read_segment_id(r, seg);
+    mi->seg_id_predicted = vpx_read(r, pred_prob);
+    segment_id = mi->seg_id_predicted ? predicted_segment_id
+                                      : read_segment_id(r, seg);
   } else {
     segment_id = read_segment_id(r, seg);
   }
@@ -202,52 +197,46 @@ static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
 
 static void read_intra_frame_mode_info(VP9_COMMON *const cm,
                                        MACROBLOCKD *const xd,
-                                       int mi_row, int mi_col, vpx_reader *r) {
+                                       int mi_row, int mi_col, vpx_reader *r,
+                                       int x_mis, int y_mis) {
   MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
   const MODE_INFO *above_mi = xd->above_mi;
   const MODE_INFO *left_mi  = xd->left_mi;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mi->sb_type;
   int i;
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = xd->plane[0].n4_w >> 1;
-  const int bh = xd->plane[0].n4_h >> 1;
 
-  // TODO(slavarnway): move x_mis, y_mis into xd ?????
-  const int x_mis = VPXMIN(cm->mi_cols - mi_col, bw);
-  const int y_mis = VPXMIN(cm->mi_rows - mi_row, bh);
-
-  mbmi->segment_id = read_intra_segment_id(cm, mi_offset, x_mis, y_mis, r);
-  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
-  mbmi->tx_size = read_tx_size(cm, xd, 1, r);
-  mbmi->ref_frame[0] = INTRA_FRAME;
-  mbmi->ref_frame[1] = NONE;
+  mi->segment_id = read_intra_segment_id(cm, mi_offset, x_mis, y_mis, r);
+  mi->skip = read_skip(cm, xd, mi->segment_id, r);
+  mi->tx_size = read_tx_size(cm, xd, 1, r);
+  mi->ref_frame[0] = INTRA_FRAME;
+  mi->ref_frame[1] = NONE;
 
   switch (bsize) {
     case BLOCK_4X4:
       for (i = 0; i < 4; ++i)
         mi->bmi[i].as_mode =
             read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, i));
-      mbmi->mode = mi->bmi[3].as_mode;
+      mi->mode = mi->bmi[3].as_mode;
       break;
     case BLOCK_4X8:
       mi->bmi[0].as_mode = mi->bmi[2].as_mode =
           read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0));
-      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
+      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mi->mode =
           read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 1));
       break;
     case BLOCK_8X4:
       mi->bmi[0].as_mode = mi->bmi[1].as_mode =
           read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0));
-      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
+      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mi->mode =
           read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 2));
       break;
     default:
-      mbmi->mode = read_intra_mode(r,
-                                   get_y_mode_probs(mi, above_mi, left_mi, 0));
+      mi->mode = read_intra_mode(r,
+                                 get_y_mode_probs(mi, above_mi, left_mi, 0));
   }
 
-  mbmi->uv_mode = read_intra_mode(r, vp9_kf_uv_mode_prob[mbmi->mode]);
+  mi->uv_mode = read_intra_mode(r, vp9_kf_uv_mode_prob[mi->mode]);
 }
 
 static int read_mv_component(vpx_reader *r,
@@ -289,7 +278,7 @@ static INLINE void read_mv(vpx_reader *r, MV *mv, const MV *ref,
                            nmv_context_counts *counts, int allow_hp) {
   const MV_JOINT_TYPE joint_type =
       (MV_JOINT_TYPE)vpx_read_tree(r, vp9_mv_joint_tree, ctx->joints);
-  const int use_hp = allow_hp && vp9_use_mv_hp(ref);
+  const int use_hp = allow_hp && use_mv_hp(ref);
   MV diff = {0, 0};
 
   if (mv_joint_vertical(joint_type))
@@ -364,11 +353,36 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   }
 }
 
+// TODO(slavarnway): Move this decoder version of
+// vp9_get_pred_context_switchable_interp() to vp9_pred_common.h and update the
+// encoder.
+//
+// Returns a context number for the given MB prediction signal
+static int dec_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int left_type = left_mi ? left_mi->interp_filter : SWITCHABLE_FILTERS;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const int above_type = above_mi ? above_mi->interp_filter
+                             : SWITCHABLE_FILTERS;
+
+  if (left_type == above_type)
+    return left_type;
+  else if (left_type == SWITCHABLE_FILTERS)
+    return above_type;
+  else if (above_type == SWITCHABLE_FILTERS)
+    return left_type;
+  else
+    return SWITCHABLE_FILTERS;
+}
 
 static INLINE INTERP_FILTER read_switchable_interp_filter(
     VP9_COMMON *const cm, MACROBLOCKD *const xd,
     vpx_reader *r) {
-  const int ctx = vp9_get_pred_context_switchable_interp(xd);
+  const int ctx = dec_get_pred_context_switchable_interp(xd);
   const INTERP_FILTER type =
       (INTERP_FILTER)vpx_read_tree(r, vp9_switchable_interp_tree,
                                    cm->fc->switchable_interp_prob[ctx]);
@@ -381,36 +395,39 @@ static INLINE INTERP_FILTER read_switchable_interp_filter(
 static void read_intra_block_mode_info(VP9_COMMON *const cm,
                                        MACROBLOCKD *const xd, MODE_INFO *mi,
                                        vpx_reader *r) {
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
+  const BLOCK_SIZE bsize = mi->sb_type;
   int i;
 
-  mbmi->ref_frame[0] = INTRA_FRAME;
-  mbmi->ref_frame[1] = NONE;
-
   switch (bsize) {
     case BLOCK_4X4:
       for (i = 0; i < 4; ++i)
         mi->bmi[i].as_mode = read_intra_mode_y(cm, xd, r, 0);
-      mbmi->mode = mi->bmi[3].as_mode;
+      mi->mode = mi->bmi[3].as_mode;
       break;
     case BLOCK_4X8:
       mi->bmi[0].as_mode = mi->bmi[2].as_mode = read_intra_mode_y(cm, xd,
                                                                   r, 0);
-      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
+      mi->bmi[1].as_mode = mi->bmi[3].as_mode = mi->mode =
           read_intra_mode_y(cm, xd, r, 0);
       break;
     case BLOCK_8X4:
       mi->bmi[0].as_mode = mi->bmi[1].as_mode = read_intra_mode_y(cm, xd,
                                                                   r, 0);
-      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
+      mi->bmi[2].as_mode = mi->bmi[3].as_mode = mi->mode =
           read_intra_mode_y(cm, xd, r, 0);
       break;
     default:
-      mbmi->mode = read_intra_mode_y(cm, xd, r, size_group_lookup[bsize]);
+      mi->mode = read_intra_mode_y(cm, xd, r, size_group_lookup[bsize]);
   }
 
-  mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
+  mi->uv_mode = read_intra_mode_uv(cm, xd, r, mi->mode);
+
+  // Initialize interp_filter here so we do not have to check for inter block
+  // modes in dec_get_pred_context_switchable_interp()
+  mi->interp_filter = SWITCHABLE_FILTERS;
+
+  mi->ref_frame[0] = INTRA_FRAME;
+  mi->ref_frame[1] = NONE;
 }
 
 static INLINE int is_mv_valid(const MV *mv) {
@@ -418,10 +435,18 @@ static INLINE int is_mv_valid(const MV *mv) {
          mv->col > MV_LOW && mv->col < MV_UPP;
 }
 
+static INLINE void copy_mv_pair(int_mv *dst, const int_mv *src) {
+  memcpy(dst, src, sizeof(*dst) * 2);
+}
+
+static INLINE void zero_mv_pair(int_mv *dst) {
+  memset(dst, 0, sizeof(*dst) * 2);
+}
+
 static INLINE int assign_mv(VP9_COMMON *cm, MACROBLOCKD *xd,
                             PREDICTION_MODE mode,
                             int_mv mv[2], int_mv ref_mv[2],
-                            int_mv nearest_mv[2], int_mv near_mv[2],
+                            int_mv near_nearest_mv[2],
                             int is_compound, int allow_hp, vpx_reader *r) {
   int i;
   int ret = 1;
@@ -437,22 +462,13 @@ static INLINE int assign_mv(VP9_COMMON *cm, MACROBLOCKD *xd,
       }
       break;
     }
+    case NEARMV:
     case NEARESTMV: {
-      mv[0].as_int = nearest_mv[0].as_int;
-      if (is_compound)
-        mv[1].as_int = nearest_mv[1].as_int;
-      break;
-    }
-    case NEARMV: {
-      mv[0].as_int = near_mv[0].as_int;
-      if (is_compound)
-        mv[1].as_int = near_mv[1].as_int;
+      copy_mv_pair(mv, near_nearest_mv);
       break;
     }
     case ZEROMV: {
-      mv[0].as_int = 0;
-      if (is_compound)
-        mv[1].as_int = 0;
+      zero_mv_pair(mv);
       break;
     }
     default: {
@@ -467,7 +483,7 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     return get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
   } else {
-    const int ctx = vp9_get_intra_inter_context(xd);
+    const int ctx = get_intra_inter_context(xd);
     const int is_inter = vpx_read(r, cm->fc->intra_inter_prob[ctx]);
     FRAME_COUNTS *counts = xd->counts;
     if (counts)
@@ -476,44 +492,295 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   }
 }
 
+static void dec_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *best_mv,
+                                  int refmv_count) {
+  int i;
+
+  // Make sure all the candidates are properly clamped etc
+  for (i = 0; i < refmv_count; ++i) {
+    lower_mv_precision(&mvlist[i].as_mv, allow_hp);
+    *best_mv = mvlist[i];
+  }
+}
+
 static void fpm_sync(void *const data, int mi_row) {
   VP9Decoder *const pbi = (VP9Decoder *)data;
   vp9_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame,
                        mi_row << MI_BLOCK_SIZE_LOG2);
 }
 
+// This macro is used to add a motion vector mv_ref list if it isn't
+// already in the list.  If it's the second motion vector or early_break
+// it will also skip all additional processing and jump to Done!
+#define ADD_MV_REF_LIST_EB(mv, refmv_count, mv_ref_list, Done) \
+  do { \
+    if (refmv_count) { \
+      if ((mv).as_int != (mv_ref_list)[0].as_int) { \
+        (mv_ref_list)[(refmv_count)] = (mv); \
+        refmv_count++; \
+        goto Done; \
+      } \
+    } else { \
+      (mv_ref_list)[(refmv_count)++] = (mv); \
+      if (early_break) \
+        goto Done; \
+    } \
+  } while (0)
+
+// If either reference frame is different, not INTRA, and they
+// are different from each other scale and add the mv to our list.
+#define IF_DIFF_REF_FRAME_ADD_MV_EB(mbmi, ref_frame, ref_sign_bias, \
+                                    refmv_count, mv_ref_list, Done) \
+  do { \
+    if (is_inter_block(mbmi)) { \
+      if ((mbmi)->ref_frame[0] != ref_frame) \
+        ADD_MV_REF_LIST_EB(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \
+                           refmv_count, mv_ref_list, Done); \
+      if (has_second_ref(mbmi) && \
+          (mbmi)->ref_frame[1] != ref_frame && \
+          (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
+        ADD_MV_REF_LIST_EB(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \
+                           refmv_count, mv_ref_list, Done); \
+    } \
+  } while (0)
+
+// This function searches the neighborhood of a given MB/SB
+// to try and find candidate reference vectors.
+static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                            PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame,
+                            const POSITION *const mv_ref_search,
+                            int_mv *mv_ref_list,
+                            int mi_row, int mi_col, int block, int is_sub8x8,
+                            find_mv_refs_sync sync, void *const data) {
+  const int *ref_sign_bias = cm->ref_frame_sign_bias;
+  int i, refmv_count = 0;
+  int different_ref_found = 0;
+  const MV_REF *const prev_frame_mvs = cm->use_prev_frame_mvs ?
+      cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL;
+  const TileInfo *const tile = &xd->tile;
+  // If mode is nearestmv or newmv (uses nearestmv as a reference) then stop
+  // searching after the first mv is found.
+  const int early_break = (mode != NEARMV);
+
+  // Blank the reference vector list
+  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+
+  i = 0;
+  if (is_sub8x8) {
+    // If the size < 8x8 we get the mv from the bmi substructure for the
+    // nearest two blocks.
+    for (i = 0; i < 2; ++i) {
+      const POSITION *const mv_ref = &mv_ref_search[i];
+      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+        const MODE_INFO *const candidate_mi =
+            xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
+        different_ref_found = 1;
+
+        if (candidate_mi->ref_frame[0] == ref_frame)
+          ADD_MV_REF_LIST_EB(
+              get_sub_block_mv(candidate_mi, 0, mv_ref->col, block),
+                  refmv_count, mv_ref_list, Done);
+        else if (candidate_mi->ref_frame[1] == ref_frame)
+          ADD_MV_REF_LIST_EB(
+              get_sub_block_mv(candidate_mi, 1, mv_ref->col, block),
+                  refmv_count, mv_ref_list, Done);
+      }
+    }
+  }
+
+  // Check the rest of the neighbors in much the same way
+  // as before except we don't need to keep track of sub blocks or
+  // mode counts.
+  for (; i < MVREF_NEIGHBOURS; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MODE_INFO *const candidate =
+          xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
+      different_ref_found = 1;
+
+      if (candidate->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST_EB(candidate->mv[0], refmv_count, mv_ref_list, Done);
+      else if (candidate->ref_frame[1] == ref_frame)
+        ADD_MV_REF_LIST_EB(candidate->mv[1], refmv_count, mv_ref_list, Done);
+    }
+  }
+
+  // TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast
+  // on windows platform. The sync here is unnecessary if use_prev_frame_mvs
+  // is 0. But after removing it, there will be hang in the unit test on windows
+  // due to several threads waiting for a thread's signal.
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+    if (cm->frame_parallel_decode && sync != NULL) {
+      sync(data, mi_row);
+    }
+#endif
+
+  // Check the last frame's mode and mv info.
+  if (prev_frame_mvs) {
+    // Synchronize here for frame parallel decode if sync function is provided.
+    if (cm->frame_parallel_decode && sync != NULL) {
+      sync(data, mi_row);
+    }
+
+    if (prev_frame_mvs->ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done);
+    } else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
+      ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[1], refmv_count, mv_ref_list, Done);
+    }
+  }
+
+  // Since we couldn't find 2 mvs from the same reference frame
+  // go back through the neighbors and find motion vectors from
+  // different reference frames.
+  if (different_ref_found) {
+    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
+      const POSITION *mv_ref = &mv_ref_search[i];
+      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+        const MODE_INFO *const candidate =
+            xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
+
+        // If the candidate is INTRA we don't want to consider its mv.
+        IF_DIFF_REF_FRAME_ADD_MV_EB(candidate, ref_frame, ref_sign_bias,
+                                    refmv_count, mv_ref_list, Done);
+      }
+    }
+  }
+
+  // Since we still don't have a candidate we'll try the last frame.
+  if (prev_frame_mvs) {
+    if (prev_frame_mvs->ref_frame[0] != ref_frame &&
+        prev_frame_mvs->ref_frame[0] > INTRA_FRAME) {
+      int_mv mv = prev_frame_mvs->mv[0];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[0]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST_EB(mv, refmv_count, mv_ref_list, Done);
+    }
+
+    if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME &&
+        prev_frame_mvs->ref_frame[1] != ref_frame &&
+        prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int) {
+      int_mv mv = prev_frame_mvs->mv[1];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST_EB(mv, refmv_count, mv_ref_list, Done);
+    }
+  }
+
+  if (mode == NEARMV)
+    refmv_count = MAX_MV_REF_CANDIDATES;
+  else
+    // we only care about the nearestmv for the remaining modes
+    refmv_count = 1;
+
+ Done:
+  // Clamp vectors
+  for (i = 0; i < refmv_count; ++i)
+    clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
+
+  return refmv_count;
+}
+
+static void append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                      const POSITION *const mv_ref_search,
+                                      PREDICTION_MODE b_mode, int block,
+                                      int ref, int mi_row, int mi_col,
+                                      int_mv *best_sub8x8) {
+  int_mv mv_list[MAX_MV_REF_CANDIDATES];
+  MODE_INFO *const mi = xd->mi[0];
+  b_mode_info *bmi = mi->bmi;
+  int n;
+  int refmv_count;
+
+  assert(MAX_MV_REF_CANDIDATES == 2);
+
+  refmv_count = dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref],
+                                 mv_ref_search, mv_list, mi_row, mi_col, block,
+                                 1, NULL, NULL);
+
+  switch (block) {
+    case 0:
+      best_sub8x8->as_int = mv_list[refmv_count - 1].as_int;
+      break;
+    case 1:
+    case 2:
+      if (b_mode == NEARESTMV) {
+        best_sub8x8->as_int = bmi[0].as_mv[ref].as_int;
+      } else {
+        best_sub8x8->as_int = 0;
+        for (n = 0; n < refmv_count; ++n)
+          if (bmi[0].as_mv[ref].as_int != mv_list[n].as_int) {
+            best_sub8x8->as_int = mv_list[n].as_int;
+            break;
+          }
+      }
+      break;
+    case 3:
+      if (b_mode == NEARESTMV) {
+        best_sub8x8->as_int = bmi[2].as_mv[ref].as_int;
+      } else {
+        int_mv candidates[2 + MAX_MV_REF_CANDIDATES];
+        candidates[0] = bmi[1].as_mv[ref];
+        candidates[1] = bmi[0].as_mv[ref];
+        candidates[2] = mv_list[0];
+        candidates[3] = mv_list[1];
+        best_sub8x8->as_int = 0;
+        for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n)
+          if (bmi[2].as_mv[ref].as_int != candidates[n].as_int) {
+            best_sub8x8->as_int = candidates[n].as_int;
+            break;
+          }
+      }
+      break;
+    default:
+      assert(0 && "Invalid block index.");
+  }
+}
+
+static uint8_t get_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                                const POSITION *const mv_ref_search,
+                                int mi_row, int mi_col) {
+  int i;
+  int context_counter = 0;
+  const TileInfo *const tile = &xd->tile;
+
+  // Get mode count from nearest 2 blocks
+  for (i = 0; i < 2; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      const MODE_INFO *const candidate =
+          xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
+      // Keep counts for entropy encoding.
+      context_counter += mode_2_counter[candidate->mode];
+    }
+  }
+
+  return counter_to_context[context_counter];
+}
+
 static void read_inter_block_mode_info(VP9Decoder *const pbi,
                                        MACROBLOCKD *const xd,
                                        MODE_INFO *const mi,
                                        int mi_row, int mi_col, vpx_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
-  int_mv nearestmv[2], nearmv[2];
-  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
+  int_mv best_ref_mvs[2];
   int ref, is_compound;
-  uint8_t inter_mode_ctx[MAX_REF_FRAMES];
-
-  read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
-  is_compound = has_second_ref(mbmi);
-
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
-    RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
-
-    xd->block_refs[ref] = ref_buf;
-    if ((!vp9_is_valid_scale(&ref_buf->sf)))
-      vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
-                         "Reference frame has invalid dimensions");
-    vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col,
-                         &ref_buf->sf);
-    vp9_find_mv_refs(cm, xd, mi, frame, ref_mvs[frame],
-                     mi_row, mi_col, fpm_sync, (void *)pbi, inter_mode_ctx);
-  }
+  uint8_t inter_mode_ctx;
+  const POSITION *const mv_ref_search = mv_ref_blocks[bsize];
+
+  read_ref_frames(cm, xd, r, mi->segment_id, mi->ref_frame);
+  is_compound = has_second_ref(mi);
+  inter_mode_ctx = get_mode_context(cm, xd, mv_ref_search, mi_row, mi_col);
 
-  if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    mbmi->mode = ZEROMV;
+  if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP)) {
+    mi->mode = ZEROMV;
     if (bsize < BLOCK_8X8) {
         vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
                            "Invalid usage of segement feature on small blocks");
@@ -521,18 +788,31 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
     }
   } else {
     if (bsize >= BLOCK_8X8)
-      mbmi->mode = read_inter_mode(cm, xd, r,
-                                   inter_mode_ctx[mbmi->ref_frame[0]]);
-  }
-
-  if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-      vp9_find_best_ref_mvs(xd, allow_hp, ref_mvs[mbmi->ref_frame[ref]],
-                            &nearestmv[ref], &nearmv[ref]);
+      mi->mode = read_inter_mode(cm, xd, r, inter_mode_ctx);
+    else
+      // Sub 8x8 blocks use the nearestmv as a ref_mv if the b_mode is NEWMV.
+      // Setting mode to NEARESTMV forces the search to stop after the nearestmv
+      // has been found. After b_modes have been read, mode will be overwritten
+      // by the last b_mode.
+      mi->mode = NEARESTMV;
+
+    if (mi->mode != ZEROMV) {
+      for (ref = 0; ref < 1 + is_compound; ++ref) {
+        int_mv tmp_mvs[MAX_MV_REF_CANDIDATES];
+        const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
+        int refmv_count;
+
+        refmv_count = dec_find_mv_refs(cm, xd, mi->mode, frame, mv_ref_search,
+                                       tmp_mvs, mi_row, mi_col, -1, 0,
+                                       fpm_sync, (void *)pbi);
+
+        dec_find_best_ref_mvs(allow_hp, tmp_mvs, &best_ref_mvs[ref],
+                              refmv_count);
+      }
     }
   }
 
-  mbmi->interp_filter = (cm->interp_filter == SWITCHABLE)
+  mi->interp_filter = (cm->interp_filter == SWITCHABLE)
                       ? read_switchable_interp_filter(cm, xd, r)
                       : cm->interp_filter;
 
@@ -541,33 +821,24 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
     const int num_4x4_h = 1 << xd->bmode_blocks_hl;
     int idx, idy;
     PREDICTION_MODE b_mode;
-    int_mv nearest_sub8x8[2], near_sub8x8[2];
+    int_mv best_sub8x8[2];
     for (idy = 0; idy < 2; idy += num_4x4_h) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
-        int_mv block[2];
         const int j = idy * 2 + idx;
-        b_mode = read_inter_mode(cm, xd, r, inter_mode_ctx[mbmi->ref_frame[0]]);
+        b_mode = read_inter_mode(cm, xd, r, inter_mode_ctx);
 
         if (b_mode == NEARESTMV || b_mode == NEARMV) {
-          uint8_t dummy_mode_ctx[MAX_REF_FRAMES];
           for (ref = 0; ref < 1 + is_compound; ++ref)
-            vp9_append_sub8x8_mvs_for_idx(cm, xd, j, ref, mi_row, mi_col,
-                                          &nearest_sub8x8[ref],
-                                          &near_sub8x8[ref],
-                                          dummy_mode_ctx);
+            append_sub8x8_mvs_for_idx(cm, xd, mv_ref_search, b_mode, j, ref,
+                                      mi_row, mi_col, &best_sub8x8[ref]);
         }
 
-        if (!assign_mv(cm, xd, b_mode, block, nearestmv,
-                       nearest_sub8x8, near_sub8x8,
-                       is_compound, allow_hp, r)) {
+        if (!assign_mv(cm, xd, b_mode, mi->bmi[j].as_mv, best_ref_mvs,
+                       best_sub8x8, is_compound, allow_hp, r)) {
           xd->corrupted |= 1;
           break;
         }
 
-        mi->bmi[j].as_mv[0].as_int = block[0].as_int;
-        if (is_compound)
-          mi->bmi[j].as_mv[1].as_int = block[1].as_int;
-
         if (num_4x4_h == 2)
           mi->bmi[j + 2] = mi->bmi[j];
         if (num_4x4_w == 2)
@@ -575,30 +846,28 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
       }
     }
 
-    mi->mbmi.mode = b_mode;
+    mi->mode = b_mode;
 
-    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
-    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+    copy_mv_pair(mi->mv, mi->bmi[3].as_mv);
   } else {
-    xd->corrupted |= !assign_mv(cm, xd, mbmi->mode, mbmi->mv, nearestmv,
-                                nearestmv, nearmv, is_compound, allow_hp, r);
+    xd->corrupted |= !assign_mv(cm, xd, mi->mode, mi->mv, best_ref_mvs,
+                                best_ref_mvs, is_compound, allow_hp, r);
   }
 }
 
 static void read_inter_frame_mode_info(VP9Decoder *const pbi,
                                        MACROBLOCKD *const xd,
-                                       int mi_row, int mi_col, vpx_reader *r) {
+                                       int mi_row, int mi_col, vpx_reader *r,
+                                       int x_mis, int y_mis) {
   VP9_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
   int inter_block;
 
-  mbmi->mv[0].as_int = 0;
-  mbmi->mv[1].as_int = 0;
-  mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
-  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
-  inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
-  mbmi->tx_size = read_tx_size(cm, xd, !mbmi->skip || !inter_block, r);
+  mi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r, x_mis,
+                                         y_mis);
+  mi->skip = read_skip(cm, xd, mi->segment_id, r);
+  inter_block = read_is_inter_block(cm, xd, mi->segment_id, r);
+  mi->tx_size = read_tx_size(cm, xd, !mi->skip || !inter_block, r);
 
   if (inter_block)
     read_inter_block_mode_info(pbi, xd, mi, mi_row, mi_col, r);
@@ -606,7 +875,12 @@ static void read_inter_frame_mode_info(VP9Decoder *const pbi,
     read_intra_block_mode_info(cm, xd, mi, r);
 }
 
-void vpx_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
+static INLINE void copy_ref_frame_pair(MV_REFERENCE_FRAME *dst,
+                                       const MV_REFERENCE_FRAME *src) {
+  memcpy(dst, src, sizeof(*dst) * 2);
+}
+
+void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
                         int mi_row, int mi_col, vpx_reader *r,
                         int x_mis, int y_mis) {
   VP9_COMMON *const cm = &pbi->common;
@@ -615,19 +889,23 @@ void vpx_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
   int w, h;
 
   if (frame_is_intra_only(cm)) {
-    read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
+    read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r, x_mis, y_mis);
   } else {
-    read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r);
+    read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
 
     for (h = 0; h < y_mis; ++h) {
-      MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
       for (w = 0; w < x_mis; ++w) {
-        MV_REF *const mv = frame_mv + w;
-        mv->ref_frame[0] = mi->mbmi.ref_frame[0];
-        mv->ref_frame[1] = mi->mbmi.ref_frame[1];
-        mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
-        mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+        MV_REF *const mv = frame_mvs + w;
+        copy_ref_frame_pair(mv->ref_frame, mi->ref_frame);
+        copy_mv_pair(mv->mv, mi->mv);
       }
+      frame_mvs += cm->mi_cols;
     }
   }
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) &&
+        (xd->above_mi == NULL || xd->left_mi == NULL) &&
+        !is_inter_block(mi) && need_top_left[mi->uv_mode])
+      assert(0);
+#endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
 }
diff --git a/vp9/decoder/vp9_decodemv.h b/vp9/decoder/vp9_decodemv.h
index 75f568c..45569ec 100644
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@@ -19,7 +19,7 @@
 extern "C" {
 #endif
 
-void vpx_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
+void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
                         int mi_row, int mi_col, vpx_reader *r,
                         int x_mis, int y_mis);
 
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 4e88819..935c04f 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -131,11 +131,12 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
 
   vpx_get_worker_interface()->end(&pbi->lf_worker);
   vpx_free(pbi->lf_worker.data1);
-  vpx_free(pbi->tile_data);
+
   for (i = 0; i < pbi->num_tile_workers; ++i) {
     VPxWorker *const worker = &pbi->tile_workers[i];
     vpx_get_worker_interface()->end(worker);
   }
+
   vpx_free(pbi->tile_worker_data);
   vpx_free(pbi->tile_workers);
 
@@ -213,8 +214,11 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
 
     // Find an empty frame buffer.
     const int free_fb = get_free_fb(cm);
-    if (cm->new_fb_idx == INVALID_IDX)
-      return VPX_CODEC_MEM_ERROR;
+    if (cm->new_fb_idx == INVALID_IDX) {
+      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                         "Unable to find free frame buffer");
+      return cm->error.error_code;
+    }
 
     // Decrease ref_count since it will be increased again in
     // ref_cnt_fb() below.
@@ -243,7 +247,7 @@ static void swap_frame_buffers(VP9Decoder *pbi) {
     decrease_ref_count(old_idx, frame_bufs, pool);
 
     // Release the reference frame in reference map.
-    if ((mask & 1) && old_idx >= 0) {
+    if (mask & 1) {
       decrease_ref_count(old_idx, frame_bufs, pool);
     }
     cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
@@ -305,8 +309,11 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
                         &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
   // Find a free frame buffer. Return error if can not find any.
   cm->new_fb_idx = get_free_fb(cm);
-  if (cm->new_fb_idx == INVALID_IDX)
-    return VPX_CODEC_MEM_ERROR;
+  if (cm->new_fb_idx == INVALID_IDX) {
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Unable to find free frame buffer");
+    return cm->error.error_code;
+  }
 
   // Assign a MV array to the frame buffer.
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
@@ -350,7 +357,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
         decrease_ref_count(old_idx, frame_bufs, pool);
 
         // Release the reference frame in reference map.
-        if ((mask & 1) && old_idx >= 0) {
+        if (mask & 1) {
           decrease_ref_count(old_idx, frame_bufs, pool);
         }
         ++ref_index;
@@ -501,7 +508,7 @@ vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data,
         uint32_t this_sz = 0;
 
         for (j = 0; j < mag; ++j)
-          this_sz |= (*x++) << (j * 8);
+          this_sz |= ((uint32_t)(*x++)) << (j * 8);
         sizes[i] = this_sz;
       }
       *count = frames;
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 4a5188f..7111a36 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -27,15 +27,6 @@
 extern "C" {
 #endif
 
-// TODO(hkuang): combine this with TileWorkerData.
-typedef struct TileData {
-  VP9_COMMON *cm;
-  vpx_reader bit_reader;
-  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
-  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
-} TileData;
-
 typedef struct TileBuffer {
   const uint8_t *data;
   size_t size;
@@ -74,8 +65,6 @@ typedef struct VP9Decoder {
   TileWorkerData *tile_worker_data;
   TileBuffer tile_buffers[64];
   int num_tile_workers;
-
-  TileData *tile_data;
   int total_tiles;
 
   VP9LfSync lf_row_sync;
@@ -128,7 +117,7 @@ void vp9_decoder_remove(struct VP9Decoder *pbi);
 
 static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
                                       BufferPool *const pool) {
-  if (idx >= 0) {
+  if (idx >= 0 && frame_bufs[idx].ref_count > 0) {
     --frame_bufs[idx].ref_count;
     // A worker may only get a free framebuffer index when calling get_free_fb.
     // But the private buffer is not set up until finish decoding header.
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 5912365..47dc107 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -23,14 +23,6 @@
 #define EOB_CONTEXT_NODE            0
 #define ZERO_CONTEXT_NODE           1
 #define ONE_CONTEXT_NODE            2
-#define LOW_VAL_CONTEXT_NODE        0
-#define TWO_CONTEXT_NODE            1
-#define THREE_CONTEXT_NODE          2
-#define HIGH_LOW_CONTEXT_NODE       3
-#define CAT_ONE_CONTEXT_NODE        4
-#define CAT_THREEFOUR_CONTEXT_NODE  5
-#define CAT_THREE_CONTEXT_NODE      6
-#define CAT_FIVE_CONTEXT_NODE       7
 
 #define INCREMENT_COUNT(token)                              \
   do {                                                      \
@@ -53,7 +45,7 @@ static int decode_coefs(const MACROBLOCKD *xd,
   FRAME_COUNTS *counts = xd->counts;
   const int max_eob = 16 << (tx_size << 1);
   const FRAME_CONTEXT *const fc = xd->fc;
-  const int ref = is_inter_block(&xd->mi[0]->mbmi);
+  const int ref = is_inter_block(xd->mi[0]);
   int band, c = 0;
   const vpx_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
       fc->coef_probs[tx_size][type][ref];
@@ -65,52 +57,24 @@ static int decode_coefs(const MACROBLOCKD *xd,
   const int dq_shift = (tx_size == TX_32X32);
   int v, token;
   int16_t dqv = dq[0];
-  const uint8_t *cat1_prob;
-  const uint8_t *cat2_prob;
-  const uint8_t *cat3_prob;
-  const uint8_t *cat4_prob;
-  const uint8_t *cat5_prob;
-  const uint8_t *cat6_prob;
+  const uint8_t *const cat6_prob =
+#if CONFIG_VP9_HIGHBITDEPTH
+      (xd->bd == VPX_BITS_12) ? vp9_cat6_prob_high12 :
+      (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2 :
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      vp9_cat6_prob;
+  const int cat6_bits =
+#if CONFIG_VP9_HIGHBITDEPTH
+      (xd->bd == VPX_BITS_12) ? 18 :
+      (xd->bd == VPX_BITS_10) ? 16 :
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      14;
 
   if (counts) {
     coef_counts = counts->coef[tx_size][type][ref];
     eob_branch_count = counts->eob_branch[tx_size][type][ref];
   }
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->bd > VPX_BITS_8) {
-    if (xd->bd == VPX_BITS_10) {
-      cat1_prob = vp9_cat1_prob_high10;
-      cat2_prob = vp9_cat2_prob_high10;
-      cat3_prob = vp9_cat3_prob_high10;
-      cat4_prob = vp9_cat4_prob_high10;
-      cat5_prob = vp9_cat5_prob_high10;
-      cat6_prob = vp9_cat6_prob_high10;
-    } else {
-      cat1_prob = vp9_cat1_prob_high12;
-      cat2_prob = vp9_cat2_prob_high12;
-      cat3_prob = vp9_cat3_prob_high12;
-      cat4_prob = vp9_cat4_prob_high12;
-      cat5_prob = vp9_cat5_prob_high12;
-      cat6_prob = vp9_cat6_prob_high12;
-    }
-  } else {
-    cat1_prob = vp9_cat1_prob;
-    cat2_prob = vp9_cat2_prob;
-    cat3_prob = vp9_cat3_prob;
-    cat4_prob = vp9_cat4_prob;
-    cat5_prob = vp9_cat5_prob;
-    cat6_prob = vp9_cat6_prob;
-  }
-#else
-  cat1_prob = vp9_cat1_prob;
-  cat2_prob = vp9_cat2_prob;
-  cat3_prob = vp9_cat3_prob;
-  cat4_prob = vp9_cat4_prob;
-  cat5_prob = vp9_cat5_prob;
-  cat6_prob = vp9_cat6_prob;
-#endif
-
   while (c < max_eob) {
     int val = -1;
     band = *band_translate++;
@@ -149,39 +113,22 @@ static int decode_coefs(const MACROBLOCKD *xd,
           val = token;
           break;
         case CATEGORY1_TOKEN:
-          val = CAT1_MIN_VAL + read_coeff(cat1_prob, 1, r);
+          val = CAT1_MIN_VAL + read_coeff(vp9_cat1_prob, 1, r);
           break;
         case CATEGORY2_TOKEN:
-          val = CAT2_MIN_VAL + read_coeff(cat2_prob, 2, r);
+          val = CAT2_MIN_VAL + read_coeff(vp9_cat2_prob, 2, r);
           break;
         case CATEGORY3_TOKEN:
-          val = CAT3_MIN_VAL + read_coeff(cat3_prob, 3, r);
+          val = CAT3_MIN_VAL + read_coeff(vp9_cat3_prob, 3, r);
           break;
         case CATEGORY4_TOKEN:
-          val = CAT4_MIN_VAL + read_coeff(cat4_prob, 4, r);
+          val = CAT4_MIN_VAL + read_coeff(vp9_cat4_prob, 4, r);
           break;
         case CATEGORY5_TOKEN:
-          val = CAT5_MIN_VAL + read_coeff(cat5_prob, 5, r);
+          val = CAT5_MIN_VAL + read_coeff(vp9_cat5_prob, 5, r);
           break;
         case CATEGORY6_TOKEN:
-#if CONFIG_VP9_HIGHBITDEPTH
-          switch (xd->bd) {
-            case VPX_BITS_8:
-              val = CAT6_MIN_VAL + read_coeff(cat6_prob, 14, r);
-              break;
-            case VPX_BITS_10:
-              val = CAT6_MIN_VAL + read_coeff(cat6_prob, 16, r);
-              break;
-            case VPX_BITS_12:
-              val = CAT6_MIN_VAL + read_coeff(cat6_prob, 18, r);
-              break;
-            default:
-              assert(0);
-              return -1;
-          }
-#else
-          val = CAT6_MIN_VAL + read_coeff(cat6_prob, 14, r);
-#endif
+          val = CAT6_MIN_VAL + read_coeff(cat6_prob, cat6_bits, r);
           break;
       }
     }
@@ -205,65 +152,73 @@ static int decode_coefs(const MACROBLOCKD *xd,
   return c;
 }
 
-// TODO(slavarnway): Decode version of vp9_set_context.  Modify vp9_set_context
-// after testing is complete, then delete this version.
-static
-void dec_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
-                      TX_SIZE tx_size, int has_eob,
-                      int aoff, int loff) {
-  ENTROPY_CONTEXT *const a = pd->above_context + aoff;
-  ENTROPY_CONTEXT *const l = pd->left_context + loff;
-  const int tx_size_in_blocks = 1 << tx_size;
-
-  // above
-  if (has_eob && xd->mb_to_right_edge < 0) {
-    int i;
-    const int blocks_wide = pd->n4_w +
-                            (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-    int above_contexts = tx_size_in_blocks;
-    if (above_contexts + aoff > blocks_wide)
-      above_contexts = blocks_wide - aoff;
-
-    for (i = 0; i < above_contexts; ++i)
-      a[i] = has_eob;
-    for (i = above_contexts; i < tx_size_in_blocks; ++i)
-      a[i] = 0;
-  } else {
-    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+static void get_ctx_shift(MACROBLOCKD *xd, int *ctx_shift_a, int *ctx_shift_l,
+                          int x, int y, unsigned int tx_size_in_blocks) {
+  if (xd->max_blocks_wide) {
+    if (tx_size_in_blocks + x > xd->max_blocks_wide)
+      *ctx_shift_a = (tx_size_in_blocks - (xd->max_blocks_wide - x)) * 8;
   }
-
-  // left
-  if (has_eob && xd->mb_to_bottom_edge < 0) {
-    int i;
-    const int blocks_high = pd->n4_h +
-                            (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-    int left_contexts = tx_size_in_blocks;
-    if (left_contexts + loff > blocks_high)
-      left_contexts = blocks_high - loff;
-
-    for (i = 0; i < left_contexts; ++i)
-      l[i] = has_eob;
-    for (i = left_contexts; i < tx_size_in_blocks; ++i)
-      l[i] = 0;
-  } else {
-    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  if (xd->max_blocks_high) {
+    if (tx_size_in_blocks + y > xd->max_blocks_high)
+      *ctx_shift_l = (tx_size_in_blocks - (xd->max_blocks_high - y)) * 8;
   }
 }
 
-int vp9_decode_block_tokens(MACROBLOCKD *xd,
-                            int plane, const scan_order *sc,
-                            int x, int y,
-                            TX_SIZE tx_size, vpx_reader *r,
+int vp9_decode_block_tokens(MACROBLOCKD *xd, int plane, const scan_order *sc,
+                            int x, int y, TX_SIZE tx_size, vpx_reader *r,
                             int seg_id) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int16_t *const dequant = pd->seg_dequant[seg_id];
-  const int ctx = get_entropy_context(tx_size, pd->above_context + x,
-                                               pd->left_context + y);
-  const int eob = decode_coefs(xd, get_plane_type(plane),
-                               pd->dqcoeff, tx_size,
-                               dequant, ctx, sc->scan, sc->neighbors, r);
-  dec_set_contexts(xd, pd, tx_size, eob > 0, x, y);
+  int eob;
+  ENTROPY_CONTEXT *a = pd->above_context + x;
+  ENTROPY_CONTEXT *l = pd->left_context + y;
+  int ctx;
+  int ctx_shift_a = 0;
+  int ctx_shift_l = 0;
+
+  switch (tx_size) {
+    case TX_4X4:
+      ctx  = a[0] != 0;
+      ctx += l[0] != 0;
+      eob = decode_coefs(xd, get_plane_type(plane), pd->dqcoeff, tx_size,
+                         dequant, ctx, sc->scan, sc->neighbors, r);
+      a[0] = l[0] = (eob > 0);
+      break;
+    case TX_8X8:
+      get_ctx_shift(xd, &ctx_shift_a, &ctx_shift_l, x, y, 1 << TX_8X8);
+      ctx  = !!*(const uint16_t *)a;
+      ctx += !!*(const uint16_t *)l;
+      eob = decode_coefs(xd, get_plane_type(plane), pd->dqcoeff, tx_size,
+                         dequant, ctx, sc->scan, sc->neighbors, r);
+      *(uint16_t *)a = ((eob > 0) * 0x0101) >> ctx_shift_a;
+      *(uint16_t *)l = ((eob > 0) * 0x0101) >> ctx_shift_l;
+      break;
+    case TX_16X16:
+      get_ctx_shift(xd, &ctx_shift_a, &ctx_shift_l, x, y, 1 << TX_16X16);
+      ctx  = !!*(const uint32_t *)a;
+      ctx += !!*(const uint32_t *)l;
+      eob = decode_coefs(xd, get_plane_type(plane), pd->dqcoeff, tx_size,
+                         dequant, ctx, sc->scan, sc->neighbors, r);
+      *(uint32_t *)a = ((eob > 0) * 0x01010101) >> ctx_shift_a;
+      *(uint32_t *)l = ((eob > 0) * 0x01010101) >> ctx_shift_l;
+      break;
+    case TX_32X32:
+      get_ctx_shift(xd, &ctx_shift_a, &ctx_shift_l, x, y, 1 << TX_32X32);
+      // NOTE: casting to uint64_t here is safe because the default memory
+      // alignment is at least 8 bytes and the TX_32X32 is aligned on 8 byte
+      // boundaries.
+      ctx  = !!*(const uint64_t *)a;
+      ctx += !!*(const uint64_t *)l;
+      eob = decode_coefs(xd, get_plane_type(plane), pd->dqcoeff, tx_size,
+                         dequant, ctx, sc->scan, sc->neighbors, r);
+      *(uint64_t *)a = ((eob > 0) * 0x0101010101010101ULL) >> ctx_shift_a;
+      *(uint64_t *)l = ((eob > 0) * 0x0101010101010101ULL) >> ctx_shift_l;
+      break;
+    default:
+      assert(0 && "Invalid transform size.");
+      eob = 0;
+      break;
+  }
+
   return eob;
 }
-
-
diff --git a/vp9/decoder/vp9_dsubexp.c b/vp9/decoder/vp9_dsubexp.c
index 4fbc6db..05b3853 100644
--- a/vp9/decoder/vp9_dsubexp.c
+++ b/vp9/decoder/vp9_dsubexp.c
@@ -29,7 +29,7 @@ static int decode_uniform(vpx_reader *r) {
 }
 
 static int inv_remap_prob(int v, int m) {
-  static int inv_map_table[MAX_PROB] = {
+  static uint8_t inv_map_table[MAX_PROB] = {
       7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176, 189,
     202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,  10,  11,
      12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,
diff --git a/vp9/encoder/vp9_aq_360.c b/vp9/encoder/vp9_aq_360.c
new file mode 100644
index 0000000..7d411f6
--- /dev/null
+++ b/vp9/encoder/vp9_aq_360.c
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp9/encoder/vp9_aq_360.h"
+#include "vp9/encoder/vp9_aq_variance.h"
+
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_segmentation.h"
+
+static const double rate_ratio[MAX_SEGMENTS] =
+  {1.0, 0.75, 0.6, 0.5, 0.4, 0.3, 0.25};
+
+// Sets segment id 0 for the equatorial region, 1 for temperate region
+// and 2 for the polar regions
+unsigned int vp9_360aq_segment_id(int mi_row, int mi_rows) {
+  if (mi_row < mi_rows / 8 || mi_row > mi_rows - mi_rows / 8)
+    return 2;
+  else if (mi_row < mi_rows / 4 || mi_row > mi_rows - mi_rows / 4)
+    return 1;
+  else
+    return 0;
+}
+
+void vp9_360aq_frame_setup(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  struct segmentation *seg = &cm->seg;
+  int i;
+
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+    vp9_enable_segmentation(seg);
+    vp9_clearall_segfeatures(seg);
+
+    seg->abs_delta = SEGMENT_DELTADATA;
+
+    vpx_clear_system_state();
+
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      int qindex_delta =
+          vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+                                     rate_ratio[i], cm->bit_depth);
+
+      // We don't allow qindex 0 in a segment if the base value is not 0.
+      // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
+      // Q delta is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -cm->base_qindex + 1;
+      }
+
+      // No need to enable SEG_LVL_ALT_Q for this segment.
+      if (rate_ratio[i] == 1.0) {
+        continue;
+      }
+
+      vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
+      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+    }
+  }
+}
diff --git a/vp9/decoder/vp9_dsubexp.h b/vp9/encoder/vp9_aq_360.h
similarity index 66%
copy from vp9/decoder/vp9_dsubexp.h
copy to vp9/encoder/vp9_aq_360.h
index a8bcc70..fb861cb 100644
--- a/vp9/decoder/vp9_dsubexp.h
+++ b/vp9/encoder/vp9_aq_360.h
@@ -9,19 +9,20 @@
  */
 
 
-#ifndef VP9_DECODER_VP9_DSUBEXP_H_
-#define VP9_DECODER_VP9_DSUBEXP_H_
+#ifndef VP9_ENCODER_VP9_AQ_360_H_
+#define VP9_ENCODER_VP9_AQ_360_H_
 
-#include "vpx_dsp/bitreader.h"
+#include "vp9/encoder/vp9_encoder.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void vp9_diff_update_prob(vpx_reader *r, vpx_prob* p);
+unsigned int vp9_360aq_segment_id(int mi_row, int mi_rows);
+void vp9_360aq_frame_setup(VP9_COMP *cpi);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_DECODER_VP9_DSUBEXP_H_
+#endif  // VP9_ENCODER_VP9_AQ_VARIANCE_H_
diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c
index 30ec191..2d979ec 100644
--- a/vp9/encoder/vp9_aq_complexity.c
+++ b/vp9/encoder/vp9_aq_complexity.c
@@ -35,9 +35,6 @@ static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
     {-3.5, -2.5, -1.5, 100.00, 100.0},
     {-3.0, -2.0, -1.0, 100.00, 100.0} };
 
-#define DEFAULT_COMPLEXITY 64
-
-
 static int get_aq_c_strength(int q_index, vpx_bit_depth_t bit_depth) {
   // Approximate base quatizer (truncated to int)
   const int base_quant = vp9_ac_quant(q_index, 0, bit_depth) / 4;
@@ -51,7 +48,7 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
   // Make SURE use of floating point in this function is safe.
   vpx_clear_system_state();
 
-  if (cm->frame_type == KEY_FRAME ||
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
     int segment;
@@ -107,7 +104,6 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
 
 #define DEFAULT_LV_THRESH 10.0
 #define MIN_DEFAULT_LV_THRESH 8.0
-#define VAR_STRENGTH_STEP 0.25
 // Select a segment for the current block.
 // The choice of segment for a block depends on the ratio of the projected
 // bits for the block vs a target average and its spatial complexity.
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 2cd89c0..3e1a0a5 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -23,73 +23,42 @@
 
 CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
   size_t last_coded_q_map_size;
-  size_t consec_zero_mv_size;
   CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr));
   if (cr == NULL)
     return NULL;
 
   cr->map = vpx_calloc(mi_rows * mi_cols, sizeof(*cr->map));
   if (cr->map == NULL) {
-    vpx_free(cr);
+    vp9_cyclic_refresh_free(cr);
     return NULL;
   }
   last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map);
   cr->last_coded_q_map = vpx_malloc(last_coded_q_map_size);
   if (cr->last_coded_q_map == NULL) {
-    vpx_free(cr);
+    vp9_cyclic_refresh_free(cr);
     return NULL;
   }
   assert(MAXQ <= 255);
   memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
-
-  consec_zero_mv_size = mi_rows * mi_cols * sizeof(*cr->consec_zero_mv);
-  cr->consec_zero_mv = vpx_malloc(consec_zero_mv_size);
-  if (cr->consec_zero_mv == NULL) {
-    vpx_free(cr);
-    return NULL;
-  }
-  memset(cr->consec_zero_mv, 0, consec_zero_mv_size);
   return cr;
 }
 
 void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
   vpx_free(cr->map);
   vpx_free(cr->last_coded_q_map);
-  vpx_free(cr->consec_zero_mv);
   vpx_free(cr);
 }
 
-// Check if we should turn off cyclic refresh based on bitrate condition.
-static int apply_cyclic_refresh_bitrate(const VP9_COMMON *cm,
-                                        const RATE_CONTROL *rc) {
-  // Turn off cyclic refresh if bits available per frame is not sufficiently
-  // larger than bit cost of segmentation. Segment map bit cost should scale
-  // with number of seg blocks, so compare available bits to number of blocks.
-  // Average bits available per frame = avg_frame_bandwidth
-  // Number of (8x8) blocks in frame = mi_rows * mi_cols;
-  const float factor = 0.25;
-  const int number_blocks = cm->mi_rows  * cm->mi_cols;
-  // The condition below corresponds to turning off at target bitrates:
-  // (at 30fps), ~12kbps for CIF, 36kbps for VGA, 100kps for HD/720p.
-  // Also turn off at very small frame sizes, to avoid too large fraction of
-  // superblocks to be refreshed per frame. Threshold below is less than QCIF.
-  if (rc->avg_frame_bandwidth < factor * number_blocks ||
-      number_blocks / 64 < 5)
-    return 0;
-  else
-    return 1;
-}
-
 // Check if this coding block, of size bsize, should be considered for refresh
 // (lower-qp coding). Decision can be based on various factors, such as
 // size of the coding block (i.e., below min_block size rejected), coding
 // mode, and rate/distortion.
 static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
-                                const MB_MODE_INFO *mbmi,
+                                const MODE_INFO *mi,
                                 int64_t rate,
                                 int64_t dist,
                                 int bsize) {
-  MV mv = mbmi->mv[0].as_mv;
+  MV mv = mi->mv[0].as_mv;
   // Reject the block for lower-qp coding if projected distortion
   // is above the threshold, and any of the following is true:
   // 1) mode uses large mv
@@ -98,12 +67,12 @@ static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
   if (dist > cr->thresh_dist_sb &&
       (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
        mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
-       !is_inter_block(mbmi)))
+       !is_inter_block(mi)))
     return CR_SEGMENT_ID_BASE;
   else  if (bsize >= BLOCK_16X16 &&
             rate < cr->thresh_rate_sb &&
-            is_inter_block(mbmi) &&
-            mbmi->mv[0].as_int == 0 &&
+            is_inter_block(mi) &&
+            mi->mv[0].as_int == 0 &&
             cr->rate_boost_fac > 10)
     // More aggressive delta-q for bigger blocks with zero motion.
     return CR_SEGMENT_ID_BOOST2;
@@ -186,12 +155,13 @@ int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i,
 // check if we should reset the segment_id, and update the cyclic_refresh map
 // and segmentation map.
 void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
-                                       MB_MODE_INFO *const mbmi,
+                                       MODE_INFO *const mi,
                                        int mi_row, int mi_col,
                                        BLOCK_SIZE bsize,
                                        int64_t rate,
                                        int64_t dist,
-                                       int skip) {
+                                       int skip,
+                                       struct macroblock_plane *const p) {
   const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const int bw = num_8x8_blocks_wide_lookup[bsize];
@@ -199,26 +169,44 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
   const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
   const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
   const int block_index = mi_row * cm->mi_cols + mi_col;
-  const int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist,
-                                                      bsize);
+  int refresh_this_block = candidate_refresh_aq(cr, mi, rate, dist, bsize);
   // Default is to not update the refresh map.
   int new_map_value = cr->map[block_index];
   int x = 0; int y = 0;
 
+  int is_skin = 0;
+  if (refresh_this_block == 0 &&
+      bsize <= BLOCK_16X16 &&
+      cpi->use_skin_detection) {
+    is_skin = vp9_compute_skin_block(p[0].src.buf,
+                                     p[1].src.buf,
+                                     p[2].src.buf,
+                                     p[0].src.stride,
+                                     p[1].src.stride,
+                                     bsize,
+                                     0,
+                                     0);
+    if (is_skin)
+      refresh_this_block = 1;
+  }
+
+  if (cpi->oxcf.rc_mode == VPX_VBR && mi->ref_frame[0] == GOLDEN_FRAME)
+    refresh_this_block = 0;
+
   // If this block is labeled for refresh, check if we should reset the
   // segment_id.
-  if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
-    mbmi->segment_id = refresh_this_block;
+  if (cyclic_refresh_segment_id_boosted(mi->segment_id)) {
+    mi->segment_id = refresh_this_block;
     // Reset segment_id if it will be skipped.
     if (skip)
-      mbmi->segment_id = CR_SEGMENT_ID_BASE;
+      mi->segment_id = CR_SEGMENT_ID_BASE;
   }
 
   // Update the cyclic refresh map, to be used for setting segmentation map
   // for the next frame. If the block  will be refreshed this frame, mark it
   // as clean. The magnitude of the -ve influences how long before we consider
   // it for refresh again.
-  if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+  if (cyclic_refresh_segment_id_boosted(mi->segment_id)) {
     new_map_value = -cr->time_for_refresh;
   } else if (refresh_this_block) {
     // Else if it is accepted as candidate for refresh, and has not already
@@ -237,17 +225,16 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
     for (x = 0; x < xmis; x++) {
       int map_offset = block_index + y * cm->mi_cols + x;
       cr->map[map_offset] = new_map_value;
-      cpi->segmentation_map[map_offset] = mbmi->segment_id;
+      cpi->segmentation_map[map_offset] = mi->segment_id;
     }
 }
 
 void vp9_cyclic_refresh_update_sb_postencode(VP9_COMP *const cpi,
-                                             const MB_MODE_INFO *const mbmi,
+                                             const MODE_INFO *const mi,
                                              int mi_row, int mi_col,
                                              BLOCK_SIZE bsize) {
   const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  MV mv = mbmi->mv[0].as_mv;
   const int bw = num_8x8_blocks_wide_lookup[bsize];
   const int bh = num_8x8_blocks_high_lookup[bsize];
   const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
@@ -261,25 +248,18 @@ void vp9_cyclic_refresh_update_sb_postencode(VP9_COMP *const cpi,
       // don't update the map for them. For cases where motion is non-zero or
       // the reference frame isn't the previous frame, the previous value in
       // the map for this spatial location is not entirely correct.
-      if ((!is_inter_block(mbmi) || !mbmi->skip) &&
-          mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+      if ((!is_inter_block(mi) || !mi->skip) &&
+          mi->segment_id <= CR_SEGMENT_ID_BOOST2) {
         cr->last_coded_q_map[map_offset] = clamp(
-            cm->base_qindex + cr->qindex_delta[mbmi->segment_id], 0, MAXQ);
-      } else if (is_inter_block(mbmi) && mbmi->skip &&
-                 mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+            cm->base_qindex + cr->qindex_delta[mi->segment_id], 0, MAXQ);
+      } else if (is_inter_block(mi) && mi->skip &&
+                 mi->segment_id <= CR_SEGMENT_ID_BOOST2) {
         cr->last_coded_q_map[map_offset] = VPXMIN(
-            clamp(cm->base_qindex + cr->qindex_delta[mbmi->segment_id],
+            clamp(cm->base_qindex + cr->qindex_delta[mi->segment_id],
                   0, MAXQ),
             cr->last_coded_q_map[map_offset]);
-      // Update the consecutive zero/low_mv count.
-      if (is_inter_block(mbmi) && (abs(mv.row) < 8 && abs(mv.col) < 8)) {
-        if (cr->consec_zero_mv[map_offset] < 255)
-          cr->consec_zero_mv[map_offset]++;
-      } else {
-        cr->consec_zero_mv[map_offset] = 0;
       }
     }
-  }
 }
 
 // Update the actual number of blocks that were applied the segment delta q.
@@ -305,13 +285,15 @@ void vp9_cyclic_refresh_postencode(VP9_COMP *const cpi) {
 void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  // Set minimum gf_interval for GF update to a multiple (== 2) of refresh
-  // period. Depending on past encoding stats, GF flag may be reset and update
-  // may not occur until next baseline_gf_interval.
+  // Set minimum gf_interval for GF update to a multiple of the refresh period,
+  // with some max limit. Depending on past encoding stats, GF flag may be
+  // reset and update may not occur until next baseline_gf_interval.
   if (cr->percent_refresh > 0)
-    rc->baseline_gf_interval = 4 * (100 / cr->percent_refresh);
+    rc->baseline_gf_interval = VPXMIN(4 * (100 / cr->percent_refresh), 40);
   else
     rc->baseline_gf_interval = 40;
+  if (cpi->oxcf.rc_mode == VPX_VBR)
+    rc->baseline_gf_interval = 20;
 }
 
 // Update some encoding stats (from the just encoded frame). If this frame's
@@ -324,42 +306,40 @@ void vp9_cyclic_refresh_check_golden_update(VP9_COMP *const cpi) {
   int mi_row, mi_col;
   double fraction_low = 0.0;
   int low_content_frame = 0;
-
   MODE_INFO **mi = cm->mi_grid_visible;
   RATE_CONTROL *const rc = &cpi->rc;
   const int rows = cm->mi_rows, cols = cm->mi_cols;
   int cnt1 = 0, cnt2 = 0;
   int force_gf_refresh = 0;
-
+  int flag_force_gf_high_motion = 0;
   for (mi_row = 0; mi_row < rows; mi_row++) {
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      int16_t abs_mvr = mi[0]->mbmi.mv[0].as_mv.row >= 0 ?
-          mi[0]->mbmi.mv[0].as_mv.row : -1 * mi[0]->mbmi.mv[0].as_mv.row;
-      int16_t abs_mvc = mi[0]->mbmi.mv[0].as_mv.col >= 0 ?
-          mi[0]->mbmi.mv[0].as_mv.col : -1 * mi[0]->mbmi.mv[0].as_mv.col;
-
-      // Calculate the motion of the background.
-      if (abs_mvr <= 16 && abs_mvc <= 16) {
-        cnt1++;
-        if (abs_mvr == 0 && abs_mvc == 0)
-          cnt2++;
+      if (flag_force_gf_high_motion == 1) {
+        int16_t abs_mvr = mi[0]->mv[0].as_mv.row >= 0 ?
+            mi[0]->mv[0].as_mv.row : -1 * mi[0]->mv[0].as_mv.row;
+        int16_t abs_mvc = mi[0]->mv[0].as_mv.col >= 0 ?
+            mi[0]->mv[0].as_mv.col : -1 * mi[0]->mv[0].as_mv.col;
+        // Calculate the motion of the background.
+        if (abs_mvr <= 16 && abs_mvc <= 16) {
+          cnt1++;
+          if (abs_mvr == 0 && abs_mvc == 0)
+            cnt2++;
+        }
       }
       mi++;
-
       // Accumulate low_content_frame.
       if (cr->map[mi_row * cols + mi_col] < 1)
         low_content_frame++;
     }
     mi += 8;
   }
-
   // For video conference clips, if the background has high motion in current
   // frame because of the camera movement, set this frame as the golden frame.
   // Use 70% and 5% as the thresholds for golden frame refreshing.
   // Also, force this frame as a golden update frame if this frame will change
   // the resolution (resize_pending != 0).
   if (cpi->resize_pending != 0 ||
-     (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1)) {
+     (cnt1 * 100 > (70 * rows * cols) && cnt2 * 20 < cnt1)) {
     vp9_cyclic_refresh_set_golden_update(cpi);
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
 
@@ -368,7 +348,6 @@ void vp9_cyclic_refresh_check_golden_update(VP9_COMP *const cpi) {
     cpi->refresh_golden_frame = 1;
     force_gf_refresh = 1;
   }
-
   fraction_low =
       (double)low_content_frame / (rows * cols);
   // Update average.
@@ -412,12 +391,20 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
   assert(cr->sb_index < sbs_in_frame);
   i = cr->sb_index;
   cr->target_num_seg_blocks = 0;
-  if (cpi->oxcf.content != VP9E_CONTENT_SCREEN)
+  if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) {
     consec_zero_mv_thresh = 100;
+  }
   qindex_thresh =
       cpi->oxcf.content == VP9E_CONTENT_SCREEN
       ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
       : vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex);
+  // More aggressive settings for noisy content.
+  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) {
+    consec_zero_mv_thresh = 80;
+    qindex_thresh =
+        VPXMAX(vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex),
+                              7 * cm->base_qindex >> 3);
+  }
   do {
     int sum_map = 0;
     // Get the mi_row/mi_col corresponding to superblock index i.
@@ -442,7 +429,7 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
         if (cr->map[bl_index2] == 0) {
           count_tot++;
           if (cr->last_coded_q_map[bl_index2] > qindex_thresh ||
-              cr->consec_zero_mv[bl_index2] < consec_zero_mv_thresh) {
+              cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh) {
             sum_map++;
             count_sel++;
           }
@@ -481,29 +468,46 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
     cr->percent_refresh = 5;
   cr->max_qdelta_perc = 50;
   cr->time_for_refresh = 0;
+  cr->motion_thresh = 32;
+  cr->rate_boost_fac = 15;
   // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
   // periods of the refresh cycle, after a key frame.
   // Account for larger interval on base layer for temporal layers.
   if (cr->percent_refresh > 0 &&
       rc->frames_since_key <  (4 * cpi->svc.number_temporal_layers) *
-      (100 / cr->percent_refresh))
+      (100 / cr->percent_refresh)) {
     cr->rate_ratio_qdelta = 3.0;
-  else
+  } else {
     cr->rate_ratio_qdelta = 2.0;
+    if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) {
+      // Reduce the delta-qp if the estimated source noise is above threshold.
+      cr->rate_ratio_qdelta = 1.7;
+      cr->rate_boost_fac = 13;
+    }
+  }
   // Adjust some parameters for low resolutions at low bitrates.
   if (cm->width <= 352 &&
       cm->height <= 288 &&
       rc->avg_frame_bandwidth < 3400) {
     cr->motion_thresh = 4;
     cr->rate_boost_fac = 10;
-  } else {
-    cr->motion_thresh = 32;
-    cr->rate_boost_fac = 15;
   }
   if (cpi->svc.spatial_layer_id > 0) {
     cr->motion_thresh = 4;
     cr->rate_boost_fac = 12;
   }
+  if (cpi->oxcf.rc_mode == VPX_VBR) {
+    // To be adjusted for VBR mode, e.g., based on gf period and boost.
+    // For now use smaller qp-delta (than CBR), no second boosted seg, and
+    // turn-off (no refresh) on golden refresh (since it's already boosted).
+    cr->percent_refresh = 10;
+    cr->rate_ratio_qdelta = 1.5;
+    cr->rate_boost_fac = 10;
+    if (cpi->refresh_golden_frame == 1) {
+      cr->percent_refresh = 0;
+      cr->rate_ratio_qdelta = 1.0;
+    }
+  }
 }
 
 // Setup cyclic background refresh: set delta q and segmentation map.
@@ -512,7 +516,10 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   struct segmentation *const seg = &cm->seg;
-  const int apply_cyclic_refresh  = apply_cyclic_refresh_bitrate(cm, rc);
+  // TODO(marpan): Look into whether we should reduce the amount/delta-qp
+  // instead of completely shutting off at low bitrates. For now keep it on.
+  // const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc);
+  const int apply_cyclic_refresh = 1;
   if (cm->current_video_frame == 0)
     cr->low_content_avg = 0.0;
   // Don't apply refresh on key frame or temporal enhancement layer frames.
@@ -526,8 +533,6 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
     if (cm->frame_type == KEY_FRAME) {
       memset(cr->last_coded_q_map, MAXQ,
              cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
-      memset(cr->consec_zero_mv, 0,
-             cm->mi_rows * cm->mi_cols * sizeof(*cr->consec_zero_mv));
       cr->sb_index = 0;
     }
     return;
@@ -602,7 +607,7 @@ void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) {
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
   memset(cr->last_coded_q_map, MAXQ, cm->mi_rows * cm->mi_cols);
-  memset(cr->consec_zero_mv, 0, cm->mi_rows * cm->mi_cols);
   cr->sb_index = 0;
   cpi->refresh_golden_frame = 1;
+  cpi->refresh_alt_ref_frame = 1;
 }
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h
index a5b3813..35eea18 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -14,6 +14,8 @@
 
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_skin_detection.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -51,8 +53,6 @@ struct CYCLIC_REFRESH {
   signed char *map;
   // Map of the last q a block was coded at.
   uint8_t *last_coded_q_map;
-  // Count on how many consecutive times a block uses ZER0MV for encoding.
-  uint8_t *consec_zero_mv;
   // Thresholds applied to the projected rate/distortion of the coding block,
   // when deciding whether block should be refreshed.
   int64_t thresh_rate_sb;
@@ -91,12 +91,13 @@ int vp9_cyclic_refresh_rc_bits_per_mb(const struct VP9_COMP *cpi, int i,
 // check if we should reset the segment_id, and update the cyclic_refresh map
 // and segmentation map.
 void vp9_cyclic_refresh_update_segment(struct VP9_COMP *const cpi,
-                                       MB_MODE_INFO *const mbmi,
+                                       MODE_INFO *const mi,
                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
-                                       int64_t rate, int64_t dist, int skip);
+                                       int64_t rate, int64_t dist, int skip,
+                                       struct macroblock_plane *const p);
 
 void vp9_cyclic_refresh_update_sb_postencode(struct VP9_COMP *const cpi,
-                                             const MB_MODE_INFO *const mbmi,
+                                             const MODE_INFO *const mi,
                                              int mi_row, int mi_col,
                                              BLOCK_SIZE bsize);
 
diff --git a/vp9/encoder/vp9_aq_variance.c b/vp9/encoder/vp9_aq_variance.c
index 1c99105..59ef5fa 100644
--- a/vp9/encoder/vp9_aq_variance.c
+++ b/vp9/encoder/vp9_aq_variance.c
@@ -48,7 +48,7 @@ void vp9_vaq_frame_setup(VP9_COMP *cpi) {
   struct segmentation *seg = &cm->seg;
   int i;
 
-  if (cm->frame_type == KEY_FRAME ||
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
     vp9_enable_segmentation(seg);
@@ -167,7 +167,7 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
                 vp9_64_zeros, 0, bw, bh, &sse, &avg);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     var = sse - (((int64_t)avg * avg) / (bw * bh));
-    return (256 * var) / (bw * bh);
+    return (unsigned int)(((uint64_t)256 * var) / (bw * bh));
   } else {
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -185,7 +185,7 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
                              x->plane[0].src.stride,
                              vp9_64_zeros, 0, &sse);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    return (256 * var) >> num_pels_log2_lookup[bs];
+    return (unsigned int)(((uint64_t)256 * var) >> num_pels_log2_lookup[bs]);
   }
 }
 
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 4615554..73a2db0 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -79,8 +79,8 @@ static void prob_diff_update(const vpx_tree_index *tree,
 
 static void write_selected_tx_size(const VP9_COMMON *cm,
                                    const MACROBLOCKD *xd, vpx_writer *w) {
-  TX_SIZE tx_size = xd->mi[0]->mbmi.tx_size;
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  TX_SIZE tx_size = xd->mi[0]->tx_size;
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   const vpx_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
                                                  &cm->fc->tx_probs);
@@ -97,7 +97,7 @@ static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
-    const int skip = mi->mbmi.skip;
+    const int skip = mi->skip;
     vpx_write(w, skip, vp9_get_skip_prob(cm, xd));
     return skip;
   }
@@ -123,72 +123,66 @@ static void update_switchable_interp_probs(VP9_COMMON *cm, vpx_writer *w,
 static void pack_mb_tokens(vpx_writer *w,
                            TOKENEXTRA **tp, const TOKENEXTRA *const stop,
                            vpx_bit_depth_t bit_depth) {
-  TOKENEXTRA *p = *tp;
-
-  while (p < stop && p->token != EOSB_TOKEN) {
-    const int t = p->token;
-    const struct vp9_token *const a = &vp9_coef_encodings[t];
-    int i = 0;
-    int v = a->value;
-    int n = a->len;
+  const TOKENEXTRA *p;
+  const vp9_extra_bit *const extra_bits =
 #if CONFIG_VP9_HIGHBITDEPTH
-    const vp9_extra_bit *b;
-    if (bit_depth == VPX_BITS_12)
-      b = &vp9_extra_bits_high12[t];
-    else if (bit_depth == VPX_BITS_10)
-      b = &vp9_extra_bits_high10[t];
-    else
-      b = &vp9_extra_bits[t];
+    (bit_depth == VPX_BITS_12) ? vp9_extra_bits_high12 :
+    (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10 :
+    vp9_extra_bits;
 #else
-    const vp9_extra_bit *const b = &vp9_extra_bits[t];
+    vp9_extra_bits;
     (void) bit_depth;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    /* skip one or two nodes */
-    if (p->skip_eob_node) {
-      n -= p->skip_eob_node;
-      i = 2 * p->skip_eob_node;
+  for (p = *tp; p < stop && p->token != EOSB_TOKEN; ++p) {
+    if (p->token == EOB_TOKEN) {
+      vpx_write(w, 0, p->context_tree[0]);
+      continue;
     }
-
-    // TODO(jbb): expanding this can lead to big gains.  It allows
-    // much better branch prediction and would enable us to avoid numerous
-    // lookups and compares.
-
-    // If we have a token that's in the constrained set, the coefficient tree
-    // is split into two treed writes.  The first treed write takes care of the
-    // unconstrained nodes.  The second treed write takes care of the
-    // constrained nodes.
-    if (t >= TWO_TOKEN && t < EOB_TOKEN) {
-      int len = UNCONSTRAINED_NODES - p->skip_eob_node;
-      int bits = v >> (n - len);
-      vp9_write_tree(w, vp9_coef_tree, p->context_tree, bits, len, i);
-      vp9_write_tree(w, vp9_coef_con_tree,
-                     vp9_pareto8_full[p->context_tree[PIVOT_NODE] - 1],
-                     v, n - len, 0);
-    } else {
-      vp9_write_tree(w, vp9_coef_tree, p->context_tree, v, n, i);
+    vpx_write(w, 1, p->context_tree[0]);
+    while (p->token == ZERO_TOKEN) {
+      vpx_write(w, 0, p->context_tree[1]);
+      ++p;
+      if (p == stop || p->token == EOSB_TOKEN) {
+        *tp = (TOKENEXTRA*)(uintptr_t)p + (p->token == EOSB_TOKEN);
+        return;
+      }
     }
 
-    if (b->base_val) {
-      const int e = p->extra, l = b->len;
-
-      if (l) {
-        const unsigned char *pb = b->prob;
-        int v = e >> 1;
-        int n = l;              /* number of bits in v, assumed nonzero */
-
-        do {
-          const int bb = (v >> --n) & 1;
-          vpx_write(w, bb, *pb++);
-        } while (n);
+    {
+      const int t = p->token;
+      const vpx_prob *const context_tree = p->context_tree;
+      assert(t != ZERO_TOKEN);
+      assert(t != EOB_TOKEN);
+      assert(t != EOSB_TOKEN);
+      vpx_write(w, 1, context_tree[1]);
+      if (t == ONE_TOKEN) {
+        vpx_write(w, 0, context_tree[2]);
+        vpx_write_bit(w, p->extra & 1);
+      } else {  // t >= TWO_TOKEN && t < EOB_TOKEN
+        const struct vp9_token *const a = &vp9_coef_encodings[t];
+        const int v = a->value;
+        const int n = a->len;
+        const int e = p->extra;
+        vpx_write(w, 1, context_tree[2]);
+        vp9_write_tree(w, vp9_coef_con_tree,
+                       vp9_pareto8_full[context_tree[PIVOT_NODE] - 1], v,
+                       n - UNCONSTRAINED_NODES, 0);
+        if (t >= CATEGORY1_TOKEN) {
+          const vp9_extra_bit *const b = &extra_bits[t];
+          const unsigned char *pb = b->prob;
+          int v = e >> 1;
+          int n = b->len;  // number of bits in v, assumed nonzero
+          do {
+            const int bb = (v >> --n) & 1;
+            vpx_write(w, bb, *pb++);
+          } while (n);
+        }
+        vpx_write_bit(w, e & 1);
       }
-
-      vpx_write_bit(w, e & 1);
     }
-    ++p;
   }
-
-  *tp = p + (p->token == EOSB_TOKEN);
+  *tp = (TOKENEXTRA*)(uintptr_t)p + (p->token == EOSB_TOKEN);
 }
 
 static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
@@ -200,15 +194,15 @@ static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
 // This function encodes the reference frame
 static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                              vpx_writer *w) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int is_compound = has_second_ref(mbmi);
-  const int segment_id = mbmi->segment_id;
+  const MODE_INFO *const mi = xd->mi[0];
+  const int is_compound = has_second_ref(mi);
+  const int segment_id = mi->segment_id;
 
   // If segment level coding of this signal is disabled...
   // or the segment allows multiple reference frame options
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     assert(!is_compound);
-    assert(mbmi->ref_frame[0] ==
+    assert(mi->ref_frame[0] ==
                get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
   } else {
     // does the feature use compound prediction or not
@@ -220,13 +214,13 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd,
     }
 
     if (is_compound) {
-      vpx_write(w, mbmi->ref_frame[0] == GOLDEN_FRAME,
+      vpx_write(w, mi->ref_frame[0] == GOLDEN_FRAME,
                 vp9_get_pred_prob_comp_ref_p(cm, xd));
     } else {
-      const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
+      const int bit0 = mi->ref_frame[0] != LAST_FRAME;
       vpx_write(w, bit0, vp9_get_pred_prob_single_ref_p1(cm, xd));
       if (bit0) {
-        const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
+        const int bit1 = mi->ref_frame[0] != GOLDEN_FRAME;
         vpx_write(w, bit1, vp9_get_pred_prob_single_ref_p2(cm, xd));
       }
     }
@@ -240,19 +234,18 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
   const MACROBLOCK *const x = &cpi->td.mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct segmentation *const seg = &cm->seg;
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-  const PREDICTION_MODE mode = mbmi->mode;
-  const int segment_id = mbmi->segment_id;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const PREDICTION_MODE mode = mi->mode;
+  const int segment_id = mi->segment_id;
+  const BLOCK_SIZE bsize = mi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
-  const int is_inter = is_inter_block(mbmi);
-  const int is_compound = has_second_ref(mbmi);
+  const int is_inter = is_inter_block(mi);
+  const int is_compound = has_second_ref(mi);
   int skip, ref;
 
   if (seg->update_map) {
     if (seg->temporal_update) {
-      const int pred_flag = mbmi->seg_id_predicted;
+      const int pred_flag = mi->seg_id_predicted;
       vpx_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
       vpx_write(w, pred_flag, pred_prob);
       if (!pred_flag)
@@ -286,9 +279,9 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
         }
       }
     }
-    write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mode]);
+    write_intra_mode(w, mi->uv_mode, cm->fc->uv_mode_prob[mode]);
   } else {
-    const int mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
+    const int mode_ctx = mbmi_ext->mode_context[mi->ref_frame[0]];
     const vpx_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx];
     write_ref_frames(cm, xd, w);
 
@@ -303,10 +296,10 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
       const int ctx = vp9_get_pred_context_switchable_interp(xd);
       vp9_write_token(w, vp9_switchable_interp_tree,
                       cm->fc->switchable_interp_prob[ctx],
-                      &switchable_interp_encodings[mbmi->interp_filter]);
-      ++cpi->interp_filter_selected[0][mbmi->interp_filter];
+                      &switchable_interp_encodings[mi->interp_filter]);
+      ++cpi->interp_filter_selected[0][mi->interp_filter];
     } else {
-      assert(mbmi->interp_filter == cm->interp_filter);
+      assert(mi->interp_filter == cm->interp_filter);
     }
 
     if (bsize < BLOCK_8X8) {
@@ -321,7 +314,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
           if (b_mode == NEWMV) {
             for (ref = 0; ref < 1 + is_compound; ++ref)
               vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
-                            &mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0].as_mv,
+                            &mbmi_ext->ref_mvs[mi->ref_frame[ref]][0].as_mv,
                             nmvc, allow_hp);
           }
         }
@@ -329,8 +322,8 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
     } else {
       if (mode == NEWMV) {
         for (ref = 0; ref < 1 + is_compound; ++ref)
-          vp9_encode_mv(cpi, w, &mbmi->mv[ref].as_mv,
-                        &mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0].as_mv, nmvc,
+          vp9_encode_mv(cpi, w, &mi->mv[ref].as_mv,
+                        &mbmi_ext->ref_mvs[mi->ref_frame[ref]][0].as_mv, nmvc,
                         allow_hp);
       }
     }
@@ -343,19 +336,18 @@ static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   const MODE_INFO *const mi = mi_8x8[0];
   const MODE_INFO *const above_mi = xd->above_mi;
   const MODE_INFO *const left_mi = xd->left_mi;
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mi->sb_type;
 
   if (seg->update_map)
-    write_segment_id(w, seg, mbmi->segment_id);
+    write_segment_id(w, seg, mi->segment_id);
 
-  write_skip(cm, xd, mbmi->segment_id, mi, w);
+  write_skip(cm, xd, mi->segment_id, mi, w);
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
     write_selected_tx_size(cm, xd, w);
 
   if (bsize >= BLOCK_8X8) {
-    write_intra_mode(w, mbmi->mode, get_y_mode_probs(mi, above_mi, left_mi, 0));
+    write_intra_mode(w, mi->mode, get_y_mode_probs(mi, above_mi, left_mi, 0));
   } else {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
     const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
@@ -370,7 +362,7 @@ static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
     }
   }
 
-  write_intra_mode(w, mbmi->uv_mode, vp9_kf_uv_mode_prob[mbmi->mode]);
+  write_intra_mode(w, mi->uv_mode, vp9_kf_uv_mode_prob[mi->mode]);
 }
 
 static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
@@ -388,8 +380,8 @@ static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
       (mi_row * cm->mi_cols + mi_col);
 
   set_mi_row_col(xd, tile,
-                 mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
-                 mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type],
+                 mi_row, num_8x8_blocks_high_lookup[m->sb_type],
+                 mi_col, num_8x8_blocks_wide_lookup[m->sb_type],
                  cm->mi_rows, cm->mi_cols);
   if (frame_is_intra_only(cm)) {
     write_mb_modes_kf(cm, xd, xd->mi, w);
@@ -441,7 +433,7 @@ static void write_modes_sb(VP9_COMP *cpi,
 
   m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
 
-  partition = partition_lookup[bsl][m->mbmi.sb_type];
+  partition = partition_lookup[bsl][m->sb_type];
   write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
   subsize = get_subsize(bsize, partition);
   if (subsize < BLOCK_8X8) {
@@ -553,8 +545,8 @@ static void update_coef_probs_common(vpx_writer* const bc, VP9_COMP *cpi,
                 int u = 0;
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
-                      frame_branch_ct[i][j][k][l][0],
-                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
+                      frame_branch_ct[i][j][k][l][0], oldp, &newp, upd,
+                      stepsize);
                 else
                   s = vp9_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
@@ -592,7 +584,7 @@ static void update_coef_probs_common(vpx_writer* const bc, VP9_COMP *cpi,
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
-                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
+                      *oldp, &newp, upd, stepsize);
                 else
                   s = vp9_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t],
@@ -630,7 +622,7 @@ static void update_coef_probs_common(vpx_writer* const bc, VP9_COMP *cpi,
                 if (t == PIVOT_NODE) {
                   s = vp9_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
-                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
+                      *oldp, &newp, upd, stepsize);
                 } else {
                   s = vp9_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t],
@@ -899,7 +891,7 @@ static void write_tile_info(const VP9_COMMON *const cm,
     vpx_wb_write_bit(wb, cm->log2_tile_rows != 1);
 }
 
-static int get_refresh_mask(VP9_COMP *cpi) {
+int vp9_get_refresh_mask(VP9_COMP *cpi) {
   if (vp9_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
     // new ARF frame. However, in the short term we leave it in the GF slot and,
@@ -1115,11 +1107,11 @@ static void write_uncompressed_header(VP9_COMP *cpi,
         write_bitdepth_colorspace_sampling(cm, wb);
       }
 
-      vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES);
       write_frame_size(cm, wb);
     } else {
       MV_REFERENCE_FRAME ref_frame;
-      vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES);
       for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
         assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
         vpx_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
diff --git a/vp9/encoder/vp9_bitstream.h b/vp9/encoder/vp9_bitstream.h
index da6b414..f24d20f 100644
--- a/vp9/encoder/vp9_bitstream.h
+++ b/vp9/encoder/vp9_bitstream.h
@@ -18,6 +18,8 @@ extern "C" {
 
 #include "vp9/encoder/vp9_encoder.h"
 
+int vp9_get_refresh_mask(VP9_COMP *cpi);
+
 void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
 
 static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index fc34786..069c335 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -65,12 +65,20 @@ struct macroblock {
   int skip_optimize;
   int q_index;
 
+  // The equivalent error at the current rdmult of one whole bit (not one
+  // bitcost unit).
   int errorperbit;
+  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // for large blocks.
   int sadperbit16;
+  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // for sub-8x8 blocks.
   int sadperbit4;
   int rddiv;
   int rdmult;
   int mb_energy;
+  int * m_search_count_ptr;
+  int * ex_search_count_ptr;
 
   // These are set to their default values at the beginning, and then adjusted
   // further in the encoding process.
@@ -135,6 +143,13 @@ struct macroblock {
   // the visual quality at the boundary of moving color objects.
   uint8_t color_sensitivity[2];
 
+  uint8_t sb_is_skin;
+
+  // Used to save the status of whether a block has a low variance in
+  // choose_partitioning. 0 for 64x64, 1~2 for 64x32, 3~4 for 32x64, 5~8 for
+  // 32x32, 9~24 for 16x16.
+  uint8_t variance_low[25];
+
   void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
   void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h
index 8e365ce..86ba03d 100644
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@@ -60,6 +60,7 @@ typedef struct {
 #if CONFIG_VP9_TEMPORAL_DENOISING
   unsigned int newmv_sse;
   unsigned int zeromv_sse;
+  unsigned int zeromv_lastref_sse;
   PREDICTION_MODE best_sse_inter_mode;
   int_mv best_sse_mv;
   MV_REFERENCE_FRAME best_reference_frame;
diff --git a/vp9/encoder/vp9_cost.c b/vp9/encoder/vp9_cost.c
index e2fbb34..5d14742 100644
--- a/vp9/encoder/vp9_cost.c
+++ b/vp9/encoder/vp9_cost.c
@@ -11,35 +11,38 @@
 
 #include "vp9/encoder/vp9_cost.h"
 
-const unsigned int vp9_prob_cost[256] = {
-  2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161,
-  1129, 1099, 1072, 1046, 1023, 1000, 979,  959,  940,  922,  905,  889,
-  873,  858,  843,  829,  816,  803,  790,  778,  767,  755,  744,  733,
-  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
-  617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,
-  534,  528,  522,  516,  511,  505,  499,  494,  488,  483,  477,  472,
-  467,  462,  457,  452,  447,  442,  437,  433,  428,  424,  419,  415,
-  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
-  361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,
-  317,  314,  311,  307,  304,  301,  297,  294,  291,  288,  285,  281,
-  278,  275,  272,  269,  266,  263,  260,  257,  255,  252,  249,  246,
-  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
-  211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,
-  181,  179,  177,  174,  172,  170,  168,  165,  163,  161,  159,  156,
-  154,  152,  150,  148,  145,  143,  141,  139,  137,  135,  133,  131,
-  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
-  105,  103,  101,  99,   97,   95,   93,   92,   90,   88,   86,   84,
-  82,   81,   79,   77,   75,   73,   72,   70,   68,   66,   65,   63,
-  61,   60,   58,   56,   55,   53,   51,   50,   48,   46,   45,   43,
-  41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
-  22,   21,   19,   18,   16,   15,   13,   12,   10,   9,    7,    6,
-  4,    3,    1,    1};
+/* round(-log2(i/256.) * (1 << VP9_PROB_COST_SHIFT))
+   Begins with a bogus entry for simpler addressing. */
+const uint16_t vp9_prob_cost[256] = {
+    4096, 4096, 3584, 3284, 3072, 2907, 2772, 2659, 2560, 2473, 2395, 2325,
+    2260, 2201, 2147, 2096, 2048, 2003, 1961, 1921, 1883, 1847, 1813, 1780,
+    1748, 1718, 1689, 1661, 1635, 1609, 1584, 1559, 1536, 1513, 1491, 1470,
+    1449, 1429, 1409, 1390, 1371, 1353, 1335, 1318, 1301, 1284, 1268, 1252,
+    1236, 1221, 1206, 1192, 1177, 1163, 1149, 1136, 1123, 1110, 1097, 1084,
+    1072, 1059, 1047, 1036, 1024, 1013, 1001, 990,  979,  968,  958,  947,
+    937,  927,  917,  907,  897,  887,  878,  868,  859,  850,  841,  832,
+    823,  814,  806,  797,  789,  780,  772,  764,  756,  748,  740,  732,
+    724,  717,  709,  702,  694,  687,  680,  673,  665,  658,  651,  644,
+    637,  631,  624,  617,  611,  604,  598,  591,  585,  578,  572,  566,
+    560,  554,  547,  541,  535,  530,  524,  518,  512,  506,  501,  495,
+    489,  484,  478,  473,  467,  462,  456,  451,  446,  441,  435,  430,
+    425,  420,  415,  410,  405,  400,  395,  390,  385,  380,  375,  371,
+    366,  361,  356,  352,  347,  343,  338,  333,  329,  324,  320,  316,
+    311,  307,  302,  298,  294,  289,  285,  281,  277,  273,  268,  264,
+    260,  256,  252,  248,  244,  240,  236,  232,  228,  224,  220,  216,
+    212,  209,  205,  201,  197,  194,  190,  186,  182,  179,  175,  171,
+    168,  164,  161,  157,  153,  150,  146,  143,  139,  136,  132,  129,
+    125,  122,  119,  115,  112,  109,  105,  102,  99,   95,   92,   89,
+    86,   82,   79,   76,   73,   70,   66,   63,   60,   57,   54,   51,
+    48,   45,   42,   38,   35,   32,   29,   26,   23,   20,   18,   15,
+    12,   9,    6,    3};
 
 static void cost(int *costs, vpx_tree tree, const vpx_prob *probs,
                  int i, int c) {
   const vpx_prob prob = probs[i / 2];
   int b;
 
+  assert(prob != 0);
   for (b = 0; b <= 1; ++b) {
     const int cc = c + vp9_cost_bit(prob, b);
     const vpx_tree_index ii = tree[i + b];
diff --git a/vp9/encoder/vp9_cost.h b/vp9/encoder/vp9_cost.h
index eac74c4..0c70b78 100644
--- a/vp9/encoder/vp9_cost.h
+++ b/vp9/encoder/vp9_cost.h
@@ -12,18 +12,22 @@
 #define VP9_ENCODER_VP9_COST_H_
 
 #include "vpx_dsp/prob.h"
+#include "vpx/vpx_integer.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-extern const unsigned int vp9_prob_cost[256];
+extern const uint16_t vp9_prob_cost[256];
+
+// The factor to scale from cost in bits to cost in vp9_prob_cost units.
+#define VP9_PROB_COST_SHIFT 9
 
 #define vp9_cost_zero(prob) (vp9_prob_cost[prob])
 
-#define vp9_cost_one(prob) vp9_cost_zero(vpx_complement(prob))
+#define vp9_cost_one(prob) vp9_cost_zero(256 - (prob))
 
-#define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? vpx_complement(prob) \
+#define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? 256 - (prob) \
                                                     : (prob))
 
 static INLINE unsigned int cost_branch256(const unsigned int ct[2],
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 8623b42..42d456e 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -21,12 +21,6 @@
 #include "vp9/encoder/vp9_denoiser.h"
 #include "vp9/encoder/vp9_encoder.h"
 
-/* The VP9 denoiser is similar to that of the VP8 denoiser. While
- * choosing the motion vectors / reference frames, the denoiser is run, and if
- * it did not modify the signal to much, the denoised block is copied to the
- * signal.
- */
-
 #ifdef OUTPUT_YUV_DENOISED
 static void make_grayscale(YV12_BUFFER_CONFIG *yuv);
 #endif
@@ -49,16 +43,19 @@ static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) {
 }
 
 static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
-  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 60 : 40);
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 80 : 40);
 }
 
 static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
                            int motion_magnitude) {
   if (motion_magnitude >
       noise_motion_thresh(bs, increase_denoising)) {
-    return 0;
+    if (increase_denoising)
+      return (1 << num_pels_log2_lookup[bs]) << 2;
+    else
+      return 0;
   } else {
-    return (1 << num_pels_log2_lookup[bs]) * 20;
+    return (1 << num_pels_log2_lookup[bs]) << 4;
   }
 }
 
@@ -183,7 +180,7 @@ int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride,
 
 static uint8_t *block_start(uint8_t *framebuf, int stride,
                             int mi_row, int mi_col) {
-  return framebuf + (stride * mi_row * 8) + (mi_col * 8);
+  return framebuf + (stride * mi_row  << 3) + (mi_col << 3);
 }
 
 static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
@@ -193,90 +190,101 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
                                                          int mi_row,
                                                          int mi_col,
                                                          PICK_MODE_CONTEXT *ctx,
-                                                         int *motion_magnitude,
-                                                         int is_skin) {
-  int mv_col, mv_row;
+                                                         int motion_magnitude,
+                                                         int is_skin,
+                                                         int *zeromv_filter,
+                                                         int consec_zeromv) {
   int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
   MV_REFERENCE_FRAME frame;
   MACROBLOCKD *filter_mbd = &mb->e_mbd;
-  MB_MODE_INFO *mbmi = &filter_mbd->mi[0]->mbmi;
-  MB_MODE_INFO saved_mbmi;
-  int i, j;
+  MODE_INFO *mi = filter_mbd->mi[0];
+  MODE_INFO saved_mi;
+  int i;
   struct buf_2d saved_dst[MAX_MB_PLANE];
-  struct buf_2d saved_pre[MAX_MB_PLANE][2];  // 2 pre buffers
+  struct buf_2d saved_pre[MAX_MB_PLANE];
 
-  mv_col = ctx->best_sse_mv.as_mv.col;
-  mv_row = ctx->best_sse_mv.as_mv.row;
-  *motion_magnitude = mv_row * mv_row + mv_col * mv_col;
   frame = ctx->best_reference_frame;
+  saved_mi = *mi;
 
-  saved_mbmi = *mbmi;
+  if (is_skin && (motion_magnitude > 0 || consec_zeromv < 4))
+    return COPY_BLOCK;
 
-  if (is_skin && *motion_magnitude > 16)
+  // Avoid denoising for small block (unless motion is small).
+  // Small blocks are selected in variance partition (before encoding) and
+  // will typically lie on moving areas.
+  if (denoiser->denoising_level < kDenHigh &&
+      motion_magnitude > 16 && bs <= BLOCK_8X8)
     return COPY_BLOCK;
 
   // If the best reference frame uses inter-prediction and there is enough of a
   // difference in sum-squared-error, use it.
   if (frame != INTRA_FRAME &&
-      sse_diff > sse_diff_thresh(bs, increase_denoising, *motion_magnitude)) {
-    mbmi->ref_frame[0] = ctx->best_reference_frame;
-    mbmi->mode = ctx->best_sse_inter_mode;
-    mbmi->mv[0] = ctx->best_sse_mv;
+      ctx->newmv_sse != UINT_MAX &&
+      sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) {
+    mi->ref_frame[0] = ctx->best_reference_frame;
+    mi->mode = ctx->best_sse_inter_mode;
+    mi->mv[0] = ctx->best_sse_mv;
   } else {
     // Otherwise, use the zero reference frame.
     frame = ctx->best_zeromv_reference_frame;
-
-    mbmi->ref_frame[0] = ctx->best_zeromv_reference_frame;
-    mbmi->mode = ZEROMV;
-    mbmi->mv[0].as_int = 0;
-
+    ctx->newmv_sse = ctx->zeromv_sse;
+    // Bias to last reference.
+    if (frame != LAST_FRAME &&
+        ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) ||
+         denoiser->denoising_level >= kDenHigh)) {
+      frame = LAST_FRAME;
+      ctx->newmv_sse = ctx->zeromv_lastref_sse;
+    }
+    mi->ref_frame[0] = frame;
+    mi->mode = ZEROMV;
+    mi->mv[0].as_int = 0;
     ctx->best_sse_inter_mode = ZEROMV;
     ctx->best_sse_mv.as_int = 0;
-    ctx->newmv_sse = ctx->zeromv_sse;
+    *zeromv_filter = 1;
+    if (denoiser->denoising_level > kDenMedium) {
+      motion_magnitude = 0;
+    }
   }
 
   if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
     // Restore everything to its original state
-    *mbmi = saved_mbmi;
+    *mi = saved_mi;
     return COPY_BLOCK;
   }
-  if (*motion_magnitude >
+  if (motion_magnitude >
      (noise_motion_thresh(bs, increase_denoising) << 3)) {
     // Restore everything to its original state
-    *mbmi = saved_mbmi;
+    *mi = saved_mi;
     return COPY_BLOCK;
   }
 
   // We will restore these after motion compensation.
   for (i = 0; i < MAX_MB_PLANE; ++i) {
-    for (j = 0; j < 2; ++j) {
-      saved_pre[i][j] = filter_mbd->plane[i].pre[j];
-    }
+    saved_pre[i] = filter_mbd->plane[i].pre[0];
     saved_dst[i] = filter_mbd->plane[i].dst;
   }
 
   // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
   // struct.
-  for (j = 0; j < 2; ++j) {
-    filter_mbd->plane[0].pre[j].buf =
-        block_start(denoiser->running_avg_y[frame].y_buffer,
-                    denoiser->running_avg_y[frame].y_stride,
-                    mi_row, mi_col);
-    filter_mbd->plane[0].pre[j].stride =
-        denoiser->running_avg_y[frame].y_stride;
-    filter_mbd->plane[1].pre[j].buf =
-        block_start(denoiser->running_avg_y[frame].u_buffer,
-                    denoiser->running_avg_y[frame].uv_stride,
-                    mi_row, mi_col);
-    filter_mbd->plane[1].pre[j].stride =
-        denoiser->running_avg_y[frame].uv_stride;
-    filter_mbd->plane[2].pre[j].buf =
-        block_start(denoiser->running_avg_y[frame].v_buffer,
-                    denoiser->running_avg_y[frame].uv_stride,
-                    mi_row, mi_col);
-    filter_mbd->plane[2].pre[j].stride =
-        denoiser->running_avg_y[frame].uv_stride;
-  }
+  filter_mbd->plane[0].pre[0].buf =
+      block_start(denoiser->running_avg_y[frame].y_buffer,
+                  denoiser->running_avg_y[frame].y_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[0].pre[0].stride =
+      denoiser->running_avg_y[frame].y_stride;
+  filter_mbd->plane[1].pre[0].buf =
+       block_start(denoiser->running_avg_y[frame].u_buffer,
+                  denoiser->running_avg_y[frame].uv_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[1].pre[0].stride =
+      denoiser->running_avg_y[frame].uv_stride;
+  filter_mbd->plane[2].pre[0].buf =
+      block_start(denoiser->running_avg_y[frame].v_buffer,
+                  denoiser->running_avg_y[frame].uv_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[2].pre[0].stride =
+      denoiser->running_avg_y[frame].uv_stride;
+
   filter_mbd->plane[0].dst.buf =
       block_start(denoiser->mc_running_avg_y.y_buffer,
                   denoiser->mc_running_avg_y.y_stride,
@@ -293,27 +301,26 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
                   mi_row, mi_col);
   filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.uv_stride;
 
-  vp9_build_inter_predictors_sby(filter_mbd, mv_row, mv_col, bs);
+  vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs);
 
   // Restore everything to its original state
-  *mbmi = saved_mbmi;
+  *mi = saved_mi;
   for (i = 0; i < MAX_MB_PLANE; ++i) {
-    for (j = 0; j < 2; ++j) {
-      filter_mbd->plane[i].pre[j] = saved_pre[i][j];
-    }
+    filter_mbd->plane[i].pre[0] = saved_pre[i];
     filter_mbd->plane[i].dst = saved_dst[i];
   }
 
-  mv_row = ctx->best_sse_mv.as_mv.row;
-  mv_col = ctx->best_sse_mv.as_mv.col;
-
   return FILTER_BLOCK;
 }
 
-void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
+void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb,
                           int mi_row, int mi_col, BLOCK_SIZE bs,
-                          PICK_MODE_CONTEXT *ctx) {
+                          PICK_MODE_CONTEXT *ctx,
+                          VP9_DENOISER_DECISION *denoiser_decision) {
+  int mv_col, mv_row;
   int motion_magnitude = 0;
+  int zeromv_filter = 0;
+  VP9_DENOISER *denoiser = &cpi->denoiser;
   VP9_DENOISER_DECISION decision = COPY_BLOCK;
   YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
   YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
@@ -322,36 +329,75 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                                           mi_row, mi_col);
   struct buf_2d src = mb->plane[0].src;
   int is_skin = 0;
-
-  if (bs <= BLOCK_16X16 && denoiser->denoising_on) {
-    // Take center pixel in block to determine is_skin.
-    const int y_width_shift = (4 << b_width_log2_lookup[bs]) >> 1;
-    const int y_height_shift = (4 << b_height_log2_lookup[bs]) >> 1;
-    const int uv_width_shift = y_width_shift >> 1;
-    const int uv_height_shift = y_height_shift >> 1;
-    const int stride = mb->plane[0].src.stride;
-    const int strideuv = mb->plane[1].src.stride;
-    const uint8_t ysource =
-      mb->plane[0].src.buf[y_height_shift * stride + y_width_shift];
-    const uint8_t usource =
-      mb->plane[1].src.buf[uv_height_shift * strideuv + uv_width_shift];
-    const uint8_t vsource =
-      mb->plane[2].src.buf[uv_height_shift * strideuv + uv_width_shift];
-    is_skin = vp9_skin_pixel(ysource, usource, vsource);
+  int consec_zeromv = 0;
+  mv_col = ctx->best_sse_mv.as_mv.col;
+  mv_row = ctx->best_sse_mv.as_mv.row;
+  motion_magnitude = mv_row * mv_row + mv_col * mv_col;
+
+  if (cpi->use_skin_detection &&
+      bs <= BLOCK_32X32 &&
+      denoiser->denoising_level < kDenHigh) {
+    int motion_level = (motion_magnitude < 16) ? 0 : 1;
+    // If motion for current block is small/zero, compute consec_zeromv for
+    // skin detection (early exit in skin detection is done for large
+    // consec_zeromv when current block has small/zero motion).
+    consec_zeromv = 0;
+    if (motion_level == 0) {
+      VP9_COMMON * const cm = &cpi->common;
+      int j, i;
+      // Loop through the 8x8 sub-blocks.
+      const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+      const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+      const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+      const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+      const int block_index = mi_row * cm->mi_cols + mi_col;
+      consec_zeromv = 100;
+      for (i = 0; i < ymis; i++) {
+        for (j = 0; j < xmis; j++) {
+          int bl_index = block_index + i * cm->mi_cols + j;
+          consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], consec_zeromv);
+          // No need to keep checking 8x8 blocks if any of the sub-blocks
+          // has small consec_zeromv (since threshold for no_skin based on
+          // zero/small motion in skin detection is high, i.e, > 4).
+          if (consec_zeromv < 4) {
+            i = ymis;
+            j = xmis;
+          }
+        }
+      }
+    }
+    // TODO(marpan): Compute skin detection over sub-blocks.
+    is_skin = vp9_compute_skin_block(mb->plane[0].src.buf,
+                                     mb->plane[1].src.buf,
+                                     mb->plane[2].src.buf,
+                                     mb->plane[0].src.stride,
+                                     mb->plane[1].src.stride,
+                                     bs,
+                                     consec_zeromv,
+                                     motion_level);
+  }
+  if (!is_skin &&
+      denoiser->denoising_level == kDenHigh) {
+    denoiser->increase_denoising = 1;
+  } else {
+    denoiser->increase_denoising = 0;
   }
 
-  if (denoiser->denoising_on)
+  if (denoiser->denoising_level >= kDenLow)
     decision = perform_motion_compensation(denoiser, mb, bs,
                                            denoiser->increase_denoising,
                                            mi_row, mi_col, ctx,
-                                           &motion_magnitude,
-                                           is_skin);
+                                           motion_magnitude,
+                                           is_skin,
+                                           &zeromv_filter,
+                                           consec_zeromv);
 
   if (decision == FILTER_BLOCK) {
     decision = vp9_denoiser_filter(src.buf, src.stride,
                                  mc_avg_start, mc_avg.y_stride,
                                  avg_start, avg.y_stride,
-                                 0, bs, motion_magnitude);
+                                 denoiser->increase_denoising,
+                                 bs, motion_magnitude);
   }
 
   if (decision == FILTER_BLOCK) {
@@ -365,6 +411,9 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                       num_4x4_blocks_wide_lookup[bs] << 2,
                       num_4x4_blocks_high_lookup[bs] << 2);
   }
+  *denoiser_decision = decision;
+  if (decision == FILTER_BLOCK && zeromv_filter == 1)
+    *denoiser_decision = FILTER_ZEROMV_BLOCK;
 }
 
 static void copy_frame(YV12_BUFFER_CONFIG * const dest,
@@ -401,11 +450,12 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
                                     int resized) {
   // Copy source into denoised reference buffers on KEY_FRAME or
   // if the just encoded frame was resized.
-  if (frame_type == KEY_FRAME || resized != 0) {
+  if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset) {
     int i;
     // Start at 1 so as not to overwrite the INTRA_FRAME
     for (i = 1; i < MAX_REF_FRAMES; ++i)
       copy_frame(&denoiser->running_avg_y[i], &src);
+    denoiser->reset = 0;
     return;
   }
 
@@ -443,22 +493,25 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
 void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {
   ctx->zeromv_sse = UINT_MAX;
   ctx->newmv_sse = UINT_MAX;
+  ctx->zeromv_lastref_sse = UINT_MAX;
+  ctx->best_sse_mv.as_int = 0;
 }
 
-void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
+void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse,
                                      PREDICTION_MODE mode,
                                      PICK_MODE_CONTEXT *ctx) {
-  // TODO(tkopp): Use both MVs if possible
-  if (mbmi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) {
+  if (mi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) {
     ctx->zeromv_sse = sse;
-    ctx->best_zeromv_reference_frame = mbmi->ref_frame[0];
+    ctx->best_zeromv_reference_frame = mi->ref_frame[0];
+    if (mi->ref_frame[0] == LAST_FRAME)
+      ctx->zeromv_lastref_sse = sse;
   }
 
-  if (mbmi->mv[0].as_int != 0 && sse < ctx->newmv_sse) {
+  if (mi->mv[0].as_int != 0 && sse < ctx->newmv_sse) {
     ctx->newmv_sse = sse;
     ctx->best_sse_inter_mode = mode;
-    ctx->best_sse_mv = mbmi->mv[0];
-    ctx->best_reference_frame = mbmi->ref_frame[0];
+    ctx->best_sse_mv = mi->mv[0];
+    ctx->best_reference_frame = mi->ref_frame[0];
   }
 }
 
@@ -514,27 +567,12 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
 #endif
   denoiser->increase_denoising = 0;
   denoiser->frame_buffer_initialized = 1;
-  vp9_denoiser_init_noise_estimate(denoiser, width, height);
+  denoiser->denoising_level = kDenLow;
+  denoiser->prev_denoising_level = kDenLow;
+  denoiser->reset = 0;
   return 0;
 }
 
-void vp9_denoiser_init_noise_estimate(VP9_DENOISER *denoiser,
-                                      int width,
-                                      int height) {
-  // Denoiser is off by default, i.e., no denoising is performed.
-  // Noise level is measured periodically, and if observed to be above
-  // thresh_noise_estimate, then denoising is performed, i.e., denoising_on = 1.
-  denoiser->denoising_on = 0;
-  denoiser->noise_estimate = 0;
-  denoiser->noise_estimate_count = 0;
-  denoiser->thresh_noise_estimate = 20;
-  if (width * height >= 1920 * 1080) {
-    denoiser->thresh_noise_estimate = 70;
-  } else if (width * height >= 1280 * 720) {
-    denoiser->thresh_noise_estimate = 40;
-  }
-}
-
 void vp9_denoiser_free(VP9_DENOISER *denoiser) {
   int i;
   denoiser->frame_buffer_initialized = 0;
@@ -548,117 +586,15 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser) {
   vpx_free_frame_buffer(&denoiser->last_source);
 }
 
-void vp9_denoiser_update_noise_estimate(VP9_COMP *const cpi) {
-  const VP9_COMMON *const cm = &cpi->common;
-  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-  int frame_period = 10;
-  int thresh_consec_zeromv = 8;
-  unsigned int thresh_sum_diff = 128;
-  int num_frames_estimate = 20;
-  int min_blocks_estimate = cm->mi_rows * cm->mi_cols >> 7;
-  // Estimate of noise level every frame_period frames.
-  // Estimate is between current source and last source.
-  if (cm->current_video_frame % frame_period != 0 ||
-     cpi->denoiser.last_source.y_buffer == NULL) {
-    copy_frame(&cpi->denoiser.last_source, cpi->Source);
-    return;
-  } else {
-    int num_samples = 0;
-    uint64_t avg_est = 0;
-    int bsize = BLOCK_16X16;
-    static const unsigned char const_source[16] = {
-         128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-         128, 128};
-    // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have
-    // been encoded as zero/small mv at least x consecutive frames, compute
-    // the variance to update estimate of noise in the source.
-    const uint8_t *src_y = cpi->Source->y_buffer;
-    const int src_ystride = cpi->Source->y_stride;
-    const uint8_t *last_src_y = cpi->denoiser.last_source.y_buffer;
-    const int last_src_ystride = cpi->denoiser.last_source.y_stride;
-    const uint8_t *src_u = cpi->Source->u_buffer;
-    const uint8_t *src_v = cpi->Source->v_buffer;
-    const int src_uvstride = cpi->Source->uv_stride;
-    const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1;
-    const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1;
-    const int uv_width_shift = y_width_shift >> 1;
-    const int uv_height_shift = y_height_shift >> 1;
-    int mi_row, mi_col;
-    for (mi_row = 0; mi_row < cm->mi_rows; mi_row ++) {
-      for (mi_col = 0; mi_col < cm->mi_cols; mi_col ++) {
-        // 16x16 blocks, 1/4 sample of frame.
-        if (mi_row % 4 == 0 && mi_col % 4 == 0) {
-          int bl_index = mi_row * cm->mi_cols + mi_col;
-          int bl_index1 = bl_index + 1;
-          int bl_index2 = bl_index + cm->mi_cols;
-          int bl_index3 = bl_index2 + 1;
-          // Only consider blocks that are likely steady background. i.e, have
-          // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
-          // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
-          // 4 sub-blocks for 16x16 block. Also, avoid skin blocks.
-          const uint8_t ysource =
-            src_y[y_height_shift * src_ystride + y_width_shift];
-          const uint8_t usource =
-            src_u[uv_height_shift * src_uvstride + uv_width_shift];
-          const uint8_t vsource =
-            src_v[uv_height_shift * src_uvstride + uv_width_shift];
-          int is_skin = vp9_skin_pixel(ysource, usource, vsource);
-          if (cr->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
-              cr->consec_zero_mv[bl_index1] > thresh_consec_zeromv &&
-              cr->consec_zero_mv[bl_index2] > thresh_consec_zeromv &&
-              cr->consec_zero_mv[bl_index3] > thresh_consec_zeromv &&
-              !is_skin) {
-            // Compute variance.
-            unsigned int sse;
-            unsigned int variance = cpi->fn_ptr[bsize].vf(src_y,
-                                                          src_ystride,
-                                                          last_src_y,
-                                                          last_src_ystride,
-                                                          &sse);
-            // Only consider this block as valid for noise measurement if the
-            // average term (sse - variance = N * avg^{2}, N = 16X16) of the
-            // temporal residual is small (avoid effects from lighting change).
-            if ((sse - variance) < thresh_sum_diff) {
-              unsigned int sse2;
-              const unsigned int spatial_variance =
-                  cpi->fn_ptr[bsize].vf(src_y, src_ystride, const_source,
-                                        0, &sse2);
-              avg_est += variance / (10 + spatial_variance);
-              num_samples++;
-            }
-          }
-        }
-        src_y += 8;
-        last_src_y += 8;
-        src_u += 4;
-        src_v += 4;
-      }
-      src_y += (src_ystride << 3) - (cm->mi_cols << 3);
-      last_src_y += (last_src_ystride << 3) - (cm->mi_cols << 3);
-      src_u += (src_uvstride << 2) - (cm->mi_cols << 2);
-      src_v += (src_uvstride << 2) - (cm->mi_cols << 2);
-    }
-    // Update noise estimate if we have at a minimum number of block samples,
-    // and avg_est > 0 (avg_est == 0 can happen if the application inputs
-    // duplicate frames).
-    if (num_samples > min_blocks_estimate && avg_est > 0) {
-      // Normalize.
-      avg_est = (avg_est << 8) / num_samples;
-      // Update noise estimate.
-      cpi->denoiser.noise_estimate =  (3 * cpi->denoiser.noise_estimate +
-          avg_est) >> 2;
-      cpi->denoiser.noise_estimate_count++;
-      if (cpi->denoiser.noise_estimate_count == num_frames_estimate) {
-        // Reset counter and check noise level condition.
-        cpi->denoiser.noise_estimate_count = 0;
-       if (cpi->denoiser.noise_estimate > cpi->denoiser.thresh_noise_estimate)
-         cpi->denoiser.denoising_on = 1;
-       else
-         cpi->denoiser.denoising_on = 0;
-      }
-    }
-  }
-  copy_frame(&cpi->denoiser.last_source, cpi->Source);
+void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser,
+                                  int noise_level) {
+  denoiser->denoising_level = noise_level;
+  if (denoiser->denoising_level > kDenLowLow &&
+      denoiser->prev_denoising_level == kDenLowLow)
+    denoiser->reset = 1;
+  else
+    denoiser->reset = 0;
+  denoiser->prev_denoising_level = denoiser->denoising_level;
 }
 
 #ifdef OUTPUT_YUV_DENOISED
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index f8ad4ac..9c86e5a 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -23,21 +23,40 @@ extern "C" {
 
 typedef enum vp9_denoiser_decision {
   COPY_BLOCK,
-  FILTER_BLOCK
+  FILTER_BLOCK,
+  FILTER_ZEROMV_BLOCK
 } VP9_DENOISER_DECISION;
 
+typedef enum vp9_denoiser_level {
+  kDenLowLow,
+  kDenLow,
+  kDenMedium,
+  kDenHigh
+} VP9_DENOISER_LEVEL;
+
 typedef struct vp9_denoiser {
   YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
   YV12_BUFFER_CONFIG mc_running_avg_y;
   YV12_BUFFER_CONFIG last_source;
   int increase_denoising;
   int frame_buffer_initialized;
-  int denoising_on;
-  int noise_estimate;
-  int thresh_noise_estimate;
-  int noise_estimate_count;
+  int reset;
+  VP9_DENOISER_LEVEL denoising_level;
+  VP9_DENOISER_LEVEL prev_denoising_level;
 } VP9_DENOISER;
 
+typedef struct {
+  int64_t zero_last_cost_orig;
+  int *ref_frame_cost;
+  int_mv (*frame_mv)[MAX_REF_FRAMES];
+  int reuse_inter_pred;
+  TX_SIZE best_tx_size;
+  PREDICTION_MODE best_mode;
+  MV_REFERENCE_FRAME best_ref_frame;
+  INTERP_FILTER best_pred_filter;
+  uint8_t best_mode_skip_txfm;
+} VP9_PICKMODE_CTX_DEN;
+
 struct VP9_COMP;
 
 void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
@@ -48,13 +67,14 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
                                     int refresh_last_frame,
                                     int resized);
 
-void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
+void vp9_denoiser_denoise(struct VP9_COMP *cpi, MACROBLOCK *mb,
                           int mi_row, int mi_col, BLOCK_SIZE bs,
-                          PICK_MODE_CONTEXT *ctx);
+                          PICK_MODE_CONTEXT *ctx ,
+                          VP9_DENOISER_DECISION *denoiser_decision);
 
 void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx);
 
-void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi,
+void vp9_denoiser_update_frame_stats(MODE_INFO *mi,
                                      unsigned int sse, PREDICTION_MODE mode,
                                      PICK_MODE_CONTEXT *ctx);
 
@@ -69,18 +89,16 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
 // This function is used by both c and sse2 denoiser implementations.
 // Define it as a static function within the scope where vp9_denoiser.h
 // is referenced.
-static int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) {
+static INLINE int total_adj_strong_thresh(BLOCK_SIZE bs,
+                                          int increase_denoising) {
   return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
 }
 #endif
 
 void vp9_denoiser_free(VP9_DENOISER *denoiser);
 
-void vp9_denoiser_init_noise_estimate(VP9_DENOISER *denoiser,
-                                      int width,
-                                      int height);
-
-void vp9_denoiser_update_noise_estimate(struct VP9_COMP *const cpi);
+void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser,
+                                  int noise_level);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 2333a13..f66ed9e 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -33,6 +33,7 @@
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_tile_common.h"
 
+#include "vp9/encoder/vp9_aq_360.h"
 #include "vp9/encoder/vp9_aq_complexity.h"
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/encoder/vp9_aq_variance.h"
@@ -133,7 +134,7 @@ unsigned int vp9_high_get_sby_perpixel_variance(
                                0, &sse);
       break;
   }
-  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+  return ROUND_POWER_OF_TWO((int64_t)var, num_pels_log2_lookup[bs]);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -186,7 +187,7 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
                         BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi;
+  MODE_INFO *mi;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   const struct segmentation *const seg = &cm->seg;
@@ -195,7 +196,7 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
 
   set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
 
-  mbmi = &xd->mi[0]->mbmi;
+  mi = xd->mi[0];
 
   // Set up destination pointers.
   vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
@@ -221,16 +222,17 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
 
   // Setup segment ID.
   if (seg->enabled) {
-    if (cpi->oxcf.aq_mode != VARIANCE_AQ) {
+    if (cpi->oxcf.aq_mode != VARIANCE_AQ &&
+        cpi->oxcf.aq_mode != EQUATOR360_AQ) {
       const uint8_t *const map = seg->update_map ? cpi->segmentation_map
                                                  : cm->last_frame_seg_map;
-      mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+      mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
     }
     vp9_init_plane_quantizers(cpi, x);
 
-    x->encode_breakout = cpi->segment_encode_breakout[mbmi->segment_id];
+    x->encode_breakout = cpi->segment_encode_breakout[mi->segment_id];
   } else {
-    mbmi->segment_id = 0;
+    mi->segment_id = 0;
     x->encode_breakout = cpi->encode_breakout;
   }
 
@@ -241,14 +243,16 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
 static void duplicate_mode_info_in_sb(VP9_COMMON *cm, MACROBLOCKD *xd,
                                       int mi_row, int mi_col,
                                       BLOCK_SIZE bsize) {
-  const int block_width = num_8x8_blocks_wide_lookup[bsize];
-  const int block_height = num_8x8_blocks_high_lookup[bsize];
+  const int block_width = VPXMIN(num_8x8_blocks_wide_lookup[bsize],
+                                 cm->mi_cols - mi_col);
+  const int block_height = VPXMIN(num_8x8_blocks_high_lookup[bsize],
+                                  cm->mi_rows - mi_row);
+  const int mi_stride = xd->mi_stride;
+  MODE_INFO *const src_mi = xd->mi[0];
   int i, j;
   for (j = 0; j < block_height; ++j)
-    for (i = 0; i < block_width; ++i) {
-      if (mi_row + j < cm->mi_rows && mi_col + i < cm->mi_cols)
-        xd->mi[j * xd->mi_stride + i] = xd->mi[0];
-    }
+    for (i = 0; i < block_width; ++i)
+      xd->mi[j * mi_stride + i] = src_mi;
 }
 
 static void set_block_size(VP9_COMP * const cpi,
@@ -258,7 +262,7 @@ static void set_block_size(VP9_COMP * const cpi,
                            BLOCK_SIZE bsize) {
   if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) {
     set_mode_info_offsets(&cpi->common, x, xd, mi_row, mi_col);
-    xd->mi[0]->mbmi.sb_type = bsize;
+    xd->mi[0]->sb_type = bsize;
   }
 }
 
@@ -401,7 +405,6 @@ static int set_vt_partitioning(VP9_COMP *cpi,
   variance_node vt;
   const int block_width = num_8x8_blocks_wide_lookup[bsize];
   const int block_height = num_8x8_blocks_high_lookup[bsize];
-  const int low_res = (cm->width <= 352 && cm->height <= 288);
 
   assert(block_height == block_width);
   tree_to_node(data, bsize, &vt);
@@ -414,7 +417,7 @@ static int set_vt_partitioning(VP9_COMP *cpi,
   // No check for vert/horiz split as too few samples for variance.
   if (bsize == bsize_min) {
     // Variance already computed to set the force_split.
-    if (low_res || cm->frame_type == KEY_FRAME)
+    if (cm->frame_type == KEY_FRAME)
       get_variance(&vt.part_variances->none);
     if (mi_col + block_width / 2 < cm->mi_cols &&
         mi_row + block_height / 2 < cm->mi_rows &&
@@ -425,7 +428,7 @@ static int set_vt_partitioning(VP9_COMP *cpi,
     return 0;
   } else if (bsize > bsize_min) {
     // Variance already computed to set the force_split.
-    if (low_res || cm->frame_type == KEY_FRAME)
+    if (cm->frame_type == KEY_FRAME)
       get_variance(&vt.part_variances->none);
     // For key frame: take split for bsize above 32X32 or very high variance.
     if (cm->frame_type == KEY_FRAME &&
@@ -481,7 +484,7 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
   VP9_COMMON *const cm = &cpi->common;
   const int is_key_frame = (cm->frame_type == KEY_FRAME);
   const int threshold_multiplier = is_key_frame ? 20 : 1;
-  const int64_t threshold_base = (int64_t)(threshold_multiplier *
+  int64_t threshold_base = (int64_t)(threshold_multiplier *
       cpi->y_dequant[q][1]);
   if (is_key_frame) {
     thresholds[0] = threshold_base;
@@ -489,9 +492,20 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
     thresholds[2] = threshold_base >> 2;
     thresholds[3] = threshold_base << 2;
   } else {
-    thresholds[1] = threshold_base;
+    // Increase base variance threshold based on  estimated noise level.
+    if (cpi->noise_estimate.enabled) {
+      NOISE_LEVEL noise_level = vp9_noise_estimate_extract_level(
+          &cpi->noise_estimate);
+      if (noise_level == kHigh)
+        threshold_base = 3 * threshold_base;
+      else if (noise_level == kMedium)
+        threshold_base = threshold_base << 1;
+      else if (noise_level < kLow)
+        threshold_base = (7 * threshold_base) >> 3;
+    }
     if (cm->width <= 352 && cm->height <= 288) {
-      thresholds[0] = threshold_base >> 2;
+      thresholds[0] = threshold_base >> 3;
+      thresholds[1] = threshold_base >> 1;
       thresholds[2] = threshold_base << 3;
     } else {
       thresholds[0] = threshold_base;
@@ -518,7 +532,7 @@ void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) {
       cpi->vbp_bsize_min = BLOCK_8X8;
     } else {
       if (cm->width <= 352 && cm->height <= 288)
-        cpi->vbp_threshold_sad = 100;
+        cpi->vbp_threshold_sad = 10;
       else
         cpi->vbp_threshold_sad = (cpi->y_dequant[q][1] << 1) > 1000 ?
             (cpi->y_dequant[q][1] << 1) : 1000;
@@ -548,16 +562,16 @@ static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
     if (x8_idx < pixels_wide && y8_idx < pixels_high) {
 #if CONFIG_VP9_HIGHBITDEPTH
       if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        vp9_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+        vpx_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
                               d + y8_idx * dp + x8_idx, dp,
                               &min, &max);
       } else {
-        vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+        vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
                        d + y8_idx * dp + x8_idx, dp,
                        &min, &max);
       }
 #else
-      vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+      vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
                      d + y8_idx * dp + x8_idx, dp,
                      &min, &max);
 #endif
@@ -589,18 +603,18 @@ static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
       int d_avg = 128;
 #if CONFIG_VP9_HIGHBITDEPTH
       if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        s_avg = vpx_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
         if (!is_key_frame)
-          d_avg = vp9_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+          d_avg = vpx_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
       } else {
-        s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
         if (!is_key_frame)
-          d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+          d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
       }
 #else
-      s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+      s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
       if (!is_key_frame)
-        d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+        d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
 #endif
       sum = s_avg - d_avg;
       sse = sum * sum;
@@ -628,18 +642,18 @@ static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
       int d_avg = 128;
 #if CONFIG_VP9_HIGHBITDEPTH
       if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        s_avg = vpx_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
         if (!is_key_frame)
-          d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+          d_avg = vpx_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
       } else {
-        s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
         if (!is_key_frame)
-          d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+          d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
       }
 #else
-      s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+      s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
       if (!is_key_frame)
-        d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+        d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
 #endif
       sum = s_avg - d_avg;
       sse = sum * sum;
@@ -648,45 +662,177 @@ static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
   }
 }
 
+#if !CONFIG_VP9_HIGHBITDEPTH
+// Check if most of the superblock is skin content, and if so, force split to
+// 32x32, and set x->sb_is_skin for use in mode selection.
+static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
+                         int mi_row, int mi_col, int *force_split) {
+  VP9_COMMON * const cm = &cpi->common;
+  // Avoid checking superblocks on/near boundary and avoid low resolutions.
+  // Note superblock may still pick 64X64 if y_sad is very small
+  // (i.e., y_sad < cpi->vbp_threshold_sad) below. For now leave this as is.
+  if (!low_res && (mi_col >= 8 && mi_col + 8 < cm->mi_cols && mi_row >= 8 &&
+      mi_row + 8 < cm->mi_rows)) {
+    int num_16x16_skin = 0;
+    int num_16x16_nonskin = 0;
+    uint8_t *ysignal = x->plane[0].src.buf;
+    uint8_t *usignal = x->plane[1].src.buf;
+    uint8_t *vsignal = x->plane[2].src.buf;
+    int sp = x->plane[0].src.stride;
+    int spuv = x->plane[1].src.stride;
+    const int block_index = mi_row * cm->mi_cols + mi_col;
+    const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+    const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+    const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+    const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+    // Loop through the 16x16 sub-blocks.
+    int i, j;
+    for (i = 0; i < ymis; i+=2) {
+      for (j = 0; j < xmis; j+=2) {
+        int bl_index = block_index + i * cm->mi_cols + j;
+        int bl_index1 = bl_index + 1;
+        int bl_index2 = bl_index + cm->mi_cols;
+        int bl_index3 = bl_index2 + 1;
+        int consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index],
+                                   VPXMIN(cpi->consec_zero_mv[bl_index1],
+                                   VPXMIN(cpi->consec_zero_mv[bl_index2],
+                                   cpi->consec_zero_mv[bl_index3])));
+        int is_skin = vp9_compute_skin_block(ysignal,
+                                             usignal,
+                                             vsignal,
+                                             sp,
+                                             spuv,
+                                             BLOCK_16X16,
+                                             consec_zeromv,
+                                             0);
+        num_16x16_skin += is_skin;
+        num_16x16_nonskin += (1 - is_skin);
+        if (num_16x16_nonskin > 3) {
+          // Exit loop if at least 4 of the 16x16 blocks are not skin.
+          i = ymis;
+          break;
+        }
+        ysignal += 16;
+        usignal += 8;
+        vsignal += 8;
+      }
+      ysignal += (sp << 4) - 64;
+      usignal += (spuv << 3) - 32;
+      vsignal += (spuv << 3) - 32;
+    }
+    if (num_16x16_skin > 12) {
+      *force_split = 1;
+      return 1;
+    }
+  }
+  return 0;
+}
+#endif
+
+static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x,
+                                  MACROBLOCKD *xd, v64x64 *vt,
+                                  int force_split[], int64_t thresholds[],
+                                  MV_REFERENCE_FRAME ref_frame_partition,
+                                  int mi_col, int mi_row) {
+  int i, j;
+  VP9_COMMON * const cm = &cpi->common;
+  const int mv_thr = cm->width > 640 ? 8 : 4;
+  // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected and
+  // int_pro mv is small. If the temporal variance is small set the flag
+  // variance_low for the block. The variance threshold can be adjusted, the
+  // higher the more aggressive.
+  if (ref_frame_partition == LAST_FRAME &&
+      (cpi->sf.short_circuit_low_temp_var == 1 ||
+       (xd->mi[0]->mv[0].as_mv.col < mv_thr &&
+        xd->mi[0]->mv[0].as_mv.col > -mv_thr &&
+        xd->mi[0]->mv[0].as_mv.row < mv_thr &&
+        xd->mi[0]->mv[0].as_mv.row > -mv_thr))) {
+    if (xd->mi[0]->sb_type == BLOCK_64X64 &&
+        (vt->part_variances).none.variance < (thresholds[0] >> 1)) {
+      x->variance_low[0] = 1;
+    } else if (xd->mi[0]->sb_type == BLOCK_64X32) {
+      for (i = 0; i < 2; i++) {
+        if (vt->part_variances.horz[i].variance < (thresholds[0] >> 2))
+          x->variance_low[i + 1] = 1;
+      }
+    } else if (xd->mi[0]->sb_type == BLOCK_32X64) {
+      for (i = 0; i < 2; i++) {
+        if (vt->part_variances.vert[i].variance < (thresholds[0] >> 2))
+          x->variance_low[i + 3] = 1;
+      }
+    } else {
+      for (i = 0; i < 4; i++) {
+        if (!force_split[i + 1]) {
+          // 32x32
+          if (vt->split[i].part_variances.none.variance <
+              (thresholds[1] >> 1))
+            x->variance_low[i + 5] = 1;
+        } else if (cpi->sf.short_circuit_low_temp_var == 2) {
+          int idx[4] = {0, 4, xd->mi_stride << 2, (xd->mi_stride << 2) + 4};
+          const int idx_str = cm->mi_stride * mi_row + mi_col + idx[i];
+          MODE_INFO **this_mi = cm->mi_grid_visible + idx_str;
+          // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
+          // inside.
+          if ((*this_mi)->sb_type == BLOCK_16X16 ||
+              (*this_mi)->sb_type == BLOCK_32X16 ||
+              (*this_mi)->sb_type == BLOCK_16X32) {
+            for (j = 0; j < 4; j++) {
+              if (vt->split[i].split[j].part_variances.none.variance <
+                  (thresholds[2] >> 8))
+                x->variance_low[(i << 2) + j + 9] = 1;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 // This function chooses partitioning based on the variance between source and
 // reconstructed last, where variance is computed for down-sampled inputs.
 static int choose_partitioning(VP9_COMP *cpi,
-                                const TileInfo *const tile,
-                                MACROBLOCK *x,
-                                int mi_row, int mi_col) {
+                               const TileInfo *const tile,
+                               MACROBLOCK *x,
+                               int mi_row, int mi_col) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   int i, j, k, m;
   v64x64 vt;
   v16x16 vt2[16];
   int force_split[21];
+  int avg_32x32;
+  int avg_16x16[4];
   uint8_t *s;
   const uint8_t *d;
   int sp;
   int dp;
+  // Ref frame used in partitioning.
+  MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
   int pixels_wide = 64, pixels_high = 64;
   int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
       cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]};
 
+  // For the variance computation under SVC mode, we treat the frame as key if
+  // the reference (base layer frame) is key frame (i.e., is_key_frame == 1).
+  const int is_key_frame = (cm->frame_type == KEY_FRAME ||
+      (is_one_pass_cbr_svc(cpi) &&
+      cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
   // Always use 4x4 partition for key frame.
-  const int is_key_frame = (cm->frame_type == KEY_FRAME);
-  const int use_4x4_partition = is_key_frame;
+  const int use_4x4_partition = cm->frame_type == KEY_FRAME;
   const int low_res = (cm->width <= 352 && cm->height <= 288);
   int variance4x4downsample[16];
+  int segment_id;
 
-  int segment_id = CR_SEGMENT_ID_BASE;
+  set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+  segment_id = xd->mi[0]->segment_id;
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
-    const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map :
-                                                    cm->last_frame_seg_map;
-    segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
-
     if (cyclic_refresh_segment_id_boosted(segment_id)) {
       int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
       set_vbp_thresholds(cpi, thresholds, q);
     }
   }
 
-  set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+  memset(x->variance_low, 0, sizeof(x->variance_low));
 
   if (xd->mb_to_right_edge < 0)
     pixels_wide += (xd->mb_to_right_edge >> 3);
@@ -696,17 +842,20 @@ static int choose_partitioning(VP9_COMP *cpi,
   s = x->plane[0].src.buf;
   sp = x->plane[0].src.stride;
 
-  if (!is_key_frame && !(is_one_pass_cbr_svc(cpi) &&
-      cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) {
+  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
+  // 5-20 for the 16x16 blocks.
+  force_split[0] = 0;
+
+  if (!is_key_frame) {
     // In the case of spatial/temporal scalable coding, the assumption here is
     // that the temporal reference frame will always be of type LAST_FRAME.
     // TODO(marpan): If that assumption is broken, we need to revisit this code.
-    MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+    MODE_INFO *mi = xd->mi[0];
     unsigned int uv_sad;
     const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
 
     const YV12_BUFFER_CONFIG *yv12_g = NULL;
-    unsigned int y_sad, y_sad_g;
+    unsigned int y_sad, y_sad_g, y_sad_thr;
     const BLOCK_SIZE bsize = BLOCK_32X32
         + (mi_col + 4 < cm->mi_cols) * 2 + (mi_row + 4 < cm->mi_rows);
 
@@ -732,25 +881,38 @@ static int choose_partitioning(VP9_COMP *cpi,
 
     vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
                          &cm->frame_refs[LAST_FRAME - 1].sf);
-    mbmi->ref_frame[0] = LAST_FRAME;
-    mbmi->ref_frame[1] = NONE;
-    mbmi->sb_type = BLOCK_64X64;
-    mbmi->mv[0].as_int = 0;
-    mbmi->interp_filter = BILINEAR;
+    mi->ref_frame[0] = LAST_FRAME;
+    mi->ref_frame[1] = NONE;
+    mi->sb_type = BLOCK_64X64;
+    mi->mv[0].as_int = 0;
+    mi->interp_filter = BILINEAR;
 
     y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
-    if (y_sad_g < y_sad) {
+    // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad
+    // are close if short_circuit_low_temp_var is on.
+    y_sad_thr = cpi->sf.short_circuit_low_temp_var ? (y_sad * 7) >> 3 : y_sad;
+    if (y_sad_g < y_sad_thr) {
       vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
                            &cm->frame_refs[GOLDEN_FRAME - 1].sf);
-      mbmi->ref_frame[0] = GOLDEN_FRAME;
-      mbmi->mv[0].as_int = 0;
+      mi->ref_frame[0] = GOLDEN_FRAME;
+      mi->mv[0].as_int = 0;
       y_sad = y_sad_g;
+      ref_frame_partition = GOLDEN_FRAME;
     } else {
-      x->pred_mv[LAST_FRAME] = mbmi->mv[0].as_mv;
+      x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
+      ref_frame_partition = LAST_FRAME;
     }
 
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
 
+    x->sb_is_skin = 0;
+#if !CONFIG_VP9_HIGHBITDEPTH
+    if (cpi->use_skin_detection)
+      x->sb_is_skin = skin_sb_split(cpi, x, low_res, mi_row, mi_col,
+                                    &force_split[0]);
+#endif
+
     for (i = 1; i <= 2; ++i) {
       struct macroblock_plane  *p = &x->plane[i];
       struct macroblockd_plane *pd = &xd->plane[i];
@@ -762,7 +924,9 @@ static int choose_partitioning(VP9_COMP *cpi,
         uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride,
                                      pd->dst.buf, pd->dst.stride);
 
-      x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
+        // TODO(marpan): Investigate if we should lower this threshold if
+        // superblock is detected as skin.
+        x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
     }
 
     d = xd->plane[0].dst.buf;
@@ -801,9 +965,6 @@ static int choose_partitioning(VP9_COMP *cpi,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
-  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
-  // 5-20 for the 16x16 blocks.
-  force_split[0] = 0;
   // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
   // for splits.
   for (i = 0; i < 4; i++) {
@@ -811,6 +972,7 @@ static int choose_partitioning(VP9_COMP *cpi,
     const int y32_idx = ((i >> 1) << 5);
     const int i2 = i << 2;
     force_split[i + 1] = 0;
+    avg_16x16[i] = 0;
     for (j = 0; j < 4; j++) {
       const int x16_idx = x32_idx + ((j & 1) << 4);
       const int y16_idx = y32_idx + ((j >> 1) << 4);
@@ -828,6 +990,7 @@ static int choose_partitioning(VP9_COMP *cpi,
                             is_key_frame);
         fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
         get_variance(&vt.split[i].split[j].part_variances.none);
+        avg_16x16[i] += vt.split[i].split[j].part_variances.none.variance;
         if (vt.split[i].split[j].part_variances.none.variance >
             thresholds[2]) {
           // 16X16 variance is above threshold for split, so force split to 8x8
@@ -835,7 +998,8 @@ static int choose_partitioning(VP9_COMP *cpi,
           force_split[split_index] = 1;
           force_split[i + 1] = 1;
           force_split[0] = 1;
-        } else if (vt.split[i].split[j].part_variances.none.variance >
+        } else if (cpi->oxcf.speed < 8 &&
+                   vt.split[i].split[j].part_variances.none.variance >
                    thresholds[1] &&
                    !cyclic_refresh_segment_id_boosted(segment_id)) {
           // We have some nominal amount of 16x16 variance (based on average),
@@ -853,9 +1017,7 @@ static int choose_partitioning(VP9_COMP *cpi,
           }
         }
       }
-      // TODO(marpan): There is an issue with variance based on 4x4 average in
-      // svc mode, don't allow it for now.
-      if (is_key_frame || (low_res && !cpi->use_svc &&
+      if (is_key_frame || (low_res &&
           vt.split[i].split[j].part_variances.none.variance >
           (thresholds[1] << 1))) {
         force_split[split_index] = 0;
@@ -877,8 +1039,8 @@ static int choose_partitioning(VP9_COMP *cpi,
       }
     }
   }
-
   // Fill the rest of the variance tree by summing split partition values.
+  avg_32x32 = 0;
   for (i = 0; i < 4; i++) {
     const int i2 = i << 2;
     for (j = 0; j < 4; j++) {
@@ -888,22 +1050,41 @@ static int choose_partitioning(VP9_COMP *cpi,
         for (m = 0; m < 4; m++)
           fill_variance_tree(&vtemp->split[m], BLOCK_8X8);
         fill_variance_tree(vtemp, BLOCK_16X16);
+        // If variance of this 16x16 block is above the threshold, force block
+        // to split. This also forces a split on the upper levels.
+        get_variance(&vtemp->part_variances.none);
+        if (vtemp->part_variances.none.variance > thresholds[2]) {
+          force_split[5 + i2 + j] = 1;
+          force_split[i + 1] = 1;
+          force_split[0] = 1;
+        }
       }
     }
     fill_variance_tree(&vt.split[i], BLOCK_32X32);
-    // If variance of this 32x32 block is above the threshold, force the block
-    // to split. This also forces a split on the upper (64x64) level.
+    // If variance of this 32x32 block is above the threshold, or if its above
+    // (some threshold of) the average variance over the sub-16x16 blocks, then
+    // force this block to split. This also forces a split on the upper
+    // (64x64) level.
     if (!force_split[i + 1]) {
       get_variance(&vt.split[i].part_variances.none);
-      if (vt.split[i].part_variances.none.variance > thresholds[1]) {
+      if (vt.split[i].part_variances.none.variance > thresholds[1] ||
+          (!is_key_frame &&
+          vt.split[i].part_variances.none.variance > (thresholds[1] >> 1) &&
+          vt.split[i].part_variances.none.variance > (avg_16x16[i] >> 1))) {
         force_split[i + 1] = 1;
         force_split[0] = 1;
       }
+      avg_32x32 += vt.split[i].part_variances.none.variance;
     }
   }
   if (!force_split[0]) {
     fill_variance_tree(&vt, BLOCK_64X64);
     get_variance(&vt.part_variances.none);
+    // If variance of this 64x64 block is above (some threshold of) the average
+    // variance over the sub-32x32 blocks, then force this block to split.
+    if (!is_key_frame &&
+        vt.part_variances.none.variance > (5 * avg_32x32) >> 4)
+      force_split[0] = 1;
   }
 
   // Now go through the entire structure, splitting every block size until
@@ -960,6 +1141,11 @@ static int choose_partitioning(VP9_COMP *cpi,
       }
     }
   }
+
+  if (cpi->sf.short_circuit_low_temp_var) {
+    set_low_temp_var_flag(cpi, x, xd, &vt, force_split, thresholds,
+                          ref_frame_partition, mi_col, mi_row);
+  }
   return 0;
 }
 
@@ -975,11 +1161,11 @@ static void update_state(VP9_COMP *cpi, ThreadData *td,
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
   MODE_INFO *mi = &ctx->mic;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *const xdmi = xd->mi[0];
   MODE_INFO *mi_addr = xd->mi[0];
   const struct segmentation *const seg = &cm->seg;
-  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
-  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
+  const int bw = num_8x8_blocks_wide_lookup[mi->sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mi->sb_type];
   const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
   const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
   MV_REF *const frame_mvs =
@@ -991,7 +1177,7 @@ static void update_state(VP9_COMP *cpi, ThreadData *td,
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   int max_plane;
 
-  assert(mi->mbmi.sb_type == bsize);
+  assert(mi->sb_type == bsize);
 
   *mi_addr = *mi;
   *x->mbmi_ext = ctx->mbmi_ext;
@@ -1002,19 +1188,19 @@ static void update_state(VP9_COMP *cpi, ThreadData *td,
     if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
       const uint8_t *const map = seg->update_map ? cpi->segmentation_map
                                                  : cm->last_frame_seg_map;
-      mi_addr->mbmi.segment_id =
+      mi_addr->segment_id =
         get_segment_id(cm, map, bsize, mi_row, mi_col);
     }
     // Else for cyclic refresh mode update the segment map, set the segment id
     // and then update the quantizer.
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-      vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row,
+      vp9_cyclic_refresh_update_segment(cpi, xd->mi[0], mi_row,
                                         mi_col, bsize, ctx->rate, ctx->dist,
-                                        x->skip);
+                                        x->skip, p);
     }
   }
 
-  max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1;
+  max_plane = is_inter_block(xdmi) ? MAX_MB_PLANE : 1;
   for (i = 0; i < max_plane; ++i) {
     p[i].coeff = ctx->coeff_pbuf[i][1];
     p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
@@ -1038,16 +1224,16 @@ static void update_state(VP9_COMP *cpi, ThreadData *td,
         xd->mi[x_idx + y * mis] = mi_addr;
       }
 
-  if (cpi->oxcf.aq_mode)
+  if (cpi->oxcf.aq_mode != NO_AQ)
     vp9_init_plane_quantizers(cpi, x);
 
-  if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
-    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
-    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+  if (is_inter_block(xdmi) && xdmi->sb_type < BLOCK_8X8) {
+    xdmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+    xdmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
   }
 
   x->skip = ctx->skip;
-  memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk,
+  memcpy(x->zcoeff_blk[xdmi->tx_size], ctx->zcoeff_blk,
          sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
 
   if (!output_enabled)
@@ -1067,19 +1253,19 @@ static void update_state(VP9_COMP *cpi, ThreadData *td,
       THR_D63_PRED  /*D63_PRED*/,
       THR_TM        /*TM_PRED*/,
     };
-    ++cpi->mode_chosen_counts[kf_mode_index[mbmi->mode]];
+    ++cpi->mode_chosen_counts[kf_mode_index[xdmi->mode]];
   } else {
     // Note how often each mode chosen as best
     ++cpi->mode_chosen_counts[ctx->best_mode_index];
   }
 #endif
   if (!frame_is_intra_only(cm)) {
-    if (is_inter_block(mbmi)) {
+    if (is_inter_block(xdmi)) {
       vp9_update_mv_count(td);
 
       if (cm->interp_filter == SWITCHABLE) {
         const int ctx = vp9_get_pred_context_switchable_interp(xd);
-        ++td->counts->switchable_interp[ctx][mbmi->interp_filter];
+        ++td->counts->switchable_interp[ctx][xdmi->interp_filter];
       }
     }
 
@@ -1095,10 +1281,10 @@ static void update_state(VP9_COMP *cpi, ThreadData *td,
     MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
     for (w = 0; w < x_mis; ++w) {
       MV_REF *const mv = frame_mv + w;
-      mv->ref_frame[0] = mi->mbmi.ref_frame[0];
-      mv->ref_frame[1] = mi->mbmi.ref_frame[1];
-      mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
-      mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+      mv->ref_frame[0] = mi->ref_frame[0];
+      mv->ref_frame[1] = mi->ref_frame[1];
+      mv->mv[0].as_int = mi->mv[0].as_int;
+      mv->mv[1].as_int = mi->mv[1].as_int;
     }
   }
 }
@@ -1121,26 +1307,23 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
 static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
                                    RD_COST *rd_cost, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *const mi = xd->mi[0];
   INTERP_FILTER filter_ref;
 
-  if (xd->up_available)
-    filter_ref = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
-  else if (xd->left_available)
-    filter_ref = xd->mi[-1]->mbmi.interp_filter;
-  else
+  filter_ref = vp9_get_pred_context_switchable_interp(xd);
+  if (filter_ref == SWITCHABLE_FILTERS)
     filter_ref = EIGHTTAP;
 
-  mbmi->sb_type = bsize;
-  mbmi->mode = ZEROMV;
-  mbmi->tx_size =
+  mi->sb_type = bsize;
+  mi->mode = ZEROMV;
+  mi->tx_size =
       VPXMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[tx_mode]);
-  mbmi->skip = 1;
-  mbmi->uv_mode = DC_PRED;
-  mbmi->ref_frame[0] = LAST_FRAME;
-  mbmi->ref_frame[1] = NONE;
-  mbmi->mv[0].as_int = 0;
-  mbmi->interp_filter = filter_ref;
+  mi->skip = 1;
+  mi->uv_mode = DC_PRED;
+  mi->ref_frame[0] = LAST_FRAME;
+  mi->ref_frame[1] = NONE;
+  mi->mv[0].as_int = 0;
+  mi->interp_filter = filter_ref;
 
   xd->mi[0]->bmi[0].as_mv[0].as_int = 0;
   x->skip = 1;
@@ -1155,8 +1338,7 @@ static int set_segment_rdmult(VP9_COMP *const cpi,
   VP9_COMMON *const cm = &cpi->common;
   vp9_init_plane_quantizers(cpi, x);
   vpx_clear_system_state();
-  segment_qindex = vp9_get_qindex(&cm->seg, segment_id,
-                                  cm->base_qindex);
+  segment_qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
   return vp9_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
 }
 
@@ -1169,7 +1351,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi,
   VP9_COMMON *const cm = &cpi->common;
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi;
+  MODE_INFO *mi;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
   const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
@@ -1181,8 +1363,8 @@ static void rd_pick_sb_modes(VP9_COMP *cpi,
   x->use_lp32x32fdct = 1;
 
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-  mbmi = &xd->mi[0]->mbmi;
-  mbmi->sb_type = bsize;
+  mi = xd->mi[0];
+  mi->sb_type = bsize;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     p[i].coeff = ctx->coeff_pbuf[i][0];
@@ -1196,7 +1378,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi,
   x->skip_recode = 0;
 
   // Set to zero to make sure we do not use the previous encoded frame stats
-  mbmi->skip = 0;
+  mi->skip = 0;
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -1221,15 +1403,24 @@ static void rd_pick_sb_modes(VP9_COMP *cpi,
     if (cm->frame_type == KEY_FRAME ||
         cpi->refresh_alt_ref_frame ||
         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
-      mbmi->segment_id = vp9_vaq_segment_id(energy);
+      mi->segment_id = vp9_vaq_segment_id(energy);
     } else {
       const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
                                                     : cm->last_frame_seg_map;
-      mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+      mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
     }
-    x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+    x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id);
+  } else if (aq_mode == EQUATOR360_AQ) {
+    if (cm->frame_type == KEY_FRAME) {
+      mi->segment_id = vp9_360aq_segment_id(mi_row, cm->mi_rows);
+    } else {
+      const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
+                                                    : cm->last_frame_seg_map;
+      mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+    }
+    x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id);
   } else if (aq_mode == COMPLEXITY_AQ) {
-    x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+    x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id);
   } else if (aq_mode == CYCLIC_REFRESH_AQ) {
     const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
                                                   : cm->last_frame_seg_map;
@@ -1245,7 +1436,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi,
     vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
   } else {
     if (bsize >= BLOCK_8X8) {
-      if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+      if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
         vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
                                            ctx, best_rd);
       else
@@ -1282,27 +1473,26 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) {
   const MACROBLOCK *x = &td->mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MODE_INFO *const mi = xd->mi[0];
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mi->sb_type;
 
   if (!frame_is_intra_only(cm)) {
     FRAME_COUNTS *const counts = td->counts;
-    const int inter_block = is_inter_block(mbmi);
-    const int seg_ref_active = segfeature_active(&cm->seg, mbmi->segment_id,
+    const int inter_block = is_inter_block(mi);
+    const int seg_ref_active = segfeature_active(&cm->seg, mi->segment_id,
                                                  SEG_LVL_REF_FRAME);
     if (!seg_ref_active) {
-      counts->intra_inter[vp9_get_intra_inter_context(xd)][inter_block]++;
+      counts->intra_inter[get_intra_inter_context(xd)][inter_block]++;
       // If the segment reference feature is enabled we have only a single
       // reference frame allowed for the segment so exclude it from
       // the reference frame counts used to work out probabilities.
       if (inter_block) {
-        const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
+        const MV_REFERENCE_FRAME ref0 = mi->ref_frame[0];
         if (cm->reference_mode == REFERENCE_MODE_SELECT)
           counts->comp_inter[vp9_get_reference_mode_context(cm, xd)]
-                            [has_second_ref(mbmi)]++;
+                            [has_second_ref(mi)]++;
 
-        if (has_second_ref(mbmi)) {
+        if (has_second_ref(mi)) {
           counts->comp_ref[vp9_get_pred_context_comp_ref_p(cm, xd)]
                           [ref0 == GOLDEN_FRAME]++;
         } else {
@@ -1315,10 +1505,10 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) {
       }
     }
     if (inter_block &&
-        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-      const int mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
+        !segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP)) {
+      const int mode_ctx = mbmi_ext->mode_context[mi->ref_frame[0]];
       if (bsize >= BLOCK_8X8) {
-        const PREDICTION_MODE mode = mbmi->mode;
+        const PREDICTION_MODE mode = mi->mode;
         ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
       } else {
         const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
@@ -1520,7 +1710,7 @@ static void set_partial_b64x64_partition(MODE_INFO *mi, int mis,
     for (c = 0; c < MI_BLOCK_SIZE; c += bw) {
       const int index = r * mis + c;
       mi_8x8[index] = mi + index;
-      mi_8x8[index]->mbmi.sb_type = find_partition_size(bsize,
+      mi_8x8[index]->sb_type = find_partition_size(bsize,
           row8x8_remaining - r, col8x8_remaining - c, &bh, &bw);
     }
   }
@@ -1552,7 +1742,7 @@ static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
       for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
         int index = block_row * mis + block_col;
         mi_8x8[index] = mi_upper_left + index;
-        mi_8x8[index]->mbmi.sb_type = bsize;
+        mi_8x8[index]->sb_type = bsize;
       }
     }
   } else {
@@ -1617,7 +1807,7 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
 
         index = b_mi_row * mis + b_mi_col;
         mi_8x8[index] = mi_upper_left + index;
-        mi_8x8[index]->mbmi.sb_type = BLOCK_16X16;
+        mi_8x8[index]->sb_type = BLOCK_16X16;
 
         // TODO(yunqingwang): If d16[j].var is very large, use 8x8 partition
         // size to further improve quality.
@@ -1639,7 +1829,7 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
 
         index = coord_lookup[i*4].row * mis + coord_lookup[i*4].col;
         mi_8x8[index] = mi_upper_left + index;
-        mi_8x8[index]->mbmi.sb_type = BLOCK_32X32;
+        mi_8x8[index]->sb_type = BLOCK_32X32;
       }
     }
 
@@ -1651,7 +1841,7 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
       // Use 64x64 partition
       if (is_larger_better) {
         mi_8x8[0] = mi_upper_left;
-        mi_8x8[0]->mbmi.sb_type = BLOCK_64X64;
+        mi_8x8[0]->sb_type = BLOCK_64X64;
       }
     }
   } else {   // partial in-image SB64
@@ -1669,46 +1859,47 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *const p = x->plane;
   const struct segmentation *const seg = &cm->seg;
-  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
-  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
+  const int bw = num_8x8_blocks_wide_lookup[mi->sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mi->sb_type];
   const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
   const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
 
   *(xd->mi[0]) = ctx->mic;
   *(x->mbmi_ext) = ctx->mbmi_ext;
 
-  if (seg->enabled && cpi->oxcf.aq_mode) {
+  if (seg->enabled && cpi->oxcf.aq_mode != NO_AQ) {
     // For in frame complexity AQ or variance AQ, copy segment_id from
     // segmentation_map.
-    if (cpi->oxcf.aq_mode == COMPLEXITY_AQ ||
-        cpi->oxcf.aq_mode == VARIANCE_AQ ) {
+    if (cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ) {
       const uint8_t *const map = seg->update_map ? cpi->segmentation_map
                                                  : cm->last_frame_seg_map;
-      mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+      mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
     } else {
     // Setting segmentation map for cyclic_refresh.
-      vp9_cyclic_refresh_update_segment(cpi, mbmi, mi_row, mi_col, bsize,
-                                        ctx->rate, ctx->dist, x->skip);
+      vp9_cyclic_refresh_update_segment(cpi, mi, mi_row, mi_col, bsize,
+                                        ctx->rate, ctx->dist, x->skip, p);
     }
     vp9_init_plane_quantizers(cpi, x);
   }
 
-  if (is_inter_block(mbmi)) {
+  if (is_inter_block(mi)) {
     vp9_update_mv_count(td);
     if (cm->interp_filter == SWITCHABLE) {
       const int pred_ctx = vp9_get_pred_context_switchable_interp(xd);
-      ++td->counts->switchable_interp[pred_ctx][mbmi->interp_filter];
+      ++td->counts->switchable_interp[pred_ctx][mi->interp_filter];
     }
 
-    if (mbmi->sb_type < BLOCK_8X8) {
-      mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
-      mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+    if (mi->sb_type < BLOCK_8X8) {
+      mi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+      mi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
     }
   }
 
-  if (cm->use_prev_frame_mvs) {
+  if (cm->use_prev_frame_mvs ||
+      (cpi->svc.use_base_mv && cpi->svc.number_spatial_layers > 1
+        && cpi->svc.spatial_layer_id != cpi->svc.number_spatial_layers - 1)) {
     MV_REF *const frame_mvs =
         cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
     int w, h;
@@ -1717,16 +1908,16 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
       MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
       for (w = 0; w < x_mis; ++w) {
         MV_REF *const mv = frame_mv + w;
-        mv->ref_frame[0] = mi->mbmi.ref_frame[0];
-        mv->ref_frame[1] = mi->mbmi.ref_frame[1];
-        mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
-        mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+        mv->ref_frame[0] = mi->ref_frame[0];
+        mv->ref_frame[1] = mi->ref_frame[1];
+        mv->mv[0].as_int = mi->mv[0].as_int;
+        mv->mv[1].as_int = mi->mv[1].as_int;
       }
     }
   }
 
   x->skip = ctx->skip;
-  x->skip_txfm[0] = mbmi->segment_id ? 0 : ctx->skip_txfm[0];
+  x->skip_txfm[0] = mi->segment_id ? 0 : ctx->skip_txfm[0];
 }
 
 static void encode_b_rt(VP9_COMP *cpi, ThreadData *td,
@@ -1738,16 +1929,6 @@ static void encode_b_rt(VP9_COMP *cpi, ThreadData *td,
   set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
   update_state_rt(cpi, td, ctx, mi_row, mi_col, bsize);
 
-#if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 &&
-      output_enabled &&
-      cpi->common.frame_type != KEY_FRAME &&
-      cpi->resize_pending == 0) {
-    vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col,
-                         VPXMAX(BLOCK_8X8, bsize), ctx);
-  }
-#endif
-
   encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
   update_stats(&cpi->common, td);
 
@@ -1776,7 +1957,7 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td,
     const int idx_str = xd->mi_stride * mi_row + mi_col;
     MODE_INFO ** mi_8x8 = cm->mi_grid_visible + idx_str;
     ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-    subsize = mi_8x8[0]->mbmi.sb_type;
+    subsize = mi_8x8[0]->sb_type;
   } else {
     ctx = 0;
     subsize = BLOCK_4X4;
@@ -1851,7 +2032,7 @@ static void rd_use_partition(VP9_COMP *cpi,
   RD_COST last_part_rdc, none_rdc, chosen_rdc;
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
-  BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
+  BLOCK_SIZE bs_type = mi_8x8[0]->sb_type;
   int do_partition_search = 1;
   PICK_MODE_CONTEXT *ctx = &pc_tree->none;
 
@@ -1871,7 +2052,7 @@ static void rd_use_partition(VP9_COMP *cpi,
   pc_tree->partitioning = partition;
   save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
 
-  if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) {
+  if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode != NO_AQ) {
     set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
   }
@@ -1886,7 +2067,7 @@ static void rd_use_partition(VP9_COMP *cpi,
       for (i = 0; i < 4; i++) {
         int jj = i >> 1, ii = i & 0x01;
         MODE_INFO *this_mi = mi_8x8[jj * bss * mis + ii * bss];
-        if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) {
+        if (this_mi && this_mi->sb_type >= sub_subsize) {
           splits_below = 0;
         }
       }
@@ -1910,7 +2091,7 @@ static void rd_use_partition(VP9_COMP *cpi,
       }
 
       restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
-      mi_8x8[0]->mbmi.sb_type = bs_type;
+      mi_8x8[0]->sb_type = bs_type;
       pc_tree->partitioning = partition;
     }
   }
@@ -2068,7 +2249,7 @@ static void rd_use_partition(VP9_COMP *cpi,
 
   // If last_part is better set the partitioning to that.
   if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
-    mi_8x8[0]->mbmi.sb_type = bsize;
+    mi_8x8[0]->sb_type = bsize;
     if (bsize >= BLOCK_8X8)
       pc_tree->partitioning = partition;
     chosen_rdc = last_part_rdc;
@@ -2134,7 +2315,7 @@ static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8,
   for (i = 0; i < sb_height_in_blocks; ++i) {
     for (j = 0; j < sb_width_in_blocks; ++j) {
       MODE_INFO *mi = mi_8x8[index+j];
-      BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : 0;
+      BLOCK_SIZE sb_type = mi ? mi->sb_type : 0;
       bs_hist[sb_type]++;
       *min_block_size = VPXMIN(*min_block_size, sb_type);
       *max_block_size = VPXMAX(*max_block_size, sb_type);
@@ -2161,8 +2342,8 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
                                     BLOCK_SIZE *max_block_size) {
   VP9_COMMON *const cm = &cpi->common;
   MODE_INFO **mi = xd->mi;
-  const int left_in_image = xd->left_available && mi[-1];
-  const int above_in_image = xd->up_available && mi[-xd->mi_stride];
+  const int left_in_image = !!xd->left_mi;
+  const int above_in_image = !!xd->above_mi;
   const int row8x8_remaining = tile->mi_row_end - mi_row;
   const int col8x8_remaining = tile->mi_col_end - mi_col;
   int bh, bw;
@@ -2250,26 +2431,26 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
     for (idy = 0; idy < mi_height; ++idy) {
       for (idx = 0; idx < mi_width; ++idx) {
         mi = prev_mi[idy * cm->mi_stride + idx];
-        bs = mi ? mi->mbmi.sb_type : bsize;
+        bs = mi ? mi->sb_type : bsize;
         min_size = VPXMIN(min_size, bs);
         max_size = VPXMAX(max_size, bs);
       }
     }
   }
 
-  if (xd->left_available) {
+  if (xd->left_mi) {
     for (idy = 0; idy < mi_height; ++idy) {
       mi = xd->mi[idy * cm->mi_stride - 1];
-      bs = mi ? mi->mbmi.sb_type : bsize;
+      bs = mi ? mi->sb_type : bsize;
       min_size = VPXMIN(min_size, bs);
       max_size = VPXMAX(max_size, bs);
     }
   }
 
-  if (xd->up_available) {
+  if (xd->above_mi) {
     for (idx = 0; idx < mi_width; ++idx) {
       mi = xd->mi[idx - cm->mi_stride];
-      bs = mi ? mi->mbmi.sb_type : bsize;
+      bs = mi ? mi->sb_type : bsize;
       min_size = VPXMIN(min_size, bs);
       max_size = VPXMAX(max_size, bs);
     }
@@ -2354,7 +2535,8 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   PARTITION_CONTEXT sl[8], sa[8];
   TOKENEXTRA *tp_orig = *tp;
   PICK_MODE_CONTEXT *ctx = &pc_tree->none;
-  int i, pl;
+  int i;
+  const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
   BLOCK_SIZE subsize;
   RD_COST this_rdc, sum_rdc, best_rdc;
   int do_split = bsize >= BLOCK_8X8;
@@ -2400,7 +2582,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
 
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
 
-  if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode)
+  if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode != NO_AQ)
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
 
   if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
@@ -2424,8 +2606,15 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
 
   if (cpi->sf.use_square_partition_only &&
       bsize > cpi->sf.use_square_only_threshold) {
+    if (cpi->use_svc) {
+      if (!vp9_active_h_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless)
+        partition_horz_allowed &= force_horz_split;
+      if (!vp9_active_v_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless)
+        partition_vert_allowed &= force_vert_split;
+    } else {
       partition_horz_allowed &= force_horz_split;
       partition_vert_allowed &= force_vert_split;
+    }
   }
 
   save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -2495,7 +2684,6 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
                      &this_rdc, bsize, ctx, best_rdc.rdcost);
     if (this_rdc.rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
-        pl = partition_plane_context(xd, mi_row, mi_col, bsize);
         this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
         this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                                  this_rdc.rate, this_rdc.dist);
@@ -2549,6 +2737,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
               break;
             }
           }
+
           if (skip) {
             if (src_diff_var == UINT_MAX) {
               set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
@@ -2580,15 +2769,16 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       i = 4;
       if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
         pc_tree->leaf_split[0]->pred_interp_filter =
-            ctx->mic.mbmi.interp_filter;
+            ctx->mic.interp_filter;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
                        pc_tree->leaf_split[0], best_rdc.rdcost);
+
       if (sum_rdc.rate == INT_MAX)
         sum_rdc.rdcost = INT64_MAX;
     } else {
       for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
-      const int x_idx = (i & 1) * mi_step;
-      const int y_idx = (i >> 1) * mi_step;
+        const int x_idx =  (i & 1) * mi_step;
+        const int y_idx = (i >> 1) * mi_step;
 
         if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
           continue;
@@ -2614,7 +2804,6 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost && i == 4) {
-      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
       sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                               sum_rdc.rate, sum_rdc.dist);
@@ -2651,7 +2840,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->horizontal[0].pred_interp_filter =
-          ctx->mic.mbmi.interp_filter;
+          ctx->mic.interp_filter;
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
                      &pc_tree->horizontal[0], best_rdc.rdcost);
 
@@ -2666,7 +2855,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->horizontal[1].pred_interp_filter =
-            ctx->mic.mbmi.interp_filter;
+            ctx->mic.interp_filter;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col,
                        &this_rdc, subsize, &pc_tree->horizontal[1],
                        best_rdc.rdcost - sum_rdc.rdcost);
@@ -2680,7 +2869,6 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
       sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
       if (sum_rdc.rdcost < best_rdc.rdcost) {
@@ -2694,6 +2882,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     }
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
+
   // PARTITION_VERT
   if (partition_vert_allowed &&
       (do_rect || vp9_active_v_edge(cpi, mi_col, mi_step))) {
@@ -2704,7 +2893,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->vertical[0].pred_interp_filter =
-          ctx->mic.mbmi.interp_filter;
+          ctx->mic.interp_filter;
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
                      &pc_tree->vertical[0], best_rdc.rdcost);
     if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols &&
@@ -2718,7 +2907,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->vertical[1].pred_interp_filter =
-            ctx->mic.mbmi.interp_filter;
+            ctx->mic.interp_filter;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step,
                        &this_rdc, subsize,
                        &pc_tree->vertical[1], best_rdc.rdcost - sum_rdc.rdcost);
@@ -2732,7 +2921,6 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
       sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                               sum_rdc.rate, sum_rdc.dist);
@@ -2777,6 +2965,8 @@ static void encode_rd_sb_row(VP9_COMP *cpi,
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   SPEED_FEATURES *const sf = &cpi->sf;
+  const int mi_col_start = tile_info->mi_col_start;
+  const int mi_col_end = tile_info->mi_col_end;
   int mi_col;
 
   // Initialize the left context for the new SB row
@@ -2784,8 +2974,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi,
   memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
 
   // Code each SB in the row
-  for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
-       mi_col += MI_BLOCK_SIZE) {
+  for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) {
     const struct segmentation *const seg = &cm->seg;
     int dummy_rate;
     int64_t dummy_dist;
@@ -2890,8 +3079,8 @@ static void reset_skip_tx_size(VP9_COMMON *cm, TX_SIZE max_tx_size) {
 
   for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) {
     for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
-      if (mi_ptr[mi_col]->mbmi.tx_size > max_tx_size)
-        mi_ptr[mi_col]->mbmi.tx_size = max_tx_size;
+      if (mi_ptr[mi_col]->tx_size > max_tx_size)
+        mi_ptr[mi_col]->tx_size = max_tx_size;
     }
   }
 }
@@ -2938,18 +3127,32 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi,
   VP9_COMMON *const cm = &cpi->common;
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi;
+  MODE_INFO *mi;
+  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  BLOCK_SIZE bs = VPXMAX(bsize, BLOCK_8X8);  // processing unit block size
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bs];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bs];
+  int plane;
+
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
-  mbmi = &xd->mi[0]->mbmi;
-  mbmi->sb_type = bsize;
+  mi = xd->mi[0];
+  mi->sb_type = bsize;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    struct macroblockd_plane *pd = &xd->plane[plane];
+    memcpy(a + num_4x4_blocks_wide * plane, pd->above_context,
+           (sizeof(a[0]) * num_4x4_blocks_wide) >> pd->subsampling_x);
+    memcpy(l + num_4x4_blocks_high * plane, pd->left_context,
+           (sizeof(l[0]) * num_4x4_blocks_high) >> pd->subsampling_y);
+  }
 
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled)
-    if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
+    if (cyclic_refresh_segment_id_boosted(mi->segment_id))
       x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
 
   if (cm->frame_type == KEY_FRAME)
     hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
-  else if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+  else if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
     set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize);
   else if (bsize >= BLOCK_8X8)
     vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col,
@@ -2960,6 +3163,14 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi,
 
   duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
 
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    struct macroblockd_plane *pd = &xd->plane[plane];
+    memcpy(pd->above_context, a + num_4x4_blocks_wide * plane,
+           (sizeof(a[0]) * num_4x4_blocks_wide) >> pd->subsampling_x);
+    memcpy(pd->left_context, l + num_4x4_blocks_high * plane,
+           (sizeof(l[0]) * num_4x4_blocks_high) >> pd->subsampling_y);
+  }
+
   if (rd_cost->rate == INT_MAX)
     vp9_rd_cost_reset(rd_cost);
 
@@ -3109,7 +3320,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   if (partition_none_allowed) {
     nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col,
                         &this_rdc, bsize, ctx);
-    ctx->mic.mbmi = xd->mi[0]->mbmi;
+    ctx->mic = *xd->mi[0];
     ctx->mbmi_ext = *x->mbmi_ext;
     ctx->skip_txfm[0] = x->skip_txfm[0];
     ctx->skip = x->skip;
@@ -3192,7 +3403,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
                         &pc_tree->horizontal[0]);
 
-    pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
+    pc_tree->horizontal[0].mic = *xd->mi[0];
     pc_tree->horizontal[0].mbmi_ext = *x->mbmi_ext;
     pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
     pc_tree->horizontal[0].skip = x->skip;
@@ -3204,7 +3415,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
                           &this_rdc, subsize,
                           &pc_tree->horizontal[1]);
 
-      pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
+      pc_tree->horizontal[1].mic = *xd->mi[0];
       pc_tree->horizontal[1].mbmi_ext = *x->mbmi_ext;
       pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->horizontal[1].skip = x->skip;
@@ -3237,7 +3448,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     pc_tree->vertical[0].pred_pixel_ready = 1;
     nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
                         &pc_tree->vertical[0]);
-    pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
+    pc_tree->vertical[0].mic = *xd->mi[0];
     pc_tree->vertical[0].mbmi_ext = *x->mbmi_ext;
     pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
     pc_tree->vertical[0].skip = x->skip;
@@ -3248,7 +3459,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + ms,
                           &this_rdc, subsize,
                           &pc_tree->vertical[1]);
-      pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
+      pc_tree->vertical[1].mic = *xd->mi[0];
       pc_tree->vertical[1].mbmi_ext = *x->mbmi_ext;
       pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->vertical[1].skip = x->skip;
@@ -3320,7 +3531,7 @@ static void nonrd_select_partition(VP9_COMP *cpi,
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  subsize = (bsize >= BLOCK_8X8) ? mi[0]->mbmi.sb_type : BLOCK_4X4;
+  subsize = (bsize >= BLOCK_8X8) ? mi[0]->sb_type : BLOCK_4X4;
   partition = partition_lookup[bsl][subsize];
 
   if (bsize == BLOCK_32X32 && subsize == BLOCK_32X32) {
@@ -3345,7 +3556,7 @@ static void nonrd_select_partition(VP9_COMP *cpi,
         pc_tree->none.pred_pixel_ready = 1;
         nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost,
                             subsize, &pc_tree->none);
-        pc_tree->none.mic.mbmi = xd->mi[0]->mbmi;
+        pc_tree->none.mic = *xd->mi[0];
         pc_tree->none.mbmi_ext = *x->mbmi_ext;
         pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
         pc_tree->none.skip = x->skip;
@@ -3354,7 +3565,7 @@ static void nonrd_select_partition(VP9_COMP *cpi,
         pc_tree->vertical[0].pred_pixel_ready = 1;
         nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost,
                             subsize, &pc_tree->vertical[0]);
-        pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
+        pc_tree->vertical[0].mic = *xd->mi[0];
         pc_tree->vertical[0].mbmi_ext = *x->mbmi_ext;
         pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
         pc_tree->vertical[0].skip = x->skip;
@@ -3362,7 +3573,7 @@ static void nonrd_select_partition(VP9_COMP *cpi,
           pc_tree->vertical[1].pred_pixel_ready = 1;
           nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs,
                               &this_rdc, subsize, &pc_tree->vertical[1]);
-          pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
+          pc_tree->vertical[1].mic = *xd->mi[0];
           pc_tree->vertical[1].mbmi_ext = *x->mbmi_ext;
           pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
           pc_tree->vertical[1].skip = x->skip;
@@ -3377,7 +3588,7 @@ static void nonrd_select_partition(VP9_COMP *cpi,
         pc_tree->horizontal[0].pred_pixel_ready = 1;
         nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost,
                             subsize, &pc_tree->horizontal[0]);
-        pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
+        pc_tree->horizontal[0].mic = *xd->mi[0];
         pc_tree->horizontal[0].mbmi_ext = *x->mbmi_ext;
         pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
         pc_tree->horizontal[0].skip = x->skip;
@@ -3385,7 +3596,7 @@ static void nonrd_select_partition(VP9_COMP *cpi,
           pc_tree->horizontal[1].pred_pixel_ready = 1;
           nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
                               &this_rdc, subsize, &pc_tree->horizontal[1]);
-          pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
+          pc_tree->horizontal[1].mic = *xd->mi[0];
           pc_tree->horizontal[1].mbmi_ext = *x->mbmi_ext;
           pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
           pc_tree->horizontal[1].skip = x->skip;
@@ -3457,7 +3668,7 @@ static void nonrd_use_partition(VP9_COMP *cpi,
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  subsize = (bsize >= BLOCK_8X8) ? mi[0]->mbmi.sb_type : BLOCK_4X4;
+  subsize = (bsize >= BLOCK_8X8) ? mi[0]->sb_type : BLOCK_4X4;
   partition = partition_lookup[bsl][subsize];
 
   if (output_enabled && bsize != BLOCK_4X4) {
@@ -3470,7 +3681,7 @@ static void nonrd_use_partition(VP9_COMP *cpi,
       pc_tree->none.pred_pixel_ready = 1;
       nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
                           subsize, &pc_tree->none);
-      pc_tree->none.mic.mbmi = xd->mi[0]->mbmi;
+      pc_tree->none.mic = *xd->mi[0];
       pc_tree->none.mbmi_ext = *x->mbmi_ext;
       pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
       pc_tree->none.skip = x->skip;
@@ -3481,7 +3692,7 @@ static void nonrd_use_partition(VP9_COMP *cpi,
       pc_tree->vertical[0].pred_pixel_ready = 1;
       nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
                           subsize, &pc_tree->vertical[0]);
-      pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
+      pc_tree->vertical[0].mic = *xd->mi[0];
       pc_tree->vertical[0].mbmi_ext = *x->mbmi_ext;
       pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->vertical[0].skip = x->skip;
@@ -3491,7 +3702,7 @@ static void nonrd_use_partition(VP9_COMP *cpi,
         pc_tree->vertical[1].pred_pixel_ready = 1;
         nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs,
                             dummy_cost, subsize, &pc_tree->vertical[1]);
-        pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
+        pc_tree->vertical[1].mic = *xd->mi[0];
         pc_tree->vertical[1].mbmi_ext = *x->mbmi_ext;
         pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
         pc_tree->vertical[1].skip = x->skip;
@@ -3503,7 +3714,7 @@ static void nonrd_use_partition(VP9_COMP *cpi,
       pc_tree->horizontal[0].pred_pixel_ready = 1;
       nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
                           subsize, &pc_tree->horizontal[0]);
-      pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
+      pc_tree->horizontal[0].mic = *xd->mi[0];
       pc_tree->horizontal[0].mbmi_ext = *x->mbmi_ext;
       pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->horizontal[0].skip = x->skip;
@@ -3514,7 +3725,7 @@ static void nonrd_use_partition(VP9_COMP *cpi,
         pc_tree->horizontal[1].pred_pixel_ready = 1;
         nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
                             dummy_cost, subsize, &pc_tree->horizontal[1]);
-        pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
+        pc_tree->horizontal[1].mic = *xd->mi[0];
         pc_tree->horizontal[1].mbmi_ext = *x->mbmi_ext;
         pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
         pc_tree->horizontal[1].skip = x->skip;
@@ -3563,6 +3774,8 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi,
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_col_start = tile_info->mi_col_start;
+  const int mi_col_end = tile_info->mi_col_end;
   int mi_col;
 
   // Initialize the left context for the new SB row
@@ -3570,8 +3783,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi,
   memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
 
   // Code each SB in the row
-  for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
-       mi_col += MI_BLOCK_SIZE) {
+  for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) {
     const struct segmentation *const seg = &cm->seg;
     RD_COST dummy_rdc;
     const int idx_str = cm->mi_stride * mi_row + mi_col;
@@ -3584,6 +3796,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi,
     vp9_rd_cost_init(&dummy_rdc);
     x->color_sensitivity[0] = 0;
     x->color_sensitivity[1] = 0;
+    x->sb_is_skin = 0;
 
     if (seg->enabled) {
       const uint8_t *const map = seg->update_map ? cpi->segmentation_map
@@ -3620,8 +3833,14 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi,
         break;
       case REFERENCE_PARTITION:
         set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
-        if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
-            xd->mi[0]->mbmi.segment_id) {
+        // Use nonrd_pick_partition on scene-cut for VBR, or on qp-segment
+        // if cyclic_refresh is enabled.
+        // nonrd_pick_partition does not support 4x4 partition, so avoid it
+        // on key frame for now.
+        if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad &&
+            cm->frame_type != KEY_FRAME) ||
+            (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+            xd->mi[0]->segment_id)) {
           // Use lower max_partition_size for low resoultions.
           if (cm->width <= 352 && cm->height <= 288)
             x->max_partition_size = BLOCK_32X32;
@@ -3775,8 +3994,7 @@ static int get_skip_encode_frame(const VP9_COMMON *cm, ThreadData *const td) {
   }
 
   return (intra_count << 2) < inter_count &&
-         cm->frame_type != KEY_FRAME &&
-         cm->show_frame;
+         cm->frame_type != KEY_FRAME && cm->show_frame;
 }
 
 void vp9_init_tile_data(VP9_COMP *cpi) {
@@ -3829,10 +4047,15 @@ void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td,
       &cpi->tile_data[tile_row * tile_cols + tile_col];
   const TileInfo * const tile_info = &this_tile->tile_info;
   TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
+  const int mi_row_start = tile_info->mi_row_start;
+  const int mi_row_end = tile_info->mi_row_end;
   int mi_row;
 
-  for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
-       mi_row += MI_BLOCK_SIZE) {
+  // Set up pointers to per thread motion search counters.
+  td->mb.m_search_count_ptr = &td->rd_counts.m_search_count;
+  td->mb.ex_search_count_ptr = &td->rd_counts.ex_search_count;
+
+  for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE) {
     if (cpi->sf.use_nonrd_pick_mode)
       encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
     else
@@ -3887,6 +4110,8 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   vp9_zero(rdc->coef_counts);
   vp9_zero(rdc->comp_pred_diff);
   vp9_zero(rdc->filter_diff);
+  rdc->m_search_count = 0;   // Count of motion search hits.
+  rdc->ex_search_count = 0;  // Exhaustive mesh search hits.
 
   xd->lossless = cm->base_qindex == 0 &&
                  cm->y_dc_delta_q == 0 &&
@@ -3957,10 +4182,10 @@ static void encode_frame_internal(VP9_COMP *cpi) {
     vpx_usec_timer_start(&emr_timer);
 
 #if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm,
-                     &cpi->twopass.this_frame_mb_stats);
-  }
+    if (cpi->use_fp_mb_stats) {
+      input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm,
+                       &cpi->twopass.this_frame_mb_stats);
+    }
 #endif
 
     // If allowed, encoding tiles in parallel with one thread handling one tile.
@@ -3999,6 +4224,31 @@ static INTERP_FILTER get_interp_filter(
   }
 }
 
+static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible;
+  struct segmentation *const seg = &cm->seg;
+
+  int mi_row, mi_col;
+  int sum_delta = 0;
+  int map_index = 0;
+  int qdelta_index;
+  int segment_id;
+
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+    MODE_INFO **mi_8x8 = mi_8x8_ptr;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col++, mi_8x8++) {
+      segment_id = mi_8x8[0]->segment_id;
+      qdelta_index = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
+      sum_delta += qdelta_index;
+      map_index++;
+    }
+    mi_8x8_ptr += cm->mi_stride;
+  }
+
+  return sum_delta / (cm->mi_rows * cm->mi_cols);
+}
+
 void vp9_encode_frame(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
 
@@ -4121,12 +4371,17 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     cm->reference_mode = SINGLE_REFERENCE;
     encode_frame_internal(cpi);
   }
-}
 
+  // If segmentated AQ is enabled compute the average AQ weighting.
+  if (cm->seg.enabled && (cpi->oxcf.aq_mode != NO_AQ) &&
+      (cm->seg.update_map || cm->seg.update_data)) {
+    cm->seg.aq_av_offset = compute_frame_aq_offset(cpi);
+  }
+}
 static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi) {
-  const PREDICTION_MODE y_mode = mi->mbmi.mode;
-  const PREDICTION_MODE uv_mode = mi->mbmi.uv_mode;
-  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
+  const PREDICTION_MODE y_mode = mi->mode;
+  const PREDICTION_MODE uv_mode = mi->uv_mode;
+  const BLOCK_SIZE bsize = mi->sb_type;
 
   if (bsize < BLOCK_8X8) {
     int idx, idy;
@@ -4142,6 +4397,32 @@ static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi) {
   ++counts->uv_mode[y_mode][uv_mode];
 }
 
+static void update_zeromv_cnt(VP9_COMP *const cpi,
+                              const MODE_INFO *const mi,
+                              int mi_row, int mi_col,
+                              BLOCK_SIZE bsize) {
+  const VP9_COMMON *const cm = &cpi->common;
+  MV mv = mi->mv[0].as_mv;
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+  const int block_index = mi_row * cm->mi_cols + mi_col;
+  int x, y;
+  for (y = 0; y < ymis; y++)
+    for (x = 0; x < xmis; x++) {
+      int map_offset = block_index + y * cm->mi_cols + x;
+      if (is_inter_block(mi) && mi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+        if (abs(mv.row) < 8 && abs(mv.col) < 8) {
+          if (cpi->consec_zero_mv[map_offset] < 255)
+           cpi->consec_zero_mv[map_offset]++;
+        } else {
+          cpi->consec_zero_mv[map_offset] = 0;
+        }
+      }
+    }
+}
+
 static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
                               TOKENEXTRA **t, int output_enabled,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -4149,16 +4430,11 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO **mi_8x8 = xd->mi;
-  MODE_INFO *mi = mi_8x8[0];
-  MB_MODE_INFO *mbmi = &mi->mbmi;
-  const int seg_skip = segfeature_active(&cm->seg, mbmi->segment_id,
+  MODE_INFO *mi = xd->mi[0];
+  const int seg_skip = segfeature_active(&cm->seg, mi->segment_id,
                                          SEG_LVL_SKIP);
-  const int mis = cm->mi_stride;
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
 
-  x->skip_recode = !x->select_tx_size && mbmi->sb_type >= BLOCK_8X8 &&
+  x->skip_recode = !x->select_tx_size && mi->sb_type >= BLOCK_8X8 &&
                    cpi->oxcf.aq_mode != COMPLEXITY_AQ &&
                    cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ &&
                    cpi->sf.allow_skip_recode;
@@ -4175,21 +4451,28 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
   if (x->skip_encode)
     return;
 
-  if (!is_inter_block(mbmi)) {
+  if (!is_inter_block(mi)) {
     int plane;
-    mbmi->skip = 1;
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) &&
+        (xd->above_mi == NULL || xd->left_mi == NULL) &&
+        need_top_left[mi->uv_mode])
+      assert(0);
+#endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+    mi->skip = 1;
     for (plane = 0; plane < MAX_MB_PLANE; ++plane)
-      vp9_encode_intra_block_plane(x, VPXMAX(bsize, BLOCK_8X8), plane);
+      vp9_encode_intra_block_plane(x, VPXMAX(bsize, BLOCK_8X8), plane, 1);
     if (output_enabled)
       sum_intra_stats(td->counts, mi);
-    vp9_tokenize_sb(cpi, td, t, !output_enabled, VPXMAX(bsize, BLOCK_8X8));
+    vp9_tokenize_sb(cpi, td, t, !output_enabled, seg_skip,
+                    VPXMAX(bsize, BLOCK_8X8));
   } else {
     int ref;
-    const int is_compound = has_second_ref(mbmi);
-    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    const int is_compound = has_second_ref(mi);
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi,
-                                                     mbmi->ref_frame[ref]);
+                                                     mi->ref_frame[ref]);
       assert(cfg != NULL);
       vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
                            &xd->block_refs[ref]->sf);
@@ -4202,34 +4485,31 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
                                     VPXMAX(bsize, BLOCK_8X8));
 
     vp9_encode_sb(x, VPXMAX(bsize, BLOCK_8X8));
-    vp9_tokenize_sb(cpi, td, t, !output_enabled, VPXMAX(bsize, BLOCK_8X8));
+    vp9_tokenize_sb(cpi, td, t, !output_enabled, seg_skip,
+                    VPXMAX(bsize, BLOCK_8X8));
   }
 
   if (output_enabled) {
     if (cm->tx_mode == TX_MODE_SELECT &&
-        mbmi->sb_type >= BLOCK_8X8  &&
-        !(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) {
+        mi->sb_type >= BLOCK_8X8  &&
+        !(is_inter_block(mi) && (mi->skip || seg_skip))) {
       ++get_tx_counts(max_txsize_lookup[bsize], get_tx_size_context(xd),
-                      &td->counts->tx)[mbmi->tx_size];
+                      &td->counts->tx)[mi->tx_size];
     } else {
-      int x, y;
-      TX_SIZE tx_size;
       // The new intra coding scheme requires no change of transform size
-      if (is_inter_block(&mi->mbmi)) {
-        tx_size = VPXMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
-                         max_txsize_lookup[bsize]);
+      if (is_inter_block(mi)) {
+        mi->tx_size = VPXMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
+                             max_txsize_lookup[bsize]);
       } else {
-        tx_size = (bsize >= BLOCK_8X8) ? mbmi->tx_size : TX_4X4;
+        mi->tx_size = (bsize >= BLOCK_8X8) ? mi->tx_size : TX_4X4;
       }
-
-      for (y = 0; y < mi_height; y++)
-        for (x = 0; x < mi_width; x++)
-          if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows)
-            mi_8x8[mis * y + x]->mbmi.tx_size = tx_size;
     }
-    ++td->counts->tx.tx_totals[mbmi->tx_size];
-    ++td->counts->tx.tx_totals[get_uv_tx_size(mbmi, &xd->plane[1])];
+
+    ++td->counts->tx.tx_totals[mi->tx_size];
+    ++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])];
     if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
-      vp9_cyclic_refresh_update_sb_postencode(cpi, mbmi, mi_row, mi_col, bsize);
+      vp9_cyclic_refresh_update_sb_postencode(cpi, mi, mi_row, mi_col, bsize);
+    if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0)
+      update_zeromv_cnt(cpi, mi, mi_row, mi_col, bsize);
   }
 }
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 3c6a928..169943c 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -50,27 +50,21 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
                      pd->dst.buf, pd->dst.stride);
 }
 
-#define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
-
 typedef struct vp9_token_state {
+  int64_t       error;
   int           rate;
-  int           error;
-  int           next;
+  int16_t       next;
   int16_t       token;
-  int16_t       qc;
+  tran_low_t    qc;
+  tran_low_t    dqc;
 } vp9_token_state;
 
-// TODO(jimbankoski): experiment to find optimal RD numbers.
-static const int plane_rd_mult[PLANE_TYPES] = { 4, 2 };
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] ={ {10, 6}, {8, 7}, };
 
 #define UPDATE_RD_COST()\
 {\
   rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\
   rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
-  if (rd_cost0 == rd_cost1) {\
-    rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\
-    rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\
-  }\
 }
 
 // This function is a place holder for now but may ultimately need
@@ -91,7 +85,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *const p = &mb->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int ref = is_inter_block(&xd->mi[0]->mbmi);
+  const int ref = is_inter_block(xd->mi[0]);
   vp9_token_state tokens[1025][2];
   unsigned best_index[1025][2];
   uint8_t token_cache[1024];
@@ -101,32 +95,32 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
   const int eob = p->eobs[block];
   const PLANE_TYPE type = get_plane_type(plane);
   const int default_eob = 16 << (tx_size << 1);
-  const int mul = 1 + (tx_size == TX_32X32);
-  const int16_t *dequant_ptr = pd->dequant;
-  const uint8_t *const band_translate = get_band_translate(tx_size);
+  const int shift = (tx_size == TX_32X32);
+  const int16_t* const dequant_ptr = pd->dequant;
+  const uint8_t* const band_translate = get_band_translate(tx_size);
   const scan_order *const so = get_scan(xd, tx_size, type, block);
   const int16_t *const scan = so->scan;
   const int16_t *const nb = so->neighbors;
+  const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
   int next = eob, sz = 0;
-  int64_t rdmult = mb->rdmult * plane_rd_mult[type], rddiv = mb->rddiv;
+  const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][type]) >> 1;
+  const int64_t rddiv = mb->rddiv;
   int64_t rd_cost0, rd_cost1;
-  int rate0, rate1, error0, error1;
+  int rate0, rate1;
+  int64_t error0, error1;
   int16_t t0, t1;
   EXTRABIT e0;
   int best, band, pt, i, final_eob;
 #if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+  const int *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
 #else
-  const int16_t *cat6_high_cost = vp9_get_high_cost_table(8);
+  const int *cat6_high_cost = vp9_get_high_cost_table(8);
 #endif
 
   assert((!type && !plane) || (type && plane));
   assert(eob <= default_eob);
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-  if (!ref)
-    rdmult = (rdmult * 9) >> 4;
-
   /* Initialize the sentinel node of the trellis. */
   tokens[eob][0].rate = 0;
   tokens[eob][0].error = 0;
@@ -165,7 +159,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
       base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
-      dx = mul * (dqcoeff[rc] - coeff[rc]);
+      dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         dx >>= xd->bd - 8;
@@ -177,14 +171,15 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
       tokens[i][0].next = next;
       tokens[i][0].token = t0;
       tokens[i][0].qc = x;
+      tokens[i][0].dqc = dqcoeff[rc];
       best_index[i][0] = best;
 
       /* Evaluate the second possibility for this state. */
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
 
-      if ((abs(x) * dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) &&
-          (abs(x) * dequant_ptr[rc != 0] < abs(coeff[rc]) * mul +
+      if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
+          (abs(x) * dequant_ptr[rc != 0] < (abs(coeff[rc]) << shift) +
                                                dequant_ptr[rc != 0]))
         shortcut = 1;
       else
@@ -193,6 +188,11 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
       if (shortcut) {
         sz = -(x < 0);
         x -= 2 * sz + 1;
+      } else {
+        tokens[i][1] = tokens[i][0];
+        best_index[i][1] = best_index[i][0];
+        next = i;
+        continue;
       }
 
       /* Consider both possible successor states. */
@@ -243,6 +243,24 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
       tokens[i][1].next = next;
       tokens[i][1].token = best ? t1 : t0;
       tokens[i][1].qc = x;
+
+      if (x) {
+        tran_low_t offset = dq_step[rc != 0];
+        // The 32x32 transform coefficient uses half quantization step size.
+        // Account for the rounding difference in the dequantized coefficeint
+        // value when the quantization index is dropped from an even number
+        // to an odd number.
+        if (shift & x)
+          offset += (dequant_ptr[rc != 0] & 0x01);
+
+        if (sz == 0)
+          tokens[i][1].dqc = dqcoeff[rc] - offset;
+        else
+          tokens[i][1].dqc = dqcoeff[rc] + offset;
+      } else {
+        tokens[i][1].dqc = 0;
+      }
+
       best_index[i][1] = best;
       /* Finally, make this the new head of the trellis. */
       next = i;
@@ -282,18 +300,13 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
   final_eob = -1;
-  memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
-  memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
+
   for (i = next; i < eob; i = next) {
     const int x = tokens[i][best].qc;
     const int rc = scan[i];
-    if (x) {
-      final_eob = i;
-    }
-
+    if (x) final_eob = i;
     qcoeff[rc] = x;
-    dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul;
-
+    dqcoeff[rc] = tokens[i][best].dqc;
     next = tokens[i][best].next;
     best = best_index[i][best];
   }
@@ -736,11 +749,11 @@ void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct encode_b_args arg = {x, &ctx, &mbmi->skip};
+  MODE_INFO *mi = xd->mi[0];
+  struct encode_b_args arg = {x, &ctx, &mi->skip};
   int plane;
 
-  mbmi->skip = 1;
+  mi->skip = 1;
 
   if (x->skip)
     return;
@@ -751,7 +764,7 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
 
     if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
       const struct macroblockd_plane* const pd = &xd->plane[plane];
-      const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
+      const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size;
       vp9_get_entropy_contexts(bsize, tx_size, pd,
                                ctx.ta[plane], ctx.tl[plane]);
     }
@@ -766,7 +779,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   struct encode_b_args* const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *mi = xd->mi[0];
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
@@ -783,17 +796,26 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
   int i, j;
+  struct optimize_ctx *const ctx = args->ctx;
+  ENTROPY_CONTEXT *a = NULL;
+  ENTROPY_CONTEXT *l = NULL;
+  int entropy_ctx = 0;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   dst = &pd->dst.buf[4 * (j * dst_stride + i)];
   src = &p->src.buf[4 * (j * src_stride + i)];
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+  if (args->ctx != NULL) {
+    a = &ctx->ta[plane][i];
+    l = &ctx->tl[plane][j];
+    entropy_ctx = combine_entropy_contexts(*a, *l);
+  }
 
   if (tx_size == TX_4X4) {
     tx_type = get_tx_type_4x4(get_plane_type(plane), xd, block);
     scan_order = &vp9_scan_orders[TX_4X4][tx_type];
-    mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
+    mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mi->uv_mode;
   } else {
-    mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
+    mode = plane == 0 ? mi->mode : mi->uv_mode;
     if (tx_size == TX_32X32) {
       scan_order = &vp9_default_scan_orders[TX_32X32];
     } else {
@@ -905,6 +927,9 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                              pd->dequant, eob, scan_order->scan,
                              scan_order->iscan);
       }
+      if (args->ctx != NULL && !x->skip_recode) {
+       *a = *l = optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+      }
       if (!x->skip_encode && *eob)
         vp9_idct32x32_add(dqcoeff, dst, dst_stride, *eob);
       break;
@@ -918,6 +943,9 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                        pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
       }
+      if (args->ctx != NULL && !x->skip_recode) {
+        *a = *l = optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+      }
       if (!x->skip_encode && *eob)
         vp9_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob);
       break;
@@ -931,6 +959,9 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                        pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
       }
+      if (args->ctx != NULL && !x->skip_recode) {
+        *a = *l = optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+      }
       if (!x->skip_encode && *eob)
         vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob);
       break;
@@ -947,7 +978,9 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                        pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
       }
-
+      if (args->ctx != NULL && !x->skip_recode) {
+        *a = *l = optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+      }
       if (!x->skip_encode && *eob) {
         if (tx_type == DCT_DCT)
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
@@ -966,9 +999,20 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
     *(args->skip) = 0;
 }
 
-void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
+                                  int enable_optimize_b) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  struct encode_b_args arg = {x, NULL, &xd->mi[0]->mbmi.skip};
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {x, NULL, &xd->mi[0]->skip};
+
+  if (enable_optimize_b && x->optimize &&
+      (!x->skip_recode || !x->skip_optimize)) {
+    const struct macroblockd_plane* const pd = &xd->plane[plane];
+    const TX_SIZE tx_size = plane ? get_uv_tx_size(xd->mi[0], pd) :
+        xd->mi[0]->tx_size;
+    vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
+    arg.ctx = &ctx;
+  }
 
   vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
                                          vp9_encode_block_intra, &arg);
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 97df8a6..25b0b23 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -37,7 +37,8 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                             TX_SIZE tx_size, void *arg);
 
-void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
+                                  int enable_optimize_b);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index e719663..71f27cc 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -75,11 +75,12 @@ static void encode_mv_component(vpx_writer* w, int comp,
 static void build_nmv_component_cost_table(int *mvcost,
                                            const nmv_component* const mvcomp,
                                            int usehp) {
-  int i, v;
   int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
   int bits_cost[MV_OFFSET_BITS][2];
   int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
   int class0_hp_cost[2], hp_cost[2];
+  int i;
+  int c, o;
 
   sign_cost[0] = vp9_cost_zero(mvcomp->sign);
   sign_cost[1] = vp9_cost_one(mvcomp->sign);
@@ -94,51 +95,64 @@ static void build_nmv_component_cost_table(int *mvcost,
     vp9_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], vp9_mv_fp_tree);
   vp9_cost_tokens(fp_cost, mvcomp->fp, vp9_mv_fp_tree);
 
-  if (usehp) {
-    class0_hp_cost[0] = vp9_cost_zero(mvcomp->class0_hp);
-    class0_hp_cost[1] = vp9_cost_one(mvcomp->class0_hp);
-    hp_cost[0] = vp9_cost_zero(mvcomp->hp);
-    hp_cost[1] = vp9_cost_one(mvcomp->hp);
-  }
+  // Always build the hp costs to avoid an uninitialized warning from gcc
+  class0_hp_cost[0] = vp9_cost_zero(mvcomp->class0_hp);
+  class0_hp_cost[1] = vp9_cost_one(mvcomp->class0_hp);
+  hp_cost[0] = vp9_cost_zero(mvcomp->hp);
+  hp_cost[1] = vp9_cost_one(mvcomp->hp);
+
   mvcost[0] = 0;
-  for (v = 1; v <= MV_MAX; ++v) {
-    int z, c, o, d, e, f, cost = 0;
-    z = v - 1;
-    c = vp9_get_mv_class(z, &o);
-    cost += class_cost[c];
+  // MV_CLASS_0
+  for (o = 0; o < (CLASS0_SIZE << 3); ++o) {
+    int d, e, f;
+    int cost = class_cost[MV_CLASS_0];
+    int v = o + 1;
     d = (o >> 3);               /* int mv data */
     f = (o >> 1) & 3;           /* fractional pel mv data */
-    e = (o & 1);                /* high precision mv data */
-    if (c == MV_CLASS_0) {
-      cost += class0_cost[d];
-    } else {
-      int i, b;
-      b = c + CLASS0_BITS - 1;  /* number of bits */
-      for (i = 0; i < b; ++i)
-        cost += bits_cost[i][((d >> i) & 1)];
-    }
-    if (c == MV_CLASS_0) {
-      cost += class0_fp_cost[d][f];
-    } else {
-      cost += fp_cost[f];
-    }
+    cost += class0_cost[d];
+    cost += class0_fp_cost[d][f];
     if (usehp) {
-      if (c == MV_CLASS_0) {
-        cost += class0_hp_cost[e];
-      } else {
-        cost += hp_cost[e];
-      }
+      e = (o & 1);                /* high precision mv data */
+      cost += class0_hp_cost[e];
     }
     mvcost[v] = cost + sign_cost[0];
     mvcost[-v] = cost + sign_cost[1];
   }
+  for (c = MV_CLASS_1; c < MV_CLASSES; ++c) {
+    int d;
+    for (d = 0; d < (1 << c); ++d) {
+      int f;
+      int whole_cost = class_cost[c];
+      int b = c + CLASS0_BITS - 1;  /* number of bits */
+      for (i = 0; i < b; ++i)
+        whole_cost += bits_cost[i][((d >> i) & 1)];
+      for (f = 0; f < 4; ++f) {
+        int cost = whole_cost + fp_cost[f];
+        int v = (CLASS0_SIZE << (c + 2)) + d * 8 + f * 2 /* + e */ + 1;
+        if (usehp) {
+          mvcost[v] = cost + hp_cost[0] + sign_cost[0];
+          mvcost[-v] = cost + hp_cost[0] + sign_cost[1];
+          if (v + 1 > MV_MAX) break;
+          mvcost[v + 1] = cost + hp_cost[1] + sign_cost[0];
+          mvcost[-v - 1] = cost + hp_cost[1] + sign_cost[1];
+        } else {
+          mvcost[v] = cost + sign_cost[0];
+          mvcost[-v] = cost + sign_cost[1];
+          if (v + 1 > MV_MAX) break;
+          mvcost[v + 1] = cost + sign_cost[0];
+          mvcost[-v - 1] = cost + sign_cost[1];
+        }
+      }
+    }
+  }
 }
 
 static int update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
                      vpx_prob upd_p) {
   const vpx_prob new_p = get_binary_prob(ct[0], ct[1]) | 1;
   const int update = cost_branch256(ct, *cur_p) + vp9_cost_zero(upd_p) >
-                     cost_branch256(ct, new_p) + vp9_cost_one(upd_p) + 7 * 256;
+                     cost_branch256(ct, new_p) + vp9_cost_one(upd_p) +
+                         (7 << VP9_PROB_COST_SHIFT);
   vpx_write(w, update, upd_p);
   if (update) {
     *cur_p = new_p;
@@ -206,7 +220,7 @@ void vp9_encode_mv(VP9_COMP* cpi, vpx_writer* w,
   const MV diff = {mv->row - ref->row,
                    mv->col - ref->col};
   const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff);
-  usehp = usehp && vp9_use_mv_hp(ref);
+  usehp = usehp && use_mv_hp(ref);
 
   vp9_write_token(w, vp9_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
   if (mv_joint_vertical(j))
@@ -230,13 +244,13 @@ void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
   build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp);
 }
 
-static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
+static void inc_mvs(const MODE_INFO *mi, const MB_MODE_INFO_EXT *mbmi_ext,
                     const int_mv mvs[2],
                     nmv_context_counts *counts) {
   int i;
 
-  for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
-    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
+  for (i = 0; i < 1 + has_second_ref(mi); ++i) {
+    const MV *ref = &mbmi_ext->ref_mvs[mi->ref_frame[i]][0].as_mv;
     const MV diff = {mvs[i].as_mv.row - ref->row,
                      mvs[i].as_mv.col - ref->col};
     vp9_inc_mv(&diff, counts);
@@ -246,24 +260,23 @@ static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
 void vp9_update_mv_count(ThreadData *td) {
   const MACROBLOCKD *xd = &td->mb.e_mbd;
   const MODE_INFO *mi = xd->mi[0];
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const MB_MODE_INFO_EXT *mbmi_ext = td->mb.mbmi_ext;
 
-  if (mbmi->sb_type < BLOCK_8X8) {
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[mbmi->sb_type];
+  if (mi->sb_type < BLOCK_8X8) {
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[mi->sb_type];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[mi->sb_type];
     int idx, idy;
 
     for (idy = 0; idy < 2; idy += num_4x4_h) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         const int i = idy * 2 + idx;
         if (mi->bmi[i].as_mode == NEWMV)
-          inc_mvs(mbmi, mbmi_ext, mi->bmi[i].as_mv, &td->counts->mv);
+          inc_mvs(mi, mbmi_ext, mi->bmi[i].as_mv, &td->counts->mv);
       }
     }
   } else {
-    if (mbmi->mode == NEWMV)
-      inc_mvs(mbmi, mbmi_ext, mbmi->mv, &td->counts->mv);
+    if (mi->mode == NEWMV)
+      inc_mvs(mi, mbmi_ext, mi->mv, &td->counts->mv);
   }
 }
 
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 72eafec..147f970 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -36,6 +36,7 @@
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_tile_common.h"
 
+#include "vp9/encoder/vp9_aq_360.h"
 #include "vp9/encoder/vp9_aq_complexity.h"
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/encoder/vp9_aq_variance.h"
@@ -47,6 +48,7 @@
 #include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mbgraph.h"
+#include "vp9/encoder/vp9_noise_estimate.h"
 #include "vp9/encoder/vp9_picklpf.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
@@ -60,8 +62,6 @@
 #define AM_SEGMENT_ID_INACTIVE 7
 #define AM_SEGMENT_ID_ACTIVE 0
 
-#define SHARP_FILTER_QTHRESH 0          /* Q threshold for 8-tap sharp filter */
-
 #define ALTREF_HIGH_PRECISION_MV 1      // Whether to use high precision mv
                                          //  for altref computation.
 #define HIGH_PRECISION_MV_QTHRESH 200   // Q threshold for high precision
@@ -86,6 +86,25 @@ FILE *kf_list;
 FILE *keyfile;
 #endif
 
+static const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
+  {LEVEL_1,   829440,      36864,    200,    400,   2, 1,  4,  8},
+  {LEVEL_1_1, 2764800,     73728,    800,    1000,  2, 1,  4,  8},
+  {LEVEL_2,   4608000,     122880,   1800,   1500,  2, 1,  4,  8},
+  {LEVEL_2_1, 9216000,     245760,   3600,   2800,  2, 2,  4,  8},
+  {LEVEL_3,   20736000,    552960,   7200,   6000,  2, 4,  4,  8},
+  {LEVEL_3_1, 36864000,    983040,   12000,  10000, 2, 4,  4,  8},
+  {LEVEL_4,   83558400,    2228224,  18000,  16000, 4, 4,  4,  8},
+  {LEVEL_4_1, 160432128,   2228224,  30000,  18000, 4, 4,  5,  6},
+  {LEVEL_5,   311951360,   8912896,  60000,  36000, 6, 8,  6,  4},
+  {LEVEL_5_1, 588251136,   8912896,  120000, 46000, 8, 8,  10, 4},
+  // TODO(huisu): update max_cpb_size for level 5_2 ~ 6_2 when
+  // they are finalized (currently TBD).
+  {LEVEL_5_2, 1176502272,  8912896,  180000, 0,     8, 8,  10, 4},
+  {LEVEL_6,   1176502272,  35651584, 180000, 0,     8, 16, 10, 4},
+  {LEVEL_6_1, 2353004544u, 35651584, 240000, 0,     8, 16, 10, 4},
+  {LEVEL_6_2, 4706009088u, 35651584, 480000, 0,     8, 16, 10, 4},
+};
+
 static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
   switch (mode) {
     case NORMAL:
@@ -116,11 +135,16 @@ static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
 // so memset cannot be used, instead only inactive blocks should be reset.
 static void suppress_active_map(VP9_COMP *cpi) {
   unsigned char *const seg_map = cpi->segmentation_map;
-  int i;
-  if (cpi->active_map.enabled || cpi->active_map.update)
-    for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+
+  if (cpi->active_map.enabled || cpi->active_map.update) {
+    const int rows = cpi->common.mi_rows;
+    const int cols = cpi->common.mi_cols;
+    int i;
+
+    for (i = 0; i < rows * cols; ++i)
       if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
         seg_map[i] = AM_SEGMENT_ID_ACTIVE;
+  }
 }
 
 static void apply_active_map(VP9_COMP *cpi) {
@@ -159,6 +183,39 @@ static void apply_active_map(VP9_COMP *cpi) {
   }
 }
 
+static void init_level_info(Vp9LevelInfo *level_info) {
+  Vp9LevelStats *const level_stats = &level_info->level_stats;
+  Vp9LevelSpec *const level_spec = &level_info->level_spec;
+
+  memset(level_stats, 0, sizeof(*level_stats));
+  memset(level_spec, 0, sizeof(*level_spec));
+  level_spec->level = LEVEL_UNKNOWN;
+  level_spec->min_altref_distance = INT_MAX;
+}
+
+VP9_LEVEL vp9_get_level(const Vp9LevelSpec * const level_spec) {
+  int i;
+  const Vp9LevelSpec *this_level;
+
+  vpx_clear_system_state();
+
+  for (i = 0; i < VP9_LEVELS; ++i) {
+    this_level = &vp9_level_defs[i];
+    if ((double)level_spec->max_luma_sample_rate * (1 + SAMPLE_RATE_GRACE_P) >
+        (double)this_level->max_luma_sample_rate ||
+        level_spec->max_luma_picture_size > this_level->max_luma_picture_size ||
+        level_spec->average_bitrate > this_level->average_bitrate ||
+        level_spec->max_cpb_size > this_level->max_cpb_size ||
+        level_spec->compression_ratio < this_level->compression_ratio ||
+        level_spec->max_col_tiles > this_level->max_col_tiles ||
+        level_spec->min_altref_distance < this_level->min_altref_distance ||
+        level_spec->max_ref_frame_buffers > this_level->max_ref_frame_buffers)
+      continue;
+    break;
+  }
+  return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level;
+}
+
 int vp9_set_active_map(VP9_COMP* cpi,
                        unsigned char* new_map_16x16,
                        int rows,
@@ -375,6 +432,9 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   vpx_free(cpi->active_map.map);
   cpi->active_map.map = NULL;
 
+  vpx_free(cpi->consec_zero_mv);
+  cpi->consec_zero_mv = NULL;
+
   vp9_free_ref_frame_buffers(cm->buffer_pool);
 #if CONFIG_VP9_POSTPROC
   vp9_free_postproc_buffers(cm);
@@ -410,6 +470,9 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   memset(&cpi->svc.scaled_frames[0], 0,
          MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0]));
 
+  vpx_free_frame_buffer(&cpi->svc.scaled_temp);
+  memset(&cpi->svc.scaled_temp, 0, sizeof(cpi->svc.scaled_temp));
+
   vpx_free_frame_buffer(&cpi->svc.empty_frame.img);
   memset(&cpi->svc.empty_frame, 0, sizeof(cpi->svc.empty_frame));
 
@@ -607,7 +670,7 @@ static void update_reference_segmentation_map(VP9_COMP *cpi) {
     MODE_INFO **mi_8x8 = mi_8x8_ptr;
     uint8_t *cache = cache_ptr;
     for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++)
-      cache[0] = mi_8x8[0]->mbmi.segment_id;
+      cache[0] = mi_8x8[0]->segment_id;
     mi_8x8_ptr += cm->mi_stride;
     cache_ptr += cm->mi_cols;
   }
@@ -768,7 +831,6 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
 
   cpi->oxcf = *oxcf;
   cpi->framerate = oxcf->init_framerate;
-
   cm->profile = oxcf->profile;
   cm->bit_depth = oxcf->bit_depth;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -777,6 +839,9 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
   cm->color_space = oxcf->color_space;
   cm->color_range = oxcf->color_range;
 
+  cpi->target_level = oxcf->target_level;
+  cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX;
+
   cm->width = oxcf->width;
   cm->height = oxcf->height;
   alloc_compressor_data(cpi);
@@ -805,6 +870,8 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
   cpi->ref_frame_flags = 0;
 
   init_buffer_indices(cpi);
+
+  vp9_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
 }
 
 static void set_rc_buffer_sizes(RATE_CONTROL *rc,
@@ -1465,6 +1532,9 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   cm->color_space = oxcf->color_space;
   cm->color_range = oxcf->color_range;
 
+  cpi->target_level = oxcf->target_level;
+  cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX;
+
   if (cm->profile <= PROFILE_1)
     assert(cm->bit_depth == VPX_BITS_8);
   else
@@ -1475,7 +1545,11 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   cpi->td.mb.e_mbd.bd = (int)cm->bit_depth;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+  if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) {
+    rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+  } else {
+    rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+  }
 
   cpi->refresh_golden_frame = 0;
   cpi->refresh_last_frame = 1;
@@ -1519,6 +1593,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) {
     cm->width = cpi->oxcf.width;
     cm->height = cpi->oxcf.height;
+    cpi->external_resize = 1;
   }
 
   if (cpi->initial_width) {
@@ -1530,10 +1605,22 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
       alloc_compressor_data(cpi);
       realloc_segmentation_maps(cpi);
       cpi->initial_width = cpi->initial_height = 0;
+      cpi->external_resize = 0;
+    } else if (cm->mi_alloc_size == new_mi_size &&
+             (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) {
+        vp9_alloc_loop_filter(cm);
     }
   }
+
   update_frame_size(cpi);
 
+  if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) {
+    memset(cpi->consec_zero_mv, 0,
+               cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      vp9_cyclic_refresh_reset_resize(cpi);
+  }
+
   if ((cpi->svc.number_temporal_layers > 1 &&
       cpi->oxcf.rc_mode == VPX_CBR) ||
       ((cpi->svc.number_temporal_layers > 1 ||
@@ -1567,7 +1654,30 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
 #endif
 #define log2f(x) (log (x) / (float) M_LOG2_E)
 
+/***********************************************************************
+ * Read before modifying 'cal_nmvjointsadcost' or 'cal_nmvsadcosts'    *
+ ***********************************************************************
+ * The following 2 functions ('cal_nmvjointsadcost' and                *
+ * 'cal_nmvsadcosts') are used to calculate cost lookup tables         *
+ * used by 'vp9_diamond_search_sad'. The C implementation of the       *
+ * function is generic, but the AVX intrinsics optimised version       *
+ * relies on the following properties of the computed tables:          *
+ * For cal_nmvjointsadcost:                                            *
+ *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]     *
+ * For cal_nmvsadcosts:                                                *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                   *
+ *         (Equal costs for both components)                           *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                  *
+ *         (Cost function is even)                                     *
+ * If these do not hold, then the AVX optimised version of the         *
+ * 'vp9_diamond_search_sad' function cannot be used as it is, in which *
+ * case you can revert to using the C function instead.                *
+ ***********************************************************************/
+
 static void cal_nmvjointsadcost(int *mvjointsadcost) {
+  /*********************************************************************
+   * Warning: Read the comments above before modifying this function   *
+   *********************************************************************/
   mvjointsadcost[0] = 600;
   mvjointsadcost[1] = 300;
   mvjointsadcost[2] = 300;
@@ -1575,6 +1685,9 @@ static void cal_nmvjointsadcost(int *mvjointsadcost) {
 }
 
 static void cal_nmvsadcosts(int *mvsadcost[2]) {
+  /*********************************************************************
+   * Warning: Read the comments above before modifying this function   *
+   *********************************************************************/
   int i = 1;
 
   mvsadcost[0][0] = 0;
@@ -1604,7 +1717,6 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
   } while (++i <= MV_MAX);
 }
 
-
 VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
                                 BufferPool *const pool) {
   unsigned int i;
@@ -1635,12 +1747,12 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 
   cpi->use_svc = 0;
   cpi->resize_state = 0;
+  cpi->external_resize = 0;
   cpi->resize_avg_qp = 0;
   cpi->resize_buffer_underflow = 0;
+  cpi->use_skin_detection = 0;
   cpi->common.buffer_pool = pool;
 
-  cpi->rc.high_source_sad = 0;
-
   init_config(cpi, oxcf);
   vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
 
@@ -1650,6 +1762,10 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 
   realloc_segmentation_maps(cpi);
 
+  CHECK_MEM_ERROR(cm, cpi->consec_zero_mv,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols,
+                             sizeof(*cpi->consec_zero_mv)));
+
   CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
   CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
@@ -1689,6 +1805,9 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
   cpi->multi_arf_last_grp_enabled = 0;
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+
+  init_level_info(&cpi->level_info);
+
 #if CONFIG_INTERNAL_STATS
   cpi->b_calculate_ssimg = 0;
   cpi->b_calculate_blockiness = 1;
@@ -1727,8 +1846,9 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
   }
 
   if (cpi->b_calculate_consistency) {
-    cpi->ssim_vars = vpx_malloc(sizeof(*cpi->ssim_vars) *
-                                4 * cpi->common.mi_rows * cpi->common.mi_cols);
+    CHECK_MEM_ERROR(cm, cpi->ssim_vars,
+                    vpx_malloc(sizeof(*cpi->ssim_vars) * 4 *
+                               cpi->common.mi_rows * cpi->common.mi_cols));
     cpi->worst_consistency = 100.0;
   }
 
@@ -1736,6 +1856,10 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 
   cpi->first_time_stamp_ever = INT64_MAX;
 
+  /*********************************************************************
+   * Warning: Read the comments around 'cal_nmvjointsadcost' and       *
+   * 'cal_nmvsadcosts' before modifying how these tables are computed. *
+   *********************************************************************/
   cal_nmvjointsadcost(cpi->td.mb.nmvjointsadcost);
   cpi->td.mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
   cpi->td.mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
@@ -1928,11 +2052,14 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 
   return cpi;
 }
+
+#if CONFIG_INTERNAL_STATS
 #define SNPRINT(H, T) \
   snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
 
 #define SNPRINT2(H, T, V) \
   snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
+#endif  // CONFIG_INTERNAL_STATS
 
 void vp9_remove_compressor(VP9_COMP *cpi) {
   VP9_COMMON *cm;
@@ -1958,6 +2085,8 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
       const double dr =
           (double)cpi->bytes * (double) 8 / (double)1000 / time_encoded;
       const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
+      const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000;
+      const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
 
       if (cpi->b_calculate_psnr) {
         const double total_psnr =
@@ -2009,8 +2138,9 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
           SNPRINT2(results, "\t%7.3f", cpi->ssimg.worst);
         }
 
-        fprintf(f, "%s\t    Time\n", headings);
-        fprintf(f, "%s\t%8.0f\n", results, total_encode_time);
+        fprintf(f, "%s\t    Time  Rc-Err Abs Err\n", headings);
+        fprintf(f, "%s\t%8.0f %7.2f %7.2f\n", results,
+                total_encode_time, rate_err, fabs(rate_err));
       }
 
       fclose(f);
@@ -2128,7 +2258,7 @@ static void encoder_variance(const uint8_t *a, int  a_stride,
 static void encoder_highbd_variance64(const uint8_t *a8, int  a_stride,
                                       const uint8_t *b8, int  b_stride,
                                       int w, int h, uint64_t *sse,
-                                      uint64_t *sum) {
+                                      int64_t *sum) {
   int i, j;
 
   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
@@ -2152,7 +2282,7 @@ static void encoder_highbd_8_variance(const uint8_t *a8, int  a_stride,
                                       int w, int h,
                                       unsigned int *sse, int *sum) {
   uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
+  int64_t sum_long = 0;
   encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h,
                             &sse_long, &sum_long);
   *sse = (unsigned int)sse_long;
@@ -2574,10 +2704,6 @@ static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
 #if CONFIG_VP9_HIGHBITDEPTH
 static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                                    YV12_BUFFER_CONFIG *dst, int bd) {
-#else
-static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                   YV12_BUFFER_CONFIG *dst) {
-#endif  // CONFIG_VP9_HIGHBITDEPTH
   const int src_w = src->y_crop_width;
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
@@ -2589,19 +2715,18 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
   int x, y, i;
 
-  for (y = 0; y < dst_h; y += 16) {
-    for (x = 0; x < dst_w; x += 16) {
-      for (i = 0; i < MAX_MB_PLANE; ++i) {
-        const int factor = (i == 0 || i == 3 ? 1 : 2);
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    const int factor = (i == 0 || i == 3 ? 1 : 2);
+    const int src_stride = src_strides[i];
+    const int dst_stride = dst_strides[i];
+    for (y = 0; y < dst_h; y += 16) {
+      const int y_q4 = y * (16 / factor) * src_h / dst_h;
+      for (x = 0; x < dst_w; x += 16) {
         const int x_q4 = x * (16 / factor) * src_w / dst_w;
-        const int y_q4 = y * (16 / factor) * src_h / dst_h;
-        const int src_stride = src_strides[i];
-        const int dst_stride = dst_strides[i];
         const uint8_t *src_ptr = srcs[i] + (y / factor) * src_h / dst_h *
-                                     src_stride + (x / factor) * src_w / dst_w;
+                                   src_stride + (x / factor) * src_w / dst_w;
         uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
 
-#if CONFIG_VP9_HIGHBITDEPTH
         if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
           vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
                                kernel[x_q4 & 0xf], 16 * src_w / dst_w,
@@ -2613,18 +2738,49 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
                         kernel[y_q4 & 0xf], 16 * src_h / dst_h,
                         16 / factor, 16 / factor);
         }
+      }
+    }
+  }
+
+  vpx_extend_frame_borders(dst);
+}
 #else
+void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
+                                  YV12_BUFFER_CONFIG *dst) {
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const int dst_w = dst->y_crop_width;
+  const int dst_h = dst->y_crop_height;
+  const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+  const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+  uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+  const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
+  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
+  int x, y, i;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    const int factor = (i == 0 || i == 3 ? 1 : 2);
+    const int src_stride = src_strides[i];
+    const int dst_stride = dst_strides[i];
+    for (y = 0; y < dst_h; y += 16) {
+      const int y_q4 = y * (16 / factor) * src_h / dst_h;
+      for (x = 0; x < dst_w; x += 16) {
+        const int x_q4 = x * (16 / factor) * src_w / dst_w;
+        const uint8_t *src_ptr = srcs[i] + (y / factor) * src_h / dst_h *
+                                   src_stride + (x / factor) * src_w / dst_w;
+        uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+
         vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
                       kernel[x_q4 & 0xf], 16 * src_w / dst_w,
                       kernel[y_q4 & 0xf], 16 * src_h / dst_h,
                       16 / factor, 16 / factor);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
       }
     }
   }
 
   vpx_extend_frame_borders(dst);
 }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static int scale_down(VP9_COMP *cpi, int q) {
   RATE_CONTROL *const rc = &cpi->rc;
@@ -2641,6 +2797,13 @@ static int scale_down(VP9_COMP *cpi, int q) {
   return scale;
 }
 
+static int big_rate_miss(VP9_COMP *cpi, int high_limit, int low_limit) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+
+  return (rc->projected_frame_size > ((high_limit * 3) / 2)) ||
+         (rc->projected_frame_size < (low_limit / 2));
+}
+
 // Function to test for conditions that indicate we should loop
 // back and recode a frame.
 static int recode_loop_test(VP9_COMP *cpi,
@@ -2652,6 +2815,7 @@ static int recode_loop_test(VP9_COMP *cpi,
   int force_recode = 0;
 
   if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+      big_rate_miss(cpi, high_limit, low_limit) ||
       (cpi->sf.recode_loop == ALLOW_RECODE) ||
       (frame_is_kfgfarf &&
        (cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
@@ -2693,7 +2857,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
   } else if (vp9_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
     // new ARF frame. However, in the short term in function
-    // vp9_bitstream.c::get_refresh_mask() we left it in the GF slot and, if
+    // vp9_get_refresh_mask() we left it in the GF slot and, if
     // we're updating the GF with the current decoded frame, we save it to the
     // ARF slot instead.
     // We now have to update the ARF with the current frame and swap gld_fb_idx
@@ -2750,7 +2914,8 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
              sizeof(cpi->interp_filter_selected[0]));
   }
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0) {
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      cpi->denoiser.denoising_level > kDenLowLow) {
     vp9_denoiser_update_frame_info(&cpi->denoiser,
                                    *cpi->Source,
                                    cpi->common.frame_type,
@@ -2760,6 +2925,22 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
                                    cpi->resize_pending);
   }
 #endif
+  if (is_one_pass_cbr_svc(cpi)) {
+    // Keep track of frame index for each reference frame.
+    SVC *const svc = &cpi->svc;
+    if (cm->frame_type == KEY_FRAME) {
+      svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe;
+      svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe;
+      svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe;
+    } else {
+      if (cpi->refresh_last_frame)
+        svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe;
+      if (cpi->refresh_golden_frame)
+        svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe;
+      if (cpi->refresh_alt_ref_frame)
+        svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe;
+    }
+  }
 }
 
 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
@@ -2768,6 +2949,7 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
 
   if (xd->lossless) {
       lf->filter_level = 0;
+      lf->last_filt_level = 0;
   } else {
     struct vpx_usec_timer timer;
 
@@ -2775,7 +2957,16 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
 
     vpx_usec_timer_start(&timer);
 
-    vp9_pick_filter_level(cpi->Source, cpi, cpi->sf.lpf_pick);
+    if (!cpi->rc.is_src_frame_alt_ref) {
+      if ((cpi->common.frame_type == KEY_FRAME) &&
+          (!cpi->rc.this_key_frame_forced)) {
+        lf->last_filt_level = 0;
+      }
+      vp9_pick_filter_level(cpi->Source, cpi, cpi->sf.lpf_pick);
+      lf->last_filt_level = lf->filter_level;
+    } else {
+      lf->filter_level = 0;
+    }
 
     vpx_usec_timer_mark(&timer);
     cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
@@ -2796,16 +2987,16 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
   vpx_extend_frame_inner_borders(cm->frame_to_show);
 }
 
-static INLINE void alloc_frame_mvs(const VP9_COMMON *cm,
+static INLINE void alloc_frame_mvs(VP9_COMMON *const cm,
                                    int buffer_idx) {
   RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
   if (new_fb_ptr->mvs == NULL ||
       new_fb_ptr->mi_rows < cm->mi_rows ||
       new_fb_ptr->mi_cols < cm->mi_cols) {
     vpx_free(new_fb_ptr->mvs);
-    new_fb_ptr->mvs =
-      (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
-                           sizeof(*new_fb_ptr->mvs));
+    CHECK_MEM_ERROR(cm, new_fb_ptr->mvs,
+                    (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                         sizeof(*new_fb_ptr->mvs)));
     new_fb_ptr->mi_rows = cm->mi_rows;
     new_fb_ptr->mi_cols = cm->mi_cols;
   }
@@ -2843,12 +3034,13 @@ void vp9_scale_references(VP9_COMP *cpi) {
         if (force_scaling ||
             new_fb_ptr->buf.y_crop_width != cm->width ||
             new_fb_ptr->buf.y_crop_height != cm->height) {
-          vpx_realloc_frame_buffer(&new_fb_ptr->buf,
-                                   cm->width, cm->height,
-                                   cm->subsampling_x, cm->subsampling_y,
-                                   cm->use_highbitdepth,
-                                   VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
-                                   NULL, NULL, NULL);
+          if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
+                                       cm->subsampling_x, cm->subsampling_y,
+                                       cm->use_highbitdepth,
+                                       VP9_ENC_BORDER_IN_PIXELS,
+                                       cm->byte_alignment, NULL, NULL, NULL))
+            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
           scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth);
           cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
@@ -2868,19 +3060,31 @@ void vp9_scale_references(VP9_COMP *cpi) {
         if (force_scaling ||
             new_fb_ptr->buf.y_crop_width != cm->width ||
             new_fb_ptr->buf.y_crop_height != cm->height) {
-          vpx_realloc_frame_buffer(&new_fb_ptr->buf,
-                                   cm->width, cm->height,
-                                   cm->subsampling_x, cm->subsampling_y,
-                                   VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
-                                   NULL, NULL, NULL);
-          scale_and_extend_frame(ref, &new_fb_ptr->buf);
+          if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
+                                       cm->subsampling_x, cm->subsampling_y,
+                                       VP9_ENC_BORDER_IN_PIXELS,
+                                       cm->byte_alignment, NULL, NULL, NULL))
+            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
+          vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf);
           cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
         }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       } else {
-        const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
-        RefCntBuffer *const buf = &pool->frame_bufs[buf_idx];
+        int buf_idx;
+        RefCntBuffer *buf = NULL;
+        if (cpi->oxcf.pass == 0 && !cpi->use_svc) {
+          // Check for release of scaled reference.
+          buf_idx = cpi->scaled_ref_idx[ref_frame - 1];
+          buf = (buf_idx != INVALID_IDX) ? &pool->frame_bufs[buf_idx] : NULL;
+          if (buf != NULL) {
+            --buf->ref_count;
+            cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+          }
+        }
+        buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+        buf = &pool->frame_bufs[buf_idx];
         buf->buf.y_crop_width = ref->y_crop_width;
         buf->buf.y_crop_height = ref->y_crop_height;
         cpi->scaled_ref_idx[ref_frame - 1] = buf_idx;
@@ -2959,18 +3163,49 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
 
   vpx_clear_system_state();
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    recon_err = vp9_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+  } else {
+    recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+  }
+#else
   recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+
+  if (cpi->twopass.total_left_stats.coded_error != 0.0) {
+    double dc_quant_devisor;
+#if CONFIG_VP9_HIGHBITDEPTH
+    switch (cm->bit_depth) {
+      case VPX_BITS_8:
+        dc_quant_devisor = 4.0;
+        break;
+      case VPX_BITS_10:
+        dc_quant_devisor = 16.0;
+        break;
+      case VPX_BITS_12:
+        dc_quant_devisor = 64.0;
+        break;
+      default:
+        assert(0 && "bit_depth must be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+        break;
+    }
+#else
+    dc_quant_devisor = 4.0;
+#endif
 
-  if (cpi->twopass.total_left_stats.coded_error != 0.0)
-    fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
+    fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d"
        "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
        "%10"PRId64" %10"PRId64" %10d "
        "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
         "%6d %6d %5d %5d %5d "
         "%10"PRId64" %10.3lf"
-        "%10lf %8u %10"PRId64" %10d %10d %10d\n",
+        "%10lf %8u %10"PRId64" %10d %10d %10d %10d %10d\n",
         cpi->common.current_video_frame,
         cm->width, cm->height,
+        cpi->td.rd_counts.m_search_count,
+        cpi->td.rd_counts.ex_search_count,
         cpi->rc.source_alt_ref_pending,
         cpi->rc.source_alt_ref_active,
         cpi->rc.this_frame_target,
@@ -2985,7 +3220,8 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
         (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target),
         cpi->rc.total_actual_bits, cm->base_qindex,
         vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth),
-        (double)vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) / 4.0,
+        (double)vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) /
+            dc_quant_devisor,
         vp9_convert_qindex_to_q(cpi->twopass.active_worst_quality,
                                 cm->bit_depth),
         cpi->rc.avg_q,
@@ -2998,8 +3234,10 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
             (1 + cpi->twopass.total_left_stats.coded_error),
         cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost,
         cpi->twopass.kf_zeromotion_pct,
-        cpi->twopass.fr_content_type);
-
+        cpi->twopass.fr_content_type,
+        cm->lf.filter_level,
+        cm->seg.aq_av_offset);
+  }
   fclose(f);
 
   if (0) {
@@ -3074,7 +3312,7 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q,
   if (oxcf->pass == 2 && cpi->sf.static_segmentation)
     configure_static_seg_features(cpi);
 
-#if CONFIG_VP9_POSTPROC
+#if CONFIG_VP9_POSTPROC && !(CONFIG_VP9_TEMPORAL_DENOISING)
   if (oxcf->noise_sensitivity > 0) {
     int l = 0;
     switch (oxcf->noise_sensitivity) {
@@ -3105,12 +3343,14 @@ static void setup_denoiser_buffer(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (cpi->oxcf.noise_sensitivity > 0 &&
       !cpi->denoiser.frame_buffer_initialized) {
-    vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
-                       cm->subsampling_x, cm->subsampling_y,
+    if (vp9_denoiser_alloc(&cpi->denoiser, cm->width, cm->height,
+                           cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
-                       cm->use_highbitdepth,
+                           cm->use_highbitdepth,
 #endif
-                       VP9_ENC_BORDER_IN_PIXELS);
+                           VP9_ENC_BORDER_IN_PIXELS))
+      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                         "Failed to allocate denoiser");
   }
 }
 #endif
@@ -3160,6 +3400,7 @@ static void set_frame_size(VP9_COMP *cpi) {
     // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
     set_mv_search_params(cpi);
 
+    vp9_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
 #if CONFIG_VP9_TEMPORAL_DENOISING
     // Reset the denoiser on the resized frame.
     if (cpi->oxcf.noise_sensitivity > 0) {
@@ -3182,14 +3423,15 @@ static void set_frame_size(VP9_COMP *cpi) {
   alloc_frame_mvs(cm, cm->new_fb_idx);
 
   // Reset the frame pointers to the current frame size.
-  vpx_realloc_frame_buffer(get_frame_new_buffer(cm),
-                           cm->width, cm->height,
-                           cm->subsampling_x, cm->subsampling_y,
+  if (vpx_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
-                           cm->use_highbitdepth,
+                               cm->use_highbitdepth,
 #endif
-                           VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
-                           NULL, NULL, NULL);
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffer");
 
   alloc_util_frame_buffers(cpi);
   init_motion_estimation(cpi);
@@ -3234,43 +3476,70 @@ static void encode_without_recode_loop(VP9_COMP *cpi,
 
   set_frame_size(cpi);
 
-  cpi->Source = vp9_scale_if_required(cm,
-                                      cpi->un_scaled_source,
-                                      &cpi->scaled_source,
-                                      (cpi->oxcf.pass == 0));
-
+  if (is_one_pass_cbr_svc(cpi) &&
+      cpi->un_scaled_source->y_width == cm->width << 2 &&
+      cpi->un_scaled_source->y_height == cm->height << 2 &&
+      cpi->svc.scaled_temp.y_width == cm->width << 1 &&
+      cpi->svc.scaled_temp.y_height == cm->height << 1) {
+    cpi->Source = vp9_svc_twostage_scale(cm,
+                                         cpi->un_scaled_source,
+                                         &cpi->scaled_source,
+                                         &cpi->svc.scaled_temp);
+  } else {
+    cpi->Source = vp9_scale_if_required(cm,
+                                        cpi->un_scaled_source,
+                                        &cpi->scaled_source,
+                                        (cpi->oxcf.pass == 0));
+  }
   // Avoid scaling last_source unless its needed.
-  // Last source is currently only used for screen-content mode,
-  // or if partition_search_type == SOURCE_VAR_BASED_PARTITION.
+  // Last source is needed if vp9_avg_source_sad() is used, or if
+  // partition_search_type == SOURCE_VAR_BASED_PARTITION, or if noise
+  // estimation is enabled.
   if (cpi->unscaled_last_source != NULL &&
       (cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
-      cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION))
+      (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_VBR &&
+      cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5) ||
+      cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION ||
+      cpi->noise_estimate.enabled))
     cpi->Last_Source = vp9_scale_if_required(cm,
                                              cpi->unscaled_last_source,
                                              &cpi->scaled_last_source,
                                              (cpi->oxcf.pass == 0));
 
-#if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 &&
-      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-    vp9_denoiser_update_noise_estimate(cpi);
+  if (cm->frame_type == KEY_FRAME || cpi->resize_pending != 0) {
+    memset(cpi->consec_zero_mv, 0,
+           cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
   }
-#endif
+
+  vp9_update_noise_estimate(cpi);
 
   if (cpi->oxcf.pass == 0 &&
-      cpi->oxcf.rc_mode == VPX_CBR &&
+      cpi->oxcf.mode == REALTIME &&
+      cpi->oxcf.speed >= 5 &&
       cpi->resize_state == 0 &&
       cm->frame_type != KEY_FRAME &&
-      cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+      (cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
+       cpi->oxcf.rc_mode == VPX_VBR))
     vp9_avg_source_sad(cpi);
 
-  if (frame_is_intra_only(cm) == 0) {
+  // For 1 pass SVC, since only ZEROMV is allowed for upsampled reference
+  // frame (i.e, svc->force_zero_mode_spatial_ref = 0), we can avoid this
+  // frame-level upsampling.
+  if (frame_is_intra_only(cm) == 0 && !is_one_pass_cbr_svc(cpi)) {
     vp9_scale_references(cpi);
   }
 
   set_size_independent_vars(cpi);
   set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
+  if (cpi->oxcf.speed >= 5 &&
+      cpi->oxcf.pass == 0 &&
+      cpi->oxcf.rc_mode == VPX_CBR &&
+      cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    cpi->use_skin_detection = 1;
+  }
+
   vp9_set_quantizer(cm, q);
   vp9_set_variance_partition_thresholds(cpi, q);
 
@@ -3281,6 +3550,8 @@ static void encode_without_recode_loop(VP9_COMP *cpi,
   // exclusive.
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
     vp9_vaq_frame_setup(cpi);
+  } else if (cpi->oxcf.aq_mode == EQUATOR360_AQ) {
+    vp9_360aq_frame_setup(cpi);
   } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
     vp9_setup_in_frame_q_adj(cpi);
   } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
@@ -3411,6 +3682,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
     // exclusive.
     if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
       vp9_vaq_frame_setup(cpi);
+    } else if (cpi->oxcf.aq_mode == EQUATOR360_AQ) {
+      vp9_360aq_frame_setup(cpi);
     } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
       vp9_setup_in_frame_q_adj(cpi);
     }
@@ -3642,6 +3915,25 @@ static void set_ext_overrides(VP9_COMP *cpi) {
   }
 }
 
+YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
+                                           YV12_BUFFER_CONFIG *unscaled,
+                                           YV12_BUFFER_CONFIG *scaled,
+                                           YV12_BUFFER_CONFIG *scaled_temp) {
+  if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+      cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth);
+    scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth);
+#else
+    vp9_scale_and_extend_frame(unscaled, scaled_temp);
+    vp9_scale_and_extend_frame(scaled_temp, scaled);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    return scaled;
+  } else {
+    return unscaled;
+  }
+}
+
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
                                           YV12_BUFFER_CONFIG *scaled,
@@ -3649,13 +3941,17 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
   if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
       cm->mi_rows * MI_SIZE != unscaled->y_height) {
 #if CONFIG_VP9_HIGHBITDEPTH
-    if (use_normative_scaler)
+    if (use_normative_scaler &&
+        unscaled->y_width <= (scaled->y_width << 1) &&
+        unscaled->y_height <= (scaled->y_height << 1))
       scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth);
     else
       scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
 #else
-    if (use_normative_scaler)
-      scale_and_extend_frame(unscaled, scaled);
+    if (use_normative_scaler &&
+        unscaled->y_width <= (scaled->y_width << 1) &&
+        unscaled->y_height <= (scaled->y_height << 1))
+      vp9_scale_and_extend_frame(unscaled, scaled);
     else
       scale_and_extend_frame_nonnormative(unscaled, scaled);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -3797,14 +4093,23 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   }
 
   // For 1 pass CBR, check if we are dropping this frame.
-  // Never drop on key frame.
+  // For spatial layers, for now only check for frame-dropping on first spatial
+  // layer, and if decision is to drop, we drop whole super-frame.
   if (oxcf->pass == 0 &&
       oxcf->rc_mode == VPX_CBR &&
       cm->frame_type != KEY_FRAME) {
-    if (vp9_rc_drop_frame(cpi)) {
+    if (vp9_rc_drop_frame(cpi) ||
+        (is_one_pass_cbr_svc(cpi) && cpi->svc.rc_drop_superframe == 1)) {
       vp9_rc_postencode_update_drop_frame(cpi);
       ++cm->current_video_frame;
       cpi->ext_refresh_frame_flags_pending = 0;
+      cpi->svc.rc_drop_superframe = 1;
+      // TODO(marpan): Advancing the svc counters on dropped frames can break
+      // the referencing scheme for the fixed svc patterns defined in
+      // vp9_one_pass_cbr_svc_start_layer(). Look into fixing this issue, but
+      // for now, don't advance the svc frame counters on dropped frame.
+      // if (cpi->use_svc)
+      //   vp9_inc_frame_in_layer(cpi);
       return;
     }
   }
@@ -4020,13 +4325,16 @@ static void check_initial_width(VP9_COMP *cpi,
 int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time) {
-  VP9_COMMON *cm = &cpi->common;
+  VP9_COMMON *const cm = &cpi->common;
   struct vpx_usec_timer timer;
   int res = 0;
   const int subsampling_x = sd->subsampling_x;
   const int subsampling_y = sd->subsampling_y;
 #if CONFIG_VP9_HIGHBITDEPTH
-  const int use_highbitdepth = sd->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
   check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
 #else
   check_initial_width(cpi, subsampling_x, subsampling_y);
@@ -4171,6 +4479,124 @@ static void adjust_image_stat(double y, double u, double v, double all,
 }
 #endif  // CONFIG_INTERNAL_STATS
 
+static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
+  VP9_COMMON *const cm = &cpi->common;
+  Vp9LevelInfo *const level_info = &cpi->level_info;
+  Vp9LevelSpec *const level_spec = &level_info->level_spec;
+  Vp9LevelStats *const level_stats = &level_info->level_stats;
+  int i, idx;
+  uint64_t luma_samples, dur_end;
+  const uint32_t luma_pic_size = cm->width * cm->height;
+  double cpb_data_size;
+
+  vpx_clear_system_state();
+
+  // update level_stats
+  level_stats->total_compressed_size += *size;
+  if (cm->show_frame) {
+    level_stats->total_uncompressed_size +=
+        luma_pic_size +
+        2 * (luma_pic_size >> (cm->subsampling_x + cm->subsampling_y));
+    level_stats->time_encoded =
+        (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) /
+        (double)TICKS_PER_SEC;
+  }
+
+  if (arf_src_index > 0) {
+    if (!level_stats->seen_first_altref) {
+      level_stats->seen_first_altref = 1;
+    } else if (level_stats->frames_since_last_altref <
+             level_spec->min_altref_distance) {
+      level_spec->min_altref_distance = level_stats->frames_since_last_altref;
+    }
+    level_stats->frames_since_last_altref = 0;
+  } else {
+    ++level_stats->frames_since_last_altref;
+  }
+
+  if (level_stats->frame_window_buffer.len < FRAME_WINDOW_SIZE - 1) {
+    idx = (level_stats->frame_window_buffer.start +
+           level_stats->frame_window_buffer.len++) % FRAME_WINDOW_SIZE;
+  } else {
+    idx = level_stats->frame_window_buffer.start;
+    level_stats->frame_window_buffer.start = (idx + 1) % FRAME_WINDOW_SIZE;
+  }
+  level_stats->frame_window_buffer.buf[idx].ts = cpi->last_time_stamp_seen;
+  level_stats->frame_window_buffer.buf[idx].size = (uint32_t)(*size);
+  level_stats->frame_window_buffer.buf[idx].luma_samples = luma_pic_size;
+
+  if (cm->frame_type == KEY_FRAME) {
+    level_stats->ref_refresh_map = 0;
+  } else {
+    int count = 0;
+    level_stats->ref_refresh_map |= vp9_get_refresh_mask(cpi);
+    // Also need to consider the case where the encoder refers to a buffer
+    // that has been implicitly refreshed after encoding a keyframe.
+    if (!cm->intra_only) {
+      level_stats->ref_refresh_map |= (1 << cpi->lst_fb_idx);
+      level_stats->ref_refresh_map |= (1 << cpi->gld_fb_idx);
+      level_stats->ref_refresh_map |= (1 << cpi->alt_fb_idx);
+    }
+    for (i = 0; i < REF_FRAMES; ++i) {
+      count += (level_stats->ref_refresh_map >> i) & 1;
+    }
+    if (count > level_spec->max_ref_frame_buffers) {
+      level_spec->max_ref_frame_buffers = count;
+    }
+  }
+
+  // update average_bitrate
+  level_spec->average_bitrate =
+      (double)level_stats->total_compressed_size / 125.0 /
+      level_stats->time_encoded;
+
+  // update max_luma_sample_rate
+  luma_samples = 0;
+  for (i = 0; i < level_stats->frame_window_buffer.len; ++i) {
+    idx = (level_stats->frame_window_buffer.start +
+           level_stats->frame_window_buffer.len - 1 - i) % FRAME_WINDOW_SIZE;
+    if (i == 0) {
+      dur_end = level_stats->frame_window_buffer.buf[idx].ts;
+    }
+    if (dur_end - level_stats->frame_window_buffer.buf[idx].ts >=
+        TICKS_PER_SEC) {
+      break;
+    }
+    luma_samples += level_stats->frame_window_buffer.buf[idx].luma_samples;
+  }
+  if (luma_samples > level_spec->max_luma_sample_rate) {
+    level_spec->max_luma_sample_rate = luma_samples;
+  }
+
+  // update max_cpb_size
+  cpb_data_size = 0;
+  for (i = 0; i < CPB_WINDOW_SIZE; ++i) {
+    if (i >= level_stats->frame_window_buffer.len) break;
+    idx = (level_stats->frame_window_buffer.start +
+           level_stats->frame_window_buffer.len - 1 - i) % FRAME_WINDOW_SIZE;
+    cpb_data_size += level_stats->frame_window_buffer.buf[idx].size;
+  }
+  cpb_data_size = cpb_data_size / 125.0;
+  if (cpb_data_size > level_spec->max_cpb_size) {
+    level_spec->max_cpb_size = cpb_data_size;
+  }
+
+  // update max_luma_picture_size
+  if (luma_pic_size > level_spec->max_luma_picture_size) {
+    level_spec->max_luma_picture_size = luma_pic_size;
+  }
+
+  // update compression_ratio
+  level_spec->compression_ratio =
+      (double)level_stats->total_uncompressed_size * cm->bit_depth /
+      level_stats->total_compressed_size / 8.0;
+
+  // update max_col_tiles
+  if (level_spec->max_col_tiles < (1 << cm->log2_tile_cols)) {
+    level_spec->max_col_tiles = (1 << cm->log2_tile_cols);
+  }
+}
+
 int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest,
                             int64_t *time_stamp, int64_t *time_end, int flush) {
@@ -4228,6 +4654,20 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     arf_src_index = 0;
 
   if (arf_src_index) {
+    for (i = 0; i <= arf_src_index; ++i) {
+      struct lookahead_entry *e = vp9_lookahead_peek(cpi->lookahead, i);
+      // Avoid creating an alt-ref if there's a forced keyframe pending.
+      if (e == NULL) {
+        break;
+      } else if (e->flags == VPX_EFLAG_FORCE_KF) {
+        arf_src_index = 0;
+        flush = 1;
+        break;
+      }
+    }
+  }
+
+  if (arf_src_index) {
     assert(arf_src_index <= rc->frames_to_key);
 
     if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
@@ -4247,7 +4687,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
       cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1;
 #endif
 
-      if (oxcf->arnr_max_frames > 0) {
+      if ((oxcf->arnr_max_frames > 0) && (oxcf->arnr_strength > 0)) {
         // Produce the filtered ARF frame.
         vp9_temporal_filter(cpi, arf_src_index);
         vpx_extend_frame_borders(&cpi->alt_ref_buffer);
@@ -4427,6 +4867,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   if (cpi->b_calculate_psnr && oxcf->pass != 1 && cm->show_frame)
     generate_psnr_packet(cpi);
 
+  if (cpi->keep_level_stats && oxcf->pass != 1)
+    update_level_info(cpi, size, arf_src_index);
+
 #if CONFIG_INTERNAL_STATS
 
   if (oxcf->pass != 1) {
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 159c03a..128b623 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -20,6 +20,7 @@
 #include "vpx_dsp/ssim.h"
 #endif
 #include "vpx_dsp/variance.h"
+#include "vpx_ports/system_state.h"
 #include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
@@ -35,6 +36,7 @@
 #include "vp9/encoder/vp9_lookahead.h"
 #include "vp9/encoder/vp9_mbgraph.h"
 #include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_noise_estimate.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
@@ -50,6 +52,9 @@
 extern "C" {
 #endif
 
+// vp9 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000
+
 typedef struct {
   int nmvjointcost[MV_JOINTS];
   int nmvcosts[2][MV_VALS];
@@ -111,6 +116,7 @@ typedef enum {
   VARIANCE_AQ = 1,
   COMPLEXITY_AQ = 2,
   CYCLIC_REFRESH_AQ = 3,
+  EQUATOR360_AQ = 4,
   AQ_MODE_COUNT  // This should always be the last member of the enum
 } AQ_MODE;
 
@@ -127,7 +133,7 @@ typedef struct VP9EncoderConfig {
   int height;  // height of data passed to the compressor
   unsigned int input_bit_depth;  // Input bit depth.
   double init_framerate;  // set to passed in framerate
-  int64_t target_bandwidth;  // bandwidth to be used in kilobits per second
+  int64_t target_bandwidth;  // bandwidth to be used in bits per second
 
   int noise_sensitivity;  // pre processing blur: recommendation 0
   int sharpness;  // sharpening output: recommendation 0:
@@ -225,6 +231,8 @@ typedef struct VP9EncoderConfig {
 
   int max_threads;
 
+  int target_level;
+
   vpx_fixed_buf_t two_pass_stats_in;
   struct vpx_codec_pkt_list *output_pkt_list;
 
@@ -259,6 +267,8 @@ typedef struct RD_COUNTS {
   vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
   int64_t comp_pred_diff[REFERENCE_MODES];
   int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+  int m_search_count;
+  int ex_search_count;
 } RD_COUNTS;
 
 typedef struct ThreadData {
@@ -291,6 +301,69 @@ typedef struct IMAGE_STAT {
   double worst;
 } ImageStat;
 
+#define CPB_WINDOW_SIZE 4
+#define FRAME_WINDOW_SIZE 128
+#define SAMPLE_RATE_GRACE_P 0.015
+#define VP9_LEVELS 14
+
+typedef enum {
+  LEVEL_UNKNOWN = 0,
+  LEVEL_1 = 10,
+  LEVEL_1_1 = 11,
+  LEVEL_2 = 20,
+  LEVEL_2_1 = 21,
+  LEVEL_3 = 30,
+  LEVEL_3_1 = 31,
+  LEVEL_4 = 40,
+  LEVEL_4_1 = 41,
+  LEVEL_5 = 50,
+  LEVEL_5_1 = 51,
+  LEVEL_5_2 = 52,
+  LEVEL_6 = 60,
+  LEVEL_6_1 = 61,
+  LEVEL_6_2 = 62,
+  LEVEL_MAX = 255
+} VP9_LEVEL;
+
+typedef struct {
+  VP9_LEVEL level;
+  uint64_t max_luma_sample_rate;
+  uint32_t max_luma_picture_size;
+  double average_bitrate;  // in kilobits per second
+  double max_cpb_size;  // in kilobits
+  double compression_ratio;
+  uint8_t max_col_tiles;
+  uint32_t min_altref_distance;
+  uint8_t max_ref_frame_buffers;
+} Vp9LevelSpec;
+
+typedef struct {
+  int64_t ts;  // timestamp
+  uint32_t luma_samples;
+  uint32_t size;  // in bytes
+} FrameRecord;
+
+typedef struct {
+  FrameRecord buf[FRAME_WINDOW_SIZE];
+  uint8_t start;
+  uint8_t len;
+} FrameWindowBuffer;
+
+typedef struct {
+  uint8_t seen_first_altref;
+  uint32_t frames_since_last_altref;
+  uint64_t total_compressed_size;
+  uint64_t total_uncompressed_size;
+  double time_encoded;  // in seconds
+  FrameWindowBuffer frame_window_buffer;
+  int ref_refresh_map;
+} Vp9LevelStats;
+
+typedef struct {
+  Vp9LevelStats level_stats;
+  Vp9LevelSpec level_spec;
+} Vp9LevelInfo;
+
 typedef struct VP9_COMP {
   QUANTS quants;
   ThreadData td;
@@ -335,7 +408,7 @@ typedef struct VP9_COMP {
   YV12_BUFFER_CONFIG last_frame_uf;
 
   TOKENEXTRA *tile_tok[4][1 << 6];
-  unsigned int tok_count[4][1 << 6];
+  uint32_t tok_count[4][1 << 6];
 
   // Ambient reconstruction err target for force key frames
   int64_t ambient_err;
@@ -367,7 +440,7 @@ typedef struct VP9_COMP {
 
   SPEED_FEATURES sf;
 
-  unsigned int max_mv_magnitude;
+  uint32_t max_mv_magnitude;
   int mv_step_param;
 
   int allow_comp_inter_inter;
@@ -379,10 +452,10 @@ typedef struct VP9_COMP {
   // clips, and 300 for < HD clips.
   int encode_breakout;
 
-  unsigned char *segmentation_map;
+  uint8_t *segmentation_map;
 
   // segment threashold for encode breakout
-  int  segment_encode_breakout[MAX_SEGMENTS];
+  int segment_encode_breakout[MAX_SEGMENTS];
 
   CYCLIC_REFRESH *cyclic_refresh;
   ActiveMap active_map;
@@ -404,11 +477,10 @@ typedef struct VP9_COMP {
 
   YV12_BUFFER_CONFIG alt_ref_buffer;
 
-
 #if CONFIG_INTERNAL_STATS
   unsigned int mode_chosen_counts[MAX_MODES];
 
-  int    count;
+  int count;
   uint64_t total_sq_error;
   uint64_t total_samples;
   ImageStat psnr;
@@ -469,7 +541,7 @@ typedef struct VP9_COMP {
 
   int mbmode_cost[INTRA_MODES];
   unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
-  int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES];
+  int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES][INTRA_MODES];
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
   int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
   int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
@@ -484,12 +556,22 @@ typedef struct VP9_COMP {
 
   int resize_pending;
   int resize_state;
+  int external_resize;
   int resize_scale_num;
   int resize_scale_den;
   int resize_avg_qp;
   int resize_buffer_underflow;
   int resize_count;
 
+  int use_skin_detection;
+
+  int target_level;
+
+  NOISE_ESTIMATE noise_estimate;
+
+  // Count on how many consecutive times a block uses small/zeromv for encoding.
+  uint8_t *consec_zero_mv;
+
   // VAR_BASED_PARTITION thresholds
   // 0 - threshold_64x64; 1 - threshold_32x32;
   // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
@@ -503,6 +585,9 @@ typedef struct VP9_COMP {
   VPxWorker *workers;
   struct EncWorkerData *tile_thr_data;
   VP9LfSync lf_row_sync;
+
+  int keep_level_stats;
+  Vp9LevelInfo level_info;
 } VP9_COMP;
 
 void vp9_initialize_enc(void);
@@ -614,6 +699,11 @@ void vp9_update_reference_frames(VP9_COMP *cpi);
 
 void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
 
+YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
+                                           YV12_BUFFER_CONFIG *unscaled,
+                                           YV12_BUFFER_CONFIG *scaled,
+                                           YV12_BUFFER_CONFIG *scaled_temp);
+
 YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
                                           YV12_BUFFER_CONFIG *unscaled,
                                           YV12_BUFFER_CONFIG *scaled,
@@ -653,6 +743,8 @@ static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) {
   return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
 }
 
+VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
+
 void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 
 #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index ad25712..1d1926c 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -30,6 +30,10 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
             for (n = 0; n < ENTROPY_TOKENS; n++)
               td->rd_counts.coef_counts[i][j][k][l][m][n] +=
                   td_t->rd_counts.coef_counts[i][j][k][l][m][n];
+
+  // Counts of all motion searches and exhuastive mesh searches.
+  td->rd_counts.m_search_count += td_t->rd_counts.m_search_count;
+  td->rd_counts.ex_search_count += td_t->rd_counts.ex_search_count;
 }
 
 static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 30738b5..53a3ec7 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -41,11 +41,8 @@
 #define OUTPUT_FPF          0
 #define ARF_STATS_OUTPUT    0
 
-#define GROUP_ADAPTIVE_MAXQ 1
-
 #define BOOST_BREAKOUT      12.5
 #define BOOST_FACTOR        12.5
-#define ERR_DIVISOR         128.0
 #define FACTOR_PT_LOW       0.70
 #define FACTOR_PT_HIGH      0.90
 #define FIRST_PASS_Q        10.0
@@ -65,7 +62,7 @@
 
 #define NCOUNT_INTRA_THRESH 8192
 #define NCOUNT_INTRA_FACTOR 3
-#define NCOUNT_FRAME_II_THRESH 5.0
+
 
 #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
 
@@ -115,7 +112,7 @@ static void output_stats(FIRSTPASS_STATS *stats,
 
     fprintf(fpfile, "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
             "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
-            "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf\n",
+            "%12.4lf %12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf\n",
             stats->frame,
             stats->weight,
             stats->intra_error,
@@ -126,6 +123,7 @@ static void output_stats(FIRSTPASS_STATS *stats,
             stats->pcnt_second_ref,
             stats->pcnt_neutral,
             stats->intra_skip_pct,
+            stats->intra_smooth_pct,
             stats->inactive_zone_rows,
             stats->inactive_zone_cols,
             stats->MVr,
@@ -155,82 +153,85 @@ static void output_fpmb_stats(uint8_t *this_frame_mb_stats, VP9_COMMON *cm,
 #endif
 
 static void zero_stats(FIRSTPASS_STATS *section) {
-  section->frame = 0.0;
-  section->weight = 0.0;
-  section->intra_error = 0.0;
-  section->coded_error = 0.0;
-  section->sr_coded_error = 0.0;
-  section->pcnt_inter  = 0.0;
-  section->pcnt_motion  = 0.0;
-  section->pcnt_second_ref = 0.0;
-  section->pcnt_neutral = 0.0;
-  section->intra_skip_pct = 0.0;
+  section->frame              = 0.0;
+  section->weight             = 0.0;
+  section->intra_error        = 0.0;
+  section->coded_error        = 0.0;
+  section->sr_coded_error     = 0.0;
+  section->pcnt_inter         = 0.0;
+  section->pcnt_motion        = 0.0;
+  section->pcnt_second_ref    = 0.0;
+  section->pcnt_neutral       = 0.0;
+  section->intra_skip_pct     = 0.0;
+  section->intra_smooth_pct   = 0.0;
   section->inactive_zone_rows = 0.0;
   section->inactive_zone_cols = 0.0;
-  section->MVr = 0.0;
-  section->mvr_abs     = 0.0;
-  section->MVc        = 0.0;
-  section->mvc_abs     = 0.0;
-  section->MVrv       = 0.0;
-  section->MVcv       = 0.0;
-  section->mv_in_out_count  = 0.0;
-  section->new_mv_count = 0.0;
-  section->count      = 0.0;
-  section->duration   = 1.0;
-  section->spatial_layer_id = 0;
+  section->MVr                = 0.0;
+  section->mvr_abs            = 0.0;
+  section->MVc                = 0.0;
+  section->mvc_abs            = 0.0;
+  section->MVrv               = 0.0;
+  section->MVcv               = 0.0;
+  section->mv_in_out_count    = 0.0;
+  section->new_mv_count       = 0.0;
+  section->count              = 0.0;
+  section->duration           = 1.0;
+  section->spatial_layer_id   = 0;
 }
 
 static void accumulate_stats(FIRSTPASS_STATS *section,
                              const FIRSTPASS_STATS *frame) {
-  section->frame += frame->frame;
-  section->weight += frame->weight;
-  section->spatial_layer_id = frame->spatial_layer_id;
-  section->intra_error += frame->intra_error;
-  section->coded_error += frame->coded_error;
-  section->sr_coded_error += frame->sr_coded_error;
-  section->pcnt_inter  += frame->pcnt_inter;
-  section->pcnt_motion += frame->pcnt_motion;
-  section->pcnt_second_ref += frame->pcnt_second_ref;
-  section->pcnt_neutral += frame->pcnt_neutral;
-  section->intra_skip_pct += frame->intra_skip_pct;
+  section->frame              += frame->frame;
+  section->weight             += frame->weight;
+  section->spatial_layer_id    = frame->spatial_layer_id;
+  section->intra_error        += frame->intra_error;
+  section->coded_error        += frame->coded_error;
+  section->sr_coded_error     += frame->sr_coded_error;
+  section->pcnt_inter         += frame->pcnt_inter;
+  section->pcnt_motion        += frame->pcnt_motion;
+  section->pcnt_second_ref    += frame->pcnt_second_ref;
+  section->pcnt_neutral       += frame->pcnt_neutral;
+  section->intra_skip_pct     += frame->intra_skip_pct;
+  section->intra_smooth_pct   += frame->intra_smooth_pct;
   section->inactive_zone_rows += frame->inactive_zone_rows;
   section->inactive_zone_cols += frame->inactive_zone_cols;
-  section->MVr += frame->MVr;
-  section->mvr_abs     += frame->mvr_abs;
-  section->MVc        += frame->MVc;
-  section->mvc_abs     += frame->mvc_abs;
-  section->MVrv       += frame->MVrv;
-  section->MVcv       += frame->MVcv;
-  section->mv_in_out_count  += frame->mv_in_out_count;
-  section->new_mv_count += frame->new_mv_count;
-  section->count      += frame->count;
-  section->duration   += frame->duration;
+  section->MVr                += frame->MVr;
+  section->mvr_abs            += frame->mvr_abs;
+  section->MVc                += frame->MVc;
+  section->mvc_abs            += frame->mvc_abs;
+  section->MVrv               += frame->MVrv;
+  section->MVcv               += frame->MVcv;
+  section->mv_in_out_count    += frame->mv_in_out_count;
+  section->new_mv_count       += frame->new_mv_count;
+  section->count              += frame->count;
+  section->duration           += frame->duration;
 }
 
 static void subtract_stats(FIRSTPASS_STATS *section,
                            const FIRSTPASS_STATS *frame) {
-  section->frame -= frame->frame;
-  section->weight -= frame->weight;
-  section->intra_error -= frame->intra_error;
-  section->coded_error -= frame->coded_error;
-  section->sr_coded_error -= frame->sr_coded_error;
-  section->pcnt_inter  -= frame->pcnt_inter;
-  section->pcnt_motion -= frame->pcnt_motion;
-  section->pcnt_second_ref -= frame->pcnt_second_ref;
-  section->pcnt_neutral -= frame->pcnt_neutral;
-  section->intra_skip_pct -= frame->intra_skip_pct;
+  section->frame              -= frame->frame;
+  section->weight             -= frame->weight;
+  section->intra_error        -= frame->intra_error;
+  section->coded_error        -= frame->coded_error;
+  section->sr_coded_error     -= frame->sr_coded_error;
+  section->pcnt_inter         -= frame->pcnt_inter;
+  section->pcnt_motion        -= frame->pcnt_motion;
+  section->pcnt_second_ref    -= frame->pcnt_second_ref;
+  section->pcnt_neutral       -= frame->pcnt_neutral;
+  section->intra_skip_pct     -= frame->intra_skip_pct;
+  section->intra_smooth_pct   -= frame->intra_smooth_pct;
   section->inactive_zone_rows -= frame->inactive_zone_rows;
   section->inactive_zone_cols -= frame->inactive_zone_cols;
-  section->MVr -= frame->MVr;
-  section->mvr_abs     -= frame->mvr_abs;
-  section->MVc        -= frame->MVc;
-  section->mvc_abs     -= frame->mvc_abs;
-  section->MVrv       -= frame->MVrv;
-  section->MVcv       -= frame->MVcv;
-  section->mv_in_out_count  -= frame->mv_in_out_count;
-  section->new_mv_count -= frame->new_mv_count;
-  section->count      -= frame->count;
-  section->duration   -= frame->duration;
+  section->MVr                -= frame->MVr;
+  section->mvr_abs            -= frame->mvr_abs;
+  section->MVc                -= frame->MVc;
+  section->mvc_abs            -= frame->mvc_abs;
+  section->MVrv               -= frame->MVrv;
+  section->MVcv               -= frame->MVcv;
+  section->mv_in_out_count    -= frame->mv_in_out_count;
+  section->new_mv_count       -= frame->new_mv_count;
+  section->count              -= frame->count;
+  section->duration           -= frame->duration;
 }
 
 // Calculate an active area of the image that discounts formatting
@@ -396,7 +397,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   MV tmp_mv = {0, 0};
   MV ref_mv_full = {ref_mv->row >> 3, ref_mv->col >> 3};
   int num00, tmp_err, n;
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
   const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
 
@@ -490,7 +491,63 @@ static void set_first_pass_params(VP9_COMP *cpi) {
   cpi->rc.frames_to_key = INT_MAX;
 }
 
+// This threshold is used to track blocks where to all intents and purposes
+// the intra prediction error 0. Though the metric we test against
+// is technically a sse we are mainly interested in blocks where all the pixels
+// in the 8 bit domain have an error of <= 1 (where error = sse) so a
+// linear scaling for 10 and 12 bit gives similar results.
 #define UL_INTRA_THRESH 50
+static int get_ul_intra_threshold(VP9_COMMON *cm) {
+  int ret_val = UL_INTRA_THRESH;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    switch (cm->bit_depth) {
+      case VPX_BITS_8:
+        ret_val = UL_INTRA_THRESH;
+        break;
+      case VPX_BITS_10:
+        ret_val = UL_INTRA_THRESH >> 2;
+        break;
+      case VPX_BITS_12:
+        ret_val = UL_INTRA_THRESH >> 4;
+        break;
+      default:
+        assert(0 && "cm->bit_depth should be VPX_BITS_8, "
+                    "VPX_BITS_10 or VPX_BITS_12");
+    }
+  }
+#else
+  (void) cm;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  return ret_val;
+}
+
+#define SMOOTH_INTRA_THRESH 4000
+static int get_smooth_intra_threshold(VP9_COMMON *cm) {
+  int ret_val = SMOOTH_INTRA_THRESH;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    switch (cm->bit_depth) {
+      case VPX_BITS_8:
+        ret_val = SMOOTH_INTRA_THRESH;
+        break;
+      case VPX_BITS_10:
+        ret_val = SMOOTH_INTRA_THRESH >> 2;
+        break;
+      case VPX_BITS_12:
+        ret_val = SMOOTH_INTRA_THRESH >> 4;
+        break;
+      default:
+        assert(0 && "cm->bit_depth should be VPX_BITS_8, "
+                    "VPX_BITS_10 or VPX_BITS_12");
+    }
+  }
+#else
+  (void) cm;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  return ret_val;
+}
+
 #define INVALID_ROW -1
 void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   int mb_row, mb_col;
@@ -517,6 +574,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   const int intrapenalty = INTRA_MODE_PENALTY;
   double neutral_count;
   int intra_skip_count = 0;
+  int intra_smooth_count = 0;
   int image_data_start_row = INVALID_ROW;
   int new_mv_count = 0;
   int sum_in_vectors = 0;
@@ -535,6 +593,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   double intra_factor;
   double brightness_factor;
   BufferPool *const pool = cm->buffer_pool;
+  MODE_INFO mi_above, mi_left;
 
   // First pass code requires valid last and new frame buffers.
   assert(new_yv12 != NULL);
@@ -636,7 +695,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
     MV best_ref_mv = {0, 0};
 
     // Reset above block coeffs.
-    xd->up_available = (mb_row != 0);
     recon_yoffset = (mb_row * recon_y_stride * 16);
     recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
 
@@ -662,20 +720,25 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
       xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
       xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
       xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
-      xd->left_available = (mb_col != 0);
-      xd->mi[0]->mbmi.sb_type = bsize;
-      xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+      xd->mi[0]->sb_type = bsize;
+      xd->mi[0]->ref_frame[0] = INTRA_FRAME;
       set_mi_row_col(xd, &tile,
                      mb_row << 1, num_8x8_blocks_high_lookup[bsize],
                      mb_col << 1, num_8x8_blocks_wide_lookup[bsize],
                      cm->mi_rows, cm->mi_cols);
+      // Are edges available for intra prediction?
+      // Since the firstpass does not populate the mi_grid_visible,
+      // above_mi/left_mi must be overwritten with a nonzero value when edges
+      // are available.  Required by vp9_predict_intra_block().
+      xd->above_mi = (mb_row != 0) ? &mi_above : NULL;
+      xd->left_mi  = (mb_col > tile.mi_col_start) ? &mi_left : NULL;
 
       // Do intra 16x16 prediction.
       x->skip_encode = 0;
-      xd->mi[0]->mbmi.mode = DC_PRED;
-      xd->mi[0]->mbmi.tx_size = use_dc_pred ?
+      xd->mi[0]->mode = DC_PRED;
+      xd->mi[0]->tx_size = use_dc_pred ?
          (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
-      vp9_encode_intra_block_plane(x, bsize, 0);
+      vp9_encode_intra_block_plane(x, bsize, 0, 0);
       this_error = vpx_get_mb_ss(x->plane[0].src_diff);
 
       // Keep a record of blocks that have almost no intra error residual
@@ -683,11 +746,14 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
       // domain). In natural videos this is uncommon, but it is much more
       // common in animations, graphics and screen content, so may be used
       // as a signal to detect these types of content.
-      if (this_error < UL_INTRA_THRESH) {
+      if (this_error < get_ul_intra_threshold(cm)) {
         ++intra_skip_count;
       } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {
         image_data_start_row = mb_row;
       }
+      if (this_error < get_smooth_intra_threshold(cm)) {
+        ++intra_smooth_count;
+      }
 
 #if CONFIG_VP9_HIGHBITDEPTH
       if (cm->use_highbitdepth) {
@@ -897,11 +963,11 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
           mv.row *= 8;
           mv.col *= 8;
           this_error = motion_error;
-          xd->mi[0]->mbmi.mode = NEWMV;
-          xd->mi[0]->mbmi.mv[0].as_mv = mv;
-          xd->mi[0]->mbmi.tx_size = TX_4X4;
-          xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
-          xd->mi[0]->mbmi.ref_frame[1] = NONE;
+          xd->mi[0]->mode = NEWMV;
+          xd->mi[0]->mv[0].as_mv = mv;
+          xd->mi[0]->tx_size = TX_4X4;
+          xd->mi[0]->ref_frame[0] = LAST_FRAME;
+          xd->mi[0]->ref_frame[1] = NONE;
           vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
           vp9_encode_sby_pass1(x, bsize);
           sum_mvr += mv.row;
@@ -1053,8 +1119,10 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
     fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
     fps.pcnt_neutral = (double)neutral_count / num_mbs;
     fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
+    fps.intra_smooth_pct = (double)intra_smooth_count / num_mbs;
     fps.inactive_zone_rows = (double)image_data_start_row;
-    fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
+    // Currently set to 0 as most issues relate to letter boxing.
+    fps.inactive_zone_cols = (double)0;
 
     if (mvcount > 0) {
       fps.MVr = (double)sum_mvr / mvcount;
@@ -1080,10 +1148,9 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
       fps.pcnt_motion = 0.0;
     }
 
-    // TODO(paulwilkins):  Handle the case when duration is set to 0, or
-    // something less than the full time between subsequent values of
-    // cpi->source_time_stamp.
-    fps.duration = (double)(source->ts_end - source->ts_start);
+    // Dont allow a value of 0 for duration.
+    // (Section duration is also defaulted to minimum of 1.0).
+    fps.duration = VPXMAX(1.0, (double)(source->ts_end - source->ts_start));
 
     // Don't want to do output stats with a stack variable!
     twopass->this_frame_stats = fps;
@@ -1171,18 +1238,15 @@ static double calc_correction_factor(double err_per_mb,
   return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
-// Larger image formats are expected to be a little harder to code relatively
-// given the same prediction error score. This in part at least relates to the
-// increased size and hence coding cost of motion vectors.
-#define EDIV_SIZE_FACTOR 800
-
-static int get_twopass_worst_quality(const VP9_COMP *cpi,
+#define ERR_DIVISOR         115.0
+static int get_twopass_worst_quality(VP9_COMP *cpi,
                                      const double section_err,
                                      double inactive_zone,
-                                     int section_target_bandwidth,
-                                     double group_weight_factor) {
+                                     int section_target_bandwidth) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+
   // Clamp the target rate to VBR min / max limts.
   const int target_rate =
       vp9_rc_clamp_pframe_target_size(cpi, section_target_bandwidth);
@@ -1197,29 +1261,36 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
     const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
     const double av_err_per_mb = section_err / active_mbs;
     const double speed_term = 1.0 + 0.04 * oxcf->speed;
-    const double ediv_size_correction = (double)num_mbs / EDIV_SIZE_FACTOR;
+    double last_group_rate_err;
     const int target_norm_bits_per_mb = ((uint64_t)target_rate <<
                                          BPER_MB_NORMBITS) / active_mbs;
-
     int q;
     int is_svc_upper_layer = 0;
 
     if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0)
       is_svc_upper_layer = 1;
 
+    // based on recent history adjust expectations of bits per macroblock.
+    last_group_rate_err = (double)twopass->rolling_arf_group_actual_bits /
+        DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits);
+    last_group_rate_err =
+        VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err));
+    twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0;
+    twopass->bpm_factor =
+        VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor));
 
     // Try and pick a max Q that will be high enough to encode the
     // content at the given rate.
     for (q = rc->best_quality; q < rc->worst_quality; ++q) {
       const double factor =
           calc_correction_factor(av_err_per_mb,
-                                 ERR_DIVISOR - ediv_size_correction,
+                                 ERR_DIVISOR,
                                  is_svc_upper_layer ? SVC_FACTOR_PT_LOW :
                                  FACTOR_PT_LOW, FACTOR_PT_HIGH, q,
                                  cpi->common.bit_depth);
       const int bits_per_mb =
         vp9_rc_bits_per_mb(INTER_FRAME, q,
-                           factor * speed_term * group_weight_factor,
+                           factor * speed_term * cpi->twopass.bpm_factor,
                            cpi->common.bit_depth);
       if (bits_per_mb <= target_norm_bits_per_mb)
         break;
@@ -1270,6 +1341,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   const int is_two_pass_svc = (svc->number_spatial_layers > 1) ||
                               (svc->number_temporal_layers > 1);
+  RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = is_two_pass_svc ?
       &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
   double frame_rate;
@@ -1326,26 +1398,33 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   }
 
   // Reset the vbr bits off target counters
-  cpi->rc.vbr_bits_off_target = 0;
-  cpi->rc.vbr_bits_off_target_fast = 0;
-
-  cpi->rc.rate_error_estimate = 0;
+  rc->vbr_bits_off_target = 0;
+  rc->vbr_bits_off_target_fast = 0;
+  rc->rate_error_estimate = 0;
 
   // Static sequence monitor variables.
   twopass->kf_zeromotion_pct = 100;
   twopass->last_kfgroup_zeromotion_pct = 100;
 
+  // Initialize bits per macro_block estimate correction factor.
+  twopass->bpm_factor = 1.0;
+  // Initiallize actual and target bits counters for ARF groups so that
+  // at the start we have a neutral bpm adjustment.
+  twopass->rolling_arf_group_target_bits = 1;
+  twopass->rolling_arf_group_actual_bits = 1;
+
   if (oxcf->resize_mode != RESIZE_NONE) {
     init_subsampling(cpi);
   }
 }
 
 #define SR_DIFF_PART 0.0015
-#define MOTION_AMP_PART 0.003
 #define INTRA_PART 0.005
 #define DEFAULT_DECAY_LIMIT 0.75
 #define LOW_SR_DIFF_TRHESH 0.1
 #define SR_DIFF_MAX 128.0
+#define LOW_CODED_ERR_PER_MB 10.0
+#define NCOUNT_FRAME_II_THRESH 6.0
 
 static double get_sr_decay_rate(const VP9_COMP *cpi,
                                 const FIRSTPASS_STATS *frame) {
@@ -1356,12 +1435,15 @@ static double get_sr_decay_rate(const VP9_COMP *cpi,
   double sr_decay = 1.0;
   double modified_pct_inter;
   double modified_pcnt_intra;
-  const double motion_amplitude_factor =
-    frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
+  const double motion_amplitude_part =
+      frame->pcnt_motion *
+      ((frame->mvc_abs + frame->mvr_abs) /
+       (cpi->initial_height + cpi->initial_width));
 
   modified_pct_inter = frame->pcnt_inter;
-  if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
-      (double)NCOUNT_FRAME_II_THRESH) {
+  if (((frame->coded_error / num_mbs) > LOW_CODED_ERR_PER_MB) &&
+      ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+       (double)NCOUNT_FRAME_II_THRESH)) {
     modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
   }
   modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
@@ -1370,7 +1452,7 @@ static double get_sr_decay_rate(const VP9_COMP *cpi,
   if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
     sr_diff = VPXMIN(sr_diff, SR_DIFF_MAX);
     sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
-               (MOTION_AMP_PART * motion_amplitude_factor) -
+               motion_amplitude_part -
                (INTRA_PART * modified_pcnt_intra);
   }
   return VPXMAX(sr_decay, VPXMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
@@ -1722,15 +1804,13 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
       gf_group->update_type[0] = OVERLAY_UPDATE;
       gf_group->rf_level[0] = INTER_NORMAL;
       gf_group->bit_allocation[0] = 0;
-      gf_group->arf_update_idx[0] = arf_buffer_indices[0];
-      gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
     } else {
       gf_group->update_type[0] = GF_UPDATE;
       gf_group->rf_level[0] = GF_ARF_STD;
       gf_group->bit_allocation[0] = gf_arf_bits;
-      gf_group->arf_update_idx[0] = arf_buffer_indices[0];
-      gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
     }
+    gf_group->arf_update_idx[0] = arf_buffer_indices[0];
+    gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
 
     // Step over the golden frame / overlay frame
     if (EOF == input_stats(twopass, &frame_stats))
@@ -1857,9 +1937,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double boost_score = 0.0;
   double old_boost_score = 0.0;
   double gf_group_err = 0.0;
-#if GROUP_ADAPTIVE_MAXQ
   double gf_group_raw_error = 0.0;
-#endif
   double gf_group_skip_pct = 0.0;
   double gf_group_inactive_zone_rows = 0.0;
   double gf_first_frame_err = 0.0;
@@ -1909,9 +1987,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // the error score / cost of this frame has already been accounted for.
   if (arf_active_or_kf) {
     gf_group_err -= gf_first_frame_err;
-#if GROUP_ADAPTIVE_MAXQ
     gf_group_raw_error -= this_frame->coded_error;
-#endif
     gf_group_skip_pct -= this_frame->intra_skip_pct;
     gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
   }
@@ -1929,7 +2005,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     int int_lbq =
       (int)(vp9_convert_qindex_to_q(rc->last_boosted_qindex,
                                    cpi->common.bit_depth));
-    active_min_gf_interval = rc->min_gf_interval + VPXMIN(2, int_max_q / 200);
+    active_min_gf_interval =
+      rc->min_gf_interval + arf_active_or_kf + VPXMIN(2, int_max_q / 200);
     if (active_min_gf_interval > rc->max_gf_interval)
       active_min_gf_interval = rc->max_gf_interval;
 
@@ -1940,14 +2017,20 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       // bits to spare and are better with a smaller interval and smaller boost.
       // At high Q when there are few bits to spare we are better with a longer
       // interval to spread the cost of the GF.
-      active_max_gf_interval = 12 + VPXMIN(4, (int_lbq / 6));
-      if (active_max_gf_interval < active_min_gf_interval)
-        active_max_gf_interval = active_min_gf_interval;
+      active_max_gf_interval =
+        12 + arf_active_or_kf + VPXMIN(4, (int_lbq / 6));
 
-      if (active_max_gf_interval > rc->max_gf_interval)
-        active_max_gf_interval = rc->max_gf_interval;
+      // We have: active_min_gf_interval <= rc->max_gf_interval
       if (active_max_gf_interval < active_min_gf_interval)
         active_max_gf_interval = active_min_gf_interval;
+      else if (active_max_gf_interval > rc->max_gf_interval)
+        active_max_gf_interval = rc->max_gf_interval;
+
+      // Would the active max drop us out just before the near the next kf?
+      if ((active_max_gf_interval <= rc->frames_to_key) &&
+          (active_max_gf_interval >=
+              (rc->frames_to_key - rc->min_gf_interval)))
+        active_max_gf_interval = rc->frames_to_key / 2;
     }
   }
 
@@ -1958,9 +2041,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Accumulate error score of frames in this gf group.
     mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
     gf_group_err += mod_frame_err;
-#if GROUP_ADAPTIVE_MAXQ
     gf_group_raw_error += this_frame->coded_error;
-#endif
     gf_group_skip_pct += this_frame->intra_skip_pct;
     gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
 
@@ -2005,11 +2086,13 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Break out conditions.
     if (
       // Break at active_max_gf_interval unless almost totally static.
-      (i >= (active_max_gf_interval + arf_active_or_kf) &&
-            zero_motion_accumulator < 0.995) ||
+      ((i >= active_max_gf_interval) &&
+       (zero_motion_accumulator < 0.995)) ||
       (
         // Don't break out with a very short interval.
-        (i >= active_min_gf_interval + arf_active_or_kf) &&
+        (i >= active_min_gf_interval) &&
+        // If possible dont break very close to a kf
+        ((rc->frames_to_key - i) >= rc->min_gf_interval) &&
         (!flash_detected) &&
         ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
          (abs_mv_in_out_accumulator > 3.0) ||
@@ -2023,8 +2106,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     old_boost_score = boost_score;
   }
 
-  twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
-
   // Was the group length constrained by the requirement for a new KF?
   rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
 
@@ -2060,9 +2141,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       if (EOF == input_stats(twopass, this_frame))
         break;
       gf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
-#if GROUP_ADAPTIVE_MAXQ
       gf_group_raw_error += this_frame->coded_error;
-#endif
       gf_group_skip_pct += this_frame->intra_skip_pct;
       gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
     }
@@ -2077,7 +2156,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Calculate the bits to be allocated to the gf/arf group as a whole
   gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
 
-#if GROUP_ADAPTIVE_MAXQ
   // Calculate an estimate of the maxq needed for the group.
   // We are more agressive about correcting for sections
   // where there could be significant overshoot than for easier
@@ -2092,26 +2170,13 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     const double group_av_inactive_zone =
       ((gf_group_inactive_zone_rows * 2) /
        (rc->baseline_gf_interval * (double)cm->mb_rows));
-
-    int tmp_q;
-    // rc factor is a weight factor that corrects for local rate control drift.
-    double rc_factor = 1.0;
-    if (rc->rate_error_estimate > 0) {
-      rc_factor = VPXMAX(RC_FACTOR_MIN,
-                         (double)(100 - rc->rate_error_estimate) / 100.0);
-    } else {
-      rc_factor = VPXMIN(RC_FACTOR_MAX,
-                         (double)(100 - rc->rate_error_estimate) / 100.0);
-    }
-    tmp_q =
-      get_twopass_worst_quality(cpi, group_av_err,
-                                (group_av_skip_pct + group_av_inactive_zone),
-                                vbr_group_bits_per_frame,
-                                twopass->kfgroup_inter_fraction * rc_factor);
+    int tmp_q =
+        get_twopass_worst_quality(cpi, group_av_err,
+                                  (group_av_skip_pct + group_av_inactive_zone),
+                                  vbr_group_bits_per_frame);
     twopass->active_worst_quality =
-        VPXMAX(tmp_q, twopass->active_worst_quality >> 1);
+        (tmp_q + (twopass->active_worst_quality * 3)) >> 2;
   }
-#endif
 
   // Calculate the extra bits to be used for boosted frame(s)
   gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
@@ -2151,6 +2216,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Default to starting GF groups at normal frame size.
     cpi->rc.next_frame_size_selector = UNSCALED;
   }
+
+  // Reset rolling actual and target bits counters for ARF groups.
+  twopass->rolling_arf_group_target_bits = 0;
+  twopass->rolling_arf_group_actual_bits = 0;
 }
 
 // Threshold for use of the lagging second reference frame. High second ref
@@ -2265,6 +2334,8 @@ static int test_candidate_kf(TWO_PASS *twopass,
   return is_viable_kf;
 }
 
+#define FRAMES_TO_CHECK_DECAY 8
+
 static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int i, j;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -2283,7 +2354,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double boost_score = 0.0;
   double kf_mod_err = 0.0;
   double kf_group_err = 0.0;
-  double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+  double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
 
   vp9_zero(next_frame);
 
@@ -2310,6 +2381,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
 
+  // Initialize the decay rates for the recent frames to check
+  for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
+    recent_loop_decay[j] = 1.0;
+
   // Find the next keyframe.
   i = 0;
   while (twopass->stats_in < twopass->stats_in_end &&
@@ -2336,9 +2411,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       // We want to know something about the recent past... rather than
       // as used elsewhere where we are concerned with decay in prediction
       // quality since the last GF or KF.
-      recent_loop_decay[i % 8] = loop_decay_rate;
+      recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate;
       decay_accumulator = 1.0;
-      for (j = 0; j < 8; ++j)
+      for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
         decay_accumulator *= recent_loop_decay[j];
 
       // Special check for transition or high motion followed by a
@@ -2482,16 +2557,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   kf_bits = calculate_boost_bits((rc->frames_to_key - 1),
                                   rc->kf_boost, twopass->kf_group_bits);
 
-  // Work out the fraction of the kf group bits reserved for the inter frames
-  // within the group after discounting the bits for the kf itself.
-  if (twopass->kf_group_bits) {
-    twopass->kfgroup_inter_fraction =
-      (double)(twopass->kf_group_bits - kf_bits) /
-      (double)twopass->kf_group_bits;
-  } else {
-    twopass->kfgroup_inter_fraction = 1.0;
-  }
-
   twopass->kf_group_bits -= kf_bits;
 
   // Save the bits to spend on the key frame.
@@ -2585,21 +2650,12 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
   GF_GROUP *const gf_group = &twopass->gf_group;
-  int frames_left;
   FIRSTPASS_STATS this_frame;
 
   int target_rate;
   LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
         &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : 0;
 
-  if (lc != NULL) {
-    frames_left = (int)(twopass->total_stats.count -
-                  lc->current_video_frame_in_layer);
-  } else {
-    frames_left = (int)(twopass->total_stats.count -
-                  cm->current_video_frame);
-  }
-
   if (!twopass->stats_in)
     return;
 
@@ -2641,6 +2697,9 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     twopass->active_worst_quality = cpi->oxcf.cq_level;
   } else if (cm->current_video_frame == 0 ||
              (lc != NULL && lc->current_video_frame_in_layer == 0)) {
+    const int frames_left = (int)(twopass->total_stats.count -
+        ((lc != NULL) ? lc->current_video_frame_in_layer
+                      : cm->current_video_frame));
     // Special case code for first frame.
     const int section_target_bandwidth = (int)(twopass->bits_left /
                                                frames_left);
@@ -2652,10 +2711,10 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     const double section_inactive_zone =
       (twopass->total_left_stats.inactive_zone_rows * 2) /
       ((double)cm->mb_rows * section_length);
-    const int tmp_q =
-      get_twopass_worst_quality(cpi, section_error,
-                                section_intra_skip + section_inactive_zone,
-                                section_target_bandwidth, DEFAULT_GRP_WEIGHT);
+    int tmp_q;
+
+    tmp_q = get_twopass_worst_quality(cpi, section_error,
+        section_intra_skip + section_inactive_zone, section_target_bandwidth);
 
     twopass->active_worst_quality = tmp_q;
     twopass->baseline_active_worst_quality = tmp_q;
@@ -2749,6 +2808,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     // applied when combining MB error values for the frame.
     twopass->mb_av_energy =
       log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
+    twopass->mb_smooth_pct = this_frame.intra_smooth_pct;
   }
 
   // Update the total stats remaining structure.
@@ -2761,6 +2821,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   TWO_PASS *const twopass = &cpi->twopass;
   RATE_CONTROL *const rc = &cpi->rc;
+  VP9_COMMON *const cm = &cpi->common;
   const int bits_used = rc->base_frame_target;
 
   // VBR correction is done through rc->vbr_bits_off_target. Based on the
@@ -2771,6 +2832,10 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
   twopass->bits_left = VPXMAX(twopass->bits_left - bits_used, 0);
 
+  // Target vs actual bits for this arf group.
+  twopass->rolling_arf_group_target_bits += rc->this_frame_target;
+  twopass->rolling_arf_group_actual_bits += rc->projected_frame_size;
+
   // Calculate the pct rc error.
   if (rc->total_actual_bits) {
     rc->rate_error_estimate =
@@ -2792,12 +2857,27 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
 
   // If the rate control is drifting consider adjustment to min or maxq.
   if ((cpi->oxcf.rc_mode != VPX_Q) &&
-      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
       !cpi->rc.is_src_frame_alt_ref) {
     const int maxq_adj_limit =
       rc->worst_quality - twopass->active_worst_quality;
     const int minq_adj_limit =
         (cpi->oxcf.rc_mode == VPX_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
+    int aq_extend_min = 0;
+    int aq_extend_max = 0;
+
+    // Extend min or Max Q range to account for imbalance from the base
+    // value when using AQ.
+    if (cpi->oxcf.aq_mode != NO_AQ) {
+      if (cm->seg.aq_av_offset < 0) {
+        // The balance of the AQ map tends towarda lowering the average Q.
+        aq_extend_min = 0;
+        aq_extend_max = VPXMIN(maxq_adj_limit, -cm->seg.aq_av_offset);
+      } else {
+        // The balance of the AQ map tends towards raising the average Q.
+        aq_extend_min = VPXMIN(minq_adj_limit, cm->seg.aq_av_offset);
+        aq_extend_max = 0;
+      }
+    }
 
     // Undershoot.
     if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
@@ -2822,8 +2902,10 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
         --twopass->extend_maxq;
     }
 
-    twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit);
-    twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
+    twopass->extend_minq =
+        clamp(twopass->extend_minq, aq_extend_min, minq_adj_limit);
+    twopass->extend_maxq =
+        clamp(twopass->extend_maxq, aq_extend_max, maxq_adj_limit);
 
     // If there is a big and undexpected undershoot then feed the extra
     // bits back in quickly. One situation where this may happen is if a
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 5875a7b..7607288 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -39,8 +39,6 @@ typedef struct {
 } FIRSTPASS_MB_STATS;
 #endif
 
-#define VLOW_MOTION_THRESHOLD 950
-
 typedef struct {
   double frame;
   double weight;
@@ -52,6 +50,7 @@ typedef struct {
   double pcnt_second_ref;
   double pcnt_neutral;
   double intra_skip_pct;
+  double intra_smooth_pct;    // % of blocks that are smooth
   double inactive_zone_rows;  // Image mask rows top and bottom.
   double inactive_zone_cols;  // Image mask columns at left and right edges.
   double MVr;
@@ -107,6 +106,7 @@ typedef struct {
   double modified_error_max;
   double modified_error_left;
   double mb_av_energy;
+  double mb_smooth_pct;
 
 #if CONFIG_FP_MB_STATS
   uint8_t *frame_mb_stats_buf;
@@ -122,14 +122,13 @@ typedef struct {
   // Error score of frames still to be coded in kf group
   int64_t kf_group_error_left;
 
-  // The fraction for a kf groups total bits allocated to the inter frames
-  double kfgroup_inter_fraction;
+  double bpm_factor;
+  int rolling_arf_group_target_bits;
+  int rolling_arf_group_actual_bits;
 
   int sr_update_lag;
-
   int kf_zeromotion_pct;
   int last_kfgroup_zeromotion_pct;
-  int gf_zeromotion_pct;
   int active_worst_quality;
   int baseline_active_worst_quality;
   int extend_minq;
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index 8787be8..787bcf4 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -20,8 +20,8 @@
 
 /* Return the buffer at the given absolute index and increment the index */
 static struct lookahead_entry *pop(struct lookahead_ctx *ctx,
-                                   unsigned int *idx) {
-  unsigned int index = *idx;
+                                   int *idx) {
+  int index = *idx;
   struct lookahead_entry *buf = ctx->buf + index;
 
   assert(index < ctx->max_sz);
@@ -35,7 +35,7 @@ static struct lookahead_entry *pop(struct lookahead_ctx *ctx,
 void vp9_lookahead_destroy(struct lookahead_ctx *ctx) {
   if (ctx) {
     if (ctx->buf) {
-      unsigned int i;
+      int i;
 
       for (i = 0; i < ctx->max_sz; i++)
         vpx_free_frame_buffer(&ctx->buf[i].img);
@@ -89,7 +89,7 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
 
 #define USE_PARTIAL_COPY 0
 
-int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,
+int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
                        int64_t ts_start, int64_t ts_end,
 #if CONFIG_VP9_HIGHBITDEPTH
                        int use_highbitdepth,
@@ -207,7 +207,7 @@ struct lookahead_entry *vp9_lookahead_pop(struct lookahead_ctx *ctx,
                                           int drain) {
   struct lookahead_entry *buf = NULL;
 
-  if (ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
+  if (ctx && ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
     buf = pop(ctx, &ctx->read_idx);
     ctx->sz--;
   }
@@ -221,9 +221,9 @@ struct lookahead_entry *vp9_lookahead_peek(struct lookahead_ctx *ctx,
 
   if (index >= 0) {
     // Forward peek
-    if (index < (int)ctx->sz) {
+    if (index < ctx->sz) {
       index += ctx->read_idx;
-      if (index >= (int)ctx->max_sz)
+      if (index >= ctx->max_sz)
         index -= ctx->max_sz;
       buf = ctx->buf + index;
     }
diff --git a/vp9/encoder/vp9_lookahead.h b/vp9/encoder/vp9_lookahead.h
index 1382038..db0fd1c 100644
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h
@@ -36,10 +36,10 @@ struct lookahead_entry {
 #define MAX_PRE_FRAMES 1
 
 struct lookahead_ctx {
-  unsigned int max_sz;         /* Absolute size of the queue */
-  unsigned int sz;             /* Number of buffers currently in the queue */
-  unsigned int read_idx;       /* Read index */
-  unsigned int write_idx;      /* Write index */
+  int max_sz;                  /* Absolute size of the queue */
+  int sz;                      /* Number of buffers currently in the queue */
+  int read_idx;                /* Read index */
+  int write_idx;               /* Write index */
   struct lookahead_entry *buf; /* Buffer list */
 };
 
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 41b6d19..14a0b16 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -59,8 +59,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
   {
-    int distortion;
-    unsigned int sse;
+    uint32_t distortion;
+    uint32_t sse;
     cpi->find_fractional_mv_step(
         x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
         &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
@@ -69,8 +69,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
         &distortion, &sse, NULL, 0, 0);
   }
 
-  xd->mi[0]->mbmi.mode = NEWMV;
-  xd->mi[0]->mbmi.mv[0].as_mv = *dst_mv;
+  xd->mi[0]->mode = NEWMV;
+  xd->mi[0]->mv[0].as_mv = *dst_mv;
 
   vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);
 
@@ -147,7 +147,7 @@ static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) {
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     unsigned int err;
 
-    xd->mi[0]->mbmi.mode = mode;
+    xd->mi[0]->mode = mode;
     vp9_predict_intra_block(xd, 2, TX_16X16, mode,
                             x->plane[0].src.buf, x->plane[0].src.stride,
                             xd->plane[0].dst.buf, xd->plane[0].dst.stride,
@@ -243,20 +243,23 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
   int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
   MV gld_top_mv = {0, 0};
   MODE_INFO mi_local;
+  MODE_INFO mi_above, mi_left;
 
   vp9_zero(mi_local);
   // Set up limit values for motion vectors to prevent them extending outside
   // the UMV borders.
   x->mv_row_min     = -BORDER_MV_PIXELS_B16;
   x->mv_row_max     = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16;
-  xd->up_available  = 0;
+  // Signal to vp9_predict_intra_block() that above is not available
+  xd->above_mi = NULL;
+
   xd->plane[0].dst.stride  = buf->y_stride;
   xd->plane[0].pre[0].stride  = buf->y_stride;
   xd->plane[1].dst.stride = buf->uv_stride;
   xd->mi[0] = &mi_local;
-  mi_local.mbmi.sb_type = BLOCK_16X16;
-  mi_local.mbmi.ref_frame[0] = LAST_FRAME;
-  mi_local.mbmi.ref_frame[1] = NONE;
+  mi_local.sb_type = BLOCK_16X16;
+  mi_local.ref_frame[0] = LAST_FRAME;
+  mi_local.ref_frame[1] = NONE;
 
   for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
     MV gld_left_mv = gld_top_mv;
@@ -268,7 +271,8 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
     // the UMV borders.
     x->mv_col_min      = -BORDER_MV_PIXELS_B16;
     x->mv_col_max      = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16;
-    xd->left_available = 0;
+    // Signal to vp9_predict_intra_block() that left is not available
+    xd->left_mi = NULL;
 
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
       MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col];
@@ -280,14 +284,19 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
       if (mb_col == 0) {
         gld_top_mv = gld_left_mv;
       }
-      xd->left_available = 1;
+      // Signal to vp9_predict_intra_block() that left is available
+      xd->left_mi = &mi_left;
+
       mb_y_in_offset    += 16;
       gld_y_in_offset   += 16;
       arf_y_in_offset   += 16;
       x->mv_col_min     -= 16;
       x->mv_col_max     -= 16;
     }
-    xd->up_available = 1;
+
+    // Signal to vp9_predict_intra_block() that above is available
+    xd->above_mi = &mi_above;
+
     mb_y_offset     += buf->y_stride * 16;
     gld_y_offset    += golden_ref->y_stride * 16;
     if (alt_ref)
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index be8f57f..2ebacc0 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <limits.h>
 #include <math.h>
 #include <stdio.h>
@@ -69,6 +70,8 @@ int vp9_init_search_range(int size) {
 
 static INLINE int mv_cost(const MV *mv,
                           const int *joint_cost, int *const comp_cost[2]) {
+  assert(mv->row >= -MV_MAX && mv->row < MV_MAX);
+  assert(mv->col >= -MV_MAX && mv->col < MV_MAX);
   return joint_cost[vp9_get_mv_joint(mv)] +
              comp_cost[0][mv->row] + comp_cost[1][mv->col];
 }
@@ -80,52 +83,52 @@ int vp9_mv_bit_cost(const MV *mv, const MV *ref,
   return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7);
 }
 
-static int mv_err_cost(const MV *mv, const MV *ref,
-                       const int *mvjcost, int *mvcost[2],
-                       int error_per_bit) {
+#define PIXEL_TRANSFORM_ERROR_SCALE 4
+static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost,
+                       int *mvcost[2], int error_per_bit) {
   if (mvcost) {
-    const MV diff = { mv->row - ref->row,
-                      mv->col - ref->col };
-    return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) *
-                                  error_per_bit, 13);
+    const MV diff = {mv->row - ref->row, mv->col - ref->col};
+    // This product sits at a 32-bit ceiling right now and any additional
+    // accuracy in either bit cost or error cost will cause it to overflow.
+    return ROUND_POWER_OF_TWO(
+        (unsigned)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
+        RDDIV_BITS + VP9_PROB_COST_SHIFT - RD_EPB_SHIFT +
+            PIXEL_TRANSFORM_ERROR_SCALE);
   }
   return 0;
 }
 
 static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
-                          int error_per_bit) {
+                          int sad_per_bit) {
   const MV diff = { mv->row - ref->row,
                     mv->col - ref->col };
-  return ROUND_POWER_OF_TWO(mv_cost(&diff, x->nmvjointsadcost,
-                                    x->nmvsadcost) * error_per_bit, 8);
+  return ROUND_POWER_OF_TWO(
+      (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) *
+          sad_per_bit,
+      VP9_PROB_COST_SHIFT);
 }
 
 void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
-  int len, ss_count = 1;
-
-  cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
-  cfg->ss[0].offset = 0;
+  int len;
+  int ss_count = 0;
 
   for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
     // Generate offsets for 4 search sites per step.
     const MV ss_mvs[] = {{-len, 0}, {len, 0}, {0, -len}, {0, len}};
     int i;
-    for (i = 0; i < 4; ++i) {
-      search_site *const ss = &cfg->ss[ss_count++];
-      ss->mv = ss_mvs[i];
-      ss->offset = ss->mv.row * stride + ss->mv.col;
+    for (i = 0; i < 4; ++i, ++ss_count) {
+      cfg->ss_mv[ss_count] = ss_mvs[i];
+      cfg->ss_os[ss_count] = ss_mvs[i].row * stride + ss_mvs[i].col;
     }
   }
 
-  cfg->ss_count = ss_count;
   cfg->searches_per_step = 4;
+  cfg->total_steps = ss_count / cfg->searches_per_step;
 }
 
 void vp9_init3smotion_compensation(search_site_config *cfg, int stride) {
-  int len, ss_count = 1;
-
-  cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
-  cfg->ss[0].offset = 0;
+  int len;
+  int ss_count = 0;
 
   for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
     // Generate offsets for 8 search sites per step.
@@ -134,33 +137,23 @@ void vp9_init3smotion_compensation(search_site_config *cfg, int stride) {
       {-len, -len}, {-len, len}, {len,  -len}, {len,  len}
     };
     int i;
-    for (i = 0; i < 8; ++i) {
-      search_site *const ss = &cfg->ss[ss_count++];
-      ss->mv = ss_mvs[i];
-      ss->offset = ss->mv.row * stride + ss->mv.col;
+    for (i = 0; i < 8; ++i, ++ss_count) {
+      cfg->ss_mv[ss_count] = ss_mvs[i];
+      cfg->ss_os[ss_count] = ss_mvs[i].row * stride + ss_mvs[i].col;
     }
   }
 
-  cfg->ss_count = ss_count;
   cfg->searches_per_step = 8;
+  cfg->total_steps = ss_count / cfg->searches_per_step;
 }
 
-/*
- * To avoid the penalty for crossing cache-line read, preload the reference
- * area in a small buffer, which is aligned to make sure there won't be crossing
- * cache-line read while reading from this buffer. This reduced the cpu
- * cycles spent on reading ref data in sub-pixel filter functions.
- * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
- * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
- * could reduce the area.
- */
-
-/* estimated cost of a motion vector (r,c) */
-#define MVC(r, c)                                       \
-    (mvcost ?                                           \
-     ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +         \
-       mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) * \
-      error_per_bit + 4096) >> 13 : 0)
+/* Estimated (square) error cost of a motion vector (r,c). The 14 scale comes
+ * from the same math as in mv_err_cost(). */
+#define MVC(r, c)                                              \
+    (mvcost ?                                                  \
+     ((unsigned)(mvjcost[((r) != rr) * 2 + ((c) != rc)] +      \
+       mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) *        \
+      error_per_bit + 8192) >> 14 : 0)
 
 
 // convert motion vector component to offset for sv[a]f calc
@@ -172,6 +165,33 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
   return &buf[(r >> 3) * stride + (c >> 3)];
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    int64_t tmpmse;                                                    \
+    if (second_pred == NULL) {                                         \
+      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c),      \
+                         sp(r), z, src_stride, &sse);                  \
+    } else {                                                           \
+      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c),     \
+                          sp(r), z, src_stride, &sse, second_pred);    \
+    }                                                                  \
+    tmpmse = thismse;                                                  \
+    tmpmse += MVC(r, c);                                               \
+    if (tmpmse >= INT_MAX) {                                           \
+      v = INT_MAX;                                                     \
+    } else if ((v = (uint32_t)tmpmse) < besterr) {                     \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+#else
 /* checks if (r, c) has better score than previous best */
 #define CHECK_BETTER(v, r, c) \
   if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
@@ -192,6 +212,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
     v = INT_MAX;                                                       \
   }
 
+#endif
 #define FIRST_LEVEL_CHECKS                              \
   {                                                     \
     unsigned int left, right, up, down, diag;           \
@@ -320,10 +341,10 @@ static unsigned int setup_center_error(const MACROBLOCKD *xd,
                                        const uint8_t *second_pred,
                                        int w, int h, int offset,
                                        int *mvjcost, int *mvcost[2],
-                                       unsigned int *sse1,
-                                       int *distortion) {
-  unsigned int besterr;
+                                       uint32_t *sse1,
+                                       uint32_t *distortion) {
 #if CONFIG_VP9_HIGHBITDEPTH
+  uint64_t besterr;
   if (second_pred != NULL) {
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
@@ -339,9 +360,13 @@ static unsigned int setup_center_error(const MACROBLOCKD *xd,
   } else {
     besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
   }
-  *distortion = besterr;
+  *distortion = (uint32_t)besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  if (besterr >= UINT32_MAX)
+    return UINT32_MAX;
+  return (uint32_t)besterr;
 #else
+  uint32_t besterr;
   (void) xd;
   if (second_pred != NULL) {
     DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
@@ -352,8 +377,8 @@ static unsigned int setup_center_error(const MACROBLOCKD *xd,
   }
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
   return besterr;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
 static INLINE int divide_and_round(const int n, const int d) {
@@ -383,7 +408,7 @@ static void get_cost_surf_min(int *cost_list, int *ir, int *ic,
                          (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
 }
 
-int vp9_find_best_sub_pixel_tree_pruned_evenmore(
+uint32_t vp9_skip_sub_pixel_tree(
     const MACROBLOCK *x,
     MV *bestmv, const MV *ref_mv,
     int allow_hp,
@@ -393,8 +418,53 @@ int vp9_find_best_sub_pixel_tree_pruned_evenmore(
     int iters_per_step,
     int *cost_list,
     int *mvjcost, int *mvcost[2],
-    int *distortion,
-    unsigned int *sse1,
+    uint32_t *distortion,
+    uint32_t *sse1,
+    const uint8_t *second_pred,
+    int w, int h) {
+  SETUP_SUBPEL_SEARCH;
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               z, src_stride, y, y_stride, second_pred,
+                               w, h, offset, mvjcost, mvcost,
+                               sse1, distortion);
+  (void) halfiters;
+  (void) quarteriters;
+  (void) eighthiters;
+  (void) whichdir;
+  (void) allow_hp;
+  (void) forced_stop;
+  (void) hstep;
+  (void) rr;
+  (void) rc;
+  (void) minr;
+  (void) minc;
+  (void) maxr;
+  (void) maxc;
+  (void) tr;
+  (void) tc;
+  (void) sse;
+  (void) thismse;
+  (void) cost_list;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return UINT32_MAX;
+
+  return besterr;
+}
+
+uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore(
+    const MACROBLOCK *x,
+    MV *bestmv, const MV *ref_mv,
+    int allow_hp,
+    int error_per_bit,
+    const vp9_variance_fn_ptr_t *vfp,
+    int forced_stop,
+    int iters_per_step,
+    int *cost_list,
+    int *mvjcost, int *mvcost[2],
+    uint32_t *distortion,
+    uint32_t *sse1,
     const uint8_t *second_pred,
     int w, int h) {
   SETUP_SUBPEL_SEARCH;
@@ -445,7 +515,7 @@ int vp9_find_best_sub_pixel_tree_pruned_evenmore(
   tr = br;
   tc = bc;
 
-  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
+  if (allow_hp && use_mv_hp(ref_mv) && forced_stop == 0) {
     hstep >>= 1;
     FIRST_LEVEL_CHECKS;
     if (eighthiters > 1) {
@@ -463,7 +533,7 @@ int vp9_find_best_sub_pixel_tree_pruned_evenmore(
   return besterr;
 }
 
-int vp9_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
+uint32_t vp9_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
                                              MV *bestmv, const MV *ref_mv,
                                              int allow_hp,
                                              int error_per_bit,
@@ -472,8 +542,8 @@ int vp9_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
                                              int iters_per_step,
                                              int *cost_list,
                                              int *mvjcost, int *mvcost[2],
-                                             int *distortion,
-                                             unsigned int *sse1,
+                                             uint32_t *distortion,
+                                             uint32_t *sse1,
                                              const uint8_t *second_pred,
                                              int w, int h) {
   SETUP_SUBPEL_SEARCH;
@@ -513,7 +583,7 @@ int vp9_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
     }
   }
 
-  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
+  if (allow_hp && use_mv_hp(ref_mv) && forced_stop == 0) {
     tr = br;
     tc = bc;
     hstep >>= 1;
@@ -532,12 +602,12 @@ int vp9_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
 
   if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
       (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
-    return INT_MAX;
+    return UINT32_MAX;
 
   return besterr;
 }
 
-int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
+uint32_t vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
                                         MV *bestmv, const MV *ref_mv,
                                         int allow_hp,
                                         int error_per_bit,
@@ -546,8 +616,8 @@ int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
                                         int iters_per_step,
                                         int *cost_list,
                                         int *mvjcost, int *mvcost[2],
-                                        int *distortion,
-                                        unsigned int *sse1,
+                                        uint32_t *distortion,
+                                        uint32_t *sse1,
                                         const uint8_t *second_pred,
                                         int w, int h) {
   SETUP_SUBPEL_SEARCH;
@@ -608,7 +678,7 @@ int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
     tc = bc;
   }
 
-  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
+  if (allow_hp && use_mv_hp(ref_mv) && forced_stop == 0) {
     hstep >>= 1;
     FIRST_LEVEL_CHECKS;
     if (eighthiters > 1) {
@@ -639,19 +709,19 @@ static const MV search_step_table[12] = {
     {0, -1}, {0, 1}, {-1, 0}, {1, 0}
 };
 
-int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
-                                 MV *bestmv, const MV *ref_mv,
-                                 int allow_hp,
-                                 int error_per_bit,
-                                 const vp9_variance_fn_ptr_t *vfp,
-                                 int forced_stop,
-                                 int iters_per_step,
-                                 int *cost_list,
-                                 int *mvjcost, int *mvcost[2],
-                                 int *distortion,
-                                 unsigned int *sse1,
-                                 const uint8_t *second_pred,
-                                 int w, int h) {
+uint32_t vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
+                                      MV *bestmv, const MV *ref_mv,
+                                      int allow_hp,
+                                      int error_per_bit,
+                                      const vp9_variance_fn_ptr_t *vfp,
+                                      int forced_stop,
+                                      int iters_per_step,
+                                      int *cost_list,
+                                      int *mvjcost, int *mvcost[2],
+                                      uint32_t *distortion,
+                                      uint32_t *sse1,
+                                      const uint8_t *second_pred,
+                                      int w, int h) {
   const uint8_t *const z = x->plane[0].src.buf;
   const uint8_t *const src_address = z;
   const int src_stride = x->plane[0].src.stride;
@@ -680,7 +750,7 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
   unsigned int cost_array[5];
   int kr, kc;
 
-  if (!(allow_hp && vp9_use_mv_hp(ref_mv)))
+  if (!(allow_hp && use_mv_hp(ref_mv)))
     if (round == 3)
       round = 2;
 
@@ -790,7 +860,6 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
 }
 
 #undef MVC
-#undef PRE
 #undef CHECK_BETTER
 
 static INLINE int check_bounds(const MACROBLOCK *x, int row, int col,
@@ -852,9 +921,9 @@ static INLINE void calc_int_cost_list(const MACROBLOCK *x,
       cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
                                     get_buf_from_mv(in_what, &this_mv),
                                     in_what->stride, &sse) +
-          // mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
-          mv_err_cost(&this_mv, &fcenter_mv, x->nmvjointcost, x->mvcost,
-                      x->errorperbit);
+                                    mv_err_cost(&this_mv, &fcenter_mv,
+                                                x->nmvjointcost, x->mvcost,
+                                                x->errorperbit);
     }
   } else {
     for (i = 0; i < 4; i++) {
@@ -866,9 +935,9 @@ static INLINE void calc_int_cost_list(const MACROBLOCK *x,
         cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
                                       get_buf_from_mv(in_what, &this_mv),
                                       in_what->stride, &sse) +
-            // mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
-            mv_err_cost(&this_mv, &fcenter_mv, x->nmvjointcost, x->mvcost,
-                        x->errorperbit);
+                                      mv_err_cost(&this_mv, &fcenter_mv,
+                                                  x->nmvjointcost, x->mvcost,
+                                                  x->errorperbit);
     }
   }
 }
@@ -1347,12 +1416,22 @@ int vp9_get_mvpred_var(const MACROBLOCK *x,
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const MV mv = {best_mv->row * 8, best_mv->col * 8};
-  unsigned int unused;
-
+  uint32_t unused;
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint64_t err= vfp->vf(what->buf, what->stride,
+                        get_buf_from_mv(in_what, best_mv),
+                        in_what->stride, &unused);
+  err += (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
+                                    x->mvcost, x->errorperbit) : 0);
+  if (err >= INT_MAX)
+    return INT_MAX;
+  return (int)err;
+#else
   return vfp->vf(what->buf, what->stride,
                  get_buf_from_mv(in_what, best_mv), in_what->stride, &unused) +
       (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
                                  x->mvcost, x->errorperbit) : 0);
+#endif
 }
 
 int vp9_get_mvpred_av_var(const MACROBLOCK *x,
@@ -1523,69 +1602,83 @@ static int fast_dia_search(const MACROBLOCK *x,
 
 #undef CHECK_BETTER
 
-int vp9_full_range_search_c(const MACROBLOCK *x,
-                            const search_site_config *cfg,
-                            MV *ref_mv, MV *best_mv,
-                            int search_param, int sad_per_bit, int *num00,
-                            const vp9_variance_fn_ptr_t *fn_ptr,
-                            const MV *center_mv) {
+// Exhuastive motion search around a given centre position with a given
+// step size.
+static int exhuastive_mesh_search(const MACROBLOCK *x,
+                                  MV *ref_mv, MV *best_mv,
+                                  int range, int step, int sad_per_bit,
+                                  const vp9_variance_fn_ptr_t *fn_ptr,
+                                  const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int range = 64;
-  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  MV fcenter_mv = {center_mv->row, center_mv->col};
   unsigned int best_sad = INT_MAX;
   int r, c, i;
   int start_col, end_col, start_row, end_row;
+  int col_step = (step > 1) ? step : 4;
 
-  // The cfg and search_param parameters are not used in this search variant
-  (void)cfg;
-  (void)search_param;
+  assert(step >= 1);
 
-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  *best_mv = *ref_mv;
-  *num00 = 11;
+  clamp_mv(&fcenter_mv, x->mv_col_min, x->mv_col_max,
+           x->mv_row_min, x->mv_row_max);
+  *best_mv = fcenter_mv;
   best_sad = fn_ptr->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, ref_mv), in_what->stride) +
-                 mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  start_row = VPXMAX(-range, x->mv_row_min - ref_mv->row);
-  start_col = VPXMAX(-range, x->mv_col_min - ref_mv->col);
-  end_row = VPXMIN(range, x->mv_row_max - ref_mv->row);
-  end_col = VPXMIN(range, x->mv_col_max - ref_mv->col);
-
-  for (r = start_row; r <= end_row; ++r) {
-    for (c = start_col; c <= end_col; c += 4) {
-      if (c + 3 <= end_col) {
-        unsigned int sads[4];
-        const uint8_t *addrs[4];
-        for (i = 0; i < 4; ++i) {
-          const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
-          addrs[i] = get_buf_from_mv(in_what, &mv);
-        }
-
-        fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads);
-
-        for (i = 0; i < 4; ++i) {
-          if (sads[i] < best_sad) {
-            const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
-            const unsigned int sad = sads[i] +
-                mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
+             get_buf_from_mv(in_what, &fcenter_mv), in_what->stride) +
+             mvsad_err_cost(x, &fcenter_mv, ref_mv, sad_per_bit);
+  start_row = VPXMAX(-range, x->mv_row_min - fcenter_mv.row);
+  start_col = VPXMAX(-range, x->mv_col_min - fcenter_mv.col);
+  end_row = VPXMIN(range, x->mv_row_max - fcenter_mv.row);
+  end_col = VPXMIN(range, x->mv_col_max - fcenter_mv.col);
+
+  for (r = start_row; r <= end_row; r += step) {
+    for (c = start_col; c <= end_col; c += col_step) {
+      // Step > 1 means we are not checking every location in this pass.
+      if (step > 1) {
+        const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c};
+        unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                           get_buf_from_mv(in_what, &mv), in_what->stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            *best_mv = mv;
           }
         }
       } else {
-        for (i = 0; i < end_col - c; ++i) {
-          const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
-          unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
-              get_buf_from_mv(in_what, &mv), in_what->stride);
-          if (sad < best_sad) {
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+        // 4 sads in a single call if we are checking every location
+        if (c + 3 <= end_col) {
+          unsigned int sads[4];
+          const uint8_t *addrs[4];
+          for (i = 0; i < 4; ++i) {
+            const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+            addrs[i] = get_buf_from_mv(in_what, &mv);
+          }
+          fn_ptr->sdx4df(what->buf, what->stride, addrs,
+                         in_what->stride, sads);
+
+          for (i = 0; i < 4; ++i) {
+            if (sads[i] < best_sad) {
+              const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+              const unsigned int sad = sads[i] +
+                  mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+              if (sad < best_sad) {
+                best_sad = sad;
+                *best_mv = mv;
+              }
+            }
+          }
+        } else {
+          for (i = 0; i < end_col - c; ++i) {
+            const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+            unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                get_buf_from_mv(in_what, &mv), in_what->stride);
             if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
+              sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+              if (sad < best_sad) {
+                best_sad = sad;
+                *best_mv = mv;
+              }
             }
           }
         }
@@ -1612,8 +1705,8 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
   const uint8_t *best_address;
 
   unsigned int bestsad = INT_MAX;
-  int best_site = 0;
-  int last_site = 0;
+  int best_site = -1;
+  int last_site = -1;
 
   int ref_row;
   int ref_col;
@@ -1623,8 +1716,10 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
   // 0 = initial step (MAX_FIRST_STEP) pel
   // 1 = (MAX_FIRST_STEP/2) pel,
   // 2 = (MAX_FIRST_STEP/4) pel...
-  const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step];
-  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+//  const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const MV *ss_mv = &cfg->ss_mv[search_param * cfg->searches_per_step];
+  const intptr_t *ss_os = &cfg->ss_os[search_param * cfg->searches_per_step];
+  const int tot_steps = cfg->total_steps - search_param;
 
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
@@ -1642,17 +1737,17 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
   bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
                 + mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
 
-  i = 1;
+  i = 0;
 
   for (step = 0; step < tot_steps; step++) {
     int all_in = 1, t;
 
     // All_in is true if every one of the points we are checking are within
     // the bounds of the image.
-    all_in &= ((best_mv->row + ss[i].mv.row) > x->mv_row_min);
-    all_in &= ((best_mv->row + ss[i + 1].mv.row) < x->mv_row_max);
-    all_in &= ((best_mv->col + ss[i + 2].mv.col) > x->mv_col_min);
-    all_in &= ((best_mv->col + ss[i + 3].mv.col) < x->mv_col_max);
+    all_in &= ((best_mv->row + ss_mv[i].row) > x->mv_row_min);
+    all_in &= ((best_mv->row + ss_mv[i + 1].row) < x->mv_row_max);
+    all_in &= ((best_mv->col + ss_mv[i + 2].col) > x->mv_col_min);
+    all_in &= ((best_mv->col + ss_mv[i + 3].col) < x->mv_col_max);
 
     // If all the pixels are within the bounds we don't check whether the
     // search point is valid in this loop,  otherwise we check each point
@@ -1664,15 +1759,15 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
         unsigned char const *block_offset[4];
 
         for (t = 0; t < 4; t++)
-          block_offset[t] = ss[i + t].offset + best_address;
+          block_offset[t] = ss_os[i + t] + best_address;
 
         fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
                        sad_array);
 
         for (t = 0; t < 4; t++, i++) {
           if (sad_array[t] < bestsad) {
-            const MV this_mv = {best_mv->row + ss[i].mv.row,
-                                best_mv->col + ss[i].mv.col};
+            const MV this_mv = {best_mv->row + ss_mv[i].row,
+                                best_mv->col + ss_mv[i].col};
             sad_array[t] += mvsad_err_cost(x, &this_mv, &fcenter_mv,
                                            sad_per_bit);
             if (sad_array[t] < bestsad) {
@@ -1685,11 +1780,11 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
     } else {
       for (j = 0; j < cfg->searches_per_step; j++) {
         // Trap illegal vectors
-        const MV this_mv = {best_mv->row + ss[i].mv.row,
-                            best_mv->col + ss[i].mv.col};
+        const MV this_mv = {best_mv->row + ss_mv[i].row,
+                            best_mv->col + ss_mv[i].col};
 
         if (is_mv_in(x, &this_mv)) {
-          const uint8_t *const check_here = ss[i].offset + best_address;
+          const uint8_t *const check_here = ss_os[i] + best_address;
           unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
                                              in_what_stride);
 
@@ -1705,25 +1800,25 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
       }
     }
     if (best_site != last_site) {
-      best_mv->row += ss[best_site].mv.row;
-      best_mv->col += ss[best_site].mv.col;
-      best_address += ss[best_site].offset;
+      best_mv->row += ss_mv[best_site].row;
+      best_mv->col += ss_mv[best_site].col;
+      best_address += ss_os[best_site];
       last_site = best_site;
 #if defined(NEW_DIAMOND_SEARCH)
       while (1) {
-        const MV this_mv = {best_mv->row + ss[best_site].mv.row,
-                            best_mv->col + ss[best_site].mv.col};
+        const MV this_mv = {best_mv->row + ss_mv[best_site].row,
+                            best_mv->col + ss_mv[best_site].col};
         if (is_mv_in(x, &this_mv)) {
-          const uint8_t *const check_here = ss[best_site].offset + best_address;
+          const uint8_t *const check_here = ss_os[best_site] + best_address;
           unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
                                              in_what_stride);
           if (thissad < bestsad) {
             thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
             if (thissad < bestsad) {
               bestsad = thissad;
-              best_mv->row += ss[best_site].mv.row;
-              best_mv->col += ss[best_site].mv.col;
-              best_address += ss[best_site].offset;
+              best_mv->row += ss_mv[best_site].row;
+              best_mv->col += ss_mv[best_site].col;
+              best_address += ss_os[best_site];
               continue;
             }
           }
@@ -1745,7 +1840,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
   int center, offset = 0;
   int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
   for (d = 0; d <= bw; d += 16) {
-    this_sad = vp9_vector_var(&ref[d], src, bwl);
+    this_sad = vpx_vector_var(&ref[d], src, bwl);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       offset = d;
@@ -1758,7 +1853,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
     // check limit
     if (this_pos < 0 || this_pos > bw)
       continue;
-    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       center = this_pos;
@@ -1771,7 +1866,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
     // check limit
     if (this_pos < 0 || this_pos > bw)
       continue;
-    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       center = this_pos;
@@ -1784,7 +1879,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
     // check limit
     if (this_pos < 0 || this_pos > bw)
       continue;
-    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       center = this_pos;
@@ -1797,7 +1892,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
     // check limit
     if (this_pos < 0 || this_pos > bw)
       continue;
-    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       center = this_pos;
@@ -1815,7 +1910,7 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
                                            BLOCK_SIZE bsize,
                                            int mi_row, int mi_col) {
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *mi = xd->mi[0];
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
   DECLARE_ALIGNED(16, int16_t, hbuf[128]);
   DECLARE_ALIGNED(16, int16_t, vbuf[128]);
@@ -1829,12 +1924,12 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
   const int src_stride = x->plane[0].src.stride;
   const int ref_stride = xd->plane[0].pre[0].stride;
   uint8_t const *ref_buf, *src_buf;
-  MV *tmp_mv = &xd->mi[0]->mbmi.mv[0].as_mv;
+  MV *tmp_mv = &xd->mi[0]->mv[0].as_mv;
   unsigned int best_sad, tmp_sad, this_sad[4];
   MV this_mv;
   const int norm_factor = 3 + (bw >> 5);
   const YV12_BUFFER_CONFIG *scaled_ref_frame =
-      vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]);
+      vp9_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
 
   if (scaled_ref_frame) {
     int i;
@@ -1866,25 +1961,25 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
   // Set up prediction 1-D reference set
   ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
   for (idx = 0; idx < search_width; idx += 16) {
-    vp9_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
+    vpx_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
     ref_buf += 16;
   }
 
   ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
   for (idx = 0; idx < search_height; ++idx) {
-    vbuf[idx] = vp9_int_pro_col(ref_buf, bw) >> norm_factor;
+    vbuf[idx] = vpx_int_pro_col(ref_buf, bw) >> norm_factor;
     ref_buf += ref_stride;
   }
 
   // Set up src 1-D reference set
   for (idx = 0; idx < bw; idx += 16) {
     src_buf = x->plane[0].src.buf + idx;
-    vp9_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
+    vpx_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
   }
 
   src_buf = x->plane[0].src.buf;
   for (idx = 0; idx < bh; ++idx) {
-    src_vbuf[idx] = vp9_int_pro_col(src_buf, bw) >> norm_factor;
+    src_vbuf[idx] = vpx_int_pro_col(src_buf, bw) >> norm_factor;
     src_buf += src_stride;
   }
 
@@ -2015,6 +2110,70 @@ static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x,
   return bestsme;
 }
 
+#define MIN_RANGE 7
+#define MAX_RANGE 256
+#define MIN_INTERVAL 1
+// Runs an limited range exhaustive mesh search using a pattern set
+// according to the encode speed profile.
+static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
+                                 MV *centre_mv_full, int sadpb,  int *cost_list,
+                                 const vp9_variance_fn_ptr_t *fn_ptr,
+                                 const MV *ref_mv, MV *dst_mv) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MV temp_mv = {centre_mv_full->row, centre_mv_full->col};
+  MV f_ref_mv = {ref_mv->row >> 3, ref_mv->col >> 3};
+  int bestsme;
+  int i;
+  int interval = sf->mesh_patterns[0].interval;
+  int range = sf->mesh_patterns[0].range;
+  int baseline_interval_divisor;
+
+  // Keep track of number of exhaustive calls (this frame in this thread).
+  ++(*x->ex_search_count_ptr);
+
+  // Trap illegal values for interval and range for this function.
+  if ((range < MIN_RANGE) || (range > MAX_RANGE) ||
+      (interval < MIN_INTERVAL) || (interval > range))
+    return INT_MAX;
+
+  baseline_interval_divisor = range / interval;
+
+  // Check size of proposed first range against magnitude of the centre
+  // value used as a starting point.
+  range = VPXMAX(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4);
+  range = VPXMIN(range, MAX_RANGE);
+  interval = VPXMAX(interval, range / baseline_interval_divisor);
+
+  // initial search
+  bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range,
+                                  interval, sadpb, fn_ptr, &temp_mv);
+
+  if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
+    // Progressive searches with range and step size decreasing each time
+    // till we reach a step size of 1. Then break out.
+    for (i = 1; i < MAX_MESH_STEP; ++i) {
+      // First pass with coarser step and longer range
+      bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv,
+                                       sf->mesh_patterns[i].range,
+                                       sf->mesh_patterns[i].interval,
+                                       sadpb, fn_ptr, &temp_mv);
+
+      if (sf->mesh_patterns[i].interval == 1)
+        break;
+    }
+  }
+
+  if (bestsme < INT_MAX)
+    bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+  *dst_mv = temp_mv;
+
+  // Return cost list.
+  if (cost_list) {
+    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+  }
+  return bestsme;
+}
+
 int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
                           int sad_per_bit, int distance,
                           const vp9_variance_fn_ptr_t *fn_ptr,
@@ -2328,6 +2487,18 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x,
   return best_sad;
 }
 
+#define MIN_EX_SEARCH_LIMIT 128
+static int is_exhaustive_allowed(VP9_COMP *cpi, MACROBLOCK *x) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const int max_ex = VPXMAX(MIN_EX_SEARCH_LIMIT,
+      (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
+
+  return sf->allow_exhaustive_searches &&
+      (sf->exhaustive_searches_thresh < INT_MAX) &&
+      (*x->ex_search_count_ptr <= max_ex) &&
+      !cpi->rc.is_src_frame_alt_ref;
+}
+
 int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full,
                           int step_param, int error_per_bit,
@@ -2346,6 +2517,9 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
     cost_list[4] = INT_MAX;
   }
 
+  // Keep track of number of searches (this frame in this thread).
+  ++(*x->m_search_count_ptr);
+
   switch (method) {
     case FAST_DIAMOND:
       var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
@@ -2371,6 +2545,27 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
       var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
                                MAX_MVSEARCH_STEPS - 1 - step_param,
                                1, cost_list, fn_ptr, ref_mv, tmp_mv);
+
+      // Should we allow a follow on exhaustive search?
+      if (is_exhaustive_allowed(cpi, x)) {
+        int64_t exhuastive_thr = sf->exhaustive_searches_thresh;
+        exhuastive_thr >>= 8 - (b_width_log2_lookup[bsize] +
+                                b_height_log2_lookup[bsize]);
+
+        // Threshold variance for an exhaustive full search.
+        if (var > exhuastive_thr) {
+            int var_ex;
+          MV tmp_mv_ex;
+          var_ex = full_pixel_exhaustive(cpi, x, tmp_mv,
+                                         error_per_bit, cost_list, fn_ptr,
+                                         ref_mv, &tmp_mv_ex);
+
+          if (var_ex < var) {
+            var = var_ex;
+            *tmp_mv = tmp_mv_ex;
+          }
+        }
+      }
       break;
     default:
       assert(0 && "Invalid search method.");
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 5efd543..86cd267 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -31,16 +31,12 @@ extern "C" {
 // for Block_16x16
 #define BORDER_MV_PIXELS_B16 (16 + VP9_INTERP_EXTEND)
 
-// motion search site
-typedef struct search_site {
-  MV mv;
-  int offset;
-} search_site;
-
 typedef struct search_site_config {
-  search_site ss[8 * MAX_MVSEARCH_STEPS + 1];
-  int ss_count;
+  // motion search sites
+  MV  ss_mv[8 * MAX_MVSEARCH_STEPS];        // Motion vector
+  intptr_t ss_os[8 * MAX_MVSEARCH_STEPS];   // Offset
   int searches_per_step;
+  int total_steps;
 } search_site_config;
 
 void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride);
@@ -78,7 +74,7 @@ unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi,
                                            BLOCK_SIZE bsize,
                                            int mi_row, int mi_col);
 
-typedef int (fractional_mv_step_fp) (
+typedef uint32_t (fractional_mv_step_fp) (
     const MACROBLOCK *x,
     MV *bestmv, const MV *ref_mv,
     int allow_hp,
@@ -88,7 +84,7 @@ typedef int (fractional_mv_step_fp) (
     int iters_per_step,
     int *cost_list,
     int *mvjcost, int *mvcost[2],
-    int *distortion, unsigned int *sse1,
+    uint32_t *distortion, uint32_t *sse1,
     const uint8_t *second_pred,
     int w, int h);
 
@@ -96,6 +92,7 @@ extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned;
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_more;
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_evenmore;
+extern fractional_mv_step_fp vp9_skip_sub_pixel_tree;
 
 typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
                                     const MV *ref_mv, int sad_per_bit,
diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c
new file mode 100644
index 0000000..4b43b38
--- /dev/null
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -0,0 +1,263 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_noise_estimate.h"
+#include "vp9/encoder/vp9_encoder.h"
+
+void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne,
+                             int width,
+                             int height) {
+  ne->enabled = 0;
+  ne->level = kLowLow;
+  ne->value = 0;
+  ne->count = 0;
+  ne->thresh = 90;
+  ne->last_w = 0;
+  ne->last_h = 0;
+  if (width * height >= 1920 * 1080) {
+    ne->thresh = 200;
+  } else if (width * height >= 1280 * 720) {
+    ne->thresh = 130;
+  }
+  ne->num_frames_estimate = 20;
+}
+
+static int enable_noise_estimation(VP9_COMP *const cpi) {
+  // Enable noise estimation if denoising is on.
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0)
+    return 1;
+#endif
+  // Only allow noise estimate under certain encoding mode.
+  // Enabled for 1 pass CBR, speed >=5, and if resolution is same as original.
+  // Not enabled for SVC mode and screen_content_mode.
+  // Not enabled for low resolutions.
+  if (cpi->oxcf.pass == 0 &&
+      cpi->oxcf.rc_mode == VPX_CBR &&
+      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+      cpi->oxcf.speed >= 5 &&
+      cpi->resize_state == ORIG &&
+      cpi->resize_pending == 0 &&
+      !cpi->use_svc &&
+      cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+      cpi->common.width >= 640 &&
+      cpi->common.height >= 480)
+    return 1;
+  else
+    return 0;
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static void copy_frame(YV12_BUFFER_CONFIG * const dest,
+                       const YV12_BUFFER_CONFIG * const src) {
+  int r;
+  const uint8_t *srcbuf = src->y_buffer;
+  uint8_t *destbuf = dest->y_buffer;
+
+  assert(dest->y_width == src->y_width);
+  assert(dest->y_height == src->y_height);
+
+  for (r = 0; r < dest->y_height; ++r) {
+    memcpy(destbuf, srcbuf, dest->y_width);
+    destbuf += dest->y_stride;
+    srcbuf += src->y_stride;
+  }
+}
+#endif  // CONFIG_VP9_TEMPORAL_DENOISING
+
+NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) {
+  int noise_level = kLowLow;
+  if (ne->value > (ne->thresh << 1)) {
+    noise_level = kHigh;
+  } else {
+    if (ne->value > ne->thresh)
+      noise_level = kMedium;
+    else if (ne->value > ((9 * ne->thresh) >> 4))
+      noise_level = kLow;
+    else
+      noise_level = kLowLow;
+  }
+  return noise_level;
+}
+
+void vp9_update_noise_estimate(VP9_COMP *const cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  NOISE_ESTIMATE *const ne = &cpi->noise_estimate;
+  // Estimate of noise level every frame_period frames.
+  int frame_period = 8;
+  int thresh_consec_zeromv = 6;
+  unsigned int thresh_sum_diff = 100;
+  unsigned int thresh_sum_spatial = (200 * 200) << 8;
+  unsigned int thresh_spatial_var = (32 * 32) << 8;
+  int min_blocks_estimate = cm->mi_rows * cm->mi_cols >> 7;
+  // Estimate is between current source and last source.
+  YV12_BUFFER_CONFIG *last_source = cpi->Last_Source;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0)
+    last_source = &cpi->denoiser.last_source;
+#endif
+  ne->enabled = enable_noise_estimation(cpi);
+  if (!ne->enabled ||
+      cm->current_video_frame % frame_period != 0 ||
+      last_source == NULL ||
+      ne->last_w != cm->width ||
+      ne->last_h != cm->height) {
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0)
+    copy_frame(&cpi->denoiser.last_source, cpi->Source);
+#endif
+    if (last_source != NULL) {
+      ne->last_w = cm->width;
+      ne->last_h = cm->height;
+    }
+    return;
+  } else if (cpi->rc.avg_frame_low_motion < 50) {
+    // Force noise estimation to 0 and denoiser off if content has high motion.
+    ne->level = kLowLow;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0)
+      vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
+#endif
+    return;
+  } else {
+    int num_samples = 0;
+    uint64_t avg_est = 0;
+    int bsize = BLOCK_16X16;
+    static const unsigned char const_source[16] = {
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have
+    // been encoded as zero/small mv at least x consecutive frames, compute
+    // the variance to update estimate of noise in the source.
+    const uint8_t *src_y = cpi->Source->y_buffer;
+    const int src_ystride = cpi->Source->y_stride;
+    const uint8_t *last_src_y = last_source->y_buffer;
+    const int last_src_ystride = last_source->y_stride;
+    const uint8_t *src_u = cpi->Source->u_buffer;
+    const uint8_t *src_v = cpi->Source->v_buffer;
+    const int src_uvstride = cpi->Source->uv_stride;
+    int mi_row, mi_col;
+    int num_low_motion = 0;
+    int frame_low_motion = 1;
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+        int bl_index = mi_row * cm->mi_cols + mi_col;
+        if (cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv)
+          num_low_motion++;
+      }
+    }
+    if (num_low_motion < ((3 * cm->mi_rows * cm->mi_cols) >> 3))
+      frame_low_motion = 0;
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+        // 16x16 blocks, 1/4 sample of frame.
+        if (mi_row % 4 == 0 && mi_col % 4 == 0 &&
+            mi_row < cm->mi_rows - 1 &&
+            mi_col < cm->mi_cols - 1) {
+          int bl_index = mi_row * cm->mi_cols + mi_col;
+          int bl_index1 = bl_index + 1;
+          int bl_index2 = bl_index + cm->mi_cols;
+          int bl_index3 = bl_index2 + 1;
+          // Only consider blocks that are likely steady background. i.e, have
+          // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
+          // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
+          // 4 sub-blocks for 16x16 block. Also, avoid skin blocks.
+          int consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index],
+                                     VPXMIN(cpi->consec_zero_mv[bl_index1],
+                                     VPXMIN(cpi->consec_zero_mv[bl_index2],
+                                     cpi->consec_zero_mv[bl_index3])));
+          int is_skin = 0;
+          if (cpi->use_skin_detection) {
+            is_skin = vp9_compute_skin_block(src_y,
+                                             src_u,
+                                             src_v,
+                                             src_ystride,
+                                             src_uvstride,
+                                             bsize,
+                                             consec_zeromv,
+                                             0);
+          }
+          if (frame_low_motion &&
+              cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
+              cpi->consec_zero_mv[bl_index1] > thresh_consec_zeromv &&
+              cpi->consec_zero_mv[bl_index2] > thresh_consec_zeromv &&
+              cpi->consec_zero_mv[bl_index3] > thresh_consec_zeromv &&
+              !is_skin) {
+            // Compute variance.
+            unsigned int sse;
+            unsigned int variance = cpi->fn_ptr[bsize].vf(src_y,
+                                                          src_ystride,
+                                                          last_src_y,
+                                                          last_src_ystride,
+                                                          &sse);
+            // Only consider this block as valid for noise measurement if the
+            // average term (sse - variance = N * avg^{2}, N = 16X16) of the
+            // temporal residual is small (avoid effects from lighting change).
+            if ((sse - variance) < thresh_sum_diff) {
+              unsigned int sse2;
+              const unsigned int spatial_variance =
+                  cpi->fn_ptr[bsize].vf(src_y, src_ystride, const_source,
+                                        0, &sse2);
+              // Avoid blocks with high brightness and high spatial variance.
+              if ((sse2 - spatial_variance) < thresh_sum_spatial &&
+                  spatial_variance < thresh_spatial_var) {
+                avg_est += variance / ((spatial_variance >> 9) + 1);
+                num_samples++;
+              }
+            }
+          }
+        }
+        src_y += 8;
+        last_src_y += 8;
+        src_u += 4;
+        src_v += 4;
+      }
+      src_y += (src_ystride << 3) - (cm->mi_cols << 3);
+      last_src_y += (last_src_ystride << 3) - (cm->mi_cols << 3);
+      src_u += (src_uvstride << 2) - (cm->mi_cols << 2);
+      src_v += (src_uvstride << 2) - (cm->mi_cols << 2);
+    }
+    ne->last_w = cm->width;
+    ne->last_h = cm->height;
+    // Update noise estimate if we have at a minimum number of block samples,
+    // and avg_est > 0 (avg_est == 0 can happen if the application inputs
+    // duplicate frames).
+    if (num_samples > min_blocks_estimate && avg_est > 0) {
+      // Normalize.
+      avg_est = avg_est / num_samples;
+      // Update noise estimate.
+      ne->value = (int)((15 * ne->value + avg_est) >> 4);
+      ne->count++;
+      if (ne->count == ne->num_frames_estimate) {
+        // Reset counter and check noise level condition.
+        ne->num_frames_estimate = 30;
+        ne->count = 0;
+        ne->level = vp9_noise_estimate_extract_level(ne);
+#if CONFIG_VP9_TEMPORAL_DENOISING
+        if (cpi->oxcf.noise_sensitivity > 0)
+          vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
+#endif
+      }
+    }
+  }
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0)
+    copy_frame(&cpi->denoiser.last_source, cpi->Source);
+#endif
+}
diff --git a/vp9/encoder/vp9_noise_estimate.h b/vp9/encoder/vp9_noise_estimate.h
new file mode 100644
index 0000000..826d125
--- /dev/null
+++ b/vp9/encoder/vp9_noise_estimate.h
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_NOISE_ESTIMATE_H_
+#define VP9_ENCODER_NOISE_ESTIMATE_H_
+
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_skin_detection.h"
+#include "vpx_scale/yv12config.h"
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+#include "vp9/encoder/vp9_denoiser.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum noise_level {
+  kLowLow,
+  kLow,
+  kMedium,
+  kHigh
+} NOISE_LEVEL;
+
+typedef struct noise_estimate {
+  int enabled;
+  NOISE_LEVEL level;
+  int value;
+  int thresh;
+  int count;
+  int last_w;
+  int last_h;
+  int num_frames_estimate;
+} NOISE_ESTIMATE;
+
+struct VP9_COMP;
+
+void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne,
+                             int width,
+                             int height);
+
+NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne);
+
+void vp9_update_noise_estimate(struct VP9_COMP *const cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_NOISE_ESTIMATE_H_
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 5444bc8..f6b1dfc 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -78,7 +78,8 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
 
   // Start the search at the previous frame filter level unless it is now out of
   // range.
-  int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
+  int filt_mid =
+      clamp(lf->last_filt_level, min_filter_level, max_filter_level);
   int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
   // Sum squared error at each filter level
   int64_t ss_err[MAX_LOOP_FILTER + 1];
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index fc4d9ae..ba6a0c6 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -40,16 +40,25 @@ typedef struct {
   int in_use;
 } PRED_BUFFER;
 
-static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCK *x,
+
+static const int pos_shift_16x16[4][4] = {
+  {9, 10, 13, 14},
+  {11, 12, 15, 16},
+  {17, 18, 21, 22},
+  {19, 20, 23, 24}
+};
+
+static int mv_refs_rt(VP9_COMP *cpi, const VP9_COMMON *cm,
+                      const MACROBLOCK *x,
                       const MACROBLOCKD *xd,
                       const TileInfo *const tile,
                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                      int_mv *mv_ref_list,
-                      int mi_row, int mi_col) {
+                      int_mv *mv_ref_list, int_mv *base_mv,
+                      int mi_row, int mi_col, int use_base_mv) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
 
-  const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
+  const POSITION *const mv_ref_search = mv_ref_blocks[mi->sb_type];
 
   int different_ref_found = 0;
   int context_counter = 0;
@@ -66,12 +75,11 @@ static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCK *x,
     if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
       const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row *
                                                    xd->mi_stride];
-      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
       // Keep counts for entropy encoding.
-      context_counter += mode_2_counter[candidate->mode];
+      context_counter += mode_2_counter[candidate_mi->mode];
       different_ref_found = 1;
 
-      if (candidate->ref_frame[0] == ref_frame)
+      if (candidate_mi->ref_frame[0] == ref_frame)
         ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, -1),
                         refmv_count, mv_ref_list, Done);
     }
@@ -85,12 +93,12 @@ static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCK *x,
   for (; i < MVREF_NEIGHBOURS && !refmv_count; ++i) {
     const POSITION *const mv_ref = &mv_ref_search[i];
     if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
-      const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row *
-                                                    xd->mi_stride]->mbmi;
+      const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row *
+                                                   xd->mi_stride];
       different_ref_found = 1;
 
-      if (candidate->ref_frame[0] == ref_frame)
-        ADD_MV_REF_LIST(candidate->mv[0], refmv_count, mv_ref_list, Done);
+      if (candidate_mi->ref_frame[0] == ref_frame)
+        ADD_MV_REF_LIST(candidate_mi->mv[0], refmv_count, mv_ref_list, Done);
     }
   }
 
@@ -101,15 +109,29 @@ static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCK *x,
     for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
       const POSITION *mv_ref = &mv_ref_search[i];
       if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
-        const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row
-                                              * xd->mi_stride]->mbmi;
+        const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row
+                                              * xd->mi_stride];
 
         // If the candidate is INTRA we don't want to consider its mv.
-        IF_DIFF_REF_FRAME_ADD_MV(candidate, ref_frame, ref_sign_bias,
+        IF_DIFF_REF_FRAME_ADD_MV(candidate_mi, ref_frame, ref_sign_bias,
                                  refmv_count, mv_ref_list, Done);
       }
     }
   }
+  if (use_base_mv &&
+      !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+      ref_frame == LAST_FRAME) {
+    // Get base layer mv.
+    MV_REF *candidate =
+        &cm->prev_frame->mvs[(mi_col>>1) + (mi_row>>1) * (cm->mi_cols>>1)];
+    if (candidate->mv[0].as_int != INVALID_MV) {
+        base_mv->as_mv.row = (candidate->mv[0].as_mv.row * 2);
+        base_mv->as_mv.col = (candidate->mv[0].as_mv.col * 2);
+      clamp_mv_ref(&base_mv->as_mv, xd);
+    } else {
+      base_mv->as_int = INVALID_MV;
+    }
+  }
 
  Done:
 
@@ -125,16 +147,17 @@ static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCK *x,
 static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
                                   int_mv *tmp_mv, int *rate_mv,
-                                  int64_t best_rd_sofar) {
+                                  int64_t best_rd_sofar, int use_base_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *mi = xd->mi[0];
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
   const int step_param = cpi->sf.mv.fullpel_search_step_param;
   const int sadpb = x->sadperbit16;
   MV mvp_full;
-  const int ref = mbmi->ref_frame[0];
+  const int ref = mi->ref_frame[0];
   const MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
-  int dis;
+  MV center_mv;
+  uint32_t dis;
   int rate_mode;
   const int tmp_col_min = x->mv_col_min;
   const int tmp_col_max = x->mv_col_max;
@@ -164,9 +187,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
+  if (!use_base_mv)
+    center_mv = ref_mv;
+  else
+    center_mv = tmp_mv->as_mv;
+
   vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
                         cond_cost_list(cpi, cost_list),
-                        &ref_mv, &tmp_mv->as_mv, INT_MAX, 0);
+                        &center_mv, &tmp_mv->as_mv, INT_MAX, 0);
 
   x->mv_col_min = tmp_col_min;
   x->mv_col_max = tmp_col_max;
@@ -224,7 +252,7 @@ static void block_variance(const uint8_t *src, int src_stride,
                     &sse8x8[k], &sum8x8[k]);
       *sse += sse8x8[k];
       *sum += sum8x8[k];
-      var8x8[k] = sse8x8[k] - (((unsigned int)sum8x8[k] * sum8x8[k]) >> 6);
+      var8x8[k] = sse8x8[k] - (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6);
       k++;
     }
   }
@@ -245,7 +273,7 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
           sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1];
       sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] +
           sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1];
-      var_o[k] = sse_o[k] - (((unsigned int)sum_o[k] * sum_o[k]) >>
+      var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
           (b_width_log2_lookup[unit_size] +
               b_height_log2_lookup[unit_size] + 6));
       k++;
@@ -300,7 +328,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
       tx_size = TX_8X8;
 
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
-        cyclic_refresh_segment_id_boosted(xd->mi[0]->mbmi.segment_id))
+        cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
       tx_size = TX_8X8;
     else if (tx_size > TX_16X16)
       tx_size = TX_16X16;
@@ -310,7 +338,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
   }
 
   assert(tx_size >= TX_8X8);
-  xd->mi[0]->mbmi.tx_size = tx_size;
+  xd->mi[0]->tx_size = tx_size;
 
   // Evaluate if the partition block is a skippable block in Y plane.
   {
@@ -379,7 +407,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
     for (i = 1; i <= 2; i++) {
       struct macroblock_plane *const p = &x->plane[i];
       struct macroblockd_plane *const pd = &xd->plane[i];
-      const TX_SIZE uv_tx_size = get_uv_tx_size(&xd->mi[0]->mbmi, pd);
+      const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd);
       const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
       const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
       const int uv_bw = b_width_log2_lookup[uv_bsize];
@@ -475,19 +503,19 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
 
   if (cpi->common.tx_mode == TX_MODE_SELECT) {
     if (sse > (var << 2))
-      xd->mi[0]->mbmi.tx_size =
+      xd->mi[0]->tx_size =
           VPXMIN(max_txsize_lookup[bsize],
                  tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
     else
-      xd->mi[0]->mbmi.tx_size = TX_8X8;
+      xd->mi[0]->tx_size = TX_8X8;
 
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
-        cyclic_refresh_segment_id_boosted(xd->mi[0]->mbmi.segment_id))
-      xd->mi[0]->mbmi.tx_size = TX_8X8;
-    else if (xd->mi[0]->mbmi.tx_size > TX_16X16)
-      xd->mi[0]->mbmi.tx_size = TX_16X16;
+        cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
+      xd->mi[0]->tx_size = TX_8X8;
+    else if (xd->mi[0]->tx_size > TX_16X16)
+      xd->mi[0]->tx_size = TX_16X16;
   } else {
-    xd->mi[0]->mbmi.tx_size =
+    xd->mi[0]->tx_size =
         VPXMIN(max_txsize_lookup[bsize],
                tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
   }
@@ -495,7 +523,7 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
   // Evaluate if the partition block is a skippable block in Y plane.
   {
     const BLOCK_SIZE unit_size =
-        txsize_to_bsize[xd->mi[0]->mbmi.tx_size];
+        txsize_to_bsize[xd->mi[0]->tx_size];
     const unsigned int num_blk_log2 =
         (b_width_log2_lookup[bsize] - b_width_log2_lookup[unit_size]) +
         (b_height_log2_lookup[bsize] - b_height_log2_lookup[unit_size]);
@@ -562,39 +590,46 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
-                      int *skippable, int64_t *sse, int plane,
-                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
+                      int *skippable, int64_t *sse, BLOCK_SIZE bsize,
+                      TX_SIZE tx_size) {
   MACROBLOCKD *xd = &x->e_mbd;
   unsigned int var_y, sse_y;
-  (void)plane;
+
   (void)tx_size;
-  model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
+  model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, &var_y,
+                    &sse_y);
   *sse = INT_MAX;
   *skippable = 0;
   return;
 }
 #else
-static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
-                      int *skippable, int64_t *sse, int plane,
-                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
+                      int *skippable, int64_t *sse, BLOCK_SIZE bsize,
+                      TX_SIZE tx_size) {
   MACROBLOCKD *xd = &x->e_mbd;
-  const struct macroblockd_plane *pd = &xd->plane[plane];
-  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *pd = &xd->plane[0];
+  struct macroblock_plane *const p = &x->plane[0];
   const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
   const int step = 1 << (tx_size << 1);
   const int block_step = (1 << tx_size);
   int block = 0, r, c;
-  int shift = tx_size == TX_32X32 ? 0 : 2;
   const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
-      xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+      xd->mb_to_right_edge >> 5);
   const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
-      xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+      xd->mb_to_bottom_edge >> 5);
   int eob_cost = 0;
+  const int bw = 4 * num_4x4_w;
+  const int bh = 4 * num_4x4_h;
 
   (void)cpi;
-  vp9_subtract_plane(x, bsize, plane);
+
+  // The max tx_size passed in is TX_16X16.
+  assert(tx_size != TX_32X32);
+
+  vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                     pd->dst.buf, pd->dst.stride);
   *skippable = 1;
   // Keep track of the row and column of the blocks we use so that we know
   // if we are in the unrestricted motion border.
@@ -606,27 +641,20 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
         tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
         tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
         uint16_t *const eob = &p->eobs[block];
-        const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
+        const int diff_stride = bw;
         const int16_t *src_diff;
         src_diff = &p->src_diff[(r * diff_stride + c) << 2];
 
         switch (tx_size) {
-          case TX_32X32:
-            vpx_fdct32x32_rd(src_diff, coeff, diff_stride);
-            vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                  p->round_fp, p->quant_fp, p->quant_shift,
-                                  qcoeff, dqcoeff, pd->dequant, eob,
-                                  scan_order->scan, scan_order->iscan);
-            break;
           case TX_16X16:
-            vp9_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
+            vpx_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
             vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
                             pd->dequant, eob,
                             scan_order->scan, scan_order->iscan);
             break;
           case TX_8X8:
-            vp9_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
+            vpx_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
             vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
                             pd->dequant, eob,
@@ -650,18 +678,17 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
     }
   }
 
-  if (*skippable && *sse < INT64_MAX) {
-    *rate = 0;
-    *dist = (*sse << 6) >> shift;
-    *sse = *dist;
-    return;
+  this_rdc->rate = 0;
+  if (*sse < INT64_MAX) {
+    *sse = (*sse << 6) >> 2;
+    if (*skippable) {
+      this_rdc->dist = *sse;
+      return;
+    }
   }
 
   block = 0;
-  *rate = 0;
-  *dist = 0;
-  if (*sse < INT64_MAX)
-    *sse = (*sse << 6) >> shift;
+  this_rdc->dist = 0;
   for (r = 0; r < max_blocks_high; r += block_step) {
     for (c = 0; c < num_4x4_w; c += block_step) {
       if (c < max_blocks_wide) {
@@ -671,26 +698,26 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
         uint16_t *const eob = &p->eobs[block];
 
         if (*eob == 1)
-          *rate += (int)abs(qcoeff[0]);
+          this_rdc->rate += (int)abs(qcoeff[0]);
         else if (*eob > 1)
-          *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4);
+          this_rdc->rate += vpx_satd((const int16_t *)qcoeff, step << 4);
 
-        *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
+        this_rdc->dist +=
+            vp9_block_error_fp(coeff, dqcoeff, step << 4) >> 2;
       }
       block += step;
     }
   }
 
-  if (*skippable == 0) {
-    *rate <<= 10;
-    *rate += (eob_cost << 8);
-  }
+  // If skippable is set, rate gets clobbered later.
+  this_rdc->rate <<= (2 + VP9_PROB_COST_SHIFT);
+  this_rdc->rate += (eob_cost << VP9_PROB_COST_SHIFT);
 }
 #endif
 
 static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE plane_bsize,
                                MACROBLOCK *x, MACROBLOCKD *xd,
-                               int *out_rate_sum, int64_t *out_dist_sum,
+                               RD_COST *this_rdc,
                                unsigned int *var_y, unsigned int *sse_y,
                                int start_plane, int stop_plane) {
   // Note our transform coeffs are 8 times an orthogonal transform.
@@ -701,8 +728,8 @@ static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE plane_bsize,
   int64_t dist;
   int i;
 
-  *out_rate_sum = 0;
-  *out_dist_sum = 0;
+  this_rdc->rate = 0;
+  this_rdc->dist = 0;
 
   for (i = start_plane; i <= stop_plane; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
@@ -733,8 +760,8 @@ static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE plane_bsize,
                                  dc_quant >> 3, &rate, &dist);
   #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    *out_rate_sum += rate >> 1;
-    *out_dist_sum += dist << 3;
+    this_rdc->rate += rate >> 1;
+    this_rdc->dist += dist << 3;
 
   #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -749,8 +776,8 @@ static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE plane_bsize,
                                  ac_quant >> 3, &rate, &dist);
   #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    *out_rate_sum += rate;
-    *out_dist_sum += dist << 4;
+    this_rdc->rate += rate;
+    this_rdc->dist += dist << 4;
   }
 }
 
@@ -779,14 +806,20 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
                                  struct buf_2d yv12_mb[][MAX_MB_PLANE],
                                  int *rate, int64_t *dist) {
   MACROBLOCKD *xd = &x->e_mbd;
-
+  MODE_INFO *const mi = xd->mi[0];
   const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
   unsigned int var = var_y, sse = sse_y;
   // Skipping threshold for ac.
   unsigned int thresh_ac;
   // Skipping threshold for dc.
   unsigned int thresh_dc;
-  if (x->encode_breakout > 0) {
+  int motion_low = 1;
+  if (mi->mv[0].as_mv.row > 64 ||
+      mi->mv[0].as_mv.row < -64 ||
+      mi->mv[0].as_mv.col > 64 ||
+      mi->mv[0].as_mv.col < -64)
+    motion_low = 0;
+  if (x->encode_breakout > 0 && motion_low == 1) {
     // Set a maximum for threshold to avoid big PSNR loss in low bit rate
     // case. Use extreme low threshold for static frames to limit
     // skipping.
@@ -826,6 +859,12 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
   if (var <= thresh_ac && (sse - var) <= thresh_dc) {
     unsigned int sse_u, sse_v;
     unsigned int var_u, var_v;
+    unsigned int thresh_ac_uv = thresh_ac;
+    unsigned int thresh_dc_uv = thresh_dc;
+    if (x->sb_is_skin) {
+      thresh_ac_uv = 0;
+      thresh_dc_uv = 0;
+    }
 
     // Skip UV prediction unless breakout is zero (lossless) to save
     // computation with low impact on the result
@@ -841,14 +880,14 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
                                     xd->plane[1].dst.stride, &sse_u);
 
     // U skipping condition checking
-    if (((var_u << 2) <= thresh_ac) && (sse_u - var_u <= thresh_dc)) {
+    if (((var_u << 2) <= thresh_ac_uv) && (sse_u - var_u <= thresh_dc_uv)) {
       var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
                                       x->plane[2].src.stride,
                                       xd->plane[2].dst.buf,
                                       xd->plane[2].dst.stride, &sse_v);
 
       // V skipping condition checking
-      if (((var_v << 2) <= thresh_ac) && (sse_v - var_v <= thresh_dc)) {
+      if (((var_v << 2) <= thresh_ac_uv) && (sse_v - var_v <= thresh_dc_uv)) {
         x->skip = 1;
 
         // The cost of skip bit needs to be added.
@@ -874,8 +913,8 @@ struct estimate_block_intra_args {
   VP9_COMP *cpi;
   MACROBLOCK *x;
   PREDICTION_MODE mode;
-  int rate;
-  int64_t dist;
+  int skippable;
+  RD_COST *rdc;
 };
 
 static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -892,8 +931,7 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
   int i, j;
-  int rate;
-  int64_t dist;
+  RD_COST this_rdc;
 
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
 
@@ -909,23 +947,20 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
 
   if (plane == 0) {
     int64_t this_sse = INT64_MAX;
-    int is_skippable;
     // TODO(jingning): This needs further refactoring.
-    block_yrd(cpi, x, &rate, &dist, &is_skippable, &this_sse, 0,
-              bsize_tx, VPXMIN(tx_size, TX_16X16));
-    x->skip_txfm[0] = is_skippable;
-    // TODO(jingning): Skip is signalled per prediciton block not per tx block.
-    rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), is_skippable);
+    block_yrd(cpi, x, &this_rdc, &args->skippable, &this_sse, bsize_tx,
+              VPXMIN(tx_size, TX_16X16));
   } else {
-    unsigned int var, sse;
-    model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &rate, &dist, &var, &sse,
-                       plane, plane);
+    unsigned int var = 0;
+    unsigned int sse = 0;
+    model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &this_rdc, &var, &sse, plane,
+                       plane);
   }
 
   p->src.buf = src_buf_base;
   pd->dst.buf = dst_buf_base;
-  args->rate += rate;
-  args->dist += dist;
+  args->rdc->rate += this_rdc.rate;
+  args->rdc->dist += this_rdc.dist;
 }
 
 static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = {
@@ -975,17 +1010,17 @@ static INLINE void update_thresh_freq_fact(VP9_COMP *cpi,
 void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *const mi = xd->mi[0];
   RD_COST this_rdc, best_rdc;
   PREDICTION_MODE this_mode;
-  struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
+  struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
   const TX_SIZE intra_tx_size =
       VPXMIN(max_txsize_lookup[bsize],
              tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
   MODE_INFO *const mic = xd->mi[0];
   int *bmode_costs;
-  const MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
-  const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi = xd->left_mi;
   const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
   const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
   bmode_costs = cpi->y_mode_costs[A][L];
@@ -994,29 +1029,35 @@ void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
   vp9_rd_cost_reset(&best_rdc);
   vp9_rd_cost_reset(&this_rdc);
 
-  mbmi->ref_frame[0] = INTRA_FRAME;
-  mbmi->mv[0].as_int = INVALID_MV;
-  mbmi->uv_mode = DC_PRED;
+  mi->ref_frame[0] = INTRA_FRAME;
+  mi->mv[0].as_int = INVALID_MV;
+  mi->uv_mode = DC_PRED;
   memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
 
   // Change the limit of this loop to add other intra prediction
   // mode tests.
   for (this_mode = DC_PRED; this_mode <= H_PRED; ++this_mode) {
+    this_rdc.dist = this_rdc.rate = 0;
     args.mode = this_mode;
-    args.rate = 0;
-    args.dist = 0;
-    mbmi->tx_size = intra_tx_size;
+    args.skippable = 1;
+    args.rdc = &this_rdc;
+    mi->tx_size = intra_tx_size;
     vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
                                            estimate_block_intra, &args);
-    this_rdc.rate = args.rate;
-    this_rdc.dist = args.dist;
+    if (args.skippable) {
+      x->skip_txfm[0] = SKIP_TXFM_AC_DC;
+      this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 1);
+    } else {
+      x->skip_txfm[0] = SKIP_TXFM_NONE;
+      this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 0);
+    }
     this_rdc.rate += bmode_costs[this_mode];
     this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                              this_rdc.rate, this_rdc.dist);
 
     if (this_rdc.rdcost < best_rdc.rdcost) {
       best_rdc = this_rdc;
-      mbmi->mode = this_mode;
+      mi->mode = this_mode;
     }
   }
 
@@ -1068,17 +1109,228 @@ static const REF_MODE ref_mode_set_svc[RT_INTER_MODES] = {
     {GOLDEN_FRAME, NEWMV}
 };
 
-// TODO(jingning) placeholder for inter-frame non-RD mode decision.
-// this needs various further optimizations. to be continued..
+static int set_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize) {
+  const VP9_COMMON *const cm = &cpi->common;
+  // Reduce the intra cost penalty for small blocks (<=16x16).
+  int reduction_fac =
+      (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
+  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh)
+     // Don't reduce intra cost penalty if estimated noise level is high.
+     reduction_fac = 0;
+  return vp9_get_intra_cost_penalty(
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac;
+}
+
+static INLINE void find_predictors(VP9_COMP *cpi, MACROBLOCK *x,
+                                 MV_REFERENCE_FRAME ref_frame,
+                                 int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
+                                 int const_motion[MAX_REF_FRAMES],
+                                 int *ref_frame_skip_mask,
+                                 const int flag_list[4],
+                                 TileDataEnc *tile_data,
+                                 int mi_row, int mi_col,
+                                 struct buf_2d yv12_mb[4][MAX_MB_PLANE],
+                                 BLOCK_SIZE bsize,
+                                 int force_skip_low_temp_var) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  // TODO(jingning) placeholder for inter-frame non-RD mode decision.
+  x->pred_mv_sad[ref_frame] = INT_MAX;
+  frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+  frame_mv[ZEROMV][ref_frame].as_int = 0;
+  // this needs various further optimizations. to be continued..
+  if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+    int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
+    const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
+    vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
+                         sf, sf);
+    if (cm->use_prev_frame_mvs) {
+      vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame,
+                       candidates, mi_row, mi_col,
+                       x->mbmi_ext->mode_context);
+    } else {
+      const_motion[ref_frame] =
+          mv_refs_rt(cpi, cm, x, xd, tile_info, xd->mi[0], ref_frame,
+                     candidates, &frame_mv[NEWMV][ref_frame], mi_row, mi_col,
+                     (int)(cpi->svc.use_base_mv && cpi->svc.spatial_layer_id));
+    }
+    vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
+                          &frame_mv[NEARESTMV][ref_frame],
+                          &frame_mv[NEARMV][ref_frame]);
+    // Early exit for golden frame if force_skip_low_temp_var is set.
+    if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8 &&
+        !(force_skip_low_temp_var && ref_frame == GOLDEN_FRAME)) {
+      vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
+                  ref_frame, bsize);
+    }
+  } else {
+    *ref_frame_skip_mask |= (1 << ref_frame);
+  }
+}
+
+static void vp9_large_block_mv_bias(const NOISE_ESTIMATE *ne, RD_COST *this_rdc,
+                                    BLOCK_SIZE bsize, int mv_row, int mv_col,
+                                    int is_last_frame) {
+  // Bias against non-zero (above some threshold) motion for large blocks.
+  // This is temporary fix to avoid selection of large mv for big blocks.
+  if (mv_row > 64 || mv_row < -64 || mv_col > 64 || mv_col < -64) {
+    if (bsize == BLOCK_64X64)
+      this_rdc->rdcost = this_rdc->rdcost << 1;
+    else if (bsize >= BLOCK_32X32)
+      this_rdc->rdcost = 3 * this_rdc->rdcost >> 1;
+  }
+  // If noise estimation is enabled, and estimated level is above threshold,
+  // add a bias to LAST reference with small motion, for large blocks.
+  if (ne->enabled && ne->level >= kMedium &&
+      bsize >= BLOCK_32X32 && is_last_frame &&
+      mv_row < 8 && mv_row > -8 && mv_col < 8 && mv_col > -8) {
+    this_rdc->rdcost = 7 * this_rdc->rdcost >> 3;
+  }
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static void vp9_pickmode_ctx_den_update(
+    VP9_PICKMODE_CTX_DEN *ctx_den,
+    int64_t zero_last_cost_orig,
+    int ref_frame_cost[MAX_REF_FRAMES],
+    int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
+    int reuse_inter_pred,
+    TX_SIZE best_tx_size,
+    PREDICTION_MODE best_mode,
+    MV_REFERENCE_FRAME best_ref_frame,
+    INTERP_FILTER best_pred_filter,
+    uint8_t best_mode_skip_txfm) {
+  ctx_den->zero_last_cost_orig = zero_last_cost_orig;
+  ctx_den->ref_frame_cost = ref_frame_cost;
+  ctx_den->frame_mv = frame_mv;
+  ctx_den->reuse_inter_pred = reuse_inter_pred;
+  ctx_den->best_tx_size = best_tx_size;
+  ctx_den->best_mode = best_mode;
+  ctx_den->best_ref_frame = best_ref_frame;
+  ctx_den->best_pred_filter = best_pred_filter;
+  ctx_den->best_mode_skip_txfm = best_mode_skip_txfm;
+}
+
+static void recheck_zeromv_after_denoising(
+    VP9_COMP *cpi, MODE_INFO *const mi, MACROBLOCK *x, MACROBLOCKD *const xd,
+    VP9_DENOISER_DECISION decision, VP9_PICKMODE_CTX_DEN *ctx_den,
+    struct buf_2d yv12_mb[4][MAX_MB_PLANE], RD_COST *best_rdc, BLOCK_SIZE bsize,
+    int mi_row, int mi_col) {
+  // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on
+  // denoised result. Only do this under noise conditions, and if rdcost of
+  // ZEROMV onoriginal source is not significantly higher than rdcost of best
+  // mode.
+  if (cpi->noise_estimate.enabled &&
+      cpi->noise_estimate.level > kLow &&
+      ctx_den->zero_last_cost_orig < (best_rdc->rdcost << 3) &&
+      ((ctx_den->best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) ||
+       (ctx_den->best_ref_frame == GOLDEN_FRAME &&
+        decision == FILTER_ZEROMV_BLOCK))) {
+    // Check if we should pick ZEROMV on denoised signal.
+    int rate = 0;
+    int64_t dist = 0;
+    uint32_t var_y = UINT_MAX;
+    uint32_t sse_y = UINT_MAX;
+    RD_COST this_rdc;
+    mi->mode = ZEROMV;
+    mi->ref_frame[0] = LAST_FRAME;
+    mi->ref_frame[1] = NONE;
+    mi->mv[0].as_int = 0;
+    mi->interp_filter = EIGHTTAP;
+    xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0];
+    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+    model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
+    this_rdc.rate = rate + ctx_den->ref_frame_cost[LAST_FRAME] +
+        cpi->inter_mode_cost[x->mbmi_ext->mode_context[LAST_FRAME]]
+                            [INTER_OFFSET(ZEROMV)];
+    this_rdc.dist = dist;
+    this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, rate, dist);
+    // Switch to ZEROMV if the rdcost for ZEROMV on denoised source
+    // is lower than best_ref mode (on original source).
+    if (this_rdc.rdcost > best_rdc->rdcost) {
+      this_rdc = *best_rdc;
+      mi->mode = ctx_den->best_mode;
+      mi->ref_frame[0] = ctx_den->best_ref_frame;
+      mi->interp_filter = ctx_den->best_pred_filter;
+      if (ctx_den->best_ref_frame == INTRA_FRAME)
+        mi->mv[0].as_int = INVALID_MV;
+      else if (ctx_den->best_ref_frame == GOLDEN_FRAME) {
+        mi->mv[0].as_int = ctx_den->frame_mv[ctx_den->best_mode]
+                                            [ctx_den->best_ref_frame].as_int;
+        if (ctx_den->reuse_inter_pred) {
+          xd->plane[0].pre[0] = yv12_mb[GOLDEN_FRAME][0];
+          vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+        }
+      }
+      mi->tx_size = ctx_den->best_tx_size;
+      x->skip_txfm[0] = ctx_den->best_mode_skip_txfm;
+    } else {
+      ctx_den->best_ref_frame = LAST_FRAME;
+      *best_rdc = this_rdc;
+    }
+  }
+}
+#endif  // CONFIG_VP9_TEMPORAL_DENOISING
+
+static INLINE int get_force_skip_low_temp_var(uint8_t *variance_low,
+                                              int mi_row, int mi_col,
+                                              BLOCK_SIZE bsize) {
+  const int i = (mi_row & 0x7) >> 1;
+  const int j = (mi_col & 0x7) >> 1;
+  int force_skip_low_temp_var = 0;
+  // Set force_skip_low_temp_var based on the block size and block offset.
+  if (bsize == BLOCK_64X64) {
+    force_skip_low_temp_var = variance_low[0];
+  } else if (bsize == BLOCK_64X32) {
+    if (!(mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[1];
+    } else if (!(mi_col & 0x7) && (mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[2];
+    }
+  } else if (bsize == BLOCK_32X64) {
+    if (!(mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[3];
+    } else if ((mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[4];
+    }
+  } else if (bsize == BLOCK_32X32) {
+    if (!(mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[5];
+    } else if ((mi_col & 0x7) && !(mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[6];
+    } else if (!(mi_col & 0x7) && (mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[7];
+    } else if ((mi_col & 0x7) && (mi_row & 0x7)) {
+      force_skip_low_temp_var = variance_low[8];
+    }
+  } else if (bsize == BLOCK_16X16) {
+    force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]];
+  } else if (bsize == BLOCK_32X16) {
+    // The col shift index for the second 16x16 block.
+    const int j2 = ((mi_col + 2) & 0x7) >> 1;
+    // Only if each 16x16 block inside has low temporal variance.
+    force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]] &&
+                              variance_low[pos_shift_16x16[i][j2]];
+  } else if (bsize == BLOCK_16X32) {
+    // The row shift index for the second 16x16 block.
+    const int i2 = ((mi_row + 2) & 0x7) >> 1;
+    force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]] &&
+                              variance_low[pos_shift_16x16[i2][j]];
+  }
+  return force_skip_low_temp_var;
+}
+
 void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                          TileDataEnc *tile_data,
                          int mi_row, int mi_col, RD_COST *rd_cost,
                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *const cm = &cpi->common;
   SPEED_FEATURES *const sf = &cpi->sf;
-  TileInfo *const tile_info = &tile_data->tile_info;
+  const SVC *const svc = &cpi->svc;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *const mi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
   PREDICTION_MODE best_mode = ZEROMV;
   MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
@@ -1094,14 +1346,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
   unsigned int sse_y = UINT_MAX;
-  // Reduce the intra cost penalty for small blocks (<=16x16).
-  const int reduction_fac = (bsize <= BLOCK_16X16) ?
-      ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
-  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
-      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac;
-  const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
+  const int intra_cost_penalty = set_intra_cost_penalty(cpi, bsize);
+  int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
                                            intra_cost_penalty, 0);
-  const int *const rd_threshes = cpi->rd.threshes[mbmi->segment_id][bsize];
+  const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize];
   const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
   INTERP_FILTER filter_ref;
   const int bsl = mi_width_log2_lookup[bsize];
@@ -1129,6 +1377,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int best_pred_sad = INT_MAX;
   int best_early_term = 0;
   int ref_frame_cost[MAX_REF_FRAMES];
+  int svc_force_zero_mode[3] = {0};
+  int perform_intra_pred = 1;
+  int use_golden_nonzeromv = 1;
+  int force_skip_low_temp_var = 0;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  VP9_PICKMODE_CTX_DEN ctx_den;
+  int64_t zero_last_cost_orig = INT64_MAX;
+#endif
 
   init_ref_frame_cost(cm, xd, ref_frame_cost);
 
@@ -1154,24 +1410,42 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   x->skip = 0;
 
-  if (xd->up_available)
-    filter_ref = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
-  else if (xd->left_available)
-    filter_ref = xd->mi[-1]->mbmi.interp_filter;
+  // Instead of using vp9_get_pred_context_switchable_interp(xd) to assign
+  // filter_ref, we use a less strict condition on assigning filter_ref.
+  // This is to reduce the probabily of entering the flow of not assigning
+  // filter_ref and then skip filter search.
+  if (xd->above_mi && is_inter_block(xd->above_mi))
+    filter_ref = xd->above_mi->interp_filter;
+  else if (xd->left_mi && is_inter_block(xd->left_mi))
+    filter_ref = xd->left_mi->interp_filter;
   else
     filter_ref = cm->interp_filter;
 
   // initialize mode decisions
   vp9_rd_cost_reset(&best_rdc);
   vp9_rd_cost_reset(rd_cost);
-  mbmi->sb_type = bsize;
-  mbmi->ref_frame[0] = NONE;
-  mbmi->ref_frame[1] = NONE;
-  mbmi->tx_size = VPXMIN(max_txsize_lookup[bsize],
-                         tx_mode_to_biggest_tx_size[cm->tx_mode]);
+  mi->sb_type = bsize;
+  mi->ref_frame[0] = NONE;
+  mi->ref_frame[1] = NONE;
+  mi->tx_size = VPXMIN(max_txsize_lookup[bsize],
+                       tx_mode_to_biggest_tx_size[cm->tx_mode]);
+
+  if (sf->short_circuit_flat_blocks) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      x->source_variance = vp9_high_get_sby_perpixel_variance(
+          cpi, &x->plane[0].src, bsize, xd->bd);
+    else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      x->source_variance =
+          vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  vp9_denoiser_reset_frame_stats(ctx);
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      cpi->denoiser.denoising_level > kDenLowLow) {
+    vp9_denoiser_reset_frame_stats(ctx);
+  }
 #endif
 
   if (cpi->rc.frames_since_golden == 0 && !cpi->use_svc) {
@@ -1179,40 +1453,37 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   } else {
     usable_ref_frame = GOLDEN_FRAME;
   }
-  for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
-    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
-
-    x->pred_mv_sad[ref_frame] = INT_MAX;
-    frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
-    frame_mv[ZEROMV][ref_frame].as_int = 0;
 
-    if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
-      int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
-      const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
-
-      vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
-                           sf, sf);
+  // For svc mode, on spatial_layer_id > 0: if the reference has different scale
+  // constrain the inter mode to only test zero motion.
+  if (cpi->use_svc &&
+      svc ->force_zero_mode_spatial_ref &&
+      cpi->svc.spatial_layer_id > 0) {
+    if (cpi->ref_frame_flags & flag_list[LAST_FRAME]) {
+      struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
+      if (vp9_is_scaled(sf))
+        svc_force_zero_mode[LAST_FRAME - 1] = 1;
+    }
+    if (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) {
+      struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
+      if (vp9_is_scaled(sf))
+        svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
+    }
+  }
 
-      if (cm->use_prev_frame_mvs)
-        vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame,
-                         candidates, mi_row, mi_col, NULL, NULL,
-                         x->mbmi_ext->mode_context);
-      else
-        const_motion[ref_frame] = mv_refs_rt(cm, x, xd, tile_info,
-                                             xd->mi[0],
-                                             ref_frame, candidates,
-                                             mi_row, mi_col);
+  if (cpi->sf.short_circuit_low_temp_var) {
+    force_skip_low_temp_var =
+        get_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize);
+  }
 
-      vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
-                            &frame_mv[NEARESTMV][ref_frame],
-                            &frame_mv[NEARMV][ref_frame]);
+  if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+      !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var))
+    use_golden_nonzeromv = 0;
 
-      if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8)
-        vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
-                    ref_frame, bsize);
-    } else {
-      ref_frame_skip_mask |= (1 << ref_frame);
-    }
+  for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
+    find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
+                    &ref_frame_skip_mask, flag_list, tile_data, mi_row, mi_col,
+                    yv12_mb, bsize, force_skip_low_temp_var);
   }
 
   for (idx = 0; idx < RT_INTER_MODES; ++idx) {
@@ -1224,21 +1495,52 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     int is_skippable;
     int this_early_term = 0;
     PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
+
     if (cpi->use_svc)
       this_mode = ref_mode_set_svc[idx].pred_mode;
 
+    if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
+        this_mode != NEARESTMV) {
+      continue;
+    }
+
     if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
       continue;
 
     ref_frame = ref_mode_set[idx].ref_frame;
-    if (cpi->use_svc)
+    if (cpi->use_svc) {
       ref_frame = ref_mode_set_svc[idx].ref_frame;
+    }
+
     if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
       continue;
+
     if (const_motion[ref_frame] && this_mode == NEARMV)
       continue;
 
-    if (!(this_mode == ZEROMV && ref_frame == LAST_FRAME)) {
+    // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var
+    // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
+    // later.
+    if (force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
+        frame_mv[this_mode][ref_frame].as_int != 0) {
+      continue;
+    }
+
+    if (cpi->sf.short_circuit_low_temp_var == 2 &&
+        force_skip_low_temp_var && ref_frame == LAST_FRAME &&
+        this_mode == NEWMV) {
+      continue;
+    }
+
+    if (cpi->use_svc) {
+      if (svc_force_zero_mode[ref_frame - 1] &&
+          frame_mv[this_mode][ref_frame].as_int != 0)
+        continue;
+    }
+
+    if (!force_skip_low_temp_var &&
+        !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+          ref_frame == LAST_FRAME)) {
       i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
       if ((cpi->ref_frame_flags & flag_list[i]) && sf->reference_masking)
         if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
@@ -1251,7 +1553,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     for (i = 0; i < MAX_MB_PLANE; i++)
       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
 
-    mbmi->ref_frame[0] = ref_frame;
+    mi->ref_frame[0] = ref_frame;
     set_ref_ptrs(cm, xd, ref_frame, NONE);
 
     mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
@@ -1262,9 +1564,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       continue;
 
     if (this_mode == NEWMV) {
-      if (ref_frame > LAST_FRAME && !cpi->use_svc) {
+      if (ref_frame > LAST_FRAME &&
+          !cpi->use_svc &&
+          cpi->oxcf.rc_mode == VPX_CBR) {
         int tmp_sad;
-        int dis, cost_list[5];
+        uint32_t dis;
+        int cost_list[5];
 
         if (bsize < BLOCK_16X16)
           continue;
@@ -1276,7 +1581,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad)
           continue;
 
-        frame_mv[NEWMV][ref_frame].as_int = mbmi->mv[0].as_int;
+        frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int;
         rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,
           &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
           x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
@@ -1293,13 +1598,44 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           cond_cost_list(cpi, cost_list),
           x->nmvjointcost, x->mvcost, &dis,
           &x->pred_sse[ref_frame], NULL, 0, 0);
+      } else if (svc->use_base_mv && svc->spatial_layer_id) {
+        if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV &&
+            frame_mv[NEWMV][ref_frame].as_int != 0) {
+          const int pre_stride = xd->plane[0].pre[0].stride;
+          int base_mv_sad = INT_MAX;
+          const uint8_t * const pre_buf = xd->plane[0].pre[0].buf +
+              (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride +
+              (frame_mv[NEWMV][ref_frame].as_mv.col >> 3);
+          base_mv_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf,
+                                       x->plane[0].src.stride,
+                                       pre_buf, pre_stride);
+
+          // TODO(wonkap): make the decision to use base layer mv on RD;
+          // not just SAD.
+          if (base_mv_sad < x->pred_mv_sad[ref_frame]) {
+            // Base layer mv is good.
+            if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                &frame_mv[NEWMV][ref_frame], &rate_mv, best_rdc.rdcost, 1)) {
+                continue;
+            }
+          } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+            &frame_mv[NEWMV][ref_frame], &rate_mv, best_rdc.rdcost, 0)) {
+            continue;
+          }
+        } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+          &frame_mv[NEWMV][ref_frame], &rate_mv, best_rdc.rdcost, 0)) {
+          continue;
+        }
       } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-        &frame_mv[NEWMV][ref_frame], &rate_mv, best_rdc.rdcost)) {
+        &frame_mv[NEWMV][ref_frame], &rate_mv, best_rdc.rdcost, 0)) {
         continue;
       }
     }
 
-    if (this_mode == NEWMV && ref_frame == LAST_FRAME &&
+    // If use_golden_nonzeromv is false, NEWMV mode is skipped for golden, no
+    // need to compute best_pred_sad which is only used to skip golden NEWMV.
+    if (use_golden_nonzeromv && this_mode == NEWMV &&
+        ref_frame == LAST_FRAME &&
         frame_mv[NEWMV][LAST_FRAME].as_int != INVALID_MV) {
       const int pre_stride = xd->plane[0].pre[0].stride;
       const uint8_t * const pre_buf = xd->plane[0].pre[0].buf +
@@ -1311,28 +1647,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       x->pred_mv_sad[LAST_FRAME] = best_pred_sad;
     }
 
-    if (cpi->use_svc) {
-      if (this_mode == NEWMV && ref_frame == GOLDEN_FRAME &&
-          frame_mv[NEWMV][GOLDEN_FRAME].as_int != INVALID_MV) {
-        const int pre_stride = xd->plane[0].pre[0].stride;
-        const uint8_t * const pre_buf = xd->plane[0].pre[0].buf +
-            (frame_mv[NEWMV][GOLDEN_FRAME].as_mv.row >> 3) * pre_stride +
-            (frame_mv[NEWMV][GOLDEN_FRAME].as_mv.col >> 3);
-        best_pred_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf,
-                                               x->plane[0].src.stride,
-                                               pre_buf, pre_stride);
-        x->pred_mv_sad[GOLDEN_FRAME] = best_pred_sad;
-      }
-    }
-
-
     if (this_mode != NEARESTMV &&
         frame_mv[this_mode][ref_frame].as_int ==
             frame_mv[NEARESTMV][ref_frame].as_int)
       continue;
 
-    mbmi->mode = this_mode;
-    mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+    mi->mode = this_mode;
+    mi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
 
     // Search for the best prediction filter type, when the resulting
     // motion vector is at sub-pixel accuracy level for luma component, i.e.,
@@ -1349,8 +1670,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
     if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && pred_filter_search
         && (ref_frame == LAST_FRAME ||
-            (ref_frame == GOLDEN_FRAME && cpi->use_svc))
-        && (((mbmi->mv[0].as_mv.row | mbmi->mv[0].as_mv.col) & 0x07) != 0)) {
+           (ref_frame == GOLDEN_FRAME &&
+           (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) &&
+           (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) {
       int pf_rate[3];
       int64_t pf_dist[3];
       unsigned int pf_var[3];
@@ -1362,13 +1684,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
       for (filter = EIGHTTAP; filter <= EIGHTTAP_SMOOTH; ++filter) {
         int64_t cost;
-        mbmi->interp_filter = filter;
+        mi->interp_filter = filter;
         vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
         model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter],
                           &pf_var[filter], &pf_sse[filter]);
         pf_rate[filter] += vp9_get_switchable_rate(cpi, xd);
         cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]);
-        pf_tx_size[filter] = mbmi->tx_size;
+        pf_tx_size[filter] = mi->tx_size;
         if (cost < best_cost) {
           best_filter = filter;
           best_cost = cost;
@@ -1379,12 +1701,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
               free_pred_buffer(this_mode_pred);
               this_mode_pred = current_pred;
             }
-
-            if (filter < EIGHTTAP_SHARP) {
-              current_pred = &tmp[get_pred_buffer(tmp, 3)];
-              pd->dst.buf = current_pred->data;
-              pd->dst.stride = bw;
-            }
+            current_pred = &tmp[get_pred_buffer(tmp, 3)];
+            pd->dst.buf = current_pred->data;
+            pd->dst.stride = bw;
           }
         }
       }
@@ -1392,8 +1711,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       if (reuse_inter_pred && this_mode_pred != current_pred)
         free_pred_buffer(current_pred);
 
-      mbmi->interp_filter = best_filter;
-      mbmi->tx_size = pf_tx_size[best_filter];
+      mi->interp_filter = best_filter;
+      mi->tx_size = pf_tx_size[best_filter];
       this_rdc.rate = pf_rate[best_filter];
       this_rdc.dist = pf_dist[best_filter];
       var_y = pf_var[best_filter];
@@ -1404,13 +1723,21 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         pd->dst.stride = this_mode_pred->stride;
       }
     } else {
-      mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref;
+      // TODO(jackychen): the low-bitdepth condition causes a segfault in
+      // high-bitdepth builds.
+      // https://bugs.chromium.org/p/webm/issues/detail?id=1250
+#if CONFIG_VP9_HIGHBITDEPTH
+      const int large_block = bsize > BLOCK_32X32;
+#else
+      const int large_block = bsize >= BLOCK_32X32;
+#endif
+      mi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref;
       vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
 
       // For large partition blocks, extra testing is done.
-      if (bsize > BLOCK_32X32 &&
-        !cyclic_refresh_segment_id_boosted(xd->mi[0]->mbmi.segment_id) &&
-        cm->base_qindex) {
+      if (cpi->oxcf.rc_mode == VPX_CBR && large_block &&
+          !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
+          cm->base_qindex) {
         model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate,
                                 &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col,
                                 &this_early_term);
@@ -1422,8 +1749,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
     if (!this_early_term) {
       this_sse = (int64_t)sse_y;
-      block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable,
-                &this_sse, 0, bsize, VPXMIN(mbmi->tx_size, TX_16X16));
+      block_yrd(cpi, x, &this_rdc, &is_skippable, &this_sse, bsize,
+                VPXMIN(mi->tx_size, TX_16X16));
       x->skip_txfm[0] = is_skippable;
       if (is_skippable) {
         this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
@@ -1439,7 +1766,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       }
 
       if (cm->interp_filter == SWITCHABLE) {
-        if ((mbmi->mv[0].as_mv.row | mbmi->mv[0].as_mv.col) & 0x07)
+        if ((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07)
           this_rdc.rate += vp9_get_switchable_rate(cpi, xd);
       }
     } else {
@@ -1449,17 +1776,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
-      int uv_rate = 0;
-      int64_t uv_dist = 0;
+      RD_COST rdc_uv;
       const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]);
       if (x->color_sensitivity[0])
         vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
       if (x->color_sensitivity[1])
         vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
-      model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &uv_rate, &uv_dist,
-                         &var_y, &sse_y, 1, 2);
-      this_rdc.rate += uv_rate;
-      this_rdc.dist += uv_dist;
+      model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &var_y, &sse_y, 1, 2);
+      this_rdc.rate += rdc_uv.rate;
+      this_rdc.dist += rdc_uv.dist;
     }
 
     this_rdc.rate += rate_mv;
@@ -1469,6 +1794,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     this_rdc.rate += ref_frame_cost[ref_frame];
     this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
 
+    // Bias against non-zero motion
+    if (cpi->oxcf.rc_mode == VPX_CBR &&
+        cpi->oxcf.speed >= 5 &&
+        cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+        !x->sb_is_skin) {
+      vp9_large_block_mv_bias(&cpi->noise_estimate, &this_rdc, bsize,
+                              frame_mv[this_mode][ref_frame].as_mv.row,
+                              frame_mv[this_mode][ref_frame].as_mv.col,
+                              ref_frame == LAST_FRAME);
+    }
+
     // Skipping checking: test to see if this block can be reconstructed by
     // prediction only.
     if (cpi->allow_encode_breakout) {
@@ -1483,8 +1819,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity > 0)
-      vp9_denoiser_update_frame_stats(mbmi, sse_y, this_mode, ctx);
+    if (cpi->oxcf.noise_sensitivity > 0 &&
+        cpi->denoiser.denoising_level > kDenLowLow) {
+      vp9_denoiser_update_frame_stats(mi, sse_y, this_mode, ctx);
+      // Keep track of zero_last cost.
+      if (ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0)
+        zero_last_cost_orig = this_rdc.rdcost;
+    }
 #else
     (void)ctx;
 #endif
@@ -1492,8 +1833,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (this_rdc.rdcost < best_rdc.rdcost || x->skip) {
       best_rdc = this_rdc;
       best_mode = this_mode;
-      best_pred_filter = mbmi->interp_filter;
-      best_tx_size = mbmi->tx_size;
+      best_pred_filter = mi->interp_filter;
+      best_tx_size = mi->tx_size;
       best_ref_frame = ref_frame;
       best_mode_skip_txfm = x->skip_txfm[0];
       best_early_term = this_early_term;
@@ -1518,20 +1859,33 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  mbmi->mode          = best_mode;
-  mbmi->interp_filter = best_pred_filter;
-  mbmi->tx_size       = best_tx_size;
-  mbmi->ref_frame[0]  = best_ref_frame;
-  mbmi->mv[0].as_int  = frame_mv[best_mode][best_ref_frame].as_int;
-  xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+  mi->mode          = best_mode;
+  mi->interp_filter = best_pred_filter;
+  mi->tx_size       = best_tx_size;
+  mi->ref_frame[0]  = best_ref_frame;
+  mi->mv[0].as_int  = frame_mv[best_mode][best_ref_frame].as_int;
+  xd->mi[0]->bmi[0].as_mv[0].as_int = mi->mv[0].as_int;
   x->skip_txfm[0] = best_mode_skip_txfm;
 
+  // For spatial enhancemanent layer: perform intra prediction only if base
+  // layer is chosen as the reference. Always perform intra prediction if
+  // LAST is the only reference or is_key_frame is set.
+  if (cpi->svc.spatial_layer_id) {
+    perform_intra_pred =
+        cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame ||
+        !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME])  ||
+        (!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame
+            && svc_force_zero_mode[best_ref_frame - 1]);
+    inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
+  }
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
-  if (best_rdc.rdcost == INT64_MAX ||
-      (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
-       bsize <= cpi->sf.max_intra_bsize)) {
-    struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
+  if ((!force_skip_low_temp_var || bsize < BLOCK_32X32) &&
+      perform_intra_pred &&
+      (best_rdc.rdcost == INT64_MAX ||
+       (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
+        bsize <= cpi->sf.max_intra_bsize))) {
+    struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
     int i;
     TX_SIZE best_intra_tx_size = TX_SIZES;
     TX_SIZE intra_tx_size =
@@ -1566,6 +1920,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       const PREDICTION_MODE this_mode = intra_mode_list[i];
       THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)];
       int mode_rd_thresh = rd_threshes[mode_index];
+      if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
+          this_mode != DC_PRED) {
+        continue;
+      }
 
       if (!((1 << this_mode) & cpi->sf.intra_y_mode_bsize_mask[bsize]))
         continue;
@@ -1574,14 +1932,24 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                               rd_thresh_freq_fact[mode_index]))
         continue;
 
-      mbmi->mode = this_mode;
-      mbmi->ref_frame[0] = INTRA_FRAME;
+      mi->mode = this_mode;
+      mi->ref_frame[0] = INTRA_FRAME;
+      this_rdc.dist = this_rdc.rate = 0;
       args.mode = this_mode;
-      args.rate = 0;
-      args.dist = 0;
-      mbmi->tx_size = intra_tx_size;
+      args.skippable = 1;
+      args.rdc = &this_rdc;
+      mi->tx_size = intra_tx_size;
       vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
                                              estimate_block_intra, &args);
+      // Check skip cost here since skippable is not set for for uv, this
+      // mirrors the behavior used by inter
+      if (args.skippable) {
+        x->skip_txfm[0] = SKIP_TXFM_AC_DC;
+        this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 1);
+      } else {
+        x->skip_txfm[0] = SKIP_TXFM_NONE;
+        this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 0);
+      }
       // Inter and intra RD will mismatch in scale for non-screen content.
       if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) {
         if (x->color_sensitivity[0])
@@ -1591,8 +1959,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           vp9_foreach_transformed_block_in_plane(xd, bsize, 2,
                                                  estimate_block_intra, &args);
       }
-      this_rdc.rate = args.rate;
-      this_rdc.dist = args.dist;
       this_rdc.rate += cpi->mbmode_cost[this_mode];
       this_rdc.rate += ref_frame_cost[INTRA_FRAME];
       this_rdc.rate += intra_cost_penalty;
@@ -1602,29 +1968,33 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       if (this_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = this_rdc;
         best_mode = this_mode;
-        best_intra_tx_size = mbmi->tx_size;
+        best_intra_tx_size = mi->tx_size;
         best_ref_frame = INTRA_FRAME;
-        mbmi->uv_mode = this_mode;
-        mbmi->mv[0].as_int = INVALID_MV;
+        mi->uv_mode = this_mode;
+        mi->mv[0].as_int = INVALID_MV;
         best_mode_skip_txfm = x->skip_txfm[0];
       }
     }
 
     // Reset mb_mode_info to the best inter mode.
     if (best_ref_frame != INTRA_FRAME) {
-      mbmi->tx_size = best_tx_size;
+      mi->tx_size = best_tx_size;
     } else {
-      mbmi->tx_size = best_intra_tx_size;
+      mi->tx_size = best_intra_tx_size;
     }
   }
 
   pd->dst = orig_dst;
-  mbmi->mode = best_mode;
-  mbmi->ref_frame[0] = best_ref_frame;
+  mi->mode = best_mode;
+  mi->ref_frame[0] = best_ref_frame;
   x->skip_txfm[0] = best_mode_skip_txfm;
 
+  if (!is_inter_block(mi)) {
+    mi->interp_filter = SWITCHABLE_FILTERS;
+  }
+
   if (reuse_inter_pred && best_pred != NULL) {
-    if (best_pred->data != orig_dst.buf && is_inter_mode(mbmi->mode)) {
+    if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {
 #if CONFIG_VP9_HIGHBITDEPTH
       if (cm->use_highbitdepth)
         vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
@@ -1642,8 +2012,25 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      cpi->resize_pending == 0 &&
+      cpi->denoiser.denoising_level > kDenLowLow &&
+      cpi->denoiser.reset == 0) {
+    VP9_DENOISER_DECISION decision = COPY_BLOCK;
+    vp9_pickmode_ctx_den_update(&ctx_den, zero_last_cost_orig, ref_frame_cost,
+                                frame_mv, reuse_inter_pred, best_tx_size,
+                                best_mode, best_ref_frame, best_pred_filter,
+                                best_mode_skip_txfm);
+    vp9_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision);
+    recheck_zeromv_after_denoising(cpi, mi, x, xd, decision, &ctx_den, yv12_mb,
+                                   &best_rdc, bsize, mi_row, mi_col);
+    best_ref_frame = ctx_den.best_ref_frame;
+  }
+#endif
+
   if (cpi->sf.adaptive_rd_thresh) {
-    THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mbmi->mode)];
+    THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mi->mode)];
 
     if (best_ref_frame == INTRA_FRAME) {
       // Only consider the modes that are included in the intra_mode_list.
@@ -1677,12 +2064,12 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   VP9_COMMON *const cm = &cpi->common;
   SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const struct segmentation *const seg = &cm->seg;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame = NONE;
   MV_REFERENCE_FRAME best_ref_frame = NONE;
-  unsigned char segment_id = mbmi->segment_id;
+  unsigned char segment_id = mi->segment_id;
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
@@ -1708,8 +2095,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
                            sf, sf);
       vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame,
-                       candidates, mi_row, mi_col, NULL, NULL,
-                       mbmi_ext->mode_context);
+                       candidates, mi_row, mi_col, mbmi_ext->mode_context);
 
       vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
                             &dummy_mv[0], &dummy_mv[1]);
@@ -1718,13 +2104,13 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  mbmi->sb_type = bsize;
-  mbmi->tx_size = TX_4X4;
-  mbmi->uv_mode = DC_PRED;
-  mbmi->ref_frame[0] = LAST_FRAME;
-  mbmi->ref_frame[1] = NONE;
-  mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
-                                                        : cm->interp_filter;
+  mi->sb_type = bsize;
+  mi->tx_size = TX_4X4;
+  mi->uv_mode = DC_PRED;
+  mi->ref_frame[0] = LAST_FRAME;
+  mi->ref_frame[1] = NONE;
+  mi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+                                                      : cm->interp_filter;
 
   for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
     int64_t this_rd = 0;
@@ -1733,6 +2119,13 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     if (ref_frame_skip_mask & (1 << ref_frame))
       continue;
 
+#if CONFIG_BETTER_HW_COMPATIBILITY
+    if ((bsize == BLOCK_8X4 || bsize == BLOCK_4X8) &&
+        ref_frame > INTRA_FRAME &&
+        vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
+      continue;
+#endif
+
     // TODO(jingning, agrange): Scaling reference frame not supported for
     // sub8x8 blocks. Is this supported now?
     if (ref_frame > INTRA_FRAME &&
@@ -1745,7 +2138,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
       continue;
 
-    mbmi->ref_frame[0] = ref_frame;
+    mi->ref_frame[0] = ref_frame;
     x->skip = 0;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
@@ -1799,7 +2192,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
             const int tmp_col_max = x->mv_col_max;
             const int tmp_row_min = x->mv_row_min;
             const int tmp_row_max = x->mv_row_max;
-            int dummy_dist;
+            uint32_t dummy_dist;
 
             if (i == 0) {
               mvp_full.row = b_mv[NEARESTMV].as_mv.row >> 3;
@@ -1862,7 +2255,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
                                     &xd->block_refs[0]->sf,
                                     4 * num_4x4_blocks_wide,
                                     4 * num_4x4_blocks_high, 0,
-                                    vp9_filter_kernels[mbmi->interp_filter],
+                                    vp9_filter_kernels[mi->interp_filter],
                                     MV_PRECISION_Q3,
                                     mi_col * MI_SIZE + 4 * (i & 0x01),
                                     mi_row * MI_SIZE + 4 * (i >> 1), xd->bd);
@@ -1874,7 +2267,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
                                      &xd->block_refs[0]->sf,
                                      4 * num_4x4_blocks_wide,
                                      4 * num_4x4_blocks_high, 0,
-                                     vp9_filter_kernels[mbmi->interp_filter],
+                                     vp9_filter_kernels[mi->interp_filter],
                                      MV_PRECISION_Q3,
                                      mi_col * MI_SIZE + 4 * (i & 0x01),
                                      mi_row * MI_SIZE + 4 * (i >> 1));
@@ -1916,8 +2309,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }  // reference frames
 
-  mbmi->tx_size = TX_4X4;
-  mbmi->ref_frame[0] = best_ref_frame;
+  mi->tx_size = TX_4X4;
+  mi->ref_frame[0] = best_ref_frame;
   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
       const int block = idy * 2 + idx;
@@ -1928,7 +2321,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         xd->mi[0]->bmi[block + 2] = bsi[best_ref_frame][block];
     }
   }
-  mbmi->mode = xd->mi[0]->bmi[3].as_mode;
+  mi->mode = xd->mi[0]->bmi[3].as_mode;
   ctx->mic = *(xd->mi[0]);
   ctx->mbmi_ext = *x->mbmi_ext;
   ctx->skip_txfm[0] = SKIP_TXFM_NONE;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index cb3e21a..d68b684 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -94,7 +94,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr,
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
       const int64_t tmp = abs_coeff + round_ptr[rc != 0];
-      const uint32_t abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 16);
+      const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16);
       qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
       if (abs_qcoeff)
@@ -219,12 +219,12 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
   unsigned t;
-  int l;
+  int l, m;
   t = d;
   for (l = 0; t > 1; l++)
     t >>= 1;
-  t = 1 + (1 << (16 + l)) / d;
-  *quant = (int16_t)(t - (1 << 16));
+  m = 1 + (1 << (16 + l)) / d;
+  *quant = (int16_t)(m - (1 << 16));
   *shift = 1 << (16 - l);
 }
 
@@ -308,7 +308,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   const VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   QUANTS *const quants = &cpi->quants;
-  const int segment_id = xd->mi[0]->mbmi.segment_id;
+  const int segment_id = xd->mi[0]->segment_id;
   const int qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
   const int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
   int i;
@@ -342,8 +342,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
   x->q_index = qindex;
 
-  x->errorperbit = rdmult >> 6;
-  x->errorperbit += (x->errorperbit == 0);
+  set_error_per_bit(x, rdmult);
 
   vp9_initialize_me_consts(cpi, x, x->q_index);
 }
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index d700685..b45f8d0 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -133,7 +133,7 @@ static void init_minq_luts(int *kf_low_m, int *kf_high_m,
     kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
     arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
     arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
-    inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth);
+    inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
     rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
   }
 }
@@ -337,6 +337,10 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   rc->total_actual_bits = 0;
   rc->total_target_bits = 0;
   rc->total_target_vs_actual = 0;
+  rc->avg_frame_low_motion = 0;
+  rc->high_source_sad = 0;
+  rc->count_last_scene_change = 0;
+  rc->avg_source_sad = 0;
 
   rc->frames_since_key = 8;  // Sensible default for first frame.
   rc->this_key_frame_forced = 0;
@@ -370,8 +374,9 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
 int vp9_rc_drop_frame(VP9_COMP *cpi) {
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
-
-  if (!oxcf->drop_frames_water_mark) {
+  if (!oxcf->drop_frames_water_mark ||
+      (is_one_pass_cbr_svc(cpi) &&
+       cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode)) {
     return 0;
   } else {
     if (rc->buffer_level < 0) {
@@ -499,6 +504,12 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
   else
     cpi->rc.rc_1_frame = 0;
 
+  // Turn off oscilation detection in the case of massive overshoot.
+  if (cpi->rc.rc_1_frame == -1 && cpi->rc.rc_2_frame == 1 &&
+      correction_factor > 1000) {
+    cpi->rc.rc_2_frame = 0;
+  }
+
   if (correction_factor > 102) {
     // We are not already at the worst allowable quality
     correction_factor = (int)(100 + ((correction_factor - 100) *
@@ -614,15 +625,16 @@ static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) {
 
   if (cpi->common.frame_type == KEY_FRAME) {
     active_worst_quality = curr_frame == 0 ? rc->worst_quality
-                                           : rc->last_q[KEY_FRAME] * 2;
+                                           : rc->last_q[KEY_FRAME] << 1;
   } else {
     if (!rc->is_src_frame_alt_ref &&
         (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
-      active_worst_quality =  curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4
+      active_worst_quality =  curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 >> 2
                                               : rc->last_q[INTER_FRAME];
     } else {
-      active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 2
-                                             : rc->last_q[INTER_FRAME] * 2;
+      active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] << 1 :
+          VPXMIN(rc->last_q[INTER_FRAME] << 1,
+                (rc->avg_frame_qindex[INTER_FRAME] * 3 >> 1));
     }
   }
   return VPXMIN(active_worst_quality, rc->worst_quality);
@@ -655,7 +667,7 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
                    VPXMIN(rc->avg_frame_qindex[INTER_FRAME],
                           rc->avg_frame_qindex[KEY_FRAME]) :
                    rc->avg_frame_qindex[INTER_FRAME];
-  active_worst_quality = VPXMIN(rc->worst_quality, ambient_qp * 5 / 4);
+  active_worst_quality = VPXMIN(rc->worst_quality, ambient_qp * 5 >> 2);
   if (rc->buffer_level > rc->optimal_buffer_level) {
     // Adjust down.
     // Maximum limit for down adjustment, ~30%.
@@ -804,8 +816,8 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
   return q;
 }
 
-static int get_active_cq_level(const RATE_CONTROL *rc,
-                               const VP9EncoderConfig *const oxcf) {
+static int get_active_cq_level_one_pass(
+    const RATE_CONTROL *rc, const VP9EncoderConfig *const oxcf) {
   static const double cq_adjust_threshold = 0.1;
   int active_cq_level = oxcf->cq_level;
   if (oxcf->rc_mode == VPX_CQ &&
@@ -818,13 +830,36 @@ static int get_active_cq_level(const RATE_CONTROL *rc,
   return active_cq_level;
 }
 
+#define SMOOTH_PCT_MIN  0.1
+#define SMOOTH_PCT_DIV  0.05
+static int get_active_cq_level_two_pass(
+    const TWO_PASS *twopass, const RATE_CONTROL *rc,
+    const VP9EncoderConfig *const oxcf) {
+  static const double cq_adjust_threshold = 0.1;
+  int active_cq_level = oxcf->cq_level;
+  if (oxcf->rc_mode == VPX_CQ) {
+    if (twopass->mb_smooth_pct > SMOOTH_PCT_MIN) {
+      active_cq_level -= (int)((twopass->mb_smooth_pct - SMOOTH_PCT_MIN) /
+          SMOOTH_PCT_DIV);
+      active_cq_level = VPXMAX(active_cq_level, 0);
+    }
+    if (rc->total_target_bits > 0) {
+      const double x = (double)rc->total_actual_bits / rc->total_target_bits;
+      if (x < cq_adjust_threshold) {
+        active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold);
+      }
+    }
+  }
+  return active_cq_level;
+}
+
 static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
                                              int *bottom_index,
                                              int *top_index) {
   const VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  const int cq_level = get_active_cq_level(rc, oxcf);
+  const int cq_level = get_active_cq_level_one_pass(rc, oxcf);
   int active_best_quality;
   int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
   int q;
@@ -832,10 +867,16 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
   ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
 
   if (frame_is_intra_only(cm)) {
-    // Handle the special case for key frames forced when we have reached
-    // the maximum key frame interval. Here force the Q to a range
-    // based on the ambient Q to reduce the risk of popping.
-    if (rc->this_key_frame_forced) {
+    if (oxcf->rc_mode == VPX_Q) {
+      int qindex = cq_level;
+      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex = vp9_compute_qdelta(rc, q, q * 0.25,
+                                            cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+    } else if (rc->this_key_frame_forced) {
+      // Handle the special case for key frames forced when we have reached
+      // the maximum key frame interval. Here force the Q to a range
+      // based on the ambient Q to reduce the risk of popping.
       int qindex = rc->last_boosted_qindex;
       double last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
       int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
@@ -868,9 +909,12 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
-    if (rc->frames_since_key > 1 &&
-        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
-      q = rc->avg_frame_qindex[INTER_FRAME];
+    if (rc->frames_since_key > 1) {
+      if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+        q = rc->avg_frame_qindex[INTER_FRAME];
+      } else {
+        q = active_worst_quality;
+      }
     } else {
       q = rc->avg_frame_qindex[KEY_FRAME];
     }
@@ -885,23 +929,37 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
       active_best_quality = active_best_quality * 15 / 16;
 
     } else if (oxcf->rc_mode == VPX_Q) {
-      if (!cpi->refresh_alt_ref_frame) {
-        active_best_quality = cq_level;
-      } else {
-        active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
-      }
+      int qindex = cq_level;
+      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex;
+      if (cpi->refresh_alt_ref_frame)
+        delta_qindex = vp9_compute_qdelta(rc, q, q * 0.40, cm->bit_depth);
+      else
+        delta_qindex = vp9_compute_qdelta(rc, q, q * 0.50, cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
       active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
     }
   } else {
     if (oxcf->rc_mode == VPX_Q) {
-      active_best_quality = cq_level;
+      int qindex = cq_level;
+      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      double delta_rate[FIXED_GF_INTERVAL] =
+          {0.50, 1.0, 0.85, 1.0, 0.70, 1.0, 0.85, 1.0};
+      int delta_qindex =
+          vp9_compute_qdelta(rc, q,
+                             q * delta_rate[cm->current_video_frame %
+                             FIXED_GF_INTERVAL], cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
-      // Use the lower of active_worst_quality and recent/average Q.
-      if (cm->current_video_frame > 1)
-        active_best_quality = inter_minq[rc->avg_frame_qindex[INTER_FRAME]];
-      else
+      // Use the min of the average Q and active_worst_quality as basis for
+      // active_best.
+      if (cm->current_video_frame > 1) {
+        q = VPXMIN(rc->avg_frame_qindex[INTER_FRAME], active_worst_quality);
+        active_best_quality = inter_minq[q];
+      } else {
         active_best_quality = inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
+      }
       // For the constrained quality mode we don't want
       // q to fall below the cq level.
       if ((oxcf->rc_mode == VPX_CQ) &&
@@ -993,7 +1051,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   const GF_GROUP *gf_group = &cpi->twopass.gf_group;
-  const int cq_level = get_active_cq_level(rc, oxcf);
+  const int cq_level = get_active_cq_level_two_pass(&cpi->twopass, rc, oxcf);
   int active_best_quality;
   int active_worst_quality = cpi->twopass.active_worst_quality;
   int q;
@@ -1074,7 +1132,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
       if (!cpi->refresh_alt_ref_frame) {
         active_best_quality = cq_level;
       } else {
-       active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+        active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
 
         // Modify best quality for second level arfs. For mode VPX_Q this
         // becomes the baseline frame q.
@@ -1101,8 +1159,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
 
   // Extension to max or min Q if undershoot or overshoot is outside
   // the permitted range.
-  if ((cpi->oxcf.rc_mode != VPX_Q) &&
-      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
+  if (cpi->oxcf.rc_mode != VPX_Q) {
     if (frame_is_intra_only(cm) ||
         (!rc->is_src_frame_alt_ref &&
          (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
@@ -1256,8 +1313,12 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
     rc->frames_since_golden = 0;
 
     // If we are not using alt ref in the up and coming group clear the arf
-    // active flag.
-    if (!rc->source_alt_ref_pending) {
+    // active flag. In multi arf group case, if the index is not 0 then
+    // we are overlaying a mid group arf so should not reset the flag.
+    if (cpi->oxcf.pass == 2) {
+      if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0))
+        rc->source_alt_ref_active = 0;
+    } else if (!rc->source_alt_ref_pending) {
       rc->source_alt_ref_active = 0;
     }
 
@@ -1274,6 +1335,26 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
   }
 }
 
+static void compute_frame_low_motion(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  int mi_row, mi_col;
+  MODE_INFO **mi = cm->mi_grid_visible;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int rows = cm->mi_rows, cols = cm->mi_cols;
+  int cnt_zeromv = 0;
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      if (abs(mi[0]->mv[0].as_mv.row) < 16 &&
+          abs(mi[0]->mv[0].as_mv.col) < 16)
+        cnt_zeromv++;
+      mi++;
+    }
+    mi += 8;
+  }
+  cnt_zeromv = 100 * cnt_zeromv / (rows * cols);
+  rc->avg_frame_low_motion = (3 * rc->avg_frame_low_motion + cnt_zeromv) >> 2;
+}
+
 void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   const VP9_COMMON *const cm = &cpi->common;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -1308,9 +1389,9 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
       }
     }
   } else {
-    if (rc->is_src_frame_alt_ref ||
-        !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) ||
-        (cpi->use_svc && oxcf->rc_mode == VPX_CBR)) {
+    if ((cpi->use_svc && oxcf->rc_mode == VPX_CBR) ||
+        (!rc->is_src_frame_alt_ref &&
+         !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
       rc->last_q[INTER_FRAME] = qindex;
       rc->avg_frame_qindex[INTER_FRAME] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
@@ -1383,6 +1464,11 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
         rc->next_frame_size_selector != rc->frame_size_selector;
     rc->frame_size_selector = rc->next_frame_size_selector;
   }
+
+  if (oxcf->pass == 0) {
+    if (cm->frame_type != KEY_FRAME)
+      compute_frame_low_motion(cpi);
+  }
 }
 
 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
@@ -1421,6 +1507,24 @@ static int calc_iframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
   return vp9_rc_clamp_iframe_target_size(cpi, target);
 }
 
+static void adjust_gf_key_frame(VP9_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  rc->constrained_gf_group = 0;
+  // Reset gf interval to make more equal spacing for up-coming key frame.
+  if ((rc->frames_to_key <= 7 * rc->baseline_gf_interval >> 2) &&
+      (rc->frames_to_key > rc->baseline_gf_interval)) {
+    rc->baseline_gf_interval = rc->frames_to_key >> 1;
+    if (rc->baseline_gf_interval < 5)
+      rc->baseline_gf_interval = rc->frames_to_key;
+    rc->constrained_gf_group = 1;
+  } else {
+    // Reset since frames_till_gf_update_due must be <= frames_to_key.
+    if (rc->baseline_gf_interval > rc->frames_to_key) {
+      rc->baseline_gf_interval = rc->frames_to_key;
+      rc->constrained_gf_group = 1;
+    }
+  }
+}
 void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -1441,24 +1545,41 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
     cm->frame_type = INTER_FRAME;
   }
   if (rc->frames_till_gf_update_due == 0) {
-    rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
-    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
-    if (rc->frames_till_gf_update_due > rc->frames_to_key) {
-      rc->frames_till_gf_update_due = rc->frames_to_key;
-      rc->constrained_gf_group = 1;
+    double rate_err = 1.0;
+    rc->gfu_boost = DEFAULT_GF_BOOST;
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0) {
+      vp9_cyclic_refresh_set_golden_update(cpi);
     } else {
-      rc->constrained_gf_group = 0;
+      rc->baseline_gf_interval =
+          (rc->min_gf_interval + rc->max_gf_interval) / 2;
     }
+    if (rc->rolling_target_bits > 0)
+      rate_err =
+          (double)rc->rolling_actual_bits / (double)rc->rolling_target_bits;
+    // Increase gf interval at high Q and high overshoot.
+    if (cm->current_video_frame > 30 &&
+        rc->avg_frame_qindex[INTER_FRAME] > (7 * rc->worst_quality) >> 3 &&
+        rate_err > 3.5) {
+      rc->baseline_gf_interval =
+          VPXMIN(15, (3 * rc->baseline_gf_interval) >> 1);
+    } else if (cm->current_video_frame > 30 &&
+               rc->avg_frame_low_motion < 20) {
+      // Decrease boost and gf interval for high motion case.
+      rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
+      rc->baseline_gf_interval = VPXMAX(5, rc->baseline_gf_interval >> 1);
+    }
+    adjust_gf_key_frame(cpi);
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     cpi->refresh_golden_frame = 1;
     rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
-    rc->gfu_boost = DEFAULT_GF_BOOST;
   }
   if (cm->frame_type == KEY_FRAME)
     target = calc_iframe_target_size_one_pass_vbr(cpi);
   else
     target = calc_pframe_target_size_one_pass_vbr(cpi);
   vp9_rc_set_frame_target(cpi, target);
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0)
+    vp9_cyclic_refresh_update_parameters(cpi);
 }
 
 static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
@@ -1539,41 +1660,31 @@ static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   return vp9_rc_clamp_iframe_target_size(cpi, target);
 }
 
-// Reset information needed to set proper reference frames and buffer updates
-// for temporal layering. This is called when a key frame is encoded.
-static void reset_temporal_layer_to_zero(VP9_COMP *cpi) {
-  int sl;
-  LAYER_CONTEXT *lc = NULL;
-  cpi->svc.temporal_layer_id = 0;
-
-  for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
-    lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers];
-    lc->current_video_frame_in_layer = 0;
-    lc->frames_from_key_frame = 0;
-  }
-}
-
 void vp9_rc_get_svc_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   int target = rc->avg_frame_bandwidth;
-  const int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
+  int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
       cpi->svc.temporal_layer_id, cpi->svc.number_temporal_layers);
-
+  // Periodic key frames is based on the super-frame counter
+  // (svc.current_superframe), also only base spatial layer is key frame.
   if ((cm->current_video_frame == 0) ||
       (cpi->frame_flags & FRAMEFLAGS_KEY) ||
-      (cpi->oxcf.auto_key && (rc->frames_since_key %
-          cpi->oxcf.key_freq == 0))) {
+      (cpi->oxcf.auto_key &&
+       (cpi->svc.current_superframe % cpi->oxcf.key_freq == 0) &&
+       cpi->svc.spatial_layer_id == 0)) {
     cm->frame_type = KEY_FRAME;
     rc->source_alt_ref_active = 0;
-
     if (is_two_pass_svc(cpi)) {
       cpi->svc.layer_context[layer].is_key_frame = 1;
       cpi->ref_frame_flags &=
           (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
     } else if (is_one_pass_cbr_svc(cpi)) {
+      if (cm->current_video_frame > 0)
+        vp9_svc_reset_key_frame(cpi);
+      layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
+           cpi->svc.temporal_layer_id, cpi->svc.number_temporal_layers);
       cpi->svc.layer_context[layer].is_key_frame = 1;
-      reset_temporal_layer_to_zero(cpi);
       cpi->ref_frame_flags &=
                 (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
       // Assumption here is that LAST_FRAME is being updated for a keyframe.
@@ -1715,29 +1826,36 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
                                   RATE_CONTROL *const rc) {
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
 
-  // Set Maximum gf/arf interval
-  rc->max_gf_interval = oxcf->max_gf_interval;
-  rc->min_gf_interval = oxcf->min_gf_interval;
-  if (rc->min_gf_interval == 0)
-    rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
-        oxcf->width, oxcf->height, cpi->framerate);
-  if (rc->max_gf_interval == 0)
-    rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
-        cpi->framerate, rc->min_gf_interval);
+  // Special case code for 1 pass fixed Q mode tests
+  if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) {
+    rc->max_gf_interval = FIXED_GF_INTERVAL;
+    rc->min_gf_interval = FIXED_GF_INTERVAL;
+    rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
+  } else {
+    // Set Maximum gf/arf interval
+    rc->max_gf_interval = oxcf->max_gf_interval;
+    rc->min_gf_interval = oxcf->min_gf_interval;
+    if (rc->min_gf_interval == 0)
+      rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
+          oxcf->width, oxcf->height, cpi->framerate);
+    if (rc->max_gf_interval == 0)
+      rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
+          cpi->framerate, rc->min_gf_interval);
+
+    // Extended interval for genuinely static scenes
+    rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
+
+    if (is_altref_enabled(cpi)) {
+      if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
+        rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+    }
 
-  // Extended interval for genuinely static scenes
-  rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
+    if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
+      rc->max_gf_interval = rc->static_scene_max_gf_interval;
 
-  if (is_altref_enabled(cpi)) {
-    if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
-      rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+    // Clamp min to max
+    rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval);
   }
-
-  if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
-    rc->max_gf_interval = rc->static_scene_max_gf_interval;
-
-  // Clamp min to max
-  rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval);
 }
 
 void vp9_rc_update_framerate(VP9_COMP *cpi) {
@@ -1774,27 +1892,28 @@ static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) {
   RATE_CONTROL *const rc = &cpi->rc;
   int64_t vbr_bits_off_target = rc->vbr_bits_off_target;
   int max_delta;
-  double position_factor = 1.0;
-
-  // How far through the clip are we.
-  // This number is used to damp the per frame rate correction.
-  // Range 0 - 1.0
-  if (cpi->twopass.total_stats.count) {
-    position_factor = sqrt((double)cpi->common.current_video_frame /
-                           cpi->twopass.total_stats.count);
-  }
-  max_delta = (int)(position_factor *
-                    ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
-
-  // vbr_bits_off_target > 0 means we have extra bits to spend
-  if (vbr_bits_off_target > 0) {
-    *this_frame_target +=
-      (vbr_bits_off_target > max_delta) ? max_delta
-                                        : (int)vbr_bits_off_target;
-  } else {
-    *this_frame_target -=
-      (vbr_bits_off_target < -max_delta) ? max_delta
-                                         : (int)-vbr_bits_off_target;
+  int frame_window = VPXMIN(16,
+      ((int)cpi->twopass.total_stats.count - cpi->common.current_video_frame));
+
+  // Calcluate the adjustment to rate for this frame.
+  if (frame_window > 0) {
+    max_delta = (vbr_bits_off_target > 0)
+        ? (int)(vbr_bits_off_target / frame_window)
+        : (int)(-vbr_bits_off_target / frame_window);
+
+    max_delta = VPXMIN(max_delta,
+        ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
+
+    // vbr_bits_off_target > 0 means we have extra bits to spend
+    if (vbr_bits_off_target > 0) {
+      *this_frame_target +=
+        (vbr_bits_off_target > max_delta) ? max_delta
+                                          : (int)vbr_bits_off_target;
+    } else {
+      *this_frame_target -=
+        (vbr_bits_off_target < -max_delta) ? max_delta
+                                           : (int)-vbr_bits_off_target;
+    }
   }
 
   // Fast redistribution of bits arising from massive local undershoot.
@@ -1835,6 +1954,9 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
   RESIZE_ACTION resize_action = NO_RESIZE;
   int avg_qp_thr1 = 70;
   int avg_qp_thr2 = 50;
+  int min_width = 180;
+  int min_height = 180;
+  int down_size_on = 1;
   cpi->resize_scale_num = 1;
   cpi->resize_scale_den = 1;
   // Don't resize on key frame; reset the counters on key frame.
@@ -1843,6 +1965,21 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
     cpi->resize_count = 0;
     return 0;
   }
+  // Check current frame reslution to avoid generating frames smaller than
+  // the minimum resolution.
+  if (ONEHALFONLY_RESIZE) {
+    if ((cm->width >> 1) < min_width || (cm->height >> 1) < min_height)
+      down_size_on = 0;
+  } else {
+    if (cpi->resize_state == ORIG &&
+        (cm->width * 3 / 4 < min_width ||
+         cm->height * 3 / 4 < min_height))
+      return 0;
+    else if (cpi->resize_state == THREE_QUARTER &&
+             ((cpi->oxcf.width >> 1) < min_width ||
+              (cpi->oxcf.height >> 1) < min_height))
+      down_size_on = 0;
+  }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
   // If denoiser is on, apply a smaller qp threshold.
@@ -1854,7 +1991,7 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
 
   // Resize based on average buffer underflow and QP over some window.
   // Ignore samples close to key frame, since QP is usually high after key.
-  if (cpi->rc.frames_since_key > 1 * cpi->framerate) {
+  if (cpi->rc.frames_since_key > 2 * cpi->framerate) {
     const int window = (int)(4 * cpi->framerate);
     cpi->resize_avg_qp += cm->base_qindex;
     if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
@@ -1869,7 +2006,7 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
       // down state, i.e. 1/2 or 3/4 of original resolution.
       // Currently, use a flag to turn 3/4 resizing feature on/off.
       if (cpi->resize_buffer_underflow > (cpi->resize_count >> 2)) {
-        if (cpi->resize_state == THREE_QUARTER) {
+        if (cpi->resize_state == THREE_QUARTER && down_size_on) {
           resize_action = DOWN_ONEHALF;
           cpi->resize_state = ONE_HALF;
         } else if (cpi->resize_state == ORIG) {
@@ -1955,13 +2092,17 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
   VP9_COMMON * const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   rc->high_source_sad = 0;
-  if (cpi->Last_Source != NULL) {
+  if (cpi->Last_Source != NULL &&
+      cpi->Last_Source->y_width == cpi->Source->y_width &&
+      cpi->Last_Source->y_height == cpi->Source->y_height) {
     const uint8_t *src_y = cpi->Source->y_buffer;
     const int src_ystride = cpi->Source->y_stride;
     const uint8_t *last_src_y = cpi->Last_Source->y_buffer;
     const int last_src_ystride = cpi->Last_Source->y_stride;
     int sbi_row, sbi_col;
     const BLOCK_SIZE bsize = BLOCK_64X64;
+    uint32_t min_thresh = 4000;
+    float thresh = 8.0f;
     // Loop over sub-sample of frame, and compute average sad over 64x64 blocks.
     uint64_t avg_sad = 0;
     int num_samples = 0;
@@ -1992,12 +2133,37 @@ void vp9_avg_source_sad(VP9_COMP *cpi) {
     // between current and the previous frame value(s). Use a minimum threshold
     // for cases where there is small change from content that is completely
     // static.
-    if (avg_sad > VPXMAX(4000, (rc->avg_source_sad << 3)) &&
+    if (cpi->oxcf.rc_mode == VPX_VBR) {
+      min_thresh = 60000;
+      thresh = 2.1f;
+    }
+    if (avg_sad >
+        VPXMAX(min_thresh, (unsigned int)(rc->avg_source_sad  * thresh)) &&
         rc->frames_since_key > 1)
       rc->high_source_sad = 1;
     else
       rc->high_source_sad = 0;
-    rc->avg_source_sad = (rc->avg_source_sad + avg_sad) >> 1;
+    if (avg_sad > 0 || cpi->oxcf.rc_mode == VPX_CBR)
+      rc->avg_source_sad = (3 * rc->avg_source_sad + avg_sad) >> 2;
+    // For VBR, under scene change/high content change, force golden refresh.
+    if (cpi->oxcf.rc_mode == VPX_VBR &&
+        rc->high_source_sad &&
+        rc->frames_to_key > 3 &&
+        rc->count_last_scene_change > 4 &&
+        cpi->ext_refresh_frame_flags_pending == 0) {
+      int target;
+      cpi->refresh_golden_frame = 1;
+      rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
+      rc->baseline_gf_interval = VPXMIN(20,
+          VPXMAX(10, rc->baseline_gf_interval));
+      adjust_gf_key_frame(cpi);
+      rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+      target = calc_pframe_target_size_one_pass_vbr(cpi);
+      vp9_rc_set_frame_target(cpi, target);
+      rc->count_last_scene_change = 0;
+    } else {
+      rc->count_last_scene_change++;
+    }
   }
 }
 
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 136fd3e..7024bcf 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -26,6 +26,7 @@ extern "C" {
 
 #define MIN_GF_INTERVAL     4
 #define MAX_GF_INTERVAL     16
+#define FIXED_GF_INTERVAL   8    // Used in some testing modes only
 #define ONEHALFONLY_RESIZE  0
 
 typedef enum {
@@ -160,6 +161,8 @@ typedef struct {
 
   uint64_t avg_source_sad;
   int high_source_sad;
+  int count_last_scene_change;
+  int avg_frame_low_motion;
 } RATE_CONTROL;
 
 struct VP9_COMP;
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index b085c7a..91b2911 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -41,7 +41,6 @@
 #include "vp9/encoder/vp9_tokenize.h"
 
 #define RD_THRESH_POW      1.25
-#define RD_MULT_EPB_RATIO  64
 
 // Factor to weigh the rate for switchable interp filters.
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
@@ -76,10 +75,12 @@ static void fill_mode_costs(VP9_COMP *cpi) {
                       vp9_intra_mode_tree);
 
   vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
-  vp9_cost_tokens(cpi->intra_uv_mode_cost[KEY_FRAME],
-                  vp9_kf_uv_mode_prob[TM_PRED], vp9_intra_mode_tree);
-  vp9_cost_tokens(cpi->intra_uv_mode_cost[INTER_FRAME],
-                  fc->uv_mode_prob[TM_PRED], vp9_intra_mode_tree);
+  for (i = 0; i < INTRA_MODES; ++i) {
+    vp9_cost_tokens(cpi->intra_uv_mode_cost[KEY_FRAME][i],
+                    vp9_kf_uv_mode_prob[i], vp9_intra_mode_tree);
+    vp9_cost_tokens(cpi->intra_uv_mode_cost[INTER_FRAME][i],
+                    fc->uv_mode_prob[i], vp9_intra_mode_tree);
+  }
 
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
     vp9_cost_tokens(cpi->switchable_interp_costs[i],
@@ -277,8 +278,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   rd->RDDIV = RDDIV_BITS;  // In bits (to multiply D by 128).
   rd->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
 
-  x->errorperbit = rd->RDMULT / RD_MULT_EPB_RATIO;
-  x->errorperbit += (x->errorperbit == 0);
+  set_error_per_bit(x, rd->RDMULT);
 
   x->select_tx_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
                        cm->frame_type != KEY_FRAME) ? 0 : 1;
@@ -286,29 +286,37 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   set_block_thresholds(cm, rd);
   set_partition_probs(cm, xd);
 
-  if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME)
-    fill_token_costs(x->token_costs, cm->fc->coef_probs);
-
-  if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
-      cm->frame_type == KEY_FRAME) {
-    for (i = 0; i < PARTITION_CONTEXTS; ++i)
-      vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(xd, i),
-                      vp9_partition_tree);
-  }
+  if (cpi->oxcf.pass == 1) {
+    if (!frame_is_intra_only(cm))
+      vp9_build_nmv_cost_table(
+          x->nmvjointcost,
+          cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost,
+          &cm->fc->nmvc, cm->allow_high_precision_mv);
+  } else {
+    if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME)
+      fill_token_costs(x->token_costs, cm->fc->coef_probs);
+
+    if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
+        cm->frame_type == KEY_FRAME) {
+      for (i = 0; i < PARTITION_CONTEXTS; ++i)
+        vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(xd, i),
+                        vp9_partition_tree);
+    }
 
-  if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1 ||
-      cm->frame_type == KEY_FRAME) {
-    fill_mode_costs(cpi);
+    if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1 ||
+        cm->frame_type == KEY_FRAME) {
+      fill_mode_costs(cpi);
 
-    if (!frame_is_intra_only(cm)) {
-      vp9_build_nmv_cost_table(x->nmvjointcost,
-                               cm->allow_high_precision_mv ? x->nmvcost_hp
-                                                           : x->nmvcost,
-                               &cm->fc->nmvc, cm->allow_high_precision_mv);
+      if (!frame_is_intra_only(cm)) {
+        vp9_build_nmv_cost_table(
+            x->nmvjointcost,
+            cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost,
+            &cm->fc->nmvc, cm->allow_high_precision_mv);
 
-      for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-        vp9_cost_tokens((int *)cpi->inter_mode_cost[i],
-                        cm->fc->inter_mode_probs[i], vp9_inter_mode_tree);
+        for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+          vp9_cost_tokens((int *)cpi->inter_mode_cost[i],
+                          cm->fc->inter_mode_probs[i], vp9_inter_mode_tree);
+      }
     }
   }
 }
@@ -341,6 +349,7 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
        38,    28,    21,    16,    12,    10,     8,     6,
         5,     3,     2,     1,     1,     1,     0,     0,
   };
+
   // Normalized distortion:
   // This table models the normalized distortion for a Laplacian source
   // with given variance when quantized with a uniform quantizer
@@ -407,7 +416,7 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
         (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
     const int xsq_q10 = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
     model_rd_norm(xsq_q10, &r_q10, &d_q10);
-    *rate = ((r_q10 << n_log2) + 2) >> 2;
+    *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - VP9_PROB_COST_SHIFT);
     *dist = (var * (int64_t)d_q10 + 512) >> 10;
   }
 }
@@ -555,10 +564,10 @@ YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
 }
 
 int vp9_get_switchable_rate(const VP9_COMP *cpi, const MACROBLOCKD *const xd) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const MODE_INFO *const mi = xd->mi[0];
   const int ctx = vp9_get_pred_context_switchable_interp(xd);
   return SWITCHABLE_INTERP_RATE_FACTOR *
-             cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
+             cpi->switchable_interp_costs[ctx][mi->interp_filter];
 }
 
 void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index 28385c9..9b8e273 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -17,15 +17,17 @@
 
 #include "vp9/encoder/vp9_block.h"
 #include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_cost.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #define RDDIV_BITS          7
+#define RD_EPB_SHIFT        6
 
 #define RDCOST(RM, DM, R, D) \
-  (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM))
+  (ROUND_POWER_OF_TWO(((int64_t)R) * (RM), VP9_PROB_COST_SHIFT) + (D << DM))
 #define QIDX_SKIP_THRESH     115
 
 #define MV_COST_WEIGHT      108
@@ -167,6 +169,11 @@ static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
     return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
 }
 
+static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) {
+  x->errorperbit = rdmult >> RD_EPB_SHIFT;
+  x->errorperbit += (x->errorperbit == 0);
+}
+
 void vp9_mv_pred(struct VP9_COMP *cpi, MACROBLOCK *x,
                  uint8_t *ref_y_buffer, int ref_y_stride,
                  int ref_frame, BLOCK_SIZE block_size);
@@ -181,6 +188,15 @@ void vp9_setup_pred_block(const MACROBLOCKD *xd,
 int vp9_get_intra_cost_penalty(int qindex, int qdelta,
                                vpx_bit_depth_t bit_depth);
 
+unsigned int vp9_get_sby_perpixel_variance(struct VP9_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs);
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_high_get_sby_perpixel_variance(struct VP9_COMP *cpi,
+                                                const struct buf_2d *ref,
+                                                BLOCK_SIZE bs, int bd);
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 4f3a06e..e65e051 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -165,7 +165,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
   int i;
   int64_t rate_sum = 0;
   int64_t dist_sum = 0;
-  const int ref = xd->mi[0]->mbmi.ref_frame[0];
+  const int ref = xd->mi[0]->ref_frame[0];
   unsigned int sse;
   unsigned int var = 0;
   unsigned int sum_sse = 0;
@@ -248,7 +248,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
       int quantizer = (pd->dequant[1] >> dequant_shift);
 
       if (quantizer < 120)
-        rate = (square_error * (280 - quantizer)) >> 8;
+        rate = (square_error * (280 - quantizer)) >> (16 - VP9_PROB_COST_SHIFT);
       else
         rate = 0;
       dist = (square_error * quantizer) >> 8;
@@ -361,73 +361,96 @@ static int cost_coeffs(MACROBLOCK *x,
                        const int16_t *scan, const int16_t *nb,
                        int use_fast_coef_costing) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *mi = xd->mi[0];
   const struct macroblock_plane *p = &x->plane[plane];
   const PLANE_TYPE type = get_plane_type(plane);
   const int16_t *band_count = &band_counts[tx_size][1];
   const int eob = p->eobs[block];
   const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-                   x->token_costs[tx_size][type][is_inter_block(mbmi)];
+                   x->token_costs[tx_size][type][is_inter_block(mi)];
   uint8_t token_cache[32 * 32];
   int pt = combine_entropy_contexts(*A, *L);
   int c, cost;
 #if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+  const int *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
 #else
-  const int16_t *cat6_high_cost = vp9_get_high_cost_table(8);
+  const int *cat6_high_cost = vp9_get_high_cost_table(8);
 #endif
 
   // Check for consistency of tx_size with mode info
-  assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size :
-         get_uv_tx_size(mbmi, &xd->plane[plane]) == tx_size);
+  assert(type == PLANE_TYPE_Y ? mi->tx_size == tx_size :
+         get_uv_tx_size(mi, &xd->plane[plane]) == tx_size);
 
   if (eob == 0) {
     // single eob token
     cost = token_costs[0][0][pt][EOB_TOKEN];
     c = 0;
   } else {
-    int band_left = *band_count++;
-
-    // dc token
-    int v = qcoeff[0];
-    int16_t prev_t;
-    EXTRABIT e;
-    vp9_get_token_extra(v, &prev_t, &e);
-    cost = (*token_costs)[0][pt][prev_t] +
-        vp9_get_cost(prev_t, e, cat6_high_cost);
-
-    token_cache[0] = vp9_pt_energy_class[prev_t];
-    ++token_costs;
-
-    // ac tokens
-    for (c = 1; c < eob; c++) {
-      const int rc = scan[c];
-      int16_t t;
-
-      v = qcoeff[rc];
-      vp9_get_token_extra(v, &t, &e);
-      if (use_fast_coef_costing) {
-        cost += (*token_costs)[!prev_t][!prev_t][t] +
-            vp9_get_cost(t, e, cat6_high_cost);
-      } else {
-        pt = get_coef_context(nb, token_cache, c);
-        cost += (*token_costs)[!prev_t][pt][t] +
-            vp9_get_cost(t, e, cat6_high_cost);
-        token_cache[rc] = vp9_pt_energy_class[t];
-      }
-      prev_t = t;
-      if (!--band_left) {
-        band_left = *band_count++;
-        ++token_costs;
+    if (use_fast_coef_costing) {
+      int band_left = *band_count++;
+
+      // dc token
+      int v = qcoeff[0];
+      int16_t prev_t;
+      cost = vp9_get_token_cost(v, &prev_t, cat6_high_cost);
+      cost += (*token_costs)[0][pt][prev_t];
+
+      token_cache[0] = vp9_pt_energy_class[prev_t];
+      ++token_costs;
+
+      // ac tokens
+      for (c = 1; c < eob; c++) {
+        const int rc = scan[c];
+        int16_t t;
+
+        v = qcoeff[rc];
+        cost += vp9_get_token_cost(v, &t, cat6_high_cost);
+        cost += (*token_costs)[!prev_t][!prev_t][t];
+        prev_t = t;
+        if (!--band_left) {
+          band_left = *band_count++;
+          ++token_costs;
+        }
       }
-    }
 
-    // eob token
-    if (band_left) {
-      if (use_fast_coef_costing) {
+      // eob token
+      if (band_left)
         cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
-      } else {
+
+    } else {  // !use_fast_coef_costing
+      int band_left = *band_count++;
+
+      // dc token
+      int v = qcoeff[0];
+      int16_t tok;
+      unsigned int (*tok_cost_ptr)[COEFF_CONTEXTS][ENTROPY_TOKENS];
+      cost = vp9_get_token_cost(v, &tok, cat6_high_cost);
+      cost += (*token_costs)[0][pt][tok];
+
+      token_cache[0] = vp9_pt_energy_class[tok];
+      ++token_costs;
+
+      tok_cost_ptr = &((*token_costs)[!tok]);
+
+      // ac tokens
+      for (c = 1; c < eob; c++) {
+        const int rc = scan[c];
+
+        v = qcoeff[rc];
+        cost += vp9_get_token_cost(v, &tok, cat6_high_cost);
+        pt = get_coef_context(nb, token_cache, c);
+        cost += (*tok_cost_ptr)[pt][tok];
+        token_cache[rc] = vp9_pt_energy_class[tok];
+        if (!--band_left) {
+          band_left = *band_count++;
+          ++token_costs;
+        }
+        tok_cost_ptr = &((*token_costs)[!tok]);
+      }
+
+      // eob token
+      if (band_left) {
         pt = get_coef_context(nb, token_cache, c);
         cost += (*token_costs)[0][pt][EOB_TOKEN];
       }
@@ -461,7 +484,7 @@ static void dist_block(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   *out_sse = this_sse >> shift;
 
-  if (x->skip_encode && !is_inter_block(&xd->mi[0]->mbmi)) {
+  if (x->skip_encode && !is_inter_block(xd->mi[0])) {
     // TODO(jingning): tune the model to better capture the distortion.
     int64_t p = (pd->dequant[1] * pd->dequant[1] *
                     (1 << ss_txfrm_size)) >>
@@ -491,7 +514,7 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
   struct rdcost_block_args *args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *const mi = xd->mi[0];
   int64_t rd1, rd2, rd;
   int rate;
   int64_t dist;
@@ -500,8 +523,8 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
   if (args->exit_early)
     return;
 
-  if (!is_inter_block(mbmi)) {
-    struct encode_b_args arg = {x, NULL, &mbmi->skip};
+  if (!is_inter_block(mi)) {
+    struct encode_b_args arg = {x, NULL, &mi->skip};
     vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
     dist_block(x, plane, block, tx_size, &dist, &sse);
   } else if (max_txsize_lookup[plane_bsize] == tx_size) {
@@ -588,7 +611,7 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
   args.skippable = 1;
 
   if (plane == 0)
-    xd->mi[0]->mbmi.tx_size = tx_size;
+    xd->mi[0]->tx_size = tx_size;
 
   vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 
@@ -618,13 +641,13 @@ static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x,
   VP9_COMMON *const cm = &cpi->common;
   const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *const mi = xd->mi[0];
 
-  mbmi->tx_size = VPXMIN(max_tx_size, largest_tx_size);
+  mi->tx_size = VPXMIN(max_tx_size, largest_tx_size);
 
   txfm_rd_in_plane(x, rate, distortion, skip,
                    sse, ref_best_rd, 0, bs,
-                   mbmi->tx_size, cpi->sf.use_fast_coef_costing);
+                   mi->tx_size, cpi->sf.use_fast_coef_costing);
 }
 
 static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
@@ -637,7 +660,7 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *const mi = xd->mi[0];
   vpx_prob skip_prob = vp9_get_skip_prob(cm, xd);
   int r[TX_SIZES][2], s[TX_SIZES];
   int64_t d[TX_SIZES], sse[TX_SIZES];
@@ -684,7 +707,7 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
     if (d[n] == INT64_MAX || r[n][0] == INT_MAX) {
       rd[n][0] = rd[n][1] = INT64_MAX;
     } else if (s[n]) {
-      if (is_inter_block(mbmi)) {
+      if (is_inter_block(mi)) {
         rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]);
         r[n][1] -= r_tx_size;
       } else {
@@ -696,7 +719,7 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
     }
 
-    if (is_inter_block(mbmi) && !xd->lossless && !s[n] && sse[n] != INT64_MAX) {
+    if (is_inter_block(mi) && !xd->lossless && !s[n] && sse[n] != INT64_MAX) {
       rd[n][0] = VPXMIN(rd[n][0], RDCOST(x->rdmult, x->rddiv, s1, sse[n]));
       rd[n][1] = VPXMIN(rd[n][1], RDCOST(x->rdmult, x->rddiv, s1, sse[n]));
     }
@@ -713,12 +736,12 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
       best_rd = rd[n][1];
     }
   }
-  mbmi->tx_size = best_tx;
+  mi->tx_size = best_tx;
 
-  *distortion = d[mbmi->tx_size];
-  *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
-  *skip       = s[mbmi->tx_size];
-  *psse       = sse[mbmi->tx_size];
+  *distortion = d[mi->tx_size];
+  *rate       = r[mi->tx_size][cm->tx_mode == TX_MODE_SELECT];
+  *skip       = s[mi->tx_size];
+  *psse       = sse[mi->tx_size];
 }
 
 static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
@@ -729,7 +752,7 @@ static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   int64_t sse;
   int64_t *ret_sse = psse ? psse : &sse;
 
-  assert(bs == xd->mi[0]->mbmi.sb_type);
+  assert(bs == xd->mi[0]->sb_type);
 
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
     choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
@@ -787,10 +810,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_VP9_HIGHBITDEPTH
   uint16_t best_dst16[8 * 8];
 #endif
+  memcpy(ta, a, num_4x4_blocks_wide * sizeof(a[0]));
+  memcpy(tl, l, num_4x4_blocks_high * sizeof(l[0]));
 
-  memcpy(ta, a, sizeof(ta));
-  memcpy(tl, l, sizeof(tl));
-  xd->mi[0]->mbmi.tx_size = TX_4X4;
+  xd->mi[0]->tx_size = TX_4X4;
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -810,8 +833,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x,
             continue;
       }
 
-      memcpy(tempa, ta, sizeof(ta));
-      memcpy(templ, tl, sizeof(tl));
+      memcpy(tempa, ta, num_4x4_blocks_wide * sizeof(ta[0]));
+      memcpy(templ, tl, num_4x4_blocks_high * sizeof(tl[0]));
 
       for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
         for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
@@ -874,8 +897,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x,
         *bestdistortion = distortion;
         best_rd = this_rd;
         *best_mode = mode;
-        memcpy(a, tempa, sizeof(tempa));
-        memcpy(l, templ, sizeof(templ));
+        memcpy(a, tempa, num_4x4_blocks_wide * sizeof(tempa[0]));
+        memcpy(l, templ, num_4x4_blocks_high * sizeof(templ[0]));
         for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
           memcpy(best_dst16 + idy * 8,
                  CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
@@ -914,8 +937,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x,
           continue;
     }
 
-    memcpy(tempa, ta, sizeof(ta));
-    memcpy(templ, tl, sizeof(tl));
+    memcpy(tempa, ta, num_4x4_blocks_wide * sizeof(ta[0]));
+    memcpy(templ, tl, num_4x4_blocks_high * sizeof(tl[0]));
 
     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
@@ -976,8 +999,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x,
       *bestdistortion = distortion;
       best_rd = this_rd;
       *best_mode = mode;
-      memcpy(a, tempa, sizeof(tempa));
-      memcpy(l, templ, sizeof(templ));
+      memcpy(a, tempa, num_4x4_blocks_wide * sizeof(tempa[0]));
+      memcpy(l, templ, num_4x4_blocks_high * sizeof(templ[0]));
       for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
         memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
                num_4x4_blocks_wide * 4);
@@ -1005,7 +1028,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
   MODE_INFO *const mic = xd->mi[0];
   const MODE_INFO *above_mi = xd->above_mi;
   const MODE_INFO *left_mi = xd->left_mi;
-  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy;
@@ -1013,12 +1036,8 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
   int64_t total_distortion = 0;
   int tot_rate_y = 0;
   int64_t total_rd = 0;
-  ENTROPY_CONTEXT t_above[4], t_left[4];
   const int *bmode_costs = cpi->mbmode_cost;
 
-  memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
-  memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
-
   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
@@ -1034,8 +1053,11 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
       }
 
       this_rd = rd_pick_intra4x4block(cpi, mb, idy, idx, &best_mode,
-                                      bmode_costs, t_above + idx, t_left + idy,
+                                      bmode_costs,
+                                      xd->plane[0].above_context + idx,
+                                      xd->plane[0].left_context + idy,
                                       &r, &ry, &d, bsize, best_rd - total_rd);
+
       if (this_rd >= best_rd - total_rd)
         return INT64_MAX;
 
@@ -1058,7 +1080,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
   *rate = cost;
   *rate_y = tot_rate_y;
   *distortion = total_distortion;
-  mic->mbmi.mode = mic->bmi[3].as_mode;
+  mic->mode = mic->bmi[3].as_mode;
 
   return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
 }
@@ -1095,7 +1117,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
         break;
     }
 
-    mic->mbmi.mode = mode;
+    mic->mode = mode;
 
     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
         &s, NULL, bsize, best_rd);
@@ -1109,7 +1131,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (this_rd < best_rd) {
       mode_selected   = mode;
       best_rd         = this_rd;
-      best_tx         = mic->mbmi.tx_size;
+      best_tx         = mic->tx_size;
       *rate           = this_rate;
       *rate_tokenonly = this_rate_tokenonly;
       *distortion     = this_distortion;
@@ -1117,8 +1139,8 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  mic->mbmi.mode = mode_selected;
-  mic->mbmi.tx_size = best_tx;
+  mic->mode = mode_selected;
+  mic->tx_size = best_tx;
 
   return best_rd;
 }
@@ -1130,8 +1152,8 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
                             int64_t *sse, BLOCK_SIZE bsize,
                             int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
+  MODE_INFO *const mi = xd->mi[0];
+  const TX_SIZE uv_tx_size = get_uv_tx_size(mi, &xd->plane[1]);
   int plane;
   int pnrate = 0, pnskip = 1;
   int64_t pndist = 0, pnsse = 0;
@@ -1140,7 +1162,7 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
   if (ref_best_rd < 0)
     is_cost_valid = 0;
 
-  if (is_inter_block(mbmi) && is_cost_valid) {
+  if (is_inter_block(mi) && is_cost_valid) {
     int plane;
     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
       vp9_subtract_plane(x, bsize, plane);
@@ -1192,14 +1214,20 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
       continue;
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+    if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) &&
+        (xd->above_mi == NULL || xd->left_mi == NULL) && need_top_left[mode])
+      continue;
+#endif  // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
 
-    xd->mi[0]->mbmi.uv_mode = mode;
+    xd->mi[0]->uv_mode = mode;
 
     if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
                           &this_distortion, &s, &this_sse, bsize, best_rd))
       continue;
     this_rate = this_rate_tokenonly +
-                cpi->intra_uv_mode_cost[cpi->common.frame_type][mode];
+        cpi->intra_uv_mode_cost[cpi->common.frame_type]
+                                [xd->mi[0]->mode][mode];
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
     if (this_rd < best_rd) {
@@ -1214,7 +1242,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  xd->mi[0]->mbmi.uv_mode = mode_selected;
+  xd->mi[0]->uv_mode = mode_selected;
   return best_rd;
 }
 
@@ -1225,11 +1253,13 @@ static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x,
   const VP9_COMMON *cm = &cpi->common;
   int64_t unused;
 
-  x->e_mbd.mi[0]->mbmi.uv_mode = DC_PRED;
+  x->e_mbd.mi[0]->uv_mode = DC_PRED;
   memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
   super_block_uvrd(cpi, x, rate_tokenonly, distortion,
                    skippable, &unused, bsize, INT64_MAX);
-  *rate = *rate_tokenonly + cpi->intra_uv_mode_cost[cm->frame_type][DC_PRED];
+  *rate = *rate_tokenonly +
+      cpi->intra_uv_mode_cost[cm->frame_type]
+                              [x->e_mbd.mi[0]->mode][DC_PRED];
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
@@ -1251,7 +1281,7 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, MACROBLOCK *const x,
                             rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
                             bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
   }
-  *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode;
+  *mode_uv = x->e_mbd.mi[0]->uv_mode;
 }
 
 static int cost_mv_ref(const VP9_COMP *cpi, PREDICTION_MODE mode,
@@ -1267,31 +1297,30 @@ static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
                                 int_mv seg_mvs[MAX_REF_FRAMES],
                                 int_mv *best_ref_mv[2], const int *mvjcost,
                                 int *mvcost[2]) {
-  MODE_INFO *const mic = xd->mi[0];
-  const MB_MODE_INFO *const mbmi = &mic->mbmi;
+  MODE_INFO *const mi = xd->mi[0];
   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   int thismvcost = 0;
   int idx, idy;
-  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
-  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
-  const int is_compound = has_second_ref(mbmi);
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mi->sb_type];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mi->sb_type];
+  const int is_compound = has_second_ref(mi);
 
   switch (mode) {
     case NEWMV:
-      this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
+      this_mv[0].as_int = seg_mvs[mi->ref_frame[0]].as_int;
       thismvcost += vp9_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
       if (is_compound) {
-        this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
+        this_mv[1].as_int = seg_mvs[mi->ref_frame[1]].as_int;
         thismvcost += vp9_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
                                       mvjcost, mvcost, MV_COST_WEIGHT_SUB);
       }
       break;
     case NEARMV:
     case NEARESTMV:
-      this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
+      this_mv[0].as_int = frame_mv[mode][mi->ref_frame[0]].as_int;
       if (is_compound)
-        this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
+        this_mv[1].as_int = frame_mv[mode][mi->ref_frame[1]].as_int;
       break;
     case ZEROMV:
       this_mv[0].as_int = 0;
@@ -1302,17 +1331,17 @@ static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
       break;
   }
 
-  mic->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
+  mi->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
   if (is_compound)
-    mic->bmi[i].as_mv[1].as_int = this_mv[1].as_int;
+    mi->bmi[i].as_mv[1].as_int = this_mv[1].as_int;
 
-  mic->bmi[i].as_mode = mode;
+  mi->bmi[i].as_mode = mode;
 
   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
-      memmove(&mic->bmi[i + idy * 2 + idx], &mic->bmi[i], sizeof(mic->bmi[i]));
+      memmove(&mi->bmi[i + idy * 2 + idx], &mi->bmi[i], sizeof(mi->bmi[i]));
 
-  return cost_mv_ref(cpi, mode, mbmi_ext->mode_context[mbmi->ref_frame[0]]) +
+  return cost_mv_ref(cpi, mode, mbmi_ext->mode_context[mi->ref_frame[0]]) +
             thismvcost;
 }
 
@@ -1330,7 +1359,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
   struct macroblockd_plane *const pd = &xd->plane[0];
   struct macroblock_plane *const p = &x->plane[0];
   MODE_INFO *const mi = xd->mi[0];
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->sb_type, pd);
   const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
   int idx, idy;
@@ -1342,15 +1371,29 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0, ref;
   const scan_order *so = &vp9_default_scan_orders[TX_4X4];
-  const int is_compound = has_second_ref(&mi->mbmi);
-  const InterpKernel *kernel = vp9_filter_kernels[mi->mbmi.interp_filter];
+  const int is_compound = has_second_ref(mi);
+  const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const uint8_t *pre = &pd->pre[ref].buf[vp9_raster_block_offset(BLOCK_8X8, i,
-                                               pd->pre[ref].stride)];
+    const int bw = b_width_log2_lookup[BLOCK_8X8];
+    const int h = 4 * (i >> bw);
+    const int w = 4 * (i & ((1 << bw) - 1));
+    const struct scale_factors *sf = &xd->block_refs[ref]->sf;
+    int y_stride = pd->pre[ref].stride;
+    uint8_t *pre = pd->pre[ref].buf + (h * pd->pre[ref].stride + w);
+
+    if (vp9_is_scaled(sf)) {
+      const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+      const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+
+      y_stride = xd->block_refs[ref]->buf->y_stride;
+      pre = xd->block_refs[ref]->buf->y_buffer;
+      pre += scaled_buffer_offset(x_start + w, y_start + h,
+                                  y_stride, sf);
+    }
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_highbd_build_inter_predictor(pre, pd->pre[ref].stride,
+    vp9_highbd_build_inter_predictor(pre, y_stride,
                                      dst, pd->dst.stride,
                                      &mi->bmi[i].as_mv[ref].as_mv,
                                      &xd->block_refs[ref]->sf, width, height,
@@ -1358,7 +1401,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
                                      mi_col * MI_SIZE + 4 * (i % 2),
                                      mi_row * MI_SIZE + 4 * (i / 2), xd->bd);
   } else {
-    vp9_build_inter_predictor(pre, pd->pre[ref].stride,
+    vp9_build_inter_predictor(pre, y_stride,
                               dst, pd->dst.stride,
                               &mi->bmi[i].as_mv[ref].as_mv,
                               &xd->block_refs[ref]->sf, width, height, ref,
@@ -1367,7 +1410,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
                               mi_row * MI_SIZE + 4 * (i / 2));
   }
 #else
-    vp9_build_inter_predictor(pre, pd->pre[ref].stride,
+    vp9_build_inter_predictor(pre, y_stride,
                               dst, pd->dst.stride,
                               &mi->bmi[i].as_mv[ref].as_mv,
                               &xd->block_refs[ref]->sf, width, height, ref,
@@ -1467,7 +1510,7 @@ static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) {
 }
 
 static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
-  MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi;
+  MODE_INFO *const mi = x->e_mbd.mi[0];
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
 
@@ -1476,17 +1519,17 @@ static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
   assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
   pd->pre[0].buf = &pd->pre[0].buf[vp9_raster_block_offset(BLOCK_8X8, i,
                                                            pd->pre[0].stride)];
-  if (has_second_ref(mbmi))
+  if (has_second_ref(mi))
     pd->pre[1].buf = &pd->pre[1].buf[vp9_raster_block_offset(BLOCK_8X8, i,
                                                            pd->pre[1].stride)];
 }
 
 static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
                                   struct buf_2d orig_pre[2]) {
-  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
+  MODE_INFO *mi = x->e_mbd.mi[0];
   x->plane[0].src = orig_src;
   x->e_mbd.plane[0].pre[0] = orig_pre[0];
-  if (has_second_ref(mbmi))
+  if (has_second_ref(mi))
     x->e_mbd.plane[0].pre[1] = orig_pre[1];
 }
 
@@ -1541,20 +1584,20 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
   const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  const int refs[2] = {mbmi->ref_frame[0],
-                       mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]};
+  MODE_INFO *mi = xd->mi[0];
+  const int refs[2] = {mi->ref_frame[0],
+                       mi->ref_frame[1] < 0 ? 0 : mi->ref_frame[1]};
   int_mv ref_mv[2];
   int ite, ref;
-  const InterpKernel *kernel = vp9_filter_kernels[mbmi->interp_filter];
+  const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
   struct scale_factors sf;
 
   // Do joint motion search in compound mode to get more accurate mv.
   struct buf_2d backup_yv12[2][MAX_MB_PLANE];
-  int last_besterr[2] = {INT_MAX, INT_MAX};
+  uint32_t last_besterr[2] = {UINT32_MAX, UINT32_MAX};
   const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
-    vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
-    vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
+    vp9_get_scaled_ref_frame(cpi, mi->ref_frame[0]),
+    vp9_get_scaled_ref_frame(cpi, mi->ref_frame[1])
   };
 
   // Prediction buffer from second frame.
@@ -1597,7 +1640,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   // and break out of the search loop if it couldn't find a better mv.
   for (ite = 0; ite < 4; ite++) {
     struct buf_2d ref_yv12[2];
-    int bestsme = INT_MAX;
+    uint32_t bestsme = UINT32_MAX;
     int sadpb = x->sadperbit16;
     MV tmp_mv;
     int search_range = 3;
@@ -1662,7 +1705,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                        search_range,
                                        &cpi->fn_ptr[bsize],
                                        &ref_mv[id].as_mv, second_pred);
-    if (bestsme < INT_MAX)
+    if (bestsme < UINT32_MAX)
       bestsme = vp9_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
                                       second_pred, &cpi->fn_ptr[bsize], 1);
 
@@ -1671,9 +1714,9 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     x->mv_row_min = tmp_row_min;
     x->mv_row_max = tmp_row_max;
 
-    if (bestsme < INT_MAX) {
-      int dis; /* TODO: use dis in distortion calculation later. */
-      unsigned int sse;
+    if (bestsme < UINT32_MAX) {
+      uint32_t dis; /* TODO: use dis in distortion calculation later. */
+      uint32_t sse;
       bestsme = cpi->find_fractional_mv_step(
           x, &tmp_mv,
           &ref_mv[id].as_mv,
@@ -1730,7 +1773,6 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
   MACROBLOCKD *xd = &x->e_mbd;
   MODE_INFO *mi = xd->mi[0];
-  MB_MODE_INFO *mbmi = &mi->mbmi;
   int mode_idx;
   int k, br = 0, idx, idy;
   int64_t bd = 0, block_sse = 0;
@@ -1742,13 +1784,14 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t this_segment_rd = 0;
   int label_mv_thresh;
   int segmentyrate = 0;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsize = mi->sb_type;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   ENTROPY_CONTEXT t_above[2], t_left[2];
   int subpelmv = 1, have_ref = 0;
-  const int has_second_rf = has_second_ref(mbmi);
-  const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
+  SPEED_FEATURES *const sf = &cpi->sf;
+  const int has_second_rf = has_second_ref(mi);
+  const int inter_mode_mask = sf->inter_mode_mask[bsize];
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
 
   vp9_zero(*bsi);
@@ -1784,7 +1827,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
       int ref;
 
       for (ref = 0; ref < 1 + has_second_rf; ++ref) {
-        const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+        const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
         frame_mv[ZEROMV][frame].as_int = 0;
         vp9_append_sub8x8_mvs_for_idx(cm, xd, i, ref, mi_row, mi_col,
                                       &frame_mv[NEARESTMV][frame],
@@ -1803,7 +1846,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
           continue;
 
         if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv,
-                                this_mode, mbmi->ref_frame))
+                                this_mode, mi->ref_frame))
           continue;
 
         memcpy(orig_pre, pd->pre, sizeof(orig_pre));
@@ -1814,10 +1857,10 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
         // motion search for newmv (single predictor case only)
         if (!has_second_rf && this_mode == NEWMV &&
-            seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
+            seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV) {
           MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
           int step_param = 0;
-          int thissme, bestsme = INT_MAX;
+          uint32_t bestsme = UINT32_MAX;
           int sadpb = x->sadperbit4;
           MV mvp_full;
           int max_mv;
@@ -1837,12 +1880,12 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
             }
           }
           if (i == 0)
-            max_mv = x->max_mv_context[mbmi->ref_frame[0]];
+            max_mv = x->max_mv_context[mi->ref_frame[0]];
           else
             max_mv =
                 VPXMAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
 
-          if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+          if (sf->mv.auto_mv_step_size && cm->show_frame) {
             // Take wtd average of the step_params based on the last frame's
             // max mv magnitude and the best ref mvs of the current block for
             // the given reference.
@@ -1855,9 +1898,9 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
           mvp_full.row = bsi->mvp.as_mv.row >> 3;
           mvp_full.col = bsi->mvp.as_mv.col >> 3;
 
-          if (cpi->sf.adaptive_motion_search) {
-            mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
-            mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3;
+          if (sf->adaptive_motion_search) {
+            mvp_full.row = x->pred_mv[mi->ref_frame[0]].row >> 3;
+            mvp_full.col = x->pred_mv[mi->ref_frame[0]].col >> 3;
             step_param = VPXMAX(step_param, 8);
           }
 
@@ -1868,77 +1911,56 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
           bestsme = vp9_full_pixel_search(
               cpi, x, bsize, &mvp_full, step_param, sadpb,
-              cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
+              sf->mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
               &bsi->ref_mv[0]->as_mv, new_mv,
               INT_MAX, 1);
 
-          // Should we do a full search (best quality only)
-          if (cpi->oxcf.mode == BEST) {
-            int_mv *const best_mv = &mi->bmi[i].as_mv[0];
-            /* Check if mvp_full is within the range. */
-            clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
-                     x->mv_row_min, x->mv_row_max);
-            thissme = cpi->full_search_sad(x, &mvp_full,
-                                           sadpb, 16, &cpi->fn_ptr[bsize],
-                                           &bsi->ref_mv[0]->as_mv,
-                                           &best_mv->as_mv);
-            cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
-            if (thissme < bestsme) {
-              bestsme = thissme;
-              *new_mv = best_mv->as_mv;
-            } else {
-              // The full search result is actually worse so re-instate the
-              // previous best vector
-              best_mv->as_mv = *new_mv;
-            }
-          }
-
-          if (bestsme < INT_MAX) {
-            int distortion;
+          if (bestsme < UINT32_MAX) {
+            uint32_t distortion;
             cpi->find_fractional_mv_step(
                 x,
                 new_mv,
                 &bsi->ref_mv[0]->as_mv,
                 cm->allow_high_precision_mv,
                 x->errorperbit, &cpi->fn_ptr[bsize],
-                cpi->sf.mv.subpel_force_stop,
-                cpi->sf.mv.subpel_iters_per_step,
+                sf->mv.subpel_force_stop,
+                sf->mv.subpel_iters_per_step,
                 cond_cost_list(cpi, cost_list),
                 x->nmvjointcost, x->mvcost,
                 &distortion,
-                &x->pred_sse[mbmi->ref_frame[0]],
+                &x->pred_sse[mi->ref_frame[0]],
                 NULL, 0, 0);
 
             // save motion search result for use in compound prediction
-            seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv;
+            seg_mvs[i][mi->ref_frame[0]].as_mv = *new_mv;
           }
 
-          if (cpi->sf.adaptive_motion_search)
-            x->pred_mv[mbmi->ref_frame[0]] = *new_mv;
+          if (sf->adaptive_motion_search)
+            x->pred_mv[mi->ref_frame[0]] = *new_mv;
 
           // restore src pointers
           mi_buf_restore(x, orig_src, orig_pre);
         }
 
         if (has_second_rf) {
-          if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
-              seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
+          if (seg_mvs[i][mi->ref_frame[1]].as_int == INVALID_MV ||
+              seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV)
             continue;
         }
 
         if (has_second_rf && this_mode == NEWMV &&
-            mbmi->interp_filter == EIGHTTAP) {
+            mi->interp_filter == EIGHTTAP) {
           // adjust src pointers
           mi_buf_shift(x, i);
-          if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+          if (sf->comp_inter_joint_search_thresh <= bsize) {
             int rate_mv;
             joint_motion_search(cpi, x, bsize, frame_mv[this_mode],
                                 mi_row, mi_col, seg_mvs[i],
                                 &rate_mv);
-            seg_mvs[i][mbmi->ref_frame[0]].as_int =
-                frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
-            seg_mvs[i][mbmi->ref_frame[1]].as_int =
-                frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
+            seg_mvs[i][mi->ref_frame[0]].as_int =
+                frame_mv[this_mode][mi->ref_frame[0]].as_int;
+            seg_mvs[i][mi->ref_frame[1]].as_int =
+                frame_mv[this_mode][mi->ref_frame[1]].as_int;
           }
           // restore src pointers
           mi_buf_restore(x, orig_src, orig_pre);
@@ -2080,7 +2102,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
   for (i = 0; i < 4; i++) {
     mode_idx = INTER_OFFSET(bsi->modes[i]);
     mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
-    if (has_second_ref(mbmi))
+    if (has_second_ref(mi))
       mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
     x->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
     mi->bmi[i].as_mode = bsi->modes[i];
@@ -2094,7 +2116,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
   *returnyrate = bsi->segment_yrate;
   *skippable = vp9_is_skippable_in_plane(x, BLOCK_8X8, 0);
   *psse = bsi->sse;
-  mbmi->mode = bsi->modes[3];
+  mi->mode = bsi->modes[3];
 
   return bsi->segment_rd;
 }
@@ -2205,7 +2227,7 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
 
   // Gets an initial list of candidate vectors from neighbours and orders them
   vp9_find_mv_refs(cm, xd, mi, ref_frame, candidates, mi_row, mi_col,
-                   NULL, NULL, mbmi_ext->mode_context);
+                   mbmi_ext->mode_context);
 
   // Candidate refinement carried out at encoder and decoder
   vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
@@ -2226,13 +2248,13 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                  int_mv *tmp_mv, int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   const VP9_COMMON *cm = &cpi->common;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *mi = xd->mi[0];
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
   int bestsme = INT_MAX;
   int step_param;
   int sadpb = x->sadperbit16;
   MV mvp_full;
-  int ref = mbmi->ref_frame[0];
+  int ref = mi->ref_frame[0];
   MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
 
   int tmp_col_min = x->mv_col_min;
@@ -2324,7 +2346,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   x->mv_row_max = tmp_row_max;
 
   if (bestsme < INT_MAX) {
-    int dis;  /* TODO: use dis in distortion calculation later. */
+    uint32_t dis;  /* TODO: use dis in distortion calculation later. */
     cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
                                  cm->allow_high_precision_mv,
                                  x->errorperbit,
@@ -2398,14 +2420,14 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  int64_t filter_cache[]) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *mi = xd->mi[0];
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-  const int is_comp_pred = has_second_ref(mbmi);
-  const int this_mode = mbmi->mode;
+  const int is_comp_pred = has_second_ref(mi);
+  const int this_mode = mi->mode;
   int_mv *frame_mv = mode_mv[this_mode];
   int i;
-  int refs[2] = { mbmi->ref_frame[0],
-    (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+  int refs[2] = { mi->ref_frame[0],
+    (mi->ref_frame[1] < 0 ? 0 : mi->ref_frame[1]) };
   int_mv cur_mv[2];
 #if CONFIG_VP9_HIGHBITDEPTH
   DECLARE_ALIGNED(16, uint16_t, tmp_buf16[MAX_MB_PLANE * 64 * 64]);
@@ -2443,10 +2465,10 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   if (pred_filter_search) {
     INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
-    if (xd->up_available)
-      af = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
-    if (xd->left_available)
-      lf = xd->mi[-1]->mbmi.interp_filter;
+    if (xd->above_mi)
+      af = xd->above_mi->interp_filter;
+    if (xd->left_mi)
+      lf = xd->left_mi->interp_filter;
 
     if ((this_mode != NEWMV) || (af == lf))
       best_filter = af;
@@ -2514,7 +2536,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
     if (mv_check_bounds(x, &cur_mv[i].as_mv))
       return INT64_MAX;
-    mbmi->mv[i].as_int = cur_mv[i].as_int;
+    mi->mv[i].as_int = cur_mv[i].as_int;
   }
 
   // do first prediction into the destination buffer. Do the next
@@ -2544,14 +2566,14 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
-      mbmi->mode != NEARESTMV)
+      mi->mode != NEARESTMV)
     return INT64_MAX;
 
   pred_exists = 0;
   // Are all MVs integer pel for Y and UV
-  intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
+  intpel_mv = !mv_has_subpel(&mi->mv[0].as_mv);
   if (is_comp_pred)
-    intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
+    intpel_mv &= !mv_has_subpel(&mi->mv[1].as_mv);
 
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
@@ -2572,7 +2594,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         int tmp_skip_sb = 0;
         int64_t tmp_skip_sse = INT64_MAX;
 
-        mbmi->interp_filter = i;
+        mi->interp_filter = i;
         rs = vp9_get_switchable_rate(cpi, xd);
         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
 
@@ -2597,7 +2619,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           if ((cm->interp_filter == SWITCHABLE &&
                (!i || best_needs_copy)) ||
               (cm->interp_filter != SWITCHABLE &&
-               (cm->interp_filter == mbmi->interp_filter ||
+               (cm->interp_filter == mi->interp_filter ||
                 (i == 0 && intpel_mv)))) {
             restore_dst_buf(xd, orig_dst, orig_dst_stride);
           } else {
@@ -2634,14 +2656,14 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
         if (newbest) {
           best_rd = rd;
-          best_filter = mbmi->interp_filter;
+          best_filter = mi->interp_filter;
           if (cm->interp_filter == SWITCHABLE && i && !intpel_mv)
             best_needs_copy = !best_needs_copy;
         }
 
         if ((cm->interp_filter == SWITCHABLE && newbest) ||
             (cm->interp_filter != SWITCHABLE &&
-             cm->interp_filter == mbmi->interp_filter)) {
+             cm->interp_filter == mi->interp_filter)) {
           pred_exists = 1;
           tmp_rd = best_rd;
 
@@ -2655,7 +2677,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
   // Set the appropriate filter
-  mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
+  mi->interp_filter = cm->interp_filter != SWITCHABLE ?
       cm->interp_filter : best_filter;
   rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0;
 
@@ -2683,7 +2705,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (!is_comp_pred)
-    single_filter[this_mode][refs[0]] = mbmi->interp_filter;
+    single_filter[this_mode][refs[0]] = mi->interp_filter;
 
   if (cpi->sf.adaptive_mode_search)
     if (is_comp_pred)
@@ -2770,8 +2792,8 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   TX_SIZE max_uv_tx_size;
   x->skip_encode = 0;
   ctx->skip = 0;
-  xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
-  xd->mi[0]->mbmi.ref_frame[1] = NONE;
+  xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+  xd->mi[0]->ref_frame[1] = NONE;
 
   if (bsize >= BLOCK_8X8) {
     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
@@ -2788,7 +2810,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       return;
     }
   }
-  max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize,
+  max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->tx_size, bsize,
                                        pd[1].subsampling_x,
                                        pd[1].subsampling_y);
   rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
@@ -2848,9 +2870,9 @@ static void rd_variance_adjustment(VP9_COMP *cpi,
       ? (source_variance - recon_variance)
       : (recon_variance - source_variance);
 
-    var_error = (200 * source_variance * recon_variance) /
-      ((source_variance * source_variance) +
-       (recon_variance * recon_variance));
+    var_error = ((int64_t)200 * source_variance * recon_variance) /
+      (((int64_t)source_variance * source_variance) +
+       ((int64_t)recon_variance * recon_variance));
     var_error = 100 - var_error;
   }
 
@@ -2952,12 +2974,12 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
   RD_OPT *const rd_opt = &cpi->rd;
   SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const struct segmentation *const seg = &cm->seg;
   PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
-  unsigned char segment_id = mbmi->segment_id;
+  unsigned char segment_id = mi->segment_id;
   int comp_pred, i, k;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
@@ -2971,7 +2993,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
   int64_t best_pred_rd[REFERENCE_MODES];
   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
-  MB_MODE_INFO best_mbmode;
+  MODE_INFO best_mbmode;
   int best_mode_skippable = 0;
   int midx, best_mode_index = -1;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
@@ -3038,7 +3060,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
       // Skip checking missing references in both single and compound reference
-      // modes. Note that a mode will be skipped iff both reference frames
+      // modes. Note that a mode will be skipped if both reference frames
       // are masked out.
       ref_frame_skip_mask[0] |= (1 << ref_frame);
       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
@@ -3189,7 +3211,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
       const int bsl = mi_width_log2_lookup[bsize];
       int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl)
           + get_chessboard_index(cm->current_video_frame)) & 0x1;
-      MB_MODE_INFO *ref_mbmi;
+      MODE_INFO *ref_mi;
       int const_motion = 1;
       int skip_ref_frame = !cb_partition_search_ctrl;
       MV_REFERENCE_FRAME rf = NONE;
@@ -3197,26 +3219,26 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
       ref_mv.as_int = INVALID_MV;
 
       if ((mi_row - 1) >= tile_info->mi_row_start) {
-        ref_mv = xd->mi[-xd->mi_stride]->mbmi.mv[0];
-        rf = xd->mi[-xd->mi_stride]->mbmi.ref_frame[0];
+        ref_mv = xd->mi[-xd->mi_stride]->mv[0];
+        rf = xd->mi[-xd->mi_stride]->ref_frame[0];
         for (i = 0; i < mi_width; ++i) {
-          ref_mbmi = &xd->mi[-xd->mi_stride + i]->mbmi;
-          const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
-                          (ref_frame == ref_mbmi->ref_frame[0]);
-          skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
+          ref_mi = xd->mi[-xd->mi_stride + i];
+          const_motion &= (ref_mv.as_int == ref_mi->mv[0].as_int) &&
+                          (ref_frame == ref_mi->ref_frame[0]);
+          skip_ref_frame &= (rf == ref_mi->ref_frame[0]);
         }
       }
 
       if ((mi_col - 1) >= tile_info->mi_col_start) {
         if (ref_mv.as_int == INVALID_MV)
-          ref_mv = xd->mi[-1]->mbmi.mv[0];
+          ref_mv = xd->mi[-1]->mv[0];
         if (rf == NONE)
-          rf = xd->mi[-1]->mbmi.ref_frame[0];
+          rf = xd->mi[-1]->ref_frame[0];
         for (i = 0; i < mi_height; ++i) {
-          ref_mbmi = &xd->mi[i * xd->mi_stride - 1]->mbmi;
-          const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
-                          (ref_frame == ref_mbmi->ref_frame[0]);
-          skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
+          ref_mi = xd->mi[i * xd->mi_stride - 1];
+          const_motion &= (ref_mv.as_int == ref_mi->mv[0].as_int) &&
+                          (ref_frame == ref_mi->ref_frame[0]);
+          skip_ref_frame &= (rf == ref_mi->ref_frame[0]);
         }
       }
 
@@ -3287,15 +3309,15 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
         continue;
     }
 
-    mbmi->mode = this_mode;
-    mbmi->uv_mode = DC_PRED;
-    mbmi->ref_frame[0] = ref_frame;
-    mbmi->ref_frame[1] = second_ref_frame;
+    mi->mode = this_mode;
+    mi->uv_mode = DC_PRED;
+    mi->ref_frame[0] = ref_frame;
+    mi->ref_frame[1] = second_ref_frame;
     // Evaluate all sub-pel filters irrespective of whether we can use
     // them for this frame.
-    mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+    mi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
                                                           : cm->interp_filter;
-    mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+    mi->mv[0].as_int = mi->mv[1].as_int = 0;
 
     x->skip = 0;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
@@ -3316,7 +3338,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
       if (rate_y == INT_MAX)
         continue;
 
-      uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd->subsampling_x,
+      uv_tx = get_uv_tx_size_impl(mi->tx_size, bsize, pd->subsampling_x,
                                   pd->subsampling_y);
       if (rate_uv_intra[uv_tx] == INT_MAX) {
         choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx,
@@ -3327,9 +3349,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
       rate_uv = rate_uv_tokenonly[uv_tx];
       distortion_uv = dist_uv[uv_tx];
       skippable = skippable && skip_uv[uv_tx];
-      mbmi->uv_mode = mode_uv[uv_tx];
+      mi->uv_mode = mode_uv[uv_tx];
 
-      rate2 = rate_y + cpi->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
+      rate2 = rate_y + cpi->mbmode_cost[mi->mode] + rate_uv_intra[uv_tx];
       if (this_mode != DC_PRED && this_mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
@@ -3360,28 +3382,34 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
     }
 
     if (!disable_skip) {
+      const vpx_prob skip_prob = vp9_get_skip_prob(cm, xd);
+      const int skip_cost0 = vp9_cost_bit(skip_prob, 0);
+      const int skip_cost1 = vp9_cost_bit(skip_prob, 1);
+
       if (skippable) {
         // Back out the coefficient coding costs
         rate2 -= (rate_y + rate_uv);
 
         // Cost the skip mb case
-        rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+        rate2 += skip_cost1;
       } else if (ref_frame != INTRA_FRAME && !xd->lossless) {
-        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
-            RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
+        if (RDCOST(x->rdmult, x->rddiv,
+                   rate_y + rate_uv + skip_cost0, distortion2) <
+            RDCOST(x->rdmult, x->rddiv, skip_cost1, total_sse)) {
           // Add in the cost of the no skip flag.
-          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+          rate2 += skip_cost0;
         } else {
           // FIXME(rbultje) make this work for splitmv also
-          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
-          distortion2 = total_sse;
           assert(total_sse >= 0);
+
+          rate2 += skip_cost1;
+          distortion2 = total_sse;
           rate2 -= (rate_y + rate_uv);
           this_skip2 = 1;
         }
       } else {
         // Add in the cost of the no skip flag.
-        rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+        rate2 += skip_cost0;
       }
 
       // Calculate the final RD estimate for this mode.
@@ -3397,7 +3425,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
     // Keep record of best intra rd
       if (this_rd < best_intra_rd) {
         best_intra_rd = this_rd;
-        best_intra_mode = mbmi->mode;
+        best_intra_mode = mi->mode;
       }
     }
 
@@ -3417,7 +3445,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
 
         if (ref_frame == INTRA_FRAME) {
           /* required for left and above block mv */
-          mbmi->mv[0].as_int = 0;
+          mi->mv[0].as_int = 0;
           max_plane = 1;
         } else {
           best_pred_sse = x->pred_sse[ref_frame];
@@ -3427,13 +3455,13 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
         rd_cost->dist = distortion2;
         rd_cost->rdcost = this_rd;
         best_rd = this_rd;
-        best_mbmode = *mbmi;
+        best_mbmode = *mi;
         best_skip2 = this_skip2;
         best_mode_skippable = skippable;
 
         if (!x->select_tx_size)
           swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
-        memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
+        memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mi->tx_size],
                sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
 
         // TODO(debargha): enhance this test with a better distortion prediction
@@ -3549,8 +3577,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
     // Do Intra UV best rd mode selection if best mode choice above was intra.
     if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
       TX_SIZE uv_tx_size;
-      *mbmi = best_mbmode;
-      uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
+      *mi = best_mbmode;
+      uv_tx_size = get_uv_tx_size(mi, &xd->plane[1]);
       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
                               &rate_uv_tokenonly[uv_tx_size],
                               &dist_uv[uv_tx_size],
@@ -3569,7 +3597,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
                               sf->adaptive_rd_thresh, bsize, best_mode_index);
 
   // macroblock modes
-  *mbmi = best_mbmode;
+  *mi = best_mbmode;
   x->skip |= best_skip2;
 
   for (i = 0; i < REFERENCE_MODES; ++i) {
@@ -3599,7 +3627,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
   if (!x->skip && !x->select_tx_size) {
     int has_high_freq_coeff = 0;
     int plane;
-    int max_plane = is_inter_block(&xd->mi[0]->mbmi)
+    int max_plane = is_inter_block(xd->mi[0])
                         ? MAX_MB_PLANE : 1;
     for (plane = 0; plane < max_plane; ++plane) {
       x->plane[plane].eobs = ctx->eobs_pbuf[plane][1];
@@ -3629,8 +3657,8 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi,
                                         int64_t best_rd_so_far) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  unsigned char segment_id = mbmi->segment_id;
+  MODE_INFO *const mi = xd->mi[0];
+  unsigned char segment_id = mi->segment_id;
   const int comp_pred = 0;
   int i;
   int64_t best_pred_diff[REFERENCE_MODES];
@@ -3656,11 +3684,11 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi,
 
   assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
 
-  mbmi->mode = ZEROMV;
-  mbmi->uv_mode = DC_PRED;
-  mbmi->ref_frame[0] = LAST_FRAME;
-  mbmi->ref_frame[1] = NONE;
-  mbmi->mv[0].as_int = 0;
+  mi->mode = ZEROMV;
+  mi->uv_mode = DC_PRED;
+  mi->ref_frame[0] = LAST_FRAME;
+  mi->ref_frame[1] = NONE;
+  mi->mv[0].as_int = 0;
   x->skip = 1;
 
   if (cm->interp_filter != BILINEAR) {
@@ -3670,21 +3698,21 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi,
       int rs;
       int best_rs = INT_MAX;
       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
-        mbmi->interp_filter = i;
+        mi->interp_filter = i;
         rs = vp9_get_switchable_rate(cpi, xd);
         if (rs < best_rs) {
           best_rs = rs;
-          best_filter = mbmi->interp_filter;
+          best_filter = mi->interp_filter;
         }
       }
     }
   }
   // Set the appropriate filter
   if (cm->interp_filter == SWITCHABLE) {
-    mbmi->interp_filter = best_filter;
+    mi->interp_filter = best_filter;
     rate2 += vp9_get_switchable_rate(cpi, xd);
   } else {
-    mbmi->interp_filter = cm->interp_filter;
+    mi->interp_filter = cm->interp_filter;
   }
 
   if (cm->reference_mode == REFERENCE_MODE_SELECT)
@@ -3706,7 +3734,7 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi,
   }
 
   assert((cm->interp_filter == SWITCHABLE) ||
-         (cm->interp_filter == mbmi->interp_filter));
+         (cm->interp_filter == mi->interp_filter));
 
   vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
                             cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
@@ -3732,10 +3760,10 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
   RD_OPT *const rd_opt = &cpi->rd;
   SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *const mi = xd->mi[0];
   const struct segmentation *const seg = &cm->seg;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
-  unsigned char segment_id = mbmi->segment_id;
+  unsigned char segment_id = mi->segment_id;
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
@@ -3747,7 +3775,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
   int64_t best_pred_rd[REFERENCE_MODES];
   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
-  MB_MODE_INFO best_mbmode;
+  MODE_INFO best_mbmode;
   int ref_index, best_ref_index = 0;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
   vpx_prob comp_mode_p;
@@ -3821,6 +3849,16 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
     ref_frame = vp9_ref_order[ref_index].ref_frame[0];
     second_ref_frame = vp9_ref_order[ref_index].ref_frame[1];
 
+#if CONFIG_BETTER_HW_COMPATIBILITY
+    // forbid 8X4 and 4X8 partitions if any reference frame is scaled.
+    if (bsize == BLOCK_8X4 || bsize == BLOCK_4X8) {
+      int ref_scaled = vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf);
+      if (second_ref_frame > INTRA_FRAME)
+        ref_scaled += vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf);
+      if (ref_scaled)
+        continue;
+    }
+#endif
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
     if (ref_index > 2 && sf->mode_skip_start < MAX_MODES) {
@@ -3896,14 +3934,14 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
         continue;
     }
 
-    mbmi->tx_size = TX_4X4;
-    mbmi->uv_mode = DC_PRED;
-    mbmi->ref_frame[0] = ref_frame;
-    mbmi->ref_frame[1] = second_ref_frame;
+    mi->tx_size = TX_4X4;
+    mi->uv_mode = DC_PRED;
+    mi->ref_frame[0] = ref_frame;
+    mi->ref_frame[1] = second_ref_frame;
     // Evaluate all sub-pel filters irrespective of whether we can use
     // them for this frame.
-    mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
-                                                          : cm->interp_filter;
+    mi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+                                                        : cm->interp_filter;
     x->skip = 0;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
@@ -3934,7 +3972,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
       rate_uv = rate_uv_tokenonly;
       distortion2 += dist_uv;
       distortion_uv = dist_uv;
-      mbmi->uv_mode = mode_uv;
+      mi->uv_mode = mode_uv;
     } else {
       int rate;
       int64_t distortion;
@@ -3947,7 +3985,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
       int_mv *second_ref = comp_pred ?
                              &x->mbmi_ext->ref_mvs[second_ref_frame][0] : NULL;
       b_mode_info tmp_best_bmodes[16];
-      MB_MODE_INFO tmp_best_mbmode;
+      MODE_INFO tmp_best_mbmode;
       BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
       int pred_exists = 0;
       int uv_skippable;
@@ -3956,8 +3994,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
       int ref;
 
       for (ref = 0; ref < 2; ++ref) {
-        scaled_ref_frame[ref] = mbmi->ref_frame[ref] > INTRA_FRAME ?
-            vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[ref]) : NULL;
+        scaled_ref_frame[ref] = mi->ref_frame[ref] > INTRA_FRAME ?
+            vp9_get_scaled_ref_frame(cpi, mi->ref_frame[ref]) : NULL;
 
         if (scaled_ref_frame[ref]) {
           int i;
@@ -3996,7 +4034,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
             int newbest, rs;
             int64_t rs_rd;
             MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
-            mbmi->interp_filter = switchable_filter_index;
+            mi->interp_filter = switchable_filter_index;
             tmp_rd = rd_pick_best_sub8x8_mode(cpi, x,
                                               &mbmi_ext->ref_mvs[ref_frame][0],
                                               second_ref, best_yrd, &rate,
@@ -4020,11 +4058,11 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
 
             newbest = (tmp_rd < tmp_best_rd);
             if (newbest) {
-              tmp_best_filter = mbmi->interp_filter;
+              tmp_best_filter = mi->interp_filter;
               tmp_best_rd = tmp_rd;
             }
             if ((newbest && cm->interp_filter == SWITCHABLE) ||
-                (mbmi->interp_filter == cm->interp_filter &&
+                (mi->interp_filter == cm->interp_filter &&
                  cm->interp_filter != SWITCHABLE)) {
               tmp_best_rdu = tmp_rd;
               tmp_best_rate = rate;
@@ -4032,7 +4070,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
               tmp_best_distortion = distortion;
               tmp_best_sse = total_sse;
               tmp_best_skippable = skippable;
-              tmp_best_mbmode = *mbmi;
+              tmp_best_mbmode = *mi;
               for (i = 0; i < 4; i++) {
                 tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
                 x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
@@ -4044,7 +4082,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
                 if (tmp_best_rdu / 2 > best_rd) {
                   // skip searching the other filters if the first is
                   // already substantially larger than the best so far
-                  tmp_best_filter = mbmi->interp_filter;
+                  tmp_best_filter = mi->interp_filter;
                   tmp_best_rdu = INT64_MAX;
                   break;
                 }
@@ -4057,8 +4095,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
       if (tmp_best_rdu == INT64_MAX && pred_exists)
         continue;
 
-      mbmi->interp_filter = (cm->interp_filter == SWITCHABLE ?
-                             tmp_best_filter : cm->interp_filter);
+      mi->interp_filter = (cm->interp_filter == SWITCHABLE ?
+                           tmp_best_filter : cm->interp_filter);
       if (!pred_exists) {
         // Handles the special case when a filter that is not in the
         // switchable list (bilinear, 6-tap) is indicated at the frame level
@@ -4076,7 +4114,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
         rate_y = tmp_best_ratey;
         distortion = tmp_best_distortion;
         skippable = tmp_best_skippable;
-        *mbmi = tmp_best_mbmode;
+        *mi = tmp_best_mbmode;
         for (i = 0; i < 4; i++)
           xd->mi[0]->bmi[i] = tmp_best_bmodes[i];
       }
@@ -4143,17 +4181,21 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
     }
 
     if (!disable_skip) {
+      const vpx_prob skip_prob = vp9_get_skip_prob(cm, xd);
+      const int skip_cost0 = vp9_cost_bit(skip_prob, 0);
+      const int skip_cost1 = vp9_cost_bit(skip_prob, 1);
+
       // Skip is never coded at the segment level for sub8x8 blocks and instead
       // always coded in the bitstream at the mode info level.
-
       if (ref_frame != INTRA_FRAME && !xd->lossless) {
-        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
-            RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
+        if (RDCOST(x->rdmult, x->rddiv,
+                   rate_y + rate_uv + skip_cost0, distortion2) <
+            RDCOST(x->rdmult, x->rddiv, skip_cost1, total_sse)) {
           // Add in the cost of the no skip flag.
-          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+          rate2 += skip_cost0;
         } else {
           // FIXME(rbultje) make this work for splitmv also
-          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+          rate2 += skip_cost1;
           distortion2 = total_sse;
           assert(total_sse >= 0);
           rate2 -= (rate_y + rate_uv);
@@ -4163,7 +4205,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
         }
       } else {
         // Add in the cost of the no skip flag.
-        rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+        rate2 += skip_cost0;
       }
 
       // Calculate the final RD estimate for this mode.
@@ -4186,7 +4228,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
 
         if (ref_frame == INTRA_FRAME) {
           /* required for left and above block mv */
-          mbmi->mv[0].as_int = 0;
+          mi->mv[0].as_int = 0;
           max_plane = 1;
         }
 
@@ -4196,7 +4238,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
         best_rd = this_rd;
         best_yrd = best_rd -
                    RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
-        best_mbmode = *mbmi;
+        best_mbmode = *mi;
         best_skip2 = this_skip2;
         if (!x->select_tx_size)
           swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
@@ -4294,7 +4336,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
   if (sf->use_uv_intra_rd_estimate) {
     // Do Intra UV best rd mode selection if best mode choice above was intra.
     if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
-      *mbmi = best_mbmode;
+      *mi = best_mbmode;
       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra,
                               &rate_uv_tokenonly,
                               &dist_uv,
@@ -4318,7 +4360,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
                             sf->adaptive_rd_thresh, bsize, best_ref_index);
 
   // macroblock modes
-  *mbmi = best_mbmode;
+  *mi = best_mbmode;
   x->skip |= best_skip2;
   if (!is_inter_block(&best_mbmode)) {
     for (i = 0; i < 4; i++)
@@ -4327,8 +4369,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
     for (i = 0; i < 4; ++i)
       memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
 
-    mbmi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
-    mbmi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
+    mi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
+    mi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
   }
 
   for (i = 0; i < REFERENCE_MODES; ++i) {
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 00ee55c..253e4a0 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -29,15 +29,6 @@ void vp9_rd_pick_intra_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
                                struct RD_COST *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
-unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi,
-                                           const struct buf_2d *ref,
-                                           BLOCK_SIZE bs);
-#if CONFIG_VP9_HIGHBITDEPTH
-unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
-                                                const struct buf_2d *ref,
-                                                BLOCK_SIZE bs, int bd);
-#endif
-
 void vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi,
                                struct TileDataEnc *tile_data,
                                struct macroblock *x,
diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c
index 59c7478..307a112 100644
--- a/vp9/encoder/vp9_resize.c
+++ b/vp9/encoder/vp9_resize.c
@@ -15,6 +15,7 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "./vpx_config.h"
 #if CONFIG_VP9_HIGHBITDEPTH
 #include "vpx_dsp/vpx_dsp_common.h"
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -445,7 +446,7 @@ static void resize_multistep(const uint8_t *const input,
                              int length,
                              uint8_t *output,
                              int olength,
-                             uint8_t *buf) {
+                             uint8_t *otmp) {
   int steps;
   if (length == olength) {
     memcpy(output, input, sizeof(output[0]) * length);
@@ -456,15 +457,10 @@ static void resize_multistep(const uint8_t *const input,
   if (steps > 0) {
     int s;
     uint8_t *out = NULL;
-    uint8_t *tmpbuf = NULL;
-    uint8_t *otmp, *otmp2;
+    uint8_t *otmp2;
     int filteredlength = length;
-    if (!tmpbuf) {
-      tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) * length);
-      otmp = tmpbuf;
-    } else {
-      otmp = buf;
-    }
+
+    assert(otmp != NULL);
     otmp2 = otmp + get_down2_length(length, 1);
     for (s = 0; s < steps; ++s) {
       const int proj_filteredlength = get_down2_length(filteredlength, 1);
@@ -482,8 +478,6 @@ static void resize_multistep(const uint8_t *const input,
     if (filteredlength != olength) {
       interpolate(out, filteredlength, output, olength);
     }
-    if (tmpbuf)
-      free(tmpbuf);
   } else {
     interpolate(input, length, output, olength);
   }
@@ -519,22 +513,29 @@ void vp9_resize_plane(const uint8_t *const input,
   uint8_t *intbuf = (uint8_t *)malloc(sizeof(uint8_t) * width2 * height);
   uint8_t *tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) *
                                       (width < height ? height : width));
-  uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * (height + height2));
+  uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * height);
+  uint8_t *arrbuf2 = (uint8_t *)malloc(sizeof(uint8_t) * height2);
+  if (intbuf == NULL || tmpbuf == NULL ||
+      arrbuf == NULL || arrbuf2 == NULL)
+    goto Error;
   assert(width > 0);
   assert(height > 0);
   assert(width2 > 0);
   assert(height2 > 0);
   for (i = 0; i < height; ++i)
     resize_multistep(input + in_stride * i, width,
-                        intbuf + width2 * i, width2, tmpbuf);
+                     intbuf + width2 * i, width2, tmpbuf);
   for (i = 0; i < width2; ++i) {
     fill_col_to_arr(intbuf + i, width2, height, arrbuf);
-    resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf);
-    fill_arr_to_col(output + i, out_stride, height2, arrbuf + height);
+    resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf);
+    fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
   }
+
+ Error:
   free(intbuf);
   free(tmpbuf);
   free(arrbuf);
+  free(arrbuf2);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -737,7 +738,7 @@ static void highbd_resize_multistep(const uint16_t *const input,
                                     int length,
                                     uint16_t *output,
                                     int olength,
-                                    uint16_t *buf,
+                                    uint16_t *otmp,
                                     int bd) {
   int steps;
   if (length == olength) {
@@ -749,15 +750,10 @@ static void highbd_resize_multistep(const uint16_t *const input,
   if (steps > 0) {
     int s;
     uint16_t *out = NULL;
-    uint16_t *tmpbuf = NULL;
-    uint16_t *otmp, *otmp2;
+    uint16_t *otmp2;
     int filteredlength = length;
-    if (!tmpbuf) {
-      tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) * length);
-      otmp = tmpbuf;
-    } else {
-      otmp = buf;
-    }
+
+    assert(otmp != NULL);
     otmp2 = otmp + get_down2_length(length, 1);
     for (s = 0; s < steps; ++s) {
       const int proj_filteredlength = get_down2_length(filteredlength, 1);
@@ -775,8 +771,6 @@ static void highbd_resize_multistep(const uint16_t *const input,
     if (filteredlength != olength) {
       highbd_interpolate(out, filteredlength, output, olength, bd);
     }
-    if (tmpbuf)
-      free(tmpbuf);
   } else {
     highbd_interpolate(input, length, output, olength, bd);
   }
@@ -815,21 +809,28 @@ void vp9_highbd_resize_plane(const uint8_t *const input,
   uint16_t *intbuf = (uint16_t *)malloc(sizeof(uint16_t) * width2 * height);
   uint16_t *tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) *
                                         (width < height ? height : width));
-  uint16_t *arrbuf = (uint16_t *)malloc(sizeof(uint16_t) * (height + height2));
+  uint16_t *arrbuf = (uint16_t *)malloc(sizeof(uint16_t) * height);
+  uint16_t *arrbuf2 = (uint16_t *)malloc(sizeof(uint16_t) * height2);
+  if (intbuf == NULL || tmpbuf == NULL ||
+      arrbuf == NULL || arrbuf2 == NULL)
+    goto Error;
   for (i = 0; i < height; ++i) {
     highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
                             intbuf + width2 * i, width2, tmpbuf, bd);
   }
   for (i = 0; i < width2; ++i) {
     highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf);
-    highbd_resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf,
+    highbd_resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf,
                             bd);
     highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2,
-                           arrbuf + height);
+                           arrbuf2);
   }
+
+ Error:
   free(intbuf);
   free(tmpbuf);
   free(arrbuf);
+  free(arrbuf2);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index c5c50a2..5a0a23d 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -118,7 +118,7 @@ static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd,
     return;
 
   xd->mi = mi;
-  segment_id = xd->mi[0]->mbmi.segment_id;
+  segment_id = xd->mi[0]->segment_id;
 
   set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 
@@ -127,7 +127,7 @@ static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd,
 
   // Temporal prediction not allowed on key frames
   if (cm->frame_type != KEY_FRAME) {
-    const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+    const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
     // Test to see if the segment id matches the predicted value.
     const int pred_segment_id = get_segment_id(cm, cm->last_frame_seg_map,
                                                bsize, mi_row, mi_col);
@@ -136,7 +136,7 @@ static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd,
 
     // Store the prediction status for this mb and update counts
     // as appropriate
-    xd->mi[0]->mbmi.seg_id_predicted = pred_flag;
+    xd->mi[0]->seg_id_predicted = pred_flag;
     temporal_predictor_count[pred_context][pred_flag]++;
 
     // Update the "unpredicted" segment count
@@ -159,8 +159,8 @@ static void count_segs_sb(const VP9_COMMON *cm, MACROBLOCKD *xd,
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  bw = num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type];
-  bh = num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type];
+  bw = num_8x8_blocks_wide_lookup[mi[0]->sb_type];
+  bh = num_8x8_blocks_high_lookup[mi[0]->sb_type];
 
   if (bw == bs && bh == bs) {
     count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
diff --git a/vp9/encoder/vp9_skin_detection.c b/vp9/encoder/vp9_skin_detection.c
index c2763b7..23a5fc7 100644
--- a/vp9/encoder/vp9_skin_detection.c
+++ b/vp9/encoder/vp9_skin_detection.c
@@ -15,22 +15,29 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_skin_detection.h"
 
+#define MODEL_MODE 1
+
 // Fixed-point skin color model parameters.
-static const int skin_mean[2] = {7463, 9614};                 // q6
+static const int skin_mean[5][2] = {
+    {7463, 9614}, {6400, 10240}, {7040, 10240}, {8320, 9280}, {6800, 9614}};
 static const int skin_inv_cov[4] = {4107, 1663, 1663, 2157};  // q16
-static const int skin_threshold = 1570636;                    // q18
+static const int skin_threshold[6] = {1570636, 1400000, 800000, 800000, 800000,
+    800000};  // q18
 
 // Thresholds on luminance.
-static const int y_low = 20;
+static const int y_low = 40;
 static const int y_high = 220;
 
 // Evaluates the Mahalanobis distance measure for the input CbCr values.
-static int evaluate_skin_color_difference(int cb, int cr) {
+static int evaluate_skin_color_difference(int cb, int cr, int idx) {
   const int cb_q6 = cb << 6;
   const int cr_q6 = cr << 6;
-  const int cb_diff_q12 = (cb_q6 - skin_mean[0]) * (cb_q6 - skin_mean[0]);
-  const int cbcr_diff_q12 = (cb_q6 - skin_mean[0]) * (cr_q6 - skin_mean[1]);
-  const int cr_diff_q12 = (cr_q6 - skin_mean[1]) * (cr_q6 - skin_mean[1]);
+  const int cb_diff_q12 =
+      (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
+  const int cbcr_diff_q12 =
+      (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
+  const int cr_diff_q12 =
+      (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
   const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
   const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
   const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
@@ -41,13 +48,65 @@ static int evaluate_skin_color_difference(int cb, int cr) {
   return skin_diff;
 }
 
-int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr) {
-  if (y < y_low || y > y_high)
+int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr,
+                   int motion) {
+  if (y < y_low || y > y_high) {
     return 0;
-  else
-    return (evaluate_skin_color_difference(cb, cr) < skin_threshold);
+  } else {
+    if (MODEL_MODE == 0) {
+      return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
+    } else {
+      int i = 0;
+      // Exit on grey.
+      if (cb == 128 && cr == 128)
+        return 0;
+      // Exit on very strong cb.
+      if (cb > 150 && cr < 110)
+        return 0;
+      for (; i < 5; i++) {
+        int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
+        if (skin_color_diff < skin_threshold[i + 1]) {
+           if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2))
+             return 0;
+           else if (motion == 0 &&
+                    skin_color_diff > (skin_threshold[i + 1] >> 1))
+             return 0;
+           else
+            return 1;
+        }
+        // Exit if difference is much large than the threshold.
+        if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
+          return 0;
+        }
+      }
+      return 0;
+    }
+  }
 }
 
+int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                           int stride, int strideuv, int bsize,
+                           int consec_zeromv, int curr_motion_magn) {
+  // No skin if block has been zero/small motion for long consecutive time.
+  if (consec_zeromv > 60 && curr_motion_magn == 0) {
+    return 0;
+  } else {
+    int motion = 1;
+    // Take center pixel in block to determine is_skin.
+    const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1;
+    const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1;
+    const int uv_width_shift = y_width_shift >> 1;
+    const int uv_height_shift = y_height_shift >> 1;
+    const uint8_t ysource = y[y_height_shift * stride + y_width_shift];
+    const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift];
+    const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift];
+    if (consec_zeromv > 25 && curr_motion_magn == 0)
+      motion = 0;
+    return vp9_skin_pixel(ysource, usource, vsource, motion);
+  }
+}
+
+
 #ifdef OUTPUT_YUV_SKINMAP
 // For viewing skin map on input source.
 void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
@@ -67,7 +126,7 @@ void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
   int shuv = shy - 1;
   int fac = y_bsize / 8;
   // Use center pixel or average of center 2x2 pixels.
-  int mode_filter = 1;
+  int mode_filter = 0;
   YV12_BUFFER_CONFIG skinmap;
   memset(&skinmap, 0, sizeof(YV12_BUFFER_CONFIG));
   if (vpx_alloc_frame_buffer(&skinmap, cm->width, cm->height,
@@ -84,27 +143,46 @@ void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
   for (mi_row = 0; mi_row < cm->mi_rows - 1; mi_row += fac) {
     num_bl = 0;
     for (mi_col = 0; mi_col < cm->mi_cols - 1; mi_col += fac) {
-      // Select pixel for each block for skin detection.
-      // Use center pixel, or 2x2 average at center.
-      uint8_t ysource = src_y[ypos * src_ystride + ypos];
-      uint8_t usource = src_u[uvpos * src_uvstride + uvpos];
-      uint8_t vsource = src_v[uvpos * src_uvstride + uvpos];
-      uint8_t ysource2 = src_y[(ypos + 1) * src_ystride + ypos];
-      uint8_t usource2 = src_u[(uvpos + 1) * src_uvstride + uvpos];
-      uint8_t vsource2 = src_v[(uvpos + 1) * src_uvstride + uvpos];
-      uint8_t ysource3 = src_y[ypos * src_ystride + (ypos + 1)];
-      uint8_t usource3 = src_u[uvpos * src_uvstride + (uvpos  + 1)];
-      uint8_t vsource3 = src_v[uvpos * src_uvstride + (uvpos +  1)];
-      uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)];
-      uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos  + 1)];
-      uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos +  1)];
       int is_skin = 0;
       if (mode_filter == 1) {
+        // Use 2x2 average at center.
+        uint8_t ysource = src_y[ypos * src_ystride + ypos];
+        uint8_t usource = src_u[uvpos * src_uvstride + uvpos];
+        uint8_t vsource = src_v[uvpos * src_uvstride + uvpos];
+        uint8_t ysource2 = src_y[(ypos + 1) * src_ystride + ypos];
+        uint8_t usource2 = src_u[(uvpos + 1) * src_uvstride + uvpos];
+        uint8_t vsource2 = src_v[(uvpos + 1) * src_uvstride + uvpos];
+        uint8_t ysource3 = src_y[ypos * src_ystride + (ypos + 1)];
+        uint8_t usource3 = src_u[uvpos * src_uvstride + (uvpos  + 1)];
+        uint8_t vsource3 = src_v[uvpos * src_uvstride + (uvpos +  1)];
+        uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)];
+        uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos  + 1)];
+        uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos +  1)];
         ysource = (ysource + ysource2 + ysource3 + ysource4) >> 2;
         usource = (usource + usource2 + usource3 + usource4) >> 2;
         vsource = (vsource + vsource2 + vsource3 + vsource4) >> 2;
+        is_skin = vp9_skin_pixel(ysource, usource, vsource, 1);
+      } else {
+        int block_size = BLOCK_8X8;
+        int consec_zeromv = 0;
+        int bl_index = mi_row * cm->mi_cols + mi_col;
+        int bl_index1 = bl_index + 1;
+        int bl_index2 = bl_index + cm->mi_cols;
+        int bl_index3 = bl_index2 + 1;
+        if (y_bsize == 8)
+          consec_zeromv = cpi->consec_zero_mv[bl_index];
+        else
+          consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index],
+                                 VPXMIN(cpi->consec_zero_mv[bl_index1],
+                                 VPXMIN(cpi->consec_zero_mv[bl_index2],
+                                 cpi->consec_zero_mv[bl_index3])));
+        if (y_bsize == 16)
+          block_size = BLOCK_16X16;
+        is_skin  = vp9_compute_skin_block(src_y, src_u, src_v, src_ystride,
+                                          src_uvstride, block_size,
+                                          consec_zeromv,
+                                          0);
       }
-      is_skin = vp9_skin_pixel(ysource, usource, vsource);
       for (i = 0; i < y_bsize; i++) {
         for (j = 0; j < y_bsize; j++) {
           if (is_skin)
diff --git a/vp9/encoder/vp9_skin_detection.h b/vp9/encoder/vp9_skin_detection.h
index 0a87ef9..c77382d 100644
--- a/vp9/encoder/vp9_skin_detection.h
+++ b/vp9/encoder/vp9_skin_detection.h
@@ -21,7 +21,12 @@ struct VP9_COMP;
 
 // #define OUTPUT_YUV_SKINMAP
 
-int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr);
+int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr,
+                   int motion);
+
+int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                           int stride, int strideuv, int bsize,
+                           int consec_zeromv, int curr_motion_magn);
 
 #ifdef OUTPUT_YUV_SKINMAP
 // For viewing skin map on input source.
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index a539629..e7f04a2 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -15,6 +15,22 @@
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
+// Mesh search patters for various speed settings
+static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] =
+    {{64, 4}, {28, 2}, {15, 1}, {7, 1}};
+
+#define MAX_MESH_SPEED 5  // Max speed setting for mesh motion method
+static MESH_PATTERN good_quality_mesh_patterns[MAX_MESH_SPEED + 1]
+                                              [MAX_MESH_STEP] =
+    {{{64, 8}, {28, 4}, {15, 1}, {7, 1}},
+     {{64, 8}, {28, 4}, {15, 1}, {7, 1}},
+     {{64, 8},  {14, 2}, {7, 1},  {7, 1}},
+     {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+     {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+     {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+    };
+static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] =
+    {50, 25, 15, 5, 1, 1};
 
 // Intra only frames, golden frames (except alt ref overlays) and
 // alt ref frames tend to be coded at a higher than ambient quality
@@ -259,6 +275,8 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
   sf->static_segmentation = 0;
   sf->adaptive_rd_thresh = 1;
   sf->use_fast_coef_costing = 1;
+  sf->allow_exhaustive_searches = 0;
+  sf->exhaustive_searches_thresh = INT_MAX;
 
   if (speed >= 1) {
     sf->use_square_partition_only = !frame_is_intra_only(cm);
@@ -285,12 +303,26 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
                                  FLAG_SKIP_INTRA_LOWVAR;
     sf->adaptive_pred_interp_filter = 2;
 
-    // Disable reference masking if using spatial scaling since
-    // pred_mv_sad will not be set (since vp9_mv_pred will not
-    // be called).
-    // TODO(marpan/agrange): Fix this condition.
-    sf->reference_masking = (cpi->oxcf.resize_mode != RESIZE_DYNAMIC &&
-                             cpi->svc.number_spatial_layers == 1) ? 1 : 0;
+    // Reference masking only enabled for 1 spatial layer, and if none of the
+    // references have been scaled. The latter condition needs to be checked
+    // for external or internal dynamic resize.
+    sf->reference_masking = (cpi->svc.number_spatial_layers == 1);
+    if (sf->reference_masking == 1 &&
+        (cpi->external_resize == 1 ||
+         cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) {
+      MV_REFERENCE_FRAME ref_frame;
+      static const int flag_list[4] =
+          {0, VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG};
+      for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+        const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+        if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
+          const struct scale_factors *const scale_fac =
+              &cm->frame_refs[ref_frame - 1].sf;
+          if (vp9_is_scaled(scale_fac))
+            sf->reference_masking = 0;
+        }
+      }
+    }
 
     sf->disable_filter_search_var_thresh = 50;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
@@ -368,6 +400,8 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
     sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH;
     sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
     sf->simple_model_rd_from_var = 1;
+    if (cpi->oxcf.rc_mode == VPX_VBR)
+      sf->mv.search_method = NSTEP;
 
     if (!is_keyframe) {
       int i;
@@ -376,13 +410,16 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
           sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V;
       } else {
         for (i = 0; i < BLOCK_SIZES; ++i)
-          if (i >= BLOCK_16X16)
+          if (i > BLOCK_16X16)
             sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
           else
             // Use H and V intra mode for block sizes <= 16X16.
             sf->intra_y_mode_bsize_mask[i] = INTRA_DC_H_V;
       }
     }
+    if (content == VP9E_CONTENT_SCREEN) {
+      sf->short_circuit_flat_blocks = 1;
+    }
   }
 
   if (speed >= 6) {
@@ -392,6 +429,11 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
     sf->mv.search_method = NSTEP;
     sf->mv.reduce_first_step_size = 1;
     sf->skip_encode_sb = 0;
+    if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR &&
+        content != VP9E_CONTENT_SCREEN) {
+      // Enable short circuit for low temporal variance.
+      sf->short_circuit_low_temp_var = 1;
+    }
   }
 
   if (speed >= 7) {
@@ -406,8 +448,19 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
   }
   if (speed >= 8) {
     sf->adaptive_rd_thresh = 4;
-    sf->mv.subpel_force_stop = 2;
+    sf->mv.subpel_force_stop = (content == VP9E_CONTENT_SCREEN) ? 3 : 2;
     sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
+    // Only keep INTRA_DC mode for speed 8.
+    if (!is_keyframe) {
+      int i = 0;
+      for (i = 0; i < BLOCK_SIZES; ++i)
+        sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
+    }
+    if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR &&
+        content != VP9E_CONTENT_SCREEN) {
+      // More aggressive short circuit for speed 8.
+      sf->short_circuit_low_temp_var = 2;
+    }
   }
 }
 
@@ -460,7 +513,6 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->mv.auto_mv_step_size = 0;
   sf->mv.fullpel_search_step_param = 6;
   sf->comp_inter_joint_search_thresh = BLOCK_4X4;
-  sf->adaptive_rd_thresh = 0;
   sf->tx_size_search_method = USE_FULL_RD;
   sf->use_lp32x32fdct = 0;
   sf->adaptive_motion_search = 0;
@@ -516,10 +568,15 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   // Recode loop tolerance %.
   sf->recode_tolerance = 25;
   sf->default_interp_filter = SWITCHABLE;
-  sf->tx_size_search_breakout = 0;
-  sf->partition_search_breakout_dist_thr = 0;
-  sf->partition_search_breakout_rate_thr = 0;
   sf->simple_model_rd_from_var = 0;
+  sf->short_circuit_flat_blocks = 0;
+  sf->short_circuit_low_temp_var = 0;
+
+  // Some speed-up features even for best quality as minimal impact on quality.
+  sf->adaptive_rd_thresh = 1;
+  sf->tx_size_search_breakout = 1;
+  sf->partition_search_breakout_dist_thr = (1 << 19);
+  sf->partition_search_breakout_rate_thr = 80;
 
   if (oxcf->mode == REALTIME)
     set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);
@@ -527,8 +584,36 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
     set_good_speed_feature(cpi, cm, sf, oxcf->speed);
 
   cpi->full_search_sad = vp9_full_search_sad;
-  cpi->diamond_search_sad = oxcf->mode == BEST ? vp9_full_range_search
-                                               : vp9_diamond_search_sad;
+  cpi->diamond_search_sad = vp9_diamond_search_sad;
+
+  sf->allow_exhaustive_searches = 1;
+  if (oxcf->mode == BEST) {
+    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+      sf->exhaustive_searches_thresh = (1 << 20);
+    else
+      sf->exhaustive_searches_thresh = (1 << 21);
+    sf->max_exaustive_pct = 100;
+    for (i = 0; i < MAX_MESH_STEP; ++i) {
+      sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range;
+      sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval;
+    }
+  } else {
+    int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed;
+    if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+      sf->exhaustive_searches_thresh = (1 << 22);
+    else
+      sf->exhaustive_searches_thresh = (1 << 23);
+    sf->max_exaustive_pct = good_quality_max_mesh_pct[speed];
+    if (speed > 0)
+      sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1;
+
+    for (i = 0; i < MAX_MESH_STEP; ++i) {
+      sf->mesh_patterns[i].range =
+          good_quality_mesh_patterns[speed][i].range;
+      sf->mesh_patterns[i].interval =
+          good_quality_mesh_patterns[speed][i].interval;
+    }
+  }
 
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
@@ -541,7 +626,10 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
     sf->optimize_coefficients = 0;
   }
 
-  if (sf->mv.subpel_search_method == SUBPEL_TREE) {
+  if (sf->mv.subpel_force_stop == 3) {
+    // Whole pel only
+    cpi->find_fractional_mv_step = vp9_skip_sub_pixel_tree;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
   } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 575e98c..e88a7df 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -188,13 +188,24 @@ typedef struct MV_SPEED_FEATURES {
   // Maximum number of steps in logarithmic subpel search before giving up.
   int subpel_iters_per_step;
 
-  // Control when to stop subpel search
+  // Control when to stop subpel search:
+  // 0: Full subpel search.
+  // 1: Stop at quarter pixel.
+  // 2: Stop at half pixel.
+  // 3: Stop at full pixel.
   int subpel_force_stop;
 
   // This variable sets the step_param used in full pel motion search.
   int fullpel_search_step_param;
 } MV_SPEED_FEATURES;
 
+#define MAX_MESH_STEP 4
+
+typedef struct MESH_PATTERN {
+  int range;
+  int interval;
+} MESH_PATTERN;
+
 typedef struct SPEED_FEATURES {
   MV_SPEED_FEATURES mv;
 
@@ -299,6 +310,18 @@ typedef struct SPEED_FEATURES {
   // point for this motion search and limits the search range around it.
   int adaptive_motion_search;
 
+  // Flag for allowing some use of exhaustive searches;
+  int allow_exhaustive_searches;
+
+  // Threshold for allowing exhaistive motion search.
+  int exhaustive_searches_thresh;
+
+  // Maximum number of exhaustive searches for a frame.
+  int max_exaustive_pct;
+
+  // Pattern to be used for any exhaustive mesh searches.
+  MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
+
   int schedule_mode_search;
 
   // Allows sub 8x8 modes to use the prediction filter that was determined
@@ -419,6 +442,18 @@ typedef struct SPEED_FEATURES {
 
   // Fast approximation of vp9_model_rd_from_var_lapndz
   int simple_model_rd_from_var;
+
+  // Skip a number of expensive mode evaluations for blocks with zero source
+  // variance.
+  int short_circuit_flat_blocks;
+
+  // Skip a number of expensive mode evaluations for blocks with very low
+  // temporal variance.
+  // 1: Skip golden non-zeromv and ALL INTRA for bsize >= 32x32.
+  // 2: Skip golden non-zeromv and newmv-last for bsize >= 16x16, skip ALL
+  // INTRA for bsize >= 32x32 and vert/horz INTRA for bsize 16x16, 16x32 and
+  // 32x16.
+  int short_circuit_low_temp_var;
 } SPEED_FEATURES;
 
 struct VP9_COMP;
diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c
index 799f179..29db015 100644
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c
@@ -14,9 +14,7 @@
 #include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_subexp.h"
 
-#define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
-
-static const int update_bits[255] = {
+static const uint8_t update_bits[255] = {
    5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
    6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
@@ -34,6 +32,7 @@ static const int update_bits[255] = {
   11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
   11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  0,
 };
+#define MIN_DELP_BITS 5
 
 static int recenter_nonneg(int v, int m) {
   if (v > (m << 1))
@@ -46,7 +45,7 @@ static int recenter_nonneg(int v, int m) {
 
 static int remap_prob(int v, int m) {
   int i;
-  static const int map_table[MAX_PROB - 1] = {
+  static const uint8_t map_table[MAX_PROB - 1] = {
     // generated by:
     //   map_table[j] = split_index(j, MAX_PROB - 1, MODULUS_PARAM);
      20,  21,  22,  23,  24,  25,   0,  26,  27,  28,  29,  30,  31,  32,  33,
@@ -80,7 +79,7 @@ static int remap_prob(int v, int m) {
 
 static int prob_diff_update_cost(vpx_prob newp, vpx_prob oldp) {
   int delp = remap_prob(newp, oldp);
-  return update_bits[delp] * 256;
+  return update_bits[delp] << VP9_PROB_COST_SHIFT;
 }
 
 static void encode_uniform(vpx_writer *w, int v) {
@@ -123,14 +122,17 @@ int vp9_prob_diff_update_savings_search(const unsigned int *ct,
   int bestsavings = 0;
   vpx_prob newp, bestnewp = oldp;
   const int step = *bestp > oldp ? -1 : 1;
+  const int upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd);
 
-  for (newp = *bestp; newp != oldp; newp += step) {
-    const int new_b = cost_branch256(ct, newp);
-    const int update_b = prob_diff_update_cost(newp, oldp) + vp9_cost_upd256;
-    const int savings = old_b - new_b - update_b;
-    if (savings > bestsavings) {
-      bestsavings = savings;
-      bestnewp = newp;
+  if (old_b > upd_cost + (MIN_DELP_BITS << VP9_PROB_COST_SHIFT)) {
+    for (newp = *bestp; newp != oldp; newp += step) {
+      const int new_b = cost_branch256(ct, newp);
+      const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+      const int savings = old_b - new_b - update_b;
+      if (savings > bestsavings) {
+        bestsavings = savings;
+        bestnewp = newp;
+      }
     }
   }
   *bestp = bestnewp;
@@ -138,52 +140,35 @@ int vp9_prob_diff_update_savings_search(const unsigned int *ct,
 }
 
 int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const vpx_prob *oldp,
+                                              const vpx_prob oldp,
                                               vpx_prob *bestp,
                                               vpx_prob upd,
                                               int stepsize) {
-  int i, old_b, new_b, update_b, savings, bestsavings, step;
+  int i, old_b, new_b, update_b, savings, bestsavings;
   int newp;
-  vpx_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
-  vp9_model_to_full_probs(oldp, oldplist);
-  memcpy(newplist, oldp, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
-  for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
-    old_b += cost_branch256(ct + 2 * i, oldplist[i]);
-  old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);
+  const int step_sign = *bestp > oldp ? -1 : 1;
+  const int step = stepsize * step_sign;
+  const int upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd);
+  const vpx_prob *newplist, *oldplist;
+  vpx_prob bestnewp;
+  oldplist = vp9_pareto8_full[oldp - 1];
+  old_b = cost_branch256(ct + 2 * PIVOT_NODE, oldp);
+  for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
+    old_b += cost_branch256(ct + 2 * i, oldplist[i - UNCONSTRAINED_NODES]);
 
   bestsavings = 0;
-  bestnewp = oldp[PIVOT_NODE];
-
-  if (*bestp > oldp[PIVOT_NODE]) {
-    step = -stepsize;
-    for (newp = *bestp; newp > oldp[PIVOT_NODE]; newp += step) {
-      if (newp < 1 || newp > 255)
-        continue;
-      newplist[PIVOT_NODE] = newp;
-      vp9_model_to_full_probs(newplist, newplist);
-      for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
-        new_b += cost_branch256(ct + 2 * i, newplist[i]);
-      new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
-      update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
-          vp9_cost_upd256;
-      savings = old_b - new_b - update_b;
-      if (savings > bestsavings) {
-        bestsavings = savings;
-        bestnewp = newp;
-      }
-    }
-  } else {
-    step = stepsize;
-    for (newp = *bestp; newp < oldp[PIVOT_NODE]; newp += step) {
-      if (newp < 1 || newp > 255)
-        continue;
-      newplist[PIVOT_NODE] = newp;
-      vp9_model_to_full_probs(newplist, newplist);
-      for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
-        new_b += cost_branch256(ct + 2 * i, newplist[i]);
-      new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
-      update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
-          vp9_cost_upd256;
+  bestnewp = oldp;
+
+  assert(stepsize > 0);
+
+  if (old_b > upd_cost + (MIN_DELP_BITS << VP9_PROB_COST_SHIFT)) {
+    for (newp = *bestp; (newp - oldp) * step_sign < 0; newp += step) {
+      if (newp < 1 || newp > 255) continue;
+      newplist = vp9_pareto8_full[newp - 1];
+      new_b = cost_branch256(ct + 2 * PIVOT_NODE, newp);
+      for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
+        new_b += cost_branch256(ct + 2 * i, newplist[i - UNCONSTRAINED_NODES]);
+      update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
       savings = old_b - new_b - update_b;
       if (savings > bestsavings) {
         bestsavings = savings;
diff --git a/vp9/encoder/vp9_subexp.h b/vp9/encoder/vp9_subexp.h
index b968232..efe62c0 100644
--- a/vp9/encoder/vp9_subexp.h
+++ b/vp9/encoder/vp9_subexp.h
@@ -32,7 +32,7 @@ int vp9_prob_diff_update_savings_search(const unsigned int *ct,
 
 
 int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const vpx_prob *oldp,
+                                              const vpx_prob oldp,
                                               vpx_prob *bestp,
                                               vpx_prob upd,
                                               int stepsize);
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 8a6818c..1814a32 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -16,7 +16,6 @@
 #include "vp9/encoder/vp9_extend.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
-#define SMALL_FRAME_FB_IDX 7
 #define SMALL_FRAME_WIDTH  32
 #define SMALL_FRAME_HEIGHT 16
 
@@ -25,12 +24,44 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   int mi_rows = cpi->common.mi_rows;
   int mi_cols = cpi->common.mi_cols;
-  int sl, tl;
+  int sl, tl, i;
   int alt_ref_idx = svc->number_spatial_layers;
 
   svc->spatial_layer_id = 0;
   svc->temporal_layer_id = 0;
   svc->first_spatial_layer_to_encode = 0;
+  svc->rc_drop_superframe = 0;
+  svc->force_zero_mode_spatial_ref = 0;
+  svc->use_base_mv = 0;
+  svc->current_superframe = 0;
+  for (i = 0; i < REF_FRAMES; ++i)
+    svc->ref_frame_index[i] = -1;
+  for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+    cpi->svc.ext_frame_flags[sl] = 0;
+    cpi->svc.ext_lst_fb_idx[sl] = 0;
+    cpi->svc.ext_gld_fb_idx[sl] = 1;
+    cpi->svc.ext_alt_fb_idx[sl] = 2;
+  }
+
+  // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate
+  // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a
+  // target of 1/4x1/4.
+  if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) {
+    if (vpx_realloc_frame_buffer(&cpi->svc.scaled_temp,
+                                 cpi->common.width >> 1,
+                                 cpi->common.height >> 1,
+                                 cpi->common.subsampling_x,
+                                 cpi->common.subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 cpi->common.use_highbitdepth,
+#endif
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 cpi->common.byte_alignment,
+                                 NULL, NULL, NULL))
+      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                         "Failed to allocate scaled_frame for svc ");
+  }
+
 
   if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
     if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img,
@@ -107,15 +138,20 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
           tl == 0) {
         size_t last_coded_q_map_size;
         size_t consec_zero_mv_size;
+        VP9_COMMON *const cm = &cpi->common;
         lc->sb_index = 0;
-        lc->map = vpx_malloc(mi_rows * mi_cols * sizeof(signed char));
+        CHECK_MEM_ERROR(cm, lc->map,
+                        vpx_malloc(mi_rows * mi_cols * sizeof(*lc->map)));
         memset(lc->map, 0, mi_rows * mi_cols);
-        last_coded_q_map_size = mi_rows * mi_cols * sizeof(uint8_t);
-        lc->last_coded_q_map = vpx_malloc(last_coded_q_map_size);
+        last_coded_q_map_size = mi_rows * mi_cols *
+                                sizeof(*lc->last_coded_q_map);
+        CHECK_MEM_ERROR(cm, lc->last_coded_q_map,
+                        vpx_malloc(last_coded_q_map_size));
         assert(MAXQ <= 255);
         memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
-        consec_zero_mv_size = mi_rows * mi_cols * sizeof(uint8_t);
-        lc->consec_zero_mv = vpx_malloc(consec_zero_mv_size);
+        consec_zero_mv_size = mi_rows * mi_cols * sizeof(*lc->consec_zero_mv);
+        CHECK_MEM_ERROR(cm, lc->consec_zero_mv,
+                        vpx_malloc(consec_zero_mv_size));
         memset(lc->consec_zero_mv, 0, consec_zero_mv_size);
        }
     }
@@ -277,7 +313,8 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) {
   cpi->alt_ref_source = lc->alt_ref_source;
   // Reset the frames_since_key and frames_to_key counters to their values
   // before the layer restore. Keep these defined for the stream (not layer).
-  if (cpi->svc.number_temporal_layers > 1) {
+  if (cpi->svc.number_temporal_layers > 1 ||
+      (cpi->svc.number_spatial_layers > 1 && !is_two_pass_svc(cpi))) {
     cpi->rc.frames_since_key = old_frame_since_key;
     cpi->rc.frames_to_key = old_frame_to_key;
   }
@@ -290,12 +327,12 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) {
     CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
     signed char *temp = cr->map;
     uint8_t *temp2 = cr->last_coded_q_map;
-    uint8_t *temp3 = cr->consec_zero_mv;
+    uint8_t *temp3 = cpi->consec_zero_mv;
     cr->map = lc->map;
     lc->map = temp;
     cr->last_coded_q_map = lc->last_coded_q_map;
     lc->last_coded_q_map = temp2;
-    cr->consec_zero_mv = lc->consec_zero_mv;
+    cpi->consec_zero_mv = lc->consec_zero_mv;
     lc->consec_zero_mv = temp3;
     cr->sb_index = lc->sb_index;
   }
@@ -323,8 +360,8 @@ void vp9_save_layer_context(VP9_COMP *const cpi) {
     cr->map = temp;
     lc->last_coded_q_map = cr->last_coded_q_map;
     cr->last_coded_q_map = temp2;
-    lc->consec_zero_mv = cr->consec_zero_mv;
-    cr->consec_zero_mv = temp3;
+    lc->consec_zero_mv = cpi->consec_zero_mv;
+    cpi->consec_zero_mv = temp3;
     lc->sb_index = cr->sb_index;
   }
 }
@@ -351,6 +388,8 @@ void vp9_inc_frame_in_layer(VP9_COMP *const cpi) {
                               cpi->svc.number_temporal_layers];
   ++lc->current_video_frame_in_layer;
   ++lc->frames_from_key_frame;
+  if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+    ++cpi->svc.current_superframe;
 }
 
 int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
@@ -402,7 +441,9 @@ static void set_flags_and_fb_idx_for_temporal_mode3(VP9_COMP *const cpi) {
       cpi->ref_frame_flags = VP9_LAST_FLAG;
     } else if (cpi->svc.layer_context[temporal_id].is_key_frame) {
       // base layer is a key frame.
-      cpi->ref_frame_flags = VP9_GOLD_FLAG;
+      cpi->ref_frame_flags = VP9_LAST_FLAG;
+      cpi->ext_refresh_last_frame = 0;
+      cpi->ext_refresh_golden_frame = 1;
     } else {
       cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
     }
@@ -417,7 +458,13 @@ static void set_flags_and_fb_idx_for_temporal_mode3(VP9_COMP *const cpi) {
   } else {
     if (frame_num_within_temporal_struct == 1) {
       // the first tl2 picture
-      if (!spatial_id) {
+      if (spatial_id == cpi->svc.number_spatial_layers - 1) {  // top layer
+        cpi->ext_refresh_frame_flags_pending = 1;
+        if (!spatial_id)
+          cpi->ref_frame_flags = VP9_LAST_FLAG;
+        else
+          cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+      } else if (!spatial_id) {
         cpi->ext_refresh_frame_flags_pending = 1;
         cpi->ext_refresh_alt_ref_frame = 1;
         cpi->ref_frame_flags = VP9_LAST_FLAG;
@@ -425,32 +472,38 @@ static void set_flags_and_fb_idx_for_temporal_mode3(VP9_COMP *const cpi) {
         cpi->ext_refresh_frame_flags_pending = 1;
         cpi->ext_refresh_alt_ref_frame = 1;
         cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
-      } else {  // Top layer
-        cpi->ext_refresh_frame_flags_pending = 0;
-        cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
       }
     } else {
       //  The second tl2 picture
-      if (!spatial_id) {
+      if (spatial_id == cpi->svc.number_spatial_layers - 1) {  // top layer
         cpi->ext_refresh_frame_flags_pending = 1;
+        if (!spatial_id)
         cpi->ref_frame_flags = VP9_LAST_FLAG;
-        cpi->ext_refresh_last_frame = 1;
-      } else if (spatial_id < cpi->svc.number_spatial_layers - 1) {
+        else
+          cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+      } else if (!spatial_id) {
         cpi->ext_refresh_frame_flags_pending = 1;
-        cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
-        cpi->ext_refresh_last_frame = 1;
+        cpi->ref_frame_flags = VP9_LAST_FLAG;
+        cpi->ext_refresh_alt_ref_frame = 1;
       } else {  // top layer
-        cpi->ext_refresh_frame_flags_pending = 0;
+        cpi->ext_refresh_frame_flags_pending = 1;
         cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+        cpi->ext_refresh_alt_ref_frame = 1;
       }
     }
   }
   if (temporal_id == 0) {
     cpi->lst_fb_idx = spatial_id;
-    if (spatial_id)
+    if (spatial_id) {
+      if (cpi->svc.layer_context[temporal_id].is_key_frame) {
+        cpi->lst_fb_idx = spatial_id - 1;
+        cpi->gld_fb_idx = spatial_id;
+      } else {
       cpi->gld_fb_idx = spatial_id - 1;
-    else
+      }
+    } else {
       cpi->gld_fb_idx = 0;
+    }
     cpi->alt_fb_idx = 0;
   } else if (temporal_id == 1) {
     cpi->lst_fb_idx = spatial_id;
@@ -463,7 +516,7 @@ static void set_flags_and_fb_idx_for_temporal_mode3(VP9_COMP *const cpi) {
   } else {
     cpi->lst_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
     cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
-    cpi->alt_fb_idx = 0;
+    cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
   }
 }
 
@@ -485,7 +538,9 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) {
       cpi->ref_frame_flags = VP9_LAST_FLAG;
     } else if (cpi->svc.layer_context[temporal_id].is_key_frame) {
       // base layer is a key frame.
-      cpi->ref_frame_flags = VP9_GOLD_FLAG;
+      cpi->ref_frame_flags = VP9_LAST_FLAG;
+      cpi->ext_refresh_last_frame = 0;
+      cpi->ext_refresh_golden_frame = 1;
     } else {
       cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
     }
@@ -501,10 +556,16 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) {
 
   if (temporal_id == 0) {
     cpi->lst_fb_idx = spatial_id;
-    if (spatial_id)
+    if (spatial_id) {
+      if (cpi->svc.layer_context[temporal_id].is_key_frame) {
+        cpi->lst_fb_idx = spatial_id - 1;
+        cpi->gld_fb_idx = spatial_id;
+      } else {
       cpi->gld_fb_idx = spatial_id - 1;
-    else
+      }
+    } else {
       cpi->gld_fb_idx = 0;
+    }
     cpi->alt_fb_idx = 0;
   } else if (temporal_id == 1) {
     cpi->lst_fb_idx = spatial_id;
@@ -526,20 +587,31 @@ static void set_flags_and_fb_idx_for_temporal_mode_noLayering(
   if (!spatial_id) {
     cpi->ref_frame_flags = VP9_LAST_FLAG;
   } else if (cpi->svc.layer_context[0].is_key_frame) {
-    cpi->ref_frame_flags = VP9_GOLD_FLAG;
+    cpi->ref_frame_flags = VP9_LAST_FLAG;
+    cpi->ext_refresh_last_frame = 0;
+    cpi->ext_refresh_golden_frame = 1;
   } else {
     cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
   }
   cpi->lst_fb_idx = spatial_id;
-  if (spatial_id)
+  if (spatial_id) {
+    if (cpi->svc.layer_context[0].is_key_frame) {
+      cpi->lst_fb_idx = spatial_id - 1;
+      cpi->gld_fb_idx = spatial_id;
+    } else {
     cpi->gld_fb_idx = spatial_id - 1;
-  else
+    }
+  } else {
     cpi->gld_fb_idx = 0;
+  }
 }
 
 int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
   int width = 0, height = 0;
   LAYER_CONTEXT *lc = NULL;
+  if (cpi->svc.number_spatial_layers > 1)
+    cpi->svc.use_base_mv = 1;
+  cpi->svc.force_zero_mode_spatial_ref = 1;
 
   if (cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
     set_flags_and_fb_idx_for_temporal_mode3(cpi);
@@ -557,6 +629,8 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
     // Note that the check (cpi->ext_refresh_frame_flags_pending == 0) is
     // needed to support the case where the frame flags may be passed in via
     // vpx_codec_encode(), which can be used for the temporal-only svc case.
+    // TODO(marpan): Consider adding an enc_config parameter to better handle
+    // this case.
     if (cpi->ext_refresh_frame_flags_pending == 0) {
       int sl;
       cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
@@ -568,6 +642,9 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
     }
   }
 
+  if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode)
+    cpi->svc.rc_drop_superframe = 0;
+
   lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
                                cpi->svc.number_temporal_layers +
                                cpi->svc.temporal_layer_id];
@@ -591,6 +668,8 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
 }
 
 #if CONFIG_SPATIAL_SVC
+#define SMALL_FRAME_FB_IDX 7
+
 int vp9_svc_start_frame(VP9_COMP *const cpi) {
   int width = 0, height = 0;
   LAYER_CONTEXT *lc;
@@ -701,7 +780,8 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) {
   return 0;
 }
 
-#endif
+#undef SMALL_FRAME_FB_IDX
+#endif  // CONFIG_SPATIAL_SVC
 
 struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi,
                                               struct lookahead_ctx *ctx,
@@ -736,3 +816,27 @@ void vp9_free_svc_cyclic_refresh(VP9_COMP *const cpi) {
     }
   }
 }
+
+// Reset on key frame: reset counters, references and buffer updates.
+void vp9_svc_reset_key_frame(VP9_COMP *const cpi) {
+  int sl, tl;
+  SVC *const svc = &cpi->svc;
+  LAYER_CONTEXT *lc = NULL;
+  for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl];
+      lc->current_video_frame_in_layer = 0;
+      lc->frames_from_key_frame = 0;
+    }
+  }
+  if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
+    set_flags_and_fb_idx_for_temporal_mode3(cpi);
+  } else if (svc->temporal_layering_mode ==
+             VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
+     set_flags_and_fb_idx_for_temporal_mode_noLayering(cpi);
+  } else if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0101) {
+     set_flags_and_fb_idx_for_temporal_mode2(cpi);
+  }
+  vp9_update_temporal_layer_framerate(cpi);
+  vp9_restore_layer_context(cpi);
+}
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 694b5ab..9f386fb 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -56,6 +56,7 @@ typedef struct {
 
   int spatial_layer_to_encode;
   int first_spatial_layer_to_encode;
+  int rc_drop_superframe;
 
   // Workaround for multiple frame contexts
   enum {
@@ -69,6 +70,8 @@ typedef struct {
   // Store scaled source frames to be used for temporal filter to generate
   // a alt ref frame.
   YV12_BUFFER_CONFIG scaled_frames[MAX_LAG_BUFFERS];
+  // Temp buffer used for 2-stage down-sampling, for real-time mode.
+  YV12_BUFFER_CONFIG scaled_temp;
 
   // Layer context used for rate control in one pass temporal CBR mode or
   // two pass spatial mode.
@@ -82,6 +85,10 @@ typedef struct {
   int ext_lst_fb_idx[VPX_MAX_LAYERS];
   int ext_gld_fb_idx[VPX_MAX_LAYERS];
   int ext_alt_fb_idx[VPX_MAX_LAYERS];
+  int ref_frame_index[REF_FRAMES];
+  int force_zero_mode_spatial_ref;
+  int current_superframe;
+  int use_base_mv;
 } SVC;
 
 struct VP9_COMP;
@@ -129,6 +136,8 @@ int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi);
 
 void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
 
+void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 16f9c85..b6323e0 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -45,8 +45,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                                             int x, int y) {
   const int which_mv = 0;
   const MV mv = { mv_row, mv_col };
-  const InterpKernel *const kernel =
-    vp9_filter_kernels[xd->mi[0]->mbmi.interp_filter];
+  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP_SHARP];
 
   enum mv_precision mv_precision_uv;
   int uv_stride;
@@ -86,6 +85,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
     return;
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+  (void)xd;
   vp9_build_inter_predictor(y_mb_ptr, stride,
                             &pred[0], 16,
                             &mv,
@@ -135,15 +135,38 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1,
 
   for (i = 0, k = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++, k++) {
-      int src_byte = frame1[byte];
-      int pixel_value = *frame2++;
-
-      modifier   = src_byte - pixel_value;
-      // This is an integer approximation of:
-      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
-      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
-      modifier  *= modifier;
-      modifier  *= 3;
+      int pixel_value = *frame2;
+
+      // non-local mean approach
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
+
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = (int)i + idy;
+          int col = (int)j + idx;
+
+          if (row >= 0 && row < (int)block_height &&
+              col >= 0 && col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx)
+        modifier += diff_sse[idx];
+
+      modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
+
       modifier  += rounding;
       modifier >>= strength;
 
@@ -182,15 +205,34 @@ void vp9_highbd_temporal_filter_apply_c(uint8_t *frame1_8,
 
   for (i = 0, k = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++, k++) {
-      int src_byte = frame1[byte];
-      int pixel_value = *frame2++;
-
-      modifier   = src_byte - pixel_value;
-      // This is an integer approximation of:
-      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
-      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
-      modifier *= modifier;
+      int pixel_value = *frame2;
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
+
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = (int)i + idy;
+          int col = (int)j + idx;
+
+          if (row >= 0 && row < (int)block_height &&
+              col >= 0 && col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx)
+        modifier += diff_sse[idx];
+
       modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
       modifier += rounding;
       modifier >>= strength;
 
@@ -222,8 +264,8 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
   int step_param;
   int sadpb = x->sadperbit16;
   int bestsme = INT_MAX;
-  int distortion;
-  unsigned int sse;
+  uint32_t distortion;
+  uint32_t sse;
   int cost_list[5];
 
   MV best_ref_mv1 = {0, 0};
@@ -383,55 +425,57 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
           if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
             int adj_strength = strength + 2 * (mbd->bd - 8);
             // Apply the filter (YUV)
-            vp9_highbd_temporal_filter_apply(f->y_buffer + mb_y_offset,
-                                             f->y_stride,
-                                             predictor, 16, 16, adj_strength,
-                                             filter_weight,
-                                             accumulator, count);
-            vp9_highbd_temporal_filter_apply(f->u_buffer + mb_uv_offset,
-                                             f->uv_stride, predictor + 256,
-                                             mb_uv_width, mb_uv_height,
-                                             adj_strength,
-                                             filter_weight, accumulator + 256,
-                                             count + 256);
-            vp9_highbd_temporal_filter_apply(f->v_buffer + mb_uv_offset,
-                                             f->uv_stride, predictor + 512,
-                                             mb_uv_width, mb_uv_height,
-                                             adj_strength, filter_weight,
-                                             accumulator + 512, count + 512);
+            vp9_highbd_temporal_filter_apply_c(f->y_buffer + mb_y_offset,
+                                               f->y_stride,
+                                               predictor, 16, 16, adj_strength,
+                                               filter_weight,
+                                               accumulator, count);
+            vp9_highbd_temporal_filter_apply_c(f->u_buffer + mb_uv_offset,
+                                               f->uv_stride, predictor + 256,
+                                               mb_uv_width, mb_uv_height,
+                                               adj_strength, filter_weight,
+                                               accumulator + 256, count + 256);
+            vp9_highbd_temporal_filter_apply_c(f->v_buffer + mb_uv_offset,
+                                               f->uv_stride, predictor + 512,
+                                               mb_uv_width, mb_uv_height,
+                                               adj_strength, filter_weight,
+                                               accumulator + 512, count + 512);
           } else {
             // Apply the filter (YUV)
-            vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+            vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+                                        predictor, 16, 16,
+                                        strength, filter_weight,
+                                        accumulator, count);
+            vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset,
+                                        f->uv_stride,
+                                        predictor + 256,
+                                        mb_uv_width, mb_uv_height, strength,
+                                        filter_weight, accumulator + 256,
+                                        count + 256);
+            vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset,
+                                        f->uv_stride,
+                                        predictor + 512,
+                                        mb_uv_width, mb_uv_height, strength,
+                                        filter_weight, accumulator + 512,
+                                        count + 512);
+          }
+#else
+          // Apply the filter (YUV)
+          // TODO(jingning): Need SIMD optimization for this.
+          vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
                                       predictor, 16, 16,
                                       strength, filter_weight,
                                       accumulator, count);
-            vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+          vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
                                       predictor + 256,
                                       mb_uv_width, mb_uv_height, strength,
                                       filter_weight, accumulator + 256,
                                       count + 256);
-            vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+          vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
                                       predictor + 512,
                                       mb_uv_width, mb_uv_height, strength,
                                       filter_weight, accumulator + 512,
                                       count + 512);
-          }
-#else
-          // Apply the filter (YUV)
-          vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
-                                    predictor, 16, 16,
-                                    strength, filter_weight,
-                                    accumulator, count);
-          vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
-                                    predictor + 256,
-                                    mb_uv_width, mb_uv_height, strength,
-                                    filter_weight, accumulator + 256,
-                                    count + 256);
-          vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
-                                    predictor + 512,
-                                    mb_uv_width, mb_uv_height, strength,
-                                    filter_weight, accumulator + 512,
-                                    count + 512);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         }
       }
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 6076e2a..edec755 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -18,7 +18,6 @@
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_scan.h"
-#include "vp9/common/vp9_seg_common.h"
 
 #include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_encoder.h"
@@ -50,6 +49,35 @@ static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
 const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens = dct_cat_lt_10_value_tokens +
     (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens))
     / 2;
+// The corresponding costs of the extrabits for the tokens in the above table
+// are stored in the table below. The values are obtained from looking up the
+// entry for the specified extrabits in the table corresponding to the token
+// (as defined in cost element vp9_extra_bits)
+// e.g. {9, 63} maps to cat5_cost[63 >> 1], {1, 1} maps to sign_cost[1 >> 1]
+static const int dct_cat_lt_10_value_cost[] = {
+  3773, 3750, 3704, 3681, 3623, 3600, 3554, 3531,
+  3432, 3409, 3363, 3340, 3282, 3259, 3213, 3190,
+  3136, 3113, 3067, 3044, 2986, 2963, 2917, 2894,
+  2795, 2772, 2726, 2703, 2645, 2622, 2576, 2553,
+  3197, 3116, 3058, 2977, 2881, 2800,
+  2742, 2661, 2615, 2534, 2476, 2395,
+  2299, 2218, 2160, 2079,
+  2566, 2427, 2334, 2195, 2023, 1884, 1791, 1652,
+  1893, 1696, 1453, 1256, 1229, 864,
+  512, 512, 512, 512, 0,
+  512, 512, 512, 512,
+  864, 1229, 1256, 1453, 1696, 1893,
+  1652, 1791, 1884, 2023, 2195, 2334, 2427, 2566,
+  2079, 2160, 2218, 2299, 2395, 2476, 2534, 2615,
+  2661, 2742, 2800, 2881, 2977, 3058, 3116, 3197,
+  2553, 2576, 2622, 2645, 2703, 2726, 2772, 2795,
+  2894, 2917, 2963, 2986, 3044, 3067, 3113, 3136,
+  3190, 3213, 3259, 3282, 3340, 3363, 3409, 3432,
+  3531, 3554, 3600, 3623, 3681, 3704, 3750, 3773,
+};
+const int *vp9_dct_cat_lt_10_value_cost = dct_cat_lt_10_value_cost +
+    (sizeof(dct_cat_lt_10_value_cost) / sizeof(*dct_cat_lt_10_value_cost))
+    / 2;
 
 // Array indices are identical to previously-existing CONTEXT_NODE indices
 const vpx_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
@@ -67,303 +95,178 @@ const vpx_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
 };
 
 static const int16_t zero_cost[] = {0};
-static const int16_t one_cost[] = {255, 257};
-static const int16_t two_cost[] = {255, 257};
-static const int16_t three_cost[] = {255, 257};
-static const int16_t four_cost[] = {255, 257};
-static const int16_t cat1_cost[] = {429, 431, 616, 618};
-static const int16_t cat2_cost[] = {624, 626, 727, 729, 848, 850, 951, 953};
-static const int16_t cat3_cost[] = {
-  820, 822, 893, 895, 940, 942, 1013, 1015, 1096, 1098, 1169, 1171, 1216, 1218,
-  1289, 1291
-};
-static const int16_t cat4_cost[] = {
-  1032, 1034, 1075, 1077, 1105, 1107, 1148, 1150, 1194, 1196, 1237, 1239,
-  1267, 1269, 1310, 1312, 1328, 1330, 1371, 1373, 1401, 1403, 1444, 1446,
-  1490, 1492, 1533, 1535, 1563, 1565, 1606, 1608
-};
-static const int16_t cat5_cost[] = {
-  1269, 1271, 1283, 1285, 1306, 1308, 1320,
-  1322, 1347, 1349, 1361, 1363, 1384, 1386, 1398, 1400, 1443, 1445, 1457,
-  1459, 1480, 1482, 1494, 1496, 1521, 1523, 1535, 1537, 1558, 1560, 1572,
-  1574, 1592, 1594, 1606, 1608, 1629, 1631, 1643, 1645, 1670, 1672, 1684,
-  1686, 1707, 1709, 1721, 1723, 1766, 1768, 1780, 1782, 1803, 1805, 1817,
-  1819, 1844, 1846, 1858, 1860, 1881, 1883, 1895, 1897
-};
+static const int16_t sign_cost[1] = {512};
+static const int16_t cat1_cost[1 << 1] = {864, 1229};
+static const int16_t cat2_cost[1 << 2] = {1256, 1453, 1696, 1893};
+static const int16_t cat3_cost[1 << 3] = {1652, 1791, 1884, 2023,
+                                          2195, 2334, 2427, 2566};
+static const int16_t cat4_cost[1 << 4] = {2079, 2160, 2218, 2299, 2395, 2476,
+                                          2534, 2615, 2661, 2742, 2800, 2881,
+                                          2977, 3058, 3116, 3197};
+static const int16_t cat5_cost[1 << 5] = {
+    2553, 2576, 2622, 2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963,
+    2986, 3044, 3067, 3113, 3136, 3190, 3213, 3259, 3282, 3340, 3363,
+    3409, 3432, 3531, 3554, 3600, 3623, 3681, 3704, 3750, 3773};
 const int16_t vp9_cat6_low_cost[256] = {
-  1638, 1640, 1646, 1648, 1652, 1654, 1660, 1662,
-  1670, 1672, 1678, 1680, 1684, 1686, 1692, 1694, 1711, 1713, 1719, 1721,
-  1725, 1727, 1733, 1735, 1743, 1745, 1751, 1753, 1757, 1759, 1765, 1767,
-  1787, 1789, 1795, 1797, 1801, 1803, 1809, 1811, 1819, 1821, 1827, 1829,
-  1833, 1835, 1841, 1843, 1860, 1862, 1868, 1870, 1874, 1876, 1882, 1884,
-  1892, 1894, 1900, 1902, 1906, 1908, 1914, 1916, 1940, 1942, 1948, 1950,
-  1954, 1956, 1962, 1964, 1972, 1974, 1980, 1982, 1986, 1988, 1994, 1996,
-  2013, 2015, 2021, 2023, 2027, 2029, 2035, 2037, 2045, 2047, 2053, 2055,
-  2059, 2061, 2067, 2069, 2089, 2091, 2097, 2099, 2103, 2105, 2111, 2113,
-  2121, 2123, 2129, 2131, 2135, 2137, 2143, 2145, 2162, 2164, 2170, 2172,
-  2176, 2178, 2184, 2186, 2194, 2196, 2202, 2204, 2208, 2210, 2216, 2218,
-  2082, 2084, 2090, 2092, 2096, 2098, 2104, 2106, 2114, 2116, 2122, 2124,
-  2128, 2130, 2136, 2138, 2155, 2157, 2163, 2165, 2169, 2171, 2177, 2179,
-  2187, 2189, 2195, 2197, 2201, 2203, 2209, 2211, 2231, 2233, 2239, 2241,
-  2245, 2247, 2253, 2255, 2263, 2265, 2271, 2273, 2277, 2279, 2285, 2287,
-  2304, 2306, 2312, 2314, 2318, 2320, 2326, 2328, 2336, 2338, 2344, 2346,
-  2350, 2352, 2358, 2360, 2384, 2386, 2392, 2394, 2398, 2400, 2406, 2408,
-  2416, 2418, 2424, 2426, 2430, 2432, 2438, 2440, 2457, 2459, 2465, 2467,
-  2471, 2473, 2479, 2481, 2489, 2491, 2497, 2499, 2503, 2505, 2511, 2513,
-  2533, 2535, 2541, 2543, 2547, 2549, 2555, 2557, 2565, 2567, 2573, 2575,
-  2579, 2581, 2587, 2589, 2606, 2608, 2614, 2616, 2620, 2622, 2628, 2630,
-  2638, 2640, 2646, 2648, 2652, 2654, 2660, 2662
-};
-const int16_t vp9_cat6_high_cost[128] = {
-  72, 892, 1183, 2003, 1448, 2268, 2559, 3379,
-  1709, 2529, 2820, 3640, 3085, 3905, 4196, 5016, 2118, 2938, 3229, 4049,
-  3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, 5131, 5951, 6242, 7062,
-  2118, 2938, 3229, 4049, 3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686,
-  5131, 5951, 6242, 7062, 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471,
-  5801, 6621, 6912, 7732, 7177, 7997, 8288, 9108, 2118, 2938, 3229, 4049,
-  3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, 5131, 5951, 6242, 7062,
-  4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471, 5801, 6621, 6912, 7732,
-  7177, 7997, 8288, 9108, 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471,
-  5801, 6621, 6912, 7732, 7177, 7997, 8288, 9108, 6210, 7030, 7321, 8141,
-  7586, 8406, 8697, 9517, 7847, 8667, 8958, 9778, 9223, 10043, 10334, 11154
-};
+    3378, 3390, 3401, 3413, 3435, 3447, 3458, 3470, 3517, 3529, 3540, 3552,
+    3574, 3586, 3597, 3609, 3671, 3683, 3694, 3706, 3728, 3740, 3751, 3763,
+    3810, 3822, 3833, 3845, 3867, 3879, 3890, 3902, 3973, 3985, 3996, 4008,
+    4030, 4042, 4053, 4065, 4112, 4124, 4135, 4147, 4169, 4181, 4192, 4204,
+    4266, 4278, 4289, 4301, 4323, 4335, 4346, 4358, 4405, 4417, 4428, 4440,
+    4462, 4474, 4485, 4497, 4253, 4265, 4276, 4288, 4310, 4322, 4333, 4345,
+    4392, 4404, 4415, 4427, 4449, 4461, 4472, 4484, 4546, 4558, 4569, 4581,
+    4603, 4615, 4626, 4638, 4685, 4697, 4708, 4720, 4742, 4754, 4765, 4777,
+    4848, 4860, 4871, 4883, 4905, 4917, 4928, 4940, 4987, 4999, 5010, 5022,
+    5044, 5056, 5067, 5079, 5141, 5153, 5164, 5176, 5198, 5210, 5221, 5233,
+    5280, 5292, 5303, 5315, 5337, 5349, 5360, 5372, 4988, 5000, 5011, 5023,
+    5045, 5057, 5068, 5080, 5127, 5139, 5150, 5162, 5184, 5196, 5207, 5219,
+    5281, 5293, 5304, 5316, 5338, 5350, 5361, 5373, 5420, 5432, 5443, 5455,
+    5477, 5489, 5500, 5512, 5583, 5595, 5606, 5618, 5640, 5652, 5663, 5675,
+    5722, 5734, 5745, 5757, 5779, 5791, 5802, 5814, 5876, 5888, 5899, 5911,
+    5933, 5945, 5956, 5968, 6015, 6027, 6038, 6050, 6072, 6084, 6095, 6107,
+    5863, 5875, 5886, 5898, 5920, 5932, 5943, 5955, 6002, 6014, 6025, 6037,
+    6059, 6071, 6082, 6094, 6156, 6168, 6179, 6191, 6213, 6225, 6236, 6248,
+    6295, 6307, 6318, 6330, 6352, 6364, 6375, 6387, 6458, 6470, 6481, 6493,
+    6515, 6527, 6538, 6550, 6597, 6609, 6620, 6632, 6654, 6666, 6677, 6689,
+    6751, 6763, 6774, 6786, 6808, 6820, 6831, 6843, 6890, 6902, 6913, 6925,
+    6947, 6959, 6970, 6982};
+const int vp9_cat6_high_cost[64] = {
+    88,    2251,  2727,  4890,  3148,  5311,  5787,  7950,  3666,  5829,  6305,
+    8468,  6726,  8889,  9365,  11528, 3666,  5829,  6305,  8468,  6726,  8889,
+    9365,  11528, 7244,  9407,  9883,  12046, 10304, 12467, 12943, 15106, 3666,
+    5829,  6305,  8468,  6726,  8889,  9365,  11528, 7244,  9407,  9883,  12046,
+    10304, 12467, 12943, 15106, 7244,  9407,  9883,  12046, 10304, 12467, 12943,
+    15106, 10822, 12985, 13461, 15624, 13882, 16045, 16521, 18684};
 
 #if CONFIG_VP9_HIGHBITDEPTH
-const int16_t vp9_cat6_high10_high_cost[512] = {
-  74, 894, 1185, 2005, 1450, 2270, 2561,
-  3381, 1711, 2531, 2822, 3642, 3087, 3907, 4198, 5018, 2120, 2940, 3231,
-  4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, 5953, 6244,
-  7064, 2120, 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868,
-  5688, 5133, 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
-  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 2120, 2940, 3231,
-  4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, 5953, 6244,
-  7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914,
-  7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
-  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323,
-  8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336,
-  11156, 2120, 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868,
-  5688, 5133, 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
-  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277,
-  6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290,
-  9110, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
-  9780, 9225, 10045, 10336, 11156, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
-  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323,
-  8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336,
-  11156, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
-  9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454,
-  10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 2120,
-  2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133,
-  5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803,
-  6623, 6914, 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, 6097, 5542,
-  6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212,
-  7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225,
-  10045, 10336, 11156, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803,
-  6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323, 8143, 7588,
-  8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, 11156, 6212,
-  7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225,
-  10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454, 10745, 11565,
-  9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 4166, 4986, 5277,
-  6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290,
-  9110, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
-  9780, 9225, 10045, 10336, 11156, 6212, 7032, 7323, 8143, 7588, 8408, 8699,
-  9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369,
-  10189, 9634, 10454, 10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091,
-  12382, 13202, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669,
-  8960, 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454,
-  10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 8258,
-  9078, 9369, 10189, 9634, 10454, 10745, 11565, 9895, 10715, 11006, 11826,
-  11271, 12091, 12382, 13202, 10304, 11124, 11415, 12235, 11680, 12500, 12791,
-  13611, 11941, 12761, 13052, 13872, 13317, 14137, 14428, 15248,
-};
-const int16_t vp9_cat6_high12_high_cost[2048] = {
-  76, 896, 1187, 2007, 1452, 2272, 2563,
-  3383, 1713, 2533, 2824, 3644, 3089, 3909, 4200, 5020, 2122, 2942, 3233,
-  4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246,
-  7066, 2122, 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870,
-  5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
-  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 2122, 2942, 3233,
-  4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246,
-  7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
-  7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
-  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325,
-  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
-  11158, 2122, 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870,
-  5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
-  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279,
-  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
-  9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
-  9782, 9227, 10047, 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
-  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325,
-  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
-  11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
-  9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
-  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 2122,
-  2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135,
-  5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805,
-  6625, 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544,
-  6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214,
-  7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
-  10047, 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805,
-  6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590,
-  8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214,
-  7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
-  10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
-  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 4168, 4988, 5279,
-  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
-  9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
-  9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
-  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371,
-  10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
-  12384, 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671,
-  8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
-  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260,
-  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
-  11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
-  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 2122, 2942,
-  3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955,
-  6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
-  6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, 6364,
-  6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034,
-  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
-  10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
-  6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410,
-  8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034,
-  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
-  10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
-  10717, 11008, 11828, 11273, 12093, 12384, 13204, 4168, 4988, 5279, 6099,
-  5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112,
-  6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782,
-  9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521,
-  7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191,
-  9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
-  13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
-  9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
-  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260,
-  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
-  11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
-  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 4168, 4988,
-  5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001,
-  8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671,
-  8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410,
-  8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080,
-  9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273,
-  12093, 12384, 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851,
-  8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636,
-  10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
-  8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
-  11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502,
-  12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 6214,
-  7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
-  10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
-  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371,
-  10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
-  12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
-  12763, 13054, 13874, 13319, 14139, 14430, 15250, 8260, 9080, 9371, 10191,
-  9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
-  13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
-  13054, 13874, 13319, 14139, 14430, 15250, 10306, 11126, 11417, 12237, 11682,
-  12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250,
-  12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100,
-  15920, 15365, 16185, 16476, 17296, 2122, 2942, 3233, 4053, 3498, 4318, 4609,
-  5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279,
-  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
-  9112, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
-  7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
-  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 4168, 4988, 5279,
-  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
-  9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
-  9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
-  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371,
-  10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
-  12384, 13204, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
-  6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410,
-  8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034,
-  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
-  10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
-  10717, 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, 7325, 8145,
-  7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158,
-  8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
-  11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456,
-  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306,
-  11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874,
-  13319, 14139, 14430, 15250, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475,
-  5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145,
-  7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158,
-  6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782,
-  9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
-  11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034,
-  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
-  10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
-  10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191,
-  9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
-  13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
-  13054, 13874, 13319, 14139, 14430, 15250, 6214, 7034, 7325, 8145, 7590,
-  8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260,
-  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
-  11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
-  11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126,
-  11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
-  14139, 14430, 15250, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
-  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417,
-  12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139,
-  14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
-  12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283,
-  13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476,
-  17296, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
-  7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
-  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325,
-  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
-  11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717,
-  11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, 7325, 8145, 7590,
-  8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260,
-  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
-  11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
-  11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126,
-  11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
-  14139, 14430, 15250, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851,
-  8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636,
-  10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
-  8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
-  11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502,
-  12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 8260,
-  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
-  11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
-  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 10306, 11126,
-  11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
-  14139, 14430, 15250, 12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659,
-  13989, 14809, 15100, 15920, 15365, 16185, 16476, 17296, 6214, 7034, 7325,
-  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
-  11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717,
-  11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636,
-  10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
-  10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054,
-  13874, 13319, 14139, 14430, 15250, 8260, 9080, 9371, 10191, 9636, 10456,
-  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306,
-  11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874,
-  13319, 14139, 14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
-  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172,
-  13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365,
-  16185, 16476, 17296, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
-  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417,
-  12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139,
-  14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
-  12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283,
-  13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476,
-  17296, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
-  13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283, 13728,
-  14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476, 17296,
-  12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100,
-  15920, 15365, 16185, 16476, 17296, 14398, 15218, 15509, 16329, 15774, 16594,
-  16885, 17705, 16035, 16855, 17146, 17966, 17411, 18231, 18522, 19342
-};
+const int vp9_cat6_high10_high_cost[256] = {
+    94,    2257,  2733,  4896,  3154,  5317,  5793,  7956,  3672,  5835,  6311,
+    8474,  6732,  8895,  9371,  11534, 3672,  5835,  6311,  8474,  6732,  8895,
+    9371,  11534, 7250,  9413,  9889,  12052, 10310, 12473, 12949, 15112, 3672,
+    5835,  6311,  8474,  6732,  8895,  9371,  11534, 7250,  9413,  9889,  12052,
+    10310, 12473, 12949, 15112, 7250,  9413,  9889,  12052, 10310, 12473, 12949,
+    15112, 10828, 12991, 13467, 15630, 13888, 16051, 16527, 18690, 4187,  6350,
+    6826,  8989,  7247,  9410,  9886,  12049, 7765,  9928,  10404, 12567, 10825,
+    12988, 13464, 15627, 7765,  9928,  10404, 12567, 10825, 12988, 13464, 15627,
+    11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205, 7765,  9928,  10404,
+    12567, 10825, 12988, 13464, 15627, 11343, 13506, 13982, 16145, 14403, 16566,
+    17042, 19205, 11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205, 14921,
+    17084, 17560, 19723, 17981, 20144, 20620, 22783, 4187,  6350,  6826,  8989,
+    7247,  9410,  9886,  12049, 7765,  9928,  10404, 12567, 10825, 12988, 13464,
+    15627, 7765,  9928,  10404, 12567, 10825, 12988, 13464, 15627, 11343, 13506,
+    13982, 16145, 14403, 16566, 17042, 19205, 7765,  9928,  10404, 12567, 10825,
+    12988, 13464, 15627, 11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205,
+    11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205, 14921, 17084, 17560,
+    19723, 17981, 20144, 20620, 22783, 8280,  10443, 10919, 13082, 11340, 13503,
+    13979, 16142, 11858, 14021, 14497, 16660, 14918, 17081, 17557, 19720, 11858,
+    14021, 14497, 16660, 14918, 17081, 17557, 19720, 15436, 17599, 18075, 20238,
+    18496, 20659, 21135, 23298, 11858, 14021, 14497, 16660, 14918, 17081, 17557,
+    19720, 15436, 17599, 18075, 20238, 18496, 20659, 21135, 23298, 15436, 17599,
+    18075, 20238, 18496, 20659, 21135, 23298, 19014, 21177, 21653, 23816, 22074,
+    24237, 24713, 26876};
+const int vp9_cat6_high12_high_cost[1024] = {
+    100,   2263,  2739,  4902,  3160,  5323,  5799,  7962,  3678,  5841,  6317,
+    8480,  6738,  8901,  9377,  11540, 3678,  5841,  6317,  8480,  6738,  8901,
+    9377,  11540, 7256,  9419,  9895,  12058, 10316, 12479, 12955, 15118, 3678,
+    5841,  6317,  8480,  6738,  8901,  9377,  11540, 7256,  9419,  9895,  12058,
+    10316, 12479, 12955, 15118, 7256,  9419,  9895,  12058, 10316, 12479, 12955,
+    15118, 10834, 12997, 13473, 15636, 13894, 16057, 16533, 18696, 4193,  6356,
+    6832,  8995,  7253,  9416,  9892,  12055, 7771,  9934,  10410, 12573, 10831,
+    12994, 13470, 15633, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633,
+    11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771,  9934,  10410,
+    12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572,
+    17048, 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927,
+    17090, 17566, 19729, 17987, 20150, 20626, 22789, 4193,  6356,  6832,  8995,
+    7253,  9416,  9892,  12055, 7771,  9934,  10410, 12573, 10831, 12994, 13470,
+    15633, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512,
+    13988, 16151, 14409, 16572, 17048, 19211, 7771,  9934,  10410, 12573, 10831,
+    12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211,
+    11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090, 17566,
+    19729, 17987, 20150, 20626, 22789, 8286,  10449, 10925, 13088, 11346, 13509,
+    13985, 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864,
+    14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244,
+    18502, 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563,
+    19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605,
+    18081, 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080,
+    24243, 24719, 26882, 4193,  6356,  6832,  8995,  7253,  9416,  9892,  12055,
+    7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633, 7771,  9934,  10410,
+    12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572,
+    17048, 19211, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633, 11349,
+    13512, 13988, 16151, 14409, 16572, 17048, 19211, 11349, 13512, 13988, 16151,
+    14409, 16572, 17048, 19211, 14927, 17090, 17566, 19729, 17987, 20150, 20626,
+    22789, 8286,  10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027,
+    14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924,
+    17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304,
+    11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081,
+    20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665,
+    21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 8286,
+    10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, 14503, 16666,
+    14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, 17087, 17563,
+    19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 11864, 14027,
+    14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502,
+    20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304,
+    19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379, 14542, 15018,
+    17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180,
+    21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535,
+    21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759,
+    19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234,
+    27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276,
+    25752, 27915, 26173, 28336, 28812, 30975, 4193,  6356,  6832,  8995,  7253,
+    9416,  9892,  12055, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633,
+    7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988,
+    16151, 14409, 16572, 17048, 19211, 7771,  9934,  10410, 12573, 10831, 12994,
+    13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 11349,
+    13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090, 17566, 19729,
+    17987, 20150, 20626, 22789, 8286,  10449, 10925, 13088, 11346, 13509, 13985,
+    16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027,
+    14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502,
+    20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726,
+    15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081,
+    20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243,
+    24719, 26882, 8286,  10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864,
+    14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666,
+    14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141,
+    23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605,
+    18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502,
+    20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882,
+    12379, 14542, 15018, 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596,
+    20759, 19017, 21180, 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180,
+    21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957,
+    18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337,
+    22595, 24758, 25234, 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234,
+    27397, 23113, 25276, 25752, 27915, 26173, 28336, 28812, 30975, 8286,  10449,
+    10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, 14503, 16666, 14924,
+    17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726,
+    15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 11864, 14027, 14503,
+    16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665,
+    21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 19020,
+    21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379, 14542, 15018, 17181,
+    15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180, 21656,
+    23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698,
+    22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759, 19017,
+    21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397,
+    19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276, 25752,
+    27915, 26173, 28336, 28812, 30975, 12379, 14542, 15018, 17181, 15439, 17602,
+    18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 15957,
+    18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337,
+    22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759, 19017, 21180, 21656,
+    23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 19535, 21698,
+    22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276, 25752, 27915, 26173,
+    28336, 28812, 30975, 16472, 18635, 19111, 21274, 19532, 21695, 22171, 24334,
+    20050, 22213, 22689, 24852, 23110, 25273, 25749, 27912, 20050, 22213, 22689,
+    24852, 23110, 25273, 25749, 27912, 23628, 25791, 26267, 28430, 26688, 28851,
+    29327, 31490, 20050, 22213, 22689, 24852, 23110, 25273, 25749, 27912, 23628,
+    25791, 26267, 28430, 26688, 28851, 29327, 31490, 23628, 25791, 26267, 28430,
+    26688, 28851, 29327, 31490, 27206, 29369, 29845, 32008, 30266, 32429, 32905,
+    35068};
 #endif
 
 const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = {
   {0, 0, 0, zero_cost},                          // ZERO_TOKEN
-  {0, 0, 1, one_cost},                           // ONE_TOKEN
-  {0, 0, 2, two_cost},                           // TWO_TOKEN
-  {0, 0, 3, three_cost},                         // THREE_TOKEN
-  {0, 0, 4, four_cost},                          // FOUR_TOKEN
+  {0, 0, 1, sign_cost},                          // ONE_TOKEN
+  {0, 0, 2, sign_cost},                          // TWO_TOKEN
+  {0, 0, 3, sign_cost},                          // THREE_TOKEN
+  {0, 0, 4, sign_cost},                          // FOUR_TOKEN
   {vp9_cat1_prob, 1,  CAT1_MIN_VAL, cat1_cost},  // CATEGORY1_TOKEN
   {vp9_cat2_prob, 2,  CAT2_MIN_VAL, cat2_cost},  // CATEGORY2_TOKEN
   {vp9_cat3_prob, 3,  CAT3_MIN_VAL, cat3_cost},  // CATEGORY3_TOKEN
@@ -375,32 +278,32 @@ const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = {
 
 #if CONFIG_VP9_HIGHBITDEPTH
 const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS] = {
-  {0, 0, 0, zero_cost},                                 // ZERO
-  {0, 0, 1, one_cost},                                  // ONE
-  {0, 0, 2, two_cost},                                  // TWO
-  {0, 0, 3, three_cost},                                // THREE
-  {0, 0, 4, four_cost},                                 // FOUR
-  {vp9_cat1_prob_high10, 1,  CAT1_MIN_VAL, cat1_cost},  // CAT1
-  {vp9_cat2_prob_high10, 2,  CAT2_MIN_VAL, cat2_cost},  // CAT2
-  {vp9_cat3_prob_high10, 3,  CAT3_MIN_VAL, cat3_cost},  // CAT3
-  {vp9_cat4_prob_high10, 4,  CAT4_MIN_VAL, cat4_cost},  // CAT4
-  {vp9_cat5_prob_high10, 5,  CAT5_MIN_VAL, cat5_cost},  // CAT5
-  {vp9_cat6_prob_high10, 16, CAT6_MIN_VAL, 0},          // CAT6
-  {0, 0, 0, zero_cost}                                  // EOB
+  {0, 0, 0, zero_cost},                             // ZERO
+  {0, 0, 1, sign_cost},                             // ONE
+  {0, 0, 2, sign_cost},                             // TWO
+  {0, 0, 3, sign_cost},                             // THREE
+  {0, 0, 4, sign_cost},                             // FOUR
+  {vp9_cat1_prob, 1,  CAT1_MIN_VAL, cat1_cost},     // CAT1
+  {vp9_cat2_prob, 2,  CAT2_MIN_VAL, cat2_cost},     // CAT2
+  {vp9_cat3_prob, 3,  CAT3_MIN_VAL, cat3_cost},     // CAT3
+  {vp9_cat4_prob, 4,  CAT4_MIN_VAL, cat4_cost},     // CAT4
+  {vp9_cat5_prob, 5,  CAT5_MIN_VAL, cat5_cost},     // CAT5
+  {vp9_cat6_prob_high12 + 2, 16, CAT6_MIN_VAL, 0},  // CAT6
+  {0, 0, 0, zero_cost}                              // EOB
 };
 const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS] = {
-  {0, 0, 0, zero_cost},                                 // ZERO
-  {0, 0, 1, one_cost},                                  // ONE
-  {0, 0, 2, two_cost},                                  // TWO
-  {0, 0, 3, three_cost},                                // THREE
-  {0, 0, 4, four_cost},                                 // FOUR
-  {vp9_cat1_prob_high12, 1,  CAT1_MIN_VAL, cat1_cost},  // CAT1
-  {vp9_cat2_prob_high12, 2,  CAT2_MIN_VAL, cat2_cost},  // CAT2
-  {vp9_cat3_prob_high12, 3,  CAT3_MIN_VAL, cat3_cost},  // CAT3
-  {vp9_cat4_prob_high12, 4,  CAT4_MIN_VAL, cat4_cost},  // CAT4
-  {vp9_cat5_prob_high12, 5,  CAT5_MIN_VAL, cat5_cost},  // CAT5
-  {vp9_cat6_prob_high12, 18, CAT6_MIN_VAL, 0},          // CAT6
-  {0, 0, 0, zero_cost}                                  // EOB
+  {0, 0, 0, zero_cost},                          // ZERO
+  {0, 0, 1, sign_cost},                          // ONE
+  {0, 0, 2, sign_cost},                          // TWO
+  {0, 0, 3, sign_cost},                          // THREE
+  {0, 0, 4, sign_cost},                          // FOUR
+  {vp9_cat1_prob, 1,  CAT1_MIN_VAL, cat1_cost},  // CAT1
+  {vp9_cat2_prob, 2,  CAT2_MIN_VAL, cat2_cost},  // CAT2
+  {vp9_cat3_prob, 3,  CAT3_MIN_VAL, cat3_cost},  // CAT3
+  {vp9_cat4_prob, 4,  CAT4_MIN_VAL, cat4_cost},  // CAT4
+  {vp9_cat5_prob, 5,  CAT5_MIN_VAL, cat5_cost},  // CAT5
+  {vp9_cat6_prob_high12, 18, CAT6_MIN_VAL, 0},   // CAT6
+  {0, 0, 0, zero_cost}                           // EOB
 };
 #endif
 
@@ -431,35 +334,25 @@ static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
 }
 
 static INLINE void add_token(TOKENEXTRA **t, const vpx_prob *context_tree,
-                             int32_t extra, uint8_t token,
-                             uint8_t skip_eob_node,
+                             int16_t token, EXTRABIT extra,
                              unsigned int *counts) {
+  (*t)->context_tree = context_tree;
   (*t)->token = token;
   (*t)->extra = extra;
-  (*t)->context_tree = context_tree;
-  (*t)->skip_eob_node = skip_eob_node;
   (*t)++;
   ++counts[token];
 }
 
 static INLINE void add_token_no_extra(TOKENEXTRA **t,
                                       const vpx_prob *context_tree,
-                                      uint8_t token,
-                                      uint8_t skip_eob_node,
+                                      int16_t token,
                                       unsigned int *counts) {
-  (*t)->token = token;
   (*t)->context_tree = context_tree;
-  (*t)->skip_eob_node = skip_eob_node;
+  (*t)->token = token;
   (*t)++;
   ++counts[token];
 }
 
-static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
-                             TX_SIZE tx_size) {
-  const int eob_max = 16 << (tx_size << 1);
-  return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
-}
-
 static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
                        TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
@@ -471,17 +364,16 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   uint8_t token_cache[32 * 32];
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *mi = xd->mi[0];
   int pt; /* near block/prev token context index */
   int c;
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   int eob = p->eobs[block];
   const PLANE_TYPE type = get_plane_type(plane);
   const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  const int segment_id = mbmi->segment_id;
   const int16_t *scan, *nb;
   const scan_order *so;
-  const int ref = is_inter_block(mbmi);
+  const int ref = is_inter_block(mi);
   unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
       td->rd_counts.coef_counts[tx_size][type][ref];
   vpx_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
@@ -489,7 +381,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   unsigned int (*const eob_branch)[COEFF_CONTEXTS] =
       td->counts->eob_branch[tx_size][type][ref];
   const uint8_t *const band = get_band_translate(tx_size);
-  const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+  const int tx_eob = 16 << (tx_size << 1);
   int16_t token;
   EXTRABIT extra;
   int aoff, loff;
@@ -504,15 +396,13 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
 
   while (c < eob) {
     int v = 0;
-    int skip_eob = 0;
     v = qcoeff[scan[c]];
+    ++eob_branch[band[c]][pt];
 
     while (!v) {
-      add_token_no_extra(&t, coef_probs[band[c]][pt], ZERO_TOKEN, skip_eob,
+      add_token_no_extra(&t, coef_probs[band[c]][pt], ZERO_TOKEN,
                          counts[band[c]][pt]);
-      eob_branch[band[c]][pt] += !skip_eob;
 
-      skip_eob = 1;
       token_cache[scan[c]] = 0;
       ++c;
       pt = get_coef_context(nb, token_cache, c);
@@ -521,18 +411,17 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
 
     vp9_get_token_extra(v, &token, &extra);
 
-    add_token(&t, coef_probs[band[c]][pt], extra, (uint8_t)token,
-              (uint8_t)skip_eob, counts[band[c]][pt]);
-    eob_branch[band[c]][pt] += !skip_eob;
+    add_token(&t, coef_probs[band[c]][pt], token, extra,
+              counts[band[c]][pt]);
 
     token_cache[scan[c]] = vp9_pt_energy_class[token];
     ++c;
     pt = get_coef_context(nb, token_cache, c);
   }
-  if (c < seg_eob) {
-    add_token_no_extra(&t, coef_probs[band[c]][pt], EOB_TOKEN, 0,
-                       counts[band[c]][pt]);
+  if (c < tx_eob) {
     ++eob_branch[band[c]][pt];
+    add_token_no_extra(&t, coef_probs[band[c]][pt], EOB_TOKEN,
+                       counts[band[c]][pt]);
   }
 
   *tp = t;
@@ -584,24 +473,26 @@ int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
 }
 
 void vp9_tokenize_sb(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
-                     int dry_run, BLOCK_SIZE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
+                     int dry_run, int seg_skip, BLOCK_SIZE bsize) {
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *const mi = xd->mi[0];
   const int ctx = vp9_get_skip_context(xd);
-  const int skip_inc = !segfeature_active(&cm->seg, mbmi->segment_id,
-                                          SEG_LVL_SKIP);
   struct tokenize_b_args arg = {cpi, td, t};
-  if (mbmi->skip) {
-    if (!dry_run)
-      td->counts->skip[ctx][1] += skip_inc;
+
+  if (seg_skip) {
+    assert(mi->skip);
+  }
+
+  if (mi->skip) {
+    if (!dry_run && !seg_skip)
+      ++td->counts->skip[ctx][1];
     reset_skip_context(xd, bsize);
     return;
   }
 
   if (!dry_run) {
-    td->counts->skip[ctx][0] += skip_inc;
+    ++td->counts->skip[ctx][0];
     vp9_foreach_transformed_block(xd, bsize, tokenize_b, &arg);
   } else {
     vp9_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index c0f09c7..1caab2a 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -36,9 +36,8 @@ typedef struct {
 
 typedef struct {
   const vpx_prob *context_tree;
+  int16_t token;
   EXTRABIT extra;
-  uint8_t token;
-  uint8_t skip_eob_node;
 } TOKENEXTRA;
 
 extern const vpx_tree_index vp9_coef_tree[];
@@ -52,7 +51,8 @@ struct VP9_COMP;
 struct ThreadData;
 
 void vp9_tokenize_sb(struct VP9_COMP *cpi, struct ThreadData *td,
-                     TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
+                     TOKENEXTRA **t, int dry_run, int seg_skip,
+                     BLOCK_SIZE bsize);
 
 typedef struct {
   const vpx_prob *prob;
@@ -75,26 +75,27 @@ extern const int16_t *vp9_dct_value_cost_ptr;
  */
 extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
 extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens;
+extern const int *vp9_dct_cat_lt_10_value_cost;
 extern const int16_t vp9_cat6_low_cost[256];
-extern const int16_t vp9_cat6_high_cost[128];
-extern const int16_t vp9_cat6_high10_high_cost[512];
-extern const int16_t vp9_cat6_high12_high_cost[2048];
-static INLINE int16_t vp9_get_cost(int16_t token, EXTRABIT extrabits,
-                                   const int16_t *cat6_high_table) {
+extern const int vp9_cat6_high_cost[64];
+extern const int vp9_cat6_high10_high_cost[256];
+extern const int vp9_cat6_high12_high_cost[1024];
+static INLINE int vp9_get_cost(int16_t token, EXTRABIT extrabits,
+                               const int *cat6_high_table) {
   if (token != CATEGORY6_TOKEN)
-    return vp9_extra_bits[token].cost[extrabits];
-  return vp9_cat6_low_cost[extrabits & 0xff]
-      + cat6_high_table[extrabits >> 8];
+    return vp9_extra_bits[token].cost[extrabits >> 1];
+  return vp9_cat6_low_cost[(extrabits >> 1) & 0xff]
+      + cat6_high_table[extrabits >> 9];
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE const int16_t* vp9_get_high_cost_table(int bit_depth) {
+static INLINE const int* vp9_get_high_cost_table(int bit_depth) {
   return bit_depth == 8 ? vp9_cat6_high_cost
       : (bit_depth == 10 ? vp9_cat6_high10_high_cost :
          vp9_cat6_high12_high_cost);
 }
 #else
-static INLINE const int16_t* vp9_get_high_cost_table(int bit_depth) {
+static INLINE const int* vp9_get_high_cost_table(int bit_depth) {
   (void) bit_depth;
   return vp9_cat6_high_cost;
 }
@@ -118,6 +119,18 @@ static INLINE int16_t vp9_get_token(int v) {
   return vp9_dct_cat_lt_10_value_tokens[v].token;
 }
 
+static INLINE int vp9_get_token_cost(int v, int16_t *token,
+                                     const int *cat6_high_table) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+    EXTRABIT extrabits;
+    *token = CATEGORY6_TOKEN;
+    extrabits = abs(v) - CAT6_MIN_VAL;
+    return vp9_cat6_low_cost[extrabits & 0xff] +
+           cat6_high_table[extrabits >> 8];
+  }
+  *token = vp9_dct_cat_lt_10_value_tokens[v].token;
+  return vp9_dct_cat_lt_10_value_cost[v];
+}
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
similarity index 100%
rename from vp9/encoder/x86/vp9_dct_sse2.c
rename to vp9/encoder/x86/vp9_dct_intrin_sse2.c
diff --git a/vp9/encoder/x86/vp9_dct_mmx.asm b/vp9/encoder/x86/vp9_dct_mmx.asm
deleted file mode 100644
index 7a7a6b6..0000000
--- a/vp9/encoder/x86/vp9_dct_mmx.asm
+++ /dev/null
@@ -1,104 +0,0 @@
-;
-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro TRANSFORM_COLS 0
-  paddw           m0,        m1
-  movq            m4,        m0
-  psubw           m3,        m2
-  psubw           m4,        m3
-  psraw           m4,        1
-  movq            m5,        m4
-  psubw           m5,        m1 ;b1
-  psubw           m4,        m2 ;c1
-  psubw           m0,        m4
-  paddw           m3,        m5
-                                ; m0 a0
-  SWAP            1,         4  ; m1 c1
-  SWAP            2,         3  ; m2 d1
-  SWAP            3,         5  ; m3 b1
-%endmacro
-
-%macro TRANSPOSE_4X4 0
-  movq            m4,        m0
-  movq            m5,        m2
-  punpcklwd       m4,        m1
-  punpckhwd       m0,        m1
-  punpcklwd       m5,        m3
-  punpckhwd       m2,        m3
-  movq            m1,        m4
-  movq            m3,        m0
-  punpckldq       m1,        m5
-  punpckhdq       m4,        m5
-  punpckldq       m3,        m2
-  punpckhdq       m0,        m2
-  SWAP            2, 3, 0, 1, 4
-%endmacro
-
-INIT_MMX mmx
-cglobal fwht4x4, 3, 4, 8, input, output, stride
-  lea             r3q,       [inputq + strideq*4]
-  movq            m0,        [inputq] ;a1
-  movq            m1,        [inputq + strideq*2] ;b1
-  movq            m2,        [r3q] ;c1
-  movq            m3,        [r3q + strideq*2] ;d1
-
-  TRANSFORM_COLS
-  TRANSPOSE_4X4
-  TRANSFORM_COLS
-  TRANSPOSE_4X4
-
-  psllw           m0,        2
-  psllw           m1,        2
-  psllw           m2,        2
-  psllw           m3,        2
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  pxor            m4,             m4
-  pxor            m5,             m5
-  pcmpgtw         m4,             m0
-  pcmpgtw         m5,             m1
-  movq            m6,             m0
-  movq            m7,             m1
-  punpcklwd       m0,             m4
-  punpcklwd       m1,             m5
-  punpckhwd       m6,             m4
-  punpckhwd       m7,             m5
-  movq            [outputq],      m0
-  movq            [outputq + 8],  m6
-  movq            [outputq + 16], m1
-  movq            [outputq + 24], m7
-  pxor            m4,             m4
-  pxor            m5,             m5
-  pcmpgtw         m4,             m2
-  pcmpgtw         m5,             m3
-  movq            m6,             m2
-  movq            m7,             m3
-  punpcklwd       m2,             m4
-  punpcklwd       m3,             m5
-  punpckhwd       m6,             m4
-  punpckhwd       m7,             m5
-  movq            [outputq + 32], m2
-  movq            [outputq + 40], m6
-  movq            [outputq + 48], m3
-  movq            [outputq + 56], m7
-%else
-  movq            [outputq],      m0
-  movq            [outputq + 8],  m1
-  movq            [outputq + 16], m2
-  movq            [outputq + 24], m3
-%endif
-
-  RET
diff --git a/vp9/encoder/x86/vp9_dct_sse2.asm b/vp9/encoder/x86/vp9_dct_sse2.asm
new file mode 100644
index 0000000..ced37bd
--- /dev/null
+++ b/vp9/encoder/x86/vp9_dct_sse2.asm
@@ -0,0 +1,86 @@
+;
+;  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m1
+  movq            m4,        m0
+  psubw           m3,        m2
+  psubw           m4,        m3
+  psraw           m4,        1
+  movq            m5,        m4
+  psubw           m5,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m4
+  paddw           m3,        m5
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+                                ; 00 01 02 03
+                                ; 10 11 12 13
+                                ; 20 21 22 23
+                                ; 30 31 32 33
+  punpcklwd       m0,        m1 ; 00 10 01 11  02 12 03 13
+  punpcklwd       m2,        m3 ; 20 30 21 31  22 32 23 33
+  mova            m1,        m0
+  punpckldq       m0,        m2 ; 00 10 20 30  01 11 21 31
+  punpckhdq       m1,        m2 ; 02 12 22 32  03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+  lea             r3q,       [inputq + strideq*4]
+  movq            m0,        [inputq] ;a1
+  movq            m1,        [inputq + strideq*2] ;b1
+  movq            m2,        [r3q] ;c1
+  movq            m3,        [r3q + strideq*2] ;d1
+
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  SWAP            1,         2
+  psrldq          m1,        m0, 8
+  psrldq          m3,        m2, 8
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+
+  psllw           m0,        2
+  psllw           m1,        2
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  ; sign extension
+  mova            m2,             m0
+  mova            m3,             m1
+  punpcklwd       m0,             m0
+  punpcklwd       m1,             m1
+  punpckhwd       m2,             m2
+  punpckhwd       m3,             m3
+  psrad           m0,             16
+  psrad           m1,             16
+  psrad           m2,             16
+  psrad           m3,             16
+  mova            [outputq],      m0
+  mova            [outputq + 16], m2
+  mova            [outputq + 32], m1
+  mova            [outputq + 48], m3
+%else
+  mova            [outputq],      m0
+  mova            [outputq + 16], m1
+%endif
+
+  RET
diff --git a/vp9/encoder/x86/vp9_denoiser_sse2.c b/vp9/encoder/x86/vp9_denoiser_sse2.c
index bf7c7af..883507a 100644
--- a/vp9/encoder/x86/vp9_denoiser_sse2.c
+++ b/vp9/encoder/x86/vp9_denoiser_sse2.c
@@ -125,7 +125,7 @@ static INLINE __m128i vp9_denoiser_adj_16x1_sse2(
   return acc_diff;
 }
 
-// Denoiser for 4xM and 8xM blocks.
+// Denoise 8x8 and 8x16 blocks.
 static int vp9_denoiser_NxM_sse2_small(
     const uint8_t *sig, int sig_stride, const uint8_t *mc_running_avg_y,
     int mc_avg_y_stride, uint8_t *running_avg_y, int avg_y_stride,
@@ -147,9 +147,9 @@ static int vp9_denoiser_NxM_sse2_small(
   const __m128i l32 = _mm_set1_epi8(2);
   // Difference between level 2 and level 1 is 1.
   const __m128i l21 = _mm_set1_epi8(1);
-  const uint8_t shift = (width == 4) ? 2 : 1;
+  const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
 
-  for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) {
+  for (r = 0; r < b_height; ++r) {
     memcpy(sig_buffer[r], sig, width);
     memcpy(sig_buffer[r] + width, sig + sig_stride, width);
     memcpy(mc_running_buffer[r], mc_running_avg_y, width);
@@ -157,18 +157,6 @@ static int vp9_denoiser_NxM_sse2_small(
            mc_running_avg_y + mc_avg_y_stride, width);
     memcpy(running_buffer[r], running_avg_y, width);
     memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
-    if (width == 4) {
-      memcpy(sig_buffer[r] + width * 2, sig + sig_stride * 2, width);
-      memcpy(sig_buffer[r] + width * 3, sig + sig_stride * 3, width);
-      memcpy(mc_running_buffer[r] + width * 2,
-             mc_running_avg_y + mc_avg_y_stride * 2, width);
-      memcpy(mc_running_buffer[r] + width * 3,
-             mc_running_avg_y + mc_avg_y_stride * 3, width);
-      memcpy(running_buffer[r] + width * 2,
-             running_avg_y + avg_y_stride * 2, width);
-      memcpy(running_buffer[r] + width * 3,
-             running_avg_y + avg_y_stride * 3, width);
-    }
     acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r],
                                       mc_running_buffer[r],
                                       running_buffer[r],
@@ -176,16 +164,10 @@ static int vp9_denoiser_NxM_sse2_small(
                                       &l3, &l32, &l21, acc_diff);
     memcpy(running_avg_y, running_buffer[r], width);
     memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width);
-    if (width == 4) {
-      memcpy(running_avg_y + avg_y_stride * 2,
-             running_buffer[r] + width * 2, width);
-      memcpy(running_avg_y + avg_y_stride * 3,
-             running_buffer[r] + width * 3, width);
-    }
     // Update pointers for next iteration.
-    sig += (sig_stride << shift);
-    mc_running_avg_y += (mc_avg_y_stride << shift);
-    running_avg_y += (avg_y_stride << shift);
+    sig += (sig_stride << 1);
+    mc_running_avg_y += (mc_avg_y_stride << 1);
+    running_avg_y += (avg_y_stride << 1);
   }
 
   {
@@ -207,22 +189,16 @@ static int vp9_denoiser_NxM_sse2_small(
       // Only apply the adjustment for max delta up to 3.
       if (delta < 4) {
         const __m128i k_delta = _mm_set1_epi8(delta);
-        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
-        for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) {
+        running_avg_y -= avg_y_stride * (b_height << 1);
+        for (r = 0; r < b_height; ++r) {
           acc_diff = vp9_denoiser_adj_16x1_sse2(
               sig_buffer[r], mc_running_buffer[r], running_buffer[r],
               k_0, k_delta, acc_diff);
           memcpy(running_avg_y, running_buffer[r], width);
           memcpy(running_avg_y + avg_y_stride,
                  running_buffer[r] + width, width);
-          if (width == 4) {
-            memcpy(running_avg_y + avg_y_stride * 2,
-                   running_buffer[r] + width * 2, width);
-            memcpy(running_avg_y + avg_y_stride * 3,
-                   running_buffer[r] + width * 3, width);
-          }
           // Update pointers for next iteration.
-          running_avg_y += (avg_y_stride << shift);
+          running_avg_y += (avg_y_stride << 1);
         }
         sum_diff = sum_diff_16x1(acc_diff);
         if (abs(sum_diff) > sum_diff_thresh) {
@@ -236,7 +212,7 @@ static int vp9_denoiser_NxM_sse2_small(
   return FILTER_BLOCK;
 }
 
-// Denoiser for 16xM, 32xM and 64xM blocks
+// Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks.
 static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
                                      const uint8_t *mc_running_avg_y,
                                      int mc_avg_y_stride,
@@ -260,38 +236,37 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
   const __m128i l32 = _mm_set1_epi8(2);
   // Difference between level 2 and level 1 is 1.
   const __m128i l21 = _mm_set1_epi8(1);
+  const int b_width = (4 << b_width_log2_lookup[bs]);
+  const int b_height = (4 << b_height_log2_lookup[bs]);
+  const int b_width_shift4 = b_width >> 4;
 
-  for (c = 0; c < 4; ++c) {
-    for (r = 0; r < 4; ++r) {
+  for (r = 0; r < 4; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
       acc_diff[c][r] = _mm_setzero_si128();
     }
   }
 
-  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
-    for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-      acc_diff[c>>4][r>>4] = vp9_denoiser_16x1_sse2(
+  for (r = 0; r < b_height; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      acc_diff[c][r>>4] = vp9_denoiser_16x1_sse2(
           sig, mc_running_avg_y, running_avg_y, &k_0, &k_4,
-          &k_8, &k_16, &l3, &l32, &l21, acc_diff[c>>4][r>>4]);
+          &k_8, &k_16, &l3, &l32, &l21, acc_diff[c][r>>4]);
       // Update pointers for next iteration.
       sig += 16;
       mc_running_avg_y += 16;
       running_avg_y += 16;
     }
 
-    if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
-      for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-        sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
+    if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+      for (c = 0; c < b_width_shift4; ++c) {
+        sum_diff += sum_diff_16x1(acc_diff[c][r>>4]);
       }
     }
 
     // Update pointers for next iteration.
-    sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
-    mc_running_avg_y = mc_running_avg_y -
-                       16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                       mc_avg_y_stride;
-    running_avg_y = running_avg_y -
-                    16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                    avg_y_stride;
+    sig = sig - b_width + sig_stride;
+    mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+    running_avg_y = running_avg_y - b_width + avg_y_stride;
   }
 
   {
@@ -303,33 +278,29 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
       // Only apply the adjustment for max delta up to 3.
       if (delta < 4) {
         const __m128i k_delta = _mm_set1_epi8(delta);
-        sig -= sig_stride * (4 << b_height_log2_lookup[bs]);
-        mc_running_avg_y -= mc_avg_y_stride * (4 << b_height_log2_lookup[bs]);
-        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
+        sig -= sig_stride * b_height;
+        mc_running_avg_y -= mc_avg_y_stride * b_height;
+        running_avg_y -= avg_y_stride * b_height;
         sum_diff = 0;
-        for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
-          for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-            acc_diff[c>>4][r>>4] = vp9_denoiser_adj_16x1_sse2(
+        for (r = 0; r < b_height; ++r) {
+          for (c = 0; c < b_width_shift4; ++c) {
+            acc_diff[c][r>>4] = vp9_denoiser_adj_16x1_sse2(
                 sig, mc_running_avg_y, running_avg_y, k_0,
-                k_delta, acc_diff[c>>4][r>>4]);
+                k_delta, acc_diff[c][r>>4]);
             // Update pointers for next iteration.
             sig += 16;
             mc_running_avg_y += 16;
             running_avg_y += 16;
           }
 
-          if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
-            for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-              sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
+          if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+            for (c = 0; c < b_width_shift4; ++c) {
+              sum_diff += sum_diff_16x1(acc_diff[c][r>>4]);
             }
           }
-          sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
-          mc_running_avg_y = mc_running_avg_y -
-                             16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                             mc_avg_y_stride;
-          running_avg_y = running_avg_y -
-                          16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                          avg_y_stride;
+          sig = sig - b_width + sig_stride;
+          mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+          running_avg_y = running_avg_y - b_width + avg_y_stride;
         }
         if (abs(sum_diff) > sum_diff_thresh) {
           return COPY_BLOCK;
@@ -349,26 +320,21 @@ int vp9_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
                              int increase_denoising,
                              BLOCK_SIZE bs,
                              int motion_magnitude) {
-  if (bs == BLOCK_4X4 || bs == BLOCK_4X8) {
-    return vp9_denoiser_NxM_sse2_small(sig, sig_stride,
-                                       mc_avg, mc_avg_stride,
-                                       avg, avg_stride,
-                                       increase_denoising,
-                                       bs, motion_magnitude, 4);
-  } else if (bs == BLOCK_8X4 || bs == BLOCK_8X8 || bs == BLOCK_8X16) {
-    return vp9_denoiser_NxM_sse2_small(sig, sig_stride,
-                                       mc_avg, mc_avg_stride,
-                                       avg, avg_stride,
-                                       increase_denoising,
-                                       bs, motion_magnitude, 8);
-  } else if (bs == BLOCK_16X8 || bs == BLOCK_16X16 || bs == BLOCK_16X32 ||
-             bs == BLOCK_32X16|| bs == BLOCK_32X32 || bs == BLOCK_32X64 ||
-             bs == BLOCK_64X32 || bs == BLOCK_64X64) {
+  // Rank by frequency of the block type to have an early termination.
+  if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
+      bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
+      bs == BLOCK_32X64 || bs == BLOCK_64X32) {
     return vp9_denoiser_NxM_sse2_big(sig, sig_stride,
                                      mc_avg, mc_avg_stride,
                                      avg, avg_stride,
                                      increase_denoising,
                                      bs, motion_magnitude);
+  } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+    return vp9_denoiser_NxM_sse2_small(sig, sig_stride,
+                                       mc_avg, mc_avg_stride,
+                                       avg, avg_stride,
+                                       increase_denoising,
+                                       bs, motion_magnitude, 8);
   } else {
     return COPY_BLOCK;
   }
diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
new file mode 100644
index 0000000..cd3e87e
--- /dev/null
+++ b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -0,0 +1,314 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#if defined(_MSC_VER)
+# include <intrin.h>
+#endif
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __GNUC__
+# define LIKELY(v)    __builtin_expect(v, 1)
+# define UNLIKELY(v)  __builtin_expect(v, 0)
+#else
+# define LIKELY(v)    (v)
+# define UNLIKELY(v)  (v)
+#endif
+
+static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
+  int_mv result;
+  result.as_mv.row = row;
+  result.as_mv.col = col;
+  return result;
+}
+
+static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
+  // This is simplified from the C implementation to utilise that
+  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[2]  and
+  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
+  return mv.as_int == 0 ? 0 : 1;
+}
+
+static INLINE int mv_cost(const int_mv mv,
+                          const int *joint_cost, int *const comp_cost[2]) {
+  return joint_cost[get_mv_joint(mv)] +
+         comp_cost[0][mv.as_mv.row] + comp_cost[1][mv.as_mv.col];
+}
+
+static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
+                          int sad_per_bit) {
+  const int_mv diff = pack_int_mv(mv.as_mv.row - ref->row,
+                                  mv.as_mv.col - ref->col);
+  return ROUND_POWER_OF_TWO((unsigned)mv_cost(diff, x->nmvjointsadcost,
+                                              x->nmvsadcost) *
+                                              sad_per_bit, VP9_PROB_COST_SHIFT);
+}
+
+/*****************************************************************************
+ * This function utilizes 3 properties of the cost function lookup tables,   *
+ * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
+ * vp9_encoder.c.                                                            *
+ * For the joint cost:                                                       *
+ *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
+ * For the component costs:                                                  *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
+ *         (Equal costs for both components)                                 *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
+ *         (Cost function is even)                                           *
+ * If these do not hold, then this function cannot be used without           *
+ * modification, in which case you can revert to using the C implementation, *
+ * which does not rely on these properties.                                  *
+ *****************************************************************************/
+int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
+                               const search_site_config *cfg,
+                               MV *ref_mv, MV *best_mv, int search_param,
+                               int sad_per_bit, int *num00,
+                               const vp9_variance_fn_ptr_t *fn_ptr,
+                               const MV *center_mv) {
+  const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max);
+  const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
+  const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min);
+  const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int);
+
+  const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);
+
+  const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
+  const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);
+
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  // 0 = initial step (MAX_FIRST_STEP) pel
+  // 1 = (MAX_FIRST_STEP/2) pel,
+  // 2 = (MAX_FIRST_STEP/4) pel...
+  const       MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
+  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
+  const int tot_steps = cfg->total_steps - search_param;
+
+  const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3,
+                                        center_mv->col >> 3);
+  const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);
+
+  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
+  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
+
+  int_mv bmv = pack_int_mv(ref_row, ref_col);
+  int_mv new_bmv = bmv;
+  __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int);
+
+  const int what_stride = x->plane[0].src.stride;
+  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
+  const uint8_t *const what = x->plane[0].src.buf;
+  const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf +
+                                 ref_row * in_what_stride + ref_col;
+
+  // Work out the start point for the search
+  const uint8_t *best_address = in_what;
+  const uint8_t *new_best_address = best_address;
+#if ARCH_X86_64
+  __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
+#else
+  __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
+#endif
+
+  unsigned int best_sad;
+  int i, j, step;
+
+  // Check the prerequisite cost function properties that are easy to check
+  // in an assert. See the function-level documentation for details on all
+  // prerequisites.
+  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
+  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
+
+  // Check the starting position
+  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
+  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
+
+  *num00 = 0;
+
+  for (i = 0, step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
+      __m128i v_sad_d, v_cost_d, v_outside_d, v_inside_d, v_diff_mv_w;
+#if ARCH_X86_64
+      __m128i v_blocka[2];
+#else
+      __m128i v_blocka[1];
+#endif
+
+      // Compute the candidate motion vectors
+      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i *)&ss_mv[i]);
+      const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
+      // Clamp them to the search bounds
+      __m128i v_these_mv_clamp_w = v_these_mv_w;
+      v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
+      v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
+      // The ones that did not change are inside the search area
+      v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);
+
+      // If none of them are inside, then move on
+      if (LIKELY(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
+        continue;
+      }
+
+      // The inverse mask indicates which of the MVs are outside
+      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff));
+      // Shift right to keep the sign bit clear, we will use this later
+      // to set the cost to the maximum value.
+      v_outside_d = _mm_srli_epi32(v_outside_d, 1);
+
+      // Compute the difference MV
+      v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
+      // We utilise the fact that the cost function is even, and use the
+      // absolute difference. This allows us to use unsigned indexes later
+      // and reduces cache pressure somewhat as only a half of the table
+      // is ever referenced.
+      v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);
+
+      // Compute the SIMD pointer offsets.
+      {
+#if ARCH_X86_64  //  sizeof(intptr_t) == 8
+        // Load the offsets
+        __m128i v_bo10_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 0]);
+        __m128i v_bo32_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 2]);
+        // Set the ones falling outside to zero
+        v_bo10_q = _mm_and_si128(v_bo10_q,
+                                 _mm_cvtepi32_epi64(v_inside_d));
+        v_bo32_q = _mm_and_si128(v_bo32_q,
+                                 _mm_unpackhi_epi32(v_inside_d, v_inside_d));
+        // Compute the candidate addresses
+        v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
+        v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
+#else  // ARCH_X86 //  sizeof(intptr_t) == 4
+        __m128i v_bo_d = _mm_loadu_si128((const __m128i *)&ss_os[i]);
+        v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
+        v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
+#endif
+      }
+
+      fn_ptr->sdx4df(what, what_stride,
+                     (const uint8_t **)&v_blocka[0], in_what_stride,
+                     (uint32_t*)&v_sad_d);
+
+      // Look up the component cost of the residual motion vector
+      {
+        const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
+        const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
+        const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
+        const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
+        const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
+        const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
+        const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
+        const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);
+
+        // Note: This is a use case for vpgather in AVX2
+        const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
+        const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
+        const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
+        const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];
+
+        __m128i v_cost_10_d, v_cost_32_d;
+        v_cost_10_d = _mm_cvtsi32_si128(cost0);
+        v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);
+        v_cost_32_d = _mm_cvtsi32_si128(cost2);
+        v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);
+        v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
+      }
+
+      // Now add in the joint cost
+      {
+        const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w,
+                                                _mm_setzero_si128());
+        const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d,
+                                                       v_joint_cost_0_d,
+                                                       v_sel_d);
+        v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
+      }
+
+      // Multiply by sad_per_bit
+      v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
+      // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT)
+      v_cost_d = _mm_add_epi32(v_cost_d,
+                               _mm_set1_epi32(1 << (VP9_PROB_COST_SHIFT - 1)));
+      v_cost_d = _mm_srai_epi32(v_cost_d, VP9_PROB_COST_SHIFT);
+      // Add the cost to the sad
+      v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);
+
+      // Make the motion vectors outside the search area have max cost
+      // by or'ing in the comparison mask, this way the minimum search won't
+      // pick them.
+      v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);
+
+      // Find the minimum value and index horizontally in v_sad_d
+      {
+        // Try speculatively on 16 bits, so we can use the minpos intrinsic
+        const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
+        const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);
+
+        uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
+        uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);
+
+        // If the local best value is not saturated, just use it, otherwise
+        // find the horizontal minimum again the hard way on 32 bits.
+        // This is executed rarely.
+        if (UNLIKELY(local_best_sad == 0xffff)) {
+          __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;
+
+          v_loval_d = v_sad_d;
+          v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
+          v_hival_d = _mm_srli_si128(v_loval_d, 8);
+          v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);
+
+          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
+
+          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
+          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
+          v_hival_d = _mm_srli_si128(v_loval_d, 4);
+          v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);
+
+          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
+
+          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
+          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
+
+          local_best_sad = _mm_extract_epi32(v_loval_d, 0);
+          local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
+        }
+
+        // Update the global minimum if the local minimum is smaller
+        if (LIKELY(local_best_sad < best_sad)) {
+          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
+          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
+
+          best_sad = local_best_sad;
+        }
+      }
+    }
+
+    bmv = new_bmv;
+    best_address = new_best_address;
+
+    v_bmv_w = _mm_set1_epi32(bmv.as_int);
+#if ARCH_X86_64
+    v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
+#else
+    v_ba_d = _mm_set1_epi32((intptr_t)best_address);
+#endif
+
+    if (UNLIKELY(best_address == in_what)) {
+      (*num00)++;
+    }
+  }
+
+  *best_mv = bmv.as_mv;
+  return best_sad;
+}
diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
new file mode 100644
index 0000000..38af3b1
--- /dev/null
+++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -0,0 +1,211 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#if defined(_MSC_VER) && _MSC_VER <= 1500
+// Need to include math.h before calling tmmintrin.h/intrin.h
+// in certain versions of MSVS.
+#include <math.h>
+#endif
+#include <tmmintrin.h>  // SSSE3
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vpx_scale/yv12config.h"
+
+extern void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
+                                         YV12_BUFFER_CONFIG *dst);
+
+static void downsample_2_to_1_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    int w, int h) {
+  const __m128i mask = _mm_set1_epi16(0x00FF);
+  const int max_width = w & ~15;
+  int y;
+  for (y = 0; y < h; ++y) {
+    int x;
+    for (x = 0; x < max_width; x += 16) {
+      const __m128i a = _mm_loadu_si128((const __m128i *)(src + x * 2 +  0));
+      const __m128i b = _mm_loadu_si128((const __m128i *)(src + x * 2 + 16));
+      const __m128i a_and = _mm_and_si128(a, mask);
+      const __m128i b_and = _mm_and_si128(b, mask);
+      const __m128i c = _mm_packus_epi16(a_and, b_and);
+      _mm_storeu_si128((__m128i *)(dst + x), c);
+    }
+    for (; x < w; ++x)
+      dst[x] = src[x * 2];
+    src += src_stride * 2;
+    dst += dst_stride;
+  }
+}
+
+static INLINE __m128i filter(const __m128i *const a, const __m128i *const b,
+                             const __m128i *const c, const __m128i *const d,
+                             const __m128i *const e, const __m128i *const f,
+                             const __m128i *const g, const __m128i *const h) {
+  const __m128i coeffs_ab =
+      _mm_set_epi8(6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1);
+  const __m128i coeffs_cd =
+      _mm_set_epi8(78, -19, 78, -19, 78, -19, 78, -19, 78, -19, 78, -19,
+                   78, -19, 78, -19);
+  const __m128i const64_x16 = _mm_set1_epi16(64);
+  const __m128i ab = _mm_unpacklo_epi8(*a, *b);
+  const __m128i cd = _mm_unpacklo_epi8(*c, *d);
+  const __m128i fe = _mm_unpacklo_epi8(*f, *e);
+  const __m128i hg = _mm_unpacklo_epi8(*h, *g);
+  const __m128i ab_terms = _mm_maddubs_epi16(ab, coeffs_ab);
+  const __m128i cd_terms = _mm_maddubs_epi16(cd, coeffs_cd);
+  const __m128i fe_terms = _mm_maddubs_epi16(fe, coeffs_cd);
+  const __m128i hg_terms = _mm_maddubs_epi16(hg, coeffs_ab);
+  // can not overflow
+  const __m128i abcd_terms = _mm_add_epi16(ab_terms, cd_terms);
+  // can not overflow
+  const __m128i fehg_terms = _mm_add_epi16(fe_terms, hg_terms);
+  // can overflow, use saturating add
+  const __m128i terms = _mm_adds_epi16(abcd_terms, fehg_terms);
+  const __m128i round = _mm_adds_epi16(terms, const64_x16);
+  const __m128i shift = _mm_srai_epi16(round, 7);
+  return _mm_packus_epi16(shift, shift);
+}
+
+static void eight_tap_row_ssse3(const uint8_t *src, uint8_t *dst, int w) {
+  const int max_width = w & ~7;
+  int x = 0;
+  for (; x < max_width; x += 8) {
+    const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x + 0));
+    const __m128i b = _mm_loadl_epi64((const __m128i *)(src + x + 1));
+    const __m128i c = _mm_loadl_epi64((const __m128i *)(src + x + 2));
+    const __m128i d = _mm_loadl_epi64((const __m128i *)(src + x + 3));
+    const __m128i e = _mm_loadl_epi64((const __m128i *)(src + x + 4));
+    const __m128i f = _mm_loadl_epi64((const __m128i *)(src + x + 5));
+    const __m128i g = _mm_loadl_epi64((const __m128i *)(src + x + 6));
+    const __m128i h = _mm_loadl_epi64((const __m128i *)(src + x + 7));
+    const __m128i pack = filter(&a, &b, &c, &d, &e, &f, &g, &h);
+    _mm_storel_epi64((__m128i *)(dst + x), pack);
+  }
+}
+
+static void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  int dst_w, int dst_h) {
+  dst_w /= 2;
+  dst_h /= 2;
+  {
+    DECLARE_ALIGNED(16, uint8_t, tmp[1920 * 8]);
+    uint8_t *tmp0 = tmp + dst_w * 0;
+    uint8_t *tmp1 = tmp + dst_w * 1;
+    uint8_t *tmp2 = tmp + dst_w * 2;
+    uint8_t *tmp3 = tmp + dst_w * 3;
+    uint8_t *tmp4 = tmp + dst_w * 4;
+    uint8_t *tmp5 = tmp + dst_w * 5;
+    uint8_t *tmp6 = tmp + dst_w * 6;
+    uint8_t *tmp7 = tmp + dst_w * 7;
+    uint8_t *tmp8 = NULL;
+    const int max_width = dst_w & ~7;
+    int y;
+    eight_tap_row_ssse3(src - src_stride * 3 - 3, tmp0, dst_w);
+    eight_tap_row_ssse3(src - src_stride * 2 - 3, tmp1, dst_w);
+    eight_tap_row_ssse3(src - src_stride * 1 - 3, tmp2, dst_w);
+    eight_tap_row_ssse3(src + src_stride * 0 - 3, tmp3, dst_w);
+    eight_tap_row_ssse3(src + src_stride * 1 - 3, tmp4, dst_w);
+    eight_tap_row_ssse3(src + src_stride * 2 - 3, tmp5, dst_w);
+    eight_tap_row_ssse3(src + src_stride * 3 - 3, tmp6, dst_w);
+    for (y = 0; y < dst_h; y++) {
+      int x;
+      eight_tap_row_ssse3(src + src_stride * 4 - 3, tmp7, dst_w);
+      for (x = 0; x < max_width; x += 8) {
+        const __m128i A = _mm_loadl_epi64((const __m128i *)(src  + x));
+        const __m128i B = _mm_loadl_epi64((const __m128i *)(tmp3 + x));
+        const __m128i AB = _mm_unpacklo_epi8(A, B);
+        __m128i C, D, CD;
+        _mm_storeu_si128((__m128i *)(dst + x * 2), AB);
+        {
+          const __m128i a =
+              _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 3));
+          const __m128i b =
+              _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 2));
+          const __m128i c =
+              _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 1));
+          const __m128i d =
+              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 0));
+          const __m128i e =
+              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 1));
+          const __m128i f =
+              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 2));
+          const __m128i g =
+              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 3));
+          const __m128i h =
+              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 4));
+          C = filter(&a, &b, &c, &d, &e, &f, &g, &h);
+        }
+        {
+          const __m128i a = _mm_loadl_epi64((const __m128i *)(tmp0 + x));
+          const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp1 + x));
+          const __m128i c = _mm_loadl_epi64((const __m128i *)(tmp2 + x));
+          const __m128i d = _mm_loadl_epi64((const __m128i *)(tmp3 + x));
+          const __m128i e = _mm_loadl_epi64((const __m128i *)(tmp4 + x));
+          const __m128i f = _mm_loadl_epi64((const __m128i *)(tmp5 + x));
+          const __m128i g = _mm_loadl_epi64((const __m128i *)(tmp6 + x));
+          const __m128i h = _mm_loadl_epi64((const __m128i *)(tmp7 + x));
+          D = filter(&a, &b, &c, &d, &e, &f, &g, &h);
+        }
+        CD = _mm_unpacklo_epi8(C, D);
+        _mm_storeu_si128((__m128i *)(dst + x * 2 + dst_stride), CD);
+      }
+      src += src_stride;
+      dst += dst_stride * 2;
+      tmp8 = tmp0;
+      tmp0 = tmp1;
+      tmp1 = tmp2;
+      tmp2 = tmp3;
+      tmp3 = tmp4;
+      tmp4 = tmp5;
+      tmp5 = tmp6;
+      tmp6 = tmp7;
+      tmp7 = tmp8;
+    }
+  }
+}
+
+void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
+                                      YV12_BUFFER_CONFIG *dst) {
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const int dst_w = dst->y_crop_width;
+  const int dst_h = dst->y_crop_height;
+  const int dst_uv_w = dst_w / 2;
+  const int dst_uv_h = dst_h / 2;
+
+  if (dst_w * 2 == src_w && dst_h * 2 == src_h) {
+    downsample_2_to_1_ssse3(src->y_buffer, src->y_stride,
+                            dst->y_buffer, dst->y_stride, dst_w, dst_h);
+    downsample_2_to_1_ssse3(src->u_buffer, src->uv_stride,
+                            dst->u_buffer, dst->uv_stride, dst_uv_w, dst_uv_h);
+    downsample_2_to_1_ssse3(src->v_buffer, src->uv_stride,
+                            dst->v_buffer, dst->uv_stride, dst_uv_w, dst_uv_h);
+    vpx_extend_frame_borders(dst);
+  } else if (dst_w == src_w * 2 && dst_h == src_h * 2) {
+    // The upsample() supports widths up to 1920 * 2.  If greater, fall back
+    // to vp9_scale_and_extend_frame_c().
+    if (dst_w/2 <= 1920) {
+      upsample_1_to_2_ssse3(src->y_buffer, src->y_stride,
+                            dst->y_buffer, dst->y_stride, dst_w, dst_h);
+      upsample_1_to_2_ssse3(src->u_buffer, src->uv_stride,
+                            dst->u_buffer, dst->uv_stride, dst_uv_w, dst_uv_h);
+      upsample_1_to_2_ssse3(src->v_buffer, src->uv_stride,
+                            dst->v_buffer, dst->uv_stride, dst_uv_w, dst_uv_h);
+      vpx_extend_frame_borders(dst);
+    } else {
+      vp9_scale_and_extend_frame_c(src, dst);
+    }
+  } else {
+    vp9_scale_and_extend_frame_c(src, dst);
+  }
+}
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 6ccba0f..10d6893 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -14,6 +14,7 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_encoder.h"
 #include "vpx_ports/vpx_once.h"
+#include "vpx_ports/system_state.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "./vpx_version.h"
 #include "vp9/encoder/vp9_encoder.h"
@@ -39,6 +40,7 @@ struct vp9_extracfg {
   unsigned int                rc_max_inter_bitrate_pct;
   unsigned int                gf_cbr_boost_pct;
   unsigned int                lossless;
+  unsigned int                target_level;
   unsigned int                frame_parallel_decoding_mode;
   AQ_MODE                     aq_mode;
   unsigned int                frame_periodic_boost;
@@ -68,6 +70,7 @@ static struct vp9_extracfg default_extra_cfg = {
   0,                          // rc_max_inter_bitrate_pct
   0,                          // gf_cbr_boost_pct
   0,                          // lossless
+  255,                        // target_level
   1,                          // frame_parallel_decoding_mode
   NO_AQ,                      // aq_mode
   0,                          // frame_periodic_delta_q
@@ -157,7 +160,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK(cfg, g_w,                   1, 65535);  // 16 bits available
   RANGE_CHECK(cfg, g_h,                   1, 65535);  // 16 bits available
   RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
-  RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);
+  RANGE_CHECK(cfg, g_timebase.num,        1, 1000000000);
   RANGE_CHECK_HI(cfg, g_profile,          3);
 
   RANGE_CHECK_HI(cfg, rc_max_quantizer,   63);
@@ -195,6 +198,17 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK(cfg, ss_number_layers, 1, VPX_SS_MAX_LAYERS);
   RANGE_CHECK(cfg, ts_number_layers, 1, VPX_TS_MAX_LAYERS);
 
+  {
+    unsigned int level = extra_cfg->target_level;
+    if (level != LEVEL_1 && level != LEVEL_1_1 && level != LEVEL_2 &&
+        level != LEVEL_2_1 && level != LEVEL_3 && level != LEVEL_3_1 &&
+        level != LEVEL_4 && level != LEVEL_4_1 && level != LEVEL_5 &&
+        level != LEVEL_5_1 && level != LEVEL_5_2 && level != LEVEL_6 &&
+        level != LEVEL_6_1 && level != LEVEL_6_2 &&
+        level != LEVEL_UNKNOWN && level != LEVEL_MAX)
+    ERROR("target_level is invalid");
+  }
+
   if (cfg->ss_number_layers * cfg->ts_number_layers > VPX_MAX_LAYERS)
     ERROR("ss_number_layers * ts_number_layers is out of range");
   if (cfg->ts_number_layers > 1) {
@@ -485,7 +499,16 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->content = extra_cfg->content;
 
   oxcf->tile_columns = extra_cfg->tile_columns;
-  oxcf->tile_rows    = extra_cfg->tile_rows;
+
+  // TODO(yunqing): The dependencies between row tiles cause error in multi-
+  // threaded encoding. For now, tile_rows is forced to be 0 in this case.
+  // The further fix can be done by adding synchronizations after a tile row
+  // is encoded. But this will hurt multi-threaded encoder performance. So,
+  // it is recommended to use tile-rows=0 while encoding with threads > 1.
+  if (oxcf->max_threads > 1 && oxcf->tile_columns > 0)
+    oxcf->tile_rows  = 0;
+  else
+    oxcf->tile_rows  = extra_cfg->tile_rows;
 
   oxcf->error_resilient_mode         = cfg->g_error_resilient;
   oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
@@ -499,6 +522,8 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->temporal_layering_mode = (enum vp9e_temporal_layering_mode)
       cfg->temporal_layering_mode;
 
+  oxcf->target_level = extra_cfg->target_level;
+
   for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
 #if CONFIG_SPATIAL_SVC
     oxcf->ss_enable_auto_arf[sl] = cfg->ss_enable_auto_alt_ref[sl];
@@ -525,6 +550,7 @@ static vpx_codec_err_t set_encoder_config(
   /*
   printf("Current VP9 Settings: \n");
   printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
+  printf("target_level: %d\n", oxcf->target_level);
   printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);
   printf("sharpness: %d\n",    oxcf->sharpness);
   printf("cpu_used: %d\n",  oxcf->cpu_used);
@@ -774,6 +800,20 @@ static vpx_codec_err_t ctrl_set_frame_periodic_boost(vpx_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_target_level(vpx_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.target_level = CAST(VP9E_SET_TARGET_LEVEL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_get_level(vpx_codec_alg_priv_t *ctx, va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
+  *arg = (int)vp9_get_level(&ctx->cpi->level_info.level_spec);
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
                                     vpx_codec_priv_enc_mr_cfg_t *data) {
   vpx_codec_err_t res = VPX_CODEC_OK;
@@ -865,6 +905,11 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
       break;
   }
 
+  if (deadline == VPX_DL_REALTIME) {
+    ctx->oxcf.pass = 0;
+    new_mode = REALTIME;
+  }
+
   if (ctx->oxcf.mode != new_mode) {
     ctx->oxcf.mode = new_mode;
     vp9_change_config(ctx->cpi, &ctx->oxcf);
@@ -931,9 +976,6 @@ static int write_superframe_index(vpx_codec_alg_priv_t *ctx) {
   return index_sz;
 }
 
-// vp9 uses 10,000,000 ticks/second as time stamp
-#define TICKS_PER_SEC 10000000LL
-
 static int64_t timebase_units_to_ticks(const vpx_rational_t *timebase,
                                        int64_t n) {
   return n * TICKS_PER_SEC * timebase->num / timebase->den;
@@ -941,7 +983,7 @@ static int64_t timebase_units_to_ticks(const vpx_rational_t *timebase,
 
 static int64_t ticks_to_timebase_units(const vpx_rational_t *timebase,
                                        int64_t n) {
-  const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
+  const int64_t round = (int64_t)TICKS_PER_SEC * timebase->num / 2 - 1;
   return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
 }
 
@@ -963,28 +1005,30 @@ static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
   return flags;
 }
 
+const size_t kMinCompressedSize = 8192;
 static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
                                       const vpx_image_t *img,
                                       vpx_codec_pts_t pts,
                                       unsigned long duration,
-                                      vpx_enc_frame_flags_t flags,
+                                      vpx_enc_frame_flags_t enc_flags,
                                       unsigned long deadline) {
-  vpx_codec_err_t res = VPX_CODEC_OK;
+  volatile vpx_codec_err_t res = VPX_CODEC_OK;
+  volatile vpx_enc_frame_flags_t flags = enc_flags;
   VP9_COMP *const cpi = ctx->cpi;
   const vpx_rational_t *const timebase = &ctx->cfg.g_timebase;
   size_t data_sz;
 
+  if (cpi == NULL) return VPX_CODEC_INVALID_PARAM;
+
   if (img != NULL) {
     res = validate_img(ctx, img);
-    // TODO(jzern) the checks related to cpi's validity should be treated as a
-    // failure condition, encoder setup is done fully in init() currently.
-    if (res == VPX_CODEC_OK && cpi != NULL) {
+    if (res == VPX_CODEC_OK) {
       // There's no codec control for multiple alt-refs so check the encoder
       // instance for its status to determine the compressed data size.
       data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 *
                 (cpi->multi_arf_allowed ? 8 : 2);
-      if (data_sz < 4096)
-        data_sz = 4096;
+      if (data_sz < kMinCompressedSize)
+        data_sz = kMinCompressedSize;
       if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
         ctx->cx_data_sz = data_sz;
         free(ctx->cx_data);
@@ -1006,6 +1050,14 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
     return VPX_CODEC_INVALID_PARAM;
   }
 
+  if (setjmp(cpi->common.error.jmp)) {
+    cpi->common.error.setjmp = 0;
+    res = update_error_state(ctx, &cpi->common.error);
+    vpx_clear_system_state();
+    return res;
+  }
+  cpi->common.error.setjmp = 1;
+
   vp9_apply_encoding_flags(cpi, flags);
 
   // Handle fixed keyframe intervals
@@ -1017,8 +1069,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
     }
   }
 
-  // Initialize the encoder instance on the first frame.
-  if (res == VPX_CODEC_OK && cpi != NULL) {
+  if (res == VPX_CODEC_OK) {
     unsigned int lib_flags = 0;
     YV12_BUFFER_CONFIG sd;
     int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts);
@@ -1057,7 +1108,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
        * the buffer size anyway.
        */
       if (cx_data_sz < ctx->cx_data_sz / 2) {
-        ctx->base.err_detail = "Compressed data buffer too small";
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_ERROR,
+                           "Compressed data buffer too small");
         return VPX_CODEC_ERROR;
       }
     }
@@ -1175,6 +1227,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
     }
   }
 
+  cpi->common.error.setjmp = 0;
   return res;
 }
 
@@ -1393,8 +1446,8 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
           LAYER_IDS_TO_IDX(sl, tl, cpi->svc.number_temporal_layers);
       LAYER_CONTEXT *lc =
           &cpi->svc.layer_context[layer];
-      lc->max_q = params->max_quantizers[sl];
-      lc->min_q = params->min_quantizers[sl];
+      lc->max_q = params->max_quantizers[layer];
+      lc->min_q = params->min_quantizers[layer];
       lc->scaling_factor_num = params->scaling_factor_num[sl];
       lc->scaling_factor_den = params->scaling_factor_den[sl];
     }
@@ -1496,6 +1549,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   {VP9E_SET_MAX_GF_INTERVAL,          ctrl_set_max_gf_interval},
   {VP9E_SET_SVC_REF_FRAME_CONFIG,     ctrl_set_svc_ref_frame_config},
   {VP9E_SET_RENDER_SIZE,              ctrl_set_render_size},
+  {VP9E_SET_TARGET_LEVEL,             ctrl_set_target_level},
 
   // Getters
   {VP8E_GET_LAST_QUANTIZER,           ctrl_get_quantizer},
@@ -1503,6 +1557,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   {VP9_GET_REFERENCE,                 ctrl_get_reference},
   {VP9E_GET_SVC_LAYER_ID,             ctrl_get_svc_layer_id},
   {VP9E_GET_ACTIVEMAP,                ctrl_get_active_map},
+  {VP9E_GET_LEVEL,                    ctrl_get_level},
 
   { -1, NULL},
 };
@@ -1555,7 +1610,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
       // keyframing settings (kf)
       VPX_KF_AUTO,        // g_kfmode
       0,                  // kf_min_dist
-      9999,               // kf_max_dist
+      128,                // kf_max_dist
 
       VPX_SS_DEFAULT_LAYERS,  // ss_number_layers
       {0},
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index be5d160..6531e2c 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -127,7 +127,7 @@ static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data,
                                                 vpx_decrypt_cb decrypt_cb,
                                                 void *decrypt_state) {
   int intra_only_flag = 0;
-  uint8_t clear_buffer[9];
+  uint8_t clear_buffer[10];
 
   if (data + data_sz <= data)
     return VPX_CODEC_INVALID_PARAM;
@@ -141,6 +141,11 @@ static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data,
     data = clear_buffer;
   }
 
+  // A maximum of 6 bits are needed to read the frame marker, profile and
+  // show_existing_frame.
+  if (data_sz < 1)
+    return VPX_CODEC_UNSUP_BITSTREAM;
+
   {
     int show_frame;
     int error_resilient;
@@ -154,15 +159,19 @@ static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data,
     if (profile >= MAX_PROFILES)
       return VPX_CODEC_UNSUP_BITSTREAM;
 
-    if ((profile >= 2 && data_sz <= 1) || data_sz < 1)
-      return VPX_CODEC_UNSUP_BITSTREAM;
-
     if (vpx_rb_read_bit(&rb)) {  // show an existing frame
+      // If profile is > 2 and show_existing_frame is true, then at least 1 more
+      // byte (6+3=9 bits) is needed.
+      if (profile > 2 && data_sz < 2)
+        return VPX_CODEC_UNSUP_BITSTREAM;
       vpx_rb_read_literal(&rb, 3);  // Frame buffer to show.
       return VPX_CODEC_OK;
     }
 
-    if (data_sz <= 8)
+    // For the rest of the function, a maximum of 9 more bytes are needed
+    // (computed by taking the maximum possible bits needed in each case). Note
+    // that this has to be updated if we read any more bits in this function.
+    if (data_sz < 10)
       return VPX_CODEC_UNSUP_BITSTREAM;
 
     si->is_kf = !vpx_rb_read_bit(&rb);
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 25a176f..5f3de8f 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -17,7 +17,6 @@ VP9_CX_SRCS_REMOVE-no  += $(VP9_COMMON_SRCS_REMOVE-no)
 
 VP9_CX_SRCS-yes += vp9_cx_iface.c
 
-VP9_CX_SRCS-yes += encoder/vp9_avg.c
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
 VP9_CX_SRCS-yes += encoder/vp9_context_tree.c
 VP9_CX_SRCS-yes += encoder/vp9_context_tree.h
@@ -76,12 +75,16 @@ VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
 VP9_CX_SRCS-yes += encoder/vp9_aq_variance.c
 VP9_CX_SRCS-yes += encoder/vp9_aq_variance.h
+VP9_CX_SRCS-yes += encoder/vp9_aq_360.c
+VP9_CX_SRCS-yes += encoder/vp9_aq_360.h
 VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.c
 VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.h
 VP9_CX_SRCS-yes += encoder/vp9_aq_complexity.c
 VP9_CX_SRCS-yes += encoder/vp9_aq_complexity.h
 VP9_CX_SRCS-yes += encoder/vp9_skin_detection.c
 VP9_CX_SRCS-yes += encoder/vp9_skin_detection.h
+VP9_CX_SRCS-yes += encoder/vp9_noise_estimate.c
+VP9_CX_SRCS-yes += encoder/vp9_noise_estimate.h
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
@@ -91,15 +94,15 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
+VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 endif
 
 ifeq ($(CONFIG_USE_X86INC),yes)
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
 VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
@@ -111,12 +114,14 @@ endif
 ifeq ($(ARCH_X86_64),yes)
 ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3_x86_64.asm
 endif
 endif
 
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c
+endif
 
 ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c
@@ -128,10 +133,8 @@ ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
 endif
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
 
-VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c
diff --git a/vpx/exports_enc b/vpx/exports_enc
index e4707ba..914e36c 100644
--- a/vpx/exports_enc
+++ b/vpx/exports_enc
@@ -7,9 +7,3 @@ text vpx_codec_get_cx_data
 text vpx_codec_get_global_headers
 text vpx_codec_get_preview_frame
 text vpx_codec_set_cx_data_buf
-text vpx_svc_dump_statistics
-text vpx_svc_encode
-text vpx_svc_get_message
-text vpx_svc_init
-text vpx_svc_release
-text vpx_svc_set_options
diff --git a/vpx/exports_spatial_svc b/vpx/exports_spatial_svc
new file mode 100644
index 0000000..d258a1d
--- /dev/null
+++ b/vpx/exports_spatial_svc
@@ -0,0 +1,6 @@
+text vpx_svc_dump_statistics
+text vpx_svc_encode
+text vpx_svc_get_message
+text vpx_svc_init
+text vpx_svc_release
+text vpx_svc_set_options
diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index ff60083..ef9b352 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -322,8 +322,7 @@ void assign_layer_bitrates(const SvcContext *svc_ctx,
 
       for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
         if (si->svc_params.scaling_factor_den[sl] > 0) {
-          alloc_ratio[sl] = (float)(si->svc_params.scaling_factor_num[sl] *
-              1.0 / si->svc_params.scaling_factor_den[sl]);
+          alloc_ratio[sl] = (float)( pow(2, sl) );
           total += alloc_ratio[sl];
         }
       }
@@ -334,9 +333,9 @@ void assign_layer_bitrates(const SvcContext *svc_ctx,
                 alloc_ratio[sl] / total);
         if (svc_ctx->temporal_layering_mode == 3) {
           enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers] =
-              spatial_layer_target >> 1;
+              (spatial_layer_target*6)/10;  // 60%
           enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + 1] =
-              (spatial_layer_target >> 1) + (spatial_layer_target >> 2);
+              (spatial_layer_target*8)/10;  // 80%
           enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + 2] =
               spatial_layer_target;
         } else if (svc_ctx->temporal_layering_mode == 2 ||
@@ -385,7 +384,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
                              vpx_codec_iface_t *iface,
                              vpx_codec_enc_cfg_t *enc_cfg) {
   vpx_codec_err_t res;
-  int i;
+  int i, sl , tl;
   SvcInternal_t *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || codec_ctx == NULL || iface == NULL ||
       enc_cfg == NULL) {
@@ -398,11 +397,6 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
   si->width = enc_cfg->g_w;
   si->height = enc_cfg->g_h;
 
-  if (enc_cfg->kf_max_dist < 2) {
-    svc_log(svc_ctx, SVC_LOG_ERROR, "key frame distance too small: %d\n",
-            enc_cfg->kf_max_dist);
-    return VPX_CODEC_INVALID_PARAM;
-  }
   si->kf_dist = enc_cfg->kf_max_dist;
 
   if (svc_ctx->spatial_layers == 0)
@@ -423,11 +417,16 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
     svc_ctx->temporal_layers = 2;
   }
 
-  for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) {
-    si->svc_params.max_quantizers[i] = MAX_QUANTIZER;
-    si->svc_params.min_quantizers[i] = 0;
-    si->svc_params.scaling_factor_num[i] = DEFAULT_SCALE_FACTORS_NUM[i];
-    si->svc_params.scaling_factor_den[i] = DEFAULT_SCALE_FACTORS_DEN[i];
+  for (sl = 0; sl < VPX_SS_MAX_LAYERS; ++sl) {
+    si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM[sl];
+    si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN[sl];
+  }
+  for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
+    for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
+      i = sl * svc_ctx->temporal_layers + tl;
+      si->svc_params.max_quantizers[i] = MAX_QUANTIZER;
+      si->svc_params.min_quantizers[i] = 0;
+    }
   }
 
   // Parse aggregate command line options. Options must start with
@@ -485,6 +484,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
     enc_cfg->rc_buf_initial_sz = 500;
     enc_cfg->rc_buf_optimal_sz = 600;
     enc_cfg->rc_buf_sz = 1000;
+    enc_cfg->rc_dropframe_thresh = 0;
   }
 
   if (enc_cfg->g_error_resilient == 0 && si->use_multiple_frame_contexts == 0)
@@ -571,6 +571,27 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx,
       }
 #endif
 #endif
+      case VPX_CODEC_PSNR_PKT:
+      {
+#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
+        int j;
+        svc_log(svc_ctx, SVC_LOG_DEBUG,
+                "frame: %d, layer: %d, PSNR(Total/Y/U/V): "
+                "%2.3f  %2.3f  %2.3f  %2.3f \n",
+                si->psnr_pkt_received, 0,
+                cx_pkt->data.layer_psnr[0].psnr[0],
+                cx_pkt->data.layer_psnr[0].psnr[1],
+                cx_pkt->data.layer_psnr[0].psnr[2],
+                cx_pkt->data.layer_psnr[0].psnr[3]);
+        for (j = 0; j < COMPONENTS; ++j) {
+          si->psnr_sum[0][j] +=
+              cx_pkt->data.layer_psnr[0].psnr[j];
+          si->sse_sum[0][j] += cx_pkt->data.layer_psnr[0].sse[j];
+        }
+#endif
+      }
+      ++si->psnr_pkt_received;
+      break;
       default: {
         break;
       }
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index bd99c6d..61882e6 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -45,15 +45,6 @@ extern vpx_codec_iface_t  vpx_codec_vp9_cx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp9_cx(void);
 /*!@} - end algorithm interface member group*/
 
-/*!\name Algorithm interface for VP10
- *
- * This interface provides the capability to encode raw VP9 streams.
- * @{
- */
-extern vpx_codec_iface_t  vpx_codec_vp10_cx_algo;
-extern vpx_codec_iface_t *vpx_codec_vp10_cx(void);
-/*!@} - end algorithm interface member group*/
-
 /*
  * Algorithm Flags
  */
@@ -554,6 +545,21 @@ enum vp8e_enc_control_id {
    * Supported in codecs: VP9
    */
   VP9E_SET_RENDER_SIZE,
+
+  /*!\brief Codec control function to set target level.
+   *
+   * 255: off (default); 0: only keep level stats; 10: target for level 1.0;
+   * 11: target for level 1.1; ... 62: target for level 6.2
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_TARGET_LEVEL,
+
+  /*!\brief Codec control function to get bitstream level.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_LEVEL
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -809,6 +815,12 @@ VPX_CTRL_USE_TYPE(VP9E_SET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *)
 VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
 #define VPX_CTRL_VP9E_SET_RENDER_SIZE
 
+VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL,  unsigned int)
+#define VPX_CTRL_VP9E_SET_TARGET_LEVEL
+
+VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)
+#define VPX_CTRL_VP9E_GET_LEVEL
+
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index 1f02fd5..67c97bb 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -46,15 +46,6 @@ extern vpx_codec_iface_t  vpx_codec_vp9_dx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp9_dx(void);
 /*!@} - end algorithm interface member group*/
 
-/*!\name Algorithm interface for VP10
- *
- * This interface provides the capability to decode VP10 streams.
- * @{
- */
-extern vpx_codec_iface_t  vpx_codec_vp10_dx_algo;
-extern vpx_codec_iface_t *vpx_codec_vp10_dx(void);
-/*!@} - end algorithm interface member group*/
-
 /*!\enum vp8_dec_control_id
  * \brief VP8 decoder control functions
  *
diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index e9e952c..7958c69 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -28,7 +28,7 @@ extern "C" {
    * types, removing or reassigning enums, adding/removing/rearranging
    * fields to structures
    */
-#define VPX_IMAGE_ABI_VERSION (3) /**<\hideinitializer*/
+#define VPX_IMAGE_ABI_VERSION (4) /**<\hideinitializer*/
 
 
 #define VPX_IMG_FMT_PLANAR     0x100  /**< Image is a planar format. */
diff --git a/vpx_dsp/add_noise.c b/vpx_dsp/add_noise.c
new file mode 100644
index 0000000..682b444
--- /dev/null
+++ b/vpx_dsp/add_noise.c
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+void vpx_plane_add_noise_c(uint8_t *start, char *noise,
+                           char blackclamp[16],
+                           char whiteclamp[16],
+                           char bothclamp[16],
+                           unsigned int width, unsigned int height, int pitch) {
+  unsigned int i, j;
+
+  for (i = 0; i < height; i++) {
+    uint8_t *pos = start + i * pitch;
+    char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
+
+    for (j = 0; j < width; j++) {
+      int v = pos[j];
+
+      v = clamp(v - blackclamp[0], 0, 255);
+      v = clamp(v + bothclamp[0], 0, 255);
+      v = clamp(v - whiteclamp[0], 0, 255);
+
+      pos[j] = v + ref[j];
+    }
+  }
+}
diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vpx_dsp/arm/avg_neon.c
similarity index 55%
rename from vp9/encoder/arm/neon/vp9_avg_neon.c
rename to vpx_dsp/arm/avg_neon.c
index d569ec9..e52958c 100644
--- a/vp9/encoder/arm/neon/vp9_avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -11,7 +11,7 @@
 #include <arm_neon.h>
 #include <assert.h>
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 
 #include "vpx/vpx_integer.h"
@@ -24,7 +24,19 @@ static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
   return vget_lane_u32(c, 0);
 }
 
-unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
+unsigned int vpx_avg_4x4_neon(const uint8_t *s, int p) {
+  uint16x8_t v_sum;
+  uint32x2_t v_s0 = vdup_n_u32(0);
+  uint32x2_t v_s1 = vdup_n_u32(0);
+  v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0);
+  v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1);
+  v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0);
+  v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1);
+  v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1));
+  return (horizontal_add_u16x8(v_sum) + 8) >> 4;
+}
+
+unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) {
   uint8x8_t v_s0 = vld1_u8(s);
   const uint8x8_t v_s1 = vld1_u8(s + p);
   uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
@@ -50,7 +62,34 @@ unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
   return (horizontal_add_u16x8(v_sum) + 32) >> 6;
 }
 
-void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+int vpx_satd_neon(const int16_t *coeff, int length) {
+  const int16x4_t zero = vdup_n_s16(0);
+  int32x4_t accum = vdupq_n_s32(0);
+
+  do {
+    const int16x8_t src0 = vld1q_s16(coeff);
+    const int16x8_t src8 = vld1q_s16(coeff + 8);
+    accum = vabal_s16(accum, vget_low_s16(src0), zero);
+    accum = vabal_s16(accum, vget_high_s16(src0), zero);
+    accum = vabal_s16(accum, vget_low_s16(src8), zero);
+    accum = vabal_s16(accum, vget_high_s16(src8), zero);
+    length -= 16;
+    coeff += 16;
+  } while (length != 0);
+
+  {
+    // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+    const int64x2_t s0 = vpaddlq_s32(accum);  // cascading summation of 'accum'.
+    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
+                                  vreinterpret_s32_s64(vget_high_s64(s0)));
+    const int satd = vget_lane_s32(s1, 0);
+    return satd;
+  }
+}
+
+void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
                           const int ref_stride, const int height) {
   int i;
   uint16x8_t vec_sum_lo = vdupq_n_u16(0);
@@ -103,7 +142,7 @@ void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
   vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
 }
 
-int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width) {
+int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
   int i;
   uint16x8_t vec_sum = vdupq_n_u16(0);
 
@@ -119,7 +158,7 @@ int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width) {
 
 // ref, src = [0, 510] - max diff = 16-bits
 // bwl = {2, 3, 4}, width = {16, 32, 64}
-int vp9_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
+int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
   int width = 4 << bwl;
   int32x4_t sse = vdupq_n_s32(0);
   int16x8_t total = vdupq_n_s16(0);
@@ -158,3 +197,60 @@ int vp9_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
     return s - ((t * t) >> shift_factor);
   }
 }
+
+void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride,
+                         const uint8_t *b, int b_stride,
+                         int *min, int *max) {
+  // Load and concatenate.
+  const uint8x16_t a01 = vcombine_u8(vld1_u8(a),
+                                     vld1_u8(a + a_stride));
+  const uint8x16_t a23 = vcombine_u8(vld1_u8(a + 2 * a_stride),
+                                     vld1_u8(a + 3 * a_stride));
+  const uint8x16_t a45 = vcombine_u8(vld1_u8(a + 4 * a_stride),
+                                     vld1_u8(a + 5 * a_stride));
+  const uint8x16_t a67 = vcombine_u8(vld1_u8(a + 6 * a_stride),
+                                     vld1_u8(a + 7 * a_stride));
+
+  const uint8x16_t b01 = vcombine_u8(vld1_u8(b),
+                                     vld1_u8(b + b_stride));
+  const uint8x16_t b23 = vcombine_u8(vld1_u8(b + 2 * b_stride),
+                                     vld1_u8(b + 3 * b_stride));
+  const uint8x16_t b45 = vcombine_u8(vld1_u8(b + 4 * b_stride),
+                                     vld1_u8(b + 5 * b_stride));
+  const uint8x16_t b67 = vcombine_u8(vld1_u8(b + 6 * b_stride),
+                                     vld1_u8(b + 7 * b_stride));
+
+  // Absolute difference.
+  const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
+  const uint8x16_t ab23_diff = vabdq_u8(a23, b23);
+  const uint8x16_t ab45_diff = vabdq_u8(a45, b45);
+  const uint8x16_t ab67_diff = vabdq_u8(a67, b67);
+
+  // Max values between the Q vectors.
+  const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff);
+  const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff);
+  const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff);
+  const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff);
+
+  const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
+  const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
+
+  // Split to D and start doing pairwise.
+  uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
+  uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
+
+  // Enough runs of vpmax/min propogate the max/min values to every position.
+  ab_max = vpmax_u8(ab_max, ab_max);
+  ab_min = vpmin_u8(ab_min, ab_min);
+
+  ab_max = vpmax_u8(ab_max, ab_max);
+  ab_min = vpmin_u8(ab_min, ab_min);
+
+  ab_max = vpmax_u8(ab_max, ab_max);
+  ab_min = vpmin_u8(ab_min, ab_min);
+
+  *min = *max = 0;  // Clear high bits
+  // Store directly to avoid costly neon->gpr transfer.
+  vst1_lane_u8((uint8_t *)max, ab_max, 0);
+  vst1_lane_u8((uint8_t *)min, ab_min, 0);
+}
diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c
new file mode 100644
index 0000000..21e3e3d
--- /dev/null
+++ b/vpx_dsp/arm/hadamard_neon.c
@@ -0,0 +1,201 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1,
+                                 int16x8_t *a2, int16x8_t *a3,
+                                 int16x8_t *a4, int16x8_t *a5,
+                                 int16x8_t *a6, int16x8_t *a7) {
+  const int16x8_t b0 = vaddq_s16(*a0, *a1);
+  const int16x8_t b1 = vsubq_s16(*a0, *a1);
+  const int16x8_t b2 = vaddq_s16(*a2, *a3);
+  const int16x8_t b3 = vsubq_s16(*a2, *a3);
+  const int16x8_t b4 = vaddq_s16(*a4, *a5);
+  const int16x8_t b5 = vsubq_s16(*a4, *a5);
+  const int16x8_t b6 = vaddq_s16(*a6, *a7);
+  const int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+  const int16x8_t c0 = vaddq_s16(b0, b2);
+  const int16x8_t c1 = vaddq_s16(b1, b3);
+  const int16x8_t c2 = vsubq_s16(b0, b2);
+  const int16x8_t c3 = vsubq_s16(b1, b3);
+  const int16x8_t c4 = vaddq_s16(b4, b6);
+  const int16x8_t c5 = vaddq_s16(b5, b7);
+  const int16x8_t c6 = vsubq_s16(b4, b6);
+  const int16x8_t c7 = vsubq_s16(b5, b7);
+
+  *a0 = vaddq_s16(c0, c4);
+  *a1 = vsubq_s16(c2, c6);
+  *a2 = vsubq_s16(c0, c4);
+  *a3 = vaddq_s16(c2, c6);
+  *a4 = vaddq_s16(c3, c7);
+  *a5 = vsubq_s16(c3, c7);
+  *a6 = vsubq_s16(c1, c5);
+  *a7 = vaddq_s16(c1, c5);
+}
+
+// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider
+// reversing transpose order which may make it easier for the compiler to
+// reconcile the vtrn.64 moves.
+static void transpose8x8(int16x8_t *a0, int16x8_t *a1,
+                         int16x8_t *a2, int16x8_t *a3,
+                         int16x8_t *a4, int16x8_t *a5,
+                         int16x8_t *a6, int16x8_t *a7) {
+  // Swap 64 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 08 09 10 11 12 13 14 15
+  // a2: 16 17 18 19 20 21 22 23
+  // a3: 24 25 26 27 28 29 30 31
+  // a4: 32 33 34 35 36 37 38 39
+  // a5: 40 41 42 43 44 45 46 47
+  // a6: 48 49 50 51 52 53 54 55
+  // a7: 56 57 58 59 60 61 62 63
+  // to:
+  // a04_lo: 00 01 02 03 32 33 34 35
+  // a15_lo: 08 09 10 11 40 41 42 43
+  // a26_lo: 16 17 18 19 48 49 50 51
+  // a37_lo: 24 25 26 27 56 57 58 59
+  // a04_hi: 04 05 06 07 36 37 38 39
+  // a15_hi: 12 13 14 15 44 45 46 47
+  // a26_hi: 20 21 22 23 52 53 54 55
+  // a37_hi: 28 29 30 31 60 61 62 63
+  const int16x8_t a04_lo = vcombine_s16(vget_low_s16(*a0), vget_low_s16(*a4));
+  const int16x8_t a15_lo = vcombine_s16(vget_low_s16(*a1), vget_low_s16(*a5));
+  const int16x8_t a26_lo = vcombine_s16(vget_low_s16(*a2), vget_low_s16(*a6));
+  const int16x8_t a37_lo = vcombine_s16(vget_low_s16(*a3), vget_low_s16(*a7));
+  const int16x8_t a04_hi = vcombine_s16(vget_high_s16(*a0), vget_high_s16(*a4));
+  const int16x8_t a15_hi = vcombine_s16(vget_high_s16(*a1), vget_high_s16(*a5));
+  const int16x8_t a26_hi = vcombine_s16(vget_high_s16(*a2), vget_high_s16(*a6));
+  const int16x8_t a37_hi = vcombine_s16(vget_high_s16(*a3), vget_high_s16(*a7));
+
+  // Swap 32 bit elements resulting in:
+  // a0246_lo:
+  // 00 01 16 17 32 33 48 49
+  // 02 03 18 19 34 35 50 51
+  // a1357_lo:
+  // 08 09 24 25 40 41 56 57
+  // 10 11 26 27 42 43 58 59
+  // a0246_hi:
+  // 04 05 20 21 36 37 52 53
+  // 06 07 22 23 38 39 54 55
+  // a1657_hi:
+  // 12 13 28 29 44 45 60 61
+  // 14 15 30 31 46 47 62 63
+  const int32x4x2_t a0246_lo = vtrnq_s32(vreinterpretq_s32_s16(a04_lo),
+                                         vreinterpretq_s32_s16(a26_lo));
+  const int32x4x2_t a1357_lo = vtrnq_s32(vreinterpretq_s32_s16(a15_lo),
+                                         vreinterpretq_s32_s16(a37_lo));
+  const int32x4x2_t a0246_hi = vtrnq_s32(vreinterpretq_s32_s16(a04_hi),
+                                         vreinterpretq_s32_s16(a26_hi));
+  const int32x4x2_t a1357_hi = vtrnq_s32(vreinterpretq_s32_s16(a15_hi),
+                                         vreinterpretq_s32_s16(a37_hi));
+
+  // Swap 16 bit elements resulting in:
+  // b0:
+  // 00 08 16 24 32 40 48 56
+  // 01 09 17 25 33 41 49 57
+  // b1:
+  // 02 10 18 26 34 42 50 58
+  // 03 11 19 27 35 43 51 59
+  // b2:
+  // 04 12 20 28 36 44 52 60
+  // 05 13 21 29 37 45 53 61
+  // b3:
+  // 06 14 22 30 38 46 54 62
+  // 07 15 23 31 39 47 55 63
+  const int16x8x2_t b0 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[0]),
+                                   vreinterpretq_s16_s32(a1357_lo.val[0]));
+  const int16x8x2_t b1 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[1]),
+                                   vreinterpretq_s16_s32(a1357_lo.val[1]));
+  const int16x8x2_t b2 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[0]),
+                                   vreinterpretq_s16_s32(a1357_hi.val[0]));
+  const int16x8x2_t b3 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[1]),
+                                   vreinterpretq_s16_s32(a1357_hi.val[1]));
+
+  *a0 = b0.val[0];
+  *a1 = b0.val[1];
+  *a2 = b1.val[0];
+  *a3 = b1.val[1];
+  *a4 = b2.val[0];
+  *a5 = b2.val[1];
+  *a6 = b3.val[0];
+  *a7 = b3.val[1];
+}
+
+void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
+                           int16_t *coeff) {
+  int16x8_t a0 = vld1q_s16(src_diff);
+  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
+  int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
+  int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
+  int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
+  int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
+  int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
+  int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  transpose8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  // Skip the second transpose because it is not required.
+
+  vst1q_s16(coeff + 0, a0);
+  vst1q_s16(coeff + 8, a1);
+  vst1q_s16(coeff + 16, a2);
+  vst1q_s16(coeff + 24, a3);
+  vst1q_s16(coeff + 32, a4);
+  vst1q_s16(coeff + 40, a5);
+  vst1q_s16(coeff + 48, a6);
+  vst1q_s16(coeff + 56, a7);
+}
+
+void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
+                             int16_t *coeff) {
+  int i;
+
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  vpx_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+  /* Top right. */
+  vpx_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+  /* Bottom left. */
+  vpx_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+  /* Bottom right. */
+  vpx_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+  for (i = 0; i < 64; i += 8) {
+    const int16x8_t a0 = vld1q_s16(coeff + 0);
+    const int16x8_t a1 = vld1q_s16(coeff + 64);
+    const int16x8_t a2 = vld1q_s16(coeff + 128);
+    const int16x8_t a3 = vld1q_s16(coeff + 192);
+
+    const int16x8_t b0 = vhaddq_s16(a0, a1);
+    const int16x8_t b1 = vhsubq_s16(a0, a1);
+    const int16x8_t b2 = vhaddq_s16(a2, a3);
+    const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+    const int16x8_t c0 = vaddq_s16(b0, b2);
+    const int16x8_t c1 = vaddq_s16(b1, b3);
+    const int16x8_t c2 = vsubq_s16(b0, b2);
+    const int16x8_t c3 = vsubq_s16(b1, b3);
+
+    vst1q_s16(coeff + 0, c0);
+    vst1q_s16(coeff + 64, c1);
+    vst1q_s16(coeff + 128, c2);
+    vst1q_s16(coeff + 192, c3);
+
+    coeff += 8;
+  }
+}
diff --git a/vpx_dsp/arm/loopfilter_4_neon.asm b/vpx_dsp/arm/loopfilter_4_neon.asm
index e45e34c..9371158 100644
--- a/vpx_dsp/arm/loopfilter_4_neon.asm
+++ b/vpx_dsp/arm/loopfilter_4_neon.asm
@@ -16,37 +16,28 @@
 
 ; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 ; works on 16 iterations at a time.
-; TODO(fgalligan): See about removing the count code as this function is only
-; called with a count of 1.
 ;
 ; void vpx_lpf_horizontal_4_neon(uint8_t *s,
 ;                                int p /* pitch */,
 ;                                const uint8_t *blimit,
 ;                                const uint8_t *limit,
-;                                const uint8_t *thresh,
-;                                int count)
+;                                const uint8_t *thresh)
 ;
 ; r0    uint8_t *s,
 ; r1    int p, /* pitch */
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-; sp+4  int count
 |vpx_lpf_horizontal_4_neon| PROC
     push        {lr}
 
     vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r12, [sp, #8]              ; load count
     ldr         r2, [sp, #4]               ; load thresh
     add         r1, r1, r1                 ; double pitch
 
-    cmp         r12, #0
-    beq         end_vpx_lf_h_edge
-
     vld1.8      {d1[]}, [r3]               ; duplicate *limit
     vld1.8      {d2[]}, [r2]               ; duplicate *thresh
 
-count_lf_h_loop
     sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
     add         r3, r2, r1, lsr #1         ; set to 3 lines down
 
@@ -69,47 +60,34 @@ count_lf_h_loop
     vst1.u8     {d6}, [r2 at 64], r1          ; store oq0
     vst1.u8     {d7}, [r3 at 64], r1          ; store oq1
 
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         count_lf_h_loop
-
-end_vpx_lf_h_edge
     pop         {pc}
     ENDP        ; |vpx_lpf_horizontal_4_neon|
 
 ; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 ; works on 16 iterations at a time.
-; TODO(fgalligan): See about removing the count code as this function is only
-; called with a count of 1.
 ;
 ; void vpx_lpf_vertical_4_neon(uint8_t *s,
 ;                              int p /* pitch */,
 ;                              const uint8_t *blimit,
 ;                              const uint8_t *limit,
-;                              const uint8_t *thresh,
-;                              int count)
+;                              const uint8_t *thresh)
 ;
 ; r0    uint8_t *s,
 ; r1    int p, /* pitch */
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-; sp+4  int count
 |vpx_lpf_vertical_4_neon| PROC
     push        {lr}
 
     vld1.8      {d0[]}, [r2]              ; duplicate *blimit
-    ldr         r12, [sp, #8]             ; load count
     vld1.8      {d1[]}, [r3]              ; duplicate *limit
 
     ldr         r3, [sp, #4]              ; load thresh
     sub         r2, r0, #4                ; move s pointer down by 4 columns
-    cmp         r12, #0
-    beq         end_vpx_lf_v_edge
 
     vld1.8      {d2[]}, [r3]              ; duplicate *thresh
 
-count_lf_v_loop
     vld1.u8     {d3}, [r2], r1             ; load s data
     vld1.u8     {d4}, [r2], r1
     vld1.u8     {d5}, [r2], r1
@@ -149,12 +127,6 @@ count_lf_v_loop
     vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
     vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
 
-    add         r0, r0, r1, lsl #3         ; s += pitch * 8
-    subs        r12, r12, #1
-    subne       r2, r0, #4                 ; move s pointer down by 4 columns
-    bne         count_lf_v_loop
-
-end_vpx_lf_v_edge
     pop         {pc}
     ENDP        ; |vpx_lpf_vertical_4_neon|
 
diff --git a/vpx_dsp/arm/loopfilter_4_neon.c b/vpx_dsp/arm/loopfilter_4_neon.c
index 7ad411a..7f3ee70 100644
--- a/vpx_dsp/arm/loopfilter_4_neon.c
+++ b/vpx_dsp/arm/loopfilter_4_neon.c
@@ -115,22 +115,18 @@ void vpx_lpf_horizontal_4_neon(
         int pitch,
         const uint8_t *blimit,
         const uint8_t *limit,
-        const uint8_t *thresh,
-        int count) {
+        const uint8_t *thresh) {
     int i;
     uint8_t *s, *psrc;
     uint8x8_t dblimit, dlimit, dthresh;
     uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
 
-    if (count == 0)  // end_vpx_lf_h_edge
-        return;
-
     dblimit = vld1_u8(blimit);
     dlimit = vld1_u8(limit);
     dthresh = vld1_u8(thresh);
 
     psrc = src - (pitch << 2);
-    for (i = 0; i < count; i++) {
+    for (i = 0; i < 1; i++) {
         s = psrc + i * 8;
 
         d3u8 = vld1_u8(s);
@@ -170,8 +166,7 @@ void vpx_lpf_vertical_4_neon(
         int pitch,
         const uint8_t *blimit,
         const uint8_t *limit,
-        const uint8_t *thresh,
-        int count) {
+        const uint8_t *thresh) {
     int i, pitch8;
     uint8_t *s;
     uint8x8_t dblimit, dlimit, dthresh;
@@ -181,15 +176,12 @@ void vpx_lpf_vertical_4_neon(
     uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
     uint8x8x4_t d4Result;
 
-    if (count == 0)  // end_vpx_lf_h_edge
-        return;
-
     dblimit = vld1_u8(blimit);
     dlimit = vld1_u8(limit);
     dthresh = vld1_u8(thresh);
 
     pitch8 = pitch * 8;
-    for (i = 0; i < count; i++, src += pitch8) {
+    for (i = 0; i < 1; i++, src += pitch8) {
         s = src - (i + 1) * 4;
 
         d3u8 = vld1_u8(s);
diff --git a/vpx_dsp/arm/loopfilter_8_neon.asm b/vpx_dsp/arm/loopfilter_8_neon.asm
index e81734c..a2f20e1 100644
--- a/vpx_dsp/arm/loopfilter_8_neon.asm
+++ b/vpx_dsp/arm/loopfilter_8_neon.asm
@@ -16,35 +16,26 @@
 
 ; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 ; works on 16 iterations at a time.
-; TODO(fgalligan): See about removing the count code as this function is only
-; called with a count of 1.
 ;
 ; void vpx_lpf_horizontal_8_neon(uint8_t *s, int p,
 ;                                const uint8_t *blimit,
 ;                                const uint8_t *limit,
-;                                const uint8_t *thresh,
-;                                int count)
+;                                const uint8_t *thresh)
 ; r0    uint8_t *s,
 ; r1    int p, /* pitch */
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-; sp+4  int count
 |vpx_lpf_horizontal_8_neon| PROC
     push        {r4-r5, lr}
 
     vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r12, [sp, #16]             ; load count
     ldr         r2, [sp, #12]              ; load thresh
     add         r1, r1, r1                 ; double pitch
 
-    cmp         r12, #0
-    beq         end_vpx_mblf_h_edge
-
     vld1.8      {d1[]}, [r3]               ; duplicate *limit
     vld1.8      {d2[]}, [r2]               ; duplicate *thresh
 
-count_mblf_h_loop
     sub         r3, r0, r1, lsl #1         ; move src pointer down by 4 lines
     add         r2, r3, r1, lsr #1         ; set to 3 lines down
 
@@ -69,11 +60,6 @@ count_mblf_h_loop
     vst1.u8     {d4}, [r2 at 64], r1          ; store oq1
     vst1.u8     {d5}, [r3 at 64], r1          ; store oq2
 
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         count_mblf_h_loop
-
-end_vpx_mblf_h_edge
     pop         {r4-r5, pc}
 
     ENDP        ; |vpx_lpf_horizontal_8_neon|
@@ -82,30 +68,24 @@ end_vpx_mblf_h_edge
 ;                              int pitch,
 ;                              const uint8_t *blimit,
 ;                              const uint8_t *limit,
-;                              const uint8_t *thresh,
-;                              int count)
+;                              const uint8_t *thresh)
 ;
 ; r0    uint8_t *s,
 ; r1    int pitch,
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-; sp+4  int count
 |vpx_lpf_vertical_8_neon| PROC
     push        {r4-r5, lr}
 
     vld1.8      {d0[]}, [r2]              ; duplicate *blimit
-    ldr         r12, [sp, #16]            ; load count
     vld1.8      {d1[]}, [r3]              ; duplicate *limit
 
     ldr         r3, [sp, #12]             ; load thresh
     sub         r2, r0, #4                ; move s pointer down by 4 columns
-    cmp         r12, #0
-    beq         end_vpx_mblf_v_edge
 
     vld1.8      {d2[]}, [r3]              ; duplicate *thresh
 
-count_mblf_v_loop
     vld1.u8     {d3}, [r2], r1             ; load s data
     vld1.u8     {d4}, [r2], r1
     vld1.u8     {d5}, [r2], r1
@@ -156,12 +136,6 @@ count_mblf_v_loop
     vst2.8      {d4[6], d5[6]}, [r3], r1
     vst2.8      {d4[7], d5[7]}, [r3]
 
-    add         r0, r0, r1, lsl #3         ; s += pitch * 8
-    subs        r12, r12, #1
-    subne       r2, r0, #4                 ; move s pointer down by 4 columns
-    bne         count_mblf_v_loop
-
-end_vpx_mblf_v_edge
     pop         {r4-r5, pc}
     ENDP        ; |vpx_lpf_vertical_8_neon|
 
diff --git a/vpx_dsp/arm/loopfilter_8_neon.c b/vpx_dsp/arm/loopfilter_8_neon.c
index a887e2e..ec37573 100644
--- a/vpx_dsp/arm/loopfilter_8_neon.c
+++ b/vpx_dsp/arm/loopfilter_8_neon.c
@@ -268,23 +268,19 @@ void vpx_lpf_horizontal_8_neon(
         int pitch,
         const uint8_t *blimit,
         const uint8_t *limit,
-        const uint8_t *thresh,
-        int count) {
+        const uint8_t *thresh) {
     int i;
     uint8_t *s, *psrc;
     uint8x8_t dblimit, dlimit, dthresh;
     uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
     uint8x8_t d16u8, d17u8, d18u8;
 
-    if (count == 0)  // end_vpx_mblf_h_edge
-        return;
-
     dblimit = vld1_u8(blimit);
     dlimit = vld1_u8(limit);
     dthresh = vld1_u8(thresh);
 
     psrc = src - (pitch << 2);
-    for (i = 0; i < count; i++) {
+    for (i = 0; i < 1; i++) {
         s = psrc + i * 8;
 
         d3u8  = vld1_u8(s);
@@ -328,8 +324,7 @@ void vpx_lpf_vertical_8_neon(
         int pitch,
         const uint8_t *blimit,
         const uint8_t *limit,
-        const uint8_t *thresh,
-        int count) {
+        const uint8_t *thresh) {
     int i;
     uint8_t *s;
     uint8x8_t dblimit, dlimit, dthresh;
@@ -341,14 +336,11 @@ void vpx_lpf_vertical_8_neon(
     uint8x8x4_t d4Result;
     uint8x8x2_t d2Result;
 
-    if (count == 0)
-        return;
-
     dblimit = vld1_u8(blimit);
     dlimit = vld1_u8(limit);
     dthresh = vld1_u8(thresh);
 
-    for (i = 0; i < count; i++) {
+    for (i = 0; i < 1; i++) {
         s = src + (i * (pitch << 3)) - 4;
 
         d3u8 = vld1_u8(s);
diff --git a/vpx_dsp/arm/loopfilter_mb_neon.asm b/vpx_dsp/arm/loopfilter_mb_neon.asm
index 20d9cfb..d5da7a8 100644
--- a/vpx_dsp/arm/loopfilter_mb_neon.asm
+++ b/vpx_dsp/arm/loopfilter_mb_neon.asm
@@ -8,27 +8,28 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vpx_lpf_horizontal_16_neon|
+    EXPORT  |vpx_lpf_horizontal_edge_8_neon|
+    EXPORT  |vpx_lpf_horizontal_edge_16_neon|
     EXPORT  |vpx_lpf_vertical_16_neon|
     ARM
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; void vpx_lpf_horizontal_16_neon(uint8_t *s, int p,
-;                                 const uint8_t *blimit,
-;                                 const uint8_t *limit,
-;                                 const uint8_t *thresh
-;                                 int count)
+; void mb_lpf_horizontal_edge(uint8_t *s, int p,
+;                             const uint8_t *blimit,
+;                             const uint8_t *limit,
+;                             const uint8_t *thresh,
+;                             int count)
 ; r0    uint8_t *s,
 ; r1    int p, /* pitch */
 ; r2    const uint8_t *blimit,
 ; r3    const uint8_t *limit,
 ; sp    const uint8_t *thresh,
-|vpx_lpf_horizontal_16_neon| PROC
+; r12   int count
+|mb_lpf_horizontal_edge| PROC
     push        {r4-r8, lr}
     vpush       {d8-d15}
     ldr         r4, [sp, #88]              ; load thresh
-    ldr         r12, [sp, #92]             ; load count
 
 h_count
     vld1.8      {d16[]}, [r2]              ; load *blimit
@@ -115,7 +116,35 @@ h_next
     vpop        {d8-d15}
     pop         {r4-r8, pc}
 
-    ENDP        ; |vpx_lpf_horizontal_16_neon|
+    ENDP        ; |mb_lpf_horizontal_edge|
+
+; void vpx_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch,
+;                                     const uint8_t *blimit,
+;                                     const uint8_t *limit,
+;                                     const uint8_t *thresh)
+; r0    uint8_t *s,
+; r1    int pitch,
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh
+|vpx_lpf_horizontal_edge_8_neon| PROC
+    mov r12, #1
+    b mb_lpf_horizontal_edge
+    ENDP        ; |vpx_lpf_horizontal_edge_8_neon|
+
+; void vpx_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch,
+;                                      const uint8_t *blimit,
+;                                      const uint8_t *limit,
+;                                      const uint8_t *thresh)
+; r0    uint8_t *s,
+; r1    int pitch,
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh
+|vpx_lpf_horizontal_edge_16_neon| PROC
+    mov r12, #2
+    b mb_lpf_horizontal_edge
+    ENDP        ; |vpx_lpf_horizontal_edge_16_neon|
 
 ; void vpx_lpf_vertical_16_neon(uint8_t *s, int p,
 ;                               const uint8_t *blimit,
diff --git a/vpx_dsp/arm/loopfilter_neon.c b/vpx_dsp/arm/loopfilter_neon.c
index eff87d2..aa31f29 100644
--- a/vpx_dsp/arm/loopfilter_neon.c
+++ b/vpx_dsp/arm/loopfilter_neon.c
@@ -21,8 +21,8 @@ void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
                                   const uint8_t *blimit1,
                                   const uint8_t *limit1,
                                   const uint8_t *thresh1) {
-  vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
-  vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1);
 }
 
 #if HAVE_NEON_ASM
@@ -33,8 +33,8 @@ void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
                                     const uint8_t *blimit1,
                                     const uint8_t *limit1,
                                     const uint8_t *thresh1) {
-  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
-  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p,
@@ -44,8 +44,8 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p,
                                   const uint8_t *blimit1,
                                   const uint8_t *limit1,
                                   const uint8_t *thresh1) {
-  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
-  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p,
diff --git a/vp9/encoder/vp9_avg.c b/vpx_dsp/avg.c
similarity index 84%
rename from vp9/encoder/vp9_avg.c
rename to vpx_dsp/avg.c
index a9a4c30..a8c9966 100644
--- a/vp9/encoder/vp9_avg.c
+++ b/vpx_dsp/avg.c
@@ -7,11 +7,12 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
-unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {
+unsigned int vpx_avg_8x8_c(const uint8_t *s, int p) {
   int i, j;
   int sum = 0;
   for (i = 0; i < 8; ++i, s+=p)
@@ -20,7 +21,7 @@ unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {
   return (sum + 32) >> 6;
 }
 
-unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) {
+unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
   int i, j;
   int sum = 0;
   for (i = 0; i < 4; ++i, s+=p)
@@ -61,7 +62,9 @@ static void hadamard_col8(const int16_t *src_diff, int src_stride,
   coeff[5] = c3 - c7;
 }
 
-void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
+// The order of the output coeff of the hadamard is not important. For
+// optimization purposes the final transpose may be skipped.
+void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
                         int16_t *coeff) {
   int idx;
   int16_t buffer[64];
@@ -84,14 +87,14 @@ void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
 }
 
 // In place 16x16 2D Hadamard transform
-void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
+void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
                           int16_t *coeff) {
   int idx;
   for (idx = 0; idx < 4; ++idx) {
     // src_diff: 9 bit, dynamic range [-255, 255]
-    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+    const int16_t *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
                                 + (idx & 0x01) * 8;
-    vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+    vpx_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
   }
 
   // coeff: 15 bit, dynamic range [-16320, 16320]
@@ -117,19 +120,19 @@ void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
 
 // coeff: 16 bits, dynamic range [-32640, 32640].
 // length: value range {16, 64, 256, 1024}.
-int16_t vp9_satd_c(const int16_t *coeff, int length) {
+int vpx_satd_c(const int16_t *coeff, int length) {
   int i;
   int satd = 0;
   for (i = 0; i < length; ++i)
     satd += abs(coeff[i]);
 
   // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
-  return (int16_t)satd;
+  return satd;
 }
 
 // Integer projection onto row vectors.
 // height: value range {16, 32, 64}.
-void vp9_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref,
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
                        const int ref_stride, const int height) {
   int idx;
   const int norm_factor = height >> 1;
@@ -146,7 +149,7 @@ void vp9_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref,
 }
 
 // width: value range {16, 32, 64}.
-int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) {
+int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width) {
   int idx;
   int16_t sum = 0;
   // sum: 14 bit, dynamic range [0, 16320]
@@ -158,7 +161,7 @@ int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) {
 // ref: [0 - 510]
 // src: [0 - 510]
 // bwl: {2, 3, 4}
-int vp9_vector_var_c(int16_t const *ref, int16_t const *src,
+int vpx_vector_var_c(const int16_t *ref, const int16_t *src,
                      const int bwl) {
   int i;
   int width = 4 << bwl;
@@ -175,7 +178,7 @@ int vp9_vector_var_c(int16_t const *ref, int16_t const *src,
   return var;
 }
 
-void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
+void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
                       int *min, int *max) {
   int i, j;
   *min = 255;
@@ -190,7 +193,7 @@ void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p) {
   int i, j;
   int sum = 0;
   const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
@@ -200,7 +203,7 @@ unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
   return (sum + 32) >> 6;
 }
 
-unsigned int vp9_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p) {
   int i, j;
   int sum = 0;
   const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
@@ -210,7 +213,7 @@ unsigned int vp9_highbd_avg_4x4_c(const uint8_t *s8, int p) {
   return (sum + 8) >> 4;
 }
 
-void vp9_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
+void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
                              int dp, int *min, int *max) {
   int i, j;
   const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
diff --git a/vpx_dsp/bitreader.c b/vpx_dsp/bitreader.c
index 6ad806a..8140e78 100644
--- a/vpx_dsp/bitreader.c
+++ b/vpx_dsp/bitreader.c
@@ -69,7 +69,7 @@ void vpx_reader_fill(vpx_reader *r) {
       buffer += (bits >> 3);
       value = r->value | (nv << (shift & 0x7));
   } else {
-    const int bits_over = (int)(shift + CHAR_BIT - bits_left);
+    const int bits_over = (int)(shift + CHAR_BIT - (int)bits_left);
     int loop_end = 0;
     if (bits_over >= 0) {
       count += LOTS_OF_BITS;
diff --git a/vpx_dsp/bitreader.h b/vpx_dsp/bitreader.h
index e817c8b..9a441b4 100644
--- a/vpx_dsp/bitreader.h
+++ b/vpx_dsp/bitreader.h
@@ -98,7 +98,7 @@ static INLINE int vpx_read(vpx_reader *r, int prob) {
   }
 
   {
-    register unsigned int shift = vpx_norm[range];
+    register int shift = vpx_norm[range];
     range <<= shift;
     value <<= shift;
     count -= shift;
diff --git a/vpx_dsp/bitreader_buffer.c b/vpx_dsp/bitreader_buffer.c
index bb91726..d7b55cf 100644
--- a/vpx_dsp/bitreader_buffer.c
+++ b/vpx_dsp/bitreader_buffer.c
@@ -45,7 +45,7 @@ int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb,
                                    int bits) {
 #if CONFIG_MISC_FIXES
   const int nbits = sizeof(unsigned) * 8 - bits - 1;
-  const unsigned value = vpx_rb_read_literal(rb, bits + 1) << nbits;
+  const unsigned value = (unsigned)vpx_rb_read_literal(rb, bits + 1) << nbits;
   return ((int) value) >> nbits;
 #else
   return vpx_rb_read_signed_literal(rb, bits);
diff --git a/vpx_dsp/bitwriter.h b/vpx_dsp/bitwriter.h
index f6ca9b9..d904997 100644
--- a/vpx_dsp/bitwriter.h
+++ b/vpx_dsp/bitwriter.h
@@ -35,7 +35,7 @@ static INLINE void vpx_write(vpx_writer *br, int bit, int probability) {
   int count = br->count;
   unsigned int range = br->range;
   unsigned int lowvalue = br->lowvalue;
-  register unsigned int shift;
+  register int shift;
 
   split = 1 + (((range - 1) * probability) >> 8);
 
diff --git a/vpx_dsp/fwd_txfm.c b/vpx_dsp/fwd_txfm.c
index 7baaa8b..4c0d5db 100644
--- a/vpx_dsp/fwd_txfm.c
+++ b/vpx_dsp/fwd_txfm.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/fwd_txfm.h"
 
 void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
@@ -85,7 +86,6 @@ void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
       sum += input[r * stride + c];
 
   output[0] = sum << 1;
-  output[1] = 0;
 }
 
 void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
@@ -182,7 +182,6 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
       sum += input[r * stride + c];
 
   output[0] = sum;
-  output[1] = 0;
 }
 
 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
@@ -367,13 +366,12 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
 
 void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
   int r, c;
-  tran_low_t sum = 0;
+  int sum = 0;
   for (r = 0; r < 16; ++r)
     for (c = 0; c < 16; ++c)
       sum += input[r * stride + c];
 
-  output[0] = sum >> 1;
-  output[1] = 0;
+  output[0] = (tran_low_t)(sum >> 1);
 }
 
 static INLINE tran_high_t dct_32_round(tran_high_t input) {
@@ -771,13 +769,12 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
 
 void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
   int r, c;
-  tran_low_t sum = 0;
+  int sum = 0;
   for (r = 0; r < 32; ++r)
     for (c = 0; c < 32; ++c)
       sum += input[r * stride + c];
 
-  output[0] = sum >> 3;
-  output[1] = 0;
+  output[0] = (tran_low_t)(sum >> 3);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/intrapred.c b/vpx_dsp/intrapred.c
index a9669e5..cc4a74b 100644
--- a/vpx_dsp/intrapred.c
+++ b/vpx_dsp/intrapred.c
@@ -44,6 +44,7 @@ static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
       dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
 }
 
+#if CONFIG_MISC_FIXES
 static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                    const uint8_t *above, const uint8_t *left) {
   int r, c;
@@ -58,6 +59,7 @@ static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
     dst += stride;
   }
 }
+#endif  // CONFIG_MISC_FIXES
 
 static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                  const uint8_t *above, const uint8_t *left) {
@@ -76,6 +78,7 @@ static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
   }
 }
 
+#if CONFIG_MISC_FIXES
 static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
   int r, c;
@@ -89,6 +92,7 @@ static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
     dst += stride;
   }
 }
+#endif  // CONFIG_MISC_FIXES
 
 static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                  const uint8_t *above, const uint8_t *left) {
@@ -109,6 +113,7 @@ static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
   }
 }
 
+#if CONFIG_MISC_FIXES
 static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
   int r, c;
@@ -121,6 +126,7 @@ static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
     dst += stride;
   }
 }
+#endif  // CONFIG_MISC_FIXES
 
 static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
@@ -152,20 +158,29 @@ static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
 
 static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  for (c = 1; c < bs; c++)
-    dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+  int i;
+#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7
+  // silence a spurious -Warray-bounds warning, possibly related to:
+  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273
+  uint8_t border[69];
+#else
+  uint8_t border[32 + 32 - 1];  // outer border from bottom-left to top-right
+#endif
 
-  dst[stride] = AVG3(above[-1], left[0], left[1]);
-  for (r = 2; r < bs; ++r)
-    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
+  // dst(bs, bs - 2)[0], i.e., border starting at bottom-left
+  for (i = 0; i < bs - 2; ++i) {
+    border[i] = AVG3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]);
+  }
+  border[bs - 2] = AVG3(above[-1], left[0], left[1]);
+  border[bs - 1] = AVG3(left[0], above[-1], above[0]);
+  border[bs - 0] = AVG3(above[-1], above[0], above[1]);
+  // dst[0][2, size), i.e., remaining top border ascending
+  for (i = 0; i < bs - 2; ++i) {
+    border[bs + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]);
+  }
 
-  dst += stride;
-  for (r = 1; r < bs; ++r) {
-    for (c = 1; c < bs; c++)
-      dst[c] = dst[-stride + c - 1];
-    dst += stride;
+  for (i = 0; i < bs; ++i) {
+    memcpy(dst + i * stride, border + bs - 1 - i, bs);
   }
 }
 
@@ -311,6 +326,7 @@ void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
   const int K = above[2];
   const int L = above[3];
   const int M = above[4];
+  (void)left;
 
   dst[0] = AVG3(H, I, J);
   dst[1] = AVG3(I, J, K);
@@ -528,6 +544,7 @@ static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
   }
 }
 
+#if CONFIG_MISC_FIXES
 static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
                                           int bs, const uint16_t *above,
                                           const uint16_t *left, int bd) {
@@ -544,6 +561,7 @@ static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
     dst += stride;
   }
 }
+#endif  // CONFIG_MISC_FIXES
 
 static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride,
                                         int bs, const uint16_t *above,
@@ -579,6 +597,7 @@ static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
   }
 }
 
+#if CONFIG_MISC_FIXES
 static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
                                          const uint16_t *left, int bd) {
@@ -593,6 +612,7 @@ static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride,
     dst += stride;
   }
 }
+#endif  // CONFIG_MISC_FIXES
 
 static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c
index 5f3cfdd..e18d31d 100644
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -11,6 +11,7 @@
 #include <math.h>
 #include <string.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/inv_txfm.h"
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -34,10 +35,10 @@ void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    op[0] = WRAPLOW(a1, 8);
-    op[1] = WRAPLOW(b1, 8);
-    op[2] = WRAPLOW(c1, 8);
-    op[3] = WRAPLOW(d1, 8);
+    op[0] = WRAPLOW(a1);
+    op[1] = WRAPLOW(b1);
+    op[2] = WRAPLOW(c1);
+    op[3] = WRAPLOW(d1);
     ip += 4;
     op += 4;
   }
@@ -55,10 +56,10 @@ void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
-    dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
-    dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
-    dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
+    dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
+    dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
+    dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
+    dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
 
     ip++;
     dest++;
@@ -75,8 +76,8 @@ void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   e1 = a1 >> 1;
   a1 -= e1;
-  op[0] = WRAPLOW(a1, 8);
-  op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
+  op[0] = WRAPLOW(a1);
+  op[1] = op[2] = op[3] = WRAPLOW(e1);
 
   ip = tmp;
   for (i = 0; i < 4; i++) {
@@ -97,18 +98,18 @@ void idct4_c(const tran_low_t *input, tran_low_t *output) {
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step[3] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 2
-  output[0] = WRAPLOW(step[0] + step[3], 8);
-  output[1] = WRAPLOW(step[1] + step[2], 8);
-  output[2] = WRAPLOW(step[1] - step[2], 8);
-  output[3] = WRAPLOW(step[0] - step[3], 8);
+  output[0] = WRAPLOW(step[0] + step[3]);
+  output[1] = WRAPLOW(step[1] + step[2]);
+  output[2] = WRAPLOW(step[1] - step[2]);
+  output[3] = WRAPLOW(step[0] - step[3]);
 }
 
 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -140,8 +141,8 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
                          int dest_stride) {
   int i;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -163,48 +164,48 @@ void idct8_c(const tran_low_t *input, tran_low_t *output) {
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 2
   temp1 = (step1[0] + step1[2]) * cospi_16_64;
   temp2 = (step1[0] - step1[2]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
 
   // stage 3
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   step1[7] = step2[7];
 
   // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7], 8);
-  output[1] = WRAPLOW(step1[1] + step1[6], 8);
-  output[2] = WRAPLOW(step1[2] + step1[5], 8);
-  output[3] = WRAPLOW(step1[3] + step1[4], 8);
-  output[4] = WRAPLOW(step1[3] - step1[4], 8);
-  output[5] = WRAPLOW(step1[2] - step1[5], 8);
-  output[6] = WRAPLOW(step1[1] - step1[6], 8);
-  output[7] = WRAPLOW(step1[0] - step1[7], 8);
+  output[0] = WRAPLOW(step1[0] + step1[7]);
+  output[1] = WRAPLOW(step1[1] + step1[6]);
+  output[2] = WRAPLOW(step1[2] + step1[5]);
+  output[3] = WRAPLOW(step1[3] + step1[4]);
+  output[4] = WRAPLOW(step1[3] - step1[4]);
+  output[5] = WRAPLOW(step1[2] - step1[5]);
+  output[6] = WRAPLOW(step1[1] - step1[6]);
+  output[7] = WRAPLOW(step1[0] - step1[7]);
 }
 
 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -235,8 +236,8 @@ void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i)
@@ -265,7 +266,7 @@ void iadst4_c(const tran_low_t *input, tran_low_t *output) {
   s4 = sinpi_1_9 * x2;
   s5 = sinpi_2_9 * x3;
   s6 = sinpi_4_9 * x3;
-  s7 = x0 - x2 + x3;
+  s7 = WRAPLOW(x0 - x2 + x3);
 
   s0 = s0 + s3 + s5;
   s1 = s1 - s4 - s6;
@@ -276,10 +277,10 @@ void iadst4_c(const tran_low_t *input, tran_low_t *output) {
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
-  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
-  output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
-  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
+  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
+  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
+  output[2] = WRAPLOW(dct_const_round_shift(s2));
+  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
 }
 
 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
@@ -310,14 +311,14 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
   s6 = (int)(cospi_26_64 * x6 + cospi_6_64  * x7);
   s7 = (int)(cospi_6_64  * x6 - cospi_26_64 * x7);
 
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
+  x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
+  x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
+  x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
 
   // stage 2
   s0 = (int)x0;
@@ -329,14 +330,14 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
 
-  x0 = WRAPLOW(s0 + s2, 8);
-  x1 = WRAPLOW(s1 + s3, 8);
-  x2 = WRAPLOW(s0 - s2, 8);
-  x3 = WRAPLOW(s1 - s3, 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
+  x0 = WRAPLOW(s0 + s2);
+  x1 = WRAPLOW(s1 + s3);
+  x2 = WRAPLOW(s0 - s2);
+  x3 = WRAPLOW(s1 - s3);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
 
   // stage 3
   s2 = (int)(cospi_16_64 * (x2 + x3));
@@ -344,19 +345,19 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
   s6 = (int)(cospi_16_64 * (x6 + x7));
   s7 = (int)(cospi_16_64 * (x6 - x7));
 
-  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
-
-  output[0] = WRAPLOW(x0, 8);
-  output[1] = WRAPLOW(-x4, 8);
-  output[2] = WRAPLOW(x6, 8);
-  output[3] = WRAPLOW(-x2, 8);
-  output[4] = WRAPLOW(x3, 8);
-  output[5] = WRAPLOW(-x7, 8);
-  output[6] = WRAPLOW(x5, 8);
-  output[7] = WRAPLOW(-x1, 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2));
+  x3 = WRAPLOW(dct_const_round_shift(s3));
+  x6 = WRAPLOW(dct_const_round_shift(s6));
+  x7 = WRAPLOW(dct_const_round_shift(s7));
+
+  output[0] = WRAPLOW(x0);
+  output[1] = WRAPLOW(-x4);
+  output[2] = WRAPLOW(x6);
+  output[3] = WRAPLOW(-x2);
+  output[4] = WRAPLOW(x3);
+  output[5] = WRAPLOW(-x7);
+  output[6] = WRAPLOW(x5);
+  output[7] = WRAPLOW(-x1);
 }
 
 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -419,23 +420,23 @@ void idct16_c(const tran_low_t *input, tran_low_t *output) {
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 3
   step1[0] = step2[0];
@@ -445,109 +446,109 @@ void idct16_c(const tran_low_t *input, tran_low_t *output) {
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
-  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
-  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
-  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
-  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
-  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+  step1[8] = WRAPLOW(step2[8] + step2[9]);
+  step1[9] = WRAPLOW(step2[8] - step2[9]);
+  step1[10] = WRAPLOW(-step2[10] + step2[11]);
+  step1[11] = WRAPLOW(step2[10] + step2[11]);
+  step1[12] = WRAPLOW(step2[12] + step2[13]);
+  step1[13] = WRAPLOW(step2[12] - step2[13]);
+  step1[14] = WRAPLOW(-step2[14] + step2[15]);
+  step1[15] = WRAPLOW(step2[14] + step2[15]);
 
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   step2[11] = step1[11];
   step2[12] = step1[12];
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
-  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
-  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
-  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
-  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+  step1[8] = WRAPLOW(step2[8] + step2[11]);
+  step1[9] = WRAPLOW(step2[9] + step2[10]);
+  step1[10] = WRAPLOW(step2[9] - step2[10]);
+  step1[11] = WRAPLOW(step2[8] - step2[11]);
+  step1[12] = WRAPLOW(-step2[12] + step2[15]);
+  step1[13] = WRAPLOW(-step2[13] + step2[14]);
+  step1[14] = WRAPLOW(step2[13] + step2[14]);
+  step1[15] = WRAPLOW(step2[12] + step2[15]);
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
-  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
-  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
-  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
-  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
-  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
-  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
-  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[0] = WRAPLOW(step1[0] + step1[7]);
+  step2[1] = WRAPLOW(step1[1] + step1[6]);
+  step2[2] = WRAPLOW(step1[2] + step1[5]);
+  step2[3] = WRAPLOW(step1[3] + step1[4]);
+  step2[4] = WRAPLOW(step1[3] - step1[4]);
+  step2[5] = WRAPLOW(step1[2] - step1[5]);
+  step2[6] = WRAPLOW(step1[1] - step1[6]);
+  step2[7] = WRAPLOW(step1[0] - step1[7]);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
   step2[14] = step1[14];
   step2[15] = step1[15];
 
   // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15], 8);
-  output[1] = WRAPLOW(step2[1] + step2[14], 8);
-  output[2] = WRAPLOW(step2[2] + step2[13], 8);
-  output[3] = WRAPLOW(step2[3] + step2[12], 8);
-  output[4] = WRAPLOW(step2[4] + step2[11], 8);
-  output[5] = WRAPLOW(step2[5] + step2[10], 8);
-  output[6] = WRAPLOW(step2[6] + step2[9], 8);
-  output[7] = WRAPLOW(step2[7] + step2[8], 8);
-  output[8] = WRAPLOW(step2[7] - step2[8], 8);
-  output[9] = WRAPLOW(step2[6] - step2[9], 8);
-  output[10] = WRAPLOW(step2[5] - step2[10], 8);
-  output[11] = WRAPLOW(step2[4] - step2[11], 8);
-  output[12] = WRAPLOW(step2[3] - step2[12], 8);
-  output[13] = WRAPLOW(step2[2] - step2[13], 8);
-  output[14] = WRAPLOW(step2[1] - step2[14], 8);
-  output[15] = WRAPLOW(step2[0] - step2[15], 8);
+  output[0] = WRAPLOW(step2[0] + step2[15]);
+  output[1] = WRAPLOW(step2[1] + step2[14]);
+  output[2] = WRAPLOW(step2[2] + step2[13]);
+  output[3] = WRAPLOW(step2[3] + step2[12]);
+  output[4] = WRAPLOW(step2[4] + step2[11]);
+  output[5] = WRAPLOW(step2[5] + step2[10]);
+  output[6] = WRAPLOW(step2[6] + step2[9]);
+  output[7] = WRAPLOW(step2[7] + step2[8]);
+  output[8] = WRAPLOW(step2[7] - step2[8]);
+  output[9] = WRAPLOW(step2[6] - step2[9]);
+  output[10] = WRAPLOW(step2[5] - step2[10]);
+  output[11] = WRAPLOW(step2[4] - step2[11]);
+  output[12] = WRAPLOW(step2[3] - step2[12]);
+  output[13] = WRAPLOW(step2[2] - step2[13]);
+  output[14] = WRAPLOW(step2[1] - step2[14]);
+  output[15] = WRAPLOW(step2[0] - step2[15]);
 }
 
 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
@@ -624,22 +625,22 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
-  x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
-  x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
+  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
+  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
+  x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
+  x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
+  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
+  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
+  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
+  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
+  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
 
   // stage 2
   s0 = x0;
@@ -659,22 +660,22 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
   s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
   s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;
 
-  x0 = WRAPLOW(s0 + s4, 8);
-  x1 = WRAPLOW(s1 + s5, 8);
-  x2 = WRAPLOW(s2 + s6, 8);
-  x3 = WRAPLOW(s3 + s7, 8);
-  x4 = WRAPLOW(s0 - s4, 8);
-  x5 = WRAPLOW(s1 - s5, 8);
-  x6 = WRAPLOW(s2 - s6, 8);
-  x7 = WRAPLOW(s3 - s7, 8);
-  x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
-  x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
+  x0 = WRAPLOW(s0 + s4);
+  x1 = WRAPLOW(s1 + s5);
+  x2 = WRAPLOW(s2 + s6);
+  x3 = WRAPLOW(s3 + s7);
+  x4 = WRAPLOW(s0 - s4);
+  x5 = WRAPLOW(s1 - s5);
+  x6 = WRAPLOW(s2 - s6);
+  x7 = WRAPLOW(s3 - s7);
+  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
+  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
+  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
+  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
+  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
+  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
+  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
 
   // stage 3
   s0 = x0;
@@ -694,22 +695,22 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
   s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
   s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;
 
-  x0 = WRAPLOW(check_range(s0 + s2), 8);
-  x1 = WRAPLOW(check_range(s1 + s3), 8);
-  x2 = WRAPLOW(check_range(s0 - s2), 8);
-  x3 = WRAPLOW(check_range(s1 - s3), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
-  x8 = WRAPLOW(check_range(s8 + s10), 8);
-  x9 = WRAPLOW(check_range(s9 + s11), 8);
-  x10 = WRAPLOW(check_range(s8 - s10), 8);
-  x11 = WRAPLOW(check_range(s9 - s11), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
+  x0 = WRAPLOW(s0 + s2);
+  x1 = WRAPLOW(s1 + s3);
+  x2 = WRAPLOW(s0 - s2);
+  x3 = WRAPLOW(s1 - s3);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+  x8 = WRAPLOW(s8 + s10);
+  x9 = WRAPLOW(s9 + s11);
+  x10 = WRAPLOW(s8 - s10);
+  x11 = WRAPLOW(s9 - s11);
+  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
+  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
+  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
 
   // stage 4
   s2 = (- cospi_16_64) * (x2 + x3);
@@ -721,31 +722,31 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
   s14 = (- cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s10), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s11), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s15), 8);
-
-  output[0] = WRAPLOW(x0, 8);
-  output[1] = WRAPLOW(-x8, 8);
-  output[2] = WRAPLOW(x12, 8);
-  output[3] = WRAPLOW(-x4, 8);
-  output[4] = WRAPLOW(x6, 8);
-  output[5] = WRAPLOW(x14, 8);
-  output[6] = WRAPLOW(x10, 8);
-  output[7] = WRAPLOW(x2, 8);
-  output[8] = WRAPLOW(x3, 8);
-  output[9] = WRAPLOW(x11, 8);
-  output[10] = WRAPLOW(x15, 8);
-  output[11] = WRAPLOW(x7, 8);
-  output[12] = WRAPLOW(x5, 8);
-  output[13] = WRAPLOW(-x13, 8);
-  output[14] = WRAPLOW(x9, 8);
-  output[15] = WRAPLOW(-x1, 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2));
+  x3 = WRAPLOW(dct_const_round_shift(s3));
+  x6 = WRAPLOW(dct_const_round_shift(s6));
+  x7 = WRAPLOW(dct_const_round_shift(s7));
+  x10 = WRAPLOW(dct_const_round_shift(s10));
+  x11 = WRAPLOW(dct_const_round_shift(s11));
+  x14 = WRAPLOW(dct_const_round_shift(s14));
+  x15 = WRAPLOW(dct_const_round_shift(s15));
+
+  output[0] = WRAPLOW(x0);
+  output[1] = WRAPLOW(-x8);
+  output[2] = WRAPLOW(x12);
+  output[3] = WRAPLOW(-x4);
+  output[4] = WRAPLOW(x6);
+  output[5] = WRAPLOW(x14);
+  output[6] = WRAPLOW(x10);
+  output[7] = WRAPLOW(x2);
+  output[8] = WRAPLOW(x3);
+  output[9] = WRAPLOW(x11);
+  output[10] = WRAPLOW(x15);
+  output[11] = WRAPLOW(x7);
+  output[12] = WRAPLOW(x5);
+  output[13] = WRAPLOW(-x13);
+  output[14] = WRAPLOW(x9);
+  output[15] = WRAPLOW(-x1);
 }
 
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
@@ -778,8 +779,8 @@ void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i)
@@ -812,43 +813,43 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) {
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[16] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[31] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 2
   step2[0] = step1[0];
@@ -862,40 +863,40 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) {
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  step2[16] = WRAPLOW(step1[16] + step1[17], 8);
-  step2[17] = WRAPLOW(step1[16] - step1[17], 8);
-  step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
-  step2[19] = WRAPLOW(step1[18] + step1[19], 8);
-  step2[20] = WRAPLOW(step1[20] + step1[21], 8);
-  step2[21] = WRAPLOW(step1[20] - step1[21], 8);
-  step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
-  step2[23] = WRAPLOW(step1[22] + step1[23], 8);
-  step2[24] = WRAPLOW(step1[24] + step1[25], 8);
-  step2[25] = WRAPLOW(step1[24] - step1[25], 8);
-  step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
-  step2[27] = WRAPLOW(step1[26] + step1[27], 8);
-  step2[28] = WRAPLOW(step1[28] + step1[29], 8);
-  step2[29] = WRAPLOW(step1[28] - step1[29], 8);
-  step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
-  step2[31] = WRAPLOW(step1[30] + step1[31], 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+
+  step2[16] = WRAPLOW(step1[16] + step1[17]);
+  step2[17] = WRAPLOW(step1[16] - step1[17]);
+  step2[18] = WRAPLOW(-step1[18] + step1[19]);
+  step2[19] = WRAPLOW(step1[18] + step1[19]);
+  step2[20] = WRAPLOW(step1[20] + step1[21]);
+  step2[21] = WRAPLOW(step1[20] - step1[21]);
+  step2[22] = WRAPLOW(-step1[22] + step1[23]);
+  step2[23] = WRAPLOW(step1[22] + step1[23]);
+  step2[24] = WRAPLOW(step1[24] + step1[25]);
+  step2[25] = WRAPLOW(step1[24] - step1[25]);
+  step2[26] = WRAPLOW(-step1[26] + step1[27]);
+  step2[27] = WRAPLOW(step1[26] + step1[27]);
+  step2[28] = WRAPLOW(step1[28] + step1[29]);
+  step2[29] = WRAPLOW(step1[28] - step1[29]);
+  step2[30] = WRAPLOW(-step1[30] + step1[31]);
+  step2[31] = WRAPLOW(step1[30] + step1[31]);
 
   // stage 3
   step1[0] = step2[0];
@@ -905,42 +906,42 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) {
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
-  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
-  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
-  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
-  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
-  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+  step1[8] = WRAPLOW(step2[8] + step2[9]);
+  step1[9] = WRAPLOW(step2[8] - step2[9]);
+  step1[10] = WRAPLOW(-step2[10] + step2[11]);
+  step1[11] = WRAPLOW(step2[10] + step2[11]);
+  step1[12] = WRAPLOW(step2[12] + step2[13]);
+  step1[13] = WRAPLOW(step2[12] - step2[13]);
+  step1[14] = WRAPLOW(-step2[14] + step2[15]);
+  step1[15] = WRAPLOW(step2[14] + step2[15]);
 
   step1[16] = step2[16];
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -949,87 +950,87 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) {
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   step2[11] = step1[11];
   step2[12] = step1[12];
 
-  step2[16] = WRAPLOW(step1[16] + step1[19], 8);
-  step2[17] = WRAPLOW(step1[17] + step1[18], 8);
-  step2[18] = WRAPLOW(step1[17] - step1[18], 8);
-  step2[19] = WRAPLOW(step1[16] - step1[19], 8);
-  step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
-  step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
-  step2[22] = WRAPLOW(step1[21] + step1[22], 8);
-  step2[23] = WRAPLOW(step1[20] + step1[23], 8);
-
-  step2[24] = WRAPLOW(step1[24] + step1[27], 8);
-  step2[25] = WRAPLOW(step1[25] + step1[26], 8);
-  step2[26] = WRAPLOW(step1[25] - step1[26], 8);
-  step2[27] = WRAPLOW(step1[24] - step1[27], 8);
-  step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
-  step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
-  step2[30] = WRAPLOW(step1[29] + step1[30], 8);
-  step2[31] = WRAPLOW(step1[28] + step1[31], 8);
+  step2[16] = WRAPLOW(step1[16] + step1[19]);
+  step2[17] = WRAPLOW(step1[17] + step1[18]);
+  step2[18] = WRAPLOW(step1[17] - step1[18]);
+  step2[19] = WRAPLOW(step1[16] - step1[19]);
+  step2[20] = WRAPLOW(-step1[20] + step1[23]);
+  step2[21] = WRAPLOW(-step1[21] + step1[22]);
+  step2[22] = WRAPLOW(step1[21] + step1[22]);
+  step2[23] = WRAPLOW(step1[20] + step1[23]);
+
+  step2[24] = WRAPLOW(step1[24] + step1[27]);
+  step2[25] = WRAPLOW(step1[25] + step1[26]);
+  step2[26] = WRAPLOW(step1[25] - step1[26]);
+  step2[27] = WRAPLOW(step1[24] - step1[27]);
+  step2[28] = WRAPLOW(-step1[28] + step1[31]);
+  step2[29] = WRAPLOW(-step1[29] + step1[30]);
+  step2[30] = WRAPLOW(step1[29] + step1[30]);
+  step2[31] = WRAPLOW(step1[28] + step1[31]);
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
-  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
-  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
-  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
-  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+  step1[8] = WRAPLOW(step2[8] + step2[11]);
+  step1[9] = WRAPLOW(step2[9] + step2[10]);
+  step1[10] = WRAPLOW(step2[9] - step2[10]);
+  step1[11] = WRAPLOW(step2[8] - step2[11]);
+  step1[12] = WRAPLOW(-step2[12] + step2[15]);
+  step1[13] = WRAPLOW(-step2[13] + step2[14]);
+  step1[14] = WRAPLOW(step2[13] + step2[14]);
+  step1[15] = WRAPLOW(step2[12] + step2[15]);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -1038,62 +1039,62 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) {
   step1[31] = step2[31];
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
-  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
-  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
-  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
-  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
-  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
-  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
-  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[0] = WRAPLOW(step1[0] + step1[7]);
+  step2[1] = WRAPLOW(step1[1] + step1[6]);
+  step2[2] = WRAPLOW(step1[2] + step1[5]);
+  step2[3] = WRAPLOW(step1[3] + step1[4]);
+  step2[4] = WRAPLOW(step1[3] - step1[4]);
+  step2[5] = WRAPLOW(step1[2] - step1[5]);
+  step2[6] = WRAPLOW(step1[1] - step1[6]);
+  step2[7] = WRAPLOW(step1[0] - step1[7]);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
   step2[14] = step1[14];
   step2[15] = step1[15];
 
-  step2[16] = WRAPLOW(step1[16] + step1[23], 8);
-  step2[17] = WRAPLOW(step1[17] + step1[22], 8);
-  step2[18] = WRAPLOW(step1[18] + step1[21], 8);
-  step2[19] = WRAPLOW(step1[19] + step1[20], 8);
-  step2[20] = WRAPLOW(step1[19] - step1[20], 8);
-  step2[21] = WRAPLOW(step1[18] - step1[21], 8);
-  step2[22] = WRAPLOW(step1[17] - step1[22], 8);
-  step2[23] = WRAPLOW(step1[16] - step1[23], 8);
-
-  step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
-  step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
-  step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
-  step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
-  step2[28] = WRAPLOW(step1[27] + step1[28], 8);
-  step2[29] = WRAPLOW(step1[26] + step1[29], 8);
-  step2[30] = WRAPLOW(step1[25] + step1[30], 8);
-  step2[31] = WRAPLOW(step1[24] + step1[31], 8);
+  step2[16] = WRAPLOW(step1[16] + step1[23]);
+  step2[17] = WRAPLOW(step1[17] + step1[22]);
+  step2[18] = WRAPLOW(step1[18] + step1[21]);
+  step2[19] = WRAPLOW(step1[19] + step1[20]);
+  step2[20] = WRAPLOW(step1[19] - step1[20]);
+  step2[21] = WRAPLOW(step1[18] - step1[21]);
+  step2[22] = WRAPLOW(step1[17] - step1[22]);
+  step2[23] = WRAPLOW(step1[16] - step1[23]);
+
+  step2[24] = WRAPLOW(-step1[24] + step1[31]);
+  step2[25] = WRAPLOW(-step1[25] + step1[30]);
+  step2[26] = WRAPLOW(-step1[26] + step1[29]);
+  step2[27] = WRAPLOW(-step1[27] + step1[28]);
+  step2[28] = WRAPLOW(step1[27] + step1[28]);
+  step2[29] = WRAPLOW(step1[26] + step1[29]);
+  step2[30] = WRAPLOW(step1[25] + step1[30]);
+  step2[31] = WRAPLOW(step1[24] + step1[31]);
 
   // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[14], 8);
-  step1[2] = WRAPLOW(step2[2] + step2[13], 8);
-  step1[3] = WRAPLOW(step2[3] + step2[12], 8);
-  step1[4] = WRAPLOW(step2[4] + step2[11], 8);
-  step1[5] = WRAPLOW(step2[5] + step2[10], 8);
-  step1[6] = WRAPLOW(step2[6] + step2[9], 8);
-  step1[7] = WRAPLOW(step2[7] + step2[8], 8);
-  step1[8] = WRAPLOW(step2[7] - step2[8], 8);
-  step1[9] = WRAPLOW(step2[6] - step2[9], 8);
-  step1[10] = WRAPLOW(step2[5] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[4] - step2[11], 8);
-  step1[12] = WRAPLOW(step2[3] - step2[12], 8);
-  step1[13] = WRAPLOW(step2[2] - step2[13], 8);
-  step1[14] = WRAPLOW(step2[1] - step2[14], 8);
-  step1[15] = WRAPLOW(step2[0] - step2[15], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[15]);
+  step1[1] = WRAPLOW(step2[1] + step2[14]);
+  step1[2] = WRAPLOW(step2[2] + step2[13]);
+  step1[3] = WRAPLOW(step2[3] + step2[12]);
+  step1[4] = WRAPLOW(step2[4] + step2[11]);
+  step1[5] = WRAPLOW(step2[5] + step2[10]);
+  step1[6] = WRAPLOW(step2[6] + step2[9]);
+  step1[7] = WRAPLOW(step2[7] + step2[8]);
+  step1[8] = WRAPLOW(step2[7] - step2[8]);
+  step1[9] = WRAPLOW(step2[6] - step2[9]);
+  step1[10] = WRAPLOW(step2[5] - step2[10]);
+  step1[11] = WRAPLOW(step2[4] - step2[11]);
+  step1[12] = WRAPLOW(step2[3] - step2[12]);
+  step1[13] = WRAPLOW(step2[2] - step2[13]);
+  step1[14] = WRAPLOW(step2[1] - step2[14]);
+  step1[15] = WRAPLOW(step2[0] - step2[15]);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
@@ -1101,58 +1102,58 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) {
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
   step1[31] = step2[31];
 
   // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31], 8);
-  output[1] = WRAPLOW(step1[1] + step1[30], 8);
-  output[2] = WRAPLOW(step1[2] + step1[29], 8);
-  output[3] = WRAPLOW(step1[3] + step1[28], 8);
-  output[4] = WRAPLOW(step1[4] + step1[27], 8);
-  output[5] = WRAPLOW(step1[5] + step1[26], 8);
-  output[6] = WRAPLOW(step1[6] + step1[25], 8);
-  output[7] = WRAPLOW(step1[7] + step1[24], 8);
-  output[8] = WRAPLOW(step1[8] + step1[23], 8);
-  output[9] = WRAPLOW(step1[9] + step1[22], 8);
-  output[10] = WRAPLOW(step1[10] + step1[21], 8);
-  output[11] = WRAPLOW(step1[11] + step1[20], 8);
-  output[12] = WRAPLOW(step1[12] + step1[19], 8);
-  output[13] = WRAPLOW(step1[13] + step1[18], 8);
-  output[14] = WRAPLOW(step1[14] + step1[17], 8);
-  output[15] = WRAPLOW(step1[15] + step1[16], 8);
-  output[16] = WRAPLOW(step1[15] - step1[16], 8);
-  output[17] = WRAPLOW(step1[14] - step1[17], 8);
-  output[18] = WRAPLOW(step1[13] - step1[18], 8);
-  output[19] = WRAPLOW(step1[12] - step1[19], 8);
-  output[20] = WRAPLOW(step1[11] - step1[20], 8);
-  output[21] = WRAPLOW(step1[10] - step1[21], 8);
-  output[22] = WRAPLOW(step1[9] - step1[22], 8);
-  output[23] = WRAPLOW(step1[8] - step1[23], 8);
-  output[24] = WRAPLOW(step1[7] - step1[24], 8);
-  output[25] = WRAPLOW(step1[6] - step1[25], 8);
-  output[26] = WRAPLOW(step1[5] - step1[26], 8);
-  output[27] = WRAPLOW(step1[4] - step1[27], 8);
-  output[28] = WRAPLOW(step1[3] - step1[28], 8);
-  output[29] = WRAPLOW(step1[2] - step1[29], 8);
-  output[30] = WRAPLOW(step1[1] - step1[30], 8);
-  output[31] = WRAPLOW(step1[0] - step1[31], 8);
+  output[0] = WRAPLOW(step1[0] + step1[31]);
+  output[1] = WRAPLOW(step1[1] + step1[30]);
+  output[2] = WRAPLOW(step1[2] + step1[29]);
+  output[3] = WRAPLOW(step1[3] + step1[28]);
+  output[4] = WRAPLOW(step1[4] + step1[27]);
+  output[5] = WRAPLOW(step1[5] + step1[26]);
+  output[6] = WRAPLOW(step1[6] + step1[25]);
+  output[7] = WRAPLOW(step1[7] + step1[24]);
+  output[8] = WRAPLOW(step1[8] + step1[23]);
+  output[9] = WRAPLOW(step1[9] + step1[22]);
+  output[10] = WRAPLOW(step1[10] + step1[21]);
+  output[11] = WRAPLOW(step1[11] + step1[20]);
+  output[12] = WRAPLOW(step1[12] + step1[19]);
+  output[13] = WRAPLOW(step1[13] + step1[18]);
+  output[14] = WRAPLOW(step1[14] + step1[17]);
+  output[15] = WRAPLOW(step1[15] + step1[16]);
+  output[16] = WRAPLOW(step1[15] - step1[16]);
+  output[17] = WRAPLOW(step1[14] - step1[17]);
+  output[18] = WRAPLOW(step1[13] - step1[18]);
+  output[19] = WRAPLOW(step1[12] - step1[19]);
+  output[20] = WRAPLOW(step1[11] - step1[20]);
+  output[21] = WRAPLOW(step1[10] - step1[21]);
+  output[22] = WRAPLOW(step1[9] - step1[22]);
+  output[23] = WRAPLOW(step1[8] - step1[23]);
+  output[24] = WRAPLOW(step1[7] - step1[24]);
+  output[25] = WRAPLOW(step1[6] - step1[25]);
+  output[26] = WRAPLOW(step1[5] - step1[26]);
+  output[27] = WRAPLOW(step1[4] - step1[27]);
+  output[28] = WRAPLOW(step1[3] - step1[28]);
+  output[29] = WRAPLOW(step1[2] - step1[29]);
+  output[30] = WRAPLOW(step1[1] - step1[30]);
+  output[31] = WRAPLOW(step1[0] - step1[31]);
 }
 
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
@@ -1194,6 +1195,33 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
   }
 }
 
+void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  tran_low_t out[32 * 32] = {0};
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+
+  // Rows
+  // only upper-left 16x16 has non-zero coeff
+  for (i = 0; i < 16; ++i) {
+    idct32_c(input, outptr);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    idct32_c(temp_in, temp_out);
+    for (j = 0; j < 32; ++j) {
+      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+    }
+  }
+}
+
 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
                             int stride) {
   tran_low_t out[32 * 32] = {0};
@@ -1225,8 +1253,8 @@ void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
 
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
@@ -1260,10 +1288,10 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    op[0] = WRAPLOW(a1, bd);
-    op[1] = WRAPLOW(b1, bd);
-    op[2] = WRAPLOW(c1, bd);
-    op[3] = WRAPLOW(d1, bd);
+    op[0] = HIGHBD_WRAPLOW(a1, bd);
+    op[1] = HIGHBD_WRAPLOW(b1, bd);
+    op[2] = HIGHBD_WRAPLOW(c1, bd);
+    op[3] = HIGHBD_WRAPLOW(d1, bd);
     ip += 4;
     op += 4;
   }
@@ -1281,10 +1309,14 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
-    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
-    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
-    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
+    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0],
+                                             HIGHBD_WRAPLOW(a1, bd), bd);
+    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1],
+                                             HIGHBD_WRAPLOW(b1, bd), bd);
+    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2],
+                                             HIGHBD_WRAPLOW(c1, bd), bd);
+    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3],
+                                             HIGHBD_WRAPLOW(d1, bd), bd);
 
     ip++;
     dest++;
@@ -1304,8 +1336,8 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   e1 = a1 >> 1;
   a1 -= e1;
-  op[0] = WRAPLOW(a1, bd);
-  op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
+  op[0] = HIGHBD_WRAPLOW(a1, bd);
+  op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
 
   ip = tmp;
   for (i = 0; i < 4; i++) {
@@ -1331,18 +1363,18 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 2
-  output[0] = WRAPLOW(step[0] + step[3], bd);
-  output[1] = WRAPLOW(step[1] + step[2], bd);
-  output[2] = WRAPLOW(step[1] - step[2], bd);
-  output[3] = WRAPLOW(step[0] - step[3], bd);
+  output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
+  output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
+  output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
+  output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
 }
 
 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1376,11 +1408,11 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int dest_stride, int bd) {
   int i;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -1402,39 +1434,39 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 2 & stage 3 - even half
   vpx_highbd_idct4_c(step1, step1, bd);
 
   // stage 2 - odd half
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
 
   // stage 3 - odd half
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7], bd);
-  output[1] = WRAPLOW(step1[1] + step1[6], bd);
-  output[2] = WRAPLOW(step1[2] + step1[5], bd);
-  output[3] = WRAPLOW(step1[3] + step1[4], bd);
-  output[4] = WRAPLOW(step1[3] - step1[4], bd);
-  output[5] = WRAPLOW(step1[2] - step1[5], bd);
-  output[6] = WRAPLOW(step1[1] - step1[6], bd);
-  output[7] = WRAPLOW(step1[0] - step1[7], bd);
+  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
 }
 
 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1468,10 +1500,10 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i)
@@ -1501,7 +1533,7 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s4 = sinpi_1_9 * x2;
   s5 = sinpi_2_9 * x3;
   s6 = sinpi_4_9 * x3;
-  s7 = (tran_high_t)(x0 - x2 + x3);
+  s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
 
   s0 = s0 + s3 + s5;
   s1 = s1 - s4 - s6;
@@ -1512,10 +1544,10 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
-  output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
-  output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
+  output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd);
+  output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd);
+  output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
+  output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd);
 }
 
 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
@@ -1546,14 +1578,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
 
-  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
-  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
+  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
+  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd);
 
   // stage 2
   s0 = x0;
@@ -1565,14 +1597,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
   s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
 
-  x0 = WRAPLOW(s0 + s2, bd);
-  x1 = WRAPLOW(s1 + s3, bd);
-  x2 = WRAPLOW(s0 - s2, bd);
-  x3 = WRAPLOW(s1 - s3, bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
+  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
+  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
+  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
 
   // stage 3
   s2 = cospi_16_64 * (x2 + x3);
@@ -1580,19 +1612,19 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = cospi_16_64 * (x6 + x7);
   s7 = cospi_16_64 * (x6 - x7);
 
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
-
-  output[0] = WRAPLOW(x0, bd);
-  output[1] = WRAPLOW(-x4, bd);
-  output[2] = WRAPLOW(x6, bd);
-  output[3] = WRAPLOW(-x2, bd);
-  output[4] = WRAPLOW(x3, bd);
-  output[5] = WRAPLOW(-x7, bd);
-  output[6] = WRAPLOW(x5, bd);
-  output[7] = WRAPLOW(-x1, bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
+
+  output[0] = HIGHBD_WRAPLOW(x0, bd);
+  output[1] = HIGHBD_WRAPLOW(-x4, bd);
+  output[2] = HIGHBD_WRAPLOW(x6, bd);
+  output[3] = HIGHBD_WRAPLOW(-x2, bd);
+  output[4] = HIGHBD_WRAPLOW(x3, bd);
+  output[5] = HIGHBD_WRAPLOW(-x7, bd);
+  output[6] = HIGHBD_WRAPLOW(x5, bd);
+  output[7] = HIGHBD_WRAPLOW(-x1, bd);
 }
 
 void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1657,23 +1689,23 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -1683,109 +1715,109 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
-  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
-  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
-  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
-  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
-  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
 
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
-  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
-  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
-  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
-  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
-  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
-  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
-  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
-  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
-  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
-  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
-  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
-  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
-  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
   // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15], bd);
-  output[1] = WRAPLOW(step2[1] + step2[14], bd);
-  output[2] = WRAPLOW(step2[2] + step2[13], bd);
-  output[3] = WRAPLOW(step2[3] + step2[12], bd);
-  output[4] = WRAPLOW(step2[4] + step2[11], bd);
-  output[5] = WRAPLOW(step2[5] + step2[10], bd);
-  output[6] = WRAPLOW(step2[6] + step2[9], bd);
-  output[7] = WRAPLOW(step2[7] + step2[8], bd);
-  output[8] = WRAPLOW(step2[7] - step2[8], bd);
-  output[9] = WRAPLOW(step2[6] - step2[9], bd);
-  output[10] = WRAPLOW(step2[5] - step2[10], bd);
-  output[11] = WRAPLOW(step2[4] - step2[11], bd);
-  output[12] = WRAPLOW(step2[3] - step2[12], bd);
-  output[13] = WRAPLOW(step2[2] - step2[13], bd);
-  output[14] = WRAPLOW(step2[1] - step2[14], bd);
-  output[15] = WRAPLOW(step2[0] - step2[15], bd);
+  output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
+  output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
+  output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
+  output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
+  output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
+  output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
+  output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
+  output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
+  output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
+  output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
+  output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
+  output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
+  output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
+  output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
+  output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
+  output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
 }
 
 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1861,22 +1893,22 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 
-  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
-  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
-  x8  = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
-  x9  = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
+  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
+  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
+  x8  = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
+  x9  = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
+  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
+  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
+  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd);
 
   // stage 2
   s0 = x0;
@@ -1896,22 +1928,22 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
 
-  x0 = WRAPLOW(s0 + s4, bd);
-  x1 = WRAPLOW(s1 + s5, bd);
-  x2 = WRAPLOW(s2 + s6, bd);
-  x3 = WRAPLOW(s3 + s7, bd);
-  x4 = WRAPLOW(s0 - s4, bd);
-  x5 = WRAPLOW(s1 - s5, bd);
-  x6 = WRAPLOW(s2 - s6, bd);
-  x7 = WRAPLOW(s3 - s7, bd);
-  x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
-  x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
+  x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
+  x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
+  x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
+  x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
+  x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
+  x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
+  x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
+  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd);
+  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd);
+  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd);
+  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd);
+  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd);
 
   // stage 3
   s0 = x0;
@@ -1931,22 +1963,22 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
 
-  x0 = WRAPLOW(s0 + s2, bd);
-  x1 = WRAPLOW(s1 + s3, bd);
-  x2 = WRAPLOW(s0 - s2, bd);
-  x3 = WRAPLOW(s1 - s3, bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
-  x8 = WRAPLOW(s8 + s10, bd);
-  x9 = WRAPLOW(s9 + s11, bd);
-  x10 = WRAPLOW(s8 - s10, bd);
-  x11 = WRAPLOW(s9 - s11, bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
+  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
+  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
+  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
+  x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
+  x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
+  x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
+  x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
+  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd);
+  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
 
   // stage 4
   s2 = (- cospi_16_64) * (x2 + x3);
@@ -1958,31 +1990,31 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s14 = (- cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
-
-  output[0] = WRAPLOW(x0, bd);
-  output[1] = WRAPLOW(-x8, bd);
-  output[2] = WRAPLOW(x12, bd);
-  output[3] = WRAPLOW(-x4, bd);
-  output[4] = WRAPLOW(x6, bd);
-  output[5] = WRAPLOW(x14, bd);
-  output[6] = WRAPLOW(x10, bd);
-  output[7] = WRAPLOW(x2, bd);
-  output[8] = WRAPLOW(x3, bd);
-  output[9] = WRAPLOW(x11, bd);
-  output[10] = WRAPLOW(x15, bd);
-  output[11] = WRAPLOW(x7, bd);
-  output[12] = WRAPLOW(x5, bd);
-  output[13] = WRAPLOW(-x13, bd);
-  output[14] = WRAPLOW(x9, bd);
-  output[15] = WRAPLOW(-x1, bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
+  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd);
+  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd);
+
+  output[0] = HIGHBD_WRAPLOW(x0, bd);
+  output[1] = HIGHBD_WRAPLOW(-x8, bd);
+  output[2] = HIGHBD_WRAPLOW(x12, bd);
+  output[3] = HIGHBD_WRAPLOW(-x4, bd);
+  output[4] = HIGHBD_WRAPLOW(x6, bd);
+  output[5] = HIGHBD_WRAPLOW(x14, bd);
+  output[6] = HIGHBD_WRAPLOW(x10, bd);
+  output[7] = HIGHBD_WRAPLOW(x2, bd);
+  output[8] = HIGHBD_WRAPLOW(x3, bd);
+  output[9] = HIGHBD_WRAPLOW(x11, bd);
+  output[10] = HIGHBD_WRAPLOW(x15, bd);
+  output[11] = HIGHBD_WRAPLOW(x7, bd);
+  output[12] = HIGHBD_WRAPLOW(x5, bd);
+  output[13] = HIGHBD_WRAPLOW(-x13, bd);
+  output[14] = HIGHBD_WRAPLOW(x9, bd);
+  output[15] = HIGHBD_WRAPLOW(-x1, bd);
 }
 
 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -2017,11 +2049,11 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
                                   int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i)
@@ -2056,43 +2088,43 @@ static void highbd_idct32_c(const tran_low_t *input,
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 2
   step2[0] = step1[0];
@@ -2106,40 +2138,40 @@ static void highbd_idct32_c(const tran_low_t *input,
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  step2[16] = WRAPLOW(step1[16] + step1[17], bd);
-  step2[17] = WRAPLOW(step1[16] - step1[17], bd);
-  step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
-  step2[19] = WRAPLOW(step1[18] + step1[19], bd);
-  step2[20] = WRAPLOW(step1[20] + step1[21], bd);
-  step2[21] = WRAPLOW(step1[20] - step1[21], bd);
-  step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
-  step2[23] = WRAPLOW(step1[22] + step1[23], bd);
-  step2[24] = WRAPLOW(step1[24] + step1[25], bd);
-  step2[25] = WRAPLOW(step1[24] - step1[25], bd);
-  step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
-  step2[27] = WRAPLOW(step1[26] + step1[27], bd);
-  step2[28] = WRAPLOW(step1[28] + step1[29], bd);
-  step2[29] = WRAPLOW(step1[28] - step1[29], bd);
-  step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
-  step2[31] = WRAPLOW(step1[30] + step1[31], bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+
+  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
+  step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
+  step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
+  step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
+  step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
+  step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
+  step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
+  step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
+  step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
+  step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
+  step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
+  step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
+  step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
+  step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
+  step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
+  step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -2149,42 +2181,42 @@ static void highbd_idct32_c(const tran_low_t *input,
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
-  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
-  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
-  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
-  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
-  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
 
   step1[16] = step2[16];
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -2193,87 +2225,87 @@ static void highbd_idct32_c(const tran_low_t *input,
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
-  step2[16] = WRAPLOW(step1[16] + step1[19], bd);
-  step2[17] = WRAPLOW(step1[17] + step1[18], bd);
-  step2[18] = WRAPLOW(step1[17] - step1[18], bd);
-  step2[19] = WRAPLOW(step1[16] - step1[19], bd);
-  step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
-  step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
-  step2[22] = WRAPLOW(step1[21] + step1[22], bd);
-  step2[23] = WRAPLOW(step1[20] + step1[23], bd);
-
-  step2[24] = WRAPLOW(step1[24] + step1[27], bd);
-  step2[25] = WRAPLOW(step1[25] + step1[26], bd);
-  step2[26] = WRAPLOW(step1[25] - step1[26], bd);
-  step2[27] = WRAPLOW(step1[24] - step1[27], bd);
-  step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
-  step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
-  step2[30] = WRAPLOW(step1[29] + step1[30], bd);
-  step2[31] = WRAPLOW(step1[28] + step1[31], bd);
+  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
+  step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
+  step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
+  step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
+  step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
+  step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
+  step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
+  step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
+
+  step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
+  step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
+  step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
+  step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
+  step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
+  step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
+  step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
+  step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
-  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
-  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
-  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
-  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
-  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
-  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -2282,62 +2314,62 @@ static void highbd_idct32_c(const tran_low_t *input,
   step1[31] = step2[31];
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
-  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
-  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
-  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
-  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
-  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
-  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
-  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
-  step2[16] = WRAPLOW(step1[16] + step1[23], bd);
-  step2[17] = WRAPLOW(step1[17] + step1[22], bd);
-  step2[18] = WRAPLOW(step1[18] + step1[21], bd);
-  step2[19] = WRAPLOW(step1[19] + step1[20], bd);
-  step2[20] = WRAPLOW(step1[19] - step1[20], bd);
-  step2[21] = WRAPLOW(step1[18] - step1[21], bd);
-  step2[22] = WRAPLOW(step1[17] - step1[22], bd);
-  step2[23] = WRAPLOW(step1[16] - step1[23], bd);
-
-  step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
-  step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
-  step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
-  step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
-  step2[28] = WRAPLOW(step1[27] + step1[28], bd);
-  step2[29] = WRAPLOW(step1[26] + step1[29], bd);
-  step2[30] = WRAPLOW(step1[25] + step1[30], bd);
-  step2[31] = WRAPLOW(step1[24] + step1[31], bd);
+  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
+  step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
+  step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
+  step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
+  step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
+  step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
+  step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
+  step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
+
+  step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
+  step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
+  step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
+  step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
+  step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
+  step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
+  step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
+  step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
 
   // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[14], bd);
-  step1[2] = WRAPLOW(step2[2] + step2[13], bd);
-  step1[3] = WRAPLOW(step2[3] + step2[12], bd);
-  step1[4] = WRAPLOW(step2[4] + step2[11], bd);
-  step1[5] = WRAPLOW(step2[5] + step2[10], bd);
-  step1[6] = WRAPLOW(step2[6] + step2[9], bd);
-  step1[7] = WRAPLOW(step2[7] + step2[8], bd);
-  step1[8] = WRAPLOW(step2[7] - step2[8], bd);
-  step1[9] = WRAPLOW(step2[6] - step2[9], bd);
-  step1[10] = WRAPLOW(step2[5] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[4] - step2[11], bd);
-  step1[12] = WRAPLOW(step2[3] - step2[12], bd);
-  step1[13] = WRAPLOW(step2[2] - step2[13], bd);
-  step1[14] = WRAPLOW(step2[1] - step2[14], bd);
-  step1[15] = WRAPLOW(step2[0] - step2[15], bd);
+  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
+  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
+  step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
+  step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
+  step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
+  step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
+  step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
+  step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
+  step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
+  step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
+  step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
@@ -2345,58 +2377,58 @@ static void highbd_idct32_c(const tran_low_t *input,
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
   step1[31] = step2[31];
 
   // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31], bd);
-  output[1] = WRAPLOW(step1[1] + step1[30], bd);
-  output[2] = WRAPLOW(step1[2] + step1[29], bd);
-  output[3] = WRAPLOW(step1[3] + step1[28], bd);
-  output[4] = WRAPLOW(step1[4] + step1[27], bd);
-  output[5] = WRAPLOW(step1[5] + step1[26], bd);
-  output[6] = WRAPLOW(step1[6] + step1[25], bd);
-  output[7] = WRAPLOW(step1[7] + step1[24], bd);
-  output[8] = WRAPLOW(step1[8] + step1[23], bd);
-  output[9] = WRAPLOW(step1[9] + step1[22], bd);
-  output[10] = WRAPLOW(step1[10] + step1[21], bd);
-  output[11] = WRAPLOW(step1[11] + step1[20], bd);
-  output[12] = WRAPLOW(step1[12] + step1[19], bd);
-  output[13] = WRAPLOW(step1[13] + step1[18], bd);
-  output[14] = WRAPLOW(step1[14] + step1[17], bd);
-  output[15] = WRAPLOW(step1[15] + step1[16], bd);
-  output[16] = WRAPLOW(step1[15] - step1[16], bd);
-  output[17] = WRAPLOW(step1[14] - step1[17], bd);
-  output[18] = WRAPLOW(step1[13] - step1[18], bd);
-  output[19] = WRAPLOW(step1[12] - step1[19], bd);
-  output[20] = WRAPLOW(step1[11] - step1[20], bd);
-  output[21] = WRAPLOW(step1[10] - step1[21], bd);
-  output[22] = WRAPLOW(step1[9] - step1[22], bd);
-  output[23] = WRAPLOW(step1[8] - step1[23], bd);
-  output[24] = WRAPLOW(step1[7] - step1[24], bd);
-  output[25] = WRAPLOW(step1[6] - step1[25], bd);
-  output[26] = WRAPLOW(step1[5] - step1[26], bd);
-  output[27] = WRAPLOW(step1[4] - step1[27], bd);
-  output[28] = WRAPLOW(step1[3] - step1[28], bd);
-  output[29] = WRAPLOW(step1[2] - step1[29], bd);
-  output[30] = WRAPLOW(step1[1] - step1[30], bd);
-  output[31] = WRAPLOW(step1[0] - step1[31], bd);
+  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
+  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
+  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
+  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
+  output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
+  output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
+  output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
+  output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
+  output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
+  output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
+  output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
+  output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
+  output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
+  output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
+  output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
+  output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
+  output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
+  output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
+  output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
+  output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
+  output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
+  output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
+  output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
+  output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
+  output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
+  output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
+  output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
+  output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
+  output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
+  output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
+  output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
+  output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
 }
 
 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -2472,9 +2504,9 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
   int a1;
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  tran_low_t out = WRAPLOW(
-      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h
index 2358813..9cfe1be 100644
--- a/vpx_dsp/inv_txfm.h
+++ b/vpx_dsp/inv_txfm.h
@@ -21,7 +21,7 @@
 extern "C" {
 #endif
 
-static INLINE tran_low_t check_range(tran_high_t input) {
+static INLINE tran_high_t check_range(tran_high_t input) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   // For valid VP9 input streams, intermediate stage coefficients should always
   // stay within the range of a signed 16 bit integer. Coefficients can go out
@@ -32,17 +32,17 @@ static INLINE tran_low_t check_range(tran_high_t input) {
   assert(INT16_MIN <= input);
   assert(input <= INT16_MAX);
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-  return (tran_low_t)input;
+  return input;
 }
 
-static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
+static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return check_range(rv);
+  return (tran_high_t)rv;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE tran_low_t highbd_check_range(tran_high_t input,
-                                            int bd) {
+static INLINE tran_high_t highbd_check_range(tran_high_t input,
+                                             int bd) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
   // For valid highbitdepth VP9 streams, intermediate stage coefficients will
   // stay within the ranges:
@@ -56,13 +56,12 @@ static INLINE tran_low_t highbd_check_range(tran_high_t input,
   (void) int_min;
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   (void) bd;
-  return (tran_low_t)input;
+  return input;
 }
 
-static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
-                                                      int bd) {
+static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return highbd_check_range(rv, bd);
+  return (tran_high_t)rv;
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -83,9 +82,20 @@ static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
 // bd of 10 uses trans_low with 18bits, need to remove 14bits
 // bd of 12 uses trans_low with 20bits, need to remove 12bits
 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
-#else
-#define WRAPLOW(x, bd) ((int32_t)(x))
+
+#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16)
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_WRAPLOW(x, bd) \
+    ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd))
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#else   // CONFIG_EMULATE_HARDWARE
+
+#define WRAPLOW(x) ((int32_t)check_range(x))
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_WRAPLOW(x, bd) \
+    ((int32_t)highbd_check_range((x), bd))
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_EMULATE_HARDWARE
 
 void idct4_c(const tran_low_t *input, tran_low_t *output);
@@ -107,14 +117,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
 
 static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
                                              int bd) {
-  trans = WRAPLOW(trans, bd);
-  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
+  trans = HIGHBD_WRAPLOW(trans, bd);
+  return clip_pixel_highbd(dest + (int)trans, bd);
 }
 #endif
 
 static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
-  trans = WRAPLOW(trans, 8);
-  return clip_pixel(WRAPLOW(dest + trans, 8));
+  trans = WRAPLOW(trans);
+  return clip_pixel(dest + (int)trans);
 }
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c
index 66f4d95..645a1ab 100644
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -11,6 +11,7 @@
 #include <stdlib.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 
@@ -119,12 +120,12 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
 
 void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
                             const uint8_t *blimit, const uint8_t *limit,
-                            const uint8_t *thresh, int count) {
+                            const uint8_t *thresh) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < 8; ++i) {
     const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     const uint8_t q0 = s[0 * p],  q1 = s[1 * p],  q2 = s[2 * p],  q3 = s[3 * p];
     const int8_t mask = filter_mask(*limit, *blimit,
@@ -138,18 +139,17 @@ void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
                                  const uint8_t *limit0, const uint8_t *thresh0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
-  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1);
-  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh,
-                          int count) {
+                          const uint8_t *limit, const uint8_t *thresh) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < 8; ++i) {
     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
     const int8_t mask = filter_mask(*limit, *blimit,
@@ -163,9 +163,8 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                const uint8_t *limit0, const uint8_t *thresh0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1) {
-  vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1);
-  vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
-                                  thresh1, 1);
+  vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
+  vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
 static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
@@ -190,13 +189,12 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
 }
 
 void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
-                            const uint8_t *limit, const uint8_t *thresh,
-                            int count) {
+                            const uint8_t *limit, const uint8_t *thresh) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < 8; ++i) {
     const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
 
@@ -213,16 +211,15 @@ void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
                                  const uint8_t *limit0, const uint8_t *thresh0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
-  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1);
-  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh,
-                          int count) {
+                          const uint8_t *limit, const uint8_t *thresh) {
   int i;
 
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < 8; ++i) {
     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
     const int8_t mask = filter_mask(*limit, *blimit,
@@ -238,9 +235,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                const uint8_t *limit0, const uint8_t *thresh0,
                                const uint8_t *blimit1, const uint8_t *limit1,
                                const uint8_t *thresh1) {
-  vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1);
-  vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
-                                    thresh1, 1);
+  vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
+  vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
 static INLINE void filter16(int8_t mask, uint8_t thresh,
@@ -294,9 +290,9 @@ static INLINE void filter16(int8_t mask, uint8_t thresh,
   }
 }
 
-void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh,
-                             int count) {
+static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh, int count) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
@@ -320,6 +316,16 @@ void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
   }
 }
 
+void vpx_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh) {
+  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
+}
+
+void vpx_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,
+                                  const uint8_t *limit, const uint8_t *thresh) {
+  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
+}
+
 static void mb_lpf_vertical_edge_w(uint8_t *s, int p,
                                    const uint8_t *blimit,
                                    const uint8_t *limit,
@@ -450,12 +456,12 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
 
 void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
                                    const uint8_t *blimit, const uint8_t *limit,
-                                   const uint8_t *thresh, int count, int bd) {
+                                   const uint8_t *thresh, int bd) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < 8; ++i) {
     const uint16_t p3 = s[-4 * p];
     const uint16_t p2 = s[-3 * p];
     const uint16_t p1 = s[-2 * p];
@@ -479,18 +485,18 @@ void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int p,
                                         const uint8_t *limit1,
                                         const uint8_t *thresh1,
                                         int bd) {
-  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1, bd);
-  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
 }
 
 void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh,
-                                 int count, int bd) {
+                                 int bd) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < 8; ++i) {
     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint16_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
     const int8_t mask = highbd_filter_mask(*limit, *blimit,
@@ -508,9 +514,9 @@ void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch,
                                       const uint8_t *limit1,
                                       const uint8_t *thresh1,
                                       int bd) {
-  vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
   vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1,
-                              thresh1, 1, bd);
+                              thresh1, bd);
 }
 
 static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
@@ -536,12 +542,12 @@ static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
 
 void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
-                                   int count, int bd) {
+                                   int bd) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < 8; ++i) {
     const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
     const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
 
@@ -564,16 +570,16 @@ void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int p,
                                         const uint8_t *limit1,
                                         const uint8_t *thresh1,
                                         int bd) {
-  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1, bd);
-  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1, bd);
+  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
 }
 
 void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh,
-                                 int count, int bd) {
+                                 int bd) {
   int i;
 
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < 8; ++i) {
     const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
     const int8_t mask = highbd_filter_mask(*limit, *blimit,
@@ -596,9 +602,9 @@ void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch,
                                       const uint8_t *limit1,
                                       const uint8_t *thresh1,
                                       int bd) {
-  vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1, bd);
+  vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
   vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1,
-                              thresh1, 1, bd);
+                              thresh1, bd);
 }
 
 static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
@@ -664,9 +670,11 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh,
   }
 }
 
-void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int count, int bd) {
+static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
+                                            const uint8_t *blimit,
+                                            const uint8_t *limit,
+                                            const uint8_t *thresh,
+                                            int count, int bd) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
@@ -698,6 +706,20 @@ void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
   }
 }
 
+void vpx_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p,
+                                        const uint8_t *blimit,
+                                        const uint8_t *limit,
+                                        const uint8_t *thresh, int bd) {
+  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
+}
+
+void vpx_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p,
+                                         const uint8_t *blimit,
+                                         const uint8_t *limit,
+                                         const uint8_t *thresh, int bd) {
+  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
+}
+
 static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
                                           const uint8_t *blimit,
                                           const uint8_t *limit,
diff --git a/vpx_dsp/mips/add_noise_msa.c b/vpx_dsp/mips/add_noise_msa.c
new file mode 100644
index 0000000..366770c
--- /dev/null
+++ b/vpx_dsp/mips/add_noise_msa.c
@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "./macros_msa.h"
+
+void vpx_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
+                             char blackclamp[16], char whiteclamp[16],
+                             char bothclamp[16], uint32_t width,
+                             uint32_t height, int32_t pitch) {
+  uint32_t i, j;
+
+  for (i = 0; i < height / 2; ++i) {
+    uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
+    int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff));
+    uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
+    int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff));
+    for (j = width / 16; j--;) {
+      v16i8 temp00_s, temp01_s;
+      v16u8 temp00, temp01, black_clamp, white_clamp;
+      v16u8 pos0, ref0, pos1, ref1;
+      v16i8 const127 = __msa_ldi_b(127);
+
+      pos0 = LD_UB(pos0_ptr);
+      ref0 = LD_UB(ref0_ptr);
+      pos1 = LD_UB(pos1_ptr);
+      ref1 = LD_UB(ref1_ptr);
+      black_clamp = (v16u8)__msa_fill_b(blackclamp[0]);
+      white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]);
+      temp00 = (pos0 < black_clamp);
+      pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
+      temp01 = (pos1 < black_clamp);
+      pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
+      XORI_B2_128_UB(pos0, pos1);
+      temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
+      temp00 = (v16u8)(temp00_s < pos0);
+      pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
+      temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
+      temp01 = (temp01_s < pos1);
+      pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
+      XORI_B2_128_UB(pos0, pos1);
+      pos0 += ref0;
+      ST_UB(pos0, pos0_ptr);
+      pos1 += ref1;
+      ST_UB(pos1, pos1_ptr);
+      pos0_ptr += 16;
+      pos1_ptr += 16;
+      ref0_ptr += 16;
+      ref1_ptr += 16;
+    }
+  }
+}
diff --git a/vp9/encoder/mips/msa/vp9_avg_msa.c b/vpx_dsp/mips/avg_msa.c
similarity index 91%
rename from vp9/encoder/mips/msa/vp9_avg_msa.c
rename to vpx_dsp/mips/avg_msa.c
index 611adb1..52a24ed 100644
--- a/vp9/encoder/mips/msa/vp9_avg_msa.c
+++ b/vpx_dsp/mips/avg_msa.c
@@ -8,10 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
 
-uint32_t vp9_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
+uint32_t vpx_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
   uint32_t sum_out;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
@@ -33,7 +33,7 @@ uint32_t vp9_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
   return sum_out;
 }
 
-uint32_t vp9_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
+uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
   uint32_t sum_out;
   uint32_t src0, src1, src2, src3;
   v16u8 vec = { 0 };
diff --git a/vpx_dsp/mips/fwd_dct32x32_msa.c b/vpx_dsp/mips/fwd_dct32x32_msa.c
index 2115a34..f29c14b 100644
--- a/vpx_dsp/mips/fwd_dct32x32_msa.c
+++ b/vpx_dsp/mips/fwd_dct32x32_msa.c
@@ -933,23 +933,21 @@ void vpx_fdct32x32_rd_msa(const int16_t *input, int16_t *out,
 }
 
 void vpx_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
-  out[1] = 0;
-
-  out[0] = LD_HADD(input, stride);
-  out[0] += LD_HADD(input + 8, stride);
-  out[0] += LD_HADD(input + 16, stride);
-  out[0] += LD_HADD(input + 24, stride);
-  out[0] += LD_HADD(input + 32 * 8, stride);
-  out[0] += LD_HADD(input + 32 * 8 + 8, stride);
-  out[0] += LD_HADD(input + 32 * 8 + 16, stride);
-  out[0] += LD_HADD(input + 32 * 8 + 24, stride);
-  out[0] += LD_HADD(input + 32 * 16, stride);
-  out[0] += LD_HADD(input + 32 * 16 + 8, stride);
-  out[0] += LD_HADD(input + 32 * 16 + 16, stride);
-  out[0] += LD_HADD(input + 32 * 16 + 24, stride);
-  out[0] += LD_HADD(input + 32 * 24, stride);
-  out[0] += LD_HADD(input + 32 * 24 + 8, stride);
-  out[0] += LD_HADD(input + 32 * 24 + 16, stride);
-  out[0] += LD_HADD(input + 32 * 24 + 24, stride);
-  out[0] >>= 3;
+  int sum = LD_HADD(input, stride);
+  sum += LD_HADD(input + 8, stride);
+  sum += LD_HADD(input + 16, stride);
+  sum += LD_HADD(input + 24, stride);
+  sum += LD_HADD(input + 32 * 8, stride);
+  sum += LD_HADD(input + 32 * 8 + 8, stride);
+  sum += LD_HADD(input + 32 * 8 + 16, stride);
+  sum += LD_HADD(input + 32 * 8 + 24, stride);
+  sum += LD_HADD(input + 32 * 16, stride);
+  sum += LD_HADD(input + 32 * 16 + 8, stride);
+  sum += LD_HADD(input + 32 * 16 + 16, stride);
+  sum += LD_HADD(input + 32 * 16 + 24, stride);
+  sum += LD_HADD(input + 32 * 24, stride);
+  sum += LD_HADD(input + 32 * 24 + 8, stride);
+  sum += LD_HADD(input + 32 * 24 + 16, stride);
+  sum += LD_HADD(input + 32 * 24 + 24, stride);
+  out[0] = (int16_t)(sum >> 3);
 }
diff --git a/vpx_dsp/mips/fwd_txfm_msa.c b/vpx_dsp/mips/fwd_txfm_msa.c
index f66dd5f..0dd141f 100644
--- a/vpx_dsp/mips/fwd_txfm_msa.c
+++ b/vpx_dsp/mips/fwd_txfm_msa.c
@@ -237,11 +237,9 @@ void vpx_fdct16x16_msa(const int16_t *input, int16_t *output,
 }
 
 void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
-  out[1] = 0;
-
-  out[0] = LD_HADD(input, stride);
-  out[0] += LD_HADD(input + 8, stride);
-  out[0] += LD_HADD(input + 16 * 8, stride);
-  out[0] += LD_HADD(input + 16 * 8 + 8, stride);
-  out[0] >>= 1;
+  int sum = LD_HADD(input, stride);
+  sum += LD_HADD(input + 8, stride);
+  sum += LD_HADD(input + 16 * 8, stride);
+  sum += LD_HADD(input + 16 * 8 + 8, stride);
+  out[0] = (int16_t)(sum >> 1);
 }
diff --git a/vpx_dsp/mips/loopfilter_16_msa.c b/vpx_dsp/mips/loopfilter_16_msa.c
index b7c9f7b..a6c581d 100644
--- a/vpx_dsp/mips/loopfilter_16_msa.c
+++ b/vpx_dsp/mips/loopfilter_16_msa.c
@@ -423,11 +423,11 @@ void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
   }
 }
 
-void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
-                               const uint8_t *b_limit_ptr,
-                               const uint8_t *limit_ptr,
-                               const uint8_t *thresh_ptr,
-                               int32_t count) {
+static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
+                                   const uint8_t *b_limit_ptr,
+                                   const uint8_t *limit_ptr,
+                                   const uint8_t *thresh_ptr,
+                                   int32_t count) {
   if (1 == count) {
     uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
     uint64_t dword0, dword1;
@@ -648,6 +648,20 @@ void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
   }
 }
 
+void vpx_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch,
+                                   const uint8_t *b_limit_ptr,
+                                   const uint8_t *limit_ptr,
+                                   const uint8_t *thresh_ptr) {
+  mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
+}
+
+void vpx_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
+  mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);
+}
+
 static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
                                    uint8_t *output, int32_t out_pitch) {
   v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
diff --git a/vpx_dsp/mips/loopfilter_4_msa.c b/vpx_dsp/mips/loopfilter_4_msa.c
index daf5f38..9363470 100644
--- a/vpx_dsp/mips/loopfilter_4_msa.c
+++ b/vpx_dsp/mips/loopfilter_4_msa.c
@@ -13,14 +13,11 @@
 void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
                               const uint8_t *b_limit_ptr,
                               const uint8_t *limit_ptr,
-                              const uint8_t *thresh_ptr,
-                              int32_t count) {
+                              const uint8_t *thresh_ptr) {
   uint64_t p1_d, p0_d, q0_d, q1_d;
   v16u8 mask, hev, flat, thresh, b_limit, limit;
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
 
-  (void)count;
-
   /* load vector elements */
   LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
 
@@ -74,14 +71,11 @@ void vpx_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
 void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
                             const uint8_t *b_limit_ptr,
                             const uint8_t *limit_ptr,
-                            const uint8_t *thresh_ptr,
-                            int32_t count) {
+                            const uint8_t *thresh_ptr) {
   v16u8 mask, hev, flat, limit, thresh, b_limit;
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
   v8i16 vec0, vec1, vec2, vec3;
 
-  (void)count;
-
   LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
 
   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
diff --git a/vpx_dsp/mips/loopfilter_8_msa.c b/vpx_dsp/mips/loopfilter_8_msa.c
index 00b6db5..5b22bd0 100644
--- a/vpx_dsp/mips/loopfilter_8_msa.c
+++ b/vpx_dsp/mips/loopfilter_8_msa.c
@@ -13,8 +13,7 @@
 void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
                               const uint8_t *b_limit_ptr,
                               const uint8_t *limit_ptr,
-                              const uint8_t *thresh_ptr,
-                              int32_t count) {
+                              const uint8_t *thresh_ptr) {
   uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
   v16u8 mask, hev, flat, thresh, b_limit, limit;
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
@@ -23,8 +22,6 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
   v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
   v16i8 zero = { 0 };
 
-  (void)count;
-
   /* load vector elements */
   LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
 
@@ -161,8 +158,7 @@ void vpx_lpf_horizontal_8_dual_msa(uint8_t *src, int32_t pitch,
 void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
                             const uint8_t *b_limit_ptr,
                             const uint8_t *limit_ptr,
-                            const uint8_t *thresh_ptr,
-                            int32_t count) {
+                            const uint8_t *thresh_ptr) {
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
   v16u8 p1_out, p0_out, q0_out, q1_out;
   v16u8 flat, mask, hev, thresh, b_limit, limit;
@@ -171,8 +167,6 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
   v16u8 zero = { 0 };
   v8i16 vec0, vec1, vec2, vec3, vec4;
 
-  (void)count;
-
   /* load vector elements */
   LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
 
diff --git a/vpx_dsp/mips/loopfilter_filters_dspr2.c b/vpx_dsp/mips/loopfilter_filters_dspr2.c
index 99a96d8..8414b9e 100644
--- a/vpx_dsp/mips/loopfilter_filters_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_filters_dspr2.c
@@ -23,8 +23,7 @@ void vpx_lpf_horizontal_4_dspr2(unsigned char *s,
                                 int pitch,
                                 const uint8_t *blimit,
                                 const uint8_t *limit,
-                                const uint8_t *thresh,
-                                int count) {
+                                const uint8_t *thresh) {
   uint8_t   i;
   uint32_t  mask;
   uint32_t  hev;
@@ -117,8 +116,7 @@ void vpx_lpf_vertical_4_dspr2(unsigned char *s,
                               int pitch,
                               const uint8_t *blimit,
                               const uint8_t *limit,
-                              const uint8_t *thresh,
-                              int count) {
+                              const uint8_t *thresh) {
   uint8_t   i;
   uint32_t  mask, hev;
   uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
@@ -313,8 +311,8 @@ void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
                                      const uint8_t *blimit1,
                                      const uint8_t *limit1,
                                      const uint8_t *thresh1) {
-  vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
@@ -324,8 +322,8 @@ void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
                                      const uint8_t *blimit1,
                                      const uint8_t *limit1,
                                      const uint8_t *thresh1) {
-  vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
@@ -335,8 +333,8 @@ void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
                                    const uint8_t *blimit1,
                                    const uint8_t *limit1,
                                    const uint8_t *thresh1) {
-  vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+  vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
@@ -346,9 +344,8 @@ void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
                                    const uint8_t *blimit1,
                                    const uint8_t *limit1,
                                    const uint8_t *thresh1) {
-  vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
-  vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
-                                       1);
+  vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0);
+  vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
diff --git a/vpx_dsp/mips/loopfilter_mb_dspr2.c b/vpx_dsp/mips/loopfilter_mb_dspr2.c
index 4138f56..dd0545e 100644
--- a/vpx_dsp/mips/loopfilter_mb_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_dspr2.c
@@ -23,8 +23,7 @@ void vpx_lpf_horizontal_8_dspr2(unsigned char *s,
                                 int pitch,
                                 const uint8_t *blimit,
                                 const uint8_t *limit,
-                                const uint8_t *thresh,
-                                int count) {
+                                const uint8_t *thresh) {
   uint32_t  mask;
   uint32_t  hev, flat;
   uint8_t   i;
@@ -322,8 +321,7 @@ void vpx_lpf_vertical_8_dspr2(unsigned char *s,
                               int pitch,
                               const uint8_t *blimit,
                               const uint8_t *limit,
-                              const uint8_t *thresh,
-                              int count) {
+                              const uint8_t *thresh) {
   uint8_t   i;
   uint32_t  mask, hev, flat;
   uint8_t   *s1, *s2, *s3, *s4;
diff --git a/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
index 8a48650..85e167c 100644
--- a/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
+++ b/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c
@@ -19,12 +19,12 @@
 #include "vpx_mem/vpx_mem.h"
 
 #if HAVE_DSPR2
-void vpx_lpf_horizontal_16_dspr2(unsigned char *s,
-                                 int pitch,
-                                 const uint8_t *blimit,
-                                 const uint8_t *limit,
-                                 const uint8_t *thresh,
-                                 int count) {
+static void mb_lpf_horizontal_edge(unsigned char *s,
+                                   int pitch,
+                                   const uint8_t *blimit,
+                                   const uint8_t *limit,
+                                   const uint8_t *thresh,
+                                   int count) {
   uint32_t  mask;
   uint32_t  hev, flat, flat2;
   uint8_t   i;
@@ -791,4 +791,18 @@ void vpx_lpf_horizontal_16_dspr2(unsigned char *s,
     s = s + 4;
   }
 }
+
+void vpx_lpf_horizontal_edge_8_dspr2(unsigned char *s, int pitch,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh) {
+  mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1);
+}
+
+void vpx_lpf_horizontal_edge_16_dspr2(unsigned char *s, int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh) {
+  mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2);
+}
 #endif  // #if HAVE_DSPR2
diff --git a/vpx_dsp/psnrhvs.c b/vpx_dsp/psnrhvs.c
index 3001705..0ffa1b2 100644
--- a/vpx_dsp/psnrhvs.c
+++ b/vpx_dsp/psnrhvs.c
@@ -200,6 +200,8 @@ static double calc_psnrhvs(const unsigned char *_src, int _systride,
       }
     }
   }
+  if (pixels <=0)
+      return 0;
   ret /= pixels;
   return ret;
 }
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index e4e741a..80fcd66 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/quantize.h"
 #include "vpx_mem/vpx_mem.h"
 
@@ -52,7 +53,7 @@ void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr,
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
     const int64_t tmp = abs_coeff + round_ptr[0];
-    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16);
+    const int abs_qcoeff = (int)((tmp * quant) >> 16);
     qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
     dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
     if (abs_qcoeff)
@@ -108,7 +109,7 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
     const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
-    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15);
+    const int abs_qcoeff = (int)((tmp * quant) >> 15);
     qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
     dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
     if (abs_qcoeff)
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index c0c3ff9..f1f951f 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -33,47 +33,6 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride,
   return sad;
 }
 
-// TODO(johannkoenig): this moved to vpx_dsp, should be able to clean this up.
-/* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred.
- * The function averages every corresponding element of the buffers and stores
- * the value in a third buffer, comp_pred.
- * pred and comp_pred are assumed to have stride = width
- * In the usage below comp_pred is a local array.
- */
-static INLINE void avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
-                            int height, const uint8_t *ref, int ref_stride) {
-  int i, j;
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      const int tmp = pred[j] + ref[j];
-      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE void highbd_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
-                                   int width, int height, const uint8_t *ref8,
-                                   int ref_stride) {
-  int i, j;
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      const int tmp = pred[j] + ref[j];
-      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 #define sadMxN(m, n) \
 unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
                                   const uint8_t *ref, int ref_stride) { \
@@ -83,7 +42,7 @@ unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
                                       const uint8_t *ref, int ref_stride, \
                                       const uint8_t *second_pred) { \
   uint8_t comp_pred[m * n]; \
-  avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+  vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
   return sad(src, src_stride, comp_pred, m, m, n); \
 }
 
@@ -221,7 +180,7 @@ unsigned int vpx_highbd_sad##m##x##n##_avg_c(const uint8_t *src, \
                                              int ref_stride, \
                                              const uint8_t *second_pred) { \
   uint16_t comp_pred[m * n]; \
-  highbd_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+  vpx_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
   return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
 }
 
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index e8bddb0..d960c54 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -275,7 +275,7 @@ void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
 #if CONFIG_VP9_HIGHBITDEPTH
 static void highbd_variance64(const uint8_t *a8, int  a_stride,
                               const uint8_t *b8, int  b_stride,
-                              int w, int h, uint64_t *sse, uint64_t *sum) {
+                              int w, int h, uint64_t *sse, int64_t *sum) {
   int i, j;
 
   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
@@ -298,7 +298,7 @@ static void highbd_8_variance(const uint8_t *a8, int  a_stride,
                               const uint8_t *b8, int  b_stride,
                               int w, int h, uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
+  int64_t sum_long = 0;
   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
   *sse = (uint32_t)sse_long;
   *sum = (int)sum_long;
@@ -308,7 +308,7 @@ static void highbd_10_variance(const uint8_t *a8, int  a_stride,
                                const uint8_t *b8, int  b_stride,
                                int w, int h, uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
+  int64_t sum_long = 0;
   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
@@ -318,7 +318,7 @@ static void highbd_12_variance(const uint8_t *a8, int  a_stride,
                                const uint8_t *b8, int  b_stride,
                                int w, int h, uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
+  int64_t sum_long = 0;
   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
@@ -341,8 +341,10 @@ uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
                                              int b_stride, \
                                              uint32_t *sse) { \
   int sum; \
+  int64_t var; \
   highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
+  var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+  return (var >= 0) ? (uint32_t)var : 0; \
 } \
 \
 uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
@@ -351,8 +353,10 @@ uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
                                              int b_stride, \
                                              uint32_t *sse) { \
   int sum; \
+  int64_t var; \
   highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
+  var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+  return (var >= 0) ? (uint32_t)var : 0; \
 }
 
 #define HIGHBD_GET_VAR(S) \
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index cd0fd98..c18d9b4 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -74,7 +74,7 @@ typedef struct variance_vtable {
 } vp8_variance_fn_ptr_t;
 #endif  // CONFIG_VP8
 
-#if CONFIG_VP9 || CONFIG_VP10
+#if CONFIG_VP9
 typedef struct vp9_variance_vtable {
   vpx_sad_fn_t               sdf;
   vpx_sad_avg_fn_t           sdaf;
@@ -85,7 +85,7 @@ typedef struct vp9_variance_vtable {
   vpx_sad_multi_fn_t         sdx8f;
   vpx_sad_multi_d_fn_t       sdx4df;
 } vp9_variance_fn_ptr_t;
-#endif  // CONFIG_VP9 || CONFIG_VP10
+#endif  // CONFIG_VP9
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 9620eaa..84b5291 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -52,6 +52,12 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
 endif  # CONFIG_USE_X86INC
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
+ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
+DSP_SRCS-yes += add_noise.c
+DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c
+DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm
+endif # CONFIG_POSTPROC
+
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
 DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c
 DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c
@@ -128,7 +134,6 @@ DSP_SRCS-yes += loopfilter.c
 
 DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_sse2.c
 DSP_SRCS-$(HAVE_AVX2)                += x86/loopfilter_avx2.c
-DSP_SRCS-$(HAVE_MMX)                 += x86/loopfilter_mmx.asm
 
 DSP_SRCS-$(HAVE_NEON)   += arm/loopfilter_neon.c
 ifeq ($(HAVE_NEON_ASM),yes)
@@ -164,7 +169,7 @@ DSP_SRCS-yes            += txfm_common.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/txfm_common_sse2.h
 DSP_SRCS-$(HAVE_MSA)    += mips/txfm_macros_msa.h
 # forward transform
-ifneq ($(filter yes,$(CONFIG_VP9_ENCODER) $(CONFIG_VP10_ENCODER)),)
+ifeq ($(CONFIG_VP9_ENCODER),yes)
 DSP_SRCS-yes            += fwd_txfm.c
 DSP_SRCS-yes            += fwd_txfm.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.h
@@ -182,10 +187,10 @@ DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
-endif  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+endif  # CONFIG_VP9_ENCODER
 
 # inverse transform
-ifneq ($(filter yes,$(CONFIG_VP9) $(CONFIG_VP10)),)
+ifeq ($(CONFIG_VP9),yes)
 DSP_SRCS-yes            += inv_txfm.h
 DSP_SRCS-yes            += inv_txfm.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.h
@@ -235,10 +240,10 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
-endif  # CONFIG_VP9 || CONFIG_VP10
+endif  # CONFIG_VP9
 
 # quantization
-ifneq ($(filter yes, $(CONFIG_VP9_ENCODER) $(CONFIG_VP10_ENCODER)),)
+ifeq ($(CONFIG_VP9_ENCODER),yes)
 DSP_SRCS-yes            += quantize.c
 DSP_SRCS-yes            += quantize.h
 
@@ -252,7 +257,20 @@ DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3_x86_64.asm
 DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx_x86_64.asm
 endif
 endif
-endif  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+
+# avg
+DSP_SRCS-yes           += avg.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/avg_intrin_sse2.c
+DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
+DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
+DSP_SRCS-$(HAVE_NEON)  += arm/hadamard_neon.c
+ifeq ($(ARCH_X86_64),yes)
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
+endif
+endif
+
+endif  # CONFIG_VP9_ENCODER
 
 ifeq ($(CONFIG_ENCODERS),yes)
 DSP_SRCS-yes            += sad.c
@@ -266,7 +284,6 @@ DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c
 
-DSP_SRCS-$(HAVE_MMX)    += x86/sad_mmx.asm
 DSP_SRCS-$(HAVE_SSE3)   += x86/sad_sse3.asm
 DSP_SRCS-$(HAVE_SSSE3)  += x86/sad_ssse3.asm
 DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
@@ -304,8 +321,6 @@ DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
 
-DSP_SRCS-$(HAVE_MMX)    += x86/variance_mmx.c
-DSP_SRCS-$(HAVE_MMX)    += x86/variance_impl_mmx.asm
 DSP_SRCS-$(HAVE_SSE)    += x86/variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
 DSP_SRCS-$(HAVE_SSE2)   += x86/halfpix_variance_sse2.c
diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index a9e180e..a1d0a51 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -8,12 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_COMMON_H_
-#define VPX_DSP_COMMON_H_
+#ifndef VPX_DSP_VPX_DSP_COMMON_H_
+#define VPX_DSP_VPX_DSP_COMMON_H_
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
-#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 
 #ifdef __cplusplus
@@ -67,4 +66,4 @@ static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_COMMON_H_
+#endif  // VPX_DSP_VPX_DSP_COMMON_H_
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index b369b05..37239a1 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -55,13 +55,13 @@ if ($opts{arch} eq "x86_64") {
 #
 
 add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_4x4/, "$ssse3_x86inc";
+specialize qw/vpx_d207_predictor_4x4/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d207e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207e_predictor_4x4/;
 
 add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_4x4 neon/, "$ssse3_x86inc";
+specialize qw/vpx_d45_predictor_4x4 neon/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45e_predictor_4x4/;
@@ -76,7 +76,7 @@ add_proto qw/void vpx_d63f_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co
 specialize qw/vpx_d63f_predictor_4x4/;
 
 add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_4x4 neon dspr2 msa/, "$ssse3_x86inc";
+specialize qw/vpx_h_predictor_4x4 neon dspr2 msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_he_predictor_4x4/;
@@ -91,25 +91,25 @@ add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co
 specialize qw/vpx_d153_predictor_4x4/, "$ssse3_x86inc";
 
 add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_v_predictor_4x4 neon msa/, "$sse_x86inc";
+specialize qw/vpx_v_predictor_4x4 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_ve_predictor_4x4/;
 
 add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa/, "$sse_x86inc";
+specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon/, "$sse_x86inc";
+specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_top_predictor_4x4 msa neon/, "$sse_x86inc";
+specialize qw/vpx_dc_top_predictor_4x4 msa neon/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_left_predictor_4x4 msa neon/, "$sse_x86inc";
+specialize qw/vpx_dc_left_predictor_4x4 msa neon/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_128_predictor_4x4 msa neon/, "$sse_x86inc";
+specialize qw/vpx_dc_128_predictor_4x4 msa neon/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207_predictor_8x8/, "$ssse3_x86inc";
@@ -118,7 +118,7 @@ add_proto qw/void vpx_d207e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, c
 specialize qw/vpx_d207e_predictor_8x8/;
 
 add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_8x8 neon/, "$ssse3_x86inc";
+specialize qw/vpx_d45_predictor_8x8 neon/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d45e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45e_predictor_8x8/;
@@ -130,7 +130,7 @@ add_proto qw/void vpx_d63e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, co
 specialize qw/vpx_d63e_predictor_8x8/;
 
 add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_8x8 neon dspr2 msa/, "$ssse3_x86inc";
+specialize qw/vpx_h_predictor_8x8 neon dspr2 msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d117_predictor_8x8/;
@@ -142,22 +142,22 @@ add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, co
 specialize qw/vpx_d153_predictor_8x8/, "$ssse3_x86inc";
 
 add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_v_predictor_8x8 neon msa/, "$sse_x86inc";
+specialize qw/vpx_v_predictor_8x8 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa/, "$sse_x86inc";
+specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_top_predictor_8x8 neon msa/, "$sse_x86inc";
+specialize qw/vpx_dc_top_predictor_8x8 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_left_predictor_8x8 neon msa/, "$sse_x86inc";
+specialize qw/vpx_dc_left_predictor_8x8 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_128_predictor_8x8 neon msa/, "$sse_x86inc";
+specialize qw/vpx_dc_128_predictor_8x8 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207_predictor_16x16/, "$ssse3_x86inc";
@@ -178,7 +178,7 @@ add_proto qw/void vpx_d63e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride,
 specialize qw/vpx_d63e_predictor_16x16/;
 
 add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_16x16 neon dspr2 msa/, "$ssse3_x86inc";
+specialize qw/vpx_h_predictor_16x16 neon dspr2 msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d117_predictor_16x16/;
@@ -226,7 +226,7 @@ add_proto qw/void vpx_d63e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride,
 specialize qw/vpx_d63e_predictor_32x32/;
 
 add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_32x32 neon msa/, "$ssse3_x86inc";
+specialize qw/vpx_h_predictor_32x32 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d117_predictor_32x32/;
@@ -241,7 +241,7 @@ add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, con
 specialize qw/vpx_v_predictor_32x32 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_32x32 neon msa/, "$sse2_x86_64_x86inc";
+specialize qw/vpx_tm_predictor_32x32 neon msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_predictor_32x32 msa neon/, "$sse2_x86inc";
@@ -288,13 +288,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_d153_predictor_4x4/;
 
   add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_4x4/, "$sse_x86inc";
+  specialize qw/vpx_highbd_v_predictor_4x4/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse_x86inc";
+  specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse_x86inc";
+  specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_4x4/;
@@ -387,7 +387,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_v_predictor_16x16/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_16x16/, "$sse2_x86_64_x86inc";
+  specialize qw/vpx_highbd_tm_predictor_16x16/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_16x16/, "$sse2_x86inc";
@@ -435,10 +435,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_v_predictor_32x32/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_32x32/, "$sse2_x86_64_x86inc";
+  specialize qw/vpx_highbd_tm_predictor_32x32/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_32x32/, "$sse2_x86_64_x86inc";
+  specialize qw/vpx_highbd_dc_predictor_32x32/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_32x32/;
@@ -535,32 +535,36 @@ add_proto qw/void vpx_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8
 specialize qw/vpx_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/;
 $vpx_lpf_vertical_16_dual_neon_asm=vpx_lpf_vertical_16_dual_neon;
 
-add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/;
 $vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
 
-add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vpx_lpf_vertical_4 mmx neon dspr2 msa/;
+add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;
 
-add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon_asm dspr2 msa/;
-$vpx_lpf_horizontal_16_neon_asm=vpx_lpf_horizontal_16_neon;
+add_proto qw/void vpx_lpf_horizontal_edge_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/;
+$vpx_lpf_horizontal_edge_8_neon_asm=vpx_lpf_horizontal_edge_8_neon;
 
-add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+add_proto qw/void vpx_lpf_horizontal_edge_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/;
+$vpx_lpf_horizontal_edge_16_neon_asm=vpx_lpf_horizontal_edge_16_neon;
+
+add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/;
 $vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
 
-add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vpx_lpf_horizontal_4 mmx neon dspr2 msa/;
+add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
@@ -572,28 +576,31 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
   specialize qw/vpx_highbd_lpf_vertical_16_dual sse2/;
 
-  add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
   specialize qw/vpx_highbd_lpf_vertical_8 sse2/;
 
   add_proto qw/void vpx_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
   specialize qw/vpx_highbd_lpf_vertical_8_dual sse2/;
 
-  add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
   specialize qw/vpx_highbd_lpf_vertical_4 sse2/;
 
   add_proto qw/void vpx_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
   specialize qw/vpx_highbd_lpf_vertical_4_dual sse2/;
 
-  add_proto qw/void vpx_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
-  specialize qw/vpx_highbd_lpf_horizontal_16 sse2/;
+  add_proto qw/void vpx_highbd_lpf_horizontal_edge_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_edge_8 sse2/;
+
+  add_proto qw/void vpx_highbd_lpf_horizontal_edge_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/vpx_highbd_lpf_horizontal_edge_16 sse2/;
 
-  add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
   specialize qw/vpx_highbd_lpf_horizontal_8 sse2/;
 
   add_proto qw/void vpx_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
   specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2/;
 
-  add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, int bd";
+  add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
   specialize qw/vpx_highbd_lpf_horizontal_4 sse2/;
 
   add_proto qw/void vpx_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
@@ -607,7 +614,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 #
 # Forward transform
 #
-if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
+if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct4x4 sse2/;
@@ -687,11 +694,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct32x32_1 sse2 msa/;
 }  # CONFIG_VP9_HIGHBITDEPTH
-}  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+}  # CONFIG_VP9_ENCODER
 
 #
 # Inverse transform
-if ((vpx_config("CONFIG_VP9") eq "yes") || (vpx_config("CONFIG_VP10") eq "yes")) {
+if (vpx_config("CONFIG_VP9") eq "yes") {
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Note as optimized versions of these functions are added we need to add a check to ensure
   # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
@@ -699,7 +706,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_iwht4x4_1_add/;
 
   add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/vpx_iwht4x4_16_add/;
+  specialize qw/vpx_iwht4x4_16_add/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
   specialize qw/vpx_highbd_idct4x4_1_add/;
@@ -754,12 +761,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_1024_add/;
 
+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_135_add/;
+
     add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_34_add/;
 
     add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_1_add/;
-    
+
     add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
     specialize qw/vpx_highbd_idct4x4_16_add/;
 
@@ -782,10 +792,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     specialize qw/vpx_idct4x4_1_add sse2/;
 
     add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_64_add sse2/;
+    specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64_x86inc";
 
     add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_12_add sse2/;
+    specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64_x86inc";
 
     add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct8x8_1_add sse2/;
@@ -800,10 +810,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     specialize qw/vpx_idct16x16_1_add sse2/;
 
     add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1024_add sse2/;
+    specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64_x86inc";
+
+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_135_add sse2/, "$ssse3_x86_64_x86inc";
+    # Need to add 135 eob idct32x32 implementations.
+    $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
 
     add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_34_add sse2/;
+    specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64_x86inc";
 
     add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_1_add sse2/;
@@ -853,6 +868,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_1024_add/;
 
+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_135_add/;
+
     add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_34_add/;
 
@@ -890,12 +908,20 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/;
 
     add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/;
+    specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+
+    add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+    specialize qw/vpx_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
+    # Need to add 135 eob idct32x32 implementations.
+    $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
+    $vpx_idct32x32_135_add_neon=vpx_idct32x32_1024_add_neon;
+    $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
+    $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
 
     add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_34_add sse2 neon_asm dspr2 msa/;
+    specialize qw/vpx_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
     # Need to add 34 eob idct32x32 neon implementation.
-    $vpx_idct32x32_34_add_neon_asm=vpx_idct32x32_1024_add_neon;
+    $vpx_idct32x32_34_add_neon=vpx_idct32x32_1024_add_neon;
 
     add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_1_add sse2 neon dspr2 msa/;
@@ -907,12 +933,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     specialize qw/vpx_iwht4x4_16_add msa/, "$sse2_x86inc";
   }  # CONFIG_EMULATE_HARDWARE
 }  # CONFIG_VP9_HIGHBITDEPTH
-}  # CONFIG_VP9 || CONFIG_VP10
+}  # CONFIG_VP9
 
 #
 # Quantization
 #
-if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
+if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
 
@@ -926,7 +952,7 @@ if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCO
     add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
   }  # CONFIG_VP9_HIGHBITDEPTH
-}  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+}  # CONFIG_VP9_ENCODER
 
 if (vpx_config("CONFIG_ENCODERS") eq "yes") {
 #
@@ -957,29 +983,58 @@ add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride
 specialize qw/vpx_sad16x32 msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x16 mmx media neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x16 media neon msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x8 mmx neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x8 neon msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x16 mmx neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x16 neon msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x8 mmx neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x8 neon msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad8x4 msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x8 msa/, "$sse_x86inc";
+specialize qw/vpx_sad4x8 msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x4 mmx neon msa/, "$sse_x86inc";
+specialize qw/vpx_sad4x4 neon msa/, "$sse2_x86inc";
 
 #
 # Avg
 #
+if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
+  add_proto qw/unsigned int vpx_avg_8x8/, "const uint8_t *, int p";
+  specialize qw/vpx_avg_8x8 sse2 neon msa/;
+
+  add_proto qw/unsigned int vpx_avg_4x4/, "const uint8_t *, int p";
+  specialize qw/vpx_avg_4x4 sse2 neon msa/;
+
+  add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  specialize qw/vpx_minmax_8x8 sse2 neon/;
+
+  add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
+  specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64_x86inc";
+
+  add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
+  specialize qw/vpx_hadamard_16x16 sse2 neon/;
+
+  add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
+  specialize qw/vpx_satd sse2 neon/;
+
+  add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
+  specialize qw/vpx_int_pro_row sse2 neon/;
+
+  add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width";
+  specialize qw/vpx_int_pro_col sse2 neon/;
+
+  add_proto qw/int vpx_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
+  specialize qw/vpx_vector_var neon sse2/;
+}  # CONFIG_VP9_ENCODER
+
 add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vpx_sad64x64_avg avx2 msa/, "$sse2_x86inc";
 
@@ -1014,10 +1069,10 @@ add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stri
 specialize qw/vpx_sad8x4_avg msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x8_avg msa/, "$sse_x86inc";
+specialize qw/vpx_sad4x8_avg msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x4_avg msa/, "$sse_x86inc";
+specialize qw/vpx_sad4x4_avg msa/, "$sse2_x86inc";
 
 #
 # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
@@ -1109,10 +1164,10 @@ add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const
 specialize qw/vpx_sad8x4x4d msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x8x4d msa/, "$sse_x86inc";
+specialize qw/vpx_sad4x8x4d msa/, "$sse2_x86inc";
 
 add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x4d msa/, "$sse_x86inc";
+specialize qw/vpx_sad4x4x4d msa/, "$sse2_x86inc";
 
 #
 # Structured Similarity (SSIM)
@@ -1177,6 +1232,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
   # Avg
   #
+  add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p";
+  specialize qw/vpx_highbd_avg_8x8/;
+  add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
+  specialize qw/vpx_highbd_avg_4x4/;
+  add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  specialize qw/vpx_highbd_minmax_8x8/;
+
   add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
   specialize qw/vpx_highbd_sad64x64_avg/, "$sse2_x86inc";
 
@@ -1345,16 +1407,16 @@ add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int sourc
   specialize qw/vpx_variance16x32 sse2 msa/;
 
 add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon msa/;
+  specialize qw/vpx_variance16x16 sse2 avx2 media neon msa/;
 
 add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x8 mmx sse2 neon msa/;
+  specialize qw/vpx_variance16x8 sse2 neon msa/;
 
 add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x16 mmx sse2 neon msa/;
+  specialize qw/vpx_variance8x16 sse2 neon msa/;
 
 add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x8 mmx sse2 media neon msa/;
+  specialize qw/vpx_variance8x8 sse2 media neon msa/;
 
 add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_variance8x4 sse2 msa/;
@@ -1363,7 +1425,7 @@ add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_
   specialize qw/vpx_variance4x8 sse2 msa/;
 
 add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x4 mmx sse2 msa/;
+  specialize qw/vpx_variance4x4 sse2 msa/;
 
 #
 # Specialty Variance
@@ -1372,10 +1434,10 @@ add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride,
   specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
 
 add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get8x8var mmx sse2 neon msa/;
+  specialize qw/vpx_get8x8var sse2 neon msa/;
 
 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon msa/;
+  specialize qw/vpx_mse16x16 sse2 avx2 media neon msa/;
 
 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   specialize qw/vpx_mse16x8 sse2 msa/;
@@ -1387,7 +1449,7 @@ add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int  source_stri
   specialize qw/vpx_mse8x8 sse2 msa/;
 
 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
-  specialize qw/vpx_get_mb_ss mmx sse2 msa/;
+  specialize qw/vpx_get_mb_ss sse2 msa/;
 
 add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
   specialize qw/vpx_get4x4sse_cs neon msa/;
@@ -1416,25 +1478,25 @@ add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int
   specialize qw/vpx_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x16 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance16x16 media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x8 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x16 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x8 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance8x8 media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_variance4x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -1470,22 +1532,22 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, i
   specialize qw/vpx_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
+  specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
 
 #
 # Specialty Subpixel
 #
 add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
-  specialize qw/vpx_variance_halfpixvar16x16_h mmx sse2 media/;
+  specialize qw/vpx_variance_halfpixvar16x16_h sse2 media/;
 
 add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
-  specialize qw/vpx_variance_halfpixvar16x16_v mmx sse2 media/;
+  specialize qw/vpx_variance_halfpixvar16x16_v sse2 media/;
 
 add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
-  specialize qw/vpx_variance_halfpixvar16x16_hv mmx sse2 media/;
+  specialize qw/vpx_variance_halfpixvar16x16_hv sse2 media/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
@@ -1845,6 +1907,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
 
 }  # CONFIG_VP9_HIGHBITDEPTH
+
+#
+# Post Processing
+#
+if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+    add_proto qw/void vpx_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
+    specialize qw/vpx_plane_add_noise sse2 msa/;
+}
+
 }  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 
 1;
diff --git a/vpx_dsp/x86/add_noise_sse2.asm b/vpx_dsp/x86/add_noise_sse2.asm
new file mode 100644
index 0000000..ff61b19
--- /dev/null
+++ b/vpx_dsp/x86/add_noise_sse2.asm
@@ -0,0 +1,83 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vpx_plane_add_noise_sse2(unsigned char *start, unsigned char *noise,
+;                              unsigned char blackclamp[16],
+;                              unsigned char whiteclamp[16],
+;                              unsigned char bothclamp[16],
+;                              unsigned int width, unsigned int height,
+;                              int pitch)
+global sym(vpx_plane_add_noise_sse2) PRIVATE
+sym(vpx_plane_add_noise_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; get the clamps in registers
+    mov     rdx, arg(2) ; blackclamp
+    movdqu  xmm3, [rdx]
+    mov     rdx, arg(3) ; whiteclamp
+    movdqu  xmm4, [rdx]
+    mov     rdx, arg(4) ; bothclamp
+    movdqu  xmm5, [rdx]
+
+.addnoise_loop:
+    call sym(LIBVPX_RAND) WRT_PLT
+    mov     rcx, arg(1) ;noise
+    and     rax, 0xff
+    add     rcx, rax
+
+    mov     rdi, rcx
+    movsxd  rcx, dword arg(5) ;[Width]
+    mov     rsi, arg(0) ;Pos
+    xor         rax,rax
+
+.addnoise_nextset:
+      movdqu      xmm1,[rsi+rax]         ; get the source
+
+      psubusb     xmm1, xmm3 ; subtract black clamp
+      paddusb     xmm1, xmm5 ; add both clamp
+      psubusb     xmm1, xmm4 ; subtract whiteclamp
+
+      movdqu      xmm2,[rdi+rax]         ; get the noise for this line
+      paddb       xmm1,xmm2              ; add it in
+      movdqu      [rsi+rax],xmm1         ; store the result
+
+      add         rax,16                 ; move to the next line
+
+      cmp         rax, rcx
+      jl          .addnoise_nextset
+
+    movsxd  rax, dword arg(7) ; Pitch
+    add     arg(0), rax ; Start += Pitch
+    sub     dword arg(6), 1   ; Height -= 1
+    jg      .addnoise_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+rd42:
+    times 8 dw 0x04
+four8s:
+    times 4 dd 8
diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c
similarity index 90%
rename from vp9/encoder/x86/vp9_avg_intrin_sse2.c
rename to vpx_dsp/x86/avg_intrin_sse2.c
index 4531d79..f9af6cf 100644
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -10,10 +10,10 @@
 
 #include <emmintrin.h>
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
-void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
+void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
                          int *min, int *max) {
   __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
   u0  = _mm_setzero_si128();
@@ -91,7 +91,7 @@ void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
   *min = _mm_extract_epi16(minabsdiff, 0);
 }
 
-unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
+unsigned int vpx_avg_8x8_sse2(const uint8_t *s, int p) {
   __m128i s0, s1, u0;
   unsigned int avg = 0;
   u0  = _mm_setzero_si128();
@@ -118,7 +118,7 @@ unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
   return (avg + 32) >> 6;
 }
 
-unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) {
+unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) {
   __m128i s0, s1, u0;
   unsigned int avg = 0;
   u0  = _mm_setzero_si128();
@@ -212,7 +212,7 @@ static void hadamard_col8_sse2(__m128i *in, int iter) {
   }
 }
 
-void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
+void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
                            int16_t *coeff) {
   __m128i src[8];
   src[0] = _mm_load_si128((const __m128i *)src_diff);
@@ -244,13 +244,13 @@ void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
   _mm_store_si128((__m128i *)coeff, src[7]);
 }
 
-void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
+void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
                              int16_t *coeff) {
   int idx;
   for (idx = 0; idx < 4; ++idx) {
     int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
                                 + (idx & 0x01) * 8;
-    vp9_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+    vpx_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
   }
 
   for (idx = 0; idx < 64; idx += 8) {
@@ -283,34 +283,33 @@ void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
   }
 }
 
-int16_t vp9_satd_sse2(const int16_t *coeff, int length) {
+int vpx_satd_sse2(const int16_t *coeff, int length) {
   int i;
-  __m128i sum = _mm_load_si128((const __m128i *)coeff);
-  __m128i sign = _mm_srai_epi16(sum, 15);
-  __m128i val = _mm_xor_si128(sum, sign);
-  sum = _mm_sub_epi16(val, sign);
-  coeff += 8;
-
-  for (i = 8; i < length; i += 8) {
-    __m128i src_line = _mm_load_si128((const __m128i *)coeff);
-    sign = _mm_srai_epi16(src_line, 15);
-    val = _mm_xor_si128(src_line, sign);
-    val = _mm_sub_epi16(val, sign);
-    sum = _mm_add_epi16(sum, val);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i accum = zero;
+
+  for (i = 0; i < length; i += 8) {
+    const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+    const __m128i inv = _mm_sub_epi16(zero, src_line);
+    const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)
+    const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
+    const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
+    const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
+    accum = _mm_add_epi32(accum, sum);
     coeff += 8;
   }
 
-  val = _mm_srli_si128(sum, 8);
-  sum = _mm_add_epi16(sum, val);
-  val = _mm_srli_epi64(sum, 32);
-  sum = _mm_add_epi16(sum, val);
-  val = _mm_srli_epi32(sum, 16);
-  sum = _mm_add_epi16(sum, val);
+  {  // cascading summation of accum
+    __m128i hi = _mm_srli_si128(accum, 8);
+    accum = _mm_add_epi32(accum, hi);
+    hi = _mm_srli_epi64(accum, 32);
+    accum = _mm_add_epi32(accum, hi);
+  }
 
-  return _mm_extract_epi16(sum, 0);
+  return _mm_cvtsi128_si32(accum);
 }
 
-void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
+void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
                           const int ref_stride, const int height) {
   int idx;
   __m128i zero = _mm_setzero_si128();
@@ -359,7 +358,7 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
   _mm_storeu_si128((__m128i *)hbuf, s1);
 }
 
-int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
+int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) {
   __m128i zero = _mm_setzero_si128();
   __m128i src_line = _mm_load_si128((const __m128i *)ref);
   __m128i s0 = _mm_sad_epu8(src_line, zero);
@@ -379,7 +378,7 @@ int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
   return _mm_extract_epi16(s0, 0);
 }
 
-int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src,
+int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src,
                         const int bwl) {
   int idx;
   int width = 4 << bwl;
diff --git a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm b/vpx_dsp/x86/avg_ssse3_x86_64.asm
similarity index 97%
rename from vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
rename to vpx_dsp/x86/avg_ssse3_x86_64.asm
index 74c52df..26412e8 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/avg_ssse3_x86_64.asm
@@ -8,11 +8,11 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-%define private_prefix vp9
+%define private_prefix vpx
 
 %include "third_party/x86inc/x86inc.asm"
 
-; This file provides SSSE3 version of the forward transformation. Part
+; This file provides SSSE3 version of the hadamard transformation. Part
 ; of the macro definitions are originally derived from the ffmpeg project.
 ; The current version applies to x86 64-bit only.
 
diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h
index b6fbfcf..7e43eb7 100644
--- a/vpx_dsp/x86/convolve.h
+++ b/vpx_dsp/x86/convolve.h
@@ -33,7 +33,7 @@ typedef void filter8_1dfunction (
                                     int w, int h) { \
   assert(filter[3] != 128); \
   assert(step_q4 == 16); \
-  if (filter[0] || filter[1] || filter[2]) { \
+  if (filter[0] | filter[1] | filter[2]) { \
     while (w >= 16) { \
       vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \
                                                src_stride, \
@@ -45,27 +45,20 @@ typedef void filter8_1dfunction (
       dst += 16; \
       w -= 16; \
     } \
-    while (w >= 8) { \
+    if (w == 8) { \
       vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \
                                               src_stride, \
                                               dst, \
                                               dst_stride, \
                                               h, \
                                               filter); \
-      src += 8; \
-      dst += 8; \
-      w -= 8; \
-    } \
-    while (w >= 4) { \
+    } else if (w == 4) { \
       vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \
                                               src_stride, \
                                               dst, \
                                               dst_stride, \
                                               h, \
                                               filter); \
-      src += 4; \
-      dst += 4; \
-      w -= 4; \
     } \
   } else { \
     while (w >= 16) { \
@@ -79,27 +72,20 @@ typedef void filter8_1dfunction (
       dst += 16; \
       w -= 16; \
     } \
-    while (w >= 8) { \
+    if (w == 8) { \
       vpx_filter_block1d8_##dir##2_##avg##opt(src, \
                                               src_stride, \
                                               dst, \
                                               dst_stride, \
                                               h, \
                                               filter); \
-      src += 8; \
-      dst += 8; \
-      w -= 8; \
-    } \
-    while (w >= 4) { \
+    } else if (w == 4) { \
       vpx_filter_block1d4_##dir##2_##avg##opt(src, \
                                               src_stride, \
                                               dst, \
                                               dst_stride, \
                                               h, \
                                               filter); \
-      src += 4; \
-      dst += 4; \
-      w -= 4; \
     } \
   } \
 }
@@ -116,8 +102,7 @@ void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
   assert(h <= 64); \
   assert(x_step_q4 == 16); \
   assert(y_step_q4 == 16); \
-  if (filter_x[0] || filter_x[1] || filter_x[2]|| \
-      filter_y[0] || filter_y[1] || filter_y[2]) { \
+  if (filter_x[0] | filter_x[1] | filter_x[2]) { \
     DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
     vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
                               filter_x, x_step_q4, filter_y, y_step_q4, \
@@ -161,7 +146,7 @@ typedef void highbd_filter8_1dfunction (
   if (step_q4 == 16 && filter[3] != 128) { \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-    if (filter[0] || filter[1] || filter[2]) { \
+    if (filter[0] | filter[1] | filter[2]) { \
       while (w >= 16) { \
         vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
                                                         src_stride, \
@@ -253,8 +238,7 @@ void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
   assert(w <= 64); \
   assert(h <= 64); \
   if (x_step_q4 == 16 && y_step_q4 == 16) { \
-    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
-        filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
+    if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \
       DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
       vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
                                        CONVERT_TO_BYTEPTR(fdata2), 64, \
diff --git a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
index 4df39df..951af3a 100644
--- a/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -10,6 +10,7 @@
 
 #include <immintrin.h>  // AVX2
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/txfm_common.h"
 
 #define pair256_set_epi16(a, b) \
diff --git a/vpx_dsp/x86/fwd_txfm_sse2.c b/vpx_dsp/x86/fwd_txfm_sse2.c
index bca72e8..3e4f49b 100644
--- a/vpx_dsp/x86/fwd_txfm_sse2.c
+++ b/vpx_dsp/x86/fwd_txfm_sse2.c
@@ -11,6 +11,7 @@
 #include <emmintrin.h>  // SSE2
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/x86/fwd_txfm_sse2.h"
 
@@ -40,7 +41,7 @@ void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
 
   in1 = _mm_add_epi32(tmp, in0);
   in0 = _mm_slli_epi32(in1, 1);
-  store_output(&in0, output);
+  output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
 }
 
 void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
@@ -80,7 +81,7 @@ void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
   in0 = _mm_srli_si128(sum, 8);
 
   in1 = _mm_add_epi32(sum, in0);
-  store_output(&in1, output);
+  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
 }
 
 void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
@@ -91,40 +92,39 @@ void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
   int i;
 
   for (i = 0; i < 2; ++i) {
-    input += 8 * i;
-    in0  = _mm_load_si128((const __m128i *)(input +  0 * stride));
-    in1  = _mm_load_si128((const __m128i *)(input +  1 * stride));
-    in2  = _mm_load_si128((const __m128i *)(input +  2 * stride));
-    in3  = _mm_load_si128((const __m128i *)(input +  3 * stride));
+    in0  = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
+    in1  = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
+    in2  = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
+    in3  = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
 
     u0 = _mm_add_epi16(in0, in1);
     u1 = _mm_add_epi16(in2, in3);
     sum = _mm_add_epi16(sum, u0);
 
-    in0  = _mm_load_si128((const __m128i *)(input +  4 * stride));
-    in1  = _mm_load_si128((const __m128i *)(input +  5 * stride));
-    in2  = _mm_load_si128((const __m128i *)(input +  6 * stride));
-    in3  = _mm_load_si128((const __m128i *)(input +  7 * stride));
+    in0  = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
+    in1  = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
+    in2  = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
+    in3  = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
 
     sum = _mm_add_epi16(sum, u1);
     u0  = _mm_add_epi16(in0, in1);
     u1  = _mm_add_epi16(in2, in3);
     sum = _mm_add_epi16(sum, u0);
 
-    in0  = _mm_load_si128((const __m128i *)(input +  8 * stride));
-    in1  = _mm_load_si128((const __m128i *)(input +  9 * stride));
-    in2  = _mm_load_si128((const __m128i *)(input + 10 * stride));
-    in3  = _mm_load_si128((const __m128i *)(input + 11 * stride));
+    in0  = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
+    in1  = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
+    in2  = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
+    in3  = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
 
     sum = _mm_add_epi16(sum, u1);
     u0  = _mm_add_epi16(in0, in1);
     u1  = _mm_add_epi16(in2, in3);
     sum = _mm_add_epi16(sum, u0);
 
-    in0  = _mm_load_si128((const __m128i *)(input + 12 * stride));
-    in1  = _mm_load_si128((const __m128i *)(input + 13 * stride));
-    in2  = _mm_load_si128((const __m128i *)(input + 14 * stride));
-    in3  = _mm_load_si128((const __m128i *)(input + 15 * stride));
+    in0  = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
+    in1  = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
+    in2  = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
+    in3  = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
 
     sum = _mm_add_epi16(sum, u1);
     u0  = _mm_add_epi16(in0, in1);
@@ -132,6 +132,7 @@ void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
     sum = _mm_add_epi16(sum, u0);
 
     sum = _mm_add_epi16(sum, u1);
+    input += 8 * stride;
   }
 
   u0  = _mm_setzero_si128();
@@ -149,7 +150,7 @@ void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
 
   in1 = _mm_add_epi32(sum, in0);
   in1 = _mm_srai_epi32(in1, 1);
-  store_output(&in1, output);
+  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
 }
 
 void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
@@ -221,7 +222,7 @@ void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
 
   in1 = _mm_add_epi32(sum, in0);
   in1 = _mm_srai_epi32(in1, 3);
-  store_output(&in1, output);
+  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
 }
 
 #define DCT_HIGH_BIT_DEPTH 0
diff --git a/vpx_dsp/x86/halfpix_variance_sse2.c b/vpx_dsp/x86/halfpix_variance_sse2.c
index 5782155..4a8fb6d 100644
--- a/vpx_dsp/x86/halfpix_variance_sse2.c
+++ b/vpx_dsp/x86/halfpix_variance_sse2.c
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
+
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
@@ -40,7 +42,9 @@ uint32_t vpx_variance_halfpixvar16x16_h_sse2(const unsigned char *src,
                                     &xsum0, &xxsum0);
 
   *sse = xxsum0;
-  return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
+  assert(xsum0 <= 255 * 16 * 16);
+  assert(xsum0 >= -255 * 16 * 16);
+  return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
 }
 
 uint32_t vpx_variance_halfpixvar16x16_v_sse2(const unsigned char *src,
@@ -54,7 +58,9 @@ uint32_t vpx_variance_halfpixvar16x16_v_sse2(const unsigned char *src,
                                    &xsum0, &xxsum0);
 
   *sse = xxsum0;
-  return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
+  assert(xsum0 <= 255 * 16 * 16);
+  assert(xsum0 >= -255 * 16 * 16);
+  return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
 }
 
 
@@ -70,5 +76,7 @@ uint32_t vpx_variance_halfpixvar16x16_hv_sse2(const unsigned char *src,
                                          &xsum0, &xxsum0);
 
   *sse = xxsum0;
-  return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
+  assert(xsum0 <= 255 * 16 * 16);
+  assert(xsum0 >= -255 * 16 * 16);
+  return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8));
 }
diff --git a/vpx_dsp/x86/highbd_intrapred_sse2.asm b/vpx_dsp/x86/highbd_intrapred_sse2.asm
index b12d29c..c61b621 100644
--- a/vpx_dsp/x86/highbd_intrapred_sse2.asm
+++ b/vpx_dsp/x86/highbd_intrapred_sse2.asm
@@ -17,24 +17,20 @@ pw_16: times 4 dd 16
 pw_32: times 4 dd 32
 
 SECTION .text
-INIT_MMX sse
+INIT_XMM sse2
 cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   movq                  m0, [aboveq]
   movq                  m2, [leftq]
-  DEFINE_ARGS dst, stride, one
-  mov                 oned, 0x0001
-  pxor                  m1, m1
-  movd                  m3, oned
-  pshufw                m3, m3, 0x0
   paddw                 m0, m2
-  pmaddwd               m0, m3
-  packssdw              m0, m1
-  pmaddwd               m0, m3
+  pshuflw               m1, m0, 0xe
+  paddw                 m0, m1
+  pshuflw               m1, m0, 0x1
+  paddw                 m0, m1
   paddw                 m0, [GLOBAL(pw_4)]
   psraw                 m0, 3
-  pshufw                m0, m0, 0x0
+  pshuflw               m0, m0, 0x0
   movq    [dstq          ], m0
   movq    [dstq+strideq*2], m0
   lea                 dstq, [dstq+strideq*4]
@@ -122,30 +118,29 @@ cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
   RESTORE_GOT
   REP_RET
 
-%if ARCH_X86_64
 INIT_XMM sse2
-cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset
+cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
-  pxor                  m1, m1
   mova                  m0, [aboveq]
   mova                  m2, [aboveq+16]
   mova                  m3, [aboveq+32]
   mova                  m4, [aboveq+48]
-  mova                  m5, [leftq]
-  mova                  m6, [leftq+16]
-  mova                  m7, [leftq+32]
-  mova                  m8, [leftq+48]
+  paddw                 m0, m2
+  paddw                 m3, m4
+  mova                  m2, [leftq]
+  mova                  m4, [leftq+16]
+  mova                  m5, [leftq+32]
+  mova                  m6, [leftq+48]
+  paddw                 m2, m4
+  paddw                 m5, m6
+  paddw                 m0, m3
+  paddw                 m2, m5
+  pxor                  m1, m1
+  paddw                 m0, m2
   DEFINE_ARGS dst, stride, stride3, lines4
   lea             stride3q, [strideq*3]
   mov              lines4d, 8
-  paddw                 m0, m2
-  paddw                 m0, m3
-  paddw                 m0, m4
-  paddw                 m0, m5
-  paddw                 m0, m6
-  paddw                 m0, m7
-  paddw                 m0, m8
   movhlps               m2, m0
   paddw                 m0, m2
   punpcklwd             m0, m1
@@ -181,9 +176,8 @@ cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset
 
   RESTORE_GOT
   REP_RET
-%endif
 
-INIT_MMX sse
+INIT_XMM sse2
 cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
   movq                  m0, [aboveq]
   movq    [dstq          ], m0
@@ -261,43 +255,44 @@ cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
   jnz .loop
   REP_RET
 
-INIT_MMX sse
-cglobal highbd_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one
+INIT_XMM sse2
+cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
   movd                  m1, [aboveq-2]
   movq                  m0, [aboveq]
-  pshufw                m1, m1, 0x0
+  pshuflw               m1, m1, 0x0
+  movlhps               m0, m0         ; t1 t2 t3 t4 t1 t2 t3 t4
+  movlhps               m1, m1         ; tl tl tl tl tl tl tl tl
   ; Get the values to compute the maximum value at this bit depth
-  mov                 oned, 1
-  movd                  m3, oned
+  pcmpeqw               m3, m3
   movd                  m4, bpsd
-  pshufw                m3, m3, 0x0
-  DEFINE_ARGS dst, stride, line, left
-  mov                lineq, -2
-  mova                  m2, m3
+  psubw                 m0, m1         ; t1-tl t2-tl t3-tl t4-tl
   psllw                 m3, m4
-  add                leftq, 8
-  psubw                 m3, m2 ; max possible value
-  pxor                  m4, m4 ; min possible value
-  psubw                 m0, m1
-.loop:
-  movq                  m1, [leftq+lineq*4]
-  movq                  m2, [leftq+lineq*4+2]
-  pshufw                m1, m1, 0x0
-  pshufw                m2, m2, 0x0
-  paddw                 m1, m0
+  pcmpeqw               m2, m2
+  pxor                  m4, m4         ; min possible value
+  pxor                  m3, m2         ; max possible value
+  mova                  m1, [leftq]
+  pshuflw               m2, m1, 0x0
+  pshuflw               m5, m1, 0x55
+  movlhps               m2, m5         ; l1 l1 l1 l1 l2 l2 l2 l2
   paddw                 m2, m0
   ;Clamp to the bit-depth
-  pminsw                m1, m3
   pminsw                m2, m3
-  pmaxsw                m1, m4
   pmaxsw                m2, m4
   ;Store the values
-  movq    [dstq          ], m1
-  movq    [dstq+strideq*2], m2
+  movq    [dstq          ], m2
+  movhpd  [dstq+strideq*2], m2
   lea                 dstq, [dstq+strideq*4]
-  inc                lineq
-  jnz .loop
-  REP_RET
+  pshuflw               m2, m1, 0xaa
+  pshuflw               m5, m1, 0xff
+  movlhps               m2, m5
+  paddw                 m2, m0
+  ;Clamp to the bit-depth
+  pminsw                m2, m3
+  pmaxsw                m2, m4
+  ;Store the values
+  movq    [dstq          ], m2
+  movhpd  [dstq+strideq*2], m2
+  RET
 
 INIT_XMM sse2
 cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
@@ -343,63 +338,55 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
   jnz .loop
   REP_RET
 
-%if ARCH_X86_64
 INIT_XMM sse2
-cglobal highbd_tm_predictor_16x16, 5, 6, 9, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps
   movd                  m2, [aboveq-2]
   mova                  m0, [aboveq]
   mova                  m1, [aboveq+16]
   pshuflw               m2, m2, 0x0
   ; Get the values to compute the maximum value at this bit depth
-  mov                 oned, 1
-  pxor                  m7, m7
-  pxor                  m8, m8
-  pinsrw                m7, oned, 0
-  pinsrw                m8, bpsd, 0
-  pshuflw               m7, m7, 0x0
+  pcmpeqw               m3, m3
+  movd                  m4, bpsd
+  punpcklqdq            m2, m2
+  psllw                 m3, m4
+  pcmpeqw               m5, m5
+  pxor                  m4, m4         ; min possible value
+  pxor                  m3, m5         ; max possible value
   DEFINE_ARGS dst, stride, line, left
-  punpcklqdq            m7, m7
   mov                lineq, -8
-  mova                  m5, m7
-  punpcklqdq            m2, m2
-  psllw                 m7, m8
-  add                leftq, 32
-  psubw                 m7, m5 ; max possible value
-  pxor                  m8, m8 ; min possible value
   psubw                 m0, m2
   psubw                 m1, m2
 .loop:
-  movd                  m2, [leftq+lineq*4]
-  movd                  m3, [leftq+lineq*4+2]
-  pshuflw               m2, m2, 0x0
-  pshuflw               m3, m3, 0x0
-  punpcklqdq            m2, m2
-  punpcklqdq            m3, m3
-  paddw                 m4, m2, m0
-  paddw                 m5, m3, m0
+  movd                  m7, [leftq]
+  pshuflw               m5, m7, 0x0
+  pshuflw               m2, m7, 0x55
+  punpcklqdq            m5, m5         ; l1 l1 l1 l1 l1 l1 l1 l1
+  punpcklqdq            m2, m2         ; l2 l2 l2 l2 l2 l2 l2 l2
+  paddw                 m6, m5, m0     ; t1-tl+l1 to t4-tl+l1
+  paddw                 m5, m1         ; t5-tl+l1 to t8-tl+l1
+  pminsw                m6, m3
+  pminsw                m5, m3
+  pmaxsw                m6, m4         ; Clamp to the bit-depth
+  pmaxsw                m5, m4
+  mova   [dstq           ], m6
+  mova   [dstq        +16], m5
+  paddw                 m6, m2, m0
   paddw                 m2, m1
-  paddw                 m3, m1
-  ;Clamp to the bit-depth
-  pminsw                m4, m7
-  pminsw                m5, m7
-  pminsw                m2, m7
-  pminsw                m3, m7
-  pmaxsw                m4, m8
-  pmaxsw                m5, m8
-  pmaxsw                m2, m8
-  pmaxsw                m3, m8
-  ;Store the values
-  mova   [dstq             ], m4
-  mova   [dstq+strideq*2   ], m5
-  mova   [dstq          +16], m2
-  mova   [dstq+strideq*2+16], m3
+  pminsw                m6, m3
+  pminsw                m2, m3
+  pmaxsw                m6, m4
+  pmaxsw                m2, m4
+  mova   [dstq+strideq*2 ], m6
+  mova [dstq+strideq*2+16], m2
   lea                 dstq, [dstq+strideq*4]
   inc                lineq
+  lea                leftq, [leftq+4]
+
   jnz .loop
   REP_RET
 
 INIT_XMM sse2
-cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps
   movd                  m0, [aboveq-2]
   mova                  m1, [aboveq]
   mova                  m2, [aboveq+16]
@@ -407,70 +394,60 @@ cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one
   mova                  m4, [aboveq+48]
   pshuflw               m0, m0, 0x0
   ; Get the values to compute the maximum value at this bit depth
-  mov                 oned, 1
-  pxor                 m10, m10
-  pxor                 m11, m11
-  pinsrw               m10, oned, 0
-  pinsrw               m11, bpsd, 0
-  pshuflw              m10, m10, 0x0
+  pcmpeqw               m5, m5
+  movd                  m6, bpsd
+  psllw                 m5, m6
+  pcmpeqw               m7, m7
+  pxor                  m6, m6         ; min possible value
+  pxor                  m5, m7         ; max possible value
+  punpcklqdq            m0, m0
   DEFINE_ARGS dst, stride, line, left
-  punpcklqdq           m10, m10
   mov                lineq, -16
-  mova                  m5, m10
-  punpcklqdq            m0, m0
-  psllw                m10, m11
-  add                leftq, 64
-  psubw                m10, m5 ; max possible value
-  pxor                 m11, m11 ; min possible value
   psubw                 m1, m0
   psubw                 m2, m0
   psubw                 m3, m0
   psubw                 m4, m0
 .loop:
-  movd                  m5, [leftq+lineq*4]
-  movd                  m6, [leftq+lineq*4+2]
-  pshuflw               m5, m5, 0x0
-  pshuflw               m6, m6, 0x0
-  punpcklqdq            m5, m5
-  punpcklqdq            m6, m6
-  paddw                 m7, m5, m1
-  paddw                 m8, m5, m2
-  paddw                 m9, m5, m3
-  paddw                 m5, m4
-  ;Clamp these values to the bit-depth
-  pminsw                m7, m10
-  pminsw                m8, m10
-  pminsw                m9, m10
-  pminsw                m5, m10
-  pmaxsw                m7, m11
-  pmaxsw                m8, m11
-  pmaxsw                m9, m11
-  pmaxsw                m5, m11
-  ;Store these values
-  mova   [dstq           ], m7
-  mova   [dstq        +16], m8
-  mova   [dstq        +32], m9
-  mova   [dstq        +48], m5
-  paddw                 m7, m6, m1
-  paddw                 m8, m6, m2
-  paddw                 m9, m6, m3
-  paddw                 m6, m4
-  ;Clamp these values to the bit-depth
-  pminsw                m7, m10
-  pminsw                m8, m10
-  pminsw                m9, m10
-  pminsw                m6, m10
-  pmaxsw                m7, m11
-  pmaxsw                m8, m11
-  pmaxsw                m9, m11
-  pmaxsw                m6, m11
-  ;Store these values
-  mova   [dstq+strideq*2   ], m7
-  mova   [dstq+strideq*2+16], m8
-  mova   [dstq+strideq*2+32], m9
-  mova   [dstq+strideq*2+48], m6
+  movd                  m7, [leftq]
+  pshuflw               m7, m7, 0x0
+  punpcklqdq            m7, m7         ; l1 l1 l1 l1 l1 l1 l1 l1
+  paddw                 m0, m7, m1
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq           ], m0
+  paddw                 m0, m7, m2
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq        +16], m0
+  paddw                 m0, m7, m3
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq        +32], m0
+  paddw                 m0, m7, m4
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq        +48], m0
+  movd                  m7, [leftq+2]
+  pshuflw               m7, m7, 0x0
+  punpcklqdq            m7, m7         ; l2 l2 l2 l2 l2 l2 l2 l2
+  paddw                 m0, m7, m1
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq+strideq*2 ], m0
+  paddw                 m0, m7, m2
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq+strideq*2+16], m0
+  paddw                 m0, m7, m3
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq+strideq*2+32], m0
+  paddw                 m0, m7, m4
+  pminsw                m0, m5
+  pmaxsw                m0, m6
+  mova   [dstq+strideq*2+48], m0
   lea                 dstq, [dstq+strideq*4]
+  lea                leftq, [leftq+4]
   inc                lineq
   jnz .loop
   REP_RET
-%endif
diff --git a/vpx_dsp/x86/highbd_loopfilter_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c
index c4fd5e1..72e42ad 100644
--- a/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -51,12 +51,10 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
 
 // TODO(debargha, peter): Break up large functions into smaller ones
 // in this file.
-static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
-                                                   int p,
-                                                   const uint8_t *_blimit,
-                                                   const uint8_t *_limit,
-                                                   const uint8_t *_thresh,
-                                                   int bd) {
+void vpx_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
+                                           const uint8_t *_blimit,
+                                           const uint8_t *_limit,
+                                           const uint8_t *_thresh, int bd) {
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi16(1);
   __m128i blimit, limit, thresh;
@@ -496,34 +494,19 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
   _mm_store_si128((__m128i *)(s - 0 * p), q0);
 }
 
-static void highbd_mb_lpf_horizontal_edge_w_sse2_16(uint16_t *s,
-                                                    int p,
-                                                    const uint8_t *_blimit,
-                                                    const uint8_t *_limit,
-                                                    const uint8_t *_thresh,
-                                                    int bd) {
-  highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);
-  highbd_mb_lpf_horizontal_edge_w_sse2_8(s + 8, p, _blimit, _limit, _thresh,
-                                         bd);
-}
-
-// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
-void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
-                                       const uint8_t *_blimit,
-                                       const uint8_t *_limit,
-                                       const uint8_t *_thresh,
-                                       int count, int bd) {
-  if (count == 1)
-    highbd_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh, bd);
-  else
-    highbd_mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh, bd);
+void vpx_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p,
+                                            const uint8_t *_blimit,
+                                            const uint8_t *_limit,
+                                            const uint8_t *_thresh, int bd) {
+  vpx_highbd_lpf_horizontal_edge_8_sse2(s, p, _blimit, _limit, _thresh, bd);
+  vpx_highbd_lpf_horizontal_edge_8_sse2(s + 8, p, _blimit, _limit, _thresh, bd);
 }
 
 void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
                                       const uint8_t *_blimit,
                                       const uint8_t *_limit,
                                       const uint8_t *_thresh,
-                                      int count, int bd) {
+                                      int bd) {
   DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
@@ -556,8 +539,6 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   __m128i work_a;
   __m128i filter1, filter2;
 
-  (void)count;
-
   if (bd == 8) {
     blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
     limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
@@ -764,16 +745,15 @@ void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int p,
                                            const uint8_t *_limit1,
                                            const uint8_t *_thresh1,
                                            int bd) {
-  vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
-  vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1,
-                                   1, bd);
+  vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
+  vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
 }
 
 void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
                                       const uint8_t *_blimit,
                                       const uint8_t *_limit,
                                       const uint8_t *_thresh,
-                                      int count, int bd) {
+                                      int bd) {
   const __m128i zero = _mm_set1_epi16(0);
   __m128i blimit, limit, thresh;
   __m128i mask, hev, flat;
@@ -813,8 +793,6 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
   __m128i work_a;
   __m128i filter1, filter2;
 
-  (void)count;
-
   if (bd == 8) {
     blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
     limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
@@ -944,9 +922,8 @@ void vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int p,
                                            const uint8_t *_limit1,
                                            const uint8_t *_thresh1,
                                            int bd) {
-  vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, 1, bd);
-  vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, 1,
-                                   bd);
+  vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
+  vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
 }
 
 static INLINE void highbd_transpose(uint16_t *src[], int in_p,
@@ -1058,11 +1035,10 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
                                     const uint8_t *blimit,
                                     const uint8_t *limit,
                                     const uint8_t *thresh,
-                                    int count, int bd) {
+                                    int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
   uint16_t *src[1];
   uint16_t *dst[1];
-  (void)count;
 
   // Transpose 8x8
   src[0] = s - 4;
@@ -1071,8 +1047,7 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
   highbd_transpose(src, p, dst, 8, 1);
 
   // Loop filtering
-  vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
-                                   bd);
+  vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
 
   src[0] = t_dst;
   dst[0] = s - 4;
@@ -1112,11 +1087,10 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
                                     const uint8_t *blimit,
                                     const uint8_t *limit,
                                     const uint8_t *thresh,
-                                    int count, int bd) {
+                                    int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
   uint16_t *src[1];
   uint16_t *dst[1];
-  (void)count;
 
   // Transpose 8x8
   src[0] = s - 4;
@@ -1125,8 +1099,7 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
   highbd_transpose(src, p, dst, 8, 1);
 
   // Loop filtering
-  vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1,
-                                   bd);
+  vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
 
   src[0] = t_dst;
   dst[0] = s - 4;
@@ -1181,8 +1154,8 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
   highbd_transpose(src, p, dst, 8, 2);
 
   // Loop filtering
-  highbd_mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit,
-                                         thresh, bd);
+  vpx_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit,
+                                        thresh, bd);
   src[0] = t_dst;
   src[1] = t_dst + 8 * 8;
   dst[0] = s - 8;
@@ -1205,8 +1178,8 @@ void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
   highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
 
   //  Loop filtering
-  highbd_mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
-                                          thresh, bd);
+  vpx_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit,
+                                         thresh, bd);
 
   //  Transpose back
   highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index 93df92a..30ee81b 100644
--- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -79,20 +79,13 @@ SECTION .text
 
 %macro INC_SRC_BY_SRC_STRIDE  0
 %if ARCH_X86=1 && CONFIG_PIC=1
-  lea                srcq, [srcq + src_stridemp*2]
+  add                srcq, src_stridemp
+  add                srcq, src_stridemp
 %else
   lea                srcq, [srcq + src_strideq*2]
 %endif
 %endmacro
 
-%macro INC_SRC_BY_SRC_2STRIDE  0
-%if ARCH_X86=1 && CONFIG_PIC=1
-  lea                srcq, [srcq + src_stridemp*4]
-%else
-  lea                srcq, [srcq + src_strideq*4]
-%endif
-%endmacro
-
 %macro SUBPEL_VARIANCE 1-2 0 ; W
 %define bilin_filter_m bilin_filter_m_sse2
 %define filter_idx_shift 5
@@ -123,8 +116,10 @@ SECTION .text
       %define sec_str sec_stridemp
 
       ; Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
-      add esp, 4                ; restore esp
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -140,8 +135,10 @@ SECTION .text
       %define block_height heightd
 
       ; Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
-      add esp, 4                ; restore esp
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -980,8 +977,9 @@ SECTION .text
 .x_other_y_other_loop:
   movu                 m2, [srcq]
   movu                 m4, [srcq+2]
-  movu                 m3, [srcq+src_strideq*2]
-  movu                 m5, [srcq+src_strideq*2+2]
+  INC_SRC_BY_SRC_STRIDE
+  movu                 m3, [srcq]
+  movu                 m5, [srcq+2]
   pmullw               m2, filter_x_a
   pmullw               m4, filter_x_b
   paddw                m2, filter_rnd
@@ -1014,7 +1012,7 @@ SECTION .text
   SUM_SSE              m0, m2, m4, m3, m6, m7
   mova                 m0, m5
 
-  INC_SRC_BY_SRC_2STRIDE
+  INC_SRC_BY_SRC_STRIDE
   lea                dstq, [dstq + dst_strideq * 4]
 %if %2 == 1 ; avg
   add                secq, sec_str
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index b45331c..14d029c 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -243,20 +243,24 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
 }
 
 #if CONFIG_USE_X86INC
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in
+// highbd_subpel_variance_impl_sse2.asm
 #define DECL(w, opt) \
   int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
                                                  ptrdiff_t src_stride, \
                                                  int x_offset, int y_offset, \
                                                  const uint16_t *dst, \
                                                  ptrdiff_t dst_stride, \
-                                                 int height, unsigned int *sse);
-#define DECLS(opt1, opt2) \
-  DECL(8, opt1); \
-  DECL(16, opt1)
-
-DECLS(sse2, sse);
-// TODO(johannkoenig): enable the ssse3 or delete
-// DECLS(ssse3, ssse3);
+                                                 int height, \
+                                                 unsigned int *sse, \
+                                                 void *unused0, void *unused);
+#define DECLS(opt) \
+  DECL(8, opt); \
+  DECL(16, opt)
+
+DECLS(sse2);
+
 #undef DECLS
 #undef DECL
 
@@ -274,7 +278,7 @@ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
   int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
                                                        x_offset, y_offset, \
                                                        dst, dst_stride, h, \
-                                                       &sse); \
+                                                       &sse, NULL, NULL); \
   if (w > wf) { \
     unsigned int sse2; \
     int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
@@ -282,19 +286,20 @@ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
                                                           x_offset, y_offset, \
                                                           dst + 16, \
                                                           dst_stride, \
-                                                          h, &sse2); \
+                                                          h, &sse2, \
+                                                          NULL, NULL); \
     se += se2; \
     sse += sse2; \
     if (w > wf * 2) { \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
                                                         x_offset, y_offset, \
                                                         dst + 32, dst_stride, \
-                                                        h, &sse2); \
+                                                        h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
           src + 48, src_stride, x_offset, y_offset, \
-          dst + 48, dst_stride, h, &sse2); \
+          dst + 48, dst_stride, h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
     } \
@@ -312,7 +317,7 @@ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
   int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
                                                        x_offset, y_offset, \
                                                        dst, dst_stride, \
-                                                       h, &sse); \
+                                                       h, &sse, NULL, NULL); \
   if (w > wf) { \
     uint32_t sse2; \
     int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
@@ -320,20 +325,21 @@ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
                                                           x_offset, y_offset, \
                                                           dst + 16, \
                                                           dst_stride, \
-                                                          h, &sse2); \
+                                                          h, &sse2, \
+                                                          NULL, NULL); \
     se += se2; \
     sse += sse2; \
     if (w > wf * 2) { \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
                                                         x_offset, y_offset, \
                                                         dst + 32, dst_stride, \
-                                                        h, &sse2); \
+                                                        h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
                                                         x_offset, y_offset, \
                                                         dst + 48, dst_stride, \
-                                                        h, &sse2); \
+                                                        h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
     } \
@@ -359,27 +365,27 @@ uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
     int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
         src + (start_row * src_stride), src_stride, \
         x_offset, y_offset, dst + (start_row * dst_stride), \
-        dst_stride, height, &sse2); \
+        dst_stride, height, &sse2, NULL, NULL); \
     se += se2; \
     long_sse += sse2; \
     if (w > wf) { \
       se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
           src + 16 + (start_row * src_stride), src_stride, \
           x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
-          dst_stride, height, &sse2); \
+          dst_stride, height, &sse2, NULL, NULL); \
       se += se2; \
       long_sse += sse2; \
       if (w > wf * 2) { \
         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
             src + 32 + (start_row * src_stride), src_stride, \
             x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
-            dst_stride, height, &sse2); \
+            dst_stride, height, &sse2, NULL, NULL); \
         se += se2; \
         long_sse += sse2; \
         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
             src + 48 + (start_row * src_stride), src_stride, \
             x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
-            dst_stride, height, &sse2); \
+            dst_stride, height, &sse2, NULL, NULL); \
         se += se2; \
         long_sse += sse2; \
       }\
@@ -391,25 +397,26 @@ uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
   return sse - ((cast se * se) >> (wlog2 + hlog2)); \
 }
 
-#define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
-FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
-FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
-FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
-FN(8, 4, 8, 3, 2, opt1, (int64_t));
+#define FNS(opt) \
+FN(64, 64, 16, 6, 6, opt, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt, (int64_t)); \
+FN(8, 16, 8, 3, 4, opt, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt, (int64_t));
 
 
-FNS(sse2, sse);
+FNS(sse2);
 
 #undef FNS
 #undef FN
 
+// The 2 unused parameters are place holders for PIC enabled build.
 #define DECL(w, opt) \
 int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
                                                    ptrdiff_t src_stride, \
@@ -419,7 +426,8 @@ int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
                                                    const uint16_t *sec, \
                                                    ptrdiff_t sec_stride, \
                                                    int height, \
-                                                   unsigned int *sse);
+                                                   unsigned int *sse, \
+                                                   void *unused0, void *unused);
 #define DECLS(opt1) \
 DECL(16, opt1) \
 DECL(8, opt1)
@@ -439,23 +447,23 @@ uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
   uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
   int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                src, src_stride, x_offset, \
-               y_offset, dst, dst_stride, sec, w, h, &sse); \
+               y_offset, dst, dst_stride, sec, w, h, &sse, NULL, NULL); \
   if (w > wf) { \
     uint32_t sse2; \
     int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                   src + 16, src_stride, x_offset, y_offset, \
-                  dst + 16, dst_stride, sec + 16, w, h, &sse2); \
+                  dst + 16, dst_stride, sec + 16, w, h, &sse2, NULL, NULL); \
     se += se2; \
     sse += sse2; \
     if (w > wf * 2) { \
       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                 src + 32, src_stride, x_offset, y_offset, \
-                dst + 32, dst_stride, sec + 32, w, h, &sse2); \
+                dst + 32, dst_stride, sec + 32, w, h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                 src + 48, src_stride, x_offset, y_offset, \
-                dst + 48, dst_stride, sec + 48, w, h, &sse2); \
+                dst + 48, dst_stride, sec + 48, w, h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
     } \
@@ -475,14 +483,15 @@ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
   int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                                             src, src_stride, x_offset, \
                                             y_offset, dst, dst_stride, \
-                                            sec, w, h, &sse); \
+                                            sec, w, h, &sse, NULL, NULL); \
   if (w > wf) { \
     uint32_t sse2; \
     int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                                             src + 16, src_stride, \
                                             x_offset, y_offset, \
                                             dst + 16, dst_stride, \
-                                            sec + 16, w, h, &sse2); \
+                                            sec + 16, w, h, &sse2, \
+                                            NULL, NULL); \
     se += se2; \
     sse += sse2; \
     if (w > wf * 2) { \
@@ -490,14 +499,16 @@ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
                                             src + 32, src_stride, \
                                             x_offset, y_offset, \
                                             dst + 32, dst_stride, \
-                                            sec + 32, w, h, &sse2); \
+                                            sec + 32, w, h, &sse2, \
+                                            NULL, NULL); \
       se += se2; \
       sse += sse2; \
       se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                                             src + 48, src_stride, \
                                             x_offset, y_offset, \
                                             dst + 48, dst_stride, \
-                                            sec + 48, w, h, &sse2); \
+                                            sec + 48, w, h, &sse2, \
+                                            NULL, NULL); \
       se += se2; \
       sse += sse2; \
     } \
@@ -525,7 +536,7 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
     int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                 src + (start_row * src_stride), src_stride, x_offset, \
                 y_offset, dst + (start_row * dst_stride), dst_stride, \
-                sec + (start_row * w), w, height, &sse2); \
+                sec + (start_row * w), w, height, &sse2, NULL, NULL); \
     se += se2; \
     long_sse += sse2; \
     if (w > wf) { \
@@ -533,7 +544,7 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
                 src + 16 + (start_row * src_stride), src_stride, \
                 x_offset, y_offset, \
                 dst + 16 + (start_row * dst_stride), dst_stride, \
-                sec + 16 + (start_row * w), w, height, &sse2); \
+                sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \
       se += se2; \
       long_sse += sse2; \
       if (w > wf * 2) { \
@@ -541,14 +552,14 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
                 src + 32 + (start_row * src_stride), src_stride, \
                 x_offset, y_offset, \
                 dst + 32 + (start_row * dst_stride), dst_stride, \
-                sec + 32 + (start_row * w), w, height, &sse2); \
+                sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \
         se += se2; \
         long_sse += sse2; \
         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
                 src + 48 + (start_row * src_stride), src_stride, \
                 x_offset, y_offset, \
                 dst + 48 + (start_row * dst_stride), dst_stride, \
-                sec + 48 + (start_row * w), w, height, &sse2); \
+                sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \
         se += se2; \
         long_sse += sse2; \
       } \
diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm
index 22b5731..cd6a6ae 100644
--- a/vpx_dsp/x86/intrapred_sse2.asm
+++ b/vpx_dsp/x86/intrapred_sse2.asm
@@ -11,6 +11,7 @@
 %include "third_party/x86inc/x86inc.asm"
 
 SECTION_RODATA
+pb_1: times 16 db 1
 pw_4:  times 8 dw 4
 pw_8:  times 8 dw 8
 pw_16: times 8 dw 16
@@ -23,17 +24,127 @@ pw2_32:  times 8 dw 16
 
 SECTION .text
 
-INIT_MMX sse
-cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
+  pavgb               %4, %1, %3
+  pxor                %3, %1
+  pand                %3, [GLOBAL(pb_1)]
+  psubb               %4, %3
+  pavgb               %4, %2
+%endmacro
+
+INIT_XMM sse2
+cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
   GET_GOT     goffsetq
 
-  pxor                  m1, m1
+  movq                 m0, [aboveq]
+  DEFINE_ARGS dst, stride, temp
+  psrldq               m1, m0, 1
+  psrldq               m2, m0, 2
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+
+  ; store 4 lines
+  movd   [dstq          ], m3
+  psrlq                m3, 8
+  movd   [dstq+strideq  ], m3
+  lea                dstq, [dstq+strideq*2]
+  psrlq                m3, 8
+  movd   [dstq          ], m3
+  psrlq                m3, 8
+  movd   [dstq+strideq  ], m3
+  psrlq                m0, 56
+  movd              tempq, m0
+  mov    [dstq+strideq+3], tempb
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
+  GET_GOT     goffsetq
+
+  movu                m1, [aboveq]
+  pslldq              m0, m1, 1
+  psrldq              m2, m1, 1
+  DEFINE_ARGS dst, stride, stride3
+  lea           stride3q, [strideq*3]
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+  punpckhbw           m0, m0 ; 7 7
+  punpcklwd           m0, m0 ; 7 7 7 7
+  punpckldq           m0, m0 ; 7 7 7 7 7 7 7 7
+  punpcklqdq          m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7
+
+ ; store 4 lines
+  psrldq                m3, 1
+  movq    [dstq          ], m3
+  psrldq                m3, 1
+  movq    [dstq+strideq  ], m3
+  psrldq                m3, 1
+  movq    [dstq+strideq*2], m3
+  psrldq                m3, 1
+  movq    [dstq+stride3q ], m3
+  lea                 dstq, [dstq+strideq*4]
+
+  ; store next 4 lines
+  psrldq                m3, 1
+  movq    [dstq          ], m3
+  psrldq                m3, 1
+  movq    [dstq+strideq  ], m3
+  psrldq                m3, 1
+  movq    [dstq+strideq*2], m3
+  psrldq                m3, 1
+  movq    [dstq+stride3q ], m3
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset
+  GET_GOT     goffsetq
+
+  movd                m0, [leftq]                ; abcd [byte]
+  punpcklbw           m4, m0, m0                 ; aabb ccdd
+  punpcklwd           m4, m4                     ; aaaa bbbb cccc dddd
+  psrldq              m4, 12                     ; dddd
+  punpckldq           m0, m4                     ; abcd dddd
+  psrldq              m1, m0, 1                  ; bcdd
+  psrldq              m2, m0, 2                  ; cddd
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3   ; a2bc b2cd c3d d
+  pavgb               m1, m0                     ; ab, bc, cd, d [byte]
+
+  punpcklbw           m1, m3             ; ab, a2bc, bc, b2cd, cd, c3d, d, d
+  movd    [dstq        ], m1
+  psrlq               m1, 16             ; bc, b2cd, cd, c3d, d, d
+  movd    [dstq+strideq], m1
+
+  lea               dstq, [dstq+strideq*2]
+  psrlq               m1, 16             ; cd, c3d, d, d
+  movd    [dstq        ], m1
+  movd    [dstq+strideq], m4             ; d, d, d, d
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  movd                  m2, [leftq]
   movd                  m0, [aboveq]
-  punpckldq             m0, [leftq]
+  pxor                  m1, m1
+  punpckldq             m0, m2
   psadbw                m0, m1
   paddw                 m0, [GLOBAL(pw_4)]
   psraw                 m0, 3
-  pshufw                m0, m0, 0x0
+  pshuflw               m0, m0, 0x0
   packuswb              m0, m0
   movd      [dstq        ], m0
   movd      [dstq+strideq], m0
@@ -44,8 +155,9 @@ cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
   RESTORE_GOT
   RET
 
-INIT_MMX sse
-cglobal dc_left_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
+  movifnidn          leftq, leftmp
   GET_GOT     goffsetq
 
   pxor                  m1, m1
@@ -53,7 +165,7 @@ cglobal dc_left_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
   psadbw                m0, m1
   paddw                 m0, [GLOBAL(pw2_4)]
   psraw                 m0, 2
-  pshufw                m0, m0, 0x0
+  pshuflw               m0, m0, 0x0
   packuswb              m0, m0
   movd      [dstq        ], m0
   movd      [dstq+strideq], m0
@@ -64,8 +176,8 @@ cglobal dc_left_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
   RESTORE_GOT
   RET
 
-INIT_MMX sse
-cglobal dc_top_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   pxor                  m1, m1
@@ -73,7 +185,7 @@ cglobal dc_top_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
   psadbw                m0, m1
   paddw                 m0, [GLOBAL(pw2_4)]
   psraw                 m0, 2
-  pshufw                m0, m0, 0x0
+  pshuflw               m0, m0, 0x0
   packuswb              m0, m0
   movd      [dstq        ], m0
   movd      [dstq+strideq], m0
@@ -84,7 +196,7 @@ cglobal dc_top_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
   RESTORE_GOT
   RET
 
-INIT_MMX sse
+INIT_XMM sse2
 cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
@@ -98,8 +210,8 @@ cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
   paddw                 m0, m2
   paddw                 m0, [GLOBAL(pw_8)]
   psraw                 m0, 4
-  pshufw                m0, m0, 0x0
-  packuswb              m0, m0
+  punpcklbw             m0, m0
+  pshuflw               m0, m0, 0x0
   movq    [dstq          ], m0
   movq    [dstq+strideq  ], m0
   movq    [dstq+strideq*2], m0
@@ -113,8 +225,8 @@ cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
   RESTORE_GOT
   RET
 
-INIT_MMX sse
-cglobal dc_top_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   pxor                  m1, m1
@@ -124,8 +236,8 @@ cglobal dc_top_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
   psadbw                m0, m1
   paddw                 m0, [GLOBAL(pw2_8)]
   psraw                 m0, 3
-  pshufw                m0, m0, 0x0
-  packuswb              m0, m0
+  punpcklbw             m0, m0
+  pshuflw               m0, m0, 0x0
   movq    [dstq          ], m0
   movq    [dstq+strideq  ], m0
   movq    [dstq+strideq*2], m0
@@ -139,8 +251,9 @@ cglobal dc_top_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
   RESTORE_GOT
   RET
 
-INIT_MMX sse
-cglobal dc_left_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
+  movifnidn          leftq, leftmp
   GET_GOT     goffsetq
 
   pxor                  m1, m1
@@ -150,8 +263,8 @@ cglobal dc_left_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
   psadbw                m0, m1
   paddw                 m0, [GLOBAL(pw2_8)]
   psraw                 m0, 3
-  pshufw                m0, m0, 0x0
-  packuswb              m0, m0
+  punpcklbw             m0, m0
+  pshuflw               m0, m0, 0x0
   movq    [dstq          ], m0
   movq    [dstq+strideq  ], m0
   movq    [dstq+strideq*2], m0
@@ -165,8 +278,8 @@ cglobal dc_left_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
   RESTORE_GOT
   RET
 
-INIT_MMX sse
-cglobal dc_128_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   DEFINE_ARGS dst, stride, stride3
@@ -179,8 +292,8 @@ cglobal dc_128_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
   RESTORE_GOT
   RET
 
-INIT_MMX sse
-cglobal dc_128_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   DEFINE_ARGS dst, stride, stride3
@@ -236,14 +349,11 @@ cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   pxor                  m1, m1
-  pxor                  m2, m2
   mova                  m0, [aboveq]
   DEFINE_ARGS dst, stride, stride3, lines4
   lea             stride3q, [strideq*3]
   mov              lines4d, 4
   psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
   movhlps               m2, m0
   paddw                 m0, m2
   paddw                 m0, [GLOBAL(pw2_16)]
@@ -268,14 +378,11 @@ cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
   pxor                  m1, m1
-  pxor                  m2, m2
   mova                  m0, [leftq]
   DEFINE_ARGS dst, stride, stride3, lines4
   lea             stride3q, [strideq*3]
   mov              lines4d, 4
   psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
   movhlps               m2, m0
   paddw                 m0, m2
   paddw                 m0, [GLOBAL(pw2_16)]
@@ -452,7 +559,7 @@ cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
   RESTORE_GOT
   RET
 
-INIT_MMX sse
+INIT_XMM sse2
 cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
   movd                  m0, [aboveq]
   movd      [dstq        ], m0
@@ -462,7 +569,7 @@ cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
   movd      [dstq+strideq], m0
   RET
 
-INIT_MMX sse
+INIT_XMM sse2
 cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
   movq                  m0, [aboveq]
   DEFINE_ARGS dst, stride, stride3
@@ -515,108 +622,196 @@ cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
   jnz .loop
   REP_RET
 
-INIT_MMX sse
-cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left
-  pxor                  m1, m1
-  movd                  m2, [aboveq-1]
-  movd                  m0, [aboveq]
-  punpcklbw             m2, m1
-  punpcklbw             m0, m1
-  pshufw                m2, m2, 0x0
-  DEFINE_ARGS dst, stride, line, left
+INIT_XMM sse2
+cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  movd                  m0, [leftq]
+  punpcklbw             m0, m0
+  punpcklbw             m0, m0
+  pshufd                m1, m0, 0x1
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m1
+  pshufd                m2, m0, 0x2
+  lea                 dstq, [dstq+strideq*2]
+  pshufd                m3, m0, 0x3
+  movd      [dstq        ], m2
+  movd      [dstq+strideq], m3
+  RET
+
+INIT_XMM sse2
+cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
   mov                lineq, -2
-  add                leftq, 4
-  psubw                 m0, m2
+  DEFINE_ARGS  dst, stride, line, left, stride3
+  lea             stride3q, [strideq*3]
+  movq                  m0, [leftq    ]
+  punpcklbw             m0, m0              ; l1 l1 l2 l2 ... l8 l8
 .loop:
-  movd                  m2, [leftq+lineq*2]
-  movd                  m3, [leftq+lineq*2+1]
+  pshuflw               m1, m0, 0x0         ; l1 l1 l1 l1 l1 l1 l1 l1
+  pshuflw               m2, m0, 0x55        ; l2 l2 l2 l2 l2 l2 l2 l2
+  movq      [dstq        ], m1
+  movq      [dstq+strideq], m2
+  pshuflw               m1, m0, 0xaa
+  pshuflw               m2, m0, 0xff
+  movq    [dstq+strideq*2], m1
+  movq    [dstq+stride3q ], m2
+  pshufd                m0, m0, 0xe         ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
+  inc                lineq
+  lea                 dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  mov                lineq, -4
+  DEFINE_ARGS dst, stride, line, left, stride3
+  lea             stride3q, [strideq*3]
+.loop:
+  movd                  m0, [leftq]
+  punpcklbw             m0, m0
+  punpcklbw             m0, m0              ; l1 to l4 each repeated 4 times
+  pshufd            m1, m0, 0x0             ; l1 repeated 16 times
+  pshufd            m2, m0, 0x55            ; l2 repeated 16 times
+  mova    [dstq          ], m1
+  mova    [dstq+strideq  ], m2
+  pshufd            m1, m0, 0xaa
+  pshufd            m2, m0, 0xff
+  mova    [dstq+strideq*2], m1
+  mova    [dstq+stride3q ], m2
+  inc                lineq
+  lea                leftq, [leftq+4       ]
+  lea                 dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
+  movifnidn              leftq, leftmp
+  mov                    lineq, -8
+  DEFINE_ARGS dst, stride, line, left, stride3
+  lea                 stride3q, [strideq*3]
+.loop:
+  movd                      m0, [leftq]
+  punpcklbw                 m0, m0
+  punpcklbw                 m0, m0              ; l1 to l4 each repeated 4 times
+  pshufd                m1, m0, 0x0             ; l1 repeated 16 times
+  pshufd                m2, m0, 0x55            ; l2 repeated 16 times
+  mova     [dstq             ], m1
+  mova     [dstq+16          ], m1
+  mova     [dstq+strideq     ], m2
+  mova     [dstq+strideq+16  ], m2
+  pshufd                m1, m0, 0xaa
+  pshufd                m2, m0, 0xff
+  mova     [dstq+strideq*2   ], m1
+  mova     [dstq+strideq*2+16], m1
+  mova     [dstq+stride3q    ], m2
+  mova     [dstq+stride3q+16 ], m2
+  inc                    lineq
+  lea                    leftq, [leftq+4       ]
+  lea                     dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left
+  pxor                  m1, m1
+  movq                  m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x
+  punpcklbw             m0, m1
+  pshuflw               m2, m0, 0x0   ; [63:0] tl tl tl tl [word]
+  psrldq                m0, 2
+  psubw                 m0, m2        ; [63:0] t1-tl t2-tl t3-tl t4-tl [word]
+  movd                  m2, [leftq]
   punpcklbw             m2, m1
-  punpcklbw             m3, m1
-  pshufw                m2, m2, 0x0
-  pshufw                m3, m3, 0x0
-  paddw                 m2, m0
+  pshuflw               m4, m2, 0x0   ; [63:0] l1 l1 l1 l1 [word]
+  pshuflw               m3, m2, 0x55  ; [63:0] l2 l2 l2 l2 [word]
+  paddw                 m4, m0
   paddw                 m3, m0
-  packuswb              m2, m2
+  packuswb              m4, m4
   packuswb              m3, m3
-  movd      [dstq        ], m2
+  movd      [dstq        ], m4
   movd      [dstq+strideq], m3
   lea                 dstq, [dstq+strideq*2]
-  inc                lineq
-  jnz .loop
-  REP_RET
+  pshuflw               m4, m2, 0xaa
+  pshuflw               m3, m2, 0xff
+  paddw                 m4, m0
+  paddw                 m3, m0
+  packuswb              m4, m4
+  packuswb              m3, m3
+  movd      [dstq        ], m4
+  movd      [dstq+strideq], m3
+  RET
 
 INIT_XMM sse2
-cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left
+cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left
   pxor                  m1, m1
   movd                  m2, [aboveq-1]
   movq                  m0, [aboveq]
   punpcklbw             m2, m1
-  punpcklbw             m0, m1
-  pshuflw               m2, m2, 0x0
+  punpcklbw             m0, m1        ; t1 t2 t3 t4 t5 t6 t7 t8 [word]
+  pshuflw               m2, m2, 0x0   ; [63:0] tl tl tl tl [word]
   DEFINE_ARGS dst, stride, line, left
   mov                lineq, -4
-  punpcklqdq            m2, m2
-  add                leftq, 8
-  psubw                 m0, m2
-.loop:
-  movd                  m2, [leftq+lineq*2]
-  movd                  m3, [leftq+lineq*2+1]
-  punpcklbw             m2, m1
-  punpcklbw             m3, m1
-  pshuflw               m2, m2, 0x0
-  pshuflw               m3, m3, 0x0
-  punpcklqdq            m2, m2
-  punpcklqdq            m3, m3
-  paddw                 m2, m0
+  punpcklqdq            m2, m2        ; tl tl tl tl tl tl tl tl [word]
+  psubw                 m0, m2        ; t1-tl t2-tl ... t8-tl [word]
+  movq                  m2, [leftq]
+  punpcklbw             m2, m1        ; l1 l2 l3 l4 l5 l6 l7 l8 [word]
+.loop
+  pshuflw               m4, m2, 0x0   ; [63:0] l1 l1 l1 l1 [word]
+  pshuflw               m3, m2, 0x55  ; [63:0] l2 l2 l2 l2 [word]
+  punpcklqdq            m4, m4        ; l1 l1 l1 l1 l1 l1 l1 l1 [word]
+  punpcklqdq            m3, m3        ; l2 l2 l2 l2 l2 l2 l2 l2 [word]
+  paddw                 m4, m0
   paddw                 m3, m0
-  packuswb              m2, m3
-  movq      [dstq        ], m2
-  movhps    [dstq+strideq], m2
+  packuswb              m4, m3
+  movq      [dstq        ], m4
+  movhps    [dstq+strideq], m4
   lea                 dstq, [dstq+strideq*2]
+  psrldq                m2, 4
   inc                lineq
   jnz .loop
   REP_RET
 
 INIT_XMM sse2
-cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left
+cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left
   pxor                  m1, m1
-  movd                  m2, [aboveq-1]
-  mova                  m0, [aboveq]
-  punpcklbw             m2, m1
+  mova                  m2, [aboveq-16];
+  mova                  m0, [aboveq]   ; t1 t2 ... t16 [byte]
+  punpckhbw             m2, m1         ; [127:112] tl [word]
   punpckhbw             m4, m0, m1
-  punpcklbw             m0, m1
-  pshuflw               m2, m2, 0x0
-  DEFINE_ARGS dst, stride, line, left
+  punpcklbw             m0, m1         ; m0:m4 t1 t2 ... t16 [word]
+  DEFINE_ARGS dst, stride, line, left, stride8
   mov                lineq, -8
-  punpcklqdq            m2, m2
-  add                leftq, 16
+  pshufhw               m2, m2, 0xff
+  mova                  m3, [leftq]    ; l1 l2 ... l16 [byte]
+  punpckhqdq            m2, m2         ; tl repeated 8 times [word]
   psubw                 m0, m2
-  psubw                 m4, m2
+  psubw                 m4, m2         ; m0:m4 t1-tl t2-tl ... t16-tl [word]
+  punpckhbw             m5, m3, m1
+  punpcklbw             m3, m1         ; m3:m5 l1 l2 ... l16 [word]
+  lea             stride8q, [strideq*8]
 .loop:
-  movd                  m2, [leftq+lineq*2]
-  movd                  m3, [leftq+lineq*2+1]
-  punpcklbw             m2, m1
-  punpcklbw             m3, m1
-  pshuflw               m2, m2, 0x0
-  pshuflw               m3, m3, 0x0
-  punpcklqdq            m2, m2
-  punpcklqdq            m3, m3
-  paddw                 m5, m2, m0
-  paddw                 m6, m3, m0
-  paddw                 m2, m4
-  paddw                 m3, m4
-  packuswb              m5, m2
-  packuswb              m6, m3
-  mova      [dstq        ], m5
-  mova      [dstq+strideq], m6
-  lea                 dstq, [dstq+strideq*2]
+  pshuflw               m6, m3, 0x0
+  pshuflw               m7, m5, 0x0
+  punpcklqdq            m6, m6         ; l1 repeated 8 times [word]
+  punpcklqdq            m7, m7         ; l8 repeated 8 times [word]
+  paddw                 m1, m6, m0
+  paddw                 m6, m4         ; m1:m6 ti-tl+l1 [i=1,15] [word]
+  psrldq                m5, 2
+  packuswb              m1, m6
+  mova     [dstq         ], m1
+  paddw                 m1, m7, m0
+  paddw                 m7, m4         ; m1:m7 ti-tl+l8 [i=1,15] [word]
+  psrldq                m3, 2
+  packuswb              m1, m7
+  mova     [dstq+stride8q], m1
   inc                lineq
+  lea                 dstq, [dstq+strideq]
   jnz .loop
   REP_RET
 
-%if ARCH_X86_64
 INIT_XMM sse2
-cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left
+cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left
   pxor                  m1, m1
   movd                  m2, [aboveq-1]
   mova                  m0, [aboveq]
@@ -637,31 +832,29 @@ cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left
   psubw                 m5, m2
 .loop:
   movd                  m2, [leftq+lineq*2]
-  movd                  m6, [leftq+lineq*2+1]
+  pxor                  m1, m1
   punpcklbw             m2, m1
-  punpcklbw             m6, m1
+  pshuflw               m7, m2, 0x55
   pshuflw               m2, m2, 0x0
-  pshuflw               m6, m6, 0x0
   punpcklqdq            m2, m2
-  punpcklqdq            m6, m6
-  paddw                 m7, m2, m0
-  paddw                 m8, m2, m3
-  paddw                 m9, m2, m4
-  paddw                 m2, m5
-  packuswb              m7, m8
-  packuswb              m9, m2
-  paddw                 m2, m6, m0
-  paddw                 m8, m6, m3
-  mova   [dstq           ], m7
-  paddw                 m7, m6, m4
-  paddw                 m6, m5
-  mova   [dstq        +16], m9
-  packuswb              m2, m8
-  packuswb              m7, m6
-  mova   [dstq+strideq   ], m2
-  mova   [dstq+strideq+16], m7
+  punpcklqdq            m7, m7
+  paddw                 m6, m2, m3
+  paddw                 m1, m2, m0
+  packuswb              m1, m6
+  mova   [dstq           ], m1
+  paddw                 m6, m2, m5
+  paddw                 m1, m2, m4
+  packuswb              m1, m6
+  mova   [dstq+16        ], m1
+  paddw                 m6, m7, m3
+  paddw                 m1, m7, m0
+  packuswb              m1, m6
+  mova   [dstq+strideq   ], m1
+  paddw                 m6, m7, m5
+  paddw                 m1, m7, m4
+  packuswb              m1, m6
+  mova   [dstq+strideq+16], m1
   lea                 dstq, [dstq+strideq*2]
   inc                lineq
   jnz .loop
   REP_RET
-%endif
diff --git a/vpx_dsp/x86/intrapred_ssse3.asm b/vpx_dsp/x86/intrapred_ssse3.asm
index 88df9b2..5e0139f 100644
--- a/vpx_dsp/x86/intrapred_ssse3.asm
+++ b/vpx_dsp/x86/intrapred_ssse3.asm
@@ -13,7 +13,6 @@
 SECTION_RODATA
 
 pb_1: times 16 db 1
-sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
@@ -28,151 +27,9 @@ sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
 sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
 sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 
 SECTION .text
 
-INIT_MMX ssse3
-cglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  add                leftq, 4
-  mov                lineq, -2
-  pxor                  m0, m0
-.loop:
-  movd                  m1, [leftq+lineq*2  ]
-  movd                  m2, [leftq+lineq*2+1]
-  pshufb                m1, m0
-  pshufb                m2, m0
-  movd      [dstq        ], m1
-  movd      [dstq+strideq], m2
-  lea                 dstq, [dstq+strideq*2]
-  inc                lineq
-  jnz .loop
-  REP_RET
-
-INIT_MMX ssse3
-cglobal h_predictor_8x8, 2, 4, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  add                leftq, 8
-  mov                lineq, -4
-  pxor                  m0, m0
-.loop:
-  movd                  m1, [leftq+lineq*2  ]
-  movd                  m2, [leftq+lineq*2+1]
-  pshufb                m1, m0
-  pshufb                m2, m0
-  movq      [dstq        ], m1
-  movq      [dstq+strideq], m2
-  lea                 dstq, [dstq+strideq*2]
-  inc                lineq
-  jnz .loop
-  REP_RET
-
-INIT_XMM ssse3
-cglobal h_predictor_16x16, 2, 4, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  add                leftq, 16
-  mov                lineq, -8
-  pxor                  m0, m0
-.loop:
-  movd                  m1, [leftq+lineq*2  ]
-  movd                  m2, [leftq+lineq*2+1]
-  pshufb                m1, m0
-  pshufb                m2, m0
-  mova      [dstq        ], m1
-  mova      [dstq+strideq], m2
-  lea                 dstq, [dstq+strideq*2]
-  inc                lineq
-  jnz .loop
-  REP_RET
-
-INIT_XMM ssse3
-cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  add                leftq, 32
-  mov                lineq, -16
-  pxor                  m0, m0
-.loop:
-  movd                  m1, [leftq+lineq*2  ]
-  movd                  m2, [leftq+lineq*2+1]
-  pshufb                m1, m0
-  pshufb                m2, m0
-  mova   [dstq           ], m1
-  mova   [dstq        +16], m1
-  mova   [dstq+strideq   ], m2
-  mova   [dstq+strideq+16], m2
-  lea                 dstq, [dstq+strideq*2]
-  inc                lineq
-  jnz .loop
-  REP_RET
-
-INIT_MMX ssse3
-cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
-  GET_GOT     goffsetq
-
-  movq                m0, [aboveq]
-  pshufb              m2, m0, [GLOBAL(sh_b23456777)]
-  pshufb              m1, m0, [GLOBAL(sh_b01234577)]
-  pshufb              m0, [GLOBAL(sh_b12345677)]
-  pavgb               m3, m2, m1
-  pxor                m2, m1
-  pand                m2, [GLOBAL(pb_1)]
-  psubb               m3, m2
-  pavgb               m0, m3
-
-  ; store 4 lines
-  movd    [dstq        ], m0
-  psrlq               m0, 8
-  movd    [dstq+strideq], m0
-  lea               dstq, [dstq+strideq*2]
-  psrlq               m0, 8
-  movd    [dstq        ], m0
-  psrlq               m0, 8
-  movd    [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_MMX ssse3
-cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset
-  GET_GOT     goffsetq
-
-  movq                m0, [aboveq]
-  mova                m1, [GLOBAL(sh_b12345677)]
-  DEFINE_ARGS dst, stride, stride3
-  lea           stride3q, [strideq*3]
-  pshufb              m2, m0, [GLOBAL(sh_b23456777)]
-  pavgb               m3, m2, m0
-  pxor                m2, m0
-  pshufb              m0, m1
-  pand                m2, [GLOBAL(pb_1)]
-  psubb               m3, m2
-  pavgb               m0, m3
-
-  ; store 4 lines
-  movq  [dstq          ], m0
-  pshufb              m0, m1
-  movq  [dstq+strideq  ], m0
-  pshufb              m0, m1
-  movq  [dstq+strideq*2], m0
-  pshufb              m0, m1
-  movq  [dstq+stride3q ], m0
-  pshufb              m0, m1
-  lea               dstq, [dstq+strideq*4]
-
-  ; store next 4 lines
-  movq  [dstq          ], m0
-  pshufb              m0, m1
-  movq  [dstq+strideq  ], m0
-  pshufb              m0, m1
-  movq  [dstq+strideq*2], m0
-  pshufb              m0, m1
-  movq  [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
 INIT_XMM ssse3
 cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
   GET_GOT     goffsetq
@@ -789,28 +646,6 @@ cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
   RESTORE_GOT
   RET
 
-INIT_MMX ssse3
-cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset
-  GET_GOT     goffsetq
-  movd                m0, [leftq]                ; abcd [byte]
-  pshufb              m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte]
-  pshufb              m3, m0, [GLOBAL(sh_b2333)] ; cddd
-
-  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2
-  pavgb               m1, m0             ; ab, bc, cd, d [byte]
-
-  punpcklbw           m1, m2             ; ab, a2bc, bc, b2cd, cd, c3d, d, d
-  movd    [dstq        ], m1
-  psrlq               m1, 16             ; bc, b2cd, cd, c3d, d, d
-  movd    [dstq+strideq], m1
-  lea               dstq, [dstq+strideq*2]
-  psrlq               m1, 16             ; cd, c3d, d, d
-  movd    [dstq        ], m1
-  pshufw              m1, m1, q1111      ; d, d, d, d
-  movd    [dstq+strideq], m1
-  RESTORE_GOT
-  RET
-
 INIT_XMM ssse3
 cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
   GET_GOT     goffsetq
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index ae907fd..df5068c 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -158,8 +158,8 @@ void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
   const __m128i zero = _mm_setzero_si128();
   int a;
 
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
+  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift(a * cospi_16_64);
   a = ROUND_POWER_OF_TWO(a, 4);
 
   dc_value = _mm_set1_epi16(a);
@@ -527,8 +527,8 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
   const __m128i zero = _mm_setzero_si128();
   int a;
 
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
+  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift(a * cospi_16_64);
   a = ROUND_POWER_OF_TWO(a, 5);
 
   dc_value = _mm_set1_epi16(a);
@@ -1305,30 +1305,16 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
   const __m128i zero = _mm_setzero_si128();
   int a, i;
 
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
+  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift(a * cospi_16_64);
   a = ROUND_POWER_OF_TWO(a, 6);
 
   dc_value = _mm_set1_epi16(a);
 
-  for (i = 0; i < 2; ++i) {
-    RECON_AND_STORE(dest +  0 * stride, dc_value);
-    RECON_AND_STORE(dest +  1 * stride, dc_value);
-    RECON_AND_STORE(dest +  2 * stride, dc_value);
-    RECON_AND_STORE(dest +  3 * stride, dc_value);
-    RECON_AND_STORE(dest +  4 * stride, dc_value);
-    RECON_AND_STORE(dest +  5 * stride, dc_value);
-    RECON_AND_STORE(dest +  6 * stride, dc_value);
-    RECON_AND_STORE(dest +  7 * stride, dc_value);
-    RECON_AND_STORE(dest +  8 * stride, dc_value);
-    RECON_AND_STORE(dest +  9 * stride, dc_value);
-    RECON_AND_STORE(dest + 10 * stride, dc_value);
-    RECON_AND_STORE(dest + 11 * stride, dc_value);
-    RECON_AND_STORE(dest + 12 * stride, dc_value);
-    RECON_AND_STORE(dest + 13 * stride, dc_value);
-    RECON_AND_STORE(dest + 14 * stride, dc_value);
-    RECON_AND_STORE(dest + 15 * stride, dc_value);
-    dest += 8;
+  for (i = 0; i < 16; ++i) {
+    RECON_AND_STORE(dest +  0, dc_value);
+    RECON_AND_STORE(dest +  8, dc_value);
+    dest += stride;
   }
 }
 
@@ -3476,8 +3462,8 @@ void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
   const __m128i zero = _mm_setzero_si128();
   int a, j;
 
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
+  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift(a * cospi_16_64);
   a = ROUND_POWER_OF_TWO(a, 6);
 
   dc_value = _mm_set1_epi16(a);
diff --git a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
index 68e7fa4..20baf82 100644
--- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
@@ -17,18 +17,70 @@
 SECTION_RODATA
 
 pw_11585x2: times 8 dw 23170
+
+pw_m2404x2:  times 8 dw  -2404*2
+pw_m4756x2:  times 8 dw  -4756*2
+pw_m5520x2:  times 8 dw  -5520*2
+pw_m8423x2:  times 8 dw  -8423*2
+pw_m9102x2:  times 8 dw  -9102*2
+pw_m10394x2: times 8 dw -10394*2
+pw_m11003x2: times 8 dw -11003*2
+
+pw_16364x2: times 8 dw 16364*2
+pw_16305x2: times 8 dw 16305*2
+pw_16207x2: times 8 dw 16207*2
+pw_16069x2: times 8 dw 16069*2
+pw_15893x2: times 8 dw 15893*2
+pw_15679x2: times 8 dw 15679*2
+pw_15426x2: times 8 dw 15426*2
+pw_15137x2: times 8 dw 15137*2
+pw_14811x2: times 8 dw 14811*2
+pw_14449x2: times 8 dw 14449*2
+pw_14053x2: times 8 dw 14053*2
+pw_13623x2: times 8 dw 13623*2
+pw_13160x2: times 8 dw 13160*2
+pw_12665x2: times 8 dw 12665*2
+pw_12140x2: times 8 dw 12140*2
+pw__9760x2: times 8 dw  9760*2
+pw__7723x2: times 8 dw  7723*2
+pw__7005x2: times 8 dw  7005*2
+pw__6270x2: times 8 dw  6270*2
+pw__3981x2: times 8 dw  3981*2
+pw__3196x2: times 8 dw  3196*2
+pw__1606x2: times 8 dw  1606*2
+pw___804x2: times 8 dw   804*2
+
 pd_8192:    times 4 dd 8192
+pw_32:      times 8 dw 32
 pw_16:      times 8 dw 16
 
 %macro TRANSFORM_COEFFS 2
 pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
 pw_m%2_%1:  dw -%2,  %1, -%2,  %1, -%2,  %1, -%2,  %1
+pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2
 %endmacro
 
 TRANSFORM_COEFFS    6270, 15137
 TRANSFORM_COEFFS    3196, 16069
 TRANSFORM_COEFFS   13623,  9102
 
+; constants for 32x32_34
+TRANSFORM_COEFFS      804, 16364
+TRANSFORM_COEFFS    15426,  5520
+TRANSFORM_COEFFS     3981, 15893
+TRANSFORM_COEFFS    16207,  2404
+TRANSFORM_COEFFS     1606, 16305
+TRANSFORM_COEFFS    15679,  4756
+TRANSFORM_COEFFS    11585, 11585
+
+; constants for 32x32_1024
+TRANSFORM_COEFFS    12140, 11003
+TRANSFORM_COEFFS     7005, 14811
+TRANSFORM_COEFFS    14053,  8423
+TRANSFORM_COEFFS     9760, 13160
+TRANSFORM_COEFFS    12665, 10394
+TRANSFORM_COEFFS     7723, 14449
+
 %macro PAIR_PP_COEFFS 2
 dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2
 %endmacro
@@ -80,6 +132,15 @@ SECTION .text
   packssdw           m%2, m%6
 %endmacro
 
+%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
+  punpckhwd          m%6, m%2, m%1
+  MUL_ADD_2X         %7,  %6,  %6,  %5, [pw_m%4_%3], [pw_m%3_m%4]
+  punpcklwd          m%2, m%1
+  MUL_ADD_2X         %1,  %2,  %2,  %5, [pw_m%4_%3], [pw_m%3_m%4]
+  packssdw           m%1, m%7
+  packssdw           m%2, m%6
+%endmacro
+
 ; matrix transpose
 %macro INTERLEAVE_2X 4
   punpckh%1          m%4, m%2, m%3
@@ -159,7 +220,24 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
   mova    m12, [pw_11585x2]
 
   lea      r3, [2 * strideq]
-
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova     m0, [inputq +   0]
+  packssdw m0, [inputq +  16]
+  mova     m1, [inputq +  32]
+  packssdw m1, [inputq +  48]
+  mova     m2, [inputq +  64]
+  packssdw m2, [inputq +  80]
+  mova     m3, [inputq +  96]
+  packssdw m3, [inputq + 112]
+  mova     m4, [inputq + 128]
+  packssdw m4, [inputq + 144]
+  mova     m5, [inputq + 160]
+  packssdw m5, [inputq + 176]
+  mova     m6, [inputq + 192]
+  packssdw m6, [inputq + 208]
+  mova     m7, [inputq + 224]
+  packssdw m7, [inputq + 240]
+%else
   mova     m0, [inputq +   0]
   mova     m1, [inputq +  16]
   mova     m2, [inputq +  32]
@@ -168,7 +246,7 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
   mova     m5, [inputq +  80]
   mova     m6, [inputq +  96]
   mova     m7, [inputq + 112]
-
+%endif
   TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
   IDCT8_1D
   TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
@@ -193,10 +271,21 @@ cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
 
   lea        r3, [2 * strideq]
 
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova       m0, [inputq +   0]
+  packssdw   m0, [inputq +  16]
+  mova       m1, [inputq +  32]
+  packssdw   m1, [inputq +  48]
+  mova       m2, [inputq +  64]
+  packssdw   m2, [inputq +  80]
+  mova       m3, [inputq +  96]
+  packssdw   m3, [inputq + 112]
+%else
   mova       m0, [inputq +  0]
   mova       m1, [inputq + 16]
   mova       m2, [inputq + 32]
   mova       m3, [inputq + 48]
+%endif
 
   punpcklwd  m0, m1
   punpcklwd  m2, m3
@@ -298,4 +387,1407 @@ cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
 
   RET
 
+%define  idx0 16 * 0
+%define  idx1 16 * 1
+%define  idx2 16 * 2
+%define  idx3 16 * 3
+%define  idx4 16 * 4
+%define  idx5 16 * 5
+%define  idx6 16 * 6
+%define  idx7 16 * 7
+%define  idx8 16 * 0
+%define  idx9 16 * 1
+%define idx10 16 * 2
+%define idx11 16 * 3
+%define idx12 16 * 4
+%define idx13 16 * 5
+%define idx14 16 * 6
+%define idx15 16 * 7
+%define idx16 16 * 0
+%define idx17 16 * 1
+%define idx18 16 * 2
+%define idx19 16 * 3
+%define idx20 16 * 4
+%define idx21 16 * 5
+%define idx22 16 * 6
+%define idx23 16 * 7
+%define idx24 16 * 0
+%define idx25 16 * 1
+%define idx26 16 * 2
+%define idx27 16 * 3
+%define idx28 16 * 4
+%define idx29 16 * 5
+%define idx30 16 * 6
+%define idx31 16 * 7
+
+; FROM idct32x32_add_neon.asm
+;
+; Instead of doing the transforms stage by stage, it is done by loading
+; some input values and doing as many stages as possible to minimize the
+; storing/loading of intermediate results. To fit within registers, the
+; final coefficients are cut into four blocks:
+; BLOCK A: 16-19,28-31
+; BLOCK B: 20-23,24-27
+; BLOCK C: 8-11,12-15
+; BLOCK D: 0-3,4-7
+; Blocks A and C are straight calculation through the various stages. In
+; block B, further calculations are performed using the results from
+; block A. In block D, further calculations are performed using the results
+; from block C and then the final calculations are done using results from
+; block A and B which have been combined at the end of block B.
+;
+
+%macro IDCT32X32_34 4
+  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                m11, m1
+  pmulhrsw             m1, [pw___804x2] ; stp1_16
+  mova      [r4 +      0], m0
+  pmulhrsw            m11, [pw_16364x2] ; stp2_31
+  mova      [r4 + 16 * 2], m2
+  mova                m12, m7
+  pmulhrsw             m7, [pw_15426x2] ; stp1_28
+  mova      [r4 + 16 * 4], m4
+  pmulhrsw            m12, [pw_m5520x2] ; stp2_19
+  mova      [r4 + 16 * 6], m6
+
+  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m2, m1   ; stp1_16
+  mova                 m0, m11  ; stp1_31
+  mova                 m4, m7   ; stp1_28
+  mova                m15, m12  ; stp1_19
+
+  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
+  BUTTERFLY_4Xmm        4,    15,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
+
+  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
+  SUM_SUB               0, 15, 9 ; stp2_17, stp2_18
+  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
+  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
+
+  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          4,    15,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
+  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
+
+  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m6, m5
+  pmulhrsw             m5, [pw__3981x2] ; stp1_20
+  mova [stp + %4 + idx28], m12
+  mova [stp + %4 + idx29], m15
+  pmulhrsw             m6, [pw_15893x2] ; stp2_27
+  mova [stp + %4 + idx30], m2
+  mova                 m2, m3
+  pmulhrsw             m3, [pw_m2404x2] ; stp1_23
+  mova [stp + %4 + idx31], m11
+  pmulhrsw             m2, [pw_16207x2] ; stp2_24
+
+  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                m13, m5 ; stp1_20
+  mova                m14, m6 ; stp1_27
+  mova                m15, m3 ; stp1_23
+  mova                m11, m2 ; stp1_24
+
+  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
+  BUTTERFLY_4Xmm       11,    15,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
+
+  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
+  SUM_SUB              15, 14, 9 ; stp2_22, stp2_21
+  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
+  SUM_SUB              11, 13, 9 ; stp2_25, stp2_26
+
+  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
+  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
+
+  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1,  3, 9 ; stp2_16, stp2_23
+  SUM_SUB               0, 15, 9 ; stp2_17, stp2_22
+  SUM_SUB               4, 14, 9 ; stp2_18, stp2_21
+  SUM_SUB               7,  5, 9 ; stp2_19, stp2_20
+  mova [stp + %3 + idx16], m1
+  mova [stp + %3 + idx17], m0
+  mova [stp + %3 + idx18], m4
+  mova [stp + %3 + idx19], m7
+
+  mova                 m4, [stp + %4 + idx28]
+  mova                 m7, [stp + %4 + idx29]
+  mova                m10, [stp + %4 + idx30]
+  mova                m12, [stp + %4 + idx31]
+  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
+  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
+  SUM_SUB              10, 11, 9 ; stp2_30, stp2_25
+  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
+  mova [stp + %4 + idx28], m4
+  mova [stp + %4 + idx29], m7
+  mova [stp + %4 + idx30], m10
+  mova [stp + %4 + idx31], m12
+
+  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               6,  5, 9
+  pmulhrsw             m6, m10  ; stp1_27
+  pmulhrsw             m5, m10  ; stp1_20
+  SUM_SUB              13, 14,  9
+  pmulhrsw            m13, m10  ; stp1_26
+  pmulhrsw            m14, m10  ; stp1_21
+  SUM_SUB              11, 15,  9
+  pmulhrsw            m11, m10  ; stp1_25
+  pmulhrsw            m15, m10  ; stp1_22
+  SUM_SUB               2,  3,  9
+  pmulhrsw             m2, m10  ; stp1_24
+  pmulhrsw             m3, m10  ; stp1_23
+%else
+  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
+  SWAP 6, 5
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
+  SWAP 13, 14
+  BUTTERFLY_4X         11,    15,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
+  SWAP 11, 15
+  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
+  SWAP 2, 3
+%endif
+
+  mova [stp + %4 + idx24], m2
+  mova [stp + %4 + idx25], m11
+  mova [stp + %4 + idx26], m13
+  mova [stp + %4 + idx27], m6
+
+  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  2]
+  mova                 m6, [rsp + transposed_in + 16 *  6]
+
+  mova                 m1, m0
+  pmulhrsw             m0, [pw__1606x2] ; stp1_8
+  mova [stp + %3 + idx20], m5
+  mova [stp + %3 + idx21], m14
+  pmulhrsw             m1, [pw_16305x2] ; stp2_15
+  mova [stp + %3 + idx22], m15
+  mova                 m7, m6
+  pmulhrsw             m7, [pw_m4756x2] ; stp2_11
+  mova [stp + %3 + idx23], m3
+  pmulhrsw             m6, [pw_15679x2] ; stp1_12
+
+  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m3, m0 ; stp1_8
+  mova                 m2, m1 ; stp1_15
+
+  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
+  mova                 m4, m7 ; stp1_11
+  mova                 m5, m6 ; stp1_12
+  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
+
+  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
+  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
+  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
+  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
+
+  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               5,  4, 9
+  pmulhrsw             m5, m10  ; stp1_13
+  pmulhrsw             m4, m10  ; stp1_10
+  SUM_SUB               6,  7, 9
+  pmulhrsw             m6, m10  ; stp1_12
+  pmulhrsw             m7, m10  ; stp1_11
+%else
+  BUTTERFLY_4X          5,     4,  11585, 11585,  m8,  9,  10 ; stp1_10, stp1_13
+  SWAP 5, 4
+  BUTTERFLY_4X          6,     7,  11585, 11585,  m8,  9,  10 ; stp1_11, stp1_12
+  SWAP 6, 7
+%endif
+
+  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova [stp + %2 +  idx8], m0
+  mova [stp + %2 +  idx9], m2
+  mova [stp + %2 + idx10], m4
+  mova [stp + %2 + idx11], m7
+
+  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                m11, [rsp + transposed_in + 16 *  4]
+  mova                m12, m11
+  pmulhrsw            m11, [pw__3196x2] ; stp1_4
+  pmulhrsw            m12, [pw_16069x2] ; stp1_7
+
+  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  0]
+  mova                m10, [pw_11585x2]
+  pmulhrsw             m0, m10  ; stp1_1
+
+  mova                m14, m11 ; stp1_4
+  mova                m13, m12 ; stp1_7
+
+  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  SUM_SUB              13,   14,  9
+  pmulhrsw            m13, m10  ; stp1_6
+  pmulhrsw            m14, m10  ; stp1_5
+%else
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
+  SWAP 13, 14
+%endif
+  mova                 m7, m0 ; stp1_0 = stp1_1
+  mova                 m4, m0 ; stp1_1
+  mova                 m2, m7 ; stp1_0
+
+  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
+  SUM_SUB               7, 13, 9 ;  stp1_1, stp1_6
+  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
+  SUM_SUB               4, 11, 9 ;  stp1_3, stp1_4
+
+  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  1, 9 ;  stp1_0, stp1_15
+  SUM_SUB               7,  3, 9 ;  stp1_1, stp1_14
+  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
+  SUM_SUB               4,  6, 9 ;  stp1_3, stp1_12
+
+  ; 0-3, 28-31 final stage
+  mova                m15, [stp + %4 + idx30]
+  mova                m10, [stp + %4 + idx31]
+  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
+  SUM_SUB               7, 15, 9 ;  stp1_1, stp1_30
+  mova [stp + %1 +  idx0], m0
+  mova [stp + %1 +  idx1], m7
+  mova [stp + %4 + idx30], m15
+  mova [stp + %4 + idx31], m10
+  mova                 m7, [stp + %4 + idx28]
+  mova                 m0, [stp + %4 + idx29]
+  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
+  SUM_SUB               4,  7, 9 ;  stp1_3, stp1_28
+  mova [stp + %1 +  idx2], m2
+  mova [stp + %1 +  idx3], m4
+  mova [stp + %4 + idx28], m7
+  mova [stp + %4 + idx29], m0
+
+  ; 12-15, 16-19 final stage
+  mova                 m0, [stp + %3 + idx16]
+  mova                 m7, [stp + %3 + idx17]
+  mova                 m2, [stp + %3 + idx18]
+  mova                 m4, [stp + %3 + idx19]
+  SUM_SUB               1,  0, 9 ;  stp1_15, stp1_16
+  SUM_SUB               3,  7, 9 ;  stp1_14, stp1_17
+  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
+  SUM_SUB               6,  4, 9 ;  stp1_12, stp1_19
+  mova [stp + %2 + idx12], m6
+  mova [stp + %2 + idx13], m5
+  mova [stp + %2 + idx14], m3
+  mova [stp + %2 + idx15], m1
+  mova [stp + %3 + idx16], m0
+  mova [stp + %3 + idx17], m7
+  mova [stp + %3 + idx18], m2
+  mova [stp + %3 + idx19], m4
+
+  mova                 m4, [stp + %2 +  idx8]
+  mova                 m5, [stp + %2 +  idx9]
+  mova                 m6, [stp + %2 + idx10]
+  mova                 m7, [stp + %2 + idx11]
+  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
+  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
+  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
+  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
+
+  ; 4-7, 24-27 final stage
+  mova                 m0, [stp + %4 + idx27]
+  mova                 m1, [stp + %4 + idx26]
+  mova                 m2, [stp + %4 + idx25]
+  mova                 m3, [stp + %4 + idx24]
+  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
+  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
+  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
+  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
+  mova [stp + %4 + idx27], m0
+  mova [stp + %4 + idx26], m1
+  mova [stp + %4 + idx25], m2
+  mova [stp + %4 + idx24], m3
+  mova [stp + %1 +  idx4], m11
+  mova [stp + %1 +  idx5], m14
+  mova [stp + %1 +  idx6], m13
+  mova [stp + %1 +  idx7], m12
+
+  ; 8-11, 20-23 final stage
+  mova                 m0, [stp + %3 + idx20]
+  mova                 m1, [stp + %3 + idx21]
+  mova                 m2, [stp + %3 + idx22]
+  mova                 m3, [stp + %3 + idx23]
+  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
+  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
+  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
+  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
+  mova [stp + %2 +  idx8], m4
+  mova [stp + %2 +  idx9], m5
+  mova [stp + %2 + idx10], m6
+  mova [stp + %2 + idx11], m7
+  mova [stp + %3 + idx20], m0
+  mova [stp + %3 + idx21], m1
+  mova [stp + %3 + idx22], m2
+  mova [stp + %3 + idx23], m3
+%endmacro
+
+%macro RECON_AND_STORE 1
+  mova            m11, [pw_32]
+  lea             stp, [rsp + %1]
+  mov              r6, 32
+  pxor             m8, m8
+%%recon_and_store:
+  mova             m0, [stp + 16 * 32 * 0]
+  mova             m1, [stp + 16 * 32 * 1]
+  mova             m2, [stp + 16 * 32 * 2]
+  mova             m3, [stp + 16 * 32 * 3]
+  add             stp, 16
+
+  paddw            m0, m11
+  paddw            m1, m11
+  paddw            m2, m11
+  paddw            m3, m11
+  psraw            m0, 6
+  psraw            m1, 6
+  psraw            m2, 6
+  psraw            m3, 6
+  movh             m4, [outputq +  0]
+  movh             m5, [outputq +  8]
+  movh             m6, [outputq + 16]
+  movh             m7, [outputq + 24]
+  punpcklbw        m4, m8
+  punpcklbw        m5, m8
+  punpcklbw        m6, m8
+  punpcklbw        m7, m8
+  paddw            m0, m4
+  paddw            m1, m5
+  paddw            m2, m6
+  paddw            m3, m7
+  packuswb         m0, m1
+  packuswb         m2, m3
+  mova [outputq +  0], m0
+  mova [outputq + 16], m2
+  lea         outputq, [outputq + strideq]
+  dec              r6
+  jnz %%recon_and_store
+%endmacro
+
+%define i32x32_size     16*32*5
+%define pass_two_start  16*32*0
+%define transposed_in   16*32*4
+%define pass_one_start  16*32*0
+%define stp r8
+
+INIT_XMM ssse3
+cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride
+  mova            m8, [pd_8192]
+  lea            stp, [rsp + pass_one_start]
+
+idct32x32_34:
+  mov             r3, inputq
+  lea             r4, [rsp + transposed_in]
+
+idct32x32_34_transpose:
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova            m0, [r3 +       0]
+  packssdw        m0, [r3 +      16]
+  mova            m1, [r3 + 32 *  4]
+  packssdw        m1, [r3 + 32 *  4 + 16]
+  mova            m2, [r3 + 32 *  8]
+  packssdw        m2, [r3 + 32 *  8 + 16]
+  mova            m3, [r3 + 32 * 12]
+  packssdw        m3, [r3 + 32 * 12 + 16]
+  mova            m4, [r3 + 32 * 16]
+  packssdw        m4, [r3 + 32 * 16 + 16]
+  mova            m5, [r3 + 32 * 20]
+  packssdw        m5, [r3 + 32 * 20 + 16]
+  mova            m6, [r3 + 32 * 24]
+  packssdw        m6, [r3 + 32 * 24 + 16]
+  mova            m7, [r3 + 32 * 28]
+  packssdw        m7, [r3 + 32 * 28 + 16]
+%else
+  mova            m0, [r3 +       0]
+  mova            m1, [r3 + 16 *  4]
+  mova            m2, [r3 + 16 *  8]
+  mova            m3, [r3 + 16 * 12]
+  mova            m4, [r3 + 16 * 16]
+  mova            m5, [r3 + 16 * 20]
+  mova            m6, [r3 + 16 * 24]
+  mova            m7, [r3 + 16 * 28]
+%endif
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  IDCT32X32_34  16*0, 16*32, 16*64, 16*96
+  lea            stp, [stp + 16 * 8]
+  mov             r6, 4
+  lea            stp, [rsp + pass_one_start]
+  lea             r9, [rsp + pass_one_start]
+
+idct32x32_34_2:
+  lea             r4, [rsp + transposed_in]
+  mov             r3, r9
+
+idct32x32_34_transpose_2:
+  mova            m0, [r3 +      0]
+  mova            m1, [r3 + 16 * 1]
+  mova            m2, [r3 + 16 * 2]
+  mova            m3, [r3 + 16 * 3]
+  mova            m4, [r3 + 16 * 4]
+  mova            m5, [r3 + 16 * 5]
+  mova            m6, [r3 + 16 * 6]
+  mova            m7, [r3 + 16 * 7]
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  IDCT32X32_34  16*0, 16*8, 16*16, 16*24
+
+  lea            stp, [stp + 16 * 32]
+  add             r9, 16 * 32
+  dec             r6
+  jnz idct32x32_34_2
+
+  RECON_AND_STORE pass_two_start
+
+  RET
+
+%macro IDCT32X32_135 4
+  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m1, [rsp + transposed_in + 16 *  1]
+  mova                m11, m1
+  pmulhrsw             m1, [pw___804x2] ; stp1_16
+  pmulhrsw            m11, [pw_16364x2] ; stp2_31
+
+  mova                 m7, [rsp + transposed_in + 16 *  7]
+  mova                m12, m7
+  pmulhrsw             m7, [pw_15426x2] ; stp1_28
+  pmulhrsw            m12, [pw_m5520x2] ; stp2_19
+
+  mova                 m3, [rsp + transposed_in + 16 *  9]
+  mova                 m4, m3
+  pmulhrsw             m3, [pw__7005x2] ; stp1_18
+  pmulhrsw             m4, [pw_14811x2] ; stp2_29
+
+  mova                 m0, [rsp + transposed_in + 16 * 15]
+  mova                 m2, m0
+  pmulhrsw             m0, [pw_12140x2]  ; stp1_30
+  pmulhrsw             m2, [pw_m11003x2] ; stp2_17
+
+  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1,  2, 9 ; stp2_16, stp2_17
+  SUM_SUB              12,  3, 9 ; stp2_19, stp2_18
+  SUM_SUB               7,  4, 9 ; stp2_28, stp2_29
+  SUM_SUB              11,  0, 9 ; stp2_31, stp2_30
+
+  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
+  BUTTERFLY_4Xmm        4,     3,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
+
+  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
+  SUM_SUB               0,  3, 9 ; stp2_17, stp2_18
+  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
+  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
+
+  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          4,     3,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
+  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
+
+  mova [stp + %3 + idx16], m1
+  mova [stp + %3 + idx17], m0
+  mova [stp + %3 + idx18], m4
+  mova [stp + %3 + idx19], m7
+  mova [stp + %4 + idx28], m12
+  mova [stp + %4 + idx29], m3
+  mova [stp + %4 + idx30], m2
+  mova [stp + %4 + idx31], m11
+
+  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m2, [rsp + transposed_in + 16 *  3]
+  mova                 m3, m2
+  pmulhrsw             m3, [pw_m2404x2] ; stp1_23
+  pmulhrsw             m2, [pw_16207x2] ; stp2_24
+
+  mova                 m5, [rsp + transposed_in + 16 *  5]
+  mova                 m6, m5
+  pmulhrsw             m5, [pw__3981x2] ; stp1_20
+  pmulhrsw             m6, [pw_15893x2] ; stp2_27
+
+  mova                m14, [rsp + transposed_in + 16 * 11]
+  mova                m13, m14
+  pmulhrsw            m13, [pw_m8423x2] ; stp1_21
+  pmulhrsw            m14, [pw_14053x2] ; stp2_26
+
+  mova                 m0, [rsp + transposed_in + 16 * 13]
+  mova                 m1, m0
+  pmulhrsw             m0, [pw__9760x2] ; stp1_22
+  pmulhrsw             m1, [pw_13160x2] ; stp2_25
+
+  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               5, 13, 9 ; stp2_20, stp2_21
+  SUM_SUB               3,  0, 9 ; stp2_23, stp2_22
+  SUM_SUB               2,  1, 9 ; stp2_24, stp2_25
+  SUM_SUB               6, 14, 9 ; stp2_27, stp2_26
+
+  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
+  BUTTERFLY_4Xmm        1,     0,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
+
+  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
+  SUM_SUB               0, 14, 9 ; stp2_22, stp2_21
+  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
+  SUM_SUB               1, 13, 9 ; stp2_25, stp2_26
+
+  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
+  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
+
+  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m4, [stp + %3 + idx16]
+  mova                 m7, [stp + %3 + idx17]
+  mova                m11, [stp + %3 + idx18]
+  mova                m12, [stp + %3 + idx19]
+  SUM_SUB               4,  3, 9 ; stp2_16, stp2_23
+  SUM_SUB               7,  0, 9 ; stp2_17, stp2_22
+  SUM_SUB              11, 14, 9 ; stp2_18, stp2_21
+  SUM_SUB              12,  5, 9 ; stp2_19, stp2_20
+  mova [stp + %3 + idx16], m4
+  mova [stp + %3 + idx17], m7
+  mova [stp + %3 + idx18], m11
+  mova [stp + %3 + idx19], m12
+
+  mova                 m4, [stp + %4 + idx28]
+  mova                 m7, [stp + %4 + idx29]
+  mova                m11, [stp + %4 + idx30]
+  mova                m12, [stp + %4 + idx31]
+  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
+  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
+  SUM_SUB              11,  1, 9 ; stp2_30, stp2_25
+  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
+  mova [stp + %4 + idx28], m4
+  mova [stp + %4 + idx29], m7
+  mova [stp + %4 + idx30], m11
+  mova [stp + %4 + idx31], m12
+
+  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               6,  5,  9
+  pmulhrsw             m6, m10  ; stp1_27
+  pmulhrsw             m5, m10  ; stp1_20
+  SUM_SUB              13, 14,  9
+  pmulhrsw            m13, m10  ; stp1_26
+  pmulhrsw            m14, m10  ; stp1_21
+  SUM_SUB               1,  0,  9
+  pmulhrsw             m1, m10  ; stp1_25
+  pmulhrsw             m0, m10  ; stp1_22
+  SUM_SUB               2,  3,  9
+  pmulhrsw             m2, m10  ; stp1_25
+  pmulhrsw             m3, m10  ; stp1_22
+%else
+  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
+  SWAP  6, 5
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
+  SWAP 13, 14
+  BUTTERFLY_4X          1,     0,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
+  SWAP  1, 0
+  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
+  SWAP  2, 3
+%endif
+  mova [stp + %3 + idx20], m5
+  mova [stp + %3 + idx21], m14
+  mova [stp + %3 + idx22], m0
+  mova [stp + %3 + idx23], m3
+  mova [stp + %4 + idx24], m2
+  mova [stp + %4 + idx25], m1
+  mova [stp + %4 + idx26], m13
+  mova [stp + %4 + idx27], m6
+
+  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  2]
+  mova                 m1, m0
+  pmulhrsw             m0, [pw__1606x2] ; stp1_8
+  pmulhrsw             m1, [pw_16305x2] ; stp2_15
+
+  mova                 m6, [rsp + transposed_in + 16 *  6]
+  mova                 m7, m6
+  pmulhrsw             m7, [pw_m4756x2] ; stp2_11
+  pmulhrsw             m6, [pw_15679x2] ; stp1_12
+
+  mova                 m4, [rsp + transposed_in + 16 * 10]
+  mova                 m5, m4
+  pmulhrsw             m4, [pw__7723x2] ; stp1_10
+  pmulhrsw             m5, [pw_14449x2] ; stp2_13
+
+  mova                 m2, [rsp + transposed_in + 16 * 14]
+  mova                 m3, m2
+  pmulhrsw             m3, [pw_m10394x2] ; stp1_9
+  pmulhrsw             m2, [pw_12665x2] ; stp2_14
+
+  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  3, 9 ;  stp1_8, stp1_9
+  SUM_SUB               7,  4, 9 ; stp1_11, stp1_10
+  SUM_SUB               6,  5, 9 ; stp1_12, stp1_13
+  SUM_SUB               1,  2, 9 ; stp1_15, stp1_14
+
+  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
+  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
+
+  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
+  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
+  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
+  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
+
+  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               5,    4,  9
+  pmulhrsw             m5, m10  ; stp1_13
+  pmulhrsw             m4, m10  ; stp1_10
+  SUM_SUB               6,    7,  9
+  pmulhrsw             m6, m10  ; stp1_12
+  pmulhrsw             m7, m10  ; stp1_11
+%else
+  BUTTERFLY_4X       5,     4,  11585,  11585,  m8,  9,  10 ; stp1_10, stp1_13
+  SWAP  5, 4
+  BUTTERFLY_4X       6,     7,  11585,  11585,  m8,  9,  10 ; stp1_11, stp1_12
+  SWAP  6, 7
+%endif
+  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova [stp + %2 +  idx8], m0
+  mova [stp + %2 +  idx9], m2
+  mova [stp + %2 + idx10], m4
+  mova [stp + %2 + idx11], m7
+  mova [stp + %2 + idx12], m6
+  mova [stp + %2 + idx13], m5
+  mova [stp + %2 + idx14], m3
+  mova [stp + %2 + idx15], m1
+
+  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                m11, [rsp + transposed_in + 16 *  4]
+  mova                m12, m11
+  pmulhrsw            m11, [pw__3196x2] ; stp1_4
+  pmulhrsw            m12, [pw_16069x2] ; stp1_7
+
+  mova                m13, [rsp + transposed_in + 16 * 12]
+  mova                m14, m13
+  pmulhrsw            m13, [pw_13623x2] ; stp1_6
+  pmulhrsw            m14, [pw_m9102x2] ; stp1_5
+
+  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  0]
+  mova                 m2, [rsp + transposed_in + 16 *  8]
+  pmulhrsw             m0, [pw_11585x2]  ; stp1_1
+  mova                 m3, m2
+  pmulhrsw             m2, [pw__6270x2]  ; stp1_2
+  pmulhrsw             m3, [pw_15137x2]  ; stp1_3
+
+  SUM_SUB              11, 14, 9 ;  stp1_4, stp1_5
+  SUM_SUB              12, 13, 9 ;  stp1_7, stp1_6
+
+  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB              13,   14,  9
+  pmulhrsw            m13, m10  ; stp1_6
+  pmulhrsw            m14, m10  ; stp1_5
+%else
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
+  SWAP 13, 14
+%endif
+  mova                 m1, m0    ; stp1_0 = stp1_1
+  SUM_SUB               0,  3, 9 ;  stp1_0, stp1_3
+  SUM_SUB               1,  2, 9 ;  stp1_1, stp1_2
+
+  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
+  SUM_SUB               1, 13, 9 ;  stp1_1, stp1_6
+  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
+  SUM_SUB               3, 11, 9 ;  stp1_3, stp1_4
+
+  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m4, [stp + %2 + idx12]
+  mova                 m5, [stp + %2 + idx13]
+  mova                 m6, [stp + %2 + idx14]
+  mova                 m7, [stp + %2 + idx15]
+  SUM_SUB               0,  7, 9 ;  stp1_0, stp1_15
+  SUM_SUB               1,  6, 9 ;  stp1_1, stp1_14
+  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
+  SUM_SUB               3,  4, 9 ;  stp1_3, stp1_12
+
+  ; 0-3, 28-31 final stage
+  mova                m10, [stp + %4 + idx31]
+  mova                m15, [stp + %4 + idx30]
+  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
+  SUM_SUB               1, 15, 9 ;  stp1_1, stp1_30
+  mova [stp + %1 +  idx0], m0
+  mova [stp + %1 +  idx1], m1
+  mova [stp + %4 + idx31], m10
+  mova [stp + %4 + idx30], m15
+  mova                 m0, [stp + %4 + idx29]
+  mova                 m1, [stp + %4 + idx28]
+  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
+  SUM_SUB               3,  1, 9 ;  stp1_3, stp1_28
+  mova [stp + %1 +  idx2], m2
+  mova [stp + %1 +  idx3], m3
+  mova [stp + %4 + idx29], m0
+  mova [stp + %4 + idx28], m1
+
+  ; 12-15, 16-19 final stage
+  mova                 m0, [stp + %3 + idx16]
+  mova                 m1, [stp + %3 + idx17]
+  mova                 m2, [stp + %3 + idx18]
+  mova                 m3, [stp + %3 + idx19]
+  SUM_SUB               7,  0, 9 ;  stp1_15, stp1_16
+  SUM_SUB               6,  1, 9 ;  stp1_14, stp1_17
+  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
+  SUM_SUB               4,  3, 9 ;  stp1_12, stp1_19
+  mova [stp + %2 + idx12], m4
+  mova [stp + %2 + idx13], m5
+  mova [stp + %2 + idx14], m6
+  mova [stp + %2 + idx15], m7
+  mova [stp + %3 + idx16], m0
+  mova [stp + %3 + idx17], m1
+  mova [stp + %3 + idx18], m2
+  mova [stp + %3 + idx19], m3
+
+  mova                 m4, [stp + %2 +  idx8]
+  mova                 m5, [stp + %2 +  idx9]
+  mova                 m6, [stp + %2 + idx10]
+  mova                 m7, [stp + %2 + idx11]
+  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
+  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
+  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
+  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
+
+  ; 4-7, 24-27 final stage
+  mova                 m3, [stp + %4 + idx24]
+  mova                 m2, [stp + %4 + idx25]
+  mova                 m1, [stp + %4 + idx26]
+  mova                 m0, [stp + %4 + idx27]
+  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
+  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
+  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
+  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
+  mova [stp + %4 + idx24], m3
+  mova [stp + %4 + idx25], m2
+  mova [stp + %4 + idx26], m1
+  mova [stp + %4 + idx27], m0
+  mova [stp + %1 +  idx4], m11
+  mova [stp + %1 +  idx5], m14
+  mova [stp + %1 +  idx6], m13
+  mova [stp + %1 +  idx7], m12
+
+  ; 8-11, 20-23 final stage
+  mova                 m0, [stp + %3 + idx20]
+  mova                 m1, [stp + %3 + idx21]
+  mova                 m2, [stp + %3 + idx22]
+  mova                 m3, [stp + %3 + idx23]
+  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
+  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
+  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
+  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
+  mova [stp + %2 +  idx8], m4
+  mova [stp + %2 +  idx9], m5
+  mova [stp + %2 + idx10], m6
+  mova [stp + %2 + idx11], m7
+  mova [stp + %3 + idx20], m0
+  mova [stp + %3 + idx21], m1
+  mova [stp + %3 + idx22], m2
+  mova [stp + %3 + idx23], m3
+%endmacro
+
+INIT_XMM ssse3
+cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride
+  mova            m8, [pd_8192]
+  mov             r6, 2
+  lea            stp, [rsp + pass_one_start]
+
+idct32x32_135:
+  mov             r3, inputq
+  lea             r4, [rsp + transposed_in]
+  mov             r7, 2
+
+idct32x32_135_transpose:
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova            m0, [r3 +       0]
+  packssdw        m0, [r3 +      16]
+  mova            m1, [r3 + 32 *  4]
+  packssdw        m1, [r3 + 32 *  4 + 16]
+  mova            m2, [r3 + 32 *  8]
+  packssdw        m2, [r3 + 32 *  8 + 16]
+  mova            m3, [r3 + 32 * 12]
+  packssdw        m3, [r3 + 32 * 12 + 16]
+  mova            m4, [r3 + 32 * 16]
+  packssdw        m4, [r3 + 32 * 16 + 16]
+  mova            m5, [r3 + 32 * 20]
+  packssdw        m5, [r3 + 32 * 20 + 16]
+  mova            m6, [r3 + 32 * 24]
+  packssdw        m6, [r3 + 32 * 24 + 16]
+  mova            m7, [r3 + 32 * 28]
+  packssdw        m7, [r3 + 32 * 28 + 16]
+%else
+  mova            m0, [r3 +       0]
+  mova            m1, [r3 + 16 *  4]
+  mova            m2, [r3 + 16 *  8]
+  mova            m3, [r3 + 16 * 12]
+  mova            m4, [r3 + 16 * 16]
+  mova            m5, [r3 + 16 * 20]
+  mova            m6, [r3 + 16 * 24]
+  mova            m7, [r3 + 16 * 28]
+%endif
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  mova [r4 +      0], m0
+  mova [r4 + 16 * 1], m1
+  mova [r4 + 16 * 2], m2
+  mova [r4 + 16 * 3], m3
+  mova [r4 + 16 * 4], m4
+  mova [r4 + 16 * 5], m5
+  mova [r4 + 16 * 6], m6
+  mova [r4 + 16 * 7], m7
+
+%if CONFIG_VP9_HIGHBITDEPTH
+  add             r3, 32
+%else
+  add             r3, 16
+%endif
+  add             r4, 16 * 8
+  dec             r7
+  jne idct32x32_135_transpose
+
+  IDCT32X32_135 16*0, 16*32, 16*64, 16*96
+  lea            stp, [stp + 16 * 8]
+%if CONFIG_VP9_HIGHBITDEPTH
+  lea         inputq, [inputq + 32 * 32]
+%else
+  lea         inputq, [inputq + 16 * 32]
+%endif
+  dec             r6
+  jnz idct32x32_135
+
+  mov             r6, 4
+  lea            stp, [rsp + pass_one_start]
+  lea             r9, [rsp + pass_one_start]
+
+idct32x32_135_2:
+  lea             r4, [rsp + transposed_in]
+  mov             r3, r9
+  mov             r7, 2
+
+idct32x32_135_transpose_2:
+  mova            m0, [r3 +      0]
+  mova            m1, [r3 + 16 * 1]
+  mova            m2, [r3 + 16 * 2]
+  mova            m3, [r3 + 16 * 3]
+  mova            m4, [r3 + 16 * 4]
+  mova            m5, [r3 + 16 * 5]
+  mova            m6, [r3 + 16 * 6]
+  mova            m7, [r3 + 16 * 7]
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  mova [r4 +      0], m0
+  mova [r4 + 16 * 1], m1
+  mova [r4 + 16 * 2], m2
+  mova [r4 + 16 * 3], m3
+  mova [r4 + 16 * 4], m4
+  mova [r4 + 16 * 5], m5
+  mova [r4 + 16 * 6], m6
+  mova [r4 + 16 * 7], m7
+
+  add             r3, 16 * 8
+  add             r4, 16 * 8
+  dec             r7
+  jne idct32x32_135_transpose_2
+
+  IDCT32X32_135 16*0, 16*8, 16*16, 16*24
+
+  lea            stp, [stp + 16 * 32]
+  add             r9, 16 * 32
+  dec             r6
+  jnz idct32x32_135_2
+
+  RECON_AND_STORE pass_two_start
+
+  RET
+
+%macro IDCT32X32_1024 4
+  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m1, [rsp + transposed_in + 16 *  1]
+  mova                m11, [rsp + transposed_in + 16 * 31]
+  BUTTERFLY_4X          1,    11,    804, 16364,  m8,  9,  10 ; stp1_16, stp1_31
+
+  mova                 m0, [rsp + transposed_in + 16 * 15]
+  mova                 m2, [rsp + transposed_in + 16 * 17]
+  BUTTERFLY_4X          2,     0,  12140, 11003,  m8,  9,  10 ; stp1_17, stp1_30
+
+  mova                 m7, [rsp + transposed_in + 16 *  7]
+  mova                m12, [rsp + transposed_in + 16 * 25]
+  BUTTERFLY_4X         12,     7,  15426,  5520,  m8,  9,  10 ; stp1_19, stp1_28
+
+  mova                 m3, [rsp + transposed_in + 16 *  9]
+  mova                 m4, [rsp + transposed_in + 16 * 23]
+  BUTTERFLY_4X          3,     4,   7005, 14811,  m8,  9,  10 ; stp1_18, stp1_29
+
+  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1,  2, 9 ; stp2_16, stp2_17
+  SUM_SUB              12,  3, 9 ; stp2_19, stp2_18
+  SUM_SUB               7,  4, 9 ; stp2_28, stp2_29
+  SUM_SUB              11,  0, 9 ; stp2_31, stp2_30
+
+  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
+  BUTTERFLY_4Xmm        4,     3,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
+
+  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
+  SUM_SUB               0,  3, 9 ; stp2_17, stp2_18
+  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
+  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
+
+  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          4,     3,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
+  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
+
+  mova [stp + %3 + idx16], m1
+  mova [stp + %3 + idx17], m0
+  mova [stp + %3 + idx18], m4
+  mova [stp + %3 + idx19], m7
+  mova [stp + %4 + idx28], m12
+  mova [stp + %4 + idx29], m3
+  mova [stp + %4 + idx30], m2
+  mova [stp + %4 + idx31], m11
+
+  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m5, [rsp + transposed_in + 16 *  5]
+  mova                 m6, [rsp + transposed_in + 16 * 27]
+  BUTTERFLY_4X          5,     6,   3981, 15893,  m8,  9,  10 ; stp1_20, stp1_27
+
+  mova                m13, [rsp + transposed_in + 16 * 21]
+  mova                m14, [rsp + transposed_in + 16 * 11]
+  BUTTERFLY_4X         13,    14,  14053,  8423,  m8,  9,  10 ; stp1_21, stp1_26
+
+  mova                 m0, [rsp + transposed_in + 16 * 13]
+  mova                 m1, [rsp + transposed_in + 16 * 19]
+  BUTTERFLY_4X          0,     1,   9760, 13160,  m8,  9,  10 ; stp1_22, stp1_25
+
+  mova                 m2, [rsp + transposed_in + 16 *  3]
+  mova                 m3, [rsp + transposed_in + 16 * 29]
+  BUTTERFLY_4X          3,     2,  16207,  2404,  m8,  9,  10 ; stp1_23, stp1_24
+
+  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               5, 13, 9 ; stp2_20, stp2_21
+  SUM_SUB               3,  0, 9 ; stp2_23, stp2_22
+  SUM_SUB               2,  1, 9 ; stp2_24, stp2_25
+  SUM_SUB               6, 14, 9 ; stp2_27, stp2_26
+
+  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
+  BUTTERFLY_4Xmm        1,     0,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
+
+  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
+  SUM_SUB               0, 14, 9 ; stp2_22, stp2_21
+  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
+  SUM_SUB               1, 13, 9 ; stp2_25, stp2_26
+
+  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
+  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
+
+  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m4, [stp + %3 + idx16]
+  mova                 m7, [stp + %3 + idx17]
+  mova                m11, [stp + %3 + idx18]
+  mova                m12, [stp + %3 + idx19]
+  SUM_SUB               4,  3, 9 ; stp2_16, stp2_23
+  SUM_SUB               7,  0, 9 ; stp2_17, stp2_22
+  SUM_SUB              11, 14, 9 ; stp2_18, stp2_21
+  SUM_SUB              12,  5, 9 ; stp2_19, stp2_20
+  mova [stp + %3 + idx16], m4
+  mova [stp + %3 + idx17], m7
+  mova [stp + %3 + idx18], m11
+  mova [stp + %3 + idx19], m12
+
+  mova                 m4, [stp + %4 + idx28]
+  mova                 m7, [stp + %4 + idx29]
+  mova                m11, [stp + %4 + idx30]
+  mova                m12, [stp + %4 + idx31]
+  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
+  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
+  SUM_SUB              11,  1, 9 ; stp2_30, stp2_25
+  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
+  mova [stp + %4 + idx28], m4
+  mova [stp + %4 + idx29], m7
+  mova [stp + %4 + idx30], m11
+  mova [stp + %4 + idx31], m12
+
+  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               6,  5,  9
+  pmulhrsw             m6, m10  ; stp1_27
+  pmulhrsw             m5, m10  ; stp1_20
+  SUM_SUB              13, 14,  9
+  pmulhrsw            m13, m10  ; stp1_26
+  pmulhrsw            m14, m10  ; stp1_21
+  SUM_SUB               1,  0,  9
+  pmulhrsw             m1, m10  ; stp1_25
+  pmulhrsw             m0, m10  ; stp1_22
+  SUM_SUB               2,  3,  9
+  pmulhrsw             m2, m10  ; stp1_25
+  pmulhrsw             m3, m10  ; stp1_22
+%else
+  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
+  SWAP  6, 5
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
+  SWAP 13, 14
+  BUTTERFLY_4X          1,     0,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
+  SWAP  1, 0
+  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
+  SWAP  2, 3
+%endif
+  mova [stp + %3 + idx20], m5
+  mova [stp + %3 + idx21], m14
+  mova [stp + %3 + idx22], m0
+  mova [stp + %3 + idx23], m3
+  mova [stp + %4 + idx24], m2
+  mova [stp + %4 + idx25], m1
+  mova [stp + %4 + idx26], m13
+  mova [stp + %4 + idx27], m6
+
+  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  2]
+  mova                 m1, [rsp + transposed_in + 16 * 30]
+  BUTTERFLY_4X          0,     1,   1606, 16305,  m8,  9,  10 ; stp1_8, stp1_15
+
+  mova                 m2, [rsp + transposed_in + 16 * 14]
+  mova                 m3, [rsp + transposed_in + 16 * 18]
+  BUTTERFLY_4X          3,     2,  12665, 10394,  m8,  9,  10 ; stp1_9, stp1_14
+
+  mova                 m4, [rsp + transposed_in + 16 * 10]
+  mova                 m5, [rsp + transposed_in + 16 * 22]
+  BUTTERFLY_4X          4,     5,   7723, 14449,  m8,  9,  10 ; stp1_10, stp1_13
+
+  mova                 m6, [rsp + transposed_in + 16 *  6]
+  mova                 m7, [rsp + transposed_in + 16 * 26]
+  BUTTERFLY_4X          7,     6,  15679,  4756,  m8,  9,  10 ; stp1_11, stp1_12
+
+  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  3, 9 ;  stp1_8, stp1_9
+  SUM_SUB               7,  4, 9 ; stp1_11, stp1_10
+  SUM_SUB               6,  5, 9 ; stp1_12, stp1_13
+  SUM_SUB               1,  2, 9 ; stp1_15, stp1_14
+
+  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
+  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
+
+  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
+  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
+  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
+  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
+
+  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               5,    4,  9
+  pmulhrsw             m5, m10  ; stp1_13
+  pmulhrsw             m4, m10  ; stp1_10
+  SUM_SUB               6,    7,  9
+  pmulhrsw             m6, m10  ; stp1_12
+  pmulhrsw             m7, m10  ; stp1_11
+%else
+  BUTTERFLY_4X       5,     4,  11585,  11585,  m8,  9,  10 ; stp1_10, stp1_13
+  SWAP  5, 4
+  BUTTERFLY_4X       6,     7,  11585,  11585,  m8,  9,  10 ; stp1_11, stp1_12
+  SWAP  6, 7
+%endif
+  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova [stp + %2 +  idx8], m0
+  mova [stp + %2 +  idx9], m2
+  mova [stp + %2 + idx10], m4
+  mova [stp + %2 + idx11], m7
+  mova [stp + %2 + idx12], m6
+  mova [stp + %2 + idx13], m5
+  mova [stp + %2 + idx14], m3
+  mova [stp + %2 + idx15], m1
+
+  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                m11, [rsp + transposed_in + 16 *  4]
+  mova                m12, [rsp + transposed_in + 16 * 28]
+  BUTTERFLY_4X         11,    12,   3196, 16069,  m8,  9,  10 ; stp1_4, stp1_7
+
+  mova                m13, [rsp + transposed_in + 16 * 12]
+  mova                m14, [rsp + transposed_in + 16 * 20]
+  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_5, stp1_6
+
+  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  0]
+  mova                 m1, [rsp + transposed_in + 16 * 16]
+
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               0,    1,  9
+  pmulhrsw             m0, m10  ; stp1_1
+  pmulhrsw             m1, m10  ; stp1_0
+%else
+  BUTTERFLY_4X          0,     1,  11585, 11585,  m8,  9,  10 ; stp1_1, stp1_0
+  SWAP  0, 1
+%endif
+  mova                 m2, [rsp + transposed_in + 16 *  8]
+  mova                 m3, [rsp + transposed_in + 16 * 24]
+  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_2, stp1_3
+
+  mova                m10, [pw_11585x2]
+  SUM_SUB              11, 14, 9 ;  stp1_4, stp1_5
+  SUM_SUB              12, 13, 9 ;  stp1_7, stp1_6
+
+  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  SUM_SUB              13,   14,  9
+  pmulhrsw            m13, m10  ; stp1_6
+  pmulhrsw            m14, m10  ; stp1_5
+%else
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
+  SWAP 13, 14
+%endif
+  SUM_SUB               0,  3, 9 ;  stp1_0, stp1_3
+  SUM_SUB               1,  2, 9 ;  stp1_1, stp1_2
+
+  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
+  SUM_SUB               1, 13, 9 ;  stp1_1, stp1_6
+  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
+  SUM_SUB               3, 11, 9 ;  stp1_3, stp1_4
+
+  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m4, [stp + %2 + idx12]
+  mova                 m5, [stp + %2 + idx13]
+  mova                 m6, [stp + %2 + idx14]
+  mova                 m7, [stp + %2 + idx15]
+  SUM_SUB               0,  7, 9 ;  stp1_0, stp1_15
+  SUM_SUB               1,  6, 9 ;  stp1_1, stp1_14
+  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
+  SUM_SUB               3,  4, 9 ;  stp1_3, stp1_12
+
+  ; 0-3, 28-31 final stage
+  mova                m10, [stp + %4 + idx31]
+  mova                m15, [stp + %4 + idx30]
+  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
+  SUM_SUB               1, 15, 9 ;  stp1_1, stp1_30
+  mova [stp + %1 +  idx0], m0
+  mova [stp + %1 +  idx1], m1
+  mova [stp + %4 + idx31], m10
+  mova [stp + %4 + idx30], m15
+  mova                 m0, [stp + %4 + idx29]
+  mova                 m1, [stp + %4 + idx28]
+  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
+  SUM_SUB               3,  1, 9 ;  stp1_3, stp1_28
+  mova [stp + %1 +  idx2], m2
+  mova [stp + %1 +  idx3], m3
+  mova [stp + %4 + idx29], m0
+  mova [stp + %4 + idx28], m1
+
+  ; 12-15, 16-19 final stage
+  mova                 m0, [stp + %3 + idx16]
+  mova                 m1, [stp + %3 + idx17]
+  mova                 m2, [stp + %3 + idx18]
+  mova                 m3, [stp + %3 + idx19]
+  SUM_SUB               7,  0, 9 ;  stp1_15, stp1_16
+  SUM_SUB               6,  1, 9 ;  stp1_14, stp1_17
+  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
+  SUM_SUB               4,  3, 9 ;  stp1_12, stp1_19
+  mova [stp + %2 + idx12], m4
+  mova [stp + %2 + idx13], m5
+  mova [stp + %2 + idx14], m6
+  mova [stp + %2 + idx15], m7
+  mova [stp + %3 + idx16], m0
+  mova [stp + %3 + idx17], m1
+  mova [stp + %3 + idx18], m2
+  mova [stp + %3 + idx19], m3
+
+  mova                 m4, [stp + %2 +  idx8]
+  mova                 m5, [stp + %2 +  idx9]
+  mova                 m6, [stp + %2 + idx10]
+  mova                 m7, [stp + %2 + idx11]
+  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
+  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
+  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
+  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
+
+  ; 4-7, 24-27 final stage
+  mova                 m3, [stp + %4 + idx24]
+  mova                 m2, [stp + %4 + idx25]
+  mova                 m1, [stp + %4 + idx26]
+  mova                 m0, [stp + %4 + idx27]
+  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
+  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
+  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
+  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
+  mova [stp + %4 + idx24], m3
+  mova [stp + %4 + idx25], m2
+  mova [stp + %4 + idx26], m1
+  mova [stp + %4 + idx27], m0
+  mova [stp + %1 +  idx4], m11
+  mova [stp + %1 +  idx5], m14
+  mova [stp + %1 +  idx6], m13
+  mova [stp + %1 +  idx7], m12
+
+  ; 8-11, 20-23 final stage
+  mova                 m0, [stp + %3 + idx20]
+  mova                 m1, [stp + %3 + idx21]
+  mova                 m2, [stp + %3 + idx22]
+  mova                 m3, [stp + %3 + idx23]
+  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
+  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
+  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
+  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
+  mova [stp + %2 +  idx8], m4
+  mova [stp + %2 +  idx9], m5
+  mova [stp + %2 + idx10], m6
+  mova [stp + %2 + idx11], m7
+  mova [stp + %3 + idx20], m0
+  mova [stp + %3 + idx21], m1
+  mova [stp + %3 + idx22], m2
+  mova [stp + %3 + idx23], m3
+%endmacro
+
+INIT_XMM ssse3
+cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride
+  mova            m8, [pd_8192]
+  mov             r6, 4
+  lea            stp, [rsp + pass_one_start]
+
+idct32x32_1024:
+  mov             r3, inputq
+  lea             r4, [rsp + transposed_in]
+  mov             r7, 4
+
+idct32x32_1024_transpose:
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova            m0, [r3 +       0]
+  packssdw        m0, [r3 +      16]
+  mova            m1, [r3 + 32 *  4]
+  packssdw        m1, [r3 + 32 *  4 + 16]
+  mova            m2, [r3 + 32 *  8]
+  packssdw        m2, [r3 + 32 *  8 + 16]
+  mova            m3, [r3 + 32 * 12]
+  packssdw        m3, [r3 + 32 * 12 + 16]
+  mova            m4, [r3 + 32 * 16]
+  packssdw        m4, [r3 + 32 * 16 + 16]
+  mova            m5, [r3 + 32 * 20]
+  packssdw        m5, [r3 + 32 * 20 + 16]
+  mova            m6, [r3 + 32 * 24]
+  packssdw        m6, [r3 + 32 * 24 + 16]
+  mova            m7, [r3 + 32 * 28]
+  packssdw        m7, [r3 + 32 * 28 + 16]
+%else
+  mova            m0, [r3 +       0]
+  mova            m1, [r3 + 16 *  4]
+  mova            m2, [r3 + 16 *  8]
+  mova            m3, [r3 + 16 * 12]
+  mova            m4, [r3 + 16 * 16]
+  mova            m5, [r3 + 16 * 20]
+  mova            m6, [r3 + 16 * 24]
+  mova            m7, [r3 + 16 * 28]
+%endif
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  mova [r4 +      0], m0
+  mova [r4 + 16 * 1], m1
+  mova [r4 + 16 * 2], m2
+  mova [r4 + 16 * 3], m3
+  mova [r4 + 16 * 4], m4
+  mova [r4 + 16 * 5], m5
+  mova [r4 + 16 * 6], m6
+  mova [r4 + 16 * 7], m7
+%if CONFIG_VP9_HIGHBITDEPTH
+  add             r3, 32
+%else
+  add             r3, 16
+%endif
+  add             r4, 16 * 8
+  dec             r7
+  jne idct32x32_1024_transpose
+
+  IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
+
+  lea            stp, [stp + 16 * 8]
+%if CONFIG_VP9_HIGHBITDEPTH
+  lea         inputq, [inputq + 32 * 32]
+%else
+  lea         inputq, [inputq + 16 * 32]
+%endif
+  dec             r6
+  jnz idct32x32_1024
+
+  mov             r6, 4
+  lea            stp, [rsp + pass_one_start]
+  lea             r9, [rsp + pass_one_start]
+
+idct32x32_1024_2:
+  lea             r4, [rsp + transposed_in]
+  mov             r3, r9
+  mov             r7, 4
+
+idct32x32_1024_transpose_2:
+  mova            m0, [r3 +      0]
+  mova            m1, [r3 + 16 * 1]
+  mova            m2, [r3 + 16 * 2]
+  mova            m3, [r3 + 16 * 3]
+  mova            m4, [r3 + 16 * 4]
+  mova            m5, [r3 + 16 * 5]
+  mova            m6, [r3 + 16 * 6]
+  mova            m7, [r3 + 16 * 7]
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  mova [r4 +      0], m0
+  mova [r4 + 16 * 1], m1
+  mova [r4 + 16 * 2], m2
+  mova [r4 + 16 * 3], m3
+  mova [r4 + 16 * 4], m4
+  mova [r4 + 16 * 5], m5
+  mova [r4 + 16 * 6], m6
+  mova [r4 + 16 * 7], m7
+
+  add             r3, 16 * 8
+  add             r4, 16 * 8
+  dec             r7
+  jne idct32x32_1024_transpose_2
+
+  IDCT32X32_1024 16*0, 16*8, 16*16, 16*24
+
+  lea            stp, [stp + 16 * 32]
+  add             r9, 16 * 32
+  dec             r6
+  jnz idct32x32_1024_2
+
+  RECON_AND_STORE pass_two_start
+
+  RET
 %endif
diff --git a/vpx_dsp/x86/inv_wht_sse2.asm b/vpx_dsp/x86/inv_wht_sse2.asm
index df6f469..fbbcd76 100644
--- a/vpx_dsp/x86/inv_wht_sse2.asm
+++ b/vpx_dsp/x86/inv_wht_sse2.asm
@@ -82,9 +82,15 @@ SECTION .text
 
 INIT_XMM sse2
 cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova            m0,        [inputq +  0]
+  packssdw        m0,        [inputq + 16]
+  mova            m1,        [inputq + 32]
+  packssdw        m1,        [inputq + 48]
+%else
   mova            m0,        [inputq +  0]
   mova            m1,        [inputq + 16]
-
+%endif
   psraw           m0,        2
   psraw           m1,        2
 
diff --git a/vpx_dsp/x86/loopfilter_avx2.c b/vpx_dsp/x86/loopfilter_avx2.c
index 23a97dd..be1087c 100644
--- a/vpx_dsp/x86/loopfilter_avx2.c
+++ b/vpx_dsp/x86/loopfilter_avx2.c
@@ -13,9 +13,10 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
-static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
-        const unsigned char *_blimit, const unsigned char *_limit,
-        const unsigned char *_thresh) {
+void vpx_lpf_horizontal_edge_8_avx2(unsigned char *s, int p,
+                                    const unsigned char *_blimit,
+                                    const unsigned char *_limit,
+                                    const unsigned char *_thresh) {
     __m128i mask, hev, flat, flat2;
     const __m128i zero = _mm_set1_epi16(0);
     const __m128i one = _mm_set1_epi8(1);
@@ -400,9 +401,10 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
   8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
 };
 
-static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
-        const unsigned char *_blimit, const unsigned char *_limit,
-        const unsigned char *_thresh) {
+void vpx_lpf_horizontal_edge_16_avx2(unsigned char *s, int p,
+                                     const unsigned char *_blimit,
+                                     const unsigned char *_limit,
+                                     const unsigned char *_thresh) {
     __m128i mask, hev, flat, flat2;
     const __m128i zero = _mm_set1_epi16(0);
     const __m128i one = _mm_set1_epi8(1);
@@ -975,12 +977,3 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
         _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
     }
 }
-
-void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
-        const unsigned char *_blimit, const unsigned char *_limit,
-        const unsigned char *_thresh, int count) {
-    if (count == 1)
-        mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);
-    else
-        mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);
-}
diff --git a/vpx_dsp/x86/loopfilter_mmx.asm b/vpx_dsp/x86/loopfilter_mmx.asm
deleted file mode 100644
index b9c18b6..0000000
--- a/vpx_dsp/x86/loopfilter_mmx.asm
+++ /dev/null
@@ -1,611 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-;void vpx_lpf_horizontal_4_mmx
-;(
-;    unsigned char *src_ptr,
-;    int src_pixel_step,
-;    const char *blimit,
-;    const char *limit,
-;    const char *thresh,
-;    int  count
-;)
-global sym(vpx_lpf_horizontal_4_mmx) PRIVATE
-sym(vpx_lpf_horizontal_4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 32                         ; reserve 32 bytes
-    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        movsxd      rcx, dword ptr arg(5) ;count
-.next8_h:
-        mov         rdx, arg(3) ;limit
-        movq        mm7, [rdx]
-        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
-
-        ; calculate breakout conditions
-        movq        mm2, [rdi+2*rax]      ; q3
-        movq        mm1, [rsi+2*rax]      ; q2
-        movq        mm6, mm1              ; q2
-        psubusb     mm1, mm2              ; q2-=q3
-        psubusb     mm2, mm6              ; q3-=q2
-        por         mm1, mm2              ; abs(q3-q2)
-        psubusb     mm1, mm7              ;
-
-
-        movq        mm4, [rsi+rax]        ; q1
-        movq        mm3, mm4              ; q1
-        psubusb     mm4, mm6              ; q1-=q2
-        psubusb     mm6, mm3              ; q2-=q1
-        por         mm4, mm6              ; abs(q2-q1)
-
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-        movq        mm4, [rsi]            ; q0
-        movq        mm0, mm4              ; q0
-        psubusb     mm4, mm3              ; q0-=q1
-        psubusb     mm3, mm0              ; q1-=q0
-        por         mm4, mm3              ; abs(q0-q1)
-        movq        t0, mm4               ; save to t0
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-
-        neg         rax                   ; negate pitch to deal with above border
-
-        movq        mm2, [rsi+4*rax]      ; p3
-        movq        mm4, [rdi+4*rax]      ; p2
-        movq        mm5, mm4              ; p2
-        psubusb     mm4, mm2              ; p2-=p3
-        psubusb     mm2, mm5              ; p3-=p2
-        por         mm4, mm2              ; abs(p3 - p2)
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-
-        movq        mm4, [rsi+2*rax]      ; p1
-        movq        mm3, mm4              ; p1
-        psubusb     mm4, mm5              ; p1-=p2
-        psubusb     mm5, mm3              ; p2-=p1
-        por         mm4, mm5              ; abs(p2 - p1)
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-        movq        mm2, mm3              ; p1
-
-        movq        mm4, [rsi+rax]        ; p0
-        movq        mm5, mm4              ; p0
-        psubusb     mm4, mm3              ; p0-=p1
-        psubusb     mm3, mm5              ; p1-=p0
-        por         mm4, mm3              ; abs(p1 - p0)
-        movq        t1, mm4               ; save to t1
-        psubusb     mm4, mm7
-        por        mm1, mm4
-
-        movq        mm3, [rdi]            ; q1
-        movq        mm4, mm3              ; q1
-        psubusb     mm3, mm2              ; q1-=p1
-        psubusb     mm2, mm4              ; p1-=q1
-        por         mm2, mm3              ; abs(p1-q1)
-        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
-        psrlw       mm2, 1                ; abs(p1-q1)/2
-
-        movq        mm6, mm5              ; p0
-        movq        mm3, [rsi]            ; q0
-        psubusb     mm5, mm3              ; p0-=q0
-        psubusb     mm3, mm6              ; q0-=p0
-        por         mm5, mm3              ; abs(p0 - q0)
-        paddusb     mm5, mm5              ; abs(p0-q0)*2
-        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx, arg(2) ;blimit           ; get blimit
-        movq        mm7, [rdx]            ; blimit
-
-        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        por         mm1,    mm5
-        pxor        mm5,    mm5
-        pcmpeqb     mm1,    mm5           ; mask mm1
-
-        ; calculate high edge variance
-        mov         rdx, arg(4) ;thresh           ; get thresh
-        movq        mm7, [rdx]            ;
-        movq        mm4, t0               ; get abs (q1 - q0)
-        psubusb     mm4, mm7
-        movq        mm3, t1               ; get abs (p1 - p0)
-        psubusb     mm3, mm7
-        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
-        pcmpeqb     mm4,        mm5
-
-        pcmpeqb     mm5,        mm5
-        pxor        mm4,        mm5
-
-
-        ; start work on filters
-        movq        mm2, [rsi+2*rax]      ; p1
-        movq        mm7, [rdi]            ; q1
-        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
-        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
-        psubsb      mm2, mm7              ; p1 - q1
-        pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
-        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
-        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
-        movq        mm3, mm0              ; q0
-        psubsb      mm0, mm6              ; q0 - p0
-        paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
-        paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
-        paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
-        pand        mm1, mm2                  ; mask filter values we don't care about
-        movq        mm2, mm1
-        paddsb      mm1, [GLOBAL(t4)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-        paddsb      mm2, [GLOBAL(t3)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-
-        pxor        mm0, mm0             ;
-        pxor        mm5, mm5
-        punpcklbw   mm0, mm2            ;
-        punpckhbw   mm5, mm2            ;
-        psraw       mm0, 11             ;
-        psraw       mm5, 11
-        packsswb    mm0, mm5
-        movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-
-        pxor        mm0, mm0              ; 0
-        movq        mm5, mm1              ; abcdefgh
-        punpcklbw   mm0, mm1              ; e0f0g0h0
-        psraw       mm0, 11               ; sign extended shift right by 3
-        pxor        mm1, mm1              ; 0
-        punpckhbw   mm1, mm5              ; a0b0c0d0
-        psraw       mm1, 11               ; sign extended shift right by 3
-        movq        mm5, mm0              ; save results
-
-        packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
-        paddsw      mm5, [GLOBAL(ones)]
-        paddsw      mm1, [GLOBAL(ones)]
-        psraw       mm5, 1                ; partial shifted one more time for 2nd tap
-        psraw       mm1, 1                ; partial shifted one more time for 2nd tap
-        packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-        pandn       mm4, mm5              ; high edge variance additive
-
-        paddsb      mm6, mm2              ; p0+= p0 add
-        pxor        mm6, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi+rax], mm6        ; write back
-
-        movq        mm6, [rsi+2*rax]      ; p1
-        pxor        mm6, [GLOBAL(t80)]    ; reoffset
-        paddsb      mm6, mm4              ; p1+= p1 add
-        pxor        mm6, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi+2*rax], mm6      ; write back
-
-        psubsb      mm3, mm0              ; q0-= q0 add
-        pxor        mm3, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi], mm3            ; write back
-
-        psubsb      mm7, mm4              ; q1-= q1 add
-        pxor        mm7, [GLOBAL(t80)]    ; unoffset
-        movq        [rdi], mm7            ; write back
-
-        add         rsi,8
-        neg         rax
-        dec         rcx
-        jnz         .next8_h
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_lpf_vertical_4_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit,
-;    const char *limit,
-;    const char *thresh,
-;    int count
-;)
-global sym(vpx_lpf_vertical_4_mmx) PRIVATE
-sym(vpx_lpf_vertical_4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub          rsp, 64      ; reserve 64 bytes
-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
-    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];
-
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        lea         rsi,        [rsi + rax*4 - 4]
-
-        movsxd      rcx,        dword ptr arg(5) ;count
-.next8_v:
-        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
-        add         rdi,        rax
-
-
-        ;transpose
-        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
-        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
-
-        punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64
-        punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60
-
-        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
-        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
-
-        punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44
-        punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40
-
-        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
-        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
-
-        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
-        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
-
-        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
-        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
-
-        neg         rax
-        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
-
-        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
-        punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24
-
-        punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20
-        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
-
-        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
-        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
-
-        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
-        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
-
-        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
-        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
-
-        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
-
-        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
-        psubusb     mm5,        mm7                         ; q2-q3
-
-        psubusb     mm7,        mm6                         ; q3-q2
-        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
-
-        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
-        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
-
-        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
-        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
-
-        psubusb     mm3,        mm6                         ; q1-q2
-        psubusb     mm6,        mm5                         ; q2-q1
-
-        por         mm6,        mm3                         ; mm6=abs(q2-q1)
-        lea         rdx,        srct
-
-        movq        [rdx+24],   mm5                         ; save q1
-        movq        [rdx+16],   mm0                         ; save q0
-
-        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
-        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
-
-        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
-        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
-
-        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
-        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
-
-        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
-        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
-
-        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
-        psubusb     mm2,        mm0                         ; p2-p3
-
-        psubusb     mm0,        mm1                         ; p3-p2
-        por         mm0,        mm2                         ; mm0=abs(p3-p2)
-
-        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
-        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
-
-        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
-        movq        [rdx+8],    mm3                         ; save p0
-
-        movq        [rdx],      mm2                         ; save p1
-        movq        mm5,        mm2                         ; mm5 = p1
-
-        psubusb     mm2,        mm1                         ; p1-p2
-        psubusb     mm1,        mm5                         ; p2-p1
-
-        por         mm1,        mm2                         ; mm1=abs(p2-p1)
-        mov         rdx,        arg(3) ;limit
-
-        movq        mm4,        [rdx]                       ; mm4 = limit
-        psubusb     mm7,        mm4
-
-        psubusb     mm0,        mm4
-        psubusb     mm1,        mm4
-
-        psubusb     mm6,        mm4
-        por         mm7,        mm6
-
-        por         mm0,        mm1
-        por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
-
-        movq        mm1,        mm5                         ; p1
-
-        movq        mm7,        mm3                         ; mm3=mm7=p0
-        psubusb     mm7,        mm5                         ; p0 - p1
-
-        psubusb     mm5,        mm3                         ; p1 - p0
-        por         mm5,        mm7                         ; abs(p1-p0)
-
-        movq        t0,         mm5                         ; save abs(p1-p0)
-        lea         rdx,        srct
-
-        psubusb     mm5,        mm4
-        por         mm0,        mm5                         ; mm0=mask
-
-        movq        mm5,        [rdx+16]                    ; mm5=q0
-        movq        mm7,        [rdx+24]                    ; mm7=q1
-
-        movq        mm6,        mm5                         ; mm6=q0
-        movq        mm2,        mm7                         ; q1
-        psubusb     mm5,        mm7                         ; q0-q1
-
-        psubusb     mm7,        mm6                         ; q1-q0
-        por         mm7,        mm5                         ; abs(q1-q0)
-
-        movq        t1,         mm7                         ; save abs(q1-q0)
-        psubusb     mm7,        mm4
-
-        por         mm0,        mm7                         ; mask
-
-        movq        mm5,        mm2                         ; q1
-        psubusb     mm5,        mm1                         ; q1-=p1
-        psubusb     mm1,        mm2                         ; p1-=q1
-        por         mm5,        mm1                         ; abs(p1-q1)
-        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
-        psrlw       mm5,        1                           ; abs(p1-q1)/2
-
-        mov         rdx,        arg(2) ;blimit                      ;
-
-        movq        mm4,        [rdx]                       ;blimit
-        movq        mm1,        mm3                         ; mm1=mm3=p0
-
-        movq        mm7,        mm6                         ; mm7=mm6=q0
-        psubusb     mm1,        mm7                         ; p0-q0
-
-        psubusb     mm7,        mm3                         ; q0-p0
-        por         mm1,        mm7                         ; abs(q0-p0)
-        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
-        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        por         mm1,        mm0;                        ; mask
-
-        pxor        mm0,        mm0
-        pcmpeqb     mm1,        mm0
-
-        ; calculate high edge variance
-        mov         rdx,        arg(4) ;thresh            ; get thresh
-        movq        mm7,        [rdx]
-        ;
-        movq        mm4,        t0              ; get abs (q1 - q0)
-        psubusb     mm4,        mm7
-
-        movq        mm3,        t1              ; get abs (p1 - p0)
-        psubusb     mm3,        mm7
-
-        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-        pcmpeqb     mm4,        mm0
-
-        pcmpeqb     mm0,        mm0
-        pxor        mm4,        mm0
-
-
-
-        ; start work on filters
-        lea         rdx,        srct
-
-        movq        mm2,        [rdx]           ; p1
-        movq        mm7,        [rdx+24]        ; q1
-
-        movq        mm6,        [rdx+8]         ; p0
-        movq        mm0,        [rdx+16]        ; q0
-
-        pxor        mm2,        [GLOBAL(t80)]   ; p1 offset to convert to signed values
-        pxor        mm7,        [GLOBAL(t80)]   ; q1 offset to convert to signed values
-
-        psubsb      mm2,        mm7             ; p1 - q1
-        pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
-
-        pxor        mm6,        [GLOBAL(t80)]   ; offset to convert to signed values
-        pxor        mm0,        [GLOBAL(t80)]   ; offset to convert to signed values
-
-        movq        mm3,        mm0             ; q0
-        psubsb      mm0,        mm6             ; q0 - p0
-
-        paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)
-        paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)
-
-        paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)
-        pand       mm1,        mm2              ; mask filter values we don't care about
-
-        movq        mm2,        mm1
-        paddsb      mm1,        [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-
-        paddsb      mm2,        [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-        pxor        mm0,        mm0          ;
-
-        pxor        mm5,        mm5
-        punpcklbw   mm0,        mm2         ;
-
-        punpckhbw   mm5,        mm2         ;
-        psraw       mm0,        11              ;
-
-        psraw       mm5,        11
-        packsswb    mm0,        mm5
-
-        movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-
-        pxor        mm0,        mm0           ; 0
-        movq        mm5,        mm1           ; abcdefgh
-
-        punpcklbw   mm0,        mm1           ; e0f0g0h0
-        psraw       mm0,        11                ; sign extended shift right by 3
-
-        pxor        mm1,        mm1           ; 0
-        punpckhbw   mm1,        mm5           ; a0b0c0d0
-
-        psraw       mm1,        11                ; sign extended shift right by 3
-        movq        mm5,        mm0              ; save results
-
-        packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
-        paddsw      mm5,        [GLOBAL(ones)]
-
-        paddsw      mm1,        [GLOBAL(ones)]
-        psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
-
-        psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
-        packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-
-        pandn       mm4,        mm5             ; high edge variance additive
-
-        paddsb      mm6,        mm2             ; p0+= p0 add
-        pxor        mm6,        [GLOBAL(t80)]   ; unoffset
-
-        ; mm6=p0                               ;
-        movq        mm1,        [rdx]           ; p1
-        pxor        mm1,        [GLOBAL(t80)]   ; reoffset
-
-        paddsb      mm1,        mm4                 ; p1+= p1 add
-        pxor        mm1,        [GLOBAL(t80)]       ; unoffset
-        ; mm6 = p0 mm1 = p1
-
-        psubsb      mm3,        mm0                 ; q0-= q0 add
-        pxor        mm3,        [GLOBAL(t80)]       ; unoffset
-
-        ; mm3 = q0
-        psubsb      mm7,        mm4                 ; q1-= q1 add
-        pxor        mm7,        [GLOBAL(t80)]       ; unoffset
-        ; mm7 = q1
-
-        ; transpose and write back
-        ; mm1 =    72 62 52 42 32 22 12 02
-        ; mm6 =    73 63 53 43 33 23 13 03
-        ; mm3 =    74 64 54 44 34 24 14 04
-        ; mm7 =    75 65 55 45 35 25 15 05
-
-        movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02
-        punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02
-
-        movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04
-        punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42
-
-        punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04
-        punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44
-
-        movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02
-        punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02
-
-        punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22
-        movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42
-
-        punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42
-        punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62
-
-
-        ; mm2 = 15 14 13 12 05 04 03 02
-        ; mm6 = 35 34 33 32 25 24 23 22
-        ; mm5 = 55 54 53 52 45 44 43 42
-        ; mm1 = 75 74 73 72 65 64 63 62
-
-
-
-        movd        [rsi+rax*4+2], mm2
-        psrlq       mm2,        32
-
-        movd        [rdi+rax*4+2], mm2
-        movd        [rsi+rax*2+2], mm6
-
-        psrlq       mm6,        32
-        movd        [rsi+rax+2],mm6
-
-        movd        [rsi+2],    mm1
-        psrlq       mm1,        32
-
-        movd        [rdi+2],    mm1
-        neg         rax
-
-        movd        [rdi+rax+2],mm5
-        psrlq       mm5,        32
-
-        movd        [rdi+rax*2+2], mm5
-
-        lea         rsi,        [rsi+rax*8]
-        dec         rcx
-        jnz         .next8_v
-
-    add rsp, 64
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-tfe:
-    times 8 db 0xfe
-align 16
-t80:
-    times 8 db 0x80
-align 16
-t3:
-    times 8 db 0x03
-align 16
-t4:
-    times 8 db 0x04
-align 16
-ones:
-    times 4 dw 0x0001
diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c
index ed10127..739adf3 100644
--- a/vpx_dsp/x86/loopfilter_sse2.c
+++ b/vpx_dsp/x86/loopfilter_sse2.c
@@ -18,11 +18,216 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
 }
 
-static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
-                                            int p,
-                                            const unsigned char *_blimit,
-                                            const unsigned char *_limit,
-                                            const unsigned char *_thresh) {
+// filter_mask and hev_mask
+#define FILTER_HEV_MASK do {                                                   \
+  /* (abs(q1 - q0), abs(p1 - p0) */                                            \
+  __m128i flat = abs_diff(q1p1, q0p0);                                         \
+  /* abs(p1 - q1), abs(p0 - q0) */                                             \
+  const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                           \
+  __m128i abs_p0q0, abs_p1q1, work;                                            \
+                                                                               \
+  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */          \
+  hev = _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero);  \
+  hev = _mm_cmpgt_epi16(hev, thresh);                                          \
+  hev = _mm_packs_epi16(hev, hev);                                             \
+                                                                               \
+  /* const int8_t mask = filter_mask(*limit, *blimit, */                       \
+  /*                                 p3, p2, p1, p0, q0, q1, q2, q3); */       \
+  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0);  /* abs(p0 - q0) * 2 */\
+  abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0);  /* abs(p1 - q1) */\
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                      \
+  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1);  /* abs(p1 - q1) / 2 */      \
+  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                    \
+  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                    \
+  /* abs(p3 - p2), abs(p2 - p1) */                                             \
+  work = abs_diff(p3p2, p2p1);                                                 \
+  flat = _mm_max_epu8(work, flat);                                             \
+  /* abs(q3 - q2), abs(q2 - q1) */                                             \
+  work = abs_diff(q3q2, q2q1);                                                 \
+  flat = _mm_max_epu8(work, flat);                                             \
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                          \
+  mask = _mm_unpacklo_epi64(mask, flat);                                       \
+  mask = _mm_subs_epu8(mask, limit);                                           \
+  mask = _mm_cmpeq_epi8(mask, zero);                                           \
+  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                         \
+} while (0)
+
+#define FILTER4 do {                                                           \
+  const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3,                    \
+                                    4, 4, 4, 4, 4, 4, 4, 4);                   \
+  const __m128i t80 = _mm_set1_epi8(0x80);                                     \
+  __m128i filter, filter2filter1, work;                                        \
+                                                                               \
+  ps1ps0 = _mm_xor_si128(p1p0, t80);  /* ^ 0x80 */                             \
+  qs1qs0 = _mm_xor_si128(q1q0, t80);                                           \
+                                                                               \
+  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */                    \
+  work = _mm_subs_epi8(ps1ps0, qs1qs0);                                        \
+  filter = _mm_and_si128(_mm_srli_si128(work, 8), hev);                        \
+  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */           \
+  filter = _mm_subs_epi8(filter, work);                                        \
+  filter = _mm_subs_epi8(filter, work);                                        \
+  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */               \
+  filter = _mm_and_si128(filter, mask);  /* & mask */                          \
+  filter = _mm_unpacklo_epi64(filter, filter);                                 \
+                                                                               \
+  /* filter1 = signed_char_clamp(filter + 4) >> 3; */                          \
+  /* filter2 = signed_char_clamp(filter + 3) >> 3; */                          \
+  filter2filter1 = _mm_adds_epi8(filter, t3t4);  /* signed_char_clamp */       \
+  filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);                  \
+  filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);          \
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 11);  /* >> 3 */             \
+  filter = _mm_srai_epi16(filter, 11);  /* >> 3 */                             \
+  filter2filter1 = _mm_packs_epi16(filter2filter1, filter);                    \
+                                                                               \
+  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */                        \
+  filter = _mm_subs_epi8(filter2filter1, ff);  /* + 1 */                       \
+  filter = _mm_unpacklo_epi8(filter, filter);                                  \
+  filter = _mm_srai_epi16(filter, 9);  /* round */                             \
+  filter = _mm_packs_epi16(filter, filter);                                    \
+  filter = _mm_andnot_si128(hev, filter);                                      \
+                                                                               \
+  hev = _mm_unpackhi_epi64(filter2filter1, filter);                            \
+  filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);                 \
+                                                                               \
+  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */      \
+  qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1);                              \
+  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */      \
+  ps1ps0 = _mm_adds_epi8(ps1ps0, hev);                                         \
+  qs1qs0 = _mm_xor_si128(qs1qs0, t80);  /* ^ 0x80 */                           \
+  ps1ps0 = _mm_xor_si128(ps1ps0, t80);  /* ^ 0x80 */                           \
+} while (0)
+
+void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
+                               const uint8_t *_blimit, const uint8_t *_limit,
+                               const uint8_t *_thresh) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i limit =
+      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+                         _mm_loadl_epi64((const __m128i *)_limit));
+  const __m128i thresh =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+  __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
+  __m128i mask, hev;
+
+  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
+                            _mm_loadl_epi64((__m128i *)(s - 4 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 0 * p)));
+  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
+  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
+
+  FILTER_HEV_MASK;
+  FILTER4;
+
+  _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
+  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);  // *op0
+  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);  // *oq0
+  _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
+}
+
+void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
+                             const uint8_t *_blimit, const uint8_t *_limit,
+                             const uint8_t *_thresh) {
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i limit =
+      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
+                         _mm_loadl_epi64((const __m128i *)_limit));
+  const __m128i thresh =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+  __m128i x0, x1, x2, x3;
+  __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
+  __m128i mask, hev;
+
+  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
+                           _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
+
+  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
+
+  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
+
+  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
+
+  // Transpose 8x8
+  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  p1p0 = _mm_unpacklo_epi16(q1q0, x1);
+  // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
+  x0 = _mm_unpacklo_epi16(x2, x3);
+  // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  p3p2 = _mm_unpacklo_epi32(p1p0, x0);
+  // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  p1p0 = _mm_unpackhi_epi32(p1p0, x0);
+  p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8));  // swap lo and high
+  p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8));  // swap lo and high
+
+  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  q1q0 = _mm_unpackhi_epi16(q1q0, x1);
+  // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
+  x2 = _mm_unpackhi_epi16(x2, x3);
+  // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  q3q2 = _mm_unpackhi_epi32(q1q0, x2);
+  // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  q1q0 = _mm_unpacklo_epi32(q1q0, x2);
+
+  q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
+  q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
+  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
+  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
+
+  FILTER_HEV_MASK;
+  FILTER4;
+
+  // Transpose 8x4 to 4x8
+  // qs1qs0: 20 21 22 23 24 25 26 27  30 31 32 33 34 34 36 37
+  // ps1ps0: 10 11 12 13 14 15 16 17  00 01 02 03 04 05 06 07
+  // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+  ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
+  // 10 30 11 31 12 32 13 33  14 34 15 35 16 36 17 37
+  x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
+  // 00 20 01 21 02 22 03 23  04 24 05 25 06 26 07 27
+  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
+  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
+  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
+
+  *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
+  *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
+  *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
+  *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+
+  *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
+  *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
+  *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
+  *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+}
+
+void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit,
+                                    const unsigned char *_limit,
+                                    const unsigned char *_thresh) {
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
@@ -383,11 +588,10 @@ static INLINE __m128i filter16_mask(const __m128i *const flat,
   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
 }
 
-static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
-                                             int p,
-                                             const unsigned char *_blimit,
-                                             const unsigned char *_limit,
-                                             const unsigned char *_thresh) {
+void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
+                                     const unsigned char *_blimit,
+                                     const unsigned char *_limit,
+                                     const unsigned char *_thresh) {
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
@@ -716,21 +920,10 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
   }
 }
 
-// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
-void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
-                                const unsigned char *_blimit,
-                                const unsigned char *_limit,
-                                const unsigned char *_thresh, int count) {
-  if (count == 1)
-    mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
-  else
-    mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
-}
-
 void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
                                const unsigned char *_blimit,
                                const unsigned char *_limit,
-                               const unsigned char *_thresh, int count) {
+                               const unsigned char *_thresh) {
   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
@@ -745,8 +938,6 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
 
-  (void)count;
-
   q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
                             _mm_loadl_epi64((__m128i *)(s + 3 * p)));
   q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
@@ -1492,11 +1683,10 @@ void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
 void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
                              const unsigned char *blimit,
                              const unsigned char *limit,
-                             const unsigned char *thresh, int count) {
+                             const unsigned char *thresh) {
   DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
   unsigned char *src[1];
   unsigned char *dst[1];
-  (void)count;
 
   // Transpose 8x8
   src[0] = s - 4;
@@ -1505,7 +1695,7 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
   transpose(src, p, dst, 8, 1);
 
   // Loop filtering
-  vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
+  vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
 
   src[0] = t_dst;
   dst[0] = s - 4;
@@ -1557,7 +1747,7 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
   transpose(src, p, dst, 8, 2);
 
   // Loop filtering
-  mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh);
+  vpx_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
 
   src[0] = t_dst;
   src[1] = t_dst + 8 * 8;
@@ -1578,8 +1768,7 @@ void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
   transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
 
   // Loop filtering
-  mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
-                                   thresh);
+  vpx_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
 
   // Transpose back
   transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
diff --git a/vpx_dsp/x86/sad4d_sse2.asm b/vpx_dsp/x86/sad4d_sse2.asm
index a2f0ae7..3f6e55c 100644
--- a/vpx_dsp/x86/sad4d_sse2.asm
+++ b/vpx_dsp/x86/sad4d_sse2.asm
@@ -20,33 +20,41 @@ SECTION .text
   movd                  m4, [ref2q+%3]
   movd                  m7, [ref3q+%3]
   movd                  m5, [ref4q+%3]
-  punpckldq             m0, [srcq +%4]
-  punpckldq             m6, [ref1q+%5]
-  punpckldq             m4, [ref2q+%5]
-  punpckldq             m7, [ref3q+%5]
-  punpckldq             m5, [ref4q+%5]
+  movd                  m1, [srcq +%4]
+  movd                  m2, [ref1q+%5]
+  punpckldq             m0, m1
+  punpckldq             m6, m2
+  movd                  m1, [ref2q+%5]
+  movd                  m2, [ref3q+%5]
+  movd                  m3, [ref4q+%5]
+  punpckldq             m4, m1
+  punpckldq             m7, m2
+  punpckldq             m5, m3
+  movlhps               m0, m0
+  movlhps               m6, m4
+  movlhps               m7, m5
   psadbw                m6, m0
-  psadbw                m4, m0
   psadbw                m7, m0
-  psadbw                m5, m0
-  punpckldq             m6, m4
-  punpckldq             m7, m5
 %else
   movd                  m1, [ref1q+%3]
+  movd                  m5, [ref1q+%5]
   movd                  m2, [ref2q+%3]
+  movd                  m4, [ref2q+%5]
+  punpckldq             m1, m5
+  punpckldq             m2, m4
   movd                  m3, [ref3q+%3]
+  movd                  m5, [ref3q+%5]
+  punpckldq             m3, m5
   movd                  m4, [ref4q+%3]
-  punpckldq             m0, [srcq +%4]
-  punpckldq             m1, [ref1q+%5]
-  punpckldq             m2, [ref2q+%5]
-  punpckldq             m3, [ref3q+%5]
-  punpckldq             m4, [ref4q+%5]
+  movd                  m5, [ref4q+%5]
+  punpckldq             m4, m5
+  movd                  m5, [srcq +%4]
+  punpckldq             m0, m5
+  movlhps               m0, m0
+  movlhps               m1, m2
+  movlhps               m3, m4
   psadbw                m1, m0
-  psadbw                m2, m0
   psadbw                m3, m0
-  psadbw                m4, m0
-  punpckldq             m1, m2
-  punpckldq             m3, m4
   paddd                 m6, m1
   paddd                 m7, m3
 %endif
@@ -170,7 +178,7 @@ SECTION .text
 ; void vpx_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
-; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
 %macro SADNXN4D 2
 %if UNIX64
 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
@@ -192,7 +200,7 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
 %endrep
   PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
 
-%if mmsize == 16
+%if %1 > 4
   pslldq                m5, 4
   pslldq                m7, 4
   por                   m4, m5
@@ -207,8 +215,10 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   RET
 %else
   movifnidn             r4, r4mp
-  movq               [r4+0], m6
-  movq               [r4+8], m7
+  pshufd            m6, m6, 0x08
+  pshufd            m7, m7, 0x08
+  movq              [r4+0], m6
+  movq              [r4+8], m7
   RET
 %endif
 %endmacro
@@ -225,7 +235,5 @@ SADNXN4D 16,  8
 SADNXN4D  8, 16
 SADNXN4D  8,  8
 SADNXN4D  8,  4
-
-INIT_MMX sse
 SADNXN4D  4,  8
 SADNXN4D  4,  4
diff --git a/vpx_dsp/x86/sad_mmx.asm b/vpx_dsp/x86/sad_mmx.asm
deleted file mode 100644
index 9968992..0000000
--- a/vpx_dsp/x86/sad_mmx.asm
+++ /dev/null
@@ -1,427 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-global sym(vpx_sad16x16_mmx) PRIVATE
-global sym(vpx_sad8x16_mmx) PRIVATE
-global sym(vpx_sad8x8_mmx) PRIVATE
-global sym(vpx_sad4x4_mmx) PRIVATE
-global sym(vpx_sad16x8_mmx) PRIVATE
-
-;unsigned int vpx_sad16x16_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vpx_sad16x16_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-
-        lea             rcx,        [rcx+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x16x16sad_mmx_loop:
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm2,        QWORD PTR [rsi+8]
-
-        movq            mm1,        QWORD PTR [rdi]
-        movq            mm3,        QWORD PTR [rdi+8]
-
-        movq            mm4,        mm0
-        movq            mm5,        mm2
-
-        psubusb         mm0,        mm1
-        psubusb         mm1,        mm4
-
-        psubusb         mm2,        mm3
-        psubusb         mm3,        mm5
-
-        por             mm0,        mm1
-        por             mm2,        mm3
-
-        movq            mm1,        mm0
-        movq            mm3,        mm2
-
-        punpcklbw       mm0,        mm6
-        punpcklbw       mm2,        mm6
-
-        punpckhbw       mm1,        mm6
-        punpckhbw       mm3,        mm6
-
-        paddw           mm0,        mm2
-        paddw           mm1,        mm3
-
-
-        lea             rsi,        [rsi+rax]
-        add             rdi,        rdx
-
-        paddw           mm7,        mm0
-        paddw           mm7,        mm1
-
-        cmp             rsi,        rcx
-        jne             .x16x16sad_mmx_loop
-
-
-        movq            mm0,        mm7
-
-        punpcklwd       mm0,        mm6
-        punpckhwd       mm7,        mm6
-
-        paddw           mm0,        mm7
-        movq            mm7,        mm0
-
-
-        psrlq           mm0,        32
-        paddw           mm7,        mm0
-
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vpx_sad8x16_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vpx_sad8x16_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-
-        lea             rcx,        [rcx+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x8x16sad_mmx_loop:
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        movq            mm2,        mm0
-        psubusb         mm0,        mm1
-
-        psubusb         mm1,        mm2
-        por             mm0,        mm1
-
-        movq            mm2,        mm0
-        punpcklbw       mm0,        mm6
-
-        punpckhbw       mm2,        mm6
-        lea             rsi,        [rsi+rax]
-
-        add             rdi,        rdx
-        paddw           mm7,        mm0
-
-        paddw           mm7,        mm2
-        cmp             rsi,        rcx
-
-        jne             .x8x16sad_mmx_loop
-
-        movq            mm0,        mm7
-        punpcklwd       mm0,        mm6
-
-        punpckhwd       mm7,        mm6
-        paddw           mm0,        mm7
-
-        movq            mm7,        mm0
-        psrlq           mm0,        32
-
-        paddw           mm7,        mm0
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vpx_sad8x8_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vpx_sad8x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x8x8sad_mmx_loop:
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        movq            mm2,        mm0
-        psubusb         mm0,        mm1
-
-        psubusb         mm1,        mm2
-        por             mm0,        mm1
-
-        movq            mm2,        mm0
-        punpcklbw       mm0,        mm6
-
-        punpckhbw       mm2,        mm6
-        paddw           mm0,        mm2
-
-        lea             rsi,       [rsi+rax]
-        add             rdi,        rdx
-
-        paddw           mm7,       mm0
-        cmp             rsi,        rcx
-
-        jne             .x8x8sad_mmx_loop
-
-        movq            mm0,        mm7
-        punpcklwd       mm0,        mm6
-
-        punpckhwd       mm7,        mm6
-        paddw           mm0,        mm7
-
-        movq            mm7,        mm0
-        psrlq           mm0,        32
-
-        paddw           mm7,        mm0
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vpx_sad4x4_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vpx_sad4x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        movd            mm0,        DWORD PTR [rsi]
-        movd            mm1,        DWORD PTR [rdi]
-
-        movd            mm2,        DWORD PTR [rsi+rax]
-        movd            mm3,        DWORD PTR [rdi+rdx]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movq            mm2,        mm0
-        psubusb         mm0,        mm1
-
-        psubusb         mm1,        mm2
-        por             mm0,        mm1
-
-        movq            mm2,        mm0
-        pxor            mm3,        mm3
-
-        punpcklbw       mm0,        mm3
-        punpckhbw       mm2,        mm3
-
-        paddw           mm0,        mm2
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movd            mm4,        DWORD PTR [rsi]
-        movd            mm5,        DWORD PTR [rdi]
-
-        movd            mm6,        DWORD PTR [rsi+rax]
-        movd            mm7,        DWORD PTR [rdi+rdx]
-
-        punpcklbw       mm4,        mm6
-        punpcklbw       mm5,        mm7
-
-        movq            mm6,        mm4
-        psubusb         mm4,        mm5
-
-        psubusb         mm5,        mm6
-        por             mm4,        mm5
-
-        movq            mm5,        mm4
-        punpcklbw       mm4,        mm3
-
-        punpckhbw       mm5,        mm3
-        paddw           mm4,        mm5
-
-        paddw           mm0,        mm4
-        movq            mm1,        mm0
-
-        punpcklwd       mm0,        mm3
-        punpckhwd       mm1,        mm3
-
-        paddw           mm0,        mm1
-        movq            mm1,        mm0
-
-        psrlq           mm0,        32
-        paddw           mm0,        mm1
-
-        movq            rax,        mm0
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vpx_sad16x8_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vpx_sad16x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x16x8sad_mmx_loop:
-
-        movq            mm0,       [rsi]
-        movq            mm1,       [rdi]
-
-        movq            mm2,        [rsi+8]
-        movq            mm3,        [rdi+8]
-
-        movq            mm4,        mm0
-        movq            mm5,        mm2
-
-        psubusb         mm0,        mm1
-        psubusb         mm1,        mm4
-
-        psubusb         mm2,        mm3
-        psubusb         mm3,        mm5
-
-        por             mm0,        mm1
-        por             mm2,        mm3
-
-        movq            mm1,        mm0
-        movq            mm3,        mm2
-
-        punpcklbw       mm0,        mm6
-        punpckhbw       mm1,        mm6
-
-        punpcklbw       mm2,        mm6
-        punpckhbw       mm3,        mm6
-
-
-        paddw           mm0,        mm2
-        paddw           mm1,        mm3
-
-        paddw           mm0,        mm1
-        lea             rsi,        [rsi+rax]
-
-        add             rdi,        rdx
-        paddw           mm7,        mm0
-
-        cmp             rsi,        rcx
-        jne             .x16x8sad_mmx_loop
-
-        movq            mm0,        mm7
-        punpcklwd       mm0,        mm6
-
-        punpckhwd       mm7,        mm6
-        paddw           mm0,        mm7
-
-        movq            mm7,        mm0
-        psrlq           mm0,        32
-
-        paddw           mm7,        mm0
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/vpx_dsp/x86/sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm
index 0defe1b..1ec906c 100644
--- a/vpx_dsp/x86/sad_sse2.asm
+++ b/vpx_dsp/x86/sad_sse2.asm
@@ -17,7 +17,7 @@ SECTION .text
 %if %3 == 5
 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
 %else ; %3 == 7
-cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
+cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
 %else ; avg
@@ -25,7 +25,7 @@ cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
 %else ; %3 == 7
-cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
+cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
                                               ref, ref_stride, \
                                               second_pred, \
                                               src_stride3, ref_stride3
@@ -222,8 +222,8 @@ SAD8XN 16, 1 ; sad8x16_avg_sse2
 SAD8XN  8, 1 ; sad8x8_avg_sse2
 SAD8XN  4, 1 ; sad8x4_avg_sse2
 
-; unsigned int vpx_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
-;                                  uint8_t *ref, int ref_stride);
+; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
+;                                   uint8_t *ref, int ref_stride);
 %macro SAD4XN 1-2 0
   SAD_FN 4, %1, 7, %2
   mov              n_rowsd, %1/4
@@ -236,31 +236,32 @@ SAD8XN  4, 1 ; sad8x4_avg_sse2
   movd                  m4, [refq+ref_stride3q]
   punpckldq             m1, m2
   punpckldq             m3, m4
+  movlhps               m1, m3
 %if %2 == 1
   pavgb                 m1, [second_predq+mmsize*0]
-  pavgb                 m3, [second_predq+mmsize*1]
-  lea         second_predq, [second_predq+mmsize*2]
+  lea         second_predq, [second_predq+mmsize*1]
 %endif
   movd                  m2, [srcq]
   movd                  m5, [srcq+src_strideq]
   movd                  m4, [srcq+src_strideq*2]
-  movd                  m6, [srcq+src_stride3q]
+  movd                  m3, [srcq+src_stride3q]
   punpckldq             m2, m5
-  punpckldq             m4, m6
+  punpckldq             m4, m3
+  movlhps               m2, m4
   psadbw                m1, m2
-  psadbw                m3, m4
   lea                 refq, [refq+ref_strideq*4]
   paddd                 m0, m1
   lea                 srcq, [srcq+src_strideq*4]
-  paddd                 m0, m3
   dec              n_rowsd
   jg .loop
 
+  movhlps               m1, m0
+  paddd                 m0, m1
   movd                 eax, m0
   RET
 %endmacro
 
-INIT_MMX sse
+INIT_XMM sse2
 SAD4XN  8 ; sad4x8_sse
 SAD4XN  4 ; sad4x4_sse
 SAD4XN  8, 1 ; sad4x8_avg_sse
diff --git a/vpx_dsp/x86/subpel_variance_sse2.asm b/vpx_dsp/x86/subpel_variance_sse2.asm
index 05dcff7..cee4468 100644
--- a/vpx_dsp/x86/subpel_variance_sse2.asm
+++ b/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -57,8 +57,8 @@ SECTION .text
   paddd                %6, %1
 %endmacro
 
-%macro STORE_AND_RET 0
-%if mmsize == 16
+%macro STORE_AND_RET 1
+%if %1 > 4
   ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
   ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
   ; We have to sign-extend it before adding the words within the register
@@ -78,16 +78,16 @@ SECTION .text
   movd               [r1], m7           ; store sse
   paddd                m6, m4
   movd               raxd, m6           ; store sum as return value
-%else ; mmsize == 8
-  pshufw               m4, m6, 0xe
-  pshufw               m3, m7, 0xe
+%else ; 4xh
+  pshuflw              m4, m6, 0xe
+  pshuflw              m3, m7, 0xe
   paddw                m6, m4
   paddd                m7, m3
   pcmpgtw              m5, m6           ; mask for 0 > x
   mov                  r1, ssem         ; r1 = unsigned int *sse
   punpcklwd            m6, m5           ; sign-extend m6 word->dword
   movd               [r1], m7           ; store sse
-  pshufw               m4, m6, 0xe
+  pshuflw              m4, m6, 0xe
   paddd                m6, m4
   movd               raxd, m6           ; store sum as return value
 %endif
@@ -139,8 +139,10 @@ SECTION .text
       %define sec_str sec_stridemp
 
       ;Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
-      add esp, 4                ; restore esp
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -156,8 +158,10 @@ SECTION .text
       %define block_height heightd
 
       ;Store bilin_filter and pw_8 location in stack
-      GET_GOT eax
-      add esp, 4                ; restore esp
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
 
       lea ecx, [GLOBAL(bilin_filter_m)]
       mov g_bilin_filterm, ecx
@@ -192,6 +196,12 @@ SECTION .text
   %endif
 %endif
 
+%if %1 == 4
+  %define movx movd
+%else
+  %define movx movh
+%endif
+
   ASSERT               %1 <= 16         ; m6 overflows if w > 16
   pxor                 m6, m6           ; sum
   pxor                 m7, m7           ; sse
@@ -224,6 +234,7 @@ SECTION .text
 %endif
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+
 %if %2 == 0 ; !avg
   punpckhbw            m3, m1, m5
   punpcklbw            m1, m5
@@ -233,24 +244,37 @@ SECTION .text
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
+  movx                 m0, [srcq]
 %if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
   movhps               m0, [srcq+src_strideq]
-%else ; mmsize == 8
-  punpckldq            m0, [srcq+src_strideq]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq]
+  punpckldq            m0, m1
 %endif
 %else ; !avg
-  movh                 m2, [srcq+src_strideq]
+  movx                 m2, [srcq+src_strideq]
 %endif
-  movh                 m1, [dstq]
-  movh                 m3, [dstq+dst_strideq]
+
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+
 %if %2 == 1 ; avg
+%if %1 > 4
   pavgb                m0, [secq]
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+%endif
   punpcklbw            m3, m5
   punpcklbw            m1, m5
+%if %1 > 4
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else ; 4xh
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %else ; !avg
   punpcklbw            m0, m5
   punpcklbw            m2, m5
@@ -267,10 +291,10 @@ SECTION .text
 %endif
   dec                   block_height
   jg .x_zero_y_zero_loop
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_zero_y_nonzero:
-  cmp           y_offsetd, 8
+  cmp           y_offsetd, 4
   jne .x_zero_y_nonhalf
 
   ; x_offset == 0 && y_offset == 0.5
@@ -292,37 +316,41 @@ SECTION .text
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m2, [srcq+src_strideq]
+  movx                 m0, [srcq]
+  movx                 m2, [srcq+src_strideq]
 %if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
   movhps               m2, [srcq+src_strideq*2]
-%else ; mmsize == 8
-%if %1 == 4
-  movh                 m1, [srcq+src_strideq*2]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq*2]
   punpckldq            m2, m1
-%else
-  punpckldq            m2, [srcq+src_strideq*2]
-%endif
 %endif
-  movh                 m1, [dstq]
-%if mmsize == 16
+  movx                 m1, [dstq]
+%if %1 > 4
   movlhps              m0, m2
-%else ; mmsize == 8
+%else ; 4xh
   punpckldq            m0, m2
 %endif
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
   pavgb                m0, m2
   punpcklbw            m1, m5
+%if %1 > 4
   pavgb                m0, [secq]
   punpcklbw            m3, m5
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m4, [secq]
+  pavgb                m0, m4
+  punpcklbw            m3, m5
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %else ; !avg
-  movh                 m4, [srcq+src_strideq*2]
-  movh                 m1, [dstq]
+  movx                 m4, [srcq+src_strideq*2]
+  movx                 m1, [dstq]
   pavgb                m0, m2
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
   pavgb                m2, m4
   punpcklbw            m0, m5
   punpcklbw            m2, m5
@@ -339,7 +367,7 @@ SECTION .text
 %endif
   dec                   block_height
   jg .x_zero_y_half_loop
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_zero_y_nonhalf:
   ; x_offset == 0 && y_offset == bilin interpolation
@@ -347,7 +375,7 @@ SECTION .text
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+y_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
@@ -420,12 +448,12 @@ SECTION .text
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m2, [srcq+src_strideq]
-  movh                 m4, [srcq+src_strideq*2]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m0, [srcq]
+  movx                 m2, [srcq+src_strideq]
+  movx                 m4, [srcq+src_strideq*2]
+  movx                 m3, [dstq+dst_strideq]
 %if cpuflag(ssse3)
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   punpcklbw            m0, m2
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_y_a
@@ -445,17 +473,27 @@ SECTION .text
   pmullw               m4, filter_y_b
   paddw                m0, m1
   paddw                m2, filter_rnd
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   paddw                m2, m4
 %endif
   psraw                m0, 4
   psraw                m2, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
@@ -471,10 +509,10 @@ SECTION .text
 %undef filter_y_a
 %undef filter_y_b
 %undef filter_rnd
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_nonzero:
-  cmp           x_offsetd, 8
+  cmp           x_offsetd, 4
   jne .x_nonhalf
   ; x_offset == 0.5
   test          y_offsetd, y_offsetd
@@ -499,30 +537,40 @@ SECTION .text
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m4, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m4, [srcq+1]
 %if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
   movhps               m0, [srcq+src_strideq]
   movhps               m4, [srcq+src_strideq+1]
-%else ; mmsize == 8
-  punpckldq            m0, [srcq+src_strideq]
-  punpckldq            m4, [srcq+src_strideq+1]
-%endif
-  movh                 m1, [dstq]
-  movh                 m3, [dstq+dst_strideq]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq]
+  punpckldq            m0, m1
+  movx                 m2, [srcq+src_strideq+1]
+  punpckldq            m4, m2
+%endif
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
   pavgb                m0, m4
   punpcklbw            m3, m5
+%if %1 > 4
   pavgb                m0, [secq]
   punpcklbw            m1, m5
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m1, m5
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %else ; !avg
-  movh                 m2, [srcq+src_strideq]
-  movh                 m1, [dstq]
+  movx                 m2, [srcq+src_strideq]
+  movx                 m1, [dstq]
   pavgb                m0, m4
-  movh                 m4, [srcq+src_strideq+1]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m4, [srcq+src_strideq+1]
+  movx                 m3, [dstq+dst_strideq]
   pavgb                m2, m4
   punpcklbw            m0, m5
   punpcklbw            m2, m5
@@ -539,10 +587,10 @@ SECTION .text
 %endif
   dec                   block_height
   jg .x_half_y_zero_loop
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_half_y_nonzero:
-  cmp           y_offsetd, 8
+  cmp           y_offsetd, 4
   jne .x_half_y_nonhalf
 
   ; x_offset == 0.5 && y_offset == 0.5
@@ -574,53 +622,58 @@ SECTION .text
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m3, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m3, [srcq+1]
   add                srcq, src_strideq
   pavgb                m0, m3
 .x_half_y_half_loop:
-  movh                 m2, [srcq]
-  movh                 m3, [srcq+1]
+  movx                 m2, [srcq]
+  movx                 m3, [srcq+1]
 %if %2 == 1 ; avg
-%if mmsize == 16
+%if %1 > 4
   movhps               m2, [srcq+src_strideq]
   movhps               m3, [srcq+src_strideq+1]
 %else
-%if %1 == 4
-  movh                 m1, [srcq+src_strideq]
+  movx                 m1, [srcq+src_strideq]
   punpckldq            m2, m1
-  movh                 m1, [srcq+src_strideq+1]
+  movx                 m1, [srcq+src_strideq+1]
   punpckldq            m3, m1
-%else
-  punpckldq            m2, [srcq+src_strideq]
-  punpckldq            m3, [srcq+src_strideq+1]
-%endif
 %endif
   pavgb                m2, m3
-%if mmsize == 16
+%if %1 > 4
   movlhps              m0, m2
   movhlps              m4, m2
-%else ; mmsize == 8
+%else ; 4xh
   punpckldq            m0, m2
-  pshufw               m4, m2, 0xe
+  pshuflw              m4, m2, 0xe
 %endif
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   pavgb                m0, m2
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
+%if %1 > 4
   pavgb                m0, [secq]
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+%endif
   punpcklbw            m3, m5
   punpcklbw            m1, m5
+%if %1 > 4
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %else ; !avg
-  movh                 m4, [srcq+src_strideq]
-  movh                 m1, [srcq+src_strideq+1]
+  movx                 m4, [srcq+src_strideq]
+  movx                 m1, [srcq+src_strideq+1]
   pavgb                m2, m3
   pavgb                m4, m1
   pavgb                m0, m2
   pavgb                m2, m4
-  movh                 m1, [dstq]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
   punpcklbw            m0, m5
   punpcklbw            m2, m5
   punpcklbw            m3, m5
@@ -637,7 +690,7 @@ SECTION .text
 %endif
   dec                   block_height
   jg .x_half_y_half_loop
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_half_y_nonhalf:
   ; x_offset == 0.5 && y_offset == bilin interpolation
@@ -645,7 +698,7 @@ SECTION .text
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+y_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
@@ -720,23 +773,23 @@ SECTION .text
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m3, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m3, [srcq+1]
   add                srcq, src_strideq
   pavgb                m0, m3
 %if notcpuflag(ssse3)
   punpcklbw            m0, m5
 %endif
 .x_half_y_other_loop:
-  movh                 m2, [srcq]
-  movh                 m1, [srcq+1]
-  movh                 m4, [srcq+src_strideq]
-  movh                 m3, [srcq+src_strideq+1]
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m4, [srcq+src_strideq]
+  movx                 m3, [srcq+src_strideq+1]
   pavgb                m2, m1
   pavgb                m4, m3
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
 %if cpuflag(ssse3)
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   punpcklbw            m0, m2
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_y_a
@@ -756,16 +809,26 @@ SECTION .text
   pmullw               m1, m4, filter_y_b
   paddw                m2, filter_rnd
   paddw                m2, m1
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
 %endif
   psraw                m0, 4
   psraw                m2, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
@@ -782,7 +845,7 @@ SECTION .text
 %undef filter_y_a
 %undef filter_y_b
 %undef filter_rnd
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_nonhalf:
   test          y_offsetd, y_offsetd
@@ -793,7 +856,7 @@ SECTION .text
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
@@ -861,14 +924,14 @@ SECTION .text
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m1, [srcq+1]
-  movh                 m2, [srcq+src_strideq]
-  movh                 m4, [srcq+src_strideq+1]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m2, [srcq+src_strideq]
+  movx                 m4, [srcq+src_strideq+1]
+  movx                 m3, [dstq+dst_strideq]
 %if cpuflag(ssse3)
   punpcklbw            m0, m1
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_x_a
   pmaddubsw            m2, filter_x_a
@@ -888,17 +951,27 @@ SECTION .text
   pmullw               m4, filter_x_b
   paddw                m0, m1
   paddw                m2, filter_rnd
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   paddw                m2, m4
 %endif
   psraw                m0, 4
   psraw                m2, 4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   punpcklbw            m1, m5
   SUM_SSE              m0, m1, m2, m3, m6, m7
@@ -914,10 +987,10 @@ SECTION .text
 %undef filter_x_a
 %undef filter_x_b
 %undef filter_rnd
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_nonhalf_y_nonzero:
-  cmp           y_offsetd, 8
+  cmp           y_offsetd, 4
   jne .x_nonhalf_y_nonhalf
 
   ; x_offset == bilin interpolation && y_offset == 0.5
@@ -925,7 +998,7 @@ SECTION .text
   lea        bilin_filter, [bilin_filter_m]
 %endif
   shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
@@ -1033,8 +1106,8 @@ SECTION .text
   add                srcq, src_strideq
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m1, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
 %if cpuflag(ssse3)
   punpcklbw            m0, m1
   pmaddubsw            m0, filter_x_a
@@ -1050,17 +1123,17 @@ SECTION .text
   add                srcq, src_strideq
   psraw                m0, 4
 .x_other_y_half_loop:
-  movh                 m2, [srcq]
-  movh                 m1, [srcq+1]
-  movh                 m4, [srcq+src_strideq]
-  movh                 m3, [srcq+src_strideq+1]
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m4, [srcq+src_strideq]
+  movx                 m3, [srcq+src_strideq+1]
 %if cpuflag(ssse3)
   punpcklbw            m2, m1
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m4, filter_x_a
-  movh                 m1, [dstq]
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
   paddw                m2, filter_rnd
   paddw                m4, filter_rnd
 %else
@@ -1075,9 +1148,9 @@ SECTION .text
   pmullw               m3, filter_x_b
   paddw                m4, filter_rnd
   paddw                m2, m1
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   paddw                m4, m3
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
 %endif
   psraw                m2, 4
   psraw                m4, 4
@@ -1085,10 +1158,20 @@ SECTION .text
   pavgw                m2, m4
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline - also consider going to bytes here
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   punpcklbw            m3, m5
   punpcklbw            m1, m5
@@ -1106,7 +1189,7 @@ SECTION .text
 %undef filter_x_a
 %undef filter_x_b
 %undef filter_rnd
-  STORE_AND_RET
+  STORE_AND_RET %1
 
 .x_nonhalf_y_nonhalf:
 %ifdef PIC
@@ -1114,7 +1197,7 @@ SECTION .text
 %endif
   shl           x_offsetd, filter_idx_shift
   shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
+%if ARCH_X86_64 && %1 > 4
   mova                 m8, [bilin_filter+x_offsetq]
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
@@ -1257,8 +1340,8 @@ SECTION .text
   INC_SRC_BY_SRC_STRIDE
   add                dstq, dst_strideq
 %else ; %1 < 16
-  movh                 m0, [srcq]
-  movh                 m1, [srcq+1]
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
 %if cpuflag(ssse3)
   punpcklbw            m0, m1
   pmaddubsw            m0, filter_x_a
@@ -1279,20 +1362,20 @@ SECTION .text
   INC_SRC_BY_SRC_STRIDE
 
 .x_other_y_other_loop:
-  movh                 m2, [srcq]
-  movh                 m1, [srcq+1]
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
 
   INC_SRC_BY_SRC_STRIDE
-  movh                 m4, [srcq]
-  movh                 m3, [srcq+1]
+  movx                 m4, [srcq]
+  movx                 m3, [srcq+1]
 
 %if cpuflag(ssse3)
   punpcklbw            m2, m1
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m4, filter_x_a
-  movh                 m3, [dstq+dst_strideq]
-  movh                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+  movx                 m1, [dstq]
   paddw                m2, filter_rnd
   paddw                m4, filter_rnd
   psraw                m2, 4
@@ -1331,9 +1414,9 @@ SECTION .text
   pmullw               m1, m4, filter_y_b
   paddw                m2, filter_rnd
   paddw                m0, m3
-  movh                 m3, [dstq+dst_strideq]
+  movx                 m3, [dstq+dst_strideq]
   paddw                m2, m1
-  movh                 m1, [dstq]
+  movx                 m1, [dstq]
   psraw                m0, 4
   psraw                m2, 4
   punpcklbw            m3, m5
@@ -1341,10 +1424,20 @@ SECTION .text
 %endif
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
   packuswb             m0, m2
+%if %1 > 4
   pavgb                m0, [secq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
 %endif
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4
@@ -1362,7 +1455,8 @@ SECTION .text
 %undef filter_y_a
 %undef filter_y_b
 %undef filter_rnd
-  STORE_AND_RET
+%undef movx
+  STORE_AND_RET %1
 %endmacro
 
 ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
@@ -1371,26 +1465,22 @@ SECTION .text
 ; location in the sse/2 version, rather than duplicating that code in the
 ; binary.
 
-INIT_MMX sse
-SUBPEL_VARIANCE  4
 INIT_XMM sse2
+SUBPEL_VARIANCE  4
 SUBPEL_VARIANCE  8
 SUBPEL_VARIANCE 16
 
-INIT_MMX ssse3
-SUBPEL_VARIANCE  4
 INIT_XMM ssse3
+SUBPEL_VARIANCE  4
 SUBPEL_VARIANCE  8
 SUBPEL_VARIANCE 16
 
-INIT_MMX sse
-SUBPEL_VARIANCE  4, 1
 INIT_XMM sse2
+SUBPEL_VARIANCE  4, 1
 SUBPEL_VARIANCE  8, 1
 SUBPEL_VARIANCE 16, 1
 
-INIT_MMX ssse3
-SUBPEL_VARIANCE  4, 1
 INIT_XMM ssse3
+SUBPEL_VARIANCE  4, 1
 SUBPEL_VARIANCE  8, 1
 SUBPEL_VARIANCE 16, 1
diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c
index 7851a98..f8c9711 100644
--- a/vpx_dsp/x86/variance_avx2.c
+++ b/vpx_dsp/x86/variance_avx2.c
@@ -45,7 +45,7 @@ unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
   int sum;
   variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
                 sse, &sum, vpx_get16x16var_avx2, 16);
-  return *sse - (((unsigned int)sum * sum) >> 8);
+  return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
 }
 
 unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
diff --git a/vpx_dsp/x86/variance_impl_mmx.asm b/vpx_dsp/x86/variance_impl_mmx.asm
deleted file mode 100644
index b8ba79b..0000000
--- a/vpx_dsp/x86/variance_impl_mmx.asm
+++ /dev/null
@@ -1,744 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define mmx_filter_shift            7
-
-;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
-global sym(vpx_get_mb_ss_mmx) PRIVATE
-sym(vpx_get_mb_ss_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 8
-    ; end prolog
-
-        mov         rax, arg(0) ;src_ptr
-        mov         rcx, 16
-        pxor        mm4, mm4
-
-.NEXTROW:
-        movq        mm0, [rax]
-        movq        mm1, [rax+8]
-        movq        mm2, [rax+16]
-        movq        mm3, [rax+24]
-        pmaddwd     mm0, mm0
-        pmaddwd     mm1, mm1
-        pmaddwd     mm2, mm2
-        pmaddwd     mm3, mm3
-
-        paddd       mm4, mm0
-        paddd       mm4, mm1
-        paddd       mm4, mm2
-        paddd       mm4, mm3
-
-        add         rax, 32
-        dec         rcx
-        ja          .NEXTROW
-        movq        QWORD PTR [rsp], mm4
-
-        ;return sum[0]+sum[1];
-        movsxd      rax, dword ptr [rsp]
-        movsxd      rcx, dword ptr [rsp+4]
-        add         rax, rcx
-
-    ; begin epilog
-    add rsp, 8
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_get8x8var_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;    unsigned int *SSE,
-;    int *Sum
-;)
-global sym(vpx_get8x8var_mmx) PRIVATE
-sym(vpx_get8x8var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push rsi
-    push rdi
-    push rbx
-    sub         rsp, 16
-    ; end prolog
-
-        pxor        mm5, mm5                    ; Blank mmx6
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-
-        ; Row 1
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 2
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 3
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 4
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 5
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        ;              movq        mm4, [rbx + rdx]
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 6
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 7
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 8
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Now accumulate the final results.
-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
-        movsx       rdx, WORD PTR [rsp+8]
-        movsx       rcx, WORD PTR [rsp+10]
-        movsx       rbx, WORD PTR [rsp+12]
-        movsx       rax, WORD PTR [rsp+14]
-        add         rdx, rcx
-        add         rbx, rax
-        add         rdx, rbx    ;XSum
-        movsxd      rax, DWORD PTR [rsp]
-        movsxd      rcx, DWORD PTR [rsp+4]
-        add         rax, rcx    ;XXSum
-        mov         rsi, arg(4) ;SSE
-        mov         rdi, arg(5) ;Sum
-        mov         dword ptr [rsi], eax
-        mov         dword ptr [rdi], edx
-        xor         rax, rax    ; return 0
-
-    ; begin epilog
-    add rsp, 16
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void
-;vpx_get4x4var_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;    unsigned int *SSE,
-;    int *Sum
-;)
-global sym(vpx_get4x4var_mmx) PRIVATE
-sym(vpx_get4x4var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push rsi
-    push rdi
-    push rbx
-    sub         rsp, 16
-    ; end prolog
-
-        pxor        mm5, mm5                    ; Blank mmx6
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-
-        ; Row 1
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 2
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 3
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher precision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 4
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Now accumulate the final results.
-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
-        movsx       rdx, WORD PTR [rsp+8]
-        movsx       rcx, WORD PTR [rsp+10]
-        movsx       rbx, WORD PTR [rsp+12]
-        movsx       rax, WORD PTR [rsp+14]
-        add         rdx, rcx
-        add         rbx, rax
-        add         rdx, rbx    ;XSum
-        movsxd      rax, DWORD PTR [rsp]
-        movsxd      rcx, DWORD PTR [rsp+4]
-        add         rax, rcx    ;XXSum
-        mov         rsi, arg(4) ;SSE
-        mov         rdi, arg(5) ;Sum
-        mov         dword ptr [rsi], eax
-        mov         dword ptr [rdi], edx
-        xor         rax, rax    ; return 0
-
-    ; begin epilog
-    add rsp, 16
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block2d_bil4x4_var_mmx
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE
-sym(vpx_filter_block2d_bil4x4_var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-        pxor            mm6,            mm6                 ;
-        pxor            mm7,            mm7                 ;
-
-        mov             rax,            arg(4) ;HFilter             ;
-        mov             rdx,            arg(5) ;VFilter             ;
-
-        mov             rsi,            arg(0) ;ref_ptr              ;
-        mov             rdi,            arg(2) ;src_ptr              ;
-
-        mov             rcx,            4                   ;
-        pxor            mm0,            mm0                 ;
-
-        movd            mm1,            [rsi]               ;
-        movd            mm3,            [rsi+1]             ;
-
-        punpcklbw       mm1,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        movq            mm5,            mm1
-
-%if ABI_IS_32BIT
-        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
-%else
-        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
-        add             rsi, r8
-%endif
-
-.filter_block2d_bil4x4_var_mmx_loop:
-
-        movd            mm1,            [rsi]               ;
-        movd            mm3,            [rsi+1]             ;
-
-        punpcklbw       mm1,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        movq            mm3,            mm5                 ;
-
-        movq            mm5,            mm1                 ;
-        pmullw          mm3,            [rdx]               ;
-
-        pmullw          mm1,            [rdx+8]             ;
-        paddw           mm1,            mm3                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm1,            mmx_filter_shift    ;
-
-        movd            mm3,            [rdi]               ;
-        punpcklbw       mm3,            mm0                 ;
-
-        psubw           mm1,            mm3                 ;
-        paddw           mm6,            mm1                 ;
-
-        pmaddwd         mm1,            mm1                 ;
-        paddd           mm7,            mm1                 ;
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-        add             rsi,            r8
-        add             rdi,            r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rdi,            arg(6) ;sum
-        mov             rsi,            arg(7) ;sumsquared
-
-        movd            dword ptr [rdi],          mm2                 ;
-        movd            dword ptr [rsi],          mm4                 ;
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_filter_block2d_bil_var_mmx
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vpx_filter_block2d_bil_var_mmx) PRIVATE
-sym(vpx_filter_block2d_bil_var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-        pxor            mm6,            mm6                 ;
-        pxor            mm7,            mm7                 ;
-        mov             rax,            arg(5) ;HFilter             ;
-
-        mov             rdx,            arg(6) ;VFilter             ;
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-
-        pxor            mm0,            mm0                 ;
-        movq            mm1,            [rsi]               ;
-
-        movq            mm3,            [rsi+1]             ;
-        movq            mm2,            mm1                 ;
-
-        movq            mm4,            mm3                 ;
-        punpcklbw       mm1,            mm0                 ;
-
-        punpckhbw       mm2,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        pmullw          mm2,            [rax]               ;
-        punpcklbw       mm3,            mm0                 ;
-
-        punpckhbw       mm4,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        pmullw          mm4,            [rax+8]             ;
-        paddw           mm1,            mm3                 ;
-
-        paddw           mm2,            mm4                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm2,            mmx_filter_shift    ;
-        movq            mm5,            mm1
-
-        packuswb        mm5,            mm2                 ;
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        add             rsi,            r8
-%endif
-
-.filter_block2d_bil_var_mmx_loop:
-
-        movq            mm1,            [rsi]               ;
-        movq            mm3,            [rsi+1]             ;
-
-        movq            mm2,            mm1                 ;
-        movq            mm4,            mm3                 ;
-
-        punpcklbw       mm1,            mm0                 ;
-        punpckhbw       mm2,            mm0                 ;
-
-        pmullw          mm1,            [rax]               ;
-        pmullw          mm2,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        pmullw          mm3,            [rax+8]             ;
-        pmullw          mm4,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm2,            mm4                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm1,            mmx_filter_shift    ;
-
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm2,            mmx_filter_shift    ;
-
-        movq            mm3,            mm5                 ;
-        movq            mm4,            mm5                 ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        movq            mm5,            mm1                 ;
-        packuswb        mm5,            mm2                 ;
-
-        pmullw          mm3,            [rdx]               ;
-        pmullw          mm4,            [rdx]               ;
-
-        pmullw          mm1,            [rdx+8]             ;
-        pmullw          mm2,            [rdx+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm2,            mm4                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        psraw           mm2,            mmx_filter_shift    ;
-
-        movq            mm3,            [rdi]               ;
-        movq            mm4,            mm3                 ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        psubw           mm1,            mm3                 ;
-        psubw           mm2,            mm4                 ;
-
-        paddw           mm6,            mm1                 ;
-        pmaddwd         mm1,            mm1                 ;
-
-        paddw           mm6,            mm2                 ;
-        pmaddwd         mm2,            mm2                 ;
-
-        paddd           mm7,            mm1                 ;
-        paddd           mm7,            mm2                 ;
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
-        add             rsi,            r8
-        add             rdi,            r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .filter_block2d_bil_var_mmx_loop       ;
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rdi,            arg(7) ;sum
-        mov             rsi,            arg(8) ;sumsquared
-
-        movd            dword ptr [rdi],          mm2                 ;
-        movd            dword ptr [rsi],          mm4                 ;
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-;short mmx_bi_rd[4] = { 64, 64, 64, 64};
-align 16
-mmx_bi_rd:
-    times 4 dw 64
diff --git a/vpx_dsp/x86/variance_mmx.c b/vpx_dsp/x86/variance_mmx.c
deleted file mode 100644
index f04f4e2..0000000
--- a/vpx_dsp/x86/variance_mmx.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_dsp_rtcd.h"
-
-#include "vpx_ports/mem.h"
-
-DECLARE_ALIGNED(16, static const int16_t, bilinear_filters_mmx[8][8]) = {
-  { 128, 128, 128, 128,   0,   0,   0,   0 },
-  { 112, 112, 112, 112,  16,  16,  16,  16 },
-  {  96,  96,  96,  96,  32,  32,  32,  32 },
-  {  80,  80,  80,  80,  48,  48,  48,  48 },
-  {  64,  64,  64,  64,  64,  64,  64,  64 },
-  {  48,  48,  48,  48,  80,  80,  80,  80 },
-  {  32,  32,  32,  32,  96,  96,  96,  96 },
-  {  16,  16,  16,  16, 112, 112, 112, 112 }
-};
-
-extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
-                              const uint8_t *b, int b_stride,
-                              unsigned int *sse, int *sum);
-
-extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr,
-                                              int ref_pixels_per_line,
-                                              const unsigned char *src_ptr,
-                                              int src_pixels_per_line,
-                                              const int16_t *HFilter,
-                                              const int16_t *VFilter,
-                                              int *sum,
-                                              unsigned int *sumsquared);
-
-extern void vpx_filter_block2d_bil_var_mmx(const unsigned char *ref_ptr,
-                                           int ref_pixels_per_line,
-                                           const unsigned char *src_ptr,
-                                           int src_pixels_per_line,
-                                           unsigned int Height,
-                                           const int16_t *HFilter,
-                                           const int16_t *VFilter,
-                                           int *sum,
-                                           unsigned int *sumsquared);
-
-
-unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
-                                 const unsigned char *b, int b_stride,
-                                 unsigned int *sse) {
-    unsigned int var;
-    int avg;
-
-    vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg);
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 4));
-}
-
-unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
-                                 const unsigned char *b, int b_stride,
-                                 unsigned int *sse) {
-    unsigned int var;
-    int avg;
-
-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg);
-    *sse = var;
-
-    return (var - (((unsigned int)avg * avg) >> 6));
-}
-
-unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
-                              const unsigned char *b, int b_stride,
-                              unsigned int *sse) {
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3;
-
-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
-    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
-    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
-                      b + 8 * b_stride, b_stride, &sse2, &sum2);
-    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
-                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    *sse = var;
-    return var;
-}
-
-unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
-                                   const unsigned char *b, int b_stride,
-                                   unsigned int *sse) {
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3, avg;
-
-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
-    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
-    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
-                      b + 8 * b_stride, b_stride, &sse2, &sum2);
-    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
-                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    avg = sum0 + sum1 + sum2 + sum3;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 8));
-}
-
-unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
-                                  const unsigned char *b, int b_stride,
-                                  unsigned int *sse) {
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-
-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
-    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 7));
-}
-
-unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
-                                  const unsigned char *b, int b_stride,
-                                  unsigned int *sse) {
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-
-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
-    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
-                      b + 8 * b_stride, b_stride, &sse1, &sum1);
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-    *sse = var;
-
-    return (var - (((unsigned int)avg * avg) >> 7));
-}
-
-uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride,
-                                       int xoffset, int yoffset,
-                                       const uint8_t *b, int b_stride,
-                                       uint32_t *sse) {
-    int xsum;
-    unsigned int xxsum;
-    vpx_filter_block2d_bil4x4_var_mmx(a, a_stride, b, b_stride,
-                                      bilinear_filters_mmx[xoffset],
-                                      bilinear_filters_mmx[yoffset],
-                                      &xsum, &xxsum);
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
-}
-
-
-uint32_t vpx_sub_pixel_variance8x8_mmx(const uint8_t *a, int a_stride,
-                                       int xoffset, int yoffset,
-                                       const uint8_t *b, int b_stride,
-                                       uint32_t *sse) {
-    int xsum;
-    uint32_t xxsum;
-    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
-                                   bilinear_filters_mmx[xoffset],
-                                   bilinear_filters_mmx[yoffset],
-                                   &xsum, &xxsum);
-    *sse = xxsum;
-    return (xxsum - (((uint32_t)xsum * xsum) >> 6));
-}
-
-uint32_t vpx_sub_pixel_variance16x16_mmx(const uint8_t *a, int a_stride,
-                                         int xoffset, int yoffset,
-                                         const uint8_t *b, int b_stride,
-                                         uint32_t *sse) {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
-    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
-                                   bilinear_filters_mmx[xoffset],
-                                   bilinear_filters_mmx[yoffset],
-                                   &xsum0, &xxsum0);
-
-    vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 16,
-                                   bilinear_filters_mmx[xoffset],
-                                   bilinear_filters_mmx[yoffset],
-                                   &xsum1, &xxsum1);
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    *sse = xxsum0;
-    return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
-}
-
-uint32_t vpx_sub_pixel_variance16x8_mmx(const uint8_t *a, int a_stride,
-                                        int xoffset, int yoffset,
-                                        const uint8_t *b, int b_stride,
-                                        uint32_t *sse) {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
-    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
-                                   bilinear_filters_mmx[xoffset],
-                                   bilinear_filters_mmx[yoffset],
-                                   &xsum0, &xxsum0);
-
-    vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 8,
-                                   bilinear_filters_mmx[xoffset],
-                                   bilinear_filters_mmx[yoffset],
-                                   &xsum1, &xxsum1);
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    *sse = xxsum0;
-    return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 7));
-}
-
-uint32_t vpx_sub_pixel_variance8x16_mmx(const uint8_t *a, int a_stride,
-                                        int xoffset, int yoffset,
-                                        const uint8_t *b, int b_stride,
-                                        uint32_t *sse) {
-    int xsum;
-    unsigned int xxsum;
-    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
-                                   bilinear_filters_mmx[xoffset],
-                                   bilinear_filters_mmx[yoffset],
-                                   &xsum, &xxsum);
-    *sse = xxsum;
-    return (xxsum - (((uint32_t)xsum * xsum) >> 7));
-}
-
-uint32_t vpx_variance_halfpixvar16x16_h_mmx(const uint8_t *a, int a_stride,
-                                            const uint8_t *b, int b_stride,
-                                            uint32_t *sse) {
-  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 0, b, b_stride, sse);
-}
-
-uint32_t vpx_variance_halfpixvar16x16_v_mmx(const uint8_t *a, int a_stride,
-                                            const uint8_t *b, int b_stride,
-                                            uint32_t *sse) {
-  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 0, 4, b, b_stride, sse);
-}
-
-uint32_t vpx_variance_halfpixvar16x16_hv_mmx(const uint8_t *a, int a_stride,
-                                             const uint8_t *b, int b_stride,
-                                             uint32_t *sse) {
-  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 4, b, b_stride, sse);
-}
diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
index e6c9365..6987c2e 100644
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -171,7 +171,7 @@ unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride,
                                   unsigned int *sse) {
   int sum;
   get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 4);
+  return *sse - ((sum * sum) >> 4);
 }
 
 unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
@@ -180,7 +180,7 @@ unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
   int sum;
   variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
                 sse, &sum, get4x4var_sse2, 4);
-  return *sse - (((unsigned int)sum * sum) >> 5);
+  return *sse - ((sum * sum) >> 5);
 }
 
 unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
@@ -189,7 +189,7 @@ unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
   int sum;
   variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
                 sse, &sum, get4x4var_sse2, 4);
-  return *sse - (((unsigned int)sum * sum) >> 5);
+  return *sse - ((sum * sum) >> 5);
 }
 
 unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
@@ -197,7 +197,7 @@ unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
                                   unsigned int *sse) {
   int sum;
   vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 6);
+  return *sse - ((sum * sum) >> 6);
 }
 
 unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
@@ -206,7 +206,7 @@ unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
   int sum;
   variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
                 sse, &sum, vpx_get8x8var_sse2, 8);
-  return *sse - (((unsigned int)sum * sum) >> 7);
+  return *sse - ((sum * sum) >> 7);
 }
 
 unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
@@ -215,7 +215,7 @@ unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
   int sum;
   variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
                 sse, &sum, vpx_get8x8var_sse2, 8);
-  return *sse - (((unsigned int)sum * sum) >> 7);
+  return *sse - ((sum * sum) >> 7);
 }
 
 unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
@@ -223,7 +223,7 @@ unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
                                     unsigned int *sse) {
   int sum;
   vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 8);
+  return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
 }
 
 unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
@@ -320,16 +320,16 @@ unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
                                           int height, unsigned int *sse, \
                                           void *unused0, void *unused)
 #define DECLS(opt1, opt2) \
-  DECL(4, opt2); \
+  DECL(4, opt1); \
   DECL(8, opt1); \
   DECL(16, opt1)
 
-DECLS(sse2, sse);
+DECLS(sse2, sse2);
 DECLS(ssse3, ssse3);
 #undef DECLS
 #undef DECL
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
 unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
                                                      int src_stride, \
                                                      int x_offset, \
@@ -365,25 +365,25 @@ unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
     } \
   } \
   *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+  return sse - (cast_prod (cast se * se) >> (wlog2 + hlog2)); \
 }
 
 #define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
-FN(16,  8, 16, 4, 3, opt1, (uint32_t)); \
-FN(8,  16,  8, 3, 4, opt1, (uint32_t)); \
-FN(8,   8,  8, 3, 3, opt1, (uint32_t)); \
-FN(8,   4,  8, 3, 2, opt1, (uint32_t)); \
-FN(4,   8,  4, 2, 3, opt2, (uint32_t)); \
-FN(4,   4,  4, 2, 2, opt2, (uint32_t))
-
-FNS(sse2, sse);
+FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
+FN(16,  8, 16, 4, 3, opt1, (int32_t), (int32_t)); \
+FN(8,  16,  8, 3, 4, opt1, (int32_t), (int32_t)); \
+FN(8,   8,  8, 3, 3, opt1, (int32_t), (int32_t)); \
+FN(8,   4,  8, 3, 2, opt1, (int32_t), (int32_t)); \
+FN(4,   8,  4, 2, 3, opt1, (int32_t), (int32_t)); \
+FN(4,   4,  4, 2, 2, opt1, (int32_t), (int32_t))
+
+FNS(sse2, sse2);
 FNS(ssse3, ssse3);
 
 #undef FNS
@@ -401,16 +401,16 @@ int vpx_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
                                             int height, unsigned int *sse, \
                                             void *unused0, void *unused)
 #define DECLS(opt1, opt2) \
-DECL(4, opt2); \
+DECL(4, opt1); \
 DECL(8, opt1); \
 DECL(16, opt1)
 
-DECLS(sse2, sse);
+DECLS(sse2, sse2);
 DECLS(ssse3, ssse3);
 #undef DECL
 #undef DECLS
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
 unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
                                                          int src_stride, \
                                                          int x_offset, \
@@ -451,23 +451,23 @@ unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
     } \
   } \
   *sseptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+  return sse - (cast_prod (cast se * se) >> (wlog2 + hlog2)); \
 }
 
 #define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
-FN(16,  8, 16, 4, 3, opt1, (uint32_t)); \
-FN(8,  16,  8, 3, 4, opt1, (uint32_t)); \
-FN(8,   8,  8, 3, 3, opt1, (uint32_t)); \
-FN(8,   4,  8, 3, 2, opt1, (uint32_t)); \
-FN(4,   8,  4, 2, 3, opt2, (uint32_t)); \
-FN(4,   4,  4, 2, 2, opt2, (uint32_t))
+FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
+FN(16,  8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \
+FN(8,  16,  8, 3, 4, opt1, (uint32_t), (int32_t)); \
+FN(8,   8,  8, 3, 3, opt1, (uint32_t), (int32_t)); \
+FN(8,   4,  8, 3, 2, opt1, (uint32_t), (int32_t)); \
+FN(4,   8,  4, 2, 3, opt1, (uint32_t), (int32_t)); \
+FN(4,   4,  4, 2, 2, opt1, (uint32_t), (int32_t))
 
 FNS(sse2, sse);
 FNS(ssse3, ssse3);
diff --git a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
index 9c5b414..abc0270 100644
--- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -13,15 +13,21 @@
 SECTION .text
 
 %macro convolve_fn 1-2
-INIT_XMM sse2
+%ifidn %1, avg
+%define AUX_XMM_REGS 4
+%else
+%define AUX_XMM_REGS 0
+%endif
 %ifidn %2, highbd
 %define pavg pavgw
-cglobal %2_convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
-                                 fx, fxs, fy, fys, w, h, bd
+cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+                                              dst, dst_stride, \
+                                              fx, fxs, fy, fys, w, h, bd
 %else
 %define pavg pavgb
-cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
-                              fx, fxs, fy, fys, w, h
+cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+                                           dst, dst_stride, \
+                                           fx, fxs, fy, fys, w, h
 %endif
   mov r4d, dword wm
 %ifidn %2, highbd
@@ -152,27 +158,30 @@ cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
   jnz .loop16
   RET
 
-INIT_MMX sse
 .w8:
   mov                    r4d, dword hm
   lea                    r5q, [src_strideq*3]
   lea                    r6q, [dst_strideq*3]
 .loop8:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+src_strideq]
-  movu                    m2, [srcq+src_strideq*2]
-  movu                    m3, [srcq+r5q]
+  movh                    m0, [srcq]
+  movh                    m1, [srcq+src_strideq]
+  movh                    m2, [srcq+src_strideq*2]
+  movh                    m3, [srcq+r5q]
   lea                   srcq, [srcq+src_strideq*4]
 %ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+dst_strideq]
-  pavg                    m2, [dstq+dst_strideq*2]
-  pavg                    m3, [dstq+r6q]
+  movh                    m4, [dstq]
+  movh                    m5, [dstq+dst_strideq]
+  movh                    m6, [dstq+dst_strideq*2]
+  movh                    m7, [dstq+r6q]
+  pavg                    m0, m4
+  pavg                    m1, m5
+  pavg                    m2, m6
+  pavg                    m3, m7
 %endif
-  mova  [dstq              ], m0
-  mova  [dstq+dst_strideq  ], m1
-  mova  [dstq+dst_strideq*2], m2
-  mova  [dstq+r6q          ], m3
+  movh  [dstq              ], m0
+  movh  [dstq+dst_strideq  ], m1
+  movh  [dstq+dst_strideq*2], m2
+  movh  [dstq+r6q          ], m3
   lea                   dstq, [dstq+dst_strideq*4]
   sub                    r4d, 4
   jnz .loop8
@@ -184,25 +193,25 @@ INIT_MMX sse
   lea                    r5q, [src_strideq*3]
   lea                    r6q, [dst_strideq*3]
 .loop4:
-  movh                    m0, [srcq]
-  movh                    m1, [srcq+src_strideq]
-  movh                    m2, [srcq+src_strideq*2]
-  movh                    m3, [srcq+r5q]
+  movd                    m0, [srcq]
+  movd                    m1, [srcq+src_strideq]
+  movd                    m2, [srcq+src_strideq*2]
+  movd                    m3, [srcq+r5q]
   lea                   srcq, [srcq+src_strideq*4]
 %ifidn %1, avg
-  movh                    m4, [dstq]
-  movh                    m5, [dstq+dst_strideq]
-  movh                    m6, [dstq+dst_strideq*2]
-  movh                    m7, [dstq+r6q]
+  movd                    m4, [dstq]
+  movd                    m5, [dstq+dst_strideq]
+  movd                    m6, [dstq+dst_strideq*2]
+  movd                    m7, [dstq+r6q]
   pavg                    m0, m4
   pavg                    m1, m5
   pavg                    m2, m6
   pavg                    m3, m7
 %endif
-  movh  [dstq              ], m0
-  movh  [dstq+dst_strideq  ], m1
-  movh  [dstq+dst_strideq*2], m2
-  movh  [dstq+r6q          ], m3
+  movd  [dstq              ], m0
+  movd  [dstq+dst_strideq  ], m1
+  movd  [dstq+dst_strideq*2], m2
+  movd  [dstq+r6q          ], m3
   lea                   dstq, [dstq+dst_strideq*4]
   sub                    r4d, 4
   jnz .loop4
@@ -210,6 +219,7 @@ INIT_MMX sse
 %endif
 %endmacro
 
+INIT_XMM sse2
 convolve_fn copy
 convolve_fn avg
 %if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
index 3fbaa27..d2cb8ea 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+++ b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
@@ -16,6 +16,11 @@ pw_64:    times 8 dw 64
 ; %define USE_PMULHRSW
 ; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
 ; when using this instruction.
+;
+; The add order below (based on ffvp9) must be followed to prevent outranges.
+; x = k0k1 + k4k5
+; y = k2k3 + k6k7
+; z = signed SAT(x + y)
 
 SECTION .text
 %if ARCH_X86_64
@@ -77,17 +82,12 @@ SECTION .text
 
     pmaddubsw %2, k0k1k4k5
     pmaddubsw m3, k2k3k6k7
-
-    mova      m4, %2
-    mova      m5, m3
-    psrldq    %2, 8
-    psrldq    m3, 8
-    mova      m6, m5
-
-    paddsw    m4, m3
-    pmaxsw    m5, %2
-    pminsw    %2, m6
+    mova      m4, %2        ;k0k1
+    mova      m5, m3        ;k2k3
+    psrldq    %2, 8         ;k4k5
+    psrldq    m3, 8         ;k6k7
     paddsw    %2, m4
+    paddsw    m5, m3
     paddsw    %2, m5
     paddsw    %2, krd
     psraw     %2, 7
@@ -157,27 +157,20 @@ cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
     pmaddubsw           m7, k0k1k4k5
     palignr             m3, m2,  5
     pmaddubsw           m3, k2k3k6k7
-    mova                m0, m4
-    mova                m5, m1
-    mova                m2, m7
-    psrldq              m4, 8
-    psrldq              m1, 8
-    mova                m6, m5
-    paddsw              m0, m1
-    mova                m1, m3
-    psrldq              m7, 8
-    psrldq              m3, 8
-    paddsw              m2, m3
-    mova                m3, m1
-    pmaxsw              m5, m4
-    pminsw              m4, m6
+    mova                m0, m4                  ;k0k1
+    mova                m5, m1                  ;k2k3
+    mova                m2, m7                  ;k0k1 upper
+    psrldq              m4, 8                   ;k4k5
+    psrldq              m1, 8                   ;k6k7
     paddsw              m4, m0
-    paddsw              m4, m5
-    pmaxsw              m1, m7
-    pminsw              m7, m3
+    paddsw              m5, m1
+    mova                m1, m3                  ;k2k3 upper
+    psrldq              m7, 8                   ;k4k5 upper
+    psrldq              m3, 8                   ;k6k7 upper
     paddsw              m7, m2
+    paddsw              m4, m5
+    paddsw              m1, m3
     paddsw              m7, m1
-
     paddsw              m4, krd
     psraw               m4, 7
     packuswb            m4, m4
@@ -240,16 +233,13 @@ cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
     pmaddubsw   %3, k2k3
     pmaddubsw   %4, k4k5
     pmaddubsw   %5, k6k7
-
+    paddsw      %2, %4
+    paddsw      %5, %3
     paddsw      %2, %5
-    mova        %1, %3
-    pminsw      %3, %4
-    pmaxsw      %1, %4
-    paddsw      %2, %3
-    paddsw      %1, %2
-    paddsw      %1, krd
-    psraw       %1, 7
-    packuswb    %1, %1
+    paddsw      %2, krd
+    psraw       %2, 7
+    packuswb    %2, %2
+    SWAP        %1, %2
 %endm
 
 ;-------------------------------------------------------------------------------
@@ -293,39 +283,33 @@ cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 14, LOCAL_VARS_SIZE, \
     pmaddubsw            m3, k4k5
 
     palignr              m7, m4, 13
-    paddsw               m1, m5
-    mova                 m5, m6
-    mova                 m0, m2
-    palignr              m5, m4, 5
-    pminsw               m2, m3
+    mova                 m0, m6
+    palignr              m0, m4, 5
     pmaddubsw            m7, k6k7
-    pmaxsw               m3, m0
+    paddsw               m1, m3
+    paddsw               m2, m5
     paddsw               m1, m2
-    mova                 m0, m6
+    mova                 m5, m6
     palignr              m6, m4, 1
-    pmaddubsw            m5, k2k3
-    paddsw               m1, m3
+    pmaddubsw            m0, k2k3
     pmaddubsw            m6, k0k1
-    palignr              m0, m4, 9
+    palignr              m5, m4, 9
     paddsw               m1, krd
-    pmaddubsw            m0, k4k5
-    mova                 m4, m5
+    pmaddubsw            m5, k4k5
     psraw                m1, 7
-    pminsw               m5, m0
-    paddsw               m6, m7
+    paddsw               m0, m7
+%ifidn %1, h8_avg
+    movh                 m7, [dstq]
+    movh                 m2, [dstq + dstrideq]
+%endif
     packuswb             m1, m1
-
     paddsw               m6, m5
-    pmaxsw               m0, m4
     paddsw               m6, m0
     paddsw               m6, krd
     psraw                m6, 7
     packuswb             m6, m6
-
 %ifidn %1, h8_avg
-    movh                 m0, [dstq]
-    movh                 m2, [dstq + dstrideq]
-    pavgb                m1, m0
+    pavgb                m1, m7
     pavgb                m6, m2
 %endif
     movh             [dstq], m1
@@ -388,7 +372,7 @@ cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \
     pmaddubsw     m1, k2k3
     palignr       m2, m7, 9
     pmaddubsw     m2, k4k5
-    paddsw        m0, m3
+    paddsw        m1, m3
     mova          m3, m4
     punpckhbw     m4, m4
     mova          m5, m4
@@ -403,17 +387,13 @@ cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \
     pmaddubsw     m6, k4k5
     palignr       m7, m3, 13
     pmaddubsw     m7, k6k7
-
-    mova          m3, m1
-    pmaxsw        m1, m2
-    pminsw        m2, m3
     paddsw        m0, m2
     paddsw        m0, m1
-    paddsw        m4, m7
-    mova          m7, m5
-    pmaxsw        m5, m6
-    pminsw        m6, m7
+%ifidn %1, h8_avg
+    mova          m1, [dstq]
+%endif
     paddsw        m4, m6
+    paddsw        m5, m7
     paddsw        m4, m5
     paddsw        m0, krd
     paddsw        m4, krd
@@ -421,7 +401,6 @@ cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \
     psraw         m4, 7
     packuswb      m0, m4
 %ifidn %1, h8_avg
-    mova          m1, [dstq]
     pavgb         m0, m1
 %endif
     lea         srcq, [srcq + sstrideq]
@@ -488,27 +467,21 @@ cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
     movx         m7, [src1q + sstride6q   ]     ;H
     punpcklbw    m6, m7                         ;G H
     pmaddubsw    m6, k6k7
-    mova        tmp, m2
     pmaddubsw    m3, k2k3
     pmaddubsw    m1, k0k1
-    pmaxsw       m2, m4
-    paddsw       m0, m6
+    paddsw       m0, m4
+    paddsw       m2, m6
     movx         m6, [srcq + sstrideq * 8 ]     ;H next iter
     punpcklbw    m7, m6
     pmaddubsw    m7, k6k7
-    pminsw       m4, tmp
-    paddsw       m0, m4
-    mova         m4, m3
     paddsw       m0, m2
-    pminsw       m3, m5
-    pmaxsw       m5, m4
     paddsw       m0, krd
     psraw        m0, 7
-    paddsw       m1, m7
+    paddsw       m1, m5
     packuswb     m0, m0
 
+    paddsw       m3, m7
     paddsw       m1, m3
-    paddsw       m1, m5
     paddsw       m1, krd
     psraw        m1, 7
     lea        srcq, [srcq + sstrideq * 2 ]
@@ -538,11 +511,11 @@ cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
     movx         m1, [srcq + sstrideq     ]     ;B
     movx         m6, [srcq + sstride6q    ]     ;G
     punpcklbw    m0, m1                         ;A B
-    movx         m7, [rax + sstride6q     ]     ;H
+    movx         m7, [src1q + sstride6q   ]     ;H
     pmaddubsw    m0, k0k1
     movx         m2, [srcq + sstrideq * 2 ]     ;C
     punpcklbw    m6, m7                         ;G H
-    movx         m3, [rax + sstrideq * 2  ]     ;D
+    movx         m3, [src1q + sstrideq * 2]     ;D
     pmaddubsw    m6, k6k7
     movx         m4, [srcq + sstrideq * 4 ]     ;E
     punpcklbw    m2, m3                         ;C D
@@ -550,10 +523,7 @@ cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
     punpcklbw    m4, m5                         ;E F
     pmaddubsw    m2, k2k3
     pmaddubsw    m4, k4k5
-    paddsw       m0, m6
-    mova         m1, m2
-    pmaxsw       m2, m4
-    pminsw       m4, m1
+    paddsw       m2, m6
     paddsw       m0, m4
     paddsw       m0, m2
     paddsw       m0, krd
@@ -572,7 +542,6 @@ cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
 %macro SUBPIX_VFILTER16 1
 cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
                              src, sstride, dst, dstride, height, filter
-
     mova          m4, [filterq]
     SETUP_LOCAL_VARS
 %if ARCH_X86_64
@@ -611,12 +580,9 @@ cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
     punpcklbw     m3, m5                         ;A B
     movh          m7, [srcq + sstrideq * 2 + 8]  ;C
     pmaddubsw     m6, k6k7
-    mova          m1, m2
     movh          m5, [src1q + sstrideq * 2 + 8] ;D
-    pmaxsw        m2, m4
     punpcklbw     m7, m5                         ;C D
-    pminsw        m4, m1
-    paddsw        m0, m6
+    paddsw        m2, m6
     pmaddubsw     m3, k0k1
     movh          m1, [srcq + sstrideq * 4 + 8]  ;E
     paddsw        m0, m4
@@ -630,30 +596,24 @@ cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
     movh          m5, [src1q + sstride6q + 8]    ;H
     psraw         m0, 7
     punpcklbw     m2, m5                         ;G H
-    packuswb      m0, m0
     pmaddubsw     m2, k6k7
 %ifidn %1, v8_avg
-    movh          m4, [dstq]
-    pavgb         m0, m4
+    mova          m4, [dstq]
 %endif
     movh      [dstq], m0
-    mova          m6, m7
-    pmaxsw        m7, m1
-    pminsw        m1, m6
-    paddsw        m3, m2
+    paddsw        m7, m2
     paddsw        m3, m1
     paddsw        m3, m7
     paddsw        m3, krd
     psraw         m3, 7
-    packuswb      m3, m3
+    packuswb      m0, m3
 
     add         srcq, sstrideq
     add        src1q, sstrideq
 %ifidn %1, v8_avg
-    movh          m1, [dstq + 8]
-    pavgb         m3, m1
+    pavgb         m0, m4
 %endif
-    movh  [dstq + 8], m3
+    mova      [dstq], m0
     add         dstq, dst_stride
     dec      heightd
     jnz        .loop
diff --git a/vpx_mem/vpx_mem.c b/vpx_mem/vpx_mem.c
index b98fe83..b261fc0 100644
--- a/vpx_mem/vpx_mem.c
+++ b/vpx_mem/vpx_mem.c
@@ -9,8 +9,6 @@
  */
 
 
-#define __VPX_MEM_C__
-
 #include "vpx_mem.h"
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/vpx_ports/mem_ops.h b/vpx_ports/mem_ops.h
index d4a3d77..620df31 100644
--- a/vpx_ports/mem_ops.h
+++ b/vpx_ports/mem_ops.h
@@ -89,7 +89,7 @@ static unsigned MEM_VALUE_T mem_get_be32(const void *vmem) {
   unsigned MEM_VALUE_T  val;
   const MAU_T          *mem = (const MAU_T *)vmem;
 
-  val = mem[0] << 24;
+  val = ((unsigned MEM_VALUE_T)mem[0]) << 24;
   val |= mem[1] << 16;
   val |= mem[2] << 8;
   val |= mem[3];
@@ -125,7 +125,7 @@ static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) {
   unsigned MEM_VALUE_T  val;
   const MAU_T          *mem = (const MAU_T *)vmem;
 
-  val = mem[3] << 24;
+  val = ((unsigned MEM_VALUE_T)mem[3]) << 24;
   val |= mem[2] << 16;
   val |= mem[1] << 8;
   val |= mem[0];
@@ -168,8 +168,8 @@ mem_get_s_generic(le, 32)
 static VPX_INLINE void mem_put_be16(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
-  mem[0] = (val >> 8) & 0xff;
-  mem[1] = (val >> 0) & 0xff;
+  mem[0] = (MAU_T)((val >> 8) & 0xff);
+  mem[1] = (MAU_T)((val >> 0) & 0xff);
 }
 
 #undef  mem_put_be24
@@ -177,9 +177,9 @@ static VPX_INLINE void mem_put_be16(void *vmem, MEM_VALUE_T val) {
 static VPX_INLINE void mem_put_be24(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
-  mem[0] = (val >> 16) & 0xff;
-  mem[1] = (val >>  8) & 0xff;
-  mem[2] = (val >>  0) & 0xff;
+  mem[0] = (MAU_T)((val >> 16) & 0xff);
+  mem[1] = (MAU_T)((val >>  8) & 0xff);
+  mem[2] = (MAU_T)((val >>  0) & 0xff);
 }
 
 #undef  mem_put_be32
@@ -187,10 +187,10 @@ static VPX_INLINE void mem_put_be24(void *vmem, MEM_VALUE_T val) {
 static VPX_INLINE void mem_put_be32(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
-  mem[0] = (val >> 24) & 0xff;
-  mem[1] = (val >> 16) & 0xff;
-  mem[2] = (val >>  8) & 0xff;
-  mem[3] = (val >>  0) & 0xff;
+  mem[0] = (MAU_T)((val >> 24) & 0xff);
+  mem[1] = (MAU_T)((val >> 16) & 0xff);
+  mem[2] = (MAU_T)((val >>  8) & 0xff);
+  mem[3] = (MAU_T)((val >>  0) & 0xff);
 }
 
 #undef  mem_put_le16
@@ -198,8 +198,8 @@ static VPX_INLINE void mem_put_be32(void *vmem, MEM_VALUE_T val) {
 static VPX_INLINE void mem_put_le16(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
-  mem[0] = (val >>  0) & 0xff;
-  mem[1] = (val >>  8) & 0xff;
+  mem[0] = (MAU_T)((val >> 0) & 0xff);
+  mem[1] = (MAU_T)((val >> 8) & 0xff);
 }
 
 #undef  mem_put_le24
@@ -207,9 +207,9 @@ static VPX_INLINE void mem_put_le16(void *vmem, MEM_VALUE_T val) {
 static VPX_INLINE void mem_put_le24(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
-  mem[0] = (val >>  0) & 0xff;
-  mem[1] = (val >>  8) & 0xff;
-  mem[2] = (val >> 16) & 0xff;
+  mem[0] = (MAU_T)((val >>  0) & 0xff);
+  mem[1] = (MAU_T)((val >>  8) & 0xff);
+  mem[2] = (MAU_T)((val >> 16) & 0xff);
 }
 
 #undef  mem_put_le32
@@ -217,10 +217,10 @@ static VPX_INLINE void mem_put_le24(void *vmem, MEM_VALUE_T val) {
 static VPX_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
-  mem[0] = (val >>  0) & 0xff;
-  mem[1] = (val >>  8) & 0xff;
-  mem[2] = (val >> 16) & 0xff;
-  mem[3] = (val >> 24) & 0xff;
+  mem[0] = (MAU_T)((val >>  0) & 0xff);
+  mem[1] = (MAU_T)((val >>  8) & 0xff);
+  mem[2] = (MAU_T)((val >> 16) & 0xff);
+  mem[3] = (MAU_T)((val >> 24) & 0xff);
 }
 
 #endif  // VPX_PORTS_MEM_OPS_H_
diff --git a/vpx_ports/mem_ops_aligned.h b/vpx_ports/mem_ops_aligned.h
index c16111f..46f6173 100644
--- a/vpx_ports/mem_ops_aligned.h
+++ b/vpx_ports/mem_ops_aligned.h
@@ -28,8 +28,8 @@
  * could redefine these macros.
  */
 #define swap_endian_16(val,raw) do {\
-    val = ((raw>>8) & 0x00ff) \
-          | ((raw<<8) & 0xff00);\
+    val = (uint16_t)(((raw>>8) & 0x00ff) \
+          | ((raw<<8) & 0xff00));\
   } while(0)
 #define swap_endian_32(val,raw) do {\
     val = ((raw>>24) & 0x000000ff) \
diff --git a/vpx_ports/vpx_once.h b/vpx_ports/vpx_once.h
index f1df394..da04db4 100644
--- a/vpx_ports/vpx_once.h
+++ b/vpx_ports/vpx_once.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -13,63 +13,83 @@
 
 #include "vpx_config.h"
 
+/* Implement a function wrapper to guarantee initialization
+ * thread-safety for library singletons.
+ *
+ * NOTE: These functions use static locks, and can only be
+ * used with one common argument per compilation unit. So
+ *
+ * file1.c:
+ *   vpx_once(foo);
+ *   ...
+ *   vpx_once(foo);
+ *
+ *   file2.c:
+ *     vpx_once(bar);
+ *
+ * will ensure foo() and bar() are each called only once, but in
+ *
+ * file1.c:
+ *   vpx_once(foo);
+ *   vpx_once(bar):
+ *
+ * bar() will never be called because the lock is used up
+ * by the call to foo().
+ */
+
 #if CONFIG_MULTITHREAD && defined(_WIN32)
 #include <windows.h>
 #include <stdlib.h>
+/* Declare a per-compilation-unit state variable to track the progress
+ * of calling func() only once. This must be at global scope because
+ * local initializers are not thread-safe in MSVC prior to Visual
+ * Studio 2015.
+ *
+ * As a static, once_state will be zero-initialized as program start.
+ */
+static LONG once_state;
 static void once(void (*func)(void))
 {
-    static CRITICAL_SECTION *lock;
-    static LONG waiters;
-    static int done;
-    void *lock_ptr = &lock;
-
-    /* If the initialization is complete, return early. This isn't just an
-     * optimization, it prevents races on the destruction of the global
-     * lock.
+    /* Try to advance once_state from its initial value of 0 to 1.
+     * Only one thread can succeed in doing so.
      */
-    if(done)
+    if (InterlockedCompareExchange(&once_state, 1, 0) == 0) {
+        /* We're the winning thread, having set once_state to 1.
+         * Call our function. */
+        func();
+        /* Now advance once_state to 2, unblocking any other threads. */
+        InterlockedIncrement(&once_state);
         return;
-
-    InterlockedIncrement(&waiters);
-
-    /* Get a lock. We create one and try to make it the one-true-lock,
-     * throwing it away if we lost the race.
-     */
-
-    {
-        /* Scope to protect access to new_lock */
-        CRITICAL_SECTION *new_lock = malloc(sizeof(CRITICAL_SECTION));
-        InitializeCriticalSection(new_lock);
-        if (InterlockedCompareExchangePointer(lock_ptr, new_lock, NULL) != NULL)
-        {
-            DeleteCriticalSection(new_lock);
-            free(new_lock);
-        }
     }
 
-    /* At this point, we have a lock that can be synchronized on. We don't
-     * care which thread actually performed the allocation.
+    /* We weren't the winning thread, but we want to block on
+     * the state variable so we don't return before func()
+     * has finished executing elsewhere.
+     *
+     * Try to advance once_state from 2 to 2, which is only possible
+     * after the winning thead advances it from 1 to 2.
      */
-
-    EnterCriticalSection(lock);
-
-    if (!done)
-    {
-        func();
-        done = 1;
+    while (InterlockedCompareExchange(&once_state, 2, 2) != 2) {
+        /* State isn't yet 2. Try again.
+         *
+         * We are used for singleton initialization functions,
+         * which should complete quickly. Contention will likewise
+         * be rare, so it's worthwhile to use a simple but cpu-
+         * intensive busy-wait instead of successive backoff,
+         * waiting on a kernel object, or another heavier-weight scheme.
+         *
+         * We can at least yield our timeslice.
+         */
+        Sleep(0);
     }
 
-    LeaveCriticalSection(lock);
-
-    /* Last one out should free resources. The destructed objects are
-     * protected by checking if(done) above.
+    /* We've seen once_state advance to 2, so we know func()
+     * has been called. And we've left once_state as we found it,
+     * so other threads will have the same experience.
+     *
+     * It's safe to return now.
      */
-    if(!InterlockedDecrement(&waiters))
-    {
-        DeleteCriticalSection(lock);
-        free(lock);
-        lock = NULL;
-    }
+    return;
 }
 
 
diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index 5da346e..bae25ac 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -12,6 +12,11 @@
 #ifndef VPX_PORTS_X86_H_
 #define VPX_PORTS_X86_H_
 #include <stdlib.h>
+
+#if defined(_MSC_VER)
+#include <intrin.h>  /* For __cpuidex, __rdtsc */
+#endif
+
 #include "vpx_config.h"
 #include "vpx/vpx_integer.h"
 
@@ -77,16 +82,12 @@ typedef enum {
 #else /* end __SUNPRO__ */
 #if ARCH_X86_64
 #if defined(_MSC_VER) && _MSC_VER > 1500
-void __cpuidex(int CPUInfo[4], int info_type, int ecxvalue);
-#pragma intrinsic(__cpuidex)
 #define cpuid(func, func2, a, b, c, d) do {\
     int regs[4];\
     __cpuidex(regs, func, func2); \
     a = regs[0];  b = regs[1];  c = regs[2];  d = regs[3];\
   } while(0)
 #else
-void __cpuid(int CPUInfo[4], int info_type);
-#pragma intrinsic(__cpuid)
 #define cpuid(func, func2, a, b, c, d) do {\
     int regs[4];\
     __cpuid(regs, func); \
@@ -172,7 +173,7 @@ x86_simd_caps(void) {
   env = getenv("VPX_SIMD_CAPS_MASK");
 
   if (env && *env)
-    mask = strtol(env, NULL, 0);
+    mask = (unsigned int)strtoul(env, NULL, 0);
 
   /* Ensure that the CPUID instruction supports extended features */
   cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
@@ -212,10 +213,11 @@ x86_simd_caps(void) {
   return flags & mask;
 }
 
-#if ARCH_X86_64 && defined(_MSC_VER)
-unsigned __int64 __rdtsc(void);
-#pragma intrinsic(__rdtsc)
-#endif
+// Note:
+//  32-bit CPU cycle counter is light-weighted for most function performance
+//  measurement. For large function (CPU time > a couple of seconds), 64-bit
+//  counter should be used.
+// 32-bit CPU cycle counter
 static INLINE unsigned int
 x86_readtsc(void) {
 #if defined(__GNUC__) && __GNUC__
@@ -234,7 +236,25 @@ x86_readtsc(void) {
 #endif
 #endif
 }
-
+// 64-bit CPU cycle counter
+static INLINE uint64_t
+x86_readtsc64(void) {
+#if defined(__GNUC__) && __GNUC__
+  uint32_t hi, lo;
+  __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
+  return ((uint64_t)hi << 32) | lo;
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+  uint_t hi, lo;
+  asm volatile("rdtsc\n\t" : "=a"(lo), "=d"(hi));
+  return ((uint64_t)hi << 32) | lo;
+#else
+#if ARCH_X86_64
+  return (uint64_t)__rdtsc();
+#else
+  __asm  rdtsc;
+#endif
+#endif
+}
 
 #if defined(__GNUC__) && __GNUC__
 #define x86_pause_hint()\
diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm
index c94b76a..708fa10 100644
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@@ -189,7 +189,6 @@
 %if ABI_IS_32BIT
   %if CONFIG_PIC=1
   %ifidn __OUTPUT_FORMAT__,elf32
-    %define GET_GOT_SAVE_ARG 1
     %define WRT_PLT wrt ..plt
     %macro GET_GOT 1
       extern _GLOBAL_OFFSET_TABLE_
@@ -208,7 +207,6 @@
       %define RESTORE_GOT pop %1
     %endmacro
   %elifidn __OUTPUT_FORMAT__,macho32
-    %define GET_GOT_SAVE_ARG 1
     %macro GET_GOT 1
       push %1
       call %%get_got
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index 7739218..6bbb6d8 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -114,7 +114,7 @@ int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
   return -2;
 }
 
-#if CONFIG_VP9 || CONFIG_VP10
+#if CONFIG_VP9
 // TODO(jkoleszar): Maybe replace this with struct vpx_image
 
 int vpx_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
@@ -160,29 +160,12 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
     const uint64_t uvplane_size = (uv_height + 2 * uv_border_h) *
                                   (uint64_t)uv_stride + byte_alignment;
 
-#if CONFIG_ALPHA
-    const int alpha_width = aligned_width;
-    const int alpha_height = aligned_height;
-    const int alpha_stride = y_stride;
-    const int alpha_border_w = border;
-    const int alpha_border_h = border;
-    const uint64_t alpha_plane_size = (alpha_height + 2 * alpha_border_h) *
-                                      (uint64_t)alpha_stride + byte_alignment;
-#if CONFIG_VP9_HIGHBITDEPTH
-    const uint64_t frame_size = (1 + use_highbitdepth) *
-        (yplane_size + 2 * uvplane_size + alpha_plane_size);
-#else
-    const uint64_t frame_size = yplane_size + 2 * uvplane_size +
-                                alpha_plane_size;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-#else
 #if CONFIG_VP9_HIGHBITDEPTH
     const uint64_t frame_size =
         (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size);
 #else
     const uint64_t frame_size = yplane_size + 2 * uvplane_size;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-#endif  // CONFIG_ALPHA
 
     uint8_t *buf = NULL;
 
@@ -203,6 +186,15 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
         return -1;
 
       ybf->buffer_alloc = (uint8_t *)yv12_align_addr(fb->data, 32);
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+      // This memset is needed for fixing the issue of using uninitialized
+      // value in msan test. It will cause a perf loss, so only do this for
+      // msan test.
+      memset(ybf->buffer_alloc, 0, (int)frame_size);
+#endif
+#endif
     } else if (frame_size > (size_t)ybf->buffer_alloc_sz) {
       // Allocation to hold larger frame, or first allocation.
       vpx_free(ybf->buffer_alloc);
@@ -268,14 +260,6 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
         buf + yplane_size + uvplane_size + (uv_border_h * uv_stride) +
         uv_border_w, vp9_byte_align);
 
-#if CONFIG_ALPHA
-    ybf->alpha_width = alpha_width;
-    ybf->alpha_height = alpha_height;
-    ybf->alpha_stride = alpha_stride;
-    ybf->alpha_buffer = (uint8_t *)yv12_align_addr(
-        buf + yplane_size + 2 * uvplane_size +
-        (alpha_border_h * alpha_stride) + alpha_border_w, vp9_byte_align);
-#endif
     ybf->corrupted = 0; /* assume not corrupted by errors */
     return 0;
   }
diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c
index 670144b..52f0aff 100644
--- a/vpx_scale/generic/yv12extend.c
+++ b/vpx_scale/generic/yv12extend.c
@@ -157,7 +157,7 @@ void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
                uv_border + ybf->uv_width - ybf->uv_crop_width);
 }
 
-#if CONFIG_VP9 || CONFIG_VP10
+#if CONFIG_VP9
 static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) {
   const int c_w = ybf->uv_crop_width;
   const int c_h = ybf->uv_crop_height;
@@ -211,13 +211,13 @@ void vpx_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf) {
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
+static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   memcpy(dst, src, num * sizeof(uint16_t));
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-#endif  // CONFIG_VP9 || CONFIG_VP10
+#endif  // CONFIG_VP9
 
 // Copies the source image into the destination image and updates the
 // destination's UMV borders.
diff --git a/vpx_scale/vpx_scale_rtcd.pl b/vpx_scale/vpx_scale_rtcd.pl
index 56b952b..44b115c 100644
--- a/vpx_scale/vpx_scale_rtcd.pl
+++ b/vpx_scale/vpx_scale_rtcd.pl
@@ -22,7 +22,7 @@ add_proto qw/void vp8_yv12_copy_frame/, "const struct yv12_buffer_config *src_yb
 
 add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
 
-if ((vpx_config("CONFIG_VP9") eq "yes") || (vpx_config("CONFIG_VP10") eq "yes")) {
+if (vpx_config("CONFIG_VP9") eq "yes") {
     add_proto qw/void vpx_extend_frame_borders/, "struct yv12_buffer_config *ybf";
     specialize qw/vpx_extend_frame_borders dspr2/;
 
diff --git a/vpx_util/vpx_thread.h b/vpx_util/vpx_thread.h
index de63c4d..2062abd 100644
--- a/vpx_util/vpx_thread.h
+++ b/vpx_util/vpx_thread.h
@@ -147,6 +147,152 @@ static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
   pthread_mutex_lock(mutex);
   return !ok;
 }
+#elif defined(__OS2__)
+#define INCL_DOS
+#include <os2.h>    // NOLINT
+
+#include <errno.h>  // NOLINT
+#include <stdlib.h> // NOLINT
+#include <sys/builtin.h> // NOLINT
+
+#define pthread_t TID
+#define pthread_mutex_t HMTX
+
+typedef struct {
+  HEV event_sem_;
+  HEV ack_sem_;
+  volatile unsigned wait_count_;
+} pthread_cond_t;
+
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
+
+#define THREADFN void *
+#define THREAD_RETURN(val) (val)
+
+typedef struct {
+  void* (*start_)(void*);
+  void* arg_;
+} thread_arg;
+
+static void thread_start(void* arg) {
+  thread_arg targ = *(thread_arg *)arg;
+  free(arg);
+
+  targ.start_(targ.arg_);
+}
+
+static INLINE int pthread_create(pthread_t* const thread, const void* attr,
+                                 void* (*start)(void*),
+                                 void* arg) {
+  int tid;
+  thread_arg *targ = (thread_arg *)malloc(sizeof(*targ));
+  if (targ == NULL) return 1;
+
+  (void)attr;
+
+  targ->start_ = start;
+  targ->arg_ = arg;
+  tid = (pthread_t)_beginthread(thread_start, NULL, 1024 * 1024, targ);
+  if (tid == -1) {
+    free(targ);
+    return 1;
+  }
+
+  *thread = tid;
+  return 0;
+}
+
+static INLINE int pthread_join(pthread_t thread, void** value_ptr) {
+  (void)value_ptr;
+  return DosWaitThread(&thread, DCWW_WAIT) != 0;
+}
+
+// Mutex
+static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
+                                     void* mutexattr) {
+  (void)mutexattr;
+  return DosCreateMutexSem(NULL, mutex, 0, FALSE) != 0;
+}
+
+static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
+  return DosRequestMutexSem(*mutex, SEM_IMMEDIATE_RETURN) == 0 ? 0 : EBUSY;
+}
+
+static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+  return DosRequestMutexSem(*mutex, SEM_INDEFINITE_WAIT) != 0;
+}
+
+static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+  return DosReleaseMutexSem(*mutex) != 0;
+}
+
+static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+  return DosCloseMutexSem(*mutex) != 0;
+}
+
+// Condition
+static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
+  int ok = 1;
+  ok &= DosCloseEventSem(condition->event_sem_) == 0;
+  ok &= DosCloseEventSem(condition->ack_sem_) == 0;
+  return !ok;
+}
+
+static INLINE int pthread_cond_init(pthread_cond_t *const condition,
+                                    void* cond_attr) {
+  int ok = 1;
+  (void)cond_attr;
+
+  ok &= DosCreateEventSem(NULL, &condition->event_sem_, DCE_POSTONE, FALSE)
+          == 0;
+  ok &= DosCreateEventSem(NULL, &condition->ack_sem_, DCE_POSTONE, FALSE) == 0;
+  if (!ok) {
+    pthread_cond_destroy(condition);
+    return 1;
+  }
+  condition->wait_count_ = 0;
+  return 0;
+}
+
+static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
+  int ok = 1;
+
+  if (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) {
+    ok &= DosPostEventSem(condition->event_sem_) == 0;
+    ok &= DosWaitEventSem(condition->ack_sem_, SEM_INDEFINITE_WAIT) == 0;
+  }
+
+  return !ok;
+}
+
+static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
+  int ok = 1;
+
+  while (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0))
+      ok &= pthread_cond_signal(condition) == 0;
+
+  return !ok;
+}
+
+static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
+                                    pthread_mutex_t *const mutex) {
+  int ok = 1;
+
+  __atomic_increment(&condition->wait_count_);
+
+  ok &= pthread_mutex_unlock(mutex) == 0;
+
+  ok &= DosWaitEventSem(condition->event_sem_, SEM_INDEFINITE_WAIT) == 0;
+
+  __atomic_decrement(&condition->wait_count_);
+
+  ok &= DosPostEventSem(condition->ack_sem_) == 0;
+
+  pthread_mutex_lock(mutex);
+
+  return !ok;
+}
 #else  // _WIN32
 #include <pthread.h> // NOLINT
 # define THREADFN void*
diff --git a/vpxdec.c b/vpxdec.c
index 285d58e..dbe64aa 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -28,7 +28,7 @@
 #include "vpx_ports/mem_ops.h"
 #include "vpx_ports/vpx_timer.h"
 
-#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER || CONFIG_VP10_DECODER
+#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
 #include "vpx/vp8dx.h"
 #endif
 
@@ -257,8 +257,7 @@ static int read_frame(struct VpxDecInputContext *input, uint8_t **buf,
   switch (input->vpx_input_ctx->file_type) {
 #if CONFIG_WEBM_IO
     case FILE_TYPE_WEBM:
-      return webm_read_frame(input->webm_ctx,
-                             buf, bytes_in_buffer, buffer_size);
+      return webm_read_frame(input->webm_ctx, buf, bytes_in_buffer);
 #endif
     case FILE_TYPE_RAW:
       return raw_read_frame(input->vpx_input_ctx->file,
@@ -642,7 +641,7 @@ static int main_loop(int argc, const char **argv_) {
       summary = 1;
     else if (arg_match(&arg, &threadsarg, argi))
       cfg.threads = arg_parse_uint(&arg);
-#if CONFIG_VP9_DECODER || CONFIG_VP10_DECODER
+#if CONFIG_VP9_DECODER
     else if (arg_match(&arg, &frameparallelarg, argi))
       frame_parallel = 1;
 #endif
diff --git a/vpxenc.c b/vpxenc.c
index cb78226..efcf064 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -32,10 +32,10 @@
 #include "./ivfenc.h"
 #include "./tools_common.h"
 
-#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
 #include "vpx/vp8cx.h"
 #endif
-#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER || CONFIG_VP10_DECODER
+#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
 #include "vpx/vp8dx.h"
 #endif
 
@@ -374,21 +374,22 @@ static const int vp8_arg_ctrl_map[] = {
 };
 #endif
 
-#if CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+#if CONFIG_VP9_ENCODER
 static const arg_def_t cpu_used_vp9 = ARG_DEF(
     NULL, "cpu-used", 1, "CPU Used (-8..8)");
 static const arg_def_t tile_cols = ARG_DEF(
     NULL, "tile-columns", 1, "Number of tile columns to use, log2");
 static const arg_def_t tile_rows = ARG_DEF(
-    NULL, "tile-rows", 1, "Number of tile rows to use, log2");
+    NULL, "tile-rows", 1,
+    "Number of tile rows to use, log2 (set to 0 while threads > 1)");
 static const arg_def_t lossless = ARG_DEF(
-    NULL, "lossless", 1, "Lossless mode");
+    NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)");
 static const arg_def_t frame_parallel_decoding = ARG_DEF(
     NULL, "frame-parallel", 1, "Enable frame parallel decodability features");
 static const arg_def_t aq_mode = ARG_DEF(
     NULL, "aq-mode", 1,
     "Adaptive quantization mode (0: off (default), 1: variance 2: complexity, "
-    "3: cyclic refresh)");
+    "3: cyclic refresh, 4: equator360)");
 static const arg_def_t frame_periodic_boost = ARG_DEF(
     NULL, "frame-boost", 1,
     "Enable frame periodic boost (0: off (default), 1: on)");
@@ -443,6 +444,11 @@ static const struct arg_enum_list tune_content_enum[] = {
 
 static const arg_def_t tune_content = ARG_DEF_ENUM(
     NULL, "tune-content", 1, "Tune content type", tune_content_enum);
+
+static const arg_def_t target_level = ARG_DEF(
+    NULL, "target-level", 1,
+    "Target level (255: off (default); 0: only keep level stats; 10: level 1.0;"
+    " 11: level 1.1; ... 62: level 6.2)");
 #endif
 
 #if CONFIG_VP9_ENCODER
@@ -453,7 +459,10 @@ static const arg_def_t *vp9_args[] = {
   &gf_cbr_boost_pct, &lossless,
   &frame_parallel_decoding, &aq_mode, &frame_periodic_boost,
   &noise_sens, &tune_content, &input_color_space,
-  &min_gf_interval, &max_gf_interval,
+  &min_gf_interval, &max_gf_interval, &target_level,
+#if CONFIG_VP9_HIGHBITDEPTH
+  &bitdeptharg, &inbitdeptharg,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   NULL
 };
 static const int vp9_arg_ctrl_map[] = {
@@ -466,33 +475,7 @@ static const int vp9_arg_ctrl_map[] = {
   VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE,
   VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_NOISE_SENSITIVITY,
   VP9E_SET_TUNE_CONTENT, VP9E_SET_COLOR_SPACE,
-  VP9E_SET_MIN_GF_INTERVAL, VP9E_SET_MAX_GF_INTERVAL,
-  0
-};
-#endif
-
-#if CONFIG_VP10_ENCODER
-static const arg_def_t *vp10_args[] = {
-  &cpu_used_vp9, &auto_altref, &sharpness, &static_thresh,
-  &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
-  &tune_ssim, &cq_level, &max_intra_rate_pct, &max_inter_rate_pct,
-  &gf_cbr_boost_pct, &lossless,
-  &frame_parallel_decoding, &aq_mode, &frame_periodic_boost,
-  &noise_sens, &tune_content, &input_color_space,
-  &min_gf_interval, &max_gf_interval,
-  NULL
-};
-static const int vp10_arg_ctrl_map[] = {
-  VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF,
-  VP8E_SET_SHARPNESS, VP8E_SET_STATIC_THRESHOLD,
-  VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS,
-  VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE,
-  VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT,
-  VP9E_SET_MAX_INTER_BITRATE_PCT, VP9E_SET_GF_CBR_BOOST_PCT,
-  VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE,
-  VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_NOISE_SENSITIVITY,
-  VP9E_SET_TUNE_CONTENT, VP9E_SET_COLOR_SPACE,
-  VP9E_SET_MIN_GF_INTERVAL, VP9E_SET_MAX_GF_INTERVAL,
+  VP9E_SET_MIN_GF_INTERVAL, VP9E_SET_MAX_GF_INTERVAL, VP9E_SET_TARGET_LEVEL,
   0
 };
 #endif
@@ -524,10 +507,6 @@ void usage_exit(void) {
   fprintf(stderr, "\nVP9 Specific Options:\n");
   arg_show_usage(stderr, vp9_args);
 #endif
-#if CONFIG_VP10_ENCODER
-  fprintf(stderr, "\nVP10 Specific Options:\n");
-  arg_show_usage(stderr, vp10_args);
-#endif
   fprintf(stderr, "\nStream timebase (--timebase):\n"
           "  The desired precision of timestamps in the output, expressed\n"
           "  in fractional seconds. Default is 1/1000.\n");
@@ -773,9 +752,7 @@ static int compare_img(const vpx_image_t *const img1,
 
 
 #define NELEMENTS(x) (sizeof(x)/sizeof(x[0]))
-#if CONFIG_VP10_ENCODER
-#define ARG_CTRL_CNT_MAX NELEMENTS(vp10_arg_ctrl_map)
-#elif CONFIG_VP9_ENCODER
+#if CONFIG_VP9_ENCODER
 #define ARG_CTRL_CNT_MAX NELEMENTS(vp9_arg_ctrl_map)
 #else
 #define ARG_CTRL_CNT_MAX NELEMENTS(vp8_arg_ctrl_map)
@@ -783,7 +760,7 @@ static int compare_img(const vpx_image_t *const img1,
 
 #if !CONFIG_WEBM_IO
 typedef int stereo_format_t;
-struct EbmlGlobal { int debug; };
+struct WebmOutputContext { int debug; };
 #endif
 
 /* Per-stream configuration */
@@ -798,7 +775,6 @@ struct stream_config {
   int                       arg_ctrls[ARG_CTRL_CNT_MAX][2];
   int                       arg_ctrl_cnt;
   int                       write_webm;
-  int                       have_kf_max_dist;
 #if CONFIG_VP9_HIGHBITDEPTH
   // whether to use 16bit internal buffers
   int                       use_16bit_internal;
@@ -812,7 +788,7 @@ struct stream_state {
   struct stream_config      config;
   FILE                     *file;
   struct rate_hist         *rate_hist;
-  struct EbmlGlobal         ebml;
+  struct WebmOutputContext  webm_ctx;
   uint64_t                  psnr_sse_total;
   uint64_t                  psnr_samples_total;
   double                    psnr_totals[4];
@@ -943,7 +919,7 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
   }
   /* Validate global config */
   if (global->passes == 0) {
-#if CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+#if CONFIG_VP9_ENCODER
     // Make default VP9 passes = 2 until there is a better quality 1-pass
     // encoder
     if (global->codec != NULL && global->codec->name != NULL)
@@ -1055,13 +1031,13 @@ static struct stream_state *new_stream(struct VpxEncoderConfig *global,
     stream->config.write_webm = 1;
 #if CONFIG_WEBM_IO
     stream->config.stereo_fmt = STEREO_FORMAT_MONO;
-    stream->ebml.last_pts_ns = -1;
-    stream->ebml.writer = NULL;
-    stream->ebml.segment = NULL;
+    stream->webm_ctx.last_pts_ns = -1;
+    stream->webm_ctx.writer = NULL;
+    stream->webm_ctx.segment = NULL;
 #endif
 
     /* Allows removal of the application version from the EBML tags */
-    stream->ebml.debug = global->debug;
+    stream->webm_ctx.debug = global->debug;
 
     /* Default lag_in_frames is 0 in realtime mode */
     if (global->deadline == VPX_DL_REALTIME)
@@ -1101,13 +1077,6 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
     ctrl_args = vp9_args;
     ctrl_args_map = vp9_arg_ctrl_map;
 #endif
-#if CONFIG_VP10_ENCODER
-  } else if (strcmp(global->codec->name, "vp10") == 0) {
-    // TODO(jingning): Reuse VP9 specific encoder configuration parameters.
-    // Consider to expand this set for VP10 encoder control.
-    ctrl_args = vp10_args;
-    ctrl_args_map = vp10_arg_ctrl_map;
-#endif
   }
 
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
@@ -1218,13 +1187,11 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
       config->cfg.kf_min_dist = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &kf_max_dist, argi)) {
       config->cfg.kf_max_dist = arg_parse_uint(&arg);
-      config->have_kf_max_dist = 1;
     } else if (arg_match(&arg, &kf_disabled, argi)) {
       config->cfg.kf_mode = VPX_KF_DISABLED;
 #if CONFIG_VP9_HIGHBITDEPTH
     } else if (arg_match(&arg, &test16bitinternalarg, argi)) {
-      if (strcmp(global->codec->name, "vp9") == 0 ||
-          strcmp(global->codec->name, "vp10") == 0) {
+      if (strcmp(global->codec->name, "vp9") == 0) {
         test_16bit_internal = 1;
       }
 #endif
@@ -1258,8 +1225,7 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
     }
   }
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (strcmp(global->codec->name, "vp9") == 0 ||
-      strcmp(global->codec->name, "vp10") == 0) {
+  if (strcmp(global->codec->name, "vp9") == 0) {
     config->use_16bit_internal = test_16bit_internal |
                                  (config->cfg.g_profile > 1);
   }
@@ -1346,19 +1312,6 @@ static void set_stream_dimensions(struct stream_state *stream,
   }
 }
 
-
-static void set_default_kf_interval(struct stream_state *stream,
-                                    struct VpxEncoderConfig *global) {
-  /* Use a max keyframe interval of 5 seconds, if none was
-   * specified on the command line.
-   */
-  if (!stream->config.have_kf_max_dist) {
-    double framerate = (double)global->framerate.num / global->framerate.den;
-    if (framerate > 0.0)
-      stream->config.cfg.kf_max_dist = (unsigned int)(5.0 * framerate);
-  }
-}
-
 static const char* file_type_to_string(enum VideoFileType t) {
   switch (t) {
     case FILE_TYPE_RAW: return "RAW";
@@ -1457,13 +1410,15 @@ static void open_output_file(struct stream_state *stream,
 
 #if CONFIG_WEBM_IO
   if (stream->config.write_webm) {
-    stream->ebml.stream = stream->file;
-    write_webm_file_header(&stream->ebml, cfg,
+    stream->webm_ctx.stream = stream->file;
+    write_webm_file_header(&stream->webm_ctx, cfg,
                            &global->framerate,
                            stream->config.stereo_fmt,
                            global->codec->fourcc,
                            pixel_aspect_ratio);
   }
+#else
+  (void)pixel_aspect_ratio;
 #endif
 
   if (!stream->config.write_webm) {
@@ -1481,7 +1436,7 @@ static void close_output_file(struct stream_state *stream,
 
 #if CONFIG_WEBM_IO
   if (stream->config.write_webm) {
-    write_webm_file_footer(&stream->ebml);
+    write_webm_file_footer(&stream->webm_ctx);
   }
 #endif
 
@@ -1708,7 +1663,7 @@ static void get_cx_data(struct stream_state *stream,
         update_rate_histogram(stream->rate_hist, cfg, pkt);
 #if CONFIG_WEBM_IO
         if (stream->config.write_webm) {
-          write_webm_block(&stream->ebml, cfg, pkt);
+          write_webm_block(&stream->webm_ctx, cfg, pkt);
         }
 #endif
         if (!stream->config.write_webm) {
@@ -1996,7 +1951,7 @@ int main(int argc, const char **argv_) {
     usage_exit();
 
   /* Decide if other chroma subsamplings than 4:2:0 are supported */
-  if (global.codec->fourcc == VP9_FOURCC || global.codec->fourcc == VP10_FOURCC)
+  if (global.codec->fourcc == VP9_FOURCC)
     input.only_i420 = 0;
 
   for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) {
@@ -2060,9 +2015,11 @@ int main(int argc, const char **argv_) {
 
 #if !CONFIG_WEBM_IO
     FOREACH_STREAM({
-      stream->config.write_webm = 0;
-      warn("vpxenc was compiled without WebM container support."
-           "Producing IVF output");
+      if (stream->config.write_webm) {
+        stream->config.write_webm = 0;
+        warn("vpxenc was compiled without WebM container support."
+             "Producing IVF output");
+      }
     });
 #endif
 
@@ -2072,10 +2029,10 @@ int main(int argc, const char **argv_) {
     if (!global.have_framerate) {
       global.framerate.num = input.framerate.numerator;
       global.framerate.den = input.framerate.denominator;
+      FOREACH_STREAM(stream->config.cfg.g_timebase.den = global.framerate.num;
+                     stream->config.cfg.g_timebase.num = global.framerate.den);
     }
 
-    FOREACH_STREAM(set_default_kf_interval(stream, &global));
-
     /* Show configuration */
     if (global.verbose && pass == 0)
       FOREACH_STREAM(show_stream_config(stream, &global, &input));
@@ -2100,8 +2057,7 @@ int main(int argc, const char **argv_) {
     FOREACH_STREAM(initialize_encoder(stream, &global));
 
 #if CONFIG_VP9_HIGHBITDEPTH
-    if (strcmp(global.codec->name, "vp9") == 0 ||
-        strcmp(global.codec->name, "vp10") == 0) {
+    if (strcmp(global.codec->name, "vp9") == 0) {
       // Check to see if at least one stream uses 16 bit internal.
       // Currently assume that the bit_depths for all streams using
       // highbitdepth are the same.
diff --git a/vpxstats.c b/vpxstats.c
index 172d893..16728ce 100644
--- a/vpxstats.c
+++ b/vpxstats.c
@@ -26,17 +26,6 @@ int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
     stats->buf.buf = NULL;
     res = (stats->file != NULL);
   } else {
-#if USE_POSIX_MMAP
-    struct stat stat_buf;
-    int fd;
-
-    fd = open(fpf, O_RDONLY);
-    stats->file = fdopen(fd, "rb");
-    fstat(fd, &stat_buf);
-    stats->buf.sz = stat_buf.st_size;
-    stats->buf.buf = mmap(NULL, stats->buf.sz, PROT_READ, MAP_PRIVATE, fd, 0);
-    res = (stats->buf.buf != NULL);
-#else
     size_t nbytes;
 
     stats->file = fopen(fpf, "rb");
@@ -58,7 +47,6 @@ int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
 
     nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file);
     res = (nbytes == stats->buf.sz);
-#endif  /* USE_POSIX_MMAP */
   }
 
   return res;
@@ -82,11 +70,7 @@ int stats_open_mem(stats_io_t *stats, int pass) {
 void stats_close(stats_io_t *stats, int last_pass) {
   if (stats->file) {
     if (stats->pass == last_pass) {
-#if USE_POSIX_MMAP
-      munmap(stats->buf.buf, stats->buf.sz);
-#else
       free(stats->buf.buf);
-#endif  /* USE_POSIX_MMAP */
     }
 
     fclose(stats->file);
diff --git a/webmdec.cc b/webmdec.cc
index f541cfe..36dbd92 100644
--- a/webmdec.cc
+++ b/webmdec.cc
@@ -13,8 +13,8 @@
 #include <cstring>
 #include <cstdio>
 
-#include "third_party/libwebm/mkvparser.hpp"
-#include "third_party/libwebm/mkvreader.hpp"
+#include "third_party/libwebm/mkvparser/mkvparser.h"
+#include "third_party/libwebm/mkvparser/mkvreader.h"
 
 namespace {
 
@@ -103,8 +103,6 @@ int file_is_webm(struct WebmInputContext *webm_ctx,
     vpx_ctx->fourcc = VP8_FOURCC;
   } else if (!strncmp(video_track->GetCodecId(), "V_VP9", 5)) {
     vpx_ctx->fourcc = VP9_FOURCC;
-  } else if (!strncmp(video_track->GetCodecId(), "V_VP10", 6)) {
-    vpx_ctx->fourcc = VP10_FOURCC;
   } else {
     rewind_and_reset(webm_ctx, vpx_ctx);
     return 0;
@@ -122,7 +120,6 @@ int file_is_webm(struct WebmInputContext *webm_ctx,
 
 int webm_read_frame(struct WebmInputContext *webm_ctx,
                     uint8_t **buffer,
-                    size_t *bytes_in_buffer,
                     size_t *buffer_size) {
   // This check is needed for frame parallel decoding, in which case this
   // function could be called even after it has reached end of input stream.
@@ -147,7 +144,7 @@ int webm_read_frame(struct WebmInputContext *webm_ctx,
     } else if (block_entry_eos || block_entry->EOS()) {
       cluster = segment->GetNext(cluster);
       if (cluster == NULL || cluster->EOS()) {
-        *bytes_in_buffer = 0;
+        *buffer_size = 0;
         webm_ctx->reached_eos = 1;
         return 1;
       }
@@ -164,7 +161,7 @@ int webm_read_frame(struct WebmInputContext *webm_ctx,
       }
       get_new_block = true;
     }
-    if (status) {
+    if (status || block_entry == NULL) {
       return -1;
     }
     if (get_new_block) {
@@ -187,10 +184,9 @@ int webm_read_frame(struct WebmInputContext *webm_ctx,
     if (*buffer == NULL) {
       return -1;
     }
-    *buffer_size = frame.len;
     webm_ctx->buffer = *buffer;
   }
-  *bytes_in_buffer = frame.len;
+  *buffer_size = frame.len;
   webm_ctx->timestamp_ns = block->GetTime(cluster);
   webm_ctx->is_key_frame = block->IsKey();
 
@@ -203,10 +199,9 @@ int webm_guess_framerate(struct WebmInputContext *webm_ctx,
                          struct VpxInputContext *vpx_ctx) {
   uint32_t i = 0;
   uint8_t *buffer = NULL;
-  size_t bytes_in_buffer = 0;
   size_t buffer_size = 0;
   while (webm_ctx->timestamp_ns < 1000000000 && i < 50) {
-    if (webm_read_frame(webm_ctx, &buffer, &bytes_in_buffer, &buffer_size)) {
+    if (webm_read_frame(webm_ctx, &buffer, &buffer_size)) {
       break;
     }
     ++i;
diff --git a/webmdec.h b/webmdec.h
index 7d16380..aa371f3 100644
--- a/webmdec.h
+++ b/webmdec.h
@@ -42,22 +42,18 @@ int file_is_webm(struct WebmInputContext *webm_ctx,
 
 // Reads a WebM Video Frame. Memory for the buffer is created, owned and managed
 // by this function. For the first call, |buffer| should be NULL and
-// |*bytes_in_buffer| should be 0. Once all the frames are read and used,
+// |*buffer_size| should be 0. Once all the frames are read and used,
 // webm_free() should be called, otherwise there will be a leak.
 // Parameters:
 //      webm_ctx - WebmInputContext object
 //      buffer - pointer where the frame data will be filled.
-//      bytes_in_buffer - pointer to buffer size.
-//      buffer_size - unused TODO(vigneshv): remove this
+//      buffer_size - pointer to buffer size.
 // Return values:
 //      0 - Success
 //      1 - End of Stream
 //     -1 - Error
-// TODO(vigneshv): Make the return values consistent across all functions in
-// this file.
 int webm_read_frame(struct WebmInputContext *webm_ctx,
                     uint8_t **buffer,
-                    size_t *bytes_in_buffer,
                     size_t *buffer_size);
 
 // Guesses the frame rate of the input file based on the container timestamps.
diff --git a/webmenc.cc b/webmenc.cc
index d41e700..9929969 100644
--- a/webmenc.cc
+++ b/webmenc.cc
@@ -11,22 +11,22 @@
 
 #include <string>
 
-#include "third_party/libwebm/mkvmuxer.hpp"
-#include "third_party/libwebm/mkvmuxerutil.hpp"
-#include "third_party/libwebm/mkvwriter.hpp"
+#include "third_party/libwebm/mkvmuxer/mkvmuxer.h"
+#include "third_party/libwebm/mkvmuxer/mkvmuxerutil.h"
+#include "third_party/libwebm/mkvmuxer/mkvwriter.h"
 
 namespace {
 const uint64_t kDebugTrackUid = 0xDEADBEEF;
 const int kVideoTrackNumber = 1;
 }  // namespace
 
-void write_webm_file_header(struct EbmlGlobal *glob,
+void write_webm_file_header(struct WebmOutputContext *webm_ctx,
                             const vpx_codec_enc_cfg_t *cfg,
                             const struct vpx_rational *fps,
                             stereo_format_t stereo_fmt,
                             unsigned int fourcc,
                             const struct VpxRational *par) {
-  mkvmuxer::MkvWriter *const writer = new mkvmuxer::MkvWriter(glob->stream);
+  mkvmuxer::MkvWriter *const writer = new mkvmuxer::MkvWriter(webm_ctx->stream);
   mkvmuxer::Segment *const segment = new mkvmuxer::Segment();
   segment->Init(writer);
   segment->set_mode(mkvmuxer::Segment::kFile);
@@ -36,7 +36,7 @@ void write_webm_file_header(struct EbmlGlobal *glob,
   const uint64_t kTimecodeScale = 1000000;
   info->set_timecode_scale(kTimecodeScale);
   std::string version = "vpxenc";
-  if (!glob->debug) {
+  if (!webm_ctx->debug) {
     version.append(std::string(" ") + vpx_codec_version_str());
   }
   info->set_writing_app(version.c_str());
@@ -55,13 +55,8 @@ void write_webm_file_header(struct EbmlGlobal *glob,
     codec_id = "V_VP8";
     break;
   case VP9_FOURCC:
-    codec_id = "V_VP9";
-    break;
-  case VP10_FOURCC:
-    codec_id = "V_VP10";
-    break;
   default:
-    codec_id = "V_VP10";
+    codec_id = "V_VP9";
     break;
   }
   video_track->set_codec_id(codec_id);
@@ -74,23 +69,23 @@ void write_webm_file_header(struct EbmlGlobal *glob,
     video_track->set_display_width(display_width);
     video_track->set_display_height(cfg->g_h);
   }
-  if (glob->debug) {
+  if (webm_ctx->debug) {
     video_track->set_uid(kDebugTrackUid);
   }
-  glob->writer = writer;
-  glob->segment = segment;
+  webm_ctx->writer = writer;
+  webm_ctx->segment = segment;
 }
 
-void write_webm_block(struct EbmlGlobal *glob,
+void write_webm_block(struct WebmOutputContext *webm_ctx,
                       const vpx_codec_enc_cfg_t *cfg,
                       const vpx_codec_cx_pkt_t *pkt) {
   mkvmuxer::Segment *const segment =
-      reinterpret_cast<mkvmuxer::Segment*>(glob->segment);
+      reinterpret_cast<mkvmuxer::Segment*>(webm_ctx->segment);
   int64_t pts_ns = pkt->data.frame.pts * 1000000000ll *
                    cfg->g_timebase.num / cfg->g_timebase.den;
-  if (pts_ns <= glob->last_pts_ns)
-    pts_ns = glob->last_pts_ns + 1000000;
-  glob->last_pts_ns = pts_ns;
+  if (pts_ns <= webm_ctx->last_pts_ns)
+    pts_ns = webm_ctx->last_pts_ns + 1000000;
+  webm_ctx->last_pts_ns = pts_ns;
 
   segment->AddFrame(static_cast<uint8_t*>(pkt->data.frame.buf),
                     pkt->data.frame.sz,
@@ -99,14 +94,14 @@ void write_webm_block(struct EbmlGlobal *glob,
                     pkt->data.frame.flags & VPX_FRAME_IS_KEY);
 }
 
-void write_webm_file_footer(struct EbmlGlobal *glob) {
+void write_webm_file_footer(struct WebmOutputContext *webm_ctx) {
   mkvmuxer::MkvWriter *const writer =
-      reinterpret_cast<mkvmuxer::MkvWriter*>(glob->writer);
+      reinterpret_cast<mkvmuxer::MkvWriter*>(webm_ctx->writer);
   mkvmuxer::Segment *const segment =
-      reinterpret_cast<mkvmuxer::Segment*>(glob->segment);
+      reinterpret_cast<mkvmuxer::Segment*>(webm_ctx->segment);
   segment->Finalize();
   delete segment;
   delete writer;
-  glob->writer = NULL;
-  glob->segment = NULL;
+  webm_ctx->writer = NULL;
+  webm_ctx->segment = NULL;
 }
diff --git a/webmenc.h b/webmenc.h
index c255d3d..ad30664 100644
--- a/webmenc.h
+++ b/webmenc.h
@@ -20,8 +20,7 @@
 extern "C" {
 #endif
 
-/* TODO(vigneshv): Rename this struct */
-struct EbmlGlobal {
+struct WebmOutputContext {
   int debug;
   FILE *stream;
   int64_t last_pts_ns;
@@ -38,18 +37,18 @@ typedef enum stereo_format {
   STEREO_FORMAT_RIGHT_LEFT = 11
 } stereo_format_t;
 
-void write_webm_file_header(struct EbmlGlobal *glob,
+void write_webm_file_header(struct WebmOutputContext *webm_ctx,
                             const vpx_codec_enc_cfg_t *cfg,
                             const struct vpx_rational *fps,
                             stereo_format_t stereo_fmt,
                             unsigned int fourcc,
                             const struct VpxRational *par);
 
-void write_webm_block(struct EbmlGlobal *glob,
+void write_webm_block(struct WebmOutputContext *webm_ctx,
                       const vpx_codec_enc_cfg_t *cfg,
                       const vpx_codec_cx_pkt_t *pkt);
 
-void write_webm_file_footer(struct EbmlGlobal *glob);
+void write_webm_file_footer(struct WebmOutputContext *webm_ctx);
 
 #ifdef __cplusplus
 }  // extern "C"

-- 
libvpx packaging