[SCM] libde265/upstream: Imported Upstream version 1.0.2

Thu Jul 16 09:09:39 UTC 2015

The following commit has been merged in the upstream branch:
commit 9df95765368e7863e836348714709c09a0c5757c
Author: Joachim Bauch <bauch at struktur.de>
Date:   Wed Jul 15 18:08:18 2015 +0200

    Imported Upstream version 1.0.2

diff --git a/.travis.yml b/.travis.yml
index 8079d20..99dd894 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,14 +5,24 @@ compiler:
   - gcc
 env:
   - HOST= WINE= DECODESTREAMS=
-  - HOST= WINE= DECODESTREAMS=libde265-teststreams-nolf
-  - HOST= WINE= DECODESTREAMS=libde265-teststreams-sao
-  - HOST= WINE= DECODESTREAMS=libde265-teststreams-tiles
-  - HOST= WINE= DECODESTREAMS=libde265-teststreams-tiles-nolf
-  - HOST= WINE= DECODESTREAMS=libde265-teststreams-weighted
-  - HOST= WINE= DECODESTREAMS=libde265-teststreams-wpp-nolf
+  - HOST= WINE= DECODESTREAMS=libde265-teststreams-fuzzing THREADING=
+  - HOST= WINE= DECODESTREAMS=libde265-teststreams-fuzzing THREADING=--single-threaded
+  - HOST= WINE= DECODESTREAMS=libde265-teststreams-nolf THREADING=
+  - HOST= WINE= DECODESTREAMS=libde265-teststreams-nolf THREADING=--single-threaded
+  - HOST= WINE= DECODESTREAMS=libde265-teststreams-sao THREADING=
+  - HOST= WINE= DECODESTREAMS=libde265-teststreams-sao THREADING=--single-threaded
+  - HOST= WINE= DECODESTREAMS=libde265-teststreams-tiles THREADING=
+  - HOST= WINE= DECODESTREAMS=libde265-teststreams-tiles THREADING=--single-threaded
+  - HOST= WINE= DECODESTREAMS=libde265-teststreams-tiles-nolf THREADING=
+  - HOST= WINE= DECODESTREAMS=libde265-teststreams-tiles-nolf THREADING=--single-threaded
+  - HOST= WINE= DECODESTREAMS=libde265-teststreams-weighted THREADING=
+  - HOST= WINE= DECODESTREAMS=libde265-teststreams-weighted THREADING=--single-threaded
+  - HOST= WINE= DECODESTREAMS=libde265-teststreams-wpp-nolf THREADING=
+  - HOST= WINE= DECODESTREAMS=libde265-teststreams-wpp-nolf THREADING=--single-threaded
   - HOST=i686-w64-mingw32 WINE=wine DECODESTREAMS=
   - HOST=x86_64-w64-mingw32 WINE=wine64 DECODESTREAMS=
+  - HOST=arm-linux-gnueabihf WINE= DECODESTREAMS=
+  - HOST=cmake WINE= DECODESTREAMS=
 
 matrix:
   include:
@@ -23,28 +33,38 @@ before_install:
   - sh -c "if [ ! -z '$DECODESTREAMS' ]; then sudo add-apt-repository -y ppa:strukturag/libde265; fi"
   - sudo apt-get update -qq
   - sh -c "if [ -z '$HOST' ]; then sudo apt-get install -qq valgrind libsdl-dev libqt4-dev libswscale-dev; fi"
-  - sh -c "if [ ! -z '$HOST' ]; then sudo apt-get install -qq wine; fi"
+  - sh -c "if [ -z '$HOST' ] && [ -z '$DECODESTREAMS' ]; then sudo apt-get install -qq devscripts; fi"
+  - sh -c "if [ ! -z '$WINE' ]; then sudo apt-get install -qq wine; fi"
   - sh -c "if [ '$WINE' = 'wine'   ]; then sudo apt-get install -qq gcc-mingw-w64-i686   g++-mingw-w64-i686   binutils-mingw-w64-i686   mingw-w64-dev; fi"
   - sh -c "if [ '$WINE' = 'wine64' ]; then sudo apt-get install -qq gcc-mingw-w64-x86-64 g++-mingw-w64-x86-64 binutils-mingw-w64-x86-64 mingw-w64-dev; fi"
+  - sh -c "if ( echo '$HOST' | grep -q '^arm' ); then sudo apt-get install -qq g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf qemu-user; fi"
   - sh -c "if [ ! -z '$DECODESTREAMS' ]; then sudo apt-get install $DECODESTREAMS; fi"
 
 install:
   - git clone https://github.com/strukturag/libde265-data.git
 
 before_script:
-  - ./autogen.sh
-  - if [ ! -z "$HOST" ]; then unset CC; fi
-  - if [ ! -z "$HOST" ]; then unset CXX; fi
-  - ./configure --host=$HOST
+  - if [ "$HOST" != "cmake" ]; then ./autogen.sh; fi
+  - if [ ! -z "$HOST" ] && [ "$HOST" != "cmake" ]; then unset CC; fi
+  - if [ ! -z "$HOST" ] && [ "$HOST" != "cmake" ]; then unset CXX; fi
+  - if [ "$HOST" != "cmake" ]; then ./configure --host=$HOST; fi
+  - if [ "$HOST" = "cmake" ]; then cmake .; fi
 
 script:
+  - sh -c "if [ -z "$HOST" ] && [ -z "$DECODESTREAMS" ]; then ./scripts/check_licenses.sh; fi"
   - make
+  - sh -c "if [ -z "$HOST" ] && [ -z "$DECODESTREAMS" ]; then make dist && mkdir dist-test && cd dist-test && tar xzf ../libde265-*.tar.gz && cd libde265-* && ./configure && make; fi"
+  - sh -c "if [ -z "$HOST" ] && [ -z "$DECODESTREAMS" ]; then make dist && mkdir dist-cmake-test && cd dist-cmake-test && tar xzf ../libde265-*.tar.gz && cd libde265-* && cmake . && make; fi"
   - sh -c "if [ -z "$HOST" ] && [ -z "$DECODESTREAMS" ]; then LD_LIBRARY_PATH=./libde265/.libs/ valgrind --tool=memcheck --quiet --error-exitcode=1 ./dec265/.libs/dec265 -q -c -f 100 ./libde265-data/IDR-only/paris-352x288-intra.bin; fi"
   - sh -c "if [ -z "$HOST" ] && [ -z "$DECODESTREAMS" ]; then LD_LIBRARY_PATH=./libde265/.libs/ valgrind --tool=memcheck --quiet --error-exitcode=1 ./dec265/.libs/dec265 -t 4 -q -c -f 100 ./libde265-data/IDR-only/paris-352x288-intra.bin; fi"
   - sh -c "if [ -z "$HOST" ] && [ -z "$DECODESTREAMS" ]; then LD_LIBRARY_PATH=./libde265/.libs/ valgrind --tool=memcheck --quiet --error-exitcode=1 ./dec265/.libs/dec265 -q -c -f 100 ./libde265-data/RandomAccess/paris-ra-wpp.bin; fi"
   - sh -c "if [ -z "$HOST" ] && [ -z "$DECODESTREAMS" ]; then LD_LIBRARY_PATH=./libde265/.libs/ valgrind --tool=memcheck --quiet --error-exitcode=1 ./dec265/.libs/dec265 -t 4 -q -c -f 100 ./libde265-data/RandomAccess/paris-ra-wpp.bin; fi"
-  - sh -c "if [ ! -z "$HOST" ]; then WINEPREFIX=`pwd`/$WINE WINEPATH=/usr/lib/gcc/$HOST/4.6/ $WINE ./dec265/dec265.exe -q -c ./libde265-data/IDR-only/paris-352x288-intra.bin; fi"
-  - sh -c "if [ ! -z "$HOST" ]; then WINEPREFIX=`pwd`/$WINE WINEPATH=/usr/lib/gcc/$HOST/4.6/ $WINE ./dec265/dec265.exe -t 4 -q -c ./libde265-data/IDR-only/paris-352x288-intra.bin; fi"
-  - sh -c "if [ ! -z "$HOST" ]; then WINEPREFIX=`pwd`/$WINE WINEPATH=/usr/lib/gcc/$HOST/4.6/ $WINE ./dec265/dec265.exe -q -c ./libde265-data/RandomAccess/paris-ra-wpp.bin; fi"
-  - sh -c "if [ ! -z "$HOST" ]; then WINEPREFIX=`pwd`/$WINE WINEPATH=/usr/lib/gcc/$HOST/4.6/ $WINE ./dec265/dec265.exe -t 4 -q -c ./libde265-data/RandomAccess/paris-ra-wpp.bin; fi"
-  - sh -c "if [ ! -z '$DECODESTREAMS' ]; then python scripts/decodestreams.py /var/lib/libde265-teststreams; fi"
+  - sh -c "if [ ! -z "$WINE" ]; then WINEPREFIX=`pwd`/$WINE WINEPATH=/usr/lib/gcc/$HOST/4.6/ $WINE ./dec265/dec265.exe -q -c ./libde265-data/IDR-only/paris-352x288-intra.bin; fi"
+  - sh -c "if [ ! -z "$WINE" ]; then WINEPREFIX=`pwd`/$WINE WINEPATH=/usr/lib/gcc/$HOST/4.6/ $WINE ./dec265/dec265.exe -t 4 -q -c ./libde265-data/IDR-only/paris-352x288-intra.bin; fi"
+  - sh -c "if [ ! -z "$WINE" ]; then WINEPREFIX=`pwd`/$WINE WINEPATH=/usr/lib/gcc/$HOST/4.6/ $WINE ./dec265/dec265.exe -q -c ./libde265-data/RandomAccess/paris-ra-wpp.bin; fi"
+  - sh -c "if [ ! -z "$WINE" ]; then WINEPREFIX=`pwd`/$WINE WINEPATH=/usr/lib/gcc/$HOST/4.6/ $WINE ./dec265/dec265.exe -t 4 -q -c ./libde265-data/RandomAccess/paris-ra-wpp.bin; fi"
+  - sh -c "if ( echo '$HOST' | grep -q '^arm' ); then LD_LIBRARY_PATH=`pwd`/libde265/.libs/ qemu-arm -L /usr/$HOST ./dec265/.libs/dec265 -q -c ./libde265-data/IDR-only/paris-352x288-intra.bin; fi"
+  #- sh -c "if ( echo '$HOST' | grep -q '^arm' ); then LD_LIBRARY_PATH=`pwd`/libde265/.libs/ qemu-arm -L /usr/$HOST ./dec265/.libs/dec265 -t 4 -q -c ./libde265-data/IDR-only/paris-352x288-intra.bin; fi"
+  - sh -c "if ( echo '$HOST' | grep -q '^arm' ); then LD_LIBRARY_PATH=`pwd`/libde265/.libs/ qemu-arm -L /usr/$HOST ./dec265/.libs/dec265 -q -c ./libde265-data/RandomAccess/paris-ra-wpp.bin; fi"
+  #- sh -c "if ( echo '$HOST' | grep -q '^arm' ); then LD_LIBRARY_PATH=`pwd`/libde265/.libs/ qemu-arm -L /usr/$HOST ./dec265/.libs/dec265 -t 4 -q -c ./libde265-data/RandomAccess/paris-ra-wpp.bin; fi"
+  - sh -c "if [ ! -z '$DECODESTREAMS' ]; then python scripts/decodestreams.py $THREADING /var/lib/libde265-teststreams; fi"
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..6ac8563
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,68 @@
+project (libde265)
+cmake_minimum_required (VERSION 2.8)
+
+# The version number.
+set (NUMERIC_VERSION 0x01000000)
+set (PACKAGE_VERSION 1.0.0)
+
+include (${CMAKE_ROOT}/Modules/CheckCCompilerFlag.cmake)
+include (${CMAKE_ROOT}/Modules/CheckIncludeFile.cmake)
+include (${CMAKE_ROOT}/Modules/FindSDL.cmake)
+include (${CMAKE_ROOT}/Modules/FindThreads.cmake)
+
+CHECK_INCLUDE_FILE(malloc.h HAVE_MALLOC_H)
+CHECK_INCLUDE_FILE(stdint.h HAVE_STDINT_H)
+CHECK_INCLUDE_FILE(stdbool.h HAVE_STDBOOL_H)
+
+if (HAVE_MALLOC_H)
+  add_definitions(-DHAVE_MALLOC_H)
+endif()
+if (HAVE_STDINT_H)
+  add_definitions(-DHAVE_STDINT_H)
+endif()
+if (HAVE_STDBOOL_H)
+  add_definitions(-DHAVE_STDBOOL_H)
+endif()
+
+configure_file (
+  "${PROJECT_SOURCE_DIR}/libde265/de265-version.h.in"
+  "${PROJECT_BINARY_DIR}/libde265/de265-version.h"
+)
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+  set(GCC 1)
+  add_definitions(-Wall)
+  set(CMAKE_CXX_FLAGS "-std=gnu++0x ${CMAKE_CXX_FLAGS}")
+elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+  add_definitions(-Wall)
+  set(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
+endif()
+
+option(DISABLE_SSE "Disable SSE optimizations")
+if(NOT ${DISABLE_SSE} EQUAL OFF)
+  if(MSVC)
+    set(SUPPORTS_SSE4_1 1)
+  else()
+    CHECK_C_COMPILER_FLAG(-msse4.1 SUPPORTS_SSE4_1)
+  endif()
+endif()
+
+include_directories ("${PROJECT_SOURCE_DIR}")
+include_directories ("${PROJECT_BINARY_DIR}")
+include_directories ("${PROJECT_SOURCE_DIR}/libde265")
+if(MSVC)
+  include_directories ("${PROJECT_SOURCE_DIR}/extra")
+  add_definitions(-DHAVE_STDINT_H)
+  add_definitions(-DHAVE_STDBOOL_H)
+  add_definitions(-DNOMINMAX)
+endif()
+
+if(UNIX)
+  set(LIBDE265_LIBRARY_NAME de265)
+else()
+  set(LIBDE265_LIBRARY_NAME libde265)
+endif()
+
+add_subdirectory (libde265)
+add_subdirectory (dec265)
+add_subdirectory (enc265)
diff --git a/Makefile.am b/Makefile.am
index bf3ae28..ddda31b 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -8,6 +8,10 @@ if ENABLE_DEC265
 SUBDIRS+=dec265
 endif
 
+SUBDIRS+=enc265
+
+SUBDIRS+=tools
+
 if ENABLE_SHERLOCK265
 SUBDIRS+=sherlock265
 endif
@@ -16,8 +20,8 @@ EXTRA_DIST = .travis.yml \
   autogen.sh \
   build.bat \
   m4/m4_ax_check_compile_flag.m4 \
-  build \
   Makefile.vc7 \
+  CMakeLists.txt \
   README.md \
   libde265.png \
   */COPYING
diff --git a/Makefile.in b/Makefile.in
index 7744e49..4981700 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -85,15 +85,17 @@ subdir = .
 DIST_COMMON = INSTALL NEWS README AUTHORS ChangeLog \
 	$(srcdir)/Makefile.in $(srcdir)/Makefile.am \
 	$(top_srcdir)/configure $(am__configure_deps) \
-	$(srcdir)/config.h.in $(srcdir)/libde265.pc.in COPYING compile \
-	config.guess config.sub depcomp install-sh missing ltmain.sh
+	$(srcdir)/config.h.in $(srcdir)/libde265.pc.in COPYING TODO \
+	compile config.guess config.sub depcomp install-sh missing \
+	ltmain.sh
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_compare_version.m4 \
+	$(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \
 	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
 	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
 	$(top_srcdir)/m4/lt~obsolete.m4 \
 	$(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \
-	$(top_srcdir)/m4/visibility.m4 $(top_srcdir)/configure.ac
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
@@ -187,7 +189,7 @@ am__define_uniq_tagged_files = \
 ETAGS = etags
 CTAGS = ctags
 CSCOPE = cscope
-DIST_SUBDIRS = libde265 dec265 sherlock265
+DIST_SUBDIRS = libde265 dec265 enc265 tools sherlock265
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 distdir = $(PACKAGE)-$(VERSION)
 top_distdir = $(distdir)
@@ -231,6 +233,7 @@ am__distuninstallcheck_listfiles = $(distuninstallcheck_listfiles) \
   | sed 's|^\./|$(prefix)/|' | grep -v '$(infodir)/dir$$'
 distcleancheck_listfiles = find . -type f -print
 ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
 AMTAR = @AMTAR@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
@@ -239,9 +242,11 @@ AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
 CCDEPMODE = @CCDEPMODE@
 CFLAGS = @CFLAGS@
-CFLAG_VISIBILITY = @CFLAG_VISIBILITY@
 CPP = @CPP@
 CPPFLAGS = @CPPFLAGS@
 CXX = @CXX@
@@ -261,7 +266,7 @@ EGREP = @EGREP@
 EXEEXT = @EXEEXT@
 FGREP = @FGREP@
 GREP = @GREP@
-HAVE_VISIBILITY = @HAVE_VISIBILITY@
+HAVE_CXX11 = @HAVE_CXX11@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -372,14 +377,14 @@ target_vendor = @target_vendor@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
-SUBDIRS = libde265 $(am__append_1) $(am__append_2)
+SUBDIRS = libde265 $(am__append_1) enc265 tools $(am__append_2)
 ACLOCAL_AMFLAGS = -I m4
 EXTRA_DIST = .travis.yml \
   autogen.sh \
   build.bat \
   m4/m4_ax_check_compile_flag.m4 \
-  build \
   Makefile.vc7 \
+  CMakeLists.txt \
   README.md \
   libde265.png \
   */COPYING
diff --git a/Makefile.vc7 b/Makefile.vc7
index 1353d9a..6d18fb4 100644
--- a/Makefile.vc7
+++ b/Makefile.vc7
@@ -1,3 +1,4 @@
 clean all:
     cd libde265 && $(MAKE) -f Makefile.vc7 $*
     cd dec265 && $(MAKE) -f Makefile.vc7 $*
+    cd enc265 && $(MAKE) -f Makefile.vc7 $*
diff --git a/README.md b/README.md
index 9665305..1cdf06b 100644
--- a/README.md
+++ b/README.md
@@ -77,8 +77,8 @@ or
 
 You can disable building of the example programs by running `./configure` with
 <pre>
-  --disable-dec265        Do not dec265 decoder program.
-  --disable-sherlock265   Do not build sherlock265 visual inspection program.
+  --disable-dec265        Do not build the dec265 decoder program.
+  --disable-sherlock265   Do not build the sherlock265 visual inspection program.
 </pre>
 
 Additional logging information can be turned on and off using these `./configure` flags:
@@ -89,6 +89,23 @@ Additional logging information can be turned on and off using these `./configure
 </pre>
 
 
+Build using cmake
+=================
+
+cmake scripts to build libde265 and the sample scripts `dec265` and `enc265` are
+included and can be compiled using these commands:
+
+```
+mkdir build
+cd build
+cmake ..
+make
+```
+
+See the [cmake documentation](http://www.cmake.org) for further information on
+using cmake on other platforms.
+
+
 Prebuilt binaries
 =================
 
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..b1badec
--- /dev/null
+++ b/TODO
@@ -0,0 +1,13 @@
+/mnt/temp/dirk/yuv/flower_garden_422_720x486_30fps_simple_intra.bin
+
+diff at byte 00196800
+frame 3
+channel: 0
+pixel position: x=384;y=125
+file A: 56
+file B: 86
+
+
+- API: request IDR-frame
+- API: return SPS header infos
+- rate-control: specify bit-rate
diff --git a/aclocal.m4 b/aclocal.m4
index 2b21338..3dfc4e8 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -216,6 +216,26 @@ m4_ifndef([AC_AUTOCONF_VERSION],
   [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])
 
+# Figure out how to run the assembler.                      -*- Autoconf -*-
+
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_PROG_AS
+# ----------
+AC_DEFUN([AM_PROG_AS],
+[# By default we simply use the C compiler to build assembly code.
+AC_REQUIRE([AC_PROG_CC])
+test "${CCAS+set}" = set || CCAS=$CC
+test "${CCASFLAGS+set}" = set || CCASFLAGS=$CFLAGS
+AC_ARG_VAR([CCAS],      [assembler compiler command (defaults to CC)])
+AC_ARG_VAR([CCASFLAGS], [assembler compiler flags (defaults to CFLAGS)])
+_AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl
+])
+
 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-
 
 # Copyright (C) 2001-2013 Free Software Foundation, Inc.
@@ -1308,10 +1328,10 @@ AC_SUBST([am__untar])
 ]) # _AM_PROG_TAR
 
 m4_include([m4/ax_compare_version.m4])
+m4_include([m4/ax_cxx_compile_stdcxx_11.m4])
 m4_include([m4/libtool.m4])
 m4_include([m4/ltoptions.m4])
 m4_include([m4/ltsugar.m4])
 m4_include([m4/ltversion.m4])
 m4_include([m4/lt~obsolete.m4])
 m4_include([m4/m4_ax_check_compile_flag.m4])
-m4_include([m4/visibility.m4])
diff --git a/autogen.sh b/autogen.sh
index 57eeb94..5b79c04 100755
--- a/autogen.sh
+++ b/autogen.sh
@@ -1,4 +1,24 @@
 #!/bin/sh
+set -eu
+#
+# H.265 video codec.
+# Copyright (c) 2013 struktur AG, Joachim Bauch <bauch at struktur.de>
+#
+# This file is part of libde265.
+#
+# libde265 is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# libde265 is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+#
 if [ -x "`which autoreconf 2>/dev/null`" ] ; then
    exec autoreconf -ivf
 fi
diff --git a/build.bat b/build.bat
index 36e77a5..587333c 100755
--- a/build.bat
+++ b/build.bat
@@ -20,6 +20,7 @@ if not exist bin_x86\lib (
     mkdir bin_x86\lib
 )
 copy /y dec265\dec265.exe bin_x86\
+copy /y enc265\enc265.exe bin_x86\
 copy /y libde265\libde265.dll bin_x86\
 copy /y libde265\libde265.lib bin_x86\lib\
 copy /y libde265\libde265.exp bin_x86\lib\
@@ -33,6 +34,7 @@ if not exist bin_x64\lib (
     mkdir bin_x64\lib
 )
 copy /y dec265\dec265.exe bin_x64\
+copy /y enc265\enc265.exe bin_x64\
 copy /y libde265\libde265.dll bin_x64\
 copy /y libde265\libde265.lib bin_x64\lib\
 copy /y libde265\libde265.exp bin_x64\lib\
diff --git a/build/vc9-x86/make-solutions.bat b/build/vc9-x86/make-solutions.bat
deleted file mode 100644
index 2d5e29b..0000000
--- a/build/vc9-x86/make-solutions.bat
+++ /dev/null
@@ -1,6 +0,0 @@
- at echo off
-::
-:: run this batch file to create a Visual Studion solution file for this project.
-:: See the cmake documentation for other generator targets
-::
-cmake -G "Visual Studio 9 2008" ..\..\dec265 && cmake-gui ..\..\dec265
diff --git a/config.h.in b/config.h.in
index de4b739..b12f0be 100644
--- a/config.h.in
+++ b/config.h.in
@@ -1,8 +1,35 @@
 /* config.h.in.  Generated from configure.ac by autoheader.  */
 
+/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
+   systems. This function is required for `alloca.c' support on those systems.
+   */
+#undef CRAY_STACKSEG_END
+
+/* Define to 1 if using `alloca.c'. */
+#undef C_ALLOCA
+
+/* Define to 1 if you have the `alarm' function. */
+#undef HAVE_ALARM
+
+/* Define to 1 if you have `alloca', as a function or macro. */
+#undef HAVE_ALLOCA
+
+/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
+   */
+#undef HAVE_ALLOCA_H
+
+/* Support ARM instructions */
+#undef HAVE_ARM
+
+/* define if the compiler supports basic C++11 syntax */
+#undef HAVE_CXX11
+
 /* Define to 1 if you have the <dlfcn.h> header file. */
 #undef HAVE_DLFCN_H
 
+/* Define to 1 if you have the `gettimeofday' function. */
+#undef HAVE_GETTIMEOFDAY
+
 /* Define to 1 if you have the <inttypes.h> header file. */
 #undef HAVE_INTTYPES_H
 
@@ -24,36 +51,66 @@
 /* Define to 1 if you have the `memset' function. */
 #undef HAVE_MEMSET
 
+/* Support ARM NEON instructions */
+#undef HAVE_NEON
+
 /* Define to 1 if you have the `posix_memalign' function. */
 #undef HAVE_POSIX_MEMALIGN
 
+/* Define to 1 if you have the `pow' function. */
+#undef HAVE_POW
+
+/* Define to 1 if the system has the type `ptrdiff_t'. */
+#undef HAVE_PTRDIFF_T
+
 /* Whether libsdl was found. */
 #undef HAVE_SDL
 
+/* Define to 1 if you have the <setjmp.h> header file. */
+#undef HAVE_SETJMP_H
+
+/* Define to 1 if you have the <signal.h> header file. */
+#undef HAVE_SIGNAL_H
+
+/* Define to 1 if you have the `sqrt' function. */
+#undef HAVE_SQRT
+
 /* Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions */
 #undef HAVE_SSE4_1
 
 /* Define to 1 if stdbool.h conforms to C99. */
 #undef HAVE_STDBOOL_H
 
+/* Define to 1 if you have the <stddef.h> header file. */
+#undef HAVE_STDDEF_H
+
 /* Define to 1 if you have the <stdint.h> header file. */
 #undef HAVE_STDINT_H
 
 /* Define to 1 if you have the <stdlib.h> header file. */
 #undef HAVE_STDLIB_H
 
+/* Define to 1 if you have the `strchr' function. */
+#undef HAVE_STRCHR
+
 /* Define to 1 if you have the <strings.h> header file. */
 #undef HAVE_STRINGS_H
 
 /* Define to 1 if you have the <string.h> header file. */
 #undef HAVE_STRING_H
 
+/* Define to 1 if you have the `strrchr' function. */
+#undef HAVE_STRRCHR
+
 /* Whether libswscale was found. */
 #undef HAVE_SWSCALE
 
 /* Define to 1 if you have the <sys/stat.h> header file. */
 #undef HAVE_SYS_STAT_H
 
+/* Define to 1 if you have the <sys/time.h> header file. */
+#undef HAVE_SYS_TIME_H
+
 /* Define to 1 if you have the <sys/types.h> header file. */
 #undef HAVE_SYS_TYPES_H
 
@@ -63,10 +120,6 @@
 /* Whether libvideogfx was found. */
 #undef HAVE_VIDEOGFX
 
-/* Define to 1 or 0, depending whether the compiler supports simple visibility
-   declarations. */
-#undef HAVE_VISIBILITY
-
 /* Define to 1 if the system has the type `_Bool'. */
 #undef HAVE__BOOL
 
@@ -104,9 +157,20 @@
 /* Define to the version of this package. */
 #undef PACKAGE_VERSION
 
+/* If using the C implementation of alloca, define if you know the
+   direction of stack growth for your system; otherwise it will be
+   automatically deduced at runtime.
+	STACK_DIRECTION > 0 => grows toward higher addresses
+	STACK_DIRECTION < 0 => grows toward lower addresses
+	STACK_DIRECTION = 0 => direction of growth unknown */
+#undef STACK_DIRECTION
+
 /* Define to 1 if you have the ANSI C header files. */
 #undef STDC_HEADERS
 
+/* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */
+#undef TIME_WITH_SYS_TIME
+
 /* Version number of package */
 #undef VERSION
 
@@ -125,6 +189,12 @@
    #define below would cause a syntax error. */
 #undef _UINT8_T
 
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+#undef inline
+#endif
+
 /* Define to the type of a signed integer type of width exactly 16 bits if
    such a type exists and the standard includes do not define it. */
 #undef int16_t
@@ -133,10 +203,17 @@
    such a type exists and the standard includes do not define it. */
 #undef int32_t
 
+/* Define to the type of a signed integer type of width exactly 64 bits if
+   such a type exists and the standard includes do not define it. */
+#undef int64_t
+
 /* Define to the type of a signed integer type of width exactly 8 bits if such
    a type exists and the standard includes do not define it. */
 #undef int8_t
 
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+#undef size_t
+
 /* Define to the type of an unsigned integer type of width exactly 16 bits if
    such a type exists and the standard includes do not define it. */
 #undef uint16_t
diff --git a/configure b/configure
index 6765dde..dcfaf0c 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for libde265 0.9.
+# Generated by GNU Autoconf 2.69 for libde265 1.0.2.
 #
 # Report bugs to <farin at struktur.de>.
 #
@@ -590,8 +590,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='libde265'
 PACKAGE_TARNAME='libde265'
-PACKAGE_VERSION='0.9'
-PACKAGE_STRING='libde265 0.9'
+PACKAGE_VERSION='1.0.2'
+PACKAGE_STRING='libde265 1.0.2'
 PACKAGE_BUGREPORT='farin at struktur.de'
 PACKAGE_URL=''
 
@@ -632,10 +632,11 @@ ac_includes_default="\
 # include <unistd.h>
 #endif"
 
+ac_header_list=
+ac_func_list=
 ac_subst_vars='am__EXEEXT_FALSE
 am__EXEEXT_TRUE
 LTLIBOBJS
-LIBOBJS
 ENABLE_SHERLOCK265_FALSE
 ENABLE_SHERLOCK265_TRUE
 ENABLE_DEC265_FALSE
@@ -659,14 +660,21 @@ VIDEOGFX_CFLAGS
 PKG_CONFIG_LIBDIR
 PKG_CONFIG_PATH
 PKG_CONFIG
+ENABLE_ARM_THUMB_FALSE
+ENABLE_ARM_THUMB_TRUE
+ENABLE_NEON_OPT_FALSE
+ENABLE_NEON_OPT_TRUE
+ENABLE_ARM_OPT_FALSE
+ENABLE_ARM_OPT_TRUE
 ENABLE_SSE_OPT_FALSE
 ENABLE_SSE_OPT_TRUE
 MINGW_FALSE
 MINGW_TRUE
+LIBOBJS
+ALLOCA
 HAVE_VISIBILITY_FALSE
 HAVE_VISIBILITY_TRUE
-HAVE_VISIBILITY
-CFLAG_VISIBILITY
+HAVE_CXX11
 AM_BACKSLASH
 AM_DEFAULT_VERBOSITY
 AM_DEFAULT_V
@@ -677,17 +685,9 @@ CXXDEPMODE
 am__fastdepCC_FALSE
 am__fastdepCC_TRUE
 CCDEPMODE
-am__nodep
-AMDEPBACKSLASH
-AMDEP_FALSE
-AMDEP_TRUE
-am__quote
-am__include
-DEPDIR
 am__untar
 am__tar
 AMTAR
-am__leading_dot
 SET_MAKE
 mkdir_p
 MKDIR_P
@@ -709,6 +709,19 @@ CXXCPP
 ac_ct_CXX
 CXXFLAGS
 CXX
+am__fastdepCCAS_FALSE
+am__fastdepCCAS_TRUE
+CCASDEPMODE
+am__nodep
+AMDEPBACKSLASH
+AMDEP_FALSE
+AMDEP_TRUE
+am__quote
+am__include
+DEPDIR
+am__leading_dot
+CCASFLAGS
+CCAS
 CPP
 OTOOL64
 OTOOL
@@ -807,6 +820,8 @@ enable_libtool_lock
 enable_dependency_tracking
 enable_silent_rules
 enable_sse
+enable_arm
+enable_thumb
 enable_log_error
 enable_log_info
 enable_log_debug
@@ -823,6 +838,8 @@ LDFLAGS
 LIBS
 CPPFLAGS
 CPP
+CCAS
+CCASFLAGS
 CXX
 CXXFLAGS
 CCC
@@ -1378,7 +1395,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures libde265 0.9 to adapt to many kinds of systems.
+\`configure' configures libde265 1.0.2 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1449,7 +1466,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of libde265 0.9:";;
+     short | recursive ) echo "Configuration of libde265 1.0.2:";;
    esac
   cat <<\_ACEOF
 
@@ -1469,6 +1486,8 @@ Optional Features:
   --enable-silent-rules   less verbose build output (undo: "make V=1")
   --disable-silent-rules  verbose build output (undo: "make V=0")
   --disable-sse           disable SSE optimizations (default=no)
+  --disable-arm           disable ARM optimizations (default=no)
+  --enable-thumb          disable ARM THUMB instructions (default=no)
   --enable-log-error      turn on logging at error level (default=yes)
   --enable-log-info       turn on logging at info level (default=no)
   --enable-log-debug      turn on logging at debug level (default=no)
@@ -1494,6 +1513,8 @@ Some influential environment variables:
   CPPFLAGS    (Objective) C/C++ preprocessor flags, e.g. -I<include dir> if
               you have headers in a nonstandard directory <include dir>
   CPP         C preprocessor
+  CCAS        assembler compiler command (defaults to CC)
+  CCASFLAGS   assembler compiler flags (defaults to CFLAGS)
   CXX         C++ compiler command
   CXXFLAGS    C++ compiler flags
   CXXCPP      C++ preprocessor
@@ -1581,7 +1602,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-libde265 configure 0.9
+libde265 configure 1.0.2
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2255,7 +2276,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by libde265 $as_me 0.9, which was
+It was created by libde265 $as_me 1.0.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2535,6 +2556,9 @@ $as_echo "$as_me: creating cache $cache_file" >&6;}
   >$cache_file
 fi
 
+as_fn_append ac_header_list " sys/time.h"
+as_fn_append ac_header_list " unistd.h"
+as_fn_append ac_func_list " alarm"
 # Check that the precious variables saved in the cache have kept the same
 # value.
 ac_cache_corrupted=false
@@ -2607,11 +2631,11 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 ac_config_headers="$ac_config_headers config.h"
 
 
-NUMERIC_VERSION=0x00090000 # Numeric representation of the version
+NUMERIC_VERSION=0x01000200 # Numeric representation of the version (A.B.C[.D] = 0xAABBCCDD)
 
 
 LIBDE265_CURRENT=0
-LIBDE265_REVISION=8
+LIBDE265_REVISION=10
 LIBDE265_AGE=0
 
 # ---------------------------------------------------------------------------
@@ -11183,6 +11207,211 @@ CC="$lt_save_CC"
 
 
 # Checks for programs.
+rm -rf .tst 2>/dev/null
+mkdir .tst 2>/dev/null
+if test -d .tst; then
+  am__leading_dot=.
+else
+  am__leading_dot=_
+fi
+rmdir .tst 2>/dev/null
+
+DEPDIR="${am__leading_dot}deps"
+
+ac_config_commands="$ac_config_commands depfiles"
+
+
+am_make=${MAKE-make}
+cat > confinc << 'END'
+am__doit:
+	@echo this is the am__doit target
+.PHONY: am__doit
+END
+# If we don't find an include directive, just comment out the code.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for style of include used by $am_make" >&5
+$as_echo_n "checking for style of include used by $am_make... " >&6; }
+am__include="#"
+am__quote=
+_am_result=none
+# First try GNU make style include.
+echo "include confinc" > confmf
+# Ignore all kinds of additional output from 'make'.
+case `$am_make -s -f confmf 2> /dev/null` in #(
+*the\ am__doit\ target*)
+  am__include=include
+  am__quote=
+  _am_result=GNU
+  ;;
+esac
+# Now try BSD make style include.
+if test "$am__include" = "#"; then
+   echo '.include "confinc"' > confmf
+   case `$am_make -s -f confmf 2> /dev/null` in #(
+   *the\ am__doit\ target*)
+     am__include=.include
+     am__quote="\""
+     _am_result=BSD
+     ;;
+   esac
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $_am_result" >&5
+$as_echo "$_am_result" >&6; }
+rm -f confinc confmf
+
+# Check whether --enable-dependency-tracking was given.
+if test "${enable_dependency_tracking+set}" = set; then :
+  enableval=$enable_dependency_tracking;
+fi
+
+if test "x$enable_dependency_tracking" != xno; then
+  am_depcomp="$ac_aux_dir/depcomp"
+  AMDEPBACKSLASH='\'
+  am__nodep='_no'
+fi
+ if test "x$enable_dependency_tracking" != xno; then
+  AMDEP_TRUE=
+  AMDEP_FALSE='#'
+else
+  AMDEP_TRUE='#'
+  AMDEP_FALSE=
+fi
+
+
+# By default we simply use the C compiler to build assembly code.
+
+test "${CCAS+set}" = set || CCAS=$CC
+test "${CCASFLAGS+set}" = set || CCASFLAGS=$CFLAGS
+
+
+
+depcc="$CCAS"   am_compiler_list=
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
+$as_echo_n "checking dependency style of $depcc... " >&6; }
+if ${am_cv_CCAS_dependencies_compiler_type+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
+  # We make a subdir and do the tests there.  Otherwise we can end up
+  # making bogus files that we don't know about and never remove.  For
+  # instance it was reported that on HP-UX the gcc test will end up
+  # making a dummy file named 'D' -- because '-MD' means "put the output
+  # in D".
+  rm -rf conftest.dir
+  mkdir conftest.dir
+  # Copy depcomp to subdir because otherwise we won't find it if we're
+  # using a relative directory.
+  cp "$am_depcomp" conftest.dir
+  cd conftest.dir
+  # We will build objects and dependencies in a subdirectory because
+  # it helps to detect inapplicable dependency modes.  For instance
+  # both Tru64's cc and ICC support -MD to output dependencies as a
+  # side effect of compilation, but ICC will put the dependencies in
+  # the current directory while Tru64 will put them in the object
+  # directory.
+  mkdir sub
+
+  am_cv_CCAS_dependencies_compiler_type=none
+  if test "$am_compiler_list" = ""; then
+     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
+  fi
+  am__universal=false
+
+
+  for depmode in $am_compiler_list; do
+    # Setup a source with many dependencies, because some compilers
+    # like to wrap large dependency lists on column 80 (with \), and
+    # we should not choose a depcomp mode which is confused by this.
+    #
+    # We need to recreate these files for each test, as the compiler may
+    # overwrite some of them when testing with obscure command lines.
+    # This happens at least with the AIX C compiler.
+    : > sub/conftest.c
+    for i in 1 2 3 4 5 6; do
+      echo '#include "conftst'$i'.h"' >> sub/conftest.c
+      # Using ": > sub/conftst$i.h" creates only sub/conftst1.h with
+      # Solaris 10 /bin/sh.
+      echo '/* dummy */' > sub/conftst$i.h
+    done
+    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
+
+    # We check with '-c' and '-o' for the sake of the "dashmstdout"
+    # mode.  It turns out that the SunPro C++ compiler does not properly
+    # handle '-M -o', and we need to detect this.  Also, some Intel
+    # versions had trouble with output in subdirs.
+    am__obj=sub/conftest.${OBJEXT-o}
+    am__minus_obj="-o $am__obj"
+    case $depmode in
+    gcc)
+      # This depmode causes a compiler race in universal mode.
+      test "$am__universal" = false || continue
+      ;;
+    nosideeffect)
+      # After this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested.
+      if test "x$enable_dependency_tracking" = xyes; then
+	continue
+      else
+	break
+      fi
+      ;;
+    msvc7 | msvc7msys | msvisualcpp | msvcmsys)
+      # This compiler won't grok '-c -o', but also, the minuso test has
+      # not run yet.  These depmodes are late enough in the game, and
+      # so weak that their functioning should not be impacted.
+      am__obj=conftest.${OBJEXT-o}
+      am__minus_obj=
+      ;;
+    none) break ;;
+    esac
+    if depmode=$depmode \
+       source=sub/conftest.c object=$am__obj \
+       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
+       $SHELL ./depcomp $depcc -c $am__minus_obj sub/conftest.c \
+         >/dev/null 2>conftest.err &&
+       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
+       grep $am__obj sub/conftest.Po > /dev/null 2>&1 &&
+       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
+      # icc doesn't choke on unknown options, it will just issue warnings
+      # or remarks (even with -Werror).  So we grep stderr for any message
+      # that says an option was ignored or not supported.
+      # When given -MP, icc 7.0 and 7.1 complain thusly:
+      #   icc: Command line warning: ignoring option '-M'; no argument required
+      # The diagnosis changed in icc 8.0:
+      #   icc: Command line remark: option '-MP' not supported
+      if (grep 'ignoring option' conftest.err ||
+          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
+        am_cv_CCAS_dependencies_compiler_type=$depmode
+        break
+      fi
+    fi
+  done
+
+  cd ..
+  rm -rf conftest.dir
+else
+  am_cv_CCAS_dependencies_compiler_type=none
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_CCAS_dependencies_compiler_type" >&5
+$as_echo "$am_cv_CCAS_dependencies_compiler_type" >&6; }
+CCASDEPMODE=depmode=$am_cv_CCAS_dependencies_compiler_type
+
+ if
+  test "x$enable_dependency_tracking" != xno \
+  && test "$am_cv_CCAS_dependencies_compiler_type" = gcc3; then
+  am__fastdepCCAS_TRUE=
+  am__fastdepCCAS_FALSE='#'
+else
+  am__fastdepCCAS_TRUE='#'
+  am__fastdepCCAS_FALSE=
+fi
+
+
 ac_ext=cpp
 ac_cpp='$CXXCPP $CPPFLAGS'
 ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
@@ -15685,98 +15914,26 @@ $as_echo "no" >&6; }
   SET_MAKE="MAKE=${MAKE-make}"
 fi
 
-rm -rf .tst 2>/dev/null
-mkdir .tst 2>/dev/null
-if test -d .tst; then
-  am__leading_dot=.
-else
-  am__leading_dot=_
+# Check whether --enable-silent-rules was given.
+if test "${enable_silent_rules+set}" = set; then :
+  enableval=$enable_silent_rules;
 fi
-rmdir .tst 2>/dev/null
-
-DEPDIR="${am__leading_dot}deps"
-
-ac_config_commands="$ac_config_commands depfiles"
-
 
+case $enable_silent_rules in # (((
+  yes) AM_DEFAULT_VERBOSITY=0;;
+   no) AM_DEFAULT_VERBOSITY=1;;
+    *) AM_DEFAULT_VERBOSITY=1;;
+esac
 am_make=${MAKE-make}
-cat > confinc << 'END'
-am__doit:
-	@echo this is the am__doit target
-.PHONY: am__doit
-END
-# If we don't find an include directive, just comment out the code.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for style of include used by $am_make" >&5
-$as_echo_n "checking for style of include used by $am_make... " >&6; }
-am__include="#"
-am__quote=
-_am_result=none
-# First try GNU make style include.
-echo "include confinc" > confmf
-# Ignore all kinds of additional output from 'make'.
-case `$am_make -s -f confmf 2> /dev/null` in #(
-*the\ am__doit\ target*)
-  am__include=include
-  am__quote=
-  _am_result=GNU
-  ;;
-esac
-# Now try BSD make style include.
-if test "$am__include" = "#"; then
-   echo '.include "confinc"' > confmf
-   case `$am_make -s -f confmf 2> /dev/null` in #(
-   *the\ am__doit\ target*)
-     am__include=.include
-     am__quote="\""
-     _am_result=BSD
-     ;;
-   esac
-fi
-
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $_am_result" >&5
-$as_echo "$_am_result" >&6; }
-rm -f confinc confmf
-
-# Check whether --enable-dependency-tracking was given.
-if test "${enable_dependency_tracking+set}" = set; then :
-  enableval=$enable_dependency_tracking;
-fi
-
-if test "x$enable_dependency_tracking" != xno; then
-  am_depcomp="$ac_aux_dir/depcomp"
-  AMDEPBACKSLASH='\'
-  am__nodep='_no'
-fi
- if test "x$enable_dependency_tracking" != xno; then
-  AMDEP_TRUE=
-  AMDEP_FALSE='#'
-else
-  AMDEP_TRUE='#'
-  AMDEP_FALSE=
-fi
-
-
-# Check whether --enable-silent-rules was given.
-if test "${enable_silent_rules+set}" = set; then :
-  enableval=$enable_silent_rules;
-fi
-
-case $enable_silent_rules in # (((
-  yes) AM_DEFAULT_VERBOSITY=0;;
-   no) AM_DEFAULT_VERBOSITY=1;;
-    *) AM_DEFAULT_VERBOSITY=1;;
-esac
-am_make=${MAKE-make}
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $am_make supports nested variables" >&5
-$as_echo_n "checking whether $am_make supports nested variables... " >&6; }
-if ${am_cv_make_support_nested_variables+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if $as_echo 'TRUE=$(BAR$(V))
-BAR0=false
-BAR1=true
-V=1
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $am_make supports nested variables" >&5
+$as_echo_n "checking whether $am_make supports nested variables... " >&6; }
+if ${am_cv_make_support_nested_variables+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if $as_echo 'TRUE=$(BAR$(V))
+BAR0=false
+BAR1=true
+V=1
 am__doit:
 	@$(TRUE)
 .PHONY: am__doit' | $am_make -f - >/dev/null 2>&1; then
@@ -15818,7 +15975,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='libde265'
- VERSION='0.9'
+ VERSION='1.0.2'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -16168,65 +16325,210 @@ END
 fi
 
 CFLAGS+=" -std=c99"
+CXXFLAGS+=" -Werror=return-type -Werror=unused-result -Werror=reorder"
+    ax_cxx_compile_cxx11_required=true
+  ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+  ac_success=no
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX supports C++11 features by default" >&5
+$as_echo_n "checking whether $CXX supports C++11 features by default... " >&6; }
+if ${ax_cv_cxx_compile_cxx11+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-if test "x$GCC" = "xyes"; then
-  case " $CFLAGS " in
-  *[\ \	]-Wall[\ \	]*) ;;
-  *) CFLAGS="$CFLAGS -Wall" ;;
-  esac
-fi
+  template <typename T>
+    struct check
+    {
+      static_assert(sizeof(int) <= sizeof(T), "not big enough");
+    };
+
+    struct Base {
+    virtual void f() {}
+    };
+    struct Child : public Base {
+    virtual void f() {} // DiFa: override {}   # override not supported in gcc 4.6
+    };
+
+    typedef check<check<bool>> right_angle_brackets;
+
+    int a;
+    decltype(a) b;
 
+    typedef check<int> check_type;
+    check_type c;
+    check_type&& cr = static_cast<check_type&&>(c);
 
+    auto d = a;
+    auto l = [](){};
+
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ax_cv_cxx_compile_cxx11=yes
+else
+  ax_cv_cxx_compile_cxx11=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_cxx_compile_cxx11" >&5
+$as_echo "$ax_cv_cxx_compile_cxx11" >&6; }
+  if test x$ax_cv_cxx_compile_cxx11 = xyes; then
+    ac_success=yes
+  fi
 
-  CFLAG_VISIBILITY=
-  HAVE_VISIBILITY=0
-  if test -n "$GCC"; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for simple visibility declarations" >&5
-$as_echo_n "checking for simple visibility declarations... " >&6; }
-    if ${gl_cv_cc_visibility+:} false; then :
+    if test x$ac_success = xno; then
+    for switch in -std=gnu++11 -std=gnu++0x; do
+      cachevar=`$as_echo "ax_cv_cxx_compile_cxx11_$switch" | $as_tr_sh`
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX supports C++11 features with $switch" >&5
+$as_echo_n "checking whether $CXX supports C++11 features with $switch... " >&6; }
+if eval \${$cachevar+:} false; then :
   $as_echo_n "(cached) " >&6
 else
+  ac_save_CXXFLAGS="$CXXFLAGS"
+         CXXFLAGS="$CXXFLAGS $switch"
+         cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
 
-      gl_save_CFLAGS="$CFLAGS"
-      CFLAGS="$CFLAGS -fvisibility=hidden"
-      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  template <typename T>
+    struct check
+    {
+      static_assert(sizeof(int) <= sizeof(T), "not big enough");
+    };
+
+    struct Base {
+    virtual void f() {}
+    };
+    struct Child : public Base {
+    virtual void f() {} // DiFa: override {}   # override not supported in gcc 4.6
+    };
+
+    typedef check<check<bool>> right_angle_brackets;
+
+    int a;
+    decltype(a) b;
+
+    typedef check<int> check_type;
+    check_type c;
+    check_type&& cr = static_cast<check_type&&>(c);
+
+    auto d = a;
+    auto l = [](){};
+
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  eval $cachevar=yes
+else
+  eval $cachevar=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+         CXXFLAGS="$ac_save_CXXFLAGS"
+fi
+eval ac_res=\$$cachevar
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+      if eval test x\$$cachevar = xyes; then
+        CXXFLAGS="$CXXFLAGS $switch"
+        ac_success=yes
+        break
+      fi
+    done
+  fi
+
+    if test x$ac_success = xno; then
+    for switch in -std=c++11 -std=c++0x; do
+      cachevar=`$as_echo "ax_cv_cxx_compile_cxx11_$switch" | $as_tr_sh`
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX supports C++11 features with $switch" >&5
+$as_echo_n "checking whether $CXX supports C++11 features with $switch... " >&6; }
+if eval \${$cachevar+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_save_CXXFLAGS="$CXXFLAGS"
+         CXXFLAGS="$CXXFLAGS $switch"
+         cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
-extern __attribute__((__visibility__("hidden"))) int hiddenvar;
-         extern __attribute__((__visibility__("default"))) int exportedvar;
-         extern __attribute__((__visibility__("hidden"))) int hiddenfunc (void);
-         extern __attribute__((__visibility__("default"))) int exportedfunc (void);
-int
-main ()
-{
 
-  ;
-  return 0;
-}
+  template <typename T>
+    struct check
+    {
+      static_assert(sizeof(int) <= sizeof(T), "not big enough");
+    };
+
+    struct Base {
+    virtual void f() {}
+    };
+    struct Child : public Base {
+    virtual void f() {} // DiFa: override {}   # override not supported in gcc 4.6
+    };
+
+    typedef check<check<bool>> right_angle_brackets;
+
+    int a;
+    decltype(a) b;
+
+    typedef check<int> check_type;
+    check_type c;
+    check_type&& cr = static_cast<check_type&&>(c);
+
+    auto d = a;
+    auto l = [](){};
+
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  gl_cv_cc_visibility=yes
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  eval $cachevar=yes
 else
-  gl_cv_cc_visibility=no
+  eval $cachevar=no
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-      CFLAGS="$gl_save_CFLAGS"
+         CXXFLAGS="$ac_save_CXXFLAGS"
 fi
+eval ac_res=\$$cachevar
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+      if eval test x\$$cachevar = xyes; then
+        CXXFLAGS="$CXXFLAGS $switch"
+        ac_success=yes
+        break
+      fi
+    done
+  fi
+  ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $gl_cv_cc_visibility" >&5
-$as_echo "$gl_cv_cc_visibility" >&6; }
-    if test $gl_cv_cc_visibility = yes; then
-      CFLAG_VISIBILITY="-fvisibility=hidden"
-      HAVE_VISIBILITY=1
+  if test x$ax_cxx_compile_cxx11_required = xtrue; then
+    if test x$ac_success = xno; then
+      as_fn_error $? "*** A compiler with support for C++11 language features is required." "$LINENO" 5
     fi
-  fi
+  else
+    if test x$ac_success = xno; then
+      HAVE_CXX11=0
+      { $as_echo "$as_me:${as_lineno-$LINENO}: No compiler with C++11 support was found" >&5
+$as_echo "$as_me: No compiler with C++11 support was found" >&6;}
+    else
+      HAVE_CXX11=1
 
+$as_echo "#define HAVE_CXX11 1" >>confdefs.h
 
+    fi
 
-cat >>confdefs.h <<_ACEOF
-#define HAVE_VISIBILITY $HAVE_VISIBILITY
-_ACEOF
 
+  fi
+
+
+if test "x$GCC" = "xyes"; then
+  case " $CFLAGS " in
+  *[\ \	]-Wall[\ \	]*) ;;
+  *) CFLAGS="$CFLAGS -Wall" ;;
+  esac
+fi
 
+HAVE_VISIBILITY=0
  if test "x$HAVE_VISIBILITY" != "x0"; then
   HAVE_VISIBILITY_TRUE=
   HAVE_VISIBILITY_FALSE='#'
@@ -16237,7 +16539,7 @@ fi
 
 
 # Checks for header files.
-for ac_header in stdint.h stdlib.h string.h malloc.h
+for ac_header in stdint.h stdlib.h string.h malloc.h signal.h setjmp.h stddef.h sys/time.h
 do :
   as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
 ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
@@ -16345,6 +16647,28 @@ $as_echo "#define HAVE_STDBOOL_H 1" >>confdefs.h
 
 fi
 
+ac_fn_c_check_type "$LINENO" "size_t" "ac_cv_type_size_t" "$ac_includes_default"
+if test "x$ac_cv_type_size_t" = xyes; then :
+
+else
+
+cat >>confdefs.h <<_ACEOF
+#define size_t unsigned int
+_ACEOF
+
+fi
+
+ac_fn_c_find_intX_t "$LINENO" "8" "ac_cv_c_int8_t"
+case $ac_cv_c_int8_t in #(
+  no|yes) ;; #(
+  *)
+
+cat >>confdefs.h <<_ACEOF
+#define int8_t $ac_cv_c_int8_t
+_ACEOF
+;;
+esac
+
 ac_fn_c_find_intX_t "$LINENO" "16" "ac_cv_c_int16_t"
 case $ac_cv_c_int16_t in #(
   no|yes) ;; #(
@@ -16367,17 +16691,31 @@ _ACEOF
 ;;
 esac
 
-ac_fn_c_find_intX_t "$LINENO" "8" "ac_cv_c_int8_t"
-case $ac_cv_c_int8_t in #(
+ac_fn_c_find_intX_t "$LINENO" "64" "ac_cv_c_int64_t"
+case $ac_cv_c_int64_t in #(
   no|yes) ;; #(
   *)
 
 cat >>confdefs.h <<_ACEOF
-#define int8_t $ac_cv_c_int8_t
+#define int64_t $ac_cv_c_int64_t
 _ACEOF
 ;;
 esac
 
+ac_fn_c_find_uintX_t "$LINENO" "8" "ac_cv_c_uint8_t"
+case $ac_cv_c_uint8_t in #(
+  no|yes) ;; #(
+  *)
+
+$as_echo "#define _UINT8_T 1" >>confdefs.h
+
+
+cat >>confdefs.h <<_ACEOF
+#define uint8_t $ac_cv_c_uint8_t
+_ACEOF
+;;
+  esac
+
 ac_fn_c_find_uintX_t "$LINENO" "16" "ac_cv_c_uint16_t"
 case $ac_cv_c_uint16_t in #(
   no|yes) ;; #(
@@ -16418,23 +16756,563 @@ _ACEOF
 ;;
   esac
 
-ac_fn_c_find_uintX_t "$LINENO" "8" "ac_cv_c_uint8_t"
-case $ac_cv_c_uint8_t in #(
-  no|yes) ;; #(
-  *)
+ac_fn_c_check_type "$LINENO" "ptrdiff_t" "ac_cv_type_ptrdiff_t" "$ac_includes_default"
+if test "x$ac_cv_type_ptrdiff_t" = xyes; then :
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_PTRDIFF_T 1
+_ACEOF
+
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for inline" >&5
+$as_echo_n "checking for inline... " >&6; }
+if ${ac_cv_c_inline+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_c_inline=no
+for ac_kw in inline __inline__ __inline; do
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifndef __cplusplus
+typedef int foo_t;
+static $ac_kw foo_t static_foo () {return 0; }
+$ac_kw foo_t foo () {return 0; }
+#endif
+
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_c_inline=$ac_kw
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  test "$ac_cv_c_inline" != no && break
+done
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_inline" >&5
+$as_echo "$ac_cv_c_inline" >&6; }
+
+case $ac_cv_c_inline in
+  inline | yes) ;;
+  *)
+    case $ac_cv_c_inline in
+      no) ac_val=;;
+      *) ac_val=$ac_cv_c_inline;;
+    esac
+    cat >>confdefs.h <<_ACEOF
+#ifndef __cplusplus
+#define inline $ac_val
+#endif
+_ACEOF
+    ;;
+esac
+
+
+# Checks for library functions.
+for ac_func in malloc memmove memset __malloc_hook memalign posix_memalign __mingw_aligned_malloc __mingw_aligned_free
+do :
+  as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
+ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
+if eval test \"x\$"$as_ac_var"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+done
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing pow" >&5
+$as_echo_n "checking for library containing pow... " >&6; }
+if ${ac_cv_search_pow+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_func_search_save_LIBS=$LIBS
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char pow ();
+int
+main ()
+{
+return pow ();
+  ;
+  return 0;
+}
+_ACEOF
+for ac_lib in '' m; do
+  if test -z "$ac_lib"; then
+    ac_res="none required"
+  else
+    ac_res=-l$ac_lib
+    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
+  fi
+  if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_search_pow=$ac_res
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext
+  if ${ac_cv_search_pow+:} false; then :
+  break
+fi
+done
+if ${ac_cv_search_pow+:} false; then :
+
+else
+  ac_cv_search_pow=no
+fi
+rm conftest.$ac_ext
+LIBS=$ac_func_search_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_pow" >&5
+$as_echo "$ac_cv_search_pow" >&6; }
+ac_res=$ac_cv_search_pow
+if test "$ac_res" != no; then :
+  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing sqrt" >&5
+$as_echo_n "checking for library containing sqrt... " >&6; }
+if ${ac_cv_search_sqrt+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_func_search_save_LIBS=$LIBS
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char sqrt ();
+int
+main ()
+{
+return sqrt ();
+  ;
+  return 0;
+}
+_ACEOF
+for ac_lib in '' m; do
+  if test -z "$ac_lib"; then
+    ac_res="none required"
+  else
+    ac_res=-l$ac_lib
+    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
+  fi
+  if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_search_sqrt=$ac_res
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext
+  if ${ac_cv_search_sqrt+:} false; then :
+  break
+fi
+done
+if ${ac_cv_search_sqrt+:} false; then :
+
+else
+  ac_cv_search_sqrt=no
+fi
+rm conftest.$ac_ext
+LIBS=$ac_func_search_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_sqrt" >&5
+$as_echo "$ac_cv_search_sqrt" >&6; }
+ac_res=$ac_cv_search_sqrt
+if test "$ac_res" != no; then :
+  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing pthread_create" >&5
+$as_echo_n "checking for library containing pthread_create... " >&6; }
+if ${ac_cv_search_pthread_create+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_func_search_save_LIBS=$LIBS
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char pthread_create ();
+int
+main ()
+{
+return pthread_create ();
+  ;
+  return 0;
+}
+_ACEOF
+for ac_lib in '' pthread; do
+  if test -z "$ac_lib"; then
+    ac_res="none required"
+  else
+    ac_res=-l$ac_lib
+    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
+  fi
+  if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_search_pthread_create=$ac_res
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext
+  if ${ac_cv_search_pthread_create+:} false; then :
+  break
+fi
+done
+if ${ac_cv_search_pthread_create+:} false; then :
+
+else
+  ac_cv_search_pthread_create=no
+fi
+rm conftest.$ac_ext
+LIBS=$ac_func_search_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_pthread_create" >&5
+$as_echo "$ac_cv_search_pthread_create" >&6; }
+ac_res=$ac_cv_search_pthread_create
+if test "$ac_res" != no; then :
+  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+
+fi
+
+
+for ac_func in gettimeofday
+do :
+  ac_fn_c_check_func "$LINENO" "gettimeofday" "ac_cv_func_gettimeofday"
+if test "x$ac_cv_func_gettimeofday" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_GETTIMEOFDAY 1
+_ACEOF
+
+fi
+done
+
+for ac_func in pow sqrt
+do :
+  as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
+ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
+if eval test \"x\$"$as_ac_var"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+done
+
+for ac_func in strchr strrchr
+do :
+  as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
+ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
+if eval test \"x\$"$as_ac_var"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+done
+
+
+# The Ultrix 4.2 mips builtin alloca declared by alloca.h only works
+# for constant arguments.  Useless!
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for working alloca.h" >&5
+$as_echo_n "checking for working alloca.h... " >&6; }
+if ${ac_cv_working_alloca_h+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <alloca.h>
+int
+main ()
+{
+char *p = (char *) alloca (2 * sizeof (int));
+			  if (p) return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_working_alloca_h=yes
+else
+  ac_cv_working_alloca_h=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_working_alloca_h" >&5
+$as_echo "$ac_cv_working_alloca_h" >&6; }
+if test $ac_cv_working_alloca_h = yes; then
+
+$as_echo "#define HAVE_ALLOCA_H 1" >>confdefs.h
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for alloca" >&5
+$as_echo_n "checking for alloca... " >&6; }
+if ${ac_cv_func_alloca_works+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __GNUC__
+# define alloca __builtin_alloca
+#else
+# ifdef _MSC_VER
+#  include <malloc.h>
+#  define alloca _alloca
+# else
+#  ifdef HAVE_ALLOCA_H
+#   include <alloca.h>
+#  else
+#   ifdef _AIX
+ #pragma alloca
+#   else
+#    ifndef alloca /* predefined by HP cc +Olibcalls */
+void *alloca (size_t);
+#    endif
+#   endif
+#  endif
+# endif
+#endif
+
+int
+main ()
+{
+char *p = (char *) alloca (1);
+				    if (p) return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_func_alloca_works=yes
+else
+  ac_cv_func_alloca_works=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_func_alloca_works" >&5
+$as_echo "$ac_cv_func_alloca_works" >&6; }
+
+if test $ac_cv_func_alloca_works = yes; then
+
+$as_echo "#define HAVE_ALLOCA 1" >>confdefs.h
+
+else
+  # The SVR3 libPW and SVR4 libucb both contain incompatible functions
+# that cause trouble.  Some versions do not even contain alloca or
+# contain a buggy version.  If you still want to use their alloca,
+# use ar to extract alloca.o from them instead of compiling alloca.c.
+
+ALLOCA=\${LIBOBJDIR}alloca.$ac_objext
+
+$as_echo "#define C_ALLOCA 1" >>confdefs.h
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether \`alloca.c' needs Cray hooks" >&5
+$as_echo_n "checking whether \`alloca.c' needs Cray hooks... " >&6; }
+if ${ac_cv_os_cray+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#if defined CRAY && ! defined CRAY2
+webecray
+#else
+wenotbecray
+#endif
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "webecray" >/dev/null 2>&1; then :
+  ac_cv_os_cray=yes
+else
+  ac_cv_os_cray=no
+fi
+rm -f conftest*
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_os_cray" >&5
+$as_echo "$ac_cv_os_cray" >&6; }
+if test $ac_cv_os_cray = yes; then
+  for ac_func in _getb67 GETB67 getb67; do
+    as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
+ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
+if eval test \"x\$"$as_ac_var"\" = x"yes"; then :
+
+cat >>confdefs.h <<_ACEOF
+#define CRAY_STACKSEG_END $ac_func
+_ACEOF
+
+    break
+fi
+
+  done
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking stack direction for C alloca" >&5
+$as_echo_n "checking stack direction for C alloca... " >&6; }
+if ${ac_cv_c_stack_direction+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test "$cross_compiling" = yes; then :
+  ac_cv_c_stack_direction=0
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$ac_includes_default
+int
+find_stack_direction (int *addr, int depth)
+{
+  int dir, dummy = 0;
+  if (! addr)
+    addr = &dummy;
+  *addr = addr < &dummy ? 1 : addr == &dummy ? 0 : -1;
+  dir = depth ? find_stack_direction (addr, depth - 1) : 0;
+  return dir + dummy;
+}
+
+int
+main (int argc, char **argv)
+{
+  return find_stack_direction (0, argc + !argv + 20) < 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+  ac_cv_c_stack_direction=1
+else
+  ac_cv_c_stack_direction=-1
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_stack_direction" >&5
+$as_echo "$ac_cv_c_stack_direction" >&6; }
+cat >>confdefs.h <<_ACEOF
+#define STACK_DIRECTION $ac_cv_c_stack_direction
+_ACEOF
+
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for error_at_line" >&5
+$as_echo_n "checking for error_at_line... " >&6; }
+if ${ac_cv_lib_error_at_line+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <error.h>
+int
+main ()
+{
+error_at_line (0, 0, "", 0, "an error occurred");
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_error_at_line=yes
+else
+  ac_cv_lib_error_at_line=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_error_at_line" >&5
+$as_echo "$ac_cv_lib_error_at_line" >&6; }
+if test $ac_cv_lib_error_at_line = no; then
+  case " $LIBOBJS " in
+  *" error.$ac_objext "* ) ;;
+  *) LIBOBJS="$LIBOBJS error.$ac_objext"
+ ;;
+esac
+
+fi
+
+# Checking for malloc breaks building on ARM for us. A similar issue is described
+# here: http://nerdland.net/unstumping-the-internet/malloc-has-not-been-declared/
+# AC_FUNC_MALLOC
+# AC_FUNC_REALLOC
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether time.h and sys/time.h may both be included" >&5
+$as_echo_n "checking whether time.h and sys/time.h may both be included... " >&6; }
+if ${ac_cv_header_time+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <sys/types.h>
+#include <sys/time.h>
+#include <time.h>
+
+int
+main ()
+{
+if ((struct tm *) 0)
+return 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_header_time=yes
+else
+  ac_cv_header_time=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_time" >&5
+$as_echo "$ac_cv_header_time" >&6; }
+if test $ac_cv_header_time = yes; then
+
+$as_echo "#define TIME_WITH_SYS_TIME 1" >>confdefs.h
+
+fi
+
+
+
+
+  for ac_header in $ac_header_list
+do :
+  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
+ac_fn_c_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default
+"
+if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+
+done
+
 
-$as_echo "#define _UINT8_T 1" >>confdefs.h
 
 
-cat >>confdefs.h <<_ACEOF
-#define uint8_t $ac_cv_c_uint8_t
-_ACEOF
-;;
-  esac
 
 
-# Checks for library functions.
-for ac_func in malloc memmove memset __malloc_hook memalign posix_memalign __mingw_aligned_malloc __mingw_aligned_free
+
+
+  for ac_func in $ac_func_list
 do :
   as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
 ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
@@ -16447,115 +17325,229 @@ fi
 done
 
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing sqrt" >&5
-$as_echo_n "checking for library containing sqrt... " >&6; }
-if ${ac_cv_search_sqrt+:} false; then :
+
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for working mktime" >&5
+$as_echo_n "checking for working mktime... " >&6; }
+if ${ac_cv_func_working_mktime+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  ac_func_search_save_LIBS=$LIBS
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  if test "$cross_compiling" = yes; then :
+  ac_cv_func_working_mktime=no
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
+/* Test program from Paul Eggert and Tony Leneis.  */
+#ifdef TIME_WITH_SYS_TIME
+# include <sys/time.h>
+# include <time.h>
+#else
+# ifdef HAVE_SYS_TIME_H
+#  include <sys/time.h>
+# else
+#  include <time.h>
+# endif
+#endif
 
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
+#include <limits.h>
+#include <stdlib.h>
+
+#ifdef HAVE_UNISTD_H
+# include <unistd.h>
 #endif
-char sqrt ();
-int
-main ()
+
+#ifndef HAVE_ALARM
+# define alarm(X) /* empty */
+#endif
+
+/* Work around redefinition to rpl_putenv by other config tests.  */
+#undef putenv
+
+static time_t time_t_max;
+static time_t time_t_min;
+
+/* Values we'll use to set the TZ environment variable.  */
+static const char *tz_strings[] = {
+  (const char *) 0, "TZ=GMT0", "TZ=JST-9",
+  "TZ=EST+3EDT+2,M10.1.0/00:00:00,M2.3.0/00:00:00"
+};
+#define N_STRINGS (sizeof (tz_strings) / sizeof (tz_strings[0]))
+
+/* Return 0 if mktime fails to convert a date in the spring-forward gap.
+   Based on a problem report from Andreas Jaeger.  */
+static int
+spring_forward_gap ()
 {
-return sqrt ();
-  ;
-  return 0;
+  /* glibc (up to about 1998-10-07) failed this test. */
+  struct tm tm;
+
+  /* Use the portable POSIX.1 specification "TZ=PST8PDT,M4.1.0,M10.5.0"
+     instead of "TZ=America/Vancouver" in order to detect the bug even
+     on systems that don't support the Olson extension, or don't have the
+     full zoneinfo tables installed.  */
+  putenv ((char*) "TZ=PST8PDT,M4.1.0,M10.5.0");
+
+  tm.tm_year = 98;
+  tm.tm_mon = 3;
+  tm.tm_mday = 5;
+  tm.tm_hour = 2;
+  tm.tm_min = 0;
+  tm.tm_sec = 0;
+  tm.tm_isdst = -1;
+  return mktime (&tm) != (time_t) -1;
 }
-_ACEOF
-for ac_lib in '' m; do
-  if test -z "$ac_lib"; then
-    ac_res="none required"
-  else
-    ac_res=-l$ac_lib
-    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
-  fi
-  if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_search_sqrt=$ac_res
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext
-  if ${ac_cv_search_sqrt+:} false; then :
-  break
-fi
-done
-if ${ac_cv_search_sqrt+:} false; then :
 
-else
-  ac_cv_search_sqrt=no
-fi
-rm conftest.$ac_ext
-LIBS=$ac_func_search_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_sqrt" >&5
-$as_echo "$ac_cv_search_sqrt" >&6; }
-ac_res=$ac_cv_search_sqrt
-if test "$ac_res" != no; then :
-  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+static int
+mktime_test1 (time_t now)
+{
+  struct tm *lt;
+  return ! (lt = localtime (&now)) || mktime (lt) == now;
+}
 
-fi
+static int
+mktime_test (time_t now)
+{
+  return (mktime_test1 (now)
+	  && mktime_test1 ((time_t) (time_t_max - now))
+	  && mktime_test1 ((time_t) (time_t_min + now)));
+}
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing pthread_create" >&5
-$as_echo_n "checking for library containing pthread_create... " >&6; }
-if ${ac_cv_search_pthread_create+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  ac_func_search_save_LIBS=$LIBS
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
+static int
+irix_6_4_bug ()
+{
+  /* Based on code from Ariel Faigon.  */
+  struct tm tm;
+  tm.tm_year = 96;
+  tm.tm_mon = 3;
+  tm.tm_mday = 0;
+  tm.tm_hour = 0;
+  tm.tm_min = 0;
+  tm.tm_sec = 0;
+  tm.tm_isdst = -1;
+  mktime (&tm);
+  return tm.tm_mon == 2 && tm.tm_mday == 31;
+}
+
+static int
+bigtime_test (int j)
+{
+  struct tm tm;
+  time_t now;
+  tm.tm_year = tm.tm_mon = tm.tm_mday = tm.tm_hour = tm.tm_min = tm.tm_sec = j;
+  now = mktime (&tm);
+  if (now != (time_t) -1)
+    {
+      struct tm *lt = localtime (&now);
+      if (! (lt
+	     && lt->tm_year == tm.tm_year
+	     && lt->tm_mon == tm.tm_mon
+	     && lt->tm_mday == tm.tm_mday
+	     && lt->tm_hour == tm.tm_hour
+	     && lt->tm_min == tm.tm_min
+	     && lt->tm_sec == tm.tm_sec
+	     && lt->tm_yday == tm.tm_yday
+	     && lt->tm_wday == tm.tm_wday
+	     && ((lt->tm_isdst < 0 ? -1 : 0 < lt->tm_isdst)
+		  == (tm.tm_isdst < 0 ? -1 : 0 < tm.tm_isdst))))
+	return 0;
+    }
+  return 1;
+}
+
+static int
+year_2050_test ()
+{
+  /* The correct answer for 2050-02-01 00:00:00 in Pacific time,
+     ignoring leap seconds.  */
+  unsigned long int answer = 2527315200UL;
+
+  struct tm tm;
+  time_t t;
+  tm.tm_year = 2050 - 1900;
+  tm.tm_mon = 2 - 1;
+  tm.tm_mday = 1;
+  tm.tm_hour = tm.tm_min = tm.tm_sec = 0;
+  tm.tm_isdst = -1;
+
+  /* Use the portable POSIX.1 specification "TZ=PST8PDT,M4.1.0,M10.5.0"
+     instead of "TZ=America/Vancouver" in order to detect the bug even
+     on systems that don't support the Olson extension, or don't have the
+     full zoneinfo tables installed.  */
+  putenv ((char*) "TZ=PST8PDT,M4.1.0,M10.5.0");
+
+  t = mktime (&tm);
+
+  /* Check that the result is either a failure, or close enough
+     to the correct answer that we can assume the discrepancy is
+     due to leap seconds.  */
+  return (t == (time_t) -1
+	  || (0 < t && answer - 120 <= t && t <= answer + 120));
+}
 
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char pthread_create ();
 int
 main ()
 {
-return pthread_create ();
-  ;
-  return 0;
+  time_t t, delta;
+  int i, j;
+
+  /* This test makes some buggy mktime implementations loop.
+     Give up after 60 seconds; a mktime slower than that
+     isn't worth using anyway.  */
+  alarm (60);
+
+  for (;;)
+    {
+      t = (time_t_max << 1) + 1;
+      if (t <= time_t_max)
+	break;
+      time_t_max = t;
+    }
+  time_t_min = - ((time_t) ~ (time_t) 0 == (time_t) -1) - time_t_max;
+
+  delta = time_t_max / 997; /* a suitable prime number */
+  for (i = 0; i < N_STRINGS; i++)
+    {
+      if (tz_strings[i])
+	putenv ((char*) tz_strings[i]);
+
+      for (t = 0; t <= time_t_max - delta; t += delta)
+	if (! mktime_test (t))
+	  return 1;
+      if (! (mktime_test ((time_t) 1)
+	     && mktime_test ((time_t) (60 * 60))
+	     && mktime_test ((time_t) (60 * 60 * 24))))
+	return 1;
+
+      for (j = 1; ; j <<= 1)
+	if (! bigtime_test (j))
+	  return 1;
+	else if (INT_MAX / 2 < j)
+	  break;
+      if (! bigtime_test (INT_MAX))
+	return 1;
+    }
+  return ! (irix_6_4_bug () && spring_forward_gap () && year_2050_test ());
 }
 _ACEOF
-for ac_lib in '' pthread; do
-  if test -z "$ac_lib"; then
-    ac_res="none required"
-  else
-    ac_res=-l$ac_lib
-    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
-  fi
-  if ac_fn_c_try_link "$LINENO"; then :
-  ac_cv_search_pthread_create=$ac_res
+if ac_fn_c_try_run "$LINENO"; then :
+  ac_cv_func_working_mktime=yes
+else
+  ac_cv_func_working_mktime=no
 fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext
-  if ${ac_cv_search_pthread_create+:} false; then :
-  break
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
 fi
-done
-if ${ac_cv_search_pthread_create+:} false; then :
 
-else
-  ac_cv_search_pthread_create=no
-fi
-rm conftest.$ac_ext
-LIBS=$ac_func_search_save_LIBS
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_pthread_create" >&5
-$as_echo "$ac_cv_search_pthread_create" >&6; }
-ac_res=$ac_cv_search_pthread_create
-if test "$ac_res" != no; then :
-  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_func_working_mktime" >&5
+$as_echo "$ac_cv_func_working_mktime" >&6; }
+if test $ac_cv_func_working_mktime = no; then
+  case " $LIBOBJS " in
+  *" mktime.$ac_objext "* ) ;;
+  *) LIBOBJS="$LIBOBJS mktime.$ac_objext"
+ ;;
+esac
 
 fi
 
@@ -16569,6 +17561,46 @@ else
 fi
 
 
+# Check if "__STRICT_ANSI__" is required.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking if __STRICT_ANSI__ is required" >&5
+$as_echo_n "checking if __STRICT_ANSI__ is required... " >&6; }
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+#include <vector>
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  need_strict_ansi=no
+else
+  need_strict_ansi=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext;
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+if eval "test x$need_strict_ansi = xyes"; then
+  CFLAGS+=" -D__STRICT_ANSI__"
+  CXXFLAGS+=" -D__STRICT_ANSI__"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $need_strict_ansi" >&5
+$as_echo "$need_strict_ansi" >&6; }
 
 # --- machine dependent optimizations ---
 
@@ -16649,6 +17681,101 @@ fi
 # CFLAGS+=$SIMD_FLAGS
 # CFLAGS+=" -march=x86-64"
 
+case $target_cpu in
+  arm*)
+    # Check whether --enable-arm was given.
+if test "${enable_arm+set}" = set; then :
+  enableval=$enable_arm; disable_arm=yes
+else
+  disable_arm=no
+fi
+
+
+    if test x"$disable_arm" != x"yes"; then
+
+$as_echo "#define HAVE_ARM 1" >>confdefs.h
+
+
+      { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -mfpu=neon" >&5
+$as_echo_n "checking whether C compiler accepts -mfpu=neon... " >&6; }
+if ${ax_cv_check_cflags___mfpu_neon+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+  ax_check_save_flags=$CFLAGS
+  CFLAGS="$CFLAGS  -mfpu=neon"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ax_cv_check_cflags___mfpu_neon=yes
+else
+  ax_cv_check_cflags___mfpu_neon=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS=$ax_check_save_flags
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ax_cv_check_cflags___mfpu_neon" >&5
+$as_echo "$ax_cv_check_cflags___mfpu_neon" >&6; }
+if test x"$ax_cv_check_cflags___mfpu_neon" = xyes; then :
+
+
+$as_echo "#define HAVE_NEON 1" >>confdefs.h
+
+          ax_cv_support_neon_ext=yes
+else
+  :
+fi
+
+
+      # Check whether --enable-thumb was given.
+if test "${enable_thumb+set}" = set; then :
+  enableval=$enable_thumb; enable_thumb=yes
+else
+  enable_thumb=no
+fi
+
+    fi
+    ;;
+
+  *)
+    disable_arm=yes
+    ;;
+
+esac
+
+ if test x"$disable_arm" != x"yes"; then
+  ENABLE_ARM_OPT_TRUE=
+  ENABLE_ARM_OPT_FALSE='#'
+else
+  ENABLE_ARM_OPT_TRUE='#'
+  ENABLE_ARM_OPT_FALSE=
+fi
+
+ if test x"$ax_cv_support_neon_ext" = x"yes"; then
+  ENABLE_NEON_OPT_TRUE=
+  ENABLE_NEON_OPT_FALSE='#'
+else
+  ENABLE_NEON_OPT_TRUE='#'
+  ENABLE_NEON_OPT_FALSE=
+fi
+
+ if test x"$enable_thumb" != x"no"; then
+  ENABLE_ARM_THUMB_TRUE=
+  ENABLE_ARM_THUMB_FALSE='#'
+else
+  ENABLE_ARM_THUMB_TRUE='#'
+  ENABLE_ARM_THUMB_FALSE=
+fi
+
 
 # --- additional logging ---
 
@@ -17450,14 +18577,24 @@ ac_config_files="$ac_config_files Makefile"
 
 ac_config_files="$ac_config_files libde265/Makefile"
 
+ac_config_files="$ac_config_files libde265/arm/Makefile"
+
 ac_config_files="$ac_config_files libde265/x86/Makefile"
 
+ac_config_files="$ac_config_files libde265/encoder/Makefile"
+
+ac_config_files="$ac_config_files libde265/encoder/algo/Makefile"
+
 ac_config_files="$ac_config_files libde265/de265-version.h"
 
 ac_config_files="$ac_config_files dec265/Makefile"
 
+ac_config_files="$ac_config_files enc265/Makefile"
+
 ac_config_files="$ac_config_files sherlock265/Makefile"
 
+ac_config_files="$ac_config_files tools/Makefile"
+
 ac_config_files="$ac_config_files libde265.pc"
 
 cat >confcache <<\_ACEOF
@@ -17569,6 +18706,14 @@ LIBOBJS=$ac_libobjs
 LTLIBOBJS=$ac_ltlibobjs
 
 
+if test -z "${AMDEP_TRUE}" && test -z "${AMDEP_FALSE}"; then
+  as_fn_error $? "conditional \"AMDEP\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${am__fastdepCCAS_TRUE}" && test -z "${am__fastdepCCAS_FALSE}"; then
+  as_fn_error $? "conditional \"am__fastdepCCAS\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking that generated files are newer than configure" >&5
 $as_echo_n "checking that generated files are newer than configure... " >&6; }
    if test -n "$am_sleep_pid"; then
@@ -17577,10 +18722,6 @@ $as_echo_n "checking that generated files are newer than configure... " >&6; }
    fi
    { $as_echo "$as_me:${as_lineno-$LINENO}: result: done" >&5
 $as_echo "done" >&6; }
-if test -z "${AMDEP_TRUE}" && test -z "${AMDEP_FALSE}"; then
-  as_fn_error $? "conditional \"AMDEP\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
 if test -z "${am__fastdepCC_TRUE}" && test -z "${am__fastdepCC_FALSE}"; then
   as_fn_error $? "conditional \"am__fastdepCC\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -17609,6 +18750,18 @@ if test -z "${ENABLE_SSE_OPT_TRUE}" && test -z "${ENABLE_SSE_OPT_FALSE}"; then
   as_fn_error $? "conditional \"ENABLE_SSE_OPT\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${ENABLE_ARM_OPT_TRUE}" && test -z "${ENABLE_ARM_OPT_FALSE}"; then
+  as_fn_error $? "conditional \"ENABLE_ARM_OPT\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${ENABLE_NEON_OPT_TRUE}" && test -z "${ENABLE_NEON_OPT_FALSE}"; then
+  as_fn_error $? "conditional \"ENABLE_NEON_OPT\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${ENABLE_ARM_THUMB_TRUE}" && test -z "${ENABLE_ARM_THUMB_FALSE}"; then
+  as_fn_error $? "conditional \"ENABLE_ARM_THUMB\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${HAVE_VIDEOGFX_TRUE}" && test -z "${HAVE_VIDEOGFX_FALSE}"; then
   as_fn_error $? "conditional \"HAVE_VIDEOGFX\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -18026,7 +19179,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by libde265 $as_me 0.9, which was
+This file was extended by libde265 $as_me 1.0.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -18092,7 +19245,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-libde265 config.status 0.9
+libde265 config.status 1.0.2
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
@@ -18601,10 +19754,15 @@ do
     "depfiles") CONFIG_COMMANDS="$CONFIG_COMMANDS depfiles" ;;
     "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;;
     "libde265/Makefile") CONFIG_FILES="$CONFIG_FILES libde265/Makefile" ;;
+    "libde265/arm/Makefile") CONFIG_FILES="$CONFIG_FILES libde265/arm/Makefile" ;;
     "libde265/x86/Makefile") CONFIG_FILES="$CONFIG_FILES libde265/x86/Makefile" ;;
+    "libde265/encoder/Makefile") CONFIG_FILES="$CONFIG_FILES libde265/encoder/Makefile" ;;
+    "libde265/encoder/algo/Makefile") CONFIG_FILES="$CONFIG_FILES libde265/encoder/algo/Makefile" ;;
     "libde265/de265-version.h") CONFIG_FILES="$CONFIG_FILES libde265/de265-version.h" ;;
     "dec265/Makefile") CONFIG_FILES="$CONFIG_FILES dec265/Makefile" ;;
+    "enc265/Makefile") CONFIG_FILES="$CONFIG_FILES enc265/Makefile" ;;
     "sherlock265/Makefile") CONFIG_FILES="$CONFIG_FILES sherlock265/Makefile" ;;
+    "tools/Makefile") CONFIG_FILES="$CONFIG_FILES tools/Makefile" ;;
     "libde265.pc") CONFIG_FILES="$CONFIG_FILES libde265.pc" ;;
 
   *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
diff --git a/configure.ac b/configure.ac
index 69f9a49..aa94419 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2,15 +2,15 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.68])
-AC_INIT([libde265], [0.9], [farin at struktur.de])
+AC_INIT([libde265], [1.0.2], [farin at struktur.de])
 AC_CONFIG_SRCDIR([libde265/de265.cc])
 AC_CONFIG_HEADERS([config.h])
 
-NUMERIC_VERSION=0x00090000 # Numeric representation of the version
+NUMERIC_VERSION=0x01000200 # Numeric representation of the version (A.B.C[.D] = 0xAABBCCDD)
 AC_SUBST(NUMERIC_VERSION)
 
 LIBDE265_CURRENT=0
-LIBDE265_REVISION=8
+LIBDE265_REVISION=10
 LIBDE265_AGE=0
 
 # ---------------------------------------------------------------------------
@@ -26,6 +26,7 @@ LT_INIT
 AC_CONFIG_MACRO_DIR([m4])
 
 # Checks for programs.
+AM_PROG_AS
 AC_PROG_CXX
 AC_PROG_CC
 AC_PROG_INSTALL
@@ -36,6 +37,8 @@ AC_PROG_GREP
 AM_INIT_AUTOMAKE
 
 CFLAGS+=" -std=c99"
+CXXFLAGS+=" -Werror=return-type -Werror=unused-result -Werror=reorder"
+AX_CXX_COMPILE_STDCXX_11()
 
 dnl Use -Wall if we have gcc.
 changequote(,)dnl
@@ -47,30 +50,61 @@ if test "x$GCC" = "xyes"; then
 fi
 changequote([,])dnl
 
-gl_VISIBILITY
+dnl gl_VISIBILITY
+dnl : In encoder branch, we still export all library symbols :
+HAVE_VISIBILITY=0
 AM_CONDITIONAL([HAVE_VISIBILITY], [test "x$HAVE_VISIBILITY" != "x0"])
 
 # Checks for header files.
-AC_CHECK_HEADERS([stdint.h stdlib.h string.h malloc.h])
+AC_CHECK_HEADERS([stdint.h stdlib.h string.h malloc.h signal.h setjmp.h stddef.h sys/time.h])
 
 # Checks for typedefs, structures, and compiler characteristics.
 AC_HEADER_STDBOOL
+AC_TYPE_SIZE_T
+AC_TYPE_INT8_T
 AC_TYPE_INT16_T
 AC_TYPE_INT32_T
-AC_TYPE_INT8_T
+AC_TYPE_INT64_T
+AC_TYPE_UINT8_T
 AC_TYPE_UINT16_T
 AC_TYPE_UINT32_T
 AC_TYPE_UINT64_T
-AC_TYPE_UINT8_T
+AC_CHECK_TYPES([ptrdiff_t])
+AC_C_INLINE
 
 # Checks for library functions.
 AC_CHECK_FUNCS([malloc memmove memset __malloc_hook memalign posix_memalign __mingw_aligned_malloc __mingw_aligned_free])
 
+AC_SEARCH_LIBS([pow], [m])
 AC_SEARCH_LIBS([sqrt], [m])
 AC_SEARCH_LIBS([pthread_create], [pthread])
 
+AC_CHECK_FUNCS([gettimeofday])
+AC_CHECK_FUNCS([pow sqrt])
+AC_CHECK_FUNCS([strchr strrchr])
+
+AC_FUNC_ALLOCA
+AC_FUNC_ERROR_AT_LINE
+# Checking for malloc breaks building on ARM for us. A similar issue is described
+# here: http://nerdland.net/unstumping-the-internet/malloc-has-not-been-declared/
+# AC_FUNC_MALLOC
+# AC_FUNC_REALLOC
+AC_FUNC_MKTIME
+
 AM_CONDITIONAL(MINGW, expr $host : '.*-mingw' >/dev/null 2>&1)
 
+# Check if "__STRICT_ANSI__" is required.
+AC_MSG_CHECKING([if __STRICT_ANSI__ is required])
+AC_LANG_PUSH(C++)
+AC_TRY_COMPILE([
+#include <vector>
+],[],[need_strict_ansi=no],[need_strict_ansi=yes]);
+AC_LANG_POP(C++)
+if eval "test x$need_strict_ansi = xyes"; then
+  CFLAGS+=" -D__STRICT_ANSI__"
+  CXXFLAGS+=" -D__STRICT_ANSI__"
+fi
+AC_MSG_RESULT([$need_strict_ansi])
 
 # --- machine dependent optimizations ---
 
@@ -105,6 +139,38 @@ AM_CONDITIONAL([ENABLE_SSE_OPT], [test x"$ax_cv_support_sse41_ext" = x"yes"])
 # CFLAGS+=$SIMD_FLAGS
 # CFLAGS+=" -march=x86-64"
 
+case $target_cpu in
+  arm*)
+    AC_ARG_ENABLE(arm,
+                  [AS_HELP_STRING([--disable-arm],
+                                  [disable ARM optimizations (default=no)])],
+      [disable_arm=yes],
+      [disable_arm=no])
+
+    if test x"$disable_arm" != x"yes"; then
+      AC_DEFINE(HAVE_ARM, 1, [Support ARM instructions])
+
+      AX_CHECK_COMPILE_FLAG(-mfpu=neon, [
+          AC_DEFINE(HAVE_NEON, 1, [Support ARM NEON instructions])
+          ax_cv_support_neon_ext=yes], [])
+
+      AC_ARG_ENABLE(thumb,
+                    [AS_HELP_STRING([--enable-thumb],
+                                    [disable ARM THUMB instructions (default=no)])],
+        [enable_thumb=yes],
+        [enable_thumb=no])
+    fi
+    ;;
+
+  *)
+    disable_arm=yes
+    ;;
+
+esac
+
+AM_CONDITIONAL([ENABLE_ARM_OPT], [test x"$disable_arm" != x"yes"])
+AM_CONDITIONAL([ENABLE_NEON_OPT], [test x"$ax_cv_support_neon_ext" = x"yes"])
+AM_CONDITIONAL([ENABLE_ARM_THUMB], [test x"$enable_thumb" != x"no"])
 
 # --- additional logging ---
 
@@ -231,9 +297,14 @@ AC_MSG_NOTICE([---------------------------------------])
 
 AC_CONFIG_FILES([Makefile])
 AC_CONFIG_FILES([libde265/Makefile])
+AC_CONFIG_FILES([libde265/arm/Makefile])
 AC_CONFIG_FILES([libde265/x86/Makefile])
+AC_CONFIG_FILES([libde265/encoder/Makefile])
+AC_CONFIG_FILES([libde265/encoder/algo/Makefile])
 AC_CONFIG_FILES([libde265/de265-version.h])
 AC_CONFIG_FILES([dec265/Makefile])
+AC_CONFIG_FILES([enc265/Makefile])
 AC_CONFIG_FILES([sherlock265/Makefile])
+AC_CONFIG_FILES([tools/Makefile])
 AC_CONFIG_FILES([libde265.pc])
 AC_OUTPUT
diff --git a/dec265/CMakeLists.txt b/dec265/CMakeLists.txt
index ce0d333..b57035c 100644
--- a/dec265/CMakeLists.txt
+++ b/dec265/CMakeLists.txt
@@ -1,132 +1,41 @@
-set(CMAKE_CONFIGURATION_TYPES "Debug;Release;RelWithDebInfo" CACHE STRING "Configurations" FORCE)
-
-if(NOT CMAKE_BUILD_TYPE)
-    # default to Release build for GCC builds
-    set(CMAKE_BUILD_TYPE Release CACHE STRING
-        "Choose the type of build, options are: None(CMAKE_CXX_FLAGS or CMAKE_C_FLAGS used) Debug Release."
-        FORCE)
-endif()
-
-project (dec265)
-cmake_minimum_required (VERSION 2.8)
-
-SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/../cmake" "${CMAKE_MODULE_PATH}")
+set (dec265_sources
+  dec265.cc
+)
 
-if ("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
-    set(X64 1)
-    add_definitions(-DX86_64)
-endif()
+set (hdrcopy_sources
+  hdrcopy.cc
+)
 
-# Enforce coding standards.  Full warnings and warnings as errors
 if(MSVC)
-    add_definitions(/W4 /D_CRT_SECURE_NO_WARNINGS)
-    add_definitions(/Ob2) # always inline
-    add_definitions(/Oi)  # enable intrinsics
-#    add_definitions(/MP)  # multithreaded build
-#    add_definitions(/TP)  # all files as cpp
-
-    include_directories(../extra)
-endif(MSVC)
-
-if("$ENV{CXX}" STREQUAL "icpc")
-    set(GCC 1)
-    add_definitions(-Wall -Wextra -Wunused-variable -Wunused-function -Wshadow -no-vec -msse4)
-elseif(CMAKE_COMPILER_IS_GNUCXX)
-    set(GCC 1)
-    add_definitions(-Wall -Wextra -Wunused-variable -Wunused-function -Wshadow -msse4)
+  set (dec265_sources
+    ${dec265_sources}
+    ../extra/getopt.c
+    ../extra/getopt_long.c
+  )
+  set (hdrcopy_sources
+    ${hdrcopy_sources}
+    ../extra/getopt.c
+    ../extra/getopt_long.c
+  )
 endif()
 
-option(USE_ASM "Use SSE41 SIMD optimize" ON)
-if(USE_ASM)
-    add_definitions(-DUSE_ASM=ASM_SSE4)
-    add_definitions(-DHAVE_SSE4_1)
-else(USE_ASM)
-    add_definitions(-DUSE_ASM=ASM_NONE)
-endif(USE_ASM)
-
-option(USE_VIDEOGFX "Use of VIDEOGFX" OFF)
-if(USE_VIDEOGFX)
-    add_definitions(-D HAVE_VIDEOGFX=1)
-else(USE_VIDEOGFX)
-    add_definitions(-D HAVE_VIDEOGFX=0)
-endif(USE_VIDEOGFX)
-
-option(BUILD_STATIC  "Static Lib" ON)
-if(BUILD_STATIC)
-    add_definitions(-DLIBDE265_STATIC_BUILD)
-endif(BUILD_STATIC)
-
-option(DE265_LOG_ERROR  "Enable DE265_LOG_ERROR" OFF)
-if(DE265_LOG_ERROR)
-    add_definitions(-DDE265_LOG_ERROR)
-endif(DE265_LOG_ERROR)
-
-option(DE265_LOG_INFO   "Enable DE265_LOG_INFO"  OFF)
-if(DE265_LOG_INFO)
-    add_definitions(-DDE265_LOG_INFO)
-endif(DE265_LOG_INFO)
-
-option(DE265_LOG_DEBUG  "Enable DE265_LOG_DEBUG" OFF)
-if(DE265_LOG_DEBUG)
-    add_definitions(-DDE265_LOG_DEBUG)
-endif(DE265_LOG_DEBUG)
-
-option(DE265_LOG_TRACE  "Enable DE265_LOG_TRACE" OFF)
-if(DE265_LOG_TRACE)
-    add_definitions(-DDE265_LOG_TRACE)
-endif(DE265_LOG_TRACE)
-
-include_directories(../ ../libde265)
-
-file(GLOB LIBSRC ../libde265/*.cc ../extra/*.c)
-file(GLOB LIBINC ../libde265/*.h ../extra/*.h)
-file(GLOB APPSRC dec265.cc)
-file(GLOB ASMSRC0 ../libde265/x86/sse.cc ../libde265/x86/sse-dct.cc)
-file(GLOB ASMSRC1 ../libde265/x86/sse-motion.cc)
-file(GLOB ASMINC ../libde265/x86/*.h)
-
-source_group(INC  FILES ${LIBINC})
-source_group(SRC  FILES ${LIBSRC})
-source_group(APP  FILES ${APPSRC})
-source_group(ASM  FILES ${ASMSRC0} ${ASMSRC1} ${ASMINC})
-
-if(USE_ASM)
-    SET(LIBSRC ${LIBSRC} ${ASMSRC0} ${ASMSRC1})
-    SET(LIBINC ${LIBINC} ${ASMINC})
-
-    # disable uninitialize check on VC, because x1 is not assign, remove later
-    #if(MSVC)
-    #    SET_SOURCE_FILES_PROPERTIES(${ASMSRC1} PROPERTIES COMPILE_FLAGS "/RTCs")
-    #endif(MSVC)
-endif(USE_ASM)
-
-if(UNIX)
-    SET(PLATFORM_LIBS pthread rt m)
-    SET_SOURCE_FILES_PROPERTIES(../extra/win32thread.c PROPERTIES HEADER_FILE_ONLY TRUE)
-endif(UNIX)
+if(SDL_FOUND)
+  add_definitions(-DHAVE_SDL)
+  include_directories ("${SDL_INCLUDE_DIR}")
+  set (dec265_sources
+    ${dec265_sources}
+    sdl.cc
+  )
+endif()
 
-# Main CLI application
-if(MSVC)
-    add_definitions(/wd4244) # type conversion, possible loss of data
-    add_definitions(/wd4100) # unreferenced formal parameter
-    add_definitions(/wd4505) # unreferenced local function has been removed
-    add_definitions(/wd4701) # potentially uninitialized local variable
-    add_definitions(/wd4127) # conditional expression is constant
-#    add_definitions(/wd4018) # signed/unsigned mismatch
-    add_definitions(/wd4189) # local variable is initialized but not referenced
-    add_definitions(/wd4715) # not all control paths return a value
-    add_definitions(/wd4324) # structure was padded due to __declspec(align())
+add_executable (dec265 ${dec265_sources})
 
-    set_source_files_properties(${LIBSRC} PROPERTIES LANGUAGE CXX)
-endif(MSVC)
-if(GCC)
-    add_definitions(-Wno-sign-compare)
-    add_definitions(-Wno-unused-parameter)
-endif(GCC)
+target_link_libraries (dec265 ${LIBDE265_LIBRARY_NAME} ${SDL_LIBRARY})
 
-add_executable(dec265
-    ${APPSRC} ${EXTRAS} ${LIBSRC} ${LIBINC}
-)
 
-target_link_libraries(dec265 ${PLATFORM_LIBS})
+if(NOT MSVC)
+  # hdrcopy uses internal APIs that are not available when compiled for Windows
+  add_executable (hdrcopy ${hdrcopy_sources})
 
+  target_link_libraries (hdrcopy ${LIBDE265_LIBRARY_NAME})
+endif()
diff --git a/dec265/Makefile.am b/dec265/Makefile.am
index a997cc6..06f2540 100644
--- a/dec265/Makefile.am
+++ b/dec265/Makefile.am
@@ -1,5 +1,5 @@
 
-bin_PROGRAMS = dec265
+bin_PROGRAMS = dec265 hdrcopy
 
 AM_CPPFLAGS = -I../libde265
 
@@ -9,6 +9,12 @@ dec265_LDFLAGS =
 dec265_LDADD = ../libde265/libde265.la -lstdc++
 dec265_SOURCES = dec265.cc
 
+hdrcopy_DEPENDENCIES = ../libde265/libde265.la
+hdrcopy_CXXFLAGS =
+hdrcopy_LDFLAGS =
+hdrcopy_LDADD = ../libde265/libde265.la -lstdc++
+hdrcopy_SOURCES = hdrcopy.cc
+
 if HAVE_VIDEOGFX
   dec265_CXXFLAGS += $(VIDEOGFX_CFLAGS)
   dec265_LDFLAGS += $(VIDEOGFX_LIBS)
diff --git a/dec265/Makefile.in b/dec265/Makefile.in
index 090115b..9cc6415 100644
--- a/dec265/Makefile.in
+++ b/dec265/Makefile.in
@@ -79,7 +79,7 @@ POST_UNINSTALL = :
 build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
-bin_PROGRAMS = dec265$(EXEEXT)
+bin_PROGRAMS = dec265$(EXEEXT) hdrcopy$(EXEEXT)
 @HAVE_VIDEOGFX_TRUE at am__append_1 = $(VIDEOGFX_CFLAGS)
 @HAVE_VIDEOGFX_TRUE at am__append_2 = $(VIDEOGFX_LIBS)
 @HAVE_SDL_TRUE at am__append_3 = $(SDL_CFLAGS)
@@ -91,11 +91,12 @@ DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
 	$(top_srcdir)/depcomp COPYING
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_compare_version.m4 \
+	$(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \
 	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
 	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
 	$(top_srcdir)/m4/lt~obsolete.m4 \
 	$(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \
-	$(top_srcdir)/m4/visibility.m4 $(top_srcdir)/configure.ac
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 mkinstalldirs = $(install_sh) -d
@@ -115,6 +116,11 @@ am__v_lt_1 =
 dec265_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(dec265_CXXFLAGS) \
 	$(CXXFLAGS) $(dec265_LDFLAGS) $(LDFLAGS) -o $@
+am_hdrcopy_OBJECTS = hdrcopy-hdrcopy.$(OBJEXT)
+hdrcopy_OBJECTS = $(am_hdrcopy_OBJECTS)
+hdrcopy_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(hdrcopy_CXXFLAGS) \
+	$(CXXFLAGS) $(hdrcopy_LDFLAGS) $(LDFLAGS) -o $@
 AM_V_P = $(am__v_P_ at AM_V@)
 am__v_P_ = $(am__v_P_ at AM_DEFAULT_V@)
 am__v_P_0 = false
@@ -167,8 +173,8 @@ AM_V_CCLD = $(am__v_CCLD_ at AM_V@)
 am__v_CCLD_ = $(am__v_CCLD_ at AM_DEFAULT_V@)
 am__v_CCLD_0 = @echo "  CCLD    " $@;
 am__v_CCLD_1 = 
-SOURCES = $(dec265_SOURCES)
-DIST_SOURCES = $(am__dec265_SOURCES_DIST)
+SOURCES = $(dec265_SOURCES) $(hdrcopy_SOURCES)
+DIST_SOURCES = $(am__dec265_SOURCES_DIST) $(hdrcopy_SOURCES)
 am__can_run_installinfo = \
   case $$AM_UPDATE_INFO_DIR in \
     n|no|NO) false;; \
@@ -195,6 +201,7 @@ ETAGS = etags
 CTAGS = ctags
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
 AMTAR = @AMTAR@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
@@ -203,9 +210,11 @@ AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
 CCDEPMODE = @CCDEPMODE@
 CFLAGS = @CFLAGS@
-CFLAG_VISIBILITY = @CFLAG_VISIBILITY@
 CPP = @CPP@
 CPPFLAGS = @CPPFLAGS@
 CXX = @CXX@
@@ -225,7 +234,7 @@ EGREP = @EGREP@
 EXEEXT = @EXEEXT@
 FGREP = @FGREP@
 GREP = @GREP@
-HAVE_VISIBILITY = @HAVE_VISIBILITY@
+HAVE_CXX11 = @HAVE_CXX11@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -342,6 +351,11 @@ dec265_CXXFLAGS = $(am__append_1) $(am__append_3)
 dec265_LDFLAGS = $(am__append_2) $(am__append_4) $(am__append_6)
 dec265_LDADD = ../libde265/libde265.la -lstdc++
 dec265_SOURCES = dec265.cc $(am__append_5)
+hdrcopy_DEPENDENCIES = ../libde265/libde265.la
+hdrcopy_CXXFLAGS = 
+hdrcopy_LDFLAGS = 
+hdrcopy_LDADD = ../libde265/libde265.la -lstdc++
+hdrcopy_SOURCES = hdrcopy.cc
 EXTRA_DIST = Makefile.vc7 \
   CMakeLists.txt \
   ../extra/getopt.c \
@@ -436,6 +450,10 @@ dec265$(EXEEXT): $(dec265_OBJECTS) $(dec265_DEPENDENCIES) $(EXTRA_dec265_DEPENDE
 	@rm -f dec265$(EXEEXT)
 	$(AM_V_CXXLD)$(dec265_LINK) $(dec265_OBJECTS) $(dec265_LDADD) $(LIBS)
 
+hdrcopy$(EXEEXT): $(hdrcopy_OBJECTS) $(hdrcopy_DEPENDENCIES) $(EXTRA_hdrcopy_DEPENDENCIES) 
+	@rm -f hdrcopy$(EXEEXT)
+	$(AM_V_CXXLD)$(hdrcopy_LINK) $(hdrcopy_OBJECTS) $(hdrcopy_LDADD) $(LIBS)
+
 mostlyclean-compile:
 	-rm -f *.$(OBJEXT)
 
@@ -444,6 +462,7 @@ distclean-compile:
 
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/dec265-dec265.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/dec265-sdl.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/hdrcopy-hdrcopy.Po at am__quote@
 
 .cc.o:
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@@ -494,6 +513,20 @@ dec265-sdl.obj: sdl.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(dec265_CXXFLAGS) $(CXXFLAGS) -c -o dec265-sdl.obj `if test -f 'sdl.cc'; then $(CYGPATH_W) 'sdl.cc'; else $(CYGPATH_W) '$(srcdir)/sdl.cc'; fi`
 
+hdrcopy-hdrcopy.o: hdrcopy.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(hdrcopy_CXXFLAGS) $(CXXFLAGS) -MT hdrcopy-hdrcopy.o -MD -MP -MF $(DEPDIR)/hdrcopy-hdrcopy.Tpo -c -o hdrcopy-hdrcopy.o `test -f 'hdrcopy.cc' || echo '$(srcdir)/'`hdrcopy.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/hdrcopy-hdrcopy.Tpo $(DEPDIR)/hdrcopy-hdrcopy.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='hdrcopy.cc' object='hdrcopy-hdrcopy.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(hdrcopy_CXXFLAGS) $(CXXFLAGS) -c -o hdrcopy-hdrcopy.o `test -f 'hdrcopy.cc' || echo '$(srcdir)/'`hdrcopy.cc
+
+hdrcopy-hdrcopy.obj: hdrcopy.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(hdrcopy_CXXFLAGS) $(CXXFLAGS) -MT hdrcopy-hdrcopy.obj -MD -MP -MF $(DEPDIR)/hdrcopy-hdrcopy.Tpo -c -o hdrcopy-hdrcopy.obj `if test -f 'hdrcopy.cc'; then $(CYGPATH_W) 'hdrcopy.cc'; else $(CYGPATH_W) '$(srcdir)/hdrcopy.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/hdrcopy-hdrcopy.Tpo $(DEPDIR)/hdrcopy-hdrcopy.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='hdrcopy.cc' object='hdrcopy-hdrcopy.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(hdrcopy_CXXFLAGS) $(CXXFLAGS) -c -o hdrcopy-hdrcopy.obj `if test -f 'hdrcopy.cc'; then $(CYGPATH_W) 'hdrcopy.cc'; else $(CYGPATH_W) '$(srcdir)/hdrcopy.cc'; fi`
+
 mostlyclean-libtool:
 	-rm -f *.lo
 
diff --git a/dec265/Makefile.vc7 b/dec265/Makefile.vc7
index ff790eb..100fc33 100644
--- a/dec265/Makefile.vc7
+++ b/dec265/Makefile.vc7
@@ -2,13 +2,16 @@
 # Makefile for Microsoft Visual Studio 2003
 #
 CFLAGS=/I.. /I..\libde265 /I..\extra
-CC=cl /nologo 
+CC=cl /nologo
 LINK=link /nologo /subsystem:console
 DEFINES=/DWIN32
 
 CFLAGS=$(CFLAGS) /MT /Ob2 /Oi /W4 /EHsc
 CFLAGS=$(CFLAGS) $(DEFINES)
 
+# unreferenced formal parameter
+CFLAGS=$(CFLAGS) /wd4100
+
 OBJS=\
 	..\extra\getopt_long.obj \
 	..\extra\getopt.obj \
diff --git a/dec265/dec265.cc b/dec265/dec265.cc
index 32ea20c..694c885 100644
--- a/dec265/dec265.cc
+++ b/dec265/dec265.cc
@@ -38,6 +38,8 @@
 #include <unistd.h>
 #endif
 
+#include "libde265/quality.h"
+
 #if HAVE_VIDEOGFX
 #include <libvideogfx.hh>
 using namespace videogfx;
@@ -53,7 +55,7 @@ using namespace videogfx;
 
 int nThreads=0;
 bool nal_input=false;
-bool quiet=false;
+int quiet=0;
 bool check_hash=false;
 bool show_help=false;
 bool dump_headers=false;
@@ -65,6 +67,11 @@ const char *output_filename = "out.yuv";
 uint32_t max_frames=UINT32_MAX;
 bool write_bytestream=false;
 const char *bytestream_filename;
+bool measure_quality=false;
+bool show_ssim_map=false;
+bool show_psnr_map=false;
+const char* reference_filename;
+FILE* reference_file;
 int highestTID = 100;
 int verbosity=0;
 int disable_deblocking=0;
@@ -84,6 +91,9 @@ static struct option long_options[] = {
   {"help",       no_argument,       0, 'h' },
   {"noaccel",    no_argument,       0, '0' },
   {"write-bytestream", required_argument,0, 'B' },
+  {"measure",     required_argument, 0, 'm' },
+  {"ssim",        no_argument,       0, 's' },
+  {"errmap",      no_argument,       0, 'e' },
   {"highest-TID", required_argument, 0, 'T' },
   {"verbose",    no_argument,       0, 'v' },
   {"disable-deblocking", no_argument, &disable_deblocking, 1 },
@@ -98,18 +108,40 @@ static void write_picture(const de265_image* img)
   static FILE* fh = NULL;
   if (fh==NULL) { fh = fopen(output_filename, "wb"); }
 
-  
+
 
   for (int c=0;c<3;c++) {
     int stride;
     const uint8_t* p = de265_get_image_plane(img, c, &stride);
     int width = de265_get_image_width(img,c);
 
-    for (int y=0;y<de265_get_image_height(img,c);y++) {
-      fwrite(p + y*stride, width, 1, fh);
+    if (de265_get_bits_per_pixel(img,c)<=8) {
+      // --- save 8 bit YUV ---
+
+      for (int y=0;y<de265_get_image_height(img,c);y++) {
+        fwrite(p + y*stride, width, 1, fh);
+      }
+    }
+    else {
+      // --- save 16 bit YUV ---
+
+      uint8_t* buf = new uint8_t[width*2];
+      uint16_t* p16 = (uint16_t*)p;
+
+      for (int y=0;y<de265_get_image_height(img,c);y++) {
+        for (int x=0;x<width;x++) {
+          uint16_t pixel_value = (p16+y*stride)[x];
+          buf[2*x+0] = pixel_value & 0xFF;
+          buf[2*x+1] = pixel_value >> 8;
+        }
+
+        fwrite(buf, width*2, 1, fh);
+      }
+
+      delete[] buf;
     }
   }
-  
+
   fflush(fh);
 }
 
@@ -135,11 +167,27 @@ void display_image(const struct de265_image* img)
 
   int width  = de265_get_image_width(img,0);
   int height = de265_get_image_height(img,0);
+  de265_chroma chroma = de265_get_chroma_format(img);
+
+  ChromaFormat vgfx_chroma;
+  Colorspace   vgfx_cs = Colorspace_YUV;
+
+  switch (chroma) {
+  case de265_chroma_420:  vgfx_chroma = Chroma_420; break;
+  case de265_chroma_422:  vgfx_chroma = Chroma_422; break;
+  case de265_chroma_444:  vgfx_chroma = Chroma_444; break;
+  case de265_chroma_mono: vgfx_cs = Colorspace_Greyscale; break;
+  }
 
   Image<Pixel> visu;
-  visu.Create(width, height, Colorspace_YUV, Chroma_420);
+  visu.Create(width, height, vgfx_cs, vgfx_chroma);
+
+  int nChannels = 3;
+  if (chroma == de265_chroma_mono) {
+    nChannels = 1;
+  }
 
-  for (int ch=0;ch<3;ch++) {
+  for (int ch=0;ch<nChannels;ch++) {
     const uint8_t* data;
     int stride;
 
@@ -147,8 +195,20 @@ void display_image(const struct de265_image* img)
     width  = de265_get_image_width(img,ch);
     height = de265_get_image_height(img,ch);
 
-    for (int y=0;y<height;y++) {
-      memcpy(visu.AskFrame((BitmapChannel)ch)[y], data + y*stride, width);
+    int bit_depth = de265_get_bits_per_pixel(img,ch);
+
+    if (bit_depth==8) {
+      for (int y=0;y<height;y++) {
+        memcpy(visu.AskFrame((BitmapChannel)ch)[y], data + y*stride, width);
+      }
+    }
+    else {
+      const uint16_t* data16 = (const uint16_t*)data;
+      for (int y=0;y<height;y++) {
+        for (int x=0;x<width;x++) {
+          visu.AskFrame((BitmapChannel)ch)[y][x] = *(data16 + y*stride +x) >> (bit_depth-8);
+        }
+      }
     }
   }
 
@@ -157,18 +217,46 @@ void display_image(const struct de265_image* img)
 }
 #endif
 
+static uint8_t* convert_to_8bit(const uint8_t* data, int width, int height, int stride, int bit_depth)
+{
+  const uint16_t* data16 = (const uint16_t*)data;
+  uint8_t* out = new uint8_t[stride*height];
+
+  for (int y=0;y<height;y++) {
+    for (int x=0;x<width;x++) {
+      out[y*stride + x] = *(data16 + y*stride +x) >> (bit_depth-8);
+    }
+  }
+
+  return out;
+}
+
+
 #if HAVE_SDL
 SDL_YUV_Display sdlWin;
 bool sdl_active=false;
 
 bool display_sdl(const struct de265_image* img)
 {
-  if (!sdl_active) {
-    int width  = de265_get_image_width(img,0);
-    int height = de265_get_image_height(img,0);
+  int width  = de265_get_image_width(img,0);
+  int height = de265_get_image_height(img,0);
+
+  int chroma_width  = de265_get_image_width(img,1);
+  int chroma_height = de265_get_image_height(img,1);
+
+  de265_chroma chroma = de265_get_chroma_format(img);
 
+  if (!sdl_active) {
     sdl_active=true;
-    sdlWin.init(width,height);
+    enum SDL_YUV_Display::SDL_Chroma sdlChroma;
+    switch (chroma) {
+    case de265_chroma_420:  sdlChroma = SDL_YUV_Display::SDL_CHROMA_420;  break;
+    case de265_chroma_422:  sdlChroma = SDL_YUV_Display::SDL_CHROMA_422;  break;
+    case de265_chroma_444:  sdlChroma = SDL_YUV_Display::SDL_CHROMA_444;  break;
+    case de265_chroma_mono: sdlChroma = SDL_YUV_Display::SDL_CHROMA_MONO; break;
+    }
+
+    sdlWin.init(width,height, sdlChroma);
   }
 
   int stride,chroma_stride;
@@ -176,8 +264,30 @@ bool display_sdl(const struct de265_image* img)
   const uint8_t* cb =de265_get_image_plane(img,1,&chroma_stride);
   const uint8_t* cr =de265_get_image_plane(img,2,NULL);
 
+  uint8_t* y16  = NULL;
+  uint8_t* cb16 = NULL;
+  uint8_t* cr16 = NULL;
+  int bd;
+
+  if ((bd=de265_get_bits_per_pixel(img, 0)) > 8) {
+    y16  = convert_to_8bit(y,  width,height,stride,bd); y=y16;
+  }
+
+  if (chroma != de265_chroma_mono) {
+    if ((bd=de265_get_bits_per_pixel(img, 1)) > 8) {
+      cb16 = convert_to_8bit(cb, chroma_width,chroma_height,chroma_stride,bd); cb=cb16;
+    }
+    if ((bd=de265_get_bits_per_pixel(img, 2)) > 8) {
+      cr16 = convert_to_8bit(cr, chroma_width,chroma_height,chroma_stride,bd); cr=cr16;
+    }
+  }
+
   sdlWin.display(y,cb,cr, stride, chroma_stride);
 
+  delete[] y16;
+  delete[] cb16;
+  delete[] cr16;
+
   return sdlWin.doQuit();
 }
 #endif
@@ -236,6 +346,134 @@ bool output_image(const de265_image* img)
 }
 
 
+static double mse_y=0.0, mse_cb=0.0, mse_cr=0.0;
+static int    mse_frames=0;
+
+static double ssim_y=0.0;
+static int    ssim_frames=0;
+
+void measure(const de265_image* img)
+{
+  // --- compute PSNR ---
+
+  int width  = de265_get_image_width(img,0);
+  int height = de265_get_image_height(img,0);
+
+  uint8_t* p = (uint8_t*)malloc(width*height*3/2);
+  if (p == NULL) {
+    return;
+  }
+
+  size_t toread = width*height*3/2;
+  if (fread(p,1,toread,reference_file) != toread) {
+    free(p);
+    return;
+  }
+
+  int stride, cstride;
+  const uint8_t* yptr  = de265_get_image_plane(img,0, &stride);
+  const uint8_t* cbptr = de265_get_image_plane(img,1, &cstride);
+  const uint8_t* crptr = de265_get_image_plane(img,2, &cstride);
+
+  double img_mse_y  = MSE( yptr,  stride, p, width,   width, height);
+  double img_mse_cb = MSE(cbptr, cstride, p+width*height,      width/2, width/2,height/2);
+  double img_mse_cr = MSE(crptr, cstride, p+width*height*5/4,  width/2, width/2,height/2);
+
+  mse_frames++;
+
+  mse_y  += img_mse_y;
+  mse_cb += img_mse_cb;
+  mse_cr += img_mse_cr;
+
+
+
+  // --- compute SSIM ---
+
+  double ssimSum = 0.0;
+
+#if HAVE_VIDEOGFX
+  Bitmap<Pixel> ref, coded;
+  ref  .Create(width, height); // reference image
+  coded.Create(width, height); // coded image
+
+  const uint8_t* data;
+  data = de265_get_image_plane(img,0,&stride);
+
+  for (int y=0;y<height;y++) {
+    memcpy(coded[y], data + y*stride, width);
+    memcpy(ref[y],   p    + y*stride, width);
+  }
+
+  SSIM ssimAlgo;
+  Bitmap<float> ssim = ssimAlgo.calcSSIM(ref,coded);
+
+  Bitmap<Pixel> ssimMap;
+  ssimMap.Create(width,height);
+
+  for (int y=0;y<height;y++)
+    for (int x=0;x<width;x++)
+      {
+        float v = ssim[y][x];
+        ssimSum += v;
+        v = v*v;
+        v = 255*v; //pow(v, 20);
+
+        //assert(v<=255.0);
+        ssimMap[y][x] = v;
+      }
+
+  ssimSum /= width*height;
+
+
+  Bitmap<Pixel> error_map = CalcErrorMap(ref, coded, TransferCurve_Sqrt);
+
+
+  // display PSNR error map
+
+  if (show_psnr_map) {
+    static X11Win win;
+    static bool first=true;
+
+    if (first) {
+      first=false;
+      win.Create(de265_get_image_width(img,0),
+                 de265_get_image_height(img,0),
+                 "psnr output");
+    }
+
+    win.Display(MakeImage(error_map));
+  }
+
+
+  // display SSIM error map
+
+  if (show_ssim_map) {
+    static X11Win win;
+    static bool first=true;
+
+    if (first) {
+      first=false;
+      win.Create(de265_get_image_width(img,0),
+                 de265_get_image_height(img,0),
+                 "ssim output");
+    }
+
+    win.Display(MakeImage(ssimMap));
+  }
+#endif
+
+  ssim_frames++;
+  ssim_y += ssimSum;
+
+  printf("%5d   %6f %6f %6f %6f\n",
+         framecnt,
+         PSNR(img_mse_y), PSNR(img_mse_cb), PSNR(img_mse_cr),
+         ssimSum);
+
+  free(p);
+}
+
+
 #ifdef WIN32
 #include <time.h>
 #define WIN32_LEAN_AND_MEAN
@@ -299,12 +537,13 @@ void (*volatile __malloc_initialize_hook)(void) = init_my_hooks;
 #endif
 #endif
 
+
 int main(int argc, char** argv)
 {
   while (1) {
     int option_index = 0;
 
-    int c = getopt_long(argc, argv, "qt:chf:o:dLB:n0vT:"
+    int c = getopt_long(argc, argv, "qt:chf:o:dLB:n0vT:m:se"
 #if HAVE_VIDEOGFX && HAVE_SDL
                         "V"
 #endif
@@ -313,7 +552,7 @@ int main(int argc, char** argv)
       break;
 
     switch (c) {
-    case 'q': quiet=true; break;
+    case 'q': quiet++; break;
     case 't': nThreads=atoi(optarg); break;
     case 'c': check_hash=true; break;
     case 'f': max_frames=atoi(optarg); break;
@@ -325,6 +564,9 @@ int main(int argc, char** argv)
     case 'L': logging=false; break;
     case '0': no_acceleration=true; break;
     case 'B': write_bytestream=true; bytestream_filename=optarg; break;
+    case 'm': measure_quality=true; reference_filename=optarg; break;
+    case 's': show_ssim_map=true; break;
+    case 'e': show_psnr_map=true; break;
     case 'T': highestTID=atoi(optarg); break;
     case 'v': verbosity++; break;
     }
@@ -348,8 +590,14 @@ int main(int argc, char** argv)
     fprintf(stderr,"  -V, --videogfx    output with videogfx instead of SDL\n");
 #endif
     fprintf(stderr,"  -0, --noaccel     do not use any accelerated code (SSE)\n");
+    fprintf(stderr,"  -v, --verbose     increase verbosity level (up to 3 times)\n");
     fprintf(stderr,"  -L, --no-logging  disable logging\n");
     fprintf(stderr,"  -B, --write-bytestream FILENAME  write raw bytestream (from NAL input)\n");
+    fprintf(stderr,"  -m, --measure YUV compute PSNRs relative to reference YUV\n");
+#if HAVE_VIDEOGFX
+    fprintf(stderr,"  -s, --ssim        show SSIM-map (only when -m active)\n");
+    fprintf(stderr,"  -e, --errmap      show error-map (only when -m active)\n");
+#endif
     fprintf(stderr,"  -T, --highest-TID select highest temporal sublayer to decode\n");
     fprintf(stderr,"      --disable-deblocking   disable deblocking filter\n");
     fprintf(stderr,"      --disable-sao          disable sample-adaptive offset filter\n");
@@ -396,6 +644,10 @@ int main(int argc, char** argv)
   de265_set_limit_TID(ctx, highestTID);
 
 
+  if (measure_quality) {
+    reference_file = fopen(reference_filename, "rb");
+  }
+
 
   FILE* fh = fopen(argv[optind], "rb");
   if (fh==NULL) {
@@ -484,6 +736,8 @@ int main(int argc, char** argv)
 
           err = de265_decode(ctx, &more);
           if (err != DE265_OK) {
+            // if (quiet<=1) fprintf(stderr,"ERROR: %s\n", de265_get_error_text(err));
+
             if (check_hash && err == DE265_ERROR_CHECKSUM_MISMATCH)
               stop = 1;
             more = 0;
@@ -494,6 +748,10 @@ int main(int argc, char** argv)
 
           const de265_image* img = de265_get_next_picture(ctx);
           if (img) {
+            if (measure_quality) {
+              measure(img);
+            }
+
             stop = output_image(img);
             if (stop) more=0;
             else      more=1;
@@ -507,7 +765,7 @@ int main(int argc, char** argv)
               break;
             }
 
-            fprintf(stderr,"WARNING: %s\n", de265_get_error_text(warning));
+            if (quiet<=1) fprintf(stderr,"WARNING: %s\n", de265_get_error_text(warning));
           }
         }
     }
@@ -518,20 +776,30 @@ int main(int argc, char** argv)
     fclose(bytestream_fh);
   }
 
+  if (measure_quality) {
+    printf("#total  %6f %6f %6f %6f\n",
+           PSNR(mse_y /mse_frames),
+           PSNR(mse_cb/mse_frames),
+           PSNR(mse_cr/mse_frames),
+           ssim_y/ssim_frames);
+
+    fclose(reference_file);
+  }
+
   de265_free_decoder(ctx);
 
   struct timeval tv_end;
   gettimeofday(&tv_end, NULL);
 
   if (err != DE265_OK) {
-    fprintf(stderr,"decoding error: %s (code=%d)\n", de265_get_error_text(err), err);
+    if (quiet<=1) fprintf(stderr,"decoding error: %s (code=%d)\n", de265_get_error_text(err), err);
   }
 
   double secs = tv_end.tv_sec-tv_start.tv_sec;
   secs += (tv_end.tv_usec - tv_start.tv_usec)*0.001*0.001;
 
-  fprintf(stderr,"nFrames decoded: %d (%dx%d @ %5.2f fps)\n",framecnt,
-          width,height,framecnt/secs);
+  if (quiet<=1) fprintf(stderr,"nFrames decoded: %d (%dx%d @ %5.2f fps)\n",framecnt,
+                        width,height,framecnt/secs);
 
 
   return err==DE265_OK ? 0 : 10;
diff --git a/dec265/hdrcopy.cc b/dec265/hdrcopy.cc
new file mode 100644
index 0000000..3a901f1
--- /dev/null
+++ b/dec265/hdrcopy.cc
@@ -0,0 +1,118 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "libde265/nal-parser.h"
+#include "libde265/decctx.h"
+#include <assert.h>
+
+error_queue errqueue;
+
+video_parameter_set vps;
+seq_parameter_set   sps;
+pic_parameter_set   pps;
+
+CABAC_encoder_bitstream writer;
+
+
+void process_nal(NAL_unit* nal)
+{
+  de265_error err = DE265_OK;
+
+  bitreader reader;
+  bitreader_init(&reader, nal->data(), nal->size());
+
+  nal_header nal_hdr;
+  nal_hdr.read(&reader);
+  writer.write_startcode();
+  nal_hdr.write(writer);
+
+  printf("NAL: 0x%x 0x%x -  unit type:%s temporal id:%d\n",
+         nal->data()[0], nal->data()[1],
+         get_NAL_name(nal_hdr.nal_unit_type),
+         nal_hdr.nuh_temporal_id);
+
+
+  if (nal_hdr.nal_unit_type<32) {
+    //err = read_slice_NAL(reader, nal, nal_hdr);
+  }
+  else switch (nal_hdr.nal_unit_type) {
+    case NAL_UNIT_VPS_NUT:
+      vps.read(&errqueue, &reader);
+      vps.dump(1);
+      vps.write(&errqueue, writer);
+      writer.flush_VLC();
+      break;
+
+    case NAL_UNIT_SPS_NUT:
+      sps.read(&errqueue, &reader);
+      sps.dump(1);
+      sps.write(&errqueue, writer);
+      writer.flush_VLC();
+      break;
+
+    case NAL_UNIT_PPS_NUT:
+      //err = read_pps_NAL(reader);
+      break;
+
+    case NAL_UNIT_PREFIX_SEI_NUT:
+    case NAL_UNIT_SUFFIX_SEI_NUT:
+      //err = read_sei_NAL(reader, nal_hdr.nal_unit_type==NAL_UNIT_SUFFIX_SEI_NUT);
+      break;
+
+    case NAL_UNIT_EOS_NUT:
+      //ctx->FirstAfterEndOfSequenceNAL = true;
+      break;
+    }
+}
+
+
+int main(int argc, char** argv)
+{
+  NAL_Parser nal_parser;
+
+  FILE* fh = fopen(argv[1],"rb");
+  unsigned char buf[1024];
+
+  writer.write_bits(0,8); // because HM has an extra byte at the beginning
+
+  while(!feof(fh))
+    {
+      int n = fread(buf,1,1024,fh);
+      if (n>0) {
+        nal_parser.push_data(buf,n, 0);
+      }
+
+      if (nal_parser.get_NAL_queue_length()>0) {
+        NAL_unit* nal = nal_parser.pop_from_NAL_queue();
+        assert(nal);
+        process_nal(nal);
+        nal_parser.free_NAL_unit(nal);
+      }
+    }
+
+  fclose(fh);
+
+  fh = fopen("out.bin","wb");
+  fwrite(writer.data(), 1,writer.size(), fh);
+  fclose(fh);
+
+  return 0;
+}
diff --git a/dec265/sdl.cc b/dec265/sdl.cc
index abcfbfe..092478d 100644
--- a/dec265/sdl.cc
+++ b/dec265/sdl.cc
@@ -19,15 +19,17 @@
  */
 
 #include "sdl.hh"
+#include <assert.h>
 
 
-bool SDL_YUV_Display::init(int frame_width, int frame_height)
+bool SDL_YUV_Display::init(int frame_width, int frame_height, enum SDL_Chroma chroma)
 {
   // reduce image size to a multiple of 8 (apparently required by YUV overlay)
 
   frame_width  &= ~7;
   frame_height &= ~7;
 
+  mChroma = chroma;
 
   if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_NOPARACHUTE) < 0 ) {
     printf("SDL_Init() failed: %s\n", SDL_GetError( ) );
@@ -62,7 +64,16 @@ bool SDL_YUV_Display::init(int frame_width, int frame_height)
     return false;
   }
 
-  mYUVOverlay = SDL_CreateYUVOverlay(frame_width, frame_height, SDL_YV12_OVERLAY, mScreen);
+  uint32_t pixelFormat;
+  switch (mChroma) {
+  case SDL_CHROMA_MONO: pixelFormat = SDL_YV12_OVERLAY; break;
+  case SDL_CHROMA_420:  pixelFormat = SDL_YV12_OVERLAY; break;
+  case SDL_CHROMA_422:  pixelFormat = SDL_YUY2_OVERLAY; break;
+    case SDL_CHROMA_444:  pixelFormat = SDL_YV12_OVERLAY; break;
+      //case SDL_CHROMA_444:  pixelFormat = SDL_YUY2_OVERLAY; break;
+  }
+
+  mYUVOverlay = SDL_CreateYUVOverlay(frame_width, frame_height, pixelFormat, mScreen);
   if (mYUVOverlay == NULL ) {
     printf("SDL: Couldn't create SDL YUV overlay: %s",SDL_GetError());
     SDL_Quit();
@@ -87,6 +98,31 @@ void SDL_YUV_Display::display(const unsigned char *Y,
   if (!mWindowOpen) return;
   if (SDL_LockYUVOverlay(mYUVOverlay) < 0) return;
 
+  if (mChroma == SDL_CHROMA_420) {
+    display420(Y,U,V,stride,chroma_stride);
+  }
+  else if (mChroma == SDL_CHROMA_422) {
+    display422(Y,U,V,stride,chroma_stride);
+  }
+  else if (mChroma == SDL_CHROMA_444) {
+    display444as420(Y,U,V,stride,chroma_stride);
+    //display444as422(Y,U,V,stride,chroma_stride);
+  }
+  else if (mChroma == SDL_CHROMA_MONO) {
+    display400(Y,stride);
+  }
+
+  SDL_UnlockYUVOverlay(mYUVOverlay);
+
+  SDL_DisplayYUVOverlay(mYUVOverlay, &rect);
+}
+
+
+void SDL_YUV_Display::display420(const unsigned char *Y,
+                                 const unsigned char *U,
+                                 const unsigned char *V,
+                                 int stride, int chroma_stride)
+{
   if (stride == rect.w && chroma_stride == rect.w/2) {
 
     // fast copy
@@ -109,12 +145,111 @@ void SDL_YUV_Display::display(const unsigned char *Y,
         memcpy(mYUVOverlay->pixels[1]+y*rect.w/2, V+chroma_stride*y, rect.w/2);
       }
   }
+}
 
-  SDL_UnlockYUVOverlay(mYUVOverlay);
 
-  SDL_DisplayYUVOverlay(mYUVOverlay, &rect);
+void SDL_YUV_Display::display400(const unsigned char *Y, int stride)
+{
+  if (stride == rect.w) {
+
+    // fast copy
+
+    memcpy(mYUVOverlay->pixels[0], Y, rect.w * rect.h);
+  }
+  else {
+    // copy line by line, because sizes are different
+
+    for (int y=0;y<rect.h;y++)
+      {
+        memcpy(mYUVOverlay->pixels[0]+y*rect.w, Y+stride*y, rect.w);
+      }
+  }
+
+  // clear chroma planes
+
+  memset(mYUVOverlay->pixels[1], 0x80, rect.w * rect.h / 4);
+  memset(mYUVOverlay->pixels[2], 0x80, rect.w * rect.h / 4);
+}
+
+
+void SDL_YUV_Display::display422(const unsigned char *Y,
+                                 const unsigned char *U,
+                                 const unsigned char *V,
+                                 int stride, int chroma_stride)
+{
+  for (int y=0;y<rect.h;y++)
+    {
+      unsigned char* p = mYUVOverlay->pixels[0] + y*rect.w *2;
+
+      const unsigned char* Yp = Y + y*stride;
+      const unsigned char* Up = U + y*chroma_stride;
+      const unsigned char* Vp = V + y*chroma_stride;
+
+      for (int x=0;x<rect.w;x+=2) {
+        *p++ = Yp[x];
+        *p++ = Up[x/2];
+        *p++ = Yp[x+1];
+        *p++ = Vp[x/2];
+      }
+    }
+}
+
+
+/* This converts down 4:4:4 input to 4:2:2 for display, as SDL does not support
+   any 4:4:4 pixel format.
+ */
+void SDL_YUV_Display::display444as422(const unsigned char *Y,
+                                      const unsigned char *U,
+                                      const unsigned char *V,
+                                      int stride, int chroma_stride)
+{
+  for (int y=0;y<rect.h;y++)
+    {
+      unsigned char* p = mYUVOverlay->pixels[0] + y*rect.w *2;
+
+      const unsigned char* Yp = Y + y*stride;
+      const unsigned char* Up = U + y*chroma_stride;
+      const unsigned char* Vp = V + y*chroma_stride;
+
+      for (int x=0;x<rect.w;x+=2) {
+        *p++ = Yp[x];
+        *p++ = Up[x];
+        *p++ = Yp[x+1];
+        *p++ = Vp[x];
+      }
+    }
 }
 
+
+void SDL_YUV_Display::display444as420(const unsigned char *Y,
+                                      const unsigned char *U,
+                                      const unsigned char *V,
+                                      int stride, int chroma_stride)
+{
+  for (int y=0;y<rect.h;y++)
+    {
+      unsigned char* p = mYUVOverlay->pixels[0] + y*rect.w;
+      memcpy(p, Y+y*stride, rect.w);
+    }
+
+  for (int y=0;y<rect.h;y+=2)
+    {
+      unsigned char* u = mYUVOverlay->pixels[2] + y/2*rect.w/2;
+      unsigned char* v = mYUVOverlay->pixels[1] + y/2*rect.w/2;
+
+      for (int x=0;x<rect.w;x+=2) {
+        u[x/2] = (U[ y   *chroma_stride + x] + U[ y   *chroma_stride + x +1] +
+                  U[(y+1)*chroma_stride + x] + U[(y+1)*chroma_stride + x +1])/4;
+        v[x/2] = (V[ y   *chroma_stride + x] + V[ y   *chroma_stride + x +1] +
+                  V[(y+1)*chroma_stride + x] + V[(y+1)*chroma_stride + x +1])/4;
+
+        //u[x/2] = U[y*chroma_stride + x];
+        //v[x/2] = V[y*chroma_stride + x];
+      }
+    }
+}
+
+
 bool SDL_YUV_Display::doQuit() const
 {
   SDL_Event event;
diff --git a/dec265/sdl.hh b/dec265/sdl.hh
index 7b6bb4b..a01d267 100644
--- a/dec265/sdl.hh
+++ b/dec265/sdl.hh
@@ -25,7 +25,14 @@ class SDL_YUV_Display
 {
 public:
 
-  bool init(int frame_width, int frame_height);
+  enum SDL_Chroma {
+    SDL_CHROMA_MONO=400,
+    SDL_CHROMA_420 =420,
+    SDL_CHROMA_422 =422,
+    SDL_CHROMA_444 =444
+  };
+
+  bool init(int frame_width, int frame_height, enum SDL_Chroma chroma = SDL_CHROMA_420);
   void display(const unsigned char *Y, const unsigned char *U, const unsigned char *V,
                int stride, int chroma_stride);
   void close();
@@ -39,4 +46,25 @@ private:
   SDL_Overlay *mYUVOverlay;
   SDL_Rect     rect;
   bool         mWindowOpen;
+
+  SDL_Chroma mChroma;
+
+  void display400(const unsigned char *Y,
+                  int stride);
+  void display420(const unsigned char *Y,
+                  const unsigned char *U,
+                  const unsigned char *V,
+                  int stride, int chroma_stride);
+  void display422(const unsigned char *Y,
+                  const unsigned char *U,
+                  const unsigned char *V,
+                  int stride, int chroma_stride);
+  void display444as422(const unsigned char *Y,
+                       const unsigned char *U,
+                       const unsigned char *V,
+                       int stride, int chroma_stride);
+  void display444as420(const unsigned char *Y,
+                       const unsigned char *U,
+                       const unsigned char *V,
+                       int stride, int chroma_stride);
 };
diff --git a/enc265/CMakeLists.txt b/enc265/CMakeLists.txt
new file mode 100644
index 0000000..d451771
--- /dev/null
+++ b/enc265/CMakeLists.txt
@@ -0,0 +1,15 @@
+set (enc265_sources
+  enc265.cc
+)
+
+if(MSVC)
+  set (enc265_sources
+    ${enc265_sources}
+    ../extra/getopt.c
+    ../extra/getopt_long.c
+  )
+endif()
+
+add_executable (enc265 ${enc265_sources})
+
+target_link_libraries (enc265 ${LIBDE265_LIBRARY_NAME})
diff --git a/enc265/Makefile.am b/enc265/Makefile.am
new file mode 100644
index 0000000..82f1594
--- /dev/null
+++ b/enc265/Makefile.am
@@ -0,0 +1,19 @@
+
+bin_PROGRAMS = enc265
+
+AM_CPPFLAGS = -I../libde265
+
+enc265_DEPENDENCIES = ../libde265/libde265.la
+enc265_CXXFLAGS =
+enc265_LDFLAGS =
+enc265_LDADD = ../libde265/libde265.la -lstdc++
+enc265_SOURCES = enc265.cc
+
+if HAVE_VIDEOGFX
+  enc265_CXXFLAGS += $(VIDEOGFX_CFLAGS)
+  enc265_LDFLAGS += $(VIDEOGFX_LIBS)
+endif
+
+EXTRA_DIST = \
+  CMakeLists.txt \
+  Makefile.vc7
diff --git a/dec265/Makefile.in b/enc265/Makefile.in
similarity index 78%
copy from dec265/Makefile.in
copy to enc265/Makefile.in
index 090115b..5573cdf 100644
--- a/dec265/Makefile.in
+++ b/enc265/Makefile.in
@@ -79,23 +79,20 @@ POST_UNINSTALL = :
 build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
-bin_PROGRAMS = dec265$(EXEEXT)
+bin_PROGRAMS = enc265$(EXEEXT)
 @HAVE_VIDEOGFX_TRUE at am__append_1 = $(VIDEOGFX_CFLAGS)
 @HAVE_VIDEOGFX_TRUE at am__append_2 = $(VIDEOGFX_LIBS)
- at HAVE_SDL_TRUE@am__append_3 = $(SDL_CFLAGS)
- at HAVE_SDL_TRUE@am__append_4 = $(SDL_LIBS)
- at HAVE_SDL_TRUE@am__append_5 = sdl.cc sdl.hh
- at MINGW_TRUE@am__append_6 = -static-libgcc -static-libstdc++
-subdir = dec265
+subdir = enc265
 DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
-	$(top_srcdir)/depcomp COPYING
+	$(top_srcdir)/depcomp
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_compare_version.m4 \
+	$(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \
 	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
 	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
 	$(top_srcdir)/m4/lt~obsolete.m4 \
 	$(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \
-	$(top_srcdir)/m4/visibility.m4 $(top_srcdir)/configure.ac
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 mkinstalldirs = $(install_sh) -d
@@ -104,17 +101,15 @@ CONFIG_CLEAN_FILES =
 CONFIG_CLEAN_VPATH_FILES =
 am__installdirs = "$(DESTDIR)$(bindir)"
 PROGRAMS = $(bin_PROGRAMS)
-am__dec265_SOURCES_DIST = dec265.cc sdl.cc sdl.hh
- at HAVE_SDL_TRUE@am__objects_1 = dec265-sdl.$(OBJEXT)
-am_dec265_OBJECTS = dec265-dec265.$(OBJEXT) $(am__objects_1)
-dec265_OBJECTS = $(am_dec265_OBJECTS)
+am_enc265_OBJECTS = enc265-enc265.$(OBJEXT)
+enc265_OBJECTS = $(am_enc265_OBJECTS)
 AM_V_lt = $(am__v_lt_ at AM_V@)
 am__v_lt_ = $(am__v_lt_ at AM_DEFAULT_V@)
 am__v_lt_0 = --silent
 am__v_lt_1 = 
-dec265_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(dec265_CXXFLAGS) \
-	$(CXXFLAGS) $(dec265_LDFLAGS) $(LDFLAGS) -o $@
+enc265_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(enc265_CXXFLAGS) \
+	$(CXXFLAGS) $(enc265_LDFLAGS) $(LDFLAGS) -o $@
 AM_V_P = $(am__v_P_ at AM_V@)
 am__v_P_ = $(am__v_P_ at AM_DEFAULT_V@)
 am__v_P_0 = false
@@ -149,26 +144,8 @@ AM_V_CXXLD = $(am__v_CXXLD_ at AM_V@)
 am__v_CXXLD_ = $(am__v_CXXLD_ at AM_DEFAULT_V@)
 am__v_CXXLD_0 = @echo "  CXXLD   " $@;
 am__v_CXXLD_1 = 
-COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
-	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
-LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
-	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
-	$(AM_CFLAGS) $(CFLAGS)
-AM_V_CC = $(am__v_CC_ at AM_V@)
-am__v_CC_ = $(am__v_CC_ at AM_DEFAULT_V@)
-am__v_CC_0 = @echo "  CC      " $@;
-am__v_CC_1 = 
-CCLD = $(CC)
-LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
-	$(AM_LDFLAGS) $(LDFLAGS) -o $@
-AM_V_CCLD = $(am__v_CCLD_ at AM_V@)
-am__v_CCLD_ = $(am__v_CCLD_ at AM_DEFAULT_V@)
-am__v_CCLD_0 = @echo "  CCLD    " $@;
-am__v_CCLD_1 = 
-SOURCES = $(dec265_SOURCES)
-DIST_SOURCES = $(am__dec265_SOURCES_DIST)
+SOURCES = $(enc265_SOURCES)
+DIST_SOURCES = $(enc265_SOURCES)
 am__can_run_installinfo = \
   case $$AM_UPDATE_INFO_DIR in \
     n|no|NO) false;; \
@@ -195,6 +172,7 @@ ETAGS = etags
 CTAGS = ctags
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
 AMTAR = @AMTAR@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
@@ -203,9 +181,11 @@ AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
 CCDEPMODE = @CCDEPMODE@
 CFLAGS = @CFLAGS@
-CFLAG_VISIBILITY = @CFLAG_VISIBILITY@
 CPP = @CPP@
 CPPFLAGS = @CPPFLAGS@
 CXX = @CXX@
@@ -225,7 +205,7 @@ EGREP = @EGREP@
 EXEEXT = @EXEEXT@
 FGREP = @FGREP@
 GREP = @GREP@
-HAVE_VISIBILITY = @HAVE_VISIBILITY@
+HAVE_CXX11 = @HAVE_CXX11@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -337,16 +317,14 @@ top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
 AM_CPPFLAGS = -I../libde265
-dec265_DEPENDENCIES = ../libde265/libde265.la
-dec265_CXXFLAGS = $(am__append_1) $(am__append_3)
-dec265_LDFLAGS = $(am__append_2) $(am__append_4) $(am__append_6)
-dec265_LDADD = ../libde265/libde265.la -lstdc++
-dec265_SOURCES = dec265.cc $(am__append_5)
-EXTRA_DIST = Makefile.vc7 \
+enc265_DEPENDENCIES = ../libde265/libde265.la
+enc265_CXXFLAGS = $(am__append_1)
+enc265_LDFLAGS = $(am__append_2)
+enc265_LDADD = ../libde265/libde265.la -lstdc++
+enc265_SOURCES = enc265.cc
+EXTRA_DIST = \
   CMakeLists.txt \
-  ../extra/getopt.c \
-  ../extra/getopt.h \
-  ../extra/getopt_long.c
+  Makefile.vc7
 
 all: all-am
 
@@ -361,9 +339,9 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	      exit 1;; \
 	  esac; \
 	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dec265/Makefile'; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu enc265/Makefile'; \
 	$(am__cd) $(top_srcdir) && \
-	  $(AUTOMAKE) --gnu dec265/Makefile
+	  $(AUTOMAKE) --gnu enc265/Makefile
 .PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	@case '$?' in \
@@ -432,9 +410,9 @@ clean-binPROGRAMS:
 	echo " rm -f" $$list; \
 	rm -f $$list
 
-dec265$(EXEEXT): $(dec265_OBJECTS) $(dec265_DEPENDENCIES) $(EXTRA_dec265_DEPENDENCIES) 
-	@rm -f dec265$(EXEEXT)
-	$(AM_V_CXXLD)$(dec265_LINK) $(dec265_OBJECTS) $(dec265_LDADD) $(LIBS)
+enc265$(EXEEXT): $(enc265_OBJECTS) $(enc265_DEPENDENCIES) $(EXTRA_enc265_DEPENDENCIES) 
+	@rm -f enc265$(EXEEXT)
+	$(AM_V_CXXLD)$(enc265_LINK) $(enc265_OBJECTS) $(enc265_LDADD) $(LIBS)
 
 mostlyclean-compile:
 	-rm -f *.$(OBJEXT)
@@ -442,8 +420,7 @@ mostlyclean-compile:
 distclean-compile:
 	-rm -f *.tab.c
 
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/dec265-dec265.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/dec265-sdl.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/enc265-enc265.Po at am__quote@
 
 .cc.o:
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@@ -466,33 +443,19 @@ distclean-compile:
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LTCXXCOMPILE) -c -o $@ $<
 
-dec265-dec265.o: dec265.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(dec265_CXXFLAGS) $(CXXFLAGS) -MT dec265-dec265.o -MD -MP -MF $(DEPDIR)/dec265-dec265.Tpo -c -o dec265-dec265.o `test -f 'dec265.cc' || echo '$(srcdir)/'`dec265.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/dec265-dec265.Tpo $(DEPDIR)/dec265-dec265.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='dec265.cc' object='dec265-dec265.o' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(dec265_CXXFLAGS) $(CXXFLAGS) -c -o dec265-dec265.o `test -f 'dec265.cc' || echo '$(srcdir)/'`dec265.cc
-
-dec265-dec265.obj: dec265.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(dec265_CXXFLAGS) $(CXXFLAGS) -MT dec265-dec265.obj -MD -MP -MF $(DEPDIR)/dec265-dec265.Tpo -c -o dec265-dec265.obj `if test -f 'dec265.cc'; then $(CYGPATH_W) 'dec265.cc'; else $(CYGPATH_W) '$(srcdir)/dec265.cc'; fi`
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/dec265-dec265.Tpo $(DEPDIR)/dec265-dec265.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='dec265.cc' object='dec265-dec265.obj' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(dec265_CXXFLAGS) $(CXXFLAGS) -c -o dec265-dec265.obj `if test -f 'dec265.cc'; then $(CYGPATH_W) 'dec265.cc'; else $(CYGPATH_W) '$(srcdir)/dec265.cc'; fi`
-
-dec265-sdl.o: sdl.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(dec265_CXXFLAGS) $(CXXFLAGS) -MT dec265-sdl.o -MD -MP -MF $(DEPDIR)/dec265-sdl.Tpo -c -o dec265-sdl.o `test -f 'sdl.cc' || echo '$(srcdir)/'`sdl.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/dec265-sdl.Tpo $(DEPDIR)/dec265-sdl.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='sdl.cc' object='dec265-sdl.o' libtool=no @AMDEPBACKSLASH@
+enc265-enc265.o: enc265.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(enc265_CXXFLAGS) $(CXXFLAGS) -MT enc265-enc265.o -MD -MP -MF $(DEPDIR)/enc265-enc265.Tpo -c -o enc265-enc265.o `test -f 'enc265.cc' || echo '$(srcdir)/'`enc265.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/enc265-enc265.Tpo $(DEPDIR)/enc265-enc265.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='enc265.cc' object='enc265-enc265.o' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(dec265_CXXFLAGS) $(CXXFLAGS) -c -o dec265-sdl.o `test -f 'sdl.cc' || echo '$(srcdir)/'`sdl.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(enc265_CXXFLAGS) $(CXXFLAGS) -c -o enc265-enc265.o `test -f 'enc265.cc' || echo '$(srcdir)/'`enc265.cc
 
-dec265-sdl.obj: sdl.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(dec265_CXXFLAGS) $(CXXFLAGS) -MT dec265-sdl.obj -MD -MP -MF $(DEPDIR)/dec265-sdl.Tpo -c -o dec265-sdl.obj `if test -f 'sdl.cc'; then $(CYGPATH_W) 'sdl.cc'; else $(CYGPATH_W) '$(srcdir)/sdl.cc'; fi`
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/dec265-sdl.Tpo $(DEPDIR)/dec265-sdl.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='sdl.cc' object='dec265-sdl.obj' libtool=no @AMDEPBACKSLASH@
+enc265-enc265.obj: enc265.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(enc265_CXXFLAGS) $(CXXFLAGS) -MT enc265-enc265.obj -MD -MP -MF $(DEPDIR)/enc265-enc265.Tpo -c -o enc265-enc265.obj `if test -f 'enc265.cc'; then $(CYGPATH_W) 'enc265.cc'; else $(CYGPATH_W) '$(srcdir)/enc265.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/enc265-enc265.Tpo $(DEPDIR)/enc265-enc265.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='enc265.cc' object='enc265-enc265.obj' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(dec265_CXXFLAGS) $(CXXFLAGS) -c -o dec265-sdl.obj `if test -f 'sdl.cc'; then $(CYGPATH_W) 'sdl.cc'; else $(CYGPATH_W) '$(srcdir)/sdl.cc'; fi`
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(enc265_CXXFLAGS) $(CXXFLAGS) -c -o enc265-enc265.obj `if test -f 'enc265.cc'; then $(CYGPATH_W) 'enc265.cc'; else $(CYGPATH_W) '$(srcdir)/enc265.cc'; fi`
 
 mostlyclean-libtool:
 	-rm -f *.lo
diff --git a/dec265/Makefile.vc7 b/enc265/Makefile.vc7
similarity index 63%
copy from dec265/Makefile.vc7
copy to enc265/Makefile.vc7
index ff790eb..90ddd0b 100644
--- a/dec265/Makefile.vc7
+++ b/enc265/Makefile.vc7
@@ -2,21 +2,25 @@
 # Makefile for Microsoft Visual Studio 2003
 #
 CFLAGS=/I.. /I..\libde265 /I..\extra
-CC=cl /nologo 
+CC=cl /nologo
 LINK=link /nologo /subsystem:console
 DEFINES=/DWIN32
 
 CFLAGS=$(CFLAGS) /MT /Ob2 /Oi /W4 /EHsc
 CFLAGS=$(CFLAGS) $(DEFINES)
 
+# unreferenced formal parameter
+CFLAGS=$(CFLAGS) /wd4100
+
+
 OBJS=\
 	..\extra\getopt_long.obj \
 	..\extra\getopt.obj \
-	dec265.obj
+	enc265.obj
 
-all: dec265.exe
+all: enc265.exe
 
-dec265.obj: dec265.cc
+enc265.obj: enc265.cc
 	$(CC) /c $*.cc /Fo$*.obj /TP $(CFLAGS)
 
 .c.obj:
@@ -25,9 +29,9 @@ dec265.obj: dec265.cc
 .cc.obj:
 	$(CC) /c $*.cc /Fo$*.obj $(CFLAGS)
 
-dec265.exe: $(OBJS) ..\libde265\libde265.lib
-	$(LINK) /out:dec265.exe $** ..\libde265\libde265.lib
+enc265.exe: $(OBJS) ..\libde265\libde265.lib
+	$(LINK) /out:enc265.exe $** ..\libde265\libde265.lib
 
 clean:
-	del dec265.exe
+	del enc265.exe
 	del $(OBJS)
diff --git a/enc265/enc265.cc b/enc265/enc265.cc
new file mode 100644
index 0000000..50d208c
--- /dev/null
+++ b/enc265/enc265.cc
@@ -0,0 +1,322 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "libde265/en265.h" //coder-context.h"
+
+#include "libde265/configparam.h"
+#include "libde265/image-io.h"
+#include "libde265/encoder/analyze.h"
+#include "libde265/util.h"
+
+#include <getopt.h>
+
+
+
+#if HAVE_VIDEOGFX
+#include <libvideogfx.hh>
+using namespace videogfx;
+
+void debug_show_image_libvideogfx(const de265_image* input, int slot)
+{
+    static X11Win debugwin;
+    static bool opened=false;
+    int w = input->get_width();
+    int h = input->get_height();
+    if (!opened) {
+      opened=true;
+      debugwin.Create(w,h, "debug");
+    }
+
+    Image<Pixel> img;
+    img.Create(w,h,Colorspace_YUV, Chroma_420);
+
+    for (int y=0;y<h;y++)
+      memcpy(img.AskFrameY()[y], input->get_image_plane_at_pos(0,0,y), w);
+
+    for (int y=0;y<h/2;y++) {
+      memcpy(img.AskFrameU()[y], input->get_image_plane_at_pos(1,0,y), w/2);
+      memcpy(img.AskFrameV()[y], input->get_image_plane_at_pos(2,0,y), w/2);
+    }
+
+    debugwin.Display(img);
+    //debugwin.WaitForKeypress();
+}
+#endif
+
+
+
+int show_help=false;
+int verbosity=0;
+
+static struct option long_options[] = {
+  {"help",       no_argument,       &show_help, 1 },
+  {"verbose",    no_argument,       0, 'v' },
+  {0,            0,                 0,  0 }
+};
+
+
+struct inout_params
+{
+  inout_params();
+
+  // input
+
+  option_int first_frame;
+  option_int max_number_of_frames;
+
+  option_string input_yuv;
+  option_int input_width;
+  option_int input_height;
+
+  // output
+
+  option_string output_filename;
+
+  // debug
+
+  option_string reconstruction_yuv;
+
+
+  void register_params(config_parameters& config);
+};
+
+
+inout_params::inout_params()
+{
+  input_yuv.set_ID("input"); input_yuv.set_short_option('i');
+  input_yuv.set_default("paris_cif.yuv");
+
+  output_filename.set_ID("output"); output_filename.set_short_option('o');
+  output_filename.set_default("out.bin");
+
+  reconstruction_yuv.set_ID("input");
+  reconstruction_yuv.set_default("recon.yuv");
+
+  first_frame.set_ID("first-frame");
+  first_frame.set_default(0);
+  first_frame.set_minimum(0);
+
+  max_number_of_frames.set_ID("frames");
+  max_number_of_frames.set_short_option('f');
+  max_number_of_frames.set_minimum(1);
+  //max_number_of_frames.set_default(INT_MAX);
+
+  input_width.set_ID("width"); input_width.set_short_option('w');
+  input_width.set_minimum(1);  input_width.set_default(352);
+
+  input_height.set_ID("height"); input_height.set_short_option('h');
+  input_height.set_minimum(1); input_height.set_default(288);
+}
+
+
+void inout_params::register_params(config_parameters& config)
+{
+  config.add_option(&input_yuv);
+  config.add_option(&output_filename);
+  config.add_option(&first_frame);
+  config.add_option(&max_number_of_frames);
+  config.add_option(&input_width);
+  config.add_option(&input_height);
+}
+
+
+void test_parameters_API(en265_encoder_context* ectx)
+{
+  const char** param = en265_list_parameters(ectx);
+  if (param) {
+    for (int i=0; param[i]; i++) {
+      printf("|%s| ",param[i]);
+
+      enum en265_parameter_type type = en265_get_parameter_type(ectx, param[i]);
+      const char* type_name="unknown";
+      switch (type) {
+      case en265_parameter_int: type_name="int"; break;
+      case en265_parameter_bool: type_name="bool"; break;
+      case en265_parameter_string: type_name="string"; break;
+      case en265_parameter_choice: type_name="choice"; break;
+      }
+
+      printf("(%s)",type_name);
+
+      if (type==en265_parameter_choice) {
+        const char** choices = en265_list_parameter_choices(ectx, param[i]);
+        if (choices) {
+          for (int k=0; choices[k]; k++) {
+            printf(" %s",choices[k]);
+          }
+        }
+      }
+
+      printf("\n");
+    }
+  }
+
+  // en265_set_parameter_int(ectx, "min-tb-size", 8);
+}
+
+
+extern int skipTBSplit, noskipTBSplit;
+extern int zeroBlockCorrelation[6][2][5];
+
+int main(int argc, char** argv)
+{
+  de265_init();
+
+  en265_encoder_context* ectx = en265_new_encoder();
+
+
+  bool cmdline_errors = false;
+
+  // --- in/out parameters ---
+
+  struct inout_params inout_params;
+  config_parameters inout_param_config;
+  inout_params.register_params(inout_param_config);
+
+  int first_idx=1;
+  if (!inout_param_config.parse_command_line_params(&argc,argv, &first_idx, true)) {
+    cmdline_errors = true;
+  }
+
+
+  // --- read encoder parameters ---
+
+  if (en265_parse_command_line_parameters(ectx, &argc, argv) != DE265_OK) {
+    cmdline_errors = true;
+  }
+
+
+
+  while (1) {
+    int option_index = 0;
+
+    int c = getopt_long(argc, argv, "v"
+                        , long_options, &option_index);
+    if (c == -1)
+      break;
+
+    switch (c) {
+    case 'v': verbosity++; break;
+    }
+  }
+
+
+  // --- show usage information ---
+
+  if (optind != argc || cmdline_errors || show_help) {
+    fprintf(stderr," enc265  v%s\n", de265_get_version());
+    fprintf(stderr,"--------------\n");
+    fprintf(stderr,"usage: enc265 [options]\n");
+    fprintf(stderr,"The video file must be a raw YUV file\n");
+    fprintf(stderr,"\n");
+    fprintf(stderr,"options:\n");
+    fprintf(stderr,"      --help         show help\n");
+    fprintf(stderr,"  -v, --verbose      increase verbosity level (up to 3 times)\n");
+
+    inout_param_config.print_params();
+    fprintf(stderr,"\n");
+    en265_show_parameters(ectx);
+
+    exit(show_help ? 0 : 5);
+  }
+
+
+
+  de265_set_verbosity(verbosity);
+#if HAVE_VIDEOGFX
+  //debug_set_image_output(debug_show_image_libvideogfx);
+#endif
+
+  //test_parameters_API(ectx);
+
+
+  ImageSink_YUV reconstruction_sink;
+  if (strlen(inout_params.reconstruction_yuv.get().c_str()) != 0) {
+    reconstruction_sink.set_filename(inout_params.reconstruction_yuv.get().c_str());
+    //ectx.reconstruction_sink = &reconstruction_sink;
+  }
+
+  ImageSource_YUV image_source;
+  image_source.set_input_file(inout_params.input_yuv.get().c_str(),
+                              inout_params.input_width,
+                              inout_params.input_height);
+
+  PacketSink_File packet_sink;
+  packet_sink.set_filename(inout_params.output_filename.get().c_str());
+
+
+  // --- run encoder ---
+
+  image_source.skip_frames( inout_params.first_frame );
+
+  en265_start_encoder(ectx, 0);
+
+  int maxPoc = INT_MAX;
+  if (inout_params.max_number_of_frames.is_defined()) {
+    maxPoc = inout_params.max_number_of_frames;
+  }
+
+  bool eof = false;
+  for (int poc=0; poc<maxPoc && !eof ;poc++)
+    {
+      // push one image into the encoder
+
+      de265_image* input_image = image_source.get_image();
+      if (input_image==NULL) {
+        en265_push_eof(ectx);
+        eof=true;
+      }
+      else {
+        en265_push_image(ectx, input_image);
+      }
+
+
+
+      // encode images while more are available
+
+      en265_encode(ectx);
+
+
+      // write all pending packets
+
+      for (;;) {
+        en265_packet* pck = en265_get_packet(ectx,0);
+        if (pck==NULL)
+          break;
+
+        packet_sink.send_packet(pck->data, pck->length);
+
+        en265_free_packet(ectx,pck);
+      }
+    }
+
+
+
+  // --- print statistics ---
+
+  en265_print_logging((encoder_context*)ectx, "tb-split", NULL);
+
+
+  en265_free_encoder(ectx);
+
+  de265_free();
+
+  return 0;
+}
diff --git a/extra/win32cond.c b/extra/win32cond.c
index 390bb81..46c2198 100644
--- a/extra/win32cond.c
+++ b/extra/win32cond.c
@@ -51,6 +51,8 @@ int win32_cond_destroy(win32_cond_t *cv)
 
 int win32_cond_wait(win32_cond_t *cv, HANDLE *external_mutex)
 {
+  int last_waiter;
+
   // Avoid race conditions.
   EnterCriticalSection (&cv->waiters_count_lock_);
   cv->waiters_count_++;
@@ -68,7 +70,7 @@ int win32_cond_wait(win32_cond_t *cv, HANDLE *external_mutex)
   cv->waiters_count_--;
 
   // Check to see if we're the last waiter after <pthread_cond_broadcast>.
-  int last_waiter = cv->was_broadcast_ && cv->waiters_count_ == 0;
+  last_waiter = cv->was_broadcast_ && cv->waiters_count_ == 0;
 
   LeaveCriticalSection (&cv->waiters_count_lock_);
 
@@ -87,8 +89,10 @@ int win32_cond_wait(win32_cond_t *cv, HANDLE *external_mutex)
 
 int win32_cond_signal(win32_cond_t *cv)
 {
+  int have_waiters;
+
   EnterCriticalSection (&cv->waiters_count_lock_);
-  int have_waiters = cv->waiters_count_ > 0;
+  have_waiters = cv->waiters_count_ > 0;
   LeaveCriticalSection (&cv->waiters_count_lock_);
 
   // If there aren't any waiters, then this is a no-op.  
@@ -99,10 +103,11 @@ int win32_cond_signal(win32_cond_t *cv)
 
 int win32_cond_broadcast(win32_cond_t *cv)
 {
+  int have_waiters = 0;
+
   // This is needed to ensure that <waiters_count_> and <was_broadcast_> are
   // consistent relative to each other.
   EnterCriticalSection (&cv->waiters_count_lock_);
-  int have_waiters = 0;
 
   if (cv->waiters_count_ > 0) {
     // We are broadcasting, even if there is just one waiter...
diff --git a/libde265/CMakeLists.txt b/libde265/CMakeLists.txt
new file mode 100644
index 0000000..9a9916f
--- /dev/null
+++ b/libde265/CMakeLists.txt
@@ -0,0 +1,84 @@
+set (libde265_sources 
+  bitstream.cc
+  cabac.cc
+  de265.cc
+  deblock.cc
+  decctx.cc
+  nal-parser.cc
+  nal-parser.h
+  dpb.cc
+  dpb.h
+  image.cc
+  intrapred.cc
+  md5.cc
+  nal.cc
+  pps.cc
+  transform.cc
+  refpic.cc
+  sao.cc
+  scan.cc
+  sei.cc
+  slice.cc
+  sps.cc
+  util.cc
+  vps.cc
+  bitstream.h
+  cabac.h
+  deblock.h
+  decctx.h
+  image.h
+  intrapred.h
+  md5.h
+  nal.h
+  pps.h
+  transform.h
+  refpic.h
+  sao.h
+  scan.h
+  sei.h
+  slice.h
+  sps.h
+  util.h
+  vps.h
+  vui.h vui.cc
+  motion.cc motion.h
+  threads.cc threads.h
+  visualize.cc visualize.h
+  acceleration.h
+  fallback.cc fallback.h fallback-motion.cc fallback-motion.h
+  fallback-dct.h fallback-dct.cc
+  quality.cc quality.h
+  configparam.cc configparam.h
+  image-io.h image-io.cc
+  alloc_pool.h alloc_pool.cc
+  en265.h en265.cc
+  contextmodel.cc
+)
+
+if(MSVC)
+  set (libde265_sources
+    ${libde265_sources}
+    ../extra/win32cond.c
+    ../extra/win32cond.h
+  )
+endif()
+
+add_definitions(-DLIBDE265_EXPORTS)
+
+add_library(${LIBDE265_LIBRARY_NAME} SHARED ${libde265_sources})
+
+target_link_libraries(${LIBDE265_LIBRARY_NAME} ${CMAKE_THREAD_LIBS_INIT})
+
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+  SET_TARGET_PROPERTIES(${LIBDE265_LIBRARY_NAME} PROPERTIES COMPILE_FLAGS "-fPIC")
+endif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+
+add_subdirectory (encoder)
+
+target_link_libraries(${LIBDE265_LIBRARY_NAME} encoder)
+
+if(SUPPORTS_SSE4_1)
+  add_definitions(-DHAVE_SSE4_1)
+  add_subdirectory (x86)
+  target_link_libraries(${LIBDE265_LIBRARY_NAME} x86)
+endif()
diff --git a/libde265/Makefile.am b/libde265/Makefile.am
index ea40c75..007c7ca 100644
--- a/libde265/Makefile.am
+++ b/libde265/Makefile.am
@@ -22,65 +22,95 @@ libde265_la_LDFLAGS = -version-info $(LIBDE265_CURRENT):$(LIBDE265_REVISION):$(L
 libde265_la_LIBADD = -lstdc++
 
 libde265_la_SOURCES = \
+  acceleration.h \
+  alloc_pool.h \
+  alloc_pool.cc \
   bitstream.cc \
+  bitstream.h \
   cabac.cc \
+  cabac.h \
+  configparam.cc \
+  configparam.h \
+  contextmodel.cc \
+  contextmodel.h \
   de265.cc \
   deblock.cc \
+  deblock.h \
   decctx.cc \
-  nal-parser.cc \
-  nal-parser.h \
+  decctx.h \
+  en265.h \
+  en265.cc \
+  fallback.cc \
+  fallback.h \
+  fallback-dct.h \
+  fallback-dct.cc \
+  fallback-motion.cc \
+  fallback-motion.h \
   dpb.cc \
   dpb.h \
   image.cc \
-  intrapred.cc \
-  md5.cc \
-  nal.cc \
-  pps.cc \
-  transform.cc \
-  refpic.cc \
-  sao.cc \
-  scan.cc \
-  sei.cc \
-  slice.cc \
-  sps.cc \
-  util.cc \
-  vps.cc \
-  bitstream.h \
-  cabac.h \
-  deblock.h \
-  decctx.h \
   image.h \
+  image-io.h \
+  image-io.cc \
+  intrapred.cc \
   intrapred.h \
+  md5.cc \
   md5.h \
+  motion.cc \
+  motion.h \
+  nal.cc \
   nal.h \
+  nal-parser.cc \
+  nal-parser.h \
+  pps.cc \
   pps.h \
-  transform.h \
+  quality.cc \
+  quality.h \
+  refpic.cc \
   refpic.h \
+  sao.cc \
   sao.h \
+  scan.cc \
   scan.h \
+  sei.cc \
   sei.h \
+  slice.cc \
   slice.h \
+  sps.cc \
   sps.h \
+  threads.cc \
+  threads.h \
+  transform.cc \
+  transform.h \
+  util.cc \
   util.h \
+  visualize.cc \
+  visualize.h \
+  vps.cc \
   vps.h \
-  motion.cc motion.h \
-  threads.cc threads.h \
-  visualize.cc visualize.h \
-  acceleration.h \
-  fallback.cc fallback.h fallback-motion.cc fallback-motion.h \
-  fallback-dct.h fallback-dct.cc
+  vui.cc \
+  vui.h
+
+SUBDIRS = encoder
+libde265_la_LIBADD += encoder/libde265_encoder.la
 
 if ENABLE_SSE_OPT
-  SUBDIRS = x86
+  SUBDIRS += x86
   libde265_la_LIBADD += x86/libde265_x86.la
 endif
 
+if ENABLE_ARM_OPT
+  SUBDIRS += arm
+  libde265_la_LIBADD += arm/libde265_arm.la
+endif
+
 if MINGW
   libde265_la_SOURCES += ../extra/win32cond.c ../extra/win32cond.h
   libde265_la_LDFLAGS += -no-undefined -static-libgcc -static-libstdc++
 endif
 
 EXTRA_DIST = Makefile.vc7 \
+  CMakeLists.txt \
   ../extra/stdbool.h \
   ../extra/stdint.h
 
diff --git a/libde265/Makefile.in b/libde265/Makefile.in
index e4f75b6..7b4443f 100644
--- a/libde265/Makefile.in
+++ b/libde265/Makefile.in
@@ -82,20 +82,24 @@ host_triplet = @host@
 target_triplet = @target@
 @HAVE_VISIBILITY_TRUE at am__append_1 = -DHAVE_VISIBILITY
 @HAVE_VISIBILITY_TRUE at am__append_2 = -DHAVE_VISIBILITY
- at ENABLE_SSE_OPT_TRUE@am__append_3 = x86/libde265_x86.la
- at MINGW_TRUE@am__append_4 = ../extra/win32cond.c ../extra/win32cond.h
- at MINGW_TRUE@am__append_5 = -no-undefined -static-libgcc -static-libstdc++
+ at ENABLE_SSE_OPT_TRUE@am__append_3 = x86
+ at ENABLE_SSE_OPT_TRUE@am__append_4 = x86/libde265_x86.la
+ at ENABLE_ARM_OPT_TRUE@am__append_5 = arm
+ at ENABLE_ARM_OPT_TRUE@am__append_6 = arm/libde265_arm.la
+ at MINGW_TRUE@am__append_7 = ../extra/win32cond.c ../extra/win32cond.h
+ at MINGW_TRUE@am__append_8 = -no-undefined -static-libgcc -static-libstdc++
 subdir = libde265
 DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
 	$(srcdir)/de265-version.h.in $(top_srcdir)/depcomp \
 	$(libde265_la_HEADERS) COPYING
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_compare_version.m4 \
+	$(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \
 	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
 	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
 	$(top_srcdir)/m4/lt~obsolete.m4 \
 	$(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \
-	$(top_srcdir)/m4/visibility.m4 $(top_srcdir)/configure.ac
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 mkinstalldirs = $(install_sh) -d
@@ -131,31 +135,40 @@ am__uninstall_files_from_dir = { \
   }
 am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(libde265_ladir)"
 LTLIBRARIES = $(lib_LTLIBRARIES)
-libde265_la_DEPENDENCIES = $(am__append_3)
-am__libde265_la_SOURCES_DIST = bitstream.cc cabac.cc de265.cc \
-	deblock.cc decctx.cc nal-parser.cc nal-parser.h dpb.cc dpb.h \
-	image.cc intrapred.cc md5.cc nal.cc pps.cc transform.cc \
-	refpic.cc sao.cc scan.cc sei.cc slice.cc sps.cc util.cc vps.cc \
-	bitstream.h cabac.h deblock.h decctx.h image.h intrapred.h \
-	md5.h nal.h pps.h transform.h refpic.h sao.h scan.h sei.h \
-	slice.h sps.h util.h vps.h motion.cc motion.h threads.cc \
-	threads.h visualize.cc visualize.h acceleration.h fallback.cc \
-	fallback.h fallback-motion.cc fallback-motion.h fallback-dct.h \
-	fallback-dct.cc ../extra/win32cond.c ../extra/win32cond.h
+libde265_la_DEPENDENCIES = encoder/libde265_encoder.la $(am__append_4) \
+	$(am__append_6)
+am__libde265_la_SOURCES_DIST = acceleration.h alloc_pool.h \
+	alloc_pool.cc bitstream.cc bitstream.h cabac.cc cabac.h \
+	configparam.cc configparam.h contextmodel.cc contextmodel.h \
+	de265.cc deblock.cc deblock.h decctx.cc decctx.h en265.h \
+	en265.cc fallback.cc fallback.h fallback-dct.h fallback-dct.cc \
+	fallback-motion.cc fallback-motion.h dpb.cc dpb.h image.cc \
+	image.h image-io.h image-io.cc intrapred.cc intrapred.h md5.cc \
+	md5.h motion.cc motion.h nal.cc nal.h nal-parser.cc \
+	nal-parser.h pps.cc pps.h quality.cc quality.h refpic.cc \
+	refpic.h sao.cc sao.h scan.cc scan.h sei.cc sei.h slice.cc \
+	slice.h sps.cc sps.h threads.cc threads.h transform.cc \
+	transform.h util.cc util.h visualize.cc visualize.h vps.cc \
+	vps.h vui.cc vui.h ../extra/win32cond.c ../extra/win32cond.h
 am__dirstamp = $(am__leading_dot)dirstamp
 @MINGW_TRUE at am__objects_1 = ../extra/libde265_la-win32cond.lo
-am_libde265_la_OBJECTS = libde265_la-bitstream.lo libde265_la-cabac.lo \
+am_libde265_la_OBJECTS = libde265_la-alloc_pool.lo \
+	libde265_la-bitstream.lo libde265_la-cabac.lo \
+	libde265_la-configparam.lo libde265_la-contextmodel.lo \
 	libde265_la-de265.lo libde265_la-deblock.lo \
-	libde265_la-decctx.lo libde265_la-nal-parser.lo \
-	libde265_la-dpb.lo libde265_la-image.lo \
-	libde265_la-intrapred.lo libde265_la-md5.lo libde265_la-nal.lo \
-	libde265_la-pps.lo libde265_la-transform.lo \
-	libde265_la-refpic.lo libde265_la-sao.lo libde265_la-scan.lo \
-	libde265_la-sei.lo libde265_la-slice.lo libde265_la-sps.lo \
-	libde265_la-util.lo libde265_la-vps.lo libde265_la-motion.lo \
-	libde265_la-threads.lo libde265_la-visualize.lo \
-	libde265_la-fallback.lo libde265_la-fallback-motion.lo \
-	libde265_la-fallback-dct.lo $(am__objects_1)
+	libde265_la-decctx.lo libde265_la-en265.lo \
+	libde265_la-fallback.lo libde265_la-fallback-dct.lo \
+	libde265_la-fallback-motion.lo libde265_la-dpb.lo \
+	libde265_la-image.lo libde265_la-image-io.lo \
+	libde265_la-intrapred.lo libde265_la-md5.lo \
+	libde265_la-motion.lo libde265_la-nal.lo \
+	libde265_la-nal-parser.lo libde265_la-pps.lo \
+	libde265_la-quality.lo libde265_la-refpic.lo \
+	libde265_la-sao.lo libde265_la-scan.lo libde265_la-sei.lo \
+	libde265_la-slice.lo libde265_la-sps.lo libde265_la-threads.lo \
+	libde265_la-transform.lo libde265_la-util.lo \
+	libde265_la-visualize.lo libde265_la-vps.lo libde265_la-vui.lo \
+	$(am__objects_1)
 libde265_la_OBJECTS = $(am_libde265_la_OBJECTS)
 AM_V_lt = $(am__v_lt_ at AM_V@)
 am__v_lt_ = $(am__v_lt_ at AM_DEFAULT_V@)
@@ -259,7 +272,7 @@ am__define_uniq_tagged_files = \
   done | $(am__uniquify_input)`
 ETAGS = etags
 CTAGS = ctags
-DIST_SUBDIRS = x86
+DIST_SUBDIRS = encoder x86 arm
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 am__relativize = \
   dir0=`pwd`; \
@@ -287,6 +300,7 @@ am__relativize = \
   done; \
   reldir="$$dir2"
 ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
 AMTAR = @AMTAR@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
@@ -295,9 +309,11 @@ AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
 CCDEPMODE = @CCDEPMODE@
 CFLAGS = @CFLAGS@
-CFLAG_VISIBILITY = @CFLAG_VISIBILITY@
 CPP = @CPP@
 CPPFLAGS = @CPPFLAGS@
 CXX = @CXX@
@@ -317,7 +333,7 @@ EGREP = @EGREP@
 EXEEXT = @EXEEXT@
 FGREP = @FGREP@
 GREP = @GREP@
-HAVE_VISIBILITY = @HAVE_VISIBILITY@
+HAVE_CXX11 = @HAVE_CXX11@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -440,20 +456,25 @@ libde265_la_CXXFLAGS = $(CFLAG_VISIBILITY) -DLIBDE265_EXPORTS \
 	$(am__append_2)
 libde265_la_LDFLAGS = -version-info \
 	$(LIBDE265_CURRENT):$(LIBDE265_REVISION):$(LIBDE265_AGE) \
-	$(am__append_5)
-libde265_la_LIBADD = -lstdc++ $(am__append_3)
-libde265_la_SOURCES = bitstream.cc cabac.cc de265.cc deblock.cc \
-	decctx.cc nal-parser.cc nal-parser.h dpb.cc dpb.h image.cc \
-	intrapred.cc md5.cc nal.cc pps.cc transform.cc refpic.cc \
-	sao.cc scan.cc sei.cc slice.cc sps.cc util.cc vps.cc \
-	bitstream.h cabac.h deblock.h decctx.h image.h intrapred.h \
-	md5.h nal.h pps.h transform.h refpic.h sao.h scan.h sei.h \
-	slice.h sps.h util.h vps.h motion.cc motion.h threads.cc \
-	threads.h visualize.cc visualize.h acceleration.h fallback.cc \
-	fallback.h fallback-motion.cc fallback-motion.h fallback-dct.h \
-	fallback-dct.cc $(am__append_4)
- at ENABLE_SSE_OPT_TRUE@SUBDIRS = x86
+	$(am__append_8)
+libde265_la_LIBADD = -lstdc++ encoder/libde265_encoder.la \
+	$(am__append_4) $(am__append_6)
+libde265_la_SOURCES = acceleration.h alloc_pool.h alloc_pool.cc \
+	bitstream.cc bitstream.h cabac.cc cabac.h configparam.cc \
+	configparam.h contextmodel.cc contextmodel.h de265.cc \
+	deblock.cc deblock.h decctx.cc decctx.h en265.h en265.cc \
+	fallback.cc fallback.h fallback-dct.h fallback-dct.cc \
+	fallback-motion.cc fallback-motion.h dpb.cc dpb.h image.cc \
+	image.h image-io.h image-io.cc intrapred.cc intrapred.h md5.cc \
+	md5.h motion.cc motion.h nal.cc nal.h nal-parser.cc \
+	nal-parser.h pps.cc pps.h quality.cc quality.h refpic.cc \
+	refpic.h sao.cc sao.h scan.cc scan.h sei.cc sei.h slice.cc \
+	slice.h sps.cc sps.h threads.cc threads.h transform.cc \
+	transform.h util.cc util.h visualize.cc visualize.h vps.cc \
+	vps.h vui.cc vui.h $(am__append_7)
+SUBDIRS = encoder $(am__append_3) $(am__append_5)
 EXTRA_DIST = Makefile.vc7 \
+  CMakeLists.txt \
   ../extra/stdbool.h \
   ../extra/stdint.h
 
@@ -553,15 +574,20 @@ distclean-compile:
 	-rm -f *.tab.c
 
 @AMDEP_TRUE@@am__include@ @am__quote at ../extra/$(DEPDIR)/libde265_la-win32cond.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-alloc_pool.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-bitstream.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-cabac.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-configparam.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-contextmodel.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-de265.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-deblock.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-decctx.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-dpb.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-en265.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-fallback-dct.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-fallback-motion.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-fallback.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-image-io.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-image.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-intrapred.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-md5.Plo at am__quote@
@@ -569,6 +595,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-nal-parser.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-nal.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-pps.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-quality.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-refpic.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-sao.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-scan.Plo at am__quote@
@@ -580,6 +607,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-util.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-visualize.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-vps.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_la-vui.Plo at am__quote@
 
 .c.o:
 @am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\
@@ -636,6 +664,13 @@ distclean-compile:
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LTCXXCOMPILE) -c -o $@ $<
 
+libde265_la-alloc_pool.lo: alloc_pool.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-alloc_pool.lo -MD -MP -MF $(DEPDIR)/libde265_la-alloc_pool.Tpo -c -o libde265_la-alloc_pool.lo `test -f 'alloc_pool.cc' || echo '$(srcdir)/'`alloc_pool.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-alloc_pool.Tpo $(DEPDIR)/libde265_la-alloc_pool.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='alloc_pool.cc' object='libde265_la-alloc_pool.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-alloc_pool.lo `test -f 'alloc_pool.cc' || echo '$(srcdir)/'`alloc_pool.cc
+
 libde265_la-bitstream.lo: bitstream.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-bitstream.lo -MD -MP -MF $(DEPDIR)/libde265_la-bitstream.Tpo -c -o libde265_la-bitstream.lo `test -f 'bitstream.cc' || echo '$(srcdir)/'`bitstream.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-bitstream.Tpo $(DEPDIR)/libde265_la-bitstream.Plo
@@ -650,6 +685,20 @@ libde265_la-cabac.lo: cabac.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-cabac.lo `test -f 'cabac.cc' || echo '$(srcdir)/'`cabac.cc
 
+libde265_la-configparam.lo: configparam.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-configparam.lo -MD -MP -MF $(DEPDIR)/libde265_la-configparam.Tpo -c -o libde265_la-configparam.lo `test -f 'configparam.cc' || echo '$(srcdir)/'`configparam.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-configparam.Tpo $(DEPDIR)/libde265_la-configparam.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='configparam.cc' object='libde265_la-configparam.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-configparam.lo `test -f 'configparam.cc' || echo '$(srcdir)/'`configparam.cc
+
+libde265_la-contextmodel.lo: contextmodel.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-contextmodel.lo -MD -MP -MF $(DEPDIR)/libde265_la-contextmodel.Tpo -c -o libde265_la-contextmodel.lo `test -f 'contextmodel.cc' || echo '$(srcdir)/'`contextmodel.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-contextmodel.Tpo $(DEPDIR)/libde265_la-contextmodel.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='contextmodel.cc' object='libde265_la-contextmodel.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-contextmodel.lo `test -f 'contextmodel.cc' || echo '$(srcdir)/'`contextmodel.cc
+
 libde265_la-de265.lo: de265.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-de265.lo -MD -MP -MF $(DEPDIR)/libde265_la-de265.Tpo -c -o libde265_la-de265.lo `test -f 'de265.cc' || echo '$(srcdir)/'`de265.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-de265.Tpo $(DEPDIR)/libde265_la-de265.Plo
@@ -671,12 +720,33 @@ libde265_la-decctx.lo: decctx.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-decctx.lo `test -f 'decctx.cc' || echo '$(srcdir)/'`decctx.cc
 
-libde265_la-nal-parser.lo: nal-parser.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-nal-parser.lo -MD -MP -MF $(DEPDIR)/libde265_la-nal-parser.Tpo -c -o libde265_la-nal-parser.lo `test -f 'nal-parser.cc' || echo '$(srcdir)/'`nal-parser.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-nal-parser.Tpo $(DEPDIR)/libde265_la-nal-parser.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='nal-parser.cc' object='libde265_la-nal-parser.lo' libtool=yes @AMDEPBACKSLASH@
+libde265_la-en265.lo: en265.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-en265.lo -MD -MP -MF $(DEPDIR)/libde265_la-en265.Tpo -c -o libde265_la-en265.lo `test -f 'en265.cc' || echo '$(srcdir)/'`en265.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-en265.Tpo $(DEPDIR)/libde265_la-en265.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='en265.cc' object='libde265_la-en265.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-nal-parser.lo `test -f 'nal-parser.cc' || echo '$(srcdir)/'`nal-parser.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-en265.lo `test -f 'en265.cc' || echo '$(srcdir)/'`en265.cc
+
+libde265_la-fallback.lo: fallback.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-fallback.lo -MD -MP -MF $(DEPDIR)/libde265_la-fallback.Tpo -c -o libde265_la-fallback.lo `test -f 'fallback.cc' || echo '$(srcdir)/'`fallback.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-fallback.Tpo $(DEPDIR)/libde265_la-fallback.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='fallback.cc' object='libde265_la-fallback.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-fallback.lo `test -f 'fallback.cc' || echo '$(srcdir)/'`fallback.cc
+
+libde265_la-fallback-dct.lo: fallback-dct.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-fallback-dct.lo -MD -MP -MF $(DEPDIR)/libde265_la-fallback-dct.Tpo -c -o libde265_la-fallback-dct.lo `test -f 'fallback-dct.cc' || echo '$(srcdir)/'`fallback-dct.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-fallback-dct.Tpo $(DEPDIR)/libde265_la-fallback-dct.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='fallback-dct.cc' object='libde265_la-fallback-dct.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-fallback-dct.lo `test -f 'fallback-dct.cc' || echo '$(srcdir)/'`fallback-dct.cc
+
+libde265_la-fallback-motion.lo: fallback-motion.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-fallback-motion.lo -MD -MP -MF $(DEPDIR)/libde265_la-fallback-motion.Tpo -c -o libde265_la-fallback-motion.lo `test -f 'fallback-motion.cc' || echo '$(srcdir)/'`fallback-motion.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-fallback-motion.Tpo $(DEPDIR)/libde265_la-fallback-motion.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='fallback-motion.cc' object='libde265_la-fallback-motion.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-fallback-motion.lo `test -f 'fallback-motion.cc' || echo '$(srcdir)/'`fallback-motion.cc
 
 libde265_la-dpb.lo: dpb.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-dpb.lo -MD -MP -MF $(DEPDIR)/libde265_la-dpb.Tpo -c -o libde265_la-dpb.lo `test -f 'dpb.cc' || echo '$(srcdir)/'`dpb.cc
@@ -692,6 +762,13 @@ libde265_la-image.lo: image.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-image.lo `test -f 'image.cc' || echo '$(srcdir)/'`image.cc
 
+libde265_la-image-io.lo: image-io.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-image-io.lo -MD -MP -MF $(DEPDIR)/libde265_la-image-io.Tpo -c -o libde265_la-image-io.lo `test -f 'image-io.cc' || echo '$(srcdir)/'`image-io.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-image-io.Tpo $(DEPDIR)/libde265_la-image-io.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='image-io.cc' object='libde265_la-image-io.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-image-io.lo `test -f 'image-io.cc' || echo '$(srcdir)/'`image-io.cc
+
 libde265_la-intrapred.lo: intrapred.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-intrapred.lo -MD -MP -MF $(DEPDIR)/libde265_la-intrapred.Tpo -c -o libde265_la-intrapred.lo `test -f 'intrapred.cc' || echo '$(srcdir)/'`intrapred.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-intrapred.Tpo $(DEPDIR)/libde265_la-intrapred.Plo
@@ -706,6 +783,13 @@ libde265_la-md5.lo: md5.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-md5.lo `test -f 'md5.cc' || echo '$(srcdir)/'`md5.cc
 
+libde265_la-motion.lo: motion.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-motion.lo -MD -MP -MF $(DEPDIR)/libde265_la-motion.Tpo -c -o libde265_la-motion.lo `test -f 'motion.cc' || echo '$(srcdir)/'`motion.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-motion.Tpo $(DEPDIR)/libde265_la-motion.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='motion.cc' object='libde265_la-motion.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-motion.lo `test -f 'motion.cc' || echo '$(srcdir)/'`motion.cc
+
 libde265_la-nal.lo: nal.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-nal.lo -MD -MP -MF $(DEPDIR)/libde265_la-nal.Tpo -c -o libde265_la-nal.lo `test -f 'nal.cc' || echo '$(srcdir)/'`nal.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-nal.Tpo $(DEPDIR)/libde265_la-nal.Plo
@@ -713,6 +797,13 @@ libde265_la-nal.lo: nal.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-nal.lo `test -f 'nal.cc' || echo '$(srcdir)/'`nal.cc
 
+libde265_la-nal-parser.lo: nal-parser.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-nal-parser.lo -MD -MP -MF $(DEPDIR)/libde265_la-nal-parser.Tpo -c -o libde265_la-nal-parser.lo `test -f 'nal-parser.cc' || echo '$(srcdir)/'`nal-parser.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-nal-parser.Tpo $(DEPDIR)/libde265_la-nal-parser.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='nal-parser.cc' object='libde265_la-nal-parser.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-nal-parser.lo `test -f 'nal-parser.cc' || echo '$(srcdir)/'`nal-parser.cc
+
 libde265_la-pps.lo: pps.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-pps.lo -MD -MP -MF $(DEPDIR)/libde265_la-pps.Tpo -c -o libde265_la-pps.lo `test -f 'pps.cc' || echo '$(srcdir)/'`pps.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-pps.Tpo $(DEPDIR)/libde265_la-pps.Plo
@@ -720,12 +811,12 @@ libde265_la-pps.lo: pps.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-pps.lo `test -f 'pps.cc' || echo '$(srcdir)/'`pps.cc
 
-libde265_la-transform.lo: transform.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-transform.lo -MD -MP -MF $(DEPDIR)/libde265_la-transform.Tpo -c -o libde265_la-transform.lo `test -f 'transform.cc' || echo '$(srcdir)/'`transform.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-transform.Tpo $(DEPDIR)/libde265_la-transform.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='transform.cc' object='libde265_la-transform.lo' libtool=yes @AMDEPBACKSLASH@
+libde265_la-quality.lo: quality.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-quality.lo -MD -MP -MF $(DEPDIR)/libde265_la-quality.Tpo -c -o libde265_la-quality.lo `test -f 'quality.cc' || echo '$(srcdir)/'`quality.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-quality.Tpo $(DEPDIR)/libde265_la-quality.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='quality.cc' object='libde265_la-quality.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-transform.lo `test -f 'transform.cc' || echo '$(srcdir)/'`transform.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-quality.lo `test -f 'quality.cc' || echo '$(srcdir)/'`quality.cc
 
 libde265_la-refpic.lo: refpic.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-refpic.lo -MD -MP -MF $(DEPDIR)/libde265_la-refpic.Tpo -c -o libde265_la-refpic.lo `test -f 'refpic.cc' || echo '$(srcdir)/'`refpic.cc
@@ -769,27 +860,6 @@ libde265_la-sps.lo: sps.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-sps.lo `test -f 'sps.cc' || echo '$(srcdir)/'`sps.cc
 
-libde265_la-util.lo: util.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-util.lo -MD -MP -MF $(DEPDIR)/libde265_la-util.Tpo -c -o libde265_la-util.lo `test -f 'util.cc' || echo '$(srcdir)/'`util.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-util.Tpo $(DEPDIR)/libde265_la-util.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='util.cc' object='libde265_la-util.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-util.lo `test -f 'util.cc' || echo '$(srcdir)/'`util.cc
-
-libde265_la-vps.lo: vps.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-vps.lo -MD -MP -MF $(DEPDIR)/libde265_la-vps.Tpo -c -o libde265_la-vps.lo `test -f 'vps.cc' || echo '$(srcdir)/'`vps.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-vps.Tpo $(DEPDIR)/libde265_la-vps.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='vps.cc' object='libde265_la-vps.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-vps.lo `test -f 'vps.cc' || echo '$(srcdir)/'`vps.cc
-
-libde265_la-motion.lo: motion.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-motion.lo -MD -MP -MF $(DEPDIR)/libde265_la-motion.Tpo -c -o libde265_la-motion.lo `test -f 'motion.cc' || echo '$(srcdir)/'`motion.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-motion.Tpo $(DEPDIR)/libde265_la-motion.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='motion.cc' object='libde265_la-motion.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-motion.lo `test -f 'motion.cc' || echo '$(srcdir)/'`motion.cc
-
 libde265_la-threads.lo: threads.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-threads.lo -MD -MP -MF $(DEPDIR)/libde265_la-threads.Tpo -c -o libde265_la-threads.lo `test -f 'threads.cc' || echo '$(srcdir)/'`threads.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-threads.Tpo $(DEPDIR)/libde265_la-threads.Plo
@@ -797,6 +867,20 @@ libde265_la-threads.lo: threads.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-threads.lo `test -f 'threads.cc' || echo '$(srcdir)/'`threads.cc
 
+libde265_la-transform.lo: transform.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-transform.lo -MD -MP -MF $(DEPDIR)/libde265_la-transform.Tpo -c -o libde265_la-transform.lo `test -f 'transform.cc' || echo '$(srcdir)/'`transform.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-transform.Tpo $(DEPDIR)/libde265_la-transform.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='transform.cc' object='libde265_la-transform.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-transform.lo `test -f 'transform.cc' || echo '$(srcdir)/'`transform.cc
+
+libde265_la-util.lo: util.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-util.lo -MD -MP -MF $(DEPDIR)/libde265_la-util.Tpo -c -o libde265_la-util.lo `test -f 'util.cc' || echo '$(srcdir)/'`util.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-util.Tpo $(DEPDIR)/libde265_la-util.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='util.cc' object='libde265_la-util.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-util.lo `test -f 'util.cc' || echo '$(srcdir)/'`util.cc
+
 libde265_la-visualize.lo: visualize.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-visualize.lo -MD -MP -MF $(DEPDIR)/libde265_la-visualize.Tpo -c -o libde265_la-visualize.lo `test -f 'visualize.cc' || echo '$(srcdir)/'`visualize.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-visualize.Tpo $(DEPDIR)/libde265_la-visualize.Plo
@@ -804,26 +888,19 @@ libde265_la-visualize.lo: visualize.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-visualize.lo `test -f 'visualize.cc' || echo '$(srcdir)/'`visualize.cc
 
-libde265_la-fallback.lo: fallback.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-fallback.lo -MD -MP -MF $(DEPDIR)/libde265_la-fallback.Tpo -c -o libde265_la-fallback.lo `test -f 'fallback.cc' || echo '$(srcdir)/'`fallback.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-fallback.Tpo $(DEPDIR)/libde265_la-fallback.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='fallback.cc' object='libde265_la-fallback.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-fallback.lo `test -f 'fallback.cc' || echo '$(srcdir)/'`fallback.cc
-
-libde265_la-fallback-motion.lo: fallback-motion.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-fallback-motion.lo -MD -MP -MF $(DEPDIR)/libde265_la-fallback-motion.Tpo -c -o libde265_la-fallback-motion.lo `test -f 'fallback-motion.cc' || echo '$(srcdir)/'`fallback-motion.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-fallback-motion.Tpo $(DEPDIR)/libde265_la-fallback-motion.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='fallback-motion.cc' object='libde265_la-fallback-motion.lo' libtool=yes @AMDEPBACKSLASH@
+libde265_la-vps.lo: vps.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-vps.lo -MD -MP -MF $(DEPDIR)/libde265_la-vps.Tpo -c -o libde265_la-vps.lo `test -f 'vps.cc' || echo '$(srcdir)/'`vps.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-vps.Tpo $(DEPDIR)/libde265_la-vps.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='vps.cc' object='libde265_la-vps.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-fallback-motion.lo `test -f 'fallback-motion.cc' || echo '$(srcdir)/'`fallback-motion.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-vps.lo `test -f 'vps.cc' || echo '$(srcdir)/'`vps.cc
 
-libde265_la-fallback-dct.lo: fallback-dct.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-fallback-dct.lo -MD -MP -MF $(DEPDIR)/libde265_la-fallback-dct.Tpo -c -o libde265_la-fallback-dct.lo `test -f 'fallback-dct.cc' || echo '$(srcdir)/'`fallback-dct.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-fallback-dct.Tpo $(DEPDIR)/libde265_la-fallback-dct.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='fallback-dct.cc' object='libde265_la-fallback-dct.lo' libtool=yes @AMDEPBACKSLASH@
+libde265_la-vui.lo: vui.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_la-vui.lo -MD -MP -MF $(DEPDIR)/libde265_la-vui.Tpo -c -o libde265_la-vui.lo `test -f 'vui.cc' || echo '$(srcdir)/'`vui.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_la-vui.Tpo $(DEPDIR)/libde265_la-vui.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='vui.cc' object='libde265_la-vui.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-fallback-dct.lo `test -f 'fallback-dct.cc' || echo '$(srcdir)/'`fallback-dct.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libde265_la_CPPFLAGS) $(CPPFLAGS) $(libde265_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_la-vui.lo `test -f 'vui.cc' || echo '$(srcdir)/'`vui.cc
 
 mostlyclean-libtool:
 	-rm -f *.lo
diff --git a/libde265/Makefile.vc7 b/libde265/Makefile.vc7
index 801f590..9087c28 100644
--- a/libde265/Makefile.vc7
+++ b/libde265/Makefile.vc7
@@ -2,7 +2,7 @@
 # Makefile for Microsoft Visual Studio 2003
 #
 CFLAGS=/I..\extra /I.. /I.
-CC=cl /nologo 
+CC=cl /nologo
 LINK=link /nologo /subsystem:console
 DEFINES=/DWIN32 /D_WIN32_WINNT=0x0400 /DNDEBUG /DLIBDE265_EXPORTS /D_CRT_SECURE_NO_WARNINGS /DHAVE_SSE4_1 /DNOMINMAX
 
@@ -30,22 +30,28 @@ CFLAGS=$(CFLAGS) /wd4800
 CFLAGS=$(CFLAGS) $(DEFINES)
 
 OBJS=\
+	alloc_pool.obj \
 	bitstream.obj \
 	cabac.obj \
+	configparam.obj \
+	contextmodel.obj \
 	de265.obj \
 	deblock.obj \
 	decctx.obj \
 	dpb.obj \
+	en265.obj \
 	fallback-dct.obj \
 	fallback-motion.obj \
 	fallback.obj \
 	image.obj \
+	image-io.obj \
 	intrapred.obj \
 	md5.obj \
 	motion.obj \
 	nal.obj \
 	nal-parser.obj \
 	pps.obj \
+	quality.obj \
 	refpic.obj \
 	sao.obj \
 	scan.obj \
@@ -57,6 +63,25 @@ OBJS=\
 	util.obj \
 	visualize.obj \
 	vps.obj \
+        vui.obj \
+	encoder\analyze.obj \
+	encoder\encode.obj \
+	encoder\encoder-context.obj \
+	encoder\encoder-params.obj \
+	encoder\encpicbuf.obj \
+	encoder\sop.obj \
+	encoder\algo\algo.obj \
+	encoder\algo\cb-interpartmode.obj \
+	encoder\algo\cb-intra-inter.obj \
+	encoder\algo\cb-intrapartmode.obj \
+	encoder\algo\cb-mergeindex.obj \
+	encoder\algo\cb-skip.obj \
+	encoder\algo\cb-split.obj \
+	encoder\algo\coding-options.obj \
+	encoder\algo\ctb-qscale.obj \
+	encoder\algo\pb-mv.obj \
+	encoder\algo\tb-intrapredmode.obj \
+	encoder\algo\tb-split.obj \
 	x86\sse.obj \
 	x86\sse-dct.obj \
 	x86\sse-motion.obj \
diff --git a/libde265/acceleration.h b/libde265/acceleration.h
index 8829d31..2f1148b 100644
--- a/libde265/acceleration.h
+++ b/libde265/acceleration.h
@@ -23,51 +23,337 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include <assert.h>
+
 
 struct acceleration_functions
 {
   void (*put_weighted_pred_avg_8)(uint8_t *_dst, ptrdiff_t dststride,
-                                  int16_t *src1, int16_t *src2, ptrdiff_t srcstride,
+                                  const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
                                   int width, int height);
 
   void (*put_unweighted_pred_8)(uint8_t *_dst, ptrdiff_t dststride,
-                                int16_t *src, ptrdiff_t srcstride,
+                                const int16_t *src, ptrdiff_t srcstride,
                                 int width, int height);
 
   void (*put_weighted_pred_8)(uint8_t *_dst, ptrdiff_t dststride,
-                              int16_t *src, ptrdiff_t srcstride,
+                              const int16_t *src, ptrdiff_t srcstride,
                               int width, int height,
                               int w,int o,int log2WD);
   void (*put_weighted_bipred_8)(uint8_t *_dst, ptrdiff_t dststride,
-                                int16_t *src1, int16_t *src2, ptrdiff_t srcstride,
+                                const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
                                 int width, int height,
                                 int w1,int o1, int w2,int o2, int log2WD);
 
+
+  void (*put_weighted_pred_avg_16)(uint16_t *_dst, ptrdiff_t dststride,
+                                  const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
+                                   int width, int height, int bit_depth);
+
+  void (*put_unweighted_pred_16)(uint16_t *_dst, ptrdiff_t dststride,
+                                const int16_t *src, ptrdiff_t srcstride,
+                                int width, int height, int bit_depth);
+
+  void (*put_weighted_pred_16)(uint16_t *_dst, ptrdiff_t dststride,
+                              const int16_t *src, ptrdiff_t srcstride,
+                              int width, int height,
+                              int w,int o,int log2WD, int bit_depth);
+  void (*put_weighted_bipred_16)(uint16_t *_dst, ptrdiff_t dststride,
+                                const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
+                                int width, int height,
+                                int w1,int o1, int w2,int o2, int log2WD, int bit_depth);
+
+
+  void put_weighted_pred_avg(void *_dst, ptrdiff_t dststride,
+                             const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
+                             int width, int height, int bit_depth) const;
+
+  void put_unweighted_pred(void *_dst, ptrdiff_t dststride,
+                           const int16_t *src, ptrdiff_t srcstride,
+                           int width, int height, int bit_depth) const;
+
+  void put_weighted_pred(void *_dst, ptrdiff_t dststride,
+                         const int16_t *src, ptrdiff_t srcstride,
+                         int width, int height,
+                         int w,int o,int log2WD, int bit_depth) const;
+  void put_weighted_bipred(void *_dst, ptrdiff_t dststride,
+                           const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
+                           int width, int height,
+                           int w1,int o1, int w2,int o2, int log2WD, int bit_depth) const;
+
+
+
+
   void (*put_hevc_epel_8)(int16_t *dst, ptrdiff_t dststride,
-                          uint8_t *src, ptrdiff_t srcstride, int width, int height,
+                          const uint8_t *src, ptrdiff_t srcstride, int width, int height,
                           int mx, int my, int16_t* mcbuffer);
   void (*put_hevc_epel_h_8)(int16_t *dst, ptrdiff_t dststride,
-                            uint8_t *src, ptrdiff_t srcstride, int width, int height,
-                            int mx, int my, int16_t* mcbuffer);
+                            const uint8_t *src, ptrdiff_t srcstride, int width, int height,
+                            int mx, int my, int16_t* mcbuffer, int bit_depth);
   void (*put_hevc_epel_v_8)(int16_t *dst, ptrdiff_t dststride,
-                            uint8_t *src, ptrdiff_t srcstride, int width, int height,
-                            int mx, int my, int16_t* mcbuffer);
+                            const uint8_t *src, ptrdiff_t srcstride, int width, int height,
+                            int mx, int my, int16_t* mcbuffer, int bit_depth);
   void (*put_hevc_epel_hv_8)(int16_t *dst, ptrdiff_t dststride,
-                             uint8_t *src, ptrdiff_t srcstride, int width, int height,
-                             int mx, int my, int16_t* mcbuffer);
+                             const uint8_t *src, ptrdiff_t srcstride, int width, int height,
+                             int mx, int my, int16_t* mcbuffer, int bit_depth);
 
   void (*put_hevc_qpel_8[4][4])(int16_t *dst, ptrdiff_t dststride,
-                                uint8_t *src, ptrdiff_t srcstride, int width, int height,
+                                const uint8_t *src, ptrdiff_t srcstride, int width, int height,
                                 int16_t* mcbuffer);
 
-  void (*transform_skip_8)(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride); // no transform
-  void (*transform_bypass_8)(uint8_t *dst, int16_t *coeffs, int nT, ptrdiff_t stride);
-  void (*transform_4x4_luma_add_8)(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); // iDST
 
-  void (*transform_4x4_add_8)(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); // iDCT
-  void (*transform_8x8_add_8)(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); // iDCT
-  void (*transform_16x16_add_8)(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); // iDCT
-  void (*transform_32x32_add_8)(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride); // iDCT
+  void (*put_hevc_epel_16)(int16_t *dst, ptrdiff_t dststride,
+                           const uint16_t *src, ptrdiff_t srcstride, int width, int height,
+                           int mx, int my, int16_t* mcbuffer, int bit_depth);
+  void (*put_hevc_epel_h_16)(int16_t *dst, ptrdiff_t dststride,
+                             const uint16_t *src, ptrdiff_t srcstride, int width, int height,
+                            int mx, int my, int16_t* mcbuffer, int bit_depth);
+  void (*put_hevc_epel_v_16)(int16_t *dst, ptrdiff_t dststride,
+                             const uint16_t *src, ptrdiff_t srcstride, int width, int height,
+                             int mx, int my, int16_t* mcbuffer, int bit_depth);
+  void (*put_hevc_epel_hv_16)(int16_t *dst, ptrdiff_t dststride,
+                              const uint16_t *src, ptrdiff_t srcstride, int width, int height,
+                              int mx, int my, int16_t* mcbuffer, int bit_depth);
+
+  void (*put_hevc_qpel_16[4][4])(int16_t *dst, ptrdiff_t dststride,
+                                 const uint16_t *src, ptrdiff_t srcstride, int width, int height,
+                                 int16_t* mcbuffer, int bit_depth);
+
+
+  void put_hevc_epel(int16_t *dst, ptrdiff_t dststride,
+                     const void *src, ptrdiff_t srcstride, int width, int height,
+                     int mx, int my, int16_t* mcbuffer, int bit_depth) const;
+  void put_hevc_epel_h(int16_t *dst, ptrdiff_t dststride,
+                       const void *src, ptrdiff_t srcstride, int width, int height,
+                       int mx, int my, int16_t* mcbuffer, int bit_depth) const;
+  void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
+                       const void *src, ptrdiff_t srcstride, int width, int height,
+                       int mx, int my, int16_t* mcbuffer, int bit_depth) const;
+  void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
+                        const void *src, ptrdiff_t srcstride, int width, int height,
+                        int mx, int my, int16_t* mcbuffer, int bit_depth) const;
+
+  void put_hevc_qpel(int16_t *dst, ptrdiff_t dststride,
+                     const void *src, ptrdiff_t srcstride, int width, int height,
+                     int16_t* mcbuffer, int dX,int dY, int bit_depth) const;
+
+
+  // --- inverse transforms ---
+
+  void (*transform_bypass)(int32_t *residual, const int16_t *coeffs, int nT);
+  void (*transform_bypass_rdpcm_v)(int32_t *r, const int16_t *coeffs, int nT);
+  void (*transform_bypass_rdpcm_h)(int32_t *r, const int16_t *coeffs, int nT);
+
+  // 8 bit
+
+  void (*transform_skip_8)(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride); // no transform
+  void (*transform_skip_rdpcm_v_8)(uint8_t *_dst, const int16_t *coeffs, int nT, ptrdiff_t _stride);
+  void (*transform_skip_rdpcm_h_8)(uint8_t *_dst, const int16_t *coeffs, int nT, ptrdiff_t _stride);
+  void (*transform_4x4_dst_add_8)(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); // iDST
+  void (*transform_add_8[4])(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride); // iDCT
+
+  // 9-16 bit
+
+  void (*transform_skip_16)(uint16_t *_dst, const int16_t *coeffs, ptrdiff_t _stride, int bit_depth); // no transform
+  void (*transform_4x4_dst_add_16)(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); // iDST
+  void (*transform_add_16[4])(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth); // iDCT
+
+
+  void (*rotate_coefficients)(int16_t *coeff, int nT);
+
+  void (*transform_idst_4x4)(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits);
+  void (*transform_idct_4x4)(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits);
+  void (*transform_idct_8x8)(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits);
+  void (*transform_idct_16x16)(int32_t *dst,const int16_t *coeffs,int bdShift, int max_coeff_bits);
+  void (*transform_idct_32x32)(int32_t *dst,const int16_t *coeffs,int bdShift, int max_coeff_bits);
+  void (*add_residual_8)(uint8_t *dst, ptrdiff_t stride, const int32_t* r, int nT, int bit_depth);
+  void (*add_residual_16)(uint16_t *dst,ptrdiff_t stride,const int32_t* r, int nT, int bit_depth);
+
+  template <class pixel_t>
+  void add_residual(pixel_t *dst, ptrdiff_t stride, const int32_t* r, int nT, int bit_depth) const;
+
+  void (*rdpcm_v)(int32_t* residual, const int16_t* coeffs, int nT,int tsShift,int bdShift);
+  void (*rdpcm_h)(int32_t* residual, const int16_t* coeffs, int nT,int tsShift,int bdShift);
+
+  void (*transform_skip_residual)(int32_t *residual, const int16_t *coeffs, int nT,
+                                  int tsShift,int bdShift);
+
+
+  template <class pixel_t> void transform_skip(pixel_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const;
+  template <class pixel_t> void transform_skip_rdpcm_v(pixel_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const;
+  template <class pixel_t> void transform_skip_rdpcm_h(pixel_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const;
+  template <class pixel_t> void transform_4x4_dst_add(pixel_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const;
+  template <class pixel_t> void transform_add(int sizeIdx, pixel_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const;
+
+
+
+  // --- forward transforms ---
+
+  void (*fwd_transform_4x4_dst_8)(int16_t *coeffs, const int16_t* src, ptrdiff_t stride); // fDST
+
+  // indexed with (log2TbSize-2)
+  void (*fwd_transform_8[4])     (int16_t *coeffs, const int16_t *src, ptrdiff_t stride); // fDCT
+
+
+  // forward Hadamard transform (without scaling factor)
+  // (4x4,8x8,16x16,32x32) indexed with (log2TbSize-2)
+  void (*hadamard_transform_8[4])     (int16_t *coeffs, const int16_t *src, ptrdiff_t stride);
 };
 
+
+/*
+template <> inline void acceleration_functions::put_weighted_pred_avg<uint8_t>(uint8_t *_dst, ptrdiff_t dststride,
+                                                                               const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
+                                                                               int width, int height, int bit_depth) { put_weighted_pred_avg_8(_dst,dststride,src1,src2,srcstride,width,height); }
+template <> inline void acceleration_functions::put_weighted_pred_avg<uint16_t>(uint16_t *_dst, ptrdiff_t dststride,
+                                                                                const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
+                                                                                int width, int height, int bit_depth) { put_weighted_pred_avg_16(_dst,dststride,src1,src2,
+                                                                                                                                                 srcstride,width,height,bit_depth); }
+
+template <> inline void acceleration_functions::put_unweighted_pred<uint8_t>(uint8_t *_dst, ptrdiff_t dststride,
+                                                                             const int16_t *src, ptrdiff_t srcstride,
+                                                                             int width, int height, int bit_depth) { put_unweighted_pred_8(_dst,dststride,src,srcstride,width,height); }
+template <> inline void acceleration_functions::put_unweighted_pred<uint16_t>(uint16_t *_dst, ptrdiff_t dststride,
+                                                                              const int16_t *src, ptrdiff_t srcstride,
+                                                                              int width, int height, int bit_depth) { put_unweighted_pred_16(_dst,dststride,src,srcstride,width,height,bit_depth); }
+
+template <> inline void acceleration_functions::put_weighted_pred<uint8_t>(uint8_t *_dst, ptrdiff_t dststride,
+                                                                           const int16_t *src, ptrdiff_t srcstride,
+                                                                           int width, int height,
+                                                                           int w,int o,int log2WD, int bit_depth) { put_weighted_pred_8(_dst,dststride,src,srcstride,width,height,w,o,log2WD); }
+template <> inline void acceleration_functions::put_weighted_pred<uint16_t>(uint16_t *_dst, ptrdiff_t dststride,
+                                                                            const int16_t *src, ptrdiff_t srcstride,
+                                                                            int width, int height,
+                                                                            int w,int o,int log2WD, int bit_depth) { put_weighted_pred_16(_dst,dststride,src,srcstride,width,height,w,o,log2WD,bit_depth); }
+
+template <> inline void acceleration_functions::put_weighted_bipred<uint8_t>(uint8_t *_dst, ptrdiff_t dststride,
+                                                                             const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
+                                                                             int width, int height,
+                                                                             int w1,int o1, int w2,int o2, int log2WD, int bit_depth) { put_weighted_bipred_8(_dst,dststride,src1,src2,srcstride,
+                                                                                                                                                              width,height,
+                                                                                                                                                              w1,o1,w2,o2,log2WD); }
+template <> inline void acceleration_functions::put_weighted_bipred<uint16_t>(uint16_t *_dst, ptrdiff_t dststride,
+                                                                              const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
+                                                                              int width, int height,
+                                                                              int w1,int o1, int w2,int o2, int log2WD, int bit_depth) { put_weighted_bipred_16(_dst,dststride,src1,src2,srcstride,
+                                                                                                                                                                width,height,
+                                                                                                                                                                w1,o1,w2,o2,log2WD,bit_depth); }
+*/
+
+
+inline void acceleration_functions::put_weighted_pred_avg(void* _dst, ptrdiff_t dststride,
+                                                          const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
+                                                          int width, int height, int bit_depth) const
+{
+  if (bit_depth <= 8)
+    put_weighted_pred_avg_8((uint8_t*)_dst,dststride,src1,src2,srcstride,width,height);
+  else
+    put_weighted_pred_avg_16((uint16_t*)_dst,dststride,src1,src2,srcstride,width,height,bit_depth);
+}
+
+
+inline void acceleration_functions::put_unweighted_pred(void* _dst, ptrdiff_t dststride,
+                                                        const int16_t *src, ptrdiff_t srcstride,
+                                                        int width, int height, int bit_depth) const
+{
+  if (bit_depth <= 8)
+    put_unweighted_pred_8((uint8_t*)_dst,dststride,src,srcstride,width,height);
+  else
+    put_unweighted_pred_16((uint16_t*)_dst,dststride,src,srcstride,width,height,bit_depth);
+}
+
+
+inline void acceleration_functions::put_weighted_pred(void* _dst, ptrdiff_t dststride,
+                                                      const int16_t *src, ptrdiff_t srcstride,
+                                                      int width, int height,
+                                                      int w,int o,int log2WD, int bit_depth) const
+{
+  if (bit_depth <= 8)
+    put_weighted_pred_8((uint8_t*)_dst,dststride,src,srcstride,width,height,w,o,log2WD);
+  else
+    put_weighted_pred_16((uint16_t*)_dst,dststride,src,srcstride,width,height,w,o,log2WD,bit_depth);
+}
+
+
+inline void acceleration_functions::put_weighted_bipred(void* _dst, ptrdiff_t dststride,
+                                                        const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
+                                                        int width, int height,
+                                                        int w1,int o1, int w2,int o2, int log2WD, int bit_depth) const
+{
+  if (bit_depth <= 8)
+    put_weighted_bipred_8((uint8_t*)_dst,dststride,src1,src2,srcstride, width,height, w1,o1,w2,o2,log2WD);
+  else
+    put_weighted_bipred_16((uint16_t*)_dst,dststride,src1,src2,srcstride, width,height, w1,o1,w2,o2,log2WD,bit_depth);
+}
+
+
+
+inline void acceleration_functions::put_hevc_epel(int16_t *dst, ptrdiff_t dststride,
+                                                  const void *src, ptrdiff_t srcstride, int width, int height,
+                                                  int mx, int my, int16_t* mcbuffer, int bit_depth) const
+{
+  if (bit_depth <= 8)
+    put_hevc_epel_8(dst,dststride,(const uint8_t*)src,srcstride,width,height,mx,my,mcbuffer);
+  else
+    put_hevc_epel_16(dst,dststride,(const uint16_t*)src,srcstride,width,height,mx,my,mcbuffer, bit_depth);
+}
+
+inline void acceleration_functions::put_hevc_epel_h(int16_t *dst, ptrdiff_t dststride,
+                                                    const void *src, ptrdiff_t srcstride, int width, int height,
+                                                    int mx, int my, int16_t* mcbuffer, int bit_depth) const
+{
+  if (bit_depth <= 8)
+    put_hevc_epel_h_8(dst,dststride,(const uint8_t*)src,srcstride,width,height,mx,my,mcbuffer,bit_depth);
+  else
+    put_hevc_epel_h_16(dst,dststride,(const uint16_t*)src,srcstride,width,height,mx,my,mcbuffer,bit_depth);
+}
+
+inline void acceleration_functions::put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
+                                                    const void *src, ptrdiff_t srcstride, int width, int height,
+                                                    int mx, int my, int16_t* mcbuffer, int bit_depth) const
+{
+  if (bit_depth <= 8)
+    put_hevc_epel_v_8(dst,dststride,(const uint8_t*)src,srcstride,width,height,mx,my,mcbuffer,bit_depth);
+  else
+    put_hevc_epel_v_16(dst,dststride,(const uint16_t*)src,srcstride,width,height,mx,my,mcbuffer, bit_depth);
+}
+
+inline void acceleration_functions::put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
+                                                     const void *src, ptrdiff_t srcstride, int width, int height,
+                                                     int mx, int my, int16_t* mcbuffer, int bit_depth) const
+{
+  if (bit_depth <= 8)
+    put_hevc_epel_hv_8(dst,dststride,(const uint8_t*)src,srcstride,width,height,mx,my,mcbuffer,bit_depth);
+  else
+    put_hevc_epel_hv_16(dst,dststride,(const uint16_t*)src,srcstride,width,height,mx,my,mcbuffer, bit_depth);
+}
+
+inline void acceleration_functions::put_hevc_qpel(int16_t *dst, ptrdiff_t dststride,
+                                                  const void *src, ptrdiff_t srcstride, int width, int height,
+                                                  int16_t* mcbuffer, int dX,int dY, int bit_depth) const
+{
+  if (bit_depth <= 8)
+    put_hevc_qpel_8[dX][dY](dst,dststride,(const uint8_t*)src,srcstride,width,height,mcbuffer);
+  else
+    put_hevc_qpel_16[dX][dY](dst,dststride,(const uint16_t*)src,srcstride,width,height,mcbuffer, bit_depth);
+}
+
+template <> inline void acceleration_functions::transform_skip<uint8_t>(uint8_t *dst, const int16_t *coeffs,ptrdiff_t stride, int bit_depth) const { transform_skip_8(dst,coeffs,stride); }
+template <> inline void acceleration_functions::transform_skip<uint16_t>(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const { transform_skip_16(dst,coeffs,stride, bit_depth); }
+
+template <> inline void acceleration_functions::transform_skip_rdpcm_v<uint8_t>(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const { assert(bit_depth==8); transform_skip_rdpcm_v_8(dst,coeffs,nT,stride); }
+template <> inline void acceleration_functions::transform_skip_rdpcm_h<uint8_t>(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const { assert(bit_depth==8); transform_skip_rdpcm_h_8(dst,coeffs,nT,stride); }
+template <> inline void acceleration_functions::transform_skip_rdpcm_v<uint16_t>(uint16_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const { assert(false); /*transform_skip_rdpcm_v_8(dst,coeffs,nT,stride);*/ }
+template <> inline void acceleration_functions::transform_skip_rdpcm_h<uint16_t>(uint16_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth) const { assert(false); /*transform_skip_rdpcm_h_8(dst,coeffs,nT,stride);*/ }
+
+
+template <> inline void acceleration_functions::transform_4x4_dst_add<uint8_t>(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride,int bit_depth) const { transform_4x4_dst_add_8(dst,coeffs,stride); }
+template <> inline void acceleration_functions::transform_4x4_dst_add<uint16_t>(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride,int bit_depth) const { transform_4x4_dst_add_16(dst,coeffs,stride,bit_depth); }
+
+template <> inline void acceleration_functions::transform_add<uint8_t>(int sizeIdx, uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const { transform_add_8[sizeIdx](dst,coeffs,stride); }
+template <> inline void acceleration_functions::transform_add<uint16_t>(int sizeIdx, uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth) const { transform_add_16[sizeIdx](dst,coeffs,stride,bit_depth); }
+
+template <> inline void acceleration_functions::add_residual(uint8_t *dst,  ptrdiff_t stride, const int32_t* r, int nT, int bit_depth) const { add_residual_8(dst,stride,r,nT,bit_depth); }
+template <> inline void acceleration_functions::add_residual(uint16_t *dst, ptrdiff_t stride, const int32_t* r, int nT, int bit_depth) const { add_residual_16(dst,stride,r,nT,bit_depth); }
+
 #endif
diff --git a/libde265/alloc_pool.cc b/libde265/alloc_pool.cc
new file mode 100644
index 0000000..da95e43
--- /dev/null
+++ b/libde265/alloc_pool.cc
@@ -0,0 +1,99 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "libde265/alloc_pool.h"
+#include "libde265/util.h"
+#include <assert.h>
+#include <stdio.h>
+
+#define DEBUG_MEMORY 1
+
+
+alloc_pool::alloc_pool(size_t objSize, int poolSize, bool grow)
+  : mObjSize(objSize),
+    mPoolSize(poolSize),
+    mGrow(grow)
+{
+  m_freeList.reserve(poolSize);
+  m_memBlocks.reserve(8);
+
+  add_memory_block();
+}
+
+
+void alloc_pool::add_memory_block()
+{
+  uint8_t* p = new uint8_t[mObjSize * mPoolSize];
+  m_memBlocks.push_back(p);
+
+  for (int i=0;i<mPoolSize;i++)
+    {
+      m_freeList.push_back(p + (mPoolSize-1-i) * mObjSize);
+    }
+}
+
+alloc_pool::~alloc_pool()
+{
+  FOR_LOOP(uint8_t*, p, m_memBlocks) {
+    delete[] p;
+  }
+}
+
+
+void* alloc_pool::new_obj(const size_t size)
+{
+  if (size != mObjSize) {
+    return ::operator new(size);
+  }
+
+  if (m_freeList.size()==0) {
+    if (mGrow) {
+      add_memory_block();
+      if (DEBUG_MEMORY) { fprintf(stderr,"additional block allocated in memory pool\n"); }
+    }
+    else {
+      return NULL;
+    }
+  }
+
+  assert(!m_freeList.empty());
+
+  void* p = m_freeList.back();
+  m_freeList.pop_back();
+
+  return p;
+}
+
+
+void  alloc_pool::delete_obj(void* obj)
+{
+  int memBlockSize = mObjSize * mPoolSize;
+
+  FOR_LOOP(uint8_t*, memBlk, m_memBlocks) {
+    if (memBlk <= obj && obj < memBlk + memBlockSize) {
+      m_freeList.push_back(obj);
+      return;
+    }
+  }
+
+  ::operator delete(obj);
+}
diff --git a/libde265/scan.h b/libde265/alloc_pool.h
similarity index 55%
copy from libde265/scan.h
copy to libde265/alloc_pool.h
index 7a8b977..fd3ada3 100644
--- a/libde265/scan.h
+++ b/libde265/alloc_pool.h
@@ -1,6 +1,8 @@
 /*
  * H.265 video codec.
- * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ * Copyright (c) 2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
  *
  * This file is part of libde265.
  *
@@ -18,26 +20,37 @@
  * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef DE265_SCAN_H
-#define DE265_SCAN_H
+#ifndef ALLOC_POOL_H
+#define ALLOC_POOL_H
 
+#include <vector>
+#include <cstddef>
+#ifdef HAVE_STDINT_H
 #include <stdint.h>
+#else
+#include <cstdint>
+#endif
 
-typedef struct {
-  uint8_t x,y;
-} position;
 
-typedef struct {
-  uint8_t subBlock;
-  uint8_t scanPos;
-} scan_position;
+class alloc_pool
+{
+ public:
+  alloc_pool(size_t objSize, int poolSize=1000, bool grow=true);
+  ~alloc_pool();
 
-void init_scan_orders();
+  void* new_obj(const size_t size);
+  void  delete_obj(void*);
+  void  purge();
 
-/* scanIdx: 0 - diag, 1 - horiz, 2 - verti
- */
-const position* get_scan_order(int log2BlockSize, int scanIdx);
+ private:
+  size_t mObjSize;
+  int    mPoolSize;
+  bool   mGrow;
+
+  std::vector<uint8_t*> m_memBlocks;
+  std::vector<void*>    m_freeList;
 
-scan_position get_scan_position(int x,int y, int scanIdx, int log2BlkSize);
+  void add_memory_block();
+};
 
 #endif
diff --git a/libde265/arm/Makefile.am b/libde265/arm/Makefile.am
new file mode 100644
index 0000000..9ef62d9
--- /dev/null
+++ b/libde265/arm/Makefile.am
@@ -0,0 +1,38 @@
+noinst_LTLIBRARIES = libde265_arm.la
+
+libde265_arm_la_CXXFLAGS = -I.. $(CFLAG_VISIBILITY)
+libde265_arm_la_SOURCES = arm.cc arm.h
+libde265_arm_la_LIBADD =
+
+if HAVE_VISIBILITY
+	libde265_arm_la_CXXFLAGS += -DHAVE_VISIBILITY
+endif
+
+
+if ENABLE_NEON_OPT
+# NEON specific functions
+
+noinst_LTLIBRARIES += libde265_arm_neon.la
+libde265_arm_la_LIBADD += libde265_arm_neon.la
+libde265_arm_neon_la_CXXFLAGS = -mfpu=neon -I.. $(CFLAG_VISIBILITY)
+libde265_arm_neon_la_CCASFLAGS = -mfpu=neon -I.. \
+	-DHAVE_NEON \
+	-DEXTERN_ASM= \
+	-DHAVE_AS_FUNC \
+	-DHAVE_SECTION_DATA_REL_RO
+
+if ENABLE_ARM_THUMB
+	libde265_arm_neon_la_CCASFLAGS += -DCONFIG_THUMB
+endif
+
+libde265_arm_neon_la_SOURCES = \
+	asm.S \
+	cpudetect.S \
+	hevcdsp_qpel_neon.S \
+	neon.S
+
+if HAVE_VISIBILITY
+	libde265_arm_neon_la_CXXFLAGS += -DHAVE_VISIBILITY
+endif
+
+endif
diff --git a/libde265/x86/Makefile.in b/libde265/arm/Makefile.in
similarity index 64%
copy from libde265/x86/Makefile.in
copy to libde265/arm/Makefile.in
index c982db6..6a8a8dd 100644
--- a/libde265/x86/Makefile.in
+++ b/libde265/arm/Makefile.in
@@ -79,18 +79,21 @@ POST_UNINSTALL = :
 build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
- at HAVE_VISIBILITY_TRUE@am__append_1 = -DHAVE_VISIBILITY
- at HAVE_VISIBILITY_TRUE@am__append_2 = -DHAVE_VISIBILITY
-subdir = libde265/x86
+
+# NEON specific functions
+ at ENABLE_NEON_OPT_TRUE@am__append_1 = libde265_arm_neon.la
+ at ENABLE_NEON_OPT_TRUE@am__append_2 = libde265_arm_neon.la
+subdir = libde265/arm
 DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
 	$(top_srcdir)/depcomp
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_compare_version.m4 \
+	$(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \
 	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
 	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
 	$(top_srcdir)/m4/lt~obsolete.m4 \
 	$(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \
-	$(top_srcdir)/m4/visibility.m4 $(top_srcdir)/configure.ac
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 mkinstalldirs = $(install_sh) -d
@@ -98,25 +101,27 @@ CONFIG_HEADER = $(top_builddir)/config.h
 CONFIG_CLEAN_FILES =
 CONFIG_CLEAN_VPATH_FILES =
 LTLIBRARIES = $(noinst_LTLIBRARIES)
-libde265_x86_la_DEPENDENCIES = libde265_x86_sse.la
-am_libde265_x86_la_OBJECTS = libde265_x86_la-sse.lo
-libde265_x86_la_OBJECTS = $(am_libde265_x86_la_OBJECTS)
+libde265_arm_la_DEPENDENCIES = $(am__append_2)
+am_libde265_arm_la_OBJECTS = libde265_arm_la-arm.lo
+libde265_arm_la_OBJECTS = $(am_libde265_arm_la_OBJECTS)
 AM_V_lt = $(am__v_lt_ at AM_V@)
 am__v_lt_ = $(am__v_lt_ at AM_DEFAULT_V@)
 am__v_lt_0 = --silent
 am__v_lt_1 = 
-libde265_x86_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
-	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
-	$(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
-	$(LDFLAGS) -o $@
-libde265_x86_sse_la_LIBADD =
-am_libde265_x86_sse_la_OBJECTS = libde265_x86_sse_la-sse-motion.lo \
-	libde265_x86_sse_la-sse-dct.lo
-libde265_x86_sse_la_OBJECTS = $(am_libde265_x86_sse_la_OBJECTS)
-libde265_x86_sse_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+libde265_arm_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
-	$(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(libde265_arm_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
+libde265_arm_neon_la_LIBADD =
+am__libde265_arm_neon_la_SOURCES_DIST = asm.S cpudetect.S \
+	hevcdsp_qpel_neon.S neon.S
+ at ENABLE_NEON_OPT_TRUE@am_libde265_arm_neon_la_OBJECTS =  \
+ at ENABLE_NEON_OPT_TRUE@	libde265_arm_neon_la-asm.lo \
+ at ENABLE_NEON_OPT_TRUE@	libde265_arm_neon_la-cpudetect.lo \
+ at ENABLE_NEON_OPT_TRUE@	libde265_arm_neon_la-hevcdsp_qpel_neon.lo \
+ at ENABLE_NEON_OPT_TRUE@	libde265_arm_neon_la-neon.lo
+libde265_arm_neon_la_OBJECTS = $(am_libde265_arm_neon_la_OBJECTS)
+ at ENABLE_NEON_OPT_TRUE@am_libde265_arm_neon_la_rpath =
 AM_V_P = $(am__v_P_ at AM_V@)
 am__v_P_ = $(am__v_P_ at AM_DEFAULT_V@)
 am__v_P_0 = false
@@ -133,6 +138,16 @@ DEFAULT_INCLUDES = -I. at am__isrc@ -I$(top_builddir)
 depcomp = $(SHELL) $(top_srcdir)/depcomp
 am__depfiles_maybe = depfiles
 am__mv = mv -f
+CPPASCOMPILE = $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CCASFLAGS) $(CCASFLAGS)
+LTCPPASCOMPILE = $(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CCASFLAGS) $(CCASFLAGS)
+AM_V_CPPAS = $(am__v_CPPAS_ at AM_V@)
+am__v_CPPAS_ = $(am__v_CPPAS_ at AM_DEFAULT_V@)
+am__v_CPPAS_0 = @echo "  CPPAS   " $@;
+am__v_CPPAS_1 = 
 CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
 	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS)
 LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
@@ -169,9 +184,9 @@ AM_V_CCLD = $(am__v_CCLD_ at AM_V@)
 am__v_CCLD_ = $(am__v_CCLD_ at AM_DEFAULT_V@)
 am__v_CCLD_0 = @echo "  CCLD    " $@;
 am__v_CCLD_1 = 
-SOURCES = $(libde265_x86_la_SOURCES) $(libde265_x86_sse_la_SOURCES)
-DIST_SOURCES = $(libde265_x86_la_SOURCES) \
-	$(libde265_x86_sse_la_SOURCES)
+SOURCES = $(libde265_arm_la_SOURCES) $(libde265_arm_neon_la_SOURCES)
+DIST_SOURCES = $(libde265_arm_la_SOURCES) \
+	$(am__libde265_arm_neon_la_SOURCES_DIST)
 am__can_run_installinfo = \
   case $$AM_UPDATE_INFO_DIR in \
     n|no|NO) false;; \
@@ -198,6 +213,7 @@ ETAGS = etags
 CTAGS = ctags
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
 AMTAR = @AMTAR@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
@@ -206,9 +222,11 @@ AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
 CCDEPMODE = @CCDEPMODE@
 CFLAGS = @CFLAGS@
-CFLAG_VISIBILITY = @CFLAG_VISIBILITY@
 CPP = @CPP@
 CPPFLAGS = @CPPFLAGS@
 CXX = @CXX@
@@ -228,7 +246,7 @@ EGREP = @EGREP@
 EXEEXT = @EXEEXT@
 FGREP = @FGREP@
 GREP = @GREP@
-HAVE_VISIBILITY = @HAVE_VISIBILITY@
+HAVE_CXX11 = @HAVE_CXX11@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -339,19 +357,27 @@ target_vendor = @target_vendor@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
-noinst_LTLIBRARIES = libde265_x86.la  libde265_x86_sse.la
-libde265_x86_la_CXXFLAGS = -I.. $(CFLAG_VISIBILITY) $(am__append_1)
-libde265_x86_la_SOURCES = sse.cc sse.h
-libde265_x86_la_LIBADD = libde265_x86_sse.la
-
-# SSE4 specific functions
-libde265_x86_sse_la_CXXFLAGS = -msse4.1 -I.. $(CFLAG_VISIBILITY) \
-	$(am__append_2)
-libde265_x86_sse_la_SOURCES = sse-motion.cc sse-motion.h sse-dct.h sse-dct.cc
+noinst_LTLIBRARIES = libde265_arm.la $(am__append_1)
+libde265_arm_la_CXXFLAGS = -I.. $(CFLAG_VISIBILITY)
+libde265_arm_la_SOURCES = arm.cc arm.h
+libde265_arm_la_LIBADD = $(am__append_2)
+ at ENABLE_NEON_OPT_TRUE@libde265_arm_neon_la_CXXFLAGS = -mfpu=neon -I.. $(CFLAG_VISIBILITY)
+ at ENABLE_NEON_OPT_TRUE@libde265_arm_neon_la_CCASFLAGS = -mfpu=neon -I.. \
+ at ENABLE_NEON_OPT_TRUE@	-DHAVE_NEON \
+ at ENABLE_NEON_OPT_TRUE@	-DEXTERN_ASM= \
+ at ENABLE_NEON_OPT_TRUE@	-DHAVE_AS_FUNC \
+ at ENABLE_NEON_OPT_TRUE@	-DHAVE_SECTION_DATA_REL_RO
+
+ at ENABLE_NEON_OPT_TRUE@libde265_arm_neon_la_SOURCES = \
+ at ENABLE_NEON_OPT_TRUE@	asm.S \
+ at ENABLE_NEON_OPT_TRUE@	cpudetect.S \
+ at ENABLE_NEON_OPT_TRUE@	hevcdsp_qpel_neon.S \
+ at ENABLE_NEON_OPT_TRUE@	neon.S
+
 all: all-am
 
 .SUFFIXES:
-.SUFFIXES: .cc .lo .o .obj
+.SUFFIXES: .S .cc .lo .o .obj
 $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
@@ -361,9 +387,9 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	      exit 1;; \
 	  esac; \
 	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu libde265/x86/Makefile'; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu libde265/arm/Makefile'; \
 	$(am__cd) $(top_srcdir) && \
-	  $(AUTOMAKE) --gnu libde265/x86/Makefile
+	  $(AUTOMAKE) --gnu libde265/arm/Makefile
 .PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	@case '$?' in \
@@ -394,11 +420,11 @@ clean-noinstLTLIBRARIES:
 	  rm -f $${locs}; \
 	}
 
-libde265_x86.la: $(libde265_x86_la_OBJECTS) $(libde265_x86_la_DEPENDENCIES) $(EXTRA_libde265_x86_la_DEPENDENCIES) 
-	$(AM_V_CXXLD)$(libde265_x86_la_LINK)  $(libde265_x86_la_OBJECTS) $(libde265_x86_la_LIBADD) $(LIBS)
+libde265_arm.la: $(libde265_arm_la_OBJECTS) $(libde265_arm_la_DEPENDENCIES) $(EXTRA_libde265_arm_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(libde265_arm_la_LINK)  $(libde265_arm_la_OBJECTS) $(libde265_arm_la_LIBADD) $(LIBS)
 
-libde265_x86_sse.la: $(libde265_x86_sse_la_OBJECTS) $(libde265_x86_sse_la_DEPENDENCIES) $(EXTRA_libde265_x86_sse_la_DEPENDENCIES) 
-	$(AM_V_CXXLD)$(libde265_x86_sse_la_LINK)  $(libde265_x86_sse_la_OBJECTS) $(libde265_x86_sse_la_LIBADD) $(LIBS)
+libde265_arm_neon.la: $(libde265_arm_neon_la_OBJECTS) $(libde265_arm_neon_la_DEPENDENCIES) $(EXTRA_libde265_arm_neon_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(LINK) $(am_libde265_arm_neon_la_rpath) $(libde265_arm_neon_la_OBJECTS) $(libde265_arm_neon_la_LIBADD) $(LIBS)
 
 mostlyclean-compile:
 	-rm -f *.$(OBJEXT)
@@ -406,9 +432,60 @@ mostlyclean-compile:
 distclean-compile:
 	-rm -f *.tab.c
 
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_x86_la-sse.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_arm_la-arm.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_arm_neon_la-asm.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_arm_neon_la-neon.Plo at am__quote@
+
+.S.o:
+ at am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+ at am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+ at AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCCAS_FALSE@	$(AM_V_CPPAS at am__nodep@)$(CPPASCOMPILE) -c -o $@ $<
+
+.S.obj:
+ at am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+ at am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+ at AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCCAS_FALSE@	$(AM_V_CPPAS at am__nodep@)$(CPPASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.S.lo:
+ at am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(LTCPPASCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+ at am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+ at AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCCAS_FALSE@	$(AM_V_CPPAS at am__nodep@)$(LTCPPASCOMPILE) -c -o $@ $<
+
+libde265_arm_neon_la-asm.lo: asm.S
+ at am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-asm.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-asm.Tpo -c -o libde265_arm_neon_la-asm.lo `test -f 'asm.S' || echo '$(srcdir)/'`asm.S
+ at am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-asm.Tpo $(DEPDIR)/libde265_arm_neon_la-asm.Plo
+ at AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='asm.S' object='libde265_arm_neon_la-asm.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCCAS_FALSE@	$(AM_V_CPPAS at am__nodep@)$(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-asm.lo `test -f 'asm.S' || echo '$(srcdir)/'`asm.S
+
+libde265_arm_neon_la-cpudetect.lo: cpudetect.S
+ at am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-cpudetect.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-cpudetect.Tpo -c -o libde265_arm_neon_la-cpudetect.lo `test -f 'cpudetect.S' || echo '$(srcdir)/'`cpudetect.S
+ at am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-cpudetect.Tpo $(DEPDIR)/libde265_arm_neon_la-cpudetect.Plo
+ at AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='cpudetect.S' object='libde265_arm_neon_la-cpudetect.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCCAS_FALSE@	$(AM_V_CPPAS at am__nodep@)$(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-cpudetect.lo `test -f 'cpudetect.S' || echo '$(srcdir)/'`cpudetect.S
+
+libde265_arm_neon_la-hevcdsp_qpel_neon.lo: hevcdsp_qpel_neon.S
+ at am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-hevcdsp_qpel_neon.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Tpo -c -o libde265_arm_neon_la-hevcdsp_qpel_neon.lo `test -f 'hevcdsp_qpel_neon.S' || echo '$(srcdir)/'`hevcdsp_qpel_neon.S
+ at am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Tpo $(DEPDIR)/libde265_arm_neon_la-hevcdsp_qpel_neon.Plo
+ at AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='hevcdsp_qpel_neon.S' object='libde265_arm_neon_la-hevcdsp_qpel_neon.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCCAS_FALSE@	$(AM_V_CPPAS at am__nodep@)$(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-hevcdsp_qpel_neon.lo `test -f 'hevcdsp_qpel_neon.S' || echo '$(srcdir)/'`hevcdsp_qpel_neon.S
+
+libde265_arm_neon_la-neon.lo: neon.S
+ at am__fastdepCCAS_TRUE@	$(AM_V_CPPAS)$(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -MT libde265_arm_neon_la-neon.lo -MD -MP -MF $(DEPDIR)/libde265_arm_neon_la-neon.Tpo -c -o libde265_arm_neon_la-neon.lo `test -f 'neon.S' || echo '$(srcdir)/'`neon.S
+ at am__fastdepCCAS_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_neon_la-neon.Tpo $(DEPDIR)/libde265_arm_neon_la-neon.Plo
+ at AMDEP_TRUE@@am__fastdepCCAS_FALSE@	$(AM_V_CPPAS)source='neon.S' object='libde265_arm_neon_la-neon.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCCAS_FALSE@	DEPDIR=$(DEPDIR) $(CCASDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCCAS_FALSE@	$(AM_V_CPPAS at am__nodep@)$(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_neon_la_CCASFLAGS) $(CCASFLAGS) -c -o libde265_arm_neon_la-neon.lo `test -f 'neon.S' || echo '$(srcdir)/'`neon.S
 
 .cc.o:
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@@ -431,26 +508,12 @@ distclean-compile:
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LTCXXCOMPILE) -c -o $@ $<
 
-libde265_x86_la-sse.lo: sse.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_la-sse.lo -MD -MP -MF $(DEPDIR)/libde265_x86_la-sse.Tpo -c -o libde265_x86_la-sse.lo `test -f 'sse.cc' || echo '$(srcdir)/'`sse.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_la-sse.Tpo $(DEPDIR)/libde265_x86_la-sse.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='sse.cc' object='libde265_x86_la-sse.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_la-sse.lo `test -f 'sse.cc' || echo '$(srcdir)/'`sse.cc
-
-libde265_x86_sse_la-sse-motion.lo: sse-motion.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_sse_la-sse-motion.lo -MD -MP -MF $(DEPDIR)/libde265_x86_sse_la-sse-motion.Tpo -c -o libde265_x86_sse_la-sse-motion.lo `test -f 'sse-motion.cc' || echo '$(srcdir)/'`sse-motion.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_sse_la-sse-motion.Tpo $(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='sse-motion.cc' object='libde265_x86_sse_la-sse-motion.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_sse_la-sse-motion.lo `test -f 'sse-motion.cc' || echo '$(srcdir)/'`sse-motion.cc
-
-libde265_x86_sse_la-sse-dct.lo: sse-dct.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_sse_la-sse-dct.lo -MD -MP -MF $(DEPDIR)/libde265_x86_sse_la-sse-dct.Tpo -c -o libde265_x86_sse_la-sse-dct.lo `test -f 'sse-dct.cc' || echo '$(srcdir)/'`sse-dct.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_sse_la-sse-dct.Tpo $(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='sse-dct.cc' object='libde265_x86_sse_la-sse-dct.lo' libtool=yes @AMDEPBACKSLASH@
+libde265_arm_la-arm.lo: arm.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_arm_la-arm.lo -MD -MP -MF $(DEPDIR)/libde265_arm_la-arm.Tpo -c -o libde265_arm_la-arm.lo `test -f 'arm.cc' || echo '$(srcdir)/'`arm.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_arm_la-arm.Tpo $(DEPDIR)/libde265_arm_la-arm.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='arm.cc' object='libde265_arm_la-arm.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_sse_la-sse-dct.lo `test -f 'sse-dct.cc' || echo '$(srcdir)/'`sse-dct.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_arm_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_arm_la-arm.lo `test -f 'arm.cc' || echo '$(srcdir)/'`arm.cc
 
 mostlyclean-libtool:
 	-rm -f *.lo
@@ -662,6 +725,12 @@ uninstall-am:
 	tags tags-am uninstall uninstall-am
 
 
+ at HAVE_VISIBILITY_TRUE@	libde265_arm_la_CXXFLAGS += -DHAVE_VISIBILITY
+
+ at ENABLE_ARM_THUMB_TRUE@@ENABLE_NEON_OPT_TRUE@	libde265_arm_neon_la_CCASFLAGS += -DCONFIG_THUMB
+
+ at ENABLE_NEON_OPT_TRUE@@HAVE_VISIBILITY_TRUE@	libde265_arm_neon_la_CXXFLAGS += -DHAVE_VISIBILITY
+
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
 .NOEXPORT:
diff --git a/libde265/arm/arm.cc b/libde265/arm/arm.cc
new file mode 100644
index 0000000..9791f15
--- /dev/null
+++ b/libde265/arm/arm.cc
@@ -0,0 +1,123 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2015 struktur AG, Joachim Bauch <bauch at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "arm.h"
+
+#ifdef HAVE_NEON
+
+#define QPEL_FUNC(name) \
+    extern "C" void ff_##name(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, \
+                                   int height, int width); \
+    void libde265_##name(int16_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, \
+                                   int width, int height, int16_t* mcbuffer) { \
+      ff_##name(dst, dststride, src, srcstride, height, width); \
+    }
+
+QPEL_FUNC(hevc_put_qpel_v1_neon_8);
+QPEL_FUNC(hevc_put_qpel_v2_neon_8);
+QPEL_FUNC(hevc_put_qpel_v3_neon_8);
+QPEL_FUNC(hevc_put_qpel_h1_neon_8);
+QPEL_FUNC(hevc_put_qpel_h2_neon_8);
+QPEL_FUNC(hevc_put_qpel_h3_neon_8);
+QPEL_FUNC(hevc_put_qpel_h1v1_neon_8);
+QPEL_FUNC(hevc_put_qpel_h1v2_neon_8);
+QPEL_FUNC(hevc_put_qpel_h1v3_neon_8);
+QPEL_FUNC(hevc_put_qpel_h2v1_neon_8);
+QPEL_FUNC(hevc_put_qpel_h2v2_neon_8);
+QPEL_FUNC(hevc_put_qpel_h2v3_neon_8);
+QPEL_FUNC(hevc_put_qpel_h3v1_neon_8);
+QPEL_FUNC(hevc_put_qpel_h3v2_neon_8);
+QPEL_FUNC(hevc_put_qpel_h3v3_neon_8);
+#undef QPEL_FUNC
+
+#if defined(HAVE_SIGNAL_H) && defined(HAVE_SETJMP_H)
+
+#include <signal.h>
+#include <setjmp.h>
+
+extern "C" void libde265_detect_neon(void);
+
+static jmp_buf jump_env;
+
+static void sighandler(int sig) {
+  (void)sig;
+  longjmp(jump_env, 1);
+}
+
+static bool has_NEON() {
+  static bool checked_NEON = false;
+  static bool have_NEON = false;
+
+  if (!checked_NEON) {
+    void (*oldsignal)(int);
+
+    checked_NEON = true;
+    oldsignal = signal(SIGILL, sighandler);
+    if (setjmp(jump_env)) {
+      signal(SIGILL, oldsignal);
+      have_NEON = false;
+      return false;
+    }
+    libde265_detect_neon();
+    signal(SIGILL, oldsignal);
+    have_NEON = true;
+  }
+
+  return have_NEON;
+}
+
+#else  // #if defined(HAVE_SIGNAL_H) && defined(HAVE_SETJMP_H)
+
+#warning "Don't know how to detect NEON support at runtime- will be disabled"
+
+static bool has_NEON() {
+  return false;
+}
+
+#endif
+
+#endif  // #ifdef HAVE_NEON
+
+void init_acceleration_functions_arm(struct acceleration_functions* accel)
+{
+#ifdef HAVE_NEON
+  if (has_NEON()) {
+    accel->put_hevc_qpel_8[0][1] = libde265_hevc_put_qpel_v1_neon_8;
+    accel->put_hevc_qpel_8[0][2] = libde265_hevc_put_qpel_v2_neon_8;
+    accel->put_hevc_qpel_8[0][3] = libde265_hevc_put_qpel_v3_neon_8;
+    accel->put_hevc_qpel_8[1][0] = libde265_hevc_put_qpel_h1_neon_8;
+    accel->put_hevc_qpel_8[1][1] = libde265_hevc_put_qpel_h1v1_neon_8;
+    accel->put_hevc_qpel_8[1][2] = libde265_hevc_put_qpel_h1v2_neon_8;
+    accel->put_hevc_qpel_8[1][3] = libde265_hevc_put_qpel_h1v3_neon_8;
+    accel->put_hevc_qpel_8[2][0] = libde265_hevc_put_qpel_h2_neon_8;
+    accel->put_hevc_qpel_8[2][1] = libde265_hevc_put_qpel_h2v1_neon_8;
+    accel->put_hevc_qpel_8[2][2] = libde265_hevc_put_qpel_h2v2_neon_8;
+    accel->put_hevc_qpel_8[2][3] = libde265_hevc_put_qpel_h2v3_neon_8;
+    accel->put_hevc_qpel_8[3][0] = libde265_hevc_put_qpel_h3_neon_8;
+    accel->put_hevc_qpel_8[3][1] = libde265_hevc_put_qpel_h3v1_neon_8;
+    accel->put_hevc_qpel_8[3][2] = libde265_hevc_put_qpel_h3v2_neon_8;
+    accel->put_hevc_qpel_8[3][3] = libde265_hevc_put_qpel_h3v3_neon_8;
+  }
+#endif  // #ifdef HAVE_NEON
+}
diff --git a/libde265/x86/sse.h b/libde265/arm/arm.h
similarity index 78%
copy from libde265/x86/sse.h
copy to libde265/arm/arm.h
index d4663d0..d64172a 100644
--- a/libde265/x86/sse.h
+++ b/libde265/arm/arm.h
@@ -1,6 +1,6 @@
 /*
  * H.265 video codec.
- * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ * Copyright (c) 2013-2015 struktur AG, Joachim Bauch <bauch at struktur.de>
  *
  * This file is part of libde265.
  *
@@ -18,11 +18,11 @@
  * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef DE265_SSE_H
-#define DE265_SSE_H
+#ifndef LIBDE265_ARM_H
+#define LIBDE265_ARM_H
 
 #include "acceleration.h"
 
-void init_acceleration_functions_sse(struct acceleration_functions* accel);
+void init_acceleration_functions_arm(struct acceleration_functions* accel);
 
-#endif
+#endif  // LIBDE265_ARM_H
diff --git a/libde265/arm/asm.S b/libde265/arm/asm.S
new file mode 100644
index 0000000..1d0e5a9
--- /dev/null
+++ b/libde265/arm/asm.S
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#ifdef __ELF__
+#   define ELF
+#else
+#   define ELF @
+#endif
+
+#if CONFIG_THUMB
+#   define A @
+#   define T
+#else
+#   define A
+#   define T @
+#endif
+
+#if HAVE_AS_FUNC
+#   define FUNC
+#else
+#   define FUNC @
+#endif
+
+#if   HAVE_NEON
+        .arch           armv7-a
+#elif HAVE_ARMV6T2
+        .arch           armv6t2
+#elif HAVE_ARMV6
+        .arch           armv6
+#elif HAVE_ARMV5TE
+        .arch           armv5te
+#endif
+
+#if   HAVE_NEON
+        .fpu            neon
+#elif HAVE_VFP
+        .fpu            vfp
+#endif
+
+        .syntax unified
+T       .thumb
+ELF     .eabi_attribute 25, 1           @ Tag_ABI_align_preserved
+ELF     .section .note.GNU-stack,"",%progbits @ Mark stack as non-executable
+
+.macro  function name, export=0, align=2
+        .set            .Lpic_idx, 0
+        .set            .Lpic_gp, 0
+    .macro endfunc
+      .if .Lpic_idx
+        .align          2
+        .altmacro
+        put_pic         %(.Lpic_idx - 1)
+        .noaltmacro
+      .endif
+ELF     .size   \name, . - \name
+FUNC    .endfunc
+        .purgem endfunc
+    .endm
+        .text
+        .align          \align
+    .if \export
+        .global EXTERN_ASM\name
+ELF     .type   EXTERN_ASM\name, %function
+FUNC    .func   EXTERN_ASM\name
+EXTERN_ASM\name:
+    .else
+ELF     .type   \name, %function
+FUNC    .func   \name
+\name:
+    .endif
+.endm
+
+.macro  const   name, align=2, relocate=0
+    .macro endconst
+ELF     .size   \name, . - \name
+        .purgem endconst
+    .endm
+.if HAVE_SECTION_DATA_REL_RO && \relocate
+        .section        .data.rel.ro
+.else
+        .section        .rodata
+.endif
+        .align          \align
+\name:
+.endm
+
+#if !HAVE_ARMV6T2_EXTERNAL
+.macro  movw    rd, val
+        mov     \rd, \val &  255
+        orr     \rd, \val & ~255
+.endm
+#endif
+
+.macro  mov32   rd, val
+#if HAVE_ARMV6T2_EXTERNAL
+        movw            \rd, #(\val) & 0xffff
+    .if (\val) >> 16
+        movt            \rd, #(\val) >> 16
+    .endif
+#else
+        ldr             \rd, =\val
+#endif
+.endm
+
+.macro  put_pic         num
+        put_pic_\num
+.endm
+
+.macro  do_def_pic      num, val, label
+    .macro put_pic_\num
+      .if \num
+        .altmacro
+        put_pic         %(\num - 1)
+        .noaltmacro
+      .endif
+\label: .word           \val
+        .purgem         put_pic_\num
+    .endm
+.endm
+
+.macro  def_pic         val, label
+        .altmacro
+        do_def_pic      %.Lpic_idx, \val, \label
+        .noaltmacro
+        .set            .Lpic_idx, .Lpic_idx + 1
+.endm
+
+.macro  ldpic           rd,  val, indir=0
+        ldr             \rd, .Lpicoff\@
+.Lpic\@:
+    .if \indir
+A       ldr             \rd, [pc, \rd]
+T       add             \rd, pc
+T       ldr             \rd, [\rd]
+    .else
+        add             \rd, pc
+    .endif
+        def_pic         \val - (.Lpic\@ + (8 >> CONFIG_THUMB)), .Lpicoff\@
+.endm
+
+.macro  movrel rd, val
+#if CONFIG_PIC
+        ldpic           \rd, \val
+#elif HAVE_ARMV6T2_EXTERNAL && !defined(__APPLE__)
+        movw            \rd, #:lower16:\val
+        movt            \rd, #:upper16:\val
+#else
+        ldr             \rd, =\val
+#endif
+.endm
+
+.macro  movrelx         rd,  val, gp
+#if CONFIG_PIC && defined(__ELF__)
+    .ifnb \gp
+      .if .Lpic_gp
+        .unreq          gp
+      .endif
+        gp      .req    \gp
+        ldpic           gp,  _GLOBAL_OFFSET_TABLE_
+    .elseif !.Lpic_gp
+        gp      .req    r12
+        ldpic           gp,  _GLOBAL_OFFSET_TABLE_
+    .endif
+        .set            .Lpic_gp, 1
+        ldr             \rd, .Lpicoff\@
+        ldr             \rd, [gp, \rd]
+        def_pic         \val(GOT), .Lpicoff\@
+#elif CONFIG_PIC && defined(__APPLE__)
+        ldpic           \rd, .Lpic\@, indir=1
+        .non_lazy_symbol_pointer
+.Lpic\@:
+        .indirect_symbol \val
+        .word           0
+        .text
+#else
+        movrel          \rd, \val
+#endif
+.endm
+
+.macro  add_sh          rd,  rn,  rm,  sh:vararg
+A       add             \rd, \rn, \rm, \sh
+T       mov             \rm, \rm, \sh
+T       add             \rd, \rn, \rm
+.endm
+
+.macro  ldr_pre         rt,  rn,  rm:vararg
+A       ldr             \rt, [\rn, \rm]!
+T       add             \rn, \rn, \rm
+T       ldr             \rt, [\rn]
+.endm
+
+.macro  ldr_dpre        rt,  rn,  rm:vararg
+A       ldr             \rt, [\rn, -\rm]!
+T       sub             \rn, \rn, \rm
+T       ldr             \rt, [\rn]
+.endm
+
+.macro  ldr_nreg        rt,  rn,  rm:vararg
+A       ldr             \rt, [\rn, -\rm]
+T       sub             \rt, \rn, \rm
+T       ldr             \rt, [\rt]
+.endm
+
+.macro  ldr_post        rt,  rn,  rm:vararg
+A       ldr             \rt, [\rn], \rm
+T       ldr             \rt, [\rn]
+T       add             \rn, \rn, \rm
+.endm
+
+.macro  ldrc_pre        cc,  rt,  rn,  rm:vararg
+A       ldr\cc          \rt, [\rn, \rm]!
+T       itt             \cc
+T       add\cc          \rn, \rn, \rm
+T       ldr\cc          \rt, [\rn]
+.endm
+
+.macro  ldrd_reg        rt,  rt2, rn,  rm
+A       ldrd            \rt, \rt2, [\rn, \rm]
+T       add             \rt, \rn, \rm
+T       ldrd            \rt, \rt2, [\rt]
+.endm
+
+.macro  ldrd_post       rt,  rt2, rn,  rm
+A       ldrd            \rt, \rt2, [\rn], \rm
+T       ldrd            \rt, \rt2, [\rn]
+T       add             \rn, \rn, \rm
+.endm
+
+.macro  ldrh_pre        rt,  rn,  rm
+A       ldrh            \rt, [\rn, \rm]!
+T       add             \rn, \rn, \rm
+T       ldrh            \rt, [\rn]
+.endm
+
+.macro  ldrh_dpre       rt,  rn,  rm
+A       ldrh            \rt, [\rn, -\rm]!
+T       sub             \rn, \rn, \rm
+T       ldrh            \rt, [\rn]
+.endm
+
+.macro  ldrh_post       rt,  rn,  rm
+A       ldrh            \rt, [\rn], \rm
+T       ldrh            \rt, [\rn]
+T       add             \rn, \rn, \rm
+.endm
+
+.macro  ldrb_post       rt,  rn,  rm
+A       ldrb            \rt, [\rn], \rm
+T       ldrb            \rt, [\rn]
+T       add             \rn, \rn, \rm
+.endm
+
+.macro  str_post       rt,  rn,  rm:vararg
+A       str             \rt, [\rn], \rm
+T       str             \rt, [\rn]
+T       add             \rn, \rn, \rm
+.endm
+
+.macro  strb_post       rt,  rn,  rm:vararg
+A       strb            \rt, [\rn], \rm
+T       strb            \rt, [\rn]
+T       add             \rn, \rn, \rm
+.endm
+
+.macro  strd_post       rt,  rt2, rn,  rm
+A       strd            \rt, \rt2, [\rn], \rm
+T       strd            \rt, \rt2, [\rn]
+T       add             \rn, \rn, \rm
+.endm
+
+.macro  strh_pre        rt,  rn,  rm
+A       strh            \rt, [\rn, \rm]!
+T       add             \rn, \rn, \rm
+T       strh            \rt, [\rn]
+.endm
+
+.macro  strh_dpre       rt,  rn,  rm
+A       strh            \rt, [\rn, -\rm]!
+T       sub             \rn, \rn, \rm
+T       strh            \rt, [\rn]
+.endm
+
+.macro  strh_post       rt,  rn,  rm
+A       strh            \rt, [\rn], \rm
+T       strh            \rt, [\rn]
+T       add             \rn, \rn, \rm
+.endm
+
+.macro  strh_dpost       rt,  rn,  rm
+A       strh            \rt, [\rn], -\rm
+T       strh            \rt, [\rn]
+T       sub             \rn, \rn, \rm
+.endm
+
+#if HAVE_VFP_ARGS
+ELF     .eabi_attribute 28, 1
+#   define VFP
+#   define NOVFP @
+#else
+#   define VFP   @
+#   define NOVFP
+#endif
+
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+#define X(s) JOIN(EXTERN_ASM, s)
diff --git a/libde265/fallback.h b/libde265/arm/cpudetect.S
similarity index 70%
copy from libde265/fallback.h
copy to libde265/arm/cpudetect.S
index 4b0b83c..45600a8 100644
--- a/libde265/fallback.h
+++ b/libde265/arm/cpudetect.S
@@ -1,6 +1,6 @@
 /*
  * H.265 video codec.
- * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ * Copyright (c) 2013-2015 struktur AG, Joachim Bauch <bauch at struktur.de>
  *
  * This file is part of libde265.
  *
@@ -18,11 +18,12 @@
  * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef DE265_FALLBACK_H
-#define DE265_FALLBACK_H
+#include "asm.S"
+#include "neon.S"
 
-#include "acceleration.h"
-
-void init_acceleration_functions_fallback(struct acceleration_functions* lowlevel);
-
-#endif
+// we execute a simple NEON instruction and check if SIGILL is triggered to
+// detect if the CPU support NEON code
+function libde265_detect_neon, export=1
+    vand q0, q0, q0
+    bx lr
+endfunc
diff --git a/libde265/arm/hevcdsp_qpel_neon.S b/libde265/arm/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000..4e438a9
--- /dev/null
+++ b/libde265/arm/hevcdsp_qpel_neon.S
@@ -0,0 +1,1004 @@
+/*
+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi at vtt.fi>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/*
+ * This is commit 63ca0fe8288dbd300c9bb814cb671e5d889f691c from
+ * https://github.com/FFmpeg/FFmpeg/blob/master/libavcodec/arm/hevcdsp_qpel_neon.S
+ */
+
+#include "asm.S"
+#include "neon.S"
+
+#define MAX_PB_SIZE #64
+
+.macro regshuffle_d8
+    vmov d16, d17
+    vmov d17, d18
+    vmov d18, d19
+    vmov d19, d20
+    vmov d20, d21
+    vmov d21, d22
+    vmov d22, d23
+.endm
+
+.macro regshuffle_q8
+    vmov q0, q1
+    vmov q1, q2
+    vmov q2, q3
+    vmov q3, q4
+    vmov q4, q5
+    vmov q5, q6
+    vmov q6, q7
+.endm
+
+.macro vextin8
+        pld       [r2]
+        vld1.8    {q11}, [r2], r3
+        vext.8    d16, d22, d23, #1
+        vext.8    d17, d22, d23, #2
+        vext.8    d18, d22, d23, #3
+        vext.8    d19, d22, d23, #4
+        vext.8    d20, d22, d23, #5
+        vext.8    d21, d22, d23, #6
+        vext.8    d22, d22, d23, #7
+.endm
+
+.macro loadin8
+        pld       [r2]
+        vld1.8    {d16}, [r2], r3
+        pld       [r2]
+        vld1.8    {d17}, [r2], r3
+        pld       [r2]
+        vld1.8    {d18}, [r2], r3
+        pld       [r2]
+        vld1.8    {d19}, [r2], r3
+        pld       [r2]
+        vld1.8    {d20}, [r2], r3
+        pld       [r2]
+        vld1.8    {d21}, [r2], r3
+        pld       [r2]
+        vld1.8    {d22}, [r2], r3
+        pld       [r2]
+        vld1.8    {d23}, [r2], r3
+.endm
+
+.macro qpel_filter_1_32b
+        vmov.i16   d16, #58
+        vmov.i16   d17, #10
+        vmull.s16   q9, d6, d16   // 58 * d0
+        vmull.s16  q10, d7, d16   // 58 * d1
+        vmov.i16   d16, #17
+        vmull.s16  q11, d4, d17   // 10 * c0
+        vmull.s16  q12, d5, d17   // 10 * c1
+        vmov.i16   d17, #5
+        vmull.s16  q13, d8, d16   // 17 * e0
+        vmull.s16  q14, d9, d16   // 17 * e1
+        vmull.s16  q15, d10, d17  //  5 * f0
+        vmull.s16   q8, d11, d17  //  5 * f1
+        vsub.s32    q9, q11       // 58 * d0 - 10 * c0
+        vsub.s32   q10, q12       // 58 * d1 - 10 * c1
+        vshll.s16  q11, d2, #2    // 4 * b0
+        vshll.s16  q12, d3, #2    // 4 * b1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1
+        vsubl.s16  q13, d12, d0   // g0 - a0
+        vsubl.s16  q14, d13, d1   // g1 - a1
+        vadd.s32    q9, q11       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
+        vadd.s32   q10, q12       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
+        vsub.s32   q13, q15       // g0 - a0 - 5 * f0
+        vsub.s32   q14, q8        // g1 - a1 - 5 * f1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
+        vqshrn.s32  d16, q9, #6
+        vqshrn.s32  d17, q10, #6
+.endm
+
+// input  q0 - q7
+// output q8
+.macro qpel_filter_2_32b
+        vmov.i32   q8, #11
+        vaddl.s16   q9, d6, d8   // d0 + e0
+        vaddl.s16  q10, d7, d9   // d1 + e1
+        vaddl.s16  q11, d4, d10  // c0 + f0
+        vaddl.s16  q12, d5, d11  // c1 + f1
+        vmul.s32   q11, q8       // 11 * (c0 + f0)
+        vmul.s32   q12, q8       // 11 * (c1 + f1)
+        vmov.i32   q8, #40
+        vaddl.s16  q15, d2, d12  // b0 + g0
+        vmul.s32    q9, q8       // 40 * (d0 + e0)
+        vmul.s32   q10, q8       // 40 * (d1 + e1)
+        vaddl.s16   q8, d3, d13  // b1 + g1
+        vaddl.s16  q13, d0, d14  // a0 + h0
+        vaddl.s16  q14, d1, d15  // a1 + h1
+        vshl.s32   q15, #2       // 4*(b0+g0)
+        vshl.s32    q8, #2       // 4*(b1+g1)
+        vadd.s32   q11, q13      // 11 * (c0 + f0) + a0 + h0
+        vadd.s32   q12, q14      // 11 * (c1 + f1) + a1 + h1
+        vadd.s32   q9, q15       // 40 * (d0 + e0) + 4*(b0+g0)
+        vadd.s32   q10, q8       // 40 * (d1 + e1) + 4*(b1+g1)
+        vsub.s32   q9, q11       // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0)
+        vsub.s32   q10, q12      // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1)
+        vqshrn.s32  d16, q9, #6
+        vqshrn.s32  d17, q10, #6
+.endm
+
+.macro qpel_filter_3_32b
+        vmov.i16   d16, #58
+        vmov.i16   d17, #10
+        vmull.s16   q9, d8, d16   // 58 * d0
+        vmull.s16  q10, d9, d16   // 58 * d1
+        vmov.i16   d16, #17
+        vmull.s16  q11, d10, d17  // 10 * c0
+        vmull.s16  q12, d11, d17  // 10 * c1
+        vmov.i16   d17, #5
+        vmull.s16  q13, d6, d16   // 17 * e0
+        vmull.s16  q14, d7, d16   // 17 * e1
+        vmull.s16  q15, d4, d17   //  5 * f0
+        vmull.s16   q8, d5, d17   //  5 * f1
+        vsub.s32    q9, q11       // 58 * d0 - 10 * c0
+        vsub.s32   q10, q12       // 58 * d1 - 10 * c1
+        vshll.s16  q11, d12, #2   // 4 * b0
+        vshll.s16  q12, d13, #2   // 4 * b1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1
+        vsubl.s16  q13, d2, d14   // g0 - a0
+        vsubl.s16  q14, d3, d15   // g1 - a1
+        vadd.s32    q9, q11       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
+        vadd.s32   q10, q12       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
+        vsub.s32   q13, q15       // g0 - a0 - 5 * f0
+        vsub.s32   q14, q8        // g1 - a1 - 5 * f1
+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
+        vqshrn.s32  d16, q9, #6
+        vqshrn.s32  d17, q10, #6
+.endm
+
+.macro qpel_filter_1 out=q7
+        vmov.u8    d24, #58
+        vmov.u8    d25, #10
+        vshll.u8   q13, d20, #4   // 16*e
+        vshll.u8   q14, d21, #2   // 4*f
+        vmull.u8  \out, d19, d24  // 58*d
+        vaddw.u8   q13, q13, d20  // 17*e
+        vmull.u8   q15, d18, d25  // 10*c
+        vaddw.u8   q14, q14, d21  // 5*f
+        vsubl.u8   q12, d22, d16  // g - a
+        vadd.u16  \out, q13       // 58d + 17e
+        vshll.u8   q13, d17, #2   // 4*b
+        vadd.u16   q15, q14       // 10*c + 5*f
+        vadd.s16   q13, q12       // - a + 4*b + g
+        vsub.s16  \out, q15       // -10*c + 58*d + 17*e -5*f
+        vadd.s16  \out, q13       // -a + 4*b -10*c + 58*d + 17*e -5*f
+.endm
+
+.macro qpel_filter_2 out=q7
+        vmov.i16   q12, #10
+        vmov.i16   q14, #11
+        vaddl.u8   q13, d19, d20   // d + e
+        vaddl.u8   q15, d18, d21   // c + f
+        vmul.u16   q13, q12        // 10 * (d+e)
+        vmul.u16   q15, q14        // 11 * ( c + f)
+        vaddl.u8  \out, d17, d22   // b + g
+        vaddl.u8   q12, d16, d23   // a + h
+        vadd.u16  \out, q13        // b + 10 * (d + e) + g
+        vadd.s16   q12, q15
+        vshl.u16  \out, #2         // 4 * (b + 10 * (d + e) + g)
+        vsub.s16  \out, q12
+.endm
+
+.macro qpel_filter_3 out=q7
+        vmov.u8    d24, #58
+        vmov.u8    d25, #10
+        vshll.u8   q13, d19, #4     // 16*e
+        vshll.u8   q14, d18, #2     // 4*f
+        vmull.u8  \out, d20, d24    // 58*d
+        vaddw.u8   q13, q13, d19    // 17*e
+        vmull.u8   q15, d21, d25    // 10*c
+        vaddw.u8   q14, q14, d18    // 5*f
+        vsubl.u8   q12, d17, d23    // g - a
+        vadd.u16  \out, q13         // 58d + 17e
+        vshll.u8   q13, d22, #2     // 4*b
+        vadd.u16   q15, q14         // 10*c + 5*f
+        vadd.s16   q13, q12         // - a + 4*b + g
+        vsub.s16  \out, q15         // -10*c + 58*d + 17*e -5*f
+        vadd.s16  \out, q13         // -a + 4*b -10*c + 58*d + 17*e -5*f
+.endm
+
+.macro  hevc_put_qpel_vX_neon_8 filter
+        push   {r4, r5, r6, r7}
+        ldr    r4, [sp, #16] // height
+        ldr    r5, [sp, #20] // width
+        vpush {d8-d15}
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        lsl       r1, #1
+0:      loadin8
+        cmp       r5, #4
+        beq       4f
+8:      subs r4, #1
+        \filter
+        vst1.16    {q7}, [r0], r1
+        regshuffle_d8
+        vld1.8    {d23}, [r2], r3
+        bne 8b
+        subs  r5, #8
+        beq       99f
+        mov r4, r12
+        add r6, #16
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b     0b
+4:      subs r4, #1
+        \filter
+        vst1.16    d14, [r0], r1
+        regshuffle_d8
+        vld1.32    {d23[0]}, [r2], r3
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4, r5, r6, r7}
+        bx lr
+.endm
+
+.macro  hevc_put_qpel_uw_vX_neon_8 filter
+        push   {r4-r10}
+        ldr    r5, [sp, #28] // width
+        ldr    r4, [sp, #32] // height
+        ldr    r8, [sp, #36] // src2
+        ldr    r9, [sp, #40] // src2stride
+        vpush {d8-d15}
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r8, #0
+        bne       .Lbi\@
+0:      loadin8
+        cmp       r5, #4
+        beq       4f
+8:      subs r4, #1
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.8    d0, [r0], r1
+        regshuffle_d8
+        vld1.8    {d23}, [r2], r3
+        bne 8b
+        subs  r5, #8
+        beq       99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b     0b
+4:      subs r4, #1
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.32    d0[0], [r0], r1
+        regshuffle_d8
+        vld1.32    {d23[0]}, [r2], r3
+        bne 4b
+        b   99f
+.Lbi\@: lsl       r9, #1
+        mov       r10, r8
+0:      loadin8
+        cmp       r5, #4
+        beq       4f
+8:      subs r4, #1
+        \filter
+        vld1.16        {q0}, [r8], r9
+        vqadd.s16      q0, q7
+        vqrshrun.s16   d0, q0, #7
+        vst1.8         d0, [r0], r1
+        regshuffle_d8
+        vld1.8    {d23}, [r2], r3
+        bne 8b
+        subs  r5, #8
+        beq       99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r10, #16
+        mov r8, r10
+        add r7, #8
+        mov r2, r7
+        b     0b
+4:      subs r4, #1
+        \filter
+        vld1.16      d0, [r8], r9
+        vqadd.s16    d0, d14
+        vqrshrun.s16 d0, q0, #7
+        vst1.32      d0[0], [r0], r1
+        regshuffle_d8
+        vld1.32    {d23[0]}, [r2], r3
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4-r10}
+        bx lr
+.endm
+
+function ff_hevc_put_qpel_v1_neon_8, export=1
+        hevc_put_qpel_vX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_v2_neon_8, export=1
+        hevc_put_qpel_vX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_v3_neon_8, export=1
+        hevc_put_qpel_vX_neon_8 qpel_filter_3
+endfunc
+
+
+function ff_hevc_put_qpel_uw_v1_neon_8, export=1
+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_v2_neon_8, export=1
+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_v3_neon_8, export=1
+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_3
+endfunc
+
+.macro hevc_put_qpel_hX_neon_8 filter
+        push     {r4, r5, r6, r7}
+        ldr    r4, [sp, #16] // height
+        ldr    r5, [sp, #20] // width
+
+        vpush    {d8-d15}
+        sub       r2, #4
+        lsl       r1, #1
+        mov      r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r5, #4
+        beq       4f
+8:      subs      r4, #1
+        vextin8
+        \filter
+        vst1.16   {q7}, [r0], r1
+        bne       8b
+        subs      r5, #8
+        beq      99f
+        mov       r4, r12
+        add       r6, #16
+        mov       r0, r6
+        add       r7, #8
+        mov       r2, r7
+        cmp       r5, #4
+        bne       8b
+4:      subs      r4, #1
+        vextin8
+        \filter
+        vst1.16  d14, [r0], r1
+        bne       4b
+99:     vpop     {d8-d15}
+        pop      {r4, r5, r6, r7}
+        bx lr
+.endm
+
+.macro hevc_put_qpel_uw_hX_neon_8 filter
+        push     {r4-r10}
+        ldr       r5, [sp, #28] // width
+        ldr       r4, [sp, #32] // height
+        ldr       r8, [sp, #36] // src2
+        ldr       r9, [sp, #40] // src2stride
+        vpush    {d8-d15}
+        sub       r2, #4
+        mov      r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r8, #0
+        bne       .Lbi\@
+        cmp       r5, #4
+        beq       4f
+8:      subs      r4, #1
+        vextin8
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.8    d0, [r0], r1
+        bne       8b
+        subs      r5, #8
+        beq      99f
+        mov       r4, r12
+        add       r6, #8
+        mov       r0, r6
+        add       r7, #8
+        mov       r2, r7
+        cmp       r5, #4
+        bne       8b
+4:      subs      r4, #1
+        vextin8
+        \filter
+        vqrshrun.s16   d0, q7, #6
+        vst1.32  d0[0], [r0], r1
+        bne       4b
+        b         99f
+.Lbi\@:
+        lsl       r9, #1
+        cmp       r5, #4
+        beq       4f
+        mov       r10, r8
+8:      subs      r4, #1
+        vextin8
+        \filter
+        vld1.16        {q0}, [r8], r9
+        vqadd.s16      q0, q7
+        vqrshrun.s16   d0, q0, #7
+        vst1.8         d0, [r0], r1
+        bne       8b
+        subs      r5, #8
+        beq      99f
+        mov       r4, r12
+        add       r6, #8
+        add       r10, #16
+        mov       r8, r10
+        mov       r0, r6
+        add       r7, #8
+        mov       r2, r7
+        cmp       r5, #4
+        bne       8b
+4:      subs      r4, #1
+        vextin8
+        \filter
+        vld1.16      d0, [r8], r9
+        vqadd.s16    d0, d14
+        vqrshrun.s16 d0, q0, #7
+        vst1.32      d0[0], [r0], r1
+        bne       4b
+99:     vpop     {d8-d15}
+        pop      {r4-r10}
+        bx lr
+.endm
+
+function ff_hevc_put_qpel_h1_neon_8, export=1
+        hevc_put_qpel_hX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_h2_neon_8, export=1
+        hevc_put_qpel_hX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_h3_neon_8, export=1
+        hevc_put_qpel_hX_neon_8 qpel_filter_3
+endfunc
+
+
+function ff_hevc_put_qpel_uw_h1_neon_8, export=1
+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_1
+endfunc
+
+function ff_hevc_put_qpel_uw_h2_neon_8, export=1
+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_2
+endfunc
+
+function ff_hevc_put_qpel_uw_h3_neon_8, export=1
+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_3
+endfunc
+
+.macro hevc_put_qpel_hXvY_neon_8 filterh filterv
+        push   {r4, r5, r6, r7}
+        ldr    r4, [sp, #16] // height
+        ldr    r5, [sp, #20] // width
+
+        vpush {d8-d15}
+        sub       r2, #4
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3  // extra_before 3
+        lsl       r1, #1
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+0:      vextin8
+        \filterh q0
+        vextin8
+        \filterh q1
+        vextin8
+        \filterh q2
+        vextin8
+        \filterh q3
+        vextin8
+        \filterh q4
+        vextin8
+        \filterh q5
+        vextin8
+        \filterh q6
+        vextin8
+        \filterh q7
+        cmp r5, #4
+        beq 4f
+8:      subs  r4, #1
+        \filterv
+        vst1.16    {q8}, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 8b
+        subs  r5, #8
+        beq 99f
+        mov r4, r12
+        add r6, #16
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b 0b
+4:      subs  r4, #1
+        \filterv
+        vst1.16    d16, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4, r5, r6, r7}
+        bx lr
+.endm
+
+.macro hevc_put_qpel_uw_hXvY_neon_8 filterh filterv
+        push     {r4-r10}
+        ldr       r5, [sp, #28] // width
+        ldr       r4, [sp, #32] // height
+        ldr       r8, [sp, #36] // src2
+        ldr       r9, [sp, #40] // src2stride
+        vpush {d8-d15}
+        sub       r2, #4
+        sub       r2, r2, r3, lsl #1
+        sub       r2, r3  // extra_before 3
+        mov       r12, r4
+        mov       r6, r0
+        mov       r7, r2
+        cmp       r8, #0
+        bne       .Lbi\@
+0:      vextin8
+        \filterh q0
+        vextin8
+        \filterh q1
+        vextin8
+        \filterh q2
+        vextin8
+        \filterh q3
+        vextin8
+        \filterh q4
+        vextin8
+        \filterh q5
+        vextin8
+        \filterh q6
+        vextin8
+        \filterh q7
+        cmp r5, #4
+        beq 4f
+8:      subs  r4, #1
+        \filterv
+        vqrshrun.s16   d0, q8, #6
+        vst1.8    d0, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 8b
+        subs  r5, #8
+        beq 99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r7, #8
+        mov r2, r7
+        b 0b
+4:      subs  r4, #1
+        \filterv
+        vqrshrun.s16   d0, q8, #6
+        vst1.32        d0[0], [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 4b
+        b   99f
+.Lbi\@: lsl      r9, #1
+        mov      r10, r8
+0:      vextin8
+        \filterh q0
+        vextin8
+        \filterh q1
+        vextin8
+        \filterh q2
+        vextin8
+        \filterh q3
+        vextin8
+        \filterh q4
+        vextin8
+        \filterh q5
+        vextin8
+        \filterh q6
+        vextin8
+        \filterh q7
+        cmp r5, #4
+        beq 4f
+8:      subs  r4, #1
+        \filterv
+        vld1.16        {q0}, [r8], r9
+        vqadd.s16      q0, q8
+        vqrshrun.s16   d0, q0, #7
+        vst1.8         d0, [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 8b
+        subs  r5, #8
+        beq 99f
+        mov r4, r12
+        add r6, #8
+        mov r0, r6
+        add r10, #16
+        mov r8, r10
+        add r7, #8
+        mov r2, r7
+        b 0b
+4:      subs  r4, #1
+        \filterv
+        vld1.16      d0, [r8], r9
+        vqadd.s16    d0, d16
+        vqrshrun.s16 d0, q0, #7
+        vst1.32      d0[0], [r0], r1
+        regshuffle_q8
+        vextin8
+        \filterh q7
+        bne 4b
+99:     vpop {d8-d15}
+        pop {r4-r10}
+        bx lr
+.endm
+
+
+function ff_hevc_put_qpel_h1v1_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v1_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v1_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_h1v2_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v2_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v2_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_h1v3_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_h2v3_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_h3v3_neon_8, export=1
+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
+endfunc
+
+
+function ff_hevc_put_qpel_uw_h1v1_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v1_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v1_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h1v2_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v2_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v2_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h1v3_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h2v3_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
+endfunc
+
+function ff_hevc_put_qpel_uw_h3v3_neon_8, export=1
+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
+endfunc
+
+.macro init_put_pixels
+        pld    [r1]
+        pld    [r1, r2]
+        mov    r12, MAX_PB_SIZE
+        lsl    r12, #1
+.endm
+
+function ff_hevc_put_pixels_w2_neon_8, export=1
+        init_put_pixels
+        vmov.u8      d5, #255
+        vshr.u64     d5, #32
+0:      subs r3, #1
+        vld1.32     {d0[0]}, [r1], r2
+        pld [r1]
+        vld1.32     d6, [r0]
+        vshll.u8    q0, d0, #6
+        vbit        d6, d0, d5
+        vst1.32     d6, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w4_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.32   {d0[0]}, [r1], r2
+        vld1.32   {d0[1]}, [r1], r2
+        pld       [r1]
+        pld       [r1, r2]
+        vshll.u8   q0, d0, #6
+        vst1.64   {d0}, [r0], r12
+        vst1.64   {d1}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w6_neon_8, export=1
+        init_put_pixels
+        vmov.u8      q10, #255
+        vshr.u64     d21, #32
+0:      subs r3, #1
+        vld1.16     {d0}, [r1], r2
+        pld [r1]
+        vshll.u8    q0, d0, #6
+        vld1.8      {q12}, [r0]
+        vbit        q12, q0, q10
+        vst1.8      {q12}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w8_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.8   {d0}, [r1], r2
+        vld1.8   {d2}, [r1], r2
+        pld        [r1]
+        pld        [r1, r2]
+        vshll.u8   q0, d0, #6
+        vshll.u8   q1, d2, #6
+        vst1.16   {q0}, [r0], r12
+        vst1.16   {q1}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w12_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.64    {d0}, [r1]
+        add       r1, #8
+        vld1.32   {d1[0]}, [r1], r2
+        sub       r1, #8
+        vld1.64    {d2}, [r1]
+        add       r1, #8
+        vld1.32   {d1[1]}, [r1], r2
+        sub       r1, #8
+        pld       [r1]
+        pld       [r1, r2]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vmov      d22, d19
+        vst1.64   {d16, d17, d18}, [r0], r12
+        vst1.64   {d20, d21, d22}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w16_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #2
+        vld1.8   {q0}, [r1], r2
+        vld1.8   {q1}, [r1], r2
+        pld       [r1]
+        pld       [r1, r2]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vst1.8    {q8, q9}, [r0], r12
+        vst1.8    {q10, q11}, [r0], r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w24_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8   {d0, d1, d2}, [r1], r2
+        pld       [r1]
+        vshll.u8  q10, d0, #6
+        vshll.u8  q11, d1, #6
+        vshll.u8  q12, d2, #6
+        vstm     r0, {q10, q11, q12}
+        add      r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w32_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8 {q0, q1}, [r1], r2
+        pld       [r1]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vstm    r0, {q8, q9, q10, q11}
+        add     r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w48_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8    {q0, q1}, [r1]
+        add r1, #32
+        vld1.8    {q2}, [r1], r2
+        sub r1, #32
+        pld       [r1]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vshll.u8  q12, d4, #6
+        vshll.u8  q13, d5, #6
+        vstm r0, {q8, q9, q10, q11, q12, q13}
+        add  r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_pixels_w64_neon_8, export=1
+        init_put_pixels
+0:      subs r3, #1
+        vld1.8    {q0, q1}, [r1]
+        add      r1, #32
+        vld1.8    {q2, q3}, [r1], r2
+        sub      r1, #32
+        pld       [r1]
+        vshll.u8  q8, d0, #6
+        vshll.u8  q9, d1, #6
+        vshll.u8  q10, d2, #6
+        vshll.u8  q11, d3, #6
+        vshll.u8  q12, d4, #6
+        vshll.u8  q13, d5, #6
+        vshll.u8  q14, d6, #6
+        vshll.u8  q15, d7, #6
+        vstm    r0, {q8, q9, q10, q11, q12, q13, q14, q15}
+        add r0, r12
+        bne 0b
+        bx lr
+endfunc
+
+function ff_hevc_put_qpel_uw_pixels_neon_8, export=1
+        push   {r4-r9}
+        ldr    r5, [sp, #24] // width
+        ldr    r4, [sp, #28] // height
+        ldr    r8, [sp, #32] // src2
+        ldr    r9, [sp, #36] // src2stride
+        vpush {d8-d15}
+        cmp    r8, #0
+        bne    2f
+1:      subs r4, #1
+        vld1.8     {d0}, [r2], r3
+        vst1.8      d0, [r0], r1
+        bne 1b
+        vpop {d8-d15}
+        pop   {r4-r9}
+        bx lr
+2:      subs  r4, #1
+        vld1.8         {d0}, [r2], r3
+        vld1.16        {q1}, [r8], r9
+        vshll.u8       q0, d0, #6
+        vqadd.s16      q0, q1
+        vqrshrun.s16   d0, q0, #7
+        vst1.8      d0, [r0], r1
+        bne 2b
+        vpop {d8-d15}
+        pop   {r4-r9}
+        bx lr
+endfunc
+
+.macro put_qpel_uw_pixels width, regs, regs2, regs3, regs4
+function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1
+        ldr    r12, [sp] // height
+1:      subs   r12, #4
+        vld1.32     {\regs}  , [r2], r3
+        vld1.32     {\regs2} , [r2], r3
+        vld1.32     {\regs3} , [r2], r3
+        vld1.32     {\regs4} , [r2], r3
+        vst1.32     {\regs}  , [r0], r1
+        vst1.32     {\regs2} , [r0], r1
+        vst1.32     {\regs3} , [r0], r1
+        vst1.32     {\regs4} , [r0], r1
+        bne 1b
+        bx lr
+endfunc
+.endm
+
+.macro put_qpel_uw_pixels_m width, regs, regs2, regs3, regs4
+function ff_hevc_put_qpel_uw_pixels_w\width\()_neon_8, export=1
+        push   {r4-r5}
+        ldr    r12, [sp, #8] // height
+1:      subs r12, #2
+        mov      r4, r2
+        vld1.32   {\regs} , [r2]!
+        vld1.32   {\regs2} , [r2]
+        add      r2, r4, r3
+        mov      r4, r2
+        vld1.32   {\regs3} , [r2]!
+        vld1.32   {\regs4} , [r2]
+        add      r2, r4, r3
+        mov      r5, r0
+        vst1.32   {\regs} , [r0]!
+        vst1.32   {\regs2} , [r0]
+        add      r0, r5, r1
+        mov      r5, r0
+        vst1.32   {\regs3} , [r0]!
+        vst1.32   {\regs4} , [r0]
+        add      r0, r5, r1
+        bne 1b
+        pop   {r4-r5}
+        bx lr
+endfunc
+.endm
+
+put_qpel_uw_pixels    4, d0[0], d0[1], d1[0], d1[1]
+put_qpel_uw_pixels    8, d0,    d1,    d2,    d3
+put_qpel_uw_pixels_m 12, d0,    d1[0], d2,    d3[0]
+put_qpel_uw_pixels   16, q0,    q1,    q2,    q3
+put_qpel_uw_pixels   24, d0-d2, d3-d5, d16-d18, d19-d21
+put_qpel_uw_pixels   32, q0-q1, q2-q3, q8-q9, q10-q11
+put_qpel_uw_pixels_m 48, q0-q1, q2,    q8-q9, q10
+put_qpel_uw_pixels_m 64, q0-q1, q2-q3, q8-q9, q10-q11
diff --git a/libde265/arm/neon.S b/libde265/arm/neon.S
new file mode 100644
index 0000000..787bc4b
--- /dev/null
+++ b/libde265/arm/neon.S
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+.macro  transpose_8x8   r0, r1, r2, r3, r4, r5, r6, r7
+        vtrn.32         \r0, \r4
+        vtrn.32         \r1, \r5
+        vtrn.32         \r2, \r6
+        vtrn.32         \r3, \r7
+        vtrn.16         \r0, \r2
+        vtrn.16         \r1, \r3
+        vtrn.16         \r4, \r6
+        vtrn.16         \r5, \r7
+        vtrn.8          \r0, \r1
+        vtrn.8          \r2, \r3
+        vtrn.8          \r4, \r5
+        vtrn.8          \r6, \r7
+.endm
+
+.macro  transpose_4x4   r0, r1, r2, r3
+        vtrn.16         \r0, \r2
+        vtrn.16         \r1, \r3
+        vtrn.8          \r0, \r1
+        vtrn.8          \r2, \r3
+.endm
+
+.macro  swap4           r0, r1, r2, r3, r4, r5, r6, r7
+        vswp            \r0, \r4
+        vswp            \r1, \r5
+        vswp            \r2, \r6
+        vswp            \r3, \r7
+.endm
+
+.macro  transpose16_4x4 r0, r1, r2, r3, r4, r5, r6, r7
+        vtrn.32         \r0, \r2
+        vtrn.32         \r1, \r3
+        vtrn.32         \r4, \r6
+        vtrn.32         \r5, \r7
+        vtrn.16         \r0, \r1
+        vtrn.16         \r2, \r3
+        vtrn.16         \r4, \r5
+        vtrn.16         \r6, \r7
+.endm
diff --git a/libde265/bitstream.cc b/libde265/bitstream.cc
index 11dda9b..0298be9 100644
--- a/libde265/bitstream.cc
+++ b/libde265/bitstream.cc
@@ -142,7 +142,9 @@ int  get_uvlc(bitreader* br)
   int offset = 0;
   if (num_zeros != 0) {
     offset = get_bits(br, num_zeros);
-    return offset + (1<<num_zeros)-1;
+    int value = offset + (1<<num_zeros)-1;
+    assert(value>0);
+    return value;
   } else {
     return 0;
   }
diff --git a/libde265/cabac.cc b/libde265/cabac.cc
index 8e174ce..102bc57 100644
--- a/libde265/cabac.cc
+++ b/libde265/cabac.cc
@@ -23,8 +23,11 @@
 
 #include <stdint.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <assert.h>
 
+#define INITIAL_CABAC_BUFFER_CAPACITY 4096
+
 
 static const uint8_t LPS_table[64][4] =
   {
@@ -132,6 +135,8 @@ int logcnt=1;
 
 void init_CABAC_decoder(CABAC_decoder* decoder, uint8_t* bitstream, int length)
 {
+  assert(length >= 0);
+
   decoder->bitstream_start = bitstream;
   decoder->bitstream_curr  = bitstream;
   decoder->bitstream_end   = bitstream+length;
@@ -153,19 +158,10 @@ void init_CABAC_decoder_2(CABAC_decoder* decoder)
 }
 
 
-//#include <sys/types.h>
-//#include <signal.h>
-
 int  decode_CABAC_bit(CABAC_decoder* decoder, context_model* model)
 {
-  //if (logcnt >= 1100000) { enablelog(); }
-
-  // if (logcnt==400068770) { raise(SIGINT); }
-
   logtrace(LogCABAC,"[%3d] decodeBin r:%x v:%x state:%d\n",logcnt,decoder->range, decoder->value, model->state);
 
-  //assert(decoder->range>=0x100);
-
   int decoded_bit;
   int LPS = LPS_table[model->state][ ( decoder->range >> 6 ) - 4 ];
   decoder->range -= LPS;
@@ -178,7 +174,7 @@ int  decode_CABAC_bit(CABAC_decoder* decoder, context_model* model)
     {
       logtrace(LogCABAC,"[%3d] MPS\n",logcnt);
 
-      // MPS path                                                                                    
+      // MPS path
 
       decoded_bit = model->MPSbit;
       model->state = next_state_MPS[model->state];
@@ -194,7 +190,7 @@ int  decode_CABAC_bit(CABAC_decoder* decoder, context_model* model)
           if (decoder->bits_needed == 0)
             {
               decoder->bits_needed = -8;
-              if (decoder->bitstream_curr != decoder->bitstream_end)
+              if (decoder->bitstream_curr < decoder->bitstream_end)
                 { decoder->value |= *decoder->bitstream_curr++; }
             }
         }
@@ -202,15 +198,21 @@ int  decode_CABAC_bit(CABAC_decoder* decoder, context_model* model)
   else
     {
       logtrace(LogCABAC,"[%3d] LPS\n",logcnt);
+      //printf("%d %d\n", model->state, 0);
 
-      // LPS path                                                                                    
+      // LPS path
 
-      int num_bits = renorm_table[ LPS >> 3 ];
       decoder->value = (decoder->value - scaled_range);
 
+      int num_bits = renorm_table[ LPS >> 3 ];
       decoder->value <<= num_bits;
       decoder->range   = LPS << num_bits;  /* this is always >= 0x100 except for state 63,
                                               but state 63 is never used */
+
+      int num_bitsTab = renorm_table[ LPS >> 3 ];
+
+      assert(num_bits == num_bitsTab);
+
       decoded_bit      = 1 - model->MPSbit;
 
       if (model->state==0) { model->MPSbit = 1-model->MPSbit; }
@@ -221,7 +223,7 @@ int  decode_CABAC_bit(CABAC_decoder* decoder, context_model* model)
       if (decoder->bits_needed >= 0)
         {
           logtrace(LogCABAC,"bits_needed: %d\n", decoder->bits_needed);
-          if (decoder->bitstream_curr != decoder->bitstream_end)
+          if (decoder->bitstream_curr < decoder->bitstream_end)
             { decoder->value |= (*decoder->bitstream_curr++) << decoder->bits_needed; }
 
           decoder->bits_needed -= 8;
@@ -233,13 +235,13 @@ int  decode_CABAC_bit(CABAC_decoder* decoder, context_model* model)
   logcnt++;
 #endif
 
-  //assert(decoder->range>=0x100);
-
   return decoded_bit;
 }
 
 int  decode_CABAC_term_bit(CABAC_decoder* decoder)
 {
+  logtrace(LogCABAC,"CABAC term: range=%x\n", decoder->range);
+
   decoder->range -= 2;
   uint32_t scaledRange = decoder->range << 7;
 
@@ -261,7 +263,7 @@ int  decode_CABAC_term_bit(CABAC_decoder* decoder)
             {
               decoder->bits_needed = -8;
 
-              if (decoder->bitstream_curr != decoder->bitstream_end) {
+              if (decoder->bitstream_curr < decoder->bitstream_end) {
                 decoder->value += (*decoder->bitstream_curr++);
               }
             }
@@ -277,17 +279,15 @@ int  decode_CABAC_bypass(CABAC_decoder* decoder)
 {
   logtrace(LogCABAC,"[%3d] bypass r:%x v:%x\n",logcnt,decoder->range, decoder->value);
 
-  //assert(decoder->range>=0x100);
-
   decoder->value <<= 1;
   decoder->bits_needed++;
 
   if (decoder->bits_needed >= 0)
     {
-      //assert(decoder->bits_needed==0);
-
-      decoder->bits_needed = -8;
-      decoder->value |= *decoder->bitstream_curr++;
+      if (decoder->bitstream_end > decoder->bitstream_curr) {
+        decoder->bits_needed = -8;
+        decoder->value |= *decoder->bitstream_curr++;
+      }
     }
 
   int bit;
@@ -307,8 +307,6 @@ int  decode_CABAC_bypass(CABAC_decoder* decoder)
   logcnt++;
 #endif
 
-  //assert(decoder->range>=0x100);
-
   return bit;
 }
 
@@ -340,18 +338,21 @@ int  decode_CABAC_TU(CABAC_decoder* decoder, int cMax, context_model* model)
 
 int  decode_CABAC_FL_bypass_parallel(CABAC_decoder* decoder, int nBits)
 {
-  logtrace(LogCABAC,"[%3d] bypass group r:%x v:%x\n",logcnt,decoder->range, decoder->value);
+  logtrace(LogCABAC,"[%3d] bypass group r:%x v:%x (nBits=%d)\n",logcnt,
+           decoder->range, decoder->value, nBits);
 
   decoder->value <<= nBits;
   decoder->bits_needed+=nBits;
 
   if (decoder->bits_needed >= 0)
     {
-      int input = *decoder->bitstream_curr++;
-      input <<= decoder->bits_needed;
+      if (decoder->bitstream_end > decoder->bitstream_curr) {
+        int input = *decoder->bitstream_curr++;
+        input <<= decoder->bits_needed;
 
-      decoder->bits_needed -= 8;
-      decoder->value |= input;
+        decoder->bits_needed -= 8;
+        decoder->value |= input;
+      }
     }
 
   uint32_t scaled_range = decoder->range << 7;
@@ -361,12 +362,11 @@ int  decode_CABAC_FL_bypass_parallel(CABAC_decoder* decoder, int nBits)
 
   logtrace(LogCABAC,"[%3d] -> value %d  r:%x v:%x\n", logcnt+nBits-1,
            value, decoder->range, decoder->value);
+
 #ifdef DE265_LOG_TRACE
   logcnt+=nBits;
 #endif
 
-  //assert(decoder->range>=0x100);
-
   return value;
 }
 
@@ -375,7 +375,6 @@ int  decode_CABAC_FL_bypass(CABAC_decoder* decoder, int nBits)
 {
   int value=0;
 
-
   if (likely(nBits<=8)) {
     if (nBits==0) {
       return 0;
@@ -399,7 +398,6 @@ int  decode_CABAC_FL_bypass(CABAC_decoder* decoder, int nBits)
       value |= decode_CABAC_bypass(decoder);
     }
   }
-
   logtrace(LogCABAC,"      -> FL: %d\n", value);
 
   return value;
@@ -417,6 +415,9 @@ int  decode_CABAC_TR_bypass(CABAC_decoder* decoder, int cRiceParam, int cTRMax)
   return (prefix << cRiceParam) | suffix;
 }
 
+
+#define MAX_PREFIX 32
+
 int  decode_CABAC_EGk_bypass(CABAC_decoder* decoder, int k)
 {
   int base=0;
@@ -431,9 +432,602 @@ int  decode_CABAC_EGk_bypass(CABAC_decoder* decoder, int k)
         base += 1<<n;
         n++;
       }
+
+      if (n == k+MAX_PREFIX) {
+        return 0; // TODO: error
+      }
     }
 
   int suffix = decode_CABAC_FL_bypass(decoder, n);
   return base + suffix;
 }
 
+
+// ---------------------------------------------------------------------------
+
+void CABAC_encoder::add_trailing_bits()
+{
+  write_bit(1);
+  int nZeros = number_free_bits_in_byte();
+  write_bits(0, nZeros);
+}
+
+
+
+CABAC_encoder_bitstream::CABAC_encoder_bitstream()
+{
+  data_mem = NULL;
+  data_capacity = 0;
+  data_size = 0;
+  state = 0;
+
+  vlc_buffer_len = 0;
+
+  init_CABAC();
+}
+
+CABAC_encoder_bitstream::~CABAC_encoder_bitstream()
+{
+  free(data_mem);
+}
+
+void CABAC_encoder_bitstream::reset()
+{
+  data_size = 0;
+  state = 0;
+
+  vlc_buffer_len = 0;
+
+  init_CABAC();
+}
+
+void CABAC_encoder_bitstream::write_bits(uint32_t bits,int n)
+{
+  vlc_buffer <<= n;
+  vlc_buffer |= bits;
+  vlc_buffer_len += n;
+
+  while (vlc_buffer_len>=8) {
+    append_byte((vlc_buffer >> (vlc_buffer_len-8)) & 0xFF);
+    vlc_buffer_len -= 8;
+  }
+}
+
+void CABAC_encoder::write_uvlc(int value)
+{
+  assert(value>=0);
+
+  int nLeadingZeros=0;
+  int base=0;
+  int range=1;
+
+  while (value>=base+range) {
+    base += range;
+    range <<= 1;
+    nLeadingZeros++;
+  }
+
+  write_bits((1<<nLeadingZeros) | (value-base),2*nLeadingZeros+1);
+}
+
+void CABAC_encoder::write_svlc(int value)
+{
+  if      (value==0) write_bits(1,1);
+  else if (value>0)  write_uvlc(2*value-1);
+  else               write_uvlc(-2*value);
+}
+
+void CABAC_encoder_bitstream::flush_VLC()
+{
+  while (vlc_buffer_len>=8) {
+    append_byte((vlc_buffer >> (vlc_buffer_len-8)) & 0xFF);
+    vlc_buffer_len -= 8;
+  }
+
+  if (vlc_buffer_len>0) {
+    append_byte(vlc_buffer << (8-vlc_buffer_len));
+    vlc_buffer_len = 0;
+  }
+
+  vlc_buffer = 0;
+}
+
+void CABAC_encoder_bitstream::skip_bits(int nBits)
+{
+  while (nBits>=8) {
+    write_bits(0,8);
+    nBits-=8;
+  }
+
+  if (nBits>0) {
+    write_bits(0,nBits);
+  }
+}
+
+
+int  CABAC_encoder_bitstream::number_free_bits_in_byte() const
+{
+  if ((vlc_buffer_len % 8)==0) return 0;
+  return 8- (vlc_buffer_len % 8);
+}
+
+
+void CABAC_encoder_bitstream::check_size_and_resize(int nBytes)
+{
+  if (data_size+nBytes > data_capacity) { // 1 extra byte for stuffing
+    if (data_capacity==0) {
+      data_capacity = INITIAL_CABAC_BUFFER_CAPACITY;
+    } else {
+      data_capacity *= 2;
+    }
+
+    data_mem = (uint8_t*)realloc(data_mem,data_capacity);
+  }
+}
+
+
+void CABAC_encoder_bitstream::append_byte(int byte)
+{
+  check_size_and_resize(2);
+
+  // --- emulation prevention ---
+
+  /* These byte sequences may never occur in the bitstream:
+     0x000000 / 0x000001 / 0x000002
+
+     Hence, we have to add a 0x03 before the third byte.
+     We also have to add a 0x03 for this sequence: 0x000003, because
+     the escape byte itself also has to be escaped.
+  */
+
+  // S0 --(0)--> S1 --(0)--> S2 --(0,1,2,3)--> add stuffing
+
+  if (byte<=3) {
+    /**/ if (state< 2 && byte==0) { state++; }
+    else if (state==2 && byte<=3) {
+      data_mem[ data_size++ ] = 3;
+
+      if (byte==0) state=1;
+      else         state=0;
+    }
+    else { state=0; }
+  }
+  else { state=0; }
+
+
+  // write actual data byte
+
+  data_mem[ data_size++ ] = byte;
+}
+
+
+void CABAC_encoder_bitstream::write_startcode()
+{
+  check_size_and_resize(3);
+
+  data_mem[ data_size+0 ] = 0;
+  data_mem[ data_size+1 ] = 0;
+  data_mem[ data_size+2 ] = 1;
+  data_size+=3;
+}
+
+void CABAC_encoder_bitstream::init_CABAC()
+{
+  range = 510;
+  low = 0;
+
+  bits_left = 23;
+  buffered_byte = 0xFF;
+  num_buffered_bytes = 0;
+}
+
+void CABAC_encoder_bitstream::flush_CABAC()
+{
+  if (low >> (32 - bits_left))
+    {
+      append_byte(buffered_byte + 1);
+      while (num_buffered_bytes > 1)
+        {
+          append_byte(0x00);
+          num_buffered_bytes--;
+        }
+
+      low -= 1 << (32 - bits_left);
+    }
+  else
+    {
+      if (num_buffered_bytes > 0)
+        {
+          append_byte(buffered_byte);
+        }
+
+      while (num_buffered_bytes > 1)
+        {
+          append_byte(0xff);
+          num_buffered_bytes--;
+        }
+    }
+
+  // printf("low: %08x  nbits left:%d  filled:%d\n",low,bits_left,32-bits_left);
+
+  write_bits(low >> 8, 24-bits_left);
+}
+
+
+void CABAC_encoder_bitstream::write_out()
+{
+  //logtrace(LogCABAC,"low = %08x (bits_left=%d)\n",low,bits_left);
+  int leadByte = low >> (24 - bits_left);
+  bits_left += 8;
+  low &= 0xffffffffu >> bits_left;
+
+  //logtrace(LogCABAC,"write byte %02x\n",leadByte);
+  //logtrace(LogCABAC,"-> low = %08x\n",low);
+
+  if (leadByte == 0xff)
+    {
+      num_buffered_bytes++;
+    }
+  else
+    {
+      if (num_buffered_bytes > 0)
+        {
+          int carry = leadByte >> 8;
+          int byte = buffered_byte + carry;
+          buffered_byte = leadByte & 0xff;
+          append_byte(byte);
+
+          byte = ( 0xff + carry ) & 0xff;
+          while ( num_buffered_bytes > 1 )
+            {
+              append_byte(byte);
+              num_buffered_bytes--;
+            }
+        }
+      else
+        {
+          num_buffered_bytes = 1;
+          buffered_byte = leadByte;
+        }
+    }
+}
+
+void CABAC_encoder_bitstream::testAndWriteOut()
+{
+  // logtrace(LogCABAC,"bits_left = %d\n",bits_left);
+
+  if (bits_left < 12)
+    {
+      write_out();
+    }
+}
+
+
+#ifdef DE265_LOG_TRACE
+int encBinCnt=1;
+#endif
+
+void CABAC_encoder_bitstream::write_CABAC_bit(int modelIdx, int bin)
+{
+  context_model* model = &(*mCtxModels)[modelIdx];
+  //m_uiBinsCoded += m_binCountIncrement;
+  //rcCtxModel.setBinsCoded( 1 );
+
+  logtrace(LogCABAC,"[%d] range=%x low=%x state=%d, bin=%d\n",
+           encBinCnt, range,low, model->state,bin);
+
+  /*
+  printf("[%d] range=%x low=%x state=%d, bin=%d\n",
+         encBinCnt, range,low, model->state,bin);
+
+  printf("%d %d X\n",model->state,bin != model->MPSbit);
+  */
+
+#ifdef DE265_LOG_TRACE
+  encBinCnt++;
+#endif
+
+  uint32_t LPS = LPS_table[model->state][ ( range >> 6 ) - 4 ];
+  range -= LPS;
+
+  if (bin != model->MPSbit)
+    {
+      //logtrace(LogCABAC,"LPS\n");
+
+      int num_bits = renorm_table[ LPS >> 3 ];
+      low = (low + range) << num_bits;
+      range   = LPS << num_bits;
+
+      if (model->state==0) { model->MPSbit = 1-model->MPSbit; }
+
+      model->state = next_state_LPS[model->state];
+
+      bits_left -= num_bits;
+    }
+  else
+    {
+      //logtrace(LogCABAC,"MPS\n");
+
+      model->state = next_state_MPS[model->state];
+
+
+      // renorm
+
+      if (range >= 256) { return; }
+
+      low <<= 1;
+      range <<= 1;
+      bits_left--;
+    }
+
+  testAndWriteOut();
+}
+
+void CABAC_encoder_bitstream::write_CABAC_bypass(int bin)
+{
+  logtrace(LogCABAC,"[%d] bypass = %d, range=%x\n",encBinCnt,bin,range);
+  /*
+  printf("[%d] bypass = %d, range=%x\n",encBinCnt,bin,range);
+  printf("%d %d X\n",64, -1);
+  */
+
+#ifdef DE265_LOG_TRACE
+  encBinCnt++;
+#endif
+
+  // BinsCoded += m_binCountIncrement;
+  low <<= 1;
+
+  if (bin)
+    {
+      low += range;
+    }
+  bits_left--;
+
+  testAndWriteOut();
+}
+
+void CABAC_encoder::write_CABAC_TU_bypass(int value, int cMax)
+{
+  for (int i=0;i<value;i++) {
+    write_CABAC_bypass(1);
+  }
+
+  if (value<cMax) {
+    write_CABAC_bypass(0);
+  }
+}
+
+void CABAC_encoder::write_CABAC_FL_bypass(int value, int n)
+{
+  while (n>0) {
+    n--;
+    write_CABAC_bypass(value & (1<<n));
+  }
+}
+
+void CABAC_encoder_bitstream::write_CABAC_term_bit(int bit)
+{
+  logtrace(LogCABAC,"CABAC term: range=%x\n", range);
+
+  range -= 2;
+
+  if (bit) {
+    low += range;
+
+    low <<= 7;
+    range = 2 << 7;
+    bits_left -= 7;
+  }
+  else if (range >= 256)
+    {
+      return;
+    }
+  else
+    {
+      low   <<= 1;
+      range <<= 1;
+      bits_left--;
+    }
+
+  testAndWriteOut();
+}
+
+
+
+
+static const uint32_t entropy_table[128] = {
+  // -------------------- 200 --------------------
+  /* state= 0 */  0x07d13 /* 0.977164 */,  0x08255 /* 1.018237 */,
+  /* state= 1 */  0x07738 /* 0.931417 */,  0x086ef /* 1.054179 */,
+  /* state= 2 */  0x0702b /* 0.876323 */,  0x0935a /* 1.151195 */,
+  /* state= 3 */  0x069e6 /* 0.827333 */,  0x09c7f /* 1.222650 */,
+  /* state= 4 */  0x062e8 /* 0.772716 */,  0x0a2c7 /* 1.271708 */,
+  /* state= 5 */  0x05c18 /* 0.719488 */,  0x0ae25 /* 1.360532 */,
+  /* state= 6 */  0x05632 /* 0.673414 */,  0x0b724 /* 1.430793 */,
+  /* state= 7 */  0x05144 /* 0.634904 */,  0x0c05d /* 1.502850 */,
+  /* state= 8 */  0x04bdf /* 0.592754 */,  0x0ccf2 /* 1.601145 */,
+  /* state= 9 */  0x0478d /* 0.559012 */,  0x0d57b /* 1.667843 */,
+  /* state=10 */  0x042ad /* 0.520924 */,  0x0de81 /* 1.738336 */,
+  /* state=11 */  0x03f4d /* 0.494564 */,  0x0e4b8 /* 1.786871 */,
+  /* state=12 */  0x03a9d /* 0.457945 */,  0x0f471 /* 1.909721 */,
+  /* state=13 */  0x037d5 /* 0.436201 */,  0x0fc56 /* 1.971385 */,
+  /* state=14 */  0x034c2 /* 0.412177 */,  0x10236 /* 2.017284 */,
+  /* state=15 */  0x031a6 /* 0.387895 */,  0x10d5c /* 2.104394 */,
+  /* state=16 */  0x02e62 /* 0.362383 */,  0x11b34 /* 2.212552 */,
+  /* state=17 */  0x02c20 /* 0.344752 */,  0x120b4 /* 2.255512 */,
+  /* state=18 */  0x029b8 /* 0.325943 */,  0x1294d /* 2.322672 */,
+  /* state=19 */  0x02791 /* 0.309143 */,  0x135e1 /* 2.420959 */,
+  /* state=20 */  0x02562 /* 0.292057 */,  0x13e37 /* 2.486077 */,
+  /* state=21 */  0x0230d /* 0.273846 */,  0x144fd /* 2.539000 */,
+  /* state=22 */  0x02193 /* 0.262308 */,  0x150c9 /* 2.631150 */,
+  /* state=23 */  0x01f5d /* 0.245026 */,  0x15ca0 /* 2.723641 */,
+  /* state=24 */  0x01de7 /* 0.233617 */,  0x162f9 /* 2.773246 */,
+  /* state=25 */  0x01c2f /* 0.220208 */,  0x16d99 /* 2.856259 */,
+  /* state=26 */  0x01a8e /* 0.207459 */,  0x17a93 /* 2.957634 */,
+  /* state=27 */  0x0195a /* 0.198065 */,  0x18051 /* 3.002477 */,
+  /* state=28 */  0x01809 /* 0.187778 */,  0x18764 /* 3.057759 */,
+  /* state=29 */  0x0164a /* 0.174144 */,  0x19460 /* 3.159206 */,
+  /* state=30 */  0x01539 /* 0.165824 */,  0x19f20 /* 3.243181 */,
+  /* state=31 */  0x01452 /* 0.158756 */,  0x1a465 /* 3.284334 */,
+  /* state=32 */  0x0133b /* 0.150261 */,  0x1b422 /* 3.407303 */,
+  /* state=33 */  0x0120c /* 0.140995 */,  0x1bce5 /* 3.475767 */,
+  /* state=34 */  0x01110 /* 0.133315 */,  0x1c394 /* 3.527962 */,
+  /* state=35 */  0x0104d /* 0.127371 */,  0x1d059 /* 3.627736 */,
+  /* state=36 */  0x00f8b /* 0.121451 */,  0x1d74b /* 3.681983 */,
+  /* state=37 */  0x00ef4 /* 0.116829 */,  0x1dfd0 /* 3.748540 */,
+  /* state=38 */  0x00e10 /* 0.109864 */,  0x1e6d3 /* 3.803335 */,
+  /* state=39 */  0x00d3f /* 0.103507 */,  0x1f925 /* 3.946462 */,
+  /* state=40 */  0x00cc4 /* 0.099758 */,  0x1fda7 /* 3.981667 */,
+  /* state=41 */  0x00c42 /* 0.095792 */,  0x203f8 /* 4.031012 */,
+  /* state=42 */  0x00b78 /* 0.089610 */,  0x20f7d /* 4.121014 */,
+  /* state=43 */  0x00afc /* 0.085830 */,  0x21dd6 /* 4.233102 */,
+  /* state=44 */  0x00a5e /* 0.081009 */,  0x22419 /* 4.282016 */,
+  /* state=45 */  0x00a1b /* 0.078950 */,  0x22a5e /* 4.331015 */,
+  /* state=46 */  0x00989 /* 0.074514 */,  0x23756 /* 4.432323 */,
+  /* state=47 */  0x0091b /* 0.071166 */,  0x24225 /* 4.516775 */,
+  /* state=48 */  0x008cf /* 0.068837 */,  0x2471a /* 4.555487 */,
+  /* state=49 */  0x00859 /* 0.065234 */,  0x25313 /* 4.649048 */,
+  /* state=50 */  0x00814 /* 0.063140 */,  0x25d67 /* 4.729721 */,
+  /* state=51 */  0x007b6 /* 0.060272 */,  0x2651f /* 4.790028 */,
+  /* state=52 */  0x0076e /* 0.058057 */,  0x2687c /* 4.816294 */,
+  /* state=53 */  0x00707 /* 0.054924 */,  0x27da7 /* 4.981661 */,
+  /* state=54 */  0x006d5 /* 0.053378 */,  0x28172 /* 5.011294 */,
+  /* state=55 */  0x00659 /* 0.049617 */,  0x28948 /* 5.072512 */,
+  /* state=56 */  0x00617 /* 0.047598 */,  0x297c5 /* 5.185722 */,
+  /* state=57 */  0x005dd /* 0.045814 */,  0x2a2df /* 5.272434 */,
+  /* state=58 */  0x005c1 /* 0.044965 */,  0x2a581 /* 5.293019 */,
+  /* state=59 */  0x00574 /* 0.042619 */,  0x2ad59 /* 5.354304 */,
+  /* state=60 */  0x0053b /* 0.040882 */,  0x2bba5 /* 5.465973 */,
+  /* state=61 */  0x0050c /* 0.039448 */,  0x2c596 /* 5.543651 */,
+  /* state=62 */  0x004e9 /* 0.038377 */,  0x2cd88 /* 5.605741 */,
+  0x00400 ,  0x2d000 /* dummy, should never be used */
+};
+
+
+static const uint32_t entropy_table_orig[128] = {
+  0x07b23, 0x085f9, 0x074a0, 0x08cbc, 0x06ee4, 0x09354, 0x067f4, 0x09c1b,
+  0x060b0, 0x0a62a, 0x05a9c, 0x0af5b, 0x0548d, 0x0b955, 0x04f56, 0x0c2a9,
+  0x04a87, 0x0cbf7, 0x045d6, 0x0d5c3, 0x04144, 0x0e01b, 0x03d88, 0x0e937,
+  0x039e0, 0x0f2cd, 0x03663, 0x0fc9e, 0x03347, 0x10600, 0x03050, 0x10f95,
+  0x02d4d, 0x11a02, 0x02ad3, 0x12333, 0x0286e, 0x12cad, 0x02604, 0x136df,
+  0x02425, 0x13f48, 0x021f4, 0x149c4, 0x0203e, 0x1527b, 0x01e4d, 0x15d00,
+  0x01c99, 0x166de, 0x01b18, 0x17017, 0x019a5, 0x17988, 0x01841, 0x18327,
+  0x016df, 0x18d50, 0x015d9, 0x19547, 0x0147c, 0x1a083, 0x0138e, 0x1a8a3,
+  0x01251, 0x1b418, 0x01166, 0x1bd27, 0x01068, 0x1c77b, 0x00f7f, 0x1d18e,
+  0x00eda, 0x1d91a, 0x00e19, 0x1e254, 0x00d4f, 0x1ec9a, 0x00c90, 0x1f6e0,
+  0x00c01, 0x1fef8, 0x00b5f, 0x208b1, 0x00ab6, 0x21362, 0x00a15, 0x21e46,
+  0x00988, 0x2285d, 0x00934, 0x22ea8, 0x008a8, 0x239b2, 0x0081d, 0x24577,
+  0x007c9, 0x24ce6, 0x00763, 0x25663, 0x00710, 0x25e8f, 0x006a0, 0x26a26,
+  0x00672, 0x26f23, 0x005e8, 0x27ef8, 0x005ba, 0x284b5, 0x0055e, 0x29057,
+  0x0050c, 0x29bab, 0x004c1, 0x2a674, 0x004a7, 0x2aa5e, 0x0046f, 0x2b32f,
+  0x0041f, 0x2c0ad, 0x003e7, 0x2ca8d, 0x003ba, 0x2d323, 0x0010c, 0x3bfbb
+};
+
+
+const uint32_t entropy_table_theory[128] =
+  {
+    0x08000, 0x08000, 0x076da, 0x089a0, 0x06e92, 0x09340, 0x0670a, 0x09cdf, 0x06029, 0x0a67f, 0x059dd, 0x0b01f, 0x05413, 0x0b9bf, 0x04ebf, 0x0c35f,
+    0x049d3, 0x0ccff, 0x04546, 0x0d69e, 0x0410d, 0x0e03e, 0x03d22, 0x0e9de, 0x0397d, 0x0f37e, 0x03619, 0x0fd1e, 0x032ee, 0x106be, 0x02ffa, 0x1105d,
+    0x02d37, 0x119fd, 0x02aa2, 0x1239d, 0x02836, 0x12d3d, 0x025f2, 0x136dd, 0x023d1, 0x1407c, 0x021d2, 0x14a1c, 0x01ff2, 0x153bc, 0x01e2f, 0x15d5c,
+    0x01c87, 0x166fc, 0x01af7, 0x1709b, 0x0197f, 0x17a3b, 0x0181d, 0x183db, 0x016d0, 0x18d7b, 0x01595, 0x1971b, 0x0146c, 0x1a0bb, 0x01354, 0x1aa5a,
+    0x0124c, 0x1b3fa, 0x01153, 0x1bd9a, 0x01067, 0x1c73a, 0x00f89, 0x1d0da, 0x00eb7, 0x1da79, 0x00df0, 0x1e419, 0x00d34, 0x1edb9, 0x00c82, 0x1f759,
+    0x00bda, 0x200f9, 0x00b3c, 0x20a99, 0x00aa5, 0x21438, 0x00a17, 0x21dd8, 0x00990, 0x22778, 0x00911, 0x23118, 0x00898, 0x23ab8, 0x00826, 0x24458,
+    0x007ba, 0x24df7, 0x00753, 0x25797, 0x006f2, 0x26137, 0x00696, 0x26ad7, 0x0063f, 0x27477, 0x005ed, 0x27e17, 0x0059f, 0x287b6, 0x00554, 0x29156,
+    0x0050e, 0x29af6, 0x004cc, 0x2a497, 0x0048d, 0x2ae35, 0x00451, 0x2b7d6, 0x00418, 0x2c176, 0x003e2, 0x2cb15, 0x003af, 0x2d4b5, 0x0037f, 0x2de55
+  };
+
+
+void CABAC_encoder_estim::write_CABAC_bit(int modelIdx, int bit)
+{
+  context_model* model = &(*mCtxModels)[modelIdx];
+  //printf("[%d] state=%d, bin=%d\n", encBinCnt, model->state,bit);
+  //encBinCnt++;
+
+  int idx = model->state<<1;
+
+  if (bit==model->MPSbit) {
+    model->state = next_state_MPS[model->state];
+  }
+  else {
+    idx++;
+    if (model->state==0) { model->MPSbit = 1-model->MPSbit; }
+    model->state = next_state_LPS[model->state];
+  }
+
+  mFracBits += entropy_table[idx];
+
+  //printf("-> %08lx %f\n",entropy_table[idx], entropy_table[idx] / float(1<<15));
+}
+
+
+float CABAC_encoder::RDBits_for_CABAC_bin(int modelIdx, int bit)
+{
+  context_model* model = &(*mCtxModels)[modelIdx];
+  int idx = model->state<<1;
+
+  if (bit!=model->MPSbit) {
+    idx++;
+  }
+
+  return entropy_table[idx] / float(1<<15);
+}
+
+
+void CABAC_encoder::write_CABAC_EGk(int val, int k)
+{
+  while (val  >=  ( 1 << k ) ) {
+    write_CABAC_bypass(1);
+    val = val - ( 1 << k );
+    k++;
+  }
+
+  write_CABAC_bypass(0);
+
+  while (k)  {
+    k--;
+    write_CABAC_bypass((val >> k) & 1);
+  }
+}
+
+
+
+void CABAC_encoder_estim_constant::write_CABAC_bit(int modelIdx, int bit)
+{
+  context_model* model = &(*mCtxModels)[modelIdx];
+  int idx = model->state<<1;
+
+  if (bit!=model->MPSbit) {
+    idx++;
+  }
+
+  mFracBits += entropy_table[idx];
+}
+
+
+
+#if 0
+void printtab(int idx,int s)
+{
+  printf("%d %f %f %f\n", s,
+         double(entropy_table[idx])/0x8000,
+         double(entropy_table_orig[idx])/0x8000,
+         double(entropy_table_f265[idx])/0x8000);
+}
+
+
+void plot_tables()
+{
+  for (int i=-62;i<=0;i++) {
+    int idx = -i *2;
+    int s = i;
+    printtab(idx,s);
+  }
+
+  for (int i=0;i<=62;i++) {
+    int idx = 2*i +1;
+    int s = i;
+    printtab(idx,s);
+  }
+}
+#endif
diff --git a/libde265/cabac.h b/libde265/cabac.h
index a4c51ea..e28aeeb 100644
--- a/libde265/cabac.h
+++ b/libde265/cabac.h
@@ -22,6 +22,7 @@
 #define DE265_CABAC_H
 
 #include <stdint.h>
+#include "contextmodel.h"
 
 
 typedef struct {
@@ -35,12 +36,6 @@ typedef struct {
 } CABAC_decoder;
 
 
-typedef struct {
-  uint8_t MPSbit : 1;
-  uint8_t state  : 7;
-} context_model;
-
-
 void init_CABAC_decoder(CABAC_decoder* decoder, uint8_t* bitstream, int length);
 void init_CABAC_decoder_2(CABAC_decoder* decoder);
 int  decode_CABAC_bit(CABAC_decoder* decoder, context_model* model);
@@ -53,4 +48,164 @@ int  decode_CABAC_FL_bypass(CABAC_decoder* decoder, int nBits);
 int  decode_CABAC_TR_bypass(CABAC_decoder* decoder, int cRiceParam, int cTRMax);
 int  decode_CABAC_EGk_bypass(CABAC_decoder* decoder, int k);
 
+
+// ---------------------------------------------------------------------------
+
+class CABAC_encoder
+{
+public:
+ CABAC_encoder() : mCtxModels(NULL) { }
+  virtual ~CABAC_encoder() { }
+
+  virtual int size() const = 0;
+  virtual void reset() = 0;
+
+  // --- VLC ---
+
+  virtual void write_bits(uint32_t bits,int n) = 0;
+  virtual void write_bit(int bit) { write_bits(bit,1); }
+  virtual void write_uvlc(int value);
+  virtual void write_svlc(int value);
+  virtual void write_startcode() = 0;
+  virtual void skip_bits(int nBits) = 0;
+
+  virtual void add_trailing_bits();
+  virtual int  number_free_bits_in_byte() const = 0;
+
+  // output all remaining bits and fill with zeros to next byte boundary
+  virtual void flush_VLC() { }
+
+
+  // --- CABAC ---
+
+  void set_context_models(context_model_table* models) { mCtxModels=models; }
+
+  virtual void init_CABAC() { }
+  virtual void write_CABAC_bit(int modelIdx, int bit) = 0;
+  virtual void write_CABAC_bypass(int bit) = 0;
+  virtual void write_CABAC_TU_bypass(int value, int cMax);
+  virtual void write_CABAC_FL_bypass(int value, int nBits);
+  virtual void write_CABAC_term_bit(int bit) = 0;
+  virtual void flush_CABAC()  { }
+
+  void write_CABAC_EGk(int absolute_symbol, int k); // absolute_symbol >= 0
+
+  virtual bool modifies_context() const = 0;
+
+  float RDBits_for_CABAC_bin(int modelIdx, int bit);
+
+ protected:
+  context_model_table* mCtxModels;
+};
+
+
+class CABAC_encoder_bitstream : public CABAC_encoder
+{
+public:
+  CABAC_encoder_bitstream();
+  ~CABAC_encoder_bitstream();
+
+  virtual void reset();
+
+  virtual int size() const { return data_size; }
+  uint8_t* data() const { return data_mem; }
+
+  // --- VLC ---
+
+  virtual void write_bits(uint32_t bits,int n);
+  virtual void write_startcode();
+  virtual void skip_bits(int nBits);
+
+  virtual int  number_free_bits_in_byte() const;
+
+  // output all remaining bits and fill with zeros to next byte boundary
+  virtual void flush_VLC();
+
+
+  // --- CABAC ---
+
+  virtual void init_CABAC();
+  virtual void write_CABAC_bit(int modelIdx, int bit);
+  virtual void write_CABAC_bypass(int bit);
+  virtual void write_CABAC_term_bit(int bit);
+  virtual void flush_CABAC();
+
+  virtual bool modifies_context() const { return true; }
+
+private:
+  // data buffer
+
+  uint8_t* data_mem;
+  uint32_t data_capacity;
+  uint32_t data_size;
+  char     state; // for inserting emulation-prevention bytes
+
+  // VLC
+
+  uint32_t vlc_buffer;
+  uint32_t vlc_buffer_len;
+
+
+  // CABAC
+
+  uint32_t range;
+  uint32_t low;
+  int8_t   bits_left;
+  uint8_t  buffered_byte;
+  uint16_t num_buffered_bytes;
+
+
+  void check_size_and_resize(int nBytes);
+  void testAndWriteOut();
+  void write_out();
+  void append_byte(int byte);
+};
+
+
+class CABAC_encoder_estim : public CABAC_encoder
+{
+public:
+  CABAC_encoder_estim() : mFracBits(0) { }
+
+  virtual void reset() { mFracBits=0; }
+
+  virtual int size() const { return mFracBits>>(15+3); }
+
+  uint64_t getFracBits() const { return mFracBits; }
+  float    getRDBits() const { return mFracBits / float(1<<15); }
+
+  // --- VLC ---
+
+  virtual void write_bits(uint32_t bits,int n) { mFracBits += n<<15; }
+  virtual void write_bit(int bit) { mFracBits+=1<<15; }
+  virtual void write_startcode() { mFracBits += (1<<15)*8*3; }
+  virtual void skip_bits(int nBits) { mFracBits += nBits<<15; }
+  virtual int  number_free_bits_in_byte() const { return 0; } // TODO, good enough for now
+
+  // --- CABAC ---
+
+  virtual void write_CABAC_bit(int modelIdx, int bit);
+  virtual void write_CABAC_bypass(int bit) {
+    mFracBits += 0x8000;
+  }
+  virtual void write_CABAC_FL_bypass(int value, int nBits) {
+    mFracBits += nBits<<15;
+  }
+  virtual void write_CABAC_term_bit(int bit) { /* not implemented (not needed) */ }
+
+  virtual bool modifies_context() const { return true; }
+
+ protected:
+  uint64_t mFracBits;
+};
+
+
+class CABAC_encoder_estim_constant : public CABAC_encoder_estim
+{
+ public:
+  void write_CABAC_bit(int modelIdx, int bit);
+
+  virtual bool modifies_context() const { return false; }
+};
+
 #endif
diff --git a/libde265/configparam.cc b/libde265/configparam.cc
new file mode 100644
index 0000000..e4bcce2
--- /dev/null
+++ b/libde265/configparam.cc
@@ -0,0 +1,485 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "configparam.h"
+
+#include <string.h>
+#include <ctype.h>
+#include <sstream>
+#include <iomanip>
+#include <iostream>
+#include <algorithm>
+#include <typeinfo>
+
+#ifndef RTTI_ENABLED
+#error "Need to compile with RTTI enabled."
+#endif
+
+static void remove_option(int* argc,char** argv,int idx, int n=1)
+{
+  for (int i=idx+n;i<*argc;i++) {
+    argv[i-n] = argv[i];
+  }
+
+  *argc-=n;
+}
+
+
+bool option_string::processCmdLineArguments(char** argv, int* argc, int idx)
+{
+  if (argv==NULL)   { return false; }
+  if (idx >= *argc) { return false; }
+
+  value = argv[idx];
+  value_set = true;
+
+  remove_option(argc,argv,idx,1);
+
+  return true;
+}
+
+
+void option_int::set_range(int mini,int maxi)
+{
+  have_low_limit =true;
+  have_high_limit=true;
+  low_limit =mini;
+  high_limit=maxi;
+}
+
+std::string option_int::getTypeDescr() const
+{
+  std::stringstream sstr;
+  sstr << "(int)";
+
+  if (have_low_limit || have_high_limit) { sstr << " "; }
+  if (have_low_limit) { sstr << low_limit << " <= "; }
+  if (have_low_limit || have_high_limit) { sstr << "x"; }
+  if (have_high_limit) { sstr << " <= " << high_limit; }
+
+  if (!valid_values_set.empty()) {
+    sstr << " {";
+    bool first=true;
+    FOR_LOOP(int, v, valid_values_set) {
+      if (!first) sstr << ","; else first=false;
+      sstr << v;
+    }
+    sstr << "}";
+  }
+
+  return sstr.str();
+}
+
+bool option_int::processCmdLineArguments(char** argv, int* argc, int idx)
+{
+  if (argv==NULL)   { return false; }
+  if (idx >= *argc) { return false; }
+
+  int v = atoi(argv[idx]);
+  if (!is_valid(v)) { return false; }
+
+  value = v;
+  value_set = true;
+
+  remove_option(argc,argv,idx,1);
+
+  return true;
+}
+
+bool option_int::is_valid(int v) const
+{
+  if (have_low_limit  && v<low_limit)  { return false; }
+  if (have_high_limit && v>high_limit) { return false; }
+
+  if (!valid_values_set.empty()) {
+    auto iter = std::find(valid_values_set.begin(), valid_values_set.end(), v);
+    if (iter==valid_values_set.end()) { return false; }
+  }
+
+  return true;
+}
+
+std::string option_int::get_default_string() const
+{
+  std::stringstream sstr;
+  sstr << default_value;
+  return sstr.str();
+}
+
+
+std::string choice_option_base::getTypeDescr() const
+{
+  std::vector<std::string> choices = get_choice_names();
+
+  std::stringstream sstr;
+  sstr << "{";
+
+  bool first=true;
+#ifdef FOR_LOOP_AUTO_SUPPORT
+  FOR_LOOP(auto, c, choices) {
+#else
+  FOR_LOOP(std::string, c, choices) {
+#endif
+    if (first) { first=false; }
+    else { sstr << ","; }
+
+    sstr << c;
+  }
+
+  sstr << "}";
+  return sstr.str();
+}
+
+
+bool choice_option_base::processCmdLineArguments(char** argv, int* argc, int idx)
+{
+  if (argv==NULL)   { return false; }
+  if (idx >= *argc) { return false; }
+
+  std::string value = argv[idx];
+
+  std::cout << "set " << value << "\n";
+  bool success = set_value(value);
+  std::cout << "success " << success << "\n";
+
+  remove_option(argc,argv,idx,1);
+
+  return success;
+}
+
+
+static char* fill_strings_into_memory(const std::vector<std::string>& strings_list)
+{
+  // calculate memory requirement
+
+  int totalStringLengths = 0;
+#ifdef FOR_LOOP_AUTO_SUPPORT
+  FOR_LOOP(auto, str, strings_list) {
+#else
+  FOR_LOOP(std::string, str, strings_list) {
+#endif
+    totalStringLengths += str.length() +1; // +1 for null termination
+  }
+
+  int numStrings = strings_list.size();
+
+  int pointersSize = (numStrings+1) * sizeof(const char*);
+
+  char* memory = new char[pointersSize + totalStringLengths];
+
+
+  // copy strings to memory area
+
+  char* stringPtr = memory + (numStrings+1) * sizeof(const char*);
+  const char** tablePtr = (const char**)memory;
+
+#ifdef FOR_LOOP_AUTO_SUPPORT
+  FOR_LOOP(auto, str, strings_list) {
+#else
+  FOR_LOOP(std::string, str, strings_list) {
+#endif
+    *tablePtr++ = stringPtr;
+
+    strcpy(stringPtr, str.c_str());
+    stringPtr += str.length()+1;
+  }
+
+  *tablePtr = NULL;
+
+  return memory;
+}
+
+
+const char** choice_option_base::get_choices_string_table() const
+{
+  if (choice_string_table==NULL) {
+    choice_string_table = fill_strings_into_memory(get_choice_names());
+  }
+
+  return (const char**)choice_string_table;
+}
+
+
+
+bool config_parameters::parse_command_line_params(int* argc, char** argv, int* first_idx_ptr,
+                                                  bool ignore_unknown_options)
+{
+  int first_idx=1;
+  if (first_idx_ptr) { first_idx = *first_idx_ptr; }
+
+  for (int i=first_idx;i < *argc;i++) {
+
+    if (argv[i][0]=='-') {
+      // option
+
+      if (argv[i][1]=='-') {
+        // long option
+
+        bool option_found=false;
+
+        for (int o=0;o<mOptions.size();o++) {
+          if (mOptions[o]->hasLongOption() && strcmp(mOptions[o]->getLongOption().c_str(),
+                                                     argv[i]+2)==0) {
+            option_found=true;
+
+            bool success = mOptions[o]->processCmdLineArguments(argv,argc, i+1);
+            if (!success) {
+              if (first_idx_ptr) { *first_idx_ptr = i; }
+              return false;
+            }
+
+            remove_option(argc,argv,i);
+            i--;
+
+            break;
+          }
+        }
+
+        if (option_found == false && !ignore_unknown_options) {
+          return false;
+        }
+      }
+      else {
+        // short option
+
+        bool is_single_option = (argv[i][1] != 0 && argv[i][2]==0);
+        bool do_remove_option = true;
+
+        for (int n=1; argv[i][n]; n++) {
+          char option = argv[i][n];
+
+          bool option_found=false;
+
+          for (int o=0;o<mOptions.size();o++) {
+            if (mOptions[o]->getShortOption() == option) {
+              option_found=true;
+
+              bool success;
+              if (is_single_option) {
+                success = mOptions[o]->processCmdLineArguments(argv,argc, i+1);
+              }
+              else {
+                success = mOptions[o]->processCmdLineArguments(NULL,NULL, 0);
+              }
+
+              if (!success) {
+                if (first_idx_ptr) { *first_idx_ptr = i; }
+                return false;
+              }
+
+              break;
+            }
+          }
+
+          if (!option_found) {
+            if (!ignore_unknown_options) {
+              fprintf(stderr, "unknown option -%c\n",option);
+              return false;
+            }
+            else {
+              do_remove_option=false;
+            }
+          }
+
+        } // all short options
+
+        if (do_remove_option) {
+          remove_option(argc,argv,i);
+          i--;
+        }
+      } // is short option
+    } // is option
+  } // all command line arguments
+
+  return true;
+}
+
+
+void config_parameters::print_params() const
+{
+  for (int i=0;i<mOptions.size();i++) {
+    const option_base* o = mOptions[i];
+
+    std::stringstream sstr;
+    sstr << "  ";
+    if (o->hasShortOption()) {
+      sstr << '-' << o->getShortOption();
+    } else {
+      sstr << "  ";
+    }
+
+    if (o->hasShortOption() && o->hasLongOption()) {
+      sstr << ", ";
+    } else {
+      sstr << "  ";
+    }
+
+    if (o->hasLongOption()) {
+      sstr << "--" << std::setw(12) << std::left << o->getLongOption();
+    } else {
+      sstr << "              ";
+    }
+
+    sstr << " ";
+    sstr << o->getTypeDescr();
+
+    if (o->has_default()) {
+      sstr << ", default=" << o->get_default_string();
+    }
+
+    sstr << "\n";
+
+    std::cerr << sstr.str();
+  }
+}
+
+
+void config_parameters::add_option(option_base* o)
+{
+  mOptions.push_back(o);
+  delete[] param_string_table; // delete old table, since we got a new parameter
+  param_string_table = NULL;
+}
+
+
+std::vector<std::string> config_parameters::get_parameter_IDs() const
+{
+  std::vector<std::string> ids;
+
+#ifdef FOR_LOOP_AUTO_SUPPORT
+  FOR_LOOP(auto, option, mOptions) {
+#else
+  FOR_LOOP(option_base*, option, mOptions) {
+#endif
+    ids.push_back(option->get_name());
+  }
+
+  return ids;
+}
+
+
+enum en265_parameter_type config_parameters::get_parameter_type(const char* param) const
+{
+  option_base* option = find_option(param);
+  assert(option);
+
+  if (dynamic_cast<option_int*>   (option)) { return en265_parameter_int; }
+  if (dynamic_cast<option_bool*>  (option)) { return en265_parameter_bool; }
+  if (dynamic_cast<option_string*>(option)) { return en265_parameter_string; }
+  if (dynamic_cast<choice_option_base*>(option)) { return en265_parameter_choice; }
+
+  assert(false);
+  return en265_parameter_bool;
+}
+
+
+std::vector<std::string> config_parameters::get_parameter_choices(const char* param) const
+{
+  option_base* option = find_option(param);
+  assert(option);
+
+  choice_option_base* o = dynamic_cast<choice_option_base*>(option);
+  assert(o);
+  
+  return o->get_choice_names();
+}
+
+
+option_base* config_parameters::find_option(const char* param) const
+{
+#ifdef FOR_LOOP_AUTO_SUPPORT
+  FOR_LOOP(auto, o, mOptions) {
+#else
+  FOR_LOOP(option_base*, o, mOptions) {
+#endif
+    if (strcmp(o->get_name().c_str(), param)==0) { return o; }
+  }
+
+  return NULL;
+}
+
+
+bool config_parameters::set_bool(const char* param, bool value)
+{
+  option_base* option = find_option(param);
+  assert(option);
+
+  option_bool* o = dynamic_cast<option_bool*>(option);
+  assert(o);
+
+  return o->set(value);
+}
+
+bool config_parameters::set_int(const char* param, int value)
+{
+  option_base* option = find_option(param);
+  assert(option);
+
+  option_int* o = dynamic_cast<option_int*>(option);
+  assert(o);
+
+  return o->set(value);
+}
+
+bool config_parameters::set_string(const char* param, const char* value)
+{
+  option_base* option = find_option(param);
+  assert(option);
+
+  option_string* o = dynamic_cast<option_string*>(option);
+  assert(o);
+
+  return o->set(value);
+}
+
+bool config_parameters::set_choice(const char* param, const char* value)
+{
+  option_base* option = find_option(param);
+  assert(option);
+
+  choice_option_base* o = dynamic_cast<choice_option_base*>(option);
+  assert(o);
+
+  return o->set(value);
+}
+
+
+
+const char** config_parameters::get_parameter_choices_table(const char* param) const
+{
+  option_base* option = find_option(param);
+  assert(option);
+
+  choice_option_base* o = dynamic_cast<choice_option_base*>(option);
+  assert(o);
+
+  return o->get_choices_string_table();
+}
+
+const char** config_parameters::get_parameter_string_table() const
+{
+  if (param_string_table==NULL) {
+    param_string_table = fill_strings_into_memory(get_parameter_IDs());
+  }
+
+  return (const char**)param_string_table;
+}
diff --git a/libde265/configparam.h b/libde265/configparam.h
new file mode 100644
index 0000000..2cb2f02
--- /dev/null
+++ b/libde265/configparam.h
@@ -0,0 +1,400 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CONFIG_PARAM_H
+#define CONFIG_PARAM_H
+
+#include "en265.h"
+#include "util.h"
+
+#include <climits>
+#include <vector>
+#include <string>
+#include <stddef.h>
+#include <assert.h>
+
+
+/* Notes: probably best to keep cmd-line-options here. So it will be:
+   - automatically consistent even when having different combinations of algorithms
+   - no other place to edit
+   - if needed, one can still override it at another place
+ */
+
+// TODO: set a stack of default prefixes in config_parameters, such that all options added
+// will receive this namespace prefix.
+
+// TODO: add the possibility to remove long options again, i.e., not use the default id name
+class option_base
+{
+ public:
+ option_base() : mShortOption(0), mLongOption(NULL) { }
+ option_base(const char* name) : mIDName(name), mShortOption(0), mLongOption(NULL) { }
+  virtual ~option_base() { }
+
+
+  // --- option identifier ---
+
+  void set_ID(const char* name) { mIDName=name; }
+  void add_namespace_prefix(std::string prefix) { mPrefix = prefix + ":" + mPrefix; }
+
+  std::string get_name() const { return mPrefix + mIDName; }
+
+
+  // --- description ---
+
+  void set_description(std::string descr) { mDescription = descr; }
+  std::string get_description() const { return mDescription; }
+
+
+  // --- value ---
+
+  virtual bool is_defined() const = 0;
+  bool is_undefined() const { return !is_defined(); }
+
+  virtual bool has_default() const  = 0;
+
+
+  // --- command line options ----
+
+  void set_cmd_line_options(const char* long_option, char short_option = 0)
+  {
+    mShortOption = short_option;
+    mLongOption  = long_option;
+  }
+
+  void set_short_option(char short_option) { mShortOption=short_option; }
+
+  void unsetCmdLineOption()
+  {
+    mShortOption = 0;
+    mLongOption  = NULL;
+  }
+
+  bool hasShortOption() const { return mShortOption!=0; }
+  char getShortOption() const { return mShortOption; }
+  bool hasLongOption() const { return true; } //mLongOption!=NULL; }
+  std::string getLongOption() const { return mLongOption ? std::string(mLongOption) : get_name(); }
+
+  virtual LIBDE265_API bool processCmdLineArguments(char** argv, int* argc, int idx) { return false; }
+
+
+
+  virtual std::string getTypeDescr() const = 0;
+
+  virtual std::string get_default_string() const { return "N/A"; }
+
+ private:
+  std::string mPrefix;
+  std::string mIDName;
+
+  std::string mDescription;
+
+  char mShortOption;
+  const char* mLongOption;
+};
+
+
+
+class option_bool : public option_base
+{
+public:
+  option_bool() : value_set(false), default_set(false) { }
+
+  operator bool() const {
+    assert(value_set || default_set);
+    return value_set ? value : default_value;
+  }
+
+  virtual bool is_defined() const { return value_set || default_set; }
+  virtual bool has_default() const { return default_set; }
+
+  void set_default(bool v) { default_value=v; default_set=true; }
+  virtual std::string get_default_string() const { return default_value ? "true":"false"; }
+
+  virtual std::string getTypeDescr() const { return "boolean"; }
+  virtual LIBDE265_API bool processCmdLineArguments(char** argv, int* argc, int idx) { value=true; return true; }
+
+  bool set(bool v) { value_set=true; value=v; return true; }
+
+ private:
+  bool value_set;
+  bool value;
+
+  bool default_set;
+  bool default_value;
+};
+
+
+class option_string : public option_base
+{
+public:
+  option_string() : value_set(false), default_set(false) { }
+
+  const option_string& operator=(std::string v) { value=v; value_set=true; return *this; }
+
+  operator std::string() const { return get(); }
+  std::string get() const {
+    assert(value_set || default_set);
+    return value_set ? value : default_value;
+  }
+
+  virtual bool is_defined() const { return value_set || default_set; }
+  virtual bool has_default() const { return default_set; }
+
+  void set_default(std::string v) { default_value=v; default_set=true; }
+  virtual LIBDE265_API std::string get_default_string() const { return default_value; }
+
+  virtual LIBDE265_API std::string getTypeDescr() const { return "(string)"; }
+  virtual LIBDE265_API bool processCmdLineArguments(char** argv, int* argc, int idx);
+
+  bool set(std::string v) { value_set=true; value=v; return true; }
+
+ private:
+  bool value_set;
+  std::string value;
+
+  bool default_set;
+  std::string default_value;
+};
+
+
+class option_int : public option_base
+{
+public:
+  option_int() : value_set(false), default_set(false),
+    have_low_limit(false), have_high_limit(false) { }
+
+  void set_minimum(int mini) { have_low_limit =true; low_limit =mini; }
+  void set_maximum(int maxi) { have_high_limit=true; high_limit=maxi; }
+  void set_range(int mini,int maxi);
+  void set_valid_values(const std::vector<int>& v) { valid_values_set = v; }
+
+  const option_int& operator=(int v) { value=v; value_set=true; return *this; }
+
+  int operator() () const {
+    assert(value_set || default_set);
+    return value_set ? value : default_value;
+  }
+  operator int() const { return operator()(); }
+
+  virtual bool is_defined() const { return value_set || default_set; }
+  virtual bool has_default() const { return default_set; }
+
+  void set_default(int v) { default_value=v; default_set=true; }
+  virtual LIBDE265_API std::string get_default_string() const;
+
+  virtual LIBDE265_API std::string getTypeDescr() const;
+  virtual LIBDE265_API bool processCmdLineArguments(char** argv, int* argc, int idx);
+
+  bool set(int v) {
+    if (is_valid(v)) { value_set=true; value=v; return true; }
+    else { return false; }
+  }
+
+ private:
+  bool value_set;
+  int value;
+
+  bool default_set;
+  int default_value;
+
+  bool have_low_limit, have_high_limit;
+  int  low_limit, high_limit;
+
+  std::vector<int> valid_values_set;
+
+  bool is_valid(int v) const;
+};
+
+
+
+class choice_option_base : public option_base
+{
+public:
+  choice_option_base() : choice_string_table(NULL) { }
+  ~choice_option_base() { delete[] choice_string_table; }
+
+  bool set(std::string v) { return set_value(v); }
+  virtual bool set_value(const std::string& val) = 0;
+  virtual std::vector<std::string> get_choice_names() const = 0;
+
+  virtual std::string getTypeDescr() const;
+  virtual LIBDE265_API bool processCmdLineArguments(char** argv, int* argc, int idx);
+
+  const char** get_choices_string_table() const;
+
+ protected:
+  void invalidate_choices_string_table() {
+    delete[] choice_string_table;
+    choice_string_table = NULL;
+  }
+
+ private:
+  mutable char* choice_string_table;
+};
+
+
+template <class T> class choice_option : public choice_option_base
+{
+ public:
+ choice_option() : default_set(false), value_set(false) { }
+
+  // --- initialization ---
+
+  void add_choice(const std::string& s, T id, bool default_value=false) {
+    choices.push_back( std::make_pair(s,id) );
+    if (default_value) {
+      defaultID = id;
+      defaultValue = s;
+      default_set = true;
+    }
+
+    invalidate_choices_string_table();
+  }
+
+  void set_default(T val) {
+#ifdef FOR_LOOP_AUTO_SUPPORT
+    FOR_LOOP(auto, c, choices) {
+#else
+    for (typename std::vector< std::pair<std::string,T> >::const_iterator it=choices.begin(); it!=choices.end(); ++it) {
+      const std::pair<std::string,T> & c = *it;
+#endif
+      if (c.second == val) {
+        defaultID = val;
+        defaultValue = c.first;
+        default_set = true;
+        return;
+      }
+    }
+
+    assert(false); // value does not exist
+  }
+
+
+  // --- usage ---
+
+  bool set_value(const std::string& val) // returns false if it is not a valid option
+  {
+    value_set = true;
+    selectedValue=val;
+
+    validValue = false;
+
+#ifdef FOR_LOOP_AUTO_SUPPORT
+    FOR_LOOP(auto, c, choices) {
+#else
+    for (typename std::vector< std::pair<std::string,T> >::const_iterator it=choices.begin(); it!=choices.end(); ++it) {
+      const std::pair<std::string,T> & c = *it;
+#endif
+      if (val == c.first) {
+        selectedID = c.second;
+        validValue = true;
+      }
+    }
+
+    return validValue;
+  }
+
+  bool isValidValue() const { return validValue; }
+
+  const std::string& getValue() const {
+    assert(value_set || default_set);
+    return value_set ? selectedValue : defaultValue;
+  }
+  void setID(T id) { selectedID=id; validValue=true; }
+  const T getID() const { return value_set ? selectedID : defaultID; }
+
+  virtual bool is_defined() const { return value_set || default_set; }
+  virtual bool has_default() const { return default_set; }
+
+  std::vector<std::string> get_choice_names() const
+  {
+    std::vector<std::string> names;
+#ifdef FOR_LOOP_AUTO_SUPPORT
+    FOR_LOOP(auto, p, choices) {
+#else
+    for (typename std::vector< std::pair<std::string,T> >::const_iterator it=choices.begin(); it!=choices.end(); ++it) {
+      const std::pair<std::string,T> & p = *it;
+#endif
+      names.push_back(p.first);
+    }
+    return names;
+  }
+
+  std::string get_default_string() const { return defaultValue; }
+
+  T operator() () const { return (T)getID(); }
+
+ private:
+  std::vector< std::pair<std::string,T> > choices;
+
+  bool default_set;
+  std::string defaultValue;
+  T defaultID;
+
+  bool value_set;
+  std::string selectedValue;
+  T selectedID;
+
+  bool validValue;
+};
+
+
+
+
+class config_parameters
+{
+ public:
+ config_parameters() : param_string_table(NULL) { }
+  ~config_parameters() { delete[] param_string_table; }
+
+  void LIBDE265_API add_option(option_base* o);
+
+  void LIBDE265_API print_params() const;
+  bool LIBDE265_API parse_command_line_params(int* argc, char** argv, int* first_idx=NULL,
+                                 bool ignore_unknown_options=false);
+
+
+  // --- connection to C API ---
+
+  std::vector<std::string> get_parameter_IDs() const;
+  enum en265_parameter_type get_parameter_type(const char* param) const;
+
+  std::vector<std::string> get_parameter_choices(const char* param) const;
+
+  bool set_bool(const char* param, bool value);
+  bool set_int(const char* param, int value);
+  bool set_string(const char* param, const char* value);
+  bool set_choice(const char* param, const char* value);
+
+  const char** get_parameter_string_table() const;
+  const char** get_parameter_choices_table(const char* param) const;
+
+ private:
+  std::vector<option_base*> mOptions;
+
+  option_base* find_option(const char* param) const;
+
+  mutable char* param_string_table;
+};
+
+#endif
diff --git a/libde265/contextmodel.cc b/libde265/contextmodel.cc
new file mode 100644
index 0000000..3c0385a
--- /dev/null
+++ b/libde265/contextmodel.cc
@@ -0,0 +1,347 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "slice.h"
+#include <assert.h>
+#include <iomanip>
+#include <sstream>
+
+bool D = false;
+
+context_model_table::context_model_table()
+  : model(NULL), refcnt(NULL)
+{
+}
+
+
+context_model_table::context_model_table(const context_model_table& src)
+{
+  if (D) printf("%p c'tor = %p\n",this,&src);
+
+  if (src.refcnt) {
+    (*(src.refcnt))++;
+  }
+
+  refcnt = src.refcnt;
+  model  = src.model;
+}
+
+
+context_model_table::~context_model_table()
+{
+  if (D) printf("%p destructor\n",this);
+
+  if (refcnt) {
+    (*refcnt)--;
+    if (*refcnt==0) {
+      if (D) printf("mfree %p\n",model);
+      delete[] model;
+      delete refcnt;
+    }
+  }
+}
+
+
+void context_model_table::init(int initType, int QPY)
+{
+  if (D) printf("%p init\n",this);
+
+  decouple_or_alloc_with_empty_data();
+
+  initialize_CABAC_models(model, initType, QPY);
+}
+
+
+void context_model_table::release()
+{
+  if (D) printf("%p release %p\n",this,refcnt);
+
+  if (!refcnt) { return; }
+
+  // if (*refcnt == 1) { return; } <- keep memory for later, but does not work when we believe that we freed the memory and nulled all references
+
+  (*refcnt)--;
+  if (refcnt==0) {
+    delete[] model;
+    delete refcnt;
+  }
+
+  model = nullptr;
+  refcnt= nullptr;
+}
+
+
+void context_model_table::decouple()
+{
+  if (D) printf("%p decouple (%p)\n",this,refcnt);
+
+  assert(refcnt); // not necessarily so, but we never use it on an unitialized object
+
+  if (*refcnt > 1) {
+    (*refcnt)--;
+
+    context_model* oldModel = model;
+
+    model = new context_model[CONTEXT_MODEL_TABLE_LENGTH];
+    refcnt= new int;
+    *refcnt=1;
+
+    memcpy(model,oldModel,sizeof(context_model)*CONTEXT_MODEL_TABLE_LENGTH);
+  }
+}
+
+
+context_model_table context_model_table::transfer()
+{
+  context_model_table newtable;
+  newtable.model = model;
+  newtable.refcnt= refcnt;
+
+  model =nullptr;
+  refcnt=nullptr;
+
+  return newtable;
+}
+
+
+context_model_table& context_model_table::operator=(const context_model_table& src)
+{
+  if (D) printf("%p assign = %p\n",this,&src);
+
+  // assert(src.refcnt); // not necessarily so, but we never use it on an unitialized object
+
+  if (!src.refcnt) {
+    release();
+    return *this;
+  }
+
+  (*(src.refcnt))++;
+
+  release();
+
+  model = src.model;
+  refcnt= src.refcnt;
+
+  return *this;
+}
+
+
+bool context_model_table::operator==(const context_model_table& b) const
+{
+  if (b.model == model) return true;
+  if (b.model == nullptr || model == nullptr) return false;
+
+  for (int i=0;i<CONTEXT_MODEL_TABLE_LENGTH;i++) {
+    if (!(b.model[i] == model[i])) return false;
+  }
+
+  return true;
+}
+
+
+std::string context_model_table::debug_dump() const
+{
+  int hash = 0;
+  for (int i=0;i<CONTEXT_MODEL_TABLE_LENGTH;i++) {
+    hash ^= ((i+7)*model[i].state) & 0xFFFF;
+  }
+
+  std::stringstream sstr;
+  sstr << std::hex << hash;
+  return sstr.str();
+}
+
+
+void context_model_table::decouple_or_alloc_with_empty_data()
+{
+  if (refcnt && *refcnt==1) { return; }
+
+  if (refcnt) {
+    assert(*refcnt>1);
+    (*refcnt)--;
+  }
+
+  if (D) printf("%p (alloc)\n",this);
+
+  model = new context_model[CONTEXT_MODEL_TABLE_LENGTH];
+  refcnt= new int;
+  *refcnt=1;
+}
+
+
+
+
+
+
+static void set_initValue(int SliceQPY,
+                          context_model* model, int initValue, int nContexts)
+{
+  int slopeIdx = initValue >> 4;
+  int intersecIdx = initValue & 0xF;
+  int m = slopeIdx*5 - 45;
+  int n = (intersecIdx<<3) - 16;
+  int preCtxState = Clip3(1,126, ((m*Clip3(0,51, SliceQPY))>>4)+n);
+
+  // logtrace(LogSlice,"QP=%d slopeIdx=%d intersecIdx=%d m=%d n=%d\n",SliceQPY,slopeIdx,intersecIdx,m,n);
+
+  for (int i=0;i<nContexts;i++) {
+    model[i].MPSbit=(preCtxState<=63) ? 0 : 1;
+    model[i].state = model[i].MPSbit ? (preCtxState-64) : (63-preCtxState);
+
+    // model state will always be between [0;62]
+
+    assert(model[i].state <= 62);
+  }
+}
+
+
+static const int initValue_split_cu_flag[3][3] = {
+  { 139,141,157 },
+  { 107,139,126 },
+  { 107,139,126 },
+};
+static const int initValue_cu_skip_flag[2][3] = {
+  { 197,185,201 },
+  { 197,185,201 },
+};
+static const int initValue_part_mode[9] = { 184,154,139, 154,154,154, 139,154,154 };
+static const int initValue_prev_intra_luma_pred_flag[3] = { 184,154,183 };
+static const int initValue_intra_chroma_pred_mode[3] = { 63,152,152 };
+static const int initValue_cbf_luma[4] = { 111,141,153,111 };
+static const int initValue_cbf_chroma[12] = { 94,138,182,154,149,107,167,154,149,92,167,154 };
+static const int initValue_split_transform_flag[9] = { 153,138,138, 124,138,94, 224,167,122 }; // FIX712
+static const int initValue_last_significant_coefficient_prefix[54] = {
+    110,110,124,125,140,153,125,127,140,109,111,143,127,111, 79,108,123, 63,
+    125,110, 94,110, 95, 79,125,111,110, 78,110,111,111, 95, 94,108,123,108,
+    125,110,124,110, 95, 94,125,111,111, 79,125,126,111,111, 79,108,123, 93
+  };
+static const int initValue_coded_sub_block_flag[12] = { 91,171,134,141,121,140,61,154,121,140,61,154 };
+static const int initValue_significant_coeff_flag[3][42] = {
+    {
+      111,  111,  125,  110,  110,   94,  124,  108,  124,  107,  125,  141,  179,  153,  125,  107,
+      125,  141,  179,  153,  125,  107,  125,  141,  179,  153,  125,  140,  139,  182,  182,  152,
+      136,  152,  136,  153,  136,  139,  111,  136,  139,  111
+    },
+    {
+      155,  154,  139,  153,  139,  123,  123,   63,  153,  166,  183,  140,  136,  153,  154,  166,
+      183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  170,  153,  123,  123,  107,
+      121,  107,  121,  167,  151,  183,  140,  151,  183,  140,
+    },
+    {
+      170,  154,  139,  153,  139,  123,  123,   63,  124,  166,  183,  140,  136,  153,  154,  166,
+      183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  170,  153,  138,  138,  122,
+      121,  122,  121,  167,  151,  183,  140,  151,  183,  140
+    },
+  };
+static const int initValue_significant_coeff_flag_skipmode[3][2] = {
+  { 141,111 }, { 140,140 }, { 140,140 }
+};
+
+static const int initValue_coeff_abs_level_greater1_flag[72] = {
+    140, 92,137,138,140,152,138,139,153, 74,149, 92,139,107,122,152,
+    140,179,166,182,140,227,122,197,154,196,196,167,154,152,167,182,
+    182,134,149,136,153,121,136,137,169,194,166,167,154,167,137,182,
+    154,196,167,167,154,152,167,182,182,134,149,136,153,121,136,122,
+    169,208,166,167,154,152,167,182
+  };
+static const int initValue_coeff_abs_level_greater2_flag[18] = {
+    138,153,136,167,152,152,107,167, 91,122,107,167,
+    107,167, 91,107,107,167
+  };
+static const int initValue_sao_merge_leftUp_flag[3] = { 153,153,153 };
+static const int initValue_sao_type_idx_lumaChroma_flag[3] = { 200,185,160 };
+static const int initValue_cu_qp_delta_abs[2] = { 154,154 };
+static const int initValue_transform_skip_flag[2] = { 139,139 };
+static const int initValue_merge_flag[2] = { 110,154 };
+static const int initValue_merge_idx[2] = { 122,137 };
+static const int initValue_pred_mode_flag[2] = { 149,134 };
+static const int initValue_abs_mvd_greater01_flag[4] = { 140,198,169,198 };
+static const int initValue_mvp_lx_flag[1] = { 168 };
+static const int initValue_rqt_root_cbf[1] = { 79 };
+static const int initValue_ref_idx_lX[2] = { 153,153 };
+static const int initValue_inter_pred_idc[5] = { 95,79,63,31,31 };
+static const int initValue_cu_transquant_bypass_flag[3] = { 154,154,154 };
+
+
+static void init_context(int SliceQPY,
+                         context_model* model,
+                         const int* initValues, int len)
+{
+  for (int i=0;i<len;i++)
+    {
+      set_initValue(SliceQPY, &model[i], initValues[i], 1);
+    }
+}
+
+
+static void init_context_const(int SliceQPY,
+                               context_model* model,
+                               int initValue, int len)
+{
+  set_initValue(SliceQPY, model, initValue, len);
+}
+
+void initialize_CABAC_models(context_model context_model_table[CONTEXT_MODEL_TABLE_LENGTH],
+                      int initType,
+                      int QPY)
+{
+  context_model* cm = context_model_table; // just an abbreviation
+
+  if (initType > 0) {
+    init_context(QPY, cm+CONTEXT_MODEL_CU_SKIP_FLAG,    initValue_cu_skip_flag[initType-1],  3);
+    init_context(QPY, cm+CONTEXT_MODEL_PRED_MODE_FLAG, &initValue_pred_mode_flag[initType-1], 1);
+    init_context(QPY, cm+CONTEXT_MODEL_MERGE_FLAG,             &initValue_merge_flag[initType-1],1);
+    init_context(QPY, cm+CONTEXT_MODEL_MERGE_IDX,              &initValue_merge_idx[initType-1], 1);
+    init_context(QPY, cm+CONTEXT_MODEL_INTER_PRED_IDC,         initValue_inter_pred_idc,         5);
+    init_context(QPY, cm+CONTEXT_MODEL_REF_IDX_LX,             initValue_ref_idx_lX,             2);
+    init_context(QPY, cm+CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG, &initValue_abs_mvd_greater01_flag[initType == 1 ? 0 : 2], 2);
+    init_context(QPY, cm+CONTEXT_MODEL_MVP_LX_FLAG,            initValue_mvp_lx_flag,            1);
+    init_context(QPY, cm+CONTEXT_MODEL_RQT_ROOT_CBF,           initValue_rqt_root_cbf,           1);
+
+    init_context_const(QPY, cm+CONTEXT_MODEL_RDPCM_FLAG, 139, 2);
+    init_context_const(QPY, cm+CONTEXT_MODEL_RDPCM_DIR,  139, 2);
+  }
+
+  init_context(QPY, cm+CONTEXT_MODEL_SPLIT_CU_FLAG, initValue_split_cu_flag[initType], 3);
+  init_context(QPY, cm+CONTEXT_MODEL_PART_MODE,     &initValue_part_mode[(initType!=2 ? initType : 5)], 4);
+  init_context(QPY, cm+CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG, &initValue_prev_intra_luma_pred_flag[initType], 1);
+  init_context(QPY, cm+CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE,    &initValue_intra_chroma_pred_mode[initType],    1);
+  init_context(QPY, cm+CONTEXT_MODEL_CBF_LUMA,                  &initValue_cbf_luma[initType == 0 ? 0 : 2],     2);
+  init_context(QPY, cm+CONTEXT_MODEL_CBF_CHROMA,                &initValue_cbf_chroma[initType * 4],            4);
+  init_context(QPY, cm+CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG,      &initValue_split_transform_flag[initType * 3],  3);
+  init_context(QPY, cm+CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX, &initValue_last_significant_coefficient_prefix[initType * 18], 18);
+  init_context(QPY, cm+CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX, &initValue_last_significant_coefficient_prefix[initType * 18], 18);
+  init_context(QPY, cm+CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG,                  &initValue_coded_sub_block_flag[initType * 4],        4);
+  init_context(QPY, cm+CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG,              initValue_significant_coeff_flag[initType],    42);
+  init_context(QPY, cm+CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG+42, initValue_significant_coeff_flag_skipmode[initType], 2);
+
+  init_context(QPY, cm+CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG,       &initValue_coeff_abs_level_greater1_flag[initType * 24], 24);
+  init_context(QPY, cm+CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG,       &initValue_coeff_abs_level_greater2_flag[initType *  6],  6);
+  init_context(QPY, cm+CONTEXT_MODEL_SAO_MERGE_FLAG,                      &initValue_sao_merge_leftUp_flag[initType],    1);
+  init_context(QPY, cm+CONTEXT_MODEL_SAO_TYPE_IDX,                        &initValue_sao_type_idx_lumaChroma_flag[initType], 1);
+  init_context(QPY, cm+CONTEXT_MODEL_CU_QP_DELTA_ABS,        initValue_cu_qp_delta_abs,        2);
+  init_context(QPY, cm+CONTEXT_MODEL_TRANSFORM_SKIP_FLAG,    initValue_transform_skip_flag,    2);
+  init_context(QPY, cm+CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG, &initValue_cu_transquant_bypass_flag[initType], 1);
+
+  init_context_const(QPY, cm+CONTEXT_MODEL_LOG2_RES_SCALE_ABS_PLUS1, 154, 8);
+  init_context_const(QPY, cm+CONTEXT_MODEL_RES_SCALE_SIGN_FLAG,      154, 2);
+  init_context_const(QPY, cm+CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_FLAG, 154, 1);
+  init_context_const(QPY, cm+CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_IDX,  154, 1);
+}
diff --git a/libde265/contextmodel.h b/libde265/contextmodel.h
new file mode 100644
index 0000000..cde83e1
--- /dev/null
+++ b/libde265/contextmodel.h
@@ -0,0 +1,130 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: struktur AG, Dirk Farin <farin at struktur.de>
+ *          Min Chen <chenm003 at 163.com>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DE265_CONTEXTMODEL_H
+#define DE265_CONTEXTMODEL_H
+
+#include "libde265/cabac.h"
+#include "libde265/de265.h"
+
+#include <string.h>
+#include <string>
+
+
+struct context_model {
+  uint8_t MPSbit : 1;
+  uint8_t state  : 7;
+
+  bool operator==(context_model b) const { return state==b.state && MPSbit==b.MPSbit; }
+  bool operator!=(context_model b) const { return state!=b.state || MPSbit!=b.MPSbit; }
+};
+
+
+enum context_model_index {
+  // SAO
+  CONTEXT_MODEL_SAO_MERGE_FLAG = 0,
+  CONTEXT_MODEL_SAO_TYPE_IDX   = CONTEXT_MODEL_SAO_MERGE_FLAG +1,
+
+  // CB-tree
+  CONTEXT_MODEL_SPLIT_CU_FLAG  = CONTEXT_MODEL_SAO_TYPE_IDX + 1,
+  CONTEXT_MODEL_CU_SKIP_FLAG   = CONTEXT_MODEL_SPLIT_CU_FLAG + 3,
+
+  // intra-prediction
+  CONTEXT_MODEL_PART_MODE      = CONTEXT_MODEL_CU_SKIP_FLAG + 3,
+  CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG = CONTEXT_MODEL_PART_MODE + 4,
+  CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE    = CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG + 1,
+
+  // transform-tree
+  CONTEXT_MODEL_CBF_LUMA                  = CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE + 1,
+  CONTEXT_MODEL_CBF_CHROMA                = CONTEXT_MODEL_CBF_LUMA + 2,
+  CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG      = CONTEXT_MODEL_CBF_CHROMA + 4,
+  CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_FLAG  = CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG + 3,
+  CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_IDX   = CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_FLAG + 1,
+
+  // residual
+  CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX = CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_IDX + 1,
+  CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX = CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX + 18,
+  CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG          = CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX + 18,
+  CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG        = CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG + 4,
+  CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG = CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG + 42+2,
+  CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG = CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG + 24,
+
+  CONTEXT_MODEL_CU_QP_DELTA_ABS        = CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG + 6,
+  CONTEXT_MODEL_TRANSFORM_SKIP_FLAG    = CONTEXT_MODEL_CU_QP_DELTA_ABS + 2,
+  CONTEXT_MODEL_RDPCM_FLAG             = CONTEXT_MODEL_TRANSFORM_SKIP_FLAG + 2,
+  CONTEXT_MODEL_RDPCM_DIR              = CONTEXT_MODEL_RDPCM_FLAG + 2,
+
+  // motion
+  CONTEXT_MODEL_MERGE_FLAG             = CONTEXT_MODEL_RDPCM_DIR + 2,
+  CONTEXT_MODEL_MERGE_IDX              = CONTEXT_MODEL_MERGE_FLAG + 1,
+  CONTEXT_MODEL_PRED_MODE_FLAG         = CONTEXT_MODEL_MERGE_IDX + 1,
+  CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG = CONTEXT_MODEL_PRED_MODE_FLAG + 1,
+  CONTEXT_MODEL_MVP_LX_FLAG            = CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG + 2,
+  CONTEXT_MODEL_RQT_ROOT_CBF           = CONTEXT_MODEL_MVP_LX_FLAG + 1,
+  CONTEXT_MODEL_REF_IDX_LX             = CONTEXT_MODEL_RQT_ROOT_CBF + 1,
+  CONTEXT_MODEL_INTER_PRED_IDC         = CONTEXT_MODEL_REF_IDX_LX + 2,
+  CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG = CONTEXT_MODEL_INTER_PRED_IDC + 5,
+  CONTEXT_MODEL_LOG2_RES_SCALE_ABS_PLUS1 = CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG + 1,
+  CONTEXT_MODEL_RES_SCALE_SIGN_FLAG      = CONTEXT_MODEL_LOG2_RES_SCALE_ABS_PLUS1 + 8,
+  CONTEXT_MODEL_TABLE_LENGTH           = CONTEXT_MODEL_RES_SCALE_SIGN_FLAG + 2
+};
+
+
+
+void initialize_CABAC_models(context_model context_model_table[CONTEXT_MODEL_TABLE_LENGTH],
+                             int initType,
+                             int QPY);
+
+
+class context_model_table
+{
+ public:
+  context_model_table();
+  context_model_table(const context_model_table&);
+  ~context_model_table();
+
+  void init(int initType, int QPY);
+  void release();
+  void decouple();
+  context_model_table transfer();
+  context_model_table copy() const { context_model_table t=*this; t.decouple(); return t; }
+
+  bool empty() const { return refcnt != NULL; }
+
+  context_model& operator[](int i) { return model[i]; }
+
+  context_model_table& operator=(const context_model_table&);
+
+  bool operator==(const context_model_table&) const;
+
+  std::string debug_dump() const;
+
+ private:
+  void decouple_or_alloc_with_empty_data();
+
+  context_model* model; // [CONTEXT_MODEL_TABLE_LENGTH]
+  int* refcnt;
+};
+
+
+#endif
diff --git a/libde265/de265-version.h b/libde265/de265-version.h
index 564bfb5..57660b0 100644
--- a/libde265/de265-version.h
+++ b/libde265/de265-version.h
@@ -28,9 +28,9 @@
 #define LIBDE265_VERSION_H
 
 /* Numeric representation of the version */
-#define LIBDE265_NUMERIC_VERSION 0x00090000
+#define LIBDE265_NUMERIC_VERSION 0x01000200
 
 /* Version string */
-#define LIBDE265_VERSION "0.9"
+#define LIBDE265_VERSION "1.0.2"
 
 #endif
diff --git a/libde265/de265.cc b/libde265/de265.cc
index 7f67f59..ca31876 100644
--- a/libde265/de265.cc
+++ b/libde265/de265.cc
@@ -47,13 +47,29 @@ LIBDE265_API uint32_t de265_get_version_number(void)
     return (LIBDE265_NUMERIC_VERSION);
 }
 
+LIBDE265_API int de265_get_version_number_major(void)
+{
+  return ((LIBDE265_NUMERIC_VERSION)>>24) & 0xFF;
+}
+
+LIBDE265_API int de265_get_version_number_minor(void)
+{
+  return ((LIBDE265_NUMERIC_VERSION)>>16) & 0xFF;
+}
+
+LIBDE265_API int de265_get_version_number_maintenance(void)
+{
+  return ((LIBDE265_NUMERIC_VERSION)>>8) & 0xFF;
+}
+
+
 LIBDE265_API const char* de265_get_error_text(de265_error err)
 {
   switch (err) {
   case DE265_OK: return "no error";
   case DE265_ERROR_NO_SUCH_FILE: return "no such file";
     //case DE265_ERROR_NO_STARTCODE: return "no startcode found";
-  case DE265_ERROR_EOF: return "end of file";
+    //case DE265_ERROR_EOF: return "end of file";
   case DE265_ERROR_COEFFICIENT_OUT_OF_IMAGE_BOUNDS: return "coefficient out of image bounds";
   case DE265_ERROR_CHECKSUM_MISMATCH: return "image checksum mismatch";
   case DE265_ERROR_CTB_OUTSIDE_IMAGE_AREA: return "CTB outside of image area";
@@ -64,16 +80,27 @@ LIBDE265_API const char* de265_get_error_text(de265_error err)
   case DE265_ERROR_LIBRARY_INITIALIZATION_FAILED: return "global library initialization failed";
   case DE265_ERROR_LIBRARY_NOT_INITIALIZED: return "cannot free library data (not initialized";
 
-  case DE265_ERROR_MAX_THREAD_CONTEXTS_EXCEEDED:
-    return "internal error: maximum number of thread contexts exceeded";
-  case DE265_ERROR_MAX_NUMBER_OF_SLICES_EXCEEDED:
-    return "internal error: maximum number of slices exceeded";
+  //case DE265_ERROR_MAX_THREAD_CONTEXTS_EXCEEDED:
+  //  return "internal error: maximum number of thread contexts exceeded";
+  //case DE265_ERROR_MAX_NUMBER_OF_SLICES_EXCEEDED:
+  //  return "internal error: maximum number of slices exceeded";
+  case DE265_ERROR_NOT_IMPLEMENTED_YET:
+    return "unimplemented decoder feature";
     //case DE265_ERROR_SCALING_LIST_NOT_IMPLEMENTED:
     //return "scaling list not implemented";
+
   case DE265_ERROR_WAITING_FOR_INPUT_DATA:
     return "no more input data, decoder stalled";
   case DE265_ERROR_CANNOT_PROCESS_SEI:
     return "SEI data cannot be processed";
+  case DE265_ERROR_PARAMETER_PARSING:
+    return "command-line parameter error";
+  case DE265_ERROR_NO_INITIAL_SLICE_HEADER:
+    return "first slice missing, cannot decode dependent slice";
+  case DE265_ERROR_PREMATURE_END_OF_SLICE:
+    return "premature end of slice data";
+  case DE265_ERROR_UNSPECIFIED_DECODING_ERROR:
+    return "unspecified decoding error";
 
   case DE265_WARNING_NO_WPP_CANNOT_USE_MULTITHREADING:
     return "Cannot run decoder multi-threaded because stream does not support WPP";
@@ -127,6 +154,8 @@ LIBDE265_API const char* de265_get_error_text(de265_error err)
     return "cannot apply SAO because we ran out of memory";
   case DE265_WARNING_SPS_MISSING_CANNOT_DECODE_SEI:
     return "SPS header missing, cannot decode SEI";
+  case DE265_WARNING_COLLOCATED_MOTION_VECTOR_OUTSIDE_IMAGE_AREA:
+    return "collocated motion-vector is outside image area";
 
   default: return "unknown error";
   }
@@ -587,6 +616,19 @@ LIBDE265_API int de265_get_image_height(const struct de265_image* img,int channe
   }
 }
 
+LIBDE265_API int de265_get_bits_per_pixel(const struct de265_image* img,int channel)
+{
+  switch (channel) {
+  case 0:
+    return img->sps.BitDepth_Y;
+  case 1:
+  case 2:
+    return img->sps.BitDepth_C;
+  default:
+    return 0;
+  }
+}
+
 LIBDE265_API enum de265_chroma de265_get_chroma_format(const struct de265_image* img)
 {
   return img->get_chroma_format();
@@ -598,7 +640,7 @@ LIBDE265_API const uint8_t* de265_get_image_plane(const de265_image* img, int ch
 
   uint8_t* data = img->pixels_confwin[channel];
 
-  if (stride) *stride = img->get_image_stride(channel);
+  if (stride) *stride = img->get_image_stride(channel) * ((de265_get_bits_per_pixel(img, channel)+7) / 8);
 
   return data;
 }
@@ -612,6 +654,8 @@ LIBDE265_API void *de265_get_image_plane_user_data(const struct de265_image* img
 
 LIBDE265_API void de265_set_image_plane(de265_image* img, int cIdx, void* mem, int stride, void *userdata)
 {
+  // The internal "stride" is the number of pixels per line.
+  stride = stride / ((de265_get_bits_per_pixel(img, cIdx)+7) / 8);
   img->set_image_plane(cIdx, (uint8_t*)mem, stride, userdata);
 }
 
@@ -656,4 +700,3 @@ LIBDE265_API void de265_get_image_NAL_header(const struct de265_image* img,
   if (nuh_temporal_id) *nuh_temporal_id = img->nal_hdr.nuh_temporal_id;
 }
 }
-
diff --git a/libde265/de265.h b/libde265/de265.h
index e97dadd..6481d8f 100644
--- a/libde265/de265.h
+++ b/libde265/de265.h
@@ -72,6 +72,10 @@ extern "C" {
 LIBDE265_API const char *de265_get_version(void);
 LIBDE265_API uint32_t de265_get_version_number(void);
 
+LIBDE265_API int de265_get_version_number_major(void);
+LIBDE265_API int de265_get_version_number_minor(void);
+LIBDE265_API int de265_get_version_number_maintenance(void);
+
 
 /* === error codes === */
 
@@ -79,7 +83,7 @@ typedef enum {
   DE265_OK = 0,
   DE265_ERROR_NO_SUCH_FILE=1,
   //DE265_ERROR_NO_STARTCODE=2,  obsolet
-  DE265_ERROR_EOF=3,
+  //DE265_ERROR_EOF=3,
   DE265_ERROR_COEFFICIENT_OUT_OF_IMAGE_BOUNDS=4,
   DE265_ERROR_CHECKSUM_MISMATCH=5,
   DE265_ERROR_CTB_OUTSIDE_IMAGE_AREA=6,
@@ -91,11 +95,16 @@ typedef enum {
   DE265_ERROR_LIBRARY_NOT_INITIALIZED=12,
   DE265_ERROR_WAITING_FOR_INPUT_DATA=13,
   DE265_ERROR_CANNOT_PROCESS_SEI=14,
+  DE265_ERROR_PARAMETER_PARSING=15,
+  DE265_ERROR_NO_INITIAL_SLICE_HEADER=16,
+  DE265_ERROR_PREMATURE_END_OF_SLICE=17,
+  DE265_ERROR_UNSPECIFIED_DECODING_ERROR=18,
 
   // --- errors that should become obsolete in later libde265 versions ---
 
-  DE265_ERROR_MAX_THREAD_CONTEXTS_EXCEEDED = 500,
-  DE265_ERROR_MAX_NUMBER_OF_SLICES_EXCEEDED = 501,
+  //DE265_ERROR_MAX_THREAD_CONTEXTS_EXCEEDED = 500, obsolet
+  //DE265_ERROR_MAX_NUMBER_OF_SLICES_EXCEEDED = 501, obsolet
+  DE265_ERROR_NOT_IMPLEMENTED_YET = 502,
   //DE265_ERROR_SCALING_LIST_NOT_IMPLEMENTED = 502, obsolet
 
   // --- warnings ---
@@ -125,7 +134,8 @@ typedef enum {
   DE265_WARNING_NUMBER_OF_THREADS_LIMITED_TO_MAXIMUM=1022,
   DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER=1023,
   DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY=1024,
-  DE265_WARNING_SPS_MISSING_CANNOT_DECODE_SEI=1025
+  DE265_WARNING_SPS_MISSING_CANNOT_DECODE_SEI=1025,
+  DE265_WARNING_COLLOCATED_MOTION_VECTOR_OUTSIDE_IMAGE_AREA=1026
 } de265_error;
 
 LIBDE265_API const char* de265_get_error_text(de265_error err);
@@ -148,7 +158,7 @@ struct de265_image;
 
 enum de265_chroma {
   de265_chroma_mono=0,
-  de265_chroma_420=1,  // currently the only used format
+  de265_chroma_420=1,
   de265_chroma_422=2,
   de265_chroma_444=3
 };
@@ -159,6 +169,8 @@ typedef int64_t de265_PTS;
 LIBDE265_API int de265_get_image_width(const struct de265_image*,int channel);
 LIBDE265_API int de265_get_image_height(const struct de265_image*,int channel);
 LIBDE265_API enum de265_chroma de265_get_chroma_format(const struct de265_image*);
+LIBDE265_API int de265_get_bits_per_pixel(const struct de265_image*,int channel);
+/* The |out_stride| is returned as "bytes per line" if a non-NULL parameter is given. */
 LIBDE265_API const uint8_t* de265_get_image_plane(const struct de265_image*, int channel, int* out_stride);
 LIBDE265_API void* de265_get_image_plane_user_data(const struct de265_image*, int channel);
 LIBDE265_API de265_PTS de265_get_image_PTS(const struct de265_image*);
@@ -312,11 +324,11 @@ struct de265_image_spec
 
 struct de265_image_allocation
 {
-  int  (*get_buffer)(de265_decoder_context* ctx,
+  int  (*get_buffer)(de265_decoder_context* ctx, // first parameter deprecated
                      struct de265_image_spec* spec,
                      struct de265_image* img,
                      void* userdata);
-  void (*release_buffer)(de265_decoder_context* ctx,
+  void (*release_buffer)(de265_decoder_context* ctx, // first parameter deprecated
                          struct de265_image* img,
                          void* userdata);
 };
@@ -384,6 +396,8 @@ enum de265_acceleration {
   de265_acceleration_SSE4 = 40,
   de265_acceleration_AVX  = 50,    // not implemented yet
   de265_acceleration_AVX2 = 60,    // not implemented yet
+  de265_acceleration_ARM  = 70,
+  de265_acceleration_NEON = 80,
   de265_acceleration_AUTO = 10000
 };
 
diff --git a/libde265/deblock.cc b/libde265/deblock.cc
index 852a8e2..23b1741 100644
--- a/libde265/deblock.cc
+++ b/libde265/deblock.cc
@@ -160,6 +160,11 @@ bool derive_edgeFlags_CTBRow(de265_image* img, int ctby)
         int x0ctb = x0 >> ctbshift;
         int y0ctb = y0 >> ctbshift;
 
+        // check for corrupted streams
+        if (img->is_SliceHeader_available(x0,y0)==false) {
+          return false;
+        }
+
         // check whether we should filter this slice
 
         slice_segment_header* shdr = img->get_SliceHeader(x0,y0);
@@ -175,6 +180,7 @@ bool derive_edgeFlags_CTBRow(de265_image* img, int ctby)
 
         if (x0 && ((x0 & ctb_mask) == 0)) { // left edge at CTB boundary
           if (shdr->slice_loop_filter_across_slices_enabled_flag == 0 &&
+              img->is_SliceHeader_available(x0-1,y0) && // for corrupted streams
               shdr->SliceAddrRS != img->get_SliceHeader(x0-1,y0)->SliceAddrRS)
             {
               filterLeftCbEdge = 0;
@@ -188,6 +194,7 @@ bool derive_edgeFlags_CTBRow(de265_image* img, int ctby)
 
         if (y0 && ((y0 & ctb_mask) == 0)) { // top edge at CTB boundary
           if (shdr->slice_loop_filter_across_slices_enabled_flag == 0 &&
+              img->is_SliceHeader_available(x0,y0-1) && // for corrupted streams
               shdr->SliceAddrRS != img->get_SliceHeader(x0,y0-1)->SliceAddrRS)
             {
               filterTopCbEdge = 0;
@@ -281,8 +288,8 @@ void derive_boundaryStrength(de265_image* img, bool vertical, int yStart,int yEn
 
             bS = 0;
 
-            const PredVectorInfo* mviP = img->get_mv_info(xDiOpp,yDiOpp);
-            const PredVectorInfo* mviQ = img->get_mv_info(xDi   ,yDi);
+            const MotionVectorSpec* mviP = img->get_mv_info(xDiOpp,yDiOpp);
+            const MotionVectorSpec* mviQ = img->get_mv_info(xDi   ,yDi);
 
             slice_segment_header* shdrP = img->get_SliceHeader(xDiOpp,yDiOpp);
             slice_segment_header* shdrQ = img->get_SliceHeader(xDi   ,yDi);
@@ -393,9 +400,12 @@ static uint8_t table_8_23_tc[54] = {
 
 
 // 8.7.2.4
-void edge_filtering_luma(de265_image* img, bool vertical,
-                         int yStart,int yEnd, int xStart,int xEnd)
+template <class pixel_t>
+void edge_filtering_luma_internal(de265_image* img, bool vertical,
+                                  int yStart,int yEnd, int xStart,int xEnd)
 {
+  //printf("luma %d-%d %d-%d\n",xStart,xEnd,yStart,yEnd);
+
   int xIncr = vertical ? 2 : 1;
   int yIncr = vertical ? 1 : 2;
 
@@ -408,10 +418,14 @@ void edge_filtering_luma(de265_image* img, bool vertical,
 
   for (int y=yStart;y<yEnd;y+=yIncr)
     for (int x=xStart;x<xEnd;x+=xIncr) {
-      int xDi = x<<2;
-      int yDi = y<<2;
+      // x;y in deblocking units (4x4 pixels)
+
+      int xDi = x<<2; // *4 -> pixel resolution
+      int yDi = y<<2; // *4 -> pixel resolution
       int bS = img->get_deblk_bS(xDi,yDi);
 
+      //printf("x,y:%d,%d  xDi,yDi:%d,%d\n",x,y,xDi,yDi);
+
       logtrace(LogDeblock,"deblock POC=%d %c --- x:%d y:%d bS:%d---\n",
                img->PicOrderCntVal,vertical ? 'V':'H',xDi,yDi,bS);
 
@@ -450,9 +464,9 @@ void edge_filtering_luma(de265_image* img, bool vertical,
 
         // 8.7.2.4.3
 
-        uint8_t* ptr = img->get_image_plane_at_pos(0, xDi,yDi);
+        pixel_t* ptr = img->get_image_plane_at_pos_NEW<pixel_t>(0, xDi,yDi);
 
-        uint8_t q[4][4], p[4][4];
+        pixel_t q[4][4], p[4][4];
         for (int k=0;k<4;k++)
           for (int i=0;i<4;i++)
             {
@@ -577,21 +591,21 @@ void edge_filtering_luma(de265_image* img, bool vertical,
 
             logtrace(LogDeblock,"line:%d\n",k);
 
-            const uint8_t p0 = p[k][0];
-            const uint8_t p1 = p[k][1];
-            const uint8_t p2 = p[k][2];
-            const uint8_t p3 = p[k][3];
-            const uint8_t q0 = q[k][0];
-            const uint8_t q1 = q[k][1];
-            const uint8_t q2 = q[k][2];
-            const uint8_t q3 = q[k][3];
+            const pixel_t p0 = p[k][0];
+            const pixel_t p1 = p[k][1];
+            const pixel_t p2 = p[k][2];
+            const pixel_t p3 = p[k][3];
+            const pixel_t q0 = q[k][0];
+            const pixel_t q1 = q[k][1];
+            const pixel_t q2 = q[k][2];
+            const pixel_t q3 = q[k][3];
 
             if (dE==2) {
               // strong filtering
 
               //nDp=nDq=3;
 
-              uint8_t pnew[3],qnew[3];
+              pixel_t pnew[3],qnew[3];
               pnew[0] = Clip3(p0-2*tc,p0+2*tc, (p2 + 2*p1 + 2*p0 + 2*q0 + q1 +4)>>3);
               pnew[1] = Clip3(p1-2*tc,p1+2*tc, (p2 + p1 + p0 + q0+2)>>2);
               pnew[2] = Clip3(p2-2*tc,p2+2*tc, (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3);
@@ -629,18 +643,18 @@ void edge_filtering_luma(de265_image* img, bool vertical,
                 delta = Clip3(-tc,tc,delta);
                 logtrace(LogDeblock," deblk + %d;%d [%02x->%02x]  - %d;%d [%02x->%02x] delta:%d\n",
                          vertical ? xDi-1 : xDi+k,
-                         vertical ? yDi+k : yDi-1, p0,Clip1_8bit(p0+delta),
+                         vertical ? yDi+k : yDi-1, p0,Clip_BitDepth(p0+delta, bitDepth_Y),
                          vertical ? xDi   : xDi+k,
-                         vertical ? yDi+k : yDi, q0,Clip1_8bit(q0-delta),
+                         vertical ? yDi+k : yDi,   q0,Clip_BitDepth(q0-delta, bitDepth_Y),
                          delta);
 
                 if (vertical) {
-                  if (filterP) { ptr[-0-1+k*stride] = Clip1_8bit(p0+delta); }
-                  if (filterQ) { ptr[ 0  +k*stride] = Clip1_8bit(q0-delta); }
+                  if (filterP) { ptr[-0-1+k*stride] = Clip_BitDepth(p0+delta, bitDepth_Y); }
+                  if (filterQ) { ptr[ 0  +k*stride] = Clip_BitDepth(q0-delta, bitDepth_Y); }
                 }
                 else {
-                  if (filterP) { ptr[ k -1*stride] = Clip1_8bit(p0+delta); }
-                  if (filterQ) { ptr[ k +0*stride] = Clip1_8bit(q0-delta); }
+                  if (filterP) { ptr[ k -1*stride] = Clip_BitDepth(p0+delta, bitDepth_Y); }
+                  if (filterQ) { ptr[ k +0*stride] = Clip_BitDepth(q0-delta, bitDepth_Y); }
                 }
 
                 //ptr[ 0+k*stride] = 200;
@@ -653,8 +667,8 @@ void edge_filtering_luma(de265_image* img, bool vertical,
                            vertical ? yDi+k : yDi-2,
                            delta_p);
 
-                  if (vertical) { ptr[-1-1+k*stride] = Clip1_8bit(p1+delta_p); }
-                  else          { ptr[ k  -2*stride] = Clip1_8bit(p1+delta_p); }
+                  if (vertical) { ptr[-1-1+k*stride] = Clip_BitDepth(p1+delta_p, bitDepth_Y); }
+                  else          { ptr[ k  -2*stride] = Clip_BitDepth(p1+delta_p, bitDepth_Y); }
                 }
 
                 if (dEq==1 && filterQ) {
@@ -665,8 +679,8 @@ void edge_filtering_luma(de265_image* img, bool vertical,
                            vertical ? yDi+k : yDi+1,
                            delta_q);
 
-                  if (vertical) { ptr[ 1  +k*stride] = Clip1_8bit(q1+delta_q); }
-                  else          { ptr[ k  +1*stride] = Clip1_8bit(q1+delta_q); }
+                  if (vertical) { ptr[ 1  +k*stride] = Clip_BitDepth(q1+delta_q, bitDepth_Y); }
+                  else          { ptr[ k  +1*stride] = Clip_BitDepth(q1+delta_q, bitDepth_Y); }
                 }
 
                 //nDp = dEp+1;
@@ -682,6 +696,17 @@ void edge_filtering_luma(de265_image* img, bool vertical,
 }
 
 
+void edge_filtering_luma(de265_image* img, bool vertical,
+                         int yStart,int yEnd, int xStart,int xEnd)
+{
+  if (img->high_bit_depth(0)) {
+    edge_filtering_luma_internal<uint16_t>(img,vertical,yStart,yEnd,xStart,xEnd);
+  }
+  else {
+    edge_filtering_luma_internal<uint8_t>(img,vertical,yStart,yEnd,xStart,xEnd);
+  }
+}
+
 void edge_filtering_luma_CTB(de265_image* img, bool vertical, int xCtb,int yCtb)
 {
   int ctbSize = img->sps.CtbSizeY;
@@ -696,22 +721,39 @@ void edge_filtering_luma_CTB(de265_image* img, bool vertical, int xCtb,int yCtb)
 
 
 // 8.7.2.4
-void edge_filtering_chroma(de265_image* img, bool vertical, int yStart,int yEnd,
-                           int xStart,int xEnd)
+/** ?Start and ?End values in 4-luma pixels resolution.
+ */
+template <class pixel_t>
+void edge_filtering_chroma_internal(de265_image* img, bool vertical,
+                                    int yStart,int yEnd,
+                                    int xStart,int xEnd)
 {
-  int xIncr = vertical ? 4 : 2;
-  int yIncr = vertical ? 2 : 4;
+  //printf("chroma %d-%d %d-%d\n",xStart,xEnd,yStart,yEnd);
+
+  const int SubWidthC  = img->sps.SubWidthC;
+  const int SubHeightC = img->sps.SubHeightC;
+
+  int xIncr = vertical ? 2 : 1;
+  int yIncr = vertical ? 1 : 2;
+
+  xIncr *= SubWidthC;
+  yIncr *= SubHeightC;
 
   const int stride = img->get_image_stride(1);
 
   xEnd = libde265_min(xEnd,img->get_deblk_width());
   yEnd = libde265_min(yEnd,img->get_deblk_height());
 
+  int bitDepth_C = img->sps.BitDepth_C;
+
   for (int y=yStart;y<yEnd;y+=yIncr)
     for (int x=xStart;x<xEnd;x+=xIncr) {
-      int xDi = x*2;
-      int yDi = y*2;
-      int bS = img->get_deblk_bS(2*xDi,2*yDi);
+      int xDi = x << (3-SubWidthC);
+      int yDi = y << (3-SubHeightC);
+
+      //printf("x,y:%d,%d  xDi,yDi:%d,%d\n",x,y,xDi,yDi);
+
+      int bS = img->get_deblk_bS(xDi*SubWidthC,yDi*SubHeightC);
 
       if (bS>1) {
         // 8.7.2.4.5
@@ -721,10 +763,10 @@ void edge_filtering_chroma(de265_image* img, bool vertical, int yStart,int yEnd,
                               img->pps.pic_cb_qp_offset :
                               img->pps.pic_cr_qp_offset);
 
-          uint8_t* ptr = img->get_image_plane_at_pos(cplane+1, xDi,yDi);
+          pixel_t* ptr = img->get_image_plane_at_pos_NEW<pixel_t>(cplane+1, xDi,yDi);
 
-          uint8_t p[2][4];
-          uint8_t q[2][4];
+          pixel_t p[2][4];
+          pixel_t q[2][4];
 
           logtrace(LogDeblock,"-%s- %d %d\n",cplane==0 ? "Cb" : "Cr",xDi,yDi);
 
@@ -759,18 +801,24 @@ void edge_filtering_chroma(de265_image* img, bool vertical, int yStart,int yEnd,
             }
 #endif
 
-          int QP_Q = img->get_QPY(2*xDi,2*yDi);
+          int QP_Q = img->get_QPY(SubWidthC*xDi,SubHeightC*yDi);
           int QP_P = (vertical ?
-                      img->get_QPY(2*xDi-1,2*yDi) :
-                      img->get_QPY(2*xDi,2*yDi-1));
+                      img->get_QPY(SubWidthC*xDi-1,SubHeightC*yDi) :
+                      img->get_QPY(SubWidthC*xDi,SubHeightC*yDi-1));
           int qP_i = ((QP_Q+QP_P+1)>>1) + cQpPicOffset;
-          int QP_C = table8_22(qP_i);
+          int QP_C;
+          if (img->sps.ChromaArrayType == CHROMA_420) {
+            QP_C = table8_22(qP_i);
+          } else {
+            QP_C = libde265_min(qP_i, 51);
+          }
+
 
           //printf("POC=%d\n",ctx->img->PicOrderCntVal);
           logtrace(LogDeblock,"%d %d: ((%d+%d+1)>>1) + %d = qP_i=%d  (QP_C=%d)\n",
-                   2*xDi,2*yDi, QP_Q,QP_P,cQpPicOffset,qP_i,QP_C);
-          
-          int sliceIndexQ00 = img->get_SliceHeaderIndex(2*xDi,2*yDi);
+                   SubWidthC*xDi,SubHeightC*yDi, QP_Q,QP_P,cQpPicOffset,qP_i,QP_C);
+
+          int sliceIndexQ00 = img->get_SliceHeaderIndex(SubWidthC*xDi,SubHeightC*yDi);
           int tc_offset   = img->slices[sliceIndexQ00]->slice_tc_offset;
 
           int Q = Clip3(0,53, QP_C + 2*(bS-1) + tc_offset);
@@ -782,34 +830,34 @@ void edge_filtering_chroma(de265_image* img, bool vertical, int yStart,int yEnd,
 
           if (vertical) {
             bool filterP = true;
-            if (img->sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(2*xDi-1,2*yDi)) filterP=false;
-            if (img->get_cu_transquant_bypass(2*xDi-1,2*yDi)) filterP=false;
+            if (img->sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(SubWidthC*xDi-1,SubHeightC*yDi)) filterP=false;
+            if (img->get_cu_transquant_bypass(SubWidthC*xDi-1,SubHeightC*yDi)) filterP=false;
 
             bool filterQ = true;
-            if (img->sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(2*xDi,2*yDi)) filterQ=false;
-            if (img->get_cu_transquant_bypass(2*xDi,2*yDi)) filterQ=false;
+            if (img->sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(SubWidthC*xDi,SubHeightC*yDi)) filterQ=false;
+            if (img->get_cu_transquant_bypass(SubWidthC*xDi,SubHeightC*yDi)) filterQ=false;
 
 
             for (int k=0;k<4;k++) {
               int delta = Clip3(-tc,tc, ((((q[0][k]-p[0][k])<<2)+p[1][k]-q[1][k]+4)>>3));
               logtrace(LogDeblock,"delta=%d\n",delta);
-              if (filterP) { ptr[-1+k*stride] = Clip1_8bit(p[0][k]+delta); }
-              if (filterQ) { ptr[ 0+k*stride] = Clip1_8bit(q[0][k]-delta); }
+              if (filterP) { ptr[-1+k*stride] = Clip_BitDepth(p[0][k]+delta, bitDepth_C); }
+              if (filterQ) { ptr[ 0+k*stride] = Clip_BitDepth(q[0][k]-delta, bitDepth_C); }
             }
           }
           else {
             bool filterP = true;
-            if (img->sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(2*xDi,2*yDi-1)) filterP=false;
-            if (img->get_cu_transquant_bypass(2*xDi,2*yDi-1)) filterP=false;
+            if (img->sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(SubWidthC*xDi,SubHeightC*yDi-1)) filterP=false;
+            if (img->get_cu_transquant_bypass(SubWidthC*xDi,SubHeightC*yDi-1)) filterP=false;
 
             bool filterQ = true;
-            if (img->sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(2*xDi,2*yDi)) filterQ=false;
-            if (img->get_cu_transquant_bypass(2*xDi,2*yDi)) filterQ=false;
+            if (img->sps.pcm_loop_filter_disable_flag && img->get_pcm_flag(SubWidthC*xDi,SubHeightC*yDi)) filterQ=false;
+            if (img->get_cu_transquant_bypass(SubWidthC*xDi,SubHeightC*yDi)) filterQ=false;
 
             for (int k=0;k<4;k++) {
               int delta = Clip3(-tc,tc, ((((q[0][k]-p[0][k])<<2)+p[1][k]-q[1][k]+4)>>3));
-              if (filterP) { ptr[ k-1*stride] = Clip1_8bit(p[0][k]+delta); }
-              if (filterQ) { ptr[ k+0*stride] = Clip1_8bit(q[0][k]-delta); }
+              if (filterP) { ptr[ k-1*stride] = Clip_BitDepth(p[0][k]+delta, bitDepth_C); }
+              if (filterQ) { ptr[ k+0*stride] = Clip_BitDepth(q[0][k]-delta, bitDepth_C); }
             }
           }
         }
@@ -817,6 +865,19 @@ void edge_filtering_chroma(de265_image* img, bool vertical, int yStart,int yEnd,
     }
 }
 
+
+void edge_filtering_chroma(de265_image* img, bool vertical, int yStart,int yEnd,
+                           int xStart,int xEnd)
+{
+  if (img->high_bit_depth(1)) {
+    edge_filtering_chroma_internal<uint16_t>(img,vertical,yStart,yEnd,xStart,xEnd);
+  }
+  else {
+    edge_filtering_chroma_internal<uint8_t>(img,vertical,yStart,yEnd,xStart,xEnd);
+  }
+}
+
+
 void edge_filtering_chroma_CTB(de265_image* img, bool vertical, int xCtb,int yCtb)
 {
   int ctbSize = img->sps.CtbSizeY;
@@ -837,13 +898,18 @@ public:
   bool vertical;
 
   virtual void work();
+  virtual std::string name() const {
+    char buf[100];
+    sprintf(buf,"deblock-%d",ctb_y);
+    return buf;
+  }
 };
 
 
 void thread_task_deblock_CTBRow::work()
 {
   state = Running;
-  img->thread_run();
+  img->thread_run(this);
 
   int xStart=0;
   int xEnd = img->get_deblk_width();
@@ -900,8 +966,12 @@ void thread_task_deblock_CTBRow::work()
 
   if (deblocking_enabled) {
     derive_boundaryStrength(img, vertical, first,last, xStart,xEnd);
-    edge_filtering_luma    (img, vertical, first,last, xStart,xEnd);
-    edge_filtering_chroma  (img, vertical, first,last, xStart,xEnd);
+
+    edge_filtering_luma(img, vertical, first,last, xStart,xEnd);
+
+    if (img->sps.ChromaArrayType != CHROMA_MONO) {
+      edge_filtering_chroma(img, vertical, first,last, xStart,xEnd);
+    }
   }
 
   for (int x=0;x<=rightCtb;x++) {
@@ -910,7 +980,7 @@ void thread_task_deblock_CTBRow::work()
   }
 
   state = Finished;
-  img->thread_finishes();
+  img->thread_finishes(this);
 }
 
 
@@ -935,7 +1005,7 @@ void add_deblocking_tasks(image_unit* imgunit)
           task->vertical = (pass==0);
 
           imgunit->tasks.push_back(task);
-          add_task(&ctx->thread_pool, task);
+          add_task(&ctx->thread_pool_, task);
           n++;
         }
     }
@@ -955,8 +1025,10 @@ void apply_deblocking_filter(de265_image* img) // decoder_context* ctx)
       logtrace(LogDeblock,"VERTICAL\n");
       derive_boundaryStrength(img, true ,0,img->get_deblk_height(),0,img->get_deblk_width());
       edge_filtering_luma    (img, true ,0,img->get_deblk_height(),0,img->get_deblk_width());
-      edge_filtering_chroma  (img, true ,0,img->get_deblk_height(),0,img->get_deblk_width());
 
+      if (img->sps.ChromaArrayType != CHROMA_MONO) {
+        edge_filtering_chroma  (img, true ,0,img->get_deblk_height(),0,img->get_deblk_width());
+      }
 #if 0
       char buf[1000];
       sprintf(buf,"lf-after-V-%05d.yuv", ctx->img->PicOrderCntVal);
@@ -968,7 +1040,10 @@ void apply_deblocking_filter(de265_image* img) // decoder_context* ctx)
       logtrace(LogDeblock,"HORIZONTAL\n");
       derive_boundaryStrength(img, false ,0,img->get_deblk_height(),0,img->get_deblk_width());
       edge_filtering_luma    (img, false ,0,img->get_deblk_height(),0,img->get_deblk_width());
-      edge_filtering_chroma  (img, false ,0,img->get_deblk_height(),0,img->get_deblk_width());
+
+      if (img->sps.ChromaArrayType != CHROMA_MONO) {
+        edge_filtering_chroma  (img, false ,0,img->get_deblk_height(),0,img->get_deblk_width());
+      }
 
 #if 0
       sprintf(buf,"lf-after-H-%05d.yuv", ctx->img->PicOrderCntVal);
diff --git a/libde265/decctx.cc b/libde265/decctx.cc
index 7f83417..4c7d9df 100644
--- a/libde265/decctx.cc
+++ b/libde265/decctx.cc
@@ -40,6 +40,10 @@
 #include "x86/sse.h"
 #endif
 
+#ifdef HAVE_ARM
+#include "arm/arm.h"
+#endif
+
 #define SAVE_INTERMEDIATE_IMAGES 0
 
 #if SAVE_INTERMEDIATE_IMAGES
@@ -88,6 +92,10 @@ thread_context::thread_context()
   IsCuQpDeltaCoded = false;
   CuQpDelta = 0;
 
+  IsCuChromaQpOffsetCoded = false;
+  CuQpOffsetCb = 0;
+  CuQpOffsetCr = 0;
+
   /*
   currentQPY = 0;
   currentQG_x = 0;
@@ -110,6 +118,9 @@ thread_context::thread_context()
   img = NULL;
   shdr = NULL;
 
+  imgunit = NULL;
+  sliceunit = NULL;
+
 
   //memset(this,0,sizeof(thread_context));
 
@@ -118,7 +129,7 @@ thread_context::thread_context()
   int offset = (uintptr_t)_coeffBuf & 0x0f;
 
   if (offset == 0) {
-    coeffBuf = (int16_t *) &_coeffBuf;  // correctly aligned already
+    coeffBuf = _coeffBuf;  // correctly aligned already
   }
   else {
     coeffBuf = (int16_t *) (((uint8_t *)_coeffBuf) + (16-offset));
@@ -129,14 +140,18 @@ thread_context::thread_context()
 
 
 slice_unit::slice_unit(decoder_context* decctx)
-  : ctx(decctx),
-    nal(NULL),
+  : nal(NULL),
     shdr(NULL),
+    imgunit(NULL),
     flush_reorder_buffer(false),
+    nThreads(0),
+    first_decoded_CTB_RS(-1),
+    last_decoded_CTB_RS(-1),
     thread_contexts(NULL),
-    imgunit(NULL)
+    ctx(decctx)
 {
   state = Unprocessed;
+  nThreadContexts = 0;
 }
 
 slice_unit::~slice_unit()
@@ -154,6 +169,7 @@ void slice_unit::allocate_thread_contexts(int n)
   assert(thread_contexts==NULL);
 
   thread_contexts = new thread_context[n];
+  nThreadContexts = n;
 }
 
 
@@ -177,6 +193,12 @@ image_unit::~image_unit()
 }
 
 
+base_context::base_context()
+{
+  set_acceleration_functions(de265_acceleration_AUTO);
+}
+
+
 decoder_context::decoder_context()
 {
   //memset(ctx, 0, sizeof(decoder_context));
@@ -199,8 +221,6 @@ decoder_context::decoder_context()
   param_pps_headers_fd = -1;
   param_slice_headers_fd = -1;
 
-  set_acceleration_functions(de265_acceleration_AUTO);
-
   param_image_allocation_functions = de265_image::default_image_allocation;
   param_image_allocation_userdata  = NULL;
 
@@ -320,7 +340,7 @@ void decoder_context::set_image_allocation_functions(de265_image_allocation* all
 
 de265_error decoder_context::start_thread_pool(int nThreads)
 {
-  ::start_thread_pool(&thread_pool, nThreads);
+  ::start_thread_pool(&thread_pool_, nThreads);
 
   num_worker_threads = nThreads;
 
@@ -332,7 +352,7 @@ void decoder_context::stop_thread_pool()
 {
   if (get_num_worker_threads()>0) {
     //flush_thread_pool(&ctx->thread_pool);
-    ::stop_thread_pool(&thread_pool);
+    ::stop_thread_pool(&thread_pool_);
   }
 }
 
@@ -341,7 +361,7 @@ void decoder_context::reset()
 {
   if (num_worker_threads>0) {
     //flush_thread_pool(&ctx->thread_pool);
-    ::stop_thread_pool(&thread_pool);
+    ::stop_thread_pool(&thread_pool_);
   }
 
   // --------------------------------------------------
@@ -407,7 +427,7 @@ void decoder_context::reset()
   }
 }
 
-void decoder_context::set_acceleration_functions(enum de265_acceleration l)
+void base_context::set_acceleration_functions(enum de265_acceleration l)
 {
   // fill scalar functions first (so that function table is completely filled)
 
@@ -421,6 +441,11 @@ void decoder_context::set_acceleration_functions(enum de265_acceleration l)
     init_acceleration_functions_sse(&acceleration);
   }
 #endif
+#ifdef HAVE_ARM
+  if (l>=de265_acceleration_ARM) {
+    init_acceleration_functions_arm(&acceleration);
+  }
+#endif
 }
 
 
@@ -466,27 +491,33 @@ void decoder_context::init_thread_context(thread_context* tctx)
 }
 
 
-void decoder_context::add_task_decode_CTB_row(thread_context* tctx, bool firstSliceSubstream)
+void decoder_context::add_task_decode_CTB_row(thread_context* tctx,
+                                              bool firstSliceSubstream,
+                                              int ctbRow)
 {
   thread_task_ctb_row* task = new thread_task_ctb_row;
   task->firstSliceSubstream = firstSliceSubstream;
   task->tctx = tctx;
+  task->debug_startCtbRow = ctbRow;
   tctx->task = task;
 
-  add_task(&thread_pool, task);
+  add_task(&thread_pool_, task);
 
   tctx->imgunit->tasks.push_back(task);
 }
 
 
-void decoder_context::add_task_decode_slice_segment(thread_context* tctx, bool firstSliceSubstream)
+void decoder_context::add_task_decode_slice_segment(thread_context* tctx, bool firstSliceSubstream,
+                                                    int ctbx,int ctby)
 {
   thread_task_slice_segment* task = new thread_task_slice_segment;
   task->firstSliceSubstream = firstSliceSubstream;
   task->tctx = tctx;
+  task->debug_startCtbX = ctbx;
+  task->debug_startCtbY = ctby;
   tctx->task = task;
 
-  add_task(&thread_pool, task);
+  add_task(&thread_pool_, task);
 
   tctx->imgunit->tasks.push_back(task);
 }
@@ -496,14 +527,14 @@ de265_error decoder_context::read_vps_NAL(bitreader& reader)
 {
   logdebug(LogHeaders,"---> read VPS\n");
 
-  video_parameter_set vps = { 0 };
-  de265_error err = ::read_vps(this,&reader,&vps);
+  video_parameter_set vps;
+  de265_error err = vps.read(this,&reader);
   if (err != DE265_OK) {
     return err;
   }
 
   if (param_vps_headers_fd>=0) {
-    dump_vps(&vps, param_vps_headers_fd);
+    vps.dump(param_vps_headers_fd);
   }
 
   process_vps(&vps);
@@ -523,7 +554,7 @@ de265_error decoder_context::read_sps_NAL(bitreader& reader)
   }
 
   if (param_sps_headers_fd>=0) {
-    sps.dump_sps(param_sps_headers_fd);
+    sps.dump(param_sps_headers_fd);
   }
 
   process_sps(&sps);
@@ -540,7 +571,7 @@ de265_error decoder_context::read_pps_NAL(bitreader& reader)
   bool success = pps.read(&reader,this);
 
   if (param_pps_headers_fd>=0) {
-    pps.dump_pps(param_pps_headers_fd);
+    pps.dump(param_pps_headers_fd);
   }
 
   if (success) {
@@ -604,7 +635,7 @@ de265_error decoder_context::read_slice_NAL(bitreader& reader, NAL_unit* nal, na
 
   if (process_slice_segment_header(this, shdr, &err, nal->pts, &nal_hdr, nal->user_data) == false)
     {
-      img->integrity = INTEGRITY_NOT_DECODED;
+      if (img!=NULL) img->integrity = INTEGRITY_NOT_DECODED;
       nal_parser.free_NAL_unit(nal);
       delete shdr;
       return err;
@@ -650,7 +681,8 @@ de265_error decoder_context::read_slice_NAL(bitreader& reader, NAL_unit* nal, na
     image_units.back()->slice_units.push_back(sliceunit);
   }
 
-  decode_some();
+  bool did_work;
+  err = decode_some(&did_work);
 
   return DE265_OK;
 }
@@ -665,39 +697,40 @@ template <class T> void pop_front(std::vector<T>& vec)
 }
 
 
-de265_error decoder_context::decode_some()
+de265_error decoder_context::decode_some(bool* did_work)
 {
   de265_error err = DE265_OK;
 
-  if (0) {
-    static int cnt=0;
-    cnt++;
-    if (cnt<5) return DE265_OK;
-  }
+  *did_work = false;
 
   if (image_units.empty()) { return DE265_OK; }  // nothing to do
 
 
   // decode something if there is work to do
 
-  if ( ! image_units.empty() && ! image_units[0]->slice_units.empty() ) {
+  if ( ! image_units.empty() ) { // && ! image_units[0]->slice_units.empty() ) {
 
     image_unit* imgunit = image_units[0];
-    slice_unit* sliceunit = imgunit->slice_units[0];
+    slice_unit* sliceunit = imgunit->get_next_unprocessed_slice_segment();
 
-    pop_front(imgunit->slice_units);
+    if (sliceunit != NULL) {
 
-    if (sliceunit->flush_reorder_buffer) {
-      dpb.flush_reorder_buffer();
-    }
+      //pop_front(imgunit->slice_units);
 
-    //err = decode_slice_unit_sequential(imgunit, sliceunit);
-    err = decode_slice_unit_parallel(imgunit, sliceunit);
-    if (err) {
-      return err;
-    }
+      if (sliceunit->flush_reorder_buffer) {
+        dpb.flush_reorder_buffer();
+      }
+
+      *did_work = true;
 
-    delete sliceunit;
+      //err = decode_slice_unit_sequential(imgunit, sliceunit);
+      err = decode_slice_unit_parallel(imgunit, sliceunit);
+      if (err) {
+        return err;
+      }
+
+      //delete sliceunit;
+    }
   }
 
 
@@ -705,13 +738,15 @@ de265_error decoder_context::decode_some()
   // if we decoded all slices of the current image and there will not
   // be added any more slices to the image, output the image
 
-  if ( ( image_units.size()>=2 && image_units[0]->slice_units.empty() ) ||
-       ( image_units.size()>=1 && image_units[0]->slice_units.empty() &&
+  if ( ( image_units.size()>=2 && image_units[0]->all_slice_segments_processed()) ||
+       ( image_units.size()>=1 && image_units[0]->all_slice_segments_processed() &&
          nal_parser.number_of_NAL_units_pending()==0 &&
          (nal_parser.is_end_of_stream() || nal_parser.is_end_of_frame()) )) {
 
     image_unit* imgunit = image_units[0];
 
+    *did_work=true;
+
 
     // mark all CTBs as decoded even if they are not, because faulty input
     // streams could miss part of the picture
@@ -768,6 +803,10 @@ de265_error decoder_context::decode_slice_unit_sequential(image_unit* imgunit,
 
   remove_images_from_dpb(sliceunit->shdr->RemoveReferencesList);
 
+  if (sliceunit->shdr->slice_segment_address >= imgunit->img->pps.CtbAddrRStoTS.size()) {
+    return DE265_ERROR_CTB_OUTSIDE_IMAGE_AREA;
+  }
+
 
   struct thread_context tctx;
 
@@ -775,11 +814,16 @@ de265_error decoder_context::decode_slice_unit_sequential(image_unit* imgunit,
   tctx.img  = imgunit->img;
   tctx.decctx = this;
   tctx.imgunit = imgunit;
+  tctx.sliceunit= sliceunit;
   tctx.CtbAddrInTS = imgunit->img->pps.CtbAddrRStoTS[tctx.shdr->slice_segment_address];
   tctx.task = NULL;
 
   init_thread_context(&tctx);
 
+  if (sliceunit->reader.bytes_remaining <= 0) {
+    return DE265_ERROR_PREMATURE_END_OF_SLICE;
+  }
+
   init_CABAC_decoder(&tctx.cabac_decoder,
                      sliceunit->reader.data,
                      sliceunit->reader.bytes_remaining);
@@ -788,16 +832,49 @@ de265_error decoder_context::decode_slice_unit_sequential(image_unit* imgunit,
 
   if (pps->entropy_coding_sync_enabled_flag &&
       sliceunit->shdr->first_slice_segment_in_pic_flag) {
-    imgunit->ctx_models.resize( (img->sps.PicHeightInCtbsY-1) * CONTEXT_MODEL_TABLE_LENGTH );
+    imgunit->ctx_models.resize( (img->sps.PicHeightInCtbsY-1) ); //* CONTEXT_MODEL_TABLE_LENGTH );
   }
 
-  if ((err=read_slice_segment_data(&tctx)) != DE265_OK)
-    { return err; }
+  sliceunit->nThreads=1;
+
+  err=read_slice_segment_data(&tctx);
+
+  sliceunit->finished_threads.set_progress(1);
 
   return err;
 }
 
 
+void decoder_context::mark_whole_slice_as_processed(image_unit* imgunit,
+                                                    slice_unit* sliceunit,
+                                                    int progress)
+{
+  //printf("mark whole slice\n");
+
+
+  // mark all CTBs upto the next slice segment as processed
+
+  slice_unit* nextSegment = imgunit->get_next_slice_segment(sliceunit);
+  if (nextSegment) {
+    /*
+    printf("mark whole slice between %d and %d\n",
+           sliceunit->shdr->slice_segment_address,
+           nextSegment->shdr->slice_segment_address);
+    */
+
+    for (int ctb=sliceunit->shdr->slice_segment_address;
+         ctb < nextSegment->shdr->slice_segment_address;
+         ctb++)
+      {
+        if (ctb >= imgunit->img->number_of_ctbs())
+          break;
+
+        imgunit->img->ctb_progress[ctb].set_progress(progress);
+      }
+  }
+}
+
+
 de265_error decoder_context::decode_slice_unit_parallel(image_unit* imgunit,
                                                         slice_unit* sliceunit)
 {
@@ -805,11 +882,18 @@ de265_error decoder_context::decode_slice_unit_parallel(image_unit* imgunit,
 
   remove_images_from_dpb(sliceunit->shdr->RemoveReferencesList);
 
-
+  /*
+  printf("-------- decode --------\n");
+  printf("IMAGE UNIT %p\n",imgunit);
+  sliceunit->shdr->dump_slice_segment_header(sliceunit->ctx, 1);
+  imgunit->dump_slices();
+  */
 
   de265_image* img = imgunit->img;
   const pic_parameter_set* pps = &img->pps;
 
+  sliceunit->state = slice_unit::InProgress;
+
   bool use_WPP = (img->decctx->num_worker_threads > 0 &&
                   pps->entropy_coding_sync_enabled_flag);
 
@@ -826,27 +910,66 @@ de265_error decoder_context::decode_slice_unit_parallel(image_unit* imgunit,
   }
 
 
+  // If this is the first slice segment, mark all CTBs before this as processed
+  // (the real first slice segment could be missing).
+
+  if (imgunit->is_first_slice_segment(sliceunit)) {
+    slice_segment_header* shdr = sliceunit->shdr;
+    int firstCTB = shdr->slice_segment_address;
+
+    for (int ctb=0;ctb<firstCTB;ctb++) {
+      //printf("mark pre progress %d\n",ctb);
+      img->ctb_progress[ctb].set_progress(CTB_PROGRESS_PREFILTER);
+    }
+  }
+
+
+  // if there is a previous slice that has been completely decoded,
+  // mark all CTBs until the start of this slice as completed
+
+  //printf("this slice: %p\n",sliceunit);
+  slice_unit* prevSlice = imgunit->get_prev_slice_segment(sliceunit);
+  //if (prevSlice) printf("prev slice state: %d\n",prevSlice->state);
+  if (prevSlice && prevSlice->state == slice_unit::Decoded) {
+    mark_whole_slice_as_processed(imgunit,prevSlice,CTB_PROGRESS_PREFILTER);
+  }
+
+
   // TODO: even though we cannot split this into several tasks, we should run it
   // as a background thread
   if (!use_WPP && !use_tiles) {
-    return decode_slice_unit_sequential(imgunit, sliceunit);
+    //printf("SEQ\n");
+    err = decode_slice_unit_sequential(imgunit, sliceunit);
+    sliceunit->state = slice_unit::Decoded;
+    mark_whole_slice_as_processed(imgunit,sliceunit,CTB_PROGRESS_PREFILTER);
+    return err;
   }
 
 
   if (use_WPP && use_tiles) {
     // TODO: this is not allowed ... output some warning or error
+
+    return DE265_WARNING_PPS_HEADER_INVALID;
   }
 
 
   if (use_WPP) {
-    return decode_slice_unit_WPP(imgunit, sliceunit);
+    //printf("WPP\n");
+    err = decode_slice_unit_WPP(imgunit, sliceunit);
+    sliceunit->state = slice_unit::Decoded;
+    mark_whole_slice_as_processed(imgunit,sliceunit,CTB_PROGRESS_PREFILTER);
+    return err;
   }
   else if (use_tiles) {
-    return decode_slice_unit_tiles(imgunit, sliceunit);
+    //printf("TILE\n");
+    err = decode_slice_unit_tiles(imgunit, sliceunit);
+    sliceunit->state = slice_unit::Decoded;
+    mark_whole_slice_as_processed(imgunit,sliceunit,CTB_PROGRESS_PREFILTER);
+    return err;
   }
 
   assert(false);
-  return DE265_OK;
+  return err;
 }
 
 
@@ -864,16 +987,13 @@ de265_error decoder_context::decode_slice_unit_WPP(image_unit* imgunit,
 
 
   assert(img->num_threads_active() == 0);
-  img->thread_start(nRows);
-
-  //printf("-------- decode --------\n");
 
 
   // reserve space to store entropy coding context models for each CTB row
 
   if (shdr->first_slice_segment_in_pic_flag) {
     // reserve space for nRows-1 because we don't need to save the CABAC model in the last CTB row
-    imgunit->ctx_models.resize( (img->sps.PicHeightInCtbsY-1) * CONTEXT_MODEL_TABLE_LENGTH );
+    imgunit->ctx_models.resize( (img->sps.PicHeightInCtbsY-1) ); //* CONTEXT_MODEL_TABLE_LENGTH );
   }
 
 
@@ -890,6 +1010,15 @@ de265_error decoder_context::decode_slice_unit_WPP(image_unit* imgunit,
       ctbRow++;
       ctbAddrRS = ctbRow * ctbsWidth;
     }
+    else if (nRows>1 && (ctbAddrRS % ctbsWidth) != 0) {
+      // If slice segment consists of several WPP rows, each of them
+      // has to start at a row.
+
+      //printf("does not start at start\n");
+
+      err = DE265_WARNING_SLICEHEADER_INVALID;
+      break;
+    }
 
 
     // prepare thread context
@@ -900,6 +1029,7 @@ de265_error decoder_context::decode_slice_unit_WPP(image_unit* imgunit,
     tctx->decctx  = img->decctx;
     tctx->img     = img;
     tctx->imgunit = imgunit;
+    tctx->sliceunit= sliceunit;
     tctx->CtbAddrInTS = pps->CtbAddrRStoTS[ctbAddrRS];
 
     init_thread_context(tctx);
@@ -915,13 +1045,23 @@ de265_error decoder_context::decode_slice_unit_WPP(image_unit* imgunit,
     if (entryPt==nRows-1) dataEnd = sliceunit->reader.bytes_remaining;
     else                  dataEnd = shdr->entry_point_offset[entryPt];
 
+    if (dataStartIndex<0 || dataEnd>sliceunit->reader.bytes_remaining ||
+        dataEnd <= dataStartIndex) {
+      //printf("WPP premature end\n");
+      err = DE265_ERROR_PREMATURE_END_OF_SLICE;
+      break;
+    }
+
     init_CABAC_decoder(&tctx->cabac_decoder,
                        &sliceunit->reader.data[dataStartIndex],
                        dataEnd-dataStartIndex);
 
     // add task
 
-    add_task_decode_CTB_row(tctx, entryPt==0);
+    //printf("start task for ctb-row: %d\n",ctbRow);
+    img->thread_start(1);
+    sliceunit->nThreads++;
+    add_task_decode_CTB_row(tctx, entryPt==0, ctbRow);
   }
 
 #if 0
@@ -961,7 +1101,6 @@ de265_error decoder_context::decode_slice_unit_tiles(image_unit* imgunit,
 
 
   assert(img->num_threads_active() == 0);
-  img->thread_start(nTiles);
 
   sliceunit->allocate_thread_contexts(nTiles);
 
@@ -974,6 +1113,12 @@ de265_error decoder_context::decode_slice_unit_tiles(image_unit* imgunit,
     // entry points other than the first start at tile beginnings
     if (entryPt>0) {
       tileID++;
+
+      if (tileID >= pps->num_tile_columns * pps->num_tile_rows) {
+        err = DE265_WARNING_SLICEHEADER_INVALID;
+        break;
+      }
+
       int ctbX = pps->colBd[tileID % pps->num_tile_columns];
       int ctbY = pps->rowBd[tileID / pps->num_tile_columns];
       ctbAddrRS = ctbY * ctbsWidth + ctbX;
@@ -987,6 +1132,7 @@ de265_error decoder_context::decode_slice_unit_tiles(image_unit* imgunit,
     tctx->decctx = img->decctx;
     tctx->img    = img;
     tctx->imgunit = imgunit;
+    tctx->sliceunit= sliceunit;
     tctx->CtbAddrInTS = pps->CtbAddrRStoTS[ctbAddrRS];
 
     init_thread_context(tctx);
@@ -1002,13 +1148,24 @@ de265_error decoder_context::decode_slice_unit_tiles(image_unit* imgunit,
     if (entryPt==nTiles-1) dataEnd = sliceunit->reader.bytes_remaining;
     else                   dataEnd = shdr->entry_point_offset[entryPt];
 
+    if (dataStartIndex<0 || dataEnd>sliceunit->reader.bytes_remaining ||
+        dataEnd <= dataStartIndex) {
+      err = DE265_ERROR_PREMATURE_END_OF_SLICE;
+      break;
+    }
+
     init_CABAC_decoder(&tctx->cabac_decoder,
                        &sliceunit->reader.data[dataStartIndex],
                        dataEnd-dataStartIndex);
 
     // add task
 
-    add_task_decode_slice_segment(tctx, entryPt==0);
+    //printf("add tiles thread\n");
+    img->thread_start(1);
+    sliceunit->nThreads++;
+    add_task_decode_slice_segment(tctx, entryPt==0,
+                                  ctbAddrRS % ctbsWidth,
+                                  ctbAddrRS / ctbsWidth);
   }
 
   img->wait_for_completion();
@@ -1017,7 +1174,7 @@ de265_error decoder_context::decode_slice_unit_tiles(image_unit* imgunit,
     delete imgunit->tasks[i];
   imgunit->tasks.clear();
 
-  return DE265_OK;
+  return err;
 }
 
 
@@ -1033,9 +1190,16 @@ de265_error decoder_context::decode_NAL(NAL_unit* nal)
   bitreader_init(&reader, nal->data(), nal->size());
 
   nal_header nal_hdr;
-  nal_read_header(&reader, &nal_hdr);
+  nal_hdr.read(&reader);
   ctx->process_nal_hdr(&nal_hdr);
 
+  if (nal_hdr.nuh_layer_id > 0) {
+    // Discard all NAL units with nuh_layer_id > 0
+    // These will have to be handeled by an SHVC decoder.
+    nal_parser.free_NAL_unit(nal);
+    return DE265_OK;
+  }
+
   loginfo(LogHighlevel,"NAL: 0x%x 0x%x -  unit type:%s temporal id:%d\n",
           nal->data()[0], nal->data()[1],
           get_NAL_name(nal_hdr.nal_unit_type),
@@ -1105,7 +1269,7 @@ de265_error decoder_context::decode(int* more)
   // if the stream has ended, and no more NALs are to be decoded, flush all pictures
 
   if (ctx->nal_parser.get_NAL_queue_length() == 0 &&
-      ctx->nal_parser.is_end_of_stream() &&
+      (ctx->nal_parser.is_end_of_stream() || ctx->nal_parser.is_end_of_frame()) &&
       ctx->image_units.empty()) {
 
     // flush all pending pictures into output queue
@@ -1143,12 +1307,14 @@ de265_error decoder_context::decode(int* more)
   // decode one NAL from the queue
 
   de265_error err = DE265_OK;
+  bool did_work = false;
 
-  if (ctx->nal_parser.number_of_NAL_units_pending()) {
+  if (ctx->nal_parser.get_NAL_queue_length()) { // number_of_NAL_units_pending()) {
     NAL_unit* nal = ctx->nal_parser.pop_from_NAL_queue();
     assert(nal);
     err = ctx->decode_NAL(nal);
     // ctx->nal_parser.free_NAL_unit(nal); TODO: do not free NAL with new loop
+    did_work=true;
   }
   else if (ctx->nal_parser.is_end_of_frame() == true &&
       ctx->image_units.empty()) {
@@ -1157,12 +1323,12 @@ de265_error decoder_context::decode(int* more)
     return DE265_ERROR_WAITING_FOR_INPUT_DATA;
   }
   else {
-    err = decode_some();
+    err = decode_some(&did_work);
   }
 
   if (more) {
     // decoding error is assumed to be unrecoverable
-    *more = (err==DE265_OK);
+    *more = (err==DE265_OK && did_work);
   }
 
   return err;
@@ -1173,11 +1339,8 @@ void decoder_context::process_nal_hdr(nal_header* nal)
 {
   nal_unit_type = nal->nal_unit_type;
 
-  IdrPicFlag = (nal->nal_unit_type == NAL_UNIT_IDR_W_RADL ||
-                nal->nal_unit_type == NAL_UNIT_IDR_N_LP);
-
-  RapPicFlag = (nal->nal_unit_type >= 16 &&
-                nal->nal_unit_type <= 23);
+  IdrPicFlag = isIdrPic(nal->nal_unit_type);
+  RapPicFlag = isRapPic(nal->nal_unit_type);
 }
 
 
@@ -1248,9 +1411,9 @@ void decoder_context::process_picture_order_count(decoder_context* ctx, slice_se
            ctx->img->PicOrderCntVal);
 
   if (ctx->img->nal_hdr.nuh_temporal_id==0 &&
-      (isReferenceNALU(ctx->nal_unit_type) &&
-       (!isRASL(ctx->nal_unit_type) && !isRADL(ctx->nal_unit_type))) &&
-      1 /* sub-layer non-reference picture */) // TODO
+      !isSublayerNonReference(ctx->nal_unit_type) &&
+      !isRASL(ctx->nal_unit_type) &&
+      !isRADL(ctx->nal_unit_type))
     {
       loginfo(LogHeaders,"set prevPicOrderCntLsb/Msb\n");
 
@@ -1608,11 +1771,14 @@ bool decoder_context::construct_reference_picture_lists(decoder_context* ctx, sl
     }
   }
 
-  if (hdr->num_ref_idx_l0_active > 15) {
+  /*
+  if (hdr->num_ref_idx_l0_active > 16) {
     ctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false);
     return false;
   }
+  */
 
+  assert(hdr->num_ref_idx_l0_active <= 16);
   for (rIdx=0; rIdx<hdr->num_ref_idx_l0_active; rIdx++) {
     int idx = hdr->ref_pic_list_modification_flag_l0 ? hdr->list_entry_l0[rIdx] : rIdx;
 
@@ -1620,8 +1786,12 @@ bool decoder_context::construct_reference_picture_lists(decoder_context* ctx, sl
     hdr->LongTermRefPic[0][rIdx] = isLongTerm[0][idx];
 
     // remember POC of referenced image (needed in motion.c, derive_collocated_motion_vector)
-    hdr->RefPicList_POC[0][rIdx] = ctx->dpb.get_image(hdr->RefPicList[0][rIdx])->PicOrderCntVal;
-    hdr->RefPicList_PicState[0][rIdx] = ctx->dpb.get_image(hdr->RefPicList[0][rIdx])->PicState;
+    de265_image* img_0_rIdx = ctx->dpb.get_image(hdr->RefPicList[0][rIdx]);
+    if (img_0_rIdx==NULL) {
+      return false;
+    }
+    hdr->RefPicList_POC[0][rIdx] = img_0_rIdx->PicOrderCntVal;
+    hdr->RefPicList_PicState[0][rIdx] = img_0_rIdx->PicState;
   }
 
 
@@ -1636,19 +1806,32 @@ bool decoder_context::construct_reference_picture_lists(decoder_context* ctx, sl
 
     int rIdx=0;
     while (rIdx < NumRpsCurrTempList1) {
-      for (int i=0;i<ctx->NumPocStCurrAfter && rIdx<NumRpsCurrTempList1; rIdx++,i++)
+      for (int i=0;i<ctx->NumPocStCurrAfter && rIdx<NumRpsCurrTempList1; rIdx++,i++) {
         RefPicListTemp1[rIdx] = ctx->RefPicSetStCurrAfter[i];
+      }
 
-      for (int i=0;i<ctx->NumPocStCurrBefore && rIdx<NumRpsCurrTempList1; rIdx++,i++)
+      for (int i=0;i<ctx->NumPocStCurrBefore && rIdx<NumRpsCurrTempList1; rIdx++,i++) {
         RefPicListTemp1[rIdx] = ctx->RefPicSetStCurrBefore[i];
+      }
 
       for (int i=0;i<ctx->NumPocLtCurr && rIdx<NumRpsCurrTempList1; rIdx++,i++) {
         RefPicListTemp1[rIdx] = ctx->RefPicSetLtCurr[i];
         isLongTerm[1][rIdx] = true;
       }
+
+      // This check is to prevent an endless loop when no images are added above.
+      if (rIdx==0) {
+        ctx->add_warning(DE265_WARNING_FAULTY_REFERENCE_PICTURE_LIST, false);
+        return false;
+      }
     }
 
-    assert(hdr->num_ref_idx_l1_active <= 15);
+    if (hdr->num_ref_idx_l0_active > 16) {
+    ctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false);
+    return false;
+  }
+
+    assert(hdr->num_ref_idx_l1_active <= 16);
     for (rIdx=0; rIdx<hdr->num_ref_idx_l1_active; rIdx++) {
       int idx = hdr->ref_pic_list_modification_flag_l1 ? hdr->list_entry_l1[rIdx] : rIdx;
 
@@ -1656,8 +1839,10 @@ bool decoder_context::construct_reference_picture_lists(decoder_context* ctx, sl
       hdr->LongTermRefPic[1][rIdx] = isLongTerm[1][idx];
 
       // remember POC of referenced imaged (needed in motion.c, derive_collocated_motion_vector)
-      hdr->RefPicList_POC[1][rIdx] = ctx->dpb.get_image(hdr->RefPicList[1][rIdx])->PicOrderCntVal;
-      hdr->RefPicList_PicState[1][rIdx] = ctx->dpb.get_image(hdr->RefPicList[1][rIdx])->PicState;
+      de265_image* img_1_rIdx = ctx->dpb.get_image(hdr->RefPicList[1][rIdx]);
+      if (img_1_rIdx == NULL) { return false; }
+      hdr->RefPicList_POC[1][rIdx] = img_1_rIdx->PicOrderCntVal;
+      hdr->RefPicList_PicState[1][rIdx] = img_1_rIdx->PicState;
     }
   }
 
@@ -1838,7 +2023,7 @@ bool decoder_context::process_slice_segment_header(decoder_context* ctx, slice_s
     ctx->img = img;
 
     img->vps = *ctx->current_vps;
-    img->sps = *ctx->current_sps;
+    //img->sps = *ctx->current_sps;  // already set in new_image()
     img->pps = *ctx->current_pps;
     img->decctx = ctx;
 
@@ -1893,6 +2078,13 @@ bool decoder_context::process_slice_segment_header(decoder_context* ctx, slice_s
 
     first_decoded_picture = false;
   }
+  else {
+    // claims to be not the first slice, but there is no active image available
+
+    if (ctx->img == NULL) {
+      return false;
+    }
+  }
 
   if (hdr->slice_type == SLICE_TYPE_B ||
       hdr->slice_type == SLICE_TYPE_P)
diff --git a/libde265/decctx.h b/libde265/decctx.h
index 3d3a492..35ef602 100644
--- a/libde265/decctx.h
+++ b/libde265/decctx.h
@@ -38,17 +38,19 @@
 #define DE265_MAX_VPS_SETS 16   // this is the maximum as defined in the standard
 #define DE265_MAX_SPS_SETS 16   // this is the maximum as defined in the standard
 #define DE265_MAX_PPS_SETS 64   // this is the maximum as defined in the standard
-#define MAX_THREAD_CONTEXTS 68  // enough for 4K @ 32 pixel CTBs, but TODO: make this dynamic
 
 #define MAX_WARNINGS 20
 
 
-struct slice_segment_header;
-struct image_unit;
+class slice_segment_header;
+class image_unit;
+class slice_unit;
+class decoder_context;
 
 
-struct thread_context
+class thread_context
 {
+public:
   thread_context();
 
   int CtbAddrInRS;
@@ -59,23 +61,21 @@ struct thread_context
 
   // motion vectors
 
-  int8_t  refIdx[2];
-  int16_t mvd[2][2]; // only in top left position
-  uint8_t merge_flag;
-  uint8_t merge_idx;
-  uint8_t mvp_lX_flag[2];
-  uint8_t inter_pred_idc; // enum InterPredIdc
+  motion_spec motion;
 
 
   // prediction
 
-  enum IntraPredMode IntraPredModeC; // chroma intra-prediction mode for current CB
+  // enum IntraPredMode IntraPredModeC[4]; // chroma intra-prediction mode for current CB
+  int ResScaleVal;
 
 
   // residual data
 
   uint8_t cu_transquant_bypass_flag;
   uint8_t transform_skip_flag[3];
+  uint8_t explicit_rdpcm_flag;
+  uint8_t explicit_rdpcm_dir;
 
   ALIGNED_16(int16_t) _coeffBuf[(32*32)+8]; // alignment required for SSE code !
   int16_t *coeffBuf;
@@ -84,11 +84,15 @@ struct thread_context
   int16_t coeffPos[3][32*32];
   int16_t nCoeff[3];
 
+  int32_t residual_luma[32*32]; // only used when cross-comp-prediction is enabled
+
 
   // quantization
 
   int IsCuQpDeltaCoded;
   int CuQpDelta;
+  int IsCuChromaQpOffsetCoded;
+  int CuQpOffsetCb, CuQpOffsetCr;
 
   int currentQPY;
   int currentQG_x, currentQG_y;
@@ -98,14 +102,16 @@ struct thread_context
 
   CABAC_decoder cabac_decoder;
 
-  context_model ctx_model[CONTEXT_MODEL_TABLE_LENGTH];
+  context_model_table ctx_model;
+  uint8_t StatCoeff[4];
 
-  struct decoder_context* decctx;
+  decoder_context* decctx;
   struct de265_image *img;
-  struct slice_segment_header* shdr;
+  slice_segment_header* shdr;
 
-  struct image_unit* imgunit;
-  struct thread_task* task; // executing thread_task or NULL if not multi-threaded
+  image_unit* imgunit;
+  slice_unit* sliceunit;
+  thread_task* task; // executing thread_task or NULL if not multi-threaded
 
 private:
   thread_context(const thread_context&); // not allowed
@@ -131,8 +137,9 @@ class error_queue
 
 
 
-struct slice_unit
+class slice_unit
 {
+public:
   slice_unit(decoder_context* decctx);
   ~slice_unit();
 
@@ -140,32 +147,48 @@ struct slice_unit
   slice_segment_header* shdr;  // not the owner (de265_image is owner)
   bitreader reader;
 
-  struct image_unit* imgunit;
+  image_unit* imgunit;
 
   bool flush_reorder_buffer;
 
-  enum { Unprocessed,
-         Inprogress,
-         Decoded
+
+  // decoding status
+
+  enum SliceDecodingProgress { Unprocessed,
+                               InProgress,
+                               Decoded
   } state;
 
+  de265_progress_lock finished_threads;
+  int nThreads;
+
+  int first_decoded_CTB_RS; // TODO
+  int last_decoded_CTB_RS;  // TODO
+
   void allocate_thread_contexts(int n);
-  thread_context* get_thread_context(int n) { return &thread_contexts[n]; }
+  thread_context* get_thread_context(int n) {
+    assert(n < nThreadContexts);
+    return &thread_contexts[n];
+  }
+  int num_thread_contexts() const { return nThreadContexts; }
 
 private:
   thread_context* thread_contexts; /* NOTE: cannot use std::vector, because thread_context has
                                       no copy constructor. */
+  int nThreadContexts;
 
+public:
   decoder_context* ctx;
 
-
+private:
   slice_unit(const slice_unit&); // not allowed
   const slice_unit& operator=(const slice_unit&); // not allowed
 };
 
 
-struct image_unit
+class image_unit
 {
+public:
   image_unit();
   ~image_unit();
 
@@ -175,6 +198,53 @@ struct image_unit
   std::vector<slice_unit*> slice_units;
   std::vector<sei_message> suffix_SEIs;
 
+  slice_unit* get_next_unprocessed_slice_segment() const {
+    for (int i=0;i<slice_units.size();i++) {
+      if (slice_units[i]->state == slice_unit::Unprocessed) {
+        return slice_units[i];
+      }
+    }
+
+    return NULL;
+  }
+
+  slice_unit* get_prev_slice_segment(slice_unit* s) const {
+    for (int i=1; i<slice_units.size(); i++) {
+      if (slice_units[i]==s) {
+        return slice_units[i-1];
+      }
+    }
+
+    return NULL;
+  }
+
+  slice_unit* get_next_slice_segment(slice_unit* s) const {
+    for (int i=0; i<slice_units.size()-1; i++) {
+      if (slice_units[i]==s) {
+        return slice_units[i+1];
+      }
+    }
+
+    return NULL;
+  }
+
+  void dump_slices() const {
+    for (int i=0; i<slice_units.size(); i++) {
+      printf("[%d] = %p\n",i,slice_units[i]);
+    }
+  }
+
+  bool all_slice_segments_processed() const {
+    if (slice_units.size()==0) return true;
+    if (slice_units.back()->state != slice_unit::Unprocessed) return true;
+    return false;
+  }
+
+  bool is_first_slice_segment(const slice_unit* s) const {
+    if (slice_units.size()==0) return false;
+    return (slice_units[0] == s);
+  }
+
   enum { Invalid, // headers not read yet
          Unknown, // SPS/PPS available
          Reference, // will be used as reference
@@ -192,12 +262,29 @@ struct image_unit
   /* Saved context models for WPP.
      There is one saved model for the initialization of each CTB row.
      The array is unused for non-WPP streams. */
-  std::vector<context_model> ctx_models;  // TODO: move this into image ?
+  std::vector<context_model_table> ctx_models;  // TODO: move this into image ?
 };
 
 
+class base_context : public error_queue
+{
+ public:
+  base_context();
+  virtual ~base_context() { }
+
+  // --- accelerated DSP functions ---
 
-class decoder_context : public error_queue {
+  void set_acceleration_functions(enum de265_acceleration);
+
+  struct acceleration_functions acceleration; // CPU optimized functions
+
+  //virtual /* */ de265_image* get_image(int dpb_index)       { return dpb.get_image(dpb_index); }
+  virtual const de265_image* get_image(int frame_id) const = 0;
+  virtual bool has_image(int frame_id) const = 0;
+};
+
+
+class decoder_context : public base_context {
  public:
   decoder_context();
   ~decoder_context();
@@ -224,13 +311,14 @@ class decoder_context : public error_queue {
   de265_error decode_NAL(NAL_unit* nal);
 
   de265_error decode(int* more);
-  de265_error decode_some();
+  de265_error decode_some(bool* did_work);
 
   de265_error decode_slice_unit_sequential(image_unit* imgunit, slice_unit* sliceunit);
   de265_error decode_slice_unit_parallel(image_unit* imgunit, slice_unit* sliceunit);
   de265_error decode_slice_unit_WPP(image_unit* imgunit, slice_unit* sliceunit);
   de265_error decode_slice_unit_tiles(image_unit* imgunit, slice_unit* sliceunit);
 
+
   void process_nal_hdr(nal_header*);
   void process_vps(video_parameter_set*);
   void process_sps(seq_parameter_set*);
@@ -266,13 +354,6 @@ class decoder_context : public error_queue {
   void*                  param_image_allocation_userdata;
 
 
-  // --- accelerated DSP functions ---
-
-  void set_acceleration_functions(enum de265_acceleration);
-
-  struct acceleration_functions acceleration; // CPU optimized functions
-
-
   // --- input stream data ---
 
   NAL_Parser nal_parser;
@@ -309,7 +390,7 @@ class decoder_context : public error_queue {
   pic_parameter_set*   current_pps;
 
  public:
-  struct thread_pool thread_pool;
+  thread_pool thread_pool_;
 
  private:
   int num_worker_threads;
@@ -415,10 +496,14 @@ class decoder_context : public error_queue {
   bool flush_reorder_buffer_at_this_frame;
 
  private:
-  void init_thread_context(class thread_context* tctx);
-  void add_task_decode_CTB_row(thread_context* tctx, bool firstSliceSubstream);
-  void add_task_decode_slice_segment(thread_context* tctx, bool firstSliceSubstream);
+  void init_thread_context(thread_context* tctx);
+  void add_task_decode_CTB_row(thread_context* tctx, bool firstSliceSubstream, int ctbRow);
+  void add_task_decode_slice_segment(thread_context* tctx, bool firstSliceSubstream,
+                                     int ctbX,int ctbY);
 
+  void mark_whole_slice_as_processed(image_unit* imgunit,
+                                     slice_unit* sliceunit,
+                                     int progress);
 
   void process_picture_order_count(decoder_context* ctx, slice_segment_header* hdr);
   int generate_unavailable_reference_picture(decoder_context* ctx, const seq_parameter_set* sps,
@@ -428,7 +513,7 @@ class decoder_context : public error_queue {
 
 
   void remove_images_from_dpb(const std::vector<int>& removeImageList);
-  void run_postprocessing_filters_sequential(de265_image* img);
+  void run_postprocessing_filters_sequential(struct de265_image* img);
   void run_postprocessing_filters_parallel(image_unit* img);
 };
 
diff --git a/libde265/dpb.cc b/libde265/dpb.cc
index eefb3de..7c4ed66 100644
--- a/libde265/dpb.cc
+++ b/libde265/dpb.cc
@@ -259,7 +259,7 @@ int decoded_picture_buffer::new_image(const seq_parameter_set* sps,
   default: chroma = de265_chroma_420; assert(0); break; // should never happen
   }
 
-  img->alloc_image(w,h, chroma, sps, true, decctx, pts, user_data, isOutputImage);
+  img->alloc_image(w,h, chroma, sps, true, decctx, NULL, pts, user_data, isOutputImage);
 
   img->integrity = INTEGRITY_CORRECT;
 
diff --git a/libde265/dpb.h b/libde265/dpb.h
index 2ae275b..c39aa52 100644
--- a/libde265/dpb.h
+++ b/libde265/dpb.h
@@ -27,9 +27,10 @@
 #include <deque>
 #include <vector>
 
+class decoder_context;
 
-
-struct decoded_picture_buffer {
+class decoded_picture_buffer {
+public:
   decoded_picture_buffer();
   ~decoded_picture_buffer();
 
@@ -40,7 +41,7 @@ struct decoded_picture_buffer {
      If there is no space for a new image, return -1. */
   int new_image(const seq_parameter_set* sps, decoder_context* decctx,
                 de265_PTS pts, void* user_data, bool isOutputImage);
-  
+
   /* Check for a free slot in the DPB. There are some slots reserved for
      unavailable reference frames. If high_priority==true, these reserved slots
      are included in the check. */
@@ -52,18 +53,26 @@ struct decoded_picture_buffer {
   int size() const { return dpb.size(); }
 
   /* Raw access to the images. */
-  /* */ de265_image* get_image(int index)       { return dpb[index]; }
-  const de265_image* get_image(int index) const { return dpb[index]; }
+
+  /* */ de265_image* get_image(int index)       {
+    if (index>=dpb.size()) return NULL;
+    return dpb[index];
+  }
+
+  const de265_image* get_image(int index) const {
+    if (index>=dpb.size()) return NULL;
+    return dpb[index];
+  }
 
   /* Search DPB for the slot index of a specific picture. */
   int DPB_index_of_picture_with_POC(int poc, int currentID, bool preferLongTerm=false) const;
   int DPB_index_of_picture_with_LSB(int lsb, int currentID, bool preferLongTerm=false) const;
   int DPB_index_of_picture_with_ID (int id) const;
-  
+
 
   // --- reorder buffer ---
 
-  void insert_image_into_reorder_buffer(de265_image* img) {
+  void insert_image_into_reorder_buffer(struct de265_image* img) {
     reorder_output_queue.push_back(img);
   }
 
@@ -71,17 +80,17 @@ struct decoded_picture_buffer {
 
   // move next picture in reorder buffer to output queue
   void output_next_picture_in_reorder_buffer();
-  
+
   // Move all pictures in reorder buffer to output buffer. Return true if there were any pictures.
   bool flush_reorder_buffer();
-  
+
 
   // --- output buffer ---
 
   int num_pictures_in_output_queue() const { return image_output_queue.size(); }
 
   /* Get the next picture in the output queue, but do not remove it from the queue. */
-  de265_image* get_next_picture_in_output_queue() const { return image_output_queue.front(); }
+  struct de265_image* get_next_picture_in_output_queue() const { return image_output_queue.front(); }
 
   /* Remove the next picture in the output queue. */
   void pop_next_picture_in_output_queue();
@@ -91,15 +100,15 @@ struct decoded_picture_buffer {
 
   void log_dpb_content() const;
   void log_dpb_queues() const;
-  
+
 private:
   int max_images_in_DPB;
   int norm_images_in_DPB;
 
-  std::vector<de265_image*> dpb; // decoded picture buffer
+  std::vector<struct de265_image*> dpb; // decoded picture buffer
 
-  std::vector<de265_image*> reorder_output_queue;
-  std::deque<de265_image*>  image_output_queue;
+  std::vector<struct de265_image*> reorder_output_queue;
+  std::deque<struct de265_image*>  image_output_queue;
 
 private:
   decoded_picture_buffer(const decoded_picture_buffer&); // no copy
diff --git a/libde265/en265.cc b/libde265/en265.cc
new file mode 100644
index 0000000..2b6146d
--- /dev/null
+++ b/libde265/en265.cc
@@ -0,0 +1,320 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "libde265/en265.h"
+#include "libde265/encoder/encoder-context.h"
+
+
+LIBDE265_API en265_encoder_context* en265_new_encoder(void)
+{
+  de265_error init_err = de265_init();
+  if (init_err != DE265_OK) {
+    return NULL;
+  }
+
+  encoder_context* ectx = new encoder_context();
+  return (en265_encoder_context*)ectx;
+}
+
+
+LIBDE265_API de265_error en265_free_encoder(en265_encoder_context* e)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+  delete ectx;
+
+  return de265_free();
+}
+
+
+LIBDE265_API void en265_set_image_release_function(en265_encoder_context* e,
+                                                   void (*release_func)(en265_encoder_context*,
+                                                                        de265_image*,
+                                                                        void* userdata),
+                                                   void* alloc_userdata)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+
+  ectx->param_image_allocation_userdata = alloc_userdata;
+  ectx->release_func = release_func;
+}
+
+
+// ========== encoder parameters ==========
+
+LIBDE265_API de265_error en265_parse_command_line_parameters(en265_encoder_context* e,
+                                                             int* argc, char** argv)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+
+  //if (!ectx->params_config.parse_command_line_params(argc,argv, &ectx->params, true)) {
+  int first_idx=1;
+  if (!ectx->params_config.parse_command_line_params(argc,argv, &first_idx, true)) {
+    return DE265_ERROR_PARAMETER_PARSING;
+  }
+  else {
+    return DE265_OK;
+  }
+}
+
+LIBDE265_API void en265_show_parameters(en265_encoder_context* e)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+
+  //ectx->params_config.show_params(&ectx->params);
+
+  ectx->params_config.print_params();
+}
+
+
+LIBDE265_API const char** en265_list_parameters(en265_encoder_context* e)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+
+  return ectx->params_config.get_parameter_string_table();
+}
+
+
+LIBDE265_API enum en265_parameter_type en265_get_parameter_type(en265_encoder_context* e,
+                                                                const char* parametername)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+
+  return ectx->params_config.get_parameter_type(parametername);
+}
+
+
+LIBDE265_API de265_error en265_set_parameter_bool(en265_encoder_context* e,
+                                                  const char* param,int value)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+
+  return ectx->params_config.set_bool(param,value) ? DE265_OK : DE265_ERROR_PARAMETER_PARSING;
+}
+
+
+LIBDE265_API de265_error en265_set_parameter_int(en265_encoder_context* e,
+                                                 const char* param,int value)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+
+  return ectx->params_config.set_int(param,value) ? DE265_OK : DE265_ERROR_PARAMETER_PARSING;
+}
+
+LIBDE265_API de265_error en265_set_parameter_string(en265_encoder_context* e,
+                                                    const char* param,const char* value)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+
+  return ectx->params_config.set_string(param,value) ? DE265_OK : DE265_ERROR_PARAMETER_PARSING;
+}
+
+LIBDE265_API de265_error en265_set_parameter_choice(en265_encoder_context* e,
+                                                    const char* param,const char* value)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+
+  return ectx->params_config.set_choice(param,value) ? DE265_OK : DE265_ERROR_PARAMETER_PARSING;
+}
+
+
+LIBDE265_API const char** en265_list_parameter_choices(en265_encoder_context* e,
+                                                       const char* parametername)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+
+  return ectx->params_config.get_parameter_choices_table(parametername);
+}
+
+
+
+// ========== encoding loop ==========
+
+
+LIBDE265_API de265_error en265_start_encoder(en265_encoder_context* e, int number_of_threads)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+
+  ectx->start_encoder();
+
+  return DE265_OK;
+}
+
+
+LIBDE265_API struct de265_image* en265_allocate_image(en265_encoder_context* e,
+                                                      int width, int height, de265_chroma chroma,
+                                                      de265_PTS pts, void* image_userdata)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+
+  de265_image* img = new de265_image;
+  if (img->alloc_image(width,height,de265_chroma_420, NULL, false,
+                       NULL,ectx, pts, image_userdata, true) != DE265_OK) {
+    delete img;
+    return NULL;
+  }
+
+  return img;
+}
+
+// Request a specification of the image memory layout for an image of the specified dimensions.
+LIBDE265_API void en265_get_image_spec(en265_encoder_context* e,
+                                       int width, int height, de265_chroma chroma,
+                                       struct de265_image_spec* out_spec)
+{
+  out_spec->format = de265_image_format_YUV420P8;
+  out_spec->width = width;
+  out_spec->height= height;
+  out_spec->alignment = 1;
+
+  out_spec->crop_left  =0;
+  out_spec->crop_right =0;
+  out_spec->crop_top   =0;
+  out_spec->crop_bottom=0;
+
+  out_spec->visible_width  = out_spec->width  - out_spec->crop_left - out_spec->crop_right;
+  out_spec->visible_height = out_spec->height - out_spec->crop_top  - out_spec->crop_bottom;
+}
+
+// Image memory layout specification for an image returned by en265_allocate_image().
+//LIBDE265_API void de265_get_image_spec_from_image(de265_image* img, struct de265_image_spec* spec);
+
+
+
+LIBDE265_API de265_error en265_push_image(en265_encoder_context* e,
+                                          struct de265_image* img)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+
+  ectx->sop->insert_new_input_image(img);
+  return DE265_OK;
+}
+
+
+LIBDE265_API de265_error en265_push_eof(en265_encoder_context* e)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+
+  ectx->sop->insert_end_of_stream();
+  return DE265_OK;
+}
+
+
+LIBDE265_API de265_error en265_block_on_input_queue_length(en265_encoder_context*,
+                                                           int max_pending_images,
+                                                           int timeout_ms)
+{
+  // TODO
+  return DE265_OK;
+}
+
+LIBDE265_API de265_error en265_trim_input_queue(en265_encoder_context*, int max_pending_images)
+{
+  // TODO
+  return DE265_OK;
+}
+
+LIBDE265_API int  en265_current_input_queue_length(en265_encoder_context*)
+{
+  // TODO
+  return DE265_OK;
+}
+
+LIBDE265_API de265_error en265_encode(en265_encoder_context* e)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+
+  while (ectx->picbuf.have_more_frames_to_encode())
+    {
+      de265_error result = ectx->encode_picture_from_input_buffer();
+      if (result != DE265_OK) return result;
+    }
+
+  return DE265_OK;
+}
+
+LIBDE265_API enum en265_encoder_state en265_get_encoder_state(en265_encoder_context* e)
+{
+  // TODO
+  return EN265_STATE_IDLE;
+}
+
+LIBDE265_API struct en265_packet* en265_get_packet(en265_encoder_context* e, int timeout_ms)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+
+  assert(timeout_ms==0); // TODO: blocking not implemented yet
+
+  if (ectx->output_packets.size()>0) {
+    en265_packet* pck = ectx->output_packets.front();
+    ectx->output_packets.pop_front();
+
+    return pck;
+  }
+  else {
+    return NULL;
+  }
+}
+
+LIBDE265_API void en265_free_packet(en265_encoder_context* e, struct en265_packet* pck)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+
+  // Do not delete images here. They are owned by the EncPicBuf.
+  //delete   pck->input_image;
+  //delete   pck->reconstruction;
+
+  if (pck->frame_number >= 0) {
+    ectx->mark_image_is_outputted(pck->frame_number);
+
+    ectx->release_input_image(pck->frame_number);
+  }
+
+  delete[] pck->data;
+  delete   pck;
+}
+
+LIBDE265_API int en265_number_of_queued_packets(en265_encoder_context* e)
+{
+  assert(e);
+  encoder_context* ectx = (encoder_context*)e;
+
+  return ectx->output_packets.size();
+}
+
diff --git a/libde265/en265.h b/libde265/en265.h
new file mode 100644
index 0000000..b7ae0eb
--- /dev/null
+++ b/libde265/en265.h
@@ -0,0 +1,216 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef EN265_H
+#define EN265_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <libde265/de265.h>
+
+
+// ========== encoder context ==========
+
+typedef void en265_encoder_context; // private structure
+
+/* Get a new encoder context. Must be freed with en265_free_encoder(). */
+LIBDE265_API en265_encoder_context* en265_new_encoder(void);
+
+/* Free encoder context. May only be called once on a context. */
+LIBDE265_API de265_error en265_free_encoder(en265_encoder_context*);
+
+/* The alloc_userdata pointer will be given to the release_func(). */
+LIBDE265_API void en265_set_image_release_function(en265_encoder_context*,
+                                                   void (*release_func)(en265_encoder_context*,
+                                                                        struct de265_image*,
+                                                                        void* userdata),
+                                                   void* alloc_userdata);
+
+// ========== encoder parameters ==========
+
+LIBDE265_API de265_error en265_set_parameter_bool(en265_encoder_context*,
+                                                  const char* parametername,int value);
+LIBDE265_API de265_error en265_set_parameter_int(en265_encoder_context*,
+                                                 const char* parametername,int value);
+LIBDE265_API de265_error en265_set_parameter_string(en265_encoder_context*,
+                                                    const char* parametername,const char* value);
+LIBDE265_API de265_error en265_set_parameter_choice(en265_encoder_context*,
+                                                    const char* parametername,const char* value);
+
+
+LIBDE265_API const char** en265_list_parameters(en265_encoder_context*);
+
+enum en265_parameter_type {
+  en265_parameter_bool,
+  en265_parameter_int,
+  en265_parameter_string,
+  en265_parameter_choice
+};
+
+LIBDE265_API enum en265_parameter_type en265_get_parameter_type(en265_encoder_context*,
+                                                                const char* parametername);
+
+LIBDE265_API const char** en265_list_parameter_choices(en265_encoder_context*,
+                                                       const char* parametername);
+
+
+// --- convenience functions for command-line parameters ---
+
+LIBDE265_API de265_error en265_parse_command_line_parameters(en265_encoder_context*,
+                                                             int* argc, char** argv);
+LIBDE265_API void en265_show_parameters(en265_encoder_context*);
+
+
+
+// ========== encoding loop ==========
+
+LIBDE265_API de265_error en265_start_encoder(en265_encoder_context*, int number_of_threads);
+
+// If we have provided our own memory release function, no image memory will be allocated.
+LIBDE265_API struct de265_image* en265_allocate_image(en265_encoder_context*,
+                                                      int width, int height,
+                                                      enum de265_chroma chroma,
+                                                      de265_PTS pts, void* image_userdata);
+
+LIBDE265_API void* de265_alloc_image_plane(struct de265_image* img, int cIdx,
+                                           void* inputdata, int inputstride, void *userdata);
+LIBDE265_API void de265_free_image_plane(struct de265_image* img, int cIdx);
+
+
+// Request a specification of the image memory layout for an image of the specified dimensions.
+LIBDE265_API void en265_get_image_spec(en265_encoder_context*,
+                                       int width, int height, enum de265_chroma chroma,
+                                       struct de265_image_spec* out_spec);
+
+// Image memory layout specification for an image returned by en265_allocate_image().
+/* TODO: do we need this?
+LIBDE265_API void de265_get_image_spec_from_image(de265_image* img, struct de265_image_spec* spec);
+*/
+
+
+LIBDE265_API de265_error en265_push_image(en265_encoder_context*,
+                                          struct de265_image*); // non-blocking
+
+LIBDE265_API de265_error en265_push_eof(en265_encoder_context*);
+
+// block when there are more than max_input_images in the input queue
+LIBDE265_API de265_error en265_block_on_input_queue_length(en265_encoder_context*,
+                                                           int max_pending_images,
+                                                           int timeout_ms);
+
+LIBDE265_API de265_error en265_trim_input_queue(en265_encoder_context*, int max_pending_images);
+
+LIBDE265_API int  en265_current_input_queue_length(en265_encoder_context*);
+
+// Run encoder in main thread. Only use this when not using background threads.
+LIBDE265_API de265_error en265_encode(en265_encoder_context*);
+
+enum en265_encoder_state
+{
+  EN265_STATE_IDLE,
+  EN265_STATE_WAITING_FOR_INPUT,
+  EN265_STATE_WORKING,
+  EN265_STATE_OUTPUT_QUEUE_FULL,
+  EN265_STATE_EOS
+};
+
+
+LIBDE265_API enum en265_encoder_state en265_get_encoder_state(en265_encoder_context*);
+
+
+enum en265_packet_content_type {
+  EN265_PACKET_VPS,
+  EN265_PACKET_SPS,
+  EN265_PACKET_PPS,
+  EN265_PACKET_SEI,
+  EN265_PACKET_SLICE,
+  EN265_PACKET_SKIPPED_IMAGE
+};
+
+
+enum en265_nal_unit_type {
+  EN265_NUT_TRAIL_N = 0,
+  EN265_NUT_TRAIL_R = 1,
+  EN265_NUT_TSA_N   = 2,
+  EN265_NUT_TSA_R   = 3,
+  EN265_NUT_STSA_N  = 4,
+  EN265_NUT_STSA_R  = 5,
+  EN265_NUT_RADL_N  = 6,
+  EN265_NUT_RADL_R  = 7,
+  EN265_NUT_RASL_N  = 8,
+  EN265_NUT_RASL_R  = 9,
+  EN265_NUT_BLA_W_LP  = 16,
+  EN265_NUT_BLA_W_RADL= 17,
+  EN265_NUT_BLA_N_LP  = 18,
+  EN265_NUT_IDR_W_RADL= 19,
+  EN265_NUT_IDR_N_LP  = 20,
+  EN265_NUT_CRA       = 21,
+  EN265_NUT_VPS   =    32,
+  EN265_NUT_SPS   =    33,
+  EN265_NUT_PPS   =    34,
+  EN265_NUT_AUD   =    35,
+  EN265_NUT_EOS   =    36,
+  EN265_NUT_EOB   =    37,
+  EN265_NUT_FD    =    38,
+  EN265_NUT_PREFIX_SEI = 39,
+  EN265_NUT_SUFFIX_SEI = 40
+};
+
+
+struct en265_packet
+{
+  int version; // currently: 1
+
+  const uint8_t* data;
+  int   length;
+
+  int  frame_number;
+
+  enum en265_packet_content_type content_type;
+  char complete_picture : 1;
+  char final_slice      : 1;
+  char dependent_slice  : 1;
+
+  enum en265_nal_unit_type nal_unit_type;
+  unsigned char nuh_layer_id;
+  unsigned char nuh_temporal_id;
+
+  en265_encoder_context* encoder_context;
+
+  const struct de265_image* input_image;
+  const struct de265_image* reconstruction;
+};
+
+// timeout_ms - timeout in milliseconds. 0 - no timeout, -1 - block forever
+LIBDE265_API struct en265_packet* en265_get_packet(en265_encoder_context*, int timeout_ms);
+LIBDE265_API void en265_free_packet(en265_encoder_context*, struct en265_packet*);
+
+LIBDE265_API int en265_number_of_queued_packets(en265_encoder_context*);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/libde265/encoder/CMakeLists.txt b/libde265/encoder/CMakeLists.txt
new file mode 100644
index 0000000..6bf37d3
--- /dev/null
+++ b/libde265/encoder/CMakeLists.txt
@@ -0,0 +1,18 @@
+set (encoder_sources 
+  analyze.cc analyze.h
+  encode.h encode.cc
+  encoder-params.h encoder-params.cc
+  encoder-context.h encoder-context.cc
+  encpicbuf.h encpicbuf.cc
+  sop.h sop.cc
+)
+
+add_library(encoder STATIC ${encoder_sources})
+
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+  SET_TARGET_PROPERTIES(encoder PROPERTIES COMPILE_FLAGS "-fPIC")
+endif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+
+add_subdirectory (algo)
+
+target_link_libraries(encoder algo)
diff --git a/libde265/encoder/Makefile.am b/libde265/encoder/Makefile.am
new file mode 100644
index 0000000..cb06284
--- /dev/null
+++ b/libde265/encoder/Makefile.am
@@ -0,0 +1,16 @@
+noinst_LTLIBRARIES = libde265_encoder.la
+
+libde265_encoder_la_CXXFLAGS = -I..
+libde265_encoder_la_SOURCES = \
+  analyze.cc analyze.h \
+  encode.h encode.cc \
+  encoder-params.h encoder-params.cc \
+  encoder-context.h encoder-context.cc \
+  encpicbuf.h encpicbuf.cc \
+  sop.h sop.cc
+
+SUBDIRS=algo
+libde265_encoder_la_LIBADD = algo/libde265_encoder_algo.la
+
+EXTRA_DIST = \
+  CMakeLists.txt
diff --git a/libde265/x86/Makefile.in b/libde265/encoder/Makefile.in
similarity index 57%
copy from libde265/x86/Makefile.in
copy to libde265/encoder/Makefile.in
index c982db6..693f212 100644
--- a/libde265/x86/Makefile.in
+++ b/libde265/encoder/Makefile.in
@@ -79,18 +79,17 @@ POST_UNINSTALL = :
 build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
- at HAVE_VISIBILITY_TRUE@am__append_1 = -DHAVE_VISIBILITY
- at HAVE_VISIBILITY_TRUE@am__append_2 = -DHAVE_VISIBILITY
-subdir = libde265/x86
+subdir = libde265/encoder
 DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
 	$(top_srcdir)/depcomp
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_compare_version.m4 \
+	$(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \
 	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
 	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
 	$(top_srcdir)/m4/lt~obsolete.m4 \
 	$(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \
-	$(top_srcdir)/m4/visibility.m4 $(top_srcdir)/configure.ac
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 mkinstalldirs = $(install_sh) -d
@@ -98,24 +97,20 @@ CONFIG_HEADER = $(top_builddir)/config.h
 CONFIG_CLEAN_FILES =
 CONFIG_CLEAN_VPATH_FILES =
 LTLIBRARIES = $(noinst_LTLIBRARIES)
-libde265_x86_la_DEPENDENCIES = libde265_x86_sse.la
-am_libde265_x86_la_OBJECTS = libde265_x86_la-sse.lo
-libde265_x86_la_OBJECTS = $(am_libde265_x86_la_OBJECTS)
+libde265_encoder_la_DEPENDENCIES = algo/libde265_encoder_algo.la
+am_libde265_encoder_la_OBJECTS = libde265_encoder_la-analyze.lo \
+	libde265_encoder_la-encode.lo \
+	libde265_encoder_la-encoder-params.lo \
+	libde265_encoder_la-encoder-context.lo \
+	libde265_encoder_la-encpicbuf.lo libde265_encoder_la-sop.lo
+libde265_encoder_la_OBJECTS = $(am_libde265_encoder_la_OBJECTS)
 AM_V_lt = $(am__v_lt_ at AM_V@)
 am__v_lt_ = $(am__v_lt_ at AM_DEFAULT_V@)
 am__v_lt_0 = --silent
 am__v_lt_1 = 
-libde265_x86_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+libde265_encoder_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
-	$(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
-	$(LDFLAGS) -o $@
-libde265_x86_sse_la_LIBADD =
-am_libde265_x86_sse_la_OBJECTS = libde265_x86_sse_la-sse-motion.lo \
-	libde265_x86_sse_la-sse-dct.lo
-libde265_x86_sse_la_OBJECTS = $(am_libde265_x86_sse_la_OBJECTS)
-libde265_x86_sse_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
-	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
-	$(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(libde265_encoder_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
 AM_V_P = $(am__v_P_ at AM_V@)
 am__v_P_ = $(am__v_P_ at AM_DEFAULT_V@)
@@ -169,14 +164,29 @@ AM_V_CCLD = $(am__v_CCLD_ at AM_V@)
 am__v_CCLD_ = $(am__v_CCLD_ at AM_DEFAULT_V@)
 am__v_CCLD_0 = @echo "  CCLD    " $@;
 am__v_CCLD_1 = 
-SOURCES = $(libde265_x86_la_SOURCES) $(libde265_x86_sse_la_SOURCES)
-DIST_SOURCES = $(libde265_x86_la_SOURCES) \
-	$(libde265_x86_sse_la_SOURCES)
+SOURCES = $(libde265_encoder_la_SOURCES)
+DIST_SOURCES = $(libde265_encoder_la_SOURCES)
+RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
+	ctags-recursive dvi-recursive html-recursive info-recursive \
+	install-data-recursive install-dvi-recursive \
+	install-exec-recursive install-html-recursive \
+	install-info-recursive install-pdf-recursive \
+	install-ps-recursive install-recursive installcheck-recursive \
+	installdirs-recursive pdf-recursive ps-recursive \
+	tags-recursive uninstall-recursive
 am__can_run_installinfo = \
   case $$AM_UPDATE_INFO_DIR in \
     n|no|NO) false;; \
     *) (install-info --version) >/dev/null 2>&1;; \
   esac
+RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
+  distclean-recursive maintainer-clean-recursive
+am__recursive_targets = \
+  $(RECURSIVE_TARGETS) \
+  $(RECURSIVE_CLEAN_TARGETS) \
+  $(am__extra_recursive_targets)
+AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
+	distdir
 am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
 # Read a list of newline-separated strings from the standard input,
 # and print each of them once, without duplicates.  Input order is
@@ -196,8 +206,35 @@ am__define_uniq_tagged_files = \
   done | $(am__uniquify_input)`
 ETAGS = etags
 CTAGS = ctags
+DIST_SUBDIRS = $(SUBDIRS)
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+am__relativize = \
+  dir0=`pwd`; \
+  sed_first='s,^\([^/]*\)/.*$$,\1,'; \
+  sed_rest='s,^[^/]*/*,,'; \
+  sed_last='s,^.*/\([^/]*\)$$,\1,'; \
+  sed_butlast='s,/*[^/]*$$,,'; \
+  while test -n "$$dir1"; do \
+    first=`echo "$$dir1" | sed -e "$$sed_first"`; \
+    if test "$$first" != "."; then \
+      if test "$$first" = ".."; then \
+        dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \
+        dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \
+      else \
+        first2=`echo "$$dir2" | sed -e "$$sed_first"`; \
+        if test "$$first2" = "$$first"; then \
+          dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \
+        else \
+          dir2="../$$dir2"; \
+        fi; \
+        dir0="$$dir0"/"$$first"; \
+      fi; \
+    fi; \
+    dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \
+  done; \
+  reldir="$$dir2"
 ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
 AMTAR = @AMTAR@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
@@ -206,9 +243,11 @@ AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
 CCDEPMODE = @CCDEPMODE@
 CFLAGS = @CFLAGS@
-CFLAG_VISIBILITY = @CFLAG_VISIBILITY@
 CPP = @CPP@
 CPPFLAGS = @CPPFLAGS@
 CXX = @CXX@
@@ -228,7 +267,7 @@ EGREP = @EGREP@
 EXEEXT = @EXEEXT@
 FGREP = @FGREP@
 GREP = @GREP@
-HAVE_VISIBILITY = @HAVE_VISIBILITY@
+HAVE_CXX11 = @HAVE_CXX11@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -339,16 +378,22 @@ target_vendor = @target_vendor@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
-noinst_LTLIBRARIES = libde265_x86.la  libde265_x86_sse.la
-libde265_x86_la_CXXFLAGS = -I.. $(CFLAG_VISIBILITY) $(am__append_1)
-libde265_x86_la_SOURCES = sse.cc sse.h
-libde265_x86_la_LIBADD = libde265_x86_sse.la
-
-# SSE4 specific functions
-libde265_x86_sse_la_CXXFLAGS = -msse4.1 -I.. $(CFLAG_VISIBILITY) \
-	$(am__append_2)
-libde265_x86_sse_la_SOURCES = sse-motion.cc sse-motion.h sse-dct.h sse-dct.cc
-all: all-am
+noinst_LTLIBRARIES = libde265_encoder.la
+libde265_encoder_la_CXXFLAGS = -I..
+libde265_encoder_la_SOURCES = \
+  analyze.cc analyze.h \
+  encode.h encode.cc \
+  encoder-params.h encoder-params.cc \
+  encoder-context.h encoder-context.cc \
+  encpicbuf.h encpicbuf.cc \
+  sop.h sop.cc
+
+SUBDIRS = algo
+libde265_encoder_la_LIBADD = algo/libde265_encoder_algo.la
+EXTRA_DIST = \
+  CMakeLists.txt
+
+all: all-recursive
 
 .SUFFIXES:
 .SUFFIXES: .cc .lo .o .obj
@@ -361,9 +406,9 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	      exit 1;; \
 	  esac; \
 	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu libde265/x86/Makefile'; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu libde265/encoder/Makefile'; \
 	$(am__cd) $(top_srcdir) && \
-	  $(AUTOMAKE) --gnu libde265/x86/Makefile
+	  $(AUTOMAKE) --gnu libde265/encoder/Makefile
 .PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	@case '$?' in \
@@ -394,11 +439,8 @@ clean-noinstLTLIBRARIES:
 	  rm -f $${locs}; \
 	}
 
-libde265_x86.la: $(libde265_x86_la_OBJECTS) $(libde265_x86_la_DEPENDENCIES) $(EXTRA_libde265_x86_la_DEPENDENCIES) 
-	$(AM_V_CXXLD)$(libde265_x86_la_LINK)  $(libde265_x86_la_OBJECTS) $(libde265_x86_la_LIBADD) $(LIBS)
-
-libde265_x86_sse.la: $(libde265_x86_sse_la_OBJECTS) $(libde265_x86_sse_la_DEPENDENCIES) $(EXTRA_libde265_x86_sse_la_DEPENDENCIES) 
-	$(AM_V_CXXLD)$(libde265_x86_sse_la_LINK)  $(libde265_x86_sse_la_OBJECTS) $(libde265_x86_sse_la_LIBADD) $(LIBS)
+libde265_encoder.la: $(libde265_encoder_la_OBJECTS) $(libde265_encoder_la_DEPENDENCIES) $(EXTRA_libde265_encoder_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(libde265_encoder_la_LINK)  $(libde265_encoder_la_OBJECTS) $(libde265_encoder_la_LIBADD) $(LIBS)
 
 mostlyclean-compile:
 	-rm -f *.$(OBJEXT)
@@ -406,9 +448,12 @@ mostlyclean-compile:
 distclean-compile:
 	-rm -f *.tab.c
 
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_x86_la-sse.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_encoder_la-analyze.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_encoder_la-encode.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_encoder_la-encoder-context.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_encoder_la-encoder-params.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_encoder_la-encpicbuf.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_encoder_la-sop.Plo at am__quote@
 
 .cc.o:
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@@ -431,26 +476,47 @@ distclean-compile:
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LTCXXCOMPILE) -c -o $@ $<
 
-libde265_x86_la-sse.lo: sse.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_la-sse.lo -MD -MP -MF $(DEPDIR)/libde265_x86_la-sse.Tpo -c -o libde265_x86_la-sse.lo `test -f 'sse.cc' || echo '$(srcdir)/'`sse.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_la-sse.Tpo $(DEPDIR)/libde265_x86_la-sse.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='sse.cc' object='libde265_x86_la-sse.lo' libtool=yes @AMDEPBACKSLASH@
+libde265_encoder_la-analyze.lo: analyze.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_encoder_la-analyze.lo -MD -MP -MF $(DEPDIR)/libde265_encoder_la-analyze.Tpo -c -o libde265_encoder_la-analyze.lo `test -f 'analyze.cc' || echo '$(srcdir)/'`analyze.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_encoder_la-analyze.Tpo $(DEPDIR)/libde265_encoder_la-analyze.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='analyze.cc' object='libde265_encoder_la-analyze.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_encoder_la-analyze.lo `test -f 'analyze.cc' || echo '$(srcdir)/'`analyze.cc
+
+libde265_encoder_la-encode.lo: encode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_encoder_la-encode.lo -MD -MP -MF $(DEPDIR)/libde265_encoder_la-encode.Tpo -c -o libde265_encoder_la-encode.lo `test -f 'encode.cc' || echo '$(srcdir)/'`encode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_encoder_la-encode.Tpo $(DEPDIR)/libde265_encoder_la-encode.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='encode.cc' object='libde265_encoder_la-encode.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_encoder_la-encode.lo `test -f 'encode.cc' || echo '$(srcdir)/'`encode.cc
+
+libde265_encoder_la-encoder-params.lo: encoder-params.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_encoder_la-encoder-params.lo -MD -MP -MF $(DEPDIR)/libde265_encoder_la-encoder-params.Tpo -c -o libde265_encoder_la-encoder-params.lo `test -f 'encoder-params.cc' || echo '$(srcdir)/'`encoder-params.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_encoder_la-encoder-params.Tpo $(DEPDIR)/libde265_encoder_la-encoder-params.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='encoder-params.cc' object='libde265_encoder_la-encoder-params.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_encoder_la-encoder-params.lo `test -f 'encoder-params.cc' || echo '$(srcdir)/'`encoder-params.cc
+
+libde265_encoder_la-encoder-context.lo: encoder-context.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_encoder_la-encoder-context.lo -MD -MP -MF $(DEPDIR)/libde265_encoder_la-encoder-context.Tpo -c -o libde265_encoder_la-encoder-context.lo `test -f 'encoder-context.cc' || echo '$(srcdir)/'`encoder-context.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_encoder_la-encoder-context.Tpo $(DEPDIR)/libde265_encoder_la-encoder-context.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='encoder-context.cc' object='libde265_encoder_la-encoder-context.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_la-sse.lo `test -f 'sse.cc' || echo '$(srcdir)/'`sse.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_encoder_la-encoder-context.lo `test -f 'encoder-context.cc' || echo '$(srcdir)/'`encoder-context.cc
 
-libde265_x86_sse_la-sse-motion.lo: sse-motion.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_sse_la-sse-motion.lo -MD -MP -MF $(DEPDIR)/libde265_x86_sse_la-sse-motion.Tpo -c -o libde265_x86_sse_la-sse-motion.lo `test -f 'sse-motion.cc' || echo '$(srcdir)/'`sse-motion.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_sse_la-sse-motion.Tpo $(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='sse-motion.cc' object='libde265_x86_sse_la-sse-motion.lo' libtool=yes @AMDEPBACKSLASH@
+libde265_encoder_la-encpicbuf.lo: encpicbuf.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_encoder_la-encpicbuf.lo -MD -MP -MF $(DEPDIR)/libde265_encoder_la-encpicbuf.Tpo -c -o libde265_encoder_la-encpicbuf.lo `test -f 'encpicbuf.cc' || echo '$(srcdir)/'`encpicbuf.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_encoder_la-encpicbuf.Tpo $(DEPDIR)/libde265_encoder_la-encpicbuf.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='encpicbuf.cc' object='libde265_encoder_la-encpicbuf.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_sse_la-sse-motion.lo `test -f 'sse-motion.cc' || echo '$(srcdir)/'`sse-motion.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_encoder_la-encpicbuf.lo `test -f 'encpicbuf.cc' || echo '$(srcdir)/'`encpicbuf.cc
 
-libde265_x86_sse_la-sse-dct.lo: sse-dct.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_sse_la-sse-dct.lo -MD -MP -MF $(DEPDIR)/libde265_x86_sse_la-sse-dct.Tpo -c -o libde265_x86_sse_la-sse-dct.lo `test -f 'sse-dct.cc' || echo '$(srcdir)/'`sse-dct.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_sse_la-sse-dct.Tpo $(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='sse-dct.cc' object='libde265_x86_sse_la-sse-dct.lo' libtool=yes @AMDEPBACKSLASH@
+libde265_encoder_la-sop.lo: sop.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_encoder_la-sop.lo -MD -MP -MF $(DEPDIR)/libde265_encoder_la-sop.Tpo -c -o libde265_encoder_la-sop.lo `test -f 'sop.cc' || echo '$(srcdir)/'`sop.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_encoder_la-sop.Tpo $(DEPDIR)/libde265_encoder_la-sop.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='sop.cc' object='libde265_encoder_la-sop.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_sse_la-sse-dct.lo `test -f 'sse-dct.cc' || echo '$(srcdir)/'`sse-dct.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_encoder_la-sop.lo `test -f 'sop.cc' || echo '$(srcdir)/'`sop.cc
 
 mostlyclean-libtool:
 	-rm -f *.lo
@@ -458,14 +524,61 @@ mostlyclean-libtool:
 clean-libtool:
 	-rm -rf .libs _libs
 
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run 'make' without going through this Makefile.
+# To change the values of 'make' variables: instead of editing Makefiles,
+# (1) if the variable is set in 'config.status', edit 'config.status'
+#     (which will cause the Makefiles to be regenerated when you run 'make');
+# (2) otherwise, pass the desired values on the 'make' command line.
+$(am__recursive_targets):
+	@fail=; \
+	if $(am__make_keepgoing); then \
+	  failcom='fail=yes'; \
+	else \
+	  failcom='exit 1'; \
+	fi; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	case "$@" in \
+	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
+	  *) list='$(SUBDIRS)' ;; \
+	esac; \
+	for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	  || eval $$failcom; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
 ID: $(am__tagged_files)
 	$(am__define_uniq_tagged_files); mkid -fID $$unique
-tags: tags-am
+tags: tags-recursive
 TAGS: tags
 
 tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
 	set x; \
 	here=`pwd`; \
+	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
+	  include_option=--etags-include; \
+	  empty_fix=.; \
+	else \
+	  include_option=--include; \
+	  empty_fix=; \
+	fi; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    test ! -f $$subdir/TAGS || \
+	      set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
+	  fi; \
+	done; \
 	$(am__define_uniq_tagged_files); \
 	shift; \
 	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
@@ -478,7 +591,7 @@ tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
 	      $$unique; \
 	  fi; \
 	fi
-ctags: ctags-am
+ctags: ctags-recursive
 
 CTAGS: ctags
 ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
@@ -491,7 +604,7 @@ GTAGS:
 	here=`$(am__cd) $(top_builddir) && pwd` \
 	  && $(am__cd) $(top_srcdir) \
 	  && gtags -i $(GTAGS_ARGS) "$$here"
-cscopelist: cscopelist-am
+cscopelist: cscopelist-recursive
 
 cscopelist-am: $(am__tagged_files)
 	list='$(am__tagged_files)'; \
@@ -540,19 +653,45 @@ distdir: $(DISTFILES)
 	    || exit 1; \
 	  fi; \
 	done
+	@list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
+	  if test "$$subdir" = .; then :; else \
+	    $(am__make_dryrun) \
+	      || test -d "$(distdir)/$$subdir" \
+	      || $(MKDIR_P) "$(distdir)/$$subdir" \
+	      || exit 1; \
+	    dir1=$$subdir; dir2="$(distdir)/$$subdir"; \
+	    $(am__relativize); \
+	    new_distdir=$$reldir; \
+	    dir1=$$subdir; dir2="$(top_distdir)"; \
+	    $(am__relativize); \
+	    new_top_distdir=$$reldir; \
+	    echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \
+	    echo "     am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \
+	    ($(am__cd) $$subdir && \
+	      $(MAKE) $(AM_MAKEFLAGS) \
+	        top_distdir="$$new_top_distdir" \
+	        distdir="$$new_distdir" \
+		am__remove_distdir=: \
+		am__skip_length_check=: \
+		am__skip_mode_fix=: \
+	        distdir) \
+	      || exit 1; \
+	  fi; \
+	done
 check-am: all-am
-check: check-am
+check: check-recursive
 all-am: Makefile $(LTLIBRARIES)
-installdirs:
-install: install-am
-install-exec: install-exec-am
-install-data: install-data-am
-uninstall: uninstall-am
+installdirs: installdirs-recursive
+installdirs-am:
+install: install-recursive
+install-exec: install-exec-recursive
+install-data: install-data-recursive
+uninstall: uninstall-recursive
 
 install-am: all-am
 	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
 
-installcheck: installcheck-am
+installcheck: installcheck-recursive
 install-strip:
 	if test -z '$(STRIP)'; then \
 	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
@@ -574,92 +713,93 @@ distclean-generic:
 maintainer-clean-generic:
 	@echo "This command is intended for maintainers to use"
 	@echo "it deletes files that may require special tools to rebuild."
-clean: clean-am
+clean: clean-recursive
 
 clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
 	mostlyclean-am
 
-distclean: distclean-am
+distclean: distclean-recursive
 	-rm -rf ./$(DEPDIR)
 	-rm -f Makefile
 distclean-am: clean-am distclean-compile distclean-generic \
 	distclean-tags
 
-dvi: dvi-am
+dvi: dvi-recursive
 
 dvi-am:
 
-html: html-am
+html: html-recursive
 
 html-am:
 
-info: info-am
+info: info-recursive
 
 info-am:
 
 install-data-am:
 
-install-dvi: install-dvi-am
+install-dvi: install-dvi-recursive
 
 install-dvi-am:
 
 install-exec-am:
 
-install-html: install-html-am
+install-html: install-html-recursive
 
 install-html-am:
 
-install-info: install-info-am
+install-info: install-info-recursive
 
 install-info-am:
 
 install-man:
 
-install-pdf: install-pdf-am
+install-pdf: install-pdf-recursive
 
 install-pdf-am:
 
-install-ps: install-ps-am
+install-ps: install-ps-recursive
 
 install-ps-am:
 
 installcheck-am:
 
-maintainer-clean: maintainer-clean-am
+maintainer-clean: maintainer-clean-recursive
 	-rm -rf ./$(DEPDIR)
 	-rm -f Makefile
 maintainer-clean-am: distclean-am maintainer-clean-generic
 
-mostlyclean: mostlyclean-am
+mostlyclean: mostlyclean-recursive
 
 mostlyclean-am: mostlyclean-compile mostlyclean-generic \
 	mostlyclean-libtool
 
-pdf: pdf-am
+pdf: pdf-recursive
 
 pdf-am:
 
-ps: ps-am
+ps: ps-recursive
 
 ps-am:
 
 uninstall-am:
 
-.MAKE: install-am install-strip
-
-.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \
-	clean-libtool clean-noinstLTLIBRARIES cscopelist-am ctags \
-	ctags-am distclean distclean-compile distclean-generic \
-	distclean-libtool distclean-tags distdir dvi dvi-am html \
-	html-am info info-am install install-am install-data \
-	install-data-am install-dvi install-dvi-am install-exec \
-	install-exec-am install-html install-html-am install-info \
-	install-info-am install-man install-pdf install-pdf-am \
-	install-ps install-ps-am install-strip installcheck \
-	installcheck-am installdirs maintainer-clean \
-	maintainer-clean-generic mostlyclean mostlyclean-compile \
-	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
-	tags tags-am uninstall uninstall-am
+.MAKE: $(am__recursive_targets) install-am install-strip
+
+.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am check \
+	check-am clean clean-generic clean-libtool \
+	clean-noinstLTLIBRARIES cscopelist-am ctags ctags-am distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	installdirs-am maintainer-clean maintainer-clean-generic \
+	mostlyclean mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool pdf pdf-am ps ps-am tags tags-am uninstall \
+	uninstall-am
 
 
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
diff --git a/libde265/encoder/algo/CMakeLists.txt b/libde265/encoder/algo/CMakeLists.txt
new file mode 100644
index 0000000..48c7719
--- /dev/null
+++ b/libde265/encoder/algo/CMakeLists.txt
@@ -0,0 +1,20 @@
+set (algo_sources
+  algo.h algo.cc
+  coding-options.h coding-options.cc
+  ctb-qscale.h ctb-qscale.cc
+  cb-split.h cb-split.cc
+  cb-intrapartmode.h cb-intrapartmode.cc
+  cb-interpartmode.h cb-interpartmode.cc
+  cb-skip.h cb-skip.cc
+  cb-intra-inter.h cb-intra-inter.cc
+  cb-mergeindex.h cb-mergeindex.cc
+  tb-split.h tb-split.cc
+  tb-intrapredmode.h tb-intrapredmode.cc
+  pb-mv.h pb-mv.cc
+)
+
+add_library(algo STATIC ${algo_sources})
+
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+  SET_TARGET_PROPERTIES(algo PROPERTIES COMPILE_FLAGS "-fPIC")
+endif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
diff --git a/libde265/encoder/algo/Makefile.am b/libde265/encoder/algo/Makefile.am
new file mode 100644
index 0000000..e618a18
--- /dev/null
+++ b/libde265/encoder/algo/Makefile.am
@@ -0,0 +1,19 @@
+noinst_LTLIBRARIES = libde265_encoder_algo.la
+
+libde265_encoder_algo_la_CXXFLAGS = -I../..
+libde265_encoder_algo_la_SOURCES = \
+  algo.h algo.cc \
+  coding-options.h coding-options.cc \
+  ctb-qscale.h ctb-qscale.cc \
+  cb-split.h cb-split.cc \
+  cb-intrapartmode.h cb-intrapartmode.cc \
+  cb-interpartmode.h cb-interpartmode.cc \
+  cb-skip.h cb-skip.cc \
+  cb-intra-inter.h cb-intra-inter.cc \
+  cb-mergeindex.h cb-mergeindex.cc \
+  tb-split.h tb-split.cc \
+  tb-intrapredmode.h tb-intrapredmode.cc \
+  pb-mv.h pb-mv.cc
+
+EXTRA_DIST = \
+  CMakeLists.txt
diff --git a/libde265/x86/Makefile.in b/libde265/encoder/algo/Makefile.in
similarity index 53%
copy from libde265/x86/Makefile.in
copy to libde265/encoder/algo/Makefile.in
index c982db6..85e2fc6 100644
--- a/libde265/x86/Makefile.in
+++ b/libde265/encoder/algo/Makefile.in
@@ -79,18 +79,17 @@ POST_UNINSTALL = :
 build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
- at HAVE_VISIBILITY_TRUE@am__append_1 = -DHAVE_VISIBILITY
- at HAVE_VISIBILITY_TRUE@am__append_2 = -DHAVE_VISIBILITY
-subdir = libde265/x86
+subdir = libde265/encoder/algo
 DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
 	$(top_srcdir)/depcomp
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_compare_version.m4 \
+	$(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \
 	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
 	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
 	$(top_srcdir)/m4/lt~obsolete.m4 \
 	$(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \
-	$(top_srcdir)/m4/visibility.m4 $(top_srcdir)/configure.ac
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 mkinstalldirs = $(install_sh) -d
@@ -98,24 +97,29 @@ CONFIG_HEADER = $(top_builddir)/config.h
 CONFIG_CLEAN_FILES =
 CONFIG_CLEAN_VPATH_FILES =
 LTLIBRARIES = $(noinst_LTLIBRARIES)
-libde265_x86_la_DEPENDENCIES = libde265_x86_sse.la
-am_libde265_x86_la_OBJECTS = libde265_x86_la-sse.lo
-libde265_x86_la_OBJECTS = $(am_libde265_x86_la_OBJECTS)
+libde265_encoder_algo_la_LIBADD =
+am_libde265_encoder_algo_la_OBJECTS =  \
+	libde265_encoder_algo_la-algo.lo \
+	libde265_encoder_algo_la-coding-options.lo \
+	libde265_encoder_algo_la-ctb-qscale.lo \
+	libde265_encoder_algo_la-cb-split.lo \
+	libde265_encoder_algo_la-cb-intrapartmode.lo \
+	libde265_encoder_algo_la-cb-interpartmode.lo \
+	libde265_encoder_algo_la-cb-skip.lo \
+	libde265_encoder_algo_la-cb-intra-inter.lo \
+	libde265_encoder_algo_la-cb-mergeindex.lo \
+	libde265_encoder_algo_la-tb-split.lo \
+	libde265_encoder_algo_la-tb-intrapredmode.lo \
+	libde265_encoder_algo_la-pb-mv.lo
+libde265_encoder_algo_la_OBJECTS =  \
+	$(am_libde265_encoder_algo_la_OBJECTS)
 AM_V_lt = $(am__v_lt_ at AM_V@)
 am__v_lt_ = $(am__v_lt_ at AM_DEFAULT_V@)
 am__v_lt_0 = --silent
 am__v_lt_1 = 
-libde265_x86_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+libde265_encoder_algo_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
-	$(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
-	$(LDFLAGS) -o $@
-libde265_x86_sse_la_LIBADD =
-am_libde265_x86_sse_la_OBJECTS = libde265_x86_sse_la-sse-motion.lo \
-	libde265_x86_sse_la-sse-dct.lo
-libde265_x86_sse_la_OBJECTS = $(am_libde265_x86_sse_la_OBJECTS)
-libde265_x86_sse_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
-	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
-	$(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
 AM_V_P = $(am__v_P_ at AM_V@)
 am__v_P_ = $(am__v_P_ at AM_DEFAULT_V@)
@@ -169,9 +173,8 @@ AM_V_CCLD = $(am__v_CCLD_ at AM_V@)
 am__v_CCLD_ = $(am__v_CCLD_ at AM_DEFAULT_V@)
 am__v_CCLD_0 = @echo "  CCLD    " $@;
 am__v_CCLD_1 = 
-SOURCES = $(libde265_x86_la_SOURCES) $(libde265_x86_sse_la_SOURCES)
-DIST_SOURCES = $(libde265_x86_la_SOURCES) \
-	$(libde265_x86_sse_la_SOURCES)
+SOURCES = $(libde265_encoder_algo_la_SOURCES)
+DIST_SOURCES = $(libde265_encoder_algo_la_SOURCES)
 am__can_run_installinfo = \
   case $$AM_UPDATE_INFO_DIR in \
     n|no|NO) false;; \
@@ -198,6 +201,7 @@ ETAGS = etags
 CTAGS = ctags
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
 AMTAR = @AMTAR@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
@@ -206,9 +210,11 @@ AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
 CCDEPMODE = @CCDEPMODE@
 CFLAGS = @CFLAGS@
-CFLAG_VISIBILITY = @CFLAG_VISIBILITY@
 CPP = @CPP@
 CPPFLAGS = @CPPFLAGS@
 CXX = @CXX@
@@ -228,7 +234,7 @@ EGREP = @EGREP@
 EXEEXT = @EXEEXT@
 FGREP = @FGREP@
 GREP = @GREP@
-HAVE_VISIBILITY = @HAVE_VISIBILITY@
+HAVE_CXX11 = @HAVE_CXX11@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -339,15 +345,25 @@ target_vendor = @target_vendor@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
-noinst_LTLIBRARIES = libde265_x86.la  libde265_x86_sse.la
-libde265_x86_la_CXXFLAGS = -I.. $(CFLAG_VISIBILITY) $(am__append_1)
-libde265_x86_la_SOURCES = sse.cc sse.h
-libde265_x86_la_LIBADD = libde265_x86_sse.la
-
-# SSE4 specific functions
-libde265_x86_sse_la_CXXFLAGS = -msse4.1 -I.. $(CFLAG_VISIBILITY) \
-	$(am__append_2)
-libde265_x86_sse_la_SOURCES = sse-motion.cc sse-motion.h sse-dct.h sse-dct.cc
+noinst_LTLIBRARIES = libde265_encoder_algo.la
+libde265_encoder_algo_la_CXXFLAGS = -I../..
+libde265_encoder_algo_la_SOURCES = \
+  algo.h algo.cc \
+  coding-options.h coding-options.cc \
+  ctb-qscale.h ctb-qscale.cc \
+  cb-split.h cb-split.cc \
+  cb-intrapartmode.h cb-intrapartmode.cc \
+  cb-interpartmode.h cb-interpartmode.cc \
+  cb-skip.h cb-skip.cc \
+  cb-intra-inter.h cb-intra-inter.cc \
+  cb-mergeindex.h cb-mergeindex.cc \
+  tb-split.h tb-split.cc \
+  tb-intrapredmode.h tb-intrapredmode.cc \
+  pb-mv.h pb-mv.cc
+
+EXTRA_DIST = \
+  CMakeLists.txt
+
 all: all-am
 
 .SUFFIXES:
@@ -361,9 +377,9 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	      exit 1;; \
 	  esac; \
 	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu libde265/x86/Makefile'; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu libde265/encoder/algo/Makefile'; \
 	$(am__cd) $(top_srcdir) && \
-	  $(AUTOMAKE) --gnu libde265/x86/Makefile
+	  $(AUTOMAKE) --gnu libde265/encoder/algo/Makefile
 .PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	@case '$?' in \
@@ -394,11 +410,8 @@ clean-noinstLTLIBRARIES:
 	  rm -f $${locs}; \
 	}
 
-libde265_x86.la: $(libde265_x86_la_OBJECTS) $(libde265_x86_la_DEPENDENCIES) $(EXTRA_libde265_x86_la_DEPENDENCIES) 
-	$(AM_V_CXXLD)$(libde265_x86_la_LINK)  $(libde265_x86_la_OBJECTS) $(libde265_x86_la_LIBADD) $(LIBS)
-
-libde265_x86_sse.la: $(libde265_x86_sse_la_OBJECTS) $(libde265_x86_sse_la_DEPENDENCIES) $(EXTRA_libde265_x86_sse_la_DEPENDENCIES) 
-	$(AM_V_CXXLD)$(libde265_x86_sse_la_LINK)  $(libde265_x86_sse_la_OBJECTS) $(libde265_x86_sse_la_LIBADD) $(LIBS)
+libde265_encoder_algo.la: $(libde265_encoder_algo_la_OBJECTS) $(libde265_encoder_algo_la_DEPENDENCIES) $(EXTRA_libde265_encoder_algo_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(libde265_encoder_algo_la_LINK)  $(libde265_encoder_algo_la_OBJECTS) $(libde265_encoder_algo_la_LIBADD) $(LIBS)
 
 mostlyclean-compile:
 	-rm -f *.$(OBJEXT)
@@ -406,9 +419,18 @@ mostlyclean-compile:
 distclean-compile:
 	-rm -f *.tab.c
 
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_x86_la-sse.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_encoder_algo_la-algo.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_encoder_algo_la-cb-interpartmode.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_encoder_algo_la-cb-intra-inter.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_encoder_algo_la-cb-intrapartmode.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_encoder_algo_la-cb-mergeindex.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_encoder_algo_la-cb-skip.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_encoder_algo_la-cb-split.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_encoder_algo_la-coding-options.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_encoder_algo_la-ctb-qscale.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_encoder_algo_la-pb-mv.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_encoder_algo_la-tb-intrapredmode.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libde265_encoder_algo_la-tb-split.Plo at am__quote@
 
 .cc.o:
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@@ -431,26 +453,89 @@ distclean-compile:
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LTCXXCOMPILE) -c -o $@ $<
 
-libde265_x86_la-sse.lo: sse.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_la-sse.lo -MD -MP -MF $(DEPDIR)/libde265_x86_la-sse.Tpo -c -o libde265_x86_la-sse.lo `test -f 'sse.cc' || echo '$(srcdir)/'`sse.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_la-sse.Tpo $(DEPDIR)/libde265_x86_la-sse.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='sse.cc' object='libde265_x86_la-sse.lo' libtool=yes @AMDEPBACKSLASH@
+libde265_encoder_algo_la-algo.lo: algo.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_encoder_algo_la-algo.lo -MD -MP -MF $(DEPDIR)/libde265_encoder_algo_la-algo.Tpo -c -o libde265_encoder_algo_la-algo.lo `test -f 'algo.cc' || echo '$(srcdir)/'`algo.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_encoder_algo_la-algo.Tpo $(DEPDIR)/libde265_encoder_algo_la-algo.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='algo.cc' object='libde265_encoder_algo_la-algo.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_encoder_algo_la-algo.lo `test -f 'algo.cc' || echo '$(srcdir)/'`algo.cc
+
+libde265_encoder_algo_la-coding-options.lo: coding-options.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_encoder_algo_la-coding-options.lo -MD -MP -MF $(DEPDIR)/libde265_encoder_algo_la-coding-options.Tpo -c -o libde265_encoder_algo_la-coding-options.lo `test -f 'coding-options.cc' || echo '$(srcdir)/'`coding-options.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_encoder_algo_la-coding-options.Tpo $(DEPDIR)/libde265_encoder_algo_la-coding-options.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='coding-options.cc' object='libde265_encoder_algo_la-coding-options.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_encoder_algo_la-coding-options.lo `test -f 'coding-options.cc' || echo '$(srcdir)/'`coding-options.cc
+
+libde265_encoder_algo_la-ctb-qscale.lo: ctb-qscale.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_encoder_algo_la-ctb-qscale.lo -MD -MP -MF $(DEPDIR)/libde265_encoder_algo_la-ctb-qscale.Tpo -c -o libde265_encoder_algo_la-ctb-qscale.lo `test -f 'ctb-qscale.cc' || echo '$(srcdir)/'`ctb-qscale.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_encoder_algo_la-ctb-qscale.Tpo $(DEPDIR)/libde265_encoder_algo_la-ctb-qscale.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='ctb-qscale.cc' object='libde265_encoder_algo_la-ctb-qscale.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_encoder_algo_la-ctb-qscale.lo `test -f 'ctb-qscale.cc' || echo '$(srcdir)/'`ctb-qscale.cc
+
+libde265_encoder_algo_la-cb-split.lo: cb-split.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_encoder_algo_la-cb-split.lo -MD -MP -MF $(DEPDIR)/libde265_encoder_algo_la-cb-split.Tpo -c -o libde265_encoder_algo_la-cb-split.lo `test -f 'cb-split.cc' || echo '$(srcdir)/'`cb-split.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_encoder_algo_la-cb-split.Tpo $(DEPDIR)/libde265_encoder_algo_la-cb-split.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='cb-split.cc' object='libde265_encoder_algo_la-cb-split.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_encoder_algo_la-cb-split.lo `test -f 'cb-split.cc' || echo '$(srcdir)/'`cb-split.cc
+
+libde265_encoder_algo_la-cb-intrapartmode.lo: cb-intrapartmode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_encoder_algo_la-cb-intrapartmode.lo -MD -MP -MF $(DEPDIR)/libde265_encoder_algo_la-cb-intrapartmode.Tpo -c -o libde265_encoder_algo_la-cb-intrapartmode.lo `test -f 'cb-intrapartmode.cc' || echo '$(srcdir)/'`cb-intrapartmode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_encoder_algo_la-cb-intrapartmode.Tpo $(DEPDIR)/libde265_encoder_algo_la-cb-intrapartmode.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='cb-intrapartmode.cc' object='libde265_encoder_algo_la-cb-intrapartmode.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_encoder_algo_la-cb-intrapartmode.lo `test -f 'cb-intrapartmode.cc' || echo '$(srcdir)/'`cb-intrapartmode.cc
+
+libde265_encoder_algo_la-cb-interpartmode.lo: cb-interpartmode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_encoder_algo_la-cb-interpartmode.lo -MD -MP -MF $(DEPDIR)/libde265_encoder_algo_la-cb-interpartmode.Tpo -c -o libde265_encoder_algo_la-cb-interpartmode.lo `test -f 'cb-interpartmode.cc' || echo '$(srcdir)/'`cb-interpartmode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_encoder_algo_la-cb-interpartmode.Tpo $(DEPDIR)/libde265_encoder_algo_la-cb-interpartmode.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='cb-interpartmode.cc' object='libde265_encoder_algo_la-cb-interpartmode.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_encoder_algo_la-cb-interpartmode.lo `test -f 'cb-interpartmode.cc' || echo '$(srcdir)/'`cb-interpartmode.cc
+
+libde265_encoder_algo_la-cb-skip.lo: cb-skip.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_encoder_algo_la-cb-skip.lo -MD -MP -MF $(DEPDIR)/libde265_encoder_algo_la-cb-skip.Tpo -c -o libde265_encoder_algo_la-cb-skip.lo `test -f 'cb-skip.cc' || echo '$(srcdir)/'`cb-skip.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_encoder_algo_la-cb-skip.Tpo $(DEPDIR)/libde265_encoder_algo_la-cb-skip.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='cb-skip.cc' object='libde265_encoder_algo_la-cb-skip.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_encoder_algo_la-cb-skip.lo `test -f 'cb-skip.cc' || echo '$(srcdir)/'`cb-skip.cc
+
+libde265_encoder_algo_la-cb-intra-inter.lo: cb-intra-inter.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_encoder_algo_la-cb-intra-inter.lo -MD -MP -MF $(DEPDIR)/libde265_encoder_algo_la-cb-intra-inter.Tpo -c -o libde265_encoder_algo_la-cb-intra-inter.lo `test -f 'cb-intra-inter.cc' || echo '$(srcdir)/'`cb-intra-inter.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_encoder_algo_la-cb-intra-inter.Tpo $(DEPDIR)/libde265_encoder_algo_la-cb-intra-inter.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='cb-intra-inter.cc' object='libde265_encoder_algo_la-cb-intra-inter.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_encoder_algo_la-cb-intra-inter.lo `test -f 'cb-intra-inter.cc' || echo '$(srcdir)/'`cb-intra-inter.cc
+
+libde265_encoder_algo_la-cb-mergeindex.lo: cb-mergeindex.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_encoder_algo_la-cb-mergeindex.lo -MD -MP -MF $(DEPDIR)/libde265_encoder_algo_la-cb-mergeindex.Tpo -c -o libde265_encoder_algo_la-cb-mergeindex.lo `test -f 'cb-mergeindex.cc' || echo '$(srcdir)/'`cb-mergeindex.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_encoder_algo_la-cb-mergeindex.Tpo $(DEPDIR)/libde265_encoder_algo_la-cb-mergeindex.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='cb-mergeindex.cc' object='libde265_encoder_algo_la-cb-mergeindex.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_encoder_algo_la-cb-mergeindex.lo `test -f 'cb-mergeindex.cc' || echo '$(srcdir)/'`cb-mergeindex.cc
+
+libde265_encoder_algo_la-tb-split.lo: tb-split.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_encoder_algo_la-tb-split.lo -MD -MP -MF $(DEPDIR)/libde265_encoder_algo_la-tb-split.Tpo -c -o libde265_encoder_algo_la-tb-split.lo `test -f 'tb-split.cc' || echo '$(srcdir)/'`tb-split.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_encoder_algo_la-tb-split.Tpo $(DEPDIR)/libde265_encoder_algo_la-tb-split.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='tb-split.cc' object='libde265_encoder_algo_la-tb-split.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_la-sse.lo `test -f 'sse.cc' || echo '$(srcdir)/'`sse.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_encoder_algo_la-tb-split.lo `test -f 'tb-split.cc' || echo '$(srcdir)/'`tb-split.cc
 
-libde265_x86_sse_la-sse-motion.lo: sse-motion.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_sse_la-sse-motion.lo -MD -MP -MF $(DEPDIR)/libde265_x86_sse_la-sse-motion.Tpo -c -o libde265_x86_sse_la-sse-motion.lo `test -f 'sse-motion.cc' || echo '$(srcdir)/'`sse-motion.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_sse_la-sse-motion.Tpo $(DEPDIR)/libde265_x86_sse_la-sse-motion.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='sse-motion.cc' object='libde265_x86_sse_la-sse-motion.lo' libtool=yes @AMDEPBACKSLASH@
+libde265_encoder_algo_la-tb-intrapredmode.lo: tb-intrapredmode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_encoder_algo_la-tb-intrapredmode.lo -MD -MP -MF $(DEPDIR)/libde265_encoder_algo_la-tb-intrapredmode.Tpo -c -o libde265_encoder_algo_la-tb-intrapredmode.lo `test -f 'tb-intrapredmode.cc' || echo '$(srcdir)/'`tb-intrapredmode.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_encoder_algo_la-tb-intrapredmode.Tpo $(DEPDIR)/libde265_encoder_algo_la-tb-intrapredmode.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='tb-intrapredmode.cc' object='libde265_encoder_algo_la-tb-intrapredmode.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_sse_la-sse-motion.lo `test -f 'sse-motion.cc' || echo '$(srcdir)/'`sse-motion.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_encoder_algo_la-tb-intrapredmode.lo `test -f 'tb-intrapredmode.cc' || echo '$(srcdir)/'`tb-intrapredmode.cc
 
-libde265_x86_sse_la-sse-dct.lo: sse-dct.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_x86_sse_la-sse-dct.lo -MD -MP -MF $(DEPDIR)/libde265_x86_sse_la-sse-dct.Tpo -c -o libde265_x86_sse_la-sse-dct.lo `test -f 'sse-dct.cc' || echo '$(srcdir)/'`sse-dct.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_x86_sse_la-sse-dct.Tpo $(DEPDIR)/libde265_x86_sse_la-sse-dct.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='sse-dct.cc' object='libde265_x86_sse_la-sse-dct.lo' libtool=yes @AMDEPBACKSLASH@
+libde265_encoder_algo_la-pb-mv.lo: pb-mv.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -MT libde265_encoder_algo_la-pb-mv.lo -MD -MP -MF $(DEPDIR)/libde265_encoder_algo_la-pb-mv.Tpo -c -o libde265_encoder_algo_la-pb-mv.lo `test -f 'pb-mv.cc' || echo '$(srcdir)/'`pb-mv.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libde265_encoder_algo_la-pb-mv.Tpo $(DEPDIR)/libde265_encoder_algo_la-pb-mv.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='pb-mv.cc' object='libde265_encoder_algo_la-pb-mv.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_x86_sse_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_x86_sse_la-sse-dct.lo `test -f 'sse-dct.cc' || echo '$(srcdir)/'`sse-dct.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libde265_encoder_algo_la_CXXFLAGS) $(CXXFLAGS) -c -o libde265_encoder_algo_la-pb-mv.lo `test -f 'pb-mv.cc' || echo '$(srcdir)/'`pb-mv.cc
 
 mostlyclean-libtool:
 	-rm -f *.lo
diff --git a/libde265/x86/sse.h b/libde265/encoder/algo/algo.cc
similarity index 83%
copy from libde265/x86/sse.h
copy to libde265/encoder/algo/algo.cc
index d4663d0..463a61e 100644
--- a/libde265/x86/sse.h
+++ b/libde265/encoder/algo/algo.cc
@@ -2,6 +2,8 @@
  * H.265 video codec.
  * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
  *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
  * This file is part of libde265.
  *
  * libde265 is free software: you can redistribute it and/or modify
@@ -18,11 +20,5 @@
  * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef DE265_SSE_H
-#define DE265_SSE_H
-
-#include "acceleration.h"
-
-void init_acceleration_functions_sse(struct acceleration_functions* accel);
-
-#endif
+#include "libde265/encoder/algo/algo.h"
+#include "libde265/encoder/encoder-context.h"
diff --git a/libde265/encoder/algo/algo.h b/libde265/encoder/algo/algo.h
new file mode 100644
index 0000000..e0ad110
--- /dev/null
+++ b/libde265/encoder/algo/algo.h
@@ -0,0 +1,74 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ALGO_H
+#define ALGO_H
+
+#include "libde265/encoder/encode.h"
+
+
+/* When entering the next recursion level, it is assumed that
+   a valid CB structure is passed down. Ownership is transferred to
+   the new algorithm. That algorithm passes back a (possibly different)
+   CB structure that the first algorithm should use. The original CB
+   structure might have been deleted in the called algorithm.
+
+   The context_model_table passed down is at the current state.
+   When the algorithm returns, the state should represent the state
+   after running this algorithm.
+
+   When returning from the algorithm, it is also assumed that the
+   ectx->img content (reconstruction and metadata) represent the
+   current state. When the algorithm tries several variants, it
+   has to restore the state to the selected variant.
+ */
+
+class Algo_CB
+{
+ public:
+  virtual ~Algo_CB() { }
+
+  /* The context_model_table that is provided can be modified and
+     even released in the function. On exit, it should be filled with
+     a (optionally new) context_model_table that represents the state
+     after encoding the syntax element. However, to speed up computation,
+     it is also allowed to not modify the context_model_table at all.
+   */
+  virtual enc_cb* analyze(encoder_context*,
+                          context_model_table&,
+                          enc_cb* cb) = 0;
+};
+
+
+class Algo_PB
+{
+ public:
+  virtual ~Algo_PB() { }
+
+  virtual enc_cb* analyze(encoder_context*,
+                          context_model_table&,
+                          enc_cb* cb,
+                          int PBidx, int x,int y,int w,int h) = 0;
+};
+
+
+#endif
diff --git a/libde265/encoder/algo/cb-interpartmode.cc b/libde265/encoder/algo/cb-interpartmode.cc
new file mode 100644
index 0000000..1124a42
--- /dev/null
+++ b/libde265/encoder/algo/cb-interpartmode.cc
@@ -0,0 +1,113 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "libde265/encoder/algo/cb-interpartmode.h"
+#include "libde265/encoder/algo/coding-options.h"
+#include "libde265/encoder/encoder-context.h"
+#include <assert.h>
+#include <limits>
+#include <math.h>
+
+
+
+enc_cb* Algo_CB_InterPartMode::codeAllPBs(encoder_context* ectx,
+                                          context_model_table& ctxModel,
+                                          enc_cb* cb)
+{
+  int x = cb->x;
+  int y = cb->y;
+  int log2Size = cb->log2Size;
+  int w = 1<<log2Size;
+  int s; // splitSize;
+
+  int nPB;
+  switch (cb->PartMode) {
+  case PART_2Nx2N:
+    cb = mChildAlgo->analyze(ectx, ctxModel, cb, 0, x,y,1<<log2Size,1<<log2Size);
+    break;
+
+  case PART_NxN:
+    s = 1<<(log2Size-1);
+    cb = mChildAlgo->analyze(ectx, ctxModel, cb, 0, x  ,y  ,s,s);
+    cb = mChildAlgo->analyze(ectx, ctxModel, cb, 1, x+s,y  ,s,s);
+    cb = mChildAlgo->analyze(ectx, ctxModel, cb, 2, x  ,y+s,s,s);
+    cb = mChildAlgo->analyze(ectx, ctxModel, cb, 3, x+s,y+s,s,s);
+    break;
+
+  case PART_2NxN:
+    s = 1<<(log2Size-1);
+    cb = mChildAlgo->analyze(ectx, ctxModel, cb, 0, x,y  ,w,s);
+    cb = mChildAlgo->analyze(ectx, ctxModel, cb, 1, x,y+s,w,s);
+    break;
+
+  case PART_Nx2N:
+    s = 1<<(log2Size-1);
+    cb = mChildAlgo->analyze(ectx, ctxModel, cb, 0, x  ,y,s,w);
+    cb = mChildAlgo->analyze(ectx, ctxModel, cb, 1, x+s,y,s,w);
+    break;
+
+  case PART_2NxnU:
+    s = 1<<(log2Size-2);
+    cb = mChildAlgo->analyze(ectx, ctxModel, cb, 0, x,y  ,w,s);
+    cb = mChildAlgo->analyze(ectx, ctxModel, cb, 1, x,y+s,w,w-s);
+    break;
+
+  case PART_2NxnD:
+    s = 1<<(log2Size-2);
+    cb = mChildAlgo->analyze(ectx, ctxModel, cb, 0, x,y    ,w,w-s);
+    cb = mChildAlgo->analyze(ectx, ctxModel, cb, 1, x,y+w-s,w,s);
+    break;
+
+  case PART_nLx2N:
+    s = 1<<(log2Size-2);
+    cb = mChildAlgo->analyze(ectx, ctxModel, cb, 0, x  ,y,s  ,w);
+    cb = mChildAlgo->analyze(ectx, ctxModel, cb, 1, x+s,y,w-s,w);
+    break;
+
+  case PART_nRx2N:
+    s = 1<<(log2Size-2);
+    cb = mChildAlgo->analyze(ectx, ctxModel, cb, 0, x    ,y,w-s,w);
+    cb = mChildAlgo->analyze(ectx, ctxModel, cb, 1, x+w-s,y,s  ,w);
+    break;
+  }
+
+  return cb;
+}
+
+
+enc_cb* Algo_CB_InterPartMode_Fixed::analyze(encoder_context* ectx,
+                                             context_model_table& ctxModel,
+                                             enc_cb* cb)
+{
+  const int x = cb->x;
+  const int y = cb->y;
+
+  enum PartMode partMode = mParams.partMode();
+
+  cb->PartMode = partMode;
+  ectx->img->set_PartMode(x,y, partMode);
+
+  cb = codeAllPBs(ectx,ctxModel,cb);
+
+  return cb;
+}
diff --git a/libde265/encoder/algo/cb-interpartmode.h b/libde265/encoder/algo/cb-interpartmode.h
new file mode 100644
index 0000000..f47b36f
--- /dev/null
+++ b/libde265/encoder/algo/cb-interpartmode.h
@@ -0,0 +1,105 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CB_INTERPARTMODE_H
+#define CB_INTERPARTMODE_H
+
+#include "libde265/nal-parser.h"
+#include "libde265/decctx.h"
+#include "libde265/encoder/encode.h"
+#include "libde265/slice.h"
+#include "libde265/scan.h"
+#include "libde265/intrapred.h"
+#include "libde265/transform.h"
+#include "libde265/fallback-dct.h"
+#include "libde265/quality.h"
+#include "libde265/fallback.h"
+#include "libde265/configparam.h"
+
+#include "libde265/encoder/algo/algo.h"
+#include "libde265/encoder/algo/tb-intrapredmode.h"
+#include "libde265/encoder/algo/tb-split.h"
+#include "libde265/encoder/algo/cb-intrapartmode.h"
+
+
+// ========== CB Intra/Inter decision ==========
+
+class Algo_CB_InterPartMode : public Algo_CB
+{
+ public:
+  virtual ~Algo_CB_InterPartMode() { }
+
+  void setChildAlgo(Algo_PB* algo) { mChildAlgo = algo; }
+
+ protected:
+  Algo_PB* mChildAlgo;
+
+  enc_cb* codeAllPBs(encoder_context*,
+                     context_model_table&,
+                     enc_cb* cb);
+};
+
+
+
+
+class option_InterPartMode : public choice_option<enum PartMode> // choice_option
+{
+ public:
+  option_InterPartMode() {
+    add_choice("2Nx2N", PART_2Nx2N, true);
+    add_choice("NxN",   PART_NxN);
+    add_choice("Nx2N",  PART_Nx2N);
+    add_choice("2NxN",  PART_2NxN);
+    add_choice("2NxnU", PART_2NxnU);
+    add_choice("2NxnD", PART_2NxnD);
+    add_choice("nLx2N", PART_nLx2N);
+    add_choice("nRx2N", PART_nRx2N);
+  }
+};
+
+class Algo_CB_InterPartMode_Fixed : public Algo_CB_InterPartMode
+{
+ public:
+  struct params
+  {
+    params() {
+      partMode.set_ID("CB-InterPartMode-Fixed-partMode");
+    }
+
+    option_InterPartMode partMode;
+  };
+
+  void registerParams(config_parameters& config) {
+    config.add_option(&mParams.partMode);
+  }
+
+  void setParams(const params& p) { mParams=p; }
+
+  virtual enc_cb* analyze(encoder_context*,
+                          context_model_table&,
+                          enc_cb* cb);
+
+ private:
+  params mParams;
+};
+
+#endif
diff --git a/libde265/encoder/algo/cb-intra-inter.cc b/libde265/encoder/algo/cb-intra-inter.cc
new file mode 100644
index 0000000..93e4778
--- /dev/null
+++ b/libde265/encoder/algo/cb-intra-inter.cc
@@ -0,0 +1,128 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "libde265/encoder/algo/cb-intra-inter.h"
+#include "libde265/encoder/algo/coding-options.h"
+#include "libde265/encoder/encoder-context.h"
+#include <assert.h>
+#include <limits>
+#include <math.h>
+
+
+
+enc_cb* Algo_CB_IntraInter_BruteForce::analyze(encoder_context* ectx,
+                                               context_model_table& ctxModel,
+                                               enc_cb* cb)
+{
+  assert(cb->pcm_flag==0);
+
+  bool try_intra = true;
+  bool try_inter = (ectx->shdr->slice_type != SLICE_TYPE_I);
+
+  bool debug_halt = try_inter;
+  //try_inter = false;
+  //try_intra = !try_inter; // TODO HACK: no intra in inter frames
+
+  if (ectx->imgdata->frame_number > 0) {
+    //printf("%d\n",ectx->imgdata->frame_number);
+  }
+
+  // 0: intra
+  // 1: inter
+
+  CodingOptions options(ectx,cb,ctxModel);
+
+  CodingOption option_intra = options.new_option(try_intra);
+  CodingOption option_inter = options.new_option(try_inter);
+
+  options.start();
+
+  enc_cb* cb_inter = NULL;
+  enc_cb* cb_intra = NULL;
+
+  const int log2CbSize = cb->log2Size;
+  const int x = cb->x;
+  const int y = cb->y;
+
+
+  // try encoding with inter
+
+  if (option_inter) {
+    option_inter.begin();
+    cb_inter = option_inter.get_cb();
+
+    cb_inter->PredMode = MODE_INTER;
+    ectx->img->set_pred_mode(x,y, log2CbSize, MODE_INTER);
+
+    enc_cb* cb_result;
+    cb_result=mInterAlgo->analyze(ectx, option_inter.get_context(), cb_inter);
+
+    if (cb_result->PredMode != MODE_SKIP) {
+      CABAC_encoder_estim* cabac = option_inter.get_cabac();
+      cabac->reset();
+
+      cabac->write_CABAC_bit(CONTEXT_MODEL_PRED_MODE_FLAG, 0); // 0 - inter
+      float rate_pred_mode_flag = cabac->getRDBits();
+      //printf("inter bits: %f\n", rate_pred_mode_flag);
+
+      cb_result->rate += rate_pred_mode_flag;
+    }
+
+    option_inter.set_cb(cb_result);
+
+    option_inter.end();
+  }
+
+
+  // try intra
+
+  if (option_intra) {
+    option_intra.begin();
+    cb_intra = option_intra.get_cb();
+
+    cb_intra->PredMode = MODE_INTRA;
+    ectx->img->set_pred_mode(x,y, log2CbSize, MODE_INTRA);
+
+    enc_cb* cb_result;
+    cb_result=mIntraAlgo->analyze(ectx, option_intra.get_context(), cb_intra);
+
+    if (ectx->shdr->slice_type != SLICE_TYPE_I) {
+      CABAC_encoder_estim* cabac = option_intra.get_cabac();
+      cabac->reset();
+
+      cabac->write_CABAC_bit(CONTEXT_MODEL_PRED_MODE_FLAG, 1); // 1 - intra
+      float rate_pred_mode_flag = cabac->getRDBits();
+      //printf("intra bits: %f\n", rate_pred_mode_flag);
+
+      cb_result->rate += rate_pred_mode_flag;
+    }
+
+    option_intra.set_cb(cb_result);
+
+    option_intra.end();
+  }
+
+
+  options.compute_rdo_costs();
+  return options.return_best_rdo();
+}
diff --git a/libde265/encoder/algo/cb-intra-inter.h b/libde265/encoder/algo/cb-intra-inter.h
new file mode 100644
index 0000000..13cad74
--- /dev/null
+++ b/libde265/encoder/algo/cb-intra-inter.h
@@ -0,0 +1,67 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CB_INTRA_INTER_H
+#define CB_INTRA_INTER_H
+
+#include "libde265/nal-parser.h"
+#include "libde265/decctx.h"
+#include "libde265/encoder/encode.h"
+#include "libde265/slice.h"
+#include "libde265/scan.h"
+#include "libde265/intrapred.h"
+#include "libde265/transform.h"
+#include "libde265/fallback-dct.h"
+#include "libde265/quality.h"
+#include "libde265/fallback.h"
+#include "libde265/configparam.h"
+
+#include "libde265/encoder/algo/algo.h"
+#include "libde265/encoder/algo/tb-intrapredmode.h"
+#include "libde265/encoder/algo/tb-split.h"
+#include "libde265/encoder/algo/cb-intrapartmode.h"
+
+
+// ========== CB Intra/Inter decision ==========
+
+class Algo_CB_IntraInter : public Algo_CB
+{
+ public:
+  virtual ~Algo_CB_IntraInter() { }
+
+  void setIntraChildAlgo(Algo_CB* algo) { mIntraAlgo = algo; }
+  void setInterChildAlgo(Algo_CB* algo) { mInterAlgo = algo; }
+
+ protected:
+  Algo_CB* mIntraAlgo;
+  Algo_CB* mInterAlgo;
+};
+
+class Algo_CB_IntraInter_BruteForce : public Algo_CB_IntraInter
+{
+ public:
+  virtual enc_cb* analyze(encoder_context*,
+                          context_model_table&,
+                          enc_cb* cb);
+};
+
+#endif
diff --git a/libde265/encoder/algo/cb-intrapartmode.cc b/libde265/encoder/algo/cb-intrapartmode.cc
new file mode 100644
index 0000000..cb6c044
--- /dev/null
+++ b/libde265/encoder/algo/cb-intrapartmode.cc
@@ -0,0 +1,180 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "libde265/encoder/algo/cb-intrapartmode.h"
+#include "libde265/encoder/algo/coding-options.h"
+#include "libde265/encoder/encoder-context.h"
+#include <assert.h>
+#include <limits>
+#include <math.h>
+#include <iostream>
+
+
+#define ENCODER_DEVELOPMENT 1
+
+
+
+enc_cb* Algo_CB_IntraPartMode_BruteForce::analyze(encoder_context* ectx,
+                                                  context_model_table& ctxModel,
+                                                  enc_cb* cb_in)
+{
+  const int log2CbSize = cb_in->log2Size;
+  const int x = cb_in->x;
+  const int y = cb_in->y;
+
+  const bool can_use_NxN = ((log2CbSize == ectx->sps.Log2MinCbSizeY) &&
+                            (log2CbSize >  ectx->sps.Log2MinTrafoSize));
+
+
+  // test all modes
+
+  assert(cb_in->pcm_flag==0);
+
+
+  // 0: 2Nx2N  (always checked)
+  // 1:  NxN   (only checked at MinCbSize)
+
+  CodingOptions options(ectx,cb_in,ctxModel);
+  CodingOption option[2];
+  option[0] = options.new_option(true);
+  option[1] = options.new_option(can_use_NxN);
+
+  options.start();
+
+  for (int p=0;p<2;p++)
+    if (option[p]) {
+      option[p].begin();
+
+      enc_cb* cb = option[p].get_cb();
+
+      // --- set intra prediction mode ---
+
+      cb->PartMode = (p==0 ? PART_2Nx2N : PART_NxN);
+
+      ectx->img->set_pred_mode(x,y, log2CbSize, cb->PredMode);  // TODO: probably unnecessary
+      ectx->img->set_PartMode (x,y, cb->PartMode);
+
+      // encode transform tree
+
+      int IntraSplitFlag= (cb->PredMode == MODE_INTRA && cb->PartMode == PART_NxN);
+      int MaxTrafoDepth = ectx->sps.max_transform_hierarchy_depth_intra + IntraSplitFlag;
+
+      cb->transform_tree = mTBIntraPredModeAlgo->analyze(ectx, option[p].get_context(),
+                                                         ectx->imgdata->input, NULL, cb,
+                                                         x,y, x,y, log2CbSize,
+                                                         0,
+                                                         0, MaxTrafoDepth, IntraSplitFlag);
+
+      cb->distortion = cb->transform_tree->distortion;
+      cb->rate       = cb->transform_tree->rate;
+
+
+      // rate for cu syntax
+
+      logtrace(LogSymbols,"$1 part_mode=%d\n",cb->PartMode);
+      if (log2CbSize == ectx->sps.Log2MinCbSizeY) {
+        int bin = (cb->PartMode==PART_2Nx2N);
+        option[p].get_cabac()->reset();
+        option[p].get_cabac()->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+0, bin);
+        cb->rate += option[p].get_cabac()->getRDBits();
+      }
+
+      option[p].end();
+    }
+
+  options.compute_rdo_costs();
+  return options.return_best_rdo();
+}
+
+
+enc_cb* Algo_CB_IntraPartMode_Fixed::analyze(encoder_context* ectx,
+                                             context_model_table& ctxModel,
+                                             enc_cb* cb)
+{
+  std::cout << "CB-IntraPartMode in size=" << (1<<cb->log2Size)
+            << " hash=" << ctxModel.debug_dump() << "\n";
+
+  enum PartMode PartMode = mParams.partMode();
+
+
+  const int log2CbSize = cb->log2Size;
+  const int x = cb->x;
+  const int y = cb->y;
+
+
+  // NxN can only be applied at minimum CB size.
+  // If we are not at the minimum size, we have to use 2Nx2N.
+
+  if (PartMode==PART_NxN && log2CbSize != ectx->sps.Log2MinCbSizeY) {
+    PartMode = PART_2Nx2N;
+  }
+
+
+  // --- set intra prediction mode ---
+
+  cb->PartMode = PartMode;
+  ectx->img->set_PartMode(x,y, PartMode);
+
+
+  // encode transform tree
+
+  int IntraSplitFlag= (cb->PredMode == MODE_INTRA && cb->PartMode == PART_NxN);
+  int MaxTrafoDepth = ectx->sps.max_transform_hierarchy_depth_intra + IntraSplitFlag;
+
+  cb->transform_tree = mTBIntraPredModeAlgo->analyze(ectx, ctxModel,
+                                                     ectx->imgdata->input, NULL, cb,
+                                                     cb->x,cb->y, cb->x,cb->y, log2CbSize,
+                                                     0,
+                                                     0, MaxTrafoDepth, IntraSplitFlag);
+
+
+  // rate and distortion for this CB
+
+  cb->distortion = cb->transform_tree->distortion;
+  cb->rate       = cb->transform_tree->rate;
+
+  std::cout << "SUM TB-tree hinter PartMode " << cb->rate << "\n";
+
+
+  // rate for cu syntax
+
+  CABAC_encoder_estim estim;
+  estim.set_context_models(&ctxModel);
+
+  //encode_coding_unit(ectx,&estim,cb,x,y,log2CbSize, false);
+
+  //encode_part_mode(ectx,&estim, MODE_INTRA, PartMode, 0);
+
+  logtrace(LogSymbols,"$1 part_mode=%d\n",PartMode);
+  if (log2CbSize == ectx->sps.Log2MinCbSizeY) {
+    int bin = (PartMode==PART_2Nx2N);
+    estim.write_CABAC_bit(CONTEXT_MODEL_PART_MODE+0, bin);
+  }
+
+  cb->rate += estim.getRDBits();
+
+  std::cout << "CB-IntraPartMode out size=" << (1<<cb->log2Size)
+            << " hash=" << ctxModel.debug_dump() << "\n";
+
+  return cb;
+}
diff --git a/libde265/encoder/algo/cb-intrapartmode.h b/libde265/encoder/algo/cb-intrapartmode.h
new file mode 100644
index 0000000..5f65274
--- /dev/null
+++ b/libde265/encoder/algo/cb-intrapartmode.h
@@ -0,0 +1,143 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CB_INTRAPARTMODE_H
+#define CB_INTRAPARTMODE_H
+
+#include "libde265/nal-parser.h"
+#include "libde265/decctx.h"
+#include "libde265/encoder/encode.h"
+#include "libde265/slice.h"
+#include "libde265/scan.h"
+#include "libde265/intrapred.h"
+#include "libde265/transform.h"
+#include "libde265/fallback-dct.h"
+#include "libde265/quality.h"
+#include "libde265/fallback.h"
+#include "libde265/configparam.h"
+
+#include "libde265/encoder/algo/algo.h"
+#include "libde265/encoder/algo/tb-intrapredmode.h"
+#include "libde265/encoder/algo/tb-split.h"
+
+
+/*  Encoder search tree, bottom up:
+
+    - Algo_TB_Split - whether TB is split or not
+
+    - Algo_TB_IntraPredMode - choose the intra prediction mode (or NOP, if at the wrong tree level)
+
+    - Algo_CB_IntraPartMode - choose between NxN and 2Nx2N intra parts
+
+    - Algo_CB_Split - whether CB is split or not
+
+    - Algo_CTB_QScale - select QScale on CTB granularity
+ */
+
+
+// ========== CB intra NxN vs. 2Nx2N decision ==========
+
+enum ALGO_CB_IntraPartMode {
+  ALGO_CB_IntraPartMode_BruteForce,
+  ALGO_CB_IntraPartMode_Fixed
+};
+
+class option_ALGO_CB_IntraPartMode : public choice_option<enum ALGO_CB_IntraPartMode>
+{
+ public:
+  option_ALGO_CB_IntraPartMode() {
+    add_choice("fixed",      ALGO_CB_IntraPartMode_Fixed);
+    add_choice("brute-force",ALGO_CB_IntraPartMode_BruteForce, true);
+  }
+};
+
+
+class Algo_CB_IntraPartMode : public Algo_CB
+{
+ public:
+  Algo_CB_IntraPartMode() : mTBIntraPredModeAlgo(NULL) { }
+  virtual ~Algo_CB_IntraPartMode() { }
+
+  virtual enc_cb* analyze(encoder_context*,
+                          context_model_table&,
+                          enc_cb* cb) = 0;
+
+  void setChildAlgo(Algo_TB_IntraPredMode* algo) { mTBIntraPredModeAlgo = algo; }
+
+ protected:
+  Algo_TB_IntraPredMode* mTBIntraPredModeAlgo;
+};
+
+/* Try both NxN, 2Nx2N and choose better one.
+ */
+class Algo_CB_IntraPartMode_BruteForce : public Algo_CB_IntraPartMode
+{
+ public:
+  virtual enc_cb* analyze(encoder_context*,
+                          context_model_table&,
+                          enc_cb* cb);
+};
+
+
+class option_PartMode : public choice_option<enum PartMode> // choice_option
+{
+ public:
+  option_PartMode() {
+    add_choice("NxN",   PART_NxN);
+    add_choice("2Nx2N", PART_2Nx2N, true);
+  }
+};
+
+
+/* Always use choose selected part mode.
+   If NxN is chosen but cannot be applied (CB tree not at maximum depth), 2Nx2N is used instead.
+ */
+class Algo_CB_IntraPartMode_Fixed : public Algo_CB_IntraPartMode
+{
+ public:
+ Algo_CB_IntraPartMode_Fixed() { }
+
+  struct params
+  {
+    params() {
+      partMode.set_ID("CB-IntraPartMode-Fixed-partMode");
+    }
+
+    option_PartMode partMode;
+  };
+
+  void registerParams(config_parameters& config) {
+    config.add_option(&mParams.partMode);
+  }
+
+  void setParams(const params& p) { mParams=p; }
+
+  virtual enc_cb* analyze(encoder_context* ectx,
+                          context_model_table& ctxModel,
+                          enc_cb* cb);
+
+ private:
+  params mParams;
+};
+
+
+#endif
diff --git a/libde265/encoder/algo/cb-mergeindex.cc b/libde265/encoder/algo/cb-mergeindex.cc
new file mode 100644
index 0000000..524f01b
--- /dev/null
+++ b/libde265/encoder/algo/cb-mergeindex.cc
@@ -0,0 +1,138 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "libde265/encoder/algo/cb-mergeindex.h"
+#include "libde265/encoder/encoder-context.h"
+#include <assert.h>
+#include <limits>
+#include <math.h>
+
+
+
+
+enc_cb* Algo_CB_MergeIndex_Fixed::analyze(encoder_context* ectx,
+                                          context_model_table& ctxModel,
+                                          enc_cb* cb)
+{
+  assert(cb->split_cu_flag==false);
+  assert(cb->PredMode==MODE_SKIP); // TODO: || (cb->PredMode==MODE_INTER && cb->inter.skip_flag));
+
+
+  MotionVectorSpec mergeCandList[5];
+
+  int partIdx = 0;
+
+  int cbSize = 1 << cb->log2Size;
+
+  get_merge_candidate_list(ectx, ectx->shdr, ectx->img,
+                           cb->x, cb->y, // xC/yC
+                           cb->x, cb->y, // xP/yP
+                           cbSize, // nCS
+                           cbSize,cbSize, // nPbW/nPbH
+                           partIdx, // partIdx
+                           mergeCandList);
+
+  motion_spec&   spec = cb->inter.pb[partIdx].spec;
+
+  spec.merge_flag = 1; // we use merge mode
+  spec.merge_idx  = 0;
+
+
+  // build prediction
+
+  // previous frame (TODO)
+  const de265_image* refimg = ectx->get_image(ectx->imgdata->frame_number -1);
+
+  //printf("prev frame: %p %d\n",refimg,ectx->imgdata->frame_number);
+
+  /*
+    printf("#l0: %d\n",ectx->imgdata->shdr.num_ref_idx_l0_active);
+    printf("#l1: %d\n",ectx->imgdata->shdr.num_ref_idx_l1_active);
+
+    for (int i=0;i<ectx->imgdata->shdr.num_ref_idx_l0_active;i++)
+    printf("RefPixList[0][%d] = %d\n", i, ectx->imgdata->shdr.RefPicList[0][i]);
+  */
+
+  // TODO: fake motion data
+
+  const MotionVectorSpec& vec = mergeCandList[spec.merge_idx];
+  cb->inter.pb[partIdx].motion = vec;
+
+  ectx->img->set_mv_info(cb->x,cb->y, 1<<cb->log2Size,1<<cb->log2Size, vec);
+
+  generate_inter_prediction_samples(ectx, ectx->shdr, ectx->prediction,
+                                    cb->x,cb->y, // int xC,int yC,
+                                    0,0,         // int xB,int yB,
+                                    1<<cb->log2Size, // int nCS,
+                                    1<<cb->log2Size,
+                                    1<<cb->log2Size, // int nPbW,int nPbH,
+                                    &vec);
+
+  generate_inter_prediction_samples(ectx, ectx->shdr, ectx->img,
+                                    cb->x,cb->y, // int xC,int yC,
+                                    0,0,         // int xB,int yB,
+                                    1<<cb->log2Size, // int nCS,
+                                    1<<cb->log2Size,
+                                    1<<cb->log2Size, // int nPbW,int nPbH,
+                                    &vec);
+
+  // estimate rate for sending merge index
+
+  //CABAC_encoder_estim cabac;
+  //cabac.write_bits();
+
+  int IntraSplitFlag = 0;
+  int MaxTrafoDepth = ectx->sps.max_transform_hierarchy_depth_inter;
+
+  if (mCodeResidual) {
+    assert(false);
+    cb->transform_tree = mTBSplit->analyze(ectx,ctxModel, ectx->imgdata->input, NULL, cb,
+                                           cb->x,cb->y,cb->x,cb->y, cb->log2Size,0,
+                                           0, MaxTrafoDepth, IntraSplitFlag);
+
+    cb->inter.rqt_root_cbf = ! cb->transform_tree->isZeroBlock();
+
+    cb->distortion = cb->transform_tree->distortion;
+    cb->rate       = cb->transform_tree->rate;
+  }
+  else {
+    const de265_image* input = ectx->imgdata->input;
+    de265_image* img   = ectx->prediction;
+    int x0 = cb->x;
+    int y0 = cb->y;
+    int tbSize = 1<<cb->log2Size;
+
+    CABAC_encoder_estim cabac;
+    cabac.set_context_models(&ctxModel);
+    encode_merge_idx(ectx, &cabac, spec.merge_idx);
+
+    cb->distortion = compute_distortion_ssd(input, img, x0,y0, cb->log2Size, 0);
+    cb->rate = cabac.getRDBits();
+
+    cb->inter.rqt_root_cbf = 0;
+  }
+
+  //printf("%d;%d rqt_root_cbf=%d\n",cb->x,cb->y,cb->inter.rqt_root_cbf);
+
+  return cb;
+}
diff --git a/libde265/encoder/algo/cb-mergeindex.h b/libde265/encoder/algo/cb-mergeindex.h
new file mode 100644
index 0000000..eb380cd
--- /dev/null
+++ b/libde265/encoder/algo/cb-mergeindex.h
@@ -0,0 +1,69 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CB_MERGEINDEX_H
+#define CB_MERGEINDEX_H
+
+#include "libde265/nal-parser.h"
+#include "libde265/decctx.h"
+#include "libde265/encoder/encode.h"
+#include "libde265/slice.h"
+#include "libde265/scan.h"
+#include "libde265/intrapred.h"
+#include "libde265/transform.h"
+#include "libde265/fallback-dct.h"
+#include "libde265/quality.h"
+#include "libde265/fallback.h"
+#include "libde265/configparam.h"
+
+#include "libde265/encoder/algo/algo.h"
+#include "libde265/encoder/algo/tb-split.h"
+
+
+// ========== CB Skip/Inter decision ==========
+
+class Algo_CB_MergeIndex : public Algo_CB
+{
+ public:
+  Algo_CB_MergeIndex() : mCodeResidual(false) { }
+  virtual ~Algo_CB_MergeIndex() { }
+
+  void set_code_residual(bool flag=true) { mCodeResidual=flag; }
+
+  void setChildAlgo(Algo_TB_Split* algo) { mTBSplit = algo; }
+  // TODO void setInterChildAlgo(Algo_CB_IntraPartMode* algo) { mInterPartModeAlgo = algo; }
+
+ protected:
+  Algo_TB_Split* mTBSplit;
+
+  bool mCodeResidual;
+};
+
+class Algo_CB_MergeIndex_Fixed : public Algo_CB_MergeIndex
+{
+ public:
+  virtual enc_cb* analyze(encoder_context*,
+                          context_model_table&,
+                          enc_cb* cb);
+};
+
+#endif
diff --git a/libde265/encoder/algo/cb-skip.cc b/libde265/encoder/algo/cb-skip.cc
new file mode 100644
index 0000000..bd09f98
--- /dev/null
+++ b/libde265/encoder/algo/cb-skip.cc
@@ -0,0 +1,106 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "libde265/encoder/algo/cb-skip.h"
+#include "libde265/encoder/algo/coding-options.h"
+#include "libde265/encoder/encoder-context.h"
+#include <assert.h>
+#include <limits>
+#include <math.h>
+
+
+
+
+enc_cb* Algo_CB_Skip_BruteForce::analyze(encoder_context* ectx,
+                                         context_model_table& ctxModel,
+                                         enc_cb* cb)
+{
+  bool try_skip  = (ectx->shdr->slice_type != SLICE_TYPE_I);
+  bool try_nonskip = true;
+
+  CodingOptions options(ectx,cb,ctxModel);
+  CodingOption option_skip    = options.new_option(try_skip);
+  CodingOption option_nonskip = options.new_option(try_nonskip);
+  options.start();
+
+  for (int i=0;i<CONTEXT_MODEL_TABLE_LENGTH;i++) {
+    //printf("%i: %d/%d\n",i, ctxModel[i].state, ctxModel[i].MPSbit);
+  }
+
+  if (option_skip) {
+    CodingOption& opt = option_skip;
+    opt.begin();
+
+    enc_cb* cb = opt.get_cb();
+
+    // calc rate for skip flag (=true)
+
+    CABAC_encoder_estim* cabac = opt.get_cabac();
+    encode_cu_skip_flag(ectx, cabac, cb, true);
+    float rate_pred_mode = cabac->getRDBits();
+    cabac->reset();
+
+    // set skip flag
+
+    cb->PredMode = MODE_SKIP;
+    ectx->img->set_pred_mode(cb->x,cb->y, cb->log2Size, cb->PredMode);
+
+    // encode CB
+
+    cb = mSkipAlgo->analyze(ectx, opt.get_context(), cb);
+
+    // add rate for PredMode
+
+    cb->rate += rate_pred_mode;
+    opt.set_cb(cb);
+    opt.end();
+  }
+
+  if (option_nonskip) {
+    CodingOption& opt = option_nonskip;
+    enc_cb* cb = opt.get_cb();
+
+    opt.begin();
+
+    // calc rate for skip flag (=true)
+
+    float rate_pred_mode = 0;
+
+    if (try_skip) {
+      CABAC_encoder_estim* cabac = opt.get_cabac();
+      encode_cu_skip_flag(ectx, cabac, cb, false);
+      rate_pred_mode = cabac->getRDBits();
+      cabac->reset();
+    }
+
+    cb = mNonSkipAlgo->analyze(ectx, opt.get_context(), cb);
+    // add rate for PredMode
+
+    cb->rate += rate_pred_mode;
+    opt.set_cb(cb);
+    opt.end();
+  }
+
+  options.compute_rdo_costs();
+  return options.return_best_rdo();
+}
diff --git a/libde265/encoder/algo/cb-skip.h b/libde265/encoder/algo/cb-skip.h
new file mode 100644
index 0000000..3454913
--- /dev/null
+++ b/libde265/encoder/algo/cb-skip.h
@@ -0,0 +1,69 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CB_SKIP_H
+#define CB_SKIP_H
+
+#include "libde265/nal-parser.h"
+#include "libde265/decctx.h"
+#include "libde265/encoder/encode.h"
+#include "libde265/slice.h"
+#include "libde265/scan.h"
+#include "libde265/intrapred.h"
+#include "libde265/transform.h"
+#include "libde265/fallback-dct.h"
+#include "libde265/quality.h"
+#include "libde265/fallback.h"
+#include "libde265/configparam.h"
+
+#include "libde265/encoder/algo/algo.h"
+#include "libde265/encoder/algo/cb-mergeindex.h"
+
+
+// ========== CB Skip/Inter decision ==========
+
+class Algo_CB_Skip : public Algo_CB
+{
+ public:
+  virtual ~Algo_CB_Skip() { }
+
+  void setSkipAlgo(Algo_CB_MergeIndex* algo) {
+    mSkipAlgo = algo;
+    mSkipAlgo->set_code_residual(false);
+  }
+
+  void setNonSkipAlgo(Algo_CB* algo) { mNonSkipAlgo = algo; }
+
+ protected:
+  Algo_CB_MergeIndex* mSkipAlgo;
+  Algo_CB*            mNonSkipAlgo;
+};
+
+class Algo_CB_Skip_BruteForce : public Algo_CB_Skip
+{
+ public:
+  virtual enc_cb* analyze(encoder_context*,
+                          context_model_table&,
+                          enc_cb* cb);
+};
+
+#endif
diff --git a/libde265/encoder/algo/cb-split.cc b/libde265/encoder/algo/cb-split.cc
new file mode 100644
index 0000000..a58a963
--- /dev/null
+++ b/libde265/encoder/algo/cb-split.cc
@@ -0,0 +1,155 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "libde265/encoder/algo/cb-split.h"
+#include "libde265/encoder/algo/coding-options.h"
+#include "libde265/encoder/encoder-context.h"
+#include <assert.h>
+#include <limits>
+#include <math.h>
+#include <iostream>
+
+
+// Utility function to encode all four children in a splitted CB.
+// Children are coded with the specified algo_cb_split.
+enc_cb* Algo_CB_Split::encode_cb_split(encoder_context* ectx,
+                                       context_model_table& ctxModel,
+                                       enc_cb* cb)
+{
+  int w = ectx->imgdata->input->get_width();
+  int h = ectx->imgdata->input->get_height();
+
+
+  cb->split_cu_flag = true;
+
+
+  // encode all 4 children and sum their distortions and rates
+
+  for (int i=0;i<4;i++) {
+    int child_x = cb->x + ((i&1)  << (cb->log2Size-1));
+    int child_y = cb->y + ((i>>1) << (cb->log2Size-1));
+
+    if (child_x>=w || child_y>=h) {
+      cb->children[i] = NULL;
+    }
+    else {
+      enc_cb* childCB = new enc_cb;
+      childCB->log2Size = cb->log2Size-1;
+      childCB->ctDepth  = cb->ctDepth+1;
+
+      childCB->x = child_x;
+      childCB->y = child_y;
+
+      cb->children[i] = analyze(ectx, ctxModel, childCB);
+
+      cb->distortion += cb->children[i]->distortion;
+      cb->rate       += cb->children[i]->rate;
+    }
+  }
+
+  return cb;
+}
+
+
+
+
+enc_cb* Algo_CB_Split_BruteForce::analyze(encoder_context* ectx,
+                                          context_model_table& ctxModel,
+                                          enc_cb* cb_input)
+{
+  assert(cb_input->pcm_flag==0);
+
+  // --- prepare coding options ---
+
+  const SplitType split_type = get_split_type(&ectx->sps,
+                                              cb_input->x, cb_input->y,
+                                              cb_input->log2Size);
+
+
+  bool can_split_CB   = (split_type != ForcedNonSplit);
+  bool can_nosplit_CB = (split_type != ForcedSplit);
+
+  //if (can_split_CB) { can_nosplit_CB=false; } // TODO TMP
+  //if (can_nosplit_CB) { can_split_CB=false; } // TODO TMP
+
+  CodingOptions options(ectx, cb_input, ctxModel);
+
+  CodingOption option_split    = options.new_option(can_split_CB);
+  CodingOption option_no_split = options.new_option(can_nosplit_CB);
+
+  options.start();
+
+  // --- encode without splitting ---
+
+  if (option_no_split) {
+    CodingOption& opt = option_no_split; // abbrev.
+
+    opt.begin();
+
+    enc_cb* cb = opt.get_cb();
+
+    // set CB size in image data-structure
+    ectx->img->set_ctDepth(cb->x,cb->y,cb->log2Size, cb->ctDepth);
+    ectx->img->set_log2CbSize(cb->x,cb->y,cb->log2Size, true);
+
+    /* We set QP here, because this is required at in non-split CBs only.
+     */
+    cb->qp = ectx->active_qp;
+
+    // analyze subtree
+    assert(mChildAlgo);
+    cb = mChildAlgo->analyze(ectx, opt.get_context(), cb);
+
+    // add rate for split flag
+    if (split_type == OptionalSplit) {
+      encode_split_cu_flag(ectx,opt.get_cabac(), cb->x,cb->y, cb->ctDepth, 0);
+      cb->rate += opt.get_cabac_rate();
+    }
+
+    opt.set_cb(cb);
+    opt.end();
+  }
+
+  // --- encode with splitting ---
+
+  if (option_split) {
+    option_split.begin();
+
+    enc_cb* cb = option_split.get_cb();
+    cb = encode_cb_split(ectx, option_split.get_context(), cb);
+
+    // add rate for split flag
+    if (split_type == OptionalSplit) {
+      encode_split_cu_flag(ectx,option_split.get_cabac(), cb->x,cb->y, cb->ctDepth, 1);
+      cb->rate += option_split.get_cabac_rate();
+    }
+
+    option_split.set_cb(cb);
+    option_split.end();
+  }
+
+  options.compute_rdo_costs();
+  enc_cb* bestCB = options.return_best_rdo();
+
+  return bestCB;
+}
diff --git a/libde265/encoder/algo/cb-split.h b/libde265/encoder/algo/cb-split.h
new file mode 100644
index 0000000..31fa98a
--- /dev/null
+++ b/libde265/encoder/algo/cb-split.h
@@ -0,0 +1,85 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CB_SPLIT_H
+#define CB_SPLIT_H
+
+#include "libde265/nal-parser.h"
+#include "libde265/decctx.h"
+#include "libde265/encoder/encode.h"
+#include "libde265/slice.h"
+#include "libde265/scan.h"
+#include "libde265/intrapred.h"
+#include "libde265/transform.h"
+#include "libde265/fallback-dct.h"
+#include "libde265/quality.h"
+#include "libde265/fallback.h"
+#include "libde265/configparam.h"
+
+#include "libde265/encoder/algo/algo.h"
+#include "libde265/encoder/algo/tb-intrapredmode.h"
+#include "libde265/encoder/algo/tb-split.h"
+
+
+/*  Encoder search tree, bottom up:
+
+    - Algo_TB_Split - whether TB is split or not
+
+    - Algo_TB_IntraPredMode - choose the intra prediction mode (or NOP, if at the wrong tree level)
+
+    - Algo_CB_IntraPartMode - choose between NxN and 2Nx2N intra parts
+
+    - Algo_CB_Split - whether CB is split or not
+
+    - Algo_CTB_QScale - select QScale on CTB granularity
+ */
+
+
+// ========== CB split decision ==========
+
+class Algo_CB_Split : public Algo_CB
+{
+ public:
+  virtual ~Algo_CB_Split() { }
+
+  // TODO: probably, this will later be a intra/inter decision which again
+  // has two child algorithms, depending on the coding mode.
+  void setChildAlgo(Algo_CB* algo) { mChildAlgo = algo; }
+
+ protected:
+  Algo_CB* mChildAlgo;
+
+  enc_cb* encode_cb_split(encoder_context* ectx,
+                          context_model_table& ctxModel,
+                          enc_cb* cb);
+};
+
+
+class Algo_CB_Split_BruteForce : public Algo_CB_Split
+{
+ public:
+  virtual enc_cb* analyze(encoder_context*,
+                          context_model_table&,
+                          enc_cb* cb);
+};
+
+#endif
diff --git a/libde265/encoder/algo/coding-options.cc b/libde265/encoder/algo/coding-options.cc
new file mode 100644
index 0000000..4e7564a
--- /dev/null
+++ b/libde265/encoder/algo/coding-options.cc
@@ -0,0 +1,170 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "libde265/encoder/algo/coding-options.h"
+#include "libde265/encoder/encoder-context.h"
+
+
+CodingOptions::CodingOptions(encoder_context* ectx, enc_cb* cb, context_model_table& tab)
+{
+  mCBInput = cb;
+  mContextModelInput = &tab;
+
+  mCurrentlyReconstructedOption=-1;
+  mBestRDO=-1;
+
+  mECtx = ectx;
+}
+
+CodingOptions::~CodingOptions()
+{
+}
+
+CodingOption CodingOptions::new_option(bool active)
+{
+  if (!active) {
+    return CodingOption();
+  }
+
+
+  CodingOptionData opt;
+
+  bool firstOption = mOptions.empty();
+  if (firstOption) {
+    opt.cb = mCBInput;
+  }
+  else {
+    opt.cb = new enc_cb(*mCBInput);
+  }
+
+  opt.context = *mContextModelInput;
+
+  CodingOption option(this, mOptions.size());
+
+  mOptions.push_back( std::move(opt) );
+
+  return option;
+}
+
+
+void CodingOptions::start(enum RateEstimationMethod rateMethod)
+{
+  /* We don't need the input context model anymore.
+     Releasing it now may save a copy during a later decouple().
+  */
+  mContextModelInput->release();
+
+  bool adaptiveContext;
+  switch (rateMethod) {
+  case Rate_Default:
+    adaptiveContext = mECtx->use_adaptive_context;
+    break;
+  case Rate_FixedContext:
+    adaptiveContext = false;
+    break;
+  case Rate_AdaptiveContext:
+    adaptiveContext = true;
+    break;
+  }
+
+  if (adaptiveContext) {
+    /* If we modify the context models in this algorithm,
+       we need separate models for each option.
+    */
+    for (auto& option : mOptions) {
+      option.context.decouple();
+    }
+
+    cabac = &cabac_adaptive;
+  }
+  else {
+    cabac = &cabac_constant;
+  }
+}
+
+
+void CodingOption::begin()
+{
+  mParent->cabac->reset();
+  mParent->cabac->set_context_models( &get_context() );
+
+  if (mParent->mCurrentlyReconstructedOption >= 0) {
+    mParent->mOptions[mParent->mCurrentlyReconstructedOption].cb->save(mParent->mECtx->img);
+  }
+
+  mParent->mCurrentlyReconstructedOption = mOptionIdx;
+}
+
+void CodingOption::end()
+{
+  assert(mParent->mCurrentlyReconstructedOption == mOptionIdx);
+}
+
+
+void CodingOptions::compute_rdo_costs()
+{
+  for (int i=0;i<mOptions.size();i++) {
+    mOptions[i].rdoCost = mOptions[i].cb->distortion + mECtx->lambda * mOptions[i].cb->rate;
+  }
+}
+
+
+enc_cb* CodingOptions::return_best_rdo()
+{
+  assert(mOptions.size()>0);
+
+
+  float bestRDOCost = 0;
+  bool  first=true;
+  int   bestRDO=-1;
+
+  for (int i=0;i<mOptions.size();i++) {
+    float cost = mOptions[i].rdoCost;
+    if (first || cost < bestRDOCost) {
+      bestRDOCost = cost;
+      first = false;
+      bestRDO = i;
+    }
+  }
+
+
+  assert(bestRDO>=0);
+
+  if (bestRDO != mCurrentlyReconstructedOption) {
+    mOptions[bestRDO].cb->restore(mECtx->img);
+  }
+
+  *mContextModelInput = mOptions[bestRDO].context;
+
+
+  // delete all CBs except the best one
+
+  for (int i=0;i<mOptions.size();i++) {
+    if (i != bestRDO)
+      {
+        delete mOptions[i].cb;
+        mOptions[i].cb = NULL;
+      }
+  }
+
+  return mOptions[bestRDO].cb;
+}
diff --git a/libde265/encoder/algo/coding-options.h b/libde265/encoder/algo/coding-options.h
new file mode 100644
index 0000000..ff973dc
--- /dev/null
+++ b/libde265/encoder/algo/coding-options.h
@@ -0,0 +1,134 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CODING_OPTIONS_H
+#define CODING_OPTIONS_H
+
+#include "libde265/encoder/encode.h"
+
+
+class CodingOption;
+
+
+class CodingOptions
+{
+ public:
+  CodingOptions(encoder_context*, enc_cb*, context_model_table& tab);
+  ~CodingOptions();
+
+  // --- init --- call before object use
+
+  CodingOption new_option(bool active=true);
+
+  enum RateEstimationMethod
+  {
+    Rate_Default,
+    Rate_AdaptiveContext,
+    Rate_FixedContext
+  };
+
+  void start(enum RateEstimationMethod = Rate_Default);
+
+
+  // --- processing ---
+
+  // compute RDO cost (D + lambda*R) for all options
+  void compute_rdo_costs();
+
+
+  // --- end processing --- do not call any function after this one
+
+  /* Return the CB with the lowest RDO cost. All other CBs are destroyed.
+     If the current reconstruction and metadata are not from the returned CB,
+     the data from the returned CB is reconstructed.
+   */
+  enc_cb* return_best_rdo();
+
+ private:
+  struct CodingOptionData
+  {
+    enc_cb* cb;
+    context_model_table context;
+    bool  mOptionActive;
+    float rdoCost;
+  };
+
+
+  encoder_context* mECtx;
+
+  enc_cb* mCBInput;
+  context_model_table* mContextModelInput;
+
+  int mCurrentlyReconstructedOption;
+  int mBestRDO;
+
+  std::vector<CodingOptionData> mOptions;
+
+  CABAC_encoder_estim           cabac_adaptive;
+  CABAC_encoder_estim_constant  cabac_constant;
+  CABAC_encoder_estim*          cabac;
+
+  friend class CodingOption;
+};
+
+
+class CodingOption
+{
+ public:
+  CodingOption() {
+    mParent = nullptr;
+    mOptionIdx = 0;
+  }
+
+  enc_cb* get_cb() { return mParent->mOptions[mOptionIdx].cb; }
+  void set_cb(enc_cb* cb) { mParent->mOptions[mOptionIdx].cb = cb; }
+
+  context_model_table& get_context() { return mParent->mOptions[mOptionIdx].context; }
+
+  operator bool() const { return mParent; }
+
+  /* When modifying the reconstruction image or metadata, you have to
+     encapsulate the modification between these two functions to ensure
+     that the correct reconstruction will be active after return_best_rdo().
+   */
+  void begin();
+  void end();
+
+  // Manually set RDO costs instead of computing them with compute_rdo_costs.
+  // Only required when using custom costs.
+  void set_rdo_cost(float rdo) { mParent->mOptions[mOptionIdx].rdoCost=rdo; }
+
+  CABAC_encoder_estim* get_cabac() { return mParent->cabac; }
+  float get_cabac_rate() const { return mParent->cabac->getRDBits(); }
+
+private:
+  CodingOption(class CodingOptions* parent, int idx)
+    : mParent(parent), mOptionIdx(idx) { }
+
+  class CodingOptions* mParent;
+  int   mOptionIdx;
+
+  friend class CodingOptions;
+};
+
+
+#endif
diff --git a/libde265/encoder/algo/ctb-qscale.cc b/libde265/encoder/algo/ctb-qscale.cc
new file mode 100644
index 0000000..83d1942
--- /dev/null
+++ b/libde265/encoder/algo/ctb-qscale.cc
@@ -0,0 +1,53 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "libde265/encoder/algo/ctb-qscale.h"
+#include "libde265/encoder/encoder-context.h"
+#include <assert.h>
+#include <limits>
+#include <math.h>
+
+
+#define ENCODER_DEVELOPMENT 1
+
+
+enc_cb* Algo_CTB_QScale_Constant::analyze(encoder_context* ectx,
+                                          context_model_table& ctxModel,
+                                          int ctb_x,int ctb_y)
+{
+  enc_cb* cb = new enc_cb();
+
+  cb->log2Size = ectx->sps.Log2CtbSizeY;
+  cb->ctDepth = 0;
+  cb->x = ctb_x;
+  cb->y = ctb_y;
+
+  ectx->img->set_QPY(ctb_x,ctb_y, cb->log2Size, ectx->active_qp);
+
+  // write currently unused coding options to image
+  ectx->img->set_cu_transquant_bypass(ctb_x,ctb_y,cb->log2Size, cb->cu_transquant_bypass_flag);
+  ectx->img->set_pcm_flag(ctb_x,ctb_y,cb->log2Size, cb->pcm_flag);
+
+  assert(mChildAlgo);
+  return mChildAlgo->analyze(ectx,ctxModel,cb);
+}
diff --git a/libde265/encoder/algo/ctb-qscale.h b/libde265/encoder/algo/ctb-qscale.h
new file mode 100644
index 0000000..42a332c
--- /dev/null
+++ b/libde265/encoder/algo/ctb-qscale.h
@@ -0,0 +1,107 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef CTB_QSCALE_H
+#define CTB_QSCALE_H
+
+#include "libde265/nal-parser.h"
+#include "libde265/decctx.h"
+#include "libde265/encoder/encode.h"
+#include "libde265/slice.h"
+#include "libde265/scan.h"
+#include "libde265/intrapred.h"
+#include "libde265/transform.h"
+#include "libde265/fallback-dct.h"
+#include "libde265/quality.h"
+#include "libde265/fallback.h"
+#include "libde265/configparam.h"
+
+#include "libde265/encoder/algo/cb-split.h"
+
+
+/*  Encoder search tree, bottom up:
+
+    - Algo_TB_Split - whether TB is split or not
+
+    - Algo_TB_IntraPredMode - choose the intra prediction mode (or NOP, if at the wrong tree level)
+
+    - Algo_CB_IntraPartMode - choose between NxN and 2Nx2N intra parts
+
+    - Algo_CB_Split - whether CB is split or not
+
+    - Algo_CTB_QScale - select QScale on CTB granularity
+ */
+
+
+// ========== choose a qscale at CTB level ==========
+
+class Algo_CTB_QScale
+{
+ public:
+ Algo_CTB_QScale() : mChildAlgo(NULL) { }
+  virtual ~Algo_CTB_QScale() { }
+
+  virtual enc_cb* analyze(encoder_context*,
+                          context_model_table&,
+                          int ctb_x,int ctb_y) = 0;
+
+  void setChildAlgo(Algo_CB_Split* algo) { mChildAlgo = algo; }
+
+ protected:
+  Algo_CB_Split* mChildAlgo;
+};
+
+
+
+class Algo_CTB_QScale_Constant : public Algo_CTB_QScale
+{
+ public:
+  struct params
+  {
+    params() {
+      mQP.set_range(1,51);
+      mQP.set_default(27);
+      mQP.set_ID("CTB-QScale-Constant");
+      mQP.set_cmd_line_options("qp",'q');
+    }
+
+    option_int mQP;
+  };
+
+  void setParams(const params& p) { mParams=p; }
+
+  void registerParams(config_parameters& config) {
+    config.add_option(&mParams.mQP);
+  }
+
+  virtual enc_cb* analyze(encoder_context*,
+                          context_model_table&,
+                          int ctb_x,int ctb_y);
+
+  int getQP() const { return mParams.mQP; }
+
+ private:
+  params mParams;
+};
+
+
+#endif
diff --git a/libde265/encoder/algo/pb-mv.cc b/libde265/encoder/algo/pb-mv.cc
new file mode 100644
index 0000000..81cd825
--- /dev/null
+++ b/libde265/encoder/algo/pb-mv.cc
@@ -0,0 +1,306 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "libde265/encoder/algo/pb-mv.h"
+#include "libde265/encoder/algo/coding-options.h"
+#include "libde265/encoder/encoder-context.h"
+#include <assert.h>
+#include <limits>
+#include <math.h>
+
+
+
+enc_cb* Algo_PB_MV_Test::analyze(encoder_context* ectx,
+                                 context_model_table& ctxModel,
+                                 enc_cb* cb,
+                                 int PBidx, int x,int y,int w,int h)
+{
+  enum MVTestMode testMode = mParams.testMode();
+
+
+  MotionVector mvp[2];
+
+  fill_luma_motion_vector_predictors(ectx, ectx->shdr, ectx->img,
+                                     cb->x,cb->y,1<<cb->log2Size, x,y,w,h,
+                                     0, // l
+                                     0, 0, // int refIdx, int partIdx,
+                                     mvp);
+
+  //printf("%d/%d: [%d;%d] [%d;%d]\n",cb->x,cb->y, mvp[0].x,mvp[0].y, mvp[1].x,mvp[1].y);
+
+
+  motion_spec&     spec = cb->inter.pb[PBidx].spec;
+  MotionVectorSpec& vec = cb->inter.pb[PBidx].motion;
+
+  spec.merge_flag = 0;
+  spec.merge_idx  = 0;
+
+  spec.inter_pred_idc = PRED_L0;
+  spec.refIdx[0] = vec.refIdx[0] = 0;
+  spec.mvp_l0_flag = 0;
+
+  int value = mParams.range();
+
+  switch (testMode) {
+  case MVTestMode_Zero:
+    spec.mvd[0][0]=0;
+    spec.mvd[0][1]=0;
+    break;
+
+  case MVTestMode_Random:
+    spec.mvd[0][0] = (rand() % (2*value+1)) - value;
+    spec.mvd[0][1] = (rand() % (2*value+1)) - value;
+    break;
+
+  case MVTestMode_Horizontal:
+    spec.mvd[0][0]=value;
+    spec.mvd[0][1]=0;
+    break;
+
+  case MVTestMode_Vertical:
+    spec.mvd[0][0]=0;
+    spec.mvd[0][1]=value;
+    break;
+  }
+
+  spec.mvd[0][0] -= mvp[0].x;
+  spec.mvd[0][1] -= mvp[0].y;
+
+  vec.mv[0].x = mvp[0].x + spec.mvd[0][0];
+  vec.mv[0].y = mvp[0].y + spec.mvd[0][1];
+  vec.predFlag[0] = 1;
+  vec.predFlag[1] = 0;
+
+  ectx->img->set_mv_info(x,y,w,h, vec);
+
+  generate_inter_prediction_samples(ectx, ectx->shdr, ectx->prediction,
+                                    cb->x,cb->y, // int xC,int yC,
+                                    0,0,         // int xB,int yB,
+                                    1<<cb->log2Size, // int nCS,
+                                    1<<cb->log2Size,
+                                    1<<cb->log2Size, // int nPbW,int nPbH,
+                                    &vec);
+
+  // TODO estimate rate for sending MV
+
+  int IntraSplitFlag = 0;
+  int MaxTrafoDepth = ectx->sps.max_transform_hierarchy_depth_inter;
+
+  mCodeResidual=true;
+  if (mCodeResidual) {
+    assert(mTBSplitAlgo);
+    cb->transform_tree = mTBSplitAlgo->analyze(ectx,ctxModel, ectx->imgdata->input, NULL, cb,
+                                               cb->x,cb->y,cb->x,cb->y, cb->log2Size,0,
+                                               0, MaxTrafoDepth, IntraSplitFlag);
+
+    cb->inter.rqt_root_cbf = ! cb->transform_tree->isZeroBlock();
+
+    cb->distortion = cb->transform_tree->distortion;
+    cb->rate       = cb->transform_tree->rate;
+  }
+  else {
+    const de265_image* input = ectx->imgdata->input;
+    de265_image* img   = ectx->prediction;
+    int x0 = cb->x;
+    int y0 = cb->y;
+    int tbSize = 1<<cb->log2Size;
+
+    cb->distortion = compute_distortion_ssd(input, img, x0,y0, cb->log2Size, 0);
+    cb->rate = 5; // fake (MV)
+
+    cb->inter.rqt_root_cbf = 0;
+  }
+
+  return cb;
+}
+
+
+
+
+int sad(const uint8_t* p1,int stride1,
+        const uint8_t* p2,int stride2,
+        int w,int h)
+{
+  int cost=0;
+
+  for (int y=0;y<h;y++) {
+    for (int x=0;x<w;x++) {
+      cost += abs_value(*p1 - *p2);
+      p1++;
+      p2++;
+    }
+
+    p1 += stride1-w;
+    p2 += stride2-w;
+  }
+
+  return cost;
+}
+
+
+enc_cb* Algo_PB_MV_Search::analyze(encoder_context* ectx,
+                                   context_model_table& ctxModel,
+                                   enc_cb* cb,
+                                   int PBidx, int x,int y,int pbW,int pbH)
+{
+  enum MVSearchAlgo searchAlgo = mParams.mvSearchAlgo();
+
+
+  MotionVector mvp[2];
+
+  fill_luma_motion_vector_predictors(ectx, ectx->shdr, ectx->img,
+                                     cb->x,cb->y,1<<cb->log2Size, x,y,pbW,pbH,
+                                     0, // l
+                                     0, 0, // int refIdx, int partIdx,
+                                     mvp);
+
+  motion_spec&     spec = cb->inter.pb[PBidx].spec;
+  MotionVectorSpec& vec = cb->inter.pb[PBidx].motion;
+
+  spec.merge_flag = 0;
+  spec.merge_idx  = 0;
+
+  spec.inter_pred_idc = PRED_L0;
+  spec.refIdx[0] = vec.refIdx[0] = 0;
+  spec.mvp_l0_flag = 0;
+
+  int hrange = mParams.hrange();
+  int vrange = mParams.vrange();
+
+  // previous frame (TODO)
+  const de265_image* refimg   = ectx->get_image(ectx->imgdata->frame_number -1);
+  const de265_image* inputimg = ectx->imgdata->input;
+
+  int w = refimg->get_width();
+  int h = refimg->get_height();
+
+  int mincost = 0x7fffffff;
+
+  double lambda = 10.0;
+
+  double *bits_h = new double[2*hrange+1];
+  double *bits_v = new double[2*vrange+1];
+
+  for (int i=-hrange;i<=hrange;i++) {
+    int diff = (i - mvp[0].x);
+    int b;
+
+    if (diff==0) { b=0; }
+    else if (diff==1 || diff==-1) { b=2; }
+    else { b=abs_value(b+2); }
+
+    bits_h[i+hrange]=b;
+  }
+
+  for (int i=-vrange;i<=vrange;i++) {
+    int diff = (i - mvp[0].y);
+    int b;
+
+    if (diff==0) { b=0; }
+    else if (diff==1 || diff==-1) { b=2; }
+    else { b=abs_value(b+2); }
+
+    bits_v[i+vrange]=b;
+  }
+
+  for (int my = y-vrange; my<=y+vrange; my++)
+    for (int mx = x-hrange; mx<=x+hrange; mx++)
+      {
+        if (mx<0 || mx+pbW>w || my<0 || my+pbH>h) continue;
+
+        int cost = sad(refimg->get_image_plane_at_pos(0,mx,my),
+                       refimg->get_image_stride(0),
+                       inputimg->get_image_plane_at_pos(0,x,y),
+                       inputimg->get_image_stride(0),
+                       pbW,pbH);
+
+        int bits = bits_h[mx-x+hrange] + bits_v[my-y+vrange];
+
+        cost += lambda * bits;
+
+        //printf("%d %d : %d\n",mx,my,cost);
+
+        if (cost<mincost) {
+          mincost=cost;
+
+          spec.mvd[0][0]=(mx-x)<<2;
+          spec.mvd[0][1]=(my-y)<<2;
+        }
+      }
+
+  spec.mvd[0][0] -= mvp[0].x;
+  spec.mvd[0][1] -= mvp[0].y;
+
+  vec.mv[0].x = mvp[0].x + spec.mvd[0][0];
+  vec.mv[0].y = mvp[0].y + spec.mvd[0][1];
+  vec.predFlag[0] = 1;
+  vec.predFlag[1] = 0;
+
+  ectx->img->set_mv_info(x,y,pbW,pbH, vec);
+
+  generate_inter_prediction_samples(ectx, ectx->shdr, ectx->prediction,
+                                    cb->x,cb->y, // int xC,int yC,
+                                    0,0,         // int xB,int yB,
+                                    1<<cb->log2Size, // int nCS,
+                                    1<<cb->log2Size,
+                                    1<<cb->log2Size, // int nPbW,int nPbH,
+                                    &vec);
+
+  // --- create residual ---
+
+
+
+  // TODO estimate rate for sending MV
+
+  int IntraSplitFlag = 0;
+  int MaxTrafoDepth = ectx->sps.max_transform_hierarchy_depth_inter;
+
+  mCodeResidual=true;
+  if (mCodeResidual) {
+    cb->transform_tree = mTBSplitAlgo->analyze(ectx,ctxModel, ectx->imgdata->input, NULL, cb,
+                                               cb->x,cb->y,cb->x,cb->y, cb->log2Size,0,
+                                               0, MaxTrafoDepth, IntraSplitFlag);
+
+    cb->inter.rqt_root_cbf = ! cb->transform_tree->isZeroBlock();
+
+    cb->distortion = cb->transform_tree->distortion;
+    cb->rate       = cb->transform_tree->rate;
+  }
+  else {
+    const de265_image* input = ectx->imgdata->input;
+    de265_image* img   = ectx->img;
+    int x0 = cb->x;
+    int y0 = cb->y;
+    int tbSize = 1<<cb->log2Size;
+
+    cb->distortion = compute_distortion_ssd(input, img, x0,y0, cb->log2Size, 0);
+    cb->rate = 5; // fake (MV)
+
+    cb->inter.rqt_root_cbf = 0;
+  }
+
+  delete[] bits_h;
+  delete[] bits_v;
+
+  return cb;
+}
diff --git a/libde265/encoder/algo/pb-mv.h b/libde265/encoder/algo/pb-mv.h
new file mode 100644
index 0000000..27454c4
--- /dev/null
+++ b/libde265/encoder/algo/pb-mv.h
@@ -0,0 +1,178 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef PB_MV_H
+#define PB_MV_H
+
+#include "libde265/nal-parser.h"
+#include "libde265/decctx.h"
+#include "libde265/encoder/encode.h"
+#include "libde265/slice.h"
+#include "libde265/scan.h"
+#include "libde265/intrapred.h"
+#include "libde265/transform.h"
+#include "libde265/fallback-dct.h"
+#include "libde265/quality.h"
+#include "libde265/fallback.h"
+#include "libde265/configparam.h"
+
+#include "libde265/encoder/algo/algo.h"
+
+
+// ========== CB Intra/Inter decision ==========
+
+class Algo_TB_Split;
+
+
+class Algo_PB_MV : public Algo_PB
+{
+ public:
+ Algo_PB_MV() : mTBSplitAlgo(NULL) { }
+  virtual ~Algo_PB_MV() { }
+
+  void setChildAlgo(Algo_TB_Split* algo) { mTBSplitAlgo = algo; }
+
+ protected:
+  Algo_TB_Split* mTBSplitAlgo;
+};
+
+
+
+
+enum MVTestMode
+  {
+    MVTestMode_Zero,
+    MVTestMode_Random,
+    MVTestMode_Horizontal,
+    MVTestMode_Vertical
+  };
+
+class option_MVTestMode : public choice_option<enum MVTestMode>
+{
+ public:
+  option_MVTestMode() {
+    add_choice("zero",   MVTestMode_Zero);
+    add_choice("random", MVTestMode_Random);
+    add_choice("horiz",  MVTestMode_Horizontal, true);
+    add_choice("verti",  MVTestMode_Vertical);
+  }
+};
+
+
+class Algo_PB_MV_Test : public Algo_PB_MV
+{
+ public:
+ Algo_PB_MV_Test() : mCodeResidual(false) { }
+
+  struct params
+  {
+    params() {
+      testMode.set_ID("PB-MV-TestMode");
+      range.set_ID   ("PB-MV-Range");
+      range.set_default(4);
+    }
+
+    option_MVTestMode testMode;
+    option_int        range;
+  };
+
+  void registerParams(config_parameters& config) {
+    config.add_option(&mParams.testMode);
+    config.add_option(&mParams.range);
+  }
+
+  void setParams(const params& p) { mParams=p; }
+
+  virtual enc_cb* analyze(encoder_context*,
+                          context_model_table&,
+                          enc_cb* cb,
+                          int PBidx, int x,int y,int w,int h);
+
+ private:
+  params mParams;
+
+  bool mCodeResidual;
+};
+
+
+
+
+enum MVSearchAlgo
+  {
+    MVSearchAlgo_Zero,
+    MVSearchAlgo_Full,
+    MVSearchAlgo_Diamond,
+    MVSearchAlgo_PMVFast
+  };
+
+class option_MVSearchAlgo : public choice_option<enum MVSearchAlgo>
+{
+ public:
+  option_MVSearchAlgo() {
+    add_choice("zero",   MVSearchAlgo_Zero);
+    add_choice("full",   MVSearchAlgo_Full, true);
+    add_choice("diamond",MVSearchAlgo_Diamond);
+    add_choice("pmvfast",MVSearchAlgo_PMVFast);
+  }
+};
+
+
+class Algo_PB_MV_Search : public Algo_PB_MV
+{
+ public:
+ Algo_PB_MV_Search() : mCodeResidual(false) { }
+
+  struct params
+  {
+    params() {
+      mvSearchAlgo.set_ID("PB-MV-Search-Algo");
+      hrange.set_ID      ("PB-MV-Search-HRange");
+      vrange.set_ID      ("PB-MV-Search-VRange");
+      hrange.set_default(8);
+      vrange.set_default(8);
+    }
+
+    option_MVSearchAlgo mvSearchAlgo;
+    option_int        hrange;
+    option_int        vrange;
+  };
+
+  void registerParams(config_parameters& config) {
+    config.add_option(&mParams.mvSearchAlgo);
+    config.add_option(&mParams.hrange);
+    config.add_option(&mParams.vrange);
+  }
+
+  void setParams(const params& p) { mParams=p; }
+
+  virtual enc_cb* analyze(encoder_context*,
+                          context_model_table&,
+                          enc_cb* cb,
+                          int PBidx, int x,int y,int w,int h);
+
+ private:
+  params mParams;
+
+  bool mCodeResidual;
+};
+
+#endif
diff --git a/libde265/encoder/algo/tb-intrapredmode.cc b/libde265/encoder/algo/tb-intrapredmode.cc
new file mode 100644
index 0000000..4662ce1
--- /dev/null
+++ b/libde265/encoder/algo/tb-intrapredmode.cc
@@ -0,0 +1,524 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "libde265/encoder/analyze.h"
+#include "libde265/encoder/encoder-context.h"
+#include "libde265/encoder/algo/tb-split.h"
+#include <assert.h>
+#include <limits>
+#include <math.h>
+#include <algorithm>
+#include <iostream>
+
+
+enum IntraPredMode find_best_intra_mode(de265_image& img,int x0,int y0, int log2BlkSize, int cIdx,
+                                        const uint8_t* ref, int stride)
+{
+  //return INTRA_ANGULAR_20;
+
+  enum IntraPredMode best_mode;
+  int min_sad=-1;
+
+  int candidates[3];
+
+  const seq_parameter_set* sps = &img.sps;
+
+
+  fillIntraPredModeCandidates(candidates, x0,y0,
+                              sps->getPUIndexRS(x0,y0),
+                              x0>0, y0>0, &img);
+
+  // --- test candidates first ---
+
+  for (int idx=0;idx<3;idx++) {
+    enum IntraPredMode mode = (enum IntraPredMode)candidates[idx];
+    decode_intra_prediction(&img, x0,y0, (enum IntraPredMode)mode, 1<<log2BlkSize, cIdx);
+
+    uint32_t distortion = SSD(ref,stride,
+      img.get_image_plane_at_pos(cIdx, x0,y0), img.get_image_stride(cIdx),
+      1<<log2BlkSize, 1<<log2BlkSize);
+
+    int sad=distortion;
+
+    sad *= 0.5;
+    //sad *= 0.9;
+
+    if (mode==0 || sad<min_sad) {
+      min_sad = sad;
+      best_mode = (enum IntraPredMode)mode;
+    }
+  }
+
+
+  // --- test all modes ---
+
+  for (int idx=0;idx<35;idx++) {
+    enum IntraPredMode mode = (enum IntraPredMode)idx; //candidates[idx];
+    decode_intra_prediction(&img, x0,y0, (enum IntraPredMode)mode, 1<<log2BlkSize, cIdx);
+
+
+    uint32_t distortion = SSD(ref,stride,
+      img.get_image_plane_at_pos(cIdx, x0,y0), img.get_image_stride(cIdx),
+      1<<log2BlkSize, 1<<log2BlkSize);
+
+    int sad=distortion;
+
+    if (min_sad<0 || sad<min_sad) {
+      min_sad = sad;
+      best_mode = (enum IntraPredMode)mode;
+    }
+  }
+
+  return best_mode;
+}
+
+
+
+
+float estim_TB_bitrate(const encoder_context* ectx,
+                       const de265_image* input,
+                       int x0,int y0, int log2BlkSize,
+                       enum TBBitrateEstimMethod method)
+{
+  int blkSize = 1<<log2BlkSize;
+
+  float distortion;
+
+  switch (method)
+    {
+    case TBBitrateEstim_SSD:
+      return SSD(input->get_image_plane_at_pos(0, x0,y0),
+                 input->get_image_stride(0),
+                 ectx->img->get_image_plane_at_pos(0, x0,y0),
+                 ectx->img->get_image_stride(0),
+                 1<<log2BlkSize, 1<<log2BlkSize);
+      break;
+
+    case TBBitrateEstim_SAD:
+      return SAD(input->get_image_plane_at_pos(0, x0,y0),
+                 input->get_image_stride(0),
+                 ectx->img->get_image_plane_at_pos(0, x0,y0),
+                 ectx->img->get_image_stride(0),
+                 1<<log2BlkSize, 1<<log2BlkSize);
+      break;
+
+    case TBBitrateEstim_SATD_DCT:
+    case TBBitrateEstim_SATD_Hadamard:
+      {
+        int16_t coeffs[32*32];
+        int16_t diff[32*32];
+
+        diff_blk(diff,blkSize,
+                 input->get_image_plane_at_pos(0, x0,y0), input->get_image_stride(0),
+                 ectx->img->get_image_plane_at_pos(0, x0,y0), ectx->img->get_image_stride(0),
+                 blkSize);
+
+        if (method == TBBitrateEstim_SATD_Hadamard) {
+          ectx->acceleration.hadamard_transform_8[log2BlkSize-2](coeffs, diff, &diff[blkSize] - &diff[0]);
+        }
+        else {
+          ectx->acceleration.fwd_transform_8[log2BlkSize-2](coeffs, diff, &diff[blkSize] - &diff[0]);
+        }
+
+        float distortion=0;
+        for (int i=0;i<blkSize*blkSize;i++) {
+          distortion += abs_value((int)coeffs[i]);
+        }
+
+        return distortion;
+      }
+      break;
+
+      /*
+    case TBBitrateEstim_AccurateBits:
+      assert(false);
+      return 0;
+      */
+    }
+
+  assert(false);
+  return 0;
+}
+
+
+
+enc_tb*
+Algo_TB_IntraPredMode_BruteForce::analyze(encoder_context* ectx,
+                                          context_model_table& ctxModel,
+                                          const de265_image* input,
+                                          const enc_tb* parent,
+                                          enc_cb* cb,
+                                          int x0,int y0, int xBase,int yBase,
+                                          int log2TbSize, int blkIdx,
+                                          int TrafoDepth, int MaxTrafoDepth,
+                                          int IntraSplitFlag)
+{
+  //printf("encode_transform_tree_may_split %d %d (%d %d) size %d\n",x0,y0,xBase,yBase,1<<log2TbSize);
+
+  /*
+    enum IntraPredMode pre_intraMode = find_best_intra_mode(ectx->img,x0,y0, log2TbSize, 0,
+    input->get_image_plane_at_pos(0,x0,y0),
+    input->get_image_stride(0));
+  */
+
+  bool selectIntraPredMode = false;
+  selectIntraPredMode |= (cb->PredMode==MODE_INTRA && cb->PartMode==PART_2Nx2N && TrafoDepth==0);
+  selectIntraPredMode |= (cb->PredMode==MODE_INTRA && cb->PartMode==PART_NxN   && TrafoDepth==1);
+
+  if (selectIntraPredMode) {
+    enc_tb* tb[35];
+
+    float minCost = std::numeric_limits<float>::max();
+    int   minCostIdx=0;
+    float minCandCost;
+
+    const de265_image* img = ectx->img;
+    const seq_parameter_set* sps = &img->sps;
+    int candidates[3];
+    fillIntraPredModeCandidates(candidates, x0,y0,
+                                sps->getPUIndexRS(x0,y0),
+                                x0>0, y0>0, img);
+
+
+    for (int i = 0; i<35; i++) {
+      if (!mPredMode_enabled[i]) {
+        tb[i]=NULL;
+        continue;
+      }
+
+
+      context_model_table ctxIntra = ctxModel.copy();
+      //copy_context_model_table(ctxIntra, ctxModel);
+
+      enum IntraPredMode intraMode = (IntraPredMode)i;
+
+      cb->intra.pred_mode[blkIdx] = intraMode;
+      if (blkIdx==0) { cb->intra.chroma_mode = intraMode; }
+
+      ectx->img->set_IntraPredMode(x0,y0,log2TbSize, intraMode);
+
+      tb[intraMode] = mTBSplitAlgo->analyze(ectx,ctxIntra,input,parent,
+                                            cb, x0,y0, xBase,yBase, log2TbSize, blkIdx,
+                                            TrafoDepth, MaxTrafoDepth, IntraSplitFlag);
+
+
+      float sad;
+      if ((1<<log2TbSize)==8) {
+        decode_intra_prediction(ectx->img, x0,y0, intraMode, 1<<log2TbSize, 0);
+        sad = estim_TB_bitrate(ectx,input, x0,y0, log2TbSize, TBBitrateEstim_SAD);
+      }
+
+
+      float rate = tb[intraMode]->rate;
+      int enc_bin;
+
+      if (log2TbSize==3) {
+        // printf("RATE2 %d %f %f\n",log2TbSize,tb[intraMode]->rate,sad);
+      }
+
+      /**/ if (candidates[0]==intraMode) { rate += 1; enc_bin=1; }
+      else if (candidates[1]==intraMode) { rate += 2; enc_bin=1; }
+      else if (candidates[2]==intraMode) { rate += 2; enc_bin=1; }
+      else { rate += 5; enc_bin=0; }
+
+      CABAC_encoder_estim estim;
+      estim.set_context_models(&ctxIntra);
+      rate += estim.RDBits_for_CABAC_bin(CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG, enc_bin);
+
+      float cost = tb[intraMode]->distortion + ectx->lambda * rate;
+      if (cost<minCost) {
+        minCost=cost;
+        minCostIdx=intraMode;
+        //minCandCost=c;
+      }
+    }
+
+
+    enum IntraPredMode intraMode = (IntraPredMode)minCostIdx;
+
+    cb->intra.pred_mode[blkIdx] = intraMode;
+    if (blkIdx==0) { cb->intra.chroma_mode  = intraMode; } //INTRA_CHROMA_LIKE_LUMA;
+    ectx->img->set_IntraPredMode(x0,y0,log2TbSize, intraMode);
+
+    tb[minCostIdx]->reconstruct(ectx, ectx->img,
+                                cb, blkIdx);
+
+
+    //printf("INTRA %d %d  %d\n",pre_intraMode,intraMode,minCandCost);
+
+    for (int i = 0; i<35; i++) {
+      if (i != minCostIdx) {
+        delete tb[i];
+      }
+    }
+
+    return tb[minCostIdx];
+  }
+  else {
+    return mTBSplitAlgo->analyze(ectx, ctxModel, input, parent, cb,
+                                 x0,y0,xBase,yBase, log2TbSize,
+                                 blkIdx, TrafoDepth, MaxTrafoDepth,
+                                 IntraSplitFlag);
+  }
+
+  assert(false);
+  return nullptr;
+}
+
+
+
+enc_tb*
+Algo_TB_IntraPredMode_MinResidual::analyze(encoder_context* ectx,
+                                           context_model_table& ctxModel,
+                                           const de265_image* input,
+                                           const enc_tb* parent,
+                                           enc_cb* cb,
+                                           int x0,int y0, int xBase,int yBase,
+                                           int log2TbSize, int blkIdx,
+                                           int TrafoDepth, int MaxTrafoDepth,
+                                           int IntraSplitFlag)
+{
+
+  bool selectIntraPredMode = false;
+  selectIntraPredMode |= (cb->PredMode==MODE_INTRA && cb->PartMode==PART_2Nx2N && TrafoDepth==0);
+  selectIntraPredMode |= (cb->PredMode==MODE_INTRA && cb->PartMode==PART_NxN   && TrafoDepth==1);
+
+  if (selectIntraPredMode) {
+
+    enum IntraPredMode intraMode;
+    float minDistortion;
+
+    for (int idx=0;idx<35;idx++) {
+      enum IntraPredMode mode = (enum IntraPredMode)idx;
+      decode_intra_prediction(ectx->img, x0,y0, (enum IntraPredMode)mode, 1<<log2TbSize, 0);
+
+      float distortion;
+      distortion = estim_TB_bitrate(ectx, input, x0,y0, log2TbSize,
+                                    mParams.bitrateEstimMethod());
+
+      if (idx==0 || distortion<minDistortion) {
+        minDistortion = distortion;
+        intraMode = mode;
+      }
+    }
+
+
+    cb->intra.pred_mode[blkIdx] = intraMode;
+    if (blkIdx==0) { cb->intra.chroma_mode = intraMode; }
+
+    ectx->img->set_IntraPredMode(x0,y0,log2TbSize, intraMode);
+
+    /*
+    decode_intra_prediction(ectx->img, x0,y0,       intraMode, 1<< log2TbSize,    0);
+    decode_intra_prediction(ectx->img, x0>>1,y0>>1, intraMode, 1<<(log2TbSize-1), 1);
+    decode_intra_prediction(ectx->img, x0>>1,y0>>1, intraMode, 1<<(log2TbSize-1), 2);
+    */
+
+    // Note: cannot prepare intra prediction pixels here, because this has to
+    // be done at the lowest TB split level.
+
+    enc_tb* tb = mTBSplitAlgo->analyze(ectx,ctxModel,input,parent,
+                                       cb, x0,y0, xBase,yBase, log2TbSize, blkIdx,
+                                       TrafoDepth, MaxTrafoDepth, IntraSplitFlag);
+
+    debug_show_image(ectx->img, 0);
+
+    return tb;
+  }
+  else {
+    return mTBSplitAlgo->analyze(ectx, ctxModel, input, parent, cb,
+                                 x0,y0,xBase,yBase, log2TbSize,
+                                 blkIdx, TrafoDepth, MaxTrafoDepth,
+                                 IntraSplitFlag);
+  }
+
+  assert(false);
+  return nullptr;
+}
+
+static bool sortDistortions(std::pair<enum IntraPredMode,float> i,
+                            std::pair<enum IntraPredMode,float> j)
+{
+  return i.second < j.second;
+}
+
+enc_tb*
+Algo_TB_IntraPredMode_FastBrute::analyze(encoder_context* ectx,
+                                         context_model_table& ctxModel,
+                                         const de265_image* input,
+                                         const enc_tb* parent,
+                                         enc_cb* cb,
+                                         int x0,int y0, int xBase,int yBase,
+                                         int log2TbSize, int blkIdx,
+                                         int TrafoDepth, int MaxTrafoDepth,
+                                         int IntraSplitFlag)
+{
+  //printf("encode_transform_tree_may_split %d %d (%d %d) size %d\n",x0,y0,xBase,yBase,1<<log2TbSize);
+
+  /*
+    enum IntraPredMode pre_intraMode = find_best_intra_mode(ectx->img,x0,y0, log2TbSize, 0,
+    input->get_image_plane_at_pos(0,x0,y0),
+    input->get_image_stride(0));
+  */
+
+  bool selectIntraPredMode = false;
+  selectIntraPredMode |= (cb->PredMode==MODE_INTRA && cb->PartMode==PART_2Nx2N && TrafoDepth==0);
+  selectIntraPredMode |= (cb->PredMode==MODE_INTRA && cb->PartMode==PART_NxN   && TrafoDepth==1);
+
+  if (selectIntraPredMode) {
+    float minCost = std::numeric_limits<float>::max();
+    int   minCostIdx=0;
+    float minCandCost;
+
+    const de265_image* img = ectx->img;
+    const seq_parameter_set* sps = &img->sps;
+    int candidates[3];
+    fillIntraPredModeCandidates(candidates, x0,y0,
+                                sps->getPUIndexRS(x0,y0),
+                                x0>0, y0>0, img);
+
+
+
+    std::vector< std::pair<enum IntraPredMode,float> > distortions;
+
+    for (int idx=0;idx<35;idx++)
+      if (idx!=candidates[0] && idx!=candidates[1] && idx!=candidates[2] && mPredMode_enabled[idx])
+        {
+          enum IntraPredMode mode = (enum IntraPredMode)idx;
+          decode_intra_prediction(ectx->img, x0,y0, (enum IntraPredMode)mode, 1<<log2TbSize, 0);
+
+          float distortion;
+          distortion = estim_TB_bitrate(ectx, input, x0,y0, log2TbSize,
+                                        mParams.bitrateEstimMethod());
+
+          distortions.push_back( std::make_pair((enum IntraPredMode)idx, distortion) );
+        }
+
+    std::sort( distortions.begin(), distortions.end(), sortDistortions );
+
+
+    for (int i=0;i<distortions.size();i++)
+      {
+        //printf("%d -> %f\n",i,distortions[i].second);
+      }
+
+    int keepNBest=std::min((int)mParams.keepNBest, (int)distortions.size());
+    distortions.resize(keepNBest);
+    distortions.push_back(std::make_pair((enum IntraPredMode)candidates[0],0));
+    distortions.push_back(std::make_pair((enum IntraPredMode)candidates[1],0));
+    distortions.push_back(std::make_pair((enum IntraPredMode)candidates[2],0));
+
+
+    enc_tb* tb[35];
+    context_model_table contexts[35];
+
+    for (int i=0;i<35;i++) tb[i]=NULL;
+
+    for (int i=0;i<distortions.size();i++) {
+
+      //copy_context_model_table(ctxIntra, ctxModel);
+
+      enum IntraPredMode intraMode = (IntraPredMode)distortions[i].first;
+
+      if (!mPredMode_enabled[intraMode]) { continue; }
+
+      cb->intra.pred_mode[blkIdx] = intraMode;
+      if (blkIdx==0) { cb->intra.chroma_mode = intraMode; }
+
+      ectx->img->set_IntraPredMode(x0,y0,log2TbSize, intraMode);
+
+      contexts[intraMode] = ctxModel.copy();
+      tb[intraMode] = mTBSplitAlgo->analyze(ectx,contexts[intraMode],input,parent,
+                                            cb, x0,y0, xBase,yBase, log2TbSize, blkIdx,
+                                            TrafoDepth, MaxTrafoDepth, IntraSplitFlag);
+
+      float rate = tb[intraMode]->rate_withoutCbfChroma;
+      int enc_bin;
+
+      /**/ if (candidates[0]==intraMode) { rate += 1; enc_bin=1; }
+      else if (candidates[1]==intraMode) { rate += 2; enc_bin=1; }
+      else if (candidates[2]==intraMode) { rate += 2; enc_bin=1; }
+      else { rate += 5; enc_bin=0; }
+
+      CABAC_encoder_estim estim;
+      estim.set_context_models(&contexts[intraMode]);
+      //rate += estim.RDBits_for_CABAC_bin(CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG, enc_bin);
+      logtrace(LogSymbols,"$1 prev_intra_luma_pred_flag=%d\n",enc_bin);
+      estim.write_CABAC_bit(CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG, enc_bin);
+
+      // TODO: currently we make the chroma-pred-mode decision for each part even
+      // in NxN part mode. Since we always set this to the same value, it does not
+      // matter. However, we should only add the rate for it once (for blkIdx=0).
+
+      if (blkIdx==0) {
+        logtrace(LogSymbols,"$1 intra_chroma_pred_mode=%d\n",0);
+        estim.write_CABAC_bit(CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE,0);
+      }
+      rate += estim.getRDBits();
+
+      float cbfRate = tb[intraMode]->rate - tb[intraMode]->rate_withoutCbfChroma;
+      tb[intraMode]->rate_withoutCbfChroma = rate;
+      tb[intraMode]->rate = tb[intraMode]->rate_withoutCbfChroma + cbfRate;
+
+      //printf("QQQ %f %f\n", b, estim.getRDBits());
+
+      float cost = tb[intraMode]->distortion + ectx->lambda * rate;
+
+      //printf("idx:%d mode:%d cost:%f\n",i,intraMode,cost);
+
+      if (cost<minCost) {
+        minCost=cost;
+        minCostIdx=intraMode;
+        //minCandCost=c;
+      }
+    }
+
+
+    enum IntraPredMode intraMode = (IntraPredMode)minCostIdx;
+
+    cb->intra.pred_mode[blkIdx] = intraMode;
+    if (blkIdx==0) { cb->intra.chroma_mode  = intraMode; } //INTRA_CHROMA_LIKE_LUMA;
+    ectx->img->set_IntraPredMode(x0,y0,log2TbSize, intraMode);
+
+    tb[minCostIdx]->reconstruct(ectx, ectx->img, cb, blkIdx);
+    ctxModel = contexts[minCostIdx];
+
+    for (int i = 0; i<35; i++) {
+      if (i != minCostIdx) {
+        delete tb[i];
+      }
+    }
+
+    return tb[minCostIdx];
+  }
+  else {
+    return mTBSplitAlgo->analyze(ectx, ctxModel, input, parent, cb,
+                                 x0,y0,xBase,yBase, log2TbSize,
+                                 blkIdx, TrafoDepth, MaxTrafoDepth,
+                                 IntraSplitFlag);
+
+  }
+
+  assert(false);
+  return nullptr;
+}
diff --git a/libde265/encoder/algo/tb-intrapredmode.h b/libde265/encoder/algo/tb-intrapredmode.h
new file mode 100644
index 0000000..eea76f2
--- /dev/null
+++ b/libde265/encoder/algo/tb-intrapredmode.h
@@ -0,0 +1,281 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TB_INTRAPREDMODE_H
+#define TB_INTRAPREDMODE_H
+
+#include "libde265/nal-parser.h"
+#include "libde265/decctx.h"
+#include "libde265/encoder/encode.h"
+#include "libde265/slice.h"
+#include "libde265/scan.h"
+#include "libde265/intrapred.h"
+#include "libde265/transform.h"
+#include "libde265/fallback-dct.h"
+#include "libde265/quality.h"
+#include "libde265/fallback.h"
+#include "libde265/configparam.h"
+
+
+/*  Encoder search tree, bottom up:
+
+    - Algo_TB_Split - whether TB is split or not
+
+    - Algo_TB_IntraPredMode - choose the intra prediction mode (or NOP, if at the wrong tree level)
+
+    - Algo_CB_IntraPartMode - choose between NxN and 2Nx2N intra parts
+
+    - Algo_CB_Split - whether CB is split or not
+
+    - Algo_CTB_QScale - select QScale on CTB granularity
+ */
+
+
+// ========== TB intra prediction mode ==========
+
+enum ALGO_TB_IntraPredMode {
+  ALGO_TB_IntraPredMode_BruteForce,
+  ALGO_TB_IntraPredMode_FastBrute,
+  ALGO_TB_IntraPredMode_MinResidual
+};
+
+class option_ALGO_TB_IntraPredMode : public choice_option<enum ALGO_TB_IntraPredMode>
+{
+ public:
+  option_ALGO_TB_IntraPredMode() {
+    add_choice("min-residual",ALGO_TB_IntraPredMode_MinResidual);
+    add_choice("brute-force" ,ALGO_TB_IntraPredMode_BruteForce);
+    add_choice("fast-brute"  ,ALGO_TB_IntraPredMode_FastBrute, true);
+  }
+};
+
+
+enum TBBitrateEstimMethod {
+  //TBBitrateEstim_AccurateBits,
+  TBBitrateEstim_SSD,
+  TBBitrateEstim_SAD,
+  TBBitrateEstim_SATD_DCT,
+  TBBitrateEstim_SATD_Hadamard
+};
+
+class option_TBBitrateEstimMethod : public choice_option<enum TBBitrateEstimMethod>
+{
+ public:
+  option_TBBitrateEstimMethod() {
+    add_choice("ssd",TBBitrateEstim_SSD);
+    add_choice("sad",TBBitrateEstim_SAD);
+    add_choice("satd-dct",TBBitrateEstim_SATD_DCT);
+    add_choice("satd",TBBitrateEstim_SATD_Hadamard, true);
+  }
+};
+
+class Algo_TB_Split;
+
+
+/** Base class for intra prediction-mode algorithms.
+    Selects one of the 35 prediction modes.
+ */
+class Algo_TB_IntraPredMode
+{
+ public:
+  Algo_TB_IntraPredMode() : mTBSplitAlgo(NULL) { }
+  virtual ~Algo_TB_IntraPredMode() { }
+
+  virtual enc_tb* analyze(encoder_context*,
+                          context_model_table&,
+                          const de265_image* input,
+                          const enc_tb* parent,
+                          enc_cb* cb,
+                          int x0,int y0, int xBase,int yBase, int log2TbSize,
+                          int blkIdx,
+                          int TrafoDepth, int MaxTrafoDepth, int IntraSplitFlag) = 0;
+
+  void setChildAlgo(Algo_TB_Split* algo) { mTBSplitAlgo = algo; }
+
+ protected:
+  Algo_TB_Split* mTBSplitAlgo;
+};
+
+
+enum ALGO_TB_IntraPredMode_Subset {
+  ALGO_TB_IntraPredMode_Subset_All,
+  ALGO_TB_IntraPredMode_Subset_HVPlus,
+  ALGO_TB_IntraPredMode_Subset_DC,
+  ALGO_TB_IntraPredMode_Subset_Planar
+};
+
+class option_ALGO_TB_IntraPredMode_Subset : public choice_option<enum ALGO_TB_IntraPredMode_Subset>
+{
+ public:
+  option_ALGO_TB_IntraPredMode_Subset() {
+    add_choice("all"   ,ALGO_TB_IntraPredMode_Subset_All, true);
+    add_choice("HV+"   ,ALGO_TB_IntraPredMode_Subset_HVPlus);
+    add_choice("DC"    ,ALGO_TB_IntraPredMode_Subset_DC);
+    add_choice("planar",ALGO_TB_IntraPredMode_Subset_Planar);
+  }
+};
+
+
+/** Utility class for intra prediction-mode algorithm that uses a subset of modes.
+ */
+class Algo_TB_IntraPredMode_ModeSubset : public Algo_TB_IntraPredMode
+{
+ public:
+  Algo_TB_IntraPredMode_ModeSubset() {
+    for (int i=0;i<35;i++) {
+      mPredMode_enabled[i] = true;
+    }
+  }
+
+  void disableAllIntraPredModes() {
+    for (int i=0;i<35;i++) {
+      mPredMode_enabled[i] = false;
+    }
+  }
+
+  void enableIntraPredMode(int mode, bool flag=true) {
+    mPredMode_enabled[mode] = flag;
+  }
+
+  void enableIntraPredModeSubset(enum ALGO_TB_IntraPredMode_Subset subset) {
+    switch (subset)
+      {
+      case ALGO_TB_IntraPredMode_Subset_All: // activate all is the default
+        for (int i=0;i<35;i++) { enableIntraPredMode(i); }
+        break;
+      case ALGO_TB_IntraPredMode_Subset_DC:
+        disableAllIntraPredModes();
+        enableIntraPredMode(INTRA_DC);
+        break;
+      case ALGO_TB_IntraPredMode_Subset_Planar:
+        disableAllIntraPredModes();
+        enableIntraPredMode(INTRA_PLANAR);
+        break;
+      case ALGO_TB_IntraPredMode_Subset_HVPlus:
+        disableAllIntraPredModes();
+        enableIntraPredMode(INTRA_DC);
+        enableIntraPredMode(INTRA_PLANAR);
+        enableIntraPredMode(INTRA_ANGULAR_10);
+        enableIntraPredMode(INTRA_ANGULAR_26);
+        break;
+      }
+  }
+
+ protected:
+  bool mPredMode_enabled[35];
+};
+
+
+/** Algorithm that brute-forces through all intra prediction mode.
+ */
+class Algo_TB_IntraPredMode_BruteForce : public Algo_TB_IntraPredMode_ModeSubset
+{
+ public:
+
+  virtual enc_tb* analyze(encoder_context*,
+                          context_model_table&,
+                          const de265_image* input,
+                          const enc_tb* parent,
+                          enc_cb* cb,
+                          int x0,int y0, int xBase,int yBase, int log2TbSize,
+                          int blkIdx,
+                          int TrafoDepth, int MaxTrafoDepth, int IntraSplitFlag);
+};
+
+
+/** Algorithm that makes a quick pre-selection of modes and then brute-forces through them.
+ */
+class Algo_TB_IntraPredMode_FastBrute : public Algo_TB_IntraPredMode_ModeSubset
+{
+ public:
+
+  struct params
+  {
+    params() {
+      keepNBest.set_ID("IntraPredMode-FastBrute-keepNBest");
+      keepNBest.set_range(0,32);
+      keepNBest.set_default(5);
+
+      bitrateEstimMethod.set_ID("IntraPredMode-FastBrute-estimator");
+    }
+
+    option_TBBitrateEstimMethod bitrateEstimMethod;
+    option_int keepNBest;
+  };
+
+  void registerParams(config_parameters& config) {
+    config.add_option(&mParams.keepNBest);
+    config.add_option(&mParams.bitrateEstimMethod);
+  }
+
+  void setParams(const params& p) { mParams=p; }
+
+
+  virtual enc_tb* analyze(encoder_context*,
+                          context_model_table&,
+                          const de265_image* input,
+                          const enc_tb* parent,
+                          enc_cb* cb,
+                          int x0,int y0, int xBase,int yBase, int log2TbSize,
+                          int blkIdx,
+                          int TrafoDepth, int MaxTrafoDepth, int IntraSplitFlag);
+
+ private:
+  params mParams;
+};
+
+
+/** Algorithm that selects the intra prediction mode on minimum residual only.
+ */
+class Algo_TB_IntraPredMode_MinResidual : public Algo_TB_IntraPredMode_ModeSubset
+{
+ public:
+
+  struct params
+  {
+    params() {
+      bitrateEstimMethod.set_ID("IntraPredMode-MinResidual-estimator");
+    }
+
+    option_TBBitrateEstimMethod bitrateEstimMethod;
+  };
+
+  void setParams(const params& p) { mParams=p; }
+
+  void registerParams(config_parameters& config) {
+    config.add_option(&mParams.bitrateEstimMethod);
+  }
+
+  virtual enc_tb* analyze(encoder_context*,
+                          context_model_table&,
+                          const de265_image* input,
+                          const enc_tb* parent,
+                          enc_cb* cb,
+                          int x0,int y0, int xBase,int yBase, int log2TbSize,
+                          int blkIdx,
+                          int TrafoDepth, int MaxTrafoDepth, int IntraSplitFlag);
+
+ private:
+  params mParams;
+};
+
+#endif
diff --git a/libde265/encoder/algo/tb-split.cc b/libde265/encoder/algo/tb-split.cc
new file mode 100644
index 0000000..fe5282e
--- /dev/null
+++ b/libde265/encoder/algo/tb-split.cc
@@ -0,0 +1,525 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "libde265/encoder/analyze.h"
+#include "libde265/encoder/encoder-context.h"
+#include <assert.h>
+#include <limits>
+#include <math.h>
+#include <iostream>
+
+
+#define ENCODER_DEVELOPMENT 1
+
+
+void diff_blk(int16_t* out,int out_stride,
+              const uint8_t* a_ptr, int a_stride,
+              const uint8_t* b_ptr, int b_stride,
+              int blkSize)
+{
+  for (int by=0;by<blkSize;by++)
+    for (int bx=0;bx<blkSize;bx++)
+      {
+        out[by*out_stride+bx] = a_ptr[by*a_stride+bx] - b_ptr[by*b_stride+bx];
+      }
+}
+
+
+static bool has_nonzero_value(const int16_t* data, int n)
+{
+  for (int i=0;i<n;i++)
+    if (data[i]) return true;
+
+  return false;
+}
+
+
+void show_debug_image(const de265_image* input, int slot);
+
+/*
+  void encode_transform_unit(encoder_context* ectx,
+  enc_tb* tb,
+  const de265_image* input,
+  //int16_t* residual, int stride,
+  int x0,int y0, // luma position
+  int log2TbSize, // chroma adapted
+  const enc_cb* cb,
+  int cIdx)
+  {
+  }
+*/
+
+void analyze_transform_unit(encoder_context* ectx,
+                            enc_tb* tb,
+                            const de265_image* input, // TODO: probably pass pixels/stride directly
+                            //int16_t* residual, int stride,
+                            int x0,int y0, // luma position
+                            int log2TbSize, // chroma adapted
+                            const enc_cb* cb,
+                            int cIdx)
+{
+  int xC = x0;
+  int yC = y0;
+  int tbSize = 1<<log2TbSize;
+  if (cIdx>0) { xC>>=1; yC>>=1; }
+
+  enum PredMode predMode = cb->PredMode;
+
+  int16_t blk[32*32]; // residual
+
+  // --- do intra prediction ---
+
+  if (predMode==MODE_INTRA) {
+    enum IntraPredMode intraPredMode;
+
+    if (cIdx==0) {
+      intraPredMode = ectx->img->get_IntraPredMode(x0,y0);
+    }
+    else {
+      intraPredMode = cb->intra.chroma_mode;
+    }
+
+    decode_intra_prediction(ectx->img, xC,  yC,   intraPredMode,  tbSize  , cIdx);
+
+
+    // --- subtract prediction from prediction ---
+
+    uint8_t* pred = ectx->img->get_image_plane(cIdx);
+    int stride = ectx->img->get_image_stride(cIdx);
+
+    diff_blk(blk,tbSize,
+             input->get_image_plane_at_pos(cIdx,xC,yC), input->get_image_stride(cIdx),
+             &pred[yC*stride+xC],stride, tbSize);
+  }
+  else {
+    // --- subtract prediction from prediction ---
+
+    uint8_t* pred = ectx->prediction->get_image_plane(cIdx);
+    int stride = ectx->prediction->get_image_stride(cIdx);
+
+    //printBlk("input",input->get_image_plane_at_pos(cIdx,xC,yC), tbSize, input->get_image_stride(cIdx));
+    //printBlk("prediction", pred,tbSize, stride);
+
+    diff_blk(blk,tbSize,
+             input->get_image_plane_at_pos(cIdx,xC,yC), input->get_image_stride(cIdx),
+             &pred[yC*stride+xC],stride, tbSize);
+
+    //printBlk("residual", blk,tbSize,tbSize);
+  }
+
+
+  //show_debug_image(ectx->img, 0);
+
+
+
+
+  // --- forward transform ---
+
+  tb->alloc_coeff_memory(cIdx, tbSize);
+
+  int trType;
+  if (cIdx==0 && log2TbSize==2 && predMode==MODE_INTRA) trType=1; // TODO: inter mode
+  else trType=0;
+
+  fwd_transform(&ectx->acceleration, tb->coeff[cIdx], tbSize, log2TbSize, trType,  blk, tbSize);
+
+
+  // --- quantization ---
+
+  quant_coefficients(tb->coeff[cIdx], tb->coeff[cIdx], log2TbSize,  cb->qp, true);
+
+  tb->cbf[cIdx] = has_nonzero_value(tb->coeff[cIdx], 1<<(log2TbSize<<1));
+}
+
+
+static void recursive_cbfChroma(CABAC_encoder_estim* cabac,
+                                enc_tb* tb, int log2TrafoSize, int trafoDepth)
+{
+  float bits_pre = cabac->getRDBits();
+
+  // --- CBF CB/CR ---
+
+  // For 4x4 luma, there is no signaling of chroma CBF, because only the
+  // chroma CBF for 8x8 is relevant.
+  if (log2TrafoSize>2) {
+    if (trafoDepth==0 || tb->parent->cbf[1]) {
+      encode_cbf_chroma(cabac, trafoDepth, tb->cbf[1]);
+    }
+    if (trafoDepth==0 || tb->parent->cbf[2]) {
+      encode_cbf_chroma(cabac, trafoDepth, tb->cbf[2]);
+    }
+  }
+
+  if (tb->split_transform_flag) {
+    for (int i=0;i<4;i++) {
+      recursive_cbfChroma(cabac, tb->children[i], log2TrafoSize-1, trafoDepth+1);
+    }
+  }
+
+  float bits_post = cabac->getRDBits();
+
+  tb->rate = tb->rate_withoutCbfChroma + (bits_post - bits_pre);
+}
+
+
+enc_tb* encode_transform_tree_no_split(encoder_context* ectx,
+                                       context_model_table& ctxModel,
+                                       const de265_image* input,
+                                       const enc_tb* parent,
+                                       enc_cb* cb,
+                                       int x0,int y0, int xBase,int yBase, int log2TbSize,
+                                       int blkIdx,
+                                       int trafoDepth, int MaxTrafoDepth, int IntraSplitFlag)
+{
+  de265_image* img = ectx->img;
+
+  int stride = ectx->img->get_image_stride(0);
+
+  uint8_t* luma_plane = ectx->img->get_image_plane(0);
+  uint8_t* cb_plane = ectx->img->get_image_plane(1);
+  uint8_t* cr_plane = ectx->img->get_image_plane(2);
+
+  // --- compute transform coefficients ---
+
+  enc_tb* tb = new enc_tb();
+
+  tb->parent = parent;
+  tb->split_transform_flag = false;
+  tb->log2Size = log2TbSize;
+  tb->x = x0;
+  tb->y = y0;
+  tb->cbf[0] = tb->cbf[1] = tb->cbf[2] = 0;
+
+
+  // luma block
+
+  analyze_transform_unit(ectx, tb, input, x0,y0, log2TbSize, cb, 0 /* Y */);
+
+
+  // chroma blocks
+
+  if (log2TbSize > 2) {
+    // if TB is > 4x4, do chroma transform of half size
+    analyze_transform_unit(ectx, tb, input, x0,y0, log2TbSize-1, cb, 1 /* Cb */);
+    analyze_transform_unit(ectx, tb, input, x0,y0, log2TbSize-1, cb, 2 /* Cr */);
+  }
+  else if (blkIdx==3) {
+    // if TB size is 4x4, do chroma transform for last sub-block
+    analyze_transform_unit(ectx, tb, input, xBase,yBase, log2TbSize, cb, 1 /* Cb */);
+    analyze_transform_unit(ectx, tb, input, xBase,yBase, log2TbSize, cb, 2 /* Cr */);
+  }
+
+
+  // reconstruction
+
+  tb->reconstruct(ectx, ectx->img, cb, blkIdx);
+
+
+
+  // measure rate
+
+  CABAC_encoder_estim estim;
+  estim.set_context_models(&ctxModel);
+
+
+  tb->rate_withoutCbfChroma = 0;
+
+  const seq_parameter_set* sps = &ectx->img->sps;
+
+
+  if (log2TbSize <= sps->Log2MaxTrafoSize &&
+      log2TbSize >  sps->Log2MinTrafoSize &&
+      trafoDepth < MaxTrafoDepth &&
+      !(IntraSplitFlag && trafoDepth==0))
+    {
+      encode_split_transform_flag(ectx, &estim, log2TbSize, 0);
+      tb->rate_withoutCbfChroma += estim.getRDBits();
+      estim.reset();
+    }
+
+  // --- CBF CB/CR ---
+
+  if (cb->PredMode == MODE_INTRA || trafoDepth != 0 ||
+      tb->cbf[1] || tb->cbf[2]) {
+    encode_cbf_luma(&estim, trafoDepth==0, tb->cbf[0]);
+  }
+
+  encode_transform_unit(ectx,&estim, tb,cb, x0,y0, xBase,yBase, log2TbSize, trafoDepth, blkIdx);
+
+  tb->rate_withoutCbfChroma += estim.getRDBits();
+
+  estim.reset(); // TODO: not needed ?
+
+  recursive_cbfChroma(&estim,tb,log2TbSize,trafoDepth);
+
+  //float rate_cbfChroma = estim.getRDBits();
+  //tb->rate = tb->rate_withoutCbfChroma + rate_cbfChroma;
+
+
+  // measure distortion
+
+  int tbSize = 1<<log2TbSize;
+  tb->distortion = SSD(input->get_image_plane_at_pos(0, x0,y0), input->get_image_stride(0),
+                       img  ->get_image_plane_at_pos(0, x0,y0), img  ->get_image_stride(0),
+                       tbSize, tbSize);
+
+  return tb;
+}
+
+
+enc_tb* Algo_TB_Split::encode_transform_tree_split(encoder_context* ectx,
+                                                   context_model_table& ctxModel,
+                                                   const de265_image* input,
+                                                   const enc_tb* parent,
+                                                   enc_cb* cb,
+                                                   int x0,int y0, int log2TbSize,
+                                                   int TrafoDepth, int MaxTrafoDepth,
+                                                   int IntraSplitFlag)
+{
+  const de265_image* img = ectx->img;
+
+  enc_tb* tb = new enc_tb();
+
+  tb->parent = parent;
+  tb->split_transform_flag = true;
+  tb->log2Size = log2TbSize;
+  tb->x = x0;
+  tb->y = y0;
+
+  tb->distortion = 0;
+  tb->rate = 0;
+  tb->rate_withoutCbfChroma = 0;
+
+
+  // Since we try to code all sub-blocks, we enable all CBF flags.
+  // Should we see later that the child TBs are zero, we clear those flags later.
+
+  tb->cbf[0]=1;
+  tb->cbf[1]=1;
+  tb->cbf[2]=1;
+
+
+  context_model ctxModelCbfChroma[4];
+  for (int i=0;i<4;i++) {
+    ctxModelCbfChroma[i] = ctxModel[CONTEXT_MODEL_CBF_CHROMA+i];
+  }
+
+
+  // --- encode all child nodes ---
+
+  for (int i=0;i<4;i++) {
+    int dx = (i&1)  << (log2TbSize-1);
+    int dy = (i>>1) << (log2TbSize-1);
+
+    if (cb->PredMode == MODE_INTRA) {
+      tb->children[i] = mAlgo_TB_IntraPredMode->analyze(ectx, ctxModel, input, tb, cb,
+                                                        x0+dx, y0+dy, x0,y0,
+                                                        log2TbSize-1, i,
+                                                        TrafoDepth+1, MaxTrafoDepth, IntraSplitFlag);
+    }
+    else {
+      tb->children[i] = this->analyze(ectx, ctxModel, input, tb, cb,
+                                      x0+dx, y0+dy, x0,y0,
+                                      log2TbSize-1, i,
+                                      TrafoDepth+1, MaxTrafoDepth, IntraSplitFlag);
+    }
+
+    tb->distortion            += tb->children[i]->distortion;
+    tb->rate_withoutCbfChroma += tb->children[i]->rate_withoutCbfChroma;
+  }
+
+  tb->set_cbf_flags_from_children();
+
+
+  // --- add rate for this TB level ---
+
+  CABAC_encoder_estim estim;
+  estim.set_context_models(&ctxModel);
+
+
+
+
+  const seq_parameter_set* sps = &ectx->img->sps;
+
+  if (log2TbSize <= sps->Log2MaxTrafoSize &&
+      log2TbSize >  sps->Log2MinTrafoSize &&
+      TrafoDepth < MaxTrafoDepth &&
+      !(IntraSplitFlag && TrafoDepth==0))
+    {
+      encode_split_transform_flag(ectx, &estim, log2TbSize, 1);
+      tb->rate_withoutCbfChroma += estim.getRDBits();
+      estim.reset();
+    }
+
+  // restore chroma CBF context models
+
+  for (int i=0;i<4;i++) {
+    ctxModel[CONTEXT_MODEL_CBF_CHROMA+i] = ctxModelCbfChroma[i];
+  }
+
+  recursive_cbfChroma(&estim,tb, log2TbSize, TrafoDepth);
+  //tb->rate = tb->rate_withoutCbfChroma + estim.getRDBits();
+
+  return tb;
+}
+
+
+
+struct Logging_TB_Split : public Logging
+{
+  int skipTBSplit, noskipTBSplit;
+  int zeroBlockCorrelation[6][2][5];
+
+  const char* name() const { return "tb-split"; }
+
+  void print(const encoder_context* ectx, const char* filename)
+  {
+    printf("%d %d\n\n",skipTBSplit, noskipTBSplit);
+
+    for (int tb=3;tb<=5;tb++) {
+      for (int z=0;z<=1;z++) {
+        float total = 0;
+
+        for (int c=0;c<5;c++)
+          total += zeroBlockCorrelation[tb][z][c];
+
+        for (int c=0;c<5;c++) {
+          printf("%d %d %d : %d %5.2f\n", tb,z,c,
+                 zeroBlockCorrelation[tb][z][c],
+                 total==0 ? 0 : zeroBlockCorrelation[tb][z][c]/total*100);
+        }
+      }
+    }
+
+
+    for (int z=0;z<2;z++) {
+      printf("\n");
+      for (int tb=3;tb<=5;tb++) {
+        float total = 0;
+
+        for (int c=0;c<5;c++)
+          total += zeroBlockCorrelation[tb][z][c];
+
+        printf("%dx%d ",1<<tb,1<<tb);
+
+        for (int c=0;c<5;c++) {
+          printf("%5.2f ", total==0 ? 0 : zeroBlockCorrelation[tb][z][c]/total*100);
+        }
+        printf("\n");
+      }
+    }
+  }
+} logging_tb_split;
+
+
+
+enc_tb*
+Algo_TB_Split_BruteForce::analyze(encoder_context* ectx,
+                                  context_model_table& ctxModel,
+                                  const de265_image* input,
+                                  const enc_tb* parent,
+                                  enc_cb* cb,
+                                  int x0,int y0, int xBase,int yBase, int log2TbSize,
+                                  int blkIdx,
+                                  int TrafoDepth, int MaxTrafoDepth,
+                                  int IntraSplitFlag)
+{
+  bool test_split = (log2TbSize > 2 &&
+                     TrafoDepth < MaxTrafoDepth &&
+                     log2TbSize > ectx->sps.Log2MinTrafoSize);
+
+  bool test_no_split = true;
+  if (IntraSplitFlag && TrafoDepth==0) test_no_split=false; // we have to split
+  if (log2TbSize > ectx->sps.Log2MaxTrafoSize) test_no_split=false;
+
+  context_model_table ctxSplit;
+  if (test_split) {
+    ctxSplit = ctxModel.copy();
+  }
+
+
+  enc_tb* tb_no_split = NULL;
+  enc_tb* tb_split    = NULL;
+  float rd_cost_no_split = std::numeric_limits<float>::max();
+  float rd_cost_split    = std::numeric_limits<float>::max();
+
+  if (test_no_split) {
+    tb_no_split = encode_transform_tree_no_split(ectx, ctxModel, input, parent,
+                                                 cb, x0,y0, xBase,yBase, log2TbSize,
+                                                 blkIdx,
+                                                 TrafoDepth,MaxTrafoDepth,IntraSplitFlag);
+
+    rd_cost_no_split = tb_no_split->distortion + ectx->lambda * tb_no_split->rate;
+
+    if (log2TbSize <= mParams.zeroBlockPrune()) {
+      bool zeroBlock = tb_no_split->isZeroBlock();
+
+      if (zeroBlock) {
+        test_split = false;
+        logging_tb_split.skipTBSplit++;
+      }
+      else
+        logging_tb_split.noskipTBSplit++;
+    }
+  }
+
+
+  if (test_split) {
+    tb_split = encode_transform_tree_split(ectx, ctxSplit, input, parent, cb,
+                                           x0,y0, log2TbSize,
+                                           TrafoDepth, MaxTrafoDepth, IntraSplitFlag);
+
+    rd_cost_split    = tb_split->distortion    + ectx->lambda * tb_split->rate;
+  }
+
+
+  if (test_split && test_no_split) {
+    bool zero_block = tb_no_split->isZeroBlock();
+
+    int nChildZero = 0;
+    for (int i=0;i<4;i++) {
+      if (tb_split->children[i]->isZeroBlock()) nChildZero++;
+    }
+
+    logging_tb_split.zeroBlockCorrelation[log2TbSize][zero_block ? 0 : 1][nChildZero]++;
+  }
+
+
+  bool split = (rd_cost_split < rd_cost_no_split);
+
+  if (split) {
+    ctxModel = ctxSplit;
+
+    delete tb_no_split;
+    assert(tb_split);
+    return tb_split;
+  }
+  else {
+    delete tb_split;
+    assert(tb_no_split);
+    tb_no_split->reconstruct(ectx, ectx->img,
+                             cb, blkIdx);
+
+    return tb_no_split;
+  }
+}
diff --git a/libde265/encoder/algo/tb-split.h b/libde265/encoder/algo/tb-split.h
new file mode 100644
index 0000000..fe24be9
--- /dev/null
+++ b/libde265/encoder/algo/tb-split.h
@@ -0,0 +1,152 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TB_SPLIT_H
+#define TB_SPLIT_H
+
+#include "libde265/nal-parser.h"
+#include "libde265/decctx.h"
+#include "libde265/encoder/encode.h"
+#include "libde265/slice.h"
+#include "libde265/scan.h"
+#include "libde265/intrapred.h"
+#include "libde265/transform.h"
+#include "libde265/fallback-dct.h"
+#include "libde265/quality.h"
+#include "libde265/fallback.h"
+#include "libde265/configparam.h"
+
+#include "libde265/encoder/algo/tb-intrapredmode.h"
+
+
+/*  Encoder search tree, bottom up:
+
+    - Algo_TB_Split - whether TB is split or not
+
+    - Algo_TB_IntraPredMode - choose the intra prediction mode (or NOP, if at the wrong tree level)
+
+    - Algo_CB_IntraPartMode - choose between NxN and 2Nx2N intra parts
+
+    - Algo_CB_Split - whether CB is split or not
+
+    - Algo_CTB_QScale - select QScale on CTB granularity
+ */
+
+/*
+struct ResidualBlock
+{
+  const int16_t* data[4];
+  int            stride[4];
+};
+*/
+
+void diff_blk(int16_t* out,int out_stride,
+              const uint8_t* a_ptr, int a_stride,
+              const uint8_t* b_ptr, int b_stride,
+              int blkSize);
+
+
+// ========== TB split decision ==========
+
+class Algo_TB_Split
+{
+ public:
+  Algo_TB_Split() : mAlgo_TB_IntraPredMode(NULL) { }
+  virtual ~Algo_TB_Split() { }
+
+  virtual enc_tb* analyze(encoder_context*,
+                          context_model_table&,
+                          const de265_image* input,
+                          const enc_tb* parent,
+                          enc_cb* cb,
+                          int x0,int y0, int xBase,int yBase, int log2TbSize,
+                          int blkIdx,
+                          int TrafoDepth, int MaxTrafoDepth, int IntraSplitFlag) = 0;
+
+  void setAlgo_TB_IntraPredMode(Algo_TB_IntraPredMode* algo) { mAlgo_TB_IntraPredMode=algo; }
+
+ protected:
+  enc_tb* encode_transform_tree_split(encoder_context* ectx,
+                                      context_model_table& ctxModel,
+                                      const de265_image* input,
+                                      const enc_tb* parent,
+                                      enc_cb* cb,
+                                      int x0,int y0, int log2TbSize,
+                                      int TrafoDepth, int MaxTrafoDepth, int IntraSplitFlag);
+
+  Algo_TB_IntraPredMode* mAlgo_TB_IntraPredMode;
+};
+
+
+
+enum ALGO_TB_Split_BruteForce_ZeroBlockPrune {
+  // numeric value specifies the maximum size for log2Tb for which the pruning is applied
+  ALGO_TB_BruteForce_ZeroBlockPrune_off = 0,
+  ALGO_TB_BruteForce_ZeroBlockPrune_8x8 = 3,
+  ALGO_TB_BruteForce_ZeroBlockPrune_8x8_16x16 = 4,
+  ALGO_TB_BruteForce_ZeroBlockPrune_all = 5
+};
+
+class option_ALGO_TB_Split_BruteForce_ZeroBlockPrune
+: public choice_option<enum ALGO_TB_Split_BruteForce_ZeroBlockPrune>
+{
+ public:
+  option_ALGO_TB_Split_BruteForce_ZeroBlockPrune() {
+    add_choice("off"     ,ALGO_TB_BruteForce_ZeroBlockPrune_off);
+    add_choice("8x8"     ,ALGO_TB_BruteForce_ZeroBlockPrune_8x8);
+    add_choice("8-16"    ,ALGO_TB_BruteForce_ZeroBlockPrune_8x8_16x16);
+    add_choice("all"     ,ALGO_TB_BruteForce_ZeroBlockPrune_all, true);
+  }
+};
+
+class Algo_TB_Split_BruteForce : public Algo_TB_Split
+{
+ public:
+  struct params
+  {
+    params() {
+      zeroBlockPrune.set_ID("TB-Split-BruteForce-ZeroBlockPrune");
+    }
+
+    option_ALGO_TB_Split_BruteForce_ZeroBlockPrune zeroBlockPrune;
+  };
+
+  void setParams(const params& p) { mParams=p; }
+
+  void registerParams(config_parameters& config) {
+    config.add_option(&mParams.zeroBlockPrune);
+  }
+
+  virtual enc_tb* analyze(encoder_context*,
+                          context_model_table&,
+                          const de265_image* input,
+                          const enc_tb* parent,
+                          enc_cb* cb,
+                          int x0,int y0, int xBase,int yBase, int log2TbSize,
+                          int blkIdx,
+                          int TrafoDepth, int MaxTrafoDepth, int IntraSplitFlag);
+
+ private:
+  params mParams;
+};
+
+#endif
diff --git a/libde265/encoder/analyze.cc b/libde265/encoder/analyze.cc
new file mode 100644
index 0000000..9b86e12
--- /dev/null
+++ b/libde265/encoder/analyze.cc
@@ -0,0 +1,371 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "libde265/encoder/analyze.h"
+#include "libde265/encoder/encoder-context.h"
+#include <assert.h>
+#include <limits>
+#include <math.h>
+#include <iostream>
+
+
+#define ENCODER_DEVELOPMENT 0
+#define COMPARE_ESTIMATED_RATE_TO_REAL_RATE 0
+
+
+static int IntraPredModeCnt[7][35];
+static int MPM_used[7][35];
+
+static int IntraPredModeCnt_total[35];
+static int MPM_used_total[35];
+
+void statistics_IntraPredMode(const encoder_context* ectx, int x,int y, const enc_cb* cb)
+{
+  if (cb->split_cu_flag) {
+    for (int i=0;i<4;i++)
+      if (cb->children[i]) {
+        statistics_IntraPredMode(ectx, childX(x,i,cb->log2Size), childY(y,i,cb->log2Size), cb->children[i]);
+      }
+  }
+  else {
+    int cnt;
+    int size = cb->log2Size;
+
+    if (cb->PartMode == PART_NxN) { cnt=4; size--; } else cnt=1;
+
+    for (int i=0;i<cnt;i++) {
+      IntraPredModeCnt[size][ cb->intra.pred_mode[i] ]++;
+      IntraPredModeCnt_total[ cb->intra.pred_mode[i] ]++;
+
+      int xi = childX(x,i,cb->log2Size);
+      int yi = childY(y,i,cb->log2Size);
+
+      int candModeList[3];
+      fillIntraPredModeCandidates(candModeList,xi,yi, xi>0, yi>0, ectx->img);
+
+      int predmode = cb->intra.pred_mode[i];
+      if (candModeList[0]==predmode ||
+          candModeList[1]==predmode ||
+          candModeList[2]==predmode) {
+        MPM_used[size][predmode]++;
+        MPM_used_total[predmode]++;
+      }
+    }
+  }
+}
+
+void statistics_print()
+{
+  for (int i=0;i<35;i++) {
+    printf("%d",i);
+    printf("  %d %d",IntraPredModeCnt_total[i], MPM_used_total[i]);
+
+    for (int k=2;k<=6;k++) {
+      printf("  %d %d",IntraPredModeCnt[k][i], MPM_used[k][i]);
+    }
+
+    printf("\n");
+  }
+}
+
+
+void print_tb_tree_rates(const enc_tb* tb, int level)
+{
+  for (int i=0;i<level;i++)
+    std::cout << "  ";
+
+  std::cout << "TB rate=" << tb->rate << " (" << tb->rate_withoutCbfChroma << ")\n";
+  if (tb->split_transform_flag) {
+    for (int i=0;i<4;i++)
+      print_tb_tree_rates(tb->children[i], level+1);
+  }
+}
+
+
+void print_cb_tree_rates(const enc_cb* cb, int level)
+{
+  for (int i=0;i<level;i++)
+    std::cout << "  ";
+
+  std::cout << "CB rate=" << cb->rate << "\n";
+  if (cb->split_cu_flag) {
+    for (int i=0;i<4;i++)
+      print_cb_tree_rates(cb->children[i], level+1);
+  }
+  else {
+    print_tb_tree_rates(cb->transform_tree, level+1);
+  }
+}
+
+
+double encode_image(encoder_context* ectx,
+                    const de265_image* input,
+                    EncodingAlgorithm& algo)
+{
+  int stride=input->get_image_stride(0);
+
+  int w = ectx->sps.pic_width_in_luma_samples;
+  int h = ectx->sps.pic_height_in_luma_samples;
+
+  // --- create reconstruction image ---
+  ectx->img = new de265_image;
+  ectx->img->vps  = ectx->vps;
+  ectx->img->sps  = ectx->sps;
+  ectx->img->pps  = ectx->pps;
+  ectx->img->PicOrderCntVal = input->PicOrderCntVal;
+
+  ectx->img->alloc_image(w,h, de265_chroma_420, &ectx->sps, true,
+                         NULL /* no decctx */, ectx, 0,NULL,false);
+  //ectx->img->alloc_encoder_data(&ectx->sps);
+  ectx->img->clear_metadata();
+
+#if 1
+  if (1) {
+    ectx->prediction = new de265_image;
+    ectx->prediction->alloc_image(w,h, de265_chroma_420, &ectx->sps, false /* no metadata */,
+                                  NULL /* no decctx */, NULL /* no encctx */, 0,NULL,false);
+    ectx->prediction->vps = ectx->vps;
+    ectx->prediction->sps = ectx->sps;
+    ectx->prediction->pps = ectx->pps;
+  }
+#endif
+
+  ectx->active_qp = ectx->pps.pic_init_qp; // TODO take current qp from slice
+
+
+  ectx->cabac_ctx_models.init(ectx->shdr->initType, ectx->shdr->SliceQPY);
+  ectx->cabac_encoder.set_context_models(&ectx->cabac_ctx_models);
+
+
+  context_model_table modelEstim;
+  CABAC_encoder_estim cabacEstim;
+
+  modelEstim.init(ectx->shdr->initType, ectx->shdr->SliceQPY);
+  cabacEstim.set_context_models(&modelEstim);
+
+
+  int Log2CtbSize = ectx->sps.Log2CtbSizeY;
+
+  uint8_t* luma_plane = ectx->img->get_image_plane(0);
+  uint8_t* cb_plane   = ectx->img->get_image_plane(1);
+  uint8_t* cr_plane   = ectx->img->get_image_plane(2);
+
+
+  // encode CTB by CTB
+
+  for (int y=0;y<ectx->sps.PicHeightInCtbsY;y++)
+    for (int x=0;x<ectx->sps.PicWidthInCtbsY;x++)
+      {
+        ectx->img->set_SliceAddrRS(x, y, ectx->shdr->SliceAddrRS);
+
+        int x0 = x<<Log2CtbSize;
+        int y0 = y<<Log2CtbSize;
+
+        logtrace(LogSlice,"encode CTB at %d %d\n",x0,y0);
+
+        // make a copy of the context model that we can modify for testing alternatives
+
+        context_model_table ctxModel;
+        //copy_context_model_table(ctxModel, ectx->ctx_model_bitstream);
+        ctxModel = ectx->cabac_ctx_models.copy();
+        ctxModel = modelEstim.copy(); // TODO TMP
+
+        disable_logging(LogSymbols);
+        enable_logging(LogSymbols);  // TODO TMP
+
+        //printf("================================================== ANALYZE\n");
+
+#if 1
+        /*
+          enc_cb* cb = encode_cb_may_split(ectx, ctxModel,
+          input, x0,y0, Log2CtbSize, 0, qp);
+        */
+
+        enc_cb* cb = algo.getAlgoCTBQScale()->analyze(ectx,ctxModel, x0,y0);
+#else
+        float minCost = std::numeric_limits<float>::max();
+        int bestQ = 0;
+        int qp = ectx->params.constant_QP;
+
+        enc_cb* cb;
+        for (int q=1;q<51;q++) {
+          copy_context_model_table(ctxModel, ectx->ctx_model_bitstream);
+
+          enc_cb* cbq = encode_cb_may_split(ectx, ctxModel,
+                                            input, x0,y0, Log2CtbSize, 0, q);
+
+          float cost = cbq->distortion + ectx->lambda * cbq->rate;
+          if (cost<minCost) { minCost=cost; bestQ=q; }
+
+          if (q==qp) { cb=cbq; }
+        }
+
+        printf("Q %d\n",bestQ);
+        fflush(stdout);
+#endif
+
+        //print_cb_tree_rates(cb,0);
+
+        //statistics_IntraPredMode(ectx, x0,y0, cb);
+
+
+        // --- write bitstream ---
+
+        //ectx->switch_CABAC_to_bitstream();
+
+        enable_logging(LogSymbols);
+
+        encode_ctb(ectx, &ectx->cabac_encoder, cb, x,y);
+
+        //printf("================================================== WRITE\n");
+
+
+        if (COMPARE_ESTIMATED_RATE_TO_REAL_RATE) {
+          float realPre = cabacEstim.getRDBits();
+          encode_ctb(ectx, &cabacEstim, cb, x,y);
+          float realPost = cabacEstim.getRDBits();
+
+          printf("estim: %f  real: %f  diff: %f\n",
+                 cb->rate,
+                 realPost-realPre,
+                 cb->rate - (realPost-realPre));
+        }
+
+
+        int last = (y==ectx->sps.PicHeightInCtbsY-1 &&
+                    x==ectx->sps.PicWidthInCtbsY-1);
+        ectx->cabac_encoder.write_CABAC_term_bit(last);
+
+
+        delete cb;
+
+        //ectx->free_all_pools();
+      }
+
+
+  //statistics_print();
+
+
+  delete ectx->prediction;
+
+
+  // frame PSNR
+
+  double psnr = PSNR(MSE(input->get_image_plane(0), input->get_image_stride(0),
+                         luma_plane, ectx->img->get_image_stride(0),
+                         input->get_width(), input->get_height()));
+  return psnr;
+}
+
+
+
+void EncodingAlgorithm_Custom::setParams(encoder_params& params)
+{
+  // build algorithm tree
+
+  mAlgo_CTB_QScale_Constant.setChildAlgo(&mAlgo_CB_Split_BruteForce);
+  mAlgo_CB_Split_BruteForce.setChildAlgo(&mAlgo_CB_Skip_BruteForce);
+
+  mAlgo_CB_Skip_BruteForce.setSkipAlgo(&mAlgo_CB_MergeIndex_Fixed);
+  mAlgo_CB_Skip_BruteForce.setNonSkipAlgo(&mAlgo_CB_IntraInter_BruteForce);
+  //&mAlgo_CB_InterPartMode_Fixed);
+
+  Algo_CB_IntraPartMode* algo_CB_IntraPartMode = NULL;
+  switch (params.mAlgo_CB_IntraPartMode()) {
+  case ALGO_CB_IntraPartMode_BruteForce:
+    algo_CB_IntraPartMode = &mAlgo_CB_IntraPartMode_BruteForce;
+    break;
+  case ALGO_CB_IntraPartMode_Fixed:
+    algo_CB_IntraPartMode = &mAlgo_CB_IntraPartMode_Fixed;
+    break;
+  }
+
+  mAlgo_CB_IntraInter_BruteForce.setIntraChildAlgo(algo_CB_IntraPartMode);
+  mAlgo_CB_IntraInter_BruteForce.setInterChildAlgo(&mAlgo_CB_InterPartMode_Fixed);
+
+  mAlgo_CB_MergeIndex_Fixed.setChildAlgo(&mAlgo_TB_Split_BruteForce);
+
+  Algo_PB_MV* pbAlgo = NULL;
+  switch (params.mAlgo_MEMode()) {
+  case MEMode_Test:
+    pbAlgo = &mAlgo_PB_MV_Test;
+    break;
+  case MEMode_Search:
+    pbAlgo = &mAlgo_PB_MV_Search;
+    break;
+  }
+
+  mAlgo_CB_InterPartMode_Fixed.setChildAlgo(pbAlgo);
+  pbAlgo->setChildAlgo(&mAlgo_TB_Split_BruteForce);
+
+
+  Algo_TB_IntraPredMode_ModeSubset* algo_TB_IntraPredMode = NULL;
+  switch (params.mAlgo_TB_IntraPredMode()) {
+  case ALGO_TB_IntraPredMode_BruteForce:
+    algo_TB_IntraPredMode = &mAlgo_TB_IntraPredMode_BruteForce;
+    break;
+  case ALGO_TB_IntraPredMode_FastBrute:
+    algo_TB_IntraPredMode = &mAlgo_TB_IntraPredMode_FastBrute;
+    break;
+  case ALGO_TB_IntraPredMode_MinResidual:
+    algo_TB_IntraPredMode = &mAlgo_TB_IntraPredMode_MinResidual;
+    break;
+  }
+
+  algo_CB_IntraPartMode->setChildAlgo(algo_TB_IntraPredMode);
+
+  mAlgo_TB_Split_BruteForce.setAlgo_TB_IntraPredMode(algo_TB_IntraPredMode);
+  //mAlgo_TB_Split_BruteForce.setParams(params.TB_Split_BruteForce);
+
+  algo_TB_IntraPredMode->setChildAlgo(&mAlgo_TB_Split_BruteForce);
+
+
+  // ===== set algorithm parameters ======
+
+  //mAlgo_CB_IntraPartMode_Fixed.setParams(params.CB_IntraPartMode_Fixed);
+
+  //mAlgo_TB_IntraPredMode_FastBrute.setParams(params.TB_IntraPredMode_FastBrute);
+  //mAlgo_TB_IntraPredMode_MinResidual.setParams(params.TB_IntraPredMode_MinResidual);
+
+
+  //mAlgo_CTB_QScale_Constant.setParams(params.CTB_QScale_Constant);
+
+
+  algo_TB_IntraPredMode->enableIntraPredModeSubset( params.mAlgo_TB_IntraPredMode_Subset() );
+}
+
+
+void Logging::print_logging(const encoder_context* ectx, const char* id, const char* filename)
+{
+#if 000
+  if (strcmp(id,logging_tb_split.name())==0) {
+    logging_tb_split.print(ectx,filename);
+  }
+#endif
+}
+
+
+void en265_print_logging(const encoder_context* ectx, const char* id, const char* filename)
+{
+  Logging::print_logging(ectx,id,filename);
+}
diff --git a/libde265/encoder/analyze.h b/libde265/encoder/analyze.h
new file mode 100644
index 0000000..86388ae
--- /dev/null
+++ b/libde265/encoder/analyze.h
@@ -0,0 +1,146 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ANALYZE_H
+#define ANALYZE_H
+
+#include "libde265/nal-parser.h"
+#include "libde265/decctx.h"
+#include "libde265/encoder/encode.h"
+#include "libde265/slice.h"
+#include "libde265/scan.h"
+#include "libde265/intrapred.h"
+#include "libde265/transform.h"
+#include "libde265/fallback-dct.h"
+#include "libde265/quality.h"
+#include "libde265/fallback.h"
+#include "libde265/configparam.h"
+
+#include "libde265/encoder/algo/tb-intrapredmode.h"
+#include "libde265/encoder/algo/tb-split.h"
+#include "libde265/encoder/algo/cb-intrapartmode.h"
+#include "libde265/encoder/algo/cb-interpartmode.h"
+#include "libde265/encoder/algo/cb-split.h"
+#include "libde265/encoder/algo/ctb-qscale.h"
+#include "libde265/encoder/algo/cb-mergeindex.h"
+//#include "libde265/encoder/algo/cb-skip-or-inter.h"
+#include "libde265/encoder/algo/pb-mv.h"
+#include "libde265/encoder/algo/cb-skip.h"
+#include "libde265/encoder/algo/cb-intra-inter.h"
+
+
+/*  Encoder search tree, bottom up:
+
+    - Algo_TB_Split - whether TB is split or not
+
+    - Algo_TB_IntraPredMode - choose the intra prediction mode (or NOP, if at the wrong tree level)
+
+    - Algo_CB_IntraPartMode - choose between NxN and 2Nx2N intra parts
+
+    - Algo_CB_PredMode - intra / inter
+
+    - Algo_CB_Split - whether CB is split or not
+
+    - Algo_CTB_QScale - select QScale on CTB granularity
+ */
+
+
+// ========== an encoding algorithm combines a set of algorithm modules ==========
+
+class EncodingAlgorithm
+{
+ public:
+  virtual ~EncodingAlgorithm() { }
+
+  virtual Algo_CTB_QScale* getAlgoCTBQScale() = 0;
+
+  virtual int getPPS_QP() const = 0;
+  virtual int getSlice_QPDelta() const { return 0; }
+};
+
+
+class EncodingAlgorithm_Custom : public EncodingAlgorithm
+{
+ public:
+
+  void setParams(struct encoder_params& params);
+
+  void registerParams(config_parameters& config) {
+    mAlgo_CTB_QScale_Constant.registerParams(config);
+    mAlgo_CB_IntraPartMode_Fixed.registerParams(config);
+    mAlgo_CB_InterPartMode_Fixed.registerParams(config);
+    mAlgo_PB_MV_Test.registerParams(config);
+    mAlgo_PB_MV_Search.registerParams(config);
+    mAlgo_TB_IntraPredMode_FastBrute.registerParams(config);
+    mAlgo_TB_IntraPredMode_MinResidual.registerParams(config);
+    mAlgo_TB_Split_BruteForce.registerParams(config);
+  }
+
+  virtual Algo_CTB_QScale* getAlgoCTBQScale() { return &mAlgo_CTB_QScale_Constant; }
+
+  virtual int getPPS_QP() const { return mAlgo_CTB_QScale_Constant.getQP(); }
+
+ private:
+  Algo_CTB_QScale_Constant         mAlgo_CTB_QScale_Constant;
+
+  Algo_CB_Split_BruteForce         mAlgo_CB_Split_BruteForce;
+  Algo_CB_Skip_BruteForce          mAlgo_CB_Skip_BruteForce;
+  Algo_CB_IntraInter_BruteForce    mAlgo_CB_IntraInter_BruteForce;
+
+  Algo_CB_IntraPartMode_BruteForce mAlgo_CB_IntraPartMode_BruteForce;
+  Algo_CB_IntraPartMode_Fixed      mAlgo_CB_IntraPartMode_Fixed;
+
+  Algo_CB_InterPartMode_Fixed      mAlgo_CB_InterPartMode_Fixed;
+  Algo_CB_MergeIndex_Fixed         mAlgo_CB_MergeIndex_Fixed;
+
+  Algo_PB_MV_Test                  mAlgo_PB_MV_Test;
+  Algo_PB_MV_Search                mAlgo_PB_MV_Search;
+
+  Algo_TB_Split_BruteForce          mAlgo_TB_Split_BruteForce;
+
+  Algo_TB_IntraPredMode_BruteForce  mAlgo_TB_IntraPredMode_BruteForce;
+  Algo_TB_IntraPredMode_FastBrute   mAlgo_TB_IntraPredMode_FastBrute;
+  Algo_TB_IntraPredMode_MinResidual mAlgo_TB_IntraPredMode_MinResidual;
+};
+
+
+
+double encode_image(encoder_context*, const de265_image* input, EncodingAlgorithm&);
+
+void encode_sequence(encoder_context*);
+
+
+class Logging
+{
+public:
+  virtual ~Logging() { }
+
+  static void print_logging(const encoder_context* ectx, const char* id, const char* filename);
+
+  virtual const char* name() const = 0;
+  virtual void print(const encoder_context* ectx, const char* filename) = 0;
+};
+
+
+LIBDE265_API void en265_print_logging(const encoder_context* ectx, const char* id, const char* filename);
+
+#endif
diff --git a/libde265/encoder/encode.cc b/libde265/encoder/encode.cc
new file mode 100644
index 0000000..b108ee9
--- /dev/null
+++ b/libde265/encoder/encode.cc
@@ -0,0 +1,1981 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "encode.h"
+#include "encoder-context.h"
+#include "slice.h"
+#include "scan.h"
+#include "intrapred.h"
+#include "libde265/transform.h"
+#include "libde265/fallback-dct.h"
+#include <iostream>
+
+
+int allocTB = 0;
+int allocCB = 0;
+
+#define DEBUG_ALLOCS 0
+
+
+
+void enc_node::save(const de265_image* img)
+{
+  delete[] mReconstruction;
+
+  int blkSize = Log2SizeToArea(log2Size);
+  mReconstruction = new uint8_t[blkSize * 3/2];
+
+  int w = 1<<log2Size;
+
+  copy_subimage(mReconstruction, w,
+                img->get_image_plane_at_pos(0, x,y),
+                img->get_image_stride(0),
+                w,w);
+
+  copy_subimage(mReconstruction + blkSize, w>>1,
+                img->get_image_plane_at_pos(1, x>>1,y>>1),
+                img->get_image_stride(1),
+                w>>1,w>>1);
+
+  copy_subimage(mReconstruction + blkSize*5/4, w>>1,
+                img->get_image_plane_at_pos(2, x>>1,y>>1),
+                img->get_image_stride(2),
+                w>>1,w>>1);
+}
+
+
+void enc_node::restore(de265_image* img)
+{
+  assert(mReconstruction);
+
+  int blkSize = Log2SizeToArea(log2Size);
+  int w = 1<<log2Size;
+
+  copy_subimage(img->get_image_plane_at_pos(0, x,y),
+                img->get_image_stride(0),
+                mReconstruction, w,
+                w,w);
+
+  copy_subimage(img->get_image_plane_at_pos(1, x>>1,y>>1),
+                img->get_image_stride(1),
+                mReconstruction + blkSize, w>>1,
+                w>>1,w>>1);
+
+  copy_subimage(img->get_image_plane_at_pos(2, x>>1,y>>1),
+                img->get_image_stride(2),
+                mReconstruction + blkSize*5/4, w>>1,
+                w>>1,w>>1);
+}
+
+
+void enc_cb::save(const de265_image* img)
+{
+  enc_node::save(img);
+
+  // TODO: save metadata in node buffer memory
+}
+
+
+void enc_cb::restore(de265_image* img)
+{
+  enc_node::restore(img);
+
+  // write back all the metadata
+
+  write_to_image(img);
+}
+
+
+void enc_cb::set_rqt_root_bf_from_children_cbf()
+{
+  assert(transform_tree);
+  inter.rqt_root_cbf = (transform_tree->cbf[0] |
+                        transform_tree->cbf[1] |
+                        transform_tree->cbf[2]);
+}
+
+
+
+
+alloc_pool enc_tb::mMemPool(sizeof(enc_tb));
+
+enc_tb::enc_tb()
+  : split_transform_flag(false)
+{
+  coeff[0]=coeff[1]=coeff[2]=NULL;
+
+  if (DEBUG_ALLOCS) { allocTB++; printf("TB  : %d\n",allocTB); }
+}
+
+
+enc_tb::~enc_tb()
+{
+  if (split_transform_flag) {
+    for (int i=0;i<4;i++) {
+      delete children[i];
+    }
+  }
+  else {
+    for (int i=0;i<3;i++) {
+      delete[] coeff[i];
+    }
+  }
+
+  if (DEBUG_ALLOCS) { allocTB--; printf("TB ~: %d\n",allocTB); }
+}
+
+
+void enc_tb::alloc_coeff_memory(int cIdx, int tbSize)
+{
+  assert(coeff[cIdx]==NULL);
+  coeff[cIdx] = new int16_t[tbSize*tbSize];
+}
+
+
+void enc_tb::reconstruct_tb(encoder_context* ectx,
+                            de265_image* img,
+                            int x0,int y0,  // luma
+                            int log2TbSize, // chroma adapted
+                            const enc_cb* cb, int cIdx) const
+{
+  // chroma adapted position
+  int xC=x0;
+  int yC=y0;
+
+  if (cIdx>0) {
+    xC>>=1;
+    yC>>=1;
+  }
+
+  if (cb->PredMode == MODE_INTRA) {
+
+    enum IntraPredMode intraPredMode  = img->get_IntraPredMode(x0,y0);
+
+    if (cIdx>0) {
+      intraPredMode = cb->intra.chroma_mode;
+      //intraPredMode = lumaPredMode_to_chromaPredMode(intraPredMode, cb->intra.chroma_mode);
+    }
+
+    decode_intra_prediction(img, xC,yC,  intraPredMode, 1<< log2TbSize   , cIdx);
+  }
+  else {
+    int size = 1<<log2TbSize;
+
+    uint8_t* dst_ptr  = img->get_image_plane_at_pos(cIdx, xC,  yC  );
+    int dst_stride  = img->get_image_stride(cIdx);
+
+    uint8_t* src_ptr  = ectx->prediction->get_image_plane_at_pos(cIdx, xC,  yC  );
+    int src_stride  = ectx->prediction->get_image_stride(cIdx);
+
+    for (int y=0;y<size;y++) {
+      for (int x=0;x<size;x++) {
+        dst_ptr[y*dst_stride+x] = src_ptr[y*src_stride+x];
+      }
+    }
+  }
+
+  ALIGNED_16(int16_t) dequant_coeff[32*32];
+
+  if (cbf[cIdx]) dequant_coefficients(dequant_coeff, coeff[cIdx], log2TbSize, cb->qp);
+
+  //printf("--- quantized coeffs ---\n");
+  //printBlk(coeff[0],1<<log2BlkSize,1<<log2BlkSize);
+
+  //printf("--- dequantized coeffs ---\n");
+  //printBlk(dequant_coeff[0],1<<log2BlkSize,1<<log2BlkSize);
+
+  //printf("--- plane at %d %d / %d ---\n",x0,y0,cIdx);
+
+  uint8_t* ptr  = img->get_image_plane_at_pos(cIdx, xC,  yC  );
+  int stride  = img->get_image_stride(cIdx);
+
+  int trType = (cIdx==0 && log2TbSize==2); // TODO: inter
+
+  //printf("--- prediction %d %d / %d ---\n",x0,y0,cIdx);
+  //printBlk("prediction",ptr,1<<log2TbSize,stride);
+
+  if (cbf[cIdx]) inv_transform(&ectx->acceleration,
+                               ptr,stride,   dequant_coeff, log2TbSize,   trType);
+
+
+  //printf("--- RECO intra prediction %d %d ---\n",x0,y0);
+  //printBlk("prediction",ptr,1<<log2TbSize,stride);
+
+  //dequant_and_add_transform(accel, img, x0,y0, qp);
+
+  //printf("--- RECO add residual %d %d ---\n",x0,y0);
+  //img->printBlk(x0,y0,0,log2CbSize);
+}
+
+
+void enc_tb::reconstruct(encoder_context* ectx,
+                         de265_image* img,
+                         const enc_cb* cb,
+                         int blkIdx) const
+{
+  if (split_transform_flag) {
+    for (int i=0;i<4;i++) {
+      children[i]->reconstruct(ectx,img,
+                               cb, i);
+    }
+  }
+  else {
+    reconstruct_tb(ectx, img, x,y, log2Size, cb, 0);
+
+    if (log2Size>2) {
+      reconstruct_tb(ectx, img, x,y, log2Size-1, cb, 1);
+      reconstruct_tb(ectx, img, x,y, log2Size-1, cb, 2);
+    }
+    else if (blkIdx==3) {
+      int xBase = x - (1<<log2Size);
+      int yBase = y - (1<<log2Size);
+
+      reconstruct_tb(ectx, img, xBase,yBase, log2Size, cb, 1);
+      reconstruct_tb(ectx, img, xBase,yBase, log2Size, cb, 2);
+    }
+  }
+}
+
+
+void enc_tb::set_cbf_flags_from_children()
+{
+  assert(split_transform_flag);
+
+  cbf[0] = 0;
+  cbf[1] = 0;
+  cbf[2] = 0;
+
+  for (int i=0;i<4;i++) {
+    cbf[0] |= children[i]->cbf[0];
+    cbf[1] |= children[i]->cbf[1];
+    cbf[2] |= children[i]->cbf[2];
+  }
+}
+
+
+
+
+alloc_pool enc_cb::mMemPool(sizeof(enc_cb), 200);
+
+
+enc_cb::enc_cb()
+  : split_cu_flag(false),
+    cu_transquant_bypass_flag(false),
+    pcm_flag(false),
+    transform_tree(NULL),
+    distortion(0),
+    rate(0)
+{
+  if (DEBUG_ALLOCS) { allocCB++; printf("CB  : %d\n",allocCB); }
+}
+
+enc_cb::~enc_cb()
+{
+  if (split_cu_flag) {
+    for (int i=0;i<4;i++) {
+      delete children[i];
+    }
+  }
+  else {
+    delete transform_tree;
+  }
+
+  if (DEBUG_ALLOCS) { allocCB--; printf("CB ~: %d\n",allocCB); }
+}
+
+
+void enc_cb::write_to_image(de265_image* img) const
+{
+  //printf("write_to_image %d %d size:%d\n",x,y,1<<log2Size);
+
+
+  if (!split_cu_flag) {
+    img->set_log2CbSize(x,y,log2Size, true);
+    img->set_ctDepth(x,y,log2Size, ctDepth);
+    assert(pcm_flag==0);
+    img->set_pcm_flag(x,y,log2Size, pcm_flag);
+    img->set_cu_transquant_bypass(x,y,log2Size, cu_transquant_bypass_flag);
+    img->set_QPY(x,y,log2Size, qp);
+    img->set_pred_mode(x,y, log2Size, PredMode);
+    img->set_PartMode(x,y, PartMode);
+
+    if (PredMode == MODE_INTRA) {
+      //img->set_ChromaIntraPredMode(x,y,log2Size, intra.chroma_mode);
+
+      if (PartMode == PART_NxN) {
+        int h = 1<<(log2Size-1);
+        img->set_IntraPredMode(x  ,y  ,log2Size-1, intra.pred_mode[0]);
+        img->set_IntraPredMode(x+h,y  ,log2Size-1, intra.pred_mode[1]);
+        img->set_IntraPredMode(x  ,y+h,log2Size-1, intra.pred_mode[2]);
+        img->set_IntraPredMode(x+h,y+h,log2Size-1, intra.pred_mode[3]);
+      }
+      else {
+        img->set_IntraPredMode(x,y,log2Size, intra.pred_mode[0]);
+      }
+    }
+    else {
+      int nC = 1<<log2Size;
+      int nC2 = nC>>1;
+      int nC4 = nC>>2;
+      int nC3 = nC-nC4;
+      switch (PartMode) {
+      case PART_2Nx2N:
+        img->set_mv_info(x,y,nC,nC, inter.pb[0].motion);
+        break;
+      case PART_NxN:
+        img->set_mv_info(x    ,y    ,nC2,nC2, inter.pb[0].motion);
+        img->set_mv_info(x+nC2,y    ,nC2,nC2, inter.pb[1].motion);
+        img->set_mv_info(x    ,y+nC2,nC2,nC2, inter.pb[2].motion);
+        img->set_mv_info(x+nC2,y+nC2,nC2,nC2, inter.pb[3].motion);
+        break;
+      case PART_2NxN:
+        img->set_mv_info(x,y    ,nC,nC2, inter.pb[0].motion);
+        img->set_mv_info(x,y+nC2,nC,nC2, inter.pb[1].motion);
+        break;
+      case PART_Nx2N:
+        img->set_mv_info(x    ,y,nC2,nC, inter.pb[0].motion);
+        img->set_mv_info(x+nC2,y,nC2,nC, inter.pb[1].motion);
+        break;
+      case PART_2NxnU:
+        img->set_mv_info(x,y    ,nC,nC4, inter.pb[0].motion);
+        img->set_mv_info(x,y+nC4,nC,nC3, inter.pb[1].motion);
+        break;
+      case PART_2NxnD:
+        img->set_mv_info(x,y    ,nC,nC3, inter.pb[0].motion);
+        img->set_mv_info(x,y+nC3,nC,nC4, inter.pb[1].motion);
+        break;
+      case PART_nLx2N:
+        img->set_mv_info(x    ,y,nC4,nC, inter.pb[0].motion);
+        img->set_mv_info(x+nC4,y,nC3,nC, inter.pb[1].motion);
+        break;
+      case PART_nRx2N:
+        img->set_mv_info(x    ,y,nC3,nC, inter.pb[0].motion);
+        img->set_mv_info(x+nC3,y,nC4,nC, inter.pb[1].motion);
+        break;
+      }
+    }
+  }
+  else {
+    for (int i=0;i<4;i++) {
+      if (children[i]) {
+        children[i]->write_to_image(img);
+      }
+    }
+  }
+}
+
+
+void enc_cb::reconstruct(encoder_context* ectx, de265_image* img) const
+{
+  if (split_cu_flag) {
+    for (int i=0;i<4;i++) {
+      children[i]->reconstruct(ectx, img);
+    }
+  }
+  else {
+    write_to_image(img);
+    transform_tree->reconstruct(ectx,img,this,0);
+  }
+}
+
+
+
+void encode_split_cu_flag(encoder_context* ectx,
+                          CABAC_encoder* cabac,
+                          int x0, int y0, int ctDepth, int split_flag)
+{
+  logtrace(LogSymbols,"$1 split_cu_flag=%d\n",split_flag);
+
+  // check if neighbors are available
+
+  int availableL = check_CTB_available(ectx->img, x0,y0, x0-1,y0);
+  int availableA = check_CTB_available(ectx->img, x0,y0, x0,y0-1);
+
+  int condL = 0;
+  int condA = 0;
+
+  if (availableL && ectx->img->get_ctDepth(x0-1,y0) > ctDepth) condL=1;
+  if (availableA && ectx->img->get_ctDepth(x0,y0-1) > ctDepth) condA=1;
+
+  int contextOffset = condL + condA;
+  int context = contextOffset;
+
+  // decode bit
+
+  logtrace(LogSlice,"> split_cu_flag = %d (context=%d)\n",split_flag,context);
+
+  cabac->write_CABAC_bit(CONTEXT_MODEL_SPLIT_CU_FLAG + context, split_flag);
+}
+
+
+void encode_part_mode(encoder_context* ectx,
+                      CABAC_encoder* cabac,
+                      enum PredMode PredMode, enum PartMode PartMode, int cLog2CbSize)
+{
+  logtrace(LogSymbols,"$1 part_mode=%d\n",PartMode);
+  logtrace(LogSlice,"> part_mode = %d\n",PartMode);
+
+  if (PredMode == MODE_INTRA) {
+    int bin = (PartMode==PART_2Nx2N);
+    cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+0, bin);
+  }
+  else {
+    if (PartMode==PART_2Nx2N) {
+      cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+0, 1);
+      return;
+    }
+    else {
+      cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+0, 0);
+    }
+
+    if (cLog2CbSize > ectx->sps.Log2MinCbSizeY) {
+      if (ectx->sps.amp_enabled_flag) {
+        switch (PartMode) {
+        case PART_2NxN:
+          cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+1, 1);
+          cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+3, 1);
+          break;
+        case PART_Nx2N:
+          cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+1, 0);
+          cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+3, 1);
+          break;
+        case PART_2NxnU:
+          cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+1, 1);
+          cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+3, 0);
+          cabac->write_CABAC_bypass(0);
+          break;
+        case PART_2NxnD:
+          cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+1, 1);
+          cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+3, 0);
+          cabac->write_CABAC_bypass(1);
+          break;
+        case PART_nLx2N:
+          cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+1, 0);
+          cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+3, 0);
+          cabac->write_CABAC_bypass(0);
+          break;
+        case PART_nRx2N:
+          cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+1, 0);
+          cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+3, 0);
+          cabac->write_CABAC_bypass(1);
+          break;
+        case PART_NxN:
+        case PART_2Nx2N:
+          assert(false);
+          break;
+        }
+      }
+      else {
+        if (PartMode==PART_2NxN) {
+          cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+1, 1);
+        }
+        else {
+          assert(PartMode==PART_Nx2N);
+          cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+1, 0);
+        }
+      }
+    }
+    else {
+      if (PartMode==PART_2NxN) {
+        cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+1, 1);
+      }
+      else {
+        cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+1, 0);
+
+        if (cLog2CbSize==3) {
+          assert(PartMode==PART_Nx2N);
+        }
+        else {
+          if (PartMode==PART_Nx2N) {
+            cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+3, 1);
+          }
+          else {
+            assert(PartMode==PART_NxN);
+            cabac->write_CABAC_bit(CONTEXT_MODEL_PART_MODE+3, 0);
+          }
+        }
+      }
+    }
+  }
+}
+
+
+static void encode_pred_mode_flag(encoder_context* ectx,
+                                  CABAC_encoder* cabac,
+                                  enum PredMode PredMode)
+{
+  logtrace(LogSlice,"> pred_mode = %d\n",PredMode);
+
+  int flag = (PredMode == MODE_INTRA) ? 1 : 0;
+
+  logtrace(LogSymbols,"$1 pred_mode=%d\n",flag);
+
+  cabac->write_CABAC_bit(CONTEXT_MODEL_PRED_MODE_FLAG, flag);
+}
+
+
+static void encode_prev_intra_luma_pred_flag(encoder_context* ectx,
+                                             CABAC_encoder* cabac,
+                                             int intraPred)
+{
+  logtrace(LogSymbols,"$1 prev_intra_luma_pred_flag=%d\n",intraPred>=0);
+  int bin = (intraPred>=0);
+
+  logtrace(LogSlice,"> prev_intra_luma_pred_flag = %d\n",bin);
+
+  cabac->write_CABAC_bit(CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG, bin);
+}
+
+static void encode_intra_mpm_or_rem(encoder_context* ectx,
+                                    CABAC_encoder* cabac,
+                                    int intraPred)
+{
+  if (intraPred>=0) {
+    logtrace(LogSymbols,"$1 mpm_idx=%d\n",intraPred);
+    logtrace(LogSlice,"> mpm_idx = %d\n",intraPred);
+    assert(intraPred<=2);
+    cabac->write_CABAC_TU_bypass(intraPred, 2);
+  }
+  else {
+    logtrace(LogSymbols,"$1 rem_intra_luma_pred_mode=%d\n",-intraPred-1);
+    logtrace(LogSlice,"> rem_intra_luma_pred_mode = %d\n",-intraPred-1);
+    cabac->write_CABAC_FL_bypass(-intraPred-1, 5);
+  }
+}
+
+
+static void encode_intra_chroma_pred_mode(encoder_context* ectx,
+                                          CABAC_encoder* cabac,
+                                          int mode)
+{
+  logtrace(LogSymbols,"$1 intra_chroma_pred_mode=%d\n",mode);
+  logtrace(LogSlice,"> intra_chroma_pred_mode = %d\n",mode);
+
+  if (mode==4) {
+    cabac->write_CABAC_bit(CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE,0);
+  }
+  else {
+    assert(mode<4);
+
+    cabac->write_CABAC_bit(CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE,1);
+    cabac->write_CABAC_FL_bypass(mode, 2);
+  }
+}
+
+
+/* Optimized variant that tests most likely branch first.
+ */
+enum IntraChromaPredMode find_chroma_pred_mode(enum IntraPredMode chroma_mode,
+                                               enum IntraPredMode luma_mode)
+{
+  // most likely mode: chroma mode = luma mode
+
+  if (luma_mode==chroma_mode) {
+    return INTRA_CHROMA_LIKE_LUMA;
+  }
+
+
+  // check remaining candidates
+
+  IntraPredMode mode = chroma_mode;
+
+  // angular-34 is coded by setting the coded mode equal to the luma_mode
+  if (chroma_mode == INTRA_ANGULAR_34) {
+    mode = luma_mode;
+  }
+
+  switch (mode) {
+  case INTRA_PLANAR:     return INTRA_CHROMA_PLANAR_OR_34;
+  case INTRA_ANGULAR_26: return INTRA_CHROMA_ANGULAR_26_OR_34;
+  case INTRA_ANGULAR_10: return INTRA_CHROMA_ANGULAR_10_OR_34;
+  case INTRA_DC:         return INTRA_CHROMA_DC_OR_34;
+  default:
+    assert(false);
+    return INTRA_CHROMA_DC_OR_34;
+  }
+}
+
+
+
+void encode_split_transform_flag(encoder_context* ectx,
+                                 CABAC_encoder* cabac,
+                                 int log2TrafoSize, int split_flag)
+{
+  logtrace(LogSymbols,"$1 split_transform_flag=%d\n",split_flag);
+  logtrace(LogSlice,"> split_transform_flag = %d\n",split_flag);
+
+  int context = 5-log2TrafoSize;
+  assert(context >= 0 && context <= 2);
+
+  cabac->write_CABAC_bit(CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG + context, split_flag);
+}
+
+
+void encode_cbf_luma(CABAC_encoder* cabac,
+                     bool zeroTrafoDepth, int cbf_luma)
+{
+  logtrace(LogSymbols,"$1 cbf_luma=%d\n",cbf_luma);
+  logtrace(LogSlice,"> cbf_luma = %d\n",cbf_luma);
+
+  int context = (zeroTrafoDepth ? 1 : 0);
+
+  cabac->write_CABAC_bit(CONTEXT_MODEL_CBF_LUMA + context, cbf_luma);
+}
+
+
+void encode_cbf_chroma(CABAC_encoder* cabac,
+                       int trafoDepth, int cbf_chroma)
+{
+  logtrace(LogSymbols,"$1 cbf_chroma=%d\n",cbf_chroma);
+  logtrace(LogSlice,"> cbf_chroma = %d\n",cbf_chroma);
+
+  int context = trafoDepth;
+  assert(context >= 0 && context <= 3);
+
+  cabac->write_CABAC_bit(CONTEXT_MODEL_CBF_CHROMA + context, cbf_chroma);
+}
+
+static inline void encode_coded_sub_block_flag(encoder_context* ectx,
+                                               CABAC_encoder* cabac,
+                                               int cIdx,
+                                               uint8_t coded_sub_block_neighbors,
+                                               int flag)
+{
+  logtrace(LogSymbols,"$1 coded_sub_block_flag=%d\n",flag);
+  logtrace(LogSlice,"# coded_sub_block_flag = %d\n",flag);
+
+  // tricky computation of csbfCtx
+  int csbfCtx = ((coded_sub_block_neighbors &  1) |  // right neighbor set  or
+                 (coded_sub_block_neighbors >> 1));  // bottom neighbor set   -> csbfCtx=1
+
+  int ctxIdxInc = csbfCtx;
+  if (cIdx!=0) {
+    ctxIdxInc += 2;
+  }
+
+  cabac->write_CABAC_bit(CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG + ctxIdxInc, flag);
+}
+
+static inline void encode_significant_coeff_flag_lookup(encoder_context* ectx,
+                                                        CABAC_encoder* cabac,
+                                                        uint8_t ctxIdxInc,
+                                                        int significantFlag)
+{
+  logtrace(LogSymbols,"$1 significant_coeff_flag=%d\n",significantFlag);
+  logtrace(LogSlice,"# significant_coeff_flag = significantFlag\n");
+  logtrace(LogSlice,"context: %d\n",ctxIdxInc);
+
+  cabac->write_CABAC_bit(CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG + ctxIdxInc, significantFlag);
+}
+
+static inline void encode_coeff_abs_level_greater1(encoder_context* ectx,
+                                                   CABAC_encoder* cabac,
+                                                   int cIdx, int i,
+                                                   bool firstCoeffInSubblock,
+                                                   bool firstSubblock,
+                                                   int  lastSubblock_greater1Ctx,
+                                                   int* lastInvocation_greater1Ctx,
+                                                   int* lastInvocation_coeff_abs_level_greater1_flag,
+                                                   int* lastInvocation_ctxSet, int c1,
+                                                   int value)
+{
+  logtrace(LogSymbols,"$1 coeff_abs_level_greater1=%d\n",value);
+  logtrace(LogSlice,"# coeff_abs_level_greater1 = %d\n",value);
+
+  logtrace(LogSlice,"  cIdx:%d i:%d firstCoeffInSB:%d firstSB:%d lastSB>1:%d last>1Ctx:%d lastLev>1:%d lastCtxSet:%d\n", cIdx,i,firstCoeffInSubblock,firstSubblock,lastSubblock_greater1Ctx,
+	   *lastInvocation_greater1Ctx,
+	   *lastInvocation_coeff_abs_level_greater1_flag,
+	   *lastInvocation_ctxSet);
+
+  int lastGreater1Ctx;
+  int greater1Ctx;
+  int ctxSet;
+
+  logtrace(LogSlice,"c1: %d\n",c1);
+
+  if (firstCoeffInSubblock) {
+    // block with real DC -> ctx 0
+    if (i==0 || cIdx>0) { ctxSet=0; }
+    else { ctxSet=2; }
+
+    if (firstSubblock) { lastGreater1Ctx=1; }
+    else { lastGreater1Ctx = lastSubblock_greater1Ctx; }
+
+    if (lastGreater1Ctx==0) { ctxSet++; }
+
+    logtrace(LogSlice,"ctxSet: %d\n",ctxSet);
+
+    greater1Ctx=1;
+  }
+  else { // !firstCoeffInSubblock
+    ctxSet = *lastInvocation_ctxSet;
+    logtrace(LogSlice,"ctxSet (old): %d\n",ctxSet);
+
+    greater1Ctx = *lastInvocation_greater1Ctx;
+    if (greater1Ctx>0) {
+      int lastGreater1Flag=*lastInvocation_coeff_abs_level_greater1_flag;
+      if (lastGreater1Flag==1) greater1Ctx=0;
+      else { /*if (greater1Ctx>0)*/ greater1Ctx++; }
+    }
+  }
+
+  ctxSet = c1; // use HM algo
+
+  int ctxIdxInc = (ctxSet*4) + (greater1Ctx>=3 ? 3 : greater1Ctx);
+
+  if (cIdx>0) { ctxIdxInc+=16; }
+
+  cabac->write_CABAC_bit(CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG + ctxIdxInc,  value);
+
+  *lastInvocation_greater1Ctx = greater1Ctx;
+  *lastInvocation_coeff_abs_level_greater1_flag = value;
+  *lastInvocation_ctxSet = ctxSet;
+}
+
+static void encode_coeff_abs_level_greater2(encoder_context* ectx,
+                                            CABAC_encoder* cabac,
+                                            int cIdx, // int i,int n,
+                                            int ctxSet,
+                                            int value)
+{
+  logtrace(LogSymbols,"$1 coeff_abs_level_greater2=%d\n",value);
+  logtrace(LogSlice,"# coeff_abs_level_greater2 = %d\n",value);
+
+  int ctxIdxInc = ctxSet;
+
+  if (cIdx>0) ctxIdxInc+=4;
+
+  cabac->write_CABAC_bit(CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG + ctxIdxInc,  value);
+}
+
+
+bool TU(int val, int maxi)
+{
+  for (int i=0;i<val;i++) {
+    printf("1");
+  }
+  if (val<maxi) { printf("0"); return false; }
+  else return true;
+}
+
+void bin(int val, int bits)
+{
+  for (int i=0;i<bits;i++) {
+    int bit = (1<<(bits-1-i));
+    if (val&bit) printf("1"); else printf("0");
+  }
+}
+
+void ExpG(int level, int riceParam)
+{
+  int prefix = level >> riceParam;
+  int suffix = level - (prefix<<riceParam);
+
+  //printf("%d %d ",prefix,suffix);
+
+  int base=0;
+  int range=1;
+  int nBits=0;
+  while (prefix >= base+range) {
+    printf("1");
+    base+=range;
+    range*=2;
+    nBits++;
+  }
+
+  printf("0.");
+  bin(prefix-base, nBits);
+  printf(":");
+  bin(suffix,riceParam);
+}
+
+int blamain()
+{
+  int riceParam=2;
+  int TRMax = 4<<riceParam;
+
+  for (int level=0;level<128;level++)
+    {
+      printf("%d: ",level);
+
+      int prefixPart = std::min(TRMax, level);
+
+      // code TR prefix
+
+      bool isMaxi = TU(prefixPart>>riceParam, TRMax>>riceParam);
+      printf(":");
+      if (TRMax>prefixPart) {
+        int remain = prefixPart & ((1<<riceParam)-1);
+        bin(remain, riceParam);
+      }
+      printf("|");
+
+      if (isMaxi) {
+        ExpG(level-TRMax, riceParam+1);
+      }
+
+      printf("\n");
+    }
+
+  return 0;
+}
+
+
+static void encode_coeff_abs_level_remaining(encoder_context* ectx,
+                                             CABAC_encoder* cabac,
+                                             int cRiceParam,
+                                             int level)
+{
+  logtrace(LogSymbols,"$1 coeff_abs_level_remaining=%d\n",level);
+  logtrace(LogSlice,"# encode_coeff_abs_level_remaining = %d\n",level);
+
+  int cTRMax = 4<<cRiceParam;
+  int prefixPart = std::min(level, cTRMax);
+
+  // --- code prefix with TR ---
+
+  // TU part, length 4 (cTRMax>>riceParam)
+
+  int nOnes = (prefixPart>>cRiceParam);
+  cabac->write_CABAC_TU_bypass(nOnes, 4);
+
+  // TR suffix
+
+  if (cTRMax > prefixPart) {
+    int remain = prefixPart & ((1<<cRiceParam)-1);
+    cabac->write_CABAC_FL_bypass(remain, cRiceParam);
+  }
+
+
+  // --- remainder suffix ---
+
+  if (nOnes==4) {
+    int remain = level-cTRMax;
+    int ExpGRiceParam = cRiceParam+1;
+
+    int prefix = remain >> ExpGRiceParam;
+    int suffix = remain - (prefix<<ExpGRiceParam);
+
+    int base=0;
+    int range=1;
+    int nBits=0;
+    while (prefix >= base+range) {
+      cabac->write_CABAC_bypass(1);
+      base+=range;
+      range*=2;
+      nBits++;
+    }
+
+    cabac->write_CABAC_bypass(0);
+    cabac->write_CABAC_FL_bypass(prefix-base, nBits);
+    cabac->write_CABAC_FL_bypass(suffix, ExpGRiceParam);
+  }
+}
+
+// ---------------------------------------------------------------------------
+
+void findLastSignificantCoeff(const position* sbScan, const position* cScan,
+                              const int16_t* coeff, int log2TrafoSize,
+                              int* lastSignificantX, int* lastSignificantY,
+                              int* lastSb, int* lastPos)
+{
+  int nSb = 1<<((log2TrafoSize-2)<<1); // number of sub-blocks
+
+  // find last significant coefficient
+
+  for (int i=nSb ; i-->0 ;) {
+    int x0 = sbScan[i].x << 2;
+    int y0 = sbScan[i].y << 2;
+    for (int c=16 ; c-->0 ;) {
+      int x = x0 + cScan[c].x;
+      int y = y0 + cScan[c].y;
+
+      if (coeff[x+(y<<log2TrafoSize)]) {
+        *lastSignificantX = x;
+        *lastSignificantY = y;
+        *lastSb = i;
+        *lastPos= c;
+
+        logtrace(LogSlice,"last significant coeff at: %d;%d, Sb:%d Pos:%d\n", x,y,i,c);
+
+        return;
+      }
+    }
+  }
+
+  // all coefficients == 0 ? cannot be since cbf should be false in this case
+  assert(false);
+}
+
+
+bool subblock_has_nonzero_coefficient(const int16_t* coeff, int coeffStride,
+                                      const position& sbPos)
+{
+  int x0 = sbPos.x << 2;
+  int y0 = sbPos.y << 2;
+
+  coeff += x0 + y0*coeffStride;
+
+  for (int y=0;y<4;y++) {
+    if (coeff[0] || coeff[1] || coeff[2] || coeff[3]) { return true; }
+    coeff += coeffStride;
+  }
+
+  return false;
+}
+
+/*
+  Example 16x16:  prefix in [0;7]
+
+  prefix       | last pos
+  =============|=============
+  0            |   0
+  1            |   1
+  2            |   2
+  3            |   3
+  -------------+-------------
+     lsb nBits |
+  4   0    1   |   4, 5
+  5   1    1   |   6, 7
+  6   0    2   |   8, 9,10,11
+  7   1    2   |  12,13,14,15
+*/
+void encode_last_signficiant_coeff_prefix(encoder_context* ectx,
+                                          CABAC_encoder* cabac,
+                                          int log2TrafoSize,
+                                          int cIdx, int lastSignificant,
+                                          int context_model_index)
+{
+  logtrace(LogSlice,"> last_significant_coeff_prefix=%d log2TrafoSize:%d cIdx:%d\n",
+           lastSignificant,log2TrafoSize,cIdx);
+
+  int cMax = (log2TrafoSize<<1)-1;
+
+  int ctxOffset, ctxShift;
+  if (cIdx==0) {
+    ctxOffset = 3*(log2TrafoSize-2) + ((log2TrafoSize-1)>>2);
+    ctxShift  = (log2TrafoSize+1)>>2;
+  }
+  else {
+    ctxOffset = 15;
+    ctxShift  = log2TrafoSize-2;
+  }
+
+  for (int binIdx=0;binIdx<lastSignificant;binIdx++)
+    {
+      int ctxIdxInc = (binIdx >> ctxShift);
+      cabac->write_CABAC_bit(context_model_index + ctxOffset + ctxIdxInc, 1);
+    }
+
+  if (lastSignificant != cMax) {
+    int binIdx = lastSignificant;
+    int ctxIdxInc = (binIdx >> ctxShift);
+    cabac->write_CABAC_bit(context_model_index + ctxOffset + ctxIdxInc, 0);
+  }
+}
+
+
+void split_last_significant_position(int pos, int* prefix, int* suffix, int* nSuffixBits)
+{
+  logtrace(LogSlice,"split position %d : ",pos);
+
+  // most frequent case
+
+  if (pos<=3) {
+    *prefix=pos;
+    *suffix=-1; // just to have some defined value
+    *nSuffixBits=0;
+    logtrace(LogSlice,"prefix=%d suffix=%d (%d bits)\n",*prefix,*suffix,*nSuffixBits);
+    return;
+  }
+
+  pos -= 4;
+  int nBits=1;
+  int range=4;
+  while (pos>=range) {
+    nBits++;
+    pos-=range;
+    range<<=1;
+  }
+
+  *prefix = (1+nBits)<<1;
+  if (pos >= (range>>1)) {
+    *prefix |= 1;
+    pos -= (range>>1);
+  }
+  *suffix = pos;
+  *nSuffixBits = nBits;
+
+  logtrace(LogSlice,"prefix=%d suffix=%d (%d bits)\n",*prefix,*suffix,*nSuffixBits);
+}
+
+
+extern uint8_t* ctxIdxLookup[4 /* 4-log2-32 */][2 /* !!cIdx */][2 /* !!scanIdx */][4 /* prevCsbf */];
+
+/* These values are read from the image metadata:
+   - intra prediction mode (x0;y0)
+ */
+void encode_residual(encoder_context* ectx,
+                     CABAC_encoder* cabac,
+                     const enc_tb* tb, const enc_cb* cb,
+                     int x0,int y0,int log2TrafoSize,int cIdx)
+{
+  const de265_image* img = ectx->img;
+  const seq_parameter_set& sps = img->sps;
+  const pic_parameter_set& pps = img->pps;
+
+  int16_t* coeff = tb->coeff[cIdx];
+
+  if (pps.transform_skip_enabled_flag && true /* TODO */) {
+  }
+
+
+#if 0
+  printf("write coefficients\n");
+  for (int y=0;y<(1<<log2TrafoSize);y++)
+    {
+      for (int x=0;x<(1<<log2TrafoSize);x++)
+        {
+          printf("%4d ",coeff[x+y*(1<<log2TrafoSize)]);
+        }
+      printf("\n");
+    }
+#endif
+
+
+  // --- get scan orders ---
+
+  enum PredMode PredMode = cb->PredMode;
+  int scanIdx;
+
+  if (PredMode == MODE_INTRA) {
+    if (cIdx==0) {
+      scanIdx = get_intra_scan_idx_luma(log2TrafoSize, img->get_IntraPredMode(x0,y0));
+    }
+    else {
+      enum IntraPredMode chromaMode = cb->intra.chroma_mode;
+      /*
+      enum IntraPredMode chromaMode = lumaPredMode_to_chromaPredMode(img->get_IntraPredMode(x0,y0),
+                                                                     cb->intra.chroma_mode);
+      */
+      scanIdx = get_intra_scan_idx_chroma(log2TrafoSize, chromaMode);
+    }
+  }
+  else {
+    scanIdx=0;
+  }
+
+
+  const position* ScanOrderSub = get_scan_order(log2TrafoSize-2, scanIdx);
+  const position* ScanOrderPos = get_scan_order(2, scanIdx);
+
+  int lastSignificantX, lastSignificantY;
+  int lastScanPos;
+  int lastSubBlock;
+  findLastSignificantCoeff(ScanOrderSub, ScanOrderPos,
+                           coeff, log2TrafoSize,
+                           &lastSignificantX, &lastSignificantY,
+                           &lastSubBlock, &lastScanPos);
+
+  int codedSignificantX = lastSignificantX;
+  int codedSignificantY = lastSignificantY;
+
+  if (scanIdx==2) {
+    std::swap(codedSignificantX, codedSignificantY);
+  }
+
+
+
+  int prefixX, suffixX, suffixBitsX;
+  int prefixY, suffixY, suffixBitsY;
+
+  split_last_significant_position(codedSignificantX, &prefixX,&suffixX,&suffixBitsX);
+  split_last_significant_position(codedSignificantY, &prefixY,&suffixY,&suffixBitsY);
+
+  encode_last_signficiant_coeff_prefix(ectx, cabac, log2TrafoSize, cIdx, prefixX,
+                                       CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX);
+
+  encode_last_signficiant_coeff_prefix(ectx, cabac, log2TrafoSize, cIdx, prefixY,
+                                       CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX);
+
+
+  if (codedSignificantX > 3) {
+    cabac->write_CABAC_FL_bypass(suffixX, suffixBitsX);
+  }
+  if (codedSignificantY > 3) {
+    cabac->write_CABAC_FL_bypass(suffixY, suffixBitsY);
+  }
+
+
+
+  int sbWidth = 1<<(log2TrafoSize-2);
+  int CoeffStride = 1<<log2TrafoSize;
+
+  uint8_t coded_sub_block_neighbors[32/4*32/4];  // 64*2 flags
+  memset(coded_sub_block_neighbors,0,sbWidth*sbWidth);
+
+  int  c1 = 1;
+  bool firstSubblock = true;           // for coeff_abs_level_greater1_flag context model
+  int  lastSubblock_greater1Ctx=false; /* for coeff_abs_level_greater1_flag context model
+                                          (initialization not strictly needed)
+                                       */
+
+  int  lastInvocation_greater1Ctx=0;
+  int  lastInvocation_coeff_abs_level_greater1_flag=0;
+  int  lastInvocation_ctxSet=0;
+
+
+
+  // ----- encode coefficients -----
+
+  //tctx->nCoeff[cIdx] = 0;
+
+
+  // i - subblock index
+  // n - coefficient index in subblock
+
+  for (int i=lastSubBlock;i>=0;i--) {
+    position S = ScanOrderSub[i];
+    int inferSbDcSigCoeffFlag=0;
+
+    logtrace(LogSlice,"sub block scan idx: %d\n",i);
+
+
+    // --- check whether this sub-block has to be coded ---
+
+    int sub_block_is_coded = 0;
+
+    if ((i<lastSubBlock) && (i>0)) {
+      sub_block_is_coded = subblock_has_nonzero_coefficient(coeff, CoeffStride, S);
+      encode_coded_sub_block_flag(ectx, cabac, cIdx,
+                                  coded_sub_block_neighbors[S.x+S.y*sbWidth],
+                                  sub_block_is_coded);
+      inferSbDcSigCoeffFlag=1;
+    }
+    else if (i==0 || i==lastSubBlock) {
+      // first (DC) and last sub-block are always coded
+      // - the first will most probably contain coefficients
+      // - the last obviously contains the last coded coefficient
+
+      sub_block_is_coded = 1;
+    }
+
+    if (sub_block_is_coded) {
+      if (S.x > 0) coded_sub_block_neighbors[S.x-1 + S.y  *sbWidth] |= 1;
+      if (S.y > 0) coded_sub_block_neighbors[S.x + (S.y-1)*sbWidth] |= 2;
+    }
+
+    logtrace(LogSlice,"subblock is coded: %s\n", sub_block_is_coded ? "yes":"no");
+
+
+    // --- write significant coefficient flags ---
+
+    int16_t  coeff_value[16];
+    int16_t  coeff_baseLevel[16];
+    int8_t   coeff_scan_pos[16];
+    int8_t   coeff_sign[16];
+    int8_t   coeff_has_max_base_level[16];
+    int nCoefficients=0;
+
+
+    if (sub_block_is_coded) {
+      int x0 = S.x<<2;
+      int y0 = S.y<<2;
+
+      int log2w = log2TrafoSize-2;
+      int prevCsbf = coded_sub_block_neighbors[S.x+S.y*sbWidth];
+      uint8_t* ctxIdxMap = ctxIdxLookup[log2w][!!cIdx][!!scanIdx][prevCsbf];
+
+
+
+
+      // set the last coded coefficient in the last subblock
+
+      if (i==lastSubBlock) {
+        coeff_value[nCoefficients] = coeff[lastSignificantX+(lastSignificantY<<log2TrafoSize)];
+        coeff_has_max_base_level[nCoefficients] = 1;  // TODO
+        coeff_scan_pos[nCoefficients] = lastScanPos;
+        nCoefficients++;
+      }
+
+
+      // --- encode all coefficients' significant_coeff flags except for the DC coefficient ---
+
+      int last_coeff =  (i==lastSubBlock) ? lastScanPos-1 : 15;
+
+      for (int n= last_coeff ; n>0 ; n--) {
+        int subX = ScanOrderPos[n].x;
+        int subY = ScanOrderPos[n].y;
+        int xC = x0 + subX;
+        int yC = y0 + subY;
+
+
+        // for all AC coefficients in sub-block, a significant_coeff flag is coded
+
+        int isSignificant = !!tb->coeff[cIdx][xC + (yC<<log2TrafoSize)];
+
+        logtrace(LogSlice,"coeff %d is significant: %d\n", n, isSignificant);
+
+        logtrace(LogSlice,"context idx: %d;%d\n",xC,yC);
+
+        encode_significant_coeff_flag_lookup(ectx, cabac,
+                                             ctxIdxMap[xC+(yC<<log2TrafoSize)],
+                                             isSignificant);
+        //ctxIdxMap[(i<<4)+n]);
+
+        if (isSignificant) {
+          coeff_value[nCoefficients] = coeff[xC+(yC<<log2TrafoSize)];
+          coeff_has_max_base_level[nCoefficients] = 1;
+          coeff_scan_pos[nCoefficients] = n;
+          nCoefficients++;
+
+          // since we have a coefficient in the sub-block,
+          // we cannot infer the DC coefficient anymore
+          inferSbDcSigCoeffFlag = 0;
+        }
+      }
+
+
+      // --- decode DC coefficient significance ---
+
+      if (last_coeff>=0) // last coded coefficient (always set to 1) is not the DC coefficient
+        {
+          if (inferSbDcSigCoeffFlag==0) {
+            // if we cannot infert the DC coefficient, it is coded
+            int isSignificant = !!tb->coeff[cIdx][x0 + (y0<<log2TrafoSize)];
+
+            logtrace(LogSlice,"DC coeff is significant: %d\n", isSignificant);
+
+            encode_significant_coeff_flag_lookup(ectx, cabac,
+                                                 ctxIdxMap[x0+(y0<<log2TrafoSize)],
+                                                 isSignificant);
+
+            if (isSignificant) {
+              coeff_value[nCoefficients] = coeff[x0+(y0<<log2TrafoSize)];
+              coeff_has_max_base_level[nCoefficients] = 1;
+              coeff_scan_pos[nCoefficients] = 0;
+              nCoefficients++;
+            }
+          }
+          else {
+            // we can infer that the DC coefficient must be present
+            coeff_value[nCoefficients] = coeff[x0+(y0<<log2TrafoSize)];
+            coeff_has_max_base_level[nCoefficients] = 1;
+            coeff_scan_pos[nCoefficients] = 0;
+            nCoefficients++;
+          }
+        }
+    }
+
+
+
+    // --- encode coefficient values ---
+
+    if (nCoefficients) {
+
+      // separate absolute coefficient value and sign
+
+      logtrace(LogSlice,"coefficients to code: ");
+
+      for (int l=0;l<nCoefficients;l++) {
+        logtrace(LogSlice,"%d ",coeff_value[l]);
+
+        if (coeff_value[l]<0) {
+          coeff_value[l] = -coeff_value[l];
+          coeff_sign[l] = 1;
+        }
+        else {
+          coeff_sign[l] = 0;
+        }
+
+        coeff_baseLevel[l] = 1;
+
+        logtrace(LogSlice,"(%d) ",coeff_scan_pos[l]);
+      }
+
+      logtrace(LogSlice,"\n");
+
+
+      int ctxSet;
+      if (i==0 || cIdx>0) { ctxSet=0; }
+      else { ctxSet=2; }
+
+      if (c1==0) { ctxSet++; }
+      c1=1;
+
+
+      // --- encode greater-1 flags ---
+
+      int newLastGreater1ScanPos=-1;
+
+      int lastGreater1Coefficient = libde265_min(8,nCoefficients);
+      for (int c=0;c<lastGreater1Coefficient;c++) {
+        int greater1_flag = (coeff_value[c]>1);
+
+        encode_coeff_abs_level_greater1(ectx, cabac, cIdx,i,
+                                        c==0,
+                                        firstSubblock,
+                                        lastSubblock_greater1Ctx,
+                                        &lastInvocation_greater1Ctx,
+                                        &lastInvocation_coeff_abs_level_greater1_flag,
+                                        &lastInvocation_ctxSet, ctxSet,
+                                        greater1_flag);
+
+        if (greater1_flag) {
+          coeff_baseLevel[c]++;
+
+          c1=0;
+
+          if (newLastGreater1ScanPos == -1) {
+            newLastGreater1ScanPos=c;
+          }
+        }
+        else {
+          coeff_has_max_base_level[c] = 0;
+
+          if (c1<3 && c1>0) {
+            c1++;
+          }
+        }
+      }
+
+      firstSubblock = false;
+      lastSubblock_greater1Ctx = lastInvocation_greater1Ctx;
+
+
+      // --- decode greater-2 flag ---
+
+      if (newLastGreater1ScanPos != -1) {
+        int greater2_flag = (coeff_value[newLastGreater1ScanPos]>2);
+        encode_coeff_abs_level_greater2(ectx,cabac, cIdx, lastInvocation_ctxSet, greater2_flag);
+        coeff_baseLevel[newLastGreater1ScanPos] += greater2_flag;
+        coeff_has_max_base_level[newLastGreater1ScanPos] = greater2_flag;
+      }
+
+
+      // --- encode coefficient signs ---
+
+      int signHidden = (coeff_scan_pos[0]-coeff_scan_pos[nCoefficients-1] > 3 &&
+                        !cb->cu_transquant_bypass_flag);
+
+      for (int n=0;n<nCoefficients-1;n++) {
+        cabac->write_CABAC_bypass(coeff_sign[n]);
+        //logtrace(LogSlice,"a) sign[%d] = %d\n", n, coeff_sign[n]);
+      }
+
+      // n==nCoefficients-1
+      if (!pps.sign_data_hiding_flag || !signHidden) {
+        cabac->write_CABAC_bypass(coeff_sign[nCoefficients-1]);
+        //logtrace(LogSlice,"b) sign[%d] = %d\n", nCoefficients-1, coeff_sign[nCoefficients-1]);
+      }
+      else {
+        assert(coeff_sign[nCoefficients-1] == 0);
+      }
+
+      // --- decode coefficient value ---
+
+      int sumAbsLevel=0;
+      int uiGoRiceParam=0;
+
+      for (int n=0;n<nCoefficients;n++) {
+        int baseLevel = coeff_baseLevel[n];
+
+        int coeff_abs_level_remaining;
+
+        if (coeff_has_max_base_level[n]) {
+          logtrace(LogSlice,"value[%d]=%d, base level: %d\n",n,coeff_value[n],coeff_baseLevel[n]);
+
+          coeff_abs_level_remaining = coeff_value[n] - coeff_baseLevel[n];
+
+          encode_coeff_abs_level_remaining(ectx, cabac, uiGoRiceParam,
+                                           coeff_abs_level_remaining);
+
+          // (9-462)
+          if (baseLevel + coeff_abs_level_remaining > 3*(1<<uiGoRiceParam)) {
+            uiGoRiceParam++;
+            if (uiGoRiceParam>4) uiGoRiceParam=4;
+          }
+        }
+        else {
+          coeff_abs_level_remaining = 0;
+        }
+
+
+        // --- DEBUG: check coefficient ---
+
+#if 0
+        int16_t currCoeff = baseLevel + coeff_abs_level_remaining;
+        if (coeff_sign[n]) {
+          currCoeff = -currCoeff;
+        }
+
+        if (pps.sign_data_hiding_flag && signHidden) {
+          sumAbsLevel += baseLevel + coeff_abs_level_remaining;
+
+          if (n==nCoefficients-1 && (sumAbsLevel & 1)) {
+            currCoeff = -currCoeff;
+          }
+        }
+
+        assert(currCoeff == coeff_value[n]);
+#endif
+      }  // iterate through coefficients in sub-block
+    }  // if nonZero
+
+  }
+}
+
+
+void encode_transform_unit(encoder_context* ectx,
+                           CABAC_encoder* cabac,
+                           const enc_tb* tb, const enc_cb* cb,
+                           int x0,int y0, int xBase,int yBase,
+                           int log2TrafoSize, int trafoDepth, int blkIdx)
+{
+  if (tb->cbf[0] || tb->cbf[1] || tb->cbf[2]) {
+    if (ectx->img->pps.cu_qp_delta_enabled_flag &&
+        true /*!ectx->IsCuQpDeltaCoded*/) {
+      assert(0);
+    }
+
+    if (tb->cbf[0]) {
+      encode_residual(ectx,cabac, tb,cb,x0,y0,log2TrafoSize,0);
+    }
+
+    // larger than 4x4
+    if (log2TrafoSize>2) {
+      if (tb->cbf[1]) {
+        encode_residual(ectx,cabac,tb,cb,x0,y0,log2TrafoSize-1,1);
+      }
+      if (tb->cbf[2]) {
+        encode_residual(ectx,cabac,tb,cb,x0,y0,log2TrafoSize-1,2);
+      }
+    }
+    else if (blkIdx==3) {
+      // cannot check for tb->parent->cbf[], because this may not yet be set
+      if (tb->cbf[1]) {
+        encode_residual(ectx,cabac,tb,cb,xBase,yBase,log2TrafoSize,1);
+      }
+      if (tb->cbf[2]) {
+        encode_residual(ectx,cabac,tb,cb,xBase,yBase,log2TrafoSize,2);
+      }
+    }
+  }
+}
+
+
+void encode_transform_tree(encoder_context* ectx,
+                           CABAC_encoder* cabac,
+                           const enc_tb* tb, const enc_cb* cb,
+                           int x0,int y0, int xBase,int yBase,
+                           int log2TrafoSize, int trafoDepth, int blkIdx,
+                           int MaxTrafoDepth, int IntraSplitFlag, bool recurse)
+{
+  //de265_image* img = ectx->img;
+  const seq_parameter_set* sps = &ectx->img->sps;
+
+  if (log2TrafoSize <= sps->Log2MaxTrafoSize &&
+      log2TrafoSize >  sps->Log2MinTrafoSize &&
+      trafoDepth < MaxTrafoDepth &&
+      !(IntraSplitFlag && trafoDepth==0))
+    {
+      int split_transform_flag = tb->split_transform_flag;
+      encode_split_transform_flag(ectx, cabac, log2TrafoSize, split_transform_flag);
+    }
+  else
+    {
+      int interSplitFlag=0; // TODO
+
+      bool split_transform_flag = (log2TrafoSize > sps->Log2MaxTrafoSize ||
+                                   (IntraSplitFlag==1 && trafoDepth==0) ||
+                                   interSplitFlag==1) ? 1:0;
+
+      /*
+      printf("split_transform_flag log2TrafoSize:%d Log2MaxTrafoSize:%d "
+             "IntraSplitFlag:%d trafoDepth:%d -> %d\n",
+             log2TrafoSize,sps->Log2MaxTrafoSize,
+             IntraSplitFlag, trafoDepth,
+             split_transform_flag);
+      */
+
+      assert(tb->split_transform_flag == split_transform_flag);
+    }
+
+  // --- CBF CB/CR ---
+
+  // For 4x4 luma, there is no signaling of chroma CBF, because only the
+  // chroma CBF for 8x8 is relevant.
+  if (log2TrafoSize>2) {
+    if (trafoDepth==0 || tb->parent->cbf[1]) {
+      encode_cbf_chroma(cabac, trafoDepth, tb->cbf[1]);
+    }
+    if (trafoDepth==0 || tb->parent->cbf[2]) {
+      encode_cbf_chroma(cabac, trafoDepth, tb->cbf[2]);
+    }
+  }
+
+  if (tb->split_transform_flag) {
+    if (recurse) {
+      int x1 = x0 + (1<<(log2TrafoSize-1));
+      int y1 = y0 + (1<<(log2TrafoSize-1));
+
+      encode_transform_tree(ectx, cabac, tb->children[0], cb, x0,y0,x0,y0,log2TrafoSize-1,
+                            trafoDepth+1, 0, MaxTrafoDepth, IntraSplitFlag, true);
+      encode_transform_tree(ectx, cabac, tb->children[1], cb, x1,y0,x0,y0,log2TrafoSize-1,
+                            trafoDepth+1, 1, MaxTrafoDepth, IntraSplitFlag, true);
+      encode_transform_tree(ectx, cabac, tb->children[2], cb, x0,y1,x0,y0,log2TrafoSize-1,
+                            trafoDepth+1, 2, MaxTrafoDepth, IntraSplitFlag, true);
+      encode_transform_tree(ectx, cabac, tb->children[3], cb, x1,y1,x0,y0,log2TrafoSize-1,
+                            trafoDepth+1, 3, MaxTrafoDepth, IntraSplitFlag, true);
+    }
+  }
+  else {
+    if (cb->PredMode == MODE_INTRA || trafoDepth != 0 ||
+        tb->cbf[1] || tb->cbf[2]) {
+      encode_cbf_luma(cabac, trafoDepth==0, tb->cbf[0]);
+    }
+    else {
+    /* Note: usually, cbf[0] should be TRUE, but while estimating the bitrate, this
+       function can also be called with all CBFs FALSE. Usually, this is handled by
+       the rqt_root_cbf flag, but during analysis, this is set after the bitrate is estimated.
+     */
+      // assert(tb->cbf[0]==true);
+    }
+
+    encode_transform_unit(ectx,cabac, tb,cb, x0,y0, xBase,yBase, log2TrafoSize, trafoDepth, blkIdx);
+  }
+}
+
+
+void encode_cu_skip_flag(encoder_context* ectx,
+                         CABAC_encoder* cabac,
+                         const enc_cb* cb,
+                         bool skip)
+{
+  logtrace(LogSymbols,"$1 cu_skip_flag=%d\n",skip);
+
+  const de265_image* img = ectx->img;
+
+  int x0 = cb->x;
+  int y0 = cb->y;
+
+  // check if neighbors are available
+
+  int availableL = check_CTB_available(img, x0,y0, x0-1,y0);
+  int availableA = check_CTB_available(img, x0,y0, x0,y0-1);
+
+  int condL = 0;
+  int condA = 0;
+
+  if (availableL && img->get_cu_skip_flag(x0-1,y0)) condL=1;
+  if (availableA && img->get_cu_skip_flag(x0,y0-1)) condA=1;
+
+  int contextOffset = condL + condA;
+  int context = contextOffset;
+
+  // decode bit
+
+  int bit = skip;
+
+  logtrace(LogSlice,"> cu_skip_flag ctx=%d, bit=%d\n", context,bit);
+
+  cabac->write_CABAC_bit(CONTEXT_MODEL_CU_SKIP_FLAG + context, bit);
+}
+
+
+void encode_merge_idx(encoder_context* ectx,
+                      CABAC_encoder* cabac,
+                      int mergeIdx)
+{
+  logtrace(LogSymbols,"$1 merge_idx=%d\n",mergeIdx);
+  logtrace(LogSlice,"# merge_idx %d\n", mergeIdx);
+
+  if (ectx->shdr->MaxNumMergeCand <= 1) {
+    return; // code nothing, we use only a single merge candidate
+  }
+
+  // TU coding, first bin is CABAC, remaining are bypass.
+  // cMax = MaxNumMergeCand-1
+
+  cabac->write_CABAC_bit(CONTEXT_MODEL_MERGE_IDX, mergeIdx ? 1 : 0);
+
+  if (mergeIdx>0) {
+    int idx=1;
+
+    while (idx<ectx->shdr->MaxNumMergeCand-1) {
+      int increase = (idx < mergeIdx);
+
+      cabac->write_CABAC_bypass(increase);
+      if (increase) {
+        idx++;
+      }
+      else {
+        break;
+      }
+    }
+  }
+}
+
+
+static inline void encode_rqt_root_cbf(encoder_context* ectx,
+                                       CABAC_encoder* cabac,
+                                       int rqt_root_cbf)
+{
+  logtrace(LogSymbols,"$1 rqt_root_cbf=%d\n",rqt_root_cbf);
+  cabac->write_CABAC_bit(CONTEXT_MODEL_RQT_ROOT_CBF, rqt_root_cbf);
+}
+
+
+void encode_mvd(encoder_context* ectx,
+                CABAC_encoder* cabac,
+                const int16_t mvd[2])
+{
+  int mvd0abs = abs_value(mvd[0]);
+  int mvd1abs = abs_value(mvd[1]);
+
+  int mvd0_greater_0 = !!(mvd0abs);
+  int mvd1_greater_0 = !!(mvd1abs);
+
+  cabac->write_CABAC_bit(CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+0, mvd0_greater_0);
+  cabac->write_CABAC_bit(CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+0, mvd1_greater_0);
+
+  if (mvd0_greater_0) {
+    cabac->write_CABAC_bit(CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+1, mvd0abs>1);
+  }
+  if (mvd1_greater_0) {
+    cabac->write_CABAC_bit(CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG+1, mvd1abs>1);
+  }
+
+  if (mvd0abs) {
+    if (mvd0abs>1) {
+      cabac->write_CABAC_EGk(mvd0abs-2,1);
+    }
+    cabac->write_CABAC_bypass(mvd[0]<0);
+  }
+
+  if (mvd1abs) {
+    if (mvd1abs>1) {
+      cabac->write_CABAC_EGk(mvd1abs-2,1);
+    }
+    cabac->write_CABAC_bypass(mvd[1]<0);
+  }
+}
+
+
+void encode_prediction_unit(encoder_context* ectx,
+                            CABAC_encoder* cabac,
+                            const enc_cb* cb, int pbIdx,
+                            int x0,int y0, int w, int h)
+{
+  const enc_pb_inter& pb = cb->inter.pb[pbIdx];
+
+  logtrace(LogSymbols,"$1 merge_flag=%d\n",pb.spec.merge_flag);
+  cabac->write_CABAC_bit(CONTEXT_MODEL_MERGE_FLAG, pb.spec.merge_flag);
+
+  if (pb.spec.merge_flag) {
+    assert(false); // TODO
+  }
+  else {
+    if (ectx->shdr->slice_type == SLICE_TYPE_B) {
+      assert(false); // TODO
+    }
+
+    if (pb.spec.inter_pred_idc != PRED_L1) {
+      if (ectx->shdr->num_ref_idx_l0_active > 1) {
+        assert(false); // TODO
+      //cabac->write_CABAC_bit(CONTEXT_MODEL_REF_IDX_LX, pb.spec.mvp_l0_flag);
+      }
+
+      encode_mvd(ectx,cabac, pb.spec.mvd[0]);
+
+      logtrace(LogSymbols,"$1 mvp_lx_flag=%d\n",pb.spec.mvp_l0_flag);
+      cabac->write_CABAC_bit(CONTEXT_MODEL_MVP_LX_FLAG, pb.spec.mvp_l0_flag);
+    }
+
+    if (pb.spec.inter_pred_idc != PRED_L0) {
+      assert(false); // TODO
+    }
+
+    /*
+enum InterPredIdc
+    PRED_L0=0,
+    PRED_L1=1,
+    PRED_BI=2
+    */
+  }
+}
+
+
+void encode_coding_unit(encoder_context* ectx,
+                        CABAC_encoder* cabac,
+                        const enc_cb* cb, int x0,int y0, int log2CbSize, bool recurse)
+{
+  logtrace(LogSlice,"--- encode CU (%d;%d) ---\n",x0,y0);
+
+  de265_image* img = ectx->img;
+  const slice_segment_header* shdr = &ectx->imgdata->shdr;
+  const seq_parameter_set* sps = &ectx->img->sps;
+
+
+  int nCbS = 1<<log2CbSize;
+
+
+  // write skip_flag
+
+  if (shdr->slice_type != SLICE_TYPE_I) {
+    encode_cu_skip_flag(ectx,cabac, cb, cb->PredMode==MODE_SKIP);
+  }
+
+  if (cb->PredMode==MODE_SKIP) {
+    assert(cb->inter.pb[0].spec.merge_flag);
+    encode_merge_idx(ectx,cabac, cb->inter.pb[0].spec.merge_idx);
+  }
+  else {
+
+    enum PredMode PredMode = cb->PredMode;
+    enum PartMode PartMode = PART_2Nx2N;
+    int IntraSplitFlag=0;
+
+    if (shdr->slice_type != SLICE_TYPE_I) {
+      encode_pred_mode_flag(ectx,cabac, PredMode);
+    }
+
+    if (PredMode != MODE_INTRA ||
+        log2CbSize == sps->Log2MinCbSizeY) {
+      PartMode = cb->PartMode;
+      encode_part_mode(ectx,cabac, PredMode, PartMode, log2CbSize);
+    }
+
+    if (PredMode == MODE_INTRA) {
+
+      int availableA0 = check_CTB_available(img, x0,y0, x0-1,y0);
+      int availableB0 = check_CTB_available(img, x0,y0, x0,y0-1);
+
+      if (PartMode==PART_2Nx2N) {
+        logtrace(LogSlice,"x0,y0: %d,%d\n",x0,y0);
+        int PUidx = (x0>>sps->Log2MinPUSize) + (y0>>sps->Log2MinPUSize)*sps->PicWidthInMinPUs;
+
+        int candModeList[3];
+        fillIntraPredModeCandidates(candModeList,x0,y0,PUidx,
+                                    availableA0,availableB0, img);
+
+        for (int i=0;i<3;i++)
+          logtrace(LogSlice,"candModeList[%d] = %d\n", i, candModeList[i]);
+
+        enum IntraPredMode mode = cb->intra.pred_mode[0];
+        int intraPred = find_intra_pred_mode(mode, candModeList);
+        encode_prev_intra_luma_pred_flag(ectx,cabac, intraPred);
+        encode_intra_mpm_or_rem(ectx,cabac, intraPred);
+
+        logtrace(LogSlice,"IntraPredMode: %d (candidates: %d %d %d)\n", mode,
+                 candModeList[0], candModeList[1], candModeList[2]);
+        logtrace(LogSlice,"  MPM/REM = %d\n",intraPred);
+      }
+      else {
+        IntraSplitFlag=1;
+
+        int pbOffset = nCbS/2;
+        int PUidx;
+
+        int intraPred[4];
+        int childIdx=0;
+
+        for (int j=0;j<nCbS;j+=pbOffset)
+          for (int i=0;i<nCbS;i+=pbOffset, childIdx++)
+            {
+              int x=x0+i, y=y0+j;
+
+              int availableA = availableA0 || (i>0); // left candidate always available for right blk
+              int availableB = availableB0 || (j>0); // top candidate always available for bottom blk
+
+              PUidx = (x>>sps->Log2MinPUSize) + (y>>sps->Log2MinPUSize)*sps->PicWidthInMinPUs;
+
+              int candModeList[3];
+              fillIntraPredModeCandidates(candModeList,x,y,PUidx,
+                                          availableA,availableB, img);
+
+              enum IntraPredMode mode = cb->intra.pred_mode[childIdx];
+
+              assert(ectx->img->get_IntraPredMode(x,y) == mode);
+
+              intraPred[childIdx] = find_intra_pred_mode(mode, candModeList);
+            }
+
+        for (int i=0;i<4;i++)
+          encode_prev_intra_luma_pred_flag(ectx,cabac, intraPred[i]);
+
+        for (int i=0;i<4;i++)
+          encode_intra_mpm_or_rem(ectx,cabac, intraPred[i]);
+      }
+
+      IntraChromaPredMode chromaPredMode = find_chroma_pred_mode(cb->intra.chroma_mode,
+                                                                 cb->intra.pred_mode[0]);
+      encode_intra_chroma_pred_mode(ectx,cabac, chromaPredMode);
+    }
+    else {
+      switch (cb->PartMode) {
+      case PART_2Nx2N:
+        encode_prediction_unit(ectx,cabac,cb, 0, cb->x,cb->y,1<<cb->log2Size,1<<cb->log2Size);
+        break;
+      case PART_2NxN:
+      case PART_Nx2N:
+      case PART_NxN:
+      case PART_2NxnU:
+      case PART_2NxnD:
+      case PART_nLx2N:
+      case PART_nRx2N:
+        assert(false); // TODO
+      }
+    }
+
+
+    if (true) { // !pcm
+
+      if (cb->PredMode != MODE_INTRA &&
+          !(cb->PartMode == PART_2Nx2N && cb->inter.pb[0].spec.merge_flag)) {
+
+        //printf("%d %d %d\n",cb->PredMode,cb->PartMode,cb->inter.pb[0].merge_flag);
+
+        encode_rqt_root_cbf(ectx,cabac, cb->inter.rqt_root_cbf);
+      }
+
+      //printf("%d;%d encode rqt_root_cbf=%d\n",x0,y0,cb->inter.rqt_root_cbf);
+
+      if (cb->PredMode == MODE_INTRA || cb->inter.rqt_root_cbf) {
+        int MaxTrafoDepth;
+        if (PredMode == MODE_INTRA)
+          { MaxTrafoDepth = sps->max_transform_hierarchy_depth_intra + IntraSplitFlag; }
+        else
+          { MaxTrafoDepth = sps->max_transform_hierarchy_depth_inter; }
+
+
+        if (recurse) {
+          //printf("%d;%d store transform tree\n",x0,y0);
+
+          encode_transform_tree(ectx,cabac, cb->transform_tree, cb,
+                                x0,y0, x0,y0, log2CbSize, 0, 0, MaxTrafoDepth, IntraSplitFlag, true);
+        }
+      }
+    }
+  }
+}
+
+
+SplitType get_split_type(const seq_parameter_set* sps,
+                         int x0,int y0, int log2CbSize)
+{
+  /*
+    CU split flag:
+
+         | overlaps | minimum ||
+    case | border   | size    ||  split
+    -----+----------+---------++----------
+      A  |    0     |     0   || optional
+      B  |    0     |     1   ||    0
+      C  |    1     |     0   ||    1
+      D  |    1     |     1   ||    0
+  */
+  if (x0+(1<<log2CbSize) <= sps->pic_width_in_luma_samples &&
+      y0+(1<<log2CbSize) <= sps->pic_height_in_luma_samples &&
+      log2CbSize > sps->Log2MinCbSizeY) {
+
+    // case A
+
+    return OptionalSplit;
+  } else {
+    // case B/C/D
+
+    if (log2CbSize > sps->Log2MinCbSizeY) { return ForcedSplit;    }
+    else                                  { return ForcedNonSplit; }
+  }
+}
+
+
+void encode_quadtree(encoder_context* ectx,
+                     CABAC_encoder* cabac,
+                     const enc_cb* cb, int x0,int y0, int log2CbSize, int ctDepth,
+                     bool recurse)
+{
+  //de265_image* img = ectx->img;
+  const seq_parameter_set* sps = &ectx->img->sps;
+
+  int split_flag = get_split_type(sps,x0,y0,log2CbSize);
+
+  // if it is an optional split, take the decision from the CU flag
+  if (split_flag == OptionalSplit) {
+    split_flag = cb->split_cu_flag;
+
+    encode_split_cu_flag(ectx,cabac, x0,y0, ctDepth, split_flag);
+  }
+
+
+  if (split_flag) {
+    if (recurse) {
+      int x1 = x0 + (1<<(log2CbSize-1));
+      int y1 = y0 + (1<<(log2CbSize-1));
+
+      encode_quadtree(ectx,cabac, cb->children[0], x0,y0, log2CbSize-1, ctDepth+1, true);
+
+      if (x1<sps->pic_width_in_luma_samples)
+        encode_quadtree(ectx,cabac, cb->children[1], x1,y0, log2CbSize-1, ctDepth+1, true);
+
+      if (y1<sps->pic_height_in_luma_samples)
+        encode_quadtree(ectx,cabac, cb->children[2], x0,y1, log2CbSize-1, ctDepth+1, true);
+
+      if (x1<sps->pic_width_in_luma_samples &&
+          y1<sps->pic_height_in_luma_samples)
+        encode_quadtree(ectx,cabac, cb->children[3], x1,y1, log2CbSize-1, ctDepth+1, true);
+    }
+  }
+  else {
+    encode_coding_unit(ectx,cabac, cb,x0,y0, log2CbSize, true);
+  }
+}
+
+
+void encode_ctb(encoder_context* ectx,
+                CABAC_encoder* cabac,
+                enc_cb* cb, int ctbX,int ctbY)
+{
+  logtrace(LogSlice,"----- encode CTB (%d;%d) -----\n",ctbX,ctbY);
+
+#if 0
+  printf("MODEL:\n");
+  for (int i=0;i<CONTEXT_MODEL_TABLE_LENGTH;i++)
+    {
+      printf("%d;%d ",
+             ectx->ctx_model[i].state,
+             ectx->ctx_model[i].MPSbit);
+
+      if ((i%16)==15) printf("\n");
+    }
+  printf("\n");
+#endif
+
+  de265_image* img = ectx->img;
+  int log2ctbSize = img->sps.Log2CtbSizeY;
+
+  encode_quadtree(ectx,cabac, cb, ctbX<<log2ctbSize, ctbY<<log2ctbSize, log2ctbSize, 0, true);
+}
+
+
+// ---------------------------------------------------------------------------
diff --git a/libde265/encoder/encode.h b/libde265/encoder/encode.h
new file mode 100644
index 0000000..940cb3b
--- /dev/null
+++ b/libde265/encoder/encode.h
@@ -0,0 +1,289 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ENCODE_H
+#define ENCODE_H
+
+#include "libde265/image.h"
+#include "libde265/decctx.h"
+#include "libde265/image-io.h"
+#include "libde265/alloc_pool.h"
+
+class encoder_context;
+class enc_cb;
+
+
+class enc_node
+{
+ public:
+  enc_node() { mReconstruction=NULL; }
+  virtual ~enc_node() { delete[] mReconstruction; }
+
+  uint16_t x,y;
+  uint8_t  log2Size : 3;
+
+  virtual void save(const de265_image*);
+  virtual void restore(de265_image*);
+
+ private:
+  uint8_t* mReconstruction;
+};
+
+
+class enc_tb : public enc_node
+{
+ public:
+  enc_tb();
+  ~enc_tb();
+
+  const enc_tb* parent;
+
+  uint8_t split_transform_flag : 1;
+  uint8_t TrafoDepth : 2;  // 2 bits enough ? (TODO)
+
+  uint8_t cbf[3];
+
+  union {
+    // split
+    struct {
+      enc_tb* children[4];
+    };
+
+    // non-split
+    struct {
+      int16_t* coeff[3];
+    };
+  };
+
+  float distortion;  // total distortion for this level of the TB tree (including all children)
+  float rate;        // total rate for coding this TB level and all children
+  float rate_withoutCbfChroma;
+
+  void set_cbf_flags_from_children();
+
+  void reconstruct(encoder_context* ectx,
+                   de265_image* img,
+                   const enc_cb* cb, int blkIdx=0) const;
+
+  bool isZeroBlock() const { return cbf[0]==false && cbf[1]==false && cbf[2]==false; }
+
+  void alloc_coeff_memory(int cIdx, int tbSize);
+
+  static void* operator new(const size_t size) { return mMemPool.new_obj(size); }
+  static void operator delete(void* obj) { mMemPool.delete_obj(obj); }
+
+private:
+  static alloc_pool mMemPool;
+
+  void reconstruct_tb(encoder_context* ectx,
+                      de265_image* img, int x0,int y0, int log2TbSize,
+                      const enc_cb* cb, int cIdx) const;
+};
+
+
+struct enc_pb_inter
+{
+  /* absolute motion information (for MV-prediction candidates)
+   */
+  MotionVectorSpec motion;
+
+  /* specification how to code the motion vector in the bitstream
+   */
+  motion_spec    spec;
+
+
+  // NOT TRUE: refIdx in 'spec' is not used. It is taken from 'motion'
+  // Currently, information is duplicated. Same as with inter_pred_idc/predFlag[].
+
+  /* SPEC:
+  int8_t  refIdx[2]; // not used
+  int16_t mvd[2][2];
+
+  uint8_t inter_pred_idc : 2; // enum InterPredIdc
+  uint8_t mvp_l0_flag : 1;
+  uint8_t mvp_l1_flag : 1;
+  uint8_t merge_flag : 1;
+  uint8_t merge_idx  : 3;
+  */
+};
+
+
+class enc_cb : public enc_node
+{
+public:
+  enc_cb();
+  ~enc_cb();
+
+  const enc_cb* parent;
+
+  uint8_t split_cu_flag : 1;
+  uint8_t ctDepth : 2;
+
+  union {
+    // split
+    struct {
+      enc_cb* children[4];   // undefined when split_cu_flag==false
+    };
+
+    // non-split
+    struct {
+      uint8_t qp : 6;
+      uint8_t cu_transquant_bypass_flag : 1; // currently unused
+      uint8_t pcm_flag : 1;
+
+      enum PredMode PredMode; // : 6;
+      enum PartMode PartMode; // : 3;
+
+      union {
+        struct {
+          enum IntraPredMode pred_mode[4];
+          enum IntraPredMode chroma_mode;
+        } intra;
+
+        struct {
+          enc_pb_inter pb[4];
+
+          uint8_t rqt_root_cbf : 1;
+        } inter;
+      };
+
+      const enc_tb* transform_tree;
+    };
+  };
+
+
+  float distortion;
+  float rate;
+
+
+  void set_rqt_root_bf_from_children_cbf();
+
+  /* Save CB reconstruction in the node and restore it again to the image.
+     Pixel data and metadata.
+   */
+  virtual void save(const de265_image*);
+  virtual void restore(de265_image*);
+
+
+  /* Decode this CB: pixel data and write metadata to image.
+   */
+  void reconstruct(encoder_context* ectx,de265_image* img) const;
+
+
+  // memory management
+
+  static void* operator new(const size_t size) { return mMemPool.new_obj(size); }
+  static void operator delete(void* obj) { mMemPool.delete_obj(obj); }
+
+ private:
+  void write_to_image(de265_image*) const;
+
+  static alloc_pool mMemPool;
+};
+
+
+
+inline int childX(int x0, int idx, int log2CbSize)
+{
+  return x0 + ((idx&1) << (log2CbSize-1));
+}
+
+inline int childY(int y0, int idx, int log2CbSize)
+{
+  return y0 + ((idx>>1) << (log2CbSize-1));
+}
+
+
+
+void encode_split_cu_flag(encoder_context* ectx,
+                          CABAC_encoder* cabac,
+                          int x0, int y0, int ctDepth, int split_flag);
+
+void encode_transform_tree(encoder_context* ectx,
+                           CABAC_encoder* cabac,
+                           const enc_tb* tb, const enc_cb* cb,
+                           int x0,int y0, int xBase,int yBase,
+                           int log2TrafoSize, int trafoDepth, int blkIdx,
+                           int MaxTrafoDepth, int IntraSplitFlag, bool recurse);
+
+void encode_coding_unit(encoder_context* ectx,
+                        CABAC_encoder* cabac,
+                        const enc_cb* cb, int x0,int y0, int log2CbSize, bool recurse);
+
+/* returns
+   1  - forced split
+   0  - forced non-split
+   -1 - optional split
+*/
+enum SplitType {
+  ForcedNonSplit = 0,
+  ForcedSplit    = 1,
+  OptionalSplit  = 2
+};
+
+SplitType get_split_type(const seq_parameter_set* sps,
+                         int x0,int y0, int log2CbSize);
+
+
+void encode_split_transform_flag(encoder_context* ectx,
+                                 CABAC_encoder* cabac,
+                                 int log2TrafoSize, int split_flag);
+
+void encode_merge_idx(encoder_context* ectx,
+                      CABAC_encoder* cabac,
+                      int mergeIdx);
+
+void encode_cu_skip_flag(encoder_context* ectx,
+                         CABAC_encoder* cabac,
+                         const enc_cb* cb,
+                         bool skip);
+
+void encode_cbf_luma(CABAC_encoder* cabac,
+                     bool zeroTrafoDepth, int cbf_luma);
+
+void encode_cbf_chroma(CABAC_encoder* cabac,
+                       int trafoDepth, int cbf_chroma);
+
+void encode_transform_unit(encoder_context* ectx,
+                           CABAC_encoder* cabac,
+                           const enc_tb* tb, const enc_cb* cb,
+                           int x0,int y0, int xBase,int yBase,
+                           int log2TrafoSize, int trafoDepth, int blkIdx);
+
+
+void encode_quadtree(encoder_context* ectx,
+                     CABAC_encoder* cabac,
+                     const enc_cb* cb, int x0,int y0, int log2CbSize, int ctDepth,
+                     bool recurse);
+
+void encode_ctb(encoder_context* ectx,
+                CABAC_encoder* cabac,
+                enc_cb* cb, int ctbX,int ctbY);
+
+
+class de265_encoder
+{
+ public:
+  virtual ~de265_encoder() { }
+};
+
+#endif
diff --git a/libde265/encoder/encoder-context.cc b/libde265/encoder/encoder-context.cc
new file mode 100644
index 0000000..947041a
--- /dev/null
+++ b/libde265/encoder/encoder-context.cc
@@ -0,0 +1,294 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "encoder/encoder-context.h"
+#include "encoder/analyze.h"
+#include "libde265/util.h"
+
+#include <math.h>
+
+
+encoder_context::encoder_context()
+{
+  encoder_started=false;
+
+  //img_source = NULL;
+  //reconstruction_sink = NULL;
+  //packet_sink = NULL;
+
+  image_spec_is_defined = false;
+  parameters_have_been_set = false;
+  headers_have_been_sent = false;
+
+  param_image_allocation_userdata = NULL;
+  release_func = NULL;
+
+  use_adaptive_context = true; //false;
+
+  //enc_coeff_pool.set_blk_size(64*64*20); // TODO: this a guess
+
+  //switch_CABAC_to_bitstream();
+
+
+  params.registerParams(params_config);
+  algo.registerParams(params_config);
+}
+
+
+encoder_context::~encoder_context()
+{
+  while (!output_packets.empty()) {
+    en265_free_packet(this, output_packets.front());
+    output_packets.pop_front();
+  }
+}
+
+
+void encoder_context::start_encoder()
+{
+  if (encoder_started) {
+    return;
+  }
+
+
+  if (params.sop_structure() == SOP_Intra) {
+    sop = std::shared_ptr<sop_creator_intra_only>(new sop_creator_intra_only());
+  }
+  else {
+    auto s = std::shared_ptr<sop_creator_trivial_low_delay>(new sop_creator_trivial_low_delay());
+    s->setParams(params.mSOP_LowDelay);
+    sop = s;
+  }
+
+  sop->set_encoder_context(this);
+  sop->set_encoder_picture_buffer(&picbuf);
+
+
+  encoder_started=true;
+}
+
+
+en265_packet* encoder_context::create_packet(en265_packet_content_type t)
+{
+  en265_packet* pck = new en265_packet;
+
+  uint8_t* data = new uint8_t[cabac_encoder.size()];
+  memcpy(data, cabac_encoder.data(), cabac_encoder.size());
+
+  pck->version = 1;
+
+  pck->data = data;
+  pck->length = cabac_encoder.size();
+
+  pck->frame_number = -1;
+  pck->content_type = t;
+  pck->complete_picture = 0;
+  pck->final_slice = 0;
+  pck->dependent_slice = 0;
+  //pck->pts = 0;
+  //pck->user_data = NULL;
+  pck->nuh_layer_id = 0;
+  pck->nuh_temporal_id = 0;
+
+  pck->encoder_context = this;
+
+  pck->input_image = NULL;
+  pck->reconstruction = NULL;
+
+  cabac_encoder.reset();
+
+  return pck;
+}
+
+
+de265_error encoder_context::encode_headers()
+{
+  nal_header nal;
+
+  // VPS
+
+  vps.set_defaults(Profile_Main, 6,2);
+
+
+  // SPS
+
+  sps.set_defaults();
+  sps.set_CB_log2size_range( Log2(params.min_cb_size), Log2(params.max_cb_size));
+  sps.set_TB_log2size_range( Log2(params.min_tb_size), Log2(params.max_tb_size));
+  sps.max_transform_hierarchy_depth_intra = params.max_transform_hierarchy_depth_intra;
+
+  sps.set_resolution(image_width, image_height);
+  sop->set_SPS_header_values();
+  sps.compute_derived_values();
+
+
+  // PPS
+
+  pps.set_defaults();
+  pps.pic_init_qp = algo.getPPS_QP();
+
+  // turn off deblocking filter
+  pps.deblocking_filter_control_present_flag = true;
+  pps.deblocking_filter_override_enabled_flag = false;
+  pps.pic_disable_deblocking_filter_flag = true;
+  pps.pps_loop_filter_across_slices_enabled_flag = false;
+
+  pps.set_derived_values(&sps);
+
+
+
+  // write headers
+
+  en265_packet* pck;
+
+  nal.set(NAL_UNIT_VPS_NUT);
+  nal.write(cabac_encoder);
+  vps.write(this, cabac_encoder);
+  cabac_encoder.add_trailing_bits();
+  cabac_encoder.flush_VLC();
+  pck = create_packet(EN265_PACKET_VPS);
+  pck->nal_unit_type = EN265_NUT_VPS;
+  output_packets.push_back(pck);
+
+  nal.set(NAL_UNIT_SPS_NUT);
+  nal.write(cabac_encoder);
+  sps.write(this, cabac_encoder);
+  cabac_encoder.add_trailing_bits();
+  cabac_encoder.flush_VLC();
+  pck = create_packet(EN265_PACKET_SPS);
+  pck->nal_unit_type = EN265_NUT_SPS;
+  output_packets.push_back(pck);
+
+  nal.set(NAL_UNIT_PPS_NUT);
+  nal.write(cabac_encoder);
+  pps.write(this, cabac_encoder, &sps);
+  cabac_encoder.add_trailing_bits();
+  cabac_encoder.flush_VLC();
+  pck = create_packet(EN265_PACKET_PPS);
+  pck->nal_unit_type = EN265_NUT_PPS;
+  output_packets.push_back(pck);
+
+
+
+  headers_have_been_sent = true;
+
+  return DE265_OK;
+}
+
+
+de265_error encoder_context::encode_picture_from_input_buffer()
+{
+  if (!picbuf.have_more_frames_to_encode()) {
+    return DE265_OK;
+  }
+
+
+  if (!image_spec_is_defined) {
+    const image_data* id = picbuf.peek_next_picture_to_encode();
+    image_width  = id->input->get_width();
+    image_height = id->input->get_height();
+    image_spec_is_defined = true;
+  }
+
+
+  if (!parameters_have_been_set) {
+    algo.setParams(params);
+
+
+    // TODO: must be <30, because Y->C mapping (tab8_22) is not implemented yet
+    int qp = algo.getPPS_QP();
+
+    //lambda = ectx->params.lambda;
+    lambda = 0.0242 * pow(1.27245, qp);
+
+    parameters_have_been_set = true;
+  }
+
+
+  if (!headers_have_been_sent) {
+    encode_headers();
+  }
+
+
+
+
+
+  image_data* imgdata;
+  imgdata = picbuf.get_next_picture_to_encode();
+  assert(imgdata);
+  picbuf.mark_encoding_started(imgdata->frame_number);
+
+  this->imgdata = imgdata;
+  this->shdr    = &imgdata->shdr;
+  loginfo(LogEncoder,"encoding frame %d\n",imgdata->frame_number);
+
+
+  // write slice header
+
+  // slice
+
+  imgdata->shdr.slice_deblocking_filter_disabled_flag = true;
+  imgdata->shdr.slice_loop_filter_across_slices_enabled_flag = false;
+  imgdata->shdr.compute_derived_values(&pps);
+
+  //shdr.slice_pic_order_cnt_lsb = poc & 0xFF;
+
+  imgdata->nal.write(cabac_encoder);
+  imgdata->shdr.write(this, cabac_encoder, &sps, &pps, imgdata->nal.nal_unit_type);
+  cabac_encoder.add_trailing_bits();
+  cabac_encoder.flush_VLC();
+
+
+  // encode image
+
+  cabac_encoder.init_CABAC();
+  double psnr = encode_image(this,imgdata->input, algo);
+  loginfo(LogEncoder,"  PSNR-Y: %f\n", psnr);
+  cabac_encoder.flush_CABAC();
+  cabac_encoder.add_trailing_bits();
+  cabac_encoder.flush_VLC();
+
+
+  // set reconstruction image
+
+  picbuf.set_reconstruction_image(imgdata->frame_number, img);
+  //picbuf.set_prediction_image(imgdata->frame_number, prediction);
+  img=NULL;
+  this->imgdata = NULL;
+  this->shdr = NULL;
+
+  // build output packet
+
+  en265_packet* pck = create_packet(EN265_PACKET_SLICE);
+  pck->input_image    = imgdata->input;
+  pck->reconstruction = imgdata->reconstruction;
+  pck->frame_number   = imgdata->frame_number;
+  pck->nal_unit_type  = (enum en265_nal_unit_type)imgdata->nal.nal_unit_type;
+  pck->nuh_layer_id   = imgdata->nal.nuh_layer_id;
+  pck->nuh_temporal_id= imgdata->nal.nuh_temporal_id;
+  output_packets.push_back(pck);
+
+
+  picbuf.mark_encoding_finished(imgdata->frame_number);
+
+  return DE265_OK;
+}
diff --git a/libde265/encoder/encoder-context.h b/libde265/encoder/encoder-context.h
new file mode 100644
index 0000000..6916620
--- /dev/null
+++ b/libde265/encoder/encoder-context.h
@@ -0,0 +1,157 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ENCODER_CONTEXT_H
+#define ENCODER_CONTEXT_H
+
+#include "libde265/image.h"
+#include "libde265/decctx.h"
+#include "libde265/image-io.h"
+#include "libde265/encoder/encoder-params.h"
+#include "libde265/encoder/encpicbuf.h"
+#include "libde265/encoder/sop.h"
+#include "libde265/en265.h"
+#include "libde265/util.h"
+
+#include <memory>
+
+
+class encoder_context : public base_context
+{
+ public:
+  encoder_context();
+  ~encoder_context();
+
+  virtual const de265_image* get_image(int frame_id) const {
+    return picbuf.get_picture(frame_id)->reconstruction;
+  }
+
+  virtual bool has_image(int frame_id) const {
+    return picbuf.has_picture(frame_id);
+  }
+
+  bool encoder_started;
+
+  encoder_params params;
+  config_parameters params_config;
+
+  EncodingAlgorithm_Custom algo;
+
+  int image_width, image_height;
+  bool image_spec_is_defined;  // whether we know the input image size
+
+  void* param_image_allocation_userdata;
+  void (*release_func)(en265_encoder_context*,
+                       de265_image*,
+                       void* userdata);
+
+  //error_queue errqueue;
+  //acceleration_functions accel;
+
+  // quick links
+  de265_image* img; // reconstruction
+  de265_image* prediction;
+  image_data* imgdata; // input image
+  slice_segment_header* shdr;
+
+  // temporary memory for motion compensated pixels (when CB-algo passes this down to TB-algo)
+  //uint8_t prediction[3][64*64]; // stride: 1<<(cb->log2Size)
+  //int prediction_x0,prediction_y0;
+
+
+  int active_qp; // currently active QP
+  /*int target_qp;*/ /* QP we want to code at.
+                     (Not actually the real QP. Check image.get_QPY() for that.) */
+
+  video_parameter_set  vps;
+  seq_parameter_set    sps;
+  pic_parameter_set    pps;
+  //slice_segment_header shdr;
+
+  bool parameters_have_been_set;
+  bool headers_have_been_sent;
+
+  encoder_picture_buffer picbuf;
+  std::shared_ptr<sop_creator> sop;
+
+  std::deque<en265_packet*> output_packets;
+
+
+  // --- rate-control ---
+
+  float lambda;
+
+
+  // --- CABAC output and rate estimation ---
+
+  //CABAC_encoder*  cabac;      // currently active CABAC output (estim or bitstream)
+  //context_model_table2* ctx_model;  // currently active ctx models (estim or bitstream)
+
+  // CABAC bitstream writer
+  CABAC_encoder_bitstream cabac_encoder;
+  context_model_table     cabac_ctx_models;
+
+  //std::shared_ptr<CABAC_encoder> cabac_estim;
+
+  bool use_adaptive_context;
+
+
+  /*** TODO: CABAC_encoder direkt an encode-Funktion übergeben, anstatt hier
+       aussenrum zwischenzuspeichern (mit undefinierter Lifetime).
+       Das Context-Model kann dann gleich mit in den Encoder rein cabac_encoder(ctxtable).
+       write_bits() wird dann mit dem context-index aufgerufen, nicht mit dem model direkt.
+  ***/
+
+
+  /*
+  void switch_CABAC(context_model_table2* model) {
+    cabac      = cabac_estim.get();
+    ctx_model  = model;
+  }
+
+  void switch_CABAC_to_bitstream() {
+    cabac     = &cabac_bitstream;
+    ctx_model = &ctx_model_bitstream;
+  }
+  */
+
+  en265_packet* create_packet(en265_packet_content_type t);
+
+
+  // --- encoding control ---
+
+  void start_encoder();
+  de265_error encode_headers();
+  de265_error encode_picture_from_input_buffer();
+
+
+  // Input images can be released after encoding and when the output packet is released.
+  // This is important to do as soon as possible, as the image might actually wrap
+  // scarce resources like camera picture buffers.
+  // This function does release (only) the raw input data.
+  void release_input_image(int frame_number) { picbuf.release_input_image(frame_number); }
+
+  void mark_image_is_outputted(int frame_number) { picbuf.mark_image_is_outputted(frame_number); }
+};
+
+
+#endif
diff --git a/libde265/encoder/encoder-params.cc b/libde265/encoder/encoder-params.cc
new file mode 100644
index 0000000..d52f81b
--- /dev/null
+++ b/libde265/encoder/encoder-params.cc
@@ -0,0 +1,75 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "encoder-params.h"
+
+
+
+static std::vector<int> power2range(int low,int high)
+{
+  std::vector<int> vals;
+  for (int i=low; i<=high; i*=2)
+    vals.push_back(i);
+  return vals;
+}
+
+encoder_params::encoder_params()
+{
+  //rateControlMethod = RateControlMethod_ConstantQP;
+
+  min_cb_size.set_ID("min-cb-size"); min_cb_size.set_valid_values(power2range(8,64)); min_cb_size.set_default(8);
+  max_cb_size.set_ID("max-cb-size"); max_cb_size.set_valid_values(power2range(8,64)); max_cb_size.set_default(32);
+  min_tb_size.set_ID("min-tb-size"); min_tb_size.set_valid_values(power2range(4,32)); min_tb_size.set_default(4);
+  max_tb_size.set_ID("max-tb-size"); max_tb_size.set_valid_values(power2range(8,32)); max_tb_size.set_default(32);
+
+  max_transform_hierarchy_depth_intra.set_ID("max-transform-hierarchy-depth-intra");
+  max_transform_hierarchy_depth_intra.set_range(0,4);
+  max_transform_hierarchy_depth_intra.set_default(3);
+
+  sop_structure.set_ID("sop-structure");
+
+  mAlgo_TB_IntraPredMode.set_ID("TB-IntraPredMode");
+  mAlgo_TB_IntraPredMode_Subset.set_ID("TB-IntraPredMode-subset");
+  mAlgo_CB_IntraPartMode.set_ID("CB-IntraPartMode");
+
+  mAlgo_MEMode.set_ID("MEMode");
+}
+
+
+void encoder_params::registerParams(config_parameters& config)
+{
+  config.add_option(&min_cb_size);
+  config.add_option(&max_cb_size);
+  config.add_option(&min_tb_size);
+  config.add_option(&max_tb_size);
+  config.add_option(&max_transform_hierarchy_depth_intra);
+
+  config.add_option(&sop_structure);
+
+  config.add_option(&mAlgo_TB_IntraPredMode);
+  config.add_option(&mAlgo_TB_IntraPredMode_Subset);
+  config.add_option(&mAlgo_CB_IntraPartMode);
+
+  config.add_option(&mAlgo_MEMode);
+
+  mSOP_LowDelay.registerParams(config);
+}
diff --git a/libde265/encoder/encoder-params.h b/libde265/encoder/encoder-params.h
new file mode 100644
index 0000000..c0726be
--- /dev/null
+++ b/libde265/encoder/encoder-params.h
@@ -0,0 +1,141 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ENCODER_PARAMS_H
+#define ENCODER_PARAMS_H
+
+#include "libde265/encoder/encode.h"
+#include "libde265/encoder/analyze.h"
+#include "libde265/encoder/sop.h"
+
+
+enum RateControlMethod
+  {
+    RateControlMethod_ConstantQP,
+    RateControlMethod_ConstantLambda
+  };
+
+enum IntraPredSearch
+  {
+    IntraPredSearch_Complete
+  };
+
+
+enum SOP_Structure
+  {
+    SOP_Intra,
+    SOP_LowDelay
+  };
+
+class option_SOP_Structure : public choice_option<enum SOP_Structure>
+{
+ public:
+  option_SOP_Structure() {
+    add_choice("intra",     SOP_Intra);
+    add_choice("low-delay", SOP_LowDelay, true);
+  }
+};
+
+
+enum MEMode
+  {
+    MEMode_Test,
+    MEMode_Search
+  };
+
+class option_MEMode : public choice_option<enum MEMode>
+{
+ public:
+  option_MEMode() {
+    add_choice("test",   MEMode_Test, true);
+    add_choice("search", MEMode_Search);
+  }
+};
+
+
+struct encoder_params
+{
+  encoder_params();
+
+  void registerParams(config_parameters& config);
+
+
+  // CB quad-tree
+
+  option_int min_cb_size;
+  option_int max_cb_size;
+
+  option_int min_tb_size;
+  option_int max_tb_size;
+
+  option_int max_transform_hierarchy_depth_intra;
+
+
+  option_SOP_Structure sop_structure;
+
+  sop_creator_trivial_low_delay::params mSOP_LowDelay;
+
+
+  // --- Algo_TB_IntraPredMode
+
+  option_ALGO_TB_IntraPredMode        mAlgo_TB_IntraPredMode;
+  option_ALGO_TB_IntraPredMode_Subset mAlgo_TB_IntraPredMode_Subset;
+
+  //Algo_TB_IntraPredMode_FastBrute::params TB_IntraPredMode_FastBrute;
+  //Algo_TB_IntraPredMode_MinResidual::params TB_IntraPredMode_MinResidual;
+
+
+  // --- Algo_TB_Split_BruteForce
+
+  //Algo_TB_Split_BruteForce::params  TB_Split_BruteForce;
+
+
+  // --- Algo_CB_IntraPartMode
+
+  option_ALGO_CB_IntraPartMode mAlgo_CB_IntraPartMode;
+
+  //Algo_CB_IntraPartMode_Fixed::params CB_IntraPartMode_Fixed;
+
+  // --- Algo_CB_Split
+
+  // --- Algo_CTB_QScale
+
+  //Algo_CTB_QScale_Constant::params    CTB_QScale_Constant;
+
+  option_MEMode mAlgo_MEMode;
+
+
+  // intra-prediction
+
+  enum IntraPredSearch intraPredSearch;
+
+
+  // rate-control
+
+  enum RateControlMethod rateControlMethod;
+
+  //int constant_QP;
+  //int lambda;
+};
+
+
+#endif
diff --git a/libde265/encoder/encpicbuf.cc b/libde265/encoder/encpicbuf.cc
new file mode 100644
index 0000000..f84a151
--- /dev/null
+++ b/libde265/encoder/encpicbuf.cc
@@ -0,0 +1,321 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "libde265/encoder/encpicbuf.h"
+#include "libde265/util.h"
+
+
+encoder_picture_buffer::encoder_picture_buffer()
+{
+}
+
+encoder_picture_buffer::~encoder_picture_buffer()
+{
+  flush_images();
+}
+
+
+image_data::image_data()
+{
+  //printf("new %p\n",this);
+
+  frame_number = 0;
+
+  input = NULL;
+  prediction = NULL;
+  reconstruction = NULL;
+
+  // SOP metadata
+
+  sps_index = -1;
+  skip_priority = 0;
+  is_intra = true;
+
+  state = state_unprocessed;
+
+  is_in_output_queue = true;
+}
+
+image_data::~image_data()
+{
+  //printf("delete %p\n",this);
+
+  delete input;
+  // TODO: this could still be referenced in the packet output queue, so the
+  //       images should really be refcounted. release for now to prevent leaks
+  delete reconstruction;
+  delete prediction;
+}
+
+
+// --- input pushed by the input process ---
+
+void encoder_picture_buffer::reset()
+{
+  flush_images();
+
+  mEndOfStream = false;
+}
+
+
+void encoder_picture_buffer::flush_images()
+{
+  while (!mImages.empty()) {
+    delete mImages.front();
+    mImages.pop_front();
+  }
+}
+
+
+image_data* encoder_picture_buffer::insert_next_image_in_encoding_order(const de265_image* img,
+                                                                        int frame_number)
+{
+  image_data* data = new image_data();
+  data->frame_number = frame_number;
+  data->input = img;
+  data->shdr.set_defaults();
+
+  mImages.push_back(data);
+
+  return data;
+}
+
+void encoder_picture_buffer::insert_end_of_stream()
+{
+  mEndOfStream = true;
+}
+
+
+// --- SOP structure ---
+
+void image_data::set_intra()
+{
+  is_intra = true;
+}
+
+void image_data::set_NAL_type(uint8_t nalType)
+{
+  nal.nal_unit_type = nalType;
+}
+
+void image_data::set_references(int sps_index, // -1 -> custom
+                                const std::vector<int>& l0,
+                                const std::vector<int>& l1,
+                                const std::vector<int>& lt,
+                                const std::vector<int>& keepMoreReferences)
+{
+  this->sps_index = sps_index;
+  ref0 = l0;
+  ref1 = l1;
+  longterm = lt;
+  keep = keepMoreReferences;
+
+
+  // TODO: pps.num_ref_idx_l0_default_active
+
+  shdr.num_ref_idx_l0_active = l0.size();
+  //shdr.num_ref_idx_l1_active = l1.size();
+
+  assert(l0.size() < MAX_NUM_REF_PICS);
+  for (int i=0;i<l0.size();i++) {
+    shdr.RefPicList[0][i] = l0[i];
+  }
+
+  /*
+  assert(l1.size() < MAX_NUM_REF_PICS);
+  for (int i=0;i<l1.size();i++) {
+    shdr.RefPicList[1][i] = l1[i];
+  }
+  */
+}
+
+void image_data::set_NAL_temporal_id(int temporal_id)
+{
+  this->nal.nuh_temporal_id = temporal_id;
+}
+
+void image_data::set_skip_priority(int skip_priority)
+{
+  this->skip_priority = skip_priority;
+}
+
+void encoder_picture_buffer::sop_metadata_commit(int frame_number)
+{
+  image_data* data = mImages.back();
+  assert(data->frame_number == frame_number);
+
+  data->state = image_data::state_sop_metadata_available;
+}
+
+
+
+// --- infos pushed by encoder ---
+
+void encoder_picture_buffer::mark_encoding_started(int frame_number)
+{
+  image_data* data = get_picture(frame_number);
+
+  data->state = image_data::state_encoding;
+}
+
+void encoder_picture_buffer::set_prediction_image(int frame_number, de265_image* pred)
+{
+  image_data* data = get_picture(frame_number);
+
+  data->prediction = pred;
+}
+
+void encoder_picture_buffer::set_reconstruction_image(int frame_number, de265_image* reco)
+{
+  image_data* data = get_picture(frame_number);
+
+  data->reconstruction = reco;
+}
+
+void encoder_picture_buffer::mark_encoding_finished(int frame_number)
+{
+  image_data* data = get_picture(frame_number);
+
+  data->state = image_data::state_keep_for_reference;
+
+
+  // --- delete images that are not required anymore ---
+
+  // first, mark all images unused
+
+#ifdef FOR_LOOP_AUTO_SUPPORT
+  FOR_LOOP(auto, imgdata, mImages) {
+#else
+  FOR_LOOP(image_data *, imgdata, mImages) {
+#endif
+    imgdata->mark_used = false;
+  }
+
+  // mark all images that will be used later
+
+  FOR_LOOP(int, f, data->ref0)     { get_picture(f)->mark_used=true; }
+  FOR_LOOP(int, f, data->ref1)     { get_picture(f)->mark_used=true; }
+  FOR_LOOP(int, f, data->longterm) { get_picture(f)->mark_used=true; }
+  FOR_LOOP(int, f, data->keep)     { get_picture(f)->mark_used=true; }
+  data->mark_used=true;
+
+  // copy over all images that we still keep
+
+  std::deque<image_data*> newImageSet;
+#ifdef FOR_LOOP_AUTO_SUPPORT
+  FOR_LOOP(auto, imgdata, mImages) {
+#else
+  FOR_LOOP(image_data *, imgdata, mImages) {
+#endif
+    if (imgdata->mark_used || imgdata->is_in_output_queue) {
+      imgdata->reconstruction->PicState = UsedForShortTermReference; // TODO: this is only a hack
+
+      newImageSet.push_back(imgdata);
+    }
+    else {
+      // image is not needed anymore for reference, remove it from EncPicBuf
+
+      delete imgdata;
+    }
+  }
+
+  mImages = newImageSet;
+}
+
+
+
+// --- data access ---
+
+bool encoder_picture_buffer::have_more_frames_to_encode() const
+{
+  for (int i=0;i<mImages.size();i++) {
+    if (mImages[i]->state < image_data::state_encoding) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
+image_data* encoder_picture_buffer::get_next_picture_to_encode()
+{
+  for (int i=0;i<mImages.size();i++) {
+    if (mImages[i]->state < image_data::state_encoding) {
+      return mImages[i];
+    }
+  }
+
+  return NULL;
+}
+
+
+const image_data* encoder_picture_buffer::get_picture(int frame_number) const
+{
+  for (int i=0;i<mImages.size();i++) {
+    if (mImages[i]->frame_number == frame_number)
+      return mImages[i];
+  }
+
+  assert(false);
+  return NULL;
+}
+
+
+image_data* encoder_picture_buffer::get_picture(int frame_number)
+{
+  for (int i=0;i<mImages.size();i++) {
+    if (mImages[i]->frame_number == frame_number)
+      return mImages[i];
+  }
+
+  assert(false);
+  return NULL;
+}
+
+
+bool encoder_picture_buffer::has_picture(int frame_number) const
+{
+  for (int i=0;i<mImages.size();i++) {
+    if (mImages[i]->frame_number == frame_number)
+      return true;
+  }
+
+  return false;
+}
+
+
+void encoder_picture_buffer::mark_image_is_outputted(int frame_number)
+{
+  image_data* idata = get_picture(frame_number);
+  assert(idata);
+
+  idata->is_in_output_queue = false;
+}
+
+
+void encoder_picture_buffer::release_input_image(int frame_number)
+{
+  image_data* idata = get_picture(frame_number);
+  assert(idata);
+
+  delete idata->input;
+  idata->input = NULL;
+}
diff --git a/libde265/encoder/encpicbuf.h b/libde265/encoder/encpicbuf.h
new file mode 100644
index 0000000..e4d2e18
--- /dev/null
+++ b/libde265/encoder/encpicbuf.h
@@ -0,0 +1,144 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DE265_ENCPICBUF_H
+#define DE265_ENCPICBUF_H
+
+#include "libde265/image.h"
+#include "libde265/sps.h"
+
+#include <deque>
+#include <vector>
+
+
+/* TODO: we need a way to quickly access pictures with a stable ID, like in the DPB.
+ */
+
+struct image_data
+{
+  image_data();
+  ~image_data();
+
+  int frame_number;
+
+  const de265_image* input; // owner
+  de265_image* prediction;  // owner
+  de265_image* reconstruction; // owner
+
+  // SOP metadata
+
+  nal_header nal; // TODO: image split into several NALs (always same NAL header?)
+
+  slice_segment_header shdr; // TODO: multi-slice pictures
+
+  std::vector<int> ref0;
+  std::vector<int> ref1;
+  std::vector<int> longterm;
+  std::vector<int> keep;
+  int sps_index;
+  int skip_priority;
+  bool is_intra;  // TODO: remove, use shdr.slice_type instead
+
+  /* unprocessed              only input image has been inserted, no metadata
+     sop_metadata_available   sop-creator has filled in references and skipping metadata
+     a) encoding              encoding started for this frame, reconstruction image was created
+     .  keep_for_reference    encoding finished, picture is kept in the buffer for reference
+     b) skipped               image was skipped, no encoding was done, no reconstruction image
+  */
+  enum state {
+    state_unprocessed,
+    state_sop_metadata_available,
+    state_encoding,
+    state_keep_for_reference,
+    state_skipped
+  } state;
+
+  bool is_in_output_queue;
+
+  bool mark_used;
+
+
+  // --- SOP structure ---
+
+  void set_intra();
+  void set_NAL_type(uint8_t nalType);
+  void set_NAL_temporal_id(int temporal_id);
+  void set_references(int sps_index, // -1 -> custom
+                      const std::vector<int>& l0, const std::vector<int>& l1,
+                      const std::vector<int>& lt,
+                      const std::vector<int>& keepMoreReferences);
+  void set_skip_priority(int skip_priority);
+};
+
+
+class encoder_picture_buffer
+{
+ public:
+  encoder_picture_buffer();
+  ~encoder_picture_buffer();
+
+
+  // --- input pushed by the input process ---
+
+  void reset();
+
+  image_data* insert_next_image_in_encoding_order(const de265_image*, int frame_number);
+  void insert_end_of_stream();
+
+
+  // --- SOP structure ---
+
+  void sop_metadata_commit(int frame_number); // note: frame_number is only for consistency checking
+
+
+  // --- infos pushed by encoder ---
+
+  void mark_encoding_started(int frame_number);
+  void set_prediction_image(int frame_number, de265_image*); // store it just for debugging fun
+  void set_reconstruction_image(int frame_number, de265_image*);
+  void mark_encoding_finished(int frame_number);
+
+
+
+  // --- data access ---
+
+  bool have_more_frames_to_encode() const;
+  image_data* get_next_picture_to_encode(); // or return NULL if no picture is available
+  const image_data* get_picture(int frame_number) const;
+  bool has_picture(int frame_number) const;
+
+  const image_data* peek_next_picture_to_encode() const {
+    assert(!mImages.empty());
+    return mImages.front();
+  }
+
+  void mark_image_is_outputted(int frame_number);
+  void release_input_image(int frame_number);
+
+ private:
+  bool mEndOfStream;
+  std::deque<image_data*> mImages;
+
+  void flush_images();
+  image_data* get_picture(int frame_number);
+};
+
+
+#endif
diff --git a/libde265/encoder/sop.cc b/libde265/encoder/sop.cc
new file mode 100644
index 0000000..635ecc7
--- /dev/null
+++ b/libde265/encoder/sop.cc
@@ -0,0 +1,106 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "libde265/encoder/sop.h"
+#include "libde265/encoder/encoder-context.h"
+
+
+sop_creator_intra_only::sop_creator_intra_only()
+{
+}
+
+
+void sop_creator_intra_only::set_SPS_header_values()
+{
+  mEncCtx->sps.log2_max_pic_order_cnt_lsb = get_num_poc_lsb_bits();
+}
+
+
+void sop_creator_intra_only::insert_new_input_image(de265_image* img)
+{
+  img->PicOrderCntVal = get_pic_order_count();
+
+  reset_poc();
+  int poc = get_pic_order_count();
+
+  assert(mEncPicBuf);
+  image_data* imgdata = mEncPicBuf->insert_next_image_in_encoding_order(img, get_frame_number());
+
+  imgdata->set_intra();
+  imgdata->set_NAL_type(NAL_UNIT_IDR_N_LP);
+  imgdata->shdr.slice_type = SLICE_TYPE_I;
+  imgdata->shdr.slice_pic_order_cnt_lsb = get_pic_order_count_lsb();
+
+  mEncPicBuf->sop_metadata_commit(get_frame_number());
+
+  advance_frame();
+}
+
+
+// ---------------------------------------------------------------------------
+
+
+sop_creator_trivial_low_delay::sop_creator_trivial_low_delay()
+{
+}
+
+
+void sop_creator_trivial_low_delay::set_SPS_header_values()
+{
+  ref_pic_set rps;
+  rps.DeltaPocS0[0] = -1;
+  rps.UsedByCurrPicS0[0] = true;
+  rps.NumNegativePics = 1;
+  rps.NumPositivePics = 0;
+  rps.compute_derived_values();
+  mEncCtx->sps.ref_pic_sets.push_back(rps);
+  mEncCtx->sps.log2_max_pic_order_cnt_lsb = get_num_poc_lsb_bits();
+}
+
+
+void sop_creator_trivial_low_delay::insert_new_input_image(de265_image* img)
+{
+  img->PicOrderCntVal = get_pic_order_count();
+
+  int frame = get_frame_number();
+
+  std::vector<int> l0, l1, empty;
+  if (!isIntra(frame)) {
+    l0.push_back(frame-1);
+  }
+
+  assert(mEncPicBuf);
+  image_data* imgdata = mEncPicBuf->insert_next_image_in_encoding_order(img, get_frame_number());
+
+  if (isIntra(frame)) {
+    reset_poc();
+    imgdata->set_intra();
+    imgdata->set_NAL_type(NAL_UNIT_IDR_N_LP);
+    imgdata->shdr.slice_type = SLICE_TYPE_I;
+  } else {
+    imgdata->set_references(0, l0,l1, empty,empty);
+    imgdata->set_NAL_type(NAL_UNIT_TRAIL_R);
+    imgdata->shdr.slice_type = SLICE_TYPE_P;
+  }
+  imgdata->shdr.slice_pic_order_cnt_lsb = get_pic_order_count_lsb();
+  mEncPicBuf->sop_metadata_commit(get_frame_number());
+
+  advance_frame();
+}
diff --git a/libde265/encoder/sop.h b/libde265/encoder/sop.h
new file mode 100644
index 0000000..8b92b4e
--- /dev/null
+++ b/libde265/encoder/sop.h
@@ -0,0 +1,143 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DE265_SOP_H
+#define DE265_SOP_H
+
+#include "libde265/image.h"
+#include "libde265/sps.h"
+#include "libde265/encoder/encpicbuf.h"
+#include "libde265/configparam.h"
+
+#include <deque>
+#include <vector>
+
+/*
+struct refpic_set
+{
+  std::vector<int> l0;
+  std::vector<int> l1;
+};
+*/
+
+class pic_order_counter
+{
+ public:
+  pic_order_counter() { mFrameNumber=0; mPOC=0; mNumLsbBits=6; }
+
+  void reset_poc() { mPOC=0; }
+
+  int get_frame_number() const { return mFrameNumber; }
+
+  int get_pic_order_count() const { return mPOC; }
+  int get_pic_order_count_lsb() const {
+    return mPOC & ((1<<mNumLsbBits)-1);
+  }
+
+  void advance_frame(int n=1) { mFrameNumber+=n; mPOC+=n; }
+
+  void set_num_poc_lsb_bits(int n) { mNumLsbBits=n; }
+  int  get_num_poc_lsb_bits() const { return mNumLsbBits; }
+
+ private:
+  int mFrameNumber;
+  int mPOC;
+  int mNumLsbBits;
+};
+
+
+class sop_creator : public pic_order_counter
+{
+ public:
+  sop_creator() { mEncCtx=NULL; mEncPicBuf=NULL; }
+  virtual ~sop_creator() { }
+
+  void set_encoder_context(encoder_context* encctx) { mEncCtx=encctx; }
+  void set_encoder_picture_buffer(encoder_picture_buffer* encbuf) { mEncPicBuf=encbuf; }
+
+  /* Fills in the following fields:
+     - SPS.ref_pic_sets
+     - SPS.log2_max_pic_order_cnt_lsb
+   */
+  virtual void set_SPS_header_values() = 0;
+
+  /* Fills in the following fields:
+     - NAL.nal_type
+     - SHDR.slice_type
+     - SHDR.slice_pic_order_cnt_lsb
+     - IMGDATA.references
+   */
+  virtual void insert_new_input_image(de265_image*) = 0;
+  virtual void insert_end_of_stream() { mEncPicBuf->insert_end_of_stream(); }
+
+  virtual int  get_number_of_temporal_layers() const { return 1; }
+
+  //virtual std::vector<refpic_set> get_sps_refpic_sets() const = 0;
+
+ protected:
+  encoder_context*        mEncCtx;
+  encoder_picture_buffer* mEncPicBuf;
+};
+
+
+
+class sop_creator_intra_only : public sop_creator
+{
+ public:
+  sop_creator_intra_only();
+
+  virtual void set_SPS_header_values();
+  virtual void insert_new_input_image(de265_image* img);
+};
+
+
+
+class sop_creator_trivial_low_delay : public sop_creator
+{
+ public:
+  struct params {
+    params() {
+      intraPeriod.set_ID("sop-lowDelay-intraPeriod");
+      intraPeriod.set_minimum(1);
+      intraPeriod.set_default(250);
+    }
+
+    void registerParams(config_parameters& config) {
+      config.add_option(&intraPeriod);
+    }
+
+    option_int intraPeriod;
+  };
+
+  sop_creator_trivial_low_delay();
+
+  void setParams(const params& p) { mParams=p; }
+
+  virtual void set_SPS_header_values();
+  virtual void insert_new_input_image(de265_image* img);
+
+ private:
+  params mParams;
+
+  bool isIntra(int frame) const { return (frame % mParams.intraPeriod)==0; }
+};
+
+
+#endif
diff --git a/libde265/fallback-dct.cc b/libde265/fallback-dct.cc
index 7eaca82..9f6df8e 100644
--- a/libde265/fallback-dct.cc
+++ b/libde265/fallback-dct.cc
@@ -18,8 +18,7 @@
  * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "fallback-motion.h"
-#include "util.h"
+#include "fallback-dct.h"
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
 # include <malloc.h>
@@ -28,13 +27,29 @@
 #endif
 
 #include <assert.h>
+#include <algorithm>
 
 
-void transform_skip_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+static void printMatrix(const char* name, const int16_t* v, int n)
+{
+  printf("--- %s ---\n",name);
+  for (int r=0;r<n;r++) {
+    for (int c=0;c<n;c++) {
+      printf("%4d ",v[c+r*n]);
+    }
+    printf("\n");
+  }
+}
+
+
+
+void transform_skip_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride)
 {
   int nT = 4;
   int bdShift2 = 20-8;
 
+  assert(0); // DEPRECATED, should not be used anymore because of fixed 4x4 size
+
   for (int y=0;y<nT;y++)
     for (int x=0;x<nT;x++) {
       int32_t c = coeffs[x+y*nT] << 7;
@@ -45,10 +60,172 @@ void transform_skip_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
 }
 
 
-void transform_bypass_8_fallback(uint8_t *dst, int16_t *coeffs, int nT, ptrdiff_t stride)
+void transform_skip_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth)
 {
-  int bdShift2 = 20-8;
+  int nT = 4;
+  int bdShift2 = 20-bit_depth;
+
+  assert(0); // DEPRECATED, should not be used anymore because of fixed 4x4 size
+
+  for (int y=0;y<nT;y++)
+    for (int x=0;x<nT;x++) {
+      int32_t c = coeffs[x+y*nT] << 7;
+      c = (c+(1<<(bdShift2-1)))>>bdShift2;
+
+      dst[y*stride+x] = Clip_BitDepth(dst[y*stride+x] + c, bit_depth);
+    }
+}
+
+
+void transform_skip_residual_fallback(int32_t *residual, const int16_t *coeffs, int nT,
+                                      int tsShift,int bdShift)
+{
+  const int rnd = 1<<(bdShift-1);
+
+  for (int y=0;y<nT;y++)
+    for (int x=0;x<nT;x++) {
+      int32_t c = coeffs[x+y*nT] << tsShift;
+      residual[x+y*nT] = (c + rnd) >> bdShift;
+    }
+}
+
+
+void transform_skip_rdpcm_v_8_fallback(uint8_t *dst, const int16_t *coeffs, int log2nT, ptrdiff_t stride)
+{
+  int bitDepth = 8;
+  int bdShift2 = 20-bitDepth;
+  int offset = (1<<(bdShift2-1));
+  int tsShift = 5 + log2nT; // TODO: extended_precision
+  int nT = 1<<log2nT;
+
+  for (int x=0;x<nT;x++) {
+    int32_t sum = 0;
+
+    for (int y=0;y<nT;y++) {
+      int c = coeffs[x+y*nT] << tsShift;
+      sum += (c+offset)>>bdShift2;
+
+      dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + sum);
+    }
+  }
+}
+
+void transform_skip_rdpcm_h_8_fallback(uint8_t *dst, const int16_t *coeffs, int log2nT, ptrdiff_t stride)
+{
+  int bitDepth = 8;
+  int bdShift2 = 20-bitDepth;
+  int offset = (1<<(bdShift2-1));
+  int tsShift = 5 + log2nT; // TODO: extended_precision
+  int nT = 1<<log2nT;
+
+  for (int y=0;y<nT;y++) {
+    int32_t sum = 0;
+
+    for (int x=0;x<nT;x++) {
+      int c = coeffs[x+y*nT] << tsShift;
+      sum += (c+offset)>>bdShift2;
+
+      dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + sum);
+    }
+  }
+}
+
+
+void transform_bypass_rdpcm_v_8_fallback(uint8_t *dst, const int16_t *coeffs,int nT,ptrdiff_t stride)
+{
+  for (int x=0;x<nT;x++) {
+    int32_t sum=0;
+    for (int y=0;y<nT;y++) {
+      sum += coeffs[x+y*nT];
+
+      dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + sum);
+    }
+  }
+}
+
+
+void transform_bypass_rdpcm_h_8_fallback(uint8_t *dst, const int16_t *coeffs,int nT,ptrdiff_t stride)
+{
+  for (int y=0;y<nT;y++) {
+    int32_t sum=0;
+    for (int x=0;x<nT;x++) {
+      sum += coeffs[x+y*nT];
+
+      dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + sum);
+    }
+  }
+}
+
+
+void transform_bypass_rdpcm_v_fallback(int32_t *dst, const int16_t *coeffs,int nT)
+{
+  for (int x=0;x<nT;x++) {
+    int32_t sum=0;
+    for (int y=0;y<nT;y++) {
+      sum += coeffs[x+y*nT];
+
+      dst[y*nT+x] = sum;
+    }
+  }
+}
+
+
+void transform_bypass_rdpcm_h_fallback(int32_t *dst, const int16_t *coeffs,int nT)
+{
+  for (int y=0;y<nT;y++) {
+    int32_t sum=0;
+    for (int x=0;x<nT;x++) {
+      sum += coeffs[x+y*nT];
+
+      dst[y*nT+x] = sum;
+    }
+  }
+}
+
+
+void rdpcm_v_fallback(int32_t* residual, const int16_t* coeffs, int nT,int tsShift,int bdShift)
+{
+  int rnd = (1<<(bdShift-1));
+
+  for (int x=0;x<nT;x++) {
+    int sum=0;
+    for (int y=0;y<nT;y++) {
+      int c = coeffs[x+y*nT] << tsShift;
+      sum += (c+rnd)>>bdShift;
+      residual[y*nT+x] = sum;
+    }
+  }
+}
+
+
+void rdpcm_h_fallback(int32_t* residual, const int16_t* coeffs, int nT,int tsShift,int bdShift)
+{
+  int rnd = (1<<(bdShift-1));
+
+  for (int y=0;y<nT;y++) {
+    int sum=0;
+    for (int x=0;x<nT;x++) {
+      int c = coeffs[x+y*nT] << tsShift;
+      sum += (c+rnd)>>bdShift;
+      residual[y*nT+x] = sum;
+    }
+  }
+}
+
+
+void transform_bypass_fallback(int32_t *dst, const int16_t *coeffs, int nT)
+{
+  for (int y=0;y<nT;y++)
+    for (int x=0;x<nT;x++) {
+      int32_t c = coeffs[x+y*nT];
+
+      dst[y*nT+x] = c;
+    }
+}
 
+
+void transform_bypass_8_fallback(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride)
+{
   for (int y=0;y<nT;y++)
     for (int x=0;x<nT;x++) {
       int32_t c = coeffs[x+y*nT];
@@ -56,7 +233,30 @@ void transform_bypass_8_fallback(uint8_t *dst, int16_t *coeffs, int nT, ptrdiff_
       dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + c);
     }
 }
-        
+
+
+void transform_bypass_16_fallback(uint16_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth)
+{
+  int bdShift2 = 20-bit_depth;
+
+  for (int y=0;y<nT;y++)
+    for (int x=0;x<nT;x++) {
+      int32_t c = coeffs[x+y*nT];
+
+      dst[y*stride+x] = Clip_BitDepth(dst[y*stride+x] + c, bit_depth);
+    }
+}
+
+
+void rotate_coefficients_fallback(int16_t *coeff, int nT)
+{
+  for (int y=0;y<nT/2;y++)
+    for (int x=0;x<nT;x++) {
+      std::swap(coeff[y*nT+x], coeff[(nT-1-y)*nT + nT-1-x]);
+    }
+}
+
+
 
 static int8_t mat_8_357[4][4] = {
   { 29, 55, 74, 84 },
@@ -67,7 +267,7 @@ static int8_t mat_8_357[4][4] = {
 
 
 
-void transform_4x4_luma_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+void transform_4x4_luma_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride)
 {
   int16_t g[4][4];
 
@@ -79,13 +279,13 @@ void transform_4x4_luma_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t
   // --- V ---
 
   for (int c=0;c<4;c++) {
-
+    /*
     logtrace(LogTransform,"DST-V: ");
     for (int r=0;r<4;r++) {
       logtrace(LogTransform,"%d ",coeffs[c+r*4]);
     }
     logtrace(LogTransform,"* -> ");
-
+    */
 
     for (int i=0;i<4;i++) {
       int sum=0;
@@ -97,11 +297,12 @@ void transform_4x4_luma_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t
       g[i][c] = Clip3(-32768,32767, (sum+rndV)>>7);
     }
 
-
+    /*
     for (int y=0;y<4;y++) {
       logtrace(LogTransform,"*%d ",g[y][c]);
     }
     logtrace(LogTransform,"*\n");
+    */
   }
 
 
@@ -109,12 +310,13 @@ void transform_4x4_luma_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t
 
   for (int y=0;y<4;y++) {
 
+    /*
     logtrace(LogTransform,"DST-H: ");
     for (int c=0;c<4;c++) {
       logtrace(LogTransform,"%d ",g[y][c]);
     }
     logtrace(LogTransform,"* -> ");
-
+    */
 
     for (int i=0;i<4;i++) {
       int sum=0;
@@ -135,156 +337,876 @@ void transform_4x4_luma_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t
 }
 
 
-
-static int8_t mat_dct[32][32] = {
-  { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,      64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64},
-  { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13,  4,      -4,-13,-22,-31,-38,-46,-54,-61,-67,-73,-78,-82,-85,-88,-90,-90},
-  { 90, 87, 80, 70, 57, 43, 25,  9, -9,-25,-43,-57,-70,-80,-87,-90,     -90,-87,-80,-70,-57,-43,-25, -9,  9, 25, 43, 57, 70, 80, 87, 90},
-  { 90, 82, 67, 46, 22, -4,-31,-54,-73,-85,-90,-88,-78,-61,-38,-13,      13, 38, 61, 78, 88, 90, 85, 73, 54, 31,  4,-22,-46,-67,-82,-90},
-  { 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89,      89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89},
-  { 88, 67, 31,-13,-54,-82,-90,-78,-46, -4, 38, 73, 90, 85, 61, 22,     -22,-61,-85,-90,-73,-38,  4, 46, 78, 90, 82, 54, 13,-31,-67,-88},
-  { 87, 57,  9,-43,-80,-90,-70,-25, 25, 70, 90, 80, 43, -9,-57,-87,     -87,-57, -9, 43, 80, 90, 70, 25,-25,-70,-90,-80,-43,  9, 57, 87},
-  { 85, 46,-13,-67,-90,-73,-22, 38, 82, 88, 54, -4,-61,-90,-78,-31,      31, 78, 90, 61,  4,-54,-88,-82,-38, 22, 73, 90, 67, 13,-46,-85},
-  { 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83,      83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83},
-  { 82, 22,-54,-90,-61, 13, 78, 85, 31,-46,-90,-67,  4, 73, 88, 38,     -38,-88,-73, -4, 67, 90, 46,-31,-85,-78,-13, 61, 90, 54,-22,-82},
-  { 80,  9,-70,-87,-25, 57, 90, 43,-43,-90,-57, 25, 87, 70, -9,-80,     -80, -9, 70, 87, 25,-57,-90,-43, 43, 90, 57,-25,-87,-70,  9, 80},
-  { 78, -4,-82,-73, 13, 85, 67,-22,-88,-61, 31, 90, 54,-38,-90,-46,      46, 90, 38,-54,-90,-31, 61, 88, 22,-67,-85,-13, 73, 82,  4,-78},
-  { 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75,      75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75},
-  { 73,-31,-90,-22, 78, 67,-38,-90,-13, 82, 61,-46,-88, -4, 85, 54,     -54,-85,  4, 88, 46,-61,-82, 13, 90, 38,-67,-78, 22, 90, 31,-73},
-  { 70,-43,-87,  9, 90, 25,-80,-57, 57, 80,-25,-90, -9, 87, 43,-70,     -70, 43, 87, -9,-90,-25, 80, 57,-57,-80, 25, 90,  9,-87,-43, 70},
-  { 67,-54,-78, 38, 85,-22,-90,  4, 90, 13,-88,-31, 82, 46,-73,-61,      61, 73,-46,-82, 31, 88,-13,-90, -4, 90, 22,-85,-38, 78, 54,-67},
-  { 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64,      64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64},
-  { 61,-73,-46, 82, 31,-88,-13, 90, -4,-90, 22, 85,-38,-78, 54, 67,     -67,-54, 78, 38,-85,-22, 90,  4,-90, 13, 88,-31,-82, 46, 73,-61},
-  { 57,-80,-25, 90, -9,-87, 43, 70,-70,-43, 87,  9,-90, 25, 80,-57,     -57, 80, 25,-90,  9, 87,-43,-70, 70, 43,-87, -9, 90,-25,-80, 57},
-  { 54,-85, -4, 88,-46,-61, 82, 13,-90, 38, 67,-78,-22, 90,-31,-73,      73, 31,-90, 22, 78,-67,-38, 90,-13,-82, 61, 46,-88,  4, 85,-54},
-  { 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50,      50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50},
-  { 46,-90, 38, 54,-90, 31, 61,-88, 22, 67,-85, 13, 73,-82,  4, 78,     -78, -4, 82,-73,-13, 85,-67,-22, 88,-61,-31, 90,-54,-38, 90,-46},
-  { 43,-90, 57, 25,-87, 70,  9,-80, 80, -9,-70, 87,-25,-57, 90,-43,     -43, 90,-57,-25, 87,-70, -9, 80,-80,  9, 70,-87, 25, 57,-90, 43},
-  { 38,-88, 73, -4,-67, 90,-46,-31, 85,-78, 13, 61,-90, 54, 22,-82,      82,-22,-54, 90,-61,-13, 78,-85, 31, 46,-90, 67,  4,-73, 88,-38},
-  { 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36,      36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36},
-  { 31,-78, 90,-61,  4, 54,-88, 82,-38,-22, 73,-90, 67,-13,-46, 85,     -85, 46, 13,-67, 90,-73, 22, 38,-82, 88,-54, -4, 61,-90, 78,-31},
-  { 25,-70, 90,-80, 43,  9,-57, 87,-87, 57, -9,-43, 80,-90, 70,-25,     -25, 70,-90, 80,-43, -9, 57,-87, 87,-57,  9, 43,-80, 90,-70, 25},
-  { 22,-61, 85,-90, 73,-38, -4, 46,-78, 90,-82, 54,-13,-31, 67,-88,      88,-67, 31, 13,-54, 82,-90, 78,-46,  4, 38,-73, 90,-85, 61,-22},
-  { 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18,      18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18},
-  { 13,-38, 61,-78, 88,-90, 85,-73, 54,-31,  4, 22,-46, 67,-82, 90,     -90, 82,-67, 46,-22, -4, 31,-54, 73,-85, 90,-88, 78,-61, 38,-13},
-  {  9,-25, 43,-57, 70,-80, 87,-90, 90,-87, 80,-70, 57,-43, 25, -9,      -9, 25,-43, 57,-70, 80,-87, 90,-90, 87,-80, 70,-57, 43,-25,  9},
-  {  4,-13, 22,-31, 38,-46, 54,-61, 67,-73, 78,-82, 85,-88, 90,-90,      90,-90, 88,-85, 82,-78, 73,-67, 61,-54, 46,-38, 31,-22, 13, -4}
-};
-
-
-
-
-static void transform_dct_add_8(uint8_t *dst, ptrdiff_t stride,
-                                int nT, int16_t *coeffs)
+void transform_4x4_luma_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride,
+                                        int bit_depth)
 {
-  int postShift = 20-8; // 8 bit
-  int rnd1 = 1<<(7-1);
-  int rnd2 = 1<<(postShift-1);
-  int fact = (1<<(5-Log2(nT)));
-
-  int16_t g[32*32];  // actually, only [nT*nT] used
-
-  // TODO: valgrind reports that dst[] contains uninitialized data.
-  // Probably from intra-prediction.
+  int16_t g[4][4];
 
-  /*
-  for (int i=0;i<nT*nT;i++) {
-    printf("%d\n",coeffs[i]);
-  }
+  int postShift = 20-bit_depth;
+  int rndV = 1<<(7-1);
+  int rndH = 1<<(postShift-1);
 
-  for (int y=0;y<nT;y++) {
-    for (int i=0;i<nT;i++) {
-      printf("%d ",dst[y*stride+i]);
-    }
-  }
-  printf("\n");
-  */
 
-  for (int c=0;c<nT;c++) {
+  // --- V ---
 
-    logtrace(LogTransform,"DCT-V: ");
-    for (int i=0;i<nT;i++) {
-      logtrace(LogTransform,"*%d ",coeffs[c+i*nT]);
+  for (int c=0;c<4;c++) {
+    /*
+    logtrace(LogTransform,"DST-V: ");
+    for (int r=0;r<4;r++) {
+      logtrace(LogTransform,"%d ",coeffs[c+r*4]);
     }
     logtrace(LogTransform,"* -> ");
+    */
 
+    for (int i=0;i<4;i++) {
+      int sum=0;
 
-    // find last non-zero coefficient to reduce computations carried out in DCT
+      for (int j=0;j<4;j++) {
+        sum += mat_8_357[j][i] * coeffs[c+j*4];
+      }
 
-    int lastCol = nT-1;
-    for (;lastCol>=0;lastCol--) {
-      if (coeffs[c+lastCol*nT]) { break; }
+      g[i][c] = Clip3(-32768,32767, (sum+rndV)>>7);
     }
 
-    for (int i=0;i<nT;i++) {
-      int sum=0;
-      
-      for (int j=0;j<=lastCol /*nT*/;j++) {
-        sum += mat_dct[fact*j][i] * coeffs[c+j*nT];
-      }
-      
-      g[c+i*nT] = Clip3(-32768,32767, (sum+rnd1)>>7);
-
-      logtrace(LogTransform,"*%d ",g[c+i*nT]);
+    /*
+    for (int y=0;y<4;y++) {
+      logtrace(LogTransform,"*%d ",g[y][c]);
     }
     logtrace(LogTransform,"*\n");
+    */
   }
 
 
-  for (int y=0;y<nT;y++) {
-
-    logtrace(LogTransform,"DCT-H: ");
-    for (int i=0;i<nT;i++) {
-      logtrace(LogTransform,"*%d ",g[i+y*nT]);
-    }
-    logtrace(LogTransform,"* -> ");
-
+  // --- H ---
 
-    // find last non-zero coefficient to reduce computations carried out in DCT
+  for (int y=0;y<4;y++) {
 
-    int lastCol = nT-1;
-    for (;lastCol>=0;lastCol--) {
-      if (g[y*nT+lastCol]) { break; }
+    /*
+    logtrace(LogTransform,"DST-H: ");
+    for (int c=0;c<4;c++) {
+      logtrace(LogTransform,"%d ",g[y][c]);
     }
+    logtrace(LogTransform,"* -> ");
+    */
 
-
-    for (int i=0;i<nT;i++) {
+    for (int i=0;i<4;i++) {
       int sum=0;
-      
-      for (int j=0;j<=lastCol /*nT*/;j++) {
-        sum += mat_dct[fact*j][i] * g[y*nT+j];
+
+      for (int j=0;j<4;j++) {
+        sum += mat_8_357[j][i] * g[y][j];
       }
-      
-      //int out = Clip3(-32768,32767, (sum+rnd2)>>postShift);
-      int out = (sum+rnd2)>>postShift;
 
-      //fprintf(stderr,"%d*%d+%d = %d\n",y,stride,i,y*stride+i);
-      //fprintf(stderr,"[%p]=%d\n",&dst[y*stride+i], Clip1_8bit(dst[y*stride+i]));
-      dst[y*stride+i] = Clip1_8bit(dst[y*stride+i] + out);
+      int out = Clip3(-32768,32767, (sum+rndH)>>postShift);
+
+      dst[y*stride+i] = Clip_BitDepth(dst[y*stride+i] + out, bit_depth);
 
       logtrace(LogTransform,"*%d ",out);
     }
+
     logtrace(LogTransform,"*\n");
   }
 }
 
 
-void transform_4x4_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+void fdst_4x4_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride)
 {
-  transform_dct_add_8(dst,stride,  4, coeffs);
-}
+  int16_t g[4*4];
+
+  int BD = 8;
+  int shift1 = Log2(4) + BD -9;
+  int shift2 = Log2(4) + 6;
+
+  int rnd1 = 1<<(shift1-1);
+  int rnd2 = 1<<(shift2-1);
+
+
+  // --- V ---
+
+  for (int c=0;c<4;c++) {
+
+    /*
+    logtrace(LogTransform,"DST-V: ");
+    for (int r=0;r<4;r++) {
+      logtrace(LogTransform,"%d ",coeffs[c+r*4]);
+    }
+    logtrace(LogTransform,"* -> ");
+    */
+
+    for (int i=0;i<4;i++) {
+      int sum=0;
+
+      for (int j=0;j<4;j++) {
+        sum += mat_8_357[i][j] * input[c+j*stride];
+      }
+
+      g[c+4*i] = Clip3(-32768,32767, (sum+rnd1)>>shift1);
+    }
+  }
+
+
+  // --- H ---
+
+  for (int y=0;y<4;y++) {
+    for (int i=0;i<4;i++) {
+      int sum=0;
+
+      for (int j=0;j<4;j++) {
+        sum += mat_8_357[i][j] * g[y*4+j];
+      }
+
+      // TODO: do we need clipping ?
+      int out = (sum+rnd2)>>shift2; // Clip3(-32768,32767, (sum+rndH)>>postShift);
+
+      coeffs[y*4+i] = out;
+
+      logtrace(LogTransform,"*%d ",out);
+    }
+
+    logtrace(LogTransform,"*\n");
+  }
+}
+
+
+void transform_idst_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits)
+{
+  int16_t g[4][4];
+
+  int rndV = 1<<(7-1);
+  int rndH = 1<<(bdShift-1);
+
+  int CoeffMax = (1<<max_coeff_bits)-1;
+  int CoeffMin = -(1<<max_coeff_bits);
+
+
+  // --- V ---
+
+  for (int c=0;c<4;c++) {
+    for (int i=0;i<4;i++) {
+      int sum=0;
+
+      for (int j=0;j<4;j++) {
+        sum += mat_8_357[j][i] * coeffs[c+j*4];
+      }
+
+      g[i][c] = Clip3(CoeffMin,CoeffMax, (sum+rndV)>>7);
+    }
+  }
+
+
+  // --- H ---
+
+  for (int y=0;y<4;y++) {
+    for (int i=0;i<4;i++) {
+      int sum=0;
+
+      for (int j=0;j<4;j++) {
+        sum += mat_8_357[j][i] * g[y][j];
+      }
+
+      dst[y*4+i] = (sum + rndH)>>bdShift;
+    }
+  }
+}
+
+
+
+static int8_t mat_dct[32][32] = {
+  { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,      64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64},
+  { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13,  4,      -4,-13,-22,-31,-38,-46,-54,-61,-67,-73,-78,-82,-85,-88,-90,-90},
+  { 90, 87, 80, 70, 57, 43, 25,  9, -9,-25,-43,-57,-70,-80,-87,-90,     -90,-87,-80,-70,-57,-43,-25, -9,  9, 25, 43, 57, 70, 80, 87, 90},
+  { 90, 82, 67, 46, 22, -4,-31,-54,-73,-85,-90,-88,-78,-61,-38,-13,      13, 38, 61, 78, 88, 90, 85, 73, 54, 31,  4,-22,-46,-67,-82,-90},
+  { 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89,      89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89},
+  { 88, 67, 31,-13,-54,-82,-90,-78,-46, -4, 38, 73, 90, 85, 61, 22,     -22,-61,-85,-90,-73,-38,  4, 46, 78, 90, 82, 54, 13,-31,-67,-88},
+  { 87, 57,  9,-43,-80,-90,-70,-25, 25, 70, 90, 80, 43, -9,-57,-87,     -87,-57, -9, 43, 80, 90, 70, 25,-25,-70,-90,-80,-43,  9, 57, 87},
+  { 85, 46,-13,-67,-90,-73,-22, 38, 82, 88, 54, -4,-61,-90,-78,-31,      31, 78, 90, 61,  4,-54,-88,-82,-38, 22, 73, 90, 67, 13,-46,-85},
+  { 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83,      83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83},
+  { 82, 22,-54,-90,-61, 13, 78, 85, 31,-46,-90,-67,  4, 73, 88, 38,     -38,-88,-73, -4, 67, 90, 46,-31,-85,-78,-13, 61, 90, 54,-22,-82},
+  { 80,  9,-70,-87,-25, 57, 90, 43,-43,-90,-57, 25, 87, 70, -9,-80,     -80, -9, 70, 87, 25,-57,-90,-43, 43, 90, 57,-25,-87,-70,  9, 80},
+  { 78, -4,-82,-73, 13, 85, 67,-22,-88,-61, 31, 90, 54,-38,-90,-46,      46, 90, 38,-54,-90,-31, 61, 88, 22,-67,-85,-13, 73, 82,  4,-78},
+  { 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75,      75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75},
+  { 73,-31,-90,-22, 78, 67,-38,-90,-13, 82, 61,-46,-88, -4, 85, 54,     -54,-85,  4, 88, 46,-61,-82, 13, 90, 38,-67,-78, 22, 90, 31,-73},
+  { 70,-43,-87,  9, 90, 25,-80,-57, 57, 80,-25,-90, -9, 87, 43,-70,     -70, 43, 87, -9,-90,-25, 80, 57,-57,-80, 25, 90,  9,-87,-43, 70},
+  { 67,-54,-78, 38, 85,-22,-90,  4, 90, 13,-88,-31, 82, 46,-73,-61,      61, 73,-46,-82, 31, 88,-13,-90, -4, 90, 22,-85,-38, 78, 54,-67},
+  { 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64,      64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64},
+  { 61,-73,-46, 82, 31,-88,-13, 90, -4,-90, 22, 85,-38,-78, 54, 67,     -67,-54, 78, 38,-85,-22, 90,  4,-90, 13, 88,-31,-82, 46, 73,-61},
+  { 57,-80,-25, 90, -9,-87, 43, 70,-70,-43, 87,  9,-90, 25, 80,-57,     -57, 80, 25,-90,  9, 87,-43,-70, 70, 43,-87, -9, 90,-25,-80, 57},
+  { 54,-85, -4, 88,-46,-61, 82, 13,-90, 38, 67,-78,-22, 90,-31,-73,      73, 31,-90, 22, 78,-67,-38, 90,-13,-82, 61, 46,-88,  4, 85,-54},
+  { 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50,      50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50},
+  { 46,-90, 38, 54,-90, 31, 61,-88, 22, 67,-85, 13, 73,-82,  4, 78,     -78, -4, 82,-73,-13, 85,-67,-22, 88,-61,-31, 90,-54,-38, 90,-46},
+  { 43,-90, 57, 25,-87, 70,  9,-80, 80, -9,-70, 87,-25,-57, 90,-43,     -43, 90,-57,-25, 87,-70, -9, 80,-80,  9, 70,-87, 25, 57,-90, 43},
+  { 38,-88, 73, -4,-67, 90,-46,-31, 85,-78, 13, 61,-90, 54, 22,-82,      82,-22,-54, 90,-61,-13, 78,-85, 31, 46,-90, 67,  4,-73, 88,-38},
+  { 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36,      36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36},
+  { 31,-78, 90,-61,  4, 54,-88, 82,-38,-22, 73,-90, 67,-13,-46, 85,     -85, 46, 13,-67, 90,-73, 22, 38,-82, 88,-54, -4, 61,-90, 78,-31},
+  { 25,-70, 90,-80, 43,  9,-57, 87,-87, 57, -9,-43, 80,-90, 70,-25,     -25, 70,-90, 80,-43, -9, 57,-87, 87,-57,  9, 43,-80, 90,-70, 25},
+  { 22,-61, 85,-90, 73,-38, -4, 46,-78, 90,-82, 54,-13,-31, 67,-88,      88,-67, 31, 13,-54, 82,-90, 78,-46,  4, 38,-73, 90,-85, 61,-22},
+  { 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18,      18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18},
+  { 13,-38, 61,-78, 88,-90, 85,-73, 54,-31,  4, 22,-46, 67,-82, 90,     -90, 82,-67, 46,-22, -4, 31,-54, 73,-85, 90,-88, 78,-61, 38,-13},
+  {  9,-25, 43,-57, 70,-80, 87,-90, 90,-87, 80,-70, 57,-43, 25, -9,      -9, 25,-43, 57,-70, 80,-87, 90,-90, 87,-80, 70,-57, 43,-25,  9},
+  {  4,-13, 22,-31, 38,-46, 54,-61, 67,-73, 78,-82, 85,-88, 90,-90,      90,-90, 88,-85, 82,-78, 73,-67, 61,-54, 46,-38, 31,-22, 13, -4}
+};
+
+
+
+
+template <class pixel_t>
+void transform_idct_add(pixel_t *dst, ptrdiff_t stride,
+                        int nT, const int16_t *coeffs, int bit_depth)
+{
+  /*
+    The effective shift is
+    7 bits right for bit-depth 8,
+    6 bits right for bit-depth 9,
+    5 bits right for bit-depth 10.
+
+    Computation is independent of the block size.
+    Each multiplication with the table includes a left shift of 6 bits.
+    Hence, we have 2* 6 bits = 12 bits left shift.
+    V-pass has fixed 7 bit right shift.
+    H-pass has 20-BitDepth bit right shift;
+
+    Effective shift 's' means: residual value 1 gives DC-coeff (1<<s).
+   */
+
+
+  int postShift = 20-bit_depth;
+  int rnd1 = 1<<(7-1);
+  int rnd2 = 1<<(postShift-1);
+  int fact = (1<<(5-Log2(nT)));
+
+  int16_t g[32*32];  // actually, only [nT*nT] used
+
+  // TODO: valgrind reports that dst[] contains uninitialized data.
+  // Probably from intra-prediction.
+
+  /*
+  for (int i=0;i<nT*nT;i++) {
+    printf("%d\n",coeffs[i]);
+  }
+
+  for (int y=0;y<nT;y++) {
+    for (int i=0;i<nT;i++) {
+      printf("%d ",dst[y*stride+i]);
+    }
+  }
+  printf("\n");
+  */
+
+  /*
+  printf("--- input\n");
+  for (int r=0;r<nT;r++, printf("\n"))
+    for (int c=0;c<nT;c++) {
+      printf("%3d ",coeffs[c+r*nT]);
+    }
+  */
+
+  for (int c=0;c<nT;c++) {
+
+    /*
+    logtrace(LogTransform,"DCT-V: ");
+    for (int i=0;i<nT;i++) {
+      logtrace(LogTransform,"*%d ",coeffs[c+i*nT]);
+    }
+    logtrace(LogTransform,"* -> ");
+    */
+
+
+    // find last non-zero coefficient to reduce computations carried out in DCT
+
+    int lastCol = nT-1;
+    for (;lastCol>=0;lastCol--) {
+      if (coeffs[c+lastCol*nT]) { break; }
+    }
+
+    for (int i=0;i<nT;i++) {
+      int sum=0;
+
+      /*
+      printf("input: ");
+      for (int j=0;j<nT;j++) {
+        printf("%3d ",coeffs[c+j*nT]);
+      }
+      printf("\n");
+
+      printf("mat: ");
+      for (int j=0;j<nT;j++) {
+        printf("%3d ",mat_dct[fact*j][i]);
+      }
+      printf("\n");
+      */
+
+      for (int j=0;j<=lastCol /*nT*/;j++) {
+        sum += mat_dct[fact*j][i] * coeffs[c+j*nT];
+      }
+
+      g[c+i*nT] = Clip3(-32768,32767, (sum+rnd1)>>7);
+
+      logtrace(LogTransform,"*%d ",g[c+i*nT]);
+    }
+    logtrace(LogTransform,"*\n");
+  }
+
+  /*
+  printf("--- temp\n");
+  for (int r=0;r<nT;r++, printf("\n"))
+    for (int c=0;c<nT;c++) {
+      printf("%3d ",g[c+r*nT]);
+    }
+  */
+
+  for (int y=0;y<nT;y++) {
+    /*
+    logtrace(LogTransform,"DCT-H: ");
+    for (int i=0;i<nT;i++) {
+      logtrace(LogTransform,"*%d ",g[i+y*nT]);
+    }
+    logtrace(LogTransform,"* -> ");
+    */
+
+
+    // find last non-zero coefficient to reduce computations carried out in DCT
+
+    int lastCol = nT-1;
+    for (;lastCol>=0;lastCol--) {
+      if (g[y*nT+lastCol]) { break; }
+    }
+
+
+    for (int i=0;i<nT;i++) {
+      int sum=0;
+
+      for (int j=0;j<=lastCol /*nT*/;j++) {
+        sum += mat_dct[fact*j][i] * g[y*nT+j];
+      }
+
+      //int out = Clip3(-32768,32767, (sum+rnd2)>>postShift);
+      int out = (sum+rnd2)>>postShift;
+
+      //fprintf(stderr,"%d*%d+%d = %d\n",y,stride,i,y*stride+i);
+      //fprintf(stderr,"[%p]=%d\n",&dst[y*stride+i], Clip1_8bit(dst[y*stride+i]));
+      dst[y*stride+i] = Clip_BitDepth(dst[y*stride+i] + out, bit_depth);
+
+      logtrace(LogTransform,"*%d ",out);
+    }
+    logtrace(LogTransform,"*\n");
+  }
+}
+
+
+
+void transform_idct_fallback(int32_t *dst, int nT, const int16_t *coeffs, int bdShift, int max_coeff_bits)
+{
+  /*
+    The effective shift is
+    7 bits right for bit-depth 8,
+    6 bits right for bit-depth 9,
+    5 bits right for bit-depth 10.
+
+    One transformation with raw transform filter values increases range be 2048 (=32*64).
+    This equals 11 bits.
+
+    Computation is independent of the block size.
+    Each multiplication with the table includes a left shift of 6 bits.
+    Hence, we have 2* 6 bits = 12 bits left shift.
+    V-pass has fixed 7 bit right shift.
+    H-pass has 20-BitDepth bit right shift;
+
+    Effective shift 's' means: residual value 1 gives DC-coeff (1<<s).
+   */
+
+
+  int rnd1 = 1<<(7-1);
+  int fact = (1<<(5-Log2(nT)));
+
+  //int bdShift = 20-bit_depth;
+  int rnd2 = 1<<(bdShift-1);
+
+  int16_t g[32*32];  // actually, only [nT*nT] used
+
+  int CoeffMax = (1<<max_coeff_bits)-1;
+  int CoeffMin = -(1<<max_coeff_bits);
+
+  // TODO: valgrind reports that dst[] contains uninitialized data.
+  // Probably from intra-prediction.
+
+  /*
+  for (int i=0;i<nT*nT;i++) {
+    printf("%d\n",coeffs[i]);
+  }
+
+  for (int y=0;y<nT;y++) {
+    for (int i=0;i<nT;i++) {
+      printf("%d ",dst[y*stride+i]);
+    }
+  }
+  printf("\n");
+  */
+
+  /*
+  printf("--- input\n");
+  for (int r=0;r<nT;r++, printf("\n"))
+    for (int c=0;c<nT;c++) {
+      printf("%3d ",coeffs[c+r*nT]);
+    }
+  */
+
+  for (int c=0;c<nT;c++) {
+
+    /*
+    logtrace(LogTransform,"DCT-V: ");
+    for (int i=0;i<nT;i++) {
+      logtrace(LogTransform,"*%d ",coeffs[c+i*nT]);
+    }
+    logtrace(LogTransform,"* -> ");
+    */
+
+
+    // find last non-zero coefficient to reduce computations carried out in DCT
+
+    int lastCol = nT-1;
+    for (;lastCol>=0;lastCol--) {
+      if (coeffs[c+lastCol*nT]) { break; }
+    }
+
+    for (int i=0;i<nT;i++) {
+      int sum=0;
+
+      /*
+      printf("input: ");
+      for (int j=0;j<nT;j++) {
+        printf("%3d ",coeffs[c+j*nT]);
+      }
+      printf("\n");
+
+      printf("mat: ");
+      for (int j=0;j<nT;j++) {
+        printf("%3d ",mat_dct[fact*j][i]);
+      }
+      printf("\n");
+      */
+
+      for (int j=0;j<=lastCol /*nT*/;j++) {
+        sum += mat_dct[fact*j][i] * coeffs[c+j*nT];
+      }
+
+      g[c+i*nT] = Clip3(CoeffMin,CoeffMax, (sum+rnd1)>>7);
+
+      logtrace(LogTransform,"*%d ",g[c+i*nT]);
+    }
+    logtrace(LogTransform,"*\n");
+  }
+
+  /*
+  printf("--- temp\n");
+  for (int r=0;r<nT;r++, printf("\n"))
+    for (int c=0;c<nT;c++) {
+      printf("%3d ",g[c+r*nT]);
+    }
+  */
+
+  for (int y=0;y<nT;y++) {
+    /*
+    logtrace(LogTransform,"DCT-H: ");
+    for (int i=0;i<nT;i++) {
+      logtrace(LogTransform,"*%d ",g[i+y*nT]);
+    }
+    logtrace(LogTransform,"* -> ");
+    */
+
+
+    // find last non-zero coefficient to reduce computations carried out in DCT
+
+    int lastCol = nT-1;
+    for (;lastCol>=0;lastCol--) {
+      if (g[y*nT+lastCol]) { break; }
+    }
+
+
+    for (int i=0;i<nT;i++) {
+      int sum=0;
+
+      for (int j=0;j<=lastCol /*nT*/;j++) {
+        sum += mat_dct[fact*j][i] * g[y*nT+j];
+      }
+
+      dst[y*nT+i] = (sum + rnd2)>>bdShift;
+
+      logtrace(LogTransform,"*%d ",sum);
+    }
+    logtrace(LogTransform,"*\n");
+  }
+}
+
+
+void transform_idct_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits)
+{
+  transform_idct_fallback(dst,4,coeffs,bdShift,max_coeff_bits);
+}
+
+void transform_idct_8x8_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits)
+{
+  transform_idct_fallback(dst,8,coeffs,bdShift,max_coeff_bits);
+}
+
+void transform_idct_16x16_fallback(int32_t *dst, const int16_t *coeffs,
+                                   int bdShift, int max_coeff_bits)
+{
+  transform_idct_fallback(dst,16,coeffs,bdShift,max_coeff_bits);
+}
+
+void transform_idct_32x32_fallback(int32_t *dst, const int16_t *coeffs,
+                                   int bdShift, int max_coeff_bits)
+{
+  transform_idct_fallback(dst,32,coeffs,bdShift,max_coeff_bits);
+}
+
+
+
+
+void transform_4x4_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride)
+{
+  transform_idct_add<uint8_t>(dst,stride,  4, coeffs, 8);
+}
+
+void transform_8x8_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride)
+{
+  transform_idct_add<uint8_t>(dst,stride,  8, coeffs, 8);
+}
+
+void transform_16x16_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride)
+{
+  transform_idct_add<uint8_t>(dst,stride,  16, coeffs, 8);
+}
+
+void transform_32x32_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride)
+{
+  transform_idct_add<uint8_t>(dst,stride,  32, coeffs, 8);
+}
+
+
+void transform_4x4_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth)
+{
+  transform_idct_add<uint16_t>(dst,stride,  4, coeffs, bit_depth);
+}
+
+void transform_8x8_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth)
+{
+  transform_idct_add<uint16_t>(dst,stride,  8, coeffs, bit_depth);
+}
+
+void transform_16x16_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth)
+{
+  transform_idct_add<uint16_t>(dst,stride,  16, coeffs, bit_depth);
+}
+
+void transform_32x32_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth)
+{
+  transform_idct_add<uint16_t>(dst,stride,  32, coeffs, bit_depth);
+}
+
+
+static void transform_fdct_8(int16_t* coeffs, int nT,
+                             const int16_t *input, ptrdiff_t stride)
+{
+  /*
+    Each sum over a basis vector sums nT elements, which is compensated by
+    shifting right by Log2(nT), effectively dividing by 2^Log2(nT) = nT.
+    Do this in each of the H/V passes.
+
+    Each multiplication with the table includes a left shift of 6 bits.
+    Hence, we have in total 2* 6 bits = 12 bits left shift because of the
+    multiplications.
+
+    We carry out shifts after each pass:
+    First (V) pass has BitDepth-9 bit right shift,
+    Second (H) pass has fixed 6 bit right shift.
+
+    For bit-depth 8, the total shift is 7 bits left.
+    For bit-depth 9, the total shift is 6 bits left.
+    For bit-depth 10, the total shift is 5 bits left.
+
+    I.e.: a constant residual value 1 gives DC-coeff (1<<s).
+
+    For 8-bit images in a 32x32 block, the input are 8 bits + 1 sign bit.
+    After the first pass, we need 9+5+6=20 bits for the intermediate sum
+    (9 bit input, 5 bit because we sum 2^5 elements, 6 bit because of multiplication with 64).
+    The first pass shift is Log2(32) - 1 -> 4 bits and we are down to 16 bits again.
+    After the second pass, we need 16+5+6=27 bits for the intermediate sum
+    (16 bit input, 5 bit because we sum 2^5 elements, 6 bit because of coefficient multiplication).
+    The second pass shift is Log2(32)+6 = 11 and we are down again to 16 bits.
+
+    For larger input bit-depths, the intermediate result after the first pass
+    will be wider accordingly, but the widths after the shifts are the same.
+  */
+
+  int BitDepth = 8;
+
+  //          / compensate everything | / effective word length |
+  int shift1 = Log2(nT) + 6 + BitDepth  - 15;
+  int shift2 = Log2(nT) + 6;
+
+  int rnd1 = 1<<(shift1-1);
+  int rnd2 = 1<<(shift2-1);
+  int fact = (1<<(5-Log2(nT)));
+
+  int16_t g[32*32];  // actually, only [nT*nT] used
+
+  for (int c=0;c<nT;c++) {
+
+    for (int i=0;i<nT;i++) {
+      int sum=0;
+
+      for (int j=0;j<nT;j++) {
+        sum += mat_dct[fact*i][j] * input[c+j*stride];
+      }
+
+      //assert((sum+rnd1)>>shift1 <=  32767);
+      //assert((sum+rnd1)>>shift1 >= -32768);
+      g[c+i*nT] = (sum+rnd1)>>shift1; // clipping to -32768;32767 unnecessary
+    }
+  }
+
+
+  for (int y=0;y<nT;y++) {
+    for (int i=0;i<nT;i++) {
+      int sum=0;
+
+      for (int j=0;j<nT;j++) {
+        sum += mat_dct[fact*i][j] * g[y*nT+j];
+      }
+
+      // no clipping to -32768;32767 required
+      int out = (sum+rnd2)>>shift2;
+
+      coeffs[y*nT+i] = out;
+    }
+  }
+}
+
+
+void fdct_4x4_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride)
+{
+  transform_fdct_8(coeffs, 4, input,stride);
+}
+
+void fdct_8x8_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride)
+{
+  transform_fdct_8(coeffs, 8, input,stride);
+}
+
+void fdct_16x16_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride)
+{
+  transform_fdct_8(coeffs, 16, input,stride);
+}
+
+void fdct_32x32_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride)
+{
+  transform_fdct_8(coeffs, 32, input,stride);
+}
+
+
+
+
+void hadamard_transform_8(int16_t *coeffs, int n, const int16_t *input, ptrdiff_t stride)
+{
+  int16_t tmp[32*32];
+
+  // row transforms
+
+  //printMatrix("input",input,n);
+
+  int16_t am[32],bm[32];
+  int16_t *a = am, *b = bm;
+  for (int row=0;row<n;row++) {
+    int rs = row*stride;
+    for (int i=0;i<(n>>1);i++) {
+      a[       i] = input[i+rs] + input[i+(n>>1)+rs];
+      a[(n>>1)+i] = input[i+rs] - input[i+(n>>1)+rs];
+    }
+
+    int iOuter=(n>>1);
+    int nInner=(n>>2);
+
+    while (nInner>=2) {
+      std::swap(a,b);
+
+      for (int k=0;k<n;k+=iOuter) {
+        for (int i=0;i<nInner;i++) {
+          a[k+i       ] = b[k+i] + b[k+i+nInner];
+          a[k+i+nInner] = b[k+i] - b[k+i+nInner];
+        }
+      }
+
+      iOuter>>=1;
+      nInner>>=1;
+    }
+
+    for (int k=0;k<n;k+=2) {
+      tmp[k  +n*row] = a[k] + a[k+1];
+      tmp[k+1+n*row] = a[k] - a[k+1];
+    }
+  }
+
+  //printMatrix("tmp",tmp,n);
+
+  // column transforms
+
+  for (int col=0;col<n;col++) {
+    for (int i=0;i<(n>>1);i++) {
+      a[       i] = tmp[i*n+col] + tmp[(i+(n>>1))*n+col];
+      a[(n>>1)+i] = tmp[i*n+col] - tmp[(i+(n>>1))*n+col];
+    }
+
+    int iOuter=(n>>1);
+    int nInner=(n>>2);
+
+    while (nInner>=2) {
+      std::swap(a,b);
+
+      for (int k=0;k<n;k+=iOuter) {
+        for (int i=0;i<nInner;i++) {
+          a[k+i       ] = b[k+i] + b[k+i+nInner];
+          a[k+i+nInner] = b[k+i] - b[k+i+nInner];
+        }
+      }
+
+      iOuter>>=1;
+      nInner>>=1;
+    }
+
+    for (int k=0;k<n;k+=2) {
+      coeffs[col+(k  )*n] = a[k] + a[k+1];
+      coeffs[col+(k+1)*n] = a[k] - a[k+1];
+    }
+  }
+
+  //printMatrix("coeffs",coeffs,n);
+}
+
+
+void hadamard_4x4_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride)
+{
+  int16_t tmp[4*4];
+
+  // row transforms
+
+  //printMatrix("input",input,4);
+
+  int16_t a[4];
+  for (int row=0;row<4;row++) {
+    int rs = row*stride;
+    a[0] = input[0+rs] + input[2+rs];
+    a[1] = input[1+rs] + input[3+rs];
+    a[2] = input[0+rs] - input[2+rs];
+    a[3] = input[1+rs] - input[3+rs];
+
+    tmp[0+4*row] = a[0]+a[1];
+    tmp[1+4*row] = a[0]-a[1];
+    tmp[2+4*row] = a[2]+a[3];
+    tmp[3+4*row] = a[2]-a[3];
+  }
+
+  //printMatrix("tmp",tmp,4);
+
+  // column transforms
+
+  for (int col=0;col<4;col++) {
+    a[0] = tmp[col+0*4] + tmp[col+2*4];
+    a[1] = tmp[col+1*4] + tmp[col+3*4];
+    a[2] = tmp[col+0*4] - tmp[col+2*4];
+    a[3] = tmp[col+1*4] - tmp[col+3*4];
+
+    coeffs[col+0*4] = a[0]+a[1];
+    coeffs[col+1*4] = a[0]-a[1];
+    coeffs[col+2*4] = a[2]+a[3];
+    coeffs[col+3*4] = a[2]-a[3];
+  }
+
+  //printMatrix("coeffs",coeffs,4);
+}
+
+
+void hadamard_8x8_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride)
+{
+  int16_t tmp[8*8];
+
+  // row transforms
+
+  //printMatrix("input",input,8);
+
+  int16_t a[8],b[8];
+  for (int row=0;row<8;row++) {
+    int rs = row*stride;
+    a[0] = input[0+rs] + input[4+rs];
+    a[1] = input[1+rs] + input[5+rs];
+    a[2] = input[2+rs] + input[6+rs];
+    a[3] = input[3+rs] + input[7+rs];
+    a[4] = input[0+rs] - input[4+rs];
+    a[5] = input[1+rs] - input[5+rs];
+    a[6] = input[2+rs] - input[6+rs];
+    a[7] = input[3+rs] - input[7+rs];
+
+    b[0] = a[0]+a[2];
+    b[1] = a[1]+a[3];
+    b[2] = a[0]-a[2];
+    b[3] = a[1]-a[3];
+    b[4] = a[4]+a[6];
+    b[5] = a[5]+a[7];
+    b[6] = a[4]-a[6];
+    b[7] = a[5]-a[7];
+
+    tmp[0+8*row] = b[0]+b[1];
+    tmp[1+8*row] = b[0]-b[1];
+    tmp[2+8*row] = b[2]+b[3];
+    tmp[3+8*row] = b[2]-b[3];
+    tmp[4+8*row] = b[4]+b[5];
+    tmp[5+8*row] = b[4]-b[5];
+    tmp[6+8*row] = b[6]+b[7];
+    tmp[7+8*row] = b[6]-b[7];
+  }
+
+  //printMatrix("tmp",tmp,8);
+
+  // column transforms
+
+  for (int col=0;col<8;col++) {
+    a[0] = tmp[col+0*8] + tmp[col+4*8];
+    a[1] = tmp[col+1*8] + tmp[col+5*8];
+    a[2] = tmp[col+2*8] + tmp[col+6*8];
+    a[3] = tmp[col+3*8] + tmp[col+7*8];
+    a[4] = tmp[col+0*8] - tmp[col+4*8];
+    a[5] = tmp[col+1*8] - tmp[col+5*8];
+    a[6] = tmp[col+2*8] - tmp[col+6*8];
+    a[7] = tmp[col+3*8] - tmp[col+7*8];
+
+    b[0] = a[0]+a[2];
+    b[1] = a[1]+a[3];
+    b[2] = a[0]-a[2];
+    b[3] = a[1]-a[3];
+    b[4] = a[4]+a[6];
+    b[5] = a[5]+a[7];
+    b[6] = a[4]-a[6];
+    b[7] = a[5]-a[7];
+
+    coeffs[col+0*8] = b[0]+b[1];
+    coeffs[col+1*8] = b[0]-b[1];
+    coeffs[col+2*8] = b[2]+b[3];
+    coeffs[col+3*8] = b[2]-b[3];
+    coeffs[col+4*8] = b[4]+b[5];
+    coeffs[col+5*8] = b[4]-b[5];
+    coeffs[col+6*8] = b[6]+b[7];
+    coeffs[col+7*8] = b[6]-b[7];
+  }
+
+  //printMatrix("coeffs",coeffs,8);
+}
 
-void transform_8x8_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-{
-  transform_dct_add_8(dst,stride,  8, coeffs);
-}
 
-void transform_16x16_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+void hadamard_16x16_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride)
 {
-  transform_dct_add_8(dst,stride,  16, coeffs);
+  hadamard_transform_8(coeffs,16, input,stride);
 }
 
-void transform_32x32_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+void hadamard_32x32_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride)
 {
-  transform_dct_add_8(dst,stride,  32, coeffs);
+  hadamard_transform_8(coeffs,32, input,stride);
 }
diff --git a/libde265/fallback-dct.h b/libde265/fallback-dct.h
index fc5bfdb..83d25c1 100644
--- a/libde265/fallback-dct.h
+++ b/libde265/fallback-dct.h
@@ -24,14 +24,73 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include "util.h"
 
-void transform_skip_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void transform_bypass_8_fallback(uint8_t *dst, int16_t *coeffs, int nT, ptrdiff_t stride);
 
-void transform_4x4_luma_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void transform_4x4_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void transform_8x8_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void transform_16x16_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void transform_32x32_add_8_fallback(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+// --- decoding ---
+
+void transform_skip_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride);
+void transform_bypass_fallback(int32_t *r, const int16_t *coeffs, int nT);
+
+void transform_skip_rdpcm_v_8_fallback(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride);
+void transform_skip_rdpcm_h_8_fallback(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride);
+void transform_bypass_rdpcm_v_fallback(int32_t *r, const int16_t *coeffs,int nT);
+void transform_bypass_rdpcm_h_fallback(int32_t *r, const int16_t *coeffs,int nT);
+
+void transform_4x4_luma_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride);
+void transform_4x4_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride);
+void transform_8x8_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride);
+void transform_16x16_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride);
+void transform_32x32_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride);
+
+
+void transform_skip_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth);
+void transform_bypass_16_fallback(uint16_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth);
+
+void transform_4x4_luma_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth);
+void transform_4x4_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth);
+void transform_8x8_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth);
+void transform_16x16_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth);
+void transform_32x32_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth);
+
+void rotate_coefficients_fallback(int16_t *coeff, int nT);
+
+
+void transform_idst_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits);
+void transform_idct_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits);
+void transform_idct_8x8_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits);
+void transform_idct_16x16_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits);
+void transform_idct_32x32_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits);
+
+template <class pixel_t>
+void add_residual_fallback(pixel_t *dst, ptrdiff_t stride,
+                           const int32_t* r, int nT, int bit_depth)
+{
+  for (int y=0;y<nT;y++)
+    for (int x=0;x<nT;x++) {
+      dst[y*stride+x] = Clip_BitDepth(dst[y*stride+x] + r[y*nT+x], bit_depth);
+    }
+}
+
+
+void rdpcm_v_fallback(int32_t* residual, const int16_t* coeffs, int nT, int tsShift,int bdShift);
+void rdpcm_h_fallback(int32_t* residual, const int16_t* coeffs, int nT, int tsShift,int bdShift);
+
+void transform_skip_residual_fallback(int32_t *residual, const int16_t *coeffs, int nT,
+                                      int tsShift,int bdShift);
+
+
+// --- encoding ---
+
+void fdst_4x4_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride);
+void fdct_4x4_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride);
+void fdct_8x8_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride);
+void fdct_16x16_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride);
+void fdct_32x32_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride);
+
+void hadamard_4x4_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride);
+void hadamard_8x8_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride);
+void hadamard_16x16_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride);
+void hadamard_32x32_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride);
 
 #endif
diff --git a/libde265/fallback-motion.cc b/libde265/fallback-motion.cc
index 1e959e2..958e4b6 100644
--- a/libde265/fallback-motion.cc
+++ b/libde265/fallback-motion.cc
@@ -31,7 +31,7 @@
 
 
 void put_unweighted_pred_8_fallback(uint8_t *dst, ptrdiff_t dststride,
-                                    int16_t *src, ptrdiff_t srcstride,
+                                    const int16_t *src, ptrdiff_t srcstride,
                                     int width, int height)
 {
   int offset8bit = 32;
@@ -40,7 +40,7 @@ void put_unweighted_pred_8_fallback(uint8_t *dst, ptrdiff_t dststride,
   assert((width&1)==0);
 
   for (int y=0;y<height;y++) {
-    int16_t* in  = &src[y*srcstride];
+    const int16_t* in  = &src[y*srcstride];
     uint8_t* out = &dst[y*dststride];
 
     for (int x=0;x<width;x+=2) {
@@ -53,7 +53,7 @@ void put_unweighted_pred_8_fallback(uint8_t *dst, ptrdiff_t dststride,
 
 
 void put_weighted_pred_8_fallback(uint8_t *dst, ptrdiff_t dststride,
-                                  int16_t *src, ptrdiff_t srcstride,
+                                  const int16_t *src, ptrdiff_t srcstride,
                                   int width, int height,
                                   int w,int o,int log2WD)
 {
@@ -62,7 +62,7 @@ void put_weighted_pred_8_fallback(uint8_t *dst, ptrdiff_t dststride,
   const int rnd = (1<<(log2WD-1));
 
   for (int y=0;y<height;y++) {
-    int16_t* in  = &src[y*srcstride];
+    const int16_t* in  = &src[y*srcstride];
     uint8_t* out = &dst[y*dststride];
 
     for (int x=0;x<width;x++) {
@@ -73,7 +73,7 @@ void put_weighted_pred_8_fallback(uint8_t *dst, ptrdiff_t dststride,
 }
 
 void put_weighted_bipred_8_fallback(uint8_t *dst, ptrdiff_t dststride,
-                                    int16_t *src1, int16_t *src2, ptrdiff_t srcstride,
+                                    const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
                                     int width, int height,
                                     int w1,int o1, int w2,int o2, int log2WD)
 {
@@ -82,8 +82,8 @@ void put_weighted_bipred_8_fallback(uint8_t *dst, ptrdiff_t dststride,
   const int rnd = ((o1+o2+1) << log2WD);
 
   for (int y=0;y<height;y++) {
-    int16_t* in1 = &src1[y*srcstride];
-    int16_t* in2 = &src2[y*srcstride];
+    const int16_t* in1 = &src1[y*srcstride];
+    const int16_t* in2 = &src2[y*srcstride];
     uint8_t* out = &dst[y*dststride];
 
     for (int x=0;x<width;x++) {
@@ -95,7 +95,7 @@ void put_weighted_bipred_8_fallback(uint8_t *dst, ptrdiff_t dststride,
 
 
 void put_weighted_pred_avg_8_fallback(uint8_t *dst, ptrdiff_t dststride,
-                                      int16_t *src1, int16_t *src2,
+                                      const int16_t *src1, const int16_t *src2,
                                       ptrdiff_t srcstride, int width,
                                       int height)
 {
@@ -144,8 +144,8 @@ void put_weighted_pred_avg_8_fallback(uint8_t *dst, ptrdiff_t dststride,
 #endif
     {
       for (int y=0;y<height;y++) {
-        int16_t* in1 = &src1[y*srcstride];
-        int16_t* in2 = &src2[y*srcstride];
+        const int16_t* in1 = &src1[y*srcstride];
+        const int16_t* in2 = &src2[y*srcstride];
         uint8_t* out = &dst[y*dststride];
 
         for (int x=0;x<width;x+=2) {
@@ -159,8 +159,103 @@ void put_weighted_pred_avg_8_fallback(uint8_t *dst, ptrdiff_t dststride,
 
 
 
+
+
+void put_unweighted_pred_16_fallback(uint16_t *dst, ptrdiff_t dststride,
+                                     const int16_t *src, ptrdiff_t srcstride,
+                                     int width, int height, int bit_depth)
+{
+  int shift1 = 14-bit_depth;
+  int offset1 = 0;
+  if (shift1>0) { offset1 = 1<<(shift1-1); }
+
+  assert((width&1)==0);
+
+  for (int y=0;y<height;y++) {
+    const int16_t* in  = &src[y*srcstride];
+    uint16_t* out = &dst[y*dststride];
+
+    for (int x=0;x<width;x+=2) {
+      out[0] = Clip_BitDepth((in[0] + offset1)>>shift1, bit_depth);
+      out[1] = Clip_BitDepth((in[1] + offset1)>>shift1, bit_depth);
+      out+=2; in+=2;
+    }
+  }
+}
+
+#include <stdlib.h>
+
+void put_weighted_pred_16_fallback(uint16_t *dst, ptrdiff_t dststride,
+                                   const int16_t *src, ptrdiff_t srcstride,
+                                   int width, int height,
+                                   int w,int o,int log2WD, int bit_depth)
+{
+  assert(log2WD>=1); // TODO
+
+  const int rnd = (1<<(log2WD-1));
+
+  for (int y=0;y<height;y++) {
+    const int16_t* in  = &src[y*srcstride];
+    uint16_t* out = &dst[y*dststride];
+
+    for (int x=0;x<width;x++) {
+      out[0] = Clip_BitDepth(((in[0]*w + rnd)>>log2WD) + o, bit_depth);
+      out++; in++;
+    }
+  }
+}
+
+void put_weighted_bipred_16_fallback(uint16_t *dst, ptrdiff_t dststride,
+                                     const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
+                                     int width, int height,
+                                     int w1,int o1, int w2,int o2, int log2WD, int bit_depth)
+{
+  assert(log2WD>=1); // TODO
+
+  const int rnd = ((o1+o2+1) << log2WD);
+
+  for (int y=0;y<height;y++) {
+    const int16_t* in1 = &src1[y*srcstride];
+    const int16_t* in2 = &src2[y*srcstride];
+    uint16_t* out = &dst[y*dststride];
+
+    for (int x=0;x<width;x++) {
+      out[0] = Clip_BitDepth((in1[0]*w1 + in2[0]*w2 + rnd)>>(log2WD+1), bit_depth);
+      out++; in1++; in2++;
+    }
+  }
+}
+
+
+void put_weighted_pred_avg_16_fallback(uint16_t *dst, ptrdiff_t dststride,
+                                       const int16_t *src1, const int16_t *src2,
+                                       ptrdiff_t srcstride, int width,
+                                       int height, int bit_depth)
+{
+  int shift2 = 15-bit_depth;
+  int offset2 = 1<<(shift2-1);
+
+  assert((width&1)==0);
+
+  for (int y=0;y<height;y++) {
+    const int16_t* in1 = &src1[y*srcstride];
+    const int16_t* in2 = &src2[y*srcstride];
+    uint16_t* out = &dst[y*dststride];
+
+    for (int x=0;x<width;x+=2) {
+      out[0] = Clip_BitDepth((in1[0] + in2[0] + offset2)>>shift2, bit_depth);
+      out[1] = Clip_BitDepth((in1[1] + in2[1] + offset2)>>shift2, bit_depth);
+      out+=2; in1+=2; in2+=2;
+    }
+  }
+}
+
+
+
+
+
 void put_epel_8_fallback(int16_t *out, ptrdiff_t out_stride,
-                         uint8_t *src, ptrdiff_t src_stride,
+                         const uint8_t *src, ptrdiff_t src_stride,
                          int width, int height,
                          int mx, int my, int16_t* mcbuffer)
 {
@@ -168,7 +263,7 @@ void put_epel_8_fallback(int16_t *out, ptrdiff_t out_stride,
 
   for (int y=0;y<height;y++) {
     int16_t* o = &out[y*out_stride];
-    uint8_t* i = &src[y*src_stride];
+    const uint8_t* i = &src[y*src_stride];
 
     for (int x=0;x<width;x++) {
       *o = *i << shift3;
@@ -179,12 +274,33 @@ void put_epel_8_fallback(int16_t *out, ptrdiff_t out_stride,
 }
 
 
-void put_epel_hv_8_fallback(int16_t *dst, ptrdiff_t dst_stride,
-                            uint8_t *src, ptrdiff_t src_stride,
-                            int nPbWC, int nPbHC,
-                            int xFracC, int yFracC, int16_t* mcbuffer)
+void put_epel_16_fallback(int16_t *out, ptrdiff_t out_stride,
+                          const uint16_t *src, ptrdiff_t src_stride,
+                          int width, int height,
+                          int mx, int my, int16_t* mcbuffer, int bit_depth)
 {
-  const int shift1 = 0;
+  int shift3 = 14 - bit_depth;
+
+  for (int y=0;y<height;y++) {
+    int16_t* o = &out[y*out_stride];
+    const uint16_t* i = &src[y*src_stride];
+
+    for (int x=0;x<width;x++) {
+      *o = *i << shift3;
+      o++;
+      i++;
+    }
+  }
+}
+
+
+template <class pixel_t>
+void put_epel_hv_fallback(int16_t *dst, ptrdiff_t dst_stride,
+                          const pixel_t *src, ptrdiff_t src_stride,
+                          int nPbWC, int nPbHC,
+                          int xFracC, int yFracC, int16_t* mcbuffer, int bit_depth)
+{
+  const int shift1 = bit_depth-8;
   const int shift2 = 6;
   //const int shift3 = 6;
 
@@ -224,7 +340,7 @@ void put_epel_hv_8_fallback(int16_t *dst, ptrdiff_t dst_stride,
   //printf("---H---(%d)\n",xFracC);
 
   for (int y=-extra_top;y<nPbHC+extra_bottom;y++) {
-    uint8_t* p = &src[y*src_stride - extra_left];
+    const pixel_t* p = &src[y*src_stride - extra_left];
 
     for (int x=0;x<nPbWC;x++) {
       int16_t v;
@@ -241,7 +357,7 @@ void put_epel_hv_8_fallback(int16_t *dst, ptrdiff_t dst_stride,
       }
 
       //printf("%d %d %d %d -> %d\n",p[0],p[1],p[2],p[3],v);
-        
+
       tmp2buf[y+extra_top + x*nPbH_extra] = v;
       p++;
 
@@ -272,7 +388,7 @@ void put_epel_hv_8_fallback(int16_t *dst, ptrdiff_t dst_stride,
       default:
       case 7: v = (-2*p[0]+10*p[1]+58*p[2]-2*p[3])>>vshift; break;
       }
-        
+
       dst[x + y*dst_stride] = v;
       p++;
     }
@@ -291,10 +407,21 @@ void put_epel_hv_8_fallback(int16_t *dst, ptrdiff_t dst_stride,
 }
 
 
+template
+void put_epel_hv_fallback<uint8_t>(int16_t *dst, ptrdiff_t dst_stride,
+                                   const uint8_t *src, ptrdiff_t src_stride,
+                                   int nPbWC, int nPbHC,
+                                   int xFracC, int yFracC, int16_t* mcbuffer, int bit_depth);
+template
+void put_epel_hv_fallback<uint16_t>(int16_t *dst, ptrdiff_t dst_stride,
+                                    const uint16_t *src, ptrdiff_t src_stride,
+                                    int nPbWC, int nPbHC,
+                                    int xFracC, int yFracC, int16_t* mcbuffer, int bit_depth);
+
 
 
 void put_qpel_0_0_fallback(int16_t *out, ptrdiff_t out_stride,
-                           uint8_t *src, ptrdiff_t srcstride,
+                           const uint8_t *src, ptrdiff_t srcstride,
                            int nPbW, int nPbH, int16_t* mcbuffer)
 {
   //const int shift1 = 0; // sps->BitDepth_Y-8;
@@ -303,14 +430,11 @@ void put_qpel_0_0_fallback(int16_t *out, ptrdiff_t out_stride,
   // straight copy
 
   for (int y=0;y<nPbH;y++) {
-      uint8_t* p = src + srcstride*y;
+      const uint8_t* p = src + srcstride*y;
       int16_t* o = out + out_stride*y;
 
       for (int x=0;x<nPbW;x+=4) {
-#if 0
-        *o = *p << shift2;
-        o++; p++;
-#else
+
         // does not seem to be faster...
         int16_t o0,o1,o2,o3;
         o0 = p[0] << shift2;
@@ -324,20 +448,41 @@ void put_qpel_0_0_fallback(int16_t *out, ptrdiff_t out_stride,
 
         o+=4;
         p+=4;
-#endif
       }
   }
 }
 
 
+void put_qpel_0_0_fallback_16(int16_t *out, ptrdiff_t out_stride,
+                              const uint16_t *src, ptrdiff_t srcstride,
+                              int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth)
+{
+  //const int shift1 = bit_depth-8;
+  //const int shift2 = 6;
+  const int shift3 = 14-bit_depth;
+
+  // straight copy
+
+  for (int y=0;y<nPbH;y++) {
+    const uint16_t* p = src + srcstride*y;
+    int16_t* o = out + out_stride*y;
+
+    for (int x=0;x<nPbW;x++) {
+      *o++ = *p++ << shift3;
+    }
+  }
+}
+
+
 
 static int extra_before[4] = { 0,3,3,2 };
 static int extra_after [4] = { 0,3,4,4 };
 
+template <class pixel_t>
 void put_qpel_fallback(int16_t *out, ptrdiff_t out_stride,
-                       uint8_t *src, ptrdiff_t srcstride,
+                       const pixel_t *src, ptrdiff_t srcstride,
                        int nPbW, int nPbH, int16_t* mcbuffer,
-                       int xFracL, int yFracL)
+                       int xFracL, int yFracL, int bit_depth)
 {
   int extra_left   = extra_before[xFracL];
   //int extra_right  = extra_after [xFracL];
@@ -347,7 +492,7 @@ void put_qpel_fallback(int16_t *out, ptrdiff_t out_stride,
   //int nPbW_extra = extra_left + nPbW + extra_right;
   int nPbH_extra = extra_top  + nPbH + extra_bottom;
 
-  const int shift1 = 0; // sps->BitDepth_Y-8;
+  const int shift1 = bit_depth-8;
   const int shift2 = 6;
 
 
@@ -356,7 +501,7 @@ void put_qpel_fallback(int16_t *out, ptrdiff_t out_stride,
   switch (xFracL) {
   case 0:
     for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
-      uint8_t* p = src + srcstride*y - extra_left;
+      const pixel_t* p = src + srcstride*y - extra_left;
       int16_t* o = &mcbuffer[y+extra_top];
 
       for (int x=0;x<nPbW;x++) {
@@ -368,7 +513,7 @@ void put_qpel_fallback(int16_t *out, ptrdiff_t out_stride,
     break;
   case 1:
     for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
-      uint8_t* p = src + srcstride*y - extra_left;
+      const pixel_t* p = src + srcstride*y - extra_left;
       int16_t* o = &mcbuffer[y+extra_top];
 
       for (int x=0;x<nPbW;x++) {
@@ -380,7 +525,7 @@ void put_qpel_fallback(int16_t *out, ptrdiff_t out_stride,
     break;
   case 2:
     for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
-      uint8_t* p = src + srcstride*y - extra_left;
+      const pixel_t* p = src + srcstride*y - extra_left;
       int16_t* o = &mcbuffer[y+extra_top];
 
       for (int x=0;x<nPbW;x++) {
@@ -392,7 +537,7 @@ void put_qpel_fallback(int16_t *out, ptrdiff_t out_stride,
     break;
   case 3:
     for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
-      uint8_t* p = src + srcstride*y - extra_left;
+      const pixel_t* p = src + srcstride*y - extra_left;
       int16_t* o = &mcbuffer[y+extra_top];
 
       for (int x=0;x<nPbW;x++) {
@@ -421,9 +566,9 @@ void put_qpel_fallback(int16_t *out, ptrdiff_t out_stride,
   switch (yFracL) {
   case 0:
     for (int x=0;x<nPbW;x++) {
-      int16_t* p = &mcbuffer[x*nPbH_extra];
+      const int16_t* p = &mcbuffer[x*nPbH_extra];
       int16_t* o = &out[x];
-              
+
       for (int y=0;y<nPbH;y++) {
         *o = *p;
         o+=out_stride;
@@ -433,9 +578,9 @@ void put_qpel_fallback(int16_t *out, ptrdiff_t out_stride,
     break;
   case 1:
     for (int x=0;x<nPbW;x++) {
-      int16_t* p = &mcbuffer[x*nPbH_extra];
+      const int16_t* p = &mcbuffer[x*nPbH_extra];
       int16_t* o = &out[x];
-              
+
       for (int y=0;y<nPbH;y++) {
         *o = (-p[0]+4*p[1]-10*p[2]+58*p[3]+17*p[4] -5*p[5]  +p[6])>>vshift;
         o+=out_stride;
@@ -445,9 +590,9 @@ void put_qpel_fallback(int16_t *out, ptrdiff_t out_stride,
     break;
   case 2:
     for (int x=0;x<nPbW;x++) {
-      int16_t* p = &mcbuffer[x*nPbH_extra];
+      const int16_t* p = &mcbuffer[x*nPbH_extra];
       int16_t* o = &out[x];
-              
+
       for (int y=0;y<nPbH;y++) {
         *o = (-p[0]+4*p[1]-11*p[2]+40*p[3]+40*p[4]-11*p[5]+4*p[6]-p[7])>>vshift;
         o+=out_stride;
@@ -457,9 +602,9 @@ void put_qpel_fallback(int16_t *out, ptrdiff_t out_stride,
     break;
   case 3:
     for (int x=0;x<nPbW;x++) {
-      int16_t* p = &mcbuffer[x*nPbH_extra];
+      const int16_t* p = &mcbuffer[x*nPbH_extra];
       int16_t* o = &out[x];
-              
+
       for (int y=0;y<nPbH;y++) {
         *o = ( p[0]-5*p[1]+17*p[2]+58*p[3]-10*p[4] +4*p[5]  -p[6])>>vshift;
         o+=out_stride;
@@ -480,12 +625,24 @@ void put_qpel_fallback(int16_t *out, ptrdiff_t out_stride,
 }
 
 
+
 #define QPEL(x,y) void put_qpel_ ## x ## _ ## y ## _fallback(int16_t *out, ptrdiff_t out_stride,    \
-                                             uint8_t *src, ptrdiff_t srcstride,     \
-                                             int nPbW, int nPbH, int16_t* mcbuffer) \
-{ put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y ); }
+                                                             const uint8_t *src, ptrdiff_t srcstride, \
+                                                             int nPbW, int nPbH, int16_t* mcbuffer) \
+  { put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, 8 ); }
+
+
+#define QPEL16(x,y) void put_qpel_ ## x ## _ ## y ## _fallback_16(int16_t *out, ptrdiff_t out_stride,    \
+                                                                  const uint16_t *src, ptrdiff_t srcstride, \
+                                                                  int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth) \
+{ put_qpel_fallback(out,out_stride, src,srcstride, nPbW,nPbH,mcbuffer,x,y, bit_depth ); }
 
 /*     */ QPEL(0,1) QPEL(0,2) QPEL(0,3)
 QPEL(1,0) QPEL(1,1) QPEL(1,2) QPEL(1,3)
 QPEL(2,0) QPEL(2,1) QPEL(2,2) QPEL(2,3)
 QPEL(3,0) QPEL(3,1) QPEL(3,2) QPEL(3,3)
+
+/*       */ QPEL16(0,1) QPEL16(0,2) QPEL16(0,3)
+QPEL16(1,0) QPEL16(1,1) QPEL16(1,2) QPEL16(1,3)
+QPEL16(2,0) QPEL16(2,1) QPEL16(2,2) QPEL16(2,3)
+QPEL16(3,0) QPEL16(3,1) QPEL16(3,2) QPEL16(3,3)
diff --git a/libde265/fallback-motion.h b/libde265/fallback-motion.h
index 24ac4c4..353ff77 100644
--- a/libde265/fallback-motion.h
+++ b/libde265/fallback-motion.h
@@ -26,79 +26,79 @@
 
 
 void put_weighted_pred_avg_8_fallback(uint8_t *dst, ptrdiff_t dststride,
-                                      int16_t *src1, int16_t *src2,
+                                      const int16_t *src1, const int16_t *src2,
                                       ptrdiff_t srcstride, int width,
                                       int height);
 
 void put_unweighted_pred_8_fallback(uint8_t *_dst, ptrdiff_t dststride,
-                                    int16_t *src, ptrdiff_t srcstride,
+                                    const int16_t *src, ptrdiff_t srcstride,
                                     int width, int height);
 
 void put_weighted_pred_8_fallback(uint8_t *_dst, ptrdiff_t dststride,
-                                  int16_t *src, ptrdiff_t srcstride,
+                                  const int16_t *src, ptrdiff_t srcstride,
                                   int width, int height,
                                   int w,int o,int log2WD);
 void put_weighted_bipred_8_fallback(uint8_t *_dst, ptrdiff_t dststride,
-                                    int16_t *src1, int16_t *src2, ptrdiff_t srcstride,
+                                    const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
                                     int width, int height,
                                     int w1,int o1, int w2,int o2, int log2WD);
 
+void put_weighted_pred_avg_16_fallback(uint16_t *dst, ptrdiff_t dststride,
+                                       const int16_t *src1, const int16_t *src2,
+                                       ptrdiff_t srcstride, int width,
+                                       int height, int bit_depth);
+
+void put_unweighted_pred_16_fallback(uint16_t *_dst, ptrdiff_t dststride,
+                                     const int16_t *src, ptrdiff_t srcstride,
+                                     int width, int height, int bit_depth);
+
+void put_weighted_pred_16_fallback(uint16_t *_dst, ptrdiff_t dststride,
+                                   const int16_t *src, ptrdiff_t srcstride,
+                                   int width, int height,
+                                   int w,int o,int log2WD, int bit_depth);
+void put_weighted_bipred_16_fallback(uint16_t *_dst, ptrdiff_t dststride,
+                                     const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
+                                     int width, int height,
+                                     int w1,int o1, int w2,int o2, int log2WD, int bit_depth);
+
+
+
 void put_epel_8_fallback(int16_t *dst, ptrdiff_t dststride,
-                    uint8_t *_src, ptrdiff_t srcstride,
-                    int width, int height,
-                    int mx, int my, int16_t* mcbuffer);
-void put_epel_hv_8_fallback(int16_t *dst, ptrdiff_t dststride,
-                            uint8_t *_src, ptrdiff_t srcstride,
-                            int width, int height,
-                            int mx, int my, int16_t* mcbuffer);
-
-void put_qpel_0_0_fallback(int16_t *out, ptrdiff_t out_stride,
-                           uint8_t *src, ptrdiff_t srcstride,
-                           int nPbW, int nPbH, int16_t* mcbuffer);
-void put_qpel_0_1_fallback(int16_t *out, ptrdiff_t out_stride,
-                           uint8_t *src, ptrdiff_t srcstride,
-                           int nPbW, int nPbH, int16_t* mcbuffer);
-void put_qpel_0_2_fallback(int16_t *out, ptrdiff_t out_stride,
-                           uint8_t *src, ptrdiff_t srcstride,
-                           int nPbW, int nPbH, int16_t* mcbuffer);
-void put_qpel_0_3_fallback(int16_t *out, ptrdiff_t out_stride,
-                           uint8_t *src, ptrdiff_t srcstride,
-                           int nPbW, int nPbH, int16_t* mcbuffer);
-void put_qpel_1_0_fallback(int16_t *out, ptrdiff_t out_stride,
-                           uint8_t *src, ptrdiff_t srcstride,
-                           int nPbW, int nPbH, int16_t* mcbuffer);
-void put_qpel_1_1_fallback(int16_t *out, ptrdiff_t out_stride,
-                           uint8_t *src, ptrdiff_t srcstride,
-                           int nPbW, int nPbH, int16_t* mcbuffer);
-void put_qpel_1_2_fallback(int16_t *out, ptrdiff_t out_stride,
-                           uint8_t *src, ptrdiff_t srcstride,
-                           int nPbW, int nPbH, int16_t* mcbuffer);
-void put_qpel_1_3_fallback(int16_t *out, ptrdiff_t out_stride,
-                           uint8_t *src, ptrdiff_t srcstride,
-                           int nPbW, int nPbH, int16_t* mcbuffer);
-void put_qpel_2_0_fallback(int16_t *out, ptrdiff_t out_stride,
-                           uint8_t *src, ptrdiff_t srcstride,
-                           int nPbW, int nPbH, int16_t* mcbuffer);
-void put_qpel_2_1_fallback(int16_t *out, ptrdiff_t out_stride,
-                           uint8_t *src, ptrdiff_t srcstride,
-                           int nPbW, int nPbH, int16_t* mcbuffer);
-void put_qpel_2_2_fallback(int16_t *out, ptrdiff_t out_stride,
-                           uint8_t *src, ptrdiff_t srcstride,
-                           int nPbW, int nPbH, int16_t* mcbuffer);
-void put_qpel_2_3_fallback(int16_t *out, ptrdiff_t out_stride,
-                           uint8_t *src, ptrdiff_t srcstride,
-                           int nPbW, int nPbH, int16_t* mcbuffer);
-void put_qpel_3_0_fallback(int16_t *out, ptrdiff_t out_stride,
-                           uint8_t *src, ptrdiff_t srcstride,
-                           int nPbW, int nPbH, int16_t* mcbuffer);
-void put_qpel_3_1_fallback(int16_t *out, ptrdiff_t out_stride,
-                           uint8_t *src, ptrdiff_t srcstride,
-                           int nPbW, int nPbH, int16_t* mcbuffer);
-void put_qpel_3_2_fallback(int16_t *out, ptrdiff_t out_stride,
-                           uint8_t *src, ptrdiff_t srcstride,
-                           int nPbW, int nPbH, int16_t* mcbuffer);
-void put_qpel_3_3_fallback(int16_t *out, ptrdiff_t out_stride,
-                           uint8_t *src, ptrdiff_t srcstride,
-                           int nPbW, int nPbH, int16_t* mcbuffer);
+                         const uint8_t *_src, ptrdiff_t srcstride,
+                         int width, int height,
+                         int mx, int my, int16_t* mcbuffer);
+
+void put_epel_16_fallback(int16_t *out, ptrdiff_t out_stride,
+                          const uint16_t *src, ptrdiff_t src_stride,
+                          int width, int height,
+                          int mx, int my, int16_t* mcbuffer, int bit_depth);
+
+template <class pixel_t>
+void put_epel_hv_fallback(int16_t *dst, ptrdiff_t dststride,
+                          const pixel_t *_src, ptrdiff_t srcstride,
+                          int width, int height,
+                          int mx, int my, int16_t* mcbuffer, int bit_depth);
+
+
+#define QPEL(x,y) void put_qpel_ ## x ## _ ## y ## _fallback(int16_t *out, ptrdiff_t out_stride, \
+                           const uint8_t *src, ptrdiff_t srcstride, \
+                           int nPbW, int nPbH, int16_t* mcbuffer)
+QPEL(0,0); QPEL(0,1); QPEL(0,2); QPEL(0,3);
+QPEL(1,0); QPEL(1,1); QPEL(1,2); QPEL(1,3);
+QPEL(2,0); QPEL(2,1); QPEL(2,2); QPEL(2,3);
+QPEL(3,0); QPEL(3,1); QPEL(3,2); QPEL(3,3);
+
+#undef QPEL
+
+
+#define QPEL(x,y) void put_qpel_ ## x ## _ ## y ## _fallback_16(int16_t *out, ptrdiff_t out_stride, \
+                           const uint16_t *src, ptrdiff_t srcstride, \
+                           int nPbW, int nPbH, int16_t* mcbuffer, int bit_depth)
+QPEL(0,0); QPEL(0,1); QPEL(0,2); QPEL(0,3);
+QPEL(1,0); QPEL(1,1); QPEL(1,2); QPEL(1,3);
+QPEL(2,0); QPEL(2,1); QPEL(2,2); QPEL(2,3);
+QPEL(3,0); QPEL(3,1); QPEL(3,2); QPEL(3,3);
+
+#undef QPEL
 
 #endif
diff --git a/libde265/fallback.cc b/libde265/fallback.cc
index 89c39d0..d0b96a7 100644
--- a/libde265/fallback.cc
+++ b/libde265/fallback.cc
@@ -27,14 +27,19 @@ void init_acceleration_functions_fallback(struct acceleration_functions* accel)
 {
   accel->put_weighted_pred_avg_8 = put_weighted_pred_avg_8_fallback;
   accel->put_unweighted_pred_8   = put_unweighted_pred_8_fallback;
-
   accel->put_weighted_pred_8 = put_weighted_pred_8_fallback;
   accel->put_weighted_bipred_8 = put_weighted_bipred_8_fallback;
 
+  accel->put_weighted_pred_avg_16 = put_weighted_pred_avg_16_fallback;
+  accel->put_unweighted_pred_16   = put_unweighted_pred_16_fallback;
+  accel->put_weighted_pred_16 = put_weighted_pred_16_fallback;
+  accel->put_weighted_bipred_16 = put_weighted_bipred_16_fallback;
+
+
   accel->put_hevc_epel_8    = put_epel_8_fallback;
-  accel->put_hevc_epel_h_8  = put_epel_hv_8_fallback;
-  accel->put_hevc_epel_v_8  = put_epel_hv_8_fallback;
-  accel->put_hevc_epel_hv_8 = put_epel_hv_8_fallback;
+  accel->put_hevc_epel_h_8  = put_epel_hv_fallback<uint8_t>;
+  accel->put_hevc_epel_v_8  = put_epel_hv_fallback<uint8_t>;
+  accel->put_hevc_epel_hv_8 = put_epel_hv_fallback<uint8_t>;
 
   accel->put_hevc_qpel_8[0][0] = put_qpel_0_0_fallback;
   accel->put_hevc_qpel_8[0][1] = put_qpel_0_1_fallback;
@@ -53,11 +58,70 @@ void init_acceleration_functions_fallback(struct acceleration_functions* accel)
   accel->put_hevc_qpel_8[3][2] = put_qpel_3_2_fallback;
   accel->put_hevc_qpel_8[3][3] = put_qpel_3_3_fallback;
 
+  accel->put_hevc_epel_16    = put_epel_16_fallback;
+  accel->put_hevc_epel_h_16  = put_epel_hv_fallback<uint16_t>;
+  accel->put_hevc_epel_v_16  = put_epel_hv_fallback<uint16_t>;
+  accel->put_hevc_epel_hv_16 = put_epel_hv_fallback<uint16_t>;
+
+  accel->put_hevc_qpel_16[0][0] = put_qpel_0_0_fallback_16;
+  accel->put_hevc_qpel_16[0][1] = put_qpel_0_1_fallback_16;
+  accel->put_hevc_qpel_16[0][2] = put_qpel_0_2_fallback_16;
+  accel->put_hevc_qpel_16[0][3] = put_qpel_0_3_fallback_16;
+  accel->put_hevc_qpel_16[1][0] = put_qpel_1_0_fallback_16;
+  accel->put_hevc_qpel_16[1][1] = put_qpel_1_1_fallback_16;
+  accel->put_hevc_qpel_16[1][2] = put_qpel_1_2_fallback_16;
+  accel->put_hevc_qpel_16[1][3] = put_qpel_1_3_fallback_16;
+  accel->put_hevc_qpel_16[2][0] = put_qpel_2_0_fallback_16;
+  accel->put_hevc_qpel_16[2][1] = put_qpel_2_1_fallback_16;
+  accel->put_hevc_qpel_16[2][2] = put_qpel_2_2_fallback_16;
+  accel->put_hevc_qpel_16[2][3] = put_qpel_2_3_fallback_16;
+  accel->put_hevc_qpel_16[3][0] = put_qpel_3_0_fallback_16;
+  accel->put_hevc_qpel_16[3][1] = put_qpel_3_1_fallback_16;
+  accel->put_hevc_qpel_16[3][2] = put_qpel_3_2_fallback_16;
+  accel->put_hevc_qpel_16[3][3] = put_qpel_3_3_fallback_16;
+
+
+
   accel->transform_skip_8 = transform_skip_8_fallback;
-  accel->transform_bypass_8 = transform_bypass_8_fallback;
-  accel->transform_4x4_luma_add_8 = transform_4x4_luma_add_8_fallback;
-  accel->transform_4x4_add_8   = transform_4x4_add_8_fallback;
-  accel->transform_8x8_add_8   = transform_8x8_add_8_fallback;
-  accel->transform_16x16_add_8 = transform_16x16_add_8_fallback;
-  accel->transform_32x32_add_8 = transform_32x32_add_8_fallback;
+  accel->transform_skip_rdpcm_h_8 = transform_skip_rdpcm_h_8_fallback;
+  accel->transform_skip_rdpcm_v_8 = transform_skip_rdpcm_v_8_fallback;
+  accel->transform_bypass = transform_bypass_fallback;
+  accel->transform_bypass_rdpcm_h = transform_bypass_rdpcm_h_fallback;
+  accel->transform_bypass_rdpcm_v = transform_bypass_rdpcm_v_fallback;
+  accel->transform_4x4_dst_add_8 = transform_4x4_luma_add_8_fallback;
+  accel->transform_add_8[0] = transform_4x4_add_8_fallback;
+  accel->transform_add_8[1] = transform_8x8_add_8_fallback;
+  accel->transform_add_8[2] = transform_16x16_add_8_fallback;
+  accel->transform_add_8[3] = transform_32x32_add_8_fallback;
+
+  accel->transform_skip_16 = transform_skip_16_fallback;
+  accel->transform_4x4_dst_add_16 = transform_4x4_luma_add_16_fallback;
+  accel->transform_add_16[0] = transform_4x4_add_16_fallback;
+  accel->transform_add_16[1] = transform_8x8_add_16_fallback;
+  accel->transform_add_16[2] = transform_16x16_add_16_fallback;
+  accel->transform_add_16[3] = transform_32x32_add_16_fallback;
+
+  accel->rotate_coefficients = rotate_coefficients_fallback;
+  accel->add_residual_8  = add_residual_fallback<uint8_t>;
+  accel->add_residual_16 = add_residual_fallback<uint16_t>;
+  accel->rdpcm_h = rdpcm_h_fallback;
+  accel->rdpcm_v = rdpcm_v_fallback;
+  accel->transform_skip_residual = transform_skip_residual_fallback;
+
+  accel->transform_idst_4x4   = transform_idst_4x4_fallback;
+  accel->transform_idct_4x4   = transform_idct_4x4_fallback;
+  accel->transform_idct_8x8   = transform_idct_8x8_fallback;
+  accel->transform_idct_16x16 = transform_idct_16x16_fallback;
+  accel->transform_idct_32x32 = transform_idct_32x32_fallback;
+
+  accel->fwd_transform_4x4_dst_8 = fdst_4x4_8_fallback;
+  accel->fwd_transform_8[0] = fdct_4x4_8_fallback;
+  accel->fwd_transform_8[1] = fdct_8x8_8_fallback;
+  accel->fwd_transform_8[2] = fdct_16x16_8_fallback;
+  accel->fwd_transform_8[3] = fdct_32x32_8_fallback;
+
+  accel->hadamard_transform_8[0] = hadamard_4x4_8_fallback;
+  accel->hadamard_transform_8[1] = hadamard_8x8_8_fallback;
+  accel->hadamard_transform_8[2] = hadamard_16x16_8_fallback;
+  accel->hadamard_transform_8[3] = hadamard_32x32_8_fallback;
 }
diff --git a/libde265/image-io.cc b/libde265/image-io.cc
new file mode 100644
index 0000000..3ceefa8
--- /dev/null
+++ b/libde265/image-io.cc
@@ -0,0 +1,194 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "libde265/image-io.h"
+#include <assert.h>
+
+
+ImageSource_YUV::~ImageSource_YUV()
+{
+  if (mFH) {
+    fclose(mFH);
+  }
+}
+
+
+bool ImageSource_YUV::set_input_file(const char* filename, int w,int h)
+{
+  assert(mFH==NULL);
+
+  mFH = fopen(filename,"rb");
+  if (mFH==NULL) {
+    return false;
+  }
+
+  width =w;
+  height=h;
+  mReachedEndOfFile = false;
+
+  return true;
+}
+
+
+de265_image* ImageSource_YUV::read_next_image()
+{
+  if (mReachedEndOfFile) return NULL;
+
+  de265_image* img = new de265_image;
+  img->alloc_image(width,height,de265_chroma_420, NULL, false,
+                   NULL, NULL, 0, NULL, false);
+  assert(img); // TODO: error handling
+
+  // --- load image ---
+
+  uint8_t* p;
+  int stride;
+
+  p = img->get_image_plane(0);  stride = img->get_image_stride(0);
+  for (int y=0;y<height;y++) {
+    if (fread(p+y*stride,1,width,mFH) != width) {
+      goto check_eof;
+    }
+  }
+
+  p = img->get_image_plane(1);  stride = img->get_image_stride(1);
+  for (int y=0;y<height/2;y++) {
+    if (fread(p+y*stride,1,width/2,mFH) != width/2) {
+      goto check_eof;
+    }
+  }
+
+  p = img->get_image_plane(2);  stride = img->get_image_stride(2);
+  for (int y=0;y<height/2;y++) {
+    if (fread(p+y*stride,1,width/2,mFH) != width/2) {
+      goto check_eof;
+    }
+  }
+
+  // --- check for EOF ---
+
+check_eof:
+  if (feof(mFH)) {
+    mReachedEndOfFile = true;
+    delete img;
+    return NULL;
+  }
+  else {
+    return img;
+  }
+}
+
+
+/*
+ImageSource::ImageStatus  ImageSource_YUV::get_status()
+{
+  return Available;
+}
+*/
+
+de265_image* ImageSource_YUV::get_image(bool block)
+{
+  de265_image* img = read_next_image();
+  return img;
+}
+
+
+void ImageSource_YUV::skip_frames(int n)
+{
+  int imageSize = width*height*3/2;
+  fseek(mFH,n * imageSize, SEEK_CUR);
+}
+
+
+
+
+ImageSink_YUV::~ImageSink_YUV()
+{
+  if (mFH) {
+    fclose(mFH);
+  }
+}
+
+bool ImageSink_YUV::set_filename(const char* filename)
+{
+  assert(mFH==NULL);
+
+  mFH = fopen(filename,"wb");
+
+  return true;
+}
+
+void ImageSink_YUV::send_image(const de265_image* img)
+{
+  // --- write image ---
+
+  const uint8_t* p;
+  int stride;
+
+  int width = img->get_width();
+  int height= img->get_height();
+
+  p = img->get_image_plane(0);  stride = img->get_image_stride(0);
+  for (int y=0;y<height;y++) {
+    fwrite(p+y*stride,1,width,mFH);
+  }
+
+  p = img->get_image_plane(1);  stride = img->get_image_stride(1);
+  for (int y=0;y<height/2;y++) {
+    fwrite(p+y*stride,1,width/2,mFH);
+  }
+
+  p = img->get_image_plane(2);  stride = img->get_image_stride(2);
+  for (int y=0;y<height/2;y++) {
+    fwrite(p+y*stride,1,width/2,mFH);
+  }
+}
+
+
+
+LIBDE265_API PacketSink_File::~PacketSink_File()
+{
+  if (mFH) {
+    fclose(mFH);
+  }
+}
+
+
+LIBDE265_API void PacketSink_File::set_filename(const char* filename)
+{
+  assert(mFH==NULL);
+
+  mFH = fopen(filename,"wb");
+}
+
+
+LIBDE265_API void PacketSink_File::send_packet(const uint8_t* data, int n)
+{
+  uint8_t startCode[3];
+  startCode[0] = 0;
+  startCode[1] = 0;
+  startCode[2] = 1;
+
+  fwrite(startCode,1,3,mFH);
+  fwrite(data,1,n,mFH);
+  fflush(mFH);
+}
diff --git a/libde265/image-io.h b/libde265/image-io.h
new file mode 100644
index 0000000..88a7d66
--- /dev/null
+++ b/libde265/image-io.h
@@ -0,0 +1,122 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * Authors: struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef IMAGE_IO_H
+#define IMAGE_IO_H
+
+#include "libde265/image.h"
+#include <deque>
+
+
+class ImageSource
+{
+ public:
+  LIBDE265_API ImageSource() { }
+  virtual LIBDE265_API ~ImageSource() { }
+
+  //enum ImageStatus { Available, Waiting, EndOfVideo };
+
+  //virtual ImageStatus  get_status() = 0;
+  virtual LIBDE265_API de265_image* get_image(bool block=true) = 0;
+  virtual LIBDE265_API void skip_frames(int n) = 0;
+
+  virtual LIBDE265_API int get_width() const = 0;
+  virtual LIBDE265_API int get_height() const = 0;
+};
+
+
+
+class ImageSource_YUV : public ImageSource
+{
+ public:
+ LIBDE265_API ImageSource_YUV() : mFH(NULL) { }
+  virtual LIBDE265_API ~ImageSource_YUV();
+
+  bool LIBDE265_API set_input_file(const char* filename, int w,int h);
+
+  //virtual ImageStatus  get_status();
+  virtual LIBDE265_API de265_image* get_image(bool block=true);
+  virtual LIBDE265_API void skip_frames(int n);
+
+  virtual LIBDE265_API int get_width() const { return width; }
+  virtual LIBDE265_API int get_height() const { return height; }
+
+ private:
+  FILE* mFH;
+  bool mReachedEndOfFile;
+
+  int width,height;
+
+  de265_image* read_next_image();
+};
+
+
+
+
+class ImageSink
+{
+ public:
+  virtual LIBDE265_API ~ImageSink() { }
+
+  virtual LIBDE265_API void send_image(const de265_image* img) = 0;
+};
+
+class ImageSink_YUV : public ImageSink
+{
+ public:
+ LIBDE265_API ImageSink_YUV() : mFH(NULL) { }
+  LIBDE265_API ~ImageSink_YUV();
+
+  bool LIBDE265_API set_filename(const char* filename);
+
+  virtual LIBDE265_API void send_image(const de265_image* img);
+
+ private:
+  FILE* mFH;
+};
+
+
+
+class PacketSink
+{
+ public:
+  virtual LIBDE265_API ~PacketSink() { }
+
+  virtual LIBDE265_API void send_packet(const uint8_t* data, int n) = 0;
+};
+
+
+class PacketSink_File : public PacketSink
+{
+ public:
+ LIBDE265_API PacketSink_File() : mFH(NULL) { }
+  virtual LIBDE265_API ~PacketSink_File();
+
+  LIBDE265_API void set_filename(const char* filename);
+
+  virtual LIBDE265_API void send_packet(const uint8_t* data, int n);
+
+ private:
+  FILE* mFH;
+};
+
+#endif
diff --git a/libde265/image.cc b/libde265/image.cc
index aa36c23..8b26ab2 100644
--- a/libde265/image.cc
+++ b/libde265/image.cc
@@ -20,6 +20,7 @@
 
 #include "image.h"
 #include "decctx.h"
+#include "encoder/encoder-context.h"
 
 #include <stdlib.h>
 #include <string.h>
@@ -38,6 +39,8 @@
 #define MEMORY_PADDING  0
 #endif
 
+#define STANDARD_ALIGNMENT 16
+
 #ifdef HAVE___MINGW_ALIGNED_MALLOC
 #define ALLOC_ALIGNED(alignment, size)         __mingw_aligned_malloc((size), (alignment))
 #define FREE_ALIGNED(mem)                      __mingw_aligned_free((mem))
@@ -62,22 +65,81 @@ static inline void *ALLOC_ALIGNED(size_t alignment, size_t size) {
 
 static const int alignment = 16;
 
+LIBDE265_API void* de265_alloc_image_plane(struct de265_image* img, int cIdx,
+                                           void* inputdata, int inputstride, void *userdata)
+{
+  int alignment = STANDARD_ALIGNMENT;
+  int stride = (img->get_width(cIdx) + alignment-1) / alignment * alignment;
+  int height = img->get_height(cIdx);
+
+  uint8_t* p = (uint8_t *)ALLOC_ALIGNED_16(stride * height + MEMORY_PADDING);
+
+  if (p==NULL) { return NULL; }
+
+  img->set_image_plane(cIdx, p, stride, userdata);
+
+  // copy input data if provided
+
+  if (inputdata != NULL) {
+    if (inputstride == stride) {
+      memcpy(p, inputdata, stride*height);
+    }
+    else {
+      for (int y=0;y<height;y++) {
+        memcpy(p+y*stride, ((char*)inputdata) + inputstride*y, inputstride);
+      }
+    }
+  }
+
+  return p;
+}
+
+
+LIBDE265_API void de265_free_image_plane(struct de265_image* img, int cIdx)
+{
+  uint8_t* p = (uint8_t*)img->get_image_plane(cIdx);
+  assert(p);
+  FREE_ALIGNED(p);
+}
+
 
 static int  de265_image_get_buffer(de265_decoder_context* ctx,
                                    de265_image_spec* spec, de265_image* img, void* userdata)
 {
-  int luma_stride   = (spec->width   + spec->alignment-1) / spec->alignment * spec->alignment;
-  int chroma_stride = (spec->width/2 + spec->alignment-1) / spec->alignment * spec->alignment;
+  const int rawChromaWidth  = spec->width  / img->sps.SubWidthC;
+  const int rawChromaHeight = spec->height / img->sps.SubHeightC;
+
+  int luma_stride   = (spec->width    + spec->alignment-1) / spec->alignment * spec->alignment;
+  int chroma_stride = (rawChromaWidth + spec->alignment-1) / spec->alignment * spec->alignment;
+
+  assert(img->sps.BitDepth_Y >= 8 && img->sps.BitDepth_Y <= 16);
+  assert(img->sps.BitDepth_C >= 8 && img->sps.BitDepth_C <= 16);
+
+  int luma_bpl   = luma_stride   * ((img->sps.BitDepth_Y+7)/8);
+  int chroma_bpl = chroma_stride * ((img->sps.BitDepth_C+7)/8);
 
   int luma_height   = spec->height;
-  int chroma_height = (spec->height+1)/2;
+  int chroma_height = rawChromaHeight;
+
+  bool alloc_failed = false;
 
   uint8_t* p[3] = { 0,0,0 };
-  p[0] = (uint8_t *)ALLOC_ALIGNED_16(luma_stride   * luma_height   + MEMORY_PADDING);
-  p[1] = (uint8_t *)ALLOC_ALIGNED_16(chroma_stride * chroma_height + MEMORY_PADDING);
-  p[2] = (uint8_t *)ALLOC_ALIGNED_16(chroma_stride * chroma_height + MEMORY_PADDING);
+  p[0] = (uint8_t *)ALLOC_ALIGNED_16(luma_height   * luma_bpl   + MEMORY_PADDING);
+  if (p[0]==NULL) { alloc_failed=true; }
+
+  if (img->get_chroma_format() != de265_chroma_mono) {
+    p[1] = (uint8_t *)ALLOC_ALIGNED_16(chroma_height * chroma_bpl + MEMORY_PADDING);
+    p[2] = (uint8_t *)ALLOC_ALIGNED_16(chroma_height * chroma_bpl + MEMORY_PADDING);
 
-  if (p[0]==NULL || p[1]==NULL || p[2]==NULL) {
+    if (p[1]==NULL || p[2]==NULL) { alloc_failed=true; }
+  }
+  else {
+    p[1] = NULL;
+    p[2] = NULL;
+    chroma_stride = 0;
+  }
+
+  if (alloc_failed) {
     for (int i=0;i<3;i++)
       if (p[i]) {
         FREE_ALIGNED(p[i]);
@@ -98,8 +160,9 @@ static void de265_image_release_buffer(de265_decoder_context* ctx,
 {
   for (int i=0;i<3;i++) {
     uint8_t* p = (uint8_t*)img->get_image_plane(i);
-    assert(p);
-    FREE_ALIGNED(p);
+    if (p) {
+      FREE_ALIGNED(p);
+    }
   }
 }
 
@@ -128,6 +191,9 @@ de265_image::de265_image()
   removed_at_picture_id = 0; // picture not used, so we can assume it has been removed
 
   decctx = NULL;
+  encctx = NULL;
+
+  encoder_image_release_func = NULL;
 
   //alloc_functions.get_buffer = NULL;
   //alloc_functions.release_buffer = NULL;
@@ -165,15 +231,25 @@ de265_image::de265_image()
 
 de265_error de265_image::alloc_image(int w,int h, enum de265_chroma c,
                                      const seq_parameter_set* sps, bool allocMetadata,
-                                     decoder_context* ctx, de265_PTS pts, void* user_data,
-                                     bool isOutputImage)
+                                     decoder_context* dctx,
+                                     encoder_context* ectx,
+                                     de265_PTS pts, void* user_data,
+                                     bool useCustomAllocFunc)
 {
-  if (allocMetadata) { assert(sps); }
+  //if (allocMetadata) { assert(sps); }
+  assert(sps);
+
+  this->sps = *sps;
+
+  release(); /* TODO: review code for efficient allocation when arrays are already
+                allocated to the requested size. Without the release, the old image-data
+                will not be freed. */
 
   ID = s_next_image_ID++;
   removed_at_picture_id = std::numeric_limits<int32_t>::max();
 
-  decctx = ctx;
+  decctx = dctx;
+  encctx = ectx;
 
   // --- allocate image buffer ---
 
@@ -209,17 +285,27 @@ de265_error de265_image::alloc_image(int w,int h, enum de265_chroma c,
 
   case de265_chroma_422:
     spec.format = de265_image_format_YUV422P8;
-    chroma_height = (chroma_height+1)/2;
+    chroma_width = (chroma_width+1)/2;
+    break;
+
+  case de265_chroma_444:
+    spec.format = de265_image_format_YUV444P8;
+    break;
+
+  case de265_chroma_mono:
+    spec.format = de265_image_format_mono8;
+    chroma_width = 0;
+    chroma_height= 0;
     break;
 
   default:
-    assert(false); // TODO: not implemented yet
+    assert(false);
     break;
   }
 
   spec.width  = w;
   spec.height = h;
-  spec.alignment = 16;
+  spec.alignment = STANDARD_ALIGNMENT;
 
 
   // conformance window cropping
@@ -243,29 +329,55 @@ de265_error de265_image::alloc_image(int w,int h, enum de265_chroma c,
   spec.visible_height= height_confwin;
 
 
+  bpp_shift[0] = (sps->BitDepth_Y > 8) ? 1 : 0;
+  bpp_shift[1] = (sps->BitDepth_C > 8) ? 1 : 0;
+  bpp_shift[2] = bpp_shift[1];
+
+
   // allocate memory and set conformance window pointers
 
-  void* alloc_userdata = decctx->param_image_allocation_userdata;
-  if (isOutputImage) {
+  void* alloc_userdata = NULL;
+  if (decctx) alloc_userdata = decctx->param_image_allocation_userdata;
+  if (encctx) alloc_userdata = encctx->param_image_allocation_userdata; // actually not needed
+
+  if (encctx && useCustomAllocFunc) {
+    encoder_image_release_func = encctx->release_func;
+
+    // if we do not provide a release function, use our own
+
+    if (encoder_image_release_func == NULL) {
+      image_allocation_functions = de265_image::default_image_allocation;
+    }
+    else {
+      image_allocation_functions.get_buffer     = NULL;
+      image_allocation_functions.release_buffer = NULL;
+    }
+  }
+  else if (decctx && useCustomAllocFunc) {
     image_allocation_functions = decctx->param_image_allocation_functions;
   }
   else {
     image_allocation_functions = de265_image::default_image_allocation;
   }
-  bool mem_alloc_success = image_allocation_functions.get_buffer(decctx, &spec, this,
-                                                                 alloc_userdata);
 
-  pixels_confwin[0] = pixels[0] + left*WinUnitX + top*WinUnitY*stride;
-  pixels_confwin[1] = pixels[1] + left + top*chroma_stride;
-  pixels_confwin[2] = pixels[2] + left + top*chroma_stride;
+  bool mem_alloc_success = true;
 
+  if (image_allocation_functions.get_buffer != NULL) {
+    mem_alloc_success = image_allocation_functions.get_buffer(decctx, &spec, this,
+                                                              alloc_userdata);
 
-  // check for memory shortage
+    pixels_confwin[0] = pixels[0] + left*WinUnitX + top*WinUnitY*stride;
+    pixels_confwin[1] = pixels[1] + left + top*chroma_stride;
+    pixels_confwin[2] = pixels[2] + left + top*chroma_stride;
 
-  if (!mem_alloc_success)
-    {
-      return DE265_ERROR_OUT_OF_MEMORY;
-    }
+
+    // check for memory shortage
+
+    if (!mem_alloc_success)
+      {
+        return DE265_ERROR_OUT_OF_MEMORY;
+      }
+  }
 
   //alloc_functions = *allocfunc;
   //alloc_userdata  = userdata;
@@ -278,6 +390,9 @@ de265_error de265_image::alloc_image(int w,int h, enum de265_chroma c,
     mem_alloc_success &= intraPredMode.alloc(sps->PicWidthInMinPUs, sps->PicHeightInMinPUs,
                                              sps->Log2MinPUSize);
 
+    mem_alloc_success &= intraPredModeC.alloc(sps->PicWidthInMinPUs, sps->PicHeightInMinPUs,
+                                              sps->Log2MinPUSize);
+
     // cb info
 
     mem_alloc_success &= cb_info.alloc(sps->PicWidthInMinCbsY, sps->PicHeightInMinCbsY,
@@ -349,9 +464,17 @@ void de265_image::release()
 
   if (pixels[0])
     {
-      image_allocation_functions.release_buffer(decctx, this,
-                                                decctx ? decctx->param_image_allocation_userdata : NULL);
-      
+      if (encoder_image_release_func != NULL) {
+        encoder_image_release_func(encctx, this,
+                                   encctx->param_image_allocation_userdata);
+      }
+      else {
+        image_allocation_functions.release_buffer(decctx, this,
+                                                decctx ?
+                                                  decctx->param_image_allocation_userdata :
+                                                  NULL);
+      }
+
       for (int i=0;i<3;i++)
         {
           pixels[i] = NULL;
@@ -393,7 +516,7 @@ de265_error de265_image::copy_image(const de265_image* src)
   */
 
   de265_error err = alloc_image(src->width, src->height, src->chroma_format, &src->sps, false,
-                                src->decctx, src->pts, src->user_data, false);
+                                src->decctx, src->encctx, src->pts, src->user_data, false);
   if (err != DE265_OK) {
     return err;
   }
@@ -412,33 +535,42 @@ void de265_image::copy_lines_from(const de265_image* src, int first, int end)
   assert(first % 2 == 0);
   assert(end   % 2 == 0);
 
+  int luma_bpp   = (sps.BitDepth_Y+7)/8;
+  int chroma_bpp = (sps.BitDepth_C+7)/8;
+
   if (src->stride == stride) {
-    memcpy(pixels[0]      + first*stride,
-           src->pixels[0] + first*src->stride,
-           (end-first)*stride);
+    memcpy(pixels[0]      + first*stride * luma_bpp,
+           src->pixels[0] + first*src->stride * luma_bpp,
+           (end-first)*stride * luma_bpp);
   }
   else {
     for (int yp=first;yp<end;yp++) {
-      memcpy(pixels[0]+yp*stride, src->pixels[0]+yp*src->stride, src->width);
+      memcpy(pixels[0]+yp*stride * luma_bpp,
+             src->pixels[0]+yp*src->stride * luma_bpp,
+             src->width * luma_bpp);
     }
   }
 
-  int first_chroma = first>>1;
-  int end_chroma   = end>>1;
+  int first_chroma = first / src->sps.SubHeightC;
+  int end_chroma   = end / src->sps.SubHeightC;
 
   if (src->chroma_format != de265_chroma_mono) {
     if (src->chroma_stride == chroma_stride) {
-      memcpy(pixels[1]      + first_chroma*chroma_stride,
-             src->pixels[1] + first_chroma*chroma_stride,
-             (end_chroma-first_chroma) * chroma_stride);
-      memcpy(pixels[2]      + first_chroma*chroma_stride,
-             src->pixels[2] + first_chroma*chroma_stride,
-             (end_chroma-first_chroma) * chroma_stride);
+      memcpy(pixels[1]      + first_chroma*chroma_stride * chroma_bpp,
+             src->pixels[1] + first_chroma*chroma_stride * chroma_bpp,
+             (end_chroma-first_chroma) * chroma_stride * chroma_bpp);
+      memcpy(pixels[2]      + first_chroma*chroma_stride * chroma_bpp,
+             src->pixels[2] + first_chroma*chroma_stride * chroma_bpp,
+             (end_chroma-first_chroma) * chroma_stride * chroma_bpp);
     }
     else {
       for (int y=first_chroma;y<end_chroma;y++) {
-        memcpy(pixels[1]+y*chroma_stride, src->pixels[1]+y*src->chroma_stride, src->chroma_width);
-        memcpy(pixels[2]+y*chroma_stride, src->pixels[2]+y*src->chroma_stride, src->chroma_width);
+        memcpy(pixels[1]+y*chroma_stride * chroma_bpp,
+               src->pixels[1]+y*src->chroma_stride * chroma_bpp,
+               src->chroma_width * chroma_bpp);
+        memcpy(pixels[2]+y*chroma_stride * chroma_bpp,
+               src->pixels[2]+y*src->chroma_stride * chroma_bpp,
+               src->chroma_width * chroma_bpp);
       }
     }
   }
@@ -463,14 +595,20 @@ void de265_image::thread_start(int nThreads)
 {
   de265_mutex_lock(&mutex);
 
+  //printf("nThreads before: %d %d\n",nThreadsQueued, nThreadsTotal);
+
   nThreadsQueued += nThreads;
   nThreadsTotal += nThreads;
 
+  //printf("nThreads after: %d %d\n",nThreadsQueued, nThreadsTotal);
+
   de265_mutex_unlock(&mutex);
 }
 
-void de265_image::thread_run()
+void de265_image::thread_run(const thread_task* task)
 {
+  //printf("run thread %s\n", task->name().c_str());
+
   de265_mutex_lock(&mutex);
   nThreadsQueued--;
   nThreadsRunning++;
@@ -493,8 +631,10 @@ void de265_image::thread_unblocks()
   de265_mutex_unlock(&mutex);
 }
 
-void de265_image::thread_finishes()
+void de265_image::thread_finishes(const thread_task* task)
 {
+  //printf("finish thread %s\n", task->name().c_str());
+
   de265_mutex_lock(&mutex);
 
   nThreadsRunning--;
@@ -517,6 +657,8 @@ void de265_image::wait_for_progress(thread_task* task, int ctbx,int ctby, int pr
 
 void de265_image::wait_for_progress(thread_task* task, int ctbAddrRS, int progress)
 {
+  if (task==NULL) { return; }
+
   de265_progress_lock* progresslock = &ctb_progress[ctbAddrRS];
   if (progresslock->get_progress() < progress) {
     thread_blocks();
@@ -558,7 +700,7 @@ void de265_image::clear_metadata()
   // during decoding (especially log2CbSize), but it is unlikely to be faster than the memset.
 
   cb_info.clear();
-  tu_info.clear();
+  //tu_info.clear();  // done on the fly
   ctb_info.clear();
   deblk_info.clear();
 
@@ -570,7 +712,7 @@ void de265_image::clear_metadata()
 }
 
 
-void de265_image::set_mv_info(int x,int y, int nPbW,int nPbH, const PredVectorInfo* mv)
+void de265_image::set_mv_info(int x,int y, int nPbW,int nPbH, const MotionVectorSpec& mv)
 {
   int log2PuSize = 2;
 
@@ -584,7 +726,7 @@ void de265_image::set_mv_info(int x,int y, int nPbW,int nPbH, const PredVectorIn
   for (int pby=0;pby<hPu;pby++)
     for (int pbx=0;pbx<wPu;pbx++)
       {
-        pb_info[ xPu+pbx + (yPu+pby)*stride ].mvi = *mv;
+        pb_info[ xPu+pbx + (yPu+pby)*stride ].mv = mv;
       }
 }
 
@@ -635,9 +777,9 @@ bool de265_image::available_pred_blk(int xC,int yC, int nCbS, int xP, int yP,
     availableN = available_zscan(xP,yP,xN,yN);
   }
   else {
-    availableN = !(nPbW<<1 == nCbS && nPbH<<1 == nCbS &&
+    availableN = !(nPbW<<1 == nCbS && nPbH<<1 == nCbS &&  // NxN
                    partIdx==1 &&
-                   yN >= yC+nPbH && xN < xC+nPbW);
+                   yN >= yC+nPbH && xN < xC+nPbW);  // xN/yN inside partIdx 2
   }
 
   if (availableN && get_pred_mode(xN,yN) == MODE_INTRA) {
@@ -646,4 +788,3 @@ bool de265_image::available_pred_blk(int xC,int yC, int nCbS, int xP, int yP,
 
   return availableN;
 }
-
diff --git a/libde265/image.h b/libde265/image.h
index fee1923..17b03f3 100644
--- a/libde265/image.h
+++ b/libde265/image.h
@@ -33,6 +33,7 @@
 #include <stdbool.h>
 #endif
 #include "libde265/de265.h"
+#include "libde265/en265.h"
 #include "libde265/sps.h"
 #include "libde265/pps.h"
 #include "libde265/motion.h"
@@ -40,7 +41,6 @@
 #include "libde265/slice.h"
 #include "libde265/nal.h"
 
-
 enum PictureState {
   UnusedForReference,
   UsedForShortTermReference,
@@ -78,23 +78,30 @@ enum PictureState {
 #define CTB_PROGRESS_DEBLK_H   3
 #define CTB_PROGRESS_SAO       4
 
+class decoder_context;
+
 template <class DataUnit> class MetaDataArray
 {
  public:
   MetaDataArray() { data=NULL; data_size=0; log2unitSize=0; width_in_units=0; height_in_units=0; }
   ~MetaDataArray() { free(data); }
 
-  bool alloc(int w,int h, int _log2unitSize) {
+  LIBDE265_CHECK_RESULT bool alloc(int w,int h, int _log2unitSize) {
     int size = w*h;
 
     if (size != data_size) {
       free(data);
       data = (DataUnit*)malloc(size * sizeof(DataUnit));
+      if (data == NULL) {
+        data_size = 0;
+        return false;
+      }
       data_size = size;
-      width_in_units = w;
-      height_in_units = h;
     }
 
+    width_in_units = w;
+    height_in_units = h;
+
     log2unitSize = _log2unitSize;
 
     return data != NULL;
@@ -108,6 +115,9 @@ template <class DataUnit> class MetaDataArray
     int unitX = x>>log2unitSize;
     int unitY = y>>log2unitSize;
 
+    assert(unitX >= 0 && unitX < width_in_units);
+    assert(unitY >= 0 && unitY < height_in_units);
+
     return data[ unitX + unitY*width_in_units ];
   }
 
@@ -115,6 +125,9 @@ template <class DataUnit> class MetaDataArray
     int unitX = x>>log2unitSize;
     int unitY = y>>log2unitSize;
 
+    assert(unitX >= 0 && unitX < width_in_units);
+    assert(unitY >= 0 && unitY < height_in_units);
+
     return data[ unitX + unitY*width_in_units ];
   }
 
@@ -122,12 +135,17 @@ template <class DataUnit> class MetaDataArray
     int unitX = x>>log2unitSize;
     int unitY = y>>log2unitSize;
 
+    assert(unitX >= 0 && unitX < width_in_units);
+    assert(unitY >= 0 && unitY < height_in_units);
+
     data[ unitX + unitY*width_in_units ] = d;
   }
 
   DataUnit& operator[](int idx) { return data[idx]; }
   const DataUnit& operator[](int idx) const { return data[idx]; }
 
+  int size() const { return data_size; }
+
   // private:
   DataUnit* data;
   int data_size;
@@ -146,6 +164,16 @@ template <class DataUnit> class MetaDataArray
         cb_info[ cbx + cby*cb_info.width_in_units ].Field = value;  \
       }
 
+#define CLEAR_TB_BLK(x,y,log2BlkWidth)              \
+  int tuX = x >> tu_info.log2unitSize; \
+  int tuY = y >> tu_info.log2unitSize; \
+  int width = 1 << (log2BlkWidth - tu_info.log2unitSize);           \
+  for (int tuy=tuY;tuy<tuY+width;tuy++)                             \
+    for (int tux=tuX;tux<tuX+width;tux++)                           \
+      {                                                             \
+        tu_info[ tux + tuy*tu_info.width_in_units ] = 0;  \
+      }
+
 
 typedef struct {
   uint16_t SliceAddrRS;
@@ -153,31 +181,45 @@ typedef struct {
 
   sao_info saoInfo;
   bool     deblock;         // this CTB has to be deblocked
-  bool     has_pcm;         // pcm is used in this CTB
-  bool     has_cu_transquant_bypass; // transquant_bypass is used in this CTB
+
+  // The following flag helps to quickly check whether we have to
+  // check all conditions in the SAO filter or whether we can skip them.
+  bool     has_pcm_or_cu_transquant_bypass; // pcm or transquant_bypass is used in this CTB
 } CTB_info;
 
 
 typedef struct {
-  uint8_t log2CbSize : 3;   // [0;6] (1<<log2CbSize) = 64
+  uint8_t log2CbSize : 3;   /* [0;6] (1<<log2CbSize) = 64
+                               This is only set in the top-left corner of the CB.
+                               The other values should be zero.
+                               TODO: in the encoder, we have to clear to zero.
+                               Used in deblocking and QP-scale decoding */
   uint8_t PartMode : 3;     // (enum PartMode)  [0;7] set only in top-left of CB
-                            // TODO: could be removed if prediction-block-boundaries would be
-                            // set during decoding
-  uint8_t ctDepth : 2;      // [0:3]? (0:64, 1:32, 2:16, 3:8)
+                            // Used for spatial merging candidates in current frame
+                            // and for deriving interSplitFlag in decoding.
+
+  uint8_t ctDepth : 2;      // [0:3]? (for CTB size 64: 0:64, 1:32, 2:16, 3:8)
+                            // Used for decoding/encoding split_cu flag.
+
+  // --- byte boundary ---
   uint8_t PredMode : 2;     // (enum PredMode)  [0;2] must be saved for past images
-  uint8_t pcm_flag : 1;     //
-  uint8_t cu_transquant_bypass : 1;
+                            // Used in motion decoding.
+  uint8_t pcm_flag : 1;     // Stored for intra-prediction / SAO
+  uint8_t cu_transquant_bypass : 1; // Stored for SAO
+  // note: 4 bits left
 
-  int8_t  QP_Y;
+  // --- byte boundary ---
+  int8_t  QP_Y;  // Stored for QP prediction
 
-  // uint8_t pcm_flag;  // TODO
 } CB_ref_info;
 
 
 typedef struct {
-  PredVectorInfo mvi; // TODO: this can be done in 16x16 grid
+  MotionVectorSpec mv; // TODO: this can be done in 16x16 grid
 } PB_ref_info;
 
+// intraPredMode:   Used for determining scanIdx when decoding/encoding coefficients.
+
 
 
 struct de265_image {
@@ -185,9 +227,15 @@ struct de265_image {
   ~de265_image();
 
 
-  de265_error alloc_image(int w,int h, enum de265_chroma c, const seq_parameter_set* sps,
-                          bool allocMetadata, decoder_context* ctx, de265_PTS pts, void* user_data,
-                          bool isOutputImage);
+  de265_error alloc_image(int w,int h, enum de265_chroma c,
+                          const seq_parameter_set* sps,
+                          bool allocMetadata,
+                          decoder_context* dctx,
+                          class encoder_context* ectx,
+                          de265_PTS pts, void* user_data,
+                          bool useCustomAllocFunctions);
+
+  //de265_error alloc_encoder_data(const seq_parameter_set* sps);
 
   bool is_allocated() const { return pixels[0] != NULL; }
 
@@ -212,12 +260,35 @@ struct de265_image {
     return pixels[cIdx] + xpos + ypos*stride;
   }
 
+
+  /// xpos;ypos in actual plane resolution
+  template <class pixel_t>
+  pixel_t* get_image_plane_at_pos_NEW(int cIdx, int xpos,int ypos)
+  {
+    int stride = get_image_stride(cIdx);
+    return (pixel_t*)(pixels[cIdx] + (xpos + ypos*stride)*sizeof(pixel_t));
+  }
+
   const uint8_t* get_image_plane_at_pos(int cIdx, int xpos,int ypos) const
   {
     int stride = get_image_stride(cIdx);
     return pixels[cIdx] + xpos + ypos*stride;
   }
 
+  void* get_image_plane_at_pos_any_depth(int cIdx, int xpos,int ypos)
+  {
+    int stride = get_image_stride(cIdx);
+    return pixels[cIdx] + ((xpos + ypos*stride) << bpp_shift[cIdx]);
+  }
+
+  const void* get_image_plane_at_pos_any_depth(int cIdx, int xpos,int ypos) const
+  {
+    int stride = get_image_stride(cIdx);
+    return pixels[cIdx] + ((xpos + ypos*stride) << bpp_shift[cIdx]);
+  }
+
+  /* Number of pixels in one row (not number of bytes).
+   */
   int get_image_stride(int cIdx) const
   {
     if (cIdx==0) return stride;
@@ -232,6 +303,18 @@ struct de265_image {
 
   enum de265_chroma get_chroma_format() const { return chroma_format; }
 
+  int get_bit_depth(int cIdx) const {
+    if (cIdx==0) return sps.BitDepth_Y;
+    else         return sps.BitDepth_C;
+  }
+
+  int get_bytes_per_pixel(int cIdx) const {
+    return (get_bit_depth(cIdx)+7)/8;
+  }
+
+  bool high_bit_depth(int cIdx) const {
+    return get_bit_depth(cIdx)>8;
+  }
 
   bool can_be_released() const { return PicOutputFlag==false && PicState==UnusedForReference; }
 
@@ -251,11 +334,17 @@ struct de265_image {
 
   static de265_image_allocation default_image_allocation;
 
+  void printBlk(const char* title, int x0,int y0,int blkSize,int cIdx) const {
+    ::printBlk(title, get_image_plane_at_pos(cIdx,x0,y0),
+               blkSize, get_image_stride(cIdx));
+  }
+
 private:
   uint32_t ID;
   static uint32_t s_next_image_ID;
 
   uint8_t* pixels[3];
+  uint8_t  bpp_shift[3];  // 0 for 8 bit, 1 for 16 bit
 
   enum de265_chroma chroma_format;
 
@@ -291,12 +380,16 @@ public:
   seq_parameter_set   sps;  // the SPS used for decoding this image
   pic_parameter_set   pps;  // the PPS used for decoding this image
   decoder_context*    decctx;
+  class encoder_context*    encctx;
+
+  int number_of_ctbs() const { return ctb_info.size(); }
 
 private:
   MetaDataArray<CTB_info>    ctb_info;
   MetaDataArray<CB_ref_info> cb_info;
   MetaDataArray<PB_ref_info> pb_info;
   MetaDataArray<uint8_t>     intraPredMode;
+  MetaDataArray<uint8_t>     intraPredModeC;
   MetaDataArray<uint8_t>     tu_info;
   MetaDataArray<uint8_t>     deblk_info;
 
@@ -307,6 +400,9 @@ public:
   void*     user_data;
   void*     plane_user_data[3];  // this is logically attached to the pixel data pointers
   de265_image_allocation image_allocation_functions; // the functions used for memory allocation
+  void (*encoder_image_release_func)(en265_encoder_context*,
+                                     de265_image*,
+                                     void* userdata);
 
   uint8_t integrity; /* Whether an error occured while the image was decoded.
                         When generated, this is initialized to INTEGRITY_CORRECT,
@@ -328,12 +424,13 @@ public:
 
 
   void thread_start(int nThreads);
-  void thread_run();
+  void thread_run(const thread_task*);
   void thread_blocks();
   void thread_unblocks();
-  void thread_finishes(); /* NOTE: you should not access any data in the thread_task after
-                             calling this, as this function may unlock other threads that
-                             will push this image to the output queue and free all decoder data. */
+  /* NOTE: you should not access any data in the thread_task after
+     calling this, as this function may unlock other threads that
+     will push this image to the output queue and free all decoder data. */
+  void thread_finishes(const thread_task*);
 
   void wait_for_progress(thread_task* task, int ctbx,int ctby, int progress);
   void wait_for_progress(thread_task* task, int ctbAddrRS, int progress);
@@ -384,10 +481,12 @@ public:
     return get_pred_mode(x,y)==MODE_SKIP;
   }
 
-  void set_pcm_flag(int x,int y, int log2BlkWidth)
+  void set_pcm_flag(int x,int y, int log2BlkWidth, uint8_t value=1)
   {
-    SET_CB_BLK(x,y,log2BlkWidth, pcm_flag, 1);
-    ctb_info.get(x,y).has_pcm = true;
+    SET_CB_BLK(x,y,log2BlkWidth, pcm_flag, value);
+
+    // TODO: in the encoder, we somewhere have to clear this
+    ctb_info.get(x,y).has_pcm_or_cu_transquant_bypass = true;
   }
 
   int  get_pcm_flag(int x,int y) const
@@ -395,10 +494,12 @@ public:
     return cb_info.get(x,y).pcm_flag;
   }
 
-  void set_cu_transquant_bypass(int x,int y, int log2BlkWidth)
+  void set_cu_transquant_bypass(int x,int y, int log2BlkWidth, uint8_t value=1)
   {
-    SET_CB_BLK(x,y,log2BlkWidth, cu_transquant_bypass, 1);
-    ctb_info.get(x,y).has_cu_transquant_bypass = true;
+    SET_CB_BLK(x,y,log2BlkWidth, cu_transquant_bypass, value);
+
+    // TODO: in the encoder, we somewhere have to clear this
+    ctb_info.get(x,y).has_pcm_or_cu_transquant_bypass = true;
   }
 
   int  get_cu_transquant_bypass(int x,int y) const
@@ -406,11 +507,16 @@ public:
     return cb_info.get(x,y).cu_transquant_bypass;
   }
 
-  void set_log2CbSize(int x0, int y0, int log2CbSize)
+  void set_log2CbSize(int x0, int y0, int log2CbSize, bool fill)
   {
-    cb_info.get(x0,y0).log2CbSize = log2CbSize;
+    // In theory, we could assume that remaining cb_info blocks are initialized to zero.
+    // But in corrupted streams, slices may overlap and set contradicting log2CbSizes.
+    // We also need this for encoding.
+    if (fill) {
+      SET_CB_BLK(x0,y0,log2CbSize, log2CbSize, 0);
+    }
 
-    // assume that remaining cb_info blocks are initialized to zero
+    cb_info.get(x0,y0).log2CbSize = log2CbSize;
   }
 
   int  get_log2CbSize(int x0, int y0) const
@@ -461,6 +567,11 @@ public:
     tu_info.get(x0,y0) |= (1<<trafoDepth);
   }
 
+  void clear_split_transform_flags(int x0,int y0,int log2CbSize)
+  {
+    CLEAR_TB_BLK (x0,y0, log2CbSize);
+  }
+
   int  get_split_transform_flag(int x0,int y0,int trafoDepth) const
   {
     return (tu_info.get(x0,y0) & (1<<trafoDepth));
@@ -500,12 +611,74 @@ public:
   void set_IntraPredMode(int PUidx,int log2blkSize, enum IntraPredMode mode)
   {
     int pbSize = 1<<(log2blkSize - intraPredMode.log2unitSize);
-    
+
     for (int y=0;y<pbSize;y++)
       for (int x=0;x<pbSize;x++)
         intraPredMode[PUidx + x + y*intraPredMode.width_in_units] = mode;
   }
 
+  void set_IntraPredMode(int x0,int y0,int log2blkSize,
+                         enum IntraPredMode mode)
+  {
+    int pbSize = 1<<(log2blkSize - intraPredMode.log2unitSize);
+    int PUidx  = (x0>>sps.Log2MinPUSize) + (y0>>sps.Log2MinPUSize)*sps.PicWidthInMinPUs;
+
+    for (int y=0;y<pbSize;y++)
+      for (int x=0;x<pbSize;x++) {
+        assert(x<sps.PicWidthInMinPUs);
+        assert(y<sps.PicHeightInMinPUs);
+
+        int idx = PUidx + x + y*intraPredMode.width_in_units;
+        assert(idx<intraPredMode.data_size);
+        intraPredMode[idx] = mode;
+      }
+  }
+
+
+  enum IntraPredMode get_IntraPredModeC(int x,int y) const
+  {
+    return (enum IntraPredMode)(intraPredModeC.get(x,y) & 0x3f);
+  }
+
+  bool is_IntraPredModeC_Mode4(int x,int y) const
+  {
+    return intraPredModeC.get(x,y) & 0x80;
+  }
+
+  void set_IntraPredModeC(int x0,int y0,int log2blkSize, enum IntraPredMode mode,
+                          bool is_mode4)
+  {
+    uint8_t combinedValue = mode;
+    if (is_mode4) combinedValue |= 0x80;
+
+    int pbSize = 1<<(log2blkSize - intraPredMode.log2unitSize);
+    int PUidx  = (x0>>sps.Log2MinPUSize) + (y0>>sps.Log2MinPUSize)*sps.PicWidthInMinPUs;
+
+    for (int y=0;y<pbSize;y++)
+      for (int x=0;x<pbSize;x++) {
+        assert(x<sps.PicWidthInMinPUs);
+        assert(y<sps.PicHeightInMinPUs);
+
+        int idx = PUidx + x + y*intraPredModeC.width_in_units;
+        assert(idx<intraPredModeC.data_size);
+        intraPredModeC[idx] = combinedValue;
+      }
+  }
+
+
+  /*
+  // NOTE: encoder only
+  void set_ChromaIntraPredMode(int x,int y,int log2BlkWidth, enum IntraChromaPredMode mode)
+  {
+    SET_CB_BLK (x, y, log2BlkWidth, intra_chroma_pred_mode, mode);
+  }
+
+  // NOTE: encoder only
+  enum IntraChromaPredMode get_ChromaIntraPredMode(int x,int y) const
+  {
+    return (enum IntraChromaPredMode)(cb_info.get(x,y).intra_chroma_pred_mode);
+  }
+  */
 
   // --- CTB metadata access ---
 
@@ -547,30 +720,42 @@ public:
     return ctb_info[ctb].SliceHeaderIndex;
   }
 
+  bool is_SliceHeader_available(int x,int y) const
+  {
+    int idx = ctb_info.get(x,y).SliceHeaderIndex;
+    return idx >= 0 && idx < slices.size();
+  }
+
   slice_segment_header* get_SliceHeader(int x, int y)
   {
-    return slices[ get_SliceHeaderIndex(x,y) ];
+    int idx = get_SliceHeaderIndex(x,y);
+    if (idx >= slices.size()) { return NULL; }
+    return slices[idx];
   }
 
   slice_segment_header* get_SliceHeaderCtb(int ctbX, int ctbY)
   {
-    return slices[ get_SliceHeaderIndexCtb(ctbX,ctbY) ];
+    int idx = get_SliceHeaderIndexCtb(ctbX,ctbY);
+    if (idx >= slices.size()) { return NULL; }
+    return slices[idx];
   }
 
   const slice_segment_header* get_SliceHeaderCtb(int ctbX, int ctbY) const
   {
-    return slices[ get_SliceHeaderIndexCtb(ctbX,ctbY) ];
+    int idx = get_SliceHeaderIndexCtb(ctbX,ctbY);
+    if (idx >= slices.size()) { return NULL; }
+    return slices[idx];
   }
-  
+
   void set_sao_info(int ctbX,int ctbY,const sao_info* saoinfo)
   {
     sao_info* sao = &ctb_info[ctbX + ctbY*ctb_info.width_in_units].saoInfo;
-    
+
     memcpy(sao,
            saoinfo,
            sizeof(sao_info));
   }
-  
+
   const sao_info* get_sao_info(int ctbX,int ctbY) const
   {
     return &ctb_info[ctbX + ctbY*ctb_info.width_in_units].saoInfo;
@@ -589,16 +774,10 @@ public:
   }
 
 
-  bool get_CTB_has_pcm(int ctbX,int ctbY) const
+  bool get_CTB_has_pcm_or_cu_transquant_bypass(int ctbX,int ctbY) const
   {
     int idx = ctbX + ctbY*ctb_info.width_in_units;
-    return ctb_info[idx].has_pcm;
-  }
-
-  bool get_CTB_has_cu_transquant_bypass(int ctbX,int ctbY) const
-  {
-    int idx = ctbX + ctbY*ctb_info.width_in_units;
-    return ctb_info[idx].has_cu_transquant_bypass;
+    return ctb_info[idx].has_pcm_or_cu_transquant_bypass;
   }
 
 
@@ -612,7 +791,7 @@ public:
   {
     const int xd = x0/4;
     const int yd = y0/4;
-    
+
     if (xd<deblk_info.width_in_units &&
         yd<deblk_info.height_in_units) {
       deblk_info[xd + yd*deblk_info.width_in_units] |= flags;
@@ -642,15 +821,16 @@ public:
 
   // --- PB metadata access ---
 
-  const PredVectorInfo* get_mv_info(int x,int y) const
+  const MotionVectorSpec* get_mv_info(int x,int y) const
   {
-    return &pb_info.get(x,y).mvi;
+    return &pb_info.get(x,y).mv;
   }
 
-  void set_mv_info(int x,int y, int nPbW,int nPbH, const PredVectorInfo* mv);
+  void set_mv_info(int x,int y, int nPbW,int nPbH, const MotionVectorSpec& mv);
 
-// --- value logging ---
+  // --- value logging ---
 
+  void printBlk(int x0,int y0, int cIdx, int log2BlkSize);
 };
 
 
diff --git a/libde265/intrapred.cc b/libde265/intrapred.cc
index 771ec8a..086038b 100644
--- a/libde265/intrapred.cc
+++ b/libde265/intrapred.cc
@@ -29,7 +29,8 @@
 
 
 #ifdef DE265_LOG_TRACE
-void print_border(uint8_t* data, uint8_t* available, int nT)
+template <class pixel_t>
+void print_border(pixel_t* data, uint8_t* available, int nT)
 {
   for (int i=-2*nT ; i<=2*nT ; i++) {
     if (i==0 || i==1 || i==-nT || i==nT+1) {
@@ -51,10 +52,217 @@ void print_border(uint8_t* data, uint8_t* available, int nT)
 #endif
 
 
+void fillIntraPredModeCandidates(int candModeList[3], int x,int y, int PUidx,
+                                 bool availableA, // left
+                                 bool availableB, // top
+                                 const de265_image* img)
+{
+  const seq_parameter_set* sps = &img->sps;
+
+  // block on left side
+
+  enum IntraPredMode candIntraPredModeA, candIntraPredModeB;
+  if (availableA==false) {
+    candIntraPredModeA=INTRA_DC;
+  }
+  else if (img->get_pred_mode(x-1,y) != MODE_INTRA ||
+           img->get_pcm_flag (x-1,y)) {
+    candIntraPredModeA=INTRA_DC;
+ }
+  else {
+    candIntraPredModeA = img->get_IntraPredMode_atIndex(PUidx-1);
+  }
+
+  // block above
+
+  if (availableB==false) {
+    candIntraPredModeB=INTRA_DC;
+  }
+  else if (img->get_pred_mode(x,y-1) != MODE_INTRA ||
+           img->get_pcm_flag (x,y-1)) {
+    candIntraPredModeB=INTRA_DC;
+  }
+  else if (y-1 < ((y >> sps->Log2CtbSizeY) << sps->Log2CtbSizeY)) {
+    candIntraPredModeB=INTRA_DC;
+  }
+  else {
+    candIntraPredModeB = img->get_IntraPredMode_atIndex(PUidx-sps->PicWidthInMinPUs);
+  }
+
+
+  // build candidate list
+
+  logtrace(LogSlice,"%d;%d availableA:%d candA:%d & availableB:%d candB:%d\n", x,y,
+           availableA, candIntraPredModeA,
+           availableB, candIntraPredModeB);
+
+  if (candIntraPredModeA == candIntraPredModeB) {
+    if (candIntraPredModeA < 2) {
+      candModeList[0] = INTRA_PLANAR;
+      candModeList[1] = INTRA_DC;
+      candModeList[2] = INTRA_ANGULAR_26;
+    }
+    else {
+      candModeList[0] = candIntraPredModeA;
+      candModeList[1] = 2 + ((candIntraPredModeA-2 -1 +32) % 32);
+      candModeList[2] = 2 + ((candIntraPredModeA-2 +1    ) % 32);
+    }
+  }
+  else {
+    candModeList[0] = candIntraPredModeA;
+    candModeList[1] = candIntraPredModeB;
+
+    if (candIntraPredModeA != INTRA_PLANAR &&
+        candIntraPredModeB != INTRA_PLANAR) {
+      candModeList[2] = INTRA_PLANAR;
+    }
+    else if (candIntraPredModeA != INTRA_DC &&
+             candIntraPredModeB != INTRA_DC) {
+      candModeList[2] = INTRA_DC;
+    }
+    else {
+      candModeList[2] = INTRA_ANGULAR_26;
+    }
+  }
+
+  /*
+    printf("candModeList: %d %d %d\n",
+    candModeList[0],
+    candModeList[1],
+    candModeList[2]
+    );
+  */
+}
+
+
+int find_intra_pred_mode(enum IntraPredMode mode,
+                         int candModeList[3])
+{
+  // check whether the mode is in the candidate list
+
+  for (int i=0;i<3;i++) {
+    if (candModeList[i] == mode) {
+      return i;
+    }
+  }
+
+  // sort candModeList
+
+  if (candModeList[0] > candModeList[1]) {
+    std::swap(candModeList[0],candModeList[1]);
+  }
+  if (candModeList[0] > candModeList[2]) {
+    std::swap(candModeList[0],candModeList[2]);
+  }
+  if (candModeList[1] > candModeList[2]) {
+    std::swap(candModeList[1],candModeList[2]);
+  }
+
+  // skip modes already in the candidate list
+
+  int intraMode = mode;
+
+  for (int i=2;i>=0;i--) {
+    if (intraMode >= candModeList[i]) { intraMode--; }
+  }
+
+  return -intraMode-1;
+}
+
+
+void list_chroma_pred_candidates(enum IntraPredMode chroma_mode[5],
+                                 enum IntraPredMode luma_mode)
+{
+  enum IntraPredMode chroma_cand[5];
+  chroma_cand[0] = INTRA_PLANAR;
+  chroma_cand[1] = INTRA_ANGULAR_26;
+  chroma_cand[2] = INTRA_ANGULAR_10;
+  chroma_cand[3] = INTRA_DC;
+  chroma_cand[4] = luma_mode;
+
+  switch (luma_mode) {
+  case INTRA_PLANAR:     chroma_cand[0] = INTRA_ANGULAR_34; break;
+  case INTRA_ANGULAR_26: chroma_cand[1] = INTRA_ANGULAR_34; break;
+  case INTRA_ANGULAR_10: chroma_cand[2] = INTRA_ANGULAR_34; break;
+  case INTRA_DC:         chroma_cand[3] = INTRA_ANGULAR_34; break;
+  default:
+    // use defaults from above
+    break;
+  }
+}
+
+
+int get_intra_scan_idx(int log2TrafoSize, enum IntraPredMode intraPredMode, int cIdx,
+                       const seq_parameter_set* sps)
+{
+  if (log2TrafoSize==2 ||
+      (log2TrafoSize==3 && (cIdx==0 ||
+                            sps->ChromaArrayType==CHROMA_444))) {
+    /**/ if (intraPredMode >=  6 && intraPredMode <= 14) return 2;
+    else if (intraPredMode >= 22 && intraPredMode <= 30) return 1;
+    else return 0;
+  }
+  else { return 0; }
+}
+
+
+int get_intra_scan_idx_luma(int log2TrafoSize, enum IntraPredMode intraPredMode)
+{
+  if (log2TrafoSize==2 || log2TrafoSize==3) {
+    /**/ if (intraPredMode >=  6 && intraPredMode <= 14) return 2;
+    else if (intraPredMode >= 22 && intraPredMode <= 30) return 1;
+    else return 0;
+  }
+  else { return 0; }
+}
+
+int get_intra_scan_idx_chroma(int log2TrafoSize, enum IntraPredMode intraPredMode)
+{
+  if (log2TrafoSize==1 || log2TrafoSize==2) {
+    /**/ if (intraPredMode >=  6 && intraPredMode <= 14) return 2;
+    else if (intraPredMode >= 22 && intraPredMode <= 30) return 1;
+    else return 0;
+  }
+  else { return 0; }
+}
+
+
+enum IntraPredMode lumaPredMode_to_chromaPredMode(enum IntraPredMode luma,
+                                                  enum IntraChromaPredMode chroma)
+{
+  switch (chroma) {
+  case INTRA_CHROMA_LIKE_LUMA:
+    return luma;
+
+  case INTRA_CHROMA_PLANAR_OR_34:
+    if (luma==INTRA_PLANAR) return INTRA_ANGULAR_34;
+    else                    return INTRA_PLANAR;
+
+  case INTRA_CHROMA_ANGULAR_26_OR_34:
+    if (luma==INTRA_ANGULAR_26) return INTRA_ANGULAR_34;
+    else                        return INTRA_ANGULAR_26;
+
+  case INTRA_CHROMA_ANGULAR_10_OR_34:
+    if (luma==INTRA_ANGULAR_10) return INTRA_ANGULAR_34;
+    else                        return INTRA_ANGULAR_10;
+
+  case INTRA_CHROMA_DC_OR_34:
+    if (luma==INTRA_DC)         return INTRA_ANGULAR_34;
+    else                        return INTRA_DC;
+  }
+
+
+  assert(false);
+  return INTRA_DC;
+}
+
+
 // (8.4.4.2.2)
-void fill_border_samples(de265_image* img, int xB,int yB,
+template <class pixel_t>
+void fill_border_samples(de265_image* img,
+                         int xB,int yB,  // in component specific resolution
                          int nT, int cIdx,
-                         uint8_t* out_border)
+                         pixel_t* out_border)
 {
   const seq_parameter_set* sps = &img->sps;
   const pic_parameter_set* pps = &img->pps;
@@ -62,20 +270,20 @@ void fill_border_samples(de265_image* img, int xB,int yB,
   uint8_t available_data[2*64 + 1];
   uint8_t* available = &available_data[64];
 
-  uint8_t* image;
+  pixel_t* image;
   int stride;
-  image  = img->get_image_plane(cIdx);
+  image  = (pixel_t*)img->get_image_plane(cIdx);
   stride = img->get_image_stride(cIdx);
 
-  const int chromaShift = (cIdx==0) ? 0 : 1;
-  const int TUShift = (cIdx==0) ? sps->Log2MinTrafoSize : sps->Log2MinTrafoSize-1;
+  const int SubWidth  = (cIdx==0) ? 1 : sps->SubWidthC;
+  const int SubHeight = (cIdx==0) ? 1 : sps->SubHeightC;
 
+  const int bit_depth = img->get_bit_depth(cIdx);
 
   // --- check for CTB boundaries ---
 
-  int xBLuma = (cIdx==0) ? xB : 2*xB;
-  int yBLuma = (cIdx==0) ? yB : 2*yB;
-  int nTLuma = (cIdx==0) ? nT : 2*nT;
+  int xBLuma = xB * SubWidth;
+  int yBLuma = yB * SubHeight;
 
   int log2CtbSize = sps->Log2CtbSizeY;
   int picWidthInCtbs = sps->PicWidthInCtbsY;
@@ -86,6 +294,8 @@ void fill_border_samples(de265_image* img, int xB,int yB,
   bool availableTopLeft=true;  // if CTB at top-left pixel available?
 
 
+  //printf("xB/yB: %d %d\n",xB,yB);
+
   // are we at left image border
 
   if (xBLuma == 0) {
@@ -104,16 +314,16 @@ void fill_border_samples(de265_image* img, int xB,int yB,
     yBLuma = 0; // fake value, available flags are already set to false
   }
 
-  if (xBLuma+nTLuma >= sps->pic_width_in_luma_samples) {
+  if (xBLuma+nT*SubWidth >= sps->pic_width_in_luma_samples) {
     availableTopRight=false;
   }
- 
+
   // check for tile and slice boundaries
 
   int xCurrCtb = xBLuma >> log2CtbSize;
   int yCurrCtb = yBLuma >> log2CtbSize;
   int xLeftCtb = (xBLuma-1) >> log2CtbSize;
-  int xRightCtb = (xBLuma+nTLuma) >> log2CtbSize;
+  int xRightCtb = (xBLuma+nT*SubWidth) >> log2CtbSize;
   int yTopCtb   = (yBLuma-1) >> log2CtbSize;
 
   int currCTBSlice = img->get_SliceAddrRS(xCurrCtb,yCurrCtb);
@@ -122,6 +332,13 @@ void fill_border_samples(de265_image* img, int xB,int yB,
   int toprightCTBSlice = availableTopRight ? img->get_SliceAddrRS(xRightCtb, yTopCtb) : -1;
   int topleftCTBSlice  = availableTopLeft  ? img->get_SliceAddrRS(xLeftCtb, yTopCtb) : -1;
 
+  /*
+  printf("size: %d\n",pps->TileIdRS.size());
+  printf("curr: %d left: %d top: %d\n",
+         xCurrCtb+yCurrCtb*picWidthInCtbs,
+         availableLeft ? xLeftCtb+yCurrCtb*picWidthInCtbs : 9999,
+         availableTop  ? xCurrCtb+yTopCtb*picWidthInCtbs  : 9999);
+  */
   int currCTBTileID = pps->TileIdRS[xCurrCtb+yCurrCtb*picWidthInCtbs];
   int leftCTBTileID = availableLeft ? pps->TileIdRS[xLeftCtb+yCurrCtb*picWidthInCtbs] : -1;
   int topCTBTileID  = availableTop ? pps->TileIdRS[xCurrCtb+yTopCtb*picWidthInCtbs] : -1;
@@ -139,16 +356,17 @@ void fill_border_samples(de265_image* img, int xB,int yB,
 
   // number of pixels that are in the valid image area to the right and to the bottom
 
-  int nBottom = sps->pic_height_in_luma_samples - (cIdx==0 ? yB : 2*yB);
-  if (cIdx) nBottom=(nBottom+1)/2;
+  int nBottom = sps->pic_height_in_luma_samples - yB*SubHeight;
+  nBottom=(nBottom+SubHeight-1)/SubHeight;
   if (nBottom>2*nT) nBottom=2*nT;
-  int nRight  = sps->pic_width_in_luma_samples  - (cIdx==0 ? xB : 2*xB);
-  if (cIdx) nRight =(nRight +1)/2;
+
+  int nRight  = sps->pic_width_in_luma_samples  - xB*SubWidth;
+  nRight =(nRight +SubWidth-1)/SubWidth;
   if (nRight >2*nT) nRight=2*nT;
 
   int nAvail=0;
 
-  uint8_t firstValue;
+  pixel_t firstValue;
 
   memset(available-2*nT, 0, 4*nT+1);
 
@@ -158,13 +376,14 @@ void fill_border_samples(de265_image* img, int xB,int yB,
     for (int y=nBottom-1 ; y>=0 ; y-=4)
       if (availableLeft)
         {
-          int NBlockAddr = pps->MinTbAddrZS[ ((xB-1)>>TUShift) +
-                                             ((yB+y)>>TUShift) * sps->PicWidthInTbsY ];
-        
-          bool availableN = NBlockAddr < currBlockAddr;
+          int NBlockAddr = pps->MinTbAddrZS[ (((xB-1)*SubWidth )>>sps->Log2MinTrafoSize) +
+                                             (((yB+y)*SubHeight)>>sps->Log2MinTrafoSize)
+                                             * sps->PicWidthInTbsY ];
+
+          bool availableN = NBlockAddr <= currBlockAddr;
 
           if (pps->constrained_intra_pred_flag) {
-            if (img->get_pred_mode((xB-1)<<chromaShift,(yB+y)<<chromaShift)!=MODE_INTRA)
+            if (img->get_pred_mode((xB-1)*SubWidth,(yB+y)*SubHeight)!=MODE_INTRA)
               availableN = false;
           }
 
@@ -184,13 +403,14 @@ void fill_border_samples(de265_image* img, int xB,int yB,
 
     if (availableTopLeft)
       {
-        int NBlockAddr = pps->MinTbAddrZS[ ((xB-1)>>TUShift) +
-                                           ((yB-1)>>TUShift) * sps->PicWidthInTbsY ];
+        int NBlockAddr = pps->MinTbAddrZS[ (((xB-1)*SubWidth )>>sps->Log2MinTrafoSize) +
+                                           (((yB-1)*SubHeight)>>sps->Log2MinTrafoSize)
+                                           * sps->PicWidthInTbsY ];
 
-        bool availableN = NBlockAddr < currBlockAddr;
+        bool availableN = NBlockAddr <= currBlockAddr;
 
         if (pps->constrained_intra_pred_flag) {
-          if (img->get_pred_mode((xB-1)<<chromaShift,(yB-1)<<chromaShift)!=MODE_INTRA) {
+          if (img->get_pred_mode((xB-1)*SubWidth,(yB-1)*SubHeight)!=MODE_INTRA) {
             availableN = false;
           }
         }
@@ -213,13 +433,14 @@ void fill_border_samples(de265_image* img, int xB,int yB,
 
       if (borderAvailable)
         {
-          int NBlockAddr = pps->MinTbAddrZS[ ((xB+x)>>TUShift) +
-                                             ((yB-1)>>TUShift) * sps->PicWidthInTbsY ];
+          int NBlockAddr = pps->MinTbAddrZS[ (((xB+x)*SubWidth )>>sps->Log2MinTrafoSize) +
+                                             (((yB-1)*SubHeight)>>sps->Log2MinTrafoSize)
+                                             * sps->PicWidthInTbsY ];
 
-          bool availableN = NBlockAddr < currBlockAddr;
+          bool availableN = NBlockAddr <= currBlockAddr;
 
           if (pps->constrained_intra_pred_flag) {
-            if (img->get_pred_mode((xB+x)<<chromaShift,(yB-1)<<chromaShift)!=MODE_INTRA) {
+            if (img->get_pred_mode((xB+x)*SubWidth,(yB-1)*SubHeight)!=MODE_INTRA) {
               availableN = false;
             }
           }
@@ -243,7 +464,14 @@ void fill_border_samples(de265_image* img, int xB,int yB,
 
     if (nAvail!=4*nT+1) {
       if (nAvail==0) {
-        memset(out_border-2*nT, 1<<(sps->bit_depth_luma-1), 4*nT+1);
+        if (sizeof(pixel_t)==1) {
+          memset(out_border-2*nT, 1<<(bit_depth-1), 4*nT+1);
+        }
+        else {
+          for (int i = -2*nT; i <= 2*nT ; i++) {
+            out_border[i] = 1<<(bit_depth-1);
+          }
+        }
       }
       else {
         if (!available[-2*nT]) {
@@ -269,9 +497,10 @@ void fill_border_samples(de265_image* img, int xB,int yB,
 
 
 // (8.4.4.2.3)
+template <class pixel_t>
 void intra_prediction_sample_filtering(de265_image* img,
-                                       uint8_t* p,
-                                       int nT,
+                                       pixel_t* p,
+                                       int nT, int cIdx,
                                        enum IntraPredMode intraPredMode)
 {
   int filterFlag;
@@ -293,13 +522,14 @@ void intra_prediction_sample_filtering(de265_image* img,
 
   if (filterFlag) {
     int biIntFlag = (img->sps.strong_intra_smoothing_enable_flag &&
+                     cIdx==0 &&
                      nT==32 &&
                      abs_value(p[0]+p[ 64]-2*p[ 32]) < (1<<(img->sps.bit_depth_luma-5)) &&
                      abs_value(p[0]+p[-64]-2*p[-32]) < (1<<(img->sps.bit_depth_luma-5)))
       ? 1 : 0;
 
-    uint8_t  pF_mem[2*64+1];
-    uint8_t* pF = &pF_mem[64];
+    pixel_t  pF_mem[2*64+1];
+    pixel_t* pF = &pF_mem[64];
 
     if (biIntFlag) {
       pF[-2*nT] = p[-2*nT];
@@ -323,7 +553,7 @@ void intra_prediction_sample_filtering(de265_image* img,
 
     // copy back to original array
 
-    memcpy(p-2*nT, pF-2*nT, 4*nT+1);
+    memcpy(p-2*nT, pF-2*nT, (4*nT+1) * sizeof(pixel_t));
   }
   else {
     // do nothing ?
@@ -345,27 +575,33 @@ static const int invAngle_table[25-10] =
     -315,-390,-482,-630,-910,-1638,-4096 };
 
 
-// TODO: clip to read BitDepthY
-LIBDE265_INLINE static int Clip1Y(int x) { if (x<0) return 0; else if (x>255) return 255; else return x; }
-
-
 // (8.4.4.2.6)
+template <class pixel_t>
 void intra_prediction_angular(de265_image* img,
                               int xB0,int yB0,
                               enum IntraPredMode intraPredMode,
                               int nT,int cIdx,
-                              uint8_t* border)
+                              pixel_t* border)
 {
-  uint8_t  ref_mem[2*64+1];
-  uint8_t* ref=&ref_mem[64];
+  pixel_t  ref_mem[2*64+1];
+  pixel_t* ref=&ref_mem[64];
 
-  uint8_t* pred;
+  pixel_t* pred;
   int      stride;
-  pred   = img->get_image_plane_at_pos(cIdx,xB0,yB0);
+  pred   = img->get_image_plane_at_pos_NEW<pixel_t>(cIdx,xB0,yB0);
   stride = img->get_image_stride(cIdx);
 
+  int bit_depth = img->get_bit_depth(cIdx);
+
+  assert(intraPredMode<35);
+  assert(intraPredMode>=2);
+
   int intraPredAngle = intraPredAngle_table[intraPredMode];
 
+  bool disableIntraBoundaryFilter =
+    (img->sps.range_extension.implicit_rdpcm_enabled_flag &&
+     img->get_cu_transquant_bypass(xB0,yB0));
+
   if (intraPredMode >= 18) {
 
     for (int x=0;x<=nT;x++)
@@ -398,9 +634,9 @@ void intra_prediction_angular(de265_image* img,
           }
         }
 
-    if (intraPredMode==26 && cIdx==0 && nT<32) {
+    if (intraPredMode==26 && cIdx==0 && nT<32 && !disableIntraBoundaryFilter) {
       for (int y=0;y<nT;y++) {
-        pred[0+y*stride] = Clip1Y(border[1] + ((border[-1-y] - border[0])>>1));
+        pred[0+y*stride] = Clip_BitDepth(border[1] + ((border[-1-y] - border[0])>>1), bit_depth);
       }
     }
   }
@@ -436,9 +672,9 @@ void intra_prediction_angular(de265_image* img,
           }
         }
 
-    if (intraPredMode==10 && cIdx==0 && nT<32) {  // DIFF 26->10
+    if (intraPredMode==10 && cIdx==0 && nT<32 && !disableIntraBoundaryFilter) {  // DIFF 26->10
       for (int x=0;x<nT;x++) { // DIFF (x<->y)
-        pred[x] = Clip1Y(border[-1] + ((border[1+x] - border[0])>>1)); // DIFF (x<->y && neg)
+        pred[x] = Clip_BitDepth(border[-1] + ((border[1+x] - border[0])>>1), bit_depth); // DIFF (x<->y && neg)
       }
     }
   }
@@ -456,12 +692,13 @@ void intra_prediction_angular(de265_image* img,
 }
 
 
+template <class pixel_t>
 void intra_prediction_planar(de265_image* img,int xB0,int yB0,int nT,int cIdx,
-                             uint8_t* border)
+                             pixel_t* border)
 {
-  uint8_t* pred;
+  pixel_t* pred;
   int      stride;
-  pred = img->get_image_plane_at_pos(cIdx,xB0,yB0);
+  pred   = img->get_image_plane_at_pos_NEW<pixel_t>(cIdx,xB0,yB0);
   stride = img->get_image_stride(cIdx);
 
   int Log2_nT = Log2(nT);
@@ -486,12 +723,13 @@ void intra_prediction_planar(de265_image* img,int xB0,int yB0,int nT,int cIdx,
 }
 
 
+template <class pixel_t>
 void intra_prediction_DC(de265_image* img,int xB0,int yB0,int nT,int cIdx,
-                         uint8_t* border)
+                         pixel_t* border)
 {
-  uint8_t* pred;
+  pixel_t* pred;
   int      stride;
-  pred = img->get_image_plane_at_pos(cIdx,xB0,yB0);
+  pred   = img->get_image_plane_at_pos_NEW<pixel_t>(cIdx,xB0,yB0);
   stride = img->get_image_stride(cIdx);
 
   int Log2_nT = Log2(nT);
@@ -525,41 +763,34 @@ void intra_prediction_DC(de265_image* img,int xB0,int yB0,int nT,int cIdx,
   }
 
 
-  /*
-  printf("INTRAPRED DC\n");
-  for (int y=0;y<nT;y++) {
-    for (int x=0;x<nT;x++)
-      {
-        printf("%d ",pred[x+y*stride]);
-      }
-    printf("\n");
-  }
-  */
+  logtrace(LogIntraPred,"INTRAPRED DC\n");
+  for (int y=0;y<nT;y++)
+    {
+      for (int x=0;x<nT;x++)
+        logtrace(LogIntraPred,"%02x ", pred[x+y*stride]);
+
+      logtrace(LogIntraPred,"\n");
+    }
 }
 
 
 
-// (8.4.4.2.1)
-void decode_intra_prediction(de265_image* img,
-                             int xB0,int yB0,
-                             enum IntraPredMode intraPredMode,
-                             int nT, int cIdx)
+template <class pixel_t>
+void decode_intra_prediction_internal(de265_image* img,
+                                      int xB0,int yB0,
+                                      enum IntraPredMode intraPredMode,
+                                      int nT, int cIdx)
 {
-  logtrace(LogIntraPred,"decode_intra_prediction xy0:%d/%d mode=%d nT=%d, cIdx=%d\n",
-           xB0,yB0, intraPredMode, nT,cIdx);
-  /*
-    printf("decode_intra_prediction xy0:%d/%d mode=%d nT=%d, cIdx=%d\n",
-    xB0,yB0, intraPredMode, nT,cIdx);
-  */
-
-  uint8_t  border_pixels_mem[2*64+1];
-  uint8_t* border_pixels = &border_pixels_mem[64];
+  pixel_t  border_pixels_mem[2*64+1];
+  pixel_t* border_pixels = &border_pixels_mem[64];
 
   fill_border_samples(img, xB0,yB0, nT, cIdx, border_pixels);
 
-  if (cIdx==0) {
-    intra_prediction_sample_filtering(img, border_pixels, nT, intraPredMode);
-  }
+  if (img->sps.range_extension.intra_smoothing_disabled_flag == 0 &&
+      (cIdx==0 || img->sps.ChromaArrayType==CHROMA_444))
+    {
+      intra_prediction_sample_filtering(img, border_pixels, nT, cIdx, intraPredMode);
+    }
 
 
   switch (intraPredMode) {
@@ -576,3 +807,23 @@ void decode_intra_prediction(de265_image* img,
 }
 
 
+// (8.4.4.2.1)
+void decode_intra_prediction(de265_image* img,
+                             int xB0,int yB0,
+                             enum IntraPredMode intraPredMode,
+                             int nT, int cIdx)
+{
+  logtrace(LogIntraPred,"decode_intra_prediction xy0:%d/%d mode=%d nT=%d, cIdx=%d\n",
+           xB0,yB0, intraPredMode, nT,cIdx);
+  /*
+    printf("decode_intra_prediction xy0:%d/%d mode=%d nT=%d, cIdx=%d\n",
+    xB0,yB0, intraPredMode, nT,cIdx);
+  */
+
+  if (img->high_bit_depth(cIdx)) {
+    decode_intra_prediction_internal<uint16_t>(img,xB0,yB0, intraPredMode,nT,cIdx);
+  }
+  else {
+    decode_intra_prediction_internal<uint8_t>(img,xB0,yB0, intraPredMode,nT,cIdx);
+  }
+}
diff --git a/libde265/intrapred.h b/libde265/intrapred.h
index 1d3865e..6480938 100644
--- a/libde265/intrapred.h
+++ b/libde265/intrapred.h
@@ -25,6 +25,50 @@
 
 extern const int intraPredAngle_table[1+34];
 
+
+/* Fill the three intra-pred-mode candidates into candModeList.
+   Block position is (x,y) and you also have to give the PUidx for this
+   block (which is (x>>Log2MinPUSize) + (y>>Log2MinPUSize)*PicWidthInMinPUs).
+   availableA/B is the output of check_CTB_available().
+ */
+void fillIntraPredModeCandidates(int candModeList[3],
+                                 int x,int y, int PUidx,
+                                 bool availableA, // left
+                                 bool availableB, // top
+                                 const de265_image* img);
+
+
+inline void fillIntraPredModeCandidates(int candModeList[3], int x,int y,
+                                 bool availableA, // left
+                                 bool availableB, // top
+                                 const de265_image* img)
+{
+  int PUidx = img->sps.getPUIndexRS(x,y);
+  fillIntraPredModeCandidates(candModeList, x,y, PUidx, availableA,availableB, img);
+}
+
+
+
+/* Return value >= 0 -> use mpm_idx(return value)
+   else              -> use rem_intra(-return value-1)
+
+   This function may modify the candModeList !
+ */
+int find_intra_pred_mode(enum IntraPredMode mode,
+                         int candModeList[3]);
+
+void list_chroma_pred_candidates(enum IntraPredMode chroma_mode[5],
+                                 enum IntraPredMode luma_mode);
+
+int get_intra_scan_idx(int log2TrafoSize, enum IntraPredMode intraPredMode, int cIdx,
+                       const seq_parameter_set* sps);
+
+int get_intra_scan_idx_luma  (int log2TrafoSize, enum IntraPredMode intraPredMode); // DEPRECATED
+int get_intra_scan_idx_chroma(int log2TrafoSize, enum IntraPredMode intraPredMode); // DEPRECATED
+
+enum IntraPredMode lumaPredMode_to_chromaPredMode(enum IntraPredMode luma,
+                                                  enum IntraChromaPredMode chroma);
+
 void decode_intra_block(decoder_context* ctx,
                         thread_context* tctx,
                         int cIdx,
@@ -34,8 +78,8 @@ void decode_intra_block(decoder_context* ctx,
                         enum IntraPredMode intraPredMode,
                         bool transform_skip_flag);
 
-void fill_border_samples(decoder_context* ctx, int xB,int yB,
-                         int nT, int cIdx, uint8_t* out_border);
+//void fill_border_samples(decoder_context* ctx, int xB,int yB,
+//                         int nT, int cIdx, uint8_t* out_border);
 
 void decode_intra_prediction(de265_image* img,
                              int xB0,int yB0,
diff --git a/libde265/motion.cc b/libde265/motion.cc
index 9171244..1630daa 100644
--- a/libde265/motion.cc
+++ b/libde265/motion.cc
@@ -39,50 +39,19 @@
 #define MAX_CU_SIZE 64
 
 
-enum {
-  // important! order like shown in 8.5.3.1.1
-  PRED_A1  = 0,
-  PRED_B1  = 1,
-  PRED_B0  = 2,
-  PRED_A0  = 3,
-  PRED_B2  = 4,
-  PRED_COL = 5,
-  PRED_ZERO= 6
-};
-
-
-typedef struct
-{
-  uint8_t available[7];
-  PredVectorInfo pred_vector[7];
-} MergingCandidates;
-
-
-void reset_pred_vector(PredVectorInfo* pvec)
-{
-  for (int X=0;X<2;X++) {
-    pvec->mv[X].x = 0;
-    pvec->mv[X].y = 0;
-    pvec->refIdx[X] = -1;
-    pvec->predFlag[X] = 0;
-  }
-}
-
-
 static int extra_before[4] = { 0,3,3,2 };
 static int extra_after [4] = { 0,3,4,4 };
 
 
 
-void mc_luma(const decoder_context* ctx,
-             const de265_image* img, int mv_x, int mv_y,
+template <class pixel_t>
+void mc_luma(const base_context* ctx,
+             const seq_parameter_set* sps, int mv_x, int mv_y,
              int xP,int yP,
              int16_t* out, int out_stride,
-             uint8_t* ref, int ref_stride,
-             int nPbW, int nPbH)
+             const pixel_t* ref, int ref_stride,
+             int nPbW, int nPbH, int bitDepth_L)
 {
-  const seq_parameter_set* sps = &img->sps;
-
   int xFracL = mv_x & 3;
   int yFracL = mv_y & 3;
 
@@ -101,25 +70,22 @@ void mc_luma(const decoder_context* ctx,
   ALIGNED_16(int16_t) mcbuffer[MAX_CU_SIZE * (MAX_CU_SIZE+7)];
 
   if (xFracL==0 && yFracL==0) {
-    if (xIntOffsL >= 0 && yIntOffsL >= 0 &&
-        nPbW+xIntOffsL <= w && nPbH+yIntOffsL <= h) {
-    }
 
     if (xIntOffsL >= 0 && yIntOffsL >= 0 &&
         nPbW+xIntOffsL <= w && nPbH+yIntOffsL <= h) {
 
-      ctx->acceleration.put_hevc_qpel_8[0][0](out, out_stride,
-                                          &ref[yIntOffsL*ref_stride + xIntOffsL],
-                                          ref_stride,
-                                          nPbW,nPbH, mcbuffer);
+      ctx->acceleration.put_hevc_qpel(out, out_stride,
+                                      &ref[yIntOffsL*ref_stride + xIntOffsL],
+                                      ref_stride /* sizeof(pixel_t)*/,
+                                      nPbW,nPbH, mcbuffer, 0,0, bitDepth_L);
     }
     else {
       for (int y=0;y<nPbH;y++)
         for (int x=0;x<nPbW;x++) {
-        
+
           int xA = Clip3(0,w-1,x + xIntOffsL);
           int yA = Clip3(0,h-1,y + yIntOffsL);
-        
+
           out[y*out_stride+x] = ref[ xA + yA*ref_stride ] << shift3;
         }
     }
@@ -129,10 +95,10 @@ void mc_luma(const decoder_context* ctx,
 
     for (int y=0;y<nPbH;y++) {
       for (int x=0;x<nPbW;x++) {
-        
+
         int xA = Clip3(0,w-1,x + xIntOffsL);
         int yA = Clip3(0,h-1,y + yIntOffsL);
-        
+
         logtrace(LogMotion,"%02x ", ref[ xA + yA*ref_stride ]);
       }
       logtrace(LogMotion,"\n");
@@ -142,7 +108,7 @@ void mc_luma(const decoder_context* ctx,
 
     for (int y=0;y<nPbH;y++) {
       for (int x=0;x<nPbW;x++) {
-        
+
         logtrace(LogMotion,"%02x ",out[y*out_stride+x] >> 6); // 6 will be used when summing predictions
       }
       logtrace(LogMotion,"\n");
@@ -159,9 +125,9 @@ void mc_luma(const decoder_context* ctx,
     //int nPbH_extra = extra_top  + nPbH + extra_bottom;
 
 
-    uint8_t padbuf[(MAX_CU_SIZE+16)*(MAX_CU_SIZE+7)];
+    pixel_t padbuf[(MAX_CU_SIZE+16)*(MAX_CU_SIZE+7)];
 
-    uint8_t* src_ptr;
+    const pixel_t* src_ptr;
     int src_stride;
 
     if (-extra_left + xIntOffsL >= 0 &&
@@ -174,10 +140,10 @@ void mc_luma(const decoder_context* ctx,
     else {
       for (int y=-extra_top;y<nPbH+extra_bottom;y++) {
         for (int x=-extra_left;x<nPbW+extra_right;x++) {
-        
+
           int xA = Clip3(0,w-1,x + xIntOffsL);
           int yA = Clip3(0,h-1,y + yIntOffsL);
-        
+
           padbuf[x+extra_left + (y+extra_top)*(MAX_CU_SIZE+16)] = ref[ xA + yA*ref_stride ];
         }
       }
@@ -186,9 +152,9 @@ void mc_luma(const decoder_context* ctx,
       src_stride = MAX_CU_SIZE+16;
     }
 
-    ctx->acceleration.put_hevc_qpel_8[xFracL][yFracL](out, out_stride,
-                                                  src_ptr, src_stride,
-                                                  nPbW,nPbH, mcbuffer);
+    ctx->acceleration.put_hevc_qpel(out, out_stride,
+                                    src_ptr, src_stride /* sizeof(pixel_t) */,
+                                    nPbW,nPbH, mcbuffer, xFracL,yFracL, bitDepth_L);
 
 
     logtrace(LogMotion,"---V---\n");
@@ -203,16 +169,15 @@ void mc_luma(const decoder_context* ctx,
 
 
 
-void mc_chroma(const decoder_context* ctx,
-               const de265_image* img,
+template <class pixel_t>
+void mc_chroma(const base_context* ctx,
+               const seq_parameter_set* sps,
                int mv_x, int mv_y,
                int xP,int yP,
                int16_t* out, int out_stride,
-               uint8_t* ref, int ref_stride,
-               int nPbWC, int nPbHC)
+               const pixel_t* ref, int ref_stride,
+               int nPbWC, int nPbHC, int bit_depth_C)
 {
-  const seq_parameter_set* sps = &img->sps;
-
   // chroma sample interpolation process (8.5.3.2.2.2)
 
   //const int shift1 = sps->BitDepth_C-8;
@@ -222,20 +187,23 @@ void mc_chroma(const decoder_context* ctx,
   int wC = sps->pic_width_in_luma_samples /sps->SubWidthC;
   int hC = sps->pic_height_in_luma_samples/sps->SubHeightC;
 
+  mv_x *= 2 / sps->SubWidthC;
+  mv_y *= 2 / sps->SubHeightC;
+
   int xFracC = mv_x & 7;
   int yFracC = mv_y & 7;
 
-  int xIntOffsC = xP/2 + (mv_x>>3);
-  int yIntOffsC = yP/2 + (mv_y>>3);
+  int xIntOffsC = xP/sps->SubWidthC  + (mv_x>>3);
+  int yIntOffsC = yP/sps->SubHeightC + (mv_y>>3);
 
   ALIGNED_32(int16_t mcbuffer[MAX_CU_SIZE*(MAX_CU_SIZE+7)]);
 
   if (xFracC == 0 && yFracC == 0) {
     if (xIntOffsC>=0 && nPbWC+xIntOffsC<=wC &&
         yIntOffsC>=0 && nPbHC+yIntOffsC<=hC) {
-      ctx->acceleration.put_hevc_epel_8(out, out_stride,
-                                    &ref[xIntOffsC + yIntOffsC*ref_stride], ref_stride,
-                                    nPbWC,nPbHC, 0,0, NULL);
+      ctx->acceleration.put_hevc_epel(out, out_stride,
+                                      &ref[xIntOffsC + yIntOffsC*ref_stride], ref_stride,
+                                      nPbWC,nPbHC, 0,0, NULL, bit_depth_C);
     }
     else
       {
@@ -250,9 +218,9 @@ void mc_chroma(const decoder_context* ctx,
       }
   }
   else {
-    uint8_t padbuf[(MAX_CU_SIZE+16)*(MAX_CU_SIZE+3)];
+    pixel_t padbuf[(MAX_CU_SIZE+16)*(MAX_CU_SIZE+3)];
 
-    uint8_t* src_ptr;
+    const pixel_t* src_ptr;
     int src_stride;
 
     int extra_top  = 1;
@@ -268,10 +236,10 @@ void mc_chroma(const decoder_context* ctx,
     else {
       for (int y=-extra_top;y<nPbHC+extra_bottom;y++) {
         for (int x=-extra_left;x<nPbWC+extra_right;x++) {
-        
+
           int xA = Clip3(0,wC-1,x + xIntOffsC);
           int yA = Clip3(0,hC-1,y + yIntOffsC);
-        
+
           padbuf[x+extra_left + (y+extra_top)*(MAX_CU_SIZE+16)] = ref[ xA + yA*ref_stride ];
         }
       }
@@ -282,19 +250,19 @@ void mc_chroma(const decoder_context* ctx,
 
 
     if (xFracC && yFracC) {
-      ctx->acceleration.put_hevc_epel_hv_8(out, out_stride,
-                                       src_ptr, src_stride,
-                                       nPbWC,nPbHC, xFracC,yFracC, mcbuffer);
+      ctx->acceleration.put_hevc_epel_hv(out, out_stride,
+                                         src_ptr, src_stride,
+                                         nPbWC,nPbHC, xFracC,yFracC, mcbuffer, bit_depth_C);
     }
     else if (xFracC) {
-      ctx->acceleration.put_hevc_epel_h_8(out, out_stride,
-                                      src_ptr, src_stride,
-                                      nPbWC,nPbHC, xFracC,yFracC, mcbuffer);
+      ctx->acceleration.put_hevc_epel_h(out, out_stride,
+                                        src_ptr, src_stride,
+                                        nPbWC,nPbHC, xFracC,yFracC, mcbuffer, bit_depth_C);
     }
     else if (yFracC) {
-      ctx->acceleration.put_hevc_epel_v_8(out, out_stride,
-                                      src_ptr, src_stride,
-                                      nPbWC,nPbHC, xFracC,yFracC, mcbuffer);
+      ctx->acceleration.put_hevc_epel_v(out, out_stride,
+                                        src_ptr, src_stride,
+                                        nPbWC,nPbHC, xFracC,yFracC, mcbuffer, bit_depth_C);
     }
     else {
       assert(false); // full-pel shifts are handled above
@@ -306,24 +274,45 @@ void mc_chroma(const decoder_context* ctx,
 
 // 8.5.3.2
 // NOTE: for full-pel shifts, we can introduce a fast path, simply copying without shifts
-void generate_inter_prediction_samples(decoder_context* ctx,
+void generate_inter_prediction_samples(base_context* ctx,
+                                       const slice_segment_header* shdr,
                                        de265_image* img,
-                                       slice_segment_header* shdr,
                                        int xC,int yC,
                                        int xB,int yB,
                                        int nCS, int nPbW,int nPbH,
-                                       const VectorInfo* vi)
+                                       const MotionVectorSpec* vi)
 {
+  int xP = xC+xB;
+  int yP = yC+yB;
+
+  void*  pixels[3];
+  int    stride[3];
+
+  const int SubWidthC  = img->sps.SubWidthC;
+  const int SubHeightC = img->sps.SubHeightC;
+
+  pixels[0] = img->get_image_plane_at_pos_any_depth(0,xP,yP);
+  stride[0] = img->get_image_stride(0);
+
+  pixels[1] = img->get_image_plane_at_pos_any_depth(1,xP/SubWidthC,yP/SubHeightC);
+  stride[1] = img->get_image_stride(1);
+
+  pixels[2] = img->get_image_plane_at_pos_any_depth(2,xP/SubWidthC,yP/SubHeightC);
+  stride[2] = img->get_image_stride(2);
+
+
   ALIGNED_16(int16_t) predSamplesL                 [2 /* LX */][MAX_CU_SIZE* MAX_CU_SIZE];
   ALIGNED_16(int16_t) predSamplesC[2 /* chroma */ ][2 /* LX */][MAX_CU_SIZE* MAX_CU_SIZE];
 
-  int xP = xC+xB;
-  int yP = yC+yB;
+  //int xP = xC+xB;
+  //int yP = yC+yB;
 
   int predFlag[2];
-  predFlag[0] = vi->lum.predFlag[0];
-  predFlag[1] = vi->lum.predFlag[1];
+  predFlag[0] = vi->predFlag[0];
+  predFlag[1] = vi->predFlag[1];
 
+  const int bit_depth_L = img->sps.BitDepth_Y;
+  const int bit_depth_C = img->sps.BitDepth_C;
 
   // Some encoders use bi-prediction with two similar MVs.
   // Identify this case and use only one MV.
@@ -331,10 +320,10 @@ void generate_inter_prediction_samples(decoder_context* ctx,
   // do this only without weighted prediction, because the weights/offsets may be different
   if (img->pps.weighted_pred_flag==0) {
     if (predFlag[0] && predFlag[1]) {
-      if (vi->lum.mv[0].x == vi->lum.mv[1].x &&
-          vi->lum.mv[0].y == vi->lum.mv[1].y &&
-          shdr->RefPicList[0][vi->lum.refIdx[0]] ==
-          shdr->RefPicList[1][vi->lum.refIdx[1]]) {
+      if (vi->mv[0].x == vi->mv[1].x &&
+          vi->mv[0].y == vi->mv[1].y &&
+          shdr->RefPicList[0][vi->refIdx[0]] ==
+          shdr->RefPicList[1][vi->refIdx[1]]) {
         predFlag[1] = 0;
       }
     }
@@ -345,40 +334,60 @@ void generate_inter_prediction_samples(decoder_context* ctx,
     if (predFlag[l]) {
       // 8.5.3.2.1
 
-      if (vi->lum.refIdx[l] >= MAX_NUM_REF_PICS) {
+      if (vi->refIdx[l] >= MAX_NUM_REF_PICS) {
         img->integrity = INTEGRITY_DECODING_ERRORS;
         ctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false);
         return;
       }
 
-      de265_image* refPic;
-      refPic = ctx->get_image(shdr->RefPicList[l][vi->lum.refIdx[l]]);
+      const de265_image* refPic = ctx->get_image(shdr->RefPicList[l][vi->refIdx[l]]);
 
-      logtrace(LogMotion, "refIdx: %d -> dpb[%d]\n", vi->lum.refIdx[l], shdr->RefPicList[l][vi->lum.refIdx[l]]);
+      logtrace(LogMotion, "refIdx: %d -> dpb[%d]\n", vi->refIdx[l], shdr->RefPicList[l][vi->refIdx[l]]);
 
       if (refPic->PicState == UnusedForReference) {
         img->integrity = INTEGRITY_DECODING_ERRORS;
         ctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED, false);
+
+        // TODO: fill predSamplesC with black or grey
       }
       else {
         // 8.5.3.2.2
 
         logtrace(LogMotion,"do MC: L%d,MV=%d;%d RefPOC=%d\n",
-                 l,vi->lum.mv[l].x,vi->lum.mv[l].y,refPic->PicOrderCntVal);
+                 l,vi->mv[l].x,vi->mv[l].y,refPic->PicOrderCntVal);
 
 
         // TODO: must predSamples stride really be nCS or can it be somthing smaller like nPbW?
-        mc_luma(ctx, img, vi->lum.mv[l].x, vi->lum.mv[l].y, xP,yP,
-                predSamplesL[l],nCS,
-                refPic->get_image_plane(0),refPic->get_luma_stride(), nPbW,nPbH);
 
+        if (img->high_bit_depth(0)) {
+          mc_luma(ctx, &img->sps, vi->mv[l].x, vi->mv[l].y, xP,yP,
+                  predSamplesL[l],nCS,
+                  (const uint16_t*)refPic->get_image_plane(0),
+                  refPic->get_luma_stride(), nPbW,nPbH, bit_depth_L);
+        }
+        else {
+          mc_luma(ctx, &img->sps, vi->mv[l].x, vi->mv[l].y, xP,yP,
+                  predSamplesL[l],nCS,
+                  (const uint8_t*)refPic->get_image_plane(0),
+                  refPic->get_luma_stride(), nPbW,nPbH, bit_depth_L);
+        }
 
-        mc_chroma(ctx, img, vi->lum.mv[l].x, vi->lum.mv[l].y, xP,yP,
-                  predSamplesC[0][l],nCS, refPic->get_image_plane(1),
-                  refPic->get_chroma_stride(), nPbW/2,nPbH/2);
-        mc_chroma(ctx, img, vi->lum.mv[l].x, vi->lum.mv[l].y, xP,yP,
-                  predSamplesC[1][l],nCS, refPic->get_image_plane(2),
-                  refPic->get_chroma_stride(), nPbW/2,nPbH/2);
+        if (img->high_bit_depth(0)) {
+          mc_chroma(ctx, &img->sps, vi->mv[l].x, vi->mv[l].y, xP,yP,
+                    predSamplesC[0][l],nCS, (const uint16_t*)refPic->get_image_plane(1),
+                    refPic->get_chroma_stride(), nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C);
+          mc_chroma(ctx, &img->sps, vi->mv[l].x, vi->mv[l].y, xP,yP,
+                    predSamplesC[1][l],nCS, (const uint16_t*)refPic->get_image_plane(2),
+                    refPic->get_chroma_stride(), nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C);
+        }
+        else {
+          mc_chroma(ctx, &img->sps, vi->mv[l].x, vi->mv[l].y, xP,yP,
+                    predSamplesC[0][l],nCS, (const uint8_t*)refPic->get_image_plane(1),
+                    refPic->get_chroma_stride(), nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C);
+          mc_chroma(ctx, &img->sps, vi->mv[l].x, vi->mv[l].y, xP,yP,
+                    predSamplesC[1][l],nCS, (const uint8_t*)refPic->get_image_plane(2),
+                    refPic->get_chroma_stride(), nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C);
+        }
       }
     }
   }
@@ -386,23 +395,54 @@ void generate_inter_prediction_samples(decoder_context* ctx,
 
   // weighted sample prediction  (8.5.3.2.3)
 
-  //const int shift1 = 6; // TODO
-  //const int offset1= 1<<(shift1-1);
+  const int shift1_L = libde265_max(2,14-img->sps.BitDepth_Y);
+  const int offset_shift1_L = img->sps.WpOffsetBdShiftY;
+  const int shift1_C = libde265_max(2,14-img->sps.BitDepth_C);
+  const int offset_shift1_C = img->sps.WpOffsetBdShiftC;
+
+  /*
+  const int shift1_L = 14-img->sps.BitDepth_Y;
+  const int offset_shift1_L = img->sps.BitDepth_Y-8;
+  const int shift1_C = 14-img->sps.BitDepth_C;
+  const int offset_shift1_C = img->sps.BitDepth_C-8;
+  */
+
+  /*
+  if (0)
+  printf("%d/%d %d/%d %d/%d %d/%d\n",
+         shift1_L,
+         Nshift1_L,
+         offset_shift1_L,
+         Noffset_shift1_L,
+         shift1_C,
+         Nshift1_C,
+         offset_shift1_C,
+         Noffset_shift1_C);
+
+  assert(shift1_L==
+         Nshift1_L);
+  assert(offset_shift1_L==
+         Noffset_shift1_L);
+  assert(shift1_C==
+         Nshift1_C);
+  assert(offset_shift1_C==
+         Noffset_shift1_C);
+  */
+
 
   logtrace(LogMotion,"predFlags (modified): %d %d\n", predFlag[0], predFlag[1]);
 
   if (shdr->slice_type == SLICE_TYPE_P) {
     if (img->pps.weighted_pred_flag==0) {
       if (predFlag[0]==1 && predFlag[1]==0) {
-        ctx->acceleration.put_unweighted_pred_8(img->get_image_plane_at_pos(0,xP,yP),
-                                                img->get_image_stride(0),
-                                                predSamplesL[0],nCS, nPbW,nPbH);
-        ctx->acceleration.put_unweighted_pred_8(img->get_image_plane_at_pos(1,xP/2,yP/2),
-                                                img->get_image_stride(1),
-                                                predSamplesC[0][0],nCS, nPbW/2,nPbH/2);
-        ctx->acceleration.put_unweighted_pred_8(img->get_image_plane_at_pos(2,xP/2,yP/2),
-                                                img->get_image_stride(2),
-                                                predSamplesC[1][0],nCS, nPbW/2,nPbH/2);
+        ctx->acceleration.put_unweighted_pred(pixels[0], stride[0],
+                                              predSamplesL[0],nCS, nPbW,nPbH, bit_depth_L);
+        ctx->acceleration.put_unweighted_pred(pixels[1], stride[1],
+                                              predSamplesC[0][0],nCS,
+                                              nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C);
+        ctx->acceleration.put_unweighted_pred(pixels[2], stride[2],
+                                              predSamplesC[1][0],nCS,
+                                              nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C);
       }
       else {
         ctx->add_warning(DE265_WARNING_BOTH_PREDFLAGS_ZERO, false);
@@ -414,33 +454,30 @@ void generate_inter_prediction_samples(decoder_context* ctx,
 
       if (predFlag[0]==1 && predFlag[1]==0) {
 
-        int refIdx0 = vi->lum.refIdx[0];
+        int refIdx0 = vi->refIdx[0];
 
-        int luma_log2WD   = shdr->luma_log2_weight_denom + (14-8); // TODO: bitDepth
-        int chroma_log2WD = shdr->ChromaLog2WeightDenom + (14-8); // TODO: bitDepth
+        int luma_log2WD   = shdr->luma_log2_weight_denom + shift1_L;
+        int chroma_log2WD = shdr->ChromaLog2WeightDenom  + shift1_C;
 
         int luma_w0 = shdr->LumaWeight[0][refIdx0];
-        int luma_o0 = shdr->luma_offset[0][refIdx0] * (1<<(8-8)); // TODO: bitDepth
+        int luma_o0 = shdr->luma_offset[0][refIdx0] * (1<<(offset_shift1_L));
 
         int chroma0_w0 = shdr->ChromaWeight[0][refIdx0][0];
-        int chroma0_o0 = shdr->ChromaOffset[0][refIdx0][0] * (1<<(8-8)); // TODO: bitDepth
+        int chroma0_o0 = shdr->ChromaOffset[0][refIdx0][0] * (1<<(offset_shift1_C));
         int chroma1_w0 = shdr->ChromaWeight[0][refIdx0][1];
-        int chroma1_o0 = shdr->ChromaOffset[0][refIdx0][1] * (1<<(8-8)); // TODO: bitDepth
+        int chroma1_o0 = shdr->ChromaOffset[0][refIdx0][1] * (1<<(offset_shift1_C));
 
         logtrace(LogMotion,"weighted-0 [%d] %d %d %d  %dx%d\n", refIdx0, luma_log2WD-6,luma_w0,luma_o0,nPbW,nPbH);
 
-        ctx->acceleration.put_weighted_pred_8(img->get_image_plane_at_pos(0,xP,yP),
-                                              img->get_image_stride(0),
-                                              predSamplesL[0],nCS, nPbW,nPbH,
-                                              luma_w0, luma_o0, luma_log2WD);
-        ctx->acceleration.put_weighted_pred_8(img->get_image_plane_at_pos(1,xP/2,yP/2),
-                                              img->get_image_stride(1),
-                                              predSamplesC[0][0],nCS, nPbW/2,nPbH/2,
-                                              chroma0_w0, chroma0_o0, chroma_log2WD);
-        ctx->acceleration.put_weighted_pred_8(img->get_image_plane_at_pos(2,xP/2,yP/2),
-                                              img->get_image_stride(2),
-                                              predSamplesC[1][0],nCS, nPbW/2,nPbH/2,
-                                              chroma1_w0, chroma1_o0, chroma_log2WD);
+        ctx->acceleration.put_weighted_pred(pixels[0], stride[0],
+                                            predSamplesL[0],nCS, nPbW,nPbH,
+                                            luma_w0, luma_o0, luma_log2WD, bit_depth_L);
+        ctx->acceleration.put_weighted_pred(pixels[1], stride[1],
+                                            predSamplesC[0][0],nCS, nPbW/SubWidthC,nPbH/SubHeightC,
+                                            chroma0_w0, chroma0_o0, chroma_log2WD, bit_depth_C);
+        ctx->acceleration.put_weighted_pred(pixels[2], stride[2],
+                                            predSamplesC[1][0],nCS, nPbW/SubWidthC,nPbH/SubHeightC,
+                                            chroma1_w0, chroma1_o0, chroma_log2WD, bit_depth_C);
       }
       else {
         ctx->add_warning(DE265_WARNING_BOTH_PREDFLAGS_ZERO, false);
@@ -458,120 +495,114 @@ void generate_inter_prediction_samples(decoder_context* ctx,
 
         int16_t* in0 = predSamplesL[0];
         int16_t* in1 = predSamplesL[1];
-        uint8_t* out = img->get_image_plane_at_pos(0, xP,yP);
 
-        ctx->acceleration.put_weighted_pred_avg_8(out, img->get_luma_stride(),
-                                              in0,in1, nCS, nPbW, nPbH);
+        ctx->acceleration.put_weighted_pred_avg(pixels[0], stride[0],
+                                                in0,in1, nCS, nPbW, nPbH, bit_depth_L);
 
         int16_t* in00 = predSamplesC[0][0];
         int16_t* in01 = predSamplesC[0][1];
         int16_t* in10 = predSamplesC[1][0];
         int16_t* in11 = predSamplesC[1][1];
-        uint8_t* out0 = img->get_image_plane_at_pos(1,xP/2,yP/2);
-        uint8_t* out1 = img->get_image_plane_at_pos(2,xP/2,yP/2);
-      
-        ctx->acceleration.put_weighted_pred_avg_8(out0, img->get_chroma_stride(),
-                                              in00,in01, nCS, nPbW/2, nPbH/2);
-        ctx->acceleration.put_weighted_pred_avg_8(out1, img->get_chroma_stride(),
-                                              in10,in11, nCS, nPbW/2, nPbH/2);
+
+        ctx->acceleration.put_weighted_pred_avg(pixels[1], stride[1],
+                                                in00,in01, nCS,
+                                                nPbW/SubWidthC, nPbH/SubHeightC, bit_depth_C);
+        ctx->acceleration.put_weighted_pred_avg(pixels[2], stride[2],
+                                                in10,in11, nCS,
+                                                nPbW/SubWidthC, nPbH/SubHeightC, bit_depth_C);
       }
       else {
         // weighted prediction
 
-        int refIdx0 = vi->lum.refIdx[0];
-        int refIdx1 = vi->lum.refIdx[1];
+        int refIdx0 = vi->refIdx[0];
+        int refIdx1 = vi->refIdx[1];
 
-        int luma_log2WD   = shdr->luma_log2_weight_denom + (14-8); // TODO: bitDepth
-        int chroma_log2WD = shdr->ChromaLog2WeightDenom + (14-8); // TODO: bitDepth
+        int luma_log2WD   = shdr->luma_log2_weight_denom + shift1_L;
+        int chroma_log2WD = shdr->ChromaLog2WeightDenom + shift1_C;
 
         int luma_w0 = shdr->LumaWeight[0][refIdx0];
-        int luma_o0 = shdr->luma_offset[0][refIdx0] * (1<<(8-8)); // TODO: bitDepth
+        int luma_o0 = shdr->luma_offset[0][refIdx0] * (1<<(offset_shift1_L));
         int luma_w1 = shdr->LumaWeight[1][refIdx1];
-        int luma_o1 = shdr->luma_offset[1][refIdx1] * (1<<(8-8)); // TODO: bitDepth
+        int luma_o1 = shdr->luma_offset[1][refIdx1] * (1<<(offset_shift1_L));
 
         int chroma0_w0 = shdr->ChromaWeight[0][refIdx0][0];
-        int chroma0_o0 = shdr->ChromaOffset[0][refIdx0][0] * (1<<(8-8)); // TODO: bitDepth
+        int chroma0_o0 = shdr->ChromaOffset[0][refIdx0][0] * (1<<(offset_shift1_C));
         int chroma1_w0 = shdr->ChromaWeight[0][refIdx0][1];
-        int chroma1_o0 = shdr->ChromaOffset[0][refIdx0][1] * (1<<(8-8)); // TODO: bitDepth
+        int chroma1_o0 = shdr->ChromaOffset[0][refIdx0][1] * (1<<(offset_shift1_C));
         int chroma0_w1 = shdr->ChromaWeight[1][refIdx1][0];
-        int chroma0_o1 = shdr->ChromaOffset[1][refIdx1][0] * (1<<(8-8)); // TODO: bitDepth
+        int chroma0_o1 = shdr->ChromaOffset[1][refIdx1][0] * (1<<(offset_shift1_C));
         int chroma1_w1 = shdr->ChromaWeight[1][refIdx1][1];
-        int chroma1_o1 = shdr->ChromaOffset[1][refIdx1][1] * (1<<(8-8)); // TODO: bitDepth
+        int chroma1_o1 = shdr->ChromaOffset[1][refIdx1][1] * (1<<(offset_shift1_C));
 
         logtrace(LogMotion,"weighted-BI-0 [%d] %d %d %d  %dx%d\n", refIdx0, luma_log2WD-6,luma_w0,luma_o0,nPbW,nPbH);
         logtrace(LogMotion,"weighted-BI-1 [%d] %d %d %d  %dx%d\n", refIdx1, luma_log2WD-6,luma_w1,luma_o1,nPbW,nPbH);
 
         int16_t* in0 = predSamplesL[0];
         int16_t* in1 = predSamplesL[1];
-        uint8_t* out = img->get_image_plane_at_pos(0, xP,yP);
 
-        ctx->acceleration.put_weighted_bipred_8(out, img->get_luma_stride(),
-                                            in0,in1, nCS, nPbW, nPbH,
-                                            luma_w0,luma_o0,
-                                            luma_w1,luma_o1,
-                                            luma_log2WD);
+        ctx->acceleration.put_weighted_bipred(pixels[0], stride[0],
+                                              in0,in1, nCS, nPbW, nPbH,
+                                              luma_w0,luma_o0,
+                                              luma_w1,luma_o1,
+                                              luma_log2WD, bit_depth_L);
 
         int16_t* in00 = predSamplesC[0][0];
         int16_t* in01 = predSamplesC[0][1];
         int16_t* in10 = predSamplesC[1][0];
         int16_t* in11 = predSamplesC[1][1];
-        uint8_t* out0 = img->get_image_plane_at_pos(1,xP/2,yP/2);
-        uint8_t* out1 = img->get_image_plane_at_pos(2,xP/2,yP/2);
-      
-        ctx->acceleration.put_weighted_bipred_8(out0, img->get_chroma_stride(),
-                                            in00,in01, nCS, nPbW/2, nPbH/2,
-                                            chroma0_w0,chroma0_o0,
-                                            chroma0_w1,chroma0_o1,
-                                            chroma_log2WD);
-        ctx->acceleration.put_weighted_bipred_8(out1, img->get_chroma_stride(),
-                                            in10,in11, nCS, nPbW/2, nPbH/2,
-                                            chroma1_w0,chroma1_o0,
-                                            chroma1_w1,chroma1_o1,
-                                            chroma_log2WD);
+
+        ctx->acceleration.put_weighted_bipred(pixels[1], stride[1],
+                                              in00,in01, nCS, nPbW/SubWidthC, nPbH/SubHeightC,
+                                              chroma0_w0,chroma0_o0,
+                                              chroma0_w1,chroma0_o1,
+                                              chroma_log2WD, bit_depth_C);
+        ctx->acceleration.put_weighted_bipred(pixels[2], stride[2],
+                                              in10,in11, nCS, nPbW/SubWidthC, nPbH/SubHeightC,
+                                              chroma1_w0,chroma1_o0,
+                                              chroma1_w1,chroma1_o1,
+                                              chroma_log2WD, bit_depth_C);
       }
     }
     else if (predFlag[0]==1 || predFlag[1]==1) {
       int l = predFlag[0] ? 0 : 1;
 
       if (img->pps.weighted_bipred_flag==0) {
-        ctx->acceleration.put_unweighted_pred_8(img->get_image_plane_at_pos(0,xP,yP),
-                                                img->get_image_stride(0),
-                                                predSamplesL[l],nCS, nPbW,nPbH);
-        ctx->acceleration.put_unweighted_pred_8(img->get_image_plane_at_pos(1,xP/2,yP/2),
-                                                img->get_image_stride(1),
-                                                predSamplesC[0][l],nCS, nPbW/2,nPbH/2);
-        ctx->acceleration.put_unweighted_pred_8(img->get_image_plane_at_pos(2,xP/2,yP/2),
-                                                img->get_image_stride(2),
-                                                predSamplesC[1][l],nCS, nPbW/2,nPbH/2);
+        ctx->acceleration.put_unweighted_pred(pixels[0], stride[0],
+                                              predSamplesL[l],nCS, nPbW,nPbH, bit_depth_L);
+        ctx->acceleration.put_unweighted_pred(pixels[1], stride[1],
+                                              predSamplesC[0][l],nCS,
+                                              nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C);
+        ctx->acceleration.put_unweighted_pred(pixels[2], stride[2],
+                                              predSamplesC[1][l],nCS,
+                                              nPbW/SubWidthC,nPbH/SubHeightC, bit_depth_C);
       }
       else {
-        int refIdx = vi->lum.refIdx[l];
+        int refIdx = vi->refIdx[l];
 
-        int luma_log2WD   = shdr->luma_log2_weight_denom + (14-8); // TODO: bitDepth
-        int chroma_log2WD = shdr->ChromaLog2WeightDenom + (14-8); // TODO: bitDepth
+        int luma_log2WD   = shdr->luma_log2_weight_denom + shift1_L;
+        int chroma_log2WD = shdr->ChromaLog2WeightDenom  + shift1_C;
 
         int luma_w = shdr->LumaWeight[l][refIdx];
-        int luma_o = shdr->luma_offset[l][refIdx] * (1<<(8-8)); // TODO: bitDepth
+        int luma_o = shdr->luma_offset[l][refIdx] * (1<<(offset_shift1_L));
 
         int chroma0_w = shdr->ChromaWeight[l][refIdx][0];
-        int chroma0_o = shdr->ChromaOffset[l][refIdx][0] * (1<<(8-8)); // TODO: bitDepth
+        int chroma0_o = shdr->ChromaOffset[l][refIdx][0] * (1<<(offset_shift1_C));
         int chroma1_w = shdr->ChromaWeight[l][refIdx][1];
-        int chroma1_o = shdr->ChromaOffset[l][refIdx][1] * (1<<(8-8)); // TODO: bitDepth
+        int chroma1_o = shdr->ChromaOffset[l][refIdx][1] * (1<<(offset_shift1_C));
 
         logtrace(LogMotion,"weighted-B-L%d [%d] %d %d %d  %dx%d\n", l, refIdx, luma_log2WD-6,luma_w,luma_o,nPbW,nPbH);
 
-        ctx->acceleration.put_weighted_pred_8(img->get_image_plane_at_pos(0,xP,yP),
-                                              img->get_image_stride(0),
-                                              predSamplesL[l],nCS, nPbW,nPbH,
-                                              luma_w, luma_o, luma_log2WD);
-        ctx->acceleration.put_weighted_pred_8(img->get_image_plane_at_pos(1,xP/2,yP/2),
-                                              img->get_image_stride(1),
-                                              predSamplesC[0][l],nCS, nPbW/2,nPbH/2,
-                                              chroma0_w, chroma0_o, chroma_log2WD);
-        ctx->acceleration.put_weighted_pred_8(img->get_image_plane_at_pos(2,xP/2,yP/2),
-                                              img->get_image_stride(2),
-                                              predSamplesC[1][l],nCS, nPbW/2,nPbH/2,
-                                              chroma1_w, chroma1_o, chroma_log2WD);
+        ctx->acceleration.put_weighted_pred(pixels[0], stride[0],
+                                            predSamplesL[l],nCS, nPbW,nPbH,
+                                            luma_w, luma_o, luma_log2WD, bit_depth_L);
+        ctx->acceleration.put_weighted_pred(pixels[1], stride[1],
+                                            predSamplesC[0][l],nCS,
+                                            nPbW/SubWidthC,nPbH/SubHeightC,
+                                            chroma0_w, chroma0_o, chroma_log2WD, bit_depth_C);
+        ctx->acceleration.put_weighted_pred(pixels[2], stride[2],
+                                            predSamplesC[1][l],nCS,
+                                            nPbW/SubWidthC,nPbH/SubHeightC,
+                                            chroma1_w, chroma1_o, chroma_log2WD, bit_depth_C);
       }
     }
     else {
@@ -590,11 +621,11 @@ void generate_inter_prediction_samples(decoder_context* ctx,
     logtrace(LogTransform,"MC-y-%d-%d ",xP,yP+y);
 
     for (int x=0;x<nPbW;x++) {
-      logtrace(LogTransform,"*%02x ", img->y[xP+x+(yP+y)*img->stride]);
+      logtrace(LogTransform,"*%02x ", pixels[0][x+y*stride[0]]);
     }
 
     logtrace(LogTransform,"*\n");
-  }  
+  }
 
 
   logtrace(LogTransform,"MC pixels (chroma cb), position %d %d:\n", xP/2,yP/2);
@@ -603,11 +634,11 @@ void generate_inter_prediction_samples(decoder_context* ctx,
     logtrace(LogTransform,"MC-cb-%d-%d ",xP/2,yP/2+y);
 
     for (int x=0;x<nPbW/2;x++) {
-      logtrace(LogTransform,"*%02x ", img->cb[xP/2+x+(yP/2+y)*img->chroma_stride]);
+      logtrace(LogTransform,"*%02x ", pixels[1][x+y*stride[1]]);
     }
 
     logtrace(LogTransform,"*\n");
-  }  
+  }
 
 
   logtrace(LogTransform,"MC pixels (chroma cr), position %d %d:\n", xP/2,yP/2);
@@ -616,17 +647,17 @@ void generate_inter_prediction_samples(decoder_context* ctx,
     logtrace(LogTransform,"MC-cr-%d-%d ",xP/2,yP/2+y);
 
     for (int x=0;x<nPbW/2;x++) {
-      logtrace(LogTransform,"*%02x ", img->cr[xP/2+x+(yP/2+y)*img->chroma_stride]);
+      logtrace(LogTransform,"*%02x ", pixels[2][x+y*stride[2]]);
     }
 
     logtrace(LogTransform,"*\n");
-  }  
+  }
 #endif
 }
 
 
 #ifdef DE265_LOG_TRACE
-void logmvcand(PredVectorInfo p)
+void logmvcand(const MotionVectorSpec& p)
 {
   for (int v=0;v<2;v++) {
     if (p.predFlag[v]) {
@@ -642,7 +673,7 @@ void logmvcand(PredVectorInfo p)
 #endif
 
 
-LIBDE265_INLINE static bool equal_cand_MV(const PredVectorInfo* a, const PredVectorInfo* b)
+LIBDE265_INLINE static bool equal_cand_MV(const MotionVectorSpec* a, const MotionVectorSpec* b)
 {
   // TODO: is this really correct? no check for predFlag? Standard says so... (p.127)
 
@@ -668,7 +699,7 @@ LIBDE265_INLINE static bool equal_cand_MV(const PredVectorInfo* a, const PredVec
      |                   |
      |                   |
      |                   |
-     |                   |
+     |        PB         |
      |                   |
      |                   |
   +--+                   |
@@ -681,32 +712,68 @@ LIBDE265_INLINE static bool equal_cand_MV(const PredVectorInfo* a, const PredVec
 
 // 8.5.3.1.2
 // TODO: check: can we fill the candidate list directly in this function and omit to copy later
-void derive_spatial_merging_candidates(const de265_image* img,
-                                       int xC, int yC, int nCS, int xP, int yP,
-                                       uint8_t singleMCLFlag,
-                                       int nPbW, int nPbH,
-                                       int partIdx,
-                                       MergingCandidates* out_cand)
+/*
+  xC/yC:  CB position
+  nCS:    CB size                 (probably modified because of singleMCLFlag)
+  xP/yP:  PB position (absolute)  (probably modified because of singleMCLFlag)
+  singleMCLFlag
+  nPbW/nPbH: PB size
+  partIdx
+  out_cand: merging candidate vectors
+
+  Add these candidates:
+  - A1
+  - B1  (if != A1)
+  - B0  (if != B1)
+  - A0  (if != A1)
+  - B2  (if != A1 and != B1)
+
+  A maximum of 4 candidates are generated.
+
+  Note 1: For a CB splitted into two PBs, it does not make sense to merge the
+  second part to the parameters of the first part, since then, we could use 2Nx2N
+  right away. -> Exclude this candidate.
+*/
+int derive_spatial_merging_candidates(const de265_image* img,
+                                      int xC, int yC, int nCS, int xP, int yP,
+                                      uint8_t singleMCLFlag,
+                                      int nPbW, int nPbH,
+                                      int partIdx,
+                                      MotionVectorSpec* out_cand,
+                                      int maxCandidates)
 {
   const pic_parameter_set* pps = &img->pps;
-  int log2_parallel_merge_level = pps->log2_parallel_merge_level;
+  const int log2_parallel_merge_level = pps->log2_parallel_merge_level;
 
   enum PartMode PartMode = img->get_PartMode(xC,yC);
 
+  /*
+  const int A0 = SpatialMergingCandidates::PRED_A0;
+  const int A1 = SpatialMergingCandidates::PRED_A1;
+  const int B0 = SpatialMergingCandidates::PRED_B0;
+  const int B1 = SpatialMergingCandidates::PRED_B1;
+  const int B2 = SpatialMergingCandidates::PRED_B2;
+  */
+
   // --- A1 ---
 
-  // a pixel within A1
+  // a pixel within A1 (bottom right of A1)
   int xA1 = xP-1;
   int yA1 = yP+nPbH-1;
 
   bool availableA1;
+  int idxA1;
 
+  int computed_candidates = 0;
+
+  // check if candidate is in same motion-estimation region (MER) -> discard
   if ((xP>>log2_parallel_merge_level) == (xA1>>log2_parallel_merge_level) &&
       (yP>>log2_parallel_merge_level) == (yA1>>log2_parallel_merge_level)) {
     availableA1 = false;
     logtrace(LogMotion,"spatial merging candidate A1: below parallel merge level\n");
   }
-  else if (!singleMCLFlag &&
+  // redundant candidate? (Note 1) -> discard
+  else if (// !singleMCLFlag &&    automatically true when partIdx==1
            partIdx==1 &&
            (PartMode==PART_Nx2N ||
             PartMode==PART_nLx2N ||
@@ -714,23 +781,22 @@ void derive_spatial_merging_candidates(const de265_image* img,
     availableA1 = false;
     logtrace(LogMotion,"spatial merging candidate A1: second part ignore\n");
   }
+  // MV available in A1
   else {
     availableA1 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xA1,yA1);
     if (!availableA1) logtrace(LogMotion,"spatial merging candidate A1: unavailable\n");
   }
 
-  if (!availableA1) {
-    out_cand->available[PRED_A1] = 0;
-    reset_pred_vector(&out_cand->pred_vector[PRED_A1]);
-  }
-  else {
-    out_cand->available[PRED_A1] = 1;
-    out_cand->pred_vector[PRED_A1] = *img->get_mv_info(xA1,yA1);
+  if (availableA1) {
+    idxA1 = computed_candidates++;
+    out_cand[idxA1] = *img->get_mv_info(xA1,yA1);
 
     logtrace(LogMotion,"spatial merging candidate A1:\n");
-    logmvcand(out_cand->pred_vector[PRED_A1]);
+    logmvcand(out_cand[idxA1]);
   }
 
+  if (computed_candidates>=maxCandidates) return computed_candidates;
+
 
   // --- B1 ---
 
@@ -738,13 +804,16 @@ void derive_spatial_merging_candidates(const de265_image* img,
   int yB1 = yP-1;
 
   bool availableB1;
+  int idxB1;
 
+  // same MER -> discard
   if ((xP>>log2_parallel_merge_level) == (xB1>>log2_parallel_merge_level) &&
       (yP>>log2_parallel_merge_level) == (yB1>>log2_parallel_merge_level)) {
     availableB1 = false;
     logtrace(LogMotion,"spatial merging candidate B1: below parallel merge level\n");
   }
-  else if (!singleMCLFlag &&
+  // redundant candidate (Note 1) -> discard
+  else if (// !singleMCLFlag &&    automatically true when partIdx==1
            partIdx==1 &&
            (PartMode==PART_2NxN ||
             PartMode==PART_2NxnU ||
@@ -752,31 +821,32 @@ void derive_spatial_merging_candidates(const de265_image* img,
     availableB1 = false;
     logtrace(LogMotion,"spatial merging candidate B1: second part ignore\n");
   }
+  // MV available in B1
   else {
     availableB1 = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB1,yB1);
     if (!availableB1) logtrace(LogMotion,"spatial merging candidate B1: unavailable\n");
   }
 
-  if (!availableB1) {
-    out_cand->available[PRED_B1] = 0;
-    reset_pred_vector(&out_cand->pred_vector[PRED_B1]);
-  }
-  else {
-    out_cand->available[PRED_B1] = 1;
-    out_cand->pred_vector[PRED_B1] = *img->get_mv_info(xB1,yB1);
+  if (availableB1) {
+    const MotionVectorSpec* b1 = img->get_mv_info(xB1,yB1);
 
+    // B1 == A1 -> discard B1
     if (availableA1 &&
-        equal_cand_MV(&out_cand->pred_vector[PRED_A1],
-                      &out_cand->pred_vector[PRED_B1])) {
-      out_cand->available[PRED_B1] = 0;
+        equal_cand_MV(&out_cand[idxA1], b1)) {
+      idxB1 = idxA1;
       logtrace(LogMotion,"spatial merging candidate B1: redundant to A1\n");
     }
     else {
+      idxB1 = computed_candidates++;
+      out_cand[idxB1] = *b1;
+
       logtrace(LogMotion,"spatial merging candidate B1:\n");
-      logmvcand(out_cand->pred_vector[PRED_B1]);
+      logmvcand(out_cand[idxB1]);
     }
   }
 
+  if (computed_candidates>=maxCandidates) return computed_candidates;
+
 
   // --- B0 ---
 
@@ -784,6 +854,7 @@ void derive_spatial_merging_candidates(const de265_image* img,
   int yB0 = yP-1;
 
   bool availableB0;
+  int idxB0;
 
   if ((xP>>log2_parallel_merge_level) == (xB0>>log2_parallel_merge_level) &&
       (yP>>log2_parallel_merge_level) == (yB0>>log2_parallel_merge_level)) {
@@ -795,26 +866,25 @@ void derive_spatial_merging_candidates(const de265_image* img,
     if (!availableB0) logtrace(LogMotion,"spatial merging candidate B0: unavailable\n");
   }
 
-  if (!availableB0) {
-    out_cand->available[PRED_B0] = 0;
-    reset_pred_vector(&out_cand->pred_vector[PRED_B0]);
-  }
-  else {
-    out_cand->available[PRED_B0] = 1;
-    out_cand->pred_vector[PRED_B0] = *img->get_mv_info(xB0,yB0);
+  if (availableB0) {
+    const MotionVectorSpec* b0 = img->get_mv_info(xB0,yB0);
 
+    // B0 == B1 -> discard B0
     if (availableB1 &&
-        equal_cand_MV(&out_cand->pred_vector[PRED_B1],
-                      &out_cand->pred_vector[PRED_B0])) {
-      out_cand->available[PRED_B0] = 0;
+        equal_cand_MV(&out_cand[idxB1], b0)) {
+      idxB0 = idxB1;
       logtrace(LogMotion,"spatial merging candidate B0: redundant to B1\n");
     }
     else {
+      idxB0 = computed_candidates++;
+      out_cand[idxB0] = *b0;
       logtrace(LogMotion,"spatial merging candidate B0:\n");
-      logmvcand(out_cand->pred_vector[PRED_B0]);
+      logmvcand(out_cand[idxB0]);
     }
   }
 
+  if (computed_candidates>=maxCandidates) return computed_candidates;
+
 
   // --- A0 ---
 
@@ -822,6 +892,7 @@ void derive_spatial_merging_candidates(const de265_image* img,
   int yA0 = yP+nPbH;
 
   bool availableA0;
+  int idxA0;
 
   if ((xP>>log2_parallel_merge_level) == (xA0>>log2_parallel_merge_level) &&
       (yP>>log2_parallel_merge_level) == (yA0>>log2_parallel_merge_level)) {
@@ -833,26 +904,25 @@ void derive_spatial_merging_candidates(const de265_image* img,
     if (!availableA0) logtrace(LogMotion,"spatial merging candidate A0: unavailable\n");
   }
 
-  if (!availableA0) {
-    out_cand->available[PRED_A0] = 0;
-    reset_pred_vector(&out_cand->pred_vector[PRED_A0]);
-  }
-  else {
-    out_cand->available[PRED_A0] = 1;
-    out_cand->pred_vector[PRED_A0] = *img->get_mv_info(xA0,yA0);
+  if (availableA0) {
+    const MotionVectorSpec* a0 = img->get_mv_info(xA0,yA0);
 
+    // A0 == A1 -> discard A0
     if (availableA1 &&
-        equal_cand_MV(&out_cand->pred_vector[PRED_A1],
-                      &out_cand->pred_vector[PRED_A0])) {
-      out_cand->available[PRED_A0] = 0;
+        equal_cand_MV(&out_cand[idxA1], a0)) {
+      idxA0 = idxA1;
       logtrace(LogMotion,"spatial merging candidate A0: redundant to A1\n");
     }
     else {
+      idxA0 = computed_candidates++;
+      out_cand[idxA0] = *a0;
       logtrace(LogMotion,"spatial merging candidate A0:\n");
-      logmvcand(out_cand->pred_vector[PRED_A0]);
+      logmvcand(out_cand[idxA0]);
     }
   }
 
+  if (computed_candidates>=maxCandidates) return computed_candidates;
+
 
   // --- B2 ---
 
@@ -860,9 +930,10 @@ void derive_spatial_merging_candidates(const de265_image* img,
   int yB2 = yP-1;
 
   bool availableB2;
+  int idxB2;
 
-  if (out_cand->available[PRED_A0] && out_cand->available[PRED_A1] &&
-      out_cand->available[PRED_B0] && out_cand->available[PRED_B1]) {
+  // if we already have four candidates, do not consider B2 anymore
+  if (computed_candidates==4) {
     availableB2 = false;
     logtrace(LogMotion,"spatial merging candidate B2: ignore\n");
   }
@@ -876,38 +947,38 @@ void derive_spatial_merging_candidates(const de265_image* img,
     if (!availableB2) logtrace(LogMotion,"spatial merging candidate B2: unavailable\n");
   }
 
-  if (!availableB2) {
-    out_cand->available[PRED_B2] = 0;
-    reset_pred_vector(&out_cand->pred_vector[PRED_B2]);
-  }
-  else {
-    out_cand->available[PRED_B2] = 1;
-    out_cand->pred_vector[PRED_B2] = *img->get_mv_info(xB2,yB2);
+  if (availableB2) {
+    const MotionVectorSpec* b2 = img->get_mv_info(xB2,yB2);
 
+    // B2 == B1 -> discard B2
     if (availableB1 &&
-        equal_cand_MV(&out_cand->pred_vector[PRED_B1],
-                      &out_cand->pred_vector[PRED_B2])) {
-      out_cand->available[PRED_B2] = 0;
+        equal_cand_MV(&out_cand[idxB1], b2)) {
+      idxB2 = idxB1;
       logtrace(LogMotion,"spatial merging candidate B2: redundant to B1\n");
     }
+    // B2 == A1 -> discard B2
     else if (availableA1 &&
-             equal_cand_MV(&out_cand->pred_vector[PRED_A1],
-                           &out_cand->pred_vector[PRED_B2])) {
-      out_cand->available[PRED_B2] = 0;
+             equal_cand_MV(&out_cand[idxA1], b2)) {
+      idxB2 = idxA1;
       logtrace(LogMotion,"spatial merging candidate B2: redundant to A1\n");
     }
     else {
-      logtrace(LogMotion,"spatial merging candidate B0:\n");
-      logmvcand(out_cand->pred_vector[PRED_B0]);
+      idxB2 = computed_candidates++;
+      out_cand[idxB2] = *b2;
+      logtrace(LogMotion,"spatial merging candidate B2:\n");
+      logmvcand(out_cand[idxB2]);
     }
   }
+
+  return computed_candidates;
 }
 
 
 // 8.5.3.1.4
-void derive_zero_motion_vector_candidates(slice_segment_header* shdr,
-                                          PredVectorInfo* inout_mergeCandList,
-                                          int* inout_numCurrMergeCand)
+void derive_zero_motion_vector_candidates(const slice_segment_header* shdr,
+                                          MotionVectorSpec* out_mergeCandList,
+                                          int* inout_numCurrMergeCand,
+                                          int maxCandidates)
 {
   logtrace(LogMotion,"derive_zero_motion_vector_candidates\n");
 
@@ -925,22 +996,24 @@ void derive_zero_motion_vector_candidates(slice_segment_header* shdr,
   //int numInputMergeCand = *inout_numMergeCand;
   int zeroIdx = 0;
 
-  while (*inout_numCurrMergeCand < shdr->MaxNumMergeCand) {
+  while (*inout_numCurrMergeCand < maxCandidates) {
     // 1.
 
     logtrace(LogMotion,"zeroIdx:%d numRefIdx:%d\n", zeroIdx, numRefIdx);
 
-    PredVectorInfo* newCand = &inout_mergeCandList[*inout_numCurrMergeCand];
+    MotionVectorSpec* newCand = &out_mergeCandList[*inout_numCurrMergeCand];
+
+    const int refIdx = (zeroIdx < numRefIdx) ? zeroIdx : 0;
 
     if (shdr->slice_type==SLICE_TYPE_P) {
-      newCand->refIdx[0] = (zeroIdx < numRefIdx) ? zeroIdx : 0;
+      newCand->refIdx[0] = refIdx;
       newCand->refIdx[1] = -1;
       newCand->predFlag[0] = 1;
       newCand->predFlag[1] = 0;
     }
     else {
-      newCand->refIdx[0] = (zeroIdx < numRefIdx) ? zeroIdx : 0;
-      newCand->refIdx[1] = (zeroIdx < numRefIdx) ? zeroIdx : 0;
+      newCand->refIdx[0] = refIdx;
+      newCand->refIdx[1] = refIdx;
       newCand->predFlag[0] = 1;
       newCand->predFlag[1] = 1;
     }
@@ -951,7 +1024,7 @@ void derive_zero_motion_vector_candidates(slice_segment_header* shdr,
     newCand->mv[1].y = 0;
 
     (*inout_numCurrMergeCand)++;
-      
+
     // 2.
 
     zeroIdx++;
@@ -982,149 +1055,194 @@ bool scale_mv(MotionVector* out_mv, MotionVector mv, int colDist, int currDist)
 
 // (L1003) 8.5.3.2.8
 
-void derive_collocated_motion_vectors(decoder_context* ctx,
+void derive_collocated_motion_vectors(base_context* ctx,
                                       de265_image* img,
                                       const slice_segment_header* shdr,
                                       int xP,int yP,
                                       int colPic,
                                       int xColPb,int yColPb,
-                                      int refIdxLX, int X,
+                                      int refIdxLX,  // (always 0 for merge mode)
+                                      int X,
                                       MotionVector* out_mvLXCol,
                                       uint8_t* out_availableFlagLXCol)
 {
   logtrace(LogMotion,"derive_collocated_motion_vectors %d;%d\n",xP,yP);
 
+
+  // get collocated image and the prediction mode at the collocated position
+
   assert(ctx->has_image(colPic));
   const de265_image* colImg = ctx->get_image(colPic);
+
+  // check for access outside image area
+
+  if (xColPb >= colImg->get_width() ||
+      yColPb >= colImg->get_height()) {
+    ctx->add_warning(DE265_WARNING_COLLOCATED_MOTION_VECTOR_OUTSIDE_IMAGE_AREA, false);
+    *out_availableFlagLXCol = 0;
+    return;
+  }
+
   enum PredMode predMode = colImg->get_pred_mode(xColPb,yColPb);
 
+
+  // collocated block is Intra -> no collocated MV
+
   if (predMode == MODE_INTRA) {
     out_mvLXCol->x = 0;
     out_mvLXCol->y = 0;
     *out_availableFlagLXCol = 0;
     return;
   }
-  else {
-    logtrace(LogMotion,"colPic:%d (POC=%d) X:%d refIdxLX:%d refpiclist:%d\n",
-             colPic,
-             colImg->PicOrderCntVal,
-             X,refIdxLX,shdr->RefPicList[X][refIdxLX]);
 
-    if (colImg->integrity == INTEGRITY_UNAVAILABLE_REFERENCE) {
-      out_mvLXCol->x = 0;
-      out_mvLXCol->y = 0;
-      *out_availableFlagLXCol = 0;
-      return;
-    }
 
-    const PredVectorInfo* mvi = colImg->get_mv_info(xColPb,yColPb);
-    int listCol;
-    int refIdxCol;
-    MotionVector mvCol;
+  logtrace(LogMotion,"colPic:%d (POC=%d) X:%d refIdxLX:%d refpiclist:%d\n",
+           colPic,
+           colImg->PicOrderCntVal,
+           X,refIdxLX,shdr->RefPicList[X][refIdxLX]);
 
-    logtrace(LogMotion,"read MVI %d;%d:\n",xColPb,yColPb);
-    logmvcand(*mvi);
 
-    if (mvi->predFlag[0]==0) {
-      mvCol = mvi->mv[1];
-      refIdxCol = mvi->refIdx[1];
-      listCol = 1;
-    }
-    else {
-      if (mvi->predFlag[1]==0) {
-        mvCol = mvi->mv[0];
-        refIdxCol = mvi->refIdx[0];
-        listCol = 0;
-      }
-      else {
-        int AllDiffPicOrderCntLEZero = true;
+  // collocated reference image is unavailable -> no collocated MV
+
+  if (colImg->integrity == INTEGRITY_UNAVAILABLE_REFERENCE) {
+    out_mvLXCol->x = 0;
+    out_mvLXCol->y = 0;
+    *out_availableFlagLXCol = 0;
+    return;
+  }
 
-        const int PicOrderCntVal = img->PicOrderCntVal;
 
-        for (int rIdx=0; rIdx<shdr->num_ref_idx_l0_active && AllDiffPicOrderCntLEZero; rIdx++)
-          {
-            const de265_image* imgA = ctx->get_image(shdr->RefPicList[0][rIdx]);
-            int aPOC = imgA->PicOrderCntVal;
+  // get the collocated MV
 
-            if (aPOC > PicOrderCntVal) {
-              AllDiffPicOrderCntLEZero = false;
-            }
-          }
+  const MotionVectorSpec* mvi = colImg->get_mv_info(xColPb,yColPb);
+  int listCol;
+  int refIdxCol;
+  MotionVector mvCol;
 
-        for (int rIdx=0; rIdx<shdr->num_ref_idx_l1_active && AllDiffPicOrderCntLEZero; rIdx++)
-          {
-            const de265_image* imgA = ctx->get_image(shdr->RefPicList[1][rIdx]);
-            int aPOC = imgA->PicOrderCntVal;
+  logtrace(LogMotion,"read MVI %d;%d:\n",xColPb,yColPb);
+  logmvcand(*mvi);
 
-            if (aPOC > PicOrderCntVal) {
-              AllDiffPicOrderCntLEZero = false;
-            }
-          }
 
-        if (AllDiffPicOrderCntLEZero) {
-          mvCol = mvi->mv[X];
-          refIdxCol = mvi->refIdx[X];
-          listCol = X;
+  // collocated MV uses only L1 -> use L1
+  if (mvi->predFlag[0]==0) {
+    mvCol = mvi->mv[1];
+    refIdxCol = mvi->refIdx[1];
+    listCol = 1;
+  }
+  // collocated MV uses only L0 -> use L0
+  else if (mvi->predFlag[1]==0) {
+    mvCol = mvi->mv[0];
+    refIdxCol = mvi->refIdx[0];
+    listCol = 0;
+  }
+  // collocated MV uses L0 and L1
+  else {
+    bool allRefFramesBeforeCurrentFrame = true;
+
+    const int currentPOC = img->PicOrderCntVal;
+
+    // all reference POCs earlier than current POC (list 1)
+    // Test L1 first, because there is a higher change to find a future reference frame.
+
+    for (int rIdx=0; rIdx<shdr->num_ref_idx_l1_active && allRefFramesBeforeCurrentFrame; rIdx++)
+      {
+        const de265_image* refimg = ctx->get_image(shdr->RefPicList[1][rIdx]);
+        int refPOC = refimg->PicOrderCntVal;
+
+        if (refPOC > currentPOC) {
+          allRefFramesBeforeCurrentFrame = false;
         }
-        else {
-          int N = shdr->collocated_from_l0_flag;
-          mvCol = mvi->mv[N];
-          refIdxCol = mvi->refIdx[N];
-          listCol = N;
+      }
+
+    // all reference POCs earlier than current POC (list 0)
+
+    for (int rIdx=0; rIdx<shdr->num_ref_idx_l0_active && allRefFramesBeforeCurrentFrame; rIdx++)
+      {
+        const de265_image* refimg = ctx->get_image(shdr->RefPicList[0][rIdx]);
+        int refPOC = refimg->PicOrderCntVal;
+
+        if (refPOC > currentPOC) {
+          allRefFramesBeforeCurrentFrame = false;
         }
       }
-    }
 
 
+    /* TODO: What is the rationale behind this ???
 
-    const slice_segment_header* colShdr = colImg->slices[ colImg->get_SliceHeaderIndex(xColPb,yColPb) ];
+       My guess:
+       when there are images before the current frame (most probably in L0) and images after
+       the current frame (most probably in L1), we take the reference in the opposite
+       direction than where the collocated frame is positioned in the hope that the distance
+       to the current frame will be smaller and thus give a better prediction.
 
-    if (shdr->LongTermRefPic[X][refIdxLX] != 
-        colShdr->LongTermRefPic[listCol][refIdxCol]) {
-      *out_availableFlagLXCol = 0;
-      out_mvLXCol->x = 0;
-      out_mvLXCol->y = 0;
+       If all references point into the past, we cannot say much about the temporal order or
+       L0,L1 and thus take over both parts.
+     */
+
+    if (allRefFramesBeforeCurrentFrame) {
+      mvCol = mvi->mv[X];
+      refIdxCol = mvi->refIdx[X];
+      listCol = X;
     }
     else {
-      *out_availableFlagLXCol = 1;
+      int N = shdr->collocated_from_l0_flag;
+      mvCol = mvi->mv[N];
+      refIdxCol = mvi->refIdx[N];
+      listCol = N;
+    }
+  }
 
-      const bool isLongTerm = shdr->LongTermRefPic[X][refIdxLX];
 
-      int colDist  = colImg->PicOrderCntVal - colShdr->RefPicList_POC[listCol][refIdxCol];
-      int currDist = img->PicOrderCntVal - shdr->RefPicList_POC[X][refIdxLX];
 
-      logtrace(LogMotion,"COLPOCDIFF %d %d [%d %d / %d %d]\n",colDist, currDist,
-               colImg->PicOrderCntVal, colShdr->RefPicList_POC[listCol][refIdxCol],
-               img->PicOrderCntVal, shdr->RefPicList_POC[X][refIdxLX]
-               );
+  const slice_segment_header* colShdr = colImg->slices[ colImg->get_SliceHeaderIndex(xColPb,yColPb) ];
 
-      if (isLongTerm || colDist == currDist) {
-        *out_mvLXCol = mvCol;
-      }
-      else {
-        if (!scale_mv(out_mvLXCol, mvCol, colDist, currDist)) {
-          ctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false);
-          img->integrity = INTEGRITY_DECODING_ERRORS;
-        }
+  if (shdr->LongTermRefPic[X][refIdxLX] !=
+      colShdr->LongTermRefPic[listCol][refIdxCol]) {
+    *out_availableFlagLXCol = 0;
+    out_mvLXCol->x = 0;
+    out_mvLXCol->y = 0;
+  }
+  else {
+    *out_availableFlagLXCol = 1;
+
+    const bool isLongTerm = shdr->LongTermRefPic[X][refIdxLX];
 
-        logtrace(LogMotion,"scale: %d;%d to %d;%d\n",
-                 mvCol.x,mvCol.y, out_mvLXCol->x,out_mvLXCol->y);
+    int colDist  = colImg->PicOrderCntVal - colShdr->RefPicList_POC[listCol][refIdxCol];
+    int currDist = img->PicOrderCntVal - shdr->RefPicList_POC[X][refIdxLX];
+
+    logtrace(LogMotion,"COLPOCDIFF %d %d [%d %d / %d %d]\n",colDist, currDist,
+             colImg->PicOrderCntVal, colShdr->RefPicList_POC[listCol][refIdxCol],
+             img->PicOrderCntVal, shdr->RefPicList_POC[X][refIdxLX]
+             );
+
+    if (isLongTerm || colDist == currDist) {
+      *out_mvLXCol = mvCol;
+    }
+    else {
+      if (!scale_mv(out_mvLXCol, mvCol, colDist, currDist)) {
+        ctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false);
+        img->integrity = INTEGRITY_DECODING_ERRORS;
       }
+
+      logtrace(LogMotion,"scale: %d;%d to %d;%d\n",
+               mvCol.x,mvCol.y, out_mvLXCol->x,out_mvLXCol->y);
     }
   }
 }
 
 
 // 8.5.3.1.7
-void derive_temporal_luma_vector_prediction(decoder_context* ctx,
+void derive_temporal_luma_vector_prediction(base_context* ctx,
                                             de265_image* img,
                                             const slice_segment_header* shdr,
                                             int xP,int yP,
                                             int nPbW,int nPbH,
-                                            int refIdxL, int X,
+                                            int refIdxL,
+                                            int X, // which MV (L0/L1) to get
                                             MotionVector* out_mvLXCol,
                                             uint8_t*      out_availableFlagLXCol)
 {
+  // --- no temporal MVP -> exit ---
 
   if (shdr->slice_temporal_mvp_enabled_flag == 0) {
     out_mvLXCol->x = 0;
@@ -1133,6 +1251,9 @@ void derive_temporal_luma_vector_prediction(decoder_context* ctx,
     return;
   }
 
+
+  // --- find collocated reference image ---
+
   int Log2CtbSizeY = img->sps.Log2CtbSizeY;
 
   int colPic; // TODO: this is the same for the whole slice. We can precompute it.
@@ -1142,19 +1263,17 @@ void derive_temporal_luma_vector_prediction(decoder_context* ctx,
     {
       logtrace(LogMotion,"collocated L1 ref_idx=%d\n",shdr->collocated_ref_idx);
 
-      // TODO: make sure that shdr->collocated_ref_idx is a valid index
       colPic = shdr->RefPicList[1][ shdr->collocated_ref_idx ];
     }
   else
     {
       logtrace(LogMotion,"collocated L0 ref_idx=%d\n",shdr->collocated_ref_idx);
 
-      // TODO: make sure that shdr->collocated_ref_idx is a valid index
       colPic = shdr->RefPicList[0][ shdr->collocated_ref_idx ];
     }
 
-  //logtrace(LogMotion,"collocated reference POC=%d\n",ctx->dpb[colPic].PicOrderCntVal);
 
+  // check whether collocated reference picture exists
 
   if (!ctx->has_image(colPic)) {
     out_mvLXCol->x = 0;
@@ -1166,10 +1285,18 @@ void derive_temporal_luma_vector_prediction(decoder_context* ctx,
   }
 
 
+  // --- get collocated MV either at bottom-right corner or from center of PB ---
+
   int xColPb,yColPb;
   int yColBr = yP + nPbH; // bottom right collocated motion vector position
   int xColBr = xP + nPbW;
 
+  /* If neighboring pixel at bottom-right corner is in the same CTB-row and inside the image,
+     use this (reduced down to 16 pixels resolution) as collocated MV position.
+
+     Note: see 2014, Sze, Sect. 5.2.1.2 why candidate C0 is excluded when on another CTB-row.
+     This is to reduce the memory bandwidth requirements.
+   */
   if ((yP>>Log2CtbSizeY) == (yColBr>>Log2CtbSizeY) &&
       xColBr < img->sps.pic_width_in_luma_samples &&
       yColBr < img->sps.pic_height_in_luma_samples)
@@ -1208,13 +1335,18 @@ static int table_8_19[2][12] = {
   };
 
 // 8.5.3.1.3
-void derive_combined_bipredictive_merging_candidates(const decoder_context* ctx,
-                                                     slice_segment_header* shdr,
-                                                     PredVectorInfo* inout_mergeCandList,
+/* Note (TODO): during decoding, we know which of the candidates we will select.
++   Hence, we do not really have to generate the other ones...
++ */
+void derive_combined_bipredictive_merging_candidates(const base_context* ctx,
+                                                     const slice_segment_header* shdr,
+                                                     MotionVectorSpec* inout_mergeCandList,
                                                      int* inout_numMergeCand,
-                                                     int numOrigMergeCand)
+                                                     int maxCandidates)
 {
-  if (*inout_numMergeCand>1 && *inout_numMergeCand < shdr->MaxNumMergeCand) {
+  if (*inout_numMergeCand>1 && *inout_numMergeCand < maxCandidates) {
+    int numOrigMergeCand = *inout_numMergeCand;
+
     int numInputMergeCand = *inout_numMergeCand;
     int combIdx = 0;
     uint8_t combStop = false;
@@ -1228,8 +1360,8 @@ void derive_combined_bipredictive_merging_candidates(const decoder_context* ctx,
         assert(false); // bitstream error -> TODO: conceal error
       }
 
-      PredVectorInfo* l0Cand = &inout_mergeCandList[l0CandIdx];
-      PredVectorInfo* l1Cand = &inout_mergeCandList[l1CandIdx];
+      MotionVectorSpec* l0Cand = &inout_mergeCandList[l0CandIdx];
+      MotionVectorSpec* l1Cand = &inout_mergeCandList[l1CandIdx];
 
       logtrace(LogMotion,"add bipredictive merging candidate (combIdx:%d)\n",combIdx);
       logtrace(LogMotion,"l0Cand:\n"); logmvcand(*l0Cand);
@@ -1238,11 +1370,19 @@ void derive_combined_bipredictive_merging_candidates(const decoder_context* ctx,
       const de265_image* img0 = l0Cand->predFlag[0] ? ctx->get_image(shdr->RefPicList[0][l0Cand->refIdx[0]]) : NULL;
       const de265_image* img1 = l1Cand->predFlag[1] ? ctx->get_image(shdr->RefPicList[1][l1Cand->refIdx[1]]) : NULL;
 
+      if (l0Cand->predFlag[0] && !img0) {
+        return; // TODO error
+      }
+
+      if (l1Cand->predFlag[1] && !img1) {
+        return; // TODO error
+      }
+
       if (l0Cand->predFlag[0] && l1Cand->predFlag[1] &&
           (img0->PicOrderCntVal != img1->PicOrderCntVal     ||
            l0Cand->mv[0].x != l1Cand->mv[1].x ||
            l0Cand->mv[0].y != l1Cand->mv[1].y)) {
-        PredVectorInfo* p = &inout_mergeCandList[ *inout_numMergeCand ];
+        MotionVectorSpec* p = &inout_mergeCandList[ *inout_numMergeCand ];
         p->refIdx[0] = l0Cand->refIdx[0];
         p->refIdx[1] = l1Cand->refIdx[1];
         p->predFlag[0] = l0Cand->predFlag[0];
@@ -1257,7 +1397,7 @@ void derive_combined_bipredictive_merging_candidates(const decoder_context* ctx,
 
       combIdx++;
       if (combIdx == numOrigMergeCand*(numOrigMergeCand-1) ||
-          *inout_numMergeCand == shdr->MaxNumMergeCand) {
+          *inout_numMergeCand == maxCandidates) {
         combStop = true;
       }
     }
@@ -1266,21 +1406,30 @@ void derive_combined_bipredictive_merging_candidates(const decoder_context* ctx,
 
 
 // 8.5.3.1.1
-void derive_luma_motion_merge_mode(decoder_context* ctx,
-                                   thread_context* tctx,
-                                   int xC,int yC, int xP,int yP,
-                                   int nCS, int nPbW,int nPbH, int partIdx,
-                                   VectorInfo* out_vi)
+static void get_merge_candidate_list_without_step_9(base_context* ctx,
+                                                    const slice_segment_header* shdr,
+                                                    de265_image* img,
+                                                    int xC,int yC, int xP,int yP,
+                                                    int nCS, int nPbW,int nPbH, int partIdx,
+                                                    int max_merge_idx,
+                                                    MotionVectorSpec* mergeCandList)
 {
-  slice_segment_header* shdr = tctx->shdr;
 
   //int xOrigP = xP;
   //int yOrigP = yP;
   int nOrigPbW = nPbW;
   int nOrigPbH = nPbH;
 
-  int singleMCLFlag;
-  singleMCLFlag = (tctx->img->pps.log2_parallel_merge_level > 2 && nCS==8);
+  int singleMCLFlag; // single merge-candidate-list (MCL) flag
+
+  /* Use single MCL for CBs of size 8x8, except when parallel-merge-level is at 4x4.
+     Without this flag, PBs smaller than 8x8 would not receive as much merging candidates.
+     Having additional candidates might have these advantages:
+     - coding MVs for these small PBs is expensive, and
+     - since the PBs are not far away from a proper (neighboring) merging candidate,
+       the quality of the candidates will still be good.
+   */
+  singleMCLFlag = (img->pps.log2_parallel_merge_level > 2 && nCS==8);
 
   if (singleMCLFlag) {
     xP=xC;
@@ -1290,97 +1439,130 @@ void derive_luma_motion_merge_mode(decoder_context* ctx,
     partIdx=0;
   }
 
-  MergingCandidates mergeCand;
-  derive_spatial_merging_candidates(tctx->img, xC,yC, nCS, xP,yP, singleMCLFlag,
-                                    nPbW,nPbH,partIdx, &mergeCand);
+  int maxCandidates = max_merge_idx+1;
+  //MotionVectorSpec mergeCandList[5];
+  int numMergeCand=0;
 
-  int refIdxCol[2] = { 0,0 };
+  // --- spatial merge candidates
 
-  MotionVector mvCol[2];
-  uint8_t predFlagLCol[2];
-  derive_temporal_luma_vector_prediction(ctx,tctx->img,shdr, xP,yP,nPbW,nPbH,
-                                         refIdxCol[0],0, &mvCol[0],
-                                         &predFlagLCol[0]);
+  numMergeCand = derive_spatial_merging_candidates(img, xC,yC, nCS, xP,yP, singleMCLFlag,
+                                                   nPbW,nPbH,partIdx, mergeCandList,
+                                                   maxCandidates);
 
-  uint8_t availableFlagCol = predFlagLCol[0];
-  predFlagLCol[1] = 0;
+  // --- collocated merge candidate
+  if (numMergeCand < maxCandidates) {
+    int refIdxCol[2] = { 0,0 };
 
-  if (shdr->slice_type == SLICE_TYPE_B) {
-    derive_temporal_luma_vector_prediction(ctx,tctx->img,shdr,
-                                           xP,yP,nPbW,nPbH, refIdxCol[1],1, &mvCol[1],
-                                           &predFlagLCol[1]);
-    availableFlagCol |= predFlagLCol[1];
-  }
-
-
-  // 4.
+    MotionVector mvCol[2];
+    uint8_t predFlagLCol[2];
+    derive_temporal_luma_vector_prediction(ctx,img,shdr, xP,yP,nPbW,nPbH,
+                                           refIdxCol[0],0, &mvCol[0],
+                                           &predFlagLCol[0]);
 
-  PredVectorInfo mergeCandList[5];
-  int numMergeCand=0;
+    uint8_t availableFlagCol = predFlagLCol[0];
+    predFlagLCol[1] = 0;
 
-  for (int i=0;i<5;i++) {
-    if (mergeCand.available[i]) {
-      mergeCandList[numMergeCand++] = mergeCand.pred_vector[i];
+    if (shdr->slice_type == SLICE_TYPE_B) {
+      derive_temporal_luma_vector_prediction(ctx,img,shdr,
+                                             xP,yP,nPbW,nPbH, refIdxCol[1],1, &mvCol[1],
+                                             &predFlagLCol[1]);
+      availableFlagCol |= predFlagLCol[1];
     }
-  }
 
-  if (availableFlagCol) {
-    // TODO: save in mergeCand directly...
-    mergeCand.available[PRED_COL] = availableFlagCol;
-    mergeCand.pred_vector[PRED_COL].mv[0] = mvCol[0];
-    mergeCand.pred_vector[PRED_COL].mv[1] = mvCol[1];
-    mergeCand.pred_vector[PRED_COL].predFlag[0] = predFlagLCol[0];
-    mergeCand.pred_vector[PRED_COL].predFlag[1] = predFlagLCol[1];
-    mergeCand.pred_vector[PRED_COL].refIdx[0] = refIdxCol[0];
-    mergeCand.pred_vector[PRED_COL].refIdx[1] = refIdxCol[1];
-
-    mergeCandList[numMergeCand++] = mergeCand.pred_vector[PRED_COL];
-  }
 
-  // 5.
+    if (availableFlagCol) {
+      MotionVectorSpec* colVec = &mergeCandList[numMergeCand++];
 
-  //int numOrigMergeCand = numMergeCand;
+      colVec->mv[0] = mvCol[0];
+      colVec->mv[1] = mvCol[1];
+      colVec->predFlag[0] = predFlagLCol[0];
+      colVec->predFlag[1] = predFlagLCol[1];
+      colVec->refIdx[0] = refIdxCol[0];
+      colVec->refIdx[1] = refIdxCol[1];
+    }
+  }
 
-  // 6.
 
-  //int numCombMergeCand = 0;
+  // --- bipredictive merge candidates ---
 
   if (shdr->slice_type == SLICE_TYPE_B) {
     derive_combined_bipredictive_merging_candidates(ctx, shdr,
-                                                    mergeCandList, &numMergeCand, numMergeCand);
-
-    //numCombMergeCand = numMergeCand - numOrigMergeCand;
+                                                    mergeCandList, &numMergeCand, maxCandidates);
   }
 
 
-  // 7.
+  // --- zero-vector merge candidates ---
 
-  derive_zero_motion_vector_candidates(shdr, mergeCandList, &numMergeCand);
-
-  // 8.
-
-  int merge_idx = tctx->merge_idx; // get_merge_idx(ctx,xP,yP);
-  out_vi->lum = mergeCandList[merge_idx];
+  derive_zero_motion_vector_candidates(shdr, mergeCandList, &numMergeCand, maxCandidates);
 
 
   logtrace(LogMotion,"mergeCandList:\n");
   for (int i=0;i<shdr->MaxNumMergeCand;i++)
     {
-      logtrace(LogMotion, " %d:%s\n", i, i==merge_idx ? " SELECTED":"");
+      //logtrace(LogMotion, " %d:%s\n", i, i==merge_idx ? " SELECTED":"");
       logmvcand(mergeCandList[i]);
     }
+}
+
+
+
+void get_merge_candidate_list(base_context* ctx,
+                              const slice_segment_header* shdr,
+                              de265_image* img,
+                              int xC,int yC, int xP,int yP,
+                              int nCS, int nPbW,int nPbH, int partIdx,
+                              MotionVectorSpec* mergeCandList)
+{
+  int max_merge_idx = 5-shdr->five_minus_max_num_merge_cand -1;
 
-  // 9.
+  get_merge_candidate_list_without_step_9(ctx, shdr, img,
+                                          xC,yC,xP,yP,nCS,nPbW,nPbH, partIdx,
+                                          max_merge_idx, mergeCandList);
 
-  if (out_vi->lum.predFlag[0] && out_vi->lum.predFlag[1] && nOrigPbW+nOrigPbH==12) {
-    out_vi->lum.refIdx[1] = -1;
-    out_vi->lum.predFlag[1] = 0;
+  // 9. for encoder: modify all merge candidates
+
+  for (int i=0;i<=max_merge_idx;i++) {
+    if (mergeCandList[i].predFlag[0] &&
+        mergeCandList[i].predFlag[1] &&
+        nPbW+nPbH==12)
+      {
+        mergeCandList[i].refIdx[1]   = -1;
+        mergeCandList[i].predFlag[1] = 0;
+      }
+  }
+}
+
+
+
+void derive_luma_motion_merge_mode(base_context* ctx,
+                                   const slice_segment_header* shdr,
+                                   de265_image* img,
+                                   int xC,int yC, int xP,int yP,
+                                   int nCS, int nPbW,int nPbH, int partIdx,
+                                   int merge_idx,
+                                   MotionVectorSpec* out_vi)
+{
+  MotionVectorSpec mergeCandList[5];
+
+  get_merge_candidate_list_without_step_9(ctx, shdr, img,
+                                          xC,yC,xP,yP,nCS,nPbW,nPbH, partIdx,
+                                          merge_idx, mergeCandList);
+
+
+  *out_vi = mergeCandList[merge_idx];
+
+  // 8.5.3.1.1 / 9.
+
+  if (out_vi->predFlag[0] && out_vi->predFlag[1] && nPbW+nPbH==12) {
+    out_vi->refIdx[1] = -1;
+    out_vi->predFlag[1] = 0;
   }
 }
 
 
 // 8.5.3.1.6
-void derive_spatial_luma_vector_prediction(de265_image* img,
+void derive_spatial_luma_vector_prediction(base_context* ctx,
+                                           de265_image* img,
                                            const slice_segment_header* shdr,
                                            int xC,int yC,int nCS,int xP,int yP,
                                            int nPbW,int nPbH, int X,
@@ -1388,13 +1570,15 @@ void derive_spatial_luma_vector_prediction(de265_image* img,
                                            uint8_t out_availableFlagLXN[2],
                                            MotionVector out_mvLXN[2])
 {
-  const decoder_context* ctx = img->decctx;
-
   int isScaledFlagLX = 0;
 
   const int A=0;
   const int B=1;
 
+  out_availableFlagLXN[A] = 0;
+  out_availableFlagLXN[B] = 0;
+
+
   // --- A ---
 
   // 1.
@@ -1428,9 +1612,9 @@ void derive_spatial_luma_vector_prediction(de265_image* img,
   int refIdxA=-1;
 
   // the POC we want to reference in this PB
-  const int referenced_POC = ctx->get_image(shdr->RefPicList[X][ refIdxLX ])->PicOrderCntVal;
-
-  const int referenced_refIdx = refIdxLX;
+  const de265_image* tmpimg = ctx->get_image(shdr->RefPicList[X][ refIdxLX ]);
+  if (tmpimg==NULL) { return; }
+  const int referenced_POC = tmpimg->PicOrderCntVal;
 
   for (int k=0;k<=1;k++) {
     if (availableA[k] &&
@@ -1438,15 +1622,18 @@ void derive_spatial_luma_vector_prediction(de265_image* img,
         img->get_pred_mode(xA[k],yA[k]) != MODE_INTRA) {
 
       int Y=1-X;
-      
-      const PredVectorInfo* vi = img->get_mv_info(xA[k],yA[k]);
+
+      const MotionVectorSpec* vi = img->get_mv_info(xA[k],yA[k]);
       logtrace(LogMotion,"MVP A%d=\n",k);
       logmvcand(*vi);
 
+      const de265_image* imgX = NULL;
+      if (vi->predFlag[X]) imgX = ctx->get_image(shdr->RefPicList[X][ vi->refIdx[X] ]);
+      const de265_image* imgY = NULL;
+      if (vi->predFlag[Y]) imgY = ctx->get_image(shdr->RefPicList[Y][ vi->refIdx[Y] ]);
+
       // check whether the predictor X is available and references the same POC
-      if (vi->predFlag[X] &&
-          ctx->get_image(shdr->RefPicList[X][ vi->refIdx[X] ])->PicOrderCntVal == referenced_POC) {
-        //vi->refIdx[X] == referenced_refIdx) {
+      if (vi->predFlag[X] && imgX && imgX->PicOrderCntVal == referenced_POC) {
 
         logtrace(LogMotion,"take A%d/L%d as A candidate with same POC\n",k,X);
 
@@ -1455,9 +1642,7 @@ void derive_spatial_luma_vector_prediction(de265_image* img,
         refIdxA = vi->refIdx[X];
       }
       // check whether the other predictor (Y) is available and references the same POC
-      else if (vi->predFlag[Y] &&
-               ctx->get_image(shdr->RefPicList[Y][ vi->refIdx[Y] ])->PicOrderCntVal == referenced_POC) {
-        //vi->refIdx[Y] == referenced_refIdx) {
+      else if (vi->predFlag[Y] && imgY && imgY->PicOrderCntVal == referenced_POC) {
 
         logtrace(LogMotion,"take A%d/L%d as A candidate with same POC\n",k,Y);
 
@@ -1479,8 +1664,8 @@ void derive_spatial_luma_vector_prediction(de265_image* img,
         img->get_pred_mode(xA[k],yA[k]) != MODE_INTRA) {
 
       int Y=1-X;
-      
-      const PredVectorInfo* vi = img->get_mv_info(xA[k],yA[k]);
+
+      const MotionVectorSpec* vi = img->get_mv_info(xA[k],yA[k]);
       if (vi->predFlag[X]==1 &&
           shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[X][ vi->refIdx[X] ]) {
 
@@ -1504,14 +1689,19 @@ void derive_spatial_luma_vector_prediction(de265_image* img,
     }
 
     if (out_availableFlagLXN[A]==1) {
+      if (refIdxA<0) {
+        out_availableFlagLXN[0] = out_availableFlagLXN[1] = false;
+        return; // error
+      }
+
       assert(refIdxA>=0);
       assert(refPicList>=0);
 
       const de265_image* refPicA = ctx->get_image(shdr->RefPicList[refPicList][refIdxA ]);
       const de265_image* refPicX = ctx->get_image(shdr->RefPicList[X         ][refIdxLX]);
 
-      int picStateA = shdr->RefPicList_PicState[refPicList][refIdxA ];
-      int picStateX = shdr->RefPicList_PicState[X         ][refIdxLX];
+      //int picStateA = shdr->RefPicList_PicState[refPicList][refIdxA ];
+      //int picStateX = shdr->RefPicList_PicState[X         ][refIdxLX];
 
       int isLongTermA = shdr->LongTermRefPic[refPicList][refIdxA ];
       int isLongTermX = shdr->LongTermRefPic[X         ][refIdxLX];
@@ -1527,9 +1717,9 @@ void derive_spatial_luma_vector_prediction(de265_image* img,
         {
           int distA = img->PicOrderCntVal - refPicA->PicOrderCntVal;
           int distX = img->PicOrderCntVal - referenced_POC;
-          
+
           if (!scale_mv(&out_mvLXN[A], out_mvLXN[A], distA, distX)) {
-            img->decctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false);
+            ctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false);
             img->integrity = INTEGRITY_DECODING_ERRORS;
           }
         }
@@ -1564,24 +1754,27 @@ void derive_spatial_luma_vector_prediction(de265_image* img,
     availableB[k] = img->available_pred_blk(xC,yC, nCS, xP,yP, nPbW,nPbH,partIdx, xB[k],yB[k]);
 
     if (availableB[k] && out_availableFlagLXN[B]==0) {
-      
+
       int Y=1-X;
-      
-      const PredVectorInfo* vi = img->get_mv_info(xB[k],yB[k]);
+
+      const MotionVectorSpec* vi = img->get_mv_info(xB[k],yB[k]);
       logtrace(LogMotion,"MVP B%d=\n",k);
       logmvcand(*vi);
 
 
-      if (vi->predFlag[X] &&
-          ctx->get_image(shdr->RefPicList[X][ vi->refIdx[X] ])->PicOrderCntVal == referenced_POC) {
+      const de265_image* imgX = NULL;
+      if (vi->predFlag[X]) imgX = ctx->get_image(shdr->RefPicList[X][ vi->refIdx[X] ]);
+      const de265_image* imgY = NULL;
+      if (vi->predFlag[Y]) imgY = ctx->get_image(shdr->RefPicList[Y][ vi->refIdx[Y] ]);
+
+      if (vi->predFlag[X] && imgX && imgX->PicOrderCntVal == referenced_POC) {
         logtrace(LogMotion,"a) take B%d/L%d as B candidate with same POC\n",k,X);
 
         out_availableFlagLXN[B]=1;
         out_mvLXN[B] = vi->mv[X];
         refIdxB = vi->refIdx[X];
       }
-      else if (vi->predFlag[Y] &&
-               ctx->get_image(shdr->RefPicList[Y][ vi->refIdx[Y] ])->PicOrderCntVal == referenced_POC) {
+      else if (vi->predFlag[Y] && imgY && imgY->PicOrderCntVal == referenced_POC) {
         logtrace(LogMotion,"b) take B%d/L%d as B candidate with same POC\n",k,Y);
 
         out_availableFlagLXN[B]=1;
@@ -1618,8 +1811,8 @@ void derive_spatial_luma_vector_prediction(de265_image* img,
 
       if (availableB[k]) {
         int Y=1-X;
-      
-        const PredVectorInfo* vi = img->get_mv_info(xB[k],yB[k]);
+
+        const MotionVectorSpec* vi = img->get_mv_info(xB[k],yB[k]);
 
         if (vi->predFlag[X]==1 &&
             shdr->LongTermRefPic[X][refIdxLX] == shdr->LongTermRefPic[X][ vi->refIdx[X] ]) {
@@ -1638,24 +1831,33 @@ void derive_spatial_luma_vector_prediction(de265_image* img,
       }
 
       if (out_availableFlagLXN[B]==1) {
+        if (refIdxB<0) {
+          out_availableFlagLXN[0] = out_availableFlagLXN[1] = false;
+          return; // error
+        }
+
         assert(refPicList>=0);
         assert(refIdxB>=0);
 
-        const de265_image* refPicB=img->decctx->get_image(shdr->RefPicList[refPicList][refIdxB ]);
-        const de265_image* refPicX=img->decctx->get_image(shdr->RefPicList[X         ][refIdxLX]);
+        const de265_image* refPicB=ctx->get_image(shdr->RefPicList[refPicList][refIdxB ]);
+        const de265_image* refPicX=ctx->get_image(shdr->RefPicList[X         ][refIdxLX]);
 
         int isLongTermB = shdr->LongTermRefPic[refPicList][refIdxB ];
         int isLongTermX = shdr->LongTermRefPic[X         ][refIdxLX];
 
-        if (refPicB->PicOrderCntVal != refPicX->PicOrderCntVal &&
-            !isLongTermB && !isLongTermX) {
+        if (refPicB==NULL || refPicX==NULL) {
+          img->decctx->add_warning(DE265_WARNING_NONEXISTING_REFERENCE_PICTURE_ACCESSED,false);
+          img->integrity = INTEGRITY_DECODING_ERRORS;
+        }
+        else if (refPicB->PicOrderCntVal != refPicX->PicOrderCntVal &&
+                 !isLongTermB && !isLongTermX) {
           int distB = img->PicOrderCntVal - refPicB->PicOrderCntVal;
           int distX = img->PicOrderCntVal - referenced_POC;
 
           logtrace(LogMotion,"scale MVP B: B-POC:%d X-POC:%d\n",refPicB->PicOrderCntVal,refPicX->PicOrderCntVal);
 
           if (!scale_mv(&out_mvLXN[B], out_mvLXN[B], distB, distX)) {
-            img->decctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false);
+            ctx->add_warning(DE265_WARNING_INCORRECT_MOTION_VECTOR_SCALING, false);
             img->integrity = INTEGRITY_DECODING_ERRORS;
           }
         }
@@ -1664,22 +1866,23 @@ void derive_spatial_luma_vector_prediction(de265_image* img,
   }
 }
 
+
 // 8.5.3.1.5
-MotionVector luma_motion_vector_prediction(decoder_context* ctx,
-                                           thread_context* tctx,
-                                           int xC,int yC,int nCS,int xP,int yP,
-                                           int nPbW,int nPbH, int l,
-                                           int refIdx, int partIdx)
+void fill_luma_motion_vector_predictors(base_context* ctx,
+                                        const slice_segment_header* shdr,
+                                        de265_image* img,
+                                        int xC,int yC,int nCS,int xP,int yP,
+                                        int nPbW,int nPbH, int l,
+                                        int refIdx, int partIdx,
+                                        MotionVector out_mvpList[2])
 {
-  const slice_segment_header* shdr = tctx->shdr;
-
-
   // 8.5.3.1.6: derive two spatial vector predictors A (0) and B (1)
 
   uint8_t availableFlagLXN[2];
   MotionVector mvLXN[2];
 
-  derive_spatial_luma_vector_prediction(tctx->img, shdr, xC,yC, nCS, xP,yP, nPbW,nPbH, l, refIdx, partIdx,
+  derive_spatial_luma_vector_prediction(ctx, img, shdr, xC,yC, nCS, xP,yP,
+                                        nPbW,nPbH, l, refIdx, partIdx,
                                         availableFlagLXN, mvLXN);
 
   // 8.5.3.1.7: if we only have one spatial vector or both spatial vectors are the same,
@@ -1695,7 +1898,8 @@ MotionVector luma_motion_vector_prediction(decoder_context* ctx,
     availableFlagLXCol = 0;
   }
   else {
-    derive_temporal_luma_vector_prediction(ctx, tctx->img, shdr, xP,yP, nPbW,nPbH, refIdx,l,
+    derive_temporal_luma_vector_prediction(ctx, img, shdr,
+                                           xP,yP, nPbW,nPbH, refIdx,l,
                                            &mvLXCol, &availableFlagLXCol);
   }
 
@@ -1706,10 +1910,9 @@ MotionVector luma_motion_vector_prediction(decoder_context* ctx,
 
   // spatial predictor A
 
-  MotionVector mvpList[3];
   if (availableFlagLXN[0])
     {
-      mvpList[numMVPCandLX++] = mvLXN[0];
+      out_mvpList[numMVPCandLX++] = mvLXN[0];
     }
 
   // spatial predictor B (if not same as A)
@@ -1718,42 +1921,62 @@ MotionVector luma_motion_vector_prediction(decoder_context* ctx,
       (!availableFlagLXN[0] || // in case A in not available, but mvLXA initialized to same as mvLXB
        (mvLXN[0].x != mvLXN[1].x || mvLXN[0].y != mvLXN[1].y)))
     {
-      mvpList[numMVPCandLX++] = mvLXN[1];
+      out_mvpList[numMVPCandLX++] = mvLXN[1];
     }
 
   // temporal predictor
 
   if (availableFlagLXCol)
     {
-      mvpList[numMVPCandLX++] = mvLXCol;
+      out_mvpList[numMVPCandLX++] = mvLXCol;
     }
 
   // fill with zero predictors
 
   while (numMVPCandLX<2) {
-    mvpList[numMVPCandLX].x = 0;
-    mvpList[numMVPCandLX].y = 0;
+    out_mvpList[numMVPCandLX].x = 0;
+    out_mvpList[numMVPCandLX].y = 0;
     numMVPCandLX++;
   }
 
 
+  assert(numMVPCandLX==2);
+}
+
+
+MotionVector luma_motion_vector_prediction(base_context* ctx,
+                                           const slice_segment_header* shdr,
+                                           de265_image* img,
+                                           const motion_spec& motion,
+                                           int xC,int yC,int nCS,int xP,int yP,
+                                           int nPbW,int nPbH, int l,
+                                           int refIdx, int partIdx)
+{
+  MotionVector mvpList[2];
+
+  fill_luma_motion_vector_predictors(ctx, shdr, img,
+                                     xC,yC,nCS,xP,yP,
+                                     nPbW, nPbH, l, refIdx, partIdx,
+                                     mvpList);
+
   // select predictor according to mvp_lX_flag
 
-  return mvpList[ tctx->mvp_lX_flag[l] ];
+  return mvpList[ l ? motion.mvp_l1_flag : motion.mvp_l0_flag ];
 }
 
+
 #if DE265_LOG_TRACE
-void logMV(int x0,int y0,int nPbW,int nPbH, const char* mode,const VectorInfo* mv)
+void logMV(int x0,int y0,int nPbW,int nPbH, const char* mode,const MotionVectorSpec* mv)
 {
-  int pred0 = mv->lum.predFlag[0];
-  int pred1 = mv->lum.predFlag[1];
+  int pred0 = mv->predFlag[0];
+  int pred1 = mv->predFlag[1];
 
   logtrace(LogMotion,
            "*MV %d;%d [%d;%d] %s: (%d) %d;%d @%d   (%d) %d;%d @%d\n", x0,y0,nPbW,nPbH,mode,
            pred0,
-           pred0 ? mv->lum.mv[0].x : 0,pred0 ? mv->lum.mv[0].y : 0, pred0 ? mv->lum.refIdx[0] : 0,
+           pred0 ? mv->mv[0].x : 0,pred0 ? mv->mv[0].y : 0, pred0 ? mv->refIdx[0] : 0,
            pred1,
-           pred1 ? mv->lum.mv[1].x : 0,pred1 ? mv->lum.mv[1].y : 0, pred1 ? mv->lum.refIdx[1] : 0);
+           pred1 ? mv->mv[1].x : 0,pred1 ? mv->mv[1].y : 0, pred1 ? mv->refIdx[1] : 0);
 }
 #else
 #define logMV(x0,y0,nPbW,nPbH,mode,mv)
@@ -1762,22 +1985,27 @@ void logMV(int x0,int y0,int nPbW,int nPbH, const char* mode,const VectorInfo* m
 
 
 // 8.5.3.1
-void motion_vectors_and_ref_indices(decoder_context* ctx,
-                                    thread_context* tctx,
-                                    int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, int partIdx,
-                                    VectorInfo* out_vi)
+void motion_vectors_and_ref_indices(base_context* ctx,
+                                    const slice_segment_header* shdr,
+                                    de265_image* img,
+                                    const motion_spec& motion,
+                                    int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH,
+                                    int partIdx,
+                                    MotionVectorSpec* out_vi)
 {
   //slice_segment_header* shdr = tctx->shdr;
 
   int xP = xC+xB;
   int yP = yC+yB;
 
-  enum PredMode predMode = tctx->img->get_pred_mode(xC,yC);
+  enum PredMode predMode = img->get_pred_mode(xC,yC);
 
   if (predMode == MODE_SKIP ||
-      (predMode == MODE_INTER && tctx->merge_flag))
+      (predMode == MODE_INTER && motion.merge_flag))
     {
-      derive_luma_motion_merge_mode(ctx,tctx, xC,yC, xP,yP, nCS,nPbW,nPbH, partIdx, out_vi);
+      derive_luma_motion_merge_mode(ctx,shdr,img,
+                                    xC,yC, xP,yP, nCS,nPbW,nPbH, partIdx,
+                                    motion.merge_idx, out_vi);
 
       logMV(xP,yP,nPbW,nPbH, "merge_mode", out_vi);
     }
@@ -1788,38 +2016,39 @@ void motion_vectors_and_ref_indices(decoder_context* ctx,
     for (int l=0;l<2;l++) {
       // 1.
 
-      enum InterPredIdc inter_pred_idc = (enum InterPredIdc)tctx->inter_pred_idc;
+      enum InterPredIdc inter_pred_idc = (enum InterPredIdc)motion.inter_pred_idc;
 
       if (inter_pred_idc == PRED_BI ||
           (inter_pred_idc == PRED_L0 && l==0) ||
           (inter_pred_idc == PRED_L1 && l==1)) {
-        out_vi->lum.refIdx[l] = tctx->refIdx[l];
-        out_vi->lum.predFlag[l] = 1;
+        out_vi->refIdx[l] = motion.refIdx[l];
+        out_vi->predFlag[l] = 1;
       }
       else {
-        out_vi->lum.refIdx[l] = -1;
-        out_vi->lum.predFlag[l] = 0;
+        out_vi->refIdx[l] = -1;
+        out_vi->predFlag[l] = 0;
       }
 
       // 2.
 
-      mvdL[l][0] = tctx->mvd[l][0];
-      mvdL[l][1] = tctx->mvd[l][1];
+      mvdL[l][0] = motion.mvd[l][0];
+      mvdL[l][1] = motion.mvd[l][1];
 
 
-      if (out_vi->lum.predFlag[l]) {
+      if (out_vi->predFlag[l]) {
         // 3.
 
-        mvpL[l] = luma_motion_vector_prediction(ctx,tctx,xC,yC,nCS,xP,yP, nPbW,nPbH, l,
-                                                out_vi->lum.refIdx[l], partIdx);
+        mvpL[l] = luma_motion_vector_prediction(ctx,shdr,img,motion,
+                                                xC,yC,nCS,xP,yP, nPbW,nPbH, l,
+                                                out_vi->refIdx[l], partIdx);
 
         // 4.
 
         int32_t x = (mvpL[l].x + mvdL[l][0] + 0x10000) & 0xFFFF;
         int32_t y = (mvpL[l].y + mvdL[l][1] + 0x10000) & 0xFFFF;
 
-        out_vi->lum.mv[l].x = (x>=0x8000) ? x-0x10000 : x;
-        out_vi->lum.mv[l].y = (y>=0x8000) ? y-0x10000 : y;
+        out_vi->mv[l].x = (x>=0x8000) ? x-0x10000 : x;
+        out_vi->mv[l].y = (y>=0x8000) ? y-0x10000 : y;
       }
     }
 
@@ -1829,24 +2058,27 @@ void motion_vectors_and_ref_indices(decoder_context* ctx,
 
 
 // 8.5.3
-void decode_prediction_unit(thread_context* tctx,
+void decode_prediction_unit(base_context* ctx,
+                            const slice_segment_header* shdr,
+                            de265_image* img,
+                            const motion_spec& motion,
                             int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, int partIdx)
 {
   logtrace(LogMotion,"decode_prediction_unit POC=%d %d;%d %dx%d\n",
-           tctx->img->PicOrderCntVal, xC+xB,yC+yB, nPbW,nPbH);
+           img->PicOrderCntVal, xC+xB,yC+yB, nPbW,nPbH);
 
-  slice_segment_header* shdr = tctx->shdr;
+  //slice_segment_header* shdr = tctx->shdr;
 
   // 1.
 
-  VectorInfo vi;
-  motion_vectors_and_ref_indices(tctx->decctx,tctx, xC,yC, xB,yB, nCS, nPbW,nPbH, partIdx, &vi);
+  MotionVectorSpec vi;
+  motion_vectors_and_ref_indices(ctx, shdr, img, motion,
+                                 xC,yC, xB,yB, nCS, nPbW,nPbH, partIdx, &vi);
 
   // 2.
 
-  generate_inter_prediction_samples(tctx->decctx,tctx->img, shdr, xC,yC, xB,yB, nCS, nPbW,nPbH, &vi);
+  generate_inter_prediction_samples(ctx,shdr, img, xC,yC, xB,yB, nCS, nPbW,nPbH, &vi);
 
 
-  tctx->img->set_mv_info(xC+xB,yC+yB,nPbW,nPbH, &vi.lum);
+  img->set_mv_info(xC+xB,yC+yB,nPbW,nPbH, vi);
 }
-
diff --git a/libde265/motion.h b/libde265/motion.h
index b296804..eb532f1 100644
--- a/libde265/motion.h
+++ b/libde265/motion.h
@@ -23,6 +23,8 @@
 
 #include <stdint.h>
 
+class base_context;
+class slice_segment_header;
 
 typedef struct
 {
@@ -32,23 +34,63 @@ typedef struct
 
 typedef struct
 {
-   int8_t refIdx[2];
-  uint8_t predFlag[2];
-  MotionVector mv[2];
-} PredVectorInfo;
+  uint8_t predFlag[2];  // which of the two vectors is actually used
+   int8_t   refIdx[2];
+  MotionVector  mv[2];
+} MotionVectorSpec;
 
 
-typedef struct
-{
-  PredVectorInfo lum;
-  MotionVector mvC[2];
-} VectorInfo;
+typedef struct {
+  int8_t  refIdx[2];
+  int16_t mvd[2][2]; // [L0/L1][x/y]  (only in top left position - ???)
 
+  uint8_t inter_pred_idc : 2; // enum InterPredIdc
+  uint8_t mvp_l0_flag : 1;
+  uint8_t mvp_l1_flag : 1;
+  uint8_t merge_flag : 1;
+  uint8_t merge_idx  : 3;
+} motion_spec;
 
-void decode_prediction_unit(struct thread_context* shdr,
-                            int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, int partIdx);
 
-void inter_prediction(struct decoder_context* ctx,struct slice_segment_header* shdr,
-                      int xC,int yC, int log2CbSize);
+void get_merge_candidate_list(base_context* ctx,
+                              const slice_segment_header* shdr,
+                              struct de265_image* img,
+                              int xC,int yC, int xP,int yP,
+                              int nCS, int nPbW,int nPbH, int partIdx,
+                              MotionVectorSpec* mergeCandList);
+
+/*
+int derive_spatial_merging_candidates(const struct de265_image* img,
+                                      int xC, int yC, int nCS, int xP, int yP,
+                                      uint8_t singleMCLFlag,
+                                      int nPbW, int nPbH,
+                                      int partIdx,
+                                      MotionVectorSpec* out_cand,
+                                      int maxCandidates);
+*/
+
+void generate_inter_prediction_samples(base_context* ctx,
+                                       const slice_segment_header* shdr,
+                                       struct de265_image* img,
+                                       int xC,int yC,
+                                       int xB,int yB,
+                                       int nCS, int nPbW,int nPbH,
+                                       const MotionVectorSpec* vi);
+
+
+/* Fill list (two entries) of motion-vector predictors for MVD coding.
+ */
+void fill_luma_motion_vector_predictors(base_context* ctx,
+                                        const slice_segment_header* shdr,
+                                        de265_image* img,
+                                        int xC,int yC,int nCS,int xP,int yP,
+                                        int nPbW,int nPbH, int l,
+                                        int refIdx, int partIdx,
+                                        MotionVector out_mvpList[2]);
+
+
+void decode_prediction_unit(base_context* ctx,const slice_segment_header* shdr,
+                            de265_image* img, const motion_spec& motion,
+                            int xC,int yC, int xB,int yB, int nCS, int nPbW,int nPbH, int partIdx);
 
 #endif
diff --git a/libde265/nal-parser.cc b/libde265/nal-parser.cc
index e1c17ad..ea95ed1 100644
--- a/libde265/nal-parser.cc
+++ b/libde265/nal-parser.cc
@@ -58,10 +58,13 @@ void NAL_unit::clear()
   skipped_bytes.clear();
 }
 
-void NAL_unit::resize(int new_size)
+LIBDE265_CHECK_RESULT bool NAL_unit::resize(int new_size)
 {
   if (capacity < new_size) {
     unsigned char* newbuffer = (unsigned char*)malloc(new_size);
+    if (newbuffer == NULL) {
+      return false;
+    }
 
     if (nal_data != NULL) {
       memcpy(newbuffer, nal_data, data_size);
@@ -71,20 +74,27 @@ void NAL_unit::resize(int new_size)
     nal_data = newbuffer;
     capacity = new_size;
   }
+  return true;
 }
 
-void NAL_unit::append(const unsigned char* in_data, int n)
+LIBDE265_CHECK_RESULT bool NAL_unit::append(const unsigned char* in_data, int n)
 {
-  resize(data_size + n);
+  if (!resize(data_size + n)) {
+    return false;
+  }
   memcpy(nal_data + data_size, in_data, n);
   data_size += n;
+  return true;
 }
 
-void NAL_unit::set_data(const unsigned char* in_data, int n)
+bool LIBDE265_CHECK_RESULT NAL_unit::set_data(const unsigned char* in_data, int n)
 {
-  resize(n);
+  if (!resize(n)) {
+    return false;
+  }
   memcpy(nal_data, in_data, n);
   data_size = n;
+  return true;
 }
 
 void NAL_unit::insert_skipped_byte(int pos)
@@ -177,7 +187,7 @@ NAL_Parser::~NAL_Parser()
 }
 
 
-NAL_unit* NAL_Parser::alloc_NAL_unit(int size)
+LIBDE265_CHECK_RESULT NAL_unit* NAL_Parser::alloc_NAL_unit(int size)
 {
   NAL_unit* nal;
 
@@ -192,13 +202,20 @@ NAL_unit* NAL_Parser::alloc_NAL_unit(int size)
   }
 
   nal->clear();
-  nal->resize(size);
+  if (!nal->resize(size)) {
+    free_NAL_unit(nal);
+    return NULL;
+  }
 
   return nal;
 }
 
 void NAL_Parser::free_NAL_unit(NAL_unit* nal)
 {
+  if (nal == NULL) {
+    // Allow calling with NULL just like regular "free()"
+    return;
+  }
   if (NAL_free_list.size() < DE265_NAL_FREE_LIST_SIZE) {
     NAL_free_list.push_back(nal);
   }
@@ -235,6 +252,9 @@ de265_error NAL_Parser::push_data(const unsigned char* data, int len,
 
   if (pending_input_NAL == NULL) {
     pending_input_NAL = alloc_NAL_unit(len+3);
+    if (pending_input_NAL == NULL) {
+      return DE265_ERROR_OUT_OF_MEMORY;
+    }
     pending_input_NAL->pts = pts;
     pending_input_NAL->user_data = user_data;
   }
@@ -243,7 +263,9 @@ de265_error NAL_Parser::push_data(const unsigned char* data, int len,
 
   // Resize output buffer so that complete input would fit.
   // We add 3, because in the worst case 3 extra bytes are created for an input byte.
-  nal->resize(nal->size() + len + 3);
+  if (!nal->resize(nal->size() + len + 3)) {
+    return DE265_ERROR_OUT_OF_MEMORY;
+  }
 
   unsigned char* out = nal->data() + nal->size();
 
@@ -316,6 +338,9 @@ de265_error NAL_Parser::push_data(const unsigned char* data, int len,
         // initialize new, empty NAL unit
 
         pending_input_NAL = alloc_NAL_unit(len+3);
+        if (pending_input_NAL == NULL) {
+          return DE265_ERROR_OUT_OF_MEMORY;
+        }
         pending_input_NAL->pts = pts;
         pending_input_NAL->user_data = user_data;
         nal = pending_input_NAL;
@@ -352,7 +377,10 @@ de265_error NAL_Parser::push_NAL(const unsigned char* data, int len,
   end_of_frame = false;
 
   NAL_unit* nal = alloc_NAL_unit(len);
-  nal->set_data(data, len);
+  if (nal == NULL || !nal->set_data(data, len)) {
+    free_NAL_unit(nal);
+    return DE265_ERROR_OUT_OF_MEMORY;
+  }
   nal->pts = pts;
   nal->user_data = user_data;
 
@@ -372,8 +400,16 @@ de265_error NAL_Parser::flush_data()
 
     // append bytes that are implied by the push state
 
-    if (input_push_state==6) { nal->append(null,1); }
-    if (input_push_state==7) { nal->append(null,2); }
+    if (input_push_state==6) {
+      if (!nal->append(null,1)) {
+        return DE265_ERROR_OUT_OF_MEMORY;
+      }
+    }
+    if (input_push_state==7) {
+      if (!nal->append(null,2)) {
+        return DE265_ERROR_OUT_OF_MEMORY;
+      }
+    }
 
 
     // only push the NAL if it contains at least the NAL header
diff --git a/libde265/nal-parser.h b/libde265/nal-parser.h
index 921b27a..a63a7fd 100644
--- a/libde265/nal-parser.h
+++ b/libde265/nal-parser.h
@@ -24,6 +24,7 @@
 #include "libde265/sps.h"
 #include "libde265/pps.h"
 #include "libde265/nal.h"
+#include "libde265/util.h"
 
 #include <vector>
 #include <queue>
@@ -47,9 +48,9 @@ class NAL_unit {
 
   // --- rbsp data ---
 
-  void resize(int new_size);
-  void append(const unsigned char* data, int n);
-  void set_data(const unsigned char* data, int n);
+  LIBDE265_CHECK_RESULT bool resize(int new_size);
+  LIBDE265_CHECK_RESULT bool append(const unsigned char* data, int n);
+  LIBDE265_CHECK_RESULT bool set_data(const unsigned char* data, int n);
 
   int size() const { return data_size; }
   void set_size(int s) { data_size=s; }
@@ -90,13 +91,12 @@ class NAL_Parser
   ~NAL_Parser();
 
   de265_error push_data(const unsigned char* data, int len,
-                        de265_PTS pts, void* user_data);
+                        de265_PTS pts, void* user_data = NULL);
 
   de265_error push_NAL(const unsigned char* data, int len,
-                       de265_PTS pts, void* user_data);
+                       de265_PTS pts, void* user_data = NULL);
 
   NAL_unit*   pop_from_NAL_queue();
-  void        push_to_NAL_queue(NAL_unit*);
   de265_error flush_data();
   void        mark_end_of_stream() { end_of_stream=true; }
   void        mark_end_of_frame() { end_of_frame=true; }
@@ -114,6 +114,10 @@ class NAL_Parser
     return size;
   }
 
+  int number_of_complete_NAL_units_pending() const {
+    return NAL_queue.size();
+  }
+
   void free_NAL_unit(NAL_unit*);
 
 
@@ -136,12 +140,14 @@ class NAL_Parser
   std::queue<NAL_unit*> NAL_queue;  // enqueued NALs have suffing bytes removed
   int nBytes_in_NAL_queue; // data bytes currently in NAL_queue
 
+  void push_to_NAL_queue(NAL_unit*);
+
 
   // pool of unused NAL memory
 
   std::vector<NAL_unit*> NAL_free_list;  // maximum size: DE265_NAL_FREE_LIST_SIZE
 
-  NAL_unit* alloc_NAL_unit(int size);
+  LIBDE265_CHECK_RESULT NAL_unit* alloc_NAL_unit(int size);
 };
 
 
diff --git a/libde265/nal.cc b/libde265/nal.cc
index b3160fa..380f04d 100644
--- a/libde265/nal.cc
+++ b/libde265/nal.cc
@@ -19,15 +19,25 @@
  */
 
 #include "nal.h"
+#include "cabac.h"
 #include <assert.h>
 
 
-void nal_read_header(bitreader* reader, nal_header* hdr)
+void nal_header::read(bitreader* reader)
 {
   skip_bits(reader,1);
-  hdr->nal_unit_type = get_bits(reader,6);
-  hdr->nuh_layer_id  = get_bits(reader,6);
-  hdr->nuh_temporal_id = get_bits(reader,3) -1;
+  nal_unit_type = get_bits(reader,6);
+  nuh_layer_id  = get_bits(reader,6);
+  nuh_temporal_id = get_bits(reader,3) -1;
+}
+
+
+void nal_header::write(CABAC_encoder& out) const
+{
+  out.skip_bits(1);
+  out.write_bits(nal_unit_type,6);
+  out.write_bits(nuh_layer_id ,6);
+  out.write_bits(nuh_temporal_id+1,3);
 }
 
 
@@ -80,6 +90,23 @@ bool isReferenceNALU(uint8_t unit_type)
             (unit_type <= NAL_UNIT_RESERVED_IRAP_VCL23)) );
 }
 
+bool isSublayerNonReference(uint8_t unit_type)
+{
+  switch (unit_type) {
+  case NAL_UNIT_TRAIL_N:
+  case NAL_UNIT_TSA_N:
+  case NAL_UNIT_STSA_N:
+  case NAL_UNIT_RADL_N:
+  case NAL_UNIT_RASL_N:
+  case NAL_UNIT_RESERVED_VCL_N10:
+  case NAL_UNIT_RESERVED_VCL_N12:
+  case NAL_UNIT_RESERVED_VCL_N14:
+    return true;
+
+  default:
+    return false;
+  }
+}
 
 static const char* NAL_unit_name[] = {
   "TRAIL_N", // 0
@@ -134,7 +161,6 @@ static const char* NAL_unit_name[] = {
 
 const char* get_NAL_name(uint8_t unit_type)
 {
-  assert(unit_type <= 47);
+  if (unit_type >= 48) { return "INVALID NAL >= 48"; }
   return NAL_unit_name[unit_type];
 }
-
diff --git a/libde265/nal.h b/libde265/nal.h
index cf31728..2bd85db 100644
--- a/libde265/nal.h
+++ b/libde265/nal.h
@@ -31,6 +31,7 @@
 #endif
 
 #include "libde265/bitstream.h"
+#include "libde265/cabac.h"
 
 struct nal_header {
   nal_header() {
@@ -39,9 +40,18 @@ struct nal_header {
     nuh_temporal_id = 0;
   }
 
-  int nal_unit_type;
-  int nuh_layer_id;
-  int nuh_temporal_id;
+  void read(bitreader* reader);
+  void write(CABAC_encoder& writer) const;
+
+  void set(int unit_type, int layer_id=0, int temporal_id=0) {
+    nal_unit_type  =unit_type;
+    nuh_layer_id   =layer_id;
+    nuh_temporal_id=temporal_id;
+  }
+
+  uint8_t nal_unit_type;
+  uint8_t nuh_layer_id;
+  uint8_t nuh_temporal_id;
 };
 
 #define NAL_UNIT_TRAIL_N  0
@@ -95,8 +105,6 @@ struct nal_header {
 
 #define NAL_UNIT_UNDEFINED    255
 
-void nal_read_header(bitreader* reader, nal_header*);
-
 bool isIDR(uint8_t unit_type);
 bool isBLA(uint8_t unit_type);
 bool isCRA(uint8_t unit_type);
@@ -105,7 +113,17 @@ bool isRASL(uint8_t unit_type);
 bool isIRAP(uint8_t unit_type);
 bool isRADL(uint8_t unit_type);
 bool isReferenceNALU(uint8_t unit_type);
+bool isSublayerNonReference(uint8_t unit_type);
 
 const char* get_NAL_name(uint8_t unit_type);
 
+inline bool isIdrPic(uint8_t nal_unit_type) {
+  return (nal_unit_type == NAL_UNIT_IDR_W_RADL ||
+          nal_unit_type == NAL_UNIT_IDR_N_LP);
+}
+
+inline bool isRapPic(uint8_t nal_unit_type) {
+  return nal_unit_type >= 16 && nal_unit_type <= 23;
+}
+
 #endif
diff --git a/libde265/pps.cc b/libde265/pps.cc
index eb4b01f..f893958 100644
--- a/libde265/pps.cc
+++ b/libde265/pps.cc
@@ -32,9 +32,154 @@
 #endif
 
 
+void pps_range_extension::reset()
+{
+  log2_max_transform_skip_block_size = 2;
+  cross_component_prediction_enabled_flag = false;
+  chroma_qp_offset_list_enabled_flag = false;
+  diff_cu_chroma_qp_offset_depth = 0;
+  chroma_qp_offset_list_len = 0;
+  log2_sao_offset_scale_luma = 0;
+  log2_sao_offset_scale_chroma = 0;
+}
+
+
+bool pps_range_extension::read(bitreader* br, decoder_context* ctx, const pic_parameter_set* pps)
+{
+  const seq_parameter_set* sps = ctx->get_sps(pps->seq_parameter_set_id);
+
+  int uvlc;
+
+  if (pps->transform_skip_enabled_flag) {
+    uvlc = get_uvlc(br);
+    if (uvlc == UVLC_ERROR ||
+        uvlc+2 > sps->Log2MaxTrafoSize) {
+
+      // Note: this is out of spec, but the conformance stream
+      // PERSIST_RPARAM_A_RExt_Sony_2 codes a too large value.
+
+      //ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
+      //return false;
+    }
+
+    log2_max_transform_skip_block_size = uvlc+2;
+  }
+
+  cross_component_prediction_enabled_flag = get_bits(br,1);
+  if (sps->ChromaArrayType != CHROMA_444 &&
+      cross_component_prediction_enabled_flag) {
+      ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
+  }
+
+  chroma_qp_offset_list_enabled_flag = get_bits(br,1);
+  if (sps->ChromaArrayType == CHROMA_MONO &&
+      chroma_qp_offset_list_enabled_flag) {
+      ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
+  }
+
+  if (chroma_qp_offset_list_enabled_flag) {
+    uvlc = get_uvlc(br);
+    if (uvlc == UVLC_ERROR ||
+        uvlc > sps->log2_diff_max_min_luma_coding_block_size) {
+      ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
+      return false;
+    }
+
+    diff_cu_chroma_qp_offset_depth = uvlc;
+
+
+    uvlc = get_uvlc(br);
+    if (uvlc == UVLC_ERROR ||
+        uvlc > 5) {
+      ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
+      return false;
+    }
+
+    chroma_qp_offset_list_len = uvlc+1;
+
+    for (int i=0;i<chroma_qp_offset_list_len;i++) {
+      int svlc;
+      svlc = get_svlc(br);
+      if (svlc == UVLC_ERROR ||
+          svlc < -12 || svlc > 12) {
+        ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
+        return false;
+      }
+
+      cb_qp_offset_list[i] = svlc;
+
+      svlc = get_svlc(br);
+      if (svlc == UVLC_ERROR ||
+          svlc < -12 || svlc > 12) {
+        ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
+        return false;
+      }
+
+      cr_qp_offset_list[i] = svlc;
+    }
+  }
+
+
+  uvlc = get_uvlc(br);
+  if (uvlc == UVLC_ERROR ||
+      uvlc > libde265_max(0, sps->BitDepth_Y-10)) {
+    ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
+    return false;
+  }
+
+  log2_sao_offset_scale_luma = uvlc;
+
+  uvlc = get_uvlc(br);
+  if (uvlc == UVLC_ERROR ||
+      uvlc > libde265_max(0, sps->BitDepth_C-10)) {
+    ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
+    return false;
+  }
+
+  log2_sao_offset_scale_chroma = uvlc;
+
+  return true;
+}
+
+
+void pps_range_extension::dump(int fd) const
+{
+  FILE* fh;
+  if (fd==1) fh=stdout;
+  else if (fd==2) fh=stderr;
+  else { return; }
+
+#define LOG0(t) log2fh(fh, t)
+#define LOG1(t,d) log2fh(fh, t,d)
+#define LOG2(t,d,e) log2fh(fh, t,d,e)
+
+  LOG0("---------- PPS range-extension ----------\n");
+  LOG1("log2_max_transform_skip_block_size      : %d\n", log2_max_transform_skip_block_size);
+  LOG1("cross_component_prediction_enabled_flag : %d\n", cross_component_prediction_enabled_flag);
+  LOG1("chroma_qp_offset_list_enabled_flag      : %d\n", chroma_qp_offset_list_enabled_flag);
+  if (chroma_qp_offset_list_enabled_flag) {
+    LOG1("diff_cu_chroma_qp_offset_depth          : %d\n", diff_cu_chroma_qp_offset_depth);
+    LOG1("chroma_qp_offset_list_len               : %d\n", chroma_qp_offset_list_len);
+    for (int i=0;i<chroma_qp_offset_list_len;i++) {
+      LOG2("cb_qp_offset_list[%d]                    : %d\n", i,cb_qp_offset_list[i]);
+      LOG2("cr_qp_offset_list[%d]                    : %d\n", i,cr_qp_offset_list[i]);
+    }
+  }
+
+  LOG1("log2_sao_offset_scale_luma              : %d\n", log2_sao_offset_scale_luma);
+  LOG1("log2_sao_offset_scale_chroma            : %d\n", log2_sao_offset_scale_chroma);
+#undef LOG2
+#undef LOG1
+#undef LOG0
+}
+
+
+
+
+
 pic_parameter_set::pic_parameter_set()
 {
-  pps_read = false;
+  reset();
 }
 
 
@@ -43,9 +188,88 @@ pic_parameter_set::~pic_parameter_set()
 }
 
 
+void pic_parameter_set::set_defaults(enum PresetSet)
+{
+  pps_read = false;
+
+  pic_parameter_set_id = 0;
+  seq_parameter_set_id = 0;
+  dependent_slice_segments_enabled_flag = 0;
+  sign_data_hiding_flag = 0;
+  cabac_init_present_flag = 0;
+  num_ref_idx_l0_default_active = 1;
+  num_ref_idx_l1_default_active = 1;
+
+  pic_init_qp = 27;
+  constrained_intra_pred_flag = 0;
+  transform_skip_enabled_flag = 0;
+
+  cu_qp_delta_enabled_flag = 0;
+  diff_cu_qp_delta_depth = 0;
+
+  pic_cb_qp_offset = 0;
+  pic_cr_qp_offset = 0;
+  pps_slice_chroma_qp_offsets_present_flag = 0;
+  weighted_pred_flag  = 0;
+  weighted_bipred_flag= 0;
+  output_flag_present_flag = 0;
+  transquant_bypass_enable_flag = 0;
+  entropy_coding_sync_enabled_flag = 0;
+
+  // --- tiles ---
+
+  tiles_enabled_flag = 0;
+  num_tile_columns = 1;
+  num_tile_rows    = 1;
+  uniform_spacing_flag = 1;
+
+
+  // --- ---
+
+  loop_filter_across_tiles_enabled_flag = 1;
+  pps_loop_filter_across_slices_enabled_flag = 1;
+
+  for (int i=0;i<DE265_MAX_TILE_COLUMNS;i++) { colWidth[i]=0; }
+  for (int i=0;i<DE265_MAX_TILE_ROWS;i++)    { rowHeight[i]=0; }
+  for (int i=0;i<=DE265_MAX_TILE_COLUMNS;i++) { colBd[i]=0; }
+  for (int i=0;i<=DE265_MAX_TILE_ROWS;i++)    { rowBd[i]=0; }
+
+  CtbAddrRStoTS.clear();
+  CtbAddrTStoRS.clear();
+  TileId.clear();
+  TileIdRS.clear();
+  MinTbAddrZS.clear();
+
+
+  Log2MinCuQpDeltaSize = 0;
+
+  deblocking_filter_control_present_flag = 0;
+  deblocking_filter_override_enabled_flag = 0;
+  pic_disable_deblocking_filter_flag = 0;
+
+  beta_offset = 0;
+  tc_offset   = 0;
+
+  pic_scaling_list_data_present_flag = 0;
+  // TODO struct scaling_list_data scaling_list;
+
+  lists_modification_present_flag = 0;
+  log2_parallel_merge_level = 2;
+
+  num_extra_slice_header_bits = 0;
+  slice_segment_header_extension_present_flag = 0;
+  pps_extension_flag = 0;
+
+  pps_range_extension_flag = 0;
+  pps_multilayer_extension_flag = 0;
+  pps_extension_6bits = 0;
+}
+
+
 bool pic_parameter_set::read(bitreader* br, decoder_context* ctx)
 {
-  pps_read = false; // incomplete pps
+  reset();
+
 
   int uvlc;
   pic_parameter_set_id = uvlc = get_uvlc(br);
@@ -162,6 +386,10 @@ bool pic_parameter_set::read(bitreader* br, decoder_context* ctx)
           lastColumnWidth -= colWidth[i];
         }
 
+      if (lastColumnWidth <= 0) {
+        return false;
+      }
+
       colWidth[num_tile_columns-1] = lastColumnWidth;
 
       for (int i=0; i<num_tile_rows-1; i++)
@@ -175,6 +403,11 @@ bool pic_parameter_set::read(bitreader* br, decoder_context* ctx)
           lastRowHeight -= rowHeight[i];
         }
 
+      if (lastRowHeight <= 0) {
+        return false;
+      }
+
+
       rowHeight[num_tile_rows-1] = lastRowHeight;
     }
 
@@ -189,6 +422,122 @@ bool pic_parameter_set::read(bitreader* br, decoder_context* ctx)
 
 
 
+  // END tiles
+
+
+
+  beta_offset = 0; // default value
+  tc_offset   = 0; // default value
+
+  pps_loop_filter_across_slices_enabled_flag = get_bits(br,1);
+  deblocking_filter_control_present_flag = get_bits(br,1);
+  if (deblocking_filter_control_present_flag) {
+    deblocking_filter_override_enabled_flag = get_bits(br,1);
+    pic_disable_deblocking_filter_flag = get_bits(br,1);
+    if (!pic_disable_deblocking_filter_flag) {
+      beta_offset = get_svlc(br);
+      if (beta_offset == UVLC_ERROR) {
+	ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
+	return false;
+      }
+      beta_offset *= 2;
+
+      tc_offset   = get_svlc(br);
+      if (tc_offset == UVLC_ERROR) {
+	ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
+	return false;
+      }
+      tc_offset   *= 2;
+    }
+  }
+  else {
+    deblocking_filter_override_enabled_flag = 0;
+    pic_disable_deblocking_filter_flag = 0;
+  }
+
+
+  // --- scaling list ---
+
+  pic_scaling_list_data_present_flag = get_bits(br,1);
+
+  // check consistency: if scaling-lists are not enabled, pic_scalign_list_data_present_flag
+  // must be FALSE
+  if (sps->scaling_list_enable_flag==0 &&
+      pic_scaling_list_data_present_flag != 0) {
+    ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
+    return false;
+  }
+
+  if (pic_scaling_list_data_present_flag) {
+    de265_error err = read_scaling_list(br, sps, &scaling_list, true);
+    if (err != DE265_OK) {
+      ctx->add_warning(err, false);
+      return false;
+    }
+  }
+  else {
+    memcpy(&scaling_list, &sps->scaling_list, sizeof(scaling_list_data));
+  }
+
+
+
+
+  lists_modification_present_flag = get_bits(br,1);
+  log2_parallel_merge_level = get_uvlc(br);
+  if (log2_parallel_merge_level == UVLC_ERROR) {
+    ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
+    return false;
+  }
+  log2_parallel_merge_level += 2;
+
+  if (log2_parallel_merge_level-2 > sps->log2_min_luma_coding_block_size-3 +1 +
+      sps->log2_diff_max_min_luma_coding_block_size) {
+    return false;
+  }
+
+  slice_segment_header_extension_present_flag = get_bits(br,1);
+  pps_extension_flag = get_bits(br,1);
+
+  if (pps_extension_flag) {
+    pps_range_extension_flag = get_bits(br,1);
+    pps_multilayer_extension_flag = get_bits(br,1);
+    pps_extension_6bits = get_bits(br,6);
+
+    if (pps_range_extension_flag) {
+      bool success = range_extension.read(br, ctx, this);
+      if (!success) {
+        return false;
+      }
+    }
+
+    //assert(false);
+    /*
+      while( more_rbsp_data() )
+
+      pps_extension_data_flag
+      u(1)
+      rbsp_trailing_bits()
+
+      }
+    */
+  }
+
+
+  set_derived_values(sps);
+
+  pps_read = true;
+
+  return true;
+}
+
+
+void pic_parameter_set::set_derived_values(const seq_parameter_set* sps)
+{
+  Log2MinCuQpDeltaSize = sps->Log2CtbSizeY - diff_cu_qp_delta_depth;
+
+  Log2MinCuChromaQpOffsetSize = sps->Log2CtbSizeY - range_extension.diff_cu_chroma_qp_offset_depth;
+  Log2MaxTransformSkipSize = range_extension.log2_max_transform_skip_block_size;
+
   if (uniform_spacing_flag) {
 
     // set columns widths
@@ -278,6 +627,7 @@ bool pic_parameter_set::read(bitreader* br, decoder_context* ctx)
     }
 
 
+#if 0
   logtrace(LogHeaders,"6.5.1 CtbAddrRSToTS\n");
   for (int y=0;y<sps->PicHeightInCtbsY;y++)
     {
@@ -288,7 +638,7 @@ bool pic_parameter_set::read(bitreader* br, decoder_context* ctx)
 
       logtrace(LogHeaders,"\n");
     }
-
+#endif
 
   // tile id
 
@@ -306,6 +656,7 @@ bool pic_parameter_set::read(bitreader* br, decoder_context* ctx)
         tIdx++;
       }
 
+#if 0
   logtrace(LogHeaders,"Tile IDs RS:\n");
   for (int y=0;y<sps->PicHeightInCtbsY;y++) {
     for (int x=0;x<sps->PicWidthInCtbsY;x++) {
@@ -313,6 +664,7 @@ bool pic_parameter_set::read(bitreader* br, decoder_context* ctx)
     }
     logtrace(LogHeaders,"\n");
   }
+#endif
 
   // 6.5.2 Z-scan order array initialization process
 
@@ -363,79 +715,127 @@ bool pic_parameter_set::read(bitreader* br, decoder_context* ctx)
     }
     }
   */
+}
 
-  // END tiles
 
+bool pic_parameter_set::write(error_queue* errqueue, CABAC_encoder& out,
+                              const seq_parameter_set* sps)
+{
+  if (pic_parameter_set_id >= DE265_MAX_PPS_SETS) {
+    errqueue->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false);
+    return false;
+  }
+  out.write_uvlc(pic_parameter_set_id);
 
-  Log2MinCuQpDeltaSize = sps->Log2CtbSizeY - diff_cu_qp_delta_depth;
+  if (seq_parameter_set_id >= DE265_MAX_PPS_SETS) {
+    errqueue->add_warning(DE265_WARNING_NONEXISTING_SPS_REFERENCED, false);
+    return false;
+  }
+  out.write_uvlc(seq_parameter_set_id);
 
+  out.write_bit(dependent_slice_segments_enabled_flag);
+  out.write_bit(output_flag_present_flag);
+  out.write_bits(num_extra_slice_header_bits,3);
+  out.write_bit(sign_data_hiding_flag);
+  out.write_bit(cabac_init_present_flag);
+  out.write_uvlc(num_ref_idx_l0_default_active-1);
+  out.write_uvlc(num_ref_idx_l1_default_active-1);
 
-  beta_offset = 0; // default value
-  tc_offset   = 0; // default value
+  out.write_svlc(pic_init_qp-26);
 
-  pps_loop_filter_across_slices_enabled_flag = get_bits(br,1);
-  deblocking_filter_control_present_flag = get_bits(br,1);
-  if (deblocking_filter_control_present_flag) {
-    deblocking_filter_override_enabled_flag = get_bits(br,1);
-    pic_disable_deblocking_filter_flag = get_bits(br,1);
-    if (!pic_disable_deblocking_filter_flag) {
-      beta_offset = get_svlc(br);
-      if (beta_offset == UVLC_ERROR) {
-	ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
-	return false;
-      }
-      beta_offset *= 2;
+  out.write_bit(constrained_intra_pred_flag);
+  out.write_bit(transform_skip_enabled_flag);
+  out.write_bit(cu_qp_delta_enabled_flag);
 
-      tc_offset   = get_svlc(br);
-      if (tc_offset == UVLC_ERROR) {
-	ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
-	return false;
-      }
-      tc_offset   *= 2;
+  if (cu_qp_delta_enabled_flag) {
+    out.write_uvlc(diff_cu_qp_delta_depth);
+  }
+
+  out.write_svlc(pic_cb_qp_offset);
+  out.write_svlc(pic_cr_qp_offset);
+
+  out.write_bit(pps_slice_chroma_qp_offsets_present_flag);
+  out.write_bit(weighted_pred_flag);
+  out.write_bit(weighted_bipred_flag);
+  out.write_bit(transquant_bypass_enable_flag);
+  out.write_bit(tiles_enabled_flag);
+  out.write_bit(entropy_coding_sync_enabled_flag);
+
+
+  // --- tiles ---
+
+  if (tiles_enabled_flag) {
+    if (num_tile_columns > DE265_MAX_TILE_COLUMNS) {
+      errqueue->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
+      return false;
+    }
+    out.write_uvlc(num_tile_columns-1);
+
+    if (num_tile_rows > DE265_MAX_TILE_ROWS) {
+      errqueue->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
+      return false;
+    }
+    out.write_uvlc(num_tile_rows-1);
+
+    out.write_bit(uniform_spacing_flag);
+
+    if (uniform_spacing_flag==false) {
+      for (int i=0; i<num_tile_columns-1; i++)
+        {
+          out.write_uvlc(colWidth[i]-1);
+        }
+
+      for (int i=0; i<num_tile_rows-1; i++)
+        {
+          out.write_uvlc(rowHeight[i]-1);
+        }
     }
+
+    out.write_bit(loop_filter_across_tiles_enabled_flag);
   }
-  else {
-    deblocking_filter_override_enabled_flag = 0;
-    pic_disable_deblocking_filter_flag = 0;
+
+
+  out.write_bit(pps_loop_filter_across_slices_enabled_flag);
+  out.write_bit(deblocking_filter_control_present_flag);
+
+  if (deblocking_filter_control_present_flag) {
+    out.write_bit(deblocking_filter_override_enabled_flag);
+    out.write_bit(pic_disable_deblocking_filter_flag);
+
+    if (!pic_disable_deblocking_filter_flag) {
+      out.write_svlc(beta_offset/2);
+      out.write_svlc(tc_offset  /2);
+    }
   }
 
 
   // --- scaling list ---
 
-  pic_scaling_list_data_present_flag = get_bits(br,1);
+  out.write_bit(pic_scaling_list_data_present_flag);
 
   // check consistency: if scaling-lists are not enabled, pic_scalign_list_data_present_flag
   // must be FALSE
   if (sps->scaling_list_enable_flag==0 &&
       pic_scaling_list_data_present_flag != 0) {
-    ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
+    errqueue->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
     return false;
   }
 
   if (pic_scaling_list_data_present_flag) {
-    de265_error err = read_scaling_list(br, sps, &scaling_list, true);
+    de265_error err = write_scaling_list(out, sps, &scaling_list, true);
     if (err != DE265_OK) {
-      ctx->add_warning(err, false);
+      errqueue->add_warning(err, false);
       return false;
     }
   }
-  else {
-    memcpy(&scaling_list, &sps->scaling_list, sizeof(scaling_list_data));
-  }
 
 
 
+  out.write_bit(lists_modification_present_flag);
+  out.write_uvlc(log2_parallel_merge_level-2);
 
-  lists_modification_present_flag = get_bits(br,1);
-  log2_parallel_merge_level = get_uvlc(br);
-  if (log2_parallel_merge_level == UVLC_ERROR) {
-    ctx->add_warning(DE265_WARNING_PPS_HEADER_INVALID, false);
-    return false;
-  }
-  log2_parallel_merge_level += 2;
-
-  slice_segment_header_extension_present_flag = get_bits(br,1);
-  pps_extension_flag = get_bits(br,1);
+  out.write_bit(slice_segment_header_extension_present_flag);
+  out.write_bit(pps_extension_flag);
 
   if (pps_extension_flag) {
     //assert(false);
@@ -457,7 +857,7 @@ bool pic_parameter_set::read(bitreader* br, decoder_context* ctx)
 }
 
 
-void pic_parameter_set::dump_pps(int fd) const
+void pic_parameter_set::dump(int fd) const
 {
   FILE* fh;
   if (fd==1) fh=stdout;
@@ -536,7 +936,7 @@ void pic_parameter_set::dump_pps(int fd) const
     LOG1("pic_disable_deblocking_filter_flag: %d\n", pic_disable_deblocking_filter_flag);
 
     LOG1("beta_offset:  %d\n", beta_offset);
-    LOG1("tc_offset:     %d\n", tc_offset);
+    LOG1("tc_offset:    %d\n", tc_offset);
   }
 
   LOG1("pic_scaling_list_data_present_flag: %d\n", pic_scaling_list_data_present_flag);
@@ -548,12 +948,22 @@ void pic_parameter_set::dump_pps(int fd) const
   LOG1("log2_parallel_merge_level      : %d\n", log2_parallel_merge_level);
   LOG1("num_extra_slice_header_bits    : %d\n", num_extra_slice_header_bits);
   LOG1("slice_segment_header_extension_present_flag : %d\n", slice_segment_header_extension_present_flag);
-  LOG1("pps_extension_flag : %d\n", pps_extension_flag);
+  LOG1("pps_extension_flag            : %d\n", pps_extension_flag);
+  LOG1("pps_range_extension_flag      : %d\n", pps_range_extension_flag);
+  LOG1("pps_multilayer_extension_flag : %d\n", pps_multilayer_extension_flag);
+  LOG1("pps_extension_6bits           : %d\n", pps_extension_6bits);
 
-  LOG1("Log2MinCuQpDeltaSize : %d\n", Log2MinCuQpDeltaSize);
+  LOG1("Log2MinCuQpDeltaSize          : %d\n", Log2MinCuQpDeltaSize);
+  LOG1("Log2MinCuChromaQpOffsetSize (RExt) : %d\n", Log2MinCuChromaQpOffsetSize);
+  LOG1("Log2MaxTransformSkipSize    (RExt) : %d\n", Log2MaxTransformSkipSize);
 
 #undef LOG0
 #undef LOG1
+
+
+  if (pps_range_extension_flag) {
+    range_extension.dump(fd);
+  }
 }
 
 
diff --git a/libde265/pps.h b/libde265/pps.h
index 6b93214..74b3e08 100644
--- a/libde265/pps.h
+++ b/libde265/pps.h
@@ -29,16 +29,47 @@
 #define DE265_MAX_TILE_COLUMNS 10
 #define DE265_MAX_TILE_ROWS    10
 
+class decoder_context;
+class pic_parameter_set;
 
-struct pic_parameter_set {
+
+class pps_range_extension
+{
+ public:
+  pps_range_extension() { reset(); }
+
+  void reset();
+
+  bool read(bitreader*, decoder_context*, const pic_parameter_set*);
+  void dump(int fd) const;
+
+  uint8_t log2_max_transform_skip_block_size;
+  bool    cross_component_prediction_enabled_flag;
+  bool    chroma_qp_offset_list_enabled_flag;
+  uint8_t diff_cu_chroma_qp_offset_depth;
+  uint8_t chroma_qp_offset_list_len;
+  int8_t  cb_qp_offset_list[6];
+  int8_t  cr_qp_offset_list[6];
+  uint8_t log2_sao_offset_scale_luma;
+  uint8_t log2_sao_offset_scale_chroma;
+};
+
+
+class pic_parameter_set {
+public:
   pic_parameter_set();
   ~pic_parameter_set();
 
-  bool read(bitreader*, struct decoder_context*);
+  void reset() { set_defaults(); }
+  bool read(bitreader*, decoder_context*);
+  bool write(error_queue*, CABAC_encoder&,
+             const seq_parameter_set* sps);
 
   bool is_tile_start_CTB(int ctbX,int ctbY) const;
-  void dump_pps(int fd) const;
+  void dump(int fd) const;
+
 
+  void set_defaults(enum PresetSet = Preset_Default);
 
   bool pps_read; // whether this pps has been read from bitstream
 
@@ -47,20 +78,23 @@ struct pic_parameter_set {
   char dependent_slice_segments_enabled_flag;
   char sign_data_hiding_flag;
   char cabac_init_present_flag;
-  char num_ref_idx_l0_default_active;
-  char num_ref_idx_l1_default_active;
+  char num_ref_idx_l0_default_active; // [1;16]
+  char num_ref_idx_l1_default_active; // [1;16]
 
   int pic_init_qp;
   char constrained_intra_pred_flag;
   char transform_skip_enabled_flag;
 
-  // if ( cu_qp_delta_enabled_flag )
+  // --- QP ---
+
   char cu_qp_delta_enabled_flag;
-  int  diff_cu_qp_delta_depth;
+  int  diff_cu_qp_delta_depth;   // [ 0 ; log2_diff_max_min_luma_coding_block_size ]
 
   int  pic_cb_qp_offset;
   int  pic_cr_qp_offset;
   char pps_slice_chroma_qp_offsets_present_flag;
+
+
   char weighted_pred_flag;
   char weighted_bipred_flag;
   char output_flag_present_flag;
@@ -70,33 +104,15 @@ struct pic_parameter_set {
 
   // --- tiles ---
 
-  //if( tiles_enabled_flag ) {
   char tiles_enabled_flag;
-  int  num_tile_columns;
-  int  num_tile_rows;
+  int  num_tile_columns;  // [1;PicWidthInCtbsY]
+  int  num_tile_rows;     // [1;PicHeightInCtbsY]
   char uniform_spacing_flag;
 
-  // derived values
-  int colWidth [ DE265_MAX_TILE_COLUMNS ];
-  int rowHeight[ DE265_MAX_TILE_ROWS ];
-  int colBd    [ DE265_MAX_TILE_COLUMNS+1 ];
-  int rowBd    [ DE265_MAX_TILE_ROWS+1 ];
-
-  std::vector<int> CtbAddrRStoTS; // #CTBs
-  std::vector<int> CtbAddrTStoRS; // #CTBs
-  std::vector<int> TileId;        // #CTBs  // index in tile-scan order
-  std::vector<int> TileIdRS;      // #CTBs  // index in raster-scan order
-  std::vector<int> MinTbAddrZS;   // #TBs   [x + y*PicWidthInTbsY]
-
-
-  // --- QP ---
-
-  int Log2MinCuQpDeltaSize;
 
   // --- ---
 
   char loop_filter_across_tiles_enabled_flag;
-
   char pps_loop_filter_across_slices_enabled_flag;
   char deblocking_filter_control_present_flag;
 
@@ -110,11 +126,35 @@ struct pic_parameter_set {
   struct scaling_list_data scaling_list; // contains valid data if sps->scaling_list_enabled_flag set
 
   char lists_modification_present_flag;
-  int log2_parallel_merge_level;
+  int log2_parallel_merge_level; // [2 ; log2(max CB size)]
   char num_extra_slice_header_bits;
   char slice_segment_header_extension_present_flag;
   char pps_extension_flag;
+  char pps_range_extension_flag;
+  char pps_multilayer_extension_flag;
+  char pps_extension_6bits;
+
+  pps_range_extension range_extension;
+
+
+  // --- derived values ---
+
+  int Log2MinCuQpDeltaSize;
+  int Log2MinCuChromaQpOffsetSize;
+  int Log2MaxTransformSkipSize;
+
+  int colWidth [ DE265_MAX_TILE_COLUMNS ];
+  int rowHeight[ DE265_MAX_TILE_ROWS ];
+  int colBd    [ DE265_MAX_TILE_COLUMNS+1 ];
+  int rowBd    [ DE265_MAX_TILE_ROWS+1 ];
+
+  std::vector<int> CtbAddrRStoTS; // #CTBs
+  std::vector<int> CtbAddrTStoRS; // #CTBs
+  std::vector<int> TileId;        // #CTBs  // index in tile-scan order
+  std::vector<int> TileIdRS;      // #CTBs  // index in raster-scan order
+  std::vector<int> MinTbAddrZS;   // #TBs   [x + y*PicWidthInTbsY]
 
+  void set_derived_values(const seq_parameter_set* sps);
 };
 
 #endif
diff --git a/libde265/quality.cc b/libde265/quality.cc
new file mode 100644
index 0000000..9a3f2d7
--- /dev/null
+++ b/libde265/quality.cc
@@ -0,0 +1,112 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "quality.h"
+#include <math.h>
+
+
+uint32_t SSD(const uint8_t* img, int imgStride,
+             const uint8_t* ref, int refStride,
+             int width, int height)
+{
+  uint32_t sum=0;
+
+  const uint8_t* iPtr = img;
+  const uint8_t* rPtr = ref;
+
+  for (int y=0;y<height;y++) {
+    for (int x=0;x<width;x++) {
+      int diff = iPtr[x] - rPtr[x];
+      sum += diff*diff;
+    }
+
+    iPtr += imgStride;
+    rPtr += refStride;
+  }
+
+  return sum;
+}
+
+
+uint32_t SAD(const uint8_t* img, int imgStride,
+             const uint8_t* ref, int refStride,
+             int width, int height)
+{
+  uint32_t sum=0;
+
+  const uint8_t* iPtr = img;
+  const uint8_t* rPtr = ref;
+
+  for (int y=0;y<height;y++) {
+    for (int x=0;x<width;x++) {
+      int diff = iPtr[x] - rPtr[x];
+      sum += abs_value(diff);
+    }
+
+    iPtr += imgStride;
+    rPtr += refStride;
+  }
+
+  return sum;
+}
+
+
+double MSE(const uint8_t* img, int imgStride,
+           const uint8_t* ref, int refStride,
+           int width, int height)
+{
+  double sum=0.0;
+
+  const uint8_t* iPtr = img;
+  const uint8_t* rPtr = ref;
+
+  for (int y=0;y<height;y++) {
+    uint32_t lineSum=0;
+
+    for (int x=0;x<width;x++) {
+      int diff = iPtr[x] - rPtr[x];
+      lineSum += diff*diff;
+    }
+
+    sum += ((double)lineSum)/width;
+
+    iPtr += imgStride;
+    rPtr += refStride;
+  }
+
+  return sum/height;
+}
+
+
+double PSNR(double mse)
+{
+  if (mse==0) { return 99.99999; }
+
+  return 10*log10(255.0*255.0/mse);
+}
+
+uint32_t compute_distortion_ssd(const de265_image* img1, const de265_image* img2,
+                                int x0, int y0, int log2size, int cIdx)
+{
+  return SSD(img1->get_image_plane_at_pos(cIdx,x0,y0), img1->get_image_stride(cIdx),
+             img2->get_image_plane_at_pos(cIdx,x0,y0), img2->get_image_stride(cIdx),
+             1<<log2size, 1<<log2size);
+}
+
diff --git a/libde265/quality.h b/libde265/quality.h
new file mode 100644
index 0000000..7073d14
--- /dev/null
+++ b/libde265/quality.h
@@ -0,0 +1,47 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DE265_QUALITY_H
+#define DE265_QUALITY_H
+
+#include <stdint.h>
+#include <libde265/de265.h>
+#include <libde265/image.h>
+
+
+LIBDE265_API uint32_t SSD(const uint8_t* img, int imgStride,
+                          const uint8_t* ref, int refStride,
+                          int width, int height);
+
+LIBDE265_API uint32_t SAD(const uint8_t* img, int imgStride,
+                          const uint8_t* ref, int refStride,
+                          int width, int height);
+
+LIBDE265_API double MSE(const uint8_t* img, int imgStride,
+                        const uint8_t* ref, int refStride,
+                        int width, int height);
+
+LIBDE265_API double PSNR(double mse);
+
+
+LIBDE265_API uint32_t compute_distortion_ssd(const de265_image* img1, const de265_image* img2,
+                                             int x0, int y0, int log2size, int cIdx);
+
+#endif
diff --git a/libde265/refpic.cc b/libde265/refpic.cc
index 88cb5c3..f6ff3e3 100644
--- a/libde265/refpic.cc
+++ b/libde265/refpic.cc
@@ -31,17 +31,37 @@
 #endif
 
 
-static void compute_NumPoc(ref_pic_set* rpset)
+void ref_pic_set::reset()
 {
-  rpset->NumPocTotalCurr_shortterm_only = 0;
-  
-  for (int i=0; i<rpset->NumNegativePics; i++)
-    if (rpset->UsedByCurrPicS0[i])
-      rpset->NumPocTotalCurr_shortterm_only++;
+  NumNegativePics = 0;
+  NumPositivePics = 0;
+  NumDeltaPocs = 0;
+  NumPocTotalCurr_shortterm_only = 0;
+
+  for (int i=0;i<MAX_NUM_REF_PICS;i++) {
+    DeltaPocS0[i] = 0;
+    DeltaPocS1[i] = 0;
+
+    UsedByCurrPicS0[i] = 0;
+    UsedByCurrPicS1[i] = 0;
+  }
+}
+
+
+void ref_pic_set::compute_derived_values()
+{
+  NumPocTotalCurr_shortterm_only = 0;
+
+  for (int i=0; i<NumNegativePics; i++)
+    if (UsedByCurrPicS0[i])
+      NumPocTotalCurr_shortterm_only++;
+
+  for (int i=0; i<NumPositivePics; i++)
+    if (UsedByCurrPicS1[i])
+      NumPocTotalCurr_shortterm_only++;
+
+  NumDeltaPocs = NumNegativePics + NumPositivePics;
 
-  for (int i=0; i<rpset->NumPositivePics; i++)
-    if (rpset->UsedByCurrPicS1[i])
-      rpset->NumPocTotalCurr_shortterm_only++;
 
   /*
     NOTE: this is done when reading the slice header.
@@ -62,7 +82,7 @@ static void compute_NumPoc(ref_pic_set* rpset)
    When coding the ref-pic-sets in the SPS, predicition is always from the previous set.
    In the slice header, the ref-pic-set can use any previous set as reference.
  */
-bool read_short_term_ref_pic_set(decoder_context* ctx,
+bool read_short_term_ref_pic_set(error_queue* errqueue,
                                  const seq_parameter_set* sps,
                                  bitreader* br,
                                  ref_pic_set* out_set, // where to store the read set
@@ -92,6 +112,14 @@ bool read_short_term_ref_pic_set(decoder_context* ctx,
     int delta_idx;
     if (sliceRefPicSet) { // idxRps == num_short_term_ref_pic_sets) {
       delta_idx = vlc = get_uvlc(br);
+      if (delta_idx==UVLC_ERROR) {
+        return false;
+      }
+
+      if (delta_idx>=idxRps) {
+        return false;
+      }
+
       delta_idx++;
     } else {
       delta_idx = 1;
@@ -102,6 +130,7 @@ bool read_short_term_ref_pic_set(decoder_context* ctx,
 
     int delta_rps_sign = get_bits(br,1);
     int abs_delta_rps  = vlc = get_uvlc(br);
+    if (vlc==UVLC_ERROR) { return false; }
     abs_delta_rps++;
     int DeltaRPS = (delta_rps_sign ? -abs_delta_rps : abs_delta_rps);
 
@@ -142,8 +171,13 @@ bool read_short_term_ref_pic_set(decoder_context* ctx,
 
     // positive list
     for (int j=nPositiveRIdx-1;j>=0;j--) {
+      assert(RIdx >= 0 && RIdx < sets.size());
+      assert(j>=0 && j < MAX_NUM_REF_PICS);
+
       int dPoc = sets[RIdx].DeltaPocS1[j] + DeltaRPS; // new delta
       if (dPoc<0 && use_delta_flag[nNegativeRIdx+j]) {
+        if (i>= MAX_NUM_REF_PICS) { return false; }
+
         out_set->DeltaPocS0[i] = dPoc;
         out_set->UsedByCurrPicS0[i] = used_by_curr_pic_flag[nNegativeRIdx+j];
         i++;
@@ -152,6 +186,8 @@ bool read_short_term_ref_pic_set(decoder_context* ctx,
 
     // frame 0
     if (DeltaRPS<0 && use_delta_flag[nDeltaPocsRIdx]) {
+      if (i>= MAX_NUM_REF_PICS) { return false; }
+
       out_set->DeltaPocS0[i] = DeltaRPS;
       out_set->UsedByCurrPicS0[i] = used_by_curr_pic_flag[nDeltaPocsRIdx];
       i++;
@@ -161,6 +197,8 @@ bool read_short_term_ref_pic_set(decoder_context* ctx,
     for (int j=0;j<nNegativeRIdx;j++) {
       int dPoc = sets[RIdx].DeltaPocS0[j] + DeltaRPS;
       if (dPoc<0 && use_delta_flag[j]) {
+        if (i>= MAX_NUM_REF_PICS) { return false; }
+
         out_set->DeltaPocS0[i] = dPoc;
         out_set->UsedByCurrPicS0[i] = used_by_curr_pic_flag[j];
         i++;
@@ -179,6 +217,8 @@ bool read_short_term_ref_pic_set(decoder_context* ctx,
     for (int j=nNegativeRIdx-1;j>=0;j--) {
       int dPoc = sets[RIdx].DeltaPocS0[j] + DeltaRPS;
       if (dPoc>0 && use_delta_flag[j]) {
+        if (i>= MAX_NUM_REF_PICS) { return false; }
+
         out_set->DeltaPocS1[i] = dPoc;
         out_set->UsedByCurrPicS1[i] = used_by_curr_pic_flag[j];
         i++;
@@ -187,6 +227,8 @@ bool read_short_term_ref_pic_set(decoder_context* ctx,
 
     // frame 0
     if (DeltaRPS>0 && use_delta_flag[nDeltaPocsRIdx]) {
+      if (i>= MAX_NUM_REF_PICS) { return false; }
+
       out_set->DeltaPocS1[i] = DeltaRPS;
       out_set->UsedByCurrPicS1[i] = used_by_curr_pic_flag[nDeltaPocsRIdx];
       i++;
@@ -196,6 +238,8 @@ bool read_short_term_ref_pic_set(decoder_context* ctx,
     for (int j=0;j<nPositiveRIdx;j++) {
       int dPoc = sets[RIdx].DeltaPocS1[j] + DeltaRPS;
       if (dPoc>0 && use_delta_flag[nNegativeRIdx+j]) {
+        if (i>= MAX_NUM_REF_PICS) { return false; }
+
         out_set->DeltaPocS1[i] = dPoc;
         out_set->UsedByCurrPicS1[i] = used_by_curr_pic_flag[nNegativeRIdx+j];
         i++;
@@ -204,8 +248,6 @@ bool read_short_term_ref_pic_set(decoder_context* ctx,
 
     out_set->NumPositivePics = i;
 
-    out_set->NumDeltaPocs = out_set->NumNegativePics + out_set->NumPositivePics;
-
   } else {
 
     // --- first, read the number of past and future frames in this set ---
@@ -222,14 +264,18 @@ bool read_short_term_ref_pic_set(decoder_context* ctx,
       out_set->NumDeltaPocs = 0;
       out_set->NumPocTotalCurr_shortterm_only = 0;
 
-      ctx->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false);
+      errqueue->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false);
       return false;
     }
 
+    if (num_negative_pics > MAX_NUM_REF_PICS ||
+        num_positive_pics > MAX_NUM_REF_PICS) {
+      errqueue->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false);
+      return false;
+    }
 
     out_set->NumNegativePics = num_negative_pics;
     out_set->NumPositivePics = num_positive_pics;
-    out_set->NumDeltaPocs = num_positive_pics + num_negative_pics;
 
     // --- now, read the deltas between the reference frames to fill the lists ---
 
@@ -237,7 +283,9 @@ bool read_short_term_ref_pic_set(decoder_context* ctx,
 
     int lastPocS=0;
     for (int i=0;i<num_negative_pics;i++) {
-      int  delta_poc_s0 = get_uvlc(br)+1;
+      int  delta_poc_s0 = get_uvlc(br);
+      if (delta_poc_s0==UVLC_ERROR) { return false; }
+      delta_poc_s0++;
       char used_by_curr_pic_s0_flag = get_bits(br,1);
 
       out_set->DeltaPocS0[i]      = lastPocS - delta_poc_s0;
@@ -249,7 +297,9 @@ bool read_short_term_ref_pic_set(decoder_context* ctx,
 
     lastPocS=0;
     for (int i=0;i<num_positive_pics;i++) {
-      int  delta_poc_s1 = get_uvlc(br)+1;
+      int  delta_poc_s1 = get_uvlc(br);
+      if (delta_poc_s1==UVLC_ERROR) { return false; }
+      delta_poc_s1++;
       char used_by_curr_pic_s1_flag = get_bits(br,1);
 
       out_set->DeltaPocS1[i]      = lastPocS + delta_poc_s1;
@@ -259,12 +309,76 @@ bool read_short_term_ref_pic_set(decoder_context* ctx,
   }
 
 
-  compute_NumPoc(out_set);
+  out_set->compute_derived_values();
 
   return true;
 }
 
 
+bool write_short_term_ref_pic_set_nopred(error_queue* errqueue,
+                                         const seq_parameter_set* sps,
+                                         CABAC_encoder& out,
+                                         const ref_pic_set* in_set, // which set to write
+                                         int idxRps,  // index of the set to be written
+                                         const std::vector<ref_pic_set>& sets, // previously read sets
+                                         bool sliceRefPicSet) // is this in the slice header?
+{
+  if (idxRps != 0) {
+    // inter_ref_pic_set_prediction_flag
+    out.write_bit(0);
+  }
+
+
+  // --- first, write the number of past and future frames in this set ---
+
+  out.write_uvlc(in_set->NumNegativePics);
+  out.write_uvlc(in_set->NumPositivePics);
+
+  // --- now, write the deltas between the reference frames to fill the lists ---
+
+  // past frames
+
+  int lastPocS=0;
+  for (int i=0;i<in_set->NumNegativePics;i++) {
+    int  delta_poc_s0 = lastPocS - in_set->DeltaPocS0[i];
+    char used_by_curr_pic_s0_flag = in_set->UsedByCurrPicS0[i];
+
+    assert(delta_poc_s0 >= 1);
+    out.write_uvlc(delta_poc_s0-1);
+    out.write_bit(used_by_curr_pic_s0_flag);
+    lastPocS = in_set->DeltaPocS0[i];
+  }
+
+  // future frames
+
+  lastPocS=0;
+  for (int i=0;i<in_set->NumPositivePics;i++) {
+    int  delta_poc_s1 = in_set->DeltaPocS1[i] - lastPocS;
+    char used_by_curr_pic_s1_flag = in_set->UsedByCurrPicS1[i];
+
+    assert(delta_poc_s1 >= 1);
+    out.write_uvlc(delta_poc_s1-1);
+    out.write_bit(used_by_curr_pic_s1_flag);
+    lastPocS = in_set->DeltaPocS1[i];
+  }
+
+  return true;
+}
+
+
+bool write_short_term_ref_pic_set(error_queue* errqueue,
+                                  const seq_parameter_set* sps,
+                                  CABAC_encoder& out,
+                                  const ref_pic_set* in_set, // which set to write
+                                  int idxRps,  // index of the set to be read
+                                  const std::vector<ref_pic_set>& sets, // previously read sets
+                                  bool sliceRefPicSet) // is this in the slice header?
+{
+  return write_short_term_ref_pic_set_nopred(errqueue, sps, out, in_set, idxRps, sets,
+                                             sliceRefPicSet);
+}
+
+
 void dump_short_term_ref_pic_set(const ref_pic_set* set, FILE* fh)
 {
   log2fh(fh,"NumDeltaPocs: %d [-:%d +:%d]\n", set->NumDeltaPocs,
diff --git a/libde265/refpic.h b/libde265/refpic.h
index d4784dc..2904197 100644
--- a/libde265/refpic.h
+++ b/libde265/refpic.h
@@ -26,14 +26,9 @@
 #define MAX_NUM_REF_PICS 16  // maximum defined by standard, may be lower for some Levels
 
 
-typedef struct {
-  uint8_t NumNegativePics;  // number of past reference pictures
-  uint8_t NumPositivePics;  // number of future reference pictures
-  uint8_t NumDeltaPocs;     // total number of reference pictures (past + future)
-
-  uint8_t NumPocTotalCurr_shortterm_only; /* Total number of reference pictures that may actually
-                                             be used for prediction in the current frame. */
-
+class ref_pic_set
+{
+ public:
   // Lists of pictures that have to be kept in the decoded picture buffer for future
   // reference and that may optionally be used for prediction in the current frame.
   // Lists contain the relative POC positions.
@@ -44,7 +39,20 @@ typedef struct {
   uint8_t UsedByCurrPicS0[MAX_NUM_REF_PICS];
   uint8_t UsedByCurrPicS1[MAX_NUM_REF_PICS];
 
-} ref_pic_set;
+  uint8_t NumNegativePics;  // number of past reference pictures
+  uint8_t NumPositivePics;  // number of future reference pictures
+
+  // --- derived values ---
+
+  void compute_derived_values();
+
+  uint8_t NumDeltaPocs;     // total number of reference pictures (past + future)
+
+  uint8_t NumPocTotalCurr_shortterm_only; /* Total number of reference pictures that may actually
+                                             be used for prediction in the current frame. */
+
+  void reset();
+};
 
 
 void dump_short_term_ref_pic_set(const ref_pic_set*, FILE* fh);
diff --git a/libde265/sao.cc b/libde265/sao.cc
index d3df685..b5f6427 100644
--- a/libde265/sao.cc
+++ b/libde265/sao.cc
@@ -25,16 +25,17 @@
 #include <string.h>
 
 
-void apply_sao(de265_image* img, int xCtb,int yCtb,
-               const slice_segment_header* shdr, int cIdx, int nS,
-               const uint8_t* in_img,  int in_stride,
-               /* */ uint8_t* out_img, int out_stride)
+template <class pixel_t>
+void apply_sao_internal(de265_image* img, int xCtb,int yCtb,
+                        const slice_segment_header* shdr, int cIdx, int nSW,int nSH,
+                        const pixel_t* in_img,  int in_stride,
+                        /* */ pixel_t* out_img, int out_stride)
 {
   const sao_info* saoinfo = img->get_sao_info(xCtb,yCtb);
 
   int SaoTypeIdx = (saoinfo->SaoTypeIdx >> (2*cIdx)) & 0x3;
 
-  logtrace(LogSAO,"apply_sao CTB %d;%d cIdx:%d type=%d (%dx%d)\n",xCtb,yCtb,cIdx, SaoTypeIdx, nS,nS);
+  logtrace(LogSAO,"apply_sao CTB %d;%d cIdx:%d type=%d (%dx%d)\n",xCtb,yCtb,cIdx, SaoTypeIdx, nSW,nSH);
 
   if (SaoTypeIdx==0) {
     return;
@@ -46,8 +47,8 @@ void apply_sao(de265_image* img, int xCtb,int yCtb,
   const int maxPixelValue = (1<<bitDepth)-1;
 
   // top left position of CTB in pixels
-  const int xC = xCtb*nS;
-  const int yC = yCtb*nS;
+  const int xC = xCtb*nSW;
+  const int yC = yCtb*nSH;
 
   const int width  = img->get_width(cIdx);
   const int height = img->get_height(cIdx);
@@ -55,8 +56,10 @@ void apply_sao(de265_image* img, int xCtb,int yCtb,
   const int ctbSliceAddrRS = img->get_SliceHeader(xC,yC)->SliceAddrRS;
 
   const int picWidthInCtbs = sps->PicWidthInCtbsY;
-  const int ctbshift = sps->Log2CtbSizeY - (cIdx>0 ? 1 : 0);
-  const int chromashift = (cIdx>0 ? 1 : 0);
+  const int chromashiftW = sps->get_chroma_shift_W(cIdx);
+  const int chromashiftH = sps->get_chroma_shift_H(cIdx);
+  const int ctbshiftW = sps->Log2CtbSizeY - chromashiftW;
+  const int ctbshiftH = sps->Log2CtbSizeY - chromashiftH;
 
 
   for (int i=0;i<5;i++)
@@ -66,12 +69,11 @@ void apply_sao(de265_image* img, int xCtb,int yCtb,
 
 
   // actual size of CTB to be processed (can be smaller when partially outside of image)
-  const int ctbW = (xC+nS>width)  ? width -xC : nS;
-  const int ctbH = (yC+nS>height) ? height-yC : nS;
+  const int ctbW = (xC+nSW>width)  ? width -xC : nSW;
+  const int ctbH = (yC+nSH>height) ? height-yC : nSH;
 
 
-  const bool extendedTests = (img->get_CTB_has_pcm(xCtb,yCtb) ||
-                              img->get_CTB_has_cu_transquant_bypass(xCtb,yCtb));
+  const bool extendedTests = img->get_CTB_has_pcm_or_cu_transquant_bypass(xCtb,yCtb);
 
   if (SaoTypeIdx==2) {
     int hPos[2], vPos[2];
@@ -99,18 +101,18 @@ void apply_sao(de265_image* img, int xCtb,int yCtb,
 
 
     for (int j=0;j<ctbH;j++) {
-      const uint8_t* in_ptr  = &in_img [xC+(yC+j)*in_stride];
-      /* */ uint8_t* out_ptr = &out_img[xC+(yC+j)*out_stride];
+      const pixel_t* in_ptr  = &in_img [xC+(yC+j)*in_stride];
+      /* */ pixel_t* out_ptr = &out_img[xC+(yC+j)*out_stride];
 
       for (int i=0;i<ctbW;i++) {
         int edgeIdx = -1;
 
         logtrace(LogSAO, "pos %d,%d\n",xC+i,yC+j);
 
-        if (extendedTests &&
-            (sps->pcm_loop_filter_disable_flag &&
-             img->get_pcm_flag((xC+i)<<chromashift,(yC+j)<<chromashift)) ||
-            img->get_cu_transquant_bypass((xC+i)<<chromashift,(yC+j)<<chromashift)) {
+        if ((extendedTests &&
+             (sps->pcm_loop_filter_disable_flag &&
+              img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH))) ||
+            img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) {
           continue;
         }
 
@@ -133,23 +135,29 @@ void apply_sao(de265_image* img, int xCtb,int yCtb,
             // slice anyway) reduced computation time only by 1.3%.
             // TODO: however, this may still be a big part of SAO itself.
 
-            int sliceAddrRS = img->get_SliceHeader(xS<<chromashift,yS<<chromashift)->SliceAddrRS;
+            slice_segment_header* sliceHeader = img->get_SliceHeader(xS<<chromashiftW,
+                                                                     yS<<chromashiftH);
+            if (sliceHeader==NULL) { return; }
+
+            int sliceAddrRS = sliceHeader->SliceAddrRS;
             if (sliceAddrRS <  ctbSliceAddrRS &&
-                img->get_SliceHeader((xC+i)<<chromashift,(yC+j)<<chromashift)->slice_loop_filter_across_slices_enabled_flag==0) {
+                img->get_SliceHeader((xC+i)<<chromashiftW,
+                                     (yC+j)<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) {
               edgeIdx=0;
               break;
             }
 
             if (sliceAddrRS >  ctbSliceAddrRS &&
-                img->get_SliceHeader(xS<<chromashift,yS<<chromashift)->slice_loop_filter_across_slices_enabled_flag==0) {
+                img->get_SliceHeader(xS<<chromashiftW,
+                                     yS<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) {
               edgeIdx=0;
               break;
             }
 
 
-            if (pps->loop_filter_across_tiles_enabled_flag==0 && 
-                pps->TileIdRS[(xS>>ctbshift) + (yS>>ctbshift)*picWidthInCtbs] !=
-                pps->TileIdRS[(xC>>ctbshift) + (yC>>ctbshift)*picWidthInCtbs]) {
+            if (pps->loop_filter_across_tiles_enabled_flag==0 &&
+                pps->TileIdRS[(xS>>ctbshiftW) + (yS>>ctbshiftH)*picWidthInCtbs] !=
+                pps->TileIdRS[(xC>>ctbshiftW) + (yC>>ctbshiftH)*picWidthInCtbs]) {
               edgeIdx=0;
               break;
             }
@@ -198,13 +206,17 @@ void apply_sao(de265_image* img, int xCtb,int yCtb,
         for (int i=0;i<ctbW;i++) {
 
           if ((sps->pcm_loop_filter_disable_flag &&
-               img->get_pcm_flag((xC+i)<<chromashift,(yC+j)<<chromashift)) ||
-              img->get_cu_transquant_bypass((xC+i)<<chromashift,(yC+j)<<chromashift)) {
+               img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) ||
+              img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) {
             continue;
           }
 
           int bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ];
 
+          // Shifts are a strange thing. On x86, >>x actually computes >>(x%64).
+          // So we have to take care of large bandShifts.
+          if (bandShift>=8) { bandIdx=0; }
+
           if (bandIdx>0) {
             int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];
 
@@ -212,7 +224,7 @@ void apply_sao(de265_image* img, int xCtb,int yCtb,
                      offset,
                      in_img[xC+i+(yC+j)*in_stride],
                      in_img[xC+i+(yC+j)*in_stride]+offset);
-          
+
             out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
                                                     in_img[xC+i+(yC+j)*in_stride] + offset);
           }
@@ -227,6 +239,9 @@ void apply_sao(de265_image* img, int xCtb,int yCtb,
 
             int bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ];
 
+            // see above
+            if (bandShift>=8) { bandIdx=0; }
+
             if (bandIdx>0) {
               int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];
 
@@ -239,6 +254,25 @@ void apply_sao(de265_image* img, int xCtb,int yCtb,
 }
 
 
+template <class pixel_t>
+void apply_sao(de265_image* img, int xCtb,int yCtb,
+               const slice_segment_header* shdr, int cIdx, int nSW,int nSH,
+               const pixel_t* in_img,  int in_stride,
+               /* */ pixel_t* out_img, int out_stride)
+{
+  if (img->high_bit_depth(cIdx)) {
+    apply_sao_internal<uint16_t>(img,xCtb,yCtb, shdr,cIdx,nSW,nSH,
+                                 (uint16_t*)in_img, in_stride,
+                                 (uint16_t*)out_img,out_stride);
+  }
+  else {
+    apply_sao_internal<uint8_t>(img,xCtb,yCtb, shdr,cIdx,nSW,nSH,
+                                in_img, in_stride,
+                                out_img,out_stride);
+  }
+}
+
+
 void apply_sample_adaptive_offset(de265_image* img)
 {
   if (img->sps.sample_adaptive_offset_enabled_flag==0) {
@@ -258,17 +292,20 @@ void apply_sample_adaptive_offset(de265_image* img)
         const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,yCtb);
 
         if (shdr->slice_sao_luma_flag) {
-          apply_sao(img, xCtb,yCtb, shdr, 0, 1<<img->sps.Log2CtbSizeY,
+          apply_sao(img, xCtb,yCtb, shdr, 0, 1<<img->sps.Log2CtbSizeY, 1<<img->sps.Log2CtbSizeY,
                     inputCopy.get_image_plane(0), inputCopy.get_image_stride(0),
                     img->get_image_plane(0), img->get_image_stride(0));
         }
 
         if (shdr->slice_sao_chroma_flag) {
-          apply_sao(img, xCtb,yCtb, shdr, 1, 1<<(img->sps.Log2CtbSizeY-1),
+          int nSW = (1<<img->sps.Log2CtbSizeY) / img->sps.SubWidthC;
+          int nSH = (1<<img->sps.Log2CtbSizeY) / img->sps.SubHeightC;
+
+          apply_sao(img, xCtb,yCtb, shdr, 1, nSW,nSH,
                     inputCopy.get_image_plane(1), inputCopy.get_image_stride(1),
                     img->get_image_plane(1), img->get_image_stride(1));
 
-          apply_sao(img, xCtb,yCtb, shdr, 2, 1<<(img->sps.Log2CtbSizeY-1),
+          apply_sao(img, xCtb,yCtb, shdr, 2, nSW,nSH,
                     inputCopy.get_image_plane(2), inputCopy.get_image_stride(2),
                     img->get_image_plane(2), img->get_image_stride(2));
         }
@@ -282,34 +319,43 @@ void apply_sample_adaptive_offset_sequential(de265_image* img)
     return;
   }
 
+  int lumaImageSize   = img->get_image_stride(0) * img->get_height(0) * img->get_bytes_per_pixel(0);
+  int chromaImageSize = img->get_image_stride(1) * img->get_height(1) * img->get_bytes_per_pixel(1);
 
-  uint8_t* inputCopy = new uint8_t[ img->get_image_stride(0) * img->get_height(0) ];
+  uint8_t* inputCopy = new uint8_t[ libde265_max(lumaImageSize, chromaImageSize) ];
   if (inputCopy == NULL) {
     img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
     return;
   }
 
 
-  for (int cIdx=0;cIdx<3;cIdx++) {
+  int nChannels = 3;
+  if (img->sps.ChromaArrayType == CHROMA_MONO) { nChannels=1; }
+
+  for (int cIdx=0;cIdx<nChannels;cIdx++) {
 
     int stride = img->get_image_stride(cIdx);
     int height = img->get_height(cIdx);
 
-    memcpy(inputCopy, img->get_image_plane(cIdx), stride * height);
+    memcpy(inputCopy, img->get_image_plane(cIdx), stride * height * img->get_bytes_per_pixel(cIdx));
 
     for (int yCtb=0; yCtb<img->sps.PicHeightInCtbsY; yCtb++)
       for (int xCtb=0; xCtb<img->sps.PicWidthInCtbsY; xCtb++)
         {
           const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,yCtb);
+          if (shdr==NULL) { return; }
 
           if (cIdx==0 && shdr->slice_sao_luma_flag) {
-            apply_sao(img, xCtb,yCtb, shdr, 0, 1<<img->sps.Log2CtbSizeY,
+            apply_sao(img, xCtb,yCtb, shdr, 0, 1<<img->sps.Log2CtbSizeY, 1<<img->sps.Log2CtbSizeY,
                       inputCopy, stride,
                       img->get_image_plane(0), img->get_image_stride(0));
           }
 
           if (cIdx!=0 && shdr->slice_sao_chroma_flag) {
-            apply_sao(img, xCtb,yCtb, shdr, cIdx, 1<<(img->sps.Log2CtbSizeY-1),
+            int nSW = (1<<img->sps.Log2CtbSizeY) / img->sps.SubWidthC;
+            int nSH = (1<<img->sps.Log2CtbSizeY) / img->sps.SubHeightC;
+
+            apply_sao(img, xCtb,yCtb, shdr, cIdx, nSW,nSH,
                       inputCopy, stride,
                       img->get_image_plane(cIdx), img->get_image_stride(cIdx));
           }
@@ -335,13 +381,18 @@ public:
   int inputProgress;
 
   virtual void work();
+  virtual std::string name() const {
+    char buf[100];
+    sprintf(buf,"sao-%d",ctb_y);
+    return buf;
+  }
 };
 
 
 void thread_task_sao::work()
 {
   state = Running;
-  img->thread_run();
+  img->thread_run(this);
 
   const int rightCtb = img->sps.PicWidthInCtbsY-1;
   const int ctbSize  = (1<<img->sps.Log2CtbSizeY);
@@ -354,7 +405,7 @@ void thread_task_sao::work()
   if (ctb_y>0) {
     img->wait_for_progress(this, rightCtb,ctb_y-1, inputProgress);
   }
-  
+
   if (ctb_y+1<img->sps.PicHeightInCtbsY) {
     img->wait_for_progress(this, rightCtb,ctb_y+1, inputProgress);
   }
@@ -370,19 +421,25 @@ void thread_task_sao::work()
   for (int xCtb=0; xCtb<img->sps.PicWidthInCtbsY; xCtb++)
     {
       const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,ctb_y);
+      if (shdr==NULL) {
+        break;
+      }
 
       if (shdr->slice_sao_luma_flag) {
-        apply_sao(img, xCtb,ctb_y, shdr, 0, ctbSize,
+        apply_sao(img, xCtb,ctb_y, shdr, 0, ctbSize, ctbSize,
                   inputImg ->get_image_plane(0), inputImg ->get_image_stride(0),
                   outputImg->get_image_plane(0), outputImg->get_image_stride(0));
       }
 
       if (shdr->slice_sao_chroma_flag) {
-        apply_sao(img, xCtb,ctb_y, shdr, 1, ctbSize>>1,
+        int nSW = ctbSize / img->sps.SubWidthC;
+        int nSH = ctbSize / img->sps.SubHeightC;
+
+        apply_sao(img, xCtb,ctb_y, shdr, 1, nSW,nSH,
                   inputImg ->get_image_plane(1), inputImg ->get_image_stride(1),
                   outputImg->get_image_plane(1), outputImg->get_image_stride(1));
 
-        apply_sao(img, xCtb,ctb_y, shdr, 2, ctbSize>>1,
+        apply_sao(img, xCtb,ctb_y, shdr, 2, nSW,nSH,
                   inputImg ->get_image_plane(2), inputImg ->get_image_stride(2),
                   outputImg->get_image_plane(2), outputImg->get_image_stride(2));
       }
@@ -398,7 +455,7 @@ void thread_task_sao::work()
 
 
   state = Finished;
-  img->thread_finishes();
+  img->thread_finishes(this);
 }
 
 
@@ -415,7 +472,8 @@ bool add_sao_tasks(image_unit* imgunit, int saoInputProgress)
 
   de265_error err = imgunit->sao_output.alloc_image(img->get_width(), img->get_height(),
                                                     img->get_chroma_format(), &img->sps, false,
-                                                    img->decctx, img->pts, img->user_data, true);
+                                                    img->decctx, img->encctx,
+                                                    img->pts, img->user_data, true);
   if (err != DE265_OK) {
     img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
     return false;
@@ -426,7 +484,7 @@ bool add_sao_tasks(image_unit* imgunit, int saoInputProgress)
   int n=0;
   img->thread_start(nRows);
 
-  for (int y=0;y<img->sps.PicHeightInCtbsY;y++)
+  for (int y=0;y<nRows;y++)
     {
       thread_task_sao* task = new thread_task_sao;
 
@@ -437,7 +495,7 @@ bool add_sao_tasks(image_unit* imgunit, int saoInputProgress)
       task->inputProgress = saoInputProgress;
 
       imgunit->tasks.push_back(task);
-      add_task(&ctx->thread_pool, task);
+      add_task(&ctx->thread_pool_, task);
       n++;
     }
 
diff --git a/libde265/sei.cc b/libde265/sei.cc
index 08cd7d0..eb525dd 100644
--- a/libde265/sei.cc
+++ b/libde265/sei.cc
@@ -96,21 +96,86 @@ static void dump_sei_decoded_picture_hash(const sei_message* sei,
 }
 
 
-static uint32_t compute_checksum_8bit(uint8_t* data,int w,int h,int stride)
+class raw_hash_data
+{
+public:
+  raw_hash_data(int w, int stride);
+  ~raw_hash_data();
+
+  struct data_chunk {
+    const uint8_t* data;
+    int            len;
+  };
+
+  data_chunk prepare_8bit(const uint8_t* data,int y);
+  data_chunk prepare_16bit(const uint8_t* data,int y);
+
+private:
+  int mWidth, mStride;
+
+  uint8_t* mMem;
+};
+
+
+raw_hash_data::raw_hash_data(int w, int stride)
+{
+  mWidth=w;
+  mStride=stride;
+  mMem = NULL;
+}
+
+raw_hash_data::~raw_hash_data()
+{
+  delete[] mMem;
+}
+
+raw_hash_data::data_chunk raw_hash_data::prepare_8bit(const uint8_t* data,int y)
+{
+  data_chunk chunk;
+  chunk.data = data+y*mStride;
+  chunk.len  = mWidth;
+  return chunk;
+}
+
+raw_hash_data::data_chunk raw_hash_data::prepare_16bit(const uint8_t* data,int y)
+{
+  if (mMem == NULL) {
+    mMem = new uint8_t[2*mWidth];
+  }
+
+  const uint16_t* data16 = (uint16_t*)data;
+
+  for (int x=0; x<mWidth; x++) {
+    mMem[2*x+0] = data16[y*mStride+x] & 0xFF;
+    mMem[2*x+1] = data16[y*mStride+x] >> 8;
+  }
+
+  data_chunk chunk;
+  chunk.data = mMem;
+  chunk.len  = 2*mWidth;
+  return chunk;
+}
+
+
+static uint32_t compute_checksum_8bit(uint8_t* data,int w,int h,int stride, int bit_depth)
 {
   uint32_t sum = 0;
-  for (int y=0; y<h; y++)
-    for(int x=0; x<w; x++) {
-      uint8_t xorMask = ( x & 0xFF ) ^ ( y & 0xFF ) ^ ( x  >>  8 ) ^ ( y  >>  8 );
-      sum += data[y*stride + x] ^ xorMask;
 
-      /*
-      if (compDepth[cIdx] > 8 )
-        sum = ( sum + ( ( component[cIdx][y * compWidth[cIdx] + x]  >>  8 ) ^ xorMask ) ) &
-          0xFFFFFFFF
-          }
-      */
-    }
+  if (bit_depth<=8) {
+    for (int y=0; y<h; y++)
+      for(int x=0; x<w; x++) {
+        uint8_t xorMask = ( x & 0xFF ) ^ ( y & 0xFF ) ^ ( x  >>  8 ) ^ ( y  >>  8 );
+        sum += data[y*stride + x] ^ xorMask;
+      }
+  }
+  else {
+    for (int y=0; y<h; y++)
+      for(int x=0; x<w; x++) {
+        uint8_t xorMask = ( x & 0xFF ) ^ ( y & 0xFF ) ^ ( x  >>  8 ) ^ ( y  >>  8 );
+        sum += (data[y*stride + x] & 0xFF) ^ xorMask;
+        sum += (data[y*stride + x] >> 8)   ^ xorMask;
+      }
+  }
 
   return sum & 0xFFFFFFFF;
 }
@@ -157,31 +222,48 @@ static inline uint16_t crc_process_byte_parallel(uint16_t crc, uint8_t byte)
 	   (t << 12)) & 0xFFFF;
 }
 
-static uint32_t compute_CRC_8bit_fast(const uint8_t* data,int w,int h,int stride)
+static uint32_t compute_CRC_8bit_fast(const uint8_t* data,int w,int h,int stride, int bit_depth)
 {
+  raw_hash_data raw_data(w,stride);
+
   uint16_t crc = 0xFFFF;
 
   crc = crc_process_byte_parallel(crc, 0);
   crc = crc_process_byte_parallel(crc, 0);
 
   for (int y=0; y<h; y++) {
-    const uint8_t* d = &data[y*stride];
+    raw_hash_data::data_chunk chunk;
 
-    for(int x=0; x<w; x++) {
-      crc = crc_process_byte_parallel(crc, *d++);
+    if (bit_depth>8)
+      chunk = raw_data.prepare_16bit(data, y);
+    else
+      chunk = raw_data.prepare_8bit(data, y);
+
+    for(int x=0; x<chunk.len; x++) {
+      crc = crc_process_byte_parallel(crc, chunk.data[x]);
     }
   }
 
   return crc;
 }
 
-static void compute_MD5_8bit(uint8_t* data,int w,int h,int stride, uint8_t* result)
+
+static void compute_MD5(uint8_t* data,int w,int h,int stride, uint8_t* result, int bit_depth)
 {
   MD5_CTX md5;
   MD5_Init(&md5);
 
+  raw_hash_data raw_data(w,stride);
+
   for (int y=0; y<h; y++) {
-    MD5_Update(&md5, &data[y*stride], w);
+    raw_hash_data::data_chunk chunk;
+
+    if (bit_depth>8)
+      chunk = raw_data.prepare_16bit(data, y);
+    else
+      chunk = raw_data.prepare_8bit(data, y);
+
+    MD5_Update(&md5, (void*)chunk.data, chunk.len);
   }
 
   MD5_Final(result, &md5);
@@ -217,7 +299,7 @@ static de265_error process_sei_decoded_picture_hash(const sei_message* sei, de26
     case sei_decoded_picture_hash_type_MD5:
       {
         uint8_t md5[16];
-        compute_MD5_8bit(data,w,h,stride,md5);
+        compute_MD5(data,w,h,stride,md5, img->get_bit_depth(i));
 
 /*
         fprintf(stderr,"computed MD5: ");
@@ -238,7 +320,7 @@ static de265_error process_sei_decoded_picture_hash(const sei_message* sei, de26
 
     case sei_decoded_picture_hash_type_CRC:
       {
-        uint16_t crc = compute_CRC_8bit_fast(data,w,h,stride);
+        uint16_t crc = compute_CRC_8bit_fast(data,w,h,stride, img->get_bit_depth(i));
 
         logtrace(LogSEI,"SEI decoded picture hash: %04x <-[%d]-> decoded picture: %04x\n",
                  seihash->crc[i], i, crc);
@@ -253,7 +335,7 @@ static de265_error process_sei_decoded_picture_hash(const sei_message* sei, de26
 
     case sei_decoded_picture_hash_type_checksum:
       {
-        uint32_t chksum = compute_checksum_8bit(data,w,h,stride);
+        uint32_t chksum = compute_checksum_8bit(data,w,h,stride, img->get_bit_depth(i));
 
         if (chksum != seihash->checksum[i]) {
           fprintf(stderr,"SEI decoded picture hash: %04x, decoded picture: %04x (POC=%d)\n",
@@ -282,6 +364,8 @@ de265_error read_sei(bitreader* reader, sei_message* sei, bool suffix, const seq
       if (byte != 0xFF) { break; }
     }
 
+  //printf("SEI payload: %d\n",payload_type);
+
   int payload_size = 0;
   for (;;)
     {
@@ -335,6 +419,9 @@ de265_error process_sei(const sei_message* sei, de265_image* img)
   case sei_payload_type_decoded_picture_hash:
     if (img->decctx->param_sei_check_hash) {
       err = process_sei_decoded_picture_hash(sei, img);
+      if (err==DE265_OK) {
+        //printf("SEI check ok\n");
+      }
     }
 
     break;
@@ -406,4 +493,3 @@ const char* sei_type_name(enum sei_payload_type type)
     return "unknown SEI message";
   }
 }
-
diff --git a/libde265/sei.h b/libde265/sei.h
index 6984e49..fd615d5 100644
--- a/libde265/sei.h
+++ b/libde265/sei.h
@@ -69,15 +69,14 @@ typedef struct {
 } sei_decoded_picture_hash;
 
 
-struct sei_message {
+typedef struct {
   enum sei_payload_type payload_type;
   int payload_size;
 
   union {
     sei_decoded_picture_hash decoded_picture_hash;
   } data;
-};
-
+} sei_message;
 
 class seq_parameter_set;
 
@@ -85,6 +84,6 @@ const char* sei_type_name(enum sei_payload_type type);
 
 de265_error read_sei(bitreader* reader, sei_message*, bool suffix, const seq_parameter_set* sps);
 void dump_sei(const sei_message*, const seq_parameter_set* sps);
-de265_error process_sei(const sei_message*, class de265_image* img);
+de265_error process_sei(const sei_message*, struct de265_image* img);
 
 #endif
diff --git a/libde265/slice.cc b/libde265/slice.cc
index 0c5a62e..f8dd3bd 100644
--- a/libde265/slice.cc
+++ b/libde265/slice.cc
@@ -38,7 +38,7 @@
 #define LOCK de265_mutex_lock(&ctx->thread_pool.mutex)
 #define UNLOCK de265_mutex_unlock(&ctx->thread_pool.mutex)
 
-extern bool read_short_term_ref_pic_set(decoder_context* ctx,
+extern bool read_short_term_ref_pic_set(error_queue* errqueue,
                                         const seq_parameter_set* sps,
                                         bitreader* br,
                                         ref_pic_set* out_set,
@@ -49,17 +49,101 @@ extern bool read_short_term_ref_pic_set(decoder_context* ctx,
 
 void read_coding_tree_unit(thread_context* tctx);
 void read_coding_quadtree(thread_context* tctx,
-                          int xCtb, int yCtb, 
+                          int xCtb, int yCtb,
                           int Log2CtbSizeY,
                           int ctDepth);
-int check_CTB_available(de265_image* img,
-                        slice_segment_header* shdr,
-                        int xC,int yC, int xN,int yN);
 /*
 void decode_inter_block(decoder_context* ctx,thread_context* tctx,
                         int xC, int yC, int log2CbSize);
 */
 
+void slice_segment_header::set_defaults()
+{
+  slice_index = 0;
+
+  first_slice_segment_in_pic_flag = 1;
+  no_output_of_prior_pics_flag = 0;
+  slice_pic_parameter_set_id = 0;
+  dependent_slice_segment_flag = 0;
+  slice_segment_address = 0;
+
+  slice_type = SLICE_TYPE_I;
+  pic_output_flag = 1;
+  colour_plane_id = 0;
+  slice_pic_order_cnt_lsb = 0;
+  short_term_ref_pic_set_sps_flag = 1;
+  // ref_pic_set slice_ref_pic_set;
+
+  short_term_ref_pic_set_idx = 0;
+  num_long_term_sps = 0;
+  num_long_term_pics = 0;
+
+  //uint8_t lt_idx_sps[MAX_NUM_REF_PICS];
+  //int     poc_lsb_lt[MAX_NUM_REF_PICS];
+  //char    used_by_curr_pic_lt_flag[MAX_NUM_REF_PICS];
+
+  //char delta_poc_msb_present_flag[MAX_NUM_REF_PICS];
+  //int delta_poc_msb_cycle_lt[MAX_NUM_REF_PICS];
+
+  slice_temporal_mvp_enabled_flag = 0;
+  slice_sao_luma_flag = 0;
+  slice_sao_chroma_flag = 0;
+
+  num_ref_idx_active_override_flag = 0;
+  num_ref_idx_l0_active=1; // [1;16]
+  num_ref_idx_l1_active=1; // [1;16]
+
+  ref_pic_list_modification_flag_l0 = 0;
+  ref_pic_list_modification_flag_l1 = 0;
+  //uint8_t list_entry_l0[16];
+  //uint8_t list_entry_l1[16];
+
+  mvd_l1_zero_flag = 0;
+  cabac_init_flag = 0;
+  collocated_from_l0_flag = 0;
+  collocated_ref_idx = 0;
+
+  // --- pred_weight_table ---
+
+  luma_log2_weight_denom=0; // [0;7]
+  ChromaLog2WeightDenom=0;  // [0;7]
+
+  // first index is L0/L1
+  /*
+  uint8_t luma_weight_flag[2][16];   // bool
+  uint8_t chroma_weight_flag[2][16]; // bool
+  int16_t LumaWeight[2][16];
+  int8_t  luma_offset[2][16];
+  int16_t ChromaWeight[2][16][2];
+  int8_t  ChromaOffset[2][16][2];
+  */
+
+
+  five_minus_max_num_merge_cand = 0;
+  slice_qp_delta = 0;
+
+  slice_cb_qp_offset = 0;
+  slice_cr_qp_offset = 0;
+
+  cu_chroma_qp_offset_enabled_flag = 0;
+
+  deblocking_filter_override_flag = 0;
+  slice_deblocking_filter_disabled_flag = 0;
+  slice_beta_offset=0; // = pps->beta_offset if undefined
+  slice_tc_offset=0;   // = pps->tc_offset if undefined
+
+  slice_loop_filter_across_slices_enabled_flag = 0;
+
+  num_entry_point_offsets = 0;
+  //int  offset_len;
+  //std::vector<int> entry_point_offset;
+
+  slice_segment_header_extension_length = 0;
+
+  SliceAddrRS = slice_segment_address;
+}
+
+
 bool read_pred_weight_table(bitreader* br, slice_segment_header* shdr, decoder_context* ctx)
 {
   int vlc;
@@ -111,7 +195,7 @@ bool read_pred_weight_table(bitreader* br, slice_segment_header* shdr, decoder_c
             // luma_offset
 
             vlc = get_svlc(br);
-            if (vlc < -128 || vlc > 127) return false;
+            if (vlc < -sps->WpOffsetHalfRangeY || vlc > sps->WpOffsetHalfRangeY-1) return false;
             shdr->luma_offset[l][i] = vlc;
           }
           else {
@@ -124,17 +208,22 @@ bool read_pred_weight_table(bitreader* br, slice_segment_header* shdr, decoder_c
               // delta_chroma_weight
 
               vlc = get_svlc(br);
-              if (vlc < -128 || vlc > 127) return false;
+              if (vlc < -128 || vlc >  127) return false;
 
               shdr->ChromaWeight[l][i][j] = (1<<shdr->ChromaLog2WeightDenom) + vlc;
 
               // delta_chroma_offset
 
               vlc = get_svlc(br);
-              if (vlc < -512 || vlc > 511) return false;
+              if (vlc < -4*sps->WpOffsetHalfRangeC ||
+                  vlc >  4*sps->WpOffsetHalfRangeC-1) return false;
 
-              vlc = Clip3(-128,127, (vlc-((128*shdr->ChromaWeight[l][i][j])
-                                          >> shdr->ChromaLog2WeightDenom) + 128));
+              vlc = Clip3(-sps->WpOffsetHalfRangeC,
+                          sps->WpOffsetHalfRangeC-1,
+                          (sps->WpOffsetHalfRangeC
+                           +vlc
+                           -((sps->WpOffsetHalfRangeC*shdr->ChromaWeight[l][i][j])
+                             >> shdr->ChromaLog2WeightDenom)));
 
               shdr->ChromaOffset[l][i][j] = vlc;
             }
@@ -153,10 +242,120 @@ bool read_pred_weight_table(bitreader* br, slice_segment_header* shdr, decoder_c
 }
 
 
+void slice_segment_header::reset()
+{
+  slice_index = 0;
+
+  first_slice_segment_in_pic_flag = 0;
+  no_output_of_prior_pics_flag = 0;
+  slice_pic_parameter_set_id = 0;
+  dependent_slice_segment_flag = 0;
+  slice_segment_address = 0;
+
+  slice_type = 0;
+  pic_output_flag = 0;
+  colour_plane_id = 0;
+  slice_pic_order_cnt_lsb = 0;
+  short_term_ref_pic_set_sps_flag = 0;
+  slice_ref_pic_set.reset();
+
+  short_term_ref_pic_set_idx = 0;
+  num_long_term_sps = 0;
+  num_long_term_pics= 0;
+
+  for (int i=0;i<MAX_NUM_REF_PICS;i++) {
+    lt_idx_sps[i] = 0;
+    poc_lsb_lt[i] = 0;
+    used_by_curr_pic_lt_flag[i] = 0;
+    delta_poc_msb_present_flag[i] = 0;
+    delta_poc_msb_cycle_lt[i] = 0;
+  }
+
+  slice_temporal_mvp_enabled_flag = 0;
+  slice_sao_luma_flag = 0;
+  slice_sao_chroma_flag = 0;
+
+  num_ref_idx_active_override_flag = 0;
+  num_ref_idx_l0_active = 0;
+  num_ref_idx_l1_active = 0;
+
+  ref_pic_list_modification_flag_l0 = 0;
+  ref_pic_list_modification_flag_l1 = 0;
+  for (int i=0;i<16;i++) {
+    list_entry_l0[i] = 0;
+    list_entry_l1[i] = 0;
+  }
+
+  mvd_l1_zero_flag = 0;
+  cabac_init_flag  = 0;
+  collocated_from_l0_flag = 0;
+  collocated_ref_idx = 0;
+
+  luma_log2_weight_denom = 0;
+  ChromaLog2WeightDenom  = 0;
+
+  for (int i=0;i<2;i++)
+    for (int j=0;j<16;j++) {
+      luma_weight_flag[i][j] = 0;
+      chroma_weight_flag[i][j] = 0;
+      LumaWeight[i][j] = 0;
+      luma_offset[i][j] = 0;
+      ChromaWeight[i][j][0] = ChromaWeight[i][j][1] = 0;
+      ChromaOffset[i][j][0] = ChromaOffset[i][j][1] = 0;
+    }
+
+  five_minus_max_num_merge_cand = 0;
+  slice_qp_delta = 0;
+
+  slice_cb_qp_offset = 0;
+  slice_cr_qp_offset = 0;
+
+  cu_chroma_qp_offset_enabled_flag = 0;
+
+  deblocking_filter_override_flag = 0;
+  slice_deblocking_filter_disabled_flag = 0;
+  slice_beta_offset = 0;
+  slice_tc_offset = 0;
+
+  slice_loop_filter_across_slices_enabled_flag = 0;
+
+  num_entry_point_offsets = 0;
+  offset_len = 0;
+  entry_point_offset.clear();
+
+  slice_segment_header_extension_length = 0;
+
+  SliceAddrRS = 0;
+  SliceQPY = 0;
+
+  initType = 0;
+
+  MaxNumMergeCand = 0;
+  CurrRpsIdx = 0;
+  CurrRps.reset();
+  NumPocTotalCurr = 0;
+
+  for (int i=0;i<2;i++)
+    for (int j=0;j<MAX_NUM_REF_PICS;j++) {
+      RefPicList[i][j] = 0;
+      RefPicList_POC[i][j] = 0;
+      RefPicList_PicState[i][j] = 0;
+      LongTermRefPic[i][j] = 0;
+    }
+
+  //context_model ctx_model_storage[CONTEXT_MODEL_TABLE_LENGTH];
+
+  RemoveReferencesList.clear();
+
+  ctx_model_storage_defined = false;
+}
+
+
 de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx,
                                        bool* continueDecoding)
 {
   *continueDecoding = false;
+  reset();
 
   // set defaults
 
@@ -207,6 +406,10 @@ de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx,
         return DE265_OK;
       }
 
+      if (ctx->previous_slice_header == NULL) {
+        return DE265_ERROR_NO_INITIAL_SLICE_HEADER;
+      }
+
       *this = *ctx->previous_slice_header;
 
       first_slice_segment_in_pic_flag = 0;
@@ -220,17 +423,18 @@ de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx,
   }
 
   if (slice_segment_address < 0 ||
-      slice_segment_address > sps->PicSizeInCtbsY) {
+      slice_segment_address >= sps->PicSizeInCtbsY) {
     ctx->add_warning(DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID, false);
     return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
   }
 
+  //printf("SLICE %d (%d)\n",slice_segment_address, sps->PicSizeInCtbsY);
 
 
   if (!dependent_slice_segment_flag) {
     for (int i=0; i<pps->num_extra_slice_header_bits; i++) {
       //slice_reserved_undetermined_flag[i]
-      skip_bits(br,1); 
+      skip_bits(br,1);
     }
 
     slice_type = get_uvlc(br);
@@ -249,7 +453,7 @@ de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx,
     }
 
     if (sps->separate_colour_plane_flag == 1) {
-      colour_plane_id = get_bits(br,1);
+      colour_plane_id = get_bits(br,2);
     }
 
 
@@ -266,19 +470,19 @@ de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx,
       if (!short_term_ref_pic_set_sps_flag) {
         read_short_term_ref_pic_set(ctx, sps,
                                     br, &slice_ref_pic_set,
-                                    sps->num_short_term_ref_pic_sets,
+                                    sps->num_short_term_ref_pic_sets(),
                                     sps->ref_pic_sets,
                                     true);
 
-        CurrRpsIdx = sps->num_short_term_ref_pic_sets;
+        CurrRpsIdx = sps->num_short_term_ref_pic_sets();
         CurrRps    = slice_ref_pic_set;
       }
       else {
-        int nBits = ceil_log2(sps->num_short_term_ref_pic_sets);
+        int nBits = ceil_log2(sps->num_short_term_ref_pic_sets());
         if (nBits>0) short_term_ref_pic_set_idx = get_bits(br,nBits);
         else         short_term_ref_pic_set_idx = 0;
 
-        if (short_term_ref_pic_set_idx > sps->num_short_term_ref_pic_sets) {
+        if (short_term_ref_pic_set_idx >= sps->num_short_term_ref_pic_sets()) {
           ctx->add_warning(DE265_WARNING_SHORT_TERM_REF_PIC_SET_OUT_OF_RANGE, false);
           return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
         }
@@ -293,13 +497,18 @@ de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx,
       if (sps->long_term_ref_pics_present_flag) {
         if (sps->num_long_term_ref_pics_sps > 0) {
           num_long_term_sps = get_uvlc(br);
+          if (num_long_term_sps == UVLC_ERROR) {
+            return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+          }
         }
         else {
           num_long_term_sps = 0;
         }
 
         num_long_term_pics= get_uvlc(br);
-
+        if (num_long_term_pics == UVLC_ERROR) {
+          return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+        }
 
         // check maximum number of reference frames
 
@@ -348,6 +557,9 @@ de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx,
           delta_poc_msb_present_flag[i] = get_bits(br,1);
           if (delta_poc_msb_present_flag[i]) {
             delta_poc_msb_cycle_lt[i] = get_uvlc(br);
+            if (delta_poc_msb_cycle_lt[i]==UVLC_ERROR) {
+              return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+            }
           }
           else {
             delta_poc_msb_cycle_lt[i] = 0;
@@ -382,16 +594,25 @@ de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx,
 
 
     // --- SAO ---
-      
+
     if (sps->sample_adaptive_offset_enabled_flag) {
       slice_sao_luma_flag   = get_bits(br,1);
-      slice_sao_chroma_flag = get_bits(br,1);
+
+      if (sps->ChromaArrayType != CHROMA_MONO) {
+        slice_sao_chroma_flag = get_bits(br,1);
+      }
+      else {
+        slice_sao_chroma_flag = 0;
+      }
     }
     else {
       slice_sao_luma_flag   = 0;
       slice_sao_chroma_flag = 0;
     }
 
+    num_ref_idx_l0_active = 0;
+    num_ref_idx_l1_active = 0;
+
     if (slice_type == SLICE_TYPE_P  ||
         slice_type == SLICE_TYPE_B) {
       num_ref_idx_active_override_flag = get_bits(br,1);
@@ -417,6 +638,9 @@ de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx,
         num_ref_idx_l1_active = pps->num_ref_idx_l1_default_active;
       }
 
+      if (num_ref_idx_l0_active > 16) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; }
+      if (num_ref_idx_l1_active > 16) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; }
+
       NumPocTotalCurr = CurrRps.NumPocTotalCurr_shortterm_only + NumLtPics;
 
       if (pps->lists_modification_present_flag && NumPocTotalCurr > 1) {
@@ -475,8 +699,17 @@ de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx,
         else {
           collocated_ref_idx = 0;
         }
+
+        // check whether collocated_ref_idx points to a valid index
+
+        if (( collocated_from_l0_flag && collocated_ref_idx >= num_ref_idx_l0_active) ||
+            (!collocated_from_l0_flag && collocated_ref_idx >= num_ref_idx_l1_active)) {
+          ctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
+          return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+        }
       }
 
+
       if ((pps->weighted_pred_flag   && slice_type == SLICE_TYPE_P) ||
           (pps->weighted_bipred_flag && slice_type == SLICE_TYPE_B)) {
 
@@ -494,7 +727,7 @@ de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx,
       }
       MaxNumMergeCand = 5-five_minus_max_num_merge_cand;
     }
-    
+
     slice_qp_delta = get_svlc(br);
     if (slice_qp_delta == UVLC_ERROR) {
       ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
@@ -520,6 +753,10 @@ de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx,
       slice_cr_qp_offset = 0;
     }
 
+    if (pps->range_extension.chroma_qp_offset_list_enabled_flag) {
+      cu_chroma_qp_offset_enabled_flag = get_bits(br,1);
+    }
+
     if (pps->deblocking_filter_override_enabled_flag) {
       deblocking_filter_override_flag = get_bits(br,1);
     }
@@ -570,6 +807,24 @@ de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx,
       return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
     }
 
+    if (pps->entropy_coding_sync_enabled_flag) {
+      // check num_entry_points for valid range
+
+      int firstCTBRow = slice_segment_address / sps->PicWidthInCtbsY;
+      int lastCTBRow  = firstCTBRow + num_entry_point_offsets;
+      if (lastCTBRow >= sps->PicHeightInCtbsY) {
+        ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
+        return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+      }
+    }
+
+    if (pps->tiles_enabled_flag) {
+      if (num_entry_point_offsets > pps->num_tile_columns * pps->num_tile_rows) {
+        ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
+        return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+      }
+    }
+
     entry_point_offset.resize( num_entry_point_offsets );
 
     if (num_entry_point_offsets > 0) {
@@ -580,6 +835,10 @@ de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx,
       }
       offset_len++;
 
+      if (offset_len > 32) {
+	return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+      }
+
       for (int i=0; i<num_entry_point_offsets; i++) {
         {
           entry_point_offset[i] = get_bits(br,offset_len)+1;
@@ -602,7 +861,7 @@ de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx,
       ctx->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
       return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
     }
-    
+
     for (int i=0; i<slice_segment_header_extension_length; i++) {
       //slice_segment_header_extension_data_byte[i]
       get_bits(br,8);
@@ -610,6 +869,380 @@ de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx,
   }
 
 
+  compute_derived_values(pps);
+
+  *continueDecoding = true;
+  return DE265_OK;
+}
+
+
+de265_error slice_segment_header::write(error_queue* errqueue, CABAC_encoder& out,
+                                        const seq_parameter_set* sps,
+                                        const pic_parameter_set* pps,
+                                        uint8_t nal_unit_type)
+{
+  out.write_bit(first_slice_segment_in_pic_flag);
+
+  if (isRapPic(nal_unit_type)) { // TODO: is this still correct ? Should we drop RapPicFlag ?
+    out.write_bit(no_output_of_prior_pics_flag);
+  }
+
+  if (slice_pic_parameter_set_id > DE265_MAX_PPS_SETS) {
+    errqueue->add_warning(DE265_WARNING_NONEXISTING_PPS_REFERENCED, false);
+    return DE265_OK;
+  }
+  out.write_uvlc(slice_pic_parameter_set_id);
+
+  if (!first_slice_segment_in_pic_flag) {
+    if (pps->dependent_slice_segments_enabled_flag) {
+      out.write_bit(dependent_slice_segment_flag);
+    }
+
+    out.write_bits(slice_segment_address, ceil_log2(sps->PicSizeInCtbsY));
+
+    if (dependent_slice_segment_flag) {
+      if (slice_segment_address == 0) {
+        errqueue->add_warning(DE265_WARNING_DEPENDENT_SLICE_WITH_ADDRESS_ZERO, false);
+        return DE265_OK;
+      }
+    }
+  }
+
+  if (slice_segment_address < 0 ||
+      slice_segment_address > sps->PicSizeInCtbsY) {
+    errqueue->add_warning(DE265_WARNING_SLICE_SEGMENT_ADDRESS_INVALID, false);
+    return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+  }
+
+
+
+  if (!dependent_slice_segment_flag) {
+    for (int i=0; i<pps->num_extra_slice_header_bits; i++) {
+      //slice_reserved_undetermined_flag[i]
+      out.skip_bits(1);
+    }
+
+    if (slice_type > 2) {
+      errqueue->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
+      return DE265_OK;
+    }
+    out.write_uvlc(slice_type);
+
+    if (pps->output_flag_present_flag) {
+      out.write_bit(pic_output_flag);
+    }
+
+    if (sps->separate_colour_plane_flag == 1) {
+      out.write_bits(colour_plane_id,2);
+    }
+
+
+    int NumLtPics = 0;
+
+    if (nal_unit_type != NAL_UNIT_IDR_W_RADL &&
+        nal_unit_type != NAL_UNIT_IDR_N_LP) {
+      out.write_bits(slice_pic_order_cnt_lsb, sps->log2_max_pic_order_cnt_lsb);
+      out.write_bit(short_term_ref_pic_set_sps_flag);
+
+      if (!short_term_ref_pic_set_sps_flag) {
+        /* TODO
+        read_short_term_ref_pic_set(ctx, sps,
+                                    br, &slice_ref_pic_set,
+                                    sps->num_short_term_ref_pic_sets,
+                                    sps->ref_pic_sets,
+                                    true);
+        */
+        //CurrRpsIdx = sps->num_short_term_ref_pic_sets;
+        //CurrRps    = slice_ref_pic_set;
+      }
+      else {
+        int nBits = ceil_log2(sps->num_short_term_ref_pic_sets());
+        if (nBits>0) out.write_bits(short_term_ref_pic_set_idx,nBits);
+        else         { assert(short_term_ref_pic_set_idx==0); }
+
+        if (short_term_ref_pic_set_idx > sps->num_short_term_ref_pic_sets()) {
+          errqueue->add_warning(DE265_WARNING_SHORT_TERM_REF_PIC_SET_OUT_OF_RANGE, false);
+          return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+        }
+
+        //CurrRpsIdx = short_term_ref_pic_set_idx;
+        //CurrRps    = sps->ref_pic_sets[CurrRpsIdx];
+      }
+
+
+      // --- long-term MC ---
+
+      if (sps->long_term_ref_pics_present_flag) {
+        if (sps->num_long_term_ref_pics_sps > 0) {
+          out.write_uvlc(num_long_term_sps);
+        }
+        else {
+          assert(num_long_term_sps == 0);
+        }
+
+        out.write_uvlc(num_long_term_pics);
+
+
+        // check maximum number of reference frames
+
+        if (num_long_term_sps +
+            num_long_term_pics +
+            CurrRps.NumNegativePics +
+            CurrRps.NumPositivePics
+            > sps->sps_max_dec_pic_buffering[sps->sps_max_sub_layers-1])
+          {
+            errqueue->add_warning(DE265_WARNING_MAX_NUM_REF_PICS_EXCEEDED, false);
+            return DE265_OK;
+          }
+
+        for (int i=0; i<num_long_term_sps + num_long_term_pics; i++) {
+          if (i < num_long_term_sps) {
+            int nBits = ceil_log2(sps->num_long_term_ref_pics_sps);
+            out.write_bits(lt_idx_sps[i], nBits);
+
+            // check that the referenced lt-reference really exists
+
+            if (lt_idx_sps[i] >= sps->num_long_term_ref_pics_sps) {
+              errqueue->add_warning(DE265_NON_EXISTING_LT_REFERENCE_CANDIDATE_IN_SLICE_HEADER, false);
+              return DE265_OK;
+            }
+
+            //ctx->PocLsbLt[i] = sps->lt_ref_pic_poc_lsb_sps[ lt_idx_sps[i] ];
+            //ctx->UsedByCurrPicLt[i] = sps->used_by_curr_pic_lt_sps_flag[ lt_idx_sps[i] ];
+          }
+          else {
+            int nBits = sps->log2_max_pic_order_cnt_lsb;
+            out.write_bits(poc_lsb_lt[i], nBits);
+            out.write_bit(used_by_curr_pic_lt_flag[i]);
+
+            //ctx->PocLsbLt[i] = poc_lsb_lt[i];
+            //ctx->UsedByCurrPicLt[i] = used_by_curr_pic_lt_flag[i];
+          }
+
+          //if (ctx->UsedByCurrPicLt[i]) {
+          //NumLtPics++;
+          //}
+
+          out.write_bit(delta_poc_msb_present_flag[i]);
+          if (delta_poc_msb_present_flag[i]) {
+            out.write_uvlc(delta_poc_msb_cycle_lt[i]);
+          }
+          else {
+            assert(delta_poc_msb_cycle_lt[i] == 0);
+          }
+
+          /*
+          if (i==0 || i==num_long_term_sps) {
+            ctx->DeltaPocMsbCycleLt[i] = delta_poc_msb_cycle_lt[i];
+          }
+          else {
+            ctx->DeltaPocMsbCycleLt[i] = (delta_poc_msb_cycle_lt[i] +
+                                          ctx->DeltaPocMsbCycleLt[i-1]);
+          }
+          */
+        }
+      }
+      else {
+        assert(num_long_term_sps == 0);
+        assert(num_long_term_pics== 0);
+      }
+
+      if (sps->sps_temporal_mvp_enabled_flag) {
+        out.write_bit(slice_temporal_mvp_enabled_flag);
+      }
+      else {
+        assert(slice_temporal_mvp_enabled_flag == 0);
+      }
+    }
+    else {
+      assert(slice_pic_order_cnt_lsb == 0);
+      assert(num_long_term_sps == 0);
+      assert(num_long_term_pics== 0);
+    }
+
+
+    // --- SAO ---
+
+    if (sps->sample_adaptive_offset_enabled_flag) {
+      out.write_bit(slice_sao_luma_flag);
+      out.write_bit(slice_sao_chroma_flag);
+    }
+    else {
+      assert(slice_sao_luma_flag  == 0);
+      assert(slice_sao_chroma_flag== 0);
+    }
+
+    if (slice_type == SLICE_TYPE_P  ||
+        slice_type == SLICE_TYPE_B) {
+      out.write_bit(num_ref_idx_active_override_flag);
+
+      if (num_ref_idx_active_override_flag) {
+        out.write_uvlc(num_ref_idx_l0_active);
+        num_ref_idx_l0_active++;;
+
+        if (slice_type == SLICE_TYPE_B) {
+          out.write_uvlc(num_ref_idx_l1_active);
+          num_ref_idx_l1_active++;
+        }
+      }
+      else {
+        assert(num_ref_idx_l0_active == pps->num_ref_idx_l0_default_active);
+        assert(num_ref_idx_l1_active == pps->num_ref_idx_l1_default_active);
+      }
+
+      NumPocTotalCurr = CurrRps.NumPocTotalCurr_shortterm_only + NumLtPics;
+
+      if (pps->lists_modification_present_flag && NumPocTotalCurr > 1) {
+
+        int nBits = ceil_log2(NumPocTotalCurr);
+
+        out.write_bit(ref_pic_list_modification_flag_l0);
+        if (ref_pic_list_modification_flag_l0) {
+          for (int i=0;i<num_ref_idx_l0_active;i++) {
+            out.write_bits(list_entry_l0[i], nBits);
+          }
+        }
+
+        if (slice_type == SLICE_TYPE_B) {
+          out.write_bit(ref_pic_list_modification_flag_l1);
+          if (ref_pic_list_modification_flag_l1) {
+            for (int i=0;i<num_ref_idx_l1_active;i++) {
+              out.write_bits(list_entry_l1[i], nBits);
+            }
+          }
+        }
+        else {
+          assert(ref_pic_list_modification_flag_l1 == 0);
+        }
+      }
+      else {
+        assert(ref_pic_list_modification_flag_l0 == 0);
+        assert(ref_pic_list_modification_flag_l1 == 0);
+      }
+
+      if (slice_type == SLICE_TYPE_B) {
+        out.write_bit(mvd_l1_zero_flag);
+      }
+
+      if (pps->cabac_init_present_flag) {
+        out.write_bit(cabac_init_flag);
+      }
+      else {
+        assert(cabac_init_flag == 0);
+      }
+
+      if (slice_temporal_mvp_enabled_flag) {
+        if (slice_type == SLICE_TYPE_B)
+          out.write_bit(collocated_from_l0_flag);
+        else
+          { assert(collocated_from_l0_flag == 1); }
+
+        if (( collocated_from_l0_flag && num_ref_idx_l0_active > 1) ||
+            (!collocated_from_l0_flag && num_ref_idx_l1_active > 1)) {
+          out.write_uvlc(collocated_ref_idx);
+        }
+        else {
+          assert(collocated_ref_idx == 0);
+        }
+      }
+
+      if ((pps->weighted_pred_flag   && slice_type == SLICE_TYPE_P) ||
+          (pps->weighted_bipred_flag && slice_type == SLICE_TYPE_B)) {
+
+        assert(0);
+        /* TODO
+        if (!read_pred_weight_table(br,this,ctx))
+          {
+	    ctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
+	    return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+          }
+        */
+      }
+
+      out.write_uvlc(five_minus_max_num_merge_cand);
+      //MaxNumMergeCand = 5-five_minus_max_num_merge_cand;
+    }
+
+    out.write_svlc(slice_qp_delta);
+
+    if (pps->pps_slice_chroma_qp_offsets_present_flag) {
+      out.write_svlc(slice_cb_qp_offset);
+      out.write_svlc(slice_cr_qp_offset);
+    }
+    else {
+      assert(slice_cb_qp_offset == 0);
+      assert(slice_cr_qp_offset == 0);
+    }
+
+    if (pps->deblocking_filter_override_enabled_flag) {
+      out.write_bit(deblocking_filter_override_flag);
+    }
+    else {
+      assert(deblocking_filter_override_flag == 0);
+    }
+
+    //slice_beta_offset = pps->beta_offset;
+    //slice_tc_offset   = pps->tc_offset;
+
+    if (deblocking_filter_override_flag) {
+      out.write_bit(slice_deblocking_filter_disabled_flag);
+      if (!slice_deblocking_filter_disabled_flag) {
+        out.write_svlc(slice_beta_offset/2);
+        out.write_svlc(slice_tc_offset  /2);
+      }
+    }
+    else {
+      assert(slice_deblocking_filter_disabled_flag == pps->pic_disable_deblocking_filter_flag);
+    }
+
+    if (pps->pps_loop_filter_across_slices_enabled_flag  &&
+        (slice_sao_luma_flag || slice_sao_chroma_flag ||
+         !slice_deblocking_filter_disabled_flag )) {
+      out.write_bit(slice_loop_filter_across_slices_enabled_flag);
+    }
+    else {
+      assert(slice_loop_filter_across_slices_enabled_flag ==
+             pps->pps_loop_filter_across_slices_enabled_flag);
+    }
+  }
+
+  if (pps->tiles_enabled_flag || pps->entropy_coding_sync_enabled_flag ) {
+    out.write_uvlc(num_entry_point_offsets);
+
+    if (num_entry_point_offsets > 0) {
+      out.write_uvlc(offset_len-1);
+
+      for (int i=0; i<num_entry_point_offsets; i++) {
+        {
+          int prev=0;
+          if (i>0) prev = entry_point_offset[i-1];
+          out.write_bits(entry_point_offset[i]-prev-1, offset_len);
+        }
+      }
+    }
+  }
+  else {
+    assert(num_entry_point_offsets == 0);
+  }
+
+  if (pps->slice_segment_header_extension_present_flag) {
+    out.write_uvlc(slice_segment_header_extension_length);
+    if (slice_segment_header_extension_length > 1000) {  // TODO: safety check against too large values
+      errqueue->add_warning(DE265_WARNING_SLICEHEADER_INVALID, false);
+      return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+    }
+
+    for (int i=0; i<slice_segment_header_extension_length; i++) {
+      //slice_segment_header_extension_data_byte[i]
+      out.skip_bits(8);
+    }
+  }
+
+  return DE265_OK;
+}
+
+void slice_segment_header::compute_derived_values(const pic_parameter_set* pps)
+{
   // --- init variables ---
 
   SliceQPY = pps->pic_init_qp + slice_qp_delta;
@@ -621,12 +1254,10 @@ de265_error slice_segment_header::read(bitreader* br, decoder_context* ctx,
     case SLICE_TYPE_B: initType = 2 - cabac_init_flag; break;
     }
 
-  *continueDecoding = true;
-  return DE265_OK;
+  MaxNumMergeCand = 5-five_minus_max_num_merge_cand;
 }
 
 
-
 //-----------------------------------------------------------------------
 
 
@@ -651,18 +1282,18 @@ void slice_segment_header::dump_slice_segment_header(const decoder_context* ctx,
 
 
   LOG0("----------------- SLICE -----------------\n");
-  LOG1("first_slice_segment_in_pic_flag        : %d\n", first_slice_segment_in_pic_flag);
+  LOG1("first_slice_segment_in_pic_flag      : %d\n", first_slice_segment_in_pic_flag);
   if (ctx->get_nal_unit_type() >= NAL_UNIT_BLA_W_LP &&
       ctx->get_nal_unit_type() <= NAL_UNIT_RESERVED_IRAP_VCL23) {
-    LOG1("no_output_of_prior_pics_flag           : %d\n", no_output_of_prior_pics_flag);
+    LOG1("no_output_of_prior_pics_flag         : %d\n", no_output_of_prior_pics_flag);
   }
 
-  LOG1("slice_pic_parameter_set_id             : %d\n", slice_pic_parameter_set_id);
+  LOG1("slice_pic_parameter_set_id           : %d\n", slice_pic_parameter_set_id);
 
   if (!first_slice_segment_in_pic_flag) {
-    if (pps->dependent_slice_segments_enabled_flag) {
+    //if (pps->dependent_slice_segments_enabled_flag) {
       LOG1("dependent_slice_segment_flag         : %d\n", dependent_slice_segment_flag);
-    }
+      //}
     LOG1("slice_segment_address                : %d\n", slice_segment_address);
   }
 
@@ -690,10 +1321,10 @@ void slice_segment_header::dump_slice_segment_header(const decoder_context* ctx,
       LOG1("short_term_ref_pic_set_sps_flag      : %d\n", short_term_ref_pic_set_sps_flag);
 
       if (!short_term_ref_pic_set_sps_flag) {
-        LOG1("ref_pic_set[ %2d ]: ",sps->num_short_term_ref_pic_sets);
+        LOG1("ref_pic_set[ %2d ]: ",sps->num_short_term_ref_pic_sets());
         dump_compact_short_term_ref_pic_set(&slice_ref_pic_set, 16, fh);
       }
-      else if (sps->num_short_term_ref_pic_sets > 1) {
+      else if (sps->num_short_term_ref_pic_sets() > 1) {
         LOG1("short_term_ref_pic_set_idx           : %d\n", short_term_ref_pic_set_idx);
         dump_compact_short_term_ref_pic_set(&sps->ref_pic_sets[short_term_ref_pic_set_idx], 16, fh);
       }
@@ -705,7 +1336,7 @@ void slice_segment_header::dump_slice_segment_header(const decoder_context* ctx,
 
         LOG1("num_long_term_pics                       : %d\n", num_long_term_pics);
 
-#if 0          
+#if 0
         for (int i=0; i<num_long_term_sps + num_long_term_pics; i++) {
           LOG2("PocLsbLt[%d]            : %d\n", i, ctx->PocLsbLt[i]);
           LOG2("UsedByCurrPicLt[%d]     : %d\n", i, ctx->UsedByCurrPicLt[i]);
@@ -718,7 +1349,7 @@ void slice_segment_header::dump_slice_segment_header(const decoder_context* ctx,
         LOG1("slice_temporal_mvp_enabled_flag : %d\n", slice_temporal_mvp_enabled_flag);
       }
     }
-      
+
 
     if (sps->sample_adaptive_offset_enabled_flag) {
       LOG1("slice_sao_luma_flag             : %d\n", slice_sao_luma_flag);
@@ -757,7 +1388,7 @@ void slice_segment_header::dump_slice_segment_header(const decoder_context* ctx,
       if (slice_type == SLICE_TYPE_B) {
         LOG1("mvd_l1_zero_flag               : %d\n", mvd_l1_zero_flag);
       }
-      
+
       LOG1("cabac_init_flag                : %d\n", cabac_init_flag);
 
       if (slice_temporal_mvp_enabled_flag) {
@@ -853,7 +1484,7 @@ void slice_segment_header::dump_slice_segment_header(const decoder_context* ctx,
   /*
     if( slice_segment_header_extension_present_flag ) {
     slice_segment_header_extension_length
-    for( i = 0; i < slice_segment_header_extension_length; i++) 
+    for( i = 0; i < slice_segment_header_extension_length; i++)
     slice_segment_header_extension_data_byte[i]
     }
     byte_alignment()
@@ -870,104 +1501,20 @@ void slice_segment_header::dump_slice_segment_header(const decoder_context* ctx,
 
 
 
-
-
-static void set_initValue(slice_segment_header* shdr,
-                          context_model* model, int initValue)
+void initialize_CABAC_models(thread_context* tctx)
 {
-  int slopeIdx = initValue >> 4;
-  int intersecIdx = initValue & 0xF;
-  int m = slopeIdx*5 - 45;
-  int n = (intersecIdx<<3) - 16;
-  int preCtxState = Clip3(1,126, ((m*Clip3(0,51, shdr->SliceQPY))>>4)+n);
-  
-  logtrace(LogSlice,"QP=%d slopeIdx=%d intersecIdx=%d m=%d n=%d\n",shdr->SliceQPY,slopeIdx,intersecIdx,m,n);
-  
-  model->MPSbit=(preCtxState<=63) ? 0 : 1;
-  model->state = model->MPSbit ? (preCtxState-64) : (63-preCtxState);
+  const int QPY = tctx->shdr->SliceQPY;
+  const int initType = tctx->shdr->initType;
+  assert(initType >= 0 && initType <= 2);
 
-  // model state will always be between [0;62]
+  tctx->ctx_model.init(initType, QPY);
 
-  assert(model->state <= 62);
+  for (int i=0;i<4;i++) {
+    tctx->StatCoeff[i] = 0;
+  }
 }
 
 
-static const int initValue_split_cu_flag[3][3] = {
-  { 139,141,157 },
-  { 107,139,126 },
-  { 107,139,126 },
-};
-static const int initValue_cu_skip_flag[2][3] = {
-  { 197,185,201 },
-  { 197,185,201 },
-};
-static const int initValue_part_mode[9] = { 184,154,139, 154,154,154, 139,154,154 };
-static const int initValue_prev_intra_luma_pred_flag[3] = { 184,154,183 };
-static const int initValue_intra_chroma_pred_mode[3] = { 63,152,152 };
-static const int initValue_cbf_luma[4] = { 111,141,153,111 };
-static const int initValue_cbf_chroma[12] = { 94,138,182,154,149,107,167,154,149,92,167,154 };
-static const int initValue_split_transform_flag[9] = { 153,138,138, 124,138,94, 224,167,122 }; // FIX712
-static const int initValue_last_significant_coefficient_prefix[54] = {
-    110,110,124,125,140,153,125,127,140,109,111,143,127,111, 79,108,123, 63,
-    125,110, 94,110, 95, 79,125,111,110, 78,110,111,111, 95, 94,108,123,108,
-    125,110,124,110, 95, 94,125,111,111, 79,125,126,111,111, 79,108,123, 93
-  };
-static const int initValue_coded_sub_block_flag[12] = { 91,171,134,141,121,140,61,154,121,140,61,154 };
-static const int initValue_significant_coeff_flag[3][42] = {
-    {
-      111,  111,  125,  110,  110,   94,  124,  108,  124,  107,  125,  141,  179,  153,  125,  107,
-      125,  141,  179,  153,  125,  107,  125,  141,  179,  153,  125,  140,  139,  182,  182,  152,
-      136,  152,  136,  153,  136,  139,  111,  136,  139,  111
-    },
-    {
-      155,  154,  139,  153,  139,  123,  123,   63,  153,  166,  183,  140,  136,  153,  154,  166,
-      183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  170,  153,  123,  123,  107,
-      121,  107,  121,  167,  151,  183,  140,  151,  183,  140,
-    },
-    {
-      170,  154,  139,  153,  139,  123,  123,   63,  124,  166,  183,  140,  136,  153,  154,  166,
-      183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  170,  153,  138,  138,  122,
-      121,  122,  121,  167,  151,  183,  140,  151,  183,  140
-    },
-  };
-static const int initValue_coeff_abs_level_greater1_flag[72] = {
-    140, 92,137,138,140,152,138,139,153, 74,149, 92,139,107,122,152,
-    140,179,166,182,140,227,122,197,154,196,196,167,154,152,167,182,
-    182,134,149,136,153,121,136,137,169,194,166,167,154,167,137,182,
-    154,196,167,167,154,152,167,182,182,134,149,136,153,121,136,122,
-    169,208,166,167,154,152,167,182
-  };
-static const int initValue_coeff_abs_level_greater2_flag[18] = {
-    138,153,136,167,152,152,107,167, 91,122,107,167,
-    107,167, 91,107,107,167
-  };
-static const int initValue_sao_merge_leftUp_flag[3] = { 153,153,153 };
-static const int initValue_sao_type_idx_lumaChroma_flag[3] = { 200,185,160 };
-static const int initValue_cu_qp_delta_abs[2] = { 154,154 };
-static const int initValue_transform_skip_flag[2] = { 139,139 };
-static const int initValue_merge_flag[2] = { 110,154 };
-static const int initValue_merge_idx[2] = { 122,137 };
-static const int initValue_pred_mode_flag[2] = { 149,134 };
-static const int initValue_abs_mvd_greater01_flag[4] = { 140,198,169,198 };
-static const int initValue_mvp_lx_flag[1] = { 168 };
-static const int initValue_rqt_root_cbf[1] = { 79 };
-static const int initValue_ref_idx_lX[2] = { 153,153 };
-static const int initValue_inter_pred_idc[5] = { 95,79,63,31,31 };
-static const int initValue_cu_transquant_bypass_flag[3] = { 154,154,154 };
-
-
-static void init_context(thread_context* tctx,
-                         enum context_model_indices idx,
-                         const int* initValues, int len)
-{
-  for (int i=0;i<len;i++)
-    {
-      set_initValue(tctx->shdr,
-                    &tctx->ctx_model[idx+i],
-                    initValues[i]);
-    }
-}
-
 
 static int decode_transform_skip_flag(thread_context* tctx, int cIdx)
 {
@@ -977,6 +1524,9 @@ static int decode_transform_skip_flag(thread_context* tctx, int cIdx)
 
   int bit = decode_CABAC_bit(&tctx->cabac_decoder,
                              &tctx->ctx_model[CONTEXT_MODEL_TRANSFORM_SKIP_FLAG+context]);
+
+  logtrace(LogSymbols,"$1 transform_skip_flag=%d\n",bit);
+
   return bit;
 }
 
@@ -986,6 +1536,9 @@ static int decode_sao_merge_flag(thread_context* tctx)
   logtrace(LogSlice,"# sao_merge_left/up_flag\n");
   int bit = decode_CABAC_bit(&tctx->cabac_decoder,
                              &tctx->ctx_model[CONTEXT_MODEL_SAO_MERGE_FLAG]);
+
+  logtrace(LogSymbols,"$1 sao_merge_flag=%d\n",bit);
+
   return bit;
 }
 
@@ -999,26 +1552,29 @@ static int decode_sao_type_idx(thread_context* tctx)
                               &tctx->ctx_model[CONTEXT_MODEL_SAO_TYPE_IDX]);
 
   if (bit0==0) {
+    logtrace(LogSymbols,"$1 sao_type_idx=%d\n",0);
     return 0;
   }
   else {
     int bit1 = decode_CABAC_bypass(&tctx->cabac_decoder);
     if (bit1==0) {
+      logtrace(LogSymbols,"$1 sao_type_idx=%d\n",1);
       return 1;
     }
     else {
+      logtrace(LogSymbols,"$1 sao_type_idx=%d\n",2);
       return 2;
     }
   }
 }
 
 
-static int decode_sao_offset_abs(thread_context* tctx)
+static int decode_sao_offset_abs(thread_context* tctx, int bitDepth)
 {
   logtrace(LogSlice,"# sao_offset_abs\n");
-  int bitDepth = 8;
   int cMax = (1<<(libde265_min(bitDepth,10)-5))-1;
   int value = decode_CABAC_TU_bypass(&tctx->cabac_decoder, cMax);
+  logtrace(LogSymbols,"$1 sao_offset_abs=%d\n",value);
   return value;
 }
 
@@ -1027,6 +1583,7 @@ static int decode_sao_class(thread_context* tctx)
 {
   logtrace(LogSlice,"# sao_class\n");
   int value = decode_CABAC_FL_bypass(&tctx->cabac_decoder, 2);
+  logtrace(LogSymbols,"$1 sao_class=%d\n",value);
   return value;
 }
 
@@ -1035,6 +1592,7 @@ static int decode_sao_offset_sign(thread_context* tctx)
 {
   logtrace(LogSlice,"# sao_offset_sign\n");
   int value = decode_CABAC_bypass(&tctx->cabac_decoder);
+  logtrace(LogSymbols,"$1 sao_offset_sign=%d\n",value);
   return value;
 }
 
@@ -1043,6 +1601,7 @@ static int decode_sao_band_position(thread_context* tctx)
 {
   logtrace(LogSlice,"# sao_band_position\n");
   int value = decode_CABAC_FL_bypass(&tctx->cabac_decoder,5);
+  logtrace(LogSymbols,"$1 sao_band_position=%d\n",value);
   return value;
 }
 
@@ -1052,6 +1611,7 @@ static int decode_transquant_bypass_flag(thread_context* tctx)
   logtrace(LogSlice,"# cu_transquant_bypass_enable_flag\n");
   int value = decode_CABAC_bit(&tctx->cabac_decoder,
                                &tctx->ctx_model[CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG]);
+  logtrace(LogSymbols,"$1 transquant_bypass_flag=%d\n",value);
   return value;
 }
 
@@ -1062,16 +1622,10 @@ static int decode_transquant_bypass_flag(thread_context* tctx)
 static int decode_split_cu_flag(thread_context* tctx,
 				int x0, int y0, int ctDepth)
 {
-  //decoder_context* ctx = tctx->decctx;
-
-  if (x0==64 && y0==448) {
-    //raise(SIGINT);
-  }
-
   // check if neighbors are available
 
-  int availableL = check_CTB_available(tctx->img,tctx->shdr, x0,y0, x0-1,y0);
-  int availableA = check_CTB_available(tctx->img,tctx->shdr, x0,y0, x0,y0-1);
+  int availableL = check_CTB_available(tctx->img, x0,y0, x0-1,y0);
+  int availableA = check_CTB_available(tctx->img, x0,y0, x0,y0-1);
 
   int condL = 0;
   int condA = 0;
@@ -1090,6 +1644,8 @@ static int decode_split_cu_flag(thread_context* tctx,
 
   logtrace(LogSlice,"> split_cu_flag R=%x, ctx=%d, bit=%d\n", tctx->cabac_decoder.range,context,bit);
 
+  logtrace(LogSymbols,"$1 split_cu_flag=%d\n",bit);
+
   return bit;
 }
 
@@ -1101,8 +1657,8 @@ static int decode_cu_skip_flag(thread_context* tctx,
 
   // check if neighbors are available
 
-  int availableL = check_CTB_available(tctx->img,tctx->shdr, x0,y0, x0-1,y0);
-  int availableA = check_CTB_available(tctx->img,tctx->shdr, x0,y0, x0,y0-1);
+  int availableL = check_CTB_available(tctx->img, x0,y0, x0-1,y0);
+  int availableA = check_CTB_available(tctx->img, x0,y0, x0,y0-1);
 
   int condL = 0;
   int condA = 0;
@@ -1121,6 +1677,8 @@ static int decode_cu_skip_flag(thread_context* tctx,
 
   logtrace(LogSlice,"> cu_skip_flag R=%x, ctx=%d, bit=%d\n", tctx->cabac_decoder.range,context,bit);
 
+  logtrace(LogSymbols,"$1 cu_skip_flag=%d\n",bit);
+
   return bit;
 }
 
@@ -1137,42 +1695,63 @@ static enum PartMode decode_part_mode(thread_context* tctx,
 
     logtrace(LogSlice,"> %s\n",bit ? "2Nx2N" : "NxN");
 
+    logtrace(LogSymbols,"$1 part_mode=%d\n",bit ? PART_2Nx2N : PART_NxN);
+
     return bit ? PART_2Nx2N : PART_NxN;
   }
   else {
     int bit0 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+0]);
-    if (bit0) { return PART_2Nx2N; }
+    if (bit0) { logtrace(LogSymbols,"$1 part_mode=%d\n",PART_2Nx2N); return PART_2Nx2N; }
 
     // CHECK_ME: I optimize code and fix bug here, need more VERIFY!
     int bit1 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+1]);
     if (cLog2CbSize > img->sps.Log2MinCbSizeY) {
       if (!img->sps.amp_enabled_flag) {
+        logtrace(LogSymbols,"$1 part_mode=%d\n",bit1 ? PART_2NxN : PART_Nx2N);
         return bit1 ? PART_2NxN : PART_Nx2N;
       }
       else {
         int bit3 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+3]);
         if (bit3) {
+          logtrace(LogSymbols,"$1 part_mode=%d\n",bit1 ? PART_2NxN : PART_Nx2N);
           return bit1 ? PART_2NxN : PART_Nx2N;
         }
 
         int bit4 = decode_CABAC_bypass(&tctx->cabac_decoder);
-        if ( bit1 &&  bit4) return PART_2NxnD;
-        if ( bit1 && !bit4) return PART_2NxnU;
-        if (!bit1 && !bit4) return PART_nLx2N;
-        if (!bit1 &&  bit4) return PART_nRx2N;
+        if ( bit1 &&  bit4) {
+          logtrace(LogSymbols,"$1 part_mode=%d\n",PART_2NxnD);
+          return PART_2NxnD;
+        }
+        if ( bit1 && !bit4) {
+          logtrace(LogSymbols,"$1 part_mode=%d\n",PART_2NxnU);
+          return PART_2NxnU;
+        }
+        if (!bit1 && !bit4) {
+          logtrace(LogSymbols,"$1 part_mode=%d\n",PART_nLx2N);
+          return PART_nLx2N;
+        }
+        if (!bit1 &&  bit4) {
+          logtrace(LogSymbols,"$1 part_mode=%d\n",PART_nRx2N);
+          return PART_nRx2N;
+        }
       }
     }
     else {
       // TODO, we could save one if here when first decoding the next bin and then
       // checkcLog2CbSize==3 when it is '0'
 
-      if (bit1) return PART_2NxN;
+      if (bit1) {
+        logtrace(LogSymbols,"$1 part_mode=%d\n",PART_2NxN);
+        return PART_2NxN;
+      }
 
       if (cLog2CbSize==3) {
+        logtrace(LogSymbols,"$1 part_mode=%d\n",PART_Nx2N);
         return PART_Nx2N;
       }
       else {
         int bit2 = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PART_MODE+2]);
+        logtrace(LogSymbols,"$1 part_mode=%d\n",PART_NxN-bit2);
         return (enum PartMode)((int)PART_NxN - bit2)/*bit2 ? PART_Nx2N : PART_NxN*/;
       }
     }
@@ -1183,27 +1762,31 @@ static enum PartMode decode_part_mode(thread_context* tctx,
 }
 
 
-static int decode_prev_intra_luma_pred_flag(thread_context* tctx)
+static inline int decode_prev_intra_luma_pred_flag(thread_context* tctx)
 {
   logtrace(LogSlice,"# prev_intra_luma_pred_flag\n");
   int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG]);
+  logtrace(LogSymbols,"$1 prev_intra_luma_pred_flag=%d\n",bit);
   return bit;
 }
 
 
-static int decode_mpm_idx(thread_context* tctx)
+static inline int decode_mpm_idx(thread_context* tctx)
 {
   logtrace(LogSlice,"# mpm_idx (TU:2)\n");
   int mpm = decode_CABAC_TU_bypass(&tctx->cabac_decoder, 2);
   logtrace(LogSlice,"> mpm_idx = %d\n",mpm);
+  logtrace(LogSymbols,"$1 mpm_idx=%d\n",mpm);
   return mpm;
 }
 
 
-static int decode_rem_intra_luma_pred_mode(thread_context* tctx)
+static inline int decode_rem_intra_luma_pred_mode(thread_context* tctx)
 {
   logtrace(LogSlice,"# rem_intra_luma_pred_mode (5 bits)\n");
-  return decode_CABAC_FL_bypass(&tctx->cabac_decoder, 5);
+  int value = decode_CABAC_FL_bypass(&tctx->cabac_decoder, 5);
+  logtrace(LogSymbols,"$1 rem_intra_luma_pred_mode=%d\n",value);
+  return value;
 }
 
 
@@ -1222,6 +1805,7 @@ static int decode_intra_chroma_pred_mode(thread_context* tctx)
   }
 
   logtrace(LogSlice,"> intra_chroma_pred_mode = %d\n",mode);
+  logtrace(LogSymbols,"$1 intra_chroma_pred_mode=%d\n",mode);
 
   return mode;
 }
@@ -1238,6 +1822,7 @@ static int decode_split_transform_flag(thread_context* tctx,
   logtrace(LogSlice,"# context: %d\n",context);
 
   int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG + context]);
+  logtrace(LogSymbols,"$1 split_transform_flag=%d\n",bit);
   return bit;
 }
 
@@ -1249,6 +1834,7 @@ static int decode_cbf_chroma(thread_context* tctx,
 
   int bit = decode_CABAC_bit(&tctx->cabac_decoder, &tctx->ctx_model[CONTEXT_MODEL_CBF_CHROMA + trafoDepth]);
 
+  logtrace(LogSymbols,"$1 cbf_chroma=%d\n",bit);
   return bit;
 }
 
@@ -1262,6 +1848,7 @@ static int decode_cbf_luma(thread_context* tctx,
 
   logtrace(LogSlice,"> cbf_luma = %d\n",bit);
 
+  logtrace(LogSymbols,"$1 cbf_luma=%d\n",bit);
   return bit;
 }
 
@@ -1284,6 +1871,7 @@ static inline int decode_coded_sub_block_flag(thread_context* tctx,
   int bit = decode_CABAC_bit(&tctx->cabac_decoder,
                              &tctx->ctx_model[CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG + ctxIdxInc]);
 
+  logtrace(LogSymbols,"$1 coded_sub_block_flag=%d\n",bit);
   return bit;
 }
 
@@ -1295,6 +1883,7 @@ static int decode_cu_qp_delta_abs(thread_context* tctx)
   int bit = decode_CABAC_bit(&tctx->cabac_decoder,
                              &tctx->ctx_model[CONTEXT_MODEL_CU_QP_DELTA_ABS + 0]);
   if (bit==0) {
+    logtrace(LogSymbols,"$1 cu_qp_delta_abs=%d\n",0);
     return 0;
   }
 
@@ -1308,14 +1897,16 @@ static int decode_cu_qp_delta_abs(thread_context* tctx)
 
   if (prefix==5) {
     int value = decode_CABAC_EGk_bypass(&tctx->cabac_decoder, 0);
+    logtrace(LogSymbols,"$1 cu_qp_delta_abs=%d\n",value+5);
     return value + 5;
   }
   else {
+    logtrace(LogSymbols,"$1 cu_qp_delta_abs=%d\n",prefix);
     return prefix;
   }
 }
 
-        
+
 static int decode_last_significant_coeff_prefix(thread_context* tctx,
 						int log2TrafoSize,
 						int cIdx,
@@ -1405,7 +1996,7 @@ bool alloc_and_init_significant_coeff_ctxIdx_lookupTable()
       for (int scanIdx=0;scanIdx<2;scanIdx++) {
         ctxIdxLookup[2][cIdx][scanIdx][prevCsbf] = p;
       }
-      
+
       p += 16*16;
     }
 
@@ -1752,6 +2343,9 @@ static inline int decode_significant_coeff_flag_lookup(thread_context* tctx,
 
   int bit = decode_CABAC_bit(&tctx->cabac_decoder,
                              &tctx->ctx_model[CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG + ctxIdxInc]);
+
+  logtrace(LogSymbols,"$1 significant_coeff_flag=%d\n",bit);
+
   return bit;
 }
 
@@ -1820,6 +2414,8 @@ static inline int decode_coeff_abs_level_greater1(thread_context* tctx,
   *lastInvocation_coeff_abs_level_greater1_flag = bit;
   *lastInvocation_ctxSet = ctxSet;
 
+  //logtrace(LogSymbols,"$1 coeff_abs_level_greater1=%d\n",bit);
+
   return bit;
 }
 
@@ -1837,10 +2433,14 @@ static int decode_coeff_abs_level_greater2(thread_context* tctx,
   int bit = decode_CABAC_bit(&tctx->cabac_decoder,
                              &tctx->ctx_model[CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG + ctxIdxInc]);
 
+  logtrace(LogSymbols,"$1 coeff_abs_level_greater2=%d\n",bit);
+
   return bit;
 }
 
 
+#define MAX_PREFIX 64
+
 static int decode_coeff_abs_level_remaining(thread_context* tctx,
                                             int cRiceParam)
 {
@@ -1851,6 +2451,10 @@ static int decode_coeff_abs_level_remaining(thread_context* tctx,
   do {
     prefix++;
     codeword = decode_CABAC_bypass(&tctx->cabac_decoder);
+
+    if (prefix>MAX_PREFIX) {
+      return 0; // TODO: error
+    }
   }
   while (codeword);
 
@@ -1872,6 +2476,8 @@ static int decode_coeff_abs_level_remaining(thread_context* tctx,
     value = (((1<<(prefix-3))+3-1)<<cRiceParam)+codeword;
   }
 
+  logtrace(LogSymbols,"$1 coeff_abs_level_remaining=%d\n",value);
+
   return value;
 }
 
@@ -1883,6 +2489,8 @@ static int decode_merge_flag(thread_context* tctx)
   int bit = decode_CABAC_bit(&tctx->cabac_decoder,
                              &tctx->ctx_model[CONTEXT_MODEL_MERGE_FLAG]);
 
+  logtrace(LogSymbols,"$1 merge_flag=%d\n",bit);
+
   return bit;
 }
 
@@ -1891,6 +2499,11 @@ static int decode_merge_idx(thread_context* tctx)
 {
   logtrace(LogSlice,"# merge_idx\n");
 
+  if (tctx->shdr->MaxNumMergeCand <= 1) {
+    logtrace(LogSymbols,"$1 merge_idx=%d\n",0);
+    return 0;
+  }
+
   // TU coding, first bin is CABAC, remaining are bypass.
   // cMax = MaxNumMergeCand-1
 
@@ -1914,6 +2527,7 @@ static int decode_merge_idx(thread_context* tctx)
   }
 
   logtrace(LogSlice,"> merge_idx = %d\n",idx);
+  logtrace(LogSymbols,"$1 merge_idx=%d\n",idx);
 
   return idx;
 }
@@ -1926,6 +2540,7 @@ static int decode_pred_mode_flag(thread_context* tctx)
   int bit = decode_CABAC_bit(&tctx->cabac_decoder,
                              &tctx->ctx_model[CONTEXT_MODEL_PRED_MODE_FLAG]);
 
+  logtrace(LogSymbols,"$1 pred_mode=%d\n",bit);
   return bit;
 }
 
@@ -1936,6 +2551,7 @@ static int decode_mvp_lx_flag(thread_context* tctx)
   int bit = decode_CABAC_bit(&tctx->cabac_decoder,
                              &tctx->ctx_model[CONTEXT_MODEL_MVP_LX_FLAG]);
 
+  logtrace(LogSymbols,"$1 mvp_lx_flag=%d\n",bit);
   return bit;
 }
 
@@ -1946,6 +2562,7 @@ static int decode_rqt_root_cbf(thread_context* tctx)
   int bit = decode_CABAC_bit(&tctx->cabac_decoder,
                              &tctx->ctx_model[CONTEXT_MODEL_RQT_ROOT_CBF]);
 
+  logtrace(LogSymbols,"$1 rqt_root_cbf=%d\n",bit);
   return bit;
 }
 
@@ -1980,6 +2597,7 @@ static int decode_ref_idx_lX(thread_context* tctx, int numRefIdxLXActive)
 
   logtrace(LogSlice,"> ref_idx = %d\n",idx);
 
+  logtrace(LogSymbols,"$1 ref_idx_lX=%d\n",idx);
   return idx;
 }
 
@@ -2014,49 +2632,29 @@ static enum InterPredIdc  decode_inter_pred_idc(thread_context* tctx,
   logtrace(LogSlice,"> inter_pred_idc = %d (%s)\n",value,
            value==0 ? "L0" : (value==1 ? "L1" : "BI"));
 
-  return (enum InterPredIdc) value;
-}
+  logtrace(LogSymbols,"$1 decode_inter_pred_idx=%d\n",value+1);
 
+  return (enum InterPredIdc) (value+1);
+}
 
 
-void initialize_CABAC(thread_context* tctx)
+static int  decode_explicit_rdpcm_flag(thread_context* tctx,int cIdx)
 {
-  const int initType = tctx->shdr->initType;
-  assert(initType >= 0 && initType <= 2);
+  context_model* model = &tctx->ctx_model[CONTEXT_MODEL_RDPCM_FLAG];
+  int value = decode_CABAC_bit(&tctx->cabac_decoder, &model[cIdx ? 1 : 0]);
+  return value;
+}
 
-  init_context(tctx, CONTEXT_MODEL_SPLIT_CU_FLAG, initValue_split_cu_flag[initType], 3);
-  if (initType > 0) {
-    init_context(tctx, CONTEXT_MODEL_CU_SKIP_FLAG,    initValue_cu_skip_flag[initType-1],  3);
-    init_context(tctx, CONTEXT_MODEL_PRED_MODE_FLAG, &initValue_pred_mode_flag[initType-1], 1);
-    init_context(tctx, CONTEXT_MODEL_MERGE_FLAG,             &initValue_merge_flag[initType-1],1);
-    init_context(tctx, CONTEXT_MODEL_MERGE_IDX,              &initValue_merge_idx[initType-1], 1);
-    init_context(tctx, CONTEXT_MODEL_INTER_PRED_IDC,         initValue_inter_pred_idc,         5);
-    init_context(tctx, CONTEXT_MODEL_REF_IDX_LX,             initValue_ref_idx_lX,             2);
-    init_context(tctx, CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG, &initValue_abs_mvd_greater01_flag[initType == 1 ? 0 : 2], 2);
-    init_context(tctx, CONTEXT_MODEL_MVP_LX_FLAG,            initValue_mvp_lx_flag,            1);
-    init_context(tctx, CONTEXT_MODEL_RQT_ROOT_CBF,           initValue_rqt_root_cbf,           1);
-  }
-
-  init_context(tctx, CONTEXT_MODEL_PART_MODE,     &initValue_part_mode[(initType!=2 ? initType : 5)], 4);
-  init_context(tctx, CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG, &initValue_prev_intra_luma_pred_flag[initType], 1);
-  init_context(tctx, CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE,    &initValue_intra_chroma_pred_mode[initType],    1);
-  init_context(tctx, CONTEXT_MODEL_CBF_LUMA,                  &initValue_cbf_luma[initType == 0 ? 0 : 2],     2);
-  init_context(tctx, CONTEXT_MODEL_CBF_CHROMA,                &initValue_cbf_chroma[initType * 4],            4);
-  init_context(tctx, CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG,      &initValue_split_transform_flag[initType * 3],  3);
-  init_context(tctx, CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX, &initValue_last_significant_coefficient_prefix[initType * 18], 18);
-  init_context(tctx, CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX, &initValue_last_significant_coefficient_prefix[initType * 18], 18);
-  init_context(tctx, CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG,                  &initValue_coded_sub_block_flag[initType * 4],        4);
-  init_context(tctx, CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG,              initValue_significant_coeff_flag[initType],    42);
-  init_context(tctx, CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG,       &initValue_coeff_abs_level_greater1_flag[initType * 24], 24);
-  init_context(tctx, CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG,       &initValue_coeff_abs_level_greater2_flag[initType *  6],  6);
-  init_context(tctx, CONTEXT_MODEL_SAO_MERGE_FLAG,                      &initValue_sao_merge_leftUp_flag[initType],    1);
-  init_context(tctx, CONTEXT_MODEL_SAO_TYPE_IDX,                        &initValue_sao_type_idx_lumaChroma_flag[initType], 1);
-  init_context(tctx, CONTEXT_MODEL_CU_QP_DELTA_ABS,        initValue_cu_qp_delta_abs,        2);
-  init_context(tctx, CONTEXT_MODEL_TRANSFORM_SKIP_FLAG,    initValue_transform_skip_flag,    2);
-  init_context(tctx, CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG, &initValue_cu_transquant_bypass_flag[initType], 1);
+
+static int  decode_explicit_rdpcm_dir(thread_context* tctx,int cIdx)
+{
+  context_model* model = &tctx->ctx_model[CONTEXT_MODEL_RDPCM_DIR];
+  int value = decode_CABAC_bit(&tctx->cabac_decoder, &model[cIdx ? 1 : 0]);
+  return value;
 }
 
 
+
 /* Take CtbAddrInTS and compute
    -> CtbAddrInRS, CtbX, CtbY
  */
@@ -2135,7 +2733,10 @@ void read_sao(thread_context* tctx, int xCtb,int yCtb,
   }
 
   if (!sao_merge_up_flag && !sao_merge_left_flag) {
-    for (int cIdx=0; cIdx<3; cIdx++) {
+    int nChroma = 3;
+    if (sps->ChromaArrayType == CHROMA_MONO) nChroma=1;
+
+    for (int cIdx=0; cIdx<nChroma; cIdx++) {
       if ((shdr->slice_sao_luma_flag && cIdx==0) ||
           (shdr->slice_sao_chroma_flag && cIdx>0)) {
 
@@ -2161,7 +2762,7 @@ void read_sao(thread_context* tctx, int xCtb,int yCtb,
 
         if (SaoTypeIdx != 0) {
           for (int i=0;i<4;i++) {
-            saoinfo.saoOffsetVal[cIdx][i] = decode_sao_offset_abs(tctx);
+            saoinfo.saoOffsetVal[cIdx][i] = decode_sao_offset_abs(tctx, img->get_bit_depth(cIdx));
             logtrace(LogSlice,"saoOffsetVal[%d][%d] = %d\n",cIdx,i, saoinfo.saoOffsetVal[cIdx][i]);
           }
 
@@ -2196,13 +2797,17 @@ void read_sao(thread_context* tctx, int xCtb,int yCtb,
             logtrace(LogSlice,"SaoEoClass[%d] = %d\n",cIdx,SaoEoClass);
           }
 
-          int bitDepth = (cIdx==0 ?
-                          sps->BitDepth_Y :
-                          sps->BitDepth_C);
-          int shift = bitDepth-libde265_min(bitDepth,10);
+          int log2OffsetScale;
+
+          if (cIdx==0) {
+            log2OffsetScale = pps->range_extension.log2_sao_offset_scale_luma;
+          }
+          else {
+            log2OffsetScale = pps->range_extension.log2_sao_offset_scale_chroma;
+          }
 
           for (int i=0;i<4;i++) {
-            saoinfo.saoOffsetVal[cIdx][i] = sign[i]*(saoinfo.saoOffsetVal[cIdx][i] << shift);
+            saoinfo.saoOffsetVal[cIdx][i] = sign[i]*(saoinfo.saoOffsetVal[cIdx][i] << log2OffsetScale);
           }
         }
       }
@@ -2252,7 +2857,7 @@ void read_coding_tree_unit(thread_context* tctx)
 }
 
 
-LIBDE265_INLINE static int luma_pos_to_ctbAddrRS(seq_parameter_set* sps, int x,int y)
+LIBDE265_INLINE static int luma_pos_to_ctbAddrRS(const seq_parameter_set* sps, int x,int y)
 {
   int ctbX = x >> sps->Log2CtbSizeY;
   int ctbY = y >> sps->Log2CtbSizeY;
@@ -2261,8 +2866,7 @@ LIBDE265_INLINE static int luma_pos_to_ctbAddrRS(seq_parameter_set* sps, int x,i
 }
 
 
-int check_CTB_available(de265_image* img,
-                        slice_segment_header* shdr,
+int check_CTB_available(const de265_image* img,
                         int xC,int yC, int xN,int yN)
 {
   // check whether neighbor is outside of frame
@@ -2295,7 +2899,6 @@ int check_CTB_available(de265_image* img,
 
 int residual_coding(thread_context* tctx,
                     int x0, int y0,  // position of TU in frame
-                    int xL, int yL,  // position of TU in local CU
                     int log2TrafoSize,
                     int cIdx)
 {
@@ -2307,6 +2910,7 @@ int residual_coding(thread_context* tctx,
   const seq_parameter_set* sps = &img->sps;
   const pic_parameter_set* pps = &img->pps;
 
+  enum PredMode PredMode = img->get_pred_mode(x0,y0);
 
   if (cIdx==0) {
     img->set_nonzero_coefficient(x0,y0,log2TrafoSize);
@@ -2315,7 +2919,7 @@ int residual_coding(thread_context* tctx,
 
   if (pps->transform_skip_enabled_flag &&
       !tctx->cu_transquant_bypass_flag &&
-      (log2TrafoSize==2))
+      (log2TrafoSize <= pps->Log2MaxTransformSkipSize))
     {
       tctx->transform_skip_flag[cIdx] = decode_transform_skip_flag(tctx,cIdx);
     }
@@ -2325,6 +2929,33 @@ int residual_coding(thread_context* tctx,
     }
 
 
+  tctx->explicit_rdpcm_flag = false;
+
+  if (PredMode == MODE_INTER && sps->range_extension.explicit_rdpcm_enabled_flag &&
+      ( tctx->transform_skip_flag[cIdx] || tctx->cu_transquant_bypass_flag))
+    {
+      tctx->explicit_rdpcm_flag = decode_explicit_rdpcm_flag(tctx,cIdx);
+      if (tctx->explicit_rdpcm_flag) {
+        tctx->explicit_rdpcm_dir = decode_explicit_rdpcm_dir(tctx,cIdx);
+      }
+
+      //printf("EXPLICIT RDPCM %d;%d\n",x0,y0);
+    }
+  else
+    {
+      tctx->explicit_rdpcm_flag = false;
+    }
+
+
+
+  // sbType for persistent_rice_adaptation_enabled_flag
+
+  int sbType = (cIdx==0) ? 2 : 0;
+  if (tctx->transform_skip_flag[cIdx] || tctx->cu_transquant_bypass_flag) {
+    sbType++;
+  }
+
+
   // --- decode position of last coded coefficient ---
 
   int last_significant_coeff_x_prefix =
@@ -2368,44 +2999,20 @@ int residual_coding(thread_context* tctx,
 
   int scanIdx;
 
-  enum PredMode PredMode = img->get_pred_mode(x0,y0);
-
-
   if (PredMode == MODE_INTRA) {
     if (cIdx==0) {
-      if (log2TrafoSize==2 || log2TrafoSize==3) {
-        enum IntraPredMode predMode = img->get_IntraPredMode(x0,y0);
-        logtrace(LogSlice,"IntraPredMode[%d,%d] = %d\n",x0,y0,predMode);
-
-        if (predMode >= 6 && predMode <= 14) scanIdx=2;
-        else if (predMode >= 22 && predMode <= 30) scanIdx=1;
-        else scanIdx=0;
-      }
-      else { scanIdx=0; }
+      scanIdx = get_intra_scan_idx(log2TrafoSize, img->get_IntraPredMode(x0,y0),  cIdx, sps);
     }
     else {
-      if (log2TrafoSize==1 || log2TrafoSize==2) {
-        enum IntraPredMode predMode = tctx->IntraPredModeC;
-
-        if (predMode >= 6 && predMode <= 14) scanIdx=2;
-        else if (predMode >= 22 && predMode <= 30) scanIdx=1;
-        else scanIdx=0;
-      }
-      else { scanIdx=0; }
+      scanIdx = get_intra_scan_idx(log2TrafoSize, img->get_IntraPredModeC(x0,y0), cIdx, sps);
     }
-
-    logtrace(LogSlice,"pred: %d -> scan: %d\n",PredMode,scanIdx);
   }
   else {
     scanIdx=0;
   }
 
-
-  // HM 9 only ?
   if (scanIdx==2) {
-    int t = LastSignificantCoeffX;
-    LastSignificantCoeffX = LastSignificantCoeffY;
-    LastSignificantCoeffY = t;
+    std::swap(LastSignificantCoeffX, LastSignificantCoeffY);
   }
 
   logtrace(LogSlice,"LastSignificantCoeff: x=%d;y=%d\n",LastSignificantCoeffX,LastSignificantCoeffY);
@@ -2533,9 +3140,16 @@ int residual_coding(thread_context* tctx,
 
         // for all AC coefficients in sub-block, a significant_coeff flag is coded
 
-        int significant_coeff = decode_significant_coeff_flag_lookup(tctx,
-                                                                     ctxIdxMap[xC+(yC<<log2TrafoSize)]);
-                                                                     //ctxIdxMap[(i<<4)+n]);
+        int ctxInc;
+        if (sps->range_extension.transform_skip_context_enabled_flag &&
+            (tctx->cu_transquant_bypass_flag || tctx->transform_skip_flag[cIdx])) {
+          ctxInc = ( cIdx == 0 ) ? 42 : (16+27);
+        }
+        else {
+          ctxInc = ctxIdxMap[xC+(yC<<log2TrafoSize)];
+        }
+
+        int significant_coeff = decode_significant_coeff_flag_lookup(tctx, ctxInc);
 
         if (significant_coeff) {
           coeff_value[nCoefficients] = 1;
@@ -2556,9 +3170,17 @@ int residual_coding(thread_context* tctx,
         {
           if (inferSbDcSigCoeffFlag==0) {
             // if we cannot infert the DC coefficient, it is coded
-            int significant_coeff = decode_significant_coeff_flag_lookup(tctx,
-                                                                         ctxIdxMap[x0+(y0<<log2TrafoSize)]);
-                                                                         //ctxIdxMap[(i<<4)+0]);
+
+            int ctxInc;
+            if (sps->range_extension.transform_skip_context_enabled_flag &&
+                (tctx->cu_transquant_bypass_flag || tctx->transform_skip_flag[cIdx])) {
+              ctxInc = ( cIdx == 0 ) ? 42 : (16+27);
+            }
+            else {
+              ctxInc = ctxIdxMap[x0+(y0<<log2TrafoSize)];
+            }
+
+            int significant_coeff = decode_significant_coeff_flag_lookup(tctx, ctxInc);
 
 
             if (significant_coeff) {
@@ -2581,14 +3203,14 @@ int residual_coding(thread_context* tctx,
 
 
     /*
-    logtrace(LogSlice,"significant_coeff_flags:\n");
-    for (int y=0;y<4;y++) {
+      logtrace(LogSlice,"significant_coeff_flags:\n");
+      for (int y=0;y<4;y++) {
       logtrace(LogSlice,"  ");
       for (int x=0;x<4;x++) {
-        logtrace(LogSlice,"*%d ",significant_coeff_flag[y][x]);
+      logtrace(LogSlice,"*%d ",significant_coeff_flag[y][x]);
       }
       logtrace(LogSlice,"*\n");
-    }
+      }
     */
 
 
@@ -2649,8 +3271,28 @@ int residual_coding(thread_context* tctx,
 
       // --- decode coefficient signs ---
 
-      int signHidden = (coeff_scan_pos[0]-coeff_scan_pos[nCoefficients-1] > 3 &&
-                        !tctx->cu_transquant_bypass_flag);
+      int signHidden;
+
+
+      IntraPredMode predModeIntra;
+      if (cIdx==0) predModeIntra = img->get_IntraPredMode(x0,y0);
+      else         predModeIntra = img->get_IntraPredModeC(x0,y0);
+
+
+      if (tctx->cu_transquant_bypass_flag ||
+          (PredMode == MODE_INTRA &&
+           sps->range_extension.implicit_rdpcm_enabled_flag &&
+           tctx->transform_skip_flag[cIdx] &&
+           ( predModeIntra == 10 || predModeIntra == 26 )) ||
+          tctx->explicit_rdpcm_flag)
+        {
+          signHidden = 0;
+        }
+      else
+        {
+          signHidden = (coeff_scan_pos[0]-coeff_scan_pos[nCoefficients-1] > 3);
+        }
+
 
       for (int n=0;n<nCoefficients-1;n++) {
         coeff_sign[n] = decode_CABAC_bypass(&tctx->cabac_decoder);
@@ -2670,27 +3312,61 @@ int residual_coding(thread_context* tctx,
       // --- decode coefficient value ---
 
       int sumAbsLevel=0;
-      int uiGoRiceParam=0;
+      int uiGoRiceParam;
+
+      if (sps->range_extension.persistent_rice_adaptation_enabled_flag==0) {
+        uiGoRiceParam = 0;
+      }
+      else {
+        uiGoRiceParam = tctx->StatCoeff[sbType]/4;
+      }
+
+      // printf("initial uiGoRiceParam=%d\n",uiGoRiceParam);
+      bool firstCoeffWithAbsLevelRemaining = true;
 
       for (int n=0;n<nCoefficients;n++) {
         int baseLevel = coeff_value[n];
 
         int coeff_abs_level_remaining;
 
+        // printf("coeff %d/%d, uiRiceParam: %d\n",n,nCoefficients,uiGoRiceParam);
+
         if (coeff_has_max_base_level[n]) {
           coeff_abs_level_remaining =
             decode_coeff_abs_level_remaining(tctx, uiGoRiceParam);
 
-          // (9-462)
-          if (baseLevel + coeff_abs_level_remaining > 3*(1<<uiGoRiceParam)) {
-            uiGoRiceParam++;
-            if (uiGoRiceParam>4) uiGoRiceParam=4;
+          if (sps->range_extension.persistent_rice_adaptation_enabled_flag == 0) {
+            // (2014.10 / 9-20)
+            if (baseLevel + coeff_abs_level_remaining > 3*(1<<uiGoRiceParam)) {
+              uiGoRiceParam++;
+              if (uiGoRiceParam>4) uiGoRiceParam=4;
+            }
+          }
+          else {
+            if (baseLevel + coeff_abs_level_remaining > 3*(1<<uiGoRiceParam))
+              uiGoRiceParam++;
+          }
+
+          // persistent_rice_adaptation_enabled_flag
+          if (sps->range_extension.persistent_rice_adaptation_enabled_flag &&
+              firstCoeffWithAbsLevelRemaining) {
+            if (coeff_abs_level_remaining >= (3 << (tctx->StatCoeff[sbType]/4 ))) {
+              tctx->StatCoeff[sbType]++;
+            }
+            else if (2*coeff_abs_level_remaining < (1 << (tctx->StatCoeff[sbType]/4 )) &&
+                     tctx->StatCoeff[sbType] > 0) {
+              tctx->StatCoeff[sbType]--;
+            }
           }
+
+          firstCoeffWithAbsLevelRemaining=false;
         }
         else {
           coeff_abs_level_remaining = 0;
         }
 
+        logtrace(LogSlice, "coeff_abs_level_remaining=%d\n",coeff_abs_level_remaining);
+
 
         int16_t currCoeff = baseLevel + coeff_abs_level_remaining;
         if (coeff_sign[n]) {
@@ -2705,6 +3381,8 @@ int residual_coding(thread_context* tctx,
           }
         }
 
+        logtrace(LogSlice, "quantized coefficient=%d\n",currCoeff);
+
 #ifdef DE265_LOG_TRACE
         //TransCoeffLevel[yC*CoeffStride + xC] = currCoeff;
 #endif
@@ -2717,7 +3395,12 @@ int residual_coding(thread_context* tctx,
         tctx->coeffList[cIdx][ tctx->nCoeff[cIdx] ] = currCoeff;
         tctx->coeffPos [cIdx][ tctx->nCoeff[cIdx] ] = xC + yC*CoeffStride;
         tctx->nCoeff[cIdx]++;
+
+        //printf("%d ",currCoeff);
       }  // iterate through coefficients in sub-block
+
+      //printf(" (%d;%d)\n",x0,y0);
+
     }  // if nonZero
   }  // next sub-block
 
@@ -2725,6 +3408,130 @@ int residual_coding(thread_context* tctx,
 }
 
 
+static void decode_TU(thread_context* tctx,
+                      int x0,int y0,
+                      int xCUBase,int yCUBase,
+                      int nT, int cIdx, enum PredMode cuPredMode, bool cbf)
+{
+  de265_image* img = tctx->img;
+
+  int residualDpcm = 0;
+
+  if (cuPredMode == MODE_INTRA) // if intra mode
+    {
+      enum IntraPredMode intraPredMode;
+
+      if (cIdx==0) {
+        intraPredMode = img->get_IntraPredMode(x0,y0);
+      }
+      else {
+        const int SubWidthC  = tctx->img->sps.SubWidthC;
+        const int SubHeightC = tctx->img->sps.SubHeightC;
+
+        intraPredMode = img->get_IntraPredModeC(x0*SubWidthC,y0*SubHeightC);
+      }
+
+      if (intraPredMode<0 || intraPredMode>=35) {
+        // TODO: ERROR
+        intraPredMode = INTRA_DC;
+      }
+
+      decode_intra_prediction(img, x0,y0, intraPredMode, nT, cIdx);
+
+
+      residualDpcm = tctx->img->sps.range_extension.implicit_rdpcm_enabled_flag &&
+        (tctx->cu_transquant_bypass_flag || tctx->transform_skip_flag[cIdx]) &&
+        (intraPredMode == 10 || intraPredMode == 26);
+
+      if (residualDpcm && intraPredMode == 26)
+        residualDpcm = 2;
+    }
+  else // INTER
+    {
+      if (tctx->explicit_rdpcm_flag) {
+        residualDpcm = (tctx->explicit_rdpcm_dir ? 2 : 1);
+      }
+    }
+
+  if (cbf) {
+    scale_coefficients(tctx, x0,y0, xCUBase,yCUBase, nT, cIdx,
+                       tctx->transform_skip_flag[cIdx], cuPredMode==MODE_INTRA, residualDpcm);
+  }
+  /*
+  else if (!cbf && cIdx==0) {
+    memset(tctx->residual_luma,0,32*32*sizeof(int32_t));
+  }
+  */
+  else if (!cbf && cIdx!=0 && tctx->ResScaleVal) {
+    // --- cross-component-prediction when CBF==0 ---
+
+    tctx->nCoeff[cIdx] = 0;
+    residualDpcm=0;
+
+    scale_coefficients(tctx, x0,y0, xCUBase,yCUBase, nT, cIdx,
+                       tctx->transform_skip_flag[cIdx], cuPredMode==MODE_INTRA, residualDpcm);
+  }
+}
+
+
+static int decode_log2_res_scale_abs_plus1(thread_context* tctx, int cIdxMinus1)
+{
+  //const int context = (cIdx==0) ? 0 : 1;
+
+  logtrace(LogSlice,"# log2_res_scale_abs_plus1 (c=%d)\n",cIdxMinus1);
+
+  int value = 0;
+  int cMax  = 4;
+  for (int binIdx=0;binIdx<cMax;binIdx++)
+    {
+      int ctxIdxInc = 4*cIdxMinus1 + binIdx;
+
+      int bit = decode_CABAC_bit(&tctx->cabac_decoder,
+                                 &tctx->ctx_model[CONTEXT_MODEL_LOG2_RES_SCALE_ABS_PLUS1+ctxIdxInc]);
+      if (!bit) break;
+      value++;
+    }
+
+  logtrace(LogSymbols,"$1 log2_res_scale_abs_plus1=%d\n",value);
+
+  return value;
+}
+
+
+static int decode_res_scale_sign_flag(thread_context* tctx, int cIdxMinus1)
+{
+  //const int context = (cIdx==0) ? 0 : 1;
+
+  logtrace(LogSlice,"# res_scale_sign_flag (c=%d)\n",cIdxMinus1);
+
+  int bit = decode_CABAC_bit(&tctx->cabac_decoder,
+                             &tctx->ctx_model[CONTEXT_MODEL_RES_SCALE_SIGN_FLAG+cIdxMinus1]);
+
+  logtrace(LogSymbols,"$1 res_scale_sign_flag=%d\n",bit);
+
+  return bit;
+}
+
+
+static void read_cross_comp_pred(thread_context* tctx, int cIdxMinus1)
+{
+  int log2_res_scale_abs_plus1 = decode_log2_res_scale_abs_plus1(tctx,cIdxMinus1);
+  int ResScaleVal;
+
+  if (log2_res_scale_abs_plus1 != 0) {
+    int res_scale_sign_flag = decode_res_scale_sign_flag(tctx,cIdxMinus1);
+
+    ResScaleVal = 1 << (log2_res_scale_abs_plus1 - 1);
+    ResScaleVal *= 1 - 2 * res_scale_sign_flag;
+  }
+  else {
+    ResScaleVal = 0;
+  }
+
+  tctx->ResScaleVal = ResScaleVal;
+}
+
+
 int read_transform_unit(thread_context* tctx,
                         int x0, int y0,        // position of TU in frame
                         int xBase, int yBase,  // position of parent TU in frame
@@ -2737,77 +3544,264 @@ int read_transform_unit(thread_context* tctx,
   logtrace(LogSlice,"- read_transform_unit x0:%d y0:%d xBase:%d yBase:%d nT:%d cbf:%d:%d:%d\n",
            x0,y0,xBase,yBase, 1<<log2TrafoSize, cbf_luma, cbf_cb, cbf_cr);
 
-  assert(cbf_cb != -1);
-  assert(cbf_cr != -1);
-  assert(cbf_luma != -1);
+  assert(cbf_cb != -1);
+  assert(cbf_cr != -1);
+  assert(cbf_luma != -1);
+
+  const int ChromaArrayType = tctx->img->sps.ChromaArrayType;
+
+  int log2TrafoSizeC = (ChromaArrayType==CHROMA_444 ? log2TrafoSize : log2TrafoSize-1);
+  log2TrafoSizeC = libde265_max(2, log2TrafoSizeC);
+
+  const int cbfLuma   = cbf_luma;
+  const int cbfChroma = cbf_cb | cbf_cr;
+
+  tctx->transform_skip_flag[0]=0;
+  tctx->transform_skip_flag[1]=0;
+  tctx->transform_skip_flag[2]=0;
+
+  tctx->explicit_rdpcm_flag = false;
+
+
+  enum PredMode cuPredMode = tctx->img->get_pred_mode(x0,y0);
+
+  if (cbfLuma || cbfChroma)
+    {
+      bool doDecodeQuantParameters = false;
+
+      if (tctx->img->pps.cu_qp_delta_enabled_flag &&
+          !tctx->IsCuQpDeltaCoded) {
+
+        int cu_qp_delta_abs = decode_cu_qp_delta_abs(tctx);
+        int cu_qp_delta_sign=0;
+        if (cu_qp_delta_abs) {
+          cu_qp_delta_sign = decode_CABAC_bypass(&tctx->cabac_decoder);
+        }
+
+        tctx->IsCuQpDeltaCoded = 1;
+        tctx->CuQpDelta = cu_qp_delta_abs*(1-2*cu_qp_delta_sign);
+
+        //printf("read cu_qp_delta (%d;%d) = %d\n",x0,y0,tctx->CuQpDelta);
+
+        logtrace(LogSlice,"cu_qp_delta_abs = %d\n",cu_qp_delta_abs);
+        logtrace(LogSlice,"cu_qp_delta_sign = %d\n",cu_qp_delta_sign);
+        logtrace(LogSlice,"CuQpDelta = %d\n",tctx->CuQpDelta);
+
+        doDecodeQuantParameters = true;
+        //decode_quantization_parameters(tctx, x0,y0, xCUBase, yCUBase);
+      }
+
+      if (tctx->shdr->cu_chroma_qp_offset_enabled_flag && cbfChroma &&
+          !tctx->cu_transquant_bypass_flag && !tctx->IsCuChromaQpOffsetCoded ) {
+        logtrace(LogSlice,"# cu_chroma_qp_offset_flag\n");
+
+        int cu_chroma_qp_offset_flag = decode_CABAC_bit(&tctx->cabac_decoder,
+                                                        &tctx->ctx_model[CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_FLAG]);
+
+
+        const pic_parameter_set* pps = &tctx->img->pps;
+
+        int cu_chroma_qp_offset_idx = 0;
+        if (cu_chroma_qp_offset_flag && pps->range_extension.chroma_qp_offset_list_len > 1) {
+          cu_chroma_qp_offset_idx = decode_CABAC_bit(&tctx->cabac_decoder,
+                                                     &tctx->ctx_model[CONTEXT_MODEL_CU_CHROMA_QP_OFFSET_IDX]);
+        }
+
+        tctx->IsCuChromaQpOffsetCoded = 1;
+
+        if (cu_chroma_qp_offset_flag) {
+          tctx->CuQpOffsetCb = pps->range_extension.cb_qp_offset_list[ cu_chroma_qp_offset_idx ];
+          tctx->CuQpOffsetCr = pps->range_extension.cr_qp_offset_list[ cu_chroma_qp_offset_idx ];
+        }
+        else {
+          tctx->CuQpOffsetCb = 0;
+          tctx->CuQpOffsetCr = 0;
+        }
+
+        doDecodeQuantParameters = true;
+        //decode_quantization_parameters(tctx, x0,y0, xCUBase, yCUBase);
+      }
+
+
+      if (doDecodeQuantParameters) {
+        decode_quantization_parameters(tctx, x0,y0, xCUBase, yCUBase);
+      }
+    }
+
+  // position of TU in local CU
+  int xL = x0 - xCUBase;
+  int yL = y0 - yCUBase;
+  int nT = 1<<log2TrafoSize;
+  int nTC = 1<<log2TrafoSizeC;
+
+  const int SubWidthC  = tctx->img->sps.SubWidthC;
+  const int SubHeightC = tctx->img->sps.SubHeightC;
+
+  // --- luma ---
+
+  tctx->ResScaleVal = 0;
+
+  int err;
+  if (cbf_luma) {
+    if ((err=residual_coding(tctx,x0,y0, log2TrafoSize,0)) != DE265_OK) return err;
+  }
+
+  decode_TU(tctx, x0,y0, xCUBase,yCUBase, nT, 0, cuPredMode, cbf_luma);
+
 
-  tctx->transform_skip_flag[0]=0;
-  tctx->transform_skip_flag[1]=0;
-  tctx->transform_skip_flag[2]=0;
+  // --- chroma ---
 
+  const int yOffset422 = 1<<log2TrafoSizeC;
 
-  if (cbf_luma || cbf_cb || cbf_cr)
-    {
-      if (tctx->img->pps.cu_qp_delta_enabled_flag &&
-          !tctx->IsCuQpDeltaCoded) {
+  if (log2TrafoSize>2 || ChromaArrayType == CHROMA_444) {
+    // TODO: cross-component prediction
 
-        int cu_qp_delta_abs = decode_cu_qp_delta_abs(tctx);
-        int cu_qp_delta_sign=0;
-        if (cu_qp_delta_abs) {
-          cu_qp_delta_sign = decode_CABAC_bypass(&tctx->cabac_decoder);
-        }
+    const bool do_cross_component_prediction =
+      (tctx->img->pps.range_extension.cross_component_prediction_enabled_flag &&
+       cbf_luma &&
+       (cuPredMode == MODE_INTER || tctx->img->is_IntraPredModeC_Mode4(x0,y0)));
 
-        tctx->IsCuQpDeltaCoded = 1;
-        tctx->CuQpDelta = cu_qp_delta_abs*(1-2*cu_qp_delta_sign);
+    if (do_cross_component_prediction) {
+      read_cross_comp_pred(tctx, 0);
+    }
+    else {
+      tctx->ResScaleVal = 0;
+    }
 
-        //printf("read cu_qp_delta (%d;%d) = %d\n",x0,y0,tctx->CuQpDelta);
+    {
+      if (cbf_cb & 1) {
+        if ((err=residual_coding(tctx,x0,y0,log2TrafoSizeC,1)) != DE265_OK) return err;
+      }
 
-        logtrace(LogSlice,"cu_qp_delta_abs = %d\n",cu_qp_delta_abs);
-        logtrace(LogSlice,"cu_qp_delta_sign = %d\n",cu_qp_delta_sign);
-        logtrace(LogSlice,"CuQpDelta = %d\n",tctx->CuQpDelta);
+      if (tctx->img->sps.ChromaArrayType != CHROMA_MONO) {
+        decode_TU(tctx,
+                  x0/SubWidthC,y0/SubHeightC,
+                  xCUBase/SubWidthC,yCUBase/SubHeightC, nTC, 1, cuPredMode, cbf_cb & 1);
+      }
+    }
 
-        decode_quantization_parameters(tctx, x0,y0, xCUBase, yCUBase);
+    // 4:2:2
+    if (ChromaArrayType == CHROMA_422) {
+      const int yOffset = 1<<log2TrafoSizeC;
+
+      if (cbf_cb & 2) {
+        if ((err=residual_coding(tctx,
+                                 x0,y0+yOffset*SubHeightC,
+                                 log2TrafoSizeC,1)) != DE265_OK) return err;
       }
+
+      decode_TU(tctx,
+                x0/SubWidthC,y0/SubHeightC + yOffset,
+                xCUBase/SubWidthC,yCUBase/SubHeightC +yOffset,
+                nTC, 1, cuPredMode, cbf_cb & 2);
     }
 
 
-  if (cbf_luma || cbf_cb || cbf_cr)
+    if (do_cross_component_prediction) {
+      read_cross_comp_pred(tctx, 1);
+    }
+    else {
+      tctx->ResScaleVal = 0;
+    }
+
     {
-      // position of TU in local CU
-      int xL = x0 - xCUBase;
-      int yL = y0 - yCUBase;
+      if (cbf_cr & 1) {
+        if ((err=residual_coding(tctx,x0,y0,log2TrafoSizeC,2)) != DE265_OK) return err;
+      }
 
-      int err;
-      if (cbf_luma) {
-        if ((err=residual_coding(tctx,x0,y0, xL,yL,log2TrafoSize,0)) != DE265_OK) return err;
+      if (tctx->img->sps.ChromaArrayType != CHROMA_MONO) {
+        decode_TU(tctx,
+                  x0/SubWidthC,y0/SubHeightC,
+                  xCUBase/SubWidthC,yCUBase/SubHeightC,
+                  nTC, 2, cuPredMode, cbf_cr & 1);
       }
+    }
 
-      if (log2TrafoSize>2) {
-        if (cbf_cb) {
-          if ((err=residual_coding(tctx,x0,y0,xL,yL,log2TrafoSize-1,1)) != DE265_OK) return err;
-        }
+    // 4:2:2
+    if (ChromaArrayType == CHROMA_422) {
+      const int yOffset = 1<<log2TrafoSizeC;
 
-        if (cbf_cr) {
-          if ((err=residual_coding(tctx,x0,y0,xL,yL,log2TrafoSize-1,2)) != DE265_OK) return err;
-        }
+      if (cbf_cr & 2) {
+        if ((err=residual_coding(tctx,
+                                 x0,y0+yOffset*SubHeightC,
+                                 log2TrafoSizeC,2)) != DE265_OK) return err;
       }
-      else if (blkIdx==3) {
-        if (cbf_cb) {
-          if ((err=residual_coding(tctx,xBase,yBase,xBase-xCUBase,yBase-yCUBase,
-                                   log2TrafoSize,1)) != DE265_OK) return err;
-        }
 
-        if (cbf_cr) {
-          if ((err=residual_coding(tctx,xBase,yBase,xBase-xCUBase,yBase-yCUBase,
-                                   log2TrafoSize,2)) != DE265_OK) return err;
-        }
-      }
+      decode_TU(tctx,
+                x0/SubWidthC,y0/SubHeightC+yOffset,
+                xCUBase/SubWidthC,yCUBase/SubHeightC+yOffset,
+                nTC, 2, cuPredMode, cbf_cr & 2);
+    }
+  }
+  else if (blkIdx==3) {
+    if (cbf_cb & 1) {
+      if ((err=residual_coding(tctx,xBase,yBase,
+                               log2TrafoSize,1)) != DE265_OK) return err;
+    }
+
+    if (tctx->img->sps.ChromaArrayType != CHROMA_MONO) {
+      decode_TU(tctx,
+                xBase/SubWidthC,  yBase/SubHeightC,
+                xCUBase/SubWidthC,yCUBase/SubHeightC, nT, 1, cuPredMode, cbf_cb & 1);
+    }
+
+    // 4:2:2
+    if (cbf_cb & 2) {
+      if ((err=residual_coding(tctx,
+                               xBase        ,yBase        +(1<<log2TrafoSize),
+                               log2TrafoSize,1)) != DE265_OK) return err;
+    }
+
+    if (ChromaArrayType == CHROMA_422) {
+      decode_TU(tctx,
+                xBase/SubWidthC,  yBase/SubHeightC + (1<<log2TrafoSize),
+                xCUBase/SubWidthC,yCUBase/SubHeightC, nT, 1, cuPredMode, cbf_cb & 2);
+    }
+
+    if (cbf_cr & 1) {
+      if ((err=residual_coding(tctx,xBase,yBase,
+                               log2TrafoSize,2)) != DE265_OK) return err;
+    }
+
+    if (tctx->img->sps.ChromaArrayType != CHROMA_MONO) {
+      decode_TU(tctx,
+                xBase/SubWidthC,  yBase/SubHeightC,
+                xCUBase/SubWidthC,yCUBase/SubHeightC, nT, 2, cuPredMode, cbf_cr & 1);
+    }
+
+    // 4:2:2
+    if (cbf_cr & 2) {
+      if ((err=residual_coding(tctx,
+                               xBase        ,yBase        +(1<<log2TrafoSizeC),
+                               log2TrafoSize,2)) != DE265_OK) return err;
+    }
+
+    if (ChromaArrayType == CHROMA_422) {
+      decode_TU(tctx,
+                xBase/SubWidthC,  yBase/SubHeightC + (1<<log2TrafoSize),
+                xCUBase/SubWidthC,yCUBase/SubHeightC, nT, 2, cuPredMode, cbf_cr & 2);
     }
+  }
+
 
   return DE265_OK;
 }
 
 
+static void dump_cbsize(de265_image* img)
+{
+  int w = img->get_width(0);
+  int h = img->get_height(0);
+
+  for (int y=0;y<h;y+=8) {
+    for (int x=0;x<w;x+=8) {
+      printf("%d",img->get_log2CbSize(x,y));
+    }
+    printf("\n");
+  }
+}
+
+
 void read_transform_tree(thread_context* tctx,
                          int x0, int y0,        // position of TU in frame
                          int xBase, int yBase,  // position of parent TU in frame
@@ -2818,25 +3812,19 @@ void read_transform_tree(thread_context* tctx,
                          int MaxTrafoDepth,
                          int IntraSplitFlag,
                          enum PredMode cuPredMode,
-                         bool parent_cbf_cb,bool parent_cbf_cr)
+                         uint8_t parent_cbf_cb,uint8_t parent_cbf_cr)
 {
   logtrace(LogSlice,"- read_transform_tree (interleaved) x0:%d y0:%d xBase:%d yBase:%d "
-           "log2TrafoSize:%d trafoDepth:%d MaxTrafoDepth:%d\n",
-           x0,y0,xBase,yBase,log2TrafoSize,trafoDepth,MaxTrafoDepth);
+           "log2TrafoSize:%d trafoDepth:%d MaxTrafoDepth:%d parent-cbf-cb:%d parent-cbf-cr:%d\n",
+           x0,y0,xBase,yBase,log2TrafoSize,trafoDepth,MaxTrafoDepth,parent_cbf_cb,parent_cbf_cr);
 
   de265_image* img = tctx->img;
   const seq_parameter_set* sps = &img->sps;
 
-  enum PredMode PredMode = img->get_pred_mode(x0,y0);
-  enum PartMode PartMode = img->get_PartMode(x0,y0);
-
   int split_transform_flag;
-  
-  int interSplitFlag= (sps->max_transform_hierarchy_depth_inter==0 &&
-                       PredMode == MODE_INTER &&
-                       PartMode != PART_2Nx2N &&
-                       trafoDepth == 0);
 
+  enum PredMode PredMode = img->get_pred_mode(x0,y0);
+  assert(PredMode == cuPredMode);
 
   /*  If TrafoSize is larger than maximum size   -> split automatically
       If TrafoSize is at minimum size            -> do not split
@@ -2853,37 +3841,63 @@ void read_transform_tree(thread_context* tctx,
     }
   else
     {
+      enum PartMode PartMode = img->get_PartMode(x0,y0);
+
+      int interSplitFlag= (sps->max_transform_hierarchy_depth_inter==0 &&
+                           trafoDepth == 0 &&
+                           PredMode == MODE_INTER &&
+                           PartMode != PART_2Nx2N);
+
       split_transform_flag = (log2TrafoSize > sps->Log2MaxTrafoSize ||
                               (IntraSplitFlag==1 && trafoDepth==0) ||
                               interSplitFlag==1) ? 1:0;
     }
 
-
   if (split_transform_flag) {
     logtrace(LogSlice,"set_split_transform_flag(%d,%d, %d)\n",x0,y0,trafoDepth);
     img->set_split_transform_flag(x0,y0,trafoDepth);
   }
 
-
   int cbf_cb=-1;
   int cbf_cr=-1;
 
-  if (log2TrafoSize>2) {
+  // CBF_CB/CR flags are encoded like this:
+  // 4:2:0 and 4:4:4 modes: binary flag in bit 0
+  // 4:2:2 mode: bit 0: top block, bit 1: bottom block
+
+  if ((log2TrafoSize>2 && sps->ChromaArrayType != CHROMA_MONO) ||
+      sps->ChromaArrayType == CHROMA_444) {
     // we do not have to test for trafoDepth==0, because parent_cbf_cb is 1 at depth 0
     if (/*trafoDepth==0 ||*/ parent_cbf_cb) {
       cbf_cb = decode_cbf_chroma(tctx,trafoDepth);
+
+      if (sps->ChromaArrayType == CHROMA_422 && (!split_transform_flag || log2TrafoSize==3)) {
+        cbf_cb |= (decode_cbf_chroma(tctx,trafoDepth) << 1);
+      }
     }
 
     // we do not have to test for trafoDepth==0, because parent_cbf_cb is 1 at depth 0
     if (/*trafoDepth==0 ||*/ parent_cbf_cr) {
       cbf_cr = decode_cbf_chroma(tctx,trafoDepth);
+
+      if (sps->ChromaArrayType == CHROMA_422 && (!split_transform_flag || log2TrafoSize==3)) {
+        cbf_cr |= (decode_cbf_chroma(tctx,trafoDepth) << 1);
+      }
     }
   }
 
+  //printf("CBF: cb:%d cr:%d\n",cbf_cb,cbf_cr);
 
   // cbf_cr/cbf_cb not present in bitstream -> induce values
 
   if (cbf_cb<0) {
+    assert(!(trafoDepth==0 && log2TrafoSize==2));
+
+    /* The standard specifies to check trafoDepth>0 AND log2TrafoSize==2.
+       However, I think that trafoDepth>0 is redundant as a CB is always
+       at least 8x8 and hence trafoDepth>0.
+    */
+
     if (trafoDepth>0 && log2TrafoSize==2) {
       cbf_cb = parent_cbf_cb;
     } else {
@@ -2915,67 +3929,23 @@ void read_transform_tree(thread_context* tctx,
                         MaxTrafoDepth,IntraSplitFlag, cuPredMode, cbf_cb,cbf_cr);
   }
   else {
-    int cbf_luma=1;
+    int cbf_luma;
 
     if (PredMode==MODE_INTRA || trafoDepth!=0 || cbf_cb || cbf_cr) {
       cbf_luma = decode_cbf_luma(tctx,trafoDepth);
     }
+    else {
+      /* There cannot be INTER blocks with no residual data.
+         That case is already handled with rqt_root_cbf.
+      */
+
+      cbf_luma = 1;
+    }
 
     logtrace(LogSlice,"call read_transform_unit %d/%d\n",x0,y0);
 
     read_transform_unit(tctx, x0,y0,xBase,yBase, xCUBase,yCUBase, log2TrafoSize,trafoDepth, blkIdx,
                         cbf_luma, cbf_cb, cbf_cr);
-
-
-    int nT = 1<<log2TrafoSize;
-
-
-    if (cuPredMode == MODE_INTRA) // if intra mode
-      {
-        enum IntraPredMode intraPredMode = img->get_IntraPredMode(x0,y0);
-
-        decode_intra_prediction(img, x0,y0, intraPredMode, nT, 0);
-
-        enum IntraPredMode chromaPredMode = tctx->IntraPredModeC;
-
-        if (nT>=8) {
-          decode_intra_prediction(img, x0/2,y0/2, chromaPredMode, nT/2, 1);
-          decode_intra_prediction(img, x0/2,y0/2, chromaPredMode, nT/2, 2);
-        }
-        else if (blkIdx==3) {
-          decode_intra_prediction(img, xBase/2,yBase/2, chromaPredMode, nT, 1);
-          decode_intra_prediction(img, xBase/2,yBase/2, chromaPredMode, nT, 2);
-        }
-      }
-
-    // NOTE: disable MC-mode residuals:
-    { //if (cuPredMode == MODE_INTRA) {
-      if (cbf_luma) {
-        scale_coefficients(tctx, x0,y0, xCUBase,yCUBase, nT, 0,
-                           tctx->transform_skip_flag[0], PredMode==MODE_INTRA);
-      }
-
-      if (nT>=8) {
-        if (cbf_cb) {
-          scale_coefficients(tctx, x0/2,y0/2, xCUBase/2,yCUBase/2, nT/2, 1,
-                             tctx->transform_skip_flag[1], PredMode==MODE_INTRA);
-        }
-        if (cbf_cr) {
-          scale_coefficients(tctx, x0/2,y0/2, xCUBase/2,yCUBase/2, nT/2, 2,
-                             tctx->transform_skip_flag[2], PredMode==MODE_INTRA);
-        }
-      }
-      else if (blkIdx==3) {
-        if (cbf_cb) {
-          scale_coefficients(tctx, xBase/2,yBase/2, xCUBase/2,yCUBase/2, nT, 1,
-                             tctx->transform_skip_flag[1], PredMode==MODE_INTRA);
-        }
-        if (cbf_cr) {
-          scale_coefficients(tctx, xBase/2,yBase/2, xCUBase/2,yCUBase/2, nT, 2,
-                             tctx->transform_skip_flag[2], PredMode==MODE_INTRA);
-        }
-      }
-    }
   }
 }
 
@@ -3050,8 +4020,8 @@ void read_mvd_coding(thread_context* tctx,
   }
 
   //set_mvd(tctx->decctx, x0,y0, refList, value[0],value[1]);
-  tctx->mvd[refList][0] = value[0];
-  tctx->mvd[refList][1] = value[1];
+  tctx->motion.mvd[refList][0] = value[0];
+  tctx->motion.mvd[refList][1] = value[1];
 
   logtrace(LogSlice, "MVD[%d;%d|%d] = %d;%d\n",x0,y0,refList, value[0],value[1]);
 }
@@ -3061,23 +4031,20 @@ void read_prediction_unit_SKIP(thread_context* tctx,
                                int x0, int y0,
                                int nPbW, int nPbH)
 {
-  slice_segment_header* shdr = tctx->shdr;
-
-  int merge_idx;
-  if (shdr->MaxNumMergeCand>1) {
-    merge_idx = decode_merge_idx(tctx);
-  }
-  else {
-    merge_idx = 0;
-  }
+  int merge_idx = decode_merge_idx(tctx);
 
-  tctx->merge_idx = merge_idx;
-  tctx->merge_flag = true;
+  tctx->motion.merge_idx = merge_idx;
+  tctx->motion.merge_flag = true;
 
   logtrace(LogSlice,"prediction skip 2Nx2N, merge_idx: %d\n",merge_idx);
 }
 
 
+/* xC/yC : CB position
+   xB/yB : position offset of the PB
+   nPbW/nPbH : size of PB
+   nCS   : CB size
+ */
 void read_prediction_unit(thread_context* tctx,
                           int xC,int yC, int xB,int yB,
                           int nPbW, int nPbH,
@@ -3091,21 +4058,14 @@ void read_prediction_unit(thread_context* tctx,
   slice_segment_header* shdr = tctx->shdr;
 
   int merge_flag = decode_merge_flag(tctx);
-  tctx->merge_flag = merge_flag;
+  tctx->motion.merge_flag = merge_flag;
 
   if (merge_flag) {
-    int merge_idx;
-
-    if (shdr->MaxNumMergeCand>1) {
-      merge_idx = decode_merge_idx(tctx);
-    }
-    else {
-      merge_idx = 0;
-    }
+    int merge_idx = decode_merge_idx(tctx);
 
     logtrace(LogSlice,"prediction unit %d,%d, merge mode, index: %d\n",x0,y0,merge_idx);
 
-    tctx->merge_idx = merge_idx;
+    tctx->motion.merge_idx = merge_idx;
   }
   else { // no merge flag
     enum InterPredIdc inter_pred_idc;
@@ -3117,109 +4077,121 @@ void read_prediction_unit(thread_context* tctx,
       inter_pred_idc = PRED_L0;
     }
 
-    tctx->inter_pred_idc = inter_pred_idc; // set_inter_pred_idc(ctx,x0,y0, inter_pred_idc);
+    tctx->motion.inter_pred_idc = inter_pred_idc; // set_inter_pred_idc(ctx,x0,y0, inter_pred_idc);
 
     if (inter_pred_idc != PRED_L1) {
       int ref_idx_l0 = decode_ref_idx_lX(tctx, shdr->num_ref_idx_l0_active);
 
       // NOTE: case for only one reference frame is handles in decode_ref_idx_lX()
-      tctx->refIdx[0] = ref_idx_l0;
+      tctx->motion.refIdx[0] = ref_idx_l0;
 
       read_mvd_coding(tctx,x0,y0, 0);
 
       int mvp_l0_flag = decode_mvp_lx_flag(tctx); // l0
-      tctx->mvp_lX_flag[0] = mvp_l0_flag;
+      tctx->motion.mvp_l0_flag = mvp_l0_flag;
 
       logtrace(LogSlice,"prediction unit %d,%d, L0, refIdx=%d mvp_l0_flag:%d\n",
-               x0,y0, tctx->refIdx[0], mvp_l0_flag);
+               x0,y0, tctx->motion.refIdx[0], mvp_l0_flag);
     }
 
     if (inter_pred_idc != PRED_L0) {
       int ref_idx_l1 = decode_ref_idx_lX(tctx, shdr->num_ref_idx_l1_active);
 
       // NOTE: case for only one reference frame is handles in decode_ref_idx_lX()
-      tctx->refIdx[1] = ref_idx_l1;
+      tctx->motion.refIdx[1] = ref_idx_l1;
 
       if (shdr->mvd_l1_zero_flag &&
           inter_pred_idc == PRED_BI) {
-        tctx->mvd[1][0] = 0;
-        tctx->mvd[1][1] = 0;
+        tctx->motion.mvd[1][0] = 0;
+        tctx->motion.mvd[1][1] = 0;
       }
       else {
         read_mvd_coding(tctx,x0,y0, 1);
       }
 
       int mvp_l1_flag = decode_mvp_lx_flag(tctx); // l1
-      tctx->mvp_lX_flag[1] = mvp_l1_flag;
+      tctx->motion.mvp_l1_flag = mvp_l1_flag;
 
       logtrace(LogSlice,"prediction unit %d,%d, L1, refIdx=%d mvp_l1_flag:%d\n",
-               x0,y0, tctx->refIdx[1], mvp_l1_flag);
+               x0,y0, tctx->motion.refIdx[1], mvp_l1_flag);
     }
   }
 
 
 
-  decode_prediction_unit(tctx, xC,yC,xB,yB, nCS, nPbW,nPbH, partIdx);
+  decode_prediction_unit(tctx->decctx, tctx->shdr, tctx->img, tctx->motion,
+                         xC,yC,xB,yB, nCS, nPbW,nPbH, partIdx);
 }
 
 
 
 
-static void read_pcm_samples(thread_context* tctx, int x0, int y0, int log2CbSize)
+template <class pixel_t>
+void read_pcm_samples_internal(thread_context* tctx, int x0, int y0, int log2CbSize,
+                               int cIdx, bitreader& br)
 {
-  bitreader br;
-  br.data            = tctx->cabac_decoder.bitstream_curr;
-  br.bytes_remaining = tctx->cabac_decoder.bitstream_end - tctx->cabac_decoder.bitstream_curr;
-  br.nextbits = 0;
-  br.nextbits_cnt = 0;
-
   const seq_parameter_set* sps = &tctx->img->sps;
-  //fprintf(stderr,"PCM pos: %d %d (POC=%d)\n",x0,y0,tctx->decctx->img->PicOrderCntVal);
 
-  int nBitsY = sps->pcm_sample_bit_depth_luma;
-  int nBitsC = sps->pcm_sample_bit_depth_chroma;
+  int nPcmBits;
+  int bitDepth;
+
+  int w = 1<<log2CbSize;
+  int h = 1<<log2CbSize;
+
+  if (cIdx>0) {
+    w /= tctx->img->sps.SubWidthC;
+    h /= tctx->img->sps.SubHeightC;
 
-  int wY = 1<<log2CbSize;
-  int wC = 1<<(log2CbSize-1);
+    x0 /= tctx->img->sps.SubWidthC;
+    y0 /= tctx->img->sps.SubHeightC;
+
+    nPcmBits = sps->pcm_sample_bit_depth_chroma;
+    bitDepth = sps->BitDepth_C;
+  }
+  else {
+    nPcmBits = sps->pcm_sample_bit_depth_luma;
+    bitDepth = sps->BitDepth_Y;
+  }
 
-  uint8_t* yPtr;
-  uint8_t* cbPtr;
-  uint8_t* crPtr;
+  pixel_t* ptr;
   int stride;
-  int chroma_stride;
-  yPtr  = tctx->img->get_image_plane(0);
-  cbPtr = tctx->img->get_image_plane(1);
-  crPtr = tctx->img->get_image_plane(2);
-  stride = tctx->img->get_image_stride(0);
-  chroma_stride = tctx->img->get_image_stride(1);
-
-  yPtr  = &yPtr [y0*stride + x0];
-  cbPtr = &cbPtr[y0/2*chroma_stride + x0/2];
-  crPtr = &crPtr[y0/2*chroma_stride + x0/2];
-
-  int shiftY = sps->BitDepth_Y - nBitsY;
-  int shiftC = sps->BitDepth_C - nBitsC;
-
-  for (int y=0;y<wY;y++)
-    for (int x=0;x<wY;x++)
-      {
-        int value = get_bits(&br, nBitsY);
-        yPtr[y*stride+x] = value << shiftY;
-      }
+  ptr    = tctx->img->get_image_plane_at_pos_NEW<pixel_t>(cIdx,x0,y0);
+  stride = tctx->img->get_image_stride(cIdx);
 
-  for (int y=0;y<wC;y++)
-    for (int x=0;x<wC;x++)
-      {
-        int value = get_bits(&br, nBitsC);
-        cbPtr[y*chroma_stride+x] = value << shiftC;
-      }
+  int shift = bitDepth - nPcmBits;
 
-  for (int y=0;y<wC;y++)
-    for (int x=0;x<wC;x++)
+  for (int y=0;y<h;y++)
+    for (int x=0;x<w;x++)
       {
-        int value = get_bits(&br, nBitsC);
-        crPtr[y*chroma_stride+x] = value << shiftC;
+        int value = get_bits(&br, nPcmBits);
+        ptr[y*stride+x] = value << shift;
       }
+}
+
+static void read_pcm_samples(thread_context* tctx, int x0, int y0, int log2CbSize)
+{
+  bitreader br;
+  br.data            = tctx->cabac_decoder.bitstream_curr;
+  br.bytes_remaining = tctx->cabac_decoder.bitstream_end - tctx->cabac_decoder.bitstream_curr;
+  br.nextbits = 0;
+  br.nextbits_cnt = 0;
+
+
+  if (tctx->img->high_bit_depth(0)) {
+    read_pcm_samples_internal<uint16_t>(tctx,x0,y0,log2CbSize,0,br);
+  } else {
+    read_pcm_samples_internal<uint8_t>(tctx,x0,y0,log2CbSize,0,br);
+  }
+
+  if (tctx->img->sps.ChromaArrayType != CHROMA_MONO) {
+    if (tctx->img->high_bit_depth(1)) {
+      read_pcm_samples_internal<uint16_t>(tctx,x0,y0,log2CbSize,1,br);
+      read_pcm_samples_internal<uint16_t>(tctx,x0,y0,log2CbSize,2,br);
+    } else {
+      read_pcm_samples_internal<uint8_t>(tctx,x0,y0,log2CbSize,1,br);
+      read_pcm_samples_internal<uint8_t>(tctx,x0,y0,log2CbSize,2,br);
+    }
+  }
 
   prepare_for_CABAC(&br);
   tctx->cabac_decoder.bitstream_curr = br.data;
@@ -3227,6 +4199,35 @@ static void read_pcm_samples(thread_context* tctx, int x0, int y0, int log2CbSiz
 }
 
 
+int map_chroma_pred_mode(int intra_chroma_pred_mode, int IntraPredMode)
+{
+  if (intra_chroma_pred_mode==4) {
+    return IntraPredMode;
+  }
+  else {
+    static const enum IntraPredMode IntraPredModeCCand[4] = {
+      INTRA_PLANAR,
+      INTRA_ANGULAR_26, // vertical
+      INTRA_ANGULAR_10, // horizontal
+      INTRA_DC
+    };
+
+    int IntraPredModeC = IntraPredModeCCand[intra_chroma_pred_mode];
+    if (IntraPredModeC == IntraPredMode) {
+      return INTRA_ANGULAR_34;
+    }
+    else {
+      return IntraPredModeC;
+    }
+  }
+}
+
+// h.265-V2 Table 8-3
+static const uint8_t map_chroma_422[35] = {
+  0,1,2, 2, 2, 2, 3, 5, 7, 8,10,12,13,15,17,18,19,20,
+  21,22,23,23,24,24,25,25,26,27,27,28,28,29,29,30,31
+};
+
 void read_coding_unit(thread_context* tctx,
                       int x0, int y0,  // position of coding unit in frame
                       int log2CbSize,
@@ -3240,7 +4241,15 @@ void read_coding_unit(thread_context* tctx,
   logtrace(LogSlice,"- read_coding_unit %d;%d cbsize:%d\n",x0,y0,1<<log2CbSize);
 
 
-  img->set_log2CbSize(x0,y0, log2CbSize);
+  //QQprintf("- read_coding_unit %d;%d cbsize:%d\n",x0,y0,1<<log2CbSize);
+
+  img->set_log2CbSize(x0,y0, log2CbSize, true);
+
+  /* This is only required on corrupted input streams.
+     It may happen that there are several slices in the image that overlap.
+     In this case, flags would accumulate from both slices.
+  */
+  img->clear_split_transform_flags(x0,y0, log2CbSize);
 
   int nCbS = 1<<log2CbSize; // number of coding block samples
 
@@ -3283,7 +4292,8 @@ void read_coding_unit(thread_context* tctx,
     // DECODE
 
     int nCS_L = 1<<log2CbSize;
-    decode_prediction_unit(tctx,x0,y0, 0,0, nCS_L, nCS_L,nCS_L, 0);
+    decode_prediction_unit(tctx->decctx,tctx->shdr,tctx->img,tctx->motion,
+                           x0,y0, 0,0, nCS_L, nCS_L,nCS_L, 0);
   }
   else /* not skipped */ {
     if (shdr->slice_type != SLICE_TYPE_I) {
@@ -3349,6 +4359,9 @@ void read_coding_unit(thread_context* tctx,
         int mpm_idx[4], rem_intra_luma_pred_mode[4];
         idx=0;
 
+        int availableA0 = check_CTB_available(img, x0,y0, x0-1,y0);
+        int availableB0 = check_CTB_available(img, x0,y0, x0,y0-1);
+
         for (int j=0;j<nCbS;j+=pbOffset)
           for (int i=0;i<nCbS;i+=pbOffset)
             {
@@ -3367,77 +4380,17 @@ void read_coding_unit(thread_context* tctx,
 
               int IntraPredMode;
 
-              int availableA = check_CTB_available(img, shdr, x,y, x-1,y);
-              int availableB = check_CTB_available(img, shdr, x,y, x,y-1);
-
-              int PUidx = (x>>sps->Log2MinPUSize) + (y>>sps->Log2MinPUSize)*sps->PicWidthInMinPUs;
-
-              // block on left side
-
-              enum IntraPredMode candIntraPredModeA, candIntraPredModeB;
-              if (availableA==false) {
-                candIntraPredModeA=INTRA_DC;
-              }
-              else if (img->get_pred_mode(x-1,y) != MODE_INTRA ||
-                       img->get_pcm_flag (x-1,y)) {
-                candIntraPredModeA=INTRA_DC;
-              }
-              else {
-                candIntraPredModeA = img->get_IntraPredMode_atIndex(PUidx-1);
-              }
+              int availableA = availableA0 || (i>0); // left candidate always available for right blk
+              int availableB = availableB0 || (j>0); // top candidate always available for bottom blk
 
-              // block above
 
-              if (availableB==false) {
-                candIntraPredModeB=INTRA_DC;
-              }
-              else if (img->get_pred_mode(x,y-1) != MODE_INTRA ||
-                       img->get_pcm_flag (x,y-1)) {
-                candIntraPredModeB=INTRA_DC;
-              }
-              else if (y-1 < ((y >> sps->Log2CtbSizeY) << sps->Log2CtbSizeY)) {
-                candIntraPredModeB=INTRA_DC;
-              }
-              else {
-                candIntraPredModeB = img->get_IntraPredMode_atIndex(PUidx-sps->PicWidthInMinPUs);
-              }
 
-              // build candidate list
+              int PUidx = (x>>sps->Log2MinPUSize) + (y>>sps->Log2MinPUSize)*sps->PicWidthInMinPUs;
 
               int candModeList[3];
 
-              logtrace(LogSlice,"availableA:%d candA:%d & availableB:%d candB:%d\n",
-                       availableA, candIntraPredModeA,
-                       availableB, candIntraPredModeB);
-
-              if (candIntraPredModeA == candIntraPredModeB) {
-                if (candIntraPredModeA < 2) {
-                  candModeList[0] = INTRA_PLANAR;
-                  candModeList[1] = INTRA_DC;
-                  candModeList[2] = INTRA_ANGULAR_26; 
-                }
-                else {
-                  candModeList[0] = candIntraPredModeA;
-                  candModeList[1] = 2 + ((candIntraPredModeA-2 -1 +32) % 32);
-                  candModeList[2] = 2 + ((candIntraPredModeA-2 +1    ) % 32);
-                }
-              }
-              else {
-                candModeList[0] = candIntraPredModeA;
-                candModeList[1] = candIntraPredModeB;
-
-                if (candIntraPredModeA != INTRA_PLANAR &&
-                    candIntraPredModeB != INTRA_PLANAR) {
-                  candModeList[2] = INTRA_PLANAR;
-                }
-                else if (candIntraPredModeA != INTRA_DC &&
-                         candIntraPredModeB != INTRA_DC) {
-                  candModeList[2] = INTRA_DC;
-                }
-                else {
-                  candModeList[2] = INTRA_ANGULAR_26; 
-                }
-              }
+              fillIntraPredModeCandidates(candModeList,x,y,PUidx,
+                                          availableA, availableB, img);
 
               for (int i=0;i<3;i++)
                 logtrace(LogSlice,"candModeList[%d] = %d\n", i, candModeList[i]);
@@ -3449,13 +4402,13 @@ void read_coding_unit(thread_context* tctx,
                 // sort candModeList
 
                 if (candModeList[0] > candModeList[1]) {
-                  int t = candModeList[0]; candModeList[0]=candModeList[1]; candModeList[1]=t;
+                  std::swap(candModeList[0],candModeList[1]);
                 }
                 if (candModeList[0] > candModeList[2]) {
-                  int t = candModeList[0]; candModeList[0]=candModeList[2]; candModeList[2]=t;
+                  std::swap(candModeList[0],candModeList[2]);
                 }
                 if (candModeList[1] > candModeList[2]) {
-                  int t = candModeList[1]; candModeList[1]=candModeList[2]; candModeList[2]=t;
+                  std::swap(candModeList[1],candModeList[2]);
                 }
 
                 // skip modes in the list
@@ -3477,32 +4430,45 @@ void read_coding_unit(thread_context* tctx,
 
         // set chroma intra prediction mode
 
-        int intra_chroma_pred_mode = decode_intra_chroma_pred_mode(tctx);
+        if (sps->ChromaArrayType == CHROMA_444) {
+          // chroma 4:4:4
+
+          idx = 0;
+          for (int j=0;j<nCbS;j+=pbOffset)
+            for (int i=0;i<nCbS;i+=pbOffset) {
+              int x = x0+i;
+              int y = y0+j;
+
+              int intra_chroma_pred_mode = decode_intra_chroma_pred_mode(tctx);
+              int IntraPredMode = img->get_IntraPredMode(x,y);
 
-        int IntraPredMode = img->get_IntraPredMode(x0,y0);
-        logtrace(LogSlice,"IntraPredMode: %d\n",IntraPredMode);
+              int IntraPredModeC = map_chroma_pred_mode(intra_chroma_pred_mode, IntraPredMode);
 
-        int IntraPredModeC;
-        if (intra_chroma_pred_mode==4) {
-          IntraPredModeC = IntraPredMode;
-        }
-        else {
-          static enum IntraPredMode IntraPredModeCCand[4] = {
-            INTRA_PLANAR,
-            INTRA_ANGULAR_26, // vertical
-            INTRA_ANGULAR_10, // horizontal
-            INTRA_DC
-          };
-
-          IntraPredModeC = IntraPredModeCCand[intra_chroma_pred_mode];
-          if (IntraPredModeC == IntraPredMode) {
-            IntraPredModeC = INTRA_ANGULAR_34;
-          }
+              logtrace(LogSlice,"IntraPredModeC[%d][%d]: %d (blksize:%d)\n",x,y,IntraPredModeC,
+                       1<<log2IntraPredSize);
+
+              img->set_IntraPredModeC(x,y, log2IntraPredSize,
+                                      (enum IntraPredMode)IntraPredModeC,
+                                      intra_chroma_pred_mode == 4);
+              idx++;
+            }
         }
+        else if (sps->ChromaArrayType != CHROMA_MONO) {
+          // chroma 4:2:0 and 4:2:2
 
-        logtrace(LogSlice,"IntraPredModeC[%d][%d]: %d\n",x0,y0,IntraPredModeC);
+          int intra_chroma_pred_mode = decode_intra_chroma_pred_mode(tctx);
+          int IntraPredMode = img->get_IntraPredMode(x0,y0);
+          logtrace(LogSlice,"IntraPredMode: %d\n",IntraPredMode);
+          int IntraPredModeC = map_chroma_pred_mode(intra_chroma_pred_mode, IntraPredMode);
 
-        tctx->IntraPredModeC = (enum IntraPredMode) IntraPredModeC;
+          if (sps->ChromaArrayType == CHROMA_422) {
+            IntraPredModeC = map_chroma_422[ IntraPredModeC ];
+          }
+
+          img->set_IntraPredModeC(x0,y0, log2CbSize,
+                                  (enum IntraPredMode)IntraPredModeC,
+                                  intra_chroma_pred_mode == 4);
+        }
       }
     }
     else { // INTER
@@ -3552,7 +4518,7 @@ void read_coding_unit(thread_context* tctx,
     if (!pcm_flag) { // !pcm
       bool rqt_root_cbf;
 
-      uint8_t merge_flag = tctx->merge_flag; // !!get_merge_flag(ctx,x0,y0);
+      uint8_t merge_flag = tctx->motion.merge_flag; // !!get_merge_flag(ctx,x0,y0);
 
       if (cuPredMode != MODE_INTRA &&
           !(PartMode == PART_2Nx2N && merge_flag)) {
@@ -3560,6 +4526,11 @@ void read_coding_unit(thread_context* tctx,
         rqt_root_cbf = !!decode_rqt_root_cbf(tctx);
       }
       else {
+        /* rqt_root_cbf=1 is inferred for Inter blocks with 2Nx2N, merge mode.
+           These must be some residual data, because otherwise, the CB could
+           also be coded in SKIP mode.
+         */
+
         rqt_root_cbf = true;
       }
 
@@ -3577,8 +4548,14 @@ void read_coding_unit(thread_context* tctx,
 
         logtrace(LogSlice,"MaxTrafoDepth: %d\n",MaxTrafoDepth);
 
+        uint8_t initial_chroma_cbf = 1;
+        if (sps->ChromaArrayType == CHROMA_MONO) {
+          initial_chroma_cbf = 0;
+        }
+
         read_transform_tree(tctx, x0,y0, x0,y0, x0,y0, log2CbSize, 0,0,
-                            MaxTrafoDepth, IntraSplitFlag, cuPredMode, 1,1);
+                            MaxTrafoDepth, IntraSplitFlag, cuPredMode,
+                            initial_chroma_cbf, initial_chroma_cbf);
       }
     } // !pcm
   }
@@ -3625,6 +4602,12 @@ void read_coding_quadtree(thread_context* tctx,
       // shdr->CuQpDelta = 0; // TODO check: is this the right place to set to default value ?
     }
 
+
+  if (tctx->shdr->cu_chroma_qp_offset_enabled_flag &&
+      log2CbSize >= img->pps.Log2MinCuChromaQpOffsetSize) {
+    tctx->IsCuChromaQpOffsetCoded = 0;
+  }
+
   if (split_flag) {
     int x1 = x0 + (1<<(log2CbSize-1));
     int y1 = y0 + (1<<(log2CbSize-1));
@@ -3675,6 +4658,8 @@ enum DecodeResult decode_substream(thread_context* tctx,
 
   const int startCtbY = tctx->CtbY;
 
+  //printf("start decoding substream at %d;%d\n",tctx->CtbX,tctx->CtbY);
+
   // in WPP mode: initialize CABAC model with stored model from row above
 
   if ((!first_independent_substream || tctx->CtbY != startCtbY) &&
@@ -3682,17 +4667,22 @@ enum DecodeResult decode_substream(thread_context* tctx,
       tctx->CtbY>=1 && tctx->CtbX==0)
     {
       if (sps->PicWidthInCtbsY>1) {
+        if ((tctx->CtbY-1) >= tctx->imgunit->ctx_models.size()) {
+          return Decode_Error;
+        }
+
+        //printf("CTX wait on %d/%d\n",1,tctx->CtbY-1);
+
         // we have to wait until the context model data is there
         tctx->img->wait_for_progress(tctx->task, 1,tctx->CtbY-1,CTB_PROGRESS_PREFILTER);
 
         // copy CABAC model from previous CTB row
-        memcpy(tctx->ctx_model,
-               &tctx->imgunit->ctx_models[(tctx->CtbY-1) * CONTEXT_MODEL_TABLE_LENGTH],
-               CONTEXT_MODEL_TABLE_LENGTH * sizeof(context_model));
+        tctx->ctx_model = tctx->imgunit->ctx_models[(tctx->CtbY-1)];
+        tctx->imgunit->ctx_models[(tctx->CtbY-1)].release(); // not used anymore
       }
       else {
         tctx->img->wait_for_progress(tctx->task, 0,tctx->CtbY-1,CTB_PROGRESS_PREFILTER);
-        initialize_CABAC(tctx);
+        initialize_CABAC_models(tctx);
       }
     }
 
@@ -3701,17 +4691,33 @@ enum DecodeResult decode_substream(thread_context* tctx,
     const int ctbx = tctx->CtbX;
     const int ctby = tctx->CtbY;
 
+    if (ctbx+ctby*ctbW >= tctx->img->pps.CtbAddrRStoTS.size()) {
+        return Decode_Error;
+    }
+
+    if (ctbx >= tctx->img->sps.PicWidthInCtbsY ||
+        ctby >= tctx->img->sps.PicHeightInCtbsY) {
+        return Decode_Error;
+    }
+
     if (block_wpp && ctby>0 && ctbx < ctbW-1) {
-      //printf("wait on %d/%d\n",ctbx+1,ctby-1);
+
+      // TODO: if we are in tiles mode and at the right border, do not wait for x+1,y-1
+
+      //printf("wait on %d/%d (%d)\n",ctbx+1,ctby-1, ctbx+1+(ctby-1)*sps->PicWidthInCtbsY);
 
       tctx->img->wait_for_progress(tctx->task, ctbx+1,ctby-1, CTB_PROGRESS_PREFILTER);
     }
 
-    //printf("%p: decode %d;%d\n", tctx, tctx->CtbY,tctx->CtbX);
+    //printf("%p: decode %d;%d\n", tctx, tctx->CtbX,tctx->CtbY);
 
 
     // read and decode CTB
 
+    if (tctx->ctx_model.empty() == false) {
+      return Decode_Error;
+    }
+
     read_coding_tree_unit(tctx);
 
 
@@ -3721,26 +4727,30 @@ enum DecodeResult decode_substream(thread_context* tctx,
         ctbx == 1 &&
         ctby < sps->PicHeightInCtbsY-1)
       {
-        context_model* ctx_store = &tctx->imgunit->ctx_models[ctby * CONTEXT_MODEL_TABLE_LENGTH];
+        // no storage for context table has been allocated
+        if (tctx->imgunit->ctx_models.size() <= ctby) {
+          return Decode_Error;
+        }
 
-        memcpy(ctx_store,
-               &tctx->ctx_model,
-               CONTEXT_MODEL_TABLE_LENGTH * sizeof(context_model));
+        tctx->imgunit->ctx_models[ctby] = tctx->ctx_model;
+        tctx->imgunit->ctx_models[ctby].decouple(); // store an independent copy
       }
 
 
     // end of slice segment ?
 
     int end_of_slice_segment_flag = decode_CABAC_term_bit(&tctx->cabac_decoder);
+    //printf("end-of-slice flag: %d\n", end_of_slice_segment_flag);
 
     if (end_of_slice_segment_flag) {
       // at the end of the slice segment, we store the CABAC model if we need it
       // because a dependent slice may follow
 
       if (pps->dependent_slice_segments_enabled_flag) {
-        memcpy(tctx->shdr->ctx_model_storage,
-               tctx->ctx_model,
-               CONTEXT_MODEL_TABLE_LENGTH * sizeof(context_model));
+        tctx->shdr->ctx_model_storage = tctx->ctx_model;
+        tctx->shdr->ctx_model_storage.decouple(); // store an independent copy
+
+        tctx->shdr->ctx_model_storage_defined = true;
       }
     }
 
@@ -3766,6 +4776,17 @@ enum DecodeResult decode_substream(thread_context* tctx,
 
 
     if (end_of_slice_segment_flag) {
+      /* corrupted inputs may send the end_of_slice_segment_flag even if not all
+         CTBs in a row have been coded. Hence, we mark all of them as finished.
+       */
+
+      /*
+      for (int x = ctbx+1 ; x<sps->PicWidthInCtbsY; x++) {
+        printf("mark skipped %d;%d\n",ctbx,ctby);
+        tctx->img->ctb_progress[ctbx+ctby*ctbW].set_progress(CTB_PROGRESS_PREFILTER);
+      }
+      */
+
       return Decode_EndOfSliceSegment;
     }
 
@@ -3795,7 +4816,7 @@ enum DecodeResult decode_substream(thread_context* tctx,
 
 
 
-void initialize_CABAC_at_slice_segment_start(thread_context* tctx)
+bool initialize_CABAC_at_slice_segment_start(thread_context* tctx)
 {
   de265_image* img = tctx->img;
   const pic_parameter_set* pps = &img->pps;
@@ -3805,45 +4826,93 @@ void initialize_CABAC_at_slice_segment_start(thread_context* tctx)
   if (shdr->dependent_slice_segment_flag) {
     int prevCtb = pps->CtbAddrTStoRS[ pps->CtbAddrRStoTS[shdr->slice_segment_address] -1 ];
 
-    slice_segment_header* prevCtbHdr = img->slices[ img->get_SliceHeaderIndex_atIndex(prevCtb) ];
+    int sliceIdx = img->get_SliceHeaderIndex_atIndex(prevCtb);
+    if (sliceIdx >= img->slices.size()) {
+      return false;
+    }
+    slice_segment_header* prevCtbHdr = img->slices[ sliceIdx ];
 
     if (pps->is_tile_start_CTB(shdr->slice_segment_address % sps->PicWidthInCtbsY,
                                shdr->slice_segment_address / sps->PicWidthInCtbsY
                                )) {
-      initialize_CABAC(tctx);
+      initialize_CABAC_models(tctx);
     }
     else {
+      // wait for previous slice to finish decoding
+
+      //printf("wait for previous slice to finish decoding\n");
+
+
+      slice_unit* prevSliceSegment = tctx->imgunit->get_prev_slice_segment(tctx->sliceunit);
+      //assert(prevSliceSegment);
+      if (prevSliceSegment==NULL) {
+        return false;
+      }
+
+      prevSliceSegment->finished_threads.wait_for_progress(prevSliceSegment->nThreads);
+
+
+      /*
+      printf("wait for %d,%d (init)\n",
+             prevCtb / sps->PicWidthInCtbsY,
+             prevCtb % sps->PicWidthInCtbsY);
       tctx->img->wait_for_progress(tctx->task, prevCtb, CTB_PROGRESS_PREFILTER);
+      */
+
+      if (!prevCtbHdr->ctx_model_storage_defined) {
+        return false;
+      }
 
-      memcpy(tctx->ctx_model,
-             prevCtbHdr->ctx_model_storage,
-             CONTEXT_MODEL_TABLE_LENGTH * sizeof(context_model));
+      tctx->ctx_model = prevCtbHdr->ctx_model_storage;
+      prevCtbHdr->ctx_model_storage.release();
     }
   }
   else {
-    initialize_CABAC(tctx);
+    initialize_CABAC_models(tctx);
   }
+
+  return true;
+}
+
+
+std::string thread_task_ctb_row::name() const {
+  char buf[100];
+  sprintf(buf,"ctb-row-%d",debug_startCtbRow);
+  return buf;
+}
+
+
+std::string thread_task_slice_segment::name() const {
+  char buf[100];
+  sprintf(buf,"slice-segment-%d;%d",debug_startCtbX,debug_startCtbY);
+  return buf;
 }
 
 
 void thread_task_slice_segment::work()
 {
-  struct thread_task_slice_segment* data = this;
+  thread_task_slice_segment* data = this;
   thread_context* tctx = data->tctx;
   de265_image* img = tctx->img;
 
   state = Running;
-  img->thread_run();
+  img->thread_run(this);
 
   setCtbAddrFromTS(tctx);
 
   //printf("%p: A start decoding at %d/%d\n", tctx, tctx->CtbX,tctx->CtbY);
 
   if (data->firstSliceSubstream) {
-    initialize_CABAC_at_slice_segment_start(tctx);
+    bool success = initialize_CABAC_at_slice_segment_start(tctx);
+    if (!success) {
+      state = Finished;
+      tctx->sliceunit->finished_threads.increase_progress(1);
+      img->thread_finishes(this);
+      return;
+    }
   }
   else {
-    initialize_CABAC(tctx);
+    initialize_CABAC_models(tctx);
   }
 
   init_CABAC_decoder_2(&tctx->cabac_decoder);
@@ -3851,7 +4920,8 @@ void thread_task_slice_segment::work()
   /*enum DecodeResult result =*/ decode_substream(tctx, false, data->firstSliceSubstream);
 
   state = Finished;
-  img->thread_finishes();
+  tctx->sliceunit->finished_threads.increase_progress(1);
+  img->thread_finishes(this);
 
   return; // DE265_OK;
 }
@@ -3859,7 +4929,7 @@ void thread_task_slice_segment::work()
 
 void thread_task_ctb_row::work()
 {
-  struct thread_task_ctb_row* data = this;
+  thread_task_ctb_row* data = this;
   thread_context* tctx = data->tctx;
   de265_image* img = tctx->img;
 
@@ -3867,17 +4937,28 @@ void thread_task_ctb_row::work()
   int ctbW = sps->PicWidthInCtbsY;
 
   state = Running;
-  img->thread_run();
+  img->thread_run(this);
 
   setCtbAddrFromTS(tctx);
 
   int ctby = tctx->CtbAddrInRS / ctbW;
   int myCtbRow = ctby;
 
-  // printf("start decoding at %d/%d\n", ctbx,ctby);
+  //printf("start CTB-row decoding at row %d\n", ctby);
 
   if (data->firstSliceSubstream) {
-    initialize_CABAC_at_slice_segment_start(tctx);
+    bool success = initialize_CABAC_at_slice_segment_start(tctx);
+    if (!success) {
+      // could not decode this row, mark whole row as finished
+      for (int x=0;x<ctbW;x++) {
+        img->ctb_progress[myCtbRow*ctbW + x].set_progress(CTB_PROGRESS_PREFILTER);
+      }
+
+      state = Finished;
+      tctx->sliceunit->finished_threads.increase_progress(1);
+      img->thread_finishes(this);
+      return;
+    }
     //initialize_CABAC(tctx);
   }
 
@@ -3893,17 +4974,20 @@ void thread_task_ctb_row::work()
 
   // TODO: what about slices that end properly in the middle of a CTB row?
 
-#if 1
   if (tctx->CtbY == myCtbRow) {
     int lastCtbX = sps->PicWidthInCtbsY; // assume no tiles when WPP is on
     for (int x = tctx->CtbX; x<lastCtbX ; x++) {
-      img->ctb_progress[myCtbRow*ctbW + x].set_progress(CTB_PROGRESS_PREFILTER);
+
+      if (x        < img->sps.PicWidthInCtbsY &&
+          myCtbRow < img->sps.PicHeightInCtbsY) {
+        img->ctb_progress[myCtbRow*ctbW + x].set_progress(CTB_PROGRESS_PREFILTER);
+      }
     }
   }
-#endif
 
   state = Finished;
-  img->thread_finishes();
+  tctx->sliceunit->finished_threads.increase_progress(1);
+  img->thread_finishes(this);
 }
 
 
@@ -3916,11 +5000,14 @@ de265_error read_slice_segment_data(thread_context* tctx)
   const seq_parameter_set* sps = &img->sps;
   slice_segment_header* shdr = tctx->shdr;
 
-  initialize_CABAC_at_slice_segment_start(tctx);
+  bool success = initialize_CABAC_at_slice_segment_start(tctx);
+  if (!success) {
+    return DE265_ERROR_UNSPECIFIED_DECODING_ERROR;
+  }
 
   init_CABAC_decoder_2(&tctx->cabac_decoder);
 
-  // printf("-----\n");
+  //printf("-----\n");
 
   bool first_slice_substream = !shdr->dependent_slice_segment_flag;
 
@@ -3942,7 +5029,7 @@ de265_error read_slice_segment_data(thread_context* tctx)
     }
 
     substream++;
-        
+
 
     result = decode_substream(tctx, false, first_slice_substream);
 
@@ -3955,7 +5042,7 @@ de265_error read_slice_segment_data(thread_context* tctx)
     first_slice_substream = false;
 
     if (pps->tiles_enabled_flag) {
-      initialize_CABAC(tctx);
+      initialize_CABAC_models(tctx);
     }
   } while (true);
 
diff --git a/libde265/slice.h b/libde265/slice.h
index 3b898bd..2f2034c 100644
--- a/libde265/slice.h
+++ b/libde265/slice.h
@@ -29,18 +29,28 @@
 #include "libde265/util.h"
 #include "libde265/refpic.h"
 #include "libde265/threads.h"
+#include "contextmodel.h"
 
 #include <vector>
+#include <string.h>
 
 #define MAX_NUM_REF_PICS    16
 
-#define SLICE_TYPE_B 0
-#define SLICE_TYPE_P 1
-#define SLICE_TYPE_I 2
+class decoder_context;
+class thread_context;
+class error_queue;
+class seq_parameter_set;
+class pic_parameter_set;
 
+enum SliceType
+  {
+    SLICE_TYPE_B = 0,
+    SLICE_TYPE_P = 1,
+    SLICE_TYPE_I = 2
+  };
 
 /*
-        2Nx2N           2NxN             Nx2N            NxN          
+        2Nx2N           2NxN             Nx2N            NxN
       +-------+       +-------+       +---+---+       +---+---+
       |       |       |       |       |   |   |       |   |   |
       |       |       |_______|       |   |   |       |___|___|
@@ -48,7 +58,7 @@
       |       |       |       |       |   |   |       |   |   |
       +-------+       +-------+       +---+---+       +---+---+
 
-        2NxnU           2NxnD           nLx2N           nRx2N        
+        2NxnU           2NxnD           nLx2N           nRx2N
       +-------+       +-------+       +-+-----+       +-----+-+
       |_______|       |       |       | |     |       |     | |
       |       |       |       |       | |     |       |     | |
@@ -89,57 +99,49 @@ enum IntraPredMode
     INTRA_ANGULAR_22 = 22,  INTRA_ANGULAR_23 = 23,  INTRA_ANGULAR_24 = 24,  INTRA_ANGULAR_25 = 25,
     INTRA_ANGULAR_26 = 26,  INTRA_ANGULAR_27 = 27,  INTRA_ANGULAR_28 = 28,  INTRA_ANGULAR_29 = 29,
     INTRA_ANGULAR_30 = 30,  INTRA_ANGULAR_31 = 31,  INTRA_ANGULAR_32 = 32,  INTRA_ANGULAR_33 = 33,
-    INTRA_ANGULAR_34 = 34,
-    INTRA_CHROMA_EQ_LUMA = 100  // chroma := luma
+    INTRA_ANGULAR_34 = 34
+  };
+
+
+enum IntraChromaPredMode
+  {
+    INTRA_CHROMA_PLANAR_OR_34     = 0,
+    INTRA_CHROMA_ANGULAR_26_OR_34 = 1,
+    INTRA_CHROMA_ANGULAR_10_OR_34 = 2,
+    INTRA_CHROMA_DC_OR_34         = 3,
+    INTRA_CHROMA_LIKE_LUMA  = 4
   };
 
+
 enum InterPredIdc
   {
-    PRED_L0=0,
-    PRED_L1=1,
-    PRED_BI=2
+    // note: values have to match the decoding function decode_inter_pred_idc()
+    PRED_L0=1,
+    PRED_L1=2,
+    PRED_BI=3
   };
 
-enum context_model_indices {
-  CONTEXT_MODEL_SAO_MERGE_FLAG = 0,
-  CONTEXT_MODEL_SAO_TYPE_IDX   = CONTEXT_MODEL_SAO_MERGE_FLAG +1,
-  CONTEXT_MODEL_SPLIT_CU_FLAG  = CONTEXT_MODEL_SAO_TYPE_IDX + 1,
-  CONTEXT_MODEL_CU_SKIP_FLAG   = CONTEXT_MODEL_SPLIT_CU_FLAG + 3,
-  CONTEXT_MODEL_PART_MODE      = CONTEXT_MODEL_CU_SKIP_FLAG + 3,
-  CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG = CONTEXT_MODEL_PART_MODE + 4,
-  CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE    = CONTEXT_MODEL_PREV_INTRA_LUMA_PRED_FLAG + 1,
-  CONTEXT_MODEL_CBF_LUMA                  = CONTEXT_MODEL_INTRA_CHROMA_PRED_MODE + 1,
-  CONTEXT_MODEL_CBF_CHROMA                = CONTEXT_MODEL_CBF_LUMA + 2,
-  CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG      = CONTEXT_MODEL_CBF_CHROMA + 4,
-  CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX = CONTEXT_MODEL_SPLIT_TRANSFORM_FLAG + 3,
-  CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX = CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_X_PREFIX + 18,
-  CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG          = CONTEXT_MODEL_LAST_SIGNIFICANT_COEFFICIENT_Y_PREFIX + 18,
-  CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG        = CONTEXT_MODEL_CODED_SUB_BLOCK_FLAG + 4,
-  CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG = CONTEXT_MODEL_SIGNIFICANT_COEFF_FLAG + 42,
-  CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG = CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER1_FLAG + 24,
-  CONTEXT_MODEL_CU_QP_DELTA_ABS        = CONTEXT_MODEL_COEFF_ABS_LEVEL_GREATER2_FLAG + 6,
-  CONTEXT_MODEL_TRANSFORM_SKIP_FLAG    = CONTEXT_MODEL_CU_QP_DELTA_ABS + 2,
-  CONTEXT_MODEL_MERGE_FLAG             = CONTEXT_MODEL_TRANSFORM_SKIP_FLAG + 2,
-  CONTEXT_MODEL_MERGE_IDX              = CONTEXT_MODEL_MERGE_FLAG + 1,
-  CONTEXT_MODEL_PRED_MODE_FLAG         = CONTEXT_MODEL_MERGE_IDX + 1,
-  CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG = CONTEXT_MODEL_PRED_MODE_FLAG + 1,
-  CONTEXT_MODEL_MVP_LX_FLAG            = CONTEXT_MODEL_ABS_MVD_GREATER01_FLAG + 2,
-  CONTEXT_MODEL_RQT_ROOT_CBF           = CONTEXT_MODEL_MVP_LX_FLAG + 1,
-  CONTEXT_MODEL_REF_IDX_LX             = CONTEXT_MODEL_RQT_ROOT_CBF + 1,
-  CONTEXT_MODEL_INTER_PRED_IDC         = CONTEXT_MODEL_REF_IDX_LX + 2,
-  CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG = CONTEXT_MODEL_INTER_PRED_IDC + 5,
-  CONTEXT_MODEL_TABLE_LENGTH           = CONTEXT_MODEL_CU_TRANSQUANT_BYPASS_FLAG + 1
-};
 
 
-typedef struct slice_segment_header {
-  slice_segment_header() { }
+class slice_segment_header {
+public:
+  slice_segment_header() {
+    reset();
+  }
+
+  de265_error read(bitreader* br, decoder_context*, bool* continueDecoding);
+  de265_error write(error_queue*, CABAC_encoder&,
+                    const seq_parameter_set* sps,
+                    const pic_parameter_set* pps,
+                    uint8_t nal_unit_type);
 
-  de265_error read(bitreader* br, struct decoder_context*, bool* continueDecoding);
   void dump_slice_segment_header(const decoder_context*, int fd) const;
 
+  void set_defaults();
+  void reset();
 
-  int  slice_index; // index through all slices in a picture
+
+  int  slice_index; // index through all slices in a picture  (internal only)
 
   char first_slice_segment_in_pic_flag;
   char no_output_of_prior_pics_flag;
@@ -203,6 +205,8 @@ typedef struct slice_segment_header {
   int  slice_cb_qp_offset;
   int  slice_cr_qp_offset;
 
+  char cu_chroma_qp_offset_enabled_flag;
+
   char deblocking_filter_override_flag;
   char slice_deblocking_filter_disabled_flag;
   int  slice_beta_offset; // = pps->beta_offset if undefined
@@ -219,17 +223,23 @@ typedef struct slice_segment_header {
 
   // --- derived data ---
 
-  int SliceAddrRS;  // start of last independent slice
   int SliceQPY;
-
   int initType;
 
-  int MaxNumMergeCand;
+  void compute_derived_values(const pic_parameter_set* pps);
+
+
+  // --- data for external modules ---
+
+  int SliceAddrRS;  // slice_segment_address of last independent slice
+
+  int MaxNumMergeCand;  // directly derived from 'five_minus_max_num_merge_cand'
   int CurrRpsIdx;
   ref_pic_set CurrRps;  // the active reference-picture set
   int NumPocTotalCurr;
 
-  int RefPicList[2][MAX_NUM_REF_PICS]; // contains indices into DPB
+  // number of entries: num_ref_idx_l0_active / num_ref_idx_l1_active
+  int RefPicList[2][MAX_NUM_REF_PICS]; // contains buffer IDs (D:indices into DPB/E:frame number)
   int RefPicList_POC[2][MAX_NUM_REF_PICS];
   int RefPicList_PicState[2][MAX_NUM_REF_PICS]; /* We have to save the PicState because the decoding
                                                    of an image may be delayed and the PicState can
@@ -240,11 +250,12 @@ typedef struct slice_segment_header {
                                                is a long-term picture. */
 
   // context storage for dependent slices (stores CABAC model at end of slice segment)
-  context_model ctx_model_storage[CONTEXT_MODEL_TABLE_LENGTH];
+  context_model_table ctx_model_storage;
+  bool ctx_model_storage_defined; // whether there is valid data in ctx_model_storage
 
   std::vector<int> RemoveReferencesList; // images that can be removed from the DPB before decoding this slice
 
-} slice_segment_header;
+};
 
 
 
@@ -255,13 +266,13 @@ typedef struct {
   unsigned char SaoEoClass; // use with (SaoTypeIdx>>(2*cIdx)) & 0x3
 
   uint8_t sao_band_position[3];
-  int8_t  saoOffsetVal[3][4]; // index with [][idx-1] as saoOffsetVal[][0]==0 always  
+  int8_t  saoOffsetVal[3][4]; // index with [][idx-1] as saoOffsetVal[][0]==0 always
 } sao_info;
 
 
 
 
-de265_error read_slice_segment_data(struct thread_context* tctx);
+de265_error read_slice_segment_data(thread_context* tctx);
 
 bool alloc_and_init_significant_coeff_ctxIdx_lookupTable();
 void free_significant_coeff_ctxIdx_lookupTable();
@@ -271,18 +282,26 @@ class thread_task_ctb_row : public thread_task
 {
 public:
   bool   firstSliceSubstream;
-  struct thread_context* tctx;
+  int    debug_startCtbRow;
+  thread_context* tctx;
 
   virtual void work();
+  virtual std::string name() const;
 };
 
 class thread_task_slice_segment : public thread_task
 {
 public:
   bool   firstSliceSubstream;
-  struct thread_context* tctx;
+  int    debug_startCtbX, debug_startCtbY;
+  thread_context* tctx;
 
   virtual void work();
+  virtual std::string name() const;
 };
 
+
+int check_CTB_available(const de265_image* img,
+                        int xC,int yC, int xN,int yN);
+
 #endif
diff --git a/libde265/sps.cc b/libde265/sps.cc
index 7fd02b3..98142f7 100644
--- a/libde265/sps.cc
+++ b/libde265/sps.cc
@@ -29,7 +29,7 @@
 
 #define READ_VLC_OFFSET(variable, vlctype, offset)   \
   if ((vlc = get_ ## vlctype(br)) == UVLC_ERROR) {   \
-    ctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);  \
+    errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);  \
     return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; \
   } \
   variable = vlc + offset;
@@ -46,7 +46,7 @@ static int SubHeightC_tab[] = { -1,2,1,1 };
 
 
 // TODO: should be in some header-file of refpic.c
-extern bool read_short_term_ref_pic_set(decoder_context* ctx,
+extern bool read_short_term_ref_pic_set(error_queue* errqueue,
                                         const seq_parameter_set* sps,
                                         bitreader* br,
                                         ref_pic_set* out_set,
@@ -54,6 +54,29 @@ extern bool read_short_term_ref_pic_set(decoder_context* ctx,
                                         const std::vector<ref_pic_set>& sets,
                                         bool sliceRefPicSet);
 
+extern bool write_short_term_ref_pic_set(error_queue* errqueue,
+                                         const seq_parameter_set* sps,
+                                         CABAC_encoder& out,
+                                         const ref_pic_set* in_set, // which set to write
+                                         int idxRps,  // index of the set to be read
+                                         const std::vector<ref_pic_set>& sets, // previously read sets
+                                         bool sliceRefPicSet); // is this in the slice header?
+
+
+sps_range_extension::sps_range_extension()
+{
+  transform_skip_rotation_enabled_flag = 0;
+  transform_skip_context_enabled_flag  = 0;
+  implicit_rdpcm_enabled_flag = 0;
+  explicit_rdpcm_enabled_flag = 0;
+  extended_precision_processing_flag = 0;
+  intra_smoothing_disabled_flag = 0;
+  high_precision_offsets_enabled_flag = 0;
+  persistent_rice_adaptation_enabled_flag = 0;
+  cabac_bypass_alignment_enabled_flag = 0;
+}
+
+
 seq_parameter_set::seq_parameter_set()
 {
   // TODO: this is dangerous
@@ -70,7 +93,109 @@ seq_parameter_set::~seq_parameter_set()
 }
 
 
-de265_error seq_parameter_set::read(decoder_context* ctx, bitreader* br)
+void seq_parameter_set::set_defaults(enum PresetSet)
+{
+  video_parameter_set_id = 0;
+  sps_max_sub_layers = 1;
+  sps_temporal_id_nesting_flag = 1;
+
+  profile_tier_level_.general.set_defaults(Profile_Main, 6,2); // TODO
+
+  seq_parameter_set_id = 0;
+  chroma_format_idc = 1;
+  ChromaArrayType = chroma_format_idc;
+
+  separate_colour_plane_flag = 0;
+  pic_width_in_luma_samples = 0;
+  pic_height_in_luma_samples = 0;
+  conformance_window_flag = 0;
+
+  conf_win_left_offset   = 0;
+  conf_win_right_offset  = 0;
+  conf_win_top_offset    = 0;
+  conf_win_bottom_offset = 0;
+
+  bit_depth_luma  =8;
+  bit_depth_chroma=8;
+
+  log2_max_pic_order_cnt_lsb = 8;
+  sps_sub_layer_ordering_info_present_flag = 0;
+
+  sps_max_dec_pic_buffering[0] = 1;
+  sps_max_num_reorder_pics[0]  = 0;
+  sps_max_latency_increase_plus1[0] = 0;
+
+  set_CB_log2size_range(4,4);
+  set_TB_log2size_range(3,4);
+  max_transform_hierarchy_depth_inter = 1;
+  max_transform_hierarchy_depth_intra = 1;
+
+  scaling_list_enable_flag = 0;
+  sps_scaling_list_data_present_flag = 0;
+
+  // TODO struct scaling_list_data scaling_list;
+
+  amp_enabled_flag = 0;
+  sample_adaptive_offset_enabled_flag = 0;
+  pcm_enabled_flag = 0;
+
+  pcm_sample_bit_depth_luma = 8;
+  pcm_sample_bit_depth_chroma = 8;
+  // TODO log2_min_pcm_luma_coding_block_size;
+  // TODO log2_diff_max_min_pcm_luma_coding_block_size;
+  pcm_loop_filter_disable_flag = 1;
+
+  // num_short_term_ref_pic_sets = 0;
+  // std::vector<ref_pic_set> ref_pic_sets; // [0 ; num_short_term_ref_pic_set (<=MAX_REF_PIC_SETS) )
+  ref_pic_sets.clear();
+
+  long_term_ref_pics_present_flag = 0;
+
+  num_long_term_ref_pics_sps = 0;
+
+  /* TODO
+  int  lt_ref_pic_poc_lsb_sps[MAX_NUM_LT_REF_PICS_SPS];
+  char used_by_curr_pic_lt_sps_flag[MAX_NUM_LT_REF_PICS_SPS];
+  */
+
+  sps_temporal_mvp_enabled_flag = 0;
+  strong_intra_smoothing_enable_flag = 0;
+  vui_parameters_present_flag = 0;
+
+  /*
+    if( vui_parameters_present_flag )
+      vui_parameters()
+  */
+
+  sps_extension_present_flag = 0;
+  sps_range_extension_flag = 0;
+  sps_multilayer_extension_flag = 0;
+  sps_extension_6bits = 0;
+}
+
+
+void seq_parameter_set::set_CB_log2size_range(int mini,int maxi)
+{
+  log2_min_luma_coding_block_size = mini;
+  log2_diff_max_min_luma_coding_block_size = maxi-mini;
+}
+
+
+void seq_parameter_set::set_TB_log2size_range(int mini,int maxi)
+{
+  log2_min_transform_block_size = mini;
+  log2_diff_max_min_transform_block_size = maxi-mini;
+}
+
+
+void seq_parameter_set::set_resolution(int w,int h)
+{
+  pic_width_in_luma_samples  = w;
+  pic_height_in_luma_samples = h;
+}
+
+
+de265_error seq_parameter_set::read(error_queue* errqueue, bitreader* br)
 {
   int vlc;
 
@@ -82,9 +207,12 @@ de265_error seq_parameter_set::read(decoder_context* ctx, bitreader* br)
 
   sps_temporal_id_nesting_flag = get_bits(br,1);
 
-  read_profile_tier_level(br,&profile_tier_level, sps_max_sub_layers);
+  profile_tier_level_.read(br, sps_max_sub_layers);
 
   READ_VLC(seq_parameter_set_id, uvlc);
+  if (seq_parameter_set_id >= DE265_MAX_SPS_SETS) {
+    return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+  }
 
 
   // --- decode chroma type ---
@@ -107,19 +235,26 @@ de265_error seq_parameter_set::read(decoder_context* ctx, bitreader* br)
 
   if (chroma_format_idc<0 ||
       chroma_format_idc>3) {
-    ctx->add_warning(DE265_WARNING_INVALID_CHROMA_FORMAT, false);
+    errqueue->add_warning(DE265_WARNING_INVALID_CHROMA_FORMAT, false);
     return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
   }
 
-  SubWidthC  = SubWidthC_tab [chroma_format_idc];
-  SubHeightC = SubHeightC_tab[chroma_format_idc];
-
 
   // --- picture size ---
 
   READ_VLC(pic_width_in_luma_samples,  uvlc);
   READ_VLC(pic_height_in_luma_samples, uvlc);
 
+  if (pic_width_in_luma_samples  == 0 ||
+      pic_height_in_luma_samples == 0) {
+    return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+  }
+
+  if (pic_width_in_luma_samples > MAX_PICTURE_WIDTH ||
+      pic_height_in_luma_samples> MAX_PICTURE_HEIGHT) {
+    return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+  }
+
   conformance_window_flag = get_bits(br,1);
 
   if (conformance_window_flag) {
@@ -135,16 +270,6 @@ de265_error seq_parameter_set::read(decoder_context* ctx, bitreader* br)
     conf_win_bottom_offset= 0;
   }
 
-  if (ChromaArrayType==0) {
-    WinUnitX = 1;
-    WinUnitY = 1;
-  }
-  else {
-    WinUnitX = SubWidthC_tab [chroma_format_idc];
-    WinUnitY = SubHeightC_tab[chroma_format_idc];
-  }
-
-
   READ_VLC_OFFSET(bit_depth_luma,  uvlc, 8);
   READ_VLC_OFFSET(bit_depth_chroma,uvlc, 8);
 
@@ -166,7 +291,7 @@ de265_error seq_parameter_set::read(decoder_context* ctx, bitreader* br)
     vlc=get_uvlc(br);
     if (vlc == UVLC_ERROR ||
         vlc+1 > MAX_NUM_REF_PICS) {
-      ctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
+      errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
       return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
     }
 
@@ -205,6 +330,12 @@ de265_error seq_parameter_set::read(decoder_context* ctx, bitreader* br)
   READ_VLC(log2_diff_max_min_transform_block_size, uvlc);
   READ_VLC(max_transform_hierarchy_depth_inter, uvlc);
   READ_VLC(max_transform_hierarchy_depth_intra, uvlc);
+
+  if (log2_min_luma_coding_block_size > 6) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; }
+  if (log2_min_luma_coding_block_size + log2_diff_max_min_luma_coding_block_size > 6) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; }
+  if (log2_min_transform_block_size > 5) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; }
+  if (log2_min_transform_block_size + log2_diff_max_min_transform_block_size > 5) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; }
+
   scaling_list_enable_flag = get_bits(br,1);
 
   if (scaling_list_enable_flag) {
@@ -240,10 +371,11 @@ de265_error seq_parameter_set::read(decoder_context* ctx, bitreader* br)
     pcm_loop_filter_disable_flag = 0;
   }
 
+  int num_short_term_ref_pic_sets;
   READ_VLC(num_short_term_ref_pic_sets, uvlc);
   if (num_short_term_ref_pic_sets < 0 ||
       num_short_term_ref_pic_sets > 64) {
-    ctx->add_warning(DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE, false);
+    errqueue->add_warning(DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE, false);
     return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
   }
 
@@ -255,7 +387,7 @@ de265_error seq_parameter_set::read(decoder_context* ctx, bitreader* br)
 
   for (int i = 0; i < num_short_term_ref_pic_sets; i++) {
 
-    bool success = read_short_term_ref_pic_set(ctx,this,br,
+    bool success = read_short_term_ref_pic_set(errqueue,this,br,
                                                &ref_pic_sets[i], i,
                                                ref_pic_sets,
                                                false);
@@ -287,36 +419,63 @@ de265_error seq_parameter_set::read(decoder_context* ctx, bitreader* br)
 
   sps_temporal_mvp_enabled_flag = get_bits(br,1);
   strong_intra_smoothing_enable_flag = get_bits(br,1);
-  vui_parameters_present_flag = get_bits(br,1);
 
-#if 0
+  vui_parameters_present_flag = get_bits(br,1);
   if (vui_parameters_present_flag) {
-    assert(false);
-    /*
-      vui_parameters()
+    vui.read(errqueue, br, this);
+  }
 
-        sps_extension_flag
-        u(1)
-        if( sps_extension_flag )
 
-          while( more_rbsp_data() )
+  sps_extension_present_flag = get_bits(br,1);
+  if (sps_extension_present_flag) {
+    sps_range_extension_flag = get_bits(br,1);
+    sps_multilayer_extension_flag = get_bits(br,1);
+    sps_extension_6bits = get_bits(br,6);
+  }
+  else {
+    sps_range_extension_flag = 0;
+  }
 
-            sps_extension_data_flag
-              u(1)
-              rbsp_trailing_bits()
-    */
+  if (sps_range_extension_flag) {
+    de265_error err = range_extension.read(errqueue, br);
+    if (err != DE265_OK) { return err; }
   }
 
+  /*
   sps_extension_flag = get_bits(br,1);
   if (sps_extension_flag) {
     assert(false);
   }
+  */
 
-  check_rbsp_trailing_bits(br);
-#endif
 
+  de265_error err = compute_derived_values();
+  if (err != DE265_OK) { return err; }
+
+  sps_read = true;
+
+  return DE265_OK;
+}
+
+
+de265_error seq_parameter_set::compute_derived_values()
+{
   // --- compute derived values ---
 
+  SubWidthC  = SubWidthC_tab [chroma_format_idc];
+  SubHeightC = SubHeightC_tab[chroma_format_idc];
+
+  if (ChromaArrayType==0) {
+    WinUnitX = 1;
+    WinUnitY = 1;
+  }
+  else {
+    WinUnitX = SubWidthC_tab [chroma_format_idc];
+    WinUnitY = SubHeightC_tab[chroma_format_idc];
+  }
+
+
+
   BitDepth_Y   = bit_depth_luma;
   QpBdOffset_Y = 6*(bit_depth_luma-8);
   BitDepth_C   = bit_depth_chroma;
@@ -326,9 +485,10 @@ de265_error seq_parameter_set::read(decoder_context* ctx, bitreader* br)
   Log2CtbSizeY = Log2MinCbSizeY + log2_diff_max_min_luma_coding_block_size;
   MinCbSizeY = 1 << Log2MinCbSizeY;
   CtbSizeY = 1 << Log2CtbSizeY;
-  PicWidthInMinCbsY = pic_width_in_luma_samples / MinCbSizeY;
+
+  PicWidthInMinCbsY = ceil_div(pic_width_in_luma_samples, MinCbSizeY);
   PicWidthInCtbsY   = ceil_div(pic_width_in_luma_samples, CtbSizeY);
-  PicHeightInMinCbsY = pic_height_in_luma_samples / MinCbSizeY;
+  PicHeightInMinCbsY = ceil_div(pic_height_in_luma_samples, MinCbSizeY);
   PicHeightInCtbsY   = ceil_div(pic_height_in_luma_samples,CtbSizeY);
   PicSizeInMinCbsY   = PicWidthInMinCbsY * PicHeightInMinCbsY;
   PicSizeInCtbsY = PicWidthInCtbsY * PicHeightInCtbsY;
@@ -346,6 +506,9 @@ de265_error seq_parameter_set::read(decoder_context* ctx, bitreader* br)
   Log2MinTrafoSize = log2_min_transform_block_size;
   Log2MaxTrafoSize = log2_min_transform_block_size + log2_diff_max_min_transform_block_size;
 
+  if (max_transform_hierarchy_depth_inter > Log2CtbSizeY - Log2MinTrafoSize) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; }
+  if (max_transform_hierarchy_depth_intra > Log2CtbSizeY - Log2MinTrafoSize) { return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; }
+
   Log2MinPUSize = Log2MinCbSizeY-1;
   PicWidthInMinPUs  = PicWidthInCtbsY  << (Log2CtbSizeY - Log2MinPUSize);
   PicHeightInMinPUs = PicHeightInCtbsY << (Log2CtbSizeY - Log2MinPUSize);
@@ -359,6 +522,47 @@ de265_error seq_parameter_set::read(decoder_context* ctx, bitreader* br)
   PicHeightInTbsY = PicHeightInCtbsY << (Log2CtbSizeY - Log2MinTrafoSize);
   PicSizeInTbsY = PicWidthInTbsY * PicHeightInTbsY;
 
+
+  if (range_extension.high_precision_offsets_enabled_flag) {
+    WpOffsetBdShiftY = 0;
+    WpOffsetBdShiftC = 0;
+    WpOffsetHalfRangeY = 1 << (BitDepth_Y - 1);
+    WpOffsetHalfRangeC = 1 << (BitDepth_C - 1);
+  }
+  else {
+    WpOffsetBdShiftY = ( BitDepth_Y - 8 );
+    WpOffsetBdShiftC = ( BitDepth_C - 8 );
+    WpOffsetHalfRangeY = 1 << 7;
+    WpOffsetHalfRangeC = 1 << 7;
+  }
+
+
+  // --- check SPS sanity ---
+
+  if (pic_width_in_luma_samples  % MinCbSizeY != 0 ||
+      pic_height_in_luma_samples % MinCbSizeY != 0) {
+    // TODO: warn that image size is coded wrong in bitstream (must be multiple of MinCbSizeY)
+    return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+  }
+
+  if (Log2MinTrafoSize > Log2MinCbSizeY) {
+    return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+  }
+
+  if (Log2MaxTrafoSize > libde265_min(Log2CtbSizeY,5)) {
+    return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+  }
+
+
+  if (BitDepth_Y < 8 || BitDepth_Y > 16) {
+    return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+  }
+
+  if (BitDepth_C < 8 || BitDepth_C > 16) {
+    return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+  }
+
+
   sps_read = true;
 
   return DE265_OK;
@@ -366,7 +570,7 @@ de265_error seq_parameter_set::read(decoder_context* ctx, bitreader* br)
 
 
 
-void seq_parameter_set::dump_sps(int fd) const
+void seq_parameter_set::dump(int fd) const
 {
   //#if (_MSC_VER >= 1500)
   //#define LOG0(t) loginfo(LogHeaders, t)
@@ -390,10 +594,11 @@ void seq_parameter_set::dump_sps(int fd) const
   LOG1("sps_max_sub_layers      : %d\n", sps_max_sub_layers);
   LOG1("sps_temporal_id_nesting_flag : %d\n", sps_temporal_id_nesting_flag);
 
-  dump_profile_tier_level(&profile_tier_level, sps_max_sub_layers, fh);
+  profile_tier_level_.dump(sps_max_sub_layers, fh);
 
   LOG1("seq_parameter_set_id    : %d\n", seq_parameter_set_id);
   LOG2("chroma_format_idc       : %d (%s)\n", chroma_format_idc,
+       chroma_format_idc == 0 ? "monochrome" :
        chroma_format_idc == 1 ? "4:2:0" :
        chroma_format_idc == 2 ? "4:2:2" :
        chroma_format_idc == 3 ? "4:4:4" : "unknown");
@@ -460,9 +665,9 @@ void seq_parameter_set::dump_sps(int fd) const
     LOG1("pcm_loop_filter_disable_flag  : %d\n", pcm_loop_filter_disable_flag);
   }
 
-  LOG1("num_short_term_ref_pic_sets : %d\n", num_short_term_ref_pic_sets);
+  LOG1("num_short_term_ref_pic_sets : %d\n", ref_pic_sets.size());
 
-  for (int i = 0; i < num_short_term_ref_pic_sets; i++) {
+  for (int i = 0; i < ref_pic_sets.size(); i++) {
     LOG1("ref_pic_set[ %2d ]: ",i);
     dump_compact_short_term_ref_pic_set(&ref_pic_sets[i], 16, fh);
   }
@@ -483,32 +688,28 @@ void seq_parameter_set::dump_sps(int fd) const
   LOG1("strong_intra_smoothing_enable_flag : %d\n", strong_intra_smoothing_enable_flag);
   LOG1("vui_parameters_present_flag        : %d\n", vui_parameters_present_flag);
 
+  LOG1("sps_extension_present_flag    : %d\n", sps_extension_present_flag);
+  LOG1("sps_range_extension_flag      : %d\n", sps_range_extension_flag);
+  LOG1("sps_multilayer_extension_flag : %d\n", sps_multilayer_extension_flag);
+  LOG1("sps_extension_6bits           : %d\n", sps_extension_6bits);
+
   LOG1("CtbSizeY     : %d\n", CtbSizeY);
   LOG1("MinCbSizeY   : %d\n", MinCbSizeY);
   LOG1("MaxCbSizeY   : %d\n", 1<<(log2_min_luma_coding_block_size + log2_diff_max_min_luma_coding_block_size));
   LOG1("MinTBSizeY   : %d\n", 1<<log2_min_transform_block_size);
   LOG1("MaxTBSizeY   : %d\n", 1<<(log2_min_transform_block_size + log2_diff_max_min_transform_block_size));
 
+  LOG1("PicWidthInCtbsY         : %d\n", PicWidthInCtbsY);
+  LOG1("PicHeightInCtbsY        : %d\n", PicHeightInCtbsY);
   LOG1("SubWidthC               : %d\n", SubWidthC);
   LOG1("SubHeightC              : %d\n", SubHeightC);
 
-  return;
+  if (sps_range_extension_flag) {
+    range_extension.dump(fd);
+  }
 
   if (vui_parameters_present_flag) {
-    assert(false);
-    /*
-      vui_parameters()
-
-        sps_extension_flag
-        u(1)
-        if( sps_extension_flag )
-
-          while( more_rbsp_data() )
-
-            sps_extension_data_flag
-              u(1)
-              rbsp_trailing_bits()
-    */
+    vui.dump(fd);
   }
 #undef LOG0
 #undef LOG1
@@ -646,7 +847,7 @@ de265_error read_scaling_list(bitreader* br, const seq_parameter_set* sps,
       char scaling_list_pred_mode_flag = get_bits(br,1);
       if (!scaling_list_pred_mode_flag) {
         int scaling_list_pred_matrix_id_delta = get_uvlc(br);
-        if (scaling_list_pred_matrix_id_delta < 0 ||
+        if (scaling_list_pred_matrix_id_delta == UVLC_ERROR ||
             scaling_list_pred_matrix_id_delta > matrixId) {
           return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
         }
@@ -743,6 +944,16 @@ de265_error read_scaling_list(bitreader* br, const seq_parameter_set* sps,
 }
 
 
+de265_error write_scaling_list(CABAC_encoder& out, const seq_parameter_set* sps,
+                              scaling_list_data* sclist, bool inPPS)
+{
+  assert(false);
+  // TODO
+
+  return DE265_OK;
+}
+
+
 void set_default_scaling_lists(scaling_list_data* sclist)
 {
   // 4x4
@@ -778,3 +989,268 @@ void set_default_scaling_lists(scaling_list_data* sclist)
                       default_ScalingList_8x8_inter, 3);
 }
 
+
+de265_error seq_parameter_set::write(error_queue* errqueue, CABAC_encoder& out)
+{
+  out.write_bits(video_parameter_set_id, 4);
+  if (sps_max_sub_layers>7) {
+    return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+  }
+  out.write_bits(sps_max_sub_layers-1, 3);
+
+  out.write_bit(sps_temporal_id_nesting_flag);
+
+  profile_tier_level_.write(out, sps_max_sub_layers);
+
+  out.write_uvlc(seq_parameter_set_id);
+
+
+  // --- encode chroma type ---
+
+  out.write_uvlc(chroma_format_idc);
+
+  if (chroma_format_idc<0 ||
+      chroma_format_idc>3) {
+    errqueue->add_warning(DE265_WARNING_INVALID_CHROMA_FORMAT, false);
+    return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+  }
+
+  if (chroma_format_idc == 3) {
+    out.write_bit(separate_colour_plane_flag);
+  }
+
+
+  // --- picture size ---
+
+  out.write_uvlc(pic_width_in_luma_samples);
+  out.write_uvlc(pic_height_in_luma_samples);
+
+  out.write_bit(conformance_window_flag);
+
+  if (conformance_window_flag) {
+    out.write_uvlc(conf_win_left_offset);
+    out.write_uvlc(conf_win_right_offset);
+    out.write_uvlc(conf_win_top_offset);
+    out.write_uvlc(conf_win_bottom_offset);
+  }
+
+
+  out.write_uvlc(bit_depth_luma-8);
+  out.write_uvlc(bit_depth_chroma-8);
+
+  out.write_uvlc(log2_max_pic_order_cnt_lsb-4);
+
+
+  // --- sub_layer_ordering_info ---
+
+  out.write_bit(sps_sub_layer_ordering_info_present_flag);
+
+  int firstLayer = (sps_sub_layer_ordering_info_present_flag ?
+                    0 : sps_max_sub_layers-1 );
+
+  for (int i=firstLayer ; i <= sps_max_sub_layers-1; i++ ) {
+
+    // sps_max_dec_pic_buffering[i]
+
+    if (sps_max_dec_pic_buffering[i] > MAX_NUM_REF_PICS) {
+      errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
+      return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+    }
+
+    out.write_uvlc(sps_max_dec_pic_buffering[i]-1);
+
+    // sps_max_num_reorder_pics[i]
+
+    out.write_uvlc(sps_max_num_reorder_pics[i]);
+
+
+    // sps_max_latency_increase[i]
+
+    out.write_uvlc(sps_max_latency_increase_plus1[i]);
+  }
+
+
+  out.write_uvlc(log2_min_luma_coding_block_size-3);
+  out.write_uvlc(log2_diff_max_min_luma_coding_block_size);
+  out.write_uvlc(log2_min_transform_block_size-2);
+  out.write_uvlc(log2_diff_max_min_transform_block_size);
+  out.write_uvlc(max_transform_hierarchy_depth_inter);
+  out.write_uvlc(max_transform_hierarchy_depth_intra);
+  out.write_bit(scaling_list_enable_flag);
+
+  if (scaling_list_enable_flag) {
+
+    out.write_bit(sps_scaling_list_data_present_flag);
+    if (sps_scaling_list_data_present_flag) {
+
+      de265_error err;
+      if ((err=write_scaling_list(out,this, &scaling_list, false)) != DE265_OK) {
+        return err;
+      }
+    }
+  }
+
+  out.write_bit(amp_enabled_flag);
+  out.write_bit(sample_adaptive_offset_enabled_flag);
+  out.write_bit(pcm_enabled_flag);
+  if (pcm_enabled_flag) {
+    out.write_bits(pcm_sample_bit_depth_luma  -1,4);
+    out.write_bits(pcm_sample_bit_depth_chroma-1,4);
+    out.write_uvlc(log2_min_pcm_luma_coding_block_size-3);
+    out.write_uvlc(log2_diff_max_min_pcm_luma_coding_block_size);
+    out.write_bit(pcm_loop_filter_disable_flag);
+  }
+
+  int num_short_term_ref_pic_sets = ref_pic_sets.size();
+  if (num_short_term_ref_pic_sets < 0 ||
+      num_short_term_ref_pic_sets > 64) {
+    errqueue->add_warning(DE265_WARNING_NUMBER_OF_SHORT_TERM_REF_PIC_SETS_OUT_OF_RANGE, false);
+    return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+  }
+  out.write_uvlc(num_short_term_ref_pic_sets);
+
+  // --- allocate reference pic set ---
+
+  // we do not allocate the ref-pic-set for the slice header here, but in the slice header itself
+
+  for (int i = 0; i < num_short_term_ref_pic_sets; i++) {
+
+    bool success = write_short_term_ref_pic_set(errqueue,this,out,
+                                                &ref_pic_sets[i], i,
+                                                ref_pic_sets,
+                                                false);
+
+    if (!success) {
+      return DE265_WARNING_SPS_HEADER_INVALID;
+    }
+
+    // dump_short_term_ref_pic_set(&(*ref_pic_sets)[i], fh);
+  }
+
+  out.write_bit(long_term_ref_pics_present_flag);
+
+  if (long_term_ref_pics_present_flag) {
+
+    if (num_long_term_ref_pics_sps > MAX_NUM_LT_REF_PICS_SPS) {
+      return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+    }
+    out.write_uvlc(num_long_term_ref_pics_sps);
+
+    for (int i = 0; i < num_long_term_ref_pics_sps; i++ ) {
+      out.write_bits(lt_ref_pic_poc_lsb_sps[i], log2_max_pic_order_cnt_lsb);
+      out.write_bit (used_by_curr_pic_lt_sps_flag[i]);
+    }
+  }
+
+  out.write_bit(sps_temporal_mvp_enabled_flag);
+  out.write_bit(strong_intra_smoothing_enable_flag);
+  out.write_bit(vui_parameters_present_flag);
+
+#if 0
+  if (vui_parameters_present_flag) {
+    assert(false);
+    /*
+      vui_parameters()
+        sps_extension_flag
+        u(1)
+        if( sps_extension_flag )
+          while( more_rbsp_data() )
+            sps_extension_data_flag
+              u(1)
+              rbsp_trailing_bits()
+    */
+  }
+#endif
+
+  out.write_bit(sps_extension_present_flag);
+
+#if 0
+  if (sps_extension_flag) {
+    assert(false);
+  }
+  check_rbsp_trailing_bits(br);
+#endif
+
+  // --- compute derived values ---
+
+#if 0
+  BitDepth_Y   = bit_depth_luma;
+  QpBdOffset_Y = 6*(bit_depth_luma-8);
+  BitDepth_C   = bit_depth_chroma;
+  QpBdOffset_C = 6*(bit_depth_chroma-8);
+  Log2MinCbSizeY = log2_min_luma_coding_block_size;
+  Log2CtbSizeY = Log2MinCbSizeY + log2_diff_max_min_luma_coding_block_size;
+  MinCbSizeY = 1 << Log2MinCbSizeY;
+  CtbSizeY = 1 << Log2CtbSizeY;
+  PicWidthInMinCbsY = pic_width_in_luma_samples / MinCbSizeY;
+  PicWidthInCtbsY   = ceil_div(pic_width_in_luma_samples, CtbSizeY);
+  PicHeightInMinCbsY = pic_height_in_luma_samples / MinCbSizeY;
+  PicHeightInCtbsY   = ceil_div(pic_height_in_luma_samples,CtbSizeY);
+  PicSizeInMinCbsY   = PicWidthInMinCbsY * PicHeightInMinCbsY;
+  PicSizeInCtbsY = PicWidthInCtbsY * PicHeightInCtbsY;
+  PicSizeInSamplesY = pic_width_in_luma_samples * pic_height_in_luma_samples;
+  if (chroma_format_idc==0 || separate_colour_plane_flag) {
+    CtbWidthC  = 0;
+    CtbHeightC = 0;
+  }
+  else {
+    CtbWidthC  = CtbSizeY / SubWidthC;
+    CtbHeightC = CtbSizeY / SubHeightC;
+  }
+  Log2MinTrafoSize = log2_min_transform_block_size;
+  Log2MaxTrafoSize = log2_min_transform_block_size + log2_diff_max_min_transform_block_size;
+  Log2MinPUSize = Log2MinCbSizeY-1;
+  PicWidthInMinPUs  = PicWidthInCtbsY  << (Log2CtbSizeY - Log2MinPUSize);
+  PicHeightInMinPUs = PicHeightInCtbsY << (Log2CtbSizeY - Log2MinPUSize);
+  Log2MinIpcmCbSizeY = log2_min_pcm_luma_coding_block_size;
+  Log2MaxIpcmCbSizeY = (log2_min_pcm_luma_coding_block_size +
+                        log2_diff_max_min_pcm_luma_coding_block_size);
+  // the following are not in the standard
+  PicWidthInTbsY  = PicWidthInCtbsY  << (Log2CtbSizeY - Log2MinTrafoSize);
+  PicHeightInTbsY = PicHeightInCtbsY << (Log2CtbSizeY - Log2MinTrafoSize);
+  PicSizeInTbsY = PicWidthInTbsY * PicHeightInTbsY;
+  sps_read = true;
+#endif
+
+  return DE265_OK;
+}
+
+
+de265_error sps_range_extension::read(error_queue* errqueue, bitreader* br)
+{
+  transform_skip_rotation_enabled_flag    = get_bits(br,1);
+  transform_skip_context_enabled_flag     = get_bits(br,1);
+  implicit_rdpcm_enabled_flag             = get_bits(br,1);
+  explicit_rdpcm_enabled_flag             = get_bits(br,1);
+  extended_precision_processing_flag      = get_bits(br,1);
+  intra_smoothing_disabled_flag           = get_bits(br,1);
+  high_precision_offsets_enabled_flag     = get_bits(br,1);
+  persistent_rice_adaptation_enabled_flag = get_bits(br,1);
+  cabac_bypass_alignment_enabled_flag     = get_bits(br,1);
+
+  return DE265_OK;
+}
+
+
+#define LOG0(t) log2fh(fh, t)
+#define LOG1(t,d) log2fh(fh, t,d)
+void sps_range_extension::dump(int fd) const
+{
+  FILE* fh;
+  if (fd==1) fh=stdout;
+  else if (fd==2) fh=stderr;
+  else { return; }
+
+  LOG0("----------------- SPS-range-extension -----------------\n");
+  LOG1("transform_skip_rotation_enabled_flag    : %d\n", transform_skip_rotation_enabled_flag);
+  LOG1("transform_skip_context_enabled_flag     : %d\n", transform_skip_context_enabled_flag);
+  LOG1("implicit_rdpcm_enabled_flag             : %d\n", implicit_rdpcm_enabled_flag);
+  LOG1("explicit_rdpcm_enabled_flag             : %d\n", explicit_rdpcm_enabled_flag);
+  LOG1("extended_precision_processing_flag      : %d\n", extended_precision_processing_flag);
+  LOG1("intra_smoothing_disabled_flag           : %d\n", intra_smoothing_disabled_flag);
+  LOG1("high_precision_offsets_enabled_flag     : %d\n", high_precision_offsets_enabled_flag);
+  LOG1("persistent_rice_adaptation_enabled_flag : %d\n", persistent_rice_adaptation_enabled_flag);
+  LOG1("cabac_bypass_alignment_enabled_flag     : %d\n", cabac_bypass_alignment_enabled_flag);
+}
+#undef LOG1
+#undef LOG0
diff --git a/libde265/sps.h b/libde265/sps.h
index f5e58c4..ea41808 100644
--- a/libde265/sps.h
+++ b/libde265/sps.h
@@ -22,15 +22,22 @@
 #define DE265_SPS_H
 
 #include "libde265/vps.h"
+#include "libde265/vui.h"
 #include "libde265/bitstream.h"
 #include "libde265/refpic.h"
 #include "libde265/de265.h"
+#include "libde265/cabac.h"
 
 #include <vector>
 
-#define MAX_REF_PIC_SETS 64  // maximum according to standard
+class error_queue;
+
+// #define MAX_REF_PIC_SETS 64  // maximum according to standard
 #define MAX_NUM_LT_REF_PICS_SPS 32
 
+// this is just a safety range
+#define MAX_PICTURE_WIDTH  70000
+#define MAX_PICTURE_HEIGHT 70000
 
 enum {
   CHROMA_MONO = 0,
@@ -51,22 +58,54 @@ typedef struct scaling_list_data {
 } scaling_list_data;
 
 
-struct seq_parameter_set {
+enum PresetSet {
+  Preset_Default
+};
+
+
+class sps_range_extension
+{
+ public:
+  sps_range_extension();
+
+  uint8_t transform_skip_rotation_enabled_flag;
+  uint8_t transform_skip_context_enabled_flag;
+  uint8_t implicit_rdpcm_enabled_flag;
+  uint8_t explicit_rdpcm_enabled_flag;
+  uint8_t extended_precision_processing_flag;
+  uint8_t intra_smoothing_disabled_flag;
+  uint8_t high_precision_offsets_enabled_flag;
+  uint8_t persistent_rice_adaptation_enabled_flag;
+  uint8_t cabac_bypass_alignment_enabled_flag;
+
+  de265_error read(error_queue*, bitreader*);
+  void dump(int fd) const;
+};
+
+
+class seq_parameter_set {
+public:
   seq_parameter_set();
   ~seq_parameter_set();
 
-  de265_error read(struct decoder_context*, bitreader*);
+  de265_error read(error_queue*, bitreader*);
+  de265_error write(error_queue*, CABAC_encoder&);
+
+  void dump(int fd) const;
 
-  void dump_sps(int fd) const;
+  void set_defaults(enum PresetSet = Preset_Default);
+  void set_CB_log2size_range(int mini,int maxi);
+  void set_TB_log2size_range(int mini,int maxi);
+  void set_resolution(int w,int h);
 
   bool sps_read; // whether the sps has been read from the bitstream
 
 
   char video_parameter_set_id;
-  char sps_max_sub_layers;
+  char sps_max_sub_layers;            // [1;7]
   char sps_temporal_id_nesting_flag;
 
-  struct profile_tier_level profile_tier_level;
+  profile_tier_level profile_tier_level_;
 
   int seq_parameter_set_id;
   int chroma_format_idc;
@@ -91,10 +130,10 @@ struct seq_parameter_set {
   int sps_max_num_reorder_pics[7];
   int sps_max_latency_increase_plus1[7];
 
-  int  log2_min_luma_coding_block_size;
-  int  log2_diff_max_min_luma_coding_block_size;
-  int  log2_min_transform_block_size;
-  int  log2_diff_max_min_transform_block_size;
+  int  log2_min_luma_coding_block_size;             // smallest CB size [3;6]
+  int  log2_diff_max_min_luma_coding_block_size;    // largest  CB size
+  int  log2_min_transform_block_size;               // smallest TB size [2;5]
+  int  log2_diff_max_min_transform_block_size;      // largest  TB size
   int  max_transform_hierarchy_depth_inter;
   int  max_transform_hierarchy_depth_intra;
 
@@ -114,14 +153,9 @@ struct seq_parameter_set {
   int  log2_diff_max_min_pcm_luma_coding_block_size;
   char pcm_loop_filter_disable_flag;
 
-  int num_short_term_ref_pic_sets;
+  int num_short_term_ref_pic_sets() const { return ref_pic_sets.size(); }
   std::vector<ref_pic_set> ref_pic_sets; // [0 ; num_short_term_ref_pic_set (<=MAX_REF_PIC_SETS) )
 
-  /*
-    for( i = 0; i < num_short_term_ref_pic_sets; i++)
-    short_term_ref_pic_set(i)
-  */
-
   char long_term_ref_pics_present_flag;
 
   int num_long_term_ref_pics_sps;
@@ -131,14 +165,16 @@ struct seq_parameter_set {
 
   char sps_temporal_mvp_enabled_flag;
   char strong_intra_smoothing_enable_flag;
+
   char vui_parameters_present_flag;
+  video_usability_information vui;
 
-  /*
-    if( vui_parameters_present_flag )
-      vui_parameters()
-  */
+  char sps_extension_present_flag;
+  char sps_range_extension_flag;
+  char sps_multilayer_extension_flag;
+  char sps_extension_6bits;
 
-  char sps_extension_flag;
+  sps_range_extension range_extension;
 
   /*
     if( sps_extension_flag )
@@ -148,6 +184,11 @@ struct seq_parameter_set {
     rbsp_trailing_bits()
   */
 
+
+  // --- derived values ---
+
+  de265_error compute_derived_values();
+
   int BitDepth_Y;
   int QpBdOffset_Y;
   int BitDepth_C;
@@ -158,7 +199,7 @@ struct seq_parameter_set {
   int WinUnitX, WinUnitY;
 
   int MaxPicOrderCntLsb;
-  
+
   int Log2MinCbSizeY;
   int Log2CtbSizeY;
   int MinCbSizeY;
@@ -188,9 +229,29 @@ struct seq_parameter_set {
   int Log2MaxIpcmCbSizeY;
 
   int SpsMaxLatencyPictures[7]; // [temporal layer]
+
+  uint8_t WpOffsetBdShiftY;
+  uint8_t WpOffsetBdShiftC;
+  int32_t WpOffsetHalfRangeY;
+  int32_t WpOffsetHalfRangeC;
+
+
+  int getPUIndexRS(int pixelX,int pixelY) const {
+    return (pixelX>>Log2MinPUSize) + (pixelY>>Log2MinPUSize)*PicWidthInMinPUs;
+  }
+
+  int get_bit_depth(int cIdx) const {
+    if (cIdx==0) return BitDepth_Y;
+    else         return BitDepth_C;
+  }
+
+  int get_chroma_shift_W(int cIdx) const { return cIdx ? SubWidthC -1 : 0; }
+  int get_chroma_shift_H(int cIdx) const { return cIdx ? SubHeightC-1 : 0; }
 };
 
 de265_error read_scaling_list(bitreader*, const seq_parameter_set*, scaling_list_data*, bool inPPS);
+de265_error write_scaling_list(CABAC_encoder& out, const seq_parameter_set* sps,
+                               scaling_list_data* sclist, bool inPPS);
 void set_default_scaling_lists(scaling_list_data*);
 
 #endif
diff --git a/libde265/threads.cc b/libde265/threads.cc
index 0490211..9b3b668 100644
--- a/libde265/threads.cc
+++ b/libde265/threads.cc
@@ -35,6 +35,8 @@
 #define THREAD_RESULT       void*
 #define THREAD_PARAM        void*
 
+#include <stdio.h>
+
 int  de265_thread_create(de265_thread* t, void *(*start_routine) (void *), void *arg) { return pthread_create(t,NULL,start_routine,arg); }
 void de265_thread_join(de265_thread t) { pthread_join(t,NULL); }
 void de265_thread_destroy(de265_thread* t) { }
@@ -121,6 +123,16 @@ void de265_progress_lock::set_progress(int progress)
   de265_mutex_unlock(&mutex);
 }
 
+void de265_progress_lock::increase_progress(int progress)
+{
+  de265_mutex_lock(&mutex);
+
+  mProgress += progress;
+  de265_cond_broadcast(&cond, &mutex);
+
+  de265_mutex_unlock(&mutex);
+}
+
 int  de265_progress_lock::get_progress() const
 {
   return mProgress;
diff --git a/libde265/threads.h b/libde265/threads.h
index 25c5d59..6cca80c 100644
--- a/libde265/threads.h
+++ b/libde265/threads.h
@@ -32,6 +32,7 @@
 #endif
 
 #include <deque>
+#include <string>
 
 #ifndef _WIN32
 #include <pthread.h>
@@ -43,7 +44,16 @@ typedef pthread_cond_t   de265_cond;
 #else // _WIN32
 #include <windows.h>
 #include "../extra/win32cond.h"
+#if _MSC_VER > 1310
 #include <intrin.h>
+#else
+extern "C"
+{
+   LONG  __cdecl _InterlockedExchangeAdd(long volatile *Addend, LONG Value);
+}
+#pragma intrinsic (_InterlockedExchangeAdd)
+#define InterlockedExchangeAdd _InterlockedExchangeAdd
+#endif
 
 typedef HANDLE              de265_thread;
 typedef HANDLE              de265_mutex;
@@ -100,6 +110,7 @@ public:
 
   void wait_for_progress(int progress);
   void set_progress(int progress);
+  void increase_progress(int progress);
   int  get_progress() const;
   void reset(int value=0) { mProgress=value; }
 
@@ -123,6 +134,8 @@ public:
   enum { Queued, Running, Blocked, Finished } state;
 
   virtual void work() = 0;
+
+  virtual std::string name() const { return "noname"; }
 };
 
 
diff --git a/libde265/transform.cc b/libde265/transform.cc
index a217c46..c9e5d09 100644
--- a/libde265/transform.cc
+++ b/libde265/transform.cc
@@ -68,7 +68,7 @@ void decode_quantization_parameters(thread_context* tctx, int xC,int yC,
   int qPY_PRED;
 
   // first QG in CTB row ?
-  
+
   int ctbLSBMask = ((1<<sps->Log2CtbSizeY)-1);
   bool firstInCTBRow = (xQG == 0 && ((yQG & ctbLSBMask)==0));
 
@@ -148,20 +148,45 @@ void decode_quantization_parameters(thread_context* tctx, int xC,int yC,
 
   tctx->qPYPrime = QPY + sps->QpBdOffset_Y;
 
-  int qPiCb = Clip3(-sps->QpBdOffset_C,57, QPY+pps->pic_cb_qp_offset + shdr->slice_cb_qp_offset);
-  int qPiCr = Clip3(-sps->QpBdOffset_C,57, QPY+pps->pic_cr_qp_offset + shdr->slice_cr_qp_offset);
+  int qPiCb = Clip3(-sps->QpBdOffset_C,57, QPY+pps->pic_cb_qp_offset + shdr->slice_cb_qp_offset + tctx->CuQpOffsetCb);
+  int qPiCr = Clip3(-sps->QpBdOffset_C,57, QPY+pps->pic_cr_qp_offset + shdr->slice_cr_qp_offset + tctx->CuQpOffsetCr);
 
   logtrace(LogTransform,"qPiCb:%d (%d %d), qPiCr:%d (%d %d)\n",
            qPiCb, pps->pic_cb_qp_offset, shdr->slice_cb_qp_offset,
            qPiCr, pps->pic_cr_qp_offset, shdr->slice_cr_qp_offset);
 
-  int qPCb = table8_22(qPiCb);
-  int qPCr = table8_22(qPiCr);
+  int qPCb,qPCr;
+
+  if (sps->ChromaArrayType == CHROMA_420) {
+    qPCb = table8_22(qPiCb);
+    qPCr = table8_22(qPiCr);
+  }
+  else {
+    qPCb = qPiCb;
+    qPCr = qPiCr;
+  }
+
+  //printf("q: %d %d\n",qPiCb, qPCb);
 
   tctx->qPCbPrime = qPCb + sps->QpBdOffset_C;
   tctx->qPCrPrime = qPCr + sps->QpBdOffset_C;
 
+  /*
+  printf("Q: %d (%d %d %d / %d %d) %d %d %d\n",QPY,
+         sps->QpBdOffset_Y,
+         pps->pic_cb_qp_offset + shdr->slice_cb_qp_offset,
+         pps->pic_cr_qp_offset + shdr->slice_cr_qp_offset,
+         sps->QpBdOffset_C, sps->QpBdOffset_C,
+         tctx->qPYPrime, tctx->qPCbPrime, tctx->qPCrPrime);
+  */
+
   int log2CbSize = tctx->img->get_log2CbSize(xCUBase, yCUBase);
+
+  // TODO: On broken input, log2CbSize may be zero (multithreaded only). Not sure yet why.
+  // Maybe another decoding thread is overwriting the value set in slice.cc:read_coding_unit.
+  // id:000163,sig:06,src:002041,op:havoc,rep:16.bin
+  if (log2CbSize<3) { log2CbSize=3; }
+
   tctx->img->set_QPY(xCUBase, yCUBase, log2CbSize, QPY);
   tctx->currentQPY = QPY;
 
@@ -176,34 +201,160 @@ void decode_quantization_parameters(thread_context* tctx, int xC,int yC,
 
 
 
-void transform_coefficients(decoder_context* ctx,
-                            int16_t* coeff, int coeffStride, int nT, int trType, int postShift,
-                            uint8_t* dst, int dstStride)
+template <class pixel_t>
+void transform_coefficients(acceleration_functions* acceleration,
+                            int16_t* coeff, int coeffStride, int nT, int trType,
+                            pixel_t* dst, int dstStride, int bit_depth)
 {
   logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT);
 
+
   if (trType==1) {
 
-    ctx->acceleration.transform_4x4_luma_add_8(dst, coeff, dstStride);
+    acceleration->transform_4x4_dst_add<pixel_t>(dst, coeff, dstStride, bit_depth);
 
   } else {
 
-    /**/ if (nT==4)  { ctx->acceleration.transform_4x4_add_8(dst,coeff,dstStride); }
-    else if (nT==8)  { ctx->acceleration.transform_8x8_add_8(dst,coeff,dstStride); }
-    else if (nT==16) { ctx->acceleration.transform_16x16_add_8(dst,coeff,dstStride); }
-    else             { ctx->acceleration.transform_32x32_add_8(dst,coeff,dstStride); }
+    /**/ if (nT==4)  { acceleration->transform_add<pixel_t>(0,dst,coeff,dstStride, bit_depth); }
+    else if (nT==8)  { acceleration->transform_add<pixel_t>(1,dst,coeff,dstStride, bit_depth); }
+    else if (nT==16) { acceleration->transform_add<pixel_t>(2,dst,coeff,dstStride, bit_depth); }
+    else             { acceleration->transform_add<pixel_t>(3,dst,coeff,dstStride, bit_depth); }
   }
+
+#if 0
+  printf("decoded pixels:\n");
+  for (int y=0;y<nT;y++,printf("\n"))
+    for (int x=0;x<nT;x++) {
+      printf("%02x ",dst[y*dstStride+x]);
+    }
+#endif
+}
+
+
+// TODO: make this an accelerated function
+void cross_comp_pred(const thread_context* tctx, int32_t* residual, int nT)
+{
+  const int BitDepthC = tctx->img->sps.BitDepth_C;
+  const int BitDepthY = tctx->img->sps.BitDepth_Y;
+
+  for (int y=0;y<nT;y++)
+    for (int x=0;x<nT;x++) {
+      /* TODO: the most usual case is definitely BitDepthY == BitDepthC, in which case
+         we could just omit two shifts. The second most common case is probably
+         BitDepthY>BitDepthC, for which we could also eliminate one shift. The remaining
+         case is also one shift only.
+      */
+
+      residual[y*nT+x] += (tctx->ResScaleVal *
+                           ((tctx->residual_luma[y*nT+x] << BitDepthC ) >> BitDepthY ) ) >> 3;
+    }
 }
 
 
+template <class pixel_t>
+void transform_coefficients_explicit(thread_context* tctx,
+                                     int16_t* coeff, int coeffStride, int nT, int trType,
+                                     pixel_t* dst, int dstStride, int bit_depth, int cIdx)
+{
+  logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT);
+
+  const acceleration_functions* acceleration = &tctx->decctx->acceleration;
+
+  int32_t residual_buffer[32*32];
+  int32_t* residual;
+  if (cIdx==0) {
+    residual = tctx->residual_luma;
+  }
+  else {
+    residual = residual_buffer;
+  }
+
+
+  // TODO
+  int bdShift = 20 - bit_depth;
+  int max_coeff_bits = 15;
+
+  if (trType==1) {
+
+    acceleration->transform_idst_4x4(residual, coeff, bdShift, max_coeff_bits);
+
+  } else {
+
+    /**/ if (nT==4)  { acceleration->transform_idct_4x4(residual,coeff,bdShift,max_coeff_bits); }
+    else if (nT==8)  { acceleration->transform_idct_8x8(residual,coeff,bdShift,max_coeff_bits); }
+    else if (nT==16) { acceleration->transform_idct_16x16(residual,coeff,bdShift,max_coeff_bits); }
+    else             { acceleration->transform_idct_32x32(residual,coeff,bdShift,max_coeff_bits); }
+  }
+
+
+  //printBlk("prediction",(uint8_t*)dst,nT,dstStride);
+  //printBlk("residual",residual,nT,nT);
+
+  if (cIdx != 0) {
+    if (tctx->ResScaleVal != 0) {
+      cross_comp_pred(tctx, residual, nT);
+    }
+
+    //printBlk("cross-comp-pred modified residual",residual,nT,nT);
+  }
+
+  acceleration->add_residual(dst,dstStride, residual,nT, bit_depth);
+}
+
+
+void inv_transform(acceleration_functions* acceleration,
+                   uint8_t* dst, int dstStride, int16_t* coeff,
+                   int log2TbSize, int trType)
+{
+  if (trType==1) {
+    assert(log2TbSize==2);
+
+    acceleration->transform_4x4_dst_add_8(dst, coeff, dstStride);
+
+  } else {
+    acceleration->transform_add_8[log2TbSize-2](dst,coeff,dstStride);
+  }
+
+
+#if 0
+  int nT = 1<<log2TbSize;
+  printf("decoded pixels:\n");
+  for (int y=0;y<nT;y++,printf("\n"))
+    for (int x=0;x<nT;x++) {
+  printf("%02x ",dst[y*dstStride+x]);
+}
+#endif
+}
+
+
+void fwd_transform(acceleration_functions* acceleration,
+                   int16_t* coeff, int coeffStride, int log2TbSize, int trType,
+                   const int16_t* src, int srcStride)
+{
+  logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,1<<log2TbSize);
+
+  if (trType==1) {
+    // DST 4x4
+
+    acceleration->fwd_transform_4x4_dst_8(coeff, src, srcStride);
+  } else {
+    // DCT 4x4, 8x8, 16x16, 32x32
+
+    acceleration->fwd_transform_8[log2TbSize-2](coeff,src,srcStride);
+  }
+}
+
+
+
 static const int levelScale[] = { 40,45,51,57,64,72 };
 
 // (8.6.2) and (8.6.3)
-void scale_coefficients(thread_context* tctx,
-                        int xT,int yT, // position of TU in frame (chroma adapted)
-                        int x0,int y0, // position of CU in frame (chroma adapted)
-                        int nT, int cIdx,
-                        bool transform_skip_flag, bool intra)
+template <class pixel_t>
+void scale_coefficients_internal(thread_context* tctx,
+                                 int xT,int yT, // position of TU in frame (chroma adapted)
+                                 int x0,int y0, // position of CU in frame (chroma adapted)
+                                 int nT, int cIdx,
+                                 bool transform_skip_flag, bool intra, int rdpcmMode)
 {
   seq_parameter_set* sps = &tctx->img->sps;
   pic_parameter_set* pps = &tctx->img->pps;
@@ -218,8 +369,6 @@ void scale_coefficients(thread_context* tctx,
 
   logtrace(LogTransform,"qP: %d\n",qP);
 
-  //printf("residual %d;%d cIdx=%d qp=%d\n",xT * (cIdx?2:1),yT * (cIdx?2:1),cIdx,qP);
-
 
   int16_t* coeff;
   int      coeffStride;
@@ -231,37 +380,62 @@ void scale_coefficients(thread_context* tctx,
 
 
 
-  uint8_t* pred;
+  pixel_t* pred;
   int      stride;
-  pred = tctx->img->get_image_plane_at_pos(cIdx, xT,yT);
+  pred = tctx->img->get_image_plane_at_pos_NEW<pixel_t>(cIdx, xT,yT);
   stride = tctx->img->get_image_stride(cIdx);
 
-  //fprintf(stderr,"POC=%d pred: %p (%d;%d stride=%d)\n",ctx->img->PicOrderCntVal,pred,xT,yT,stride);
-
-  /*
-  int x,y;
-  for (y=0;y<nT;y++)
-    {
-      printf("P: ");
+  // We explicitly include the case for sizeof(pixel_t)==1 so that the compiler
+  // can optimize away a lot of code for 8-bit pixels.
+  const int bit_depth = ((sizeof(pixel_t)==1) ? 8 : sps->get_bit_depth(cIdx));
 
-      for (x=0;x<nT;x++)
-        {
-          printf("%02x ",pred[x+y*stride]);
-        }
+  //assert(intra == (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA));
+  int cuPredModeIntra = (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA);
 
-      printf("\n");
-    }
-  */
+  bool rotateCoeffs = (sps->range_extension.transform_skip_rotation_enabled_flag &&
+                       nT == 4 &&
+                       cuPredModeIntra);
 
   if (tctx->cu_transquant_bypass_flag) {
-    //assert(false); // TODO
 
+    int32_t residual_buffer[32*32];
+
+    int32_t* residual;
+    if (cIdx==0) residual = tctx->residual_luma;
+    else         residual = residual_buffer;
+
+
+    // TODO: we could fold the coefficient rotation into the coefficient expansion here:
     for (int i=0;i<tctx->nCoeff[cIdx];i++) {
-      int32_t currCoeff  = tctx->coeffList[cIdx][i];
+      int32_t currCoeff = tctx->coeffList[cIdx][i];
       tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
     }
 
-    tctx->decctx->acceleration.transform_bypass_8(pred, coeff, nT, stride);
+    if (rotateCoeffs) {
+      tctx->decctx->acceleration.rotate_coefficients(coeff, nT);
+    }
+
+    if (rdpcmMode) {
+      if (rdpcmMode==2)
+        tctx->decctx->acceleration.transform_bypass_rdpcm_v(residual, coeff, nT);
+      else
+        tctx->decctx->acceleration.transform_bypass_rdpcm_h(residual, coeff, nT);
+    }
+    else {
+      tctx->decctx->acceleration.transform_bypass(residual, coeff, nT);
+    }
+
+    if (cIdx != 0) {
+      if (tctx->ResScaleVal != 0) {
+        cross_comp_pred(tctx, residual, nT);
+      }
+    }
+
+    tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth);
+
+    if (rotateCoeffs) {
+      memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around
+    }
   }
   else {
     // (8.6.3)
@@ -272,6 +446,9 @@ void scale_coefficients(thread_context* tctx,
 
     logtrace(LogTransform,"dequant %d;%d cIdx=%d qp=%d\n",xT*(cIdx?2:1),yT*(cIdx?2:1),cIdx,qP);
 
+
+    // --- inverse quantization ---
+
     if (sps->scaling_list_enable_flag==0) {
 
       //const int m_x_y = 16;
@@ -286,9 +463,14 @@ void scale_coefficients(thread_context* tctx,
         // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit
         int32_t currCoeff  = tctx->coeffList[cIdx][i];
 
+        //logtrace(LogTransform,"coefficient[%d] = %d\n",tctx->coeffPos[cIdx][i],
+        //tctx->coeffList[cIdx][i]);
+
         currCoeff = Clip3(-32768,32767,
                           ( (currCoeff * fact + offset ) >> bdShift));
 
+        //logtrace(LogTransform," -> %d\n",currCoeff);
+
         tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
       }
     }
@@ -327,6 +509,9 @@ void scale_coefficients(thread_context* tctx,
       }
     }
 
+
+    // --- do transform or skip ---
+
     logtrace(LogTransform,"coefficients OUT:\n");
     for (int y=0;y<nT;y++) {
       logtrace(LogTransform,"  ");
@@ -345,20 +530,77 @@ void scale_coefficients(thread_context* tctx,
 
     if (transform_skip_flag) {
 
-      tctx->decctx->acceleration.transform_skip_8(pred, coeff, stride);
+      int extended_precision_processing_flag = 0;
+      int Log2nTbS = Log2(nT);
+      int bdShift = libde265_max( 20 - bit_depth, extended_precision_processing_flag ? 11 : 0 );
+      int tsShift = (extended_precision_processing_flag ? libde265_min( 5, bdShift - 2 ) : 5 )
+        + Log2nTbS;
+
+      if (rotateCoeffs) {
+        tctx->decctx->acceleration.rotate_coefficients(coeff, nT);
+      }
+
+      int32_t residual_buffer[32*32];
+
+      int32_t* residual;
+      if (cIdx==0) residual = tctx->residual_luma;
+      else         residual = residual_buffer;
+
+      if (rdpcmMode) {
+        /*
+        if (rdpcmMode==2)
+          tctx->decctx->acceleration.transform_skip_rdpcm_v(pred,coeff, Log2(nT), stride, bit_depth);
+        else
+          tctx->decctx->acceleration.transform_skip_rdpcm_h(pred,coeff, Log2(nT), stride, bit_depth);
+        */
+
+        if (rdpcmMode==2)
+          tctx->decctx->acceleration.rdpcm_v(residual, coeff,nT, tsShift,bdShift);
+        else
+          tctx->decctx->acceleration.rdpcm_h(residual, coeff,nT, tsShift,bdShift);
+      }
+      else {
+        //tctx->decctx->acceleration.transform_skip(pred, coeff, stride, bit_depth);
+
+        tctx->decctx->acceleration.transform_skip_residual(residual, coeff, nT, tsShift, bdShift);
+      }
+
+      if (cIdx != 0) {
+        if (tctx->ResScaleVal != 0) {
+          cross_comp_pred(tctx, residual, nT);
+        }
+      }
+
+      tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth);
+
+      if (rotateCoeffs) {
+        memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around
+      }
     }
     else {
       int trType;
 
-      if (nT==4 && cIdx==0 && tctx->img->get_pred_mode(xT,yT)==MODE_INTRA) {
+      //if (nT==4 && cIdx==0 && tctx->img->get_pred_mode(xT,yT)==MODE_INTRA) {
+      if (nT==4 && cIdx==0 && cuPredModeIntra) {
         trType=1;
       }
       else {
         trType=0;
       }
 
-      transform_coefficients(tctx->decctx, coeff, coeffStride, nT, trType, bdShift2,
-                             pred, stride);
+      assert(rdpcmMode==0);
+
+
+      if (tctx->img->pps.range_extension.cross_component_prediction_enabled_flag) {
+        // cross-component-prediction: transform to residual buffer and add in a separate step
+
+        transform_coefficients_explicit(tctx, coeff, coeffStride, nT, trType,
+                                        pred, stride, bit_depth, cIdx);
+      }
+      else {
+        transform_coefficients(&tctx->decctx->acceleration, coeff, coeffStride, nT, trType,
+                               pred, stride, bit_depth);
+      }
     }
   }
 
@@ -366,42 +608,122 @@ void scale_coefficients(thread_context* tctx,
   logtrace(LogTransform,"pixels (cIdx:%d), position %d %d:\n",cIdx, xT,yT);
 
   for (int y=0;y<nT;y++) {
-    logtrace(LogTransform,"RECO-%d-%d-%d ",xT,yT+y,cIdx);
+    logtrace(LogTransform,"RECO-%3d-%3d-%d ",xT,yT+y,cIdx);
 
     for (int x=0;x<nT;x++) {
-      logtrace(LogTransform,"*%02x ", pred[x+y*stride]);
+      logtrace(LogTransform,"*%03x ", pred[x+y*stride]);
     }
 
     logtrace(LogTransform,"*\n");
-  }  
+  }
 
-  /*
-  for (y=0;y<nT;y++)
-    {
-      printf("C: ");
+  // zero out scrap coefficient buffer again
 
-      for (x=0;x<nT;x++)
-        {
-          printf("%4d ",coeff[x+y*nT]);
-        }
+  for (int i=0;i<tctx->nCoeff[cIdx];i++) {
+    tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = 0;
+  }
+}
 
-      printf("\n");
-    }
 
-  for (y=0;y<nT;y++)
-    {
-      for (x=0;x<nT;x++)
-        {
-          printf("%02x ",pred[x+y*stride]);
-        }
+void scale_coefficients(thread_context* tctx,
+                        int xT,int yT, // position of TU in frame (chroma adapted)
+                        int x0,int y0, // position of CU in frame (chroma adapted)
+                        int nT, int cIdx,
+                        bool transform_skip_flag, bool intra,
+                        int rdpcmMode // 0 - off, 1 - Horizontal, 2 - Vertical
+                        )
+{
+  if (tctx->img->high_bit_depth(cIdx)) {
+    scale_coefficients_internal<uint16_t>(tctx, xT,yT, x0,y0, nT,cIdx, transform_skip_flag, intra,
+                                          rdpcmMode);
+  } else {
+    scale_coefficients_internal<uint8_t> (tctx, xT,yT, x0,y0, nT,cIdx, transform_skip_flag, intra,
+                                          rdpcmMode);
+  }
+}
 
-      printf("\n");
+
+//#define QUANT_IQUANT_SHIFT    20 // Q(QP%6) * IQ(QP%6) = 2^20
+#define QUANT_SHIFT           14 // Q(4) = 2^14
+//#define SCALE_BITS            15 // Inherited from TMuC, pressumably for fractional bit estimates in RDOQ
+#define MAX_TR_DYNAMIC_RANGE  15 // Maximum transform dynamic range (excluding sign bit)
+
+
+const static uint16_t g_quantScales[6] = {
+  26214,23302,20560,18396,16384,14564
+};
+
+void quant_coefficients(//encoder_context* ectx,
+                        int16_t* out_coeff,
+                        const int16_t* in_coeff,
+                        int log2TrSize, int qp,
+                        bool intra)
+{
+  const int qpDiv6 = qp / 6;
+  const int qpMod6 = qp % 6;
+
+  //int uiLog2TrSize = xLog2( iWidth - 1);
+
+  int uiQ = g_quantScales[qpMod6];
+  int bitDepth = 8;
+  int transformShift = MAX_TR_DYNAMIC_RANGE - bitDepth - log2TrSize;  // Represents scaling through forward transform
+  int qBits = QUANT_SHIFT + qpDiv6 + transformShift;
+
+  /* TODO: originally, this was checking for intra slices, why not for intra mode ?
+   */
+  int rnd = (intra ? 171 : 85) << (qBits-9);
+
+  int x, y;
+  int uiAcSum = 0;
+
+  int nStride = (1<<log2TrSize);
+
+  for (y=0; y < (1<<log2TrSize) ; y++) {
+    for (x=0; x < (1<<log2TrSize) ; x++) {
+      int level;
+      int sign;
+      int blockPos = y * nStride + x;
+      level  = in_coeff[blockPos];
+      //logtrace(LogTransform,"(%d,%d) %d -> ", x,y,level);
+      sign   = (level < 0 ? -1: 1);
+
+      level = (abs_value(level) * uiQ + rnd ) >> qBits;
+      uiAcSum += level;
+      level *= sign;
+      out_coeff[blockPos] = Clip3(-32768, 32767, level);
+      //logtrace(LogTransform,"%d\n", out_coeff[blockPos]);
     }
-  */
+  }
+}
 
-  // zero out scrap coefficient buffer again
 
-  for (int i=0;i<tctx->nCoeff[cIdx];i++) {
-    tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = 0;
+void dequant_coefficients(int16_t* out_coeff,
+                          const int16_t* in_coeff,
+                          int log2TrSize, int qP)
+{
+  const int m_x_y = 1;
+  int bitDepth = 8;
+  int bdShift = bitDepth + log2TrSize - 5;
+  bdShift -= 4;  // this is equivalent to having a m_x_y of 16 and we can use 32bit integers
+
+  const int offset = (1<<(bdShift-1));
+  const int fact = m_x_y * levelScale[qP%6] << (qP/6);
+
+  int blkSize = (1<<log2TrSize);
+  int nCoeff  = (1<<(log2TrSize<<1));
+
+  for (int i=0;i<nCoeff;i++) {
+
+    // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit
+    int32_t currCoeff  = in_coeff[i];
+
+    //logtrace(LogTransform,"coefficient[%d] = %d\n",i,currCoeff);
+
+    currCoeff = Clip3(-32768,32767,
+                      ( (currCoeff * fact + offset ) >> bdShift));
+
+    //logtrace(LogTransform," -> %d\n",currCoeff);
+
+    out_coeff[i] = currCoeff;
   }
 }
diff --git a/libde265/transform.h b/libde265/transform.h
index 04a7714..6f19049 100644
--- a/libde265/transform.h
+++ b/libde265/transform.h
@@ -42,6 +42,24 @@ void scale_coefficients(thread_context* tctx,
                         int xT,int yT, // position of TU in frame (chroma adapted)
                         int x0,int y0, // position of CU in frame (chroma adapted)
                         int nT, int cIdx,
-                        bool transform_skip_flag, bool intra);
+                        bool transform_skip_flag, bool intra, int rdpcmMode);
+
+
+void inv_transform(acceleration_functions* acceleration,
+                   uint8_t* dst, int dstStride, int16_t* coeff,
+                   int log2TbSize, int trType);
+
+void fwd_transform(acceleration_functions* acceleration,
+                   int16_t* coeff, int coeffStride, int log2TbSize, int trType,
+                   const int16_t* src, int srcStride);
+
+void quant_coefficients(int16_t* out_coeff,
+                        const int16_t* in_coeff,
+                        int log2TrSize, int qp,
+                        bool intra);
+
+void dequant_coefficients(int16_t* out_coeff,
+                          const int16_t* in_coeff,
+                          int log2TrSize, int qP);
 
 #endif
diff --git a/libde265/util.cc b/libde265/util.cc
index b5881e2..10b41f2 100644
--- a/libde265/util.cc
+++ b/libde265/util.cc
@@ -23,39 +23,63 @@
 
 #include <stdarg.h>
 #include <stdio.h>
+#include <string.h>
+
+
+void copy_subimage(uint8_t* dst,int dststride,
+                   const uint8_t* src,int srcstride,
+                   int w, int h)
+{
+  for (int y=0;y<h;y++) {
+    memcpy(dst, src, w);
+    dst += dststride;
+    src += srcstride;
+  }
+}
+
 
 
 #ifdef DE265_LOGGING
 static int current_poc=0;
 static int log_poc_start=-9999; // frame-numbers can be negative
-static int enable_log = 1;
+static bool disable_log[NUMBER_OF_LogModules];
 void log_set_current_POC(int poc) { current_poc=poc; }
 #endif
 
 
-static int disable_logging=0;
+static int disable_logging_OLD=0;
 static int verbosity = 0;
 
+
 LIBDE265_API void de265_disable_logging() // DEPRECATED
 {
-  disable_logging=1;
+  disable_logging_OLD=1;
 }
 
+
 LIBDE265_API void de265_set_verbosity(int level)
 {
   verbosity = level;
 }
 
 #if defined(DE265_LOG_ERROR) || defined(DE265_LOG_INFO) || defined(DE265_LOG_DEBUG) || defined(DE265_LOG_INFO)
-void enablelog() { enable_log=1; }
+void enable_logging(enum LogModule module)
+{
+  disable_log[module]=false;
+}
+void disable_logging(enum LogModule module)
+{
+  disable_log[module]=true;
+}
 #endif
 
+static long logcnt[10];
+
 #ifdef DE265_LOG_ERROR
 void logerror(enum LogModule module, const char* string, ...)
 {
-  if (disable_logging) return;
   if (current_poc < log_poc_start) { return; }
-  if (!enable_log) return;
+  if (disable_log[module]) return;
 
   va_list va;
 
@@ -72,9 +96,8 @@ void logerror(enum LogModule module, const char* string, ...)
 void loginfo (enum LogModule module, const char* string, ...)
 {
   if (verbosity<1) return;
-  if (disable_logging) return;
   if (current_poc < log_poc_start) { return; }
-  if (!enable_log) return;
+  if (disable_log[module]) return;
 
   va_list va;
 
@@ -91,9 +114,8 @@ void loginfo (enum LogModule module, const char* string, ...)
 void logdebug(enum LogModule module, const char* string, ...)
 {
   if (verbosity<2) return;
-  if (disable_logging) return;
   if (current_poc < log_poc_start) { return; }
-  if (!enable_log) return;
+  if (disable_log[module]) return;
 
   va_list va;
 
@@ -110,14 +132,24 @@ void logdebug(enum LogModule module, const char* string, ...)
 void logtrace(enum LogModule module, const char* string, ...)
 {
   if (verbosity<3) return;
-  if (disable_logging) return;
   if (current_poc < log_poc_start) { return; }
-  if (!enable_log) return;
+  if (disable_log[module]) return;
+
+  //if (module != LogSymbols /*&& module != LogCABAC*/) { return; }
+  //if (logcnt<319500) return;
 
   //if (module != LogCABAC) return;
 
   va_list va;
 
+  if (string[0]=='$') {
+    int id = string[1]-'0';
+    logcnt[id]++;
+    fprintf(stdout, "[%ld] ",logcnt[id]);
+
+    string += 3;
+  }
+
   int noPrefix = (string[0]=='*');
   if (!noPrefix) { } // fprintf(stdout, "ERR: ");
   va_start(va, string);
@@ -138,3 +170,70 @@ void log2fh(FILE* fh, const char* string, ...)
   va_end(va);
   fflush(stdout);
 }
+
+
+
+void printBlk(const char* title, const int16_t* data, int blksize, int stride)
+{
+  printf("%s:\n",title);
+
+  for (int y=0;y<blksize;y++) {
+    //logtrace(LogTransform,"  ");
+    printf("  ");
+    for (int x=0;x<blksize;x++) {
+      //logtrace(LogTransform,"*%3d ", data[x+y*stride]);
+      printf("%4d ", data[x+y*stride]);
+    }
+    //logtrace(LogTransform,"*\n");
+    printf("\n");
+  }
+}
+
+
+void printBlk(const char* title, const int32_t* data, int blksize, int stride)
+{
+  printf("%s:\n",title);
+
+  for (int y=0;y<blksize;y++) {
+    //logtrace(LogTransform,"  ");
+    printf("  ");
+    for (int x=0;x<blksize;x++) {
+      //logtrace(LogTransform,"*%3d ", data[x+y*stride]);
+      printf("%4d ", data[x+y*stride]);
+    }
+    //logtrace(LogTransform,"*\n");
+    printf("\n");
+  }
+}
+
+
+void printBlk(const char* title, const uint8_t* data, int blksize, int stride)
+{
+  printf("%s:\n",title);
+
+  for (int y=0;y<blksize;y++) {
+    //logtrace(LogTransform,"  ");
+    printf("  ");
+    for (int x=0;x<blksize;x++) {
+      //logtrace(LogTransform,"*%3d ", data[x+y*stride]);
+      printf("%02x ", data[x+y*stride]);
+    }
+    //logtrace(LogTransform,"*\n");
+    printf("\n");
+  }
+}
+
+
+static void (*debug_image_output_func)(const struct de265_image*, int slot) = NULL;
+
+void debug_set_image_output(void (*func)(const struct de265_image*, int slot))
+{
+  debug_image_output_func = func;
+}
+
+void debug_show_image(const struct de265_image* img, int slot)
+{
+  if (debug_image_output_func) {
+    debug_image_output_func(img,slot);
+  }
+}
diff --git a/libde265/util.h b/libde265/util.h
index 5af72a4..679c225 100644
--- a/libde265/util.h
+++ b/libde265/util.h
@@ -29,6 +29,9 @@
 
 #include "libde265/de265.h"
 
+#ifdef __GNUC__
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#endif
 
 #ifdef _MSC_VER
 #define LIBDE265_DECLARE_ALIGNED( var, n ) __declspec(align(n)) var
@@ -40,15 +43,51 @@
 #define unlikely(x)    __builtin_expect(!!(x), 0)
 #endif
 
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#define LIBDE265_CHECK_RESULT __attribute__ ((warn_unused_result))
+#elif defined(_MSC_VER) && (_MSC_VER >= 1700)
+#define LIBDE265_CHECK_RESULT _Check_return_
+#else
+#define LIBDE265_CHECK_RESULT
+#endif
+
 #define ALIGNED_32( var ) LIBDE265_DECLARE_ALIGNED( var, 32 )
 #define ALIGNED_16( var ) LIBDE265_DECLARE_ALIGNED( var, 16 )
 #define ALIGNED_8( var )  LIBDE265_DECLARE_ALIGNED( var, 8 )
 #define ALIGNED_4( var )  LIBDE265_DECLARE_ALIGNED( var, 4 )
 
+// C++11 specific features
+#if defined(_MSC_VER) || (!__clang__ && __GNUC__ && GCC_VERSION < 40600)
+#define FOR_LOOP(type, var, list)   for each (type var in list)
+#undef FOR_LOOP_AUTO_SUPPORT
+#else
+#define FOR_LOOP(type, var, list)   for (type var : list)
+#define FOR_LOOP_AUTO_SUPPORT 1
+#endif
 
+#ifdef USE_STD_TR1_NAMESPACE
+#include <tr1/memory>
+namespace std { using namespace std::tr1; }
+#endif
+
+#if __GNUC__ && GCC_VERSION < 40600
+// nullptr was introduced in gcc 4.6, a simple alias should be fine for our use case
+#define nullptr NULL
+#endif
+
+#ifdef _MSC_VER
+  #ifdef _CPPRTTI
+  #define RTTI_ENABLED
+  #endif
+#else
+  #ifdef __GXX_RTTI
+  #define RTTI_ENABLED
+  #endif
+#endif
 
 //inline uint8_t Clip1_8bit(int16_t value) { if (value<=0) return 0; else if (value>=255) return 255; else return value; }
 #define Clip1_8bit(value) ((value)<0 ? 0 : (value)>255 ? 255 : (value))
+#define Clip_BitDepth(value, bit_depth) ((value)<0 ? 0 : (value)>((1<<bit_depth)-1) ? ((1<<bit_depth)-1) : (value))
 #define Clip3(low,high,value) ((value)<(low) ? (low) : (value)>(high) ? (high) : (value))
 #define Sign(value) (((value)<0) ? -1 : ((value)>0) ? 1 : 0)
 #define abs_value(a) (((a)<0) ? -(a) : (a))
@@ -60,6 +99,7 @@ LIBDE265_INLINE static int ceil_div(int num,int denom)
   num += denom-1;
   return num/denom;
 }
+
 LIBDE265_INLINE static int ceil_log2(int val)
 {
   int n=0;
@@ -69,6 +109,7 @@ LIBDE265_INLINE static int ceil_log2(int val)
 
   return n;
 }
+
 LIBDE265_INLINE static int Log2(int v)
 {
   int n=0;
@@ -80,6 +121,15 @@ LIBDE265_INLINE static int Log2(int v)
   return n;
 }
 
+LIBDE265_INLINE static int Log2SizeToArea(int v)
+{
+  return (1<<(v<<1));
+}
+
+void copy_subimage(uint8_t* dst,int dststride,
+                   const uint8_t* src,int srcstride,
+                   int w, int h);
+
 
 // === logging ===
 
@@ -95,15 +145,20 @@ enum LogModule {
   LogSEI,
   LogIntraPred,
   LogPixels,
-  LogCABAC
+  LogSymbols,
+  LogCABAC,
+  LogEncoder,
+  NUMBER_OF_LogModules
 };
 
 
 #if defined(DE265_LOG_ERROR) || defined(DE265_LOG_INFO) || defined(DE265_LOG_DEBUG) || defined(DE265_LOG_TRACE)
 # define DE265_LOGGING 1
-void enablelog();
+void enable_logging(enum LogModule);
+void disable_logging(enum LogModule);
 #else
-#define enablelog() { }
+#define enable_logging(x) { }
+#define disable_logging(x) { }
 #endif
 
 #ifdef DE265_LOGGING
@@ -138,4 +193,12 @@ void logtrace(enum LogModule module, const char* string, ...);
 
 void log2fh(FILE* fh, const char* string, ...);
 
+
+void printBlk(const char* title,const int32_t* data, int blksize, int stride);
+void printBlk(const char* title,const int16_t* data, int blksize, int stride);
+void printBlk(const char* title,const uint8_t* data, int blksize, int stride);
+
+void debug_set_image_output(void (*)(const struct de265_image*, int slot));
+void debug_show_image(const struct de265_image*, int slot);
+
 #endif
diff --git a/libde265/visualize.cc b/libde265/visualize.cc
index eb8b7f0..f033f78 100644
--- a/libde265/visualize.cc
+++ b/libde265/visualize.cc
@@ -292,7 +292,7 @@ void draw_PB_block(const de265_image* srcimg,uint8_t* img,int stride,
     tint_rect(img,stride, x0,y0,w,h, cols[predMode], pixelSize);
   }
   else if (what == PBMotionVectors) {
-    const PredVectorInfo* mvi = srcimg->get_mv_info(x0,y0);
+    const MotionVectorSpec* mvi = srcimg->get_mv_info(x0,y0);
     int x = x0+w/2;
     int y = y0+h/2;
     if (mvi->predFlag[0]) {
@@ -554,4 +554,3 @@ LIBDE265_API void draw_Tiles(const de265_image* img, uint8_t* dst, int stride, i
     }
   }
 }
-
diff --git a/libde265/vps.cc b/libde265/vps.cc
index e155c79..2633af6 100644
--- a/libde265/vps.cc
+++ b/libde265/vps.cc
@@ -25,88 +25,178 @@
 #include <assert.h>
 
 
-de265_error read_vps(decoder_context* ctx, bitreader* reader, video_parameter_set* vps)
+void profile_data::set_defaults(enum profile_idc profile, int level_major, int level_minor)
+{
+  profile_present_flag = 1;
+
+  profile_space = 0;
+  tier_flag = 0;
+  profile_idc = profile;
+
+  for (int i=0;i<32;i++) {
+    profile_compatibility_flag[i]=0;
+  }
+
+  switch (profile) {
+  case Profile_Main:
+    profile_compatibility_flag[Profile_Main]=1;
+    profile_compatibility_flag[Profile_Main10]=1;
+    break;
+  case Profile_Main10:
+    profile_compatibility_flag[Profile_Main10]=1;
+    break;
+  default:
+    assert(0);
+  }
+
+  progressive_source_flag = 0;
+  interlaced_source_flag  = 0;
+  non_packed_constraint_flag = 0;
+  frame_only_constraint_flag = 0;
+
+
+  // --- level ---
+
+  level_present_flag = 1;
+  level_idc = level_major*30 + level_minor*3;
+}
+
+
+void video_parameter_set::set_defaults(enum profile_idc profile, int level_major, int level_minor)
+{
+  video_parameter_set_id = 0;
+  vps_max_layers = 1; // always =1 in current version of standard
+  vps_max_sub_layers = 1; // temporal sub-layers
+  vps_temporal_id_nesting_flag = 1;
+
+  profile_tier_level_.general.set_defaults(profile,level_major,level_minor);
+
+  vps_sub_layer_ordering_info_present_flag = 0;
+  layer[0].vps_max_dec_pic_buffering = 1;
+  layer[0].vps_max_num_reorder_pics  = 0;
+  layer[0].vps_max_latency_increase  = 0;
+
+  vps_max_layer_id = 0;
+  vps_num_layer_sets = 1;
+
+  layer_id_included_flag.resize(vps_num_layer_sets);
+
+
+  // --- timing info ---
+
+  vps_timing_info_present_flag = 0;
+  vps_num_units_in_tick = 0;
+  vps_time_scale = 0;
+  vps_poc_proportional_to_timing_flag = 0;
+
+  vps_num_ticks_poc_diff_one = 0;
+  vps_num_hrd_parameters = 0;
+
+
+  // --- vps extension ---
+
+  vps_extension_flag = 0;
+}
+
+
+de265_error video_parameter_set::read(error_queue* errqueue, bitreader* reader)
 {
   int vlc;
 
-  vps->video_parameter_set_id = vlc = get_bits(reader, 4);
+  video_parameter_set_id = vlc = get_bits(reader, 4);
   if (vlc >= DE265_MAX_VPS_SETS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
 
   skip_bits(reader, 2);
-  vps->vps_max_layers = vlc = get_bits(reader,6) +1;
-  if (vlc != 1) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; // TODO: out of specification
+  vps_max_layers = vlc = get_bits(reader,6) +1;
+  if (vlc > 63) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; // vps_max_layers_minus1 (range 0...63)
 
-  vps->vps_max_sub_layers = vlc = get_bits(reader,3) +1;
+  vps_max_sub_layers = vlc = get_bits(reader,3) +1;
   if (vlc >= MAX_TEMPORAL_SUBLAYERS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
 
-  vps->vps_temporal_id_nesting_flag = get_bits(reader,1);
+  vps_temporal_id_nesting_flag = get_bits(reader,1);
   skip_bits(reader, 16);
 
-  read_profile_tier_level(reader, &vps->profile_tier_level,
-                          vps->vps_max_sub_layers);
+  profile_tier_level_.read(reader, vps_max_sub_layers);
 
   /*
-  read_bit_rate_pic_rate_info(reader, &vps->bit_rate_pic_rate_info,
-                              0, vps->vps_max_sub_layers-1);
+    read_bit_rate_pic_rate_info(reader, &bit_rate_pic_rate_info,
+    0, vps_max_sub_layers-1);
   */
 
-  vps->vps_sub_layer_ordering_info_present_flag = get_bits(reader,1);
-  //assert(vps->vps_max_sub_layers-1 < MAX_TEMPORAL_SUBLAYERS);
+  vps_sub_layer_ordering_info_present_flag = get_bits(reader,1);
+  //assert(vps_max_sub_layers-1 < MAX_TEMPORAL_SUBLAYERS);
 
-  int firstLayerRead = vps->vps_sub_layer_ordering_info_present_flag ? 0 : (vps->vps_max_sub_layers-1);
+  int firstLayerRead = vps_sub_layer_ordering_info_present_flag ? 0 : (vps_max_sub_layers-1);
 
-  for (int i=firstLayerRead;i<vps->vps_max_sub_layers;i++) {
-    vps->layer[i].vps_max_dec_pic_buffering = get_uvlc(reader);
-    vps->layer[i].vps_max_num_reorder_pics  = get_uvlc(reader);
-    vps->layer[i].vps_max_latency_increase  = get_uvlc(reader);
+  for (int i=firstLayerRead;i<vps_max_sub_layers;i++) {
+    layer[i].vps_max_dec_pic_buffering = get_uvlc(reader);
+    layer[i].vps_max_num_reorder_pics  = get_uvlc(reader);
+    layer[i].vps_max_latency_increase  = get_uvlc(reader);
+
+if (layer[i].vps_max_dec_pic_buffering == UVLC_ERROR ||
+    layer[i].vps_max_num_reorder_pics  == UVLC_ERROR ||
+    layer[i].vps_max_latency_increase  == UVLC_ERROR) {
+      return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+    }
   }
 
-  if (!vps->vps_sub_layer_ordering_info_present_flag) {
+  if (!vps_sub_layer_ordering_info_present_flag) {
     assert(firstLayerRead < MAX_TEMPORAL_SUBLAYERS);
 
     for (int i=0;i<firstLayerRead;i++) {
-      vps->layer[i].vps_max_dec_pic_buffering = vps->layer[firstLayerRead].vps_max_dec_pic_buffering;
-      vps->layer[i].vps_max_num_reorder_pics  = vps->layer[firstLayerRead].vps_max_num_reorder_pics;
-      vps->layer[i].vps_max_latency_increase  = vps->layer[firstLayerRead].vps_max_latency_increase;
+      layer[i].vps_max_dec_pic_buffering = layer[firstLayerRead].vps_max_dec_pic_buffering;
+      layer[i].vps_max_num_reorder_pics  = layer[firstLayerRead].vps_max_num_reorder_pics;
+      layer[i].vps_max_latency_increase  = layer[firstLayerRead].vps_max_latency_increase;
     }
   }
 
 
-  vps->vps_max_layer_id = get_bits(reader,6);
-  vps->vps_num_layer_sets = get_uvlc(reader)+1;
+  vps_max_layer_id = get_bits(reader,6);
+  vps_num_layer_sets = get_uvlc(reader);
 
-  if (vps->vps_num_layer_sets<0 ||
-      vps->vps_num_layer_sets>=1024) {
-    ctx->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
+  if (vps_num_layer_sets+1<0 ||
+      vps_num_layer_sets+1>=1024 ||
+      vps_num_layer_sets == UVLC_ERROR) {
+    errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
     return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
   }
+  vps_num_layer_sets += 1;
 
-  for (int i=1; i <= vps->vps_num_layer_sets-1; i++)
-    for (int j=0; j <= vps->vps_max_layer_id; j++)
-      {
-        vps->layer_id_included_flag[i][j] = get_bits(reader,1);
-      }
+  layer_id_included_flag.resize(vps_num_layer_sets);
+
+  for (int i=1; i <= vps_num_layer_sets-1; i++)
+    {
+      layer_id_included_flag[i].resize(vps_max_layer_id+1);
 
-  vps->vps_timing_info_present_flag = get_bits(reader,1);
+      for (int j=0; j <= vps_max_layer_id; j++)
+        {
+          layer_id_included_flag[i][j] = get_bits(reader,1);
+        }
+    }
 
-  if (vps->vps_timing_info_present_flag) {
-    vps->vps_num_units_in_tick = get_bits(reader,32);
-    vps->vps_time_scale        = get_bits(reader,32);
-    vps->vps_poc_proportional_to_timing_flag = get_bits(reader,1);
+  vps_timing_info_present_flag = get_bits(reader,1);
 
-    if (vps->vps_poc_proportional_to_timing_flag) {
-      vps->vps_num_ticks_poc_diff_one = get_uvlc(reader)+1;
-      vps->vps_num_hrd_parameters     = get_uvlc(reader);
+  if (vps_timing_info_present_flag) {
+    vps_num_units_in_tick = get_bits(reader,32);
+    vps_time_scale        = get_bits(reader,32);
+    vps_poc_proportional_to_timing_flag = get_bits(reader,1);
 
-      if (vps->vps_num_hrd_parameters >= 1024) {
+    if (vps_poc_proportional_to_timing_flag) {
+      vps_num_ticks_poc_diff_one = get_uvlc(reader)+1;
+      vps_num_hrd_parameters     = get_uvlc(reader);
+
+      if (vps_num_hrd_parameters >= 1024) {
         assert(false); // TODO: return bitstream error
       }
 
-      for (int i=0; i<vps->vps_num_hrd_parameters; i++) {
-        vps->hrd_layer_set_idx[i] = get_uvlc(reader);
+      hrd_layer_set_idx .resize(vps_num_hrd_parameters);
+      cprms_present_flag.resize(vps_num_hrd_parameters);
+
+      for (int i=0; i<vps_num_hrd_parameters; i++) {
+        hrd_layer_set_idx[i] = get_uvlc(reader);
 
         if (i > 0) {
-          vps->cprms_present_flag[i] = get_bits(reader,1);
+          cprms_present_flag[i] = get_bits(reader,1);
         }
 
         //hrd_parameters(cprms_present_flag[i], vps_max_sub_layers_minus1)
@@ -115,10 +205,96 @@ de265_error read_vps(decoder_context* ctx, bitreader* reader, video_parameter_se
       }
     }
   }
-  
-  vps->vps_extension_flag = get_bits(reader,1);
 
-  if (vps->vps_extension_flag) {
+  vps_extension_flag = get_bits(reader,1);
+
+  if (vps_extension_flag) {
+    /*
+      while( more_rbsp_data() )
+      vps_extension_data_flag u(1)
+      rbsp_trailing_bits()
+    */
+  }
+
+  return DE265_OK;
+}
+
+
+de265_error video_parameter_set::write(error_queue* errqueue, CABAC_encoder& out) const
+{
+  if (video_parameter_set_id >= DE265_MAX_VPS_SETS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+  out.write_bits(video_parameter_set_id,4);
+
+  out.write_bits(0x3,2);
+  out.write_bits(vps_max_layers-1,6);
+
+  if (vps_max_sub_layers >= MAX_TEMPORAL_SUBLAYERS) return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+  out.write_bits(vps_max_sub_layers-1,3);
+
+  out.write_bit(vps_temporal_id_nesting_flag);
+  out.write_bits(0xFFFF, 16);
+
+  profile_tier_level_.write(out, vps_max_sub_layers);
+
+  /*
+  read_bit_rate_pic_rate_info(reader, &bit_rate_pic_rate_info,
+                              0, vps_max_sub_layers-1);
+  */
+
+  out.write_bit(vps_sub_layer_ordering_info_present_flag);
+  //assert(vps_max_sub_layers-1 < MAX_TEMPORAL_SUBLAYERS);
+
+  int firstLayerRead = vps_sub_layer_ordering_info_present_flag ? 0 : (vps_max_sub_layers-1);
+
+  for (int i=firstLayerRead;i<vps_max_sub_layers;i++) {
+    out.write_uvlc(layer[i].vps_max_dec_pic_buffering);
+    out.write_uvlc(layer[i].vps_max_num_reorder_pics);
+    out.write_uvlc(layer[i].vps_max_latency_increase);
+  }
+
+  if (vps_num_layer_sets<0 ||
+      vps_num_layer_sets>=1024) {
+    errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
+    return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE;
+  }
+
+  out.write_bits(vps_max_layer_id,6);
+  out.write_uvlc(vps_num_layer_sets-1);
+
+  for (int i=1; i <= vps_num_layer_sets-1; i++)
+    for (int j=0; j <= vps_max_layer_id; j++)
+      {
+        out.write_bit(layer_id_included_flag[i][j]);
+      }
+
+  out.write_bit(vps_timing_info_present_flag);
+
+  if (vps_timing_info_present_flag) {
+    out.write_bits(vps_num_units_in_tick,32);
+    out.write_bits(vps_time_scale       ,32);
+    out.write_bit (vps_poc_proportional_to_timing_flag);
+
+    if (vps_poc_proportional_to_timing_flag) {
+      out.write_uvlc(vps_num_ticks_poc_diff_one-1);
+      out.write_uvlc(vps_num_hrd_parameters);
+
+      for (int i=0; i<vps_num_hrd_parameters; i++) {
+        out.write_uvlc(hrd_layer_set_idx[i]);
+
+        if (i > 0) {
+          out.write_bit(cprms_present_flag[i]);
+        }
+
+        //hrd_parameters(cprms_present_flag[i], vps_max_sub_layers_minus1)
+
+        return DE265_OK; // TODO: decode hrd_parameters()
+      }
+    }
+  }
+
+  out.write_bit(vps_extension_flag);
+
+  if (vps_extension_flag) {
     /*
     while( more_rbsp_data() )
     vps_extension_data_flag u(1)
@@ -130,31 +306,46 @@ de265_error read_vps(decoder_context* ctx, bitreader* reader, video_parameter_se
 }
 
 
-void read_profile_tier_level(bitreader* reader,
-                             struct profile_tier_level* hdr,
-                             int max_sub_layers)
+void profile_data::read(bitreader* reader)
 {
-  hdr->general_profile_space = get_bits(reader,2);
-  hdr->general_tier_flag = get_bits(reader,1);
-  hdr->general_profile_idc = get_bits(reader,5);
+  if (profile_present_flag) {
+    profile_space = get_bits(reader,2);
+    tier_flag = get_bits(reader,1);
+    profile_idc = (enum profile_idc)get_bits(reader,5);
+
+    for (int i=0; i<32; i++) {
+      profile_compatibility_flag[i] = get_bits(reader,1);
+    }
 
-  for (int i=0; i<32; i++) {
-    hdr->general_profile_compatibility_flag[i] = get_bits(reader,1);
+    progressive_source_flag = get_bits(reader,1);
+    interlaced_source_flag  = get_bits(reader,1);
+    non_packed_constraint_flag = get_bits(reader,1);
+    frame_only_constraint_flag = get_bits(reader,1);
+    skip_bits(reader,44);
   }
 
-  hdr->general_progressive_source_flag = get_bits(reader,1);
-  hdr->general_interlaced_source_flag  = get_bits(reader,1);
-  hdr->general_non_packed_constraint_flag = get_bits(reader,1);
-  hdr->general_frame_only_constraint_flag = get_bits(reader,1);
-  skip_bits(reader,44);
+  if (level_present_flag) {
+    level_idc = get_bits(reader,8);
+  }
+}
+
+
+void profile_tier_level::read(bitreader* reader,
+                              int max_sub_layers)
+{
+  // --- read the general profile ---
 
-  hdr->general_level_idc = get_bits(reader,8);
+  general.profile_present_flag = true;
+  general.level_present_flag = true;
+  general.read(reader);
 
 
+  // --- read the profile/levels of the sub-layers ---
+
   for (int i=0; i<max_sub_layers-1; i++)
     {
-      hdr->profile[i].sub_layer_profile_present_flag = get_bits(reader,1);
-      hdr->profile[i].sub_layer_level_present_flag   = get_bits(reader,1);
+      sub_layer[i].profile_present_flag = get_bits(reader,1);
+      sub_layer[i].level_present_flag   = get_bits(reader,1);
     }
 
   if (max_sub_layers > 1)
@@ -167,29 +358,62 @@ void read_profile_tier_level(bitreader* reader,
 
   for (int i=0; i<max_sub_layers-1; i++)
     {
-      if (hdr->profile[i].sub_layer_profile_present_flag)
+      sub_layer[i].read(reader);
+    }
+}
+
+
+void profile_data::write(CABAC_encoder& out) const
+{
+  if (profile_present_flag)
+    {
+      out.write_bits(profile_space,2);
+      out.write_bit (tier_flag);
+      out.write_bits(profile_idc,5);
+
+      for (int j=0; j<32; j++)
         {
-          hdr->profile[i].sub_layer_profile_space = get_bits(reader,2);
-          hdr->profile[i].sub_layer_tier_flag = get_bits(reader,1);
-          hdr->profile[i].sub_layer_profile_idc = get_bits(reader,5);
-
-          for (int j=0; j<32; j++)
-            {
-              hdr->profile[i].sub_layer_profile_compatibility_flag[j] = get_bits(reader,1);
-            }
-
-          hdr->profile[i].sub_layer_progressive_source_flag = get_bits(reader,1);
-          hdr->profile[i].sub_layer_interlaced_source_flag  = get_bits(reader,1);
-          hdr->profile[i].sub_layer_non_packed_constraint_flag = get_bits(reader,1);
-          hdr->profile[i].sub_layer_frame_only_constraint_flag = get_bits(reader,1);
-          skip_bits(reader,44);
+          out.write_bit(profile_compatibility_flag[j]);
         }
 
-      if (hdr->profile[i].sub_layer_level_present_flag)
+      out.write_bit(progressive_source_flag);
+      out.write_bit(interlaced_source_flag);
+      out.write_bit(non_packed_constraint_flag);
+      out.write_bit(frame_only_constraint_flag);
+      out.skip_bits(44);
+    }
+
+  if (level_present_flag)
+    {
+      out.write_bits(level_idc,8);
+    }
+}
+
+void profile_tier_level::write(CABAC_encoder& out, int max_sub_layers) const
+{
+  assert(general.profile_present_flag==true);
+  assert(general.level_present_flag==true);
+
+  general.write(out);
+
+  for (int i=0; i<max_sub_layers-1; i++)
+    {
+      out.write_bit(sub_layer[i].profile_present_flag);
+      out.write_bit(sub_layer[i].level_present_flag);
+    }
+
+  if (max_sub_layers > 1)
+    {
+      for (int i=max_sub_layers-1; i<8; i++)
         {
-          hdr->profile[i].sub_layer_level_idc = get_bits(reader,8);
+          out.skip_bits(2);
         }
     }
+
+  for (int i=0; i<max_sub_layers-1; i++)
+    {
+      sub_layer[i].write(out);
+    }
 }
 
 
@@ -224,7 +448,7 @@ void read_bit_rate_pic_rate_info(bitreader* reader,
 #define LOG2(t,d1,d2) log2fh(fh, t,d1,d2)
 #define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3)
 
-void dump_vps(video_parameter_set* vps, int fd)
+void video_parameter_set::dump(int fd) const
 {
   FILE* fh;
   if (fd==1) fh=stdout;
@@ -232,58 +456,58 @@ void dump_vps(video_parameter_set* vps, int fd)
   else { return; }
 
   LOG0("----------------- VPS -----------------\n");
-  LOG1("video_parameter_set_id                : %d\n", vps->video_parameter_set_id);
-  LOG1("vps_max_layers                        : %d\n", vps->vps_max_layers);
-  LOG1("vps_max_sub_layers                    : %d\n", vps->vps_max_sub_layers);
-  LOG1("vps_temporal_id_nesting_flag          : %d\n", vps->vps_temporal_id_nesting_flag);
+  LOG1("video_parameter_set_id                : %d\n", video_parameter_set_id);
+  LOG1("vps_max_layers                        : %d\n", vps_max_layers);
+  LOG1("vps_max_sub_layers                    : %d\n", vps_max_sub_layers);
+  LOG1("vps_temporal_id_nesting_flag          : %d\n", vps_temporal_id_nesting_flag);
 
-  dump_profile_tier_level(&vps->profile_tier_level, vps->vps_max_sub_layers, fh);
-  //dump_bit_rate_pic_rate_info(&vps->bit_rate_pic_rate_info, 0, vps->vps_max_sub_layers-1);
+  profile_tier_level_.dump(vps_max_sub_layers, fh);
+  //dump_bit_rate_pic_rate_info(&bit_rate_pic_rate_info, 0, vps_max_sub_layers-1);
 
   LOG1("vps_sub_layer_ordering_info_present_flag : %d\n",
-       vps->vps_sub_layer_ordering_info_present_flag);
+       vps_sub_layer_ordering_info_present_flag);
 
-  if (vps->vps_sub_layer_ordering_info_present_flag) {
-    for (int i=0;i<vps->vps_max_sub_layers;i++) {
-      LOG2("layer %d: vps_max_dec_pic_buffering = %d\n",i,vps->layer[i].vps_max_dec_pic_buffering);
-      LOG1("         vps_max_num_reorder_pics  = %d\n",vps->layer[i].vps_max_num_reorder_pics);
-      LOG1("         vps_max_latency_increase  = %d\n",vps->layer[i].vps_max_latency_increase);
+  if (vps_sub_layer_ordering_info_present_flag) {
+    for (int i=0;i<vps_max_sub_layers;i++) {
+      LOG2("layer %d: vps_max_dec_pic_buffering = %d\n",i,layer[i].vps_max_dec_pic_buffering);
+      LOG1("         vps_max_num_reorder_pics  = %d\n",layer[i].vps_max_num_reorder_pics);
+      LOG1("         vps_max_latency_increase  = %d\n",layer[i].vps_max_latency_increase);
     }
   }
   else {
-    LOG1("layer (all): vps_max_dec_pic_buffering = %d\n",vps->layer[0].vps_max_dec_pic_buffering);
-    LOG1("             vps_max_num_reorder_pics  = %d\n",vps->layer[0].vps_max_num_reorder_pics);
-    LOG1("             vps_max_latency_increase  = %d\n",vps->layer[0].vps_max_latency_increase);
+    LOG1("layer (all): vps_max_dec_pic_buffering = %d\n",layer[0].vps_max_dec_pic_buffering);
+    LOG1("             vps_max_num_reorder_pics  = %d\n",layer[0].vps_max_num_reorder_pics);
+    LOG1("             vps_max_latency_increase  = %d\n",layer[0].vps_max_latency_increase);
   }
 
 
-  LOG1("vps_max_layer_id   = %d\n", vps->vps_max_layer_id);
-  LOG1("vps_num_layer_sets = %d\n", vps->vps_num_layer_sets);
+  LOG1("vps_max_layer_id   = %d\n", vps_max_layer_id);
+  LOG1("vps_num_layer_sets = %d\n", vps_num_layer_sets);
 
-  for (int i=1; i <= vps->vps_num_layer_sets-1; i++)
-    for (int j=0; j <= vps->vps_max_layer_id; j++)
+  for (int i=1; i <= vps_num_layer_sets-1; i++)
+    for (int j=0; j <= vps_max_layer_id; j++)
       {
         LOG3("layer_id_included_flag[%d][%d] = %d\n",i,j,
-             vps->layer_id_included_flag[i][j]);
+             layer_id_included_flag[i][j]);
       }
 
   LOG1("vps_timing_info_present_flag = %d\n",
-       vps->vps_timing_info_present_flag);
+       vps_timing_info_present_flag);
 
-  if (vps->vps_timing_info_present_flag) {
-    LOG1("vps_num_units_in_tick = %d\n", vps->vps_num_units_in_tick);
-    LOG1("vps_time_scale        = %d\n", vps->vps_time_scale);
-    LOG1("vps_poc_proportional_to_timing_flag = %d\n", vps->vps_poc_proportional_to_timing_flag);
+  if (vps_timing_info_present_flag) {
+    LOG1("vps_num_units_in_tick = %d\n", vps_num_units_in_tick);
+    LOG1("vps_time_scale        = %d\n", vps_time_scale);
+    LOG1("vps_poc_proportional_to_timing_flag = %d\n", vps_poc_proportional_to_timing_flag);
 
-    if (vps->vps_poc_proportional_to_timing_flag) {
-      LOG1("vps_num_ticks_poc_diff_one = %d\n", vps->vps_num_ticks_poc_diff_one);
-      LOG1("vps_num_hrd_parameters     = %d\n", vps->vps_num_hrd_parameters);
+    if (vps_poc_proportional_to_timing_flag) {
+      LOG1("vps_num_ticks_poc_diff_one = %d\n", vps_num_ticks_poc_diff_one);
+      LOG1("vps_num_hrd_parameters     = %d\n", vps_num_hrd_parameters);
 
-      for (int i=0; i<vps->vps_num_hrd_parameters; i++) {
-        LOG2("hrd_layer_set_idx[%d] = %d\n", i, vps->hrd_layer_set_idx[i]);
+      for (int i=0; i<vps_num_hrd_parameters; i++) {
+        LOG2("hrd_layer_set_idx[%d] = %d\n", i, hrd_layer_set_idx[i]);
 
         if (i > 0) {
-          LOG2("cprms_present_flag[%d] = %d\n", i, vps->cprms_present_flag[i]);
+          LOG2("cprms_present_flag[%d] = %d\n", i, cprms_present_flag[i]);
         }
 
         //hrd_parameters(cprms_present_flag[i], vps_max_sub_layers_minus1)
@@ -292,54 +516,59 @@ void dump_vps(video_parameter_set* vps, int fd)
       }
     }
   }
-  
-  LOG1("vps_extension_flag = %d\n", vps->vps_extension_flag);
+
+  LOG1("vps_extension_flag = %d\n", vps_extension_flag);
 }
 
 
-void dump_profile_tier_level(const struct profile_tier_level* hdr,
-                             int max_sub_layers, FILE* fh)
+static const char* profile_name(profile_idc p)
 {
-  LOG1("  general_profile_space     : %d\n", hdr->general_profile_space);
-  LOG1("  general_tier_flag         : %d\n", hdr->general_tier_flag);
-  LOG1("  general_profile_idc       : %d\n", hdr->general_profile_idc);
-
-  LOG0("  general_profile_compatibility_flags: ");
-  for (int i=0; i<32; i++) {
-    if (i) LOG0("*,");
-    LOG1("*%d",hdr->general_profile_compatibility_flag[i]);
+  switch (p) {
+  case Profile_Main: return "Main";
+  case Profile_Main10: return "Main10";
+  case Profile_MainStillPicture: return "MainStillPicture";
+  case Profile_FormatRangeExtensions: return "FormatRangeExtensions";
+  default:
+    return "(unknown)";
   }
-  LOG0("*\n");
+}
 
-  LOG1("  general_level_idc         : %d\n", hdr->general_level_idc);
 
-  for (int i=0; i<max_sub_layers-1; i++)
-    {
-      LOG1("  Profile/Tier/Level [Layer %d]\n",i);
+void profile_data::dump(bool general, FILE* fh) const
+{
+  const char* prefix = (general ? "general" : "sub_layer");
 
-      if (hdr->profile[i].sub_layer_profile_present_flag) {
+  if (profile_present_flag) {
+    LOG2("  %s_profile_space     : %d\n", prefix,profile_space);
+    LOG2("  %s_tier_flag         : %d\n", prefix,tier_flag);
+    LOG2("  %s_profile_idc       : %s\n", prefix, profile_name(profile_idc));
 
-        LOG1("    sub_layer_profile_space : %d\n",hdr->profile[i].sub_layer_profile_space);
-        LOG1("    sub_layer_tier_flag     : %d\n",hdr->profile[i].sub_layer_tier_flag);
-        LOG1("    sub_layer_profile_idc   : %d\n",hdr->profile[i].sub_layer_profile_idc);
+    LOG1("  %s_profile_compatibility_flags: ", prefix);
+    for (int i=0; i<32; i++) {
+      if (i) LOG0("*,");
+      LOG1("*%d",profile_compatibility_flag[i]);
+    }
+    LOG0("*\n");
+    LOG2("    %s_progressive_source_flag : %d\n",prefix,progressive_source_flag);
+    LOG2("    %s_interlaced_source_flag : %d\n",prefix,interlaced_source_flag);
+    LOG2("    %s_non_packed_constraint_flag : %d\n",prefix,non_packed_constraint_flag);
+    LOG2("    %s_frame_only_constraint_flag : %d\n",prefix,frame_only_constraint_flag);
+  }
 
-        LOG0("    sub_layer_profile_compatibility_flags: ");
-        for (int j=0; j<32; j++) {
-          if (j) LOG0(",");
-          LOG1("%d",hdr->profile[i].sub_layer_profile_compatibility_flag[j]);
-        }
-        LOG0("\n");
+  if (level_present_flag) {
+    LOG3("  %s_level_idc         : %d (%4.2f)\n", prefix,level_idc, level_idc/30.0f);
+  }
+}
 
-        LOG1("    sub_layer_progressive_source_flag : %d\n",hdr->profile[i].sub_layer_progressive_source_flag);
-        LOG1("    sub_layer_interlaced_source_flag : %d\n",hdr->profile[i].sub_layer_interlaced_source_flag);
-        LOG1("    sub_layer_non_packed_constraint_flag : %d\n",hdr->profile[i].sub_layer_non_packed_constraint_flag);
-        LOG1("    sub_layer_frame_only_constraint_flag : %d\n",hdr->profile[i].sub_layer_frame_only_constraint_flag);
-      }
 
+void profile_tier_level::dump(int max_sub_layers, FILE* fh) const
+{
+  general.dump(true, fh);
 
-      if (hdr->profile[i].sub_layer_level_present_flag) {
-        LOG1("    sub_layer_level_idc   : %d\n", hdr->profile[i].sub_layer_level_idc);
-      }
+  for (int i=0; i<max_sub_layers-1; i++)
+    {
+      LOG1("  Profile/Tier/Level [Layer %d]\n",i);
+      sub_layer[i].dump(false, fh);
     }
 }
 
diff --git a/libde265/vps.h b/libde265/vps.h
index 987c533..9795ca9 100644
--- a/libde265/vps.h
+++ b/libde265/vps.h
@@ -31,58 +31,68 @@
 
 #include "libde265/bitstream.h"
 #include "libde265/de265.h"
+#include "libde265/cabac.h"
 
-#define MAX_TEMPORAL_SUBLAYERS 8
+#include <vector>
 
+class error_queue;
 
-struct profile_data {
-  // --- profile ---
+#define MAX_TEMPORAL_SUBLAYERS 8
 
-  char sub_layer_profile_present_flag;
 
-  char sub_layer_profile_space;
-  char sub_layer_tier_flag;
-  char sub_layer_profile_idc;
+enum profile_idc {
+  Profile_Main   = 1,
+  Profile_Main10 = 2,
+  Profile_MainStillPicture = 3,
+  Profile_FormatRangeExtensions = 4
+};
 
-  char sub_layer_profile_compatibility_flag[32];
 
-  char sub_layer_progressive_source_flag;
-  char sub_layer_interlaced_source_flag;
-  char sub_layer_non_packed_constraint_flag;
-  char sub_layer_frame_only_constraint_flag;
+class profile_data {
+public:
+  void read(bitreader* reader);
+  void write(CABAC_encoder& writer) const;
+  void dump(bool general, FILE* fh) const;
 
+  void set_defaults(enum profile_idc, int level_major, int level_minor);
 
-  // --- level ---
+  // --- profile ---
 
-  char sub_layer_level_present_flag;
-  int  sub_layer_level_idc;
-};
+  char profile_present_flag;  // always true for general profile
 
+  char profile_space;  // currently always 0
+  char tier_flag;      // main tier or low tier (see Table A-66/A-67)
+  enum profile_idc profile_idc; // profile
 
-struct profile_tier_level {
-  int general_profile_space;
-  int general_tier_flag;
-  int general_profile_idc;
+  char profile_compatibility_flag[32]; // to which profile we are compatible
 
-  char general_profile_compatibility_flag[32];
+  char progressive_source_flag;
+  char interlaced_source_flag;
+  char non_packed_constraint_flag;
+  char frame_only_constraint_flag;
 
-  char general_progressive_source_flag;
-  char general_interlaced_source_flag;
-  char general_non_packed_constraint_flag;
-  char general_frame_only_constraint_flag;
 
-  int general_level_idc;
+  // --- level ---
 
-  struct profile_data profile[MAX_TEMPORAL_SUBLAYERS];
+  char level_present_flag; // always true for general level
+  int  level_idc;          // level * 30
 };
 
 
-void read_profile_tier_level(bitreader* reader,
-                             struct profile_tier_level* hdr,
-                             int max_sub_layers);
+class profile_tier_level
+{
+public:
+  void read(bitreader* reader, int max_sub_layers);
+  void write(CABAC_encoder& writer, int max_sub_layers) const;
+  void dump(int max_sub_layers, FILE* fh) const;
+
+  profile_data general;
 
-void dump_profile_tier_level(const struct profile_tier_level* hdr,
-                             int max_sub_layers, FILE* fh);
+  //bool sub_layer_profile_present[MAX_TEMPORAL_SUBLAYERS];
+  //bool sub_layer_level_present[MAX_TEMPORAL_SUBLAYERS];
+
+  profile_data sub_layer[MAX_TEMPORAL_SUBLAYERS];
+};
 
 
 /*
@@ -110,46 +120,54 @@ void dump_bit_rate_pic_rate_info(struct bit_rate_pic_rate_info* hdr,
 
 
 typedef struct {
-  int vps_max_dec_pic_buffering;
-  int vps_max_num_reorder_pics;
-  int vps_max_latency_increase;
+  int vps_max_dec_pic_buffering; // [1 ; ]
+  int vps_max_num_reorder_pics;  // [0 ; ]
+  int vps_max_latency_increase;  // 0 -> no limit, otherwise value is (x-1)
 } layer_data;
 
-typedef struct {
+
+class video_parameter_set
+{
+public:
+  de265_error read(error_queue* errqueue, bitreader* reader);
+  de265_error write(error_queue* errqueue, CABAC_encoder& out) const;
+  void dump(int fd) const;
+
+  void set_defaults(enum profile_idc profile, int level_major, int level_minor);
+
   int video_parameter_set_id;
-  int vps_max_layers;
-  int vps_max_sub_layers;
-  int vps_temporal_id_nesting_flag;
-  struct profile_tier_level profile_tier_level;
-  //struct bit_rate_pic_rate_info bit_rate_pic_rate_info;
-  int vps_sub_layer_ordering_info_present_flag;
+  int vps_max_layers;            // [1;?]  currently always 1
+  int vps_max_sub_layers;        // [1;7]  number of temporal sub-layers
+  int vps_temporal_id_nesting_flag; // indicate temporal up-switching always possible
+  profile_tier_level profile_tier_level_;
 
+  int vps_sub_layer_ordering_info_present_flag;
   layer_data layer[MAX_TEMPORAL_SUBLAYERS];
 
-  uint8_t vps_max_layer_id;
-  int     vps_num_layer_sets;
+  uint8_t vps_max_layer_id;   // max value for nuh_layer_id in NALs
+  int     vps_num_layer_sets; // [1;1024], currently always 1
+
+  std::vector<std::vector<bool> > layer_id_included_flag; // max size = [1024][64]
 
-  char layer_id_included_flag[1024][64];
+
+  // --- timing info ---
 
   char     vps_timing_info_present_flag;
   uint32_t vps_num_units_in_tick;
   uint32_t vps_time_scale;
   char     vps_poc_proportional_to_timing_flag;
+  uint32_t vps_num_ticks_poc_diff_one;
 
-  int vps_num_ticks_poc_diff_one;
-  int vps_num_hrd_parameters;
-
-  uint16_t hrd_layer_set_idx[1024];
-  char     cprms_present_flag[1024];
+  int vps_num_hrd_parameters;     // currently [0;1]
 
-  // hrd_parameters(cprms_present_flag[i], vps_max_sub_layers-1)
+  std::vector<uint16_t> hrd_layer_set_idx;  // max size = 1024
+  std::vector<char>     cprms_present_flag; // max size = 1024
 
-  char vps_extension_flag;
 
-} video_parameter_set;
+  // --- vps extension ---
 
+  char vps_extension_flag;
+};
 
-de265_error read_vps(struct decoder_context* ctx, bitreader* reader, video_parameter_set* vps);
-void dump_vps(video_parameter_set*, int fd);
 
 #endif
diff --git a/libde265/vui.cc b/libde265/vui.cc
new file mode 100644
index 0000000..5524fa8
--- /dev/null
+++ b/libde265/vui.cc
@@ -0,0 +1,425 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "vui.h"
+#include "decctx.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define READ_VLC_OFFSET(variable, vlctype, offset)   \
+  if ((vlc = get_ ## vlctype(br)) == UVLC_ERROR) {   \
+    errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);  \
+    return DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE; \
+  } \
+  variable = vlc + offset;
+
+#define READ_VLC(variable, vlctype)  READ_VLC_OFFSET(variable,vlctype,0)
+
+
+#define NUM_SAR_PRESETS 17
+
+static uint16_t sar_presets[NUM_SAR_PRESETS+1][2] = {
+  { 0,0 },
+  { 1,1 },
+  { 12,11 },
+  { 10,11 },
+  { 16,11 },
+  { 40,33 },
+  { 24,11 },
+  { 20,11 },
+  { 32,11 },
+  { 80,33 },
+  { 18,11 },
+  { 15,11 },
+  { 64,33 },
+  { 160,99 },
+  { 4,3 },
+  { 3,2 },
+  { 2,1 }
+};
+
+#define EXTENDED_SAR 255
+
+
+const char* get_video_format_name(enum VideoFormat format)
+{
+  switch (format) {
+  case VideoFormat_Component: return "component";
+  case VideoFormat_PAL:       return "PAL";
+  case VideoFormat_NTSC:      return "NTSC";
+  case VideoFormat_SECAM:     return "SECAM";
+  case VideoFormat_MAC:       return "MAC";
+  default:                    return "unspecified";
+  }
+}
+
+
+video_usability_information::video_usability_information()
+{
+  aspect_ratio_info_present_flag = false;
+  sar_width  = 0;
+  sar_height = 0;
+
+
+  // --- overscan ---
+
+  overscan_info_present_flag = false;
+  overscan_appropriate_flag  = false;
+
+
+  // --- video signal type ---
+
+  video_signal_type_present_flag = false;
+  video_format = VideoFormat_Unspecified;
+  video_full_range_flag = false;
+  colour_description_present_flag = false;
+  colour_primaries = 2;
+  transfer_characteristics = 2;
+  matrix_coeffs = 2;
+
+  // --- chroma / interlaced ---
+
+  chroma_loc_info_present_flag = false;
+  chroma_sample_loc_type_top_field    = 0;
+  chroma_sample_loc_type_bottom_field = 0;
+
+  neutral_chroma_indication_flag = false;
+  field_seq_flag = false;
+  frame_field_info_present_flag = false;
+
+  // --- default display window ---
+
+  default_display_window_flag = false;
+  def_disp_win_left_offset   = 0;
+  def_disp_win_right_offset  = 0;
+  def_disp_win_top_offset    = 0;
+  def_disp_win_bottom_offset = 0;
+
+
+  // --- timing ---
+
+  vui_timing_info_present_flag = false;
+  vui_num_units_in_tick = 0;
+  vui_time_scale = 0;
+
+  vui_poc_proportional_to_timing_flag = false;
+  vui_num_ticks_poc_diff_one = 1;
+
+
+  // --- hrd parameters ---
+
+  vui_hrd_parameters_present_flag = false;
+  //hrd_parameters vui_hrd_parameters;
+
+
+  // --- bitstream restriction ---
+
+  bitstream_restriction_flag = false;
+  tiles_fixed_structure_flag = false;
+  motion_vectors_over_pic_boundaries_flag = true;
+  restricted_ref_pic_lists_flag = false;
+  min_spatial_segmentation_idc = 0;
+  max_bytes_per_pic_denom   = 2;
+  max_bits_per_min_cu_denom = 1;
+  log2_max_mv_length_horizontal = 15;
+  log2_max_mv_length_vertical   = 15;
+}
+
+
+de265_error video_usability_information::read(error_queue* errqueue, bitreader* br,
+                                              const seq_parameter_set* sps)
+{
+  int vlc;
+
+
+  // --- sample aspect ratio (SAR) ---
+
+  aspect_ratio_info_present_flag = get_bits(br,1);
+  if (aspect_ratio_info_present_flag) {
+    int aspect_ratio_idc = get_bits(br,8);
+    if (aspect_ratio_idc <= NUM_SAR_PRESETS) {
+      sar_width  = sar_presets[aspect_ratio_idc][0];
+      sar_height = sar_presets[aspect_ratio_idc][1];
+    }
+    else if (aspect_ratio_idc == EXTENDED_SAR) {
+      sar_width  = get_bits(br,16);
+      sar_height = get_bits(br,16);
+    }
+    else {
+      sar_width  = 0;
+      sar_height = 0;
+    }
+  }
+  else {
+    sar_width  = 0;
+    sar_height = 0;
+  }
+
+
+  // --- overscan ---
+
+  overscan_info_present_flag = get_bits(br,1);
+  if (overscan_info_present_flag) {
+    overscan_appropriate_flag = get_bits(br,1);
+  }
+
+
+  // --- video signal type ---
+
+  { // defaults
+    video_format = VideoFormat_Unspecified;
+    video_full_range_flag = false;
+    colour_primaries = 2;
+    transfer_characteristics = 2;
+    matrix_coeffs = 2;
+  }
+
+  video_signal_type_present_flag = get_bits(br,1);
+  if (video_signal_type_present_flag) {
+    int video_format_idc = get_bits(br,3);
+    if (video_format_idc > 5) {
+      video_format_idc = VideoFormat_Unspecified;
+    }
+    video_format = (VideoFormat)video_format_idc;
+
+    video_full_range_flag = get_bits(br,1);
+
+    colour_description_present_flag = get_bits(br,1);
+    if (colour_description_present_flag) {
+      colour_primaries = get_bits(br,8);
+      if (colour_primaries == 0 ||
+          colour_primaries == 3 ||
+          colour_primaries >= 11) {
+        colour_primaries = 2;
+      }
+
+      transfer_characteristics = get_bits(br,8);
+      if (transfer_characteristics == 0 ||
+          transfer_characteristics == 3 ||
+          transfer_characteristics >= 18) {
+        transfer_characteristics = 2;
+      }
+
+      matrix_coeffs = get_bits(br,8);
+      if (matrix_coeffs == 0 ||
+          matrix_coeffs >= 11) {
+        matrix_coeffs = 2;
+      }
+    }
+  }
+
+
+  // --- chroma / interlaced ---
+
+  chroma_loc_info_present_flag = get_bits(br,1);
+  if (chroma_loc_info_present_flag) {
+    READ_VLC(chroma_sample_loc_type_top_field,    uvlc);
+    READ_VLC(chroma_sample_loc_type_bottom_field, uvlc);
+  }
+  else {
+    chroma_sample_loc_type_top_field    = 0;
+    chroma_sample_loc_type_bottom_field = 0;
+  }
+
+  neutral_chroma_indication_flag = get_bits(br,1);
+  field_seq_flag                 = get_bits(br,1);
+  frame_field_info_present_flag  = get_bits(br,1);
+
+
+  // --- default display window ---
+
+  default_display_window_flag = get_bits(br,1);
+  if (default_display_window_flag) {
+    READ_VLC(def_disp_win_left_offset  ,uvlc);
+    READ_VLC(def_disp_win_right_offset ,uvlc);
+    READ_VLC(def_disp_win_top_offset   ,uvlc);
+    READ_VLC(def_disp_win_bottom_offset,uvlc);
+  }
+  else {
+    def_disp_win_left_offset  =0;
+    def_disp_win_right_offset =0;
+    def_disp_win_top_offset   =0;
+    def_disp_win_bottom_offset=0;
+  }
+
+
+  // --- timing ---
+
+  vui_timing_info_present_flag = get_bits(br,1);
+  if (vui_timing_info_present_flag) {
+    vui_num_units_in_tick = get_bits(br,32);
+    vui_time_scale        = get_bits(br,32);
+  }
+
+  vui_poc_proportional_to_timing_flag = get_bits(br,1);
+  READ_VLC_OFFSET(vui_num_ticks_poc_diff_one, uvlc, 1);
+
+
+  // --- hrd parameters ---
+
+  vui_hrd_parameters_present_flag = get_bits(br,1);
+  if (vui_hrd_parameters_present_flag) {
+    return DE265_ERROR_NOT_IMPLEMENTED_YET;
+    //hrd_parameters vui_hrd_parameters;
+  }
+
+
+  // --- bitstream restriction ---
+
+  bitstream_restriction_flag = get_bits(br,1);
+  if (bitstream_restriction_flag) {
+    tiles_fixed_structure_flag = get_bits(br,1);
+    motion_vectors_over_pic_boundaries_flag = get_bits(br,1);
+    restricted_ref_pic_lists_flag = get_bits(br,1);
+
+    READ_VLC(min_spatial_segmentation_idc, uvlc);
+    if (min_spatial_segmentation_idc > 4095) {
+      errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
+      min_spatial_segmentation_idc = 0;
+    }
+
+    READ_VLC(max_bytes_per_pic_denom, uvlc);
+    if (max_bytes_per_pic_denom > 16) {
+      errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
+      max_bytes_per_pic_denom = 2;
+    }
+
+    READ_VLC(max_bits_per_min_cu_denom, uvlc);
+    if (max_bits_per_min_cu_denom > 16) {
+      errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
+      max_bits_per_min_cu_denom = 1;
+    }
+
+    READ_VLC(log2_max_mv_length_horizontal, uvlc);
+    if (log2_max_mv_length_horizontal > 15) {
+      errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
+      log2_max_mv_length_horizontal = 15;
+    }
+
+    READ_VLC(log2_max_mv_length_vertical, uvlc);
+    if (log2_max_mv_length_vertical > 15) {
+      errqueue->add_warning(DE265_ERROR_CODED_PARAMETER_OUT_OF_RANGE, false);
+      log2_max_mv_length_vertical = 15;
+    }
+  }
+  else {
+    tiles_fixed_structure_flag = false;
+    motion_vectors_over_pic_boundaries_flag = true;
+    restricted_ref_pic_lists_flag = false; // NOTE: default not specified in standard 2014/10
+
+    min_spatial_segmentation_idc = 0;
+    max_bytes_per_pic_denom   = 2;
+    max_bits_per_min_cu_denom = 1;
+    log2_max_mv_length_horizontal = 15;
+    log2_max_mv_length_vertical   = 15;
+  }
+
+  //vui_read = true;
+
+  return DE265_OK;
+}
+
+
+void video_usability_information::dump(int fd) const
+{
+  //#if (_MSC_VER >= 1500)
+  //#define LOG0(t) loginfo(LogHeaders, t)
+  //#define LOG1(t,d) loginfo(LogHeaders, t,d)
+  //#define LOG2(t,d1,d2) loginfo(LogHeaders, t,d1,d2)
+  //#define LOG3(t,d1,d2,d3) loginfo(LogHeaders, t,d1,d2,d3)
+
+  FILE* fh;
+  if (fd==1) fh=stdout;
+  else if (fd==2) fh=stderr;
+  else { return; }
+
+#define LOG0(t) log2fh(fh, t)
+#define LOG1(t,d) log2fh(fh, t,d)
+#define LOG2(t,d1,d2) log2fh(fh, t,d1,d2)
+#define LOG3(t,d1,d2,d3) log2fh(fh, t,d1,d2,d3)
+
+  LOG0("----------------- VUI -----------------\n");
+  LOG2("sample aspect ratio        : %d:%d\n", sar_width,sar_height);
+  LOG1("overscan_info_present_flag : %d\n", overscan_info_present_flag);
+  LOG1("overscan_appropriate_flag  : %d\n", overscan_appropriate_flag);
+
+  LOG1("video_signal_type_present_flag: %d\n", video_signal_type_present_flag);
+  if (video_signal_type_present_flag) {
+    LOG1("  video_format                : %s\n", get_video_format_name(video_format));
+    LOG1("  video_full_range_flag       : %d\n", video_full_range_flag);
+    LOG1("  colour_description_present_flag : %d\n", colour_description_present_flag);
+    LOG1("  colour_primaries            : %d\n", colour_primaries);
+    LOG1("  transfer_characteristics    : %d\n", transfer_characteristics);
+    LOG1("  matrix_coeffs               : %d\n", matrix_coeffs);
+  }
+
+  LOG1("chroma_loc_info_present_flag: %d\n", chroma_loc_info_present_flag);
+  if (chroma_loc_info_present_flag) {
+    LOG1("  chroma_sample_loc_type_top_field   : %d\n", chroma_sample_loc_type_top_field);
+    LOG1("  chroma_sample_loc_type_bottom_field: %d\n", chroma_sample_loc_type_bottom_field);
+  }
+
+  LOG1("neutral_chroma_indication_flag: %d\n", neutral_chroma_indication_flag);
+  LOG1("field_seq_flag                : %d\n", field_seq_flag);
+  LOG1("frame_field_info_present_flag : %d\n", frame_field_info_present_flag);
+
+  LOG1("default_display_window_flag   : %d\n", default_display_window_flag);
+  LOG1("  def_disp_win_left_offset    : %d\n", def_disp_win_left_offset);
+  LOG1("  def_disp_win_right_offset   : %d\n", def_disp_win_right_offset);
+  LOG1("  def_disp_win_top_offset     : %d\n", def_disp_win_top_offset);
+  LOG1("  def_disp_win_bottom_offset  : %d\n", def_disp_win_bottom_offset);
+
+  LOG1("vui_timing_info_present_flag  : %d\n", vui_timing_info_present_flag);
+  if (vui_timing_info_present_flag) {
+    LOG1("  vui_num_units_in_tick       : %d\n", vui_num_units_in_tick);
+    LOG1("  vui_time_scale              : %d\n", vui_time_scale);
+  }
+
+  LOG1("vui_poc_proportional_to_timing_flag : %d\n", vui_poc_proportional_to_timing_flag);
+  LOG1("vui_num_ticks_poc_diff_one          : %d\n", vui_num_ticks_poc_diff_one);
+
+  LOG1("vui_hrd_parameters_present_flag : %d\n", vui_hrd_parameters_present_flag);
+  if (vui_hrd_parameters_present_flag) {
+    //hrd_parameters vui_hrd_parameters;
+  }
+
+
+  // --- bitstream restriction ---
+
+  LOG1("bitstream_restriction_flag         : %d\n", bitstream_restriction_flag);
+  if (bitstream_restriction_flag) {
+    LOG1("  tiles_fixed_structure_flag       : %d\n", tiles_fixed_structure_flag);
+    LOG1("  motion_vectors_over_pic_boundaries_flag : %d\n", motion_vectors_over_pic_boundaries_flag);
+    LOG1("  restricted_ref_pic_lists_flag    : %d\n", restricted_ref_pic_lists_flag);
+    LOG1("  min_spatial_segmentation_idc     : %d\n", min_spatial_segmentation_idc);
+    LOG1("  max_bytes_per_pic_denom          : %d\n", max_bytes_per_pic_denom);
+    LOG1("  max_bits_per_min_cu_denom        : %d\n", max_bits_per_min_cu_denom);
+    LOG1("  log2_max_mv_length_horizontal    : %d\n", log2_max_mv_length_horizontal);
+    LOG1("  log2_max_mv_length_vertical      : %d\n", log2_max_mv_length_vertical);
+  }
+
+#undef LOG0
+#undef LOG1
+#undef LOG2
+#undef LOG3
+  //#endif
+}
diff --git a/libde265/vui.h b/libde265/vui.h
new file mode 100644
index 0000000..c412669
--- /dev/null
+++ b/libde265/vui.h
@@ -0,0 +1,126 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef DE265_VUI_H
+#define DE265_VUI_H
+
+#include "libde265/de265.h"
+#include "libde265/bitstream.h"
+
+#include <vector>
+
+class error_queue;
+class seq_parameter_set;
+
+
+enum VideoFormat {
+  VideoFormat_Component = 0,
+  VideoFormat_PAL   = 1,
+  VideoFormat_NTSC  = 2,
+  VideoFormat_SECAM = 3,
+  VideoFormat_MAC   = 4,
+  VideoFormat_Unspecified = 5
+};
+
+const char* get_video_format_name(enum VideoFormat);
+
+
+class video_usability_information
+{
+ public:
+  video_usability_information();
+
+  de265_error read(error_queue*, bitreader*, const seq_parameter_set*);
+  void dump(int fd) const;
+
+
+  // --- sample aspect ratio (SAR) ---
+
+  bool     aspect_ratio_info_present_flag;
+  uint16_t sar_width;  // sar_width and sar_height are zero if unspecified
+  uint16_t sar_height;
+
+
+  // --- overscan ---
+
+  bool     overscan_info_present_flag;
+  bool     overscan_appropriate_flag;
+
+
+  // --- video signal type ---
+
+  bool     video_signal_type_present_flag;
+  enum VideoFormat  video_format;
+  bool     video_full_range_flag;
+  bool     colour_description_present_flag;
+  uint8_t  colour_primaries;
+  uint8_t  transfer_characteristics;
+  uint8_t  matrix_coeffs;
+
+  // --- chroma / interlaced ---
+
+  bool     chroma_loc_info_present_flag;
+  uint8_t  chroma_sample_loc_type_top_field;
+  uint8_t  chroma_sample_loc_type_bottom_field;
+
+  bool     neutral_chroma_indication_flag;
+  bool     field_seq_flag;
+  bool     frame_field_info_present_flag;
+
+  // --- default display window ---
+
+  bool     default_display_window_flag;
+  uint32_t def_disp_win_left_offset;
+  uint32_t def_disp_win_right_offset;
+  uint32_t def_disp_win_top_offset;
+  uint32_t def_disp_win_bottom_offset;
+
+
+  // --- timing ---
+
+  bool     vui_timing_info_present_flag;
+  uint32_t vui_num_units_in_tick;
+  uint32_t vui_time_scale;
+
+  bool     vui_poc_proportional_to_timing_flag;
+  uint32_t vui_num_ticks_poc_diff_one;
+
+
+  // --- hrd parameters ---
+
+  bool     vui_hrd_parameters_present_flag;
+  //hrd_parameters vui_hrd_parameters;
+
+
+  // --- bitstream restriction ---
+
+  bool bitstream_restriction_flag;
+  bool tiles_fixed_structure_flag;
+  bool motion_vectors_over_pic_boundaries_flag;
+  bool restricted_ref_pic_lists_flag;
+  uint16_t min_spatial_segmentation_idc;
+  uint8_t  max_bytes_per_pic_denom;
+  uint8_t  max_bits_per_min_cu_denom;
+  uint8_t  log2_max_mv_length_horizontal;
+  uint8_t  log2_max_mv_length_vertical;
+};
+
+
+#endif
diff --git a/libde265/x86/CMakeLists.txt b/libde265/x86/CMakeLists.txt
new file mode 100644
index 0000000..dd80a7f
--- /dev/null
+++ b/libde265/x86/CMakeLists.txt
@@ -0,0 +1,24 @@
+set (x86_sources 
+  sse.cc sse.h
+)
+
+set (x86_sse_sources 
+  sse-motion.cc sse-motion.h sse-dct.h sse-dct.cc
+)
+
+add_library(x86 STATIC ${x86_sources})
+
+add_library(x86_sse STATIC ${x86_sse_sources})
+
+set(sse_flags "")
+
+if(NOT MSVC)
+  set(sse_flags "${sse_flags} -msse4.1")
+endif()
+
+target_link_libraries(x86 x86_sse)
+
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+  SET_TARGET_PROPERTIES(x86 PROPERTIES COMPILE_FLAGS "-fPIC")
+  SET_TARGET_PROPERTIES(x86_sse PROPERTIES COMPILE_FLAGS "-fPIC ${sse_flags}")
+endif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
diff --git a/libde265/x86/Makefile.am b/libde265/x86/Makefile.am
index 836aaea..55b3898 100644
--- a/libde265/x86/Makefile.am
+++ b/libde265/x86/Makefile.am
@@ -18,3 +18,5 @@ if HAVE_VISIBILITY
  libde265_x86_sse_la_CXXFLAGS += -DHAVE_VISIBILITY
 endif
 
+EXTRA_DIST = \
+  CMakeLists.txt
diff --git a/libde265/x86/Makefile.in b/libde265/x86/Makefile.in
index c982db6..56354ca 100644
--- a/libde265/x86/Makefile.in
+++ b/libde265/x86/Makefile.in
@@ -86,11 +86,12 @@ DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
 	$(top_srcdir)/depcomp
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_compare_version.m4 \
+	$(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \
 	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
 	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
 	$(top_srcdir)/m4/lt~obsolete.m4 \
 	$(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \
-	$(top_srcdir)/m4/visibility.m4 $(top_srcdir)/configure.ac
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 mkinstalldirs = $(install_sh) -d
@@ -198,6 +199,7 @@ ETAGS = etags
 CTAGS = ctags
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
 AMTAR = @AMTAR@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
@@ -206,9 +208,11 @@ AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
 CCDEPMODE = @CCDEPMODE@
 CFLAGS = @CFLAGS@
-CFLAG_VISIBILITY = @CFLAG_VISIBILITY@
 CPP = @CPP@
 CPPFLAGS = @CPPFLAGS@
 CXX = @CXX@
@@ -228,7 +232,7 @@ EGREP = @EGREP@
 EXEEXT = @EXEEXT@
 FGREP = @FGREP@
 GREP = @GREP@
-HAVE_VISIBILITY = @HAVE_VISIBILITY@
+HAVE_CXX11 = @HAVE_CXX11@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -348,6 +352,9 @@ libde265_x86_la_LIBADD = libde265_x86_sse.la
 libde265_x86_sse_la_CXXFLAGS = -msse4.1 -I.. $(CFLAG_VISIBILITY) \
 	$(am__append_2)
 libde265_x86_sse_la_SOURCES = sse-motion.cc sse-motion.h sse-dct.h sse-dct.cc
+EXTRA_DIST = \
+  CMakeLists.txt
+
 all: all-am
 
 .SUFFIXES:
diff --git a/libde265/x86/sse-dct.cc b/libde265/x86/sse-dct.cc
index d003bf1..3a9b7ba 100644
--- a/libde265/x86/sse-dct.cc
+++ b/libde265/x86/sse-dct.cc
@@ -282,7 +282,7 @@ ALIGNED_16(static const int16_t) transform32x32[8][16][8] =
 #define add_1st (1 << (shift_1st - 1))
 
 
-void ff_hevc_transform_skip_8_sse(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride)
+void ff_hevc_transform_skip_8_sse(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride)
 {
     uint8_t *dst = (uint8_t*)_dst;
     ptrdiff_t stride = _stride;
@@ -342,7 +342,7 @@ void ff_hevc_transform_skip_8_sse(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _str
 
 
 #if HAVE_SSE4_1
-void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
+void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
                                            ptrdiff_t _stride) {
 
     uint8_t shift_2nd = 12; // 20 - Bit depth
@@ -350,7 +350,7 @@ void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
 
     uint8_t *dst = (uint8_t*) _dst;
     ptrdiff_t stride = _stride;
-    int16_t *src = coeffs;
+    const int16_t *src = coeffs;
     __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA,
             m128iD;
     m128iAdd = _mm_set1_epi32(64);
@@ -496,7 +496,7 @@ void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
 #endif // SSE4.1
 
 #if 0
-void ff_hevc_transform_4x4_luma_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
+void ff_hevc_transform_4x4_luma_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
         ptrdiff_t _stride) {
     int i,j;
     uint8_t shift_2nd = 10; // 20 - Bit depth
@@ -623,14 +623,14 @@ void ff_hevc_transform_4x4_luma_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
 
 
 #if HAVE_SSE4_1
-void ff_hevc_transform_4x4_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
+void ff_hevc_transform_4x4_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
         ptrdiff_t _stride) {
     uint8_t shift_2nd = 12; // 20 - Bit depth
     uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
 
     uint8_t *dst = (uint8_t*) _dst;
     ptrdiff_t stride = _stride;
-    int16_t *src = coeffs;
+    const int16_t *src = coeffs;
 
     __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD, m128iTmp1,m128iTmp2;
     S0 = _mm_load_si128((__m128i *) (src));
@@ -744,7 +744,7 @@ void ff_hevc_transform_4x4_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
 #endif
 
 #if 0
-void ff_hevc_transform_4x4_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
+void ff_hevc_transform_4x4_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
         ptrdiff_t _stride) {
     int i;
     uint8_t shift_2nd = 10; // 20 - Bit depth
@@ -839,14 +839,14 @@ void ff_hevc_transform_4x4_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
 #endif
 
 #if HAVE_SSE4_1
-void ff_hevc_transform_8x8_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
+void ff_hevc_transform_8x8_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
         ptrdiff_t _stride) {
     uint8_t shift_2nd = 12; // 20 - Bit depth
     uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
 
     uint8_t *dst = (uint8_t*) _dst;
     ptrdiff_t stride = _stride / sizeof(uint8_t);
-    int16_t *src = coeffs;
+    const int16_t *src = coeffs;
     __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
             m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h,
             E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l,
@@ -1174,7 +1174,7 @@ void ff_hevc_transform_8x8_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
 #endif
 
 #if 0
-void ff_hevc_transform_8x8_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
+void ff_hevc_transform_8x8_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
         ptrdiff_t _stride) {
     int i;
     uint16_t *dst = (uint16_t*) _dst;
@@ -1489,14 +1489,14 @@ void ff_hevc_transform_8x8_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
 
 
 #if HAVE_SSE4_1
-void ff_hevc_transform_16x16_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
+void ff_hevc_transform_16x16_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
         ptrdiff_t _stride) {
     uint8_t shift_2nd = 12; // 20 - Bit depth
     uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
     int i;
     uint8_t *dst = (uint8_t*) _dst;
     ptrdiff_t stride = _stride / sizeof(uint8_t);
-    int16_t *src = coeffs;
+    const int16_t *src = coeffs;
     int32_t shift;
     __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
             m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
@@ -2246,7 +2246,7 @@ void ff_hevc_transform_16x16_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
 
 
 #if 0
-void ff_hevc_transform_16x16_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
+void ff_hevc_transform_16x16_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
         ptrdiff_t _stride) {
     int i;
     uint16_t *dst = (uint16_t*) _dst;
@@ -2909,7 +2909,7 @@ void ff_hevc_transform_16x16_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
 
 
 #if HAVE_SSE4_1
-void ff_hevc_transform_32x32_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
+void ff_hevc_transform_32x32_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
         ptrdiff_t _stride) {
     uint8_t shift_2nd = 12; // 20 - Bit depth
     uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
@@ -2917,7 +2917,7 @@ void ff_hevc_transform_32x32_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
     uint8_t *dst = (uint8_t*) _dst;
     ptrdiff_t stride = _stride / sizeof(uint8_t);
     int shift;
-    int16_t *src = coeffs;
+    const int16_t *src = coeffs;
 
     __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
             m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
@@ -5201,7 +5201,7 @@ void ff_hevc_transform_32x32_add_8_sse4(uint8_t *_dst, int16_t *coeffs,
 
 
 #if 0
-void ff_hevc_transform_32x32_add_10_sse4(uint8_t *_dst, int16_t *coeffs,
+void ff_hevc_transform_32x32_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
         ptrdiff_t _stride) {
     int i, j;
     uint16_t *dst = (uint16_t*) _dst;
diff --git a/libde265/x86/sse-dct.h b/libde265/x86/sse-dct.h
index f49b419..bc50ade 100644
--- a/libde265/x86/sse-dct.h
+++ b/libde265/x86/sse-dct.h
@@ -25,11 +25,11 @@
 #include <stddef.h>
 #include <stdint.h>
 
-void ff_hevc_transform_skip_8_sse(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
-void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_4x4_add_8_sse4(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_8x8_add_8_sse4(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_16x16_add_8_sse4(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-void ff_hevc_transform_32x32_add_8_sse4(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_skip_8_sse(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride);
+void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_4x4_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_8x8_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_16x16_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_transform_32x32_add_8_sse4(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride);
 
 #endif
diff --git a/libde265/x86/sse-motion.cc b/libde265/x86/sse-motion.cc
index c76a003..3a929f6 100644
--- a/libde265/x86/sse-motion.cc
+++ b/libde265/x86/sse-motion.cc
@@ -86,7 +86,8 @@ void printm32(const char* prefix, unsigned char* p)
 #define BIT_DEPTH 8
 
 void ff_hevc_put_unweighted_pred_8_sse(uint8_t *_dst, ptrdiff_t dststride,
-        int16_t *src, ptrdiff_t srcstride, int width, int height) {
+                                       const int16_t *src, ptrdiff_t srcstride,
+                                       int width, int height) {
     int x, y;
     uint8_t *dst = (uint8_t*) _dst;
     __m128i r0, r1, f0;
@@ -169,7 +170,8 @@ void ff_hevc_put_unweighted_pred_8_sse(uint8_t *_dst, ptrdiff_t dststride,
 }
 
 void ff_hevc_put_unweighted_pred_sse(uint8_t *_dst, ptrdiff_t _dststride,
-        int16_t *src, ptrdiff_t srcstride, int width, int height) {
+                                     const int16_t *src, ptrdiff_t srcstride,
+                                     int width, int height) {
     int x, y;
     uint8_t *dst = (uint8_t*) _dst;
     ptrdiff_t dststride = _dststride / sizeof(uint8_t);
@@ -203,8 +205,9 @@ void ff_hevc_put_unweighted_pred_sse(uint8_t *_dst, ptrdiff_t _dststride,
 }
 
 void ff_hevc_put_weighted_pred_avg_8_sse(uint8_t *_dst, ptrdiff_t dststride,
-        int16_t *src1, int16_t *src2, ptrdiff_t srcstride, int width,
-        int height) {
+                                         const int16_t *src1, const int16_t *src2,
+                                         ptrdiff_t srcstride, int width,
+                                         int height) {
     int x, y;
     uint8_t *dst = (uint8_t*) _dst;
     __m128i r0, r1, f0, r2, r3;
@@ -307,8 +310,9 @@ void ff_hevc_put_weighted_pred_avg_8_sse(uint8_t *_dst, ptrdiff_t dststride,
 }
 
 void ff_hevc_put_weighted_pred_avg_sse(uint8_t *_dst, ptrdiff_t _dststride,
-        int16_t *src1, int16_t *src2, ptrdiff_t srcstride, int width,
-        int height) {
+                                       const int16_t *src1, const int16_t *src2,
+                                       ptrdiff_t srcstride, int width,
+                                       int height) {
     int x, y;
     uint8_t *dst = (uint8_t*) _dst;
     ptrdiff_t dststride = _dststride / sizeof(uint8_t);
@@ -346,8 +350,9 @@ void ff_hevc_put_weighted_pred_avg_sse(uint8_t *_dst, ptrdiff_t _dststride,
 
 #if 0
 void ff_hevc_weighted_pred_8_sse4(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
-        uint8_t *_dst, ptrdiff_t _dststride, int16_t *src, ptrdiff_t srcstride,
-        int width, int height) {
+                                  uint8_t *_dst, ptrdiff_t _dststride,
+                                  const int16_t *src, ptrdiff_t srcstride,
+                                  int width, int height) {
 
     int log2Wd;
     int x, y;
@@ -587,8 +592,9 @@ void ff_hevc_weighted_pred_8_sse4(uint8_t denom, int16_t wlxFlag, int16_t olxFla
 
 #if 0
 void ff_hevc_weighted_pred_sse(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
-        uint8_t *_dst, ptrdiff_t _dststride, int16_t *src, ptrdiff_t srcstride,
-        int width, int height) {
+                               uint8_t *_dst, ptrdiff_t _dststride,
+                               const int16_t *src, ptrdiff_t srcstride,
+                               int width, int height) {
 
     int log2Wd;
     int x, y;
@@ -672,9 +678,10 @@ void ff_hevc_weighted_pred_sse(uint8_t denom, int16_t wlxFlag, int16_t olxFlag,
 
 #if HAVE_SSE4_1
 void ff_hevc_weighted_pred_avg_8_sse4(uint8_t denom, int16_t wl0Flag,
-        int16_t wl1Flag, int16_t ol0Flag, int16_t ol1Flag, uint8_t *_dst,
-        ptrdiff_t _dststride, int16_t *src1, int16_t *src2, ptrdiff_t srcstride,
-        int width, int height) {
+                                      int16_t wl1Flag, int16_t ol0Flag, int16_t ol1Flag,
+                                      uint8_t *_dst, ptrdiff_t _dststride,
+                                      const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
+                                      int width, int height) {
     int shift, shift2;
     int log2Wd;
     int o0;
@@ -869,7 +876,7 @@ void ff_hevc_weighted_pred_avg_8_sse4(uint8_t denom, int16_t wl0Flag,
 #if 0
 void ff_hevc_weighted_pred_avg_sse(uint8_t denom, int16_t wl0Flag,
         int16_t wl1Flag, int16_t ol0Flag, int16_t ol1Flag, uint8_t *_dst,
-        ptrdiff_t _dststride, int16_t *src1, int16_t *src2, ptrdiff_t srcstride,
+                                   ptrdiff_t _dststride, const int16_t *src1, const int16_t *src2, ptrdiff_t srcstride,
         int width, int height) {
     int shift, shift2;
     int log2Wd;
@@ -943,8 +950,9 @@ void ff_hevc_weighted_pred_avg_sse(uint8_t denom, int16_t wl0Flag,
 
 
 void ff_hevc_put_hevc_epel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t srcstride, int width, int height, int mx,
-        int my, int16_t* mcbuffer) {
+                                        const uint8_t *_src, ptrdiff_t srcstride,
+                                        int width, int height, int mx,
+                                        int my, int16_t* mcbuffer) {
     int x, y;
     __m128i x1, x2,x3;
     uint8_t *src = (uint8_t*) _src;
@@ -1020,8 +1028,9 @@ void ff_hevc_put_hevc_epel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride,
 
 #ifndef __native_client__
 void ff_hevc_put_hevc_epel_pixels_10_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
-        int my, int16_t* mcbuffer) {
+                                         const uint8_t *_src, ptrdiff_t _srcstride,
+                                         int width, int height, int mx,
+                                         int my, int16_t* mcbuffer) {
     int x, y;
     __m128i x2;
     uint16_t *src = (uint16_t*) _src;
@@ -1071,10 +1080,11 @@ void ff_hevc_put_hevc_epel_pixels_10_sse(int16_t *dst, ptrdiff_t dststride,
 #endif
 
 void ff_hevc_put_hevc_epel_h_8_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
-        int my, int16_t* mcbuffer) {
+                                   const uint8_t *_src, ptrdiff_t _srcstride,
+                                   int width, int height, int mx,
+                                   int my, int16_t* mcbuffer, int bit_depth) {
     int x, y;
-    uint8_t *src = (uint8_t*) _src;
+    const uint8_t *src = (const uint8_t*) _src;
     ptrdiff_t srcstride = _srcstride;
     const int8_t *filter = epel_filters[mx - 1];
     __m128i r0, bshuffle1, bshuffle2, x1, x2, x3;
@@ -1168,8 +1178,9 @@ void ff_hevc_put_hevc_epel_h_8_sse(int16_t *dst, ptrdiff_t dststride,
 
 #ifndef __native_client__
 void ff_hevc_put_hevc_epel_h_10_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
-        int my, int16_t* mcbuffer) {
+                                    const uint8_t *_src, ptrdiff_t _srcstride,
+                                    int width, int height, int mx,
+                                    int my, int16_t* mcbuffer) {
     int x, y;
     uint16_t *src = (uint16_t*) _src;
     ptrdiff_t srcstride = _srcstride>>1;
@@ -1230,8 +1241,8 @@ void ff_hevc_put_hevc_epel_h_10_sse(int16_t *dst, ptrdiff_t dststride,
 
 
 void ff_hevc_put_hevc_epel_v_8_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
-        int my, int16_t* mcbuffer) {
+                                   const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
+                                   int my, int16_t* mcbuffer, int bit_depth) {
     int x, y;
     __m128i x0, x1, x2, x3, t0, t1, t2, t3, r0, f0, f1, f2, f3, r1;
     uint8_t *src = (uint8_t*) _src;
@@ -1369,7 +1380,7 @@ void ff_hevc_put_hevc_epel_v_8_sse(int16_t *dst, ptrdiff_t dststride,
 
 #ifndef __native_client__
 void ff_hevc_put_hevc_epel_v_10_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
+                                    const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
         int my, int16_t* mcbuffer) {
     int x, y;
     __m128i x0, x1, x2, x3, t0, t1, t2, t3, r0, f0, f1, f2, f3, r1, r2, r3;
@@ -1536,8 +1547,8 @@ void ff_hevc_put_hevc_epel_v_10_sse(int16_t *dst, ptrdiff_t dststride,
 #endif
 
 void ff_hevc_put_hevc_epel_hv_8_sse(int16_t *dst, ptrdiff_t dststride,
-		uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
-		int my, int16_t* mcbuffer) {
+                                    const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
+                                    int my, int16_t* mcbuffer, int bit_depth) {
 	int x, y;
 	uint8_t *src = (uint8_t*) _src;
 	ptrdiff_t srcstride = _srcstride;
@@ -1763,7 +1774,7 @@ void ff_hevc_put_hevc_epel_hv_8_sse(int16_t *dst, ptrdiff_t dststride,
 
 #ifndef __native_client__
 void ff_hevc_put_hevc_epel_hv_10_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
+                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height, int mx,
         int my, int16_t* mcbuffer) {
     int x, y;
     uint16_t *src = (uint16_t*) _src;
@@ -1925,7 +1936,7 @@ void ff_hevc_put_hevc_epel_hv_10_sse(int16_t *dst, ptrdiff_t dststride,
 #endif
 
 void ff_hevc_put_hevc_qpel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                        const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     __m128i x1, x2, x3, x0;
@@ -1999,7 +2010,7 @@ void ff_hevc_put_hevc_qpel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride,
 
 #ifndef __native_client__
 void ff_hevc_put_hevc_qpel_pixels_10_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                         const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     __m128i x1, x2, x4;
@@ -2046,10 +2057,10 @@ void ff_hevc_put_hevc_qpel_pixels_10_sse(int16_t *dst, ptrdiff_t dststride,
 
 
 void ff_hevc_put_hevc_qpel_h_1_8_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
-    uint8_t *src = _src;
+    const uint8_t *src = _src;
     ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
     __m128i x1, r0, x2, x3, x4, x5;
 
@@ -2145,7 +2156,7 @@ void ff_hevc_put_hevc_qpel_h_1_8_sse(int16_t *dst, ptrdiff_t dststride,
  * @TODO : Valgrind to see if it's useful to use SSE or wait for AVX2 implementation
  */
 void ff_hevc_put_hevc_qpel_h_1_10_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                      const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint16_t *src = (uint16_t*)_src;
@@ -2179,10 +2190,10 @@ void ff_hevc_put_hevc_qpel_h_1_10_sse(int16_t *dst, ptrdiff_t dststride,
 
 
 void ff_hevc_put_hevc_qpel_h_2_8_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
-    uint8_t *src = _src;
+    const uint8_t *src = _src;
     ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
     __m128i x1, r0, x2, x3, x4, x5;
 
@@ -2249,7 +2260,7 @@ void ff_hevc_put_hevc_qpel_h_2_8_sse(int16_t *dst, ptrdiff_t dststride,
 
 #if 0
 static void ff_hevc_put_hevc_qpel_h_2_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                          const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint8_t *src = _src;
@@ -2317,7 +2328,7 @@ static void ff_hevc_put_hevc_qpel_h_2_sse(int16_t *dst, ptrdiff_t dststride,
 
 }
 static void ff_hevc_put_hevc_qpel_h_3_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                          const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint8_t *src = _src;
@@ -2384,10 +2395,10 @@ static void ff_hevc_put_hevc_qpel_h_3_sse(int16_t *dst, ptrdiff_t dststride,
 #endif
 
 void ff_hevc_put_hevc_qpel_h_3_8_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
-    uint8_t *src = _src;
+    const uint8_t *src = _src;
     ptrdiff_t srcstride = _srcstride / sizeof(uint8_t);
     __m128i x1, r0, x2, x3, x4, x5;
 
@@ -2452,7 +2463,7 @@ void ff_hevc_put_hevc_qpel_h_3_8_sse(int16_t *dst, ptrdiff_t dststride,
 
  */
 void ff_hevc_put_hevc_qpel_v_1_8_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint8_t *src = (uint8_t*) _src;
@@ -2611,7 +2622,7 @@ void ff_hevc_put_hevc_qpel_v_1_8_sse(int16_t *dst, ptrdiff_t dststride,
 
 #if 0
 void ff_hevc_put_hevc_qpel_v_1_10_sse4(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint16_t *src = (uint16_t*) _src;
@@ -2688,7 +2699,7 @@ void ff_hevc_put_hevc_qpel_v_1_10_sse4(int16_t *dst, ptrdiff_t dststride,
 
 
 void ff_hevc_put_hevc_qpel_v_2_8_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint8_t *src = (uint8_t*) _src;
@@ -2863,7 +2874,7 @@ void ff_hevc_put_hevc_qpel_v_2_8_sse(int16_t *dst, ptrdiff_t dststride,
 
 #if 0
 void ff_hevc_put_hevc_qpel_v_2_10_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                      cosnt uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint16_t *src = (uint16_t*) _src;
@@ -2954,7 +2965,7 @@ void ff_hevc_put_hevc_qpel_v_2_10_sse(int16_t *dst, ptrdiff_t dststride,
 
 #if 0
 static  void ff_hevc_put_hevc_qpel_v_3_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                           const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint8_t *src = (uint8_t*) _src;
@@ -3121,7 +3132,7 @@ static  void ff_hevc_put_hevc_qpel_v_3_sse(int16_t *dst, ptrdiff_t dststride,
 #endif
 
 void ff_hevc_put_hevc_qpel_v_3_8_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                     const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint8_t *src = (uint8_t*) _src;
@@ -3276,7 +3287,7 @@ void ff_hevc_put_hevc_qpel_v_3_8_sse(int16_t *dst, ptrdiff_t dststride,
 
 #if 0
 void ff_hevc_put_hevc_qpel_v_3_10_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                      const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint16_t *src = (uint16_t*) _src;
@@ -3352,7 +3363,7 @@ void ff_hevc_put_hevc_qpel_v_3_10_sse(int16_t *dst, ptrdiff_t dststride,
 
 
 void ff_hevc_put_hevc_qpel_h_1_v_1_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint8_t* src = (uint8_t*) _src;
@@ -3524,7 +3535,7 @@ void ff_hevc_put_hevc_qpel_h_1_v_1_sse(int16_t *dst, ptrdiff_t dststride,
     }
 }
 void ff_hevc_put_hevc_qpel_h_1_v_2_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint8_t *src = (uint8_t*) _src;
@@ -3707,7 +3718,7 @@ void ff_hevc_put_hevc_qpel_h_1_v_2_sse(int16_t *dst, ptrdiff_t dststride,
     }
 }
 void ff_hevc_put_hevc_qpel_h_1_v_3_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint8_t *src = (uint8_t*) _src;
@@ -3884,7 +3895,7 @@ void ff_hevc_put_hevc_qpel_h_1_v_3_sse(int16_t *dst, ptrdiff_t dststride,
     }
 }
 void ff_hevc_put_hevc_qpel_h_2_v_1_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint8_t *src = (uint8_t*) _src;
@@ -4055,7 +4066,7 @@ void ff_hevc_put_hevc_qpel_h_2_v_1_sse(int16_t *dst, ptrdiff_t dststride,
     }
 }
 void ff_hevc_put_hevc_qpel_h_2_v_2_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint8_t *src = (uint8_t*) _src;
@@ -4238,7 +4249,7 @@ void ff_hevc_put_hevc_qpel_h_2_v_2_sse(int16_t *dst, ptrdiff_t dststride,
     }
 }
 void ff_hevc_put_hevc_qpel_h_2_v_3_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint8_t *src = (uint8_t*) _src;
@@ -4416,7 +4427,7 @@ void ff_hevc_put_hevc_qpel_h_2_v_3_sse(int16_t *dst, ptrdiff_t dststride,
     }
 }
 void ff_hevc_put_hevc_qpel_h_3_v_1_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint8_t *src = (uint8_t*) _src;
@@ -4589,7 +4600,7 @@ void ff_hevc_put_hevc_qpel_h_3_v_1_sse(int16_t *dst, ptrdiff_t dststride,
     }
 }
 void ff_hevc_put_hevc_qpel_h_3_v_2_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint8_t *src = (uint8_t*) _src;
@@ -4775,7 +4786,7 @@ void ff_hevc_put_hevc_qpel_h_3_v_2_sse(int16_t *dst, ptrdiff_t dststride,
     }
 }
 void ff_hevc_put_hevc_qpel_h_3_v_3_sse(int16_t *dst, ptrdiff_t dststride,
-        uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
+                                       const uint8_t *_src, ptrdiff_t _srcstride, int width, int height,
         int16_t* mcbuffer) {
     int x, y;
     uint8_t *src = (uint8_t*) _src;
diff --git a/libde265/x86/sse-motion.h b/libde265/x86/sse-motion.h
index be2157f..5950ff4 100644
--- a/libde265/x86/sse-motion.h
+++ b/libde265/x86/sse-motion.h
@@ -27,78 +27,78 @@
 
 
 void ff_hevc_put_unweighted_pred_8_sse(uint8_t *_dst, ptrdiff_t dststride,
-                                        int16_t *src, ptrdiff_t srcstride,
-                                        int width, int height);
+                                       const int16_t *src, ptrdiff_t srcstride,
+                                       int width, int height);
 
 void ff_hevc_put_weighted_pred_avg_8_sse(uint8_t *_dst, ptrdiff_t dststride,
-                                 int16_t *src1, int16_t *src2,
-                                 ptrdiff_t srcstride, int width,
-                                 int height);
+                                         const int16_t *src1, const int16_t *src2,
+                                         ptrdiff_t srcstride, int width,
+                                         int height);
 
 void ff_hevc_put_hevc_epel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride,
-                                        uint8_t *_src, ptrdiff_t srcstride,
+                                        const uint8_t *_src, ptrdiff_t srcstride,
                                         int width, int height,
                                         int mx, int my, int16_t* mcbuffer);
 void ff_hevc_put_hevc_epel_h_8_sse(int16_t *dst, ptrdiff_t dststride,
-                                   uint8_t *_src, ptrdiff_t srcstride,
+                                   const uint8_t *_src, ptrdiff_t srcstride,
                                    int width, int height,
-                                   int mx, int my, int16_t* mcbuffer);
+                                   int mx, int my, int16_t* mcbuffer, int bit_depth);
 void ff_hevc_put_hevc_epel_v_8_sse(int16_t *dst, ptrdiff_t dststride,
-                                   uint8_t *_src, ptrdiff_t srcstride,
+                                   const uint8_t *_src, ptrdiff_t srcstride,
                                    int width, int height,
-                                   int mx, int my, int16_t* mcbuffer);
+                                   int mx, int my, int16_t* mcbuffer, int bit_depth);
 void ff_hevc_put_hevc_epel_hv_8_sse(int16_t *dst, ptrdiff_t dststride,
-                                    uint8_t *_src, ptrdiff_t srcstride,
+                                    const uint8_t *_src, ptrdiff_t srcstride,
                                     int width, int height,
-                                    int mx, int my, int16_t* mcbuffer);
+                                    int mx, int my, int16_t* mcbuffer, int bit_depth);
 
 void ff_hevc_put_hevc_qpel_pixels_8_sse(int16_t *dst, ptrdiff_t dststride,
-                                         uint8_t *src, ptrdiff_t srcstride,
-                                         int width, int height, int16_t* mcbuffer);
+                                        const uint8_t *src, ptrdiff_t srcstride,
+                                        int width, int height, int16_t* mcbuffer);
 void ff_hevc_put_hevc_qpel_v_1_8_sse(int16_t *dst, ptrdiff_t dststride,
-                                         uint8_t *src, ptrdiff_t srcstride,
-                                         int width, int height, int16_t* mcbuffer);
+                                     const uint8_t *src, ptrdiff_t srcstride,
+                                     int width, int height, int16_t* mcbuffer);
 void ff_hevc_put_hevc_qpel_v_2_8_sse(int16_t *dst, ptrdiff_t dststride,
-                                         uint8_t *src, ptrdiff_t srcstride,
-                                         int width, int height, int16_t* mcbuffer);
+                                     const uint8_t *src, ptrdiff_t srcstride,
+                                     int width, int height, int16_t* mcbuffer);
 void ff_hevc_put_hevc_qpel_v_3_8_sse(int16_t *dst, ptrdiff_t dststride,
-                                         uint8_t *src, ptrdiff_t srcstride,
-                                         int width, int height, int16_t* mcbuffer);
+                                     const uint8_t *src, ptrdiff_t srcstride,
+                                     int width, int height, int16_t* mcbuffer);
 void ff_hevc_put_hevc_qpel_h_1_8_sse(int16_t *dst, ptrdiff_t dststride,
-                                     uint8_t *src, ptrdiff_t srcstride,
+                                     const uint8_t *src, ptrdiff_t srcstride,
                                      int width, int height, int16_t* mcbuffer);
 void ff_hevc_put_hevc_qpel_h_1_v_1_sse(int16_t *dst, ptrdiff_t dststride,
-                                         uint8_t *src, ptrdiff_t srcstride,
-                                         int width, int height, int16_t* mcbuffer);
+                                       const uint8_t *src, ptrdiff_t srcstride,
+                                       int width, int height, int16_t* mcbuffer);
 void ff_hevc_put_hevc_qpel_h_1_v_2_sse(int16_t *dst, ptrdiff_t dststride,
-                                         uint8_t *src, ptrdiff_t srcstride,
-                                         int width, int height, int16_t* mcbuffer);
+                                       const uint8_t *src, ptrdiff_t srcstride,
+                                       int width, int height, int16_t* mcbuffer);
 void ff_hevc_put_hevc_qpel_h_1_v_3_sse(int16_t *dst, ptrdiff_t dststride,
-                                         uint8_t *src, ptrdiff_t srcstride,
-                                         int width, int height, int16_t* mcbuffer);
+                                       const uint8_t *src, ptrdiff_t srcstride,
+                                       int width, int height, int16_t* mcbuffer);
 void ff_hevc_put_hevc_qpel_h_2_8_sse(int16_t *dst, ptrdiff_t dststride,
-                                     uint8_t *src, ptrdiff_t srcstride,
+                                     const uint8_t *src, ptrdiff_t srcstride,
                                      int width, int height, int16_t* mcbuffer);
 void ff_hevc_put_hevc_qpel_h_2_v_1_sse(int16_t *dst, ptrdiff_t dststride,
-                                         uint8_t *src, ptrdiff_t srcstride,
-                                         int width, int height, int16_t* mcbuffer);
+                                       const uint8_t *src, ptrdiff_t srcstride,
+                                       int width, int height, int16_t* mcbuffer);
 void ff_hevc_put_hevc_qpel_h_2_v_2_sse(int16_t *dst, ptrdiff_t dststride,
-                                         uint8_t *src, ptrdiff_t srcstride,
-                                         int width, int height, int16_t* mcbuffer);
+                                       const uint8_t *src, ptrdiff_t srcstride,
+                                       int width, int height, int16_t* mcbuffer);
 void ff_hevc_put_hevc_qpel_h_2_v_3_sse(int16_t *dst, ptrdiff_t dststride,
-                                         uint8_t *src, ptrdiff_t srcstride,
-                                         int width, int height, int16_t* mcbuffer);
+                                       const uint8_t *src, ptrdiff_t srcstride,
+                                       int width, int height, int16_t* mcbuffer);
 void ff_hevc_put_hevc_qpel_h_3_8_sse(int16_t *dst, ptrdiff_t dststride,
-                                     uint8_t *src, ptrdiff_t srcstride,
+                                     const uint8_t *src, ptrdiff_t srcstride,
                                      int width, int height, int16_t* mcbuffer);
 void ff_hevc_put_hevc_qpel_h_3_v_1_sse(int16_t *dst, ptrdiff_t dststride,
-                                         uint8_t *src, ptrdiff_t srcstride,
-                                         int width, int height, int16_t* mcbuffer);
+                                       const uint8_t *src, ptrdiff_t srcstride,
+                                       int width, int height, int16_t* mcbuffer);
 void ff_hevc_put_hevc_qpel_h_3_v_2_sse(int16_t *dst, ptrdiff_t dststride,
-                                         uint8_t *src, ptrdiff_t srcstride,
-                                         int width, int height, int16_t* mcbuffer);
+                                       const uint8_t *src, ptrdiff_t srcstride,
+                                       int width, int height, int16_t* mcbuffer);
 void ff_hevc_put_hevc_qpel_h_3_v_3_sse(int16_t *dst, ptrdiff_t dststride,
-                                         uint8_t *src, ptrdiff_t srcstride,
-                                         int width, int height, int16_t* mcbuffer);
+                                       const uint8_t *src, ptrdiff_t srcstride,
+                                       int width, int height, int16_t* mcbuffer);
 
 #endif
diff --git a/libde265/x86/sse.cc b/libde265/x86/sse.cc
index a6707d9..2ee0f8f 100644
--- a/libde265/x86/sse.cc
+++ b/libde265/x86/sse.cc
@@ -95,9 +95,9 @@ void init_acceleration_functions_sse(struct acceleration_functions* accel)
     //accel->transform_4x4_luma_add_8 = ff_hevc_transform_4x4_luma_add_8_sse4; // SSE-4 only TODO
     //accel->transform_4x4_add_8   = ff_hevc_transform_4x4_add_8_sse4;
 
-    accel->transform_8x8_add_8   = ff_hevc_transform_8x8_add_8_sse4;
-    accel->transform_16x16_add_8 = ff_hevc_transform_16x16_add_8_sse4;
-    accel->transform_32x32_add_8 = ff_hevc_transform_32x32_add_8_sse4;
+    accel->transform_add_8[1] = ff_hevc_transform_8x8_add_8_sse4;
+    accel->transform_add_8[2] = ff_hevc_transform_16x16_add_8_sse4;
+    accel->transform_add_8[3] = ff_hevc_transform_32x32_add_8_sse4;
   }
 #endif
 }
diff --git a/m4/ax_cxx_compile_stdcxx_11.m4 b/m4/ax_cxx_compile_stdcxx_11.m4
new file mode 100644
index 0000000..88895ad
--- /dev/null
+++ b/m4/ax_cxx_compile_stdcxx_11.m4
@@ -0,0 +1,142 @@
+# ============================================================================
+#  http://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_11.html
+# ============================================================================
+#
+# SYNOPSIS
+#
+#   AX_CXX_COMPILE_STDCXX_11([ext|noext],[mandatory|optional])
+#
+# DESCRIPTION
+#
+#   Check for baseline language coverage in the compiler for the C++11
+#   standard; if necessary, add switches to CXXFLAGS to enable support.
+#
+#   The first argument, if specified, indicates whether you insist on an
+#   extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g.
+#   -std=c++11).  If neither is specified, you get whatever works, with
+#   preference for an extended mode.
+#
+#   The second argument, if specified 'mandatory' or if left unspecified,
+#   indicates that baseline C++11 support is required and that the macro
+#   should error out if no mode with that support is found.  If specified
+#   'optional', then configuration proceeds regardless, after defining
+#   HAVE_CXX11 if and only if a supporting mode is found.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Benjamin Kosnik <bkoz at redhat.com>
+#   Copyright (c) 2012 Zack Weinberg <zackw at panix.com>
+#   Copyright (c) 2013 Roy Stogner <roystgnr at ices.utexas.edu>
+#   Copyright (c) 2014 Alexey Sokolov <sokolov at google.com>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved. This file is offered as-is, without any
+#   warranty.
+
+#serial 4
+
+m4_define([_AX_CXX_COMPILE_STDCXX_11_testbody], [[
+  template <typename T>
+    struct check
+    {
+      static_assert(sizeof(int) <= sizeof(T), "not big enough");
+    };
+
+    struct Base {
+    virtual void f() {}
+    };
+    struct Child : public Base {
+    virtual void f() {} // DiFa: override {}   # override not supported in gcc 4.6
+    };
+
+    typedef check<check<bool>> right_angle_brackets;
+
+    int a;
+    decltype(a) b;
+
+    typedef check<int> check_type;
+    check_type c;
+    check_type&& cr = static_cast<check_type&&>(c);
+
+    auto d = a;
+    auto l = [](){};
+]])
+
+AC_DEFUN([AX_CXX_COMPILE_STDCXX_11], [dnl
+  m4_if([$1], [], [],
+        [$1], [ext], [],
+        [$1], [noext], [],
+        [m4_fatal([invalid argument `$1' to AX_CXX_COMPILE_STDCXX_11])])dnl
+  m4_if([$2], [], [ax_cxx_compile_cxx11_required=true],
+        [$2], [mandatory], [ax_cxx_compile_cxx11_required=true],
+        [$2], [optional], [ax_cxx_compile_cxx11_required=false],
+        [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX_11])])
+  AC_LANG_PUSH([C++])dnl
+  ac_success=no
+  AC_CACHE_CHECK(whether $CXX supports C++11 features by default,
+  ax_cv_cxx_compile_cxx11,
+  [AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
+    [ax_cv_cxx_compile_cxx11=yes],
+    [ax_cv_cxx_compile_cxx11=no])])
+  if test x$ax_cv_cxx_compile_cxx11 = xyes; then
+    ac_success=yes
+  fi
+
+  m4_if([$1], [noext], [], [dnl
+  if test x$ac_success = xno; then
+    for switch in -std=gnu++11 -std=gnu++0x; do
+      cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch])
+      AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch,
+                     $cachevar,
+        [ac_save_CXXFLAGS="$CXXFLAGS"
+         CXXFLAGS="$CXXFLAGS $switch"
+         AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
+          [eval $cachevar=yes],
+          [eval $cachevar=no])
+         CXXFLAGS="$ac_save_CXXFLAGS"])
+      if eval test x\$$cachevar = xyes; then
+        CXXFLAGS="$CXXFLAGS $switch"
+        ac_success=yes
+        break
+      fi
+    done
+  fi])
+
+  m4_if([$1], [ext], [], [dnl
+  if test x$ac_success = xno; then
+    for switch in -std=c++11 -std=c++0x; do
+      cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx11_$switch])
+      AC_CACHE_CHECK(whether $CXX supports C++11 features with $switch,
+                     $cachevar,
+        [ac_save_CXXFLAGS="$CXXFLAGS"
+         CXXFLAGS="$CXXFLAGS $switch"
+         AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_11_testbody])],
+          [eval $cachevar=yes],
+          [eval $cachevar=no])
+         CXXFLAGS="$ac_save_CXXFLAGS"])
+      if eval test x\$$cachevar = xyes; then
+        CXXFLAGS="$CXXFLAGS $switch"
+        ac_success=yes
+        break
+      fi
+    done
+  fi])
+  AC_LANG_POP([C++])
+  if test x$ax_cxx_compile_cxx11_required = xtrue; then
+    if test x$ac_success = xno; then
+      AC_MSG_ERROR([*** A compiler with support for C++11 language features is required.])
+    fi
+  else
+    if test x$ac_success = xno; then
+      HAVE_CXX11=0
+      AC_MSG_NOTICE([No compiler with C++11 support was found])
+    else
+      HAVE_CXX11=1
+      AC_DEFINE(HAVE_CXX11,1,
+                [define if the compiler supports basic C++11 syntax])
+    fi
+
+    AC_SUBST(HAVE_CXX11)
+  fi
+])
diff --git a/m4/visibility.m4 b/m4/visibility.m4
deleted file mode 100644
index 2ff6330..0000000
--- a/m4/visibility.m4
+++ /dev/null
@@ -1,52 +0,0 @@
-# visibility.m4 serial 1 (gettext-0.15)
-dnl Copyright (C) 2005 Free Software Foundation, Inc.
-dnl This file is free software; the Free Software Foundation
-dnl gives unlimited permission to copy and/or distribute it,
-dnl with or without modifications, as long as this notice is preserved.
-
-dnl From Bruno Haible.
-
-dnl Tests whether the compiler supports the command-line option
-dnl -fvisibility=hidden and the function and variable attributes
-dnl __attribute__((__visibility__("hidden"))) and
-dnl __attribute__((__visibility__("default"))).
-dnl Does *not* test for __visibility__("protected") - which has tricky
-dnl semantics (see the 'vismain' test in glibc) and does not exist e.g. on
-dnl MacOS X.
-dnl Does *not* test for __visibility__("internal") - which has processor
-dnl dependent semantics.
-dnl Does *not* test for #pragma GCC visibility push(hidden) - which is
-dnl "really only recommended for legacy code".
-dnl Set the variable CFLAG_VISIBILITY.
-dnl Defines and sets the variable HAVE_VISIBILITY.
-
-AC_DEFUN([gl_VISIBILITY],
-[
-  AC_REQUIRE([AC_PROG_CC])
-  CFLAG_VISIBILITY=
-  HAVE_VISIBILITY=0
-  if test -n "$GCC"; then
-    AC_MSG_CHECKING([for simple visibility declarations])
-    AC_CACHE_VAL(gl_cv_cc_visibility, [
-      gl_save_CFLAGS="$CFLAGS"
-      CFLAGS="$CFLAGS -fvisibility=hidden"
-      AC_TRY_COMPILE(
-        [extern __attribute__((__visibility__("hidden"))) int hiddenvar;
-         extern __attribute__((__visibility__("default"))) int exportedvar;
-         extern __attribute__((__visibility__("hidden"))) int hiddenfunc (void);
-         extern __attribute__((__visibility__("default"))) int exportedfunc (void);],
-        [],
-        gl_cv_cc_visibility=yes,
-        gl_cv_cc_visibility=no)
-      CFLAGS="$gl_save_CFLAGS"])
-    AC_MSG_RESULT([$gl_cv_cc_visibility])
-    if test $gl_cv_cc_visibility = yes; then
-      CFLAG_VISIBILITY="-fvisibility=hidden"
-      HAVE_VISIBILITY=1
-    fi
-  fi
-  AC_SUBST([CFLAG_VISIBILITY])
-  AC_SUBST([HAVE_VISIBILITY])
-  AC_DEFINE_UNQUOTED([HAVE_VISIBILITY], [$HAVE_VISIBILITY],
-    [Define to 1 or 0, depending whether the compiler supports simple visibility declarations.])
-])
diff --git a/sherlock265/Makefile.in b/sherlock265/Makefile.in
index 3c4dec5..99d1c46 100644
--- a/sherlock265/Makefile.in
+++ b/sherlock265/Makefile.in
@@ -89,11 +89,12 @@ DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
 	$(top_srcdir)/depcomp COPYING README
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_compare_version.m4 \
+	$(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \
 	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
 	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
 	$(top_srcdir)/m4/lt~obsolete.m4 \
 	$(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \
-	$(top_srcdir)/m4/visibility.m4 $(top_srcdir)/configure.ac
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 mkinstalldirs = $(install_sh) -d
@@ -198,6 +199,7 @@ ETAGS = etags
 CTAGS = ctags
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
 AMTAR = @AMTAR@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
@@ -206,9 +208,11 @@ AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
 CCDEPMODE = @CCDEPMODE@
 CFLAGS = @CFLAGS@
-CFLAG_VISIBILITY = @CFLAG_VISIBILITY@
 CPP = @CPP@
 CPPFLAGS = @CPPFLAGS@
 CXX = @CXX@
@@ -228,7 +232,7 @@ EGREP = @EGREP@
 EXEEXT = @EXEEXT@
 FGREP = @FGREP@
 GREP = @GREP@
-HAVE_VISIBILITY = @HAVE_VISIBILITY@
+HAVE_CXX11 = @HAVE_CXX11@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
diff --git a/sherlock265/VideoDecoder.cc b/sherlock265/VideoDecoder.cc
index 560cdbe..a6d486c 100644
--- a/sherlock265/VideoDecoder.cc
+++ b/sherlock265/VideoDecoder.cc
@@ -33,7 +33,8 @@ using namespace videogfx;
 
 
 VideoDecoder::VideoDecoder()
-  : ctx(NULL),
+  : mFH(NULL),
+    ctx(NULL),
     img(NULL),
     mNextBuffer(0),
     mFrameCount(0),
@@ -41,16 +42,15 @@ VideoDecoder::VideoDecoder()
     mVideoEnded(false),
     mSingleStep(false),
     mShowDecodedImage(true),
+    mShowQuantPY(false),
     mCBShowPartitioning(false),
     mTBShowPartitioning(false),
     mPBShowPartitioning(false),
-    mShowPBPredMode(false),
     mShowIntraPredMode(false),
-    mShowQuantPY(false),
+    mShowPBPredMode(false),
     mShowMotionVec(false),
-    mShowSlices(false),
     mShowTiles(false),
-    mFH(NULL)
+    mShowSlices(false)
 #ifdef HAVE_SWSCALE
     , sws(NULL)
     , width(0)
diff --git a/tools/Makefile.am b/tools/Makefile.am
new file mode 100644
index 0000000..9879576
--- /dev/null
+++ b/tools/Makefile.am
@@ -0,0 +1,46 @@
+
+bin_PROGRAMS = gen-enc-table yuv-distortion rd-curves block-rate-estim tests bjoentegaard
+
+AM_CPPFLAGS = -I../libde265
+
+gen_enc_table_DEPENDENCIES = ../libde265/libde265.la
+gen_enc_table_CXXFLAGS =
+gen_enc_table_LDFLAGS =
+gen_enc_table_LDADD = ../libde265/libde265.la -lstdc++
+gen_enc_table_SOURCES = gen-entropy-table.cc
+
+yuv_distortion_DEPENDENCIES = ../libde265/libde265.la
+yuv_distortion_CXXFLAGS =
+yuv_distortion_LDFLAGS =
+yuv_distortion_LDADD = ../libde265/libde265.la -lstdc++
+yuv_distortion_SOURCES = yuv-distortion.cc
+
+if HAVE_VIDEOGFX
+  yuv_distortion_CXXFLAGS += $(VIDEOGFX_CFLAGS)
+  yuv_distortion_LDFLAGS += $(VIDEOGFX_LIBS)
+endif
+
+rd_curves_DEPENDENCIES = ../libde265/libde265.la
+rd_curves_CXXFLAGS =
+rd_curves_LDFLAGS =
+rd_curves_LDADD = ../libde265/libde265.la -lstdc++
+rd_curves_SOURCES = rd-curves.cc
+
+block_rate_estim_DEPENDENCIES = ../libde265/libde265.la
+block_rate_estim_CXXFLAGS =
+block_rate_estim_LDFLAGS =
+block_rate_estim_LDADD = ../libde265/libde265.la -lstdc++
+block_rate_estim_SOURCES = block-rate-estim.cc
+
+tests_DEPENDENCIES = ../libde265/libde265.la
+tests_CXXFLAGS =
+tests_LDFLAGS =
+tests_LDADD = ../libde265/libde265.la -lstdc++
+tests_SOURCES = tests.cc
+
+bjoentegaard_DEPENDENCIES = ../libde265/libde265.la
+bjoentegaard_CXXFLAGS =
+bjoentegaard_LDFLAGS =
+bjoentegaard_LDADD = ../libde265/libde265.la -lstdc++
+bjoentegaard_SOURCES = bjoentegaard.cc
+
diff --git a/sherlock265/Makefile.in b/tools/Makefile.in
similarity index 56%
copy from sherlock265/Makefile.in
copy to tools/Makefile.in
index 3c4dec5..33f7fa3 100644
--- a/sherlock265/Makefile.in
+++ b/tools/Makefile.in
@@ -79,21 +79,22 @@ POST_UNINSTALL = :
 build_triplet = @build@
 host_triplet = @host@
 target_triplet = @target@
-bin_PROGRAMS = sherlock265$(EXEEXT)
+bin_PROGRAMS = gen-enc-table$(EXEEXT) yuv-distortion$(EXEEXT) \
+	rd-curves$(EXEEXT) block-rate-estim$(EXEEXT) tests$(EXEEXT) \
+	bjoentegaard$(EXEEXT)
 @HAVE_VIDEOGFX_TRUE at am__append_1 = $(VIDEOGFX_CFLAGS)
 @HAVE_VIDEOGFX_TRUE at am__append_2 = $(VIDEOGFX_LIBS)
- at HAVE_SWSCALE_TRUE@am__append_3 = $(SWSCALE_CFLAGS)
- at HAVE_SWSCALE_TRUE@am__append_4 = $(SWSCALE_LIBS)
-subdir = sherlock265
+subdir = tools
 DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
-	$(top_srcdir)/depcomp COPYING README
+	$(top_srcdir)/depcomp
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_compare_version.m4 \
+	$(top_srcdir)/m4/ax_cxx_compile_stdcxx_11.m4 \
 	$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
 	$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
 	$(top_srcdir)/m4/lt~obsolete.m4 \
 	$(top_srcdir)/m4/m4_ax_check_compile_flag.m4 \
-	$(top_srcdir)/m4/visibility.m4 $(top_srcdir)/configure.ac
+	$(top_srcdir)/configure.ac
 am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 	$(ACLOCAL_M4)
 mkinstalldirs = $(install_sh) -d
@@ -102,22 +103,44 @@ CONFIG_CLEAN_FILES =
 CONFIG_CLEAN_VPATH_FILES =
 am__installdirs = "$(DESTDIR)$(bindir)"
 PROGRAMS = $(bin_PROGRAMS)
-am_sherlock265_OBJECTS = sherlock265-sherlock265.$(OBJEXT) \
-	sherlock265-VideoPlayer.$(OBJEXT) \
-	sherlock265-VideoDecoder.$(OBJEXT) \
-	sherlock265-VideoWidget.$(OBJEXT)
-nodist_sherlock265_OBJECTS = sherlock265-moc_VideoPlayer.$(OBJEXT) \
-	sherlock265-moc_VideoDecoder.$(OBJEXT) \
-	sherlock265-moc_VideoWidget.$(OBJEXT)
-sherlock265_OBJECTS = $(am_sherlock265_OBJECTS) \
-	$(nodist_sherlock265_OBJECTS)
+am_bjoentegaard_OBJECTS = bjoentegaard-bjoentegaard.$(OBJEXT)
+bjoentegaard_OBJECTS = $(am_bjoentegaard_OBJECTS)
 AM_V_lt = $(am__v_lt_ at AM_V@)
 am__v_lt_ = $(am__v_lt_ at AM_DEFAULT_V@)
 am__v_lt_0 = --silent
 am__v_lt_1 = 
-sherlock265_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(sherlock265_CXXFLAGS) \
-	$(CXXFLAGS) $(sherlock265_LDFLAGS) $(LDFLAGS) -o $@
+bjoentegaard_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(bjoentegaard_CXXFLAGS) \
+	$(CXXFLAGS) $(bjoentegaard_LDFLAGS) $(LDFLAGS) -o $@
+am_block_rate_estim_OBJECTS =  \
+	block_rate_estim-block-rate-estim.$(OBJEXT)
+block_rate_estim_OBJECTS = $(am_block_rate_estim_OBJECTS)
+block_rate_estim_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(block_rate_estim_CXXFLAGS) $(CXXFLAGS) \
+	$(block_rate_estim_LDFLAGS) $(LDFLAGS) -o $@
+am_gen_enc_table_OBJECTS = gen_enc_table-gen-entropy-table.$(OBJEXT)
+gen_enc_table_OBJECTS = $(am_gen_enc_table_OBJECTS)
+gen_enc_table_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(gen_enc_table_CXXFLAGS) $(CXXFLAGS) $(gen_enc_table_LDFLAGS) \
+	$(LDFLAGS) -o $@
+am_rd_curves_OBJECTS = rd_curves-rd-curves.$(OBJEXT)
+rd_curves_OBJECTS = $(am_rd_curves_OBJECTS)
+rd_curves_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(rd_curves_CXXFLAGS) \
+	$(CXXFLAGS) $(rd_curves_LDFLAGS) $(LDFLAGS) -o $@
+am_tests_OBJECTS = tests-tests.$(OBJEXT)
+tests_OBJECTS = $(am_tests_OBJECTS)
+tests_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(tests_CXXFLAGS) \
+	$(CXXFLAGS) $(tests_LDFLAGS) $(LDFLAGS) -o $@
+am_yuv_distortion_OBJECTS = yuv_distortion-yuv-distortion.$(OBJEXT)
+yuv_distortion_OBJECTS = $(am_yuv_distortion_OBJECTS)
+yuv_distortion_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(yuv_distortion_CXXFLAGS) $(CXXFLAGS) \
+	$(yuv_distortion_LDFLAGS) $(LDFLAGS) -o $@
 AM_V_P = $(am__v_P_ at AM_V@)
 am__v_P_ = $(am__v_P_ at AM_DEFAULT_V@)
 am__v_P_0 = false
@@ -152,26 +175,12 @@ AM_V_CXXLD = $(am__v_CXXLD_ at AM_V@)
 am__v_CXXLD_ = $(am__v_CXXLD_ at AM_DEFAULT_V@)
 am__v_CXXLD_0 = @echo "  CXXLD   " $@;
 am__v_CXXLD_1 = 
-COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
-	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
-LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
-	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
-	$(AM_CFLAGS) $(CFLAGS)
-AM_V_CC = $(am__v_CC_ at AM_V@)
-am__v_CC_ = $(am__v_CC_ at AM_DEFAULT_V@)
-am__v_CC_0 = @echo "  CC      " $@;
-am__v_CC_1 = 
-CCLD = $(CC)
-LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
-	$(AM_LDFLAGS) $(LDFLAGS) -o $@
-AM_V_CCLD = $(am__v_CCLD_ at AM_V@)
-am__v_CCLD_ = $(am__v_CCLD_ at AM_DEFAULT_V@)
-am__v_CCLD_0 = @echo "  CCLD    " $@;
-am__v_CCLD_1 = 
-SOURCES = $(sherlock265_SOURCES) $(nodist_sherlock265_SOURCES)
-DIST_SOURCES = $(sherlock265_SOURCES)
+SOURCES = $(bjoentegaard_SOURCES) $(block_rate_estim_SOURCES) \
+	$(gen_enc_table_SOURCES) $(rd_curves_SOURCES) $(tests_SOURCES) \
+	$(yuv_distortion_SOURCES)
+DIST_SOURCES = $(bjoentegaard_SOURCES) $(block_rate_estim_SOURCES) \
+	$(gen_enc_table_SOURCES) $(rd_curves_SOURCES) $(tests_SOURCES) \
+	$(yuv_distortion_SOURCES)
 am__can_run_installinfo = \
   case $$AM_UPDATE_INFO_DIR in \
     n|no|NO) false;; \
@@ -198,6 +207,7 @@ ETAGS = etags
 CTAGS = ctags
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 ACLOCAL = @ACLOCAL@
+ALLOCA = @ALLOCA@
 AMTAR = @AMTAR@
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
@@ -206,9 +216,11 @@ AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
 CCDEPMODE = @CCDEPMODE@
 CFLAGS = @CFLAGS@
-CFLAG_VISIBILITY = @CFLAG_VISIBILITY@
 CPP = @CPP@
 CPPFLAGS = @CPPFLAGS@
 CXX = @CXX@
@@ -228,7 +240,7 @@ EGREP = @EGREP@
 EXEEXT = @EXEEXT@
 FGREP = @FGREP@
 GREP = @GREP@
-HAVE_VISIBILITY = @HAVE_VISIBILITY@
+HAVE_CXX11 = @HAVE_CXX11@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -340,37 +352,40 @@ top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
 AM_CPPFLAGS = -I../libde265
-sherlock265_DEPENDENCIES = ../libde265/libde265.la
-sherlock265_CXXFLAGS = $(QT_CFLAGS) -std=c++0x -fPIC $(am__append_1) \
-	$(am__append_3)
-sherlock265_LDFLAGS = $(QT_LIBS) $(am__append_2) $(am__append_4)
-sherlock265_LDADD = ../libde265/libde265.la -lstdc++ -lpthread
-sherlock265_SOURCES = \
-  sherlock265.cc \
-  VideoPlayer.cc \
-  VideoDecoder.cc \
-  VideoWidget.cc \
-  VideoPlayer.hh \
-  VideoDecoder.hh \
-  VideoWidget.hh
-
-nodist_sherlock265_SOURCES = \
-  moc_VideoPlayer.cpp \
-  moc_VideoDecoder.cpp \
-  moc_VideoWidget.cpp
-
-CLEANFILES = \
-  moc_VideoPlayer.cpp \
-  moc_VideoDecoder.cpp \
-  moc_VideoWidget.cpp
-
-EXTRA_DIST = \
-  README
-
+gen_enc_table_DEPENDENCIES = ../libde265/libde265.la
+gen_enc_table_CXXFLAGS = 
+gen_enc_table_LDFLAGS = 
+gen_enc_table_LDADD = ../libde265/libde265.la -lstdc++
+gen_enc_table_SOURCES = gen-entropy-table.cc
+yuv_distortion_DEPENDENCIES = ../libde265/libde265.la
+yuv_distortion_CXXFLAGS = $(am__append_1)
+yuv_distortion_LDFLAGS = $(am__append_2)
+yuv_distortion_LDADD = ../libde265/libde265.la -lstdc++
+yuv_distortion_SOURCES = yuv-distortion.cc
+rd_curves_DEPENDENCIES = ../libde265/libde265.la
+rd_curves_CXXFLAGS = 
+rd_curves_LDFLAGS = 
+rd_curves_LDADD = ../libde265/libde265.la -lstdc++
+rd_curves_SOURCES = rd-curves.cc
+block_rate_estim_DEPENDENCIES = ../libde265/libde265.la
+block_rate_estim_CXXFLAGS = 
+block_rate_estim_LDFLAGS = 
+block_rate_estim_LDADD = ../libde265/libde265.la -lstdc++
+block_rate_estim_SOURCES = block-rate-estim.cc
+tests_DEPENDENCIES = ../libde265/libde265.la
+tests_CXXFLAGS = 
+tests_LDFLAGS = 
+tests_LDADD = ../libde265/libde265.la -lstdc++
+tests_SOURCES = tests.cc
+bjoentegaard_DEPENDENCIES = ../libde265/libde265.la
+bjoentegaard_CXXFLAGS = 
+bjoentegaard_LDFLAGS = 
+bjoentegaard_LDADD = ../libde265/libde265.la -lstdc++
+bjoentegaard_SOURCES = bjoentegaard.cc
 all: all-am
 
 .SUFFIXES:
-.SUFFIXES: .cc .cpp .lo .o .obj
+.SUFFIXES: .cc .lo .o .obj
 $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
@@ -380,9 +395,9 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	      exit 1;; \
 	  esac; \
 	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu sherlock265/Makefile'; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu tools/Makefile'; \
 	$(am__cd) $(top_srcdir) && \
-	  $(AUTOMAKE) --gnu sherlock265/Makefile
+	  $(AUTOMAKE) --gnu tools/Makefile
 .PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	@case '$?' in \
@@ -451,9 +466,29 @@ clean-binPROGRAMS:
 	echo " rm -f" $$list; \
 	rm -f $$list
 
-sherlock265$(EXEEXT): $(sherlock265_OBJECTS) $(sherlock265_DEPENDENCIES) $(EXTRA_sherlock265_DEPENDENCIES) 
-	@rm -f sherlock265$(EXEEXT)
-	$(AM_V_CXXLD)$(sherlock265_LINK) $(sherlock265_OBJECTS) $(sherlock265_LDADD) $(LIBS)
+bjoentegaard$(EXEEXT): $(bjoentegaard_OBJECTS) $(bjoentegaard_DEPENDENCIES) $(EXTRA_bjoentegaard_DEPENDENCIES) 
+	@rm -f bjoentegaard$(EXEEXT)
+	$(AM_V_CXXLD)$(bjoentegaard_LINK) $(bjoentegaard_OBJECTS) $(bjoentegaard_LDADD) $(LIBS)
+
+block-rate-estim$(EXEEXT): $(block_rate_estim_OBJECTS) $(block_rate_estim_DEPENDENCIES) $(EXTRA_block_rate_estim_DEPENDENCIES) 
+	@rm -f block-rate-estim$(EXEEXT)
+	$(AM_V_CXXLD)$(block_rate_estim_LINK) $(block_rate_estim_OBJECTS) $(block_rate_estim_LDADD) $(LIBS)
+
+gen-enc-table$(EXEEXT): $(gen_enc_table_OBJECTS) $(gen_enc_table_DEPENDENCIES) $(EXTRA_gen_enc_table_DEPENDENCIES) 
+	@rm -f gen-enc-table$(EXEEXT)
+	$(AM_V_CXXLD)$(gen_enc_table_LINK) $(gen_enc_table_OBJECTS) $(gen_enc_table_LDADD) $(LIBS)
+
+rd-curves$(EXEEXT): $(rd_curves_OBJECTS) $(rd_curves_DEPENDENCIES) $(EXTRA_rd_curves_DEPENDENCIES) 
+	@rm -f rd-curves$(EXEEXT)
+	$(AM_V_CXXLD)$(rd_curves_LINK) $(rd_curves_OBJECTS) $(rd_curves_LDADD) $(LIBS)
+
+tests$(EXEEXT): $(tests_OBJECTS) $(tests_DEPENDENCIES) $(EXTRA_tests_DEPENDENCIES) 
+	@rm -f tests$(EXEEXT)
+	$(AM_V_CXXLD)$(tests_LINK) $(tests_OBJECTS) $(tests_LDADD) $(LIBS)
+
+yuv-distortion$(EXEEXT): $(yuv_distortion_OBJECTS) $(yuv_distortion_DEPENDENCIES) $(EXTRA_yuv_distortion_DEPENDENCIES) 
+	@rm -f yuv-distortion$(EXEEXT)
+	$(AM_V_CXXLD)$(yuv_distortion_LINK) $(yuv_distortion_OBJECTS) $(yuv_distortion_LDADD) $(LIBS)
 
 mostlyclean-compile:
 	-rm -f *.$(OBJEXT)
@@ -461,13 +496,12 @@ mostlyclean-compile:
 distclean-compile:
 	-rm -f *.tab.c
 
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/sherlock265-VideoDecoder.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/sherlock265-VideoPlayer.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/sherlock265-VideoWidget.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/sherlock265-moc_VideoDecoder.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/sherlock265-moc_VideoPlayer.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/sherlock265-moc_VideoWidget.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/sherlock265-sherlock265.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/bjoentegaard-bjoentegaard.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/block_rate_estim-block-rate-estim.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/gen_enc_table-gen-entropy-table.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/rd_curves-rd-curves.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/tests-tests.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/yuv_distortion-yuv-distortion.Po at am__quote@
 
 .cc.o:
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@@ -490,124 +524,89 @@ distclean-compile:
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LTCXXCOMPILE) -c -o $@ $<
 
-sherlock265-sherlock265.o: sherlock265.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -MT sherlock265-sherlock265.o -MD -MP -MF $(DEPDIR)/sherlock265-sherlock265.Tpo -c -o sherlock265-sherlock265.o `test -f 'sherlock265.cc' || echo '$(srcdir)/'`sherlock265.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/sherlock265-sherlock265.Tpo $(DEPDIR)/sherlock265-sherlock265.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='sherlock265.cc' object='sherlock265-sherlock265.o' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -c -o sherlock265-sherlock265.o `test -f 'sherlock265.cc' || echo '$(srcdir)/'`sherlock265.cc
-
-sherlock265-sherlock265.obj: sherlock265.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -MT sherlock265-sherlock265.obj -MD -MP -MF $(DEPDIR)/sherlock265-sherlock265.Tpo -c -o sherlock265-sherlock265.obj `if test -f 'sherlock265.cc'; then $(CYGPATH_W) 'sherlock265.cc'; else $(CYGPATH_W) '$(srcdir)/sherlock265.cc'; fi`
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/sherlock265-sherlock265.Tpo $(DEPDIR)/sherlock265-sherlock265.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='sherlock265.cc' object='sherlock265-sherlock265.obj' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -c -o sherlock265-sherlock265.obj `if test -f 'sherlock265.cc'; then $(CYGPATH_W) 'sherlock265.cc'; else $(CYGPATH_W) '$(srcdir)/sherlock265.cc'; fi`
-
-sherlock265-VideoPlayer.o: VideoPlayer.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -MT sherlock265-VideoPlayer.o -MD -MP -MF $(DEPDIR)/sherlock265-VideoPlayer.Tpo -c -o sherlock265-VideoPlayer.o `test -f 'VideoPlayer.cc' || echo '$(srcdir)/'`VideoPlayer.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/sherlock265-VideoPlayer.Tpo $(DEPDIR)/sherlock265-VideoPlayer.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='VideoPlayer.cc' object='sherlock265-VideoPlayer.o' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -c -o sherlock265-VideoPlayer.o `test -f 'VideoPlayer.cc' || echo '$(srcdir)/'`VideoPlayer.cc
-
-sherlock265-VideoPlayer.obj: VideoPlayer.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -MT sherlock265-VideoPlayer.obj -MD -MP -MF $(DEPDIR)/sherlock265-VideoPlayer.Tpo -c -o sherlock265-VideoPlayer.obj `if test -f 'VideoPlayer.cc'; then $(CYGPATH_W) 'VideoPlayer.cc'; else $(CYGPATH_W) '$(srcdir)/VideoPlayer.cc'; fi`
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/sherlock265-VideoPlayer.Tpo $(DEPDIR)/sherlock265-VideoPlayer.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='VideoPlayer.cc' object='sherlock265-VideoPlayer.obj' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -c -o sherlock265-VideoPlayer.obj `if test -f 'VideoPlayer.cc'; then $(CYGPATH_W) 'VideoPlayer.cc'; else $(CYGPATH_W) '$(srcdir)/VideoPlayer.cc'; fi`
-
-sherlock265-VideoDecoder.o: VideoDecoder.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -MT sherlock265-VideoDecoder.o -MD -MP -MF $(DEPDIR)/sherlock265-VideoDecoder.Tpo -c -o sherlock265-VideoDecoder.o `test -f 'VideoDecoder.cc' || echo '$(srcdir)/'`VideoDecoder.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/sherlock265-VideoDecoder.Tpo $(DEPDIR)/sherlock265-VideoDecoder.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='VideoDecoder.cc' object='sherlock265-VideoDecoder.o' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -c -o sherlock265-VideoDecoder.o `test -f 'VideoDecoder.cc' || echo '$(srcdir)/'`VideoDecoder.cc
-
-sherlock265-VideoDecoder.obj: VideoDecoder.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -MT sherlock265-VideoDecoder.obj -MD -MP -MF $(DEPDIR)/sherlock265-VideoDecoder.Tpo -c -o sherlock265-VideoDecoder.obj `if test -f 'VideoDecoder.cc'; then $(CYGPATH_W) 'VideoDecoder.cc'; else $(CYGPATH_W) '$(srcdir)/VideoDecoder.cc'; fi`
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/sherlock265-VideoDecoder.Tpo $(DEPDIR)/sherlock265-VideoDecoder.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='VideoDecoder.cc' object='sherlock265-VideoDecoder.obj' libtool=no @AMDEPBACKSLASH@
+bjoentegaard-bjoentegaard.o: bjoentegaard.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(bjoentegaard_CXXFLAGS) $(CXXFLAGS) -MT bjoentegaard-bjoentegaard.o -MD -MP -MF $(DEPDIR)/bjoentegaard-bjoentegaard.Tpo -c -o bjoentegaard-bjoentegaard.o `test -f 'bjoentegaard.cc' || echo '$(srcdir)/'`bjoentegaard.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/bjoentegaard-bjoentegaard.Tpo $(DEPDIR)/bjoentegaard-bjoentegaard.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='bjoentegaard.cc' object='bjoentegaard-bjoentegaard.o' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -c -o sherlock265-VideoDecoder.obj `if test -f 'VideoDecoder.cc'; then $(CYGPATH_W) 'VideoDecoder.cc'; else $(CYGPATH_W) '$(srcdir)/VideoDecoder.cc'; fi`
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(bjoentegaard_CXXFLAGS) $(CXXFLAGS) -c -o bjoentegaard-bjoentegaard.o `test -f 'bjoentegaard.cc' || echo '$(srcdir)/'`bjoentegaard.cc
 
-sherlock265-VideoWidget.o: VideoWidget.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -MT sherlock265-VideoWidget.o -MD -MP -MF $(DEPDIR)/sherlock265-VideoWidget.Tpo -c -o sherlock265-VideoWidget.o `test -f 'VideoWidget.cc' || echo '$(srcdir)/'`VideoWidget.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/sherlock265-VideoWidget.Tpo $(DEPDIR)/sherlock265-VideoWidget.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='VideoWidget.cc' object='sherlock265-VideoWidget.o' libtool=no @AMDEPBACKSLASH@
+bjoentegaard-bjoentegaard.obj: bjoentegaard.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(bjoentegaard_CXXFLAGS) $(CXXFLAGS) -MT bjoentegaard-bjoentegaard.obj -MD -MP -MF $(DEPDIR)/bjoentegaard-bjoentegaard.Tpo -c -o bjoentegaard-bjoentegaard.obj `if test -f 'bjoentegaard.cc'; then $(CYGPATH_W) 'bjoentegaard.cc'; else $(CYGPATH_W) '$(srcdir)/bjoentegaard.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/bjoentegaard-bjoentegaard.Tpo $(DEPDIR)/bjoentegaard-bjoentegaard.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='bjoentegaard.cc' object='bjoentegaard-bjoentegaard.obj' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -c -o sherlock265-VideoWidget.o `test -f 'VideoWidget.cc' || echo '$(srcdir)/'`VideoWidget.cc
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(bjoentegaard_CXXFLAGS) $(CXXFLAGS) -c -o bjoentegaard-bjoentegaard.obj `if test -f 'bjoentegaard.cc'; then $(CYGPATH_W) 'bjoentegaard.cc'; else $(CYGPATH_W) '$(srcdir)/bjoentegaard.cc'; fi`
 
-sherlock265-VideoWidget.obj: VideoWidget.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -MT sherlock265-VideoWidget.obj -MD -MP -MF $(DEPDIR)/sherlock265-VideoWidget.Tpo -c -o sherlock265-VideoWidget.obj `if test -f 'VideoWidget.cc'; then $(CYGPATH_W) 'VideoWidget.cc'; else $(CYGPATH_W) '$(srcdir)/VideoWidget.cc'; fi`
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/sherlock265-VideoWidget.Tpo $(DEPDIR)/sherlock265-VideoWidget.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='VideoWidget.cc' object='sherlock265-VideoWidget.obj' libtool=no @AMDEPBACKSLASH@
+block_rate_estim-block-rate-estim.o: block-rate-estim.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(block_rate_estim_CXXFLAGS) $(CXXFLAGS) -MT block_rate_estim-block-rate-estim.o -MD -MP -MF $(DEPDIR)/block_rate_estim-block-rate-estim.Tpo -c -o block_rate_estim-block-rate-estim.o `test -f 'block-rate-estim.cc' || echo '$(srcdir)/'`block-rate-estim.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/block_rate_estim-block-rate-estim.Tpo $(DEPDIR)/block_rate_estim-block-rate-estim.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='block-rate-estim.cc' object='block_rate_estim-block-rate-estim.o' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -c -o sherlock265-VideoWidget.obj `if test -f 'VideoWidget.cc'; then $(CYGPATH_W) 'VideoWidget.cc'; else $(CYGPATH_W) '$(srcdir)/VideoWidget.cc'; fi`
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(block_rate_estim_CXXFLAGS) $(CXXFLAGS) -c -o block_rate_estim-block-rate-estim.o `test -f 'block-rate-estim.cc' || echo '$(srcdir)/'`block-rate-estim.cc
 
-sherlock265-moc_VideoPlayer.o: moc_VideoPlayer.cpp
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -MT sherlock265-moc_VideoPlayer.o -MD -MP -MF $(DEPDIR)/sherlock265-moc_VideoPlayer.Tpo -c -o sherlock265-moc_VideoPlayer.o `test -f 'moc_VideoPlayer.cpp' || echo '$(srcdir)/'`moc_VideoPlayer.cpp
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/sherlock265-moc_VideoPlayer.Tpo $(DEPDIR)/sherlock265-moc_VideoPlayer.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='moc_VideoPlayer.cpp' object='sherlock265-moc_VideoPlayer.o' libtool=no @AMDEPBACKSLASH@
+block_rate_estim-block-rate-estim.obj: block-rate-estim.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(block_rate_estim_CXXFLAGS) $(CXXFLAGS) -MT block_rate_estim-block-rate-estim.obj -MD -MP -MF $(DEPDIR)/block_rate_estim-block-rate-estim.Tpo -c -o block_rate_estim-block-rate-estim.obj `if test -f 'block-rate-estim.cc'; then $(CYGPATH_W) 'block-rate-estim.cc'; else $(CYGPATH_W) '$(srcdir)/block-rate-estim.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/block_rate_estim-block-rate-estim.Tpo $(DEPDIR)/block_rate_estim-block-rate-estim.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='block-rate-estim.cc' object='block_rate_estim-block-rate-estim.obj' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -c -o sherlock265-moc_VideoPlayer.o `test -f 'moc_VideoPlayer.cpp' || echo '$(srcdir)/'`moc_VideoPlayer.cpp
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(block_rate_estim_CXXFLAGS) $(CXXFLAGS) -c -o block_rate_estim-block-rate-estim.obj `if test -f 'block-rate-estim.cc'; then $(CYGPATH_W) 'block-rate-estim.cc'; else $(CYGPATH_W) '$(srcdir)/block-rate-estim.cc'; fi`
 
-sherlock265-moc_VideoPlayer.obj: moc_VideoPlayer.cpp
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -MT sherlock265-moc_VideoPlayer.obj -MD -MP -MF $(DEPDIR)/sherlock265-moc_VideoPlayer.Tpo -c -o sherlock265-moc_VideoPlayer.obj `if test -f 'moc_VideoPlayer.cpp'; then $(CYGPATH_W) 'moc_VideoPlayer.cpp'; else $(CYGPATH_W) '$(srcdir)/moc_VideoPlayer.cpp'; fi`
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/sherlock265-moc_VideoPlayer.Tpo $(DEPDIR)/sherlock265-moc_VideoPlayer.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='moc_VideoPlayer.cpp' object='sherlock265-moc_VideoPlayer.obj' libtool=no @AMDEPBACKSLASH@
+gen_enc_table-gen-entropy-table.o: gen-entropy-table.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(gen_enc_table_CXXFLAGS) $(CXXFLAGS) -MT gen_enc_table-gen-entropy-table.o -MD -MP -MF $(DEPDIR)/gen_enc_table-gen-entropy-table.Tpo -c -o gen_enc_table-gen-entropy-table.o `test -f 'gen-entropy-table.cc' || echo '$(srcdir)/'`gen-entropy-table.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/gen_enc_table-gen-entropy-table.Tpo $(DEPDIR)/gen_enc_table-gen-entropy-table.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='gen-entropy-table.cc' object='gen_enc_table-gen-entropy-table.o' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -c -o sherlock265-moc_VideoPlayer.obj `if test -f 'moc_VideoPlayer.cpp'; then $(CYGPATH_W) 'moc_VideoPlayer.cpp'; else $(CYGPATH_W) '$(srcdir)/moc_VideoPlayer.cpp'; fi`
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(gen_enc_table_CXXFLAGS) $(CXXFLAGS) -c -o gen_enc_table-gen-entropy-table.o `test -f 'gen-entropy-table.cc' || echo '$(srcdir)/'`gen-entropy-table.cc
 
-sherlock265-moc_VideoDecoder.o: moc_VideoDecoder.cpp
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -MT sherlock265-moc_VideoDecoder.o -MD -MP -MF $(DEPDIR)/sherlock265-moc_VideoDecoder.Tpo -c -o sherlock265-moc_VideoDecoder.o `test -f 'moc_VideoDecoder.cpp' || echo '$(srcdir)/'`moc_VideoDecoder.cpp
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/sherlock265-moc_VideoDecoder.Tpo $(DEPDIR)/sherlock265-moc_VideoDecoder.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='moc_VideoDecoder.cpp' object='sherlock265-moc_VideoDecoder.o' libtool=no @AMDEPBACKSLASH@
+gen_enc_table-gen-entropy-table.obj: gen-entropy-table.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(gen_enc_table_CXXFLAGS) $(CXXFLAGS) -MT gen_enc_table-gen-entropy-table.obj -MD -MP -MF $(DEPDIR)/gen_enc_table-gen-entropy-table.Tpo -c -o gen_enc_table-gen-entropy-table.obj `if test -f 'gen-entropy-table.cc'; then $(CYGPATH_W) 'gen-entropy-table.cc'; else $(CYGPATH_W) '$(srcdir)/gen-entropy-table.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/gen_enc_table-gen-entropy-table.Tpo $(DEPDIR)/gen_enc_table-gen-entropy-table.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='gen-entropy-table.cc' object='gen_enc_table-gen-entropy-table.obj' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -c -o sherlock265-moc_VideoDecoder.o `test -f 'moc_VideoDecoder.cpp' || echo '$(srcdir)/'`moc_VideoDecoder.cpp
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(gen_enc_table_CXXFLAGS) $(CXXFLAGS) -c -o gen_enc_table-gen-entropy-table.obj `if test -f 'gen-entropy-table.cc'; then $(CYGPATH_W) 'gen-entropy-table.cc'; else $(CYGPATH_W) '$(srcdir)/gen-entropy-table.cc'; fi`
 
-sherlock265-moc_VideoDecoder.obj: moc_VideoDecoder.cpp
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -MT sherlock265-moc_VideoDecoder.obj -MD -MP -MF $(DEPDIR)/sherlock265-moc_VideoDecoder.Tpo -c -o sherlock265-moc_VideoDecoder.obj `if test -f 'moc_VideoDecoder.cpp'; then $(CYGPATH_W) 'moc_VideoDecoder.cpp'; else $(CYGPATH_W) '$(srcdir)/moc_VideoDecoder.cpp'; fi`
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/sherlock265-moc_VideoDecoder.Tpo $(DEPDIR)/sherlock265-moc_VideoDecoder.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='moc_VideoDecoder.cpp' object='sherlock265-moc_VideoDecoder.obj' libtool=no @AMDEPBACKSLASH@
+rd_curves-rd-curves.o: rd-curves.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rd_curves_CXXFLAGS) $(CXXFLAGS) -MT rd_curves-rd-curves.o -MD -MP -MF $(DEPDIR)/rd_curves-rd-curves.Tpo -c -o rd_curves-rd-curves.o `test -f 'rd-curves.cc' || echo '$(srcdir)/'`rd-curves.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/rd_curves-rd-curves.Tpo $(DEPDIR)/rd_curves-rd-curves.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='rd-curves.cc' object='rd_curves-rd-curves.o' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -c -o sherlock265-moc_VideoDecoder.obj `if test -f 'moc_VideoDecoder.cpp'; then $(CYGPATH_W) 'moc_VideoDecoder.cpp'; else $(CYGPATH_W) '$(srcdir)/moc_VideoDecoder.cpp'; fi`
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rd_curves_CXXFLAGS) $(CXXFLAGS) -c -o rd_curves-rd-curves.o `test -f 'rd-curves.cc' || echo '$(srcdir)/'`rd-curves.cc
 
-sherlock265-moc_VideoWidget.o: moc_VideoWidget.cpp
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -MT sherlock265-moc_VideoWidget.o -MD -MP -MF $(DEPDIR)/sherlock265-moc_VideoWidget.Tpo -c -o sherlock265-moc_VideoWidget.o `test -f 'moc_VideoWidget.cpp' || echo '$(srcdir)/'`moc_VideoWidget.cpp
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/sherlock265-moc_VideoWidget.Tpo $(DEPDIR)/sherlock265-moc_VideoWidget.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='moc_VideoWidget.cpp' object='sherlock265-moc_VideoWidget.o' libtool=no @AMDEPBACKSLASH@
+rd_curves-rd-curves.obj: rd-curves.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rd_curves_CXXFLAGS) $(CXXFLAGS) -MT rd_curves-rd-curves.obj -MD -MP -MF $(DEPDIR)/rd_curves-rd-curves.Tpo -c -o rd_curves-rd-curves.obj `if test -f 'rd-curves.cc'; then $(CYGPATH_W) 'rd-curves.cc'; else $(CYGPATH_W) '$(srcdir)/rd-curves.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/rd_curves-rd-curves.Tpo $(DEPDIR)/rd_curves-rd-curves.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='rd-curves.cc' object='rd_curves-rd-curves.obj' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -c -o sherlock265-moc_VideoWidget.o `test -f 'moc_VideoWidget.cpp' || echo '$(srcdir)/'`moc_VideoWidget.cpp
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(rd_curves_CXXFLAGS) $(CXXFLAGS) -c -o rd_curves-rd-curves.obj `if test -f 'rd-curves.cc'; then $(CYGPATH_W) 'rd-curves.cc'; else $(CYGPATH_W) '$(srcdir)/rd-curves.cc'; fi`
 
-sherlock265-moc_VideoWidget.obj: moc_VideoWidget.cpp
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -MT sherlock265-moc_VideoWidget.obj -MD -MP -MF $(DEPDIR)/sherlock265-moc_VideoWidget.Tpo -c -o sherlock265-moc_VideoWidget.obj `if test -f 'moc_VideoWidget.cpp'; then $(CYGPATH_W) 'moc_VideoWidget.cpp'; else $(CYGPATH_W) '$(srcdir)/moc_VideoWidget.cpp'; fi`
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/sherlock265-moc_VideoWidget.Tpo $(DEPDIR)/sherlock265-moc_VideoWidget.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='moc_VideoWidget.cpp' object='sherlock265-moc_VideoWidget.obj' libtool=no @AMDEPBACKSLASH@
+tests-tests.o: tests.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(tests_CXXFLAGS) $(CXXFLAGS) -MT tests-tests.o -MD -MP -MF $(DEPDIR)/tests-tests.Tpo -c -o tests-tests.o `test -f 'tests.cc' || echo '$(srcdir)/'`tests.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/tests-tests.Tpo $(DEPDIR)/tests-tests.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='tests.cc' object='tests-tests.o' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(sherlock265_CXXFLAGS) $(CXXFLAGS) -c -o sherlock265-moc_VideoWidget.obj `if test -f 'moc_VideoWidget.cpp'; then $(CYGPATH_W) 'moc_VideoWidget.cpp'; else $(CYGPATH_W) '$(srcdir)/moc_VideoWidget.cpp'; fi`
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(tests_CXXFLAGS) $(CXXFLAGS) -c -o tests-tests.o `test -f 'tests.cc' || echo '$(srcdir)/'`tests.cc
 
-.cpp.o:
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+tests-tests.obj: tests.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(tests_CXXFLAGS) $(CXXFLAGS) -MT tests-tests.obj -MD -MP -MF $(DEPDIR)/tests-tests.Tpo -c -o tests-tests.obj `if test -f 'tests.cc'; then $(CYGPATH_W) 'tests.cc'; else $(CYGPATH_W) '$(srcdir)/tests.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/tests-tests.Tpo $(DEPDIR)/tests-tests.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='tests.cc' object='tests-tests.obj' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXXCOMPILE) -c -o $@ $<
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(tests_CXXFLAGS) $(CXXFLAGS) -c -o tests-tests.obj `if test -f 'tests.cc'; then $(CYGPATH_W) 'tests.cc'; else $(CYGPATH_W) '$(srcdir)/tests.cc'; fi`
 
-.cpp.obj:
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+yuv_distortion-yuv-distortion.o: yuv-distortion.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(yuv_distortion_CXXFLAGS) $(CXXFLAGS) -MT yuv_distortion-yuv-distortion.o -MD -MP -MF $(DEPDIR)/yuv_distortion-yuv-distortion.Tpo -c -o yuv_distortion-yuv-distortion.o `test -f 'yuv-distortion.cc' || echo '$(srcdir)/'`yuv-distortion.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/yuv_distortion-yuv-distortion.Tpo $(DEPDIR)/yuv_distortion-yuv-distortion.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='yuv-distortion.cc' object='yuv_distortion-yuv-distortion.o' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(yuv_distortion_CXXFLAGS) $(CXXFLAGS) -c -o yuv_distortion-yuv-distortion.o `test -f 'yuv-distortion.cc' || echo '$(srcdir)/'`yuv-distortion.cc
 
-.cpp.lo:
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+yuv_distortion-yuv-distortion.obj: yuv-distortion.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(yuv_distortion_CXXFLAGS) $(CXXFLAGS) -MT yuv_distortion-yuv-distortion.obj -MD -MP -MF $(DEPDIR)/yuv_distortion-yuv-distortion.Tpo -c -o yuv_distortion-yuv-distortion.obj `if test -f 'yuv-distortion.cc'; then $(CYGPATH_W) 'yuv-distortion.cc'; else $(CYGPATH_W) '$(srcdir)/yuv-distortion.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/yuv_distortion-yuv-distortion.Tpo $(DEPDIR)/yuv_distortion-yuv-distortion.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='yuv-distortion.cc' object='yuv_distortion-yuv-distortion.obj' libtool=no @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LTCXXCOMPILE) -c -o $@ $<
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(yuv_distortion_CXXFLAGS) $(CXXFLAGS) -c -o yuv_distortion-yuv-distortion.obj `if test -f 'yuv-distortion.cc'; then $(CYGPATH_W) 'yuv-distortion.cc'; else $(CYGPATH_W) '$(srcdir)/yuv-distortion.cc'; fi`
 
 mostlyclean-libtool:
 	-rm -f *.lo
@@ -726,7 +725,6 @@ install-strip:
 mostlyclean-generic:
 
 clean-generic:
-	-test -z "$(CLEANFILES)" || rm -f $(CLEANFILES)
 
 distclean-generic:
 	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
@@ -822,15 +820,6 @@ uninstall-am: uninstall-binPROGRAMS
 	tags tags-am uninstall uninstall-am uninstall-binPROGRAMS
 
 
-moc_VideoWidget.cpp: VideoWidget.hh
-	$(QTMOC) $(DEFINES) $(INCPATH) VideoWidget.hh -o moc_VideoWidget.cpp
-
-moc_VideoPlayer.cpp: VideoPlayer.hh
-	$(QTMOC) $(DEFINES) $(INCPATH) VideoPlayer.hh -o moc_VideoPlayer.cpp
-
-moc_VideoDecoder.cpp: VideoDecoder.hh
-	$(QTMOC) $(DEFINES) $(INCPATH) VideoDecoder.hh -o moc_VideoDecoder.cpp
-
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
 .NOEXPORT:
diff --git a/tools/bjoentegaard.cc b/tools/bjoentegaard.cc
new file mode 100644
index 0000000..7e3eb8b
--- /dev/null
+++ b/tools/bjoentegaard.cc
@@ -0,0 +1,344 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <math.h>
+#include <unistd.h>
+
+
+const bool D = false;
+
+
+/* There are numerical stability problems in the matrix inverse.
+   Switching to long double seems to provide enough accuracy.
+   TODO: in the long term, use a better regression algorithm.
+ */
+typedef long double FP;
+
+
+struct datapoint
+{
+  double rate;
+  double distortion;
+};
+
+struct BjoentegaardParams
+{
+  // a*log^3 R + b*log^2 R + c*log R + d
+  double a,b,c,d;
+
+  double minRate, maxRate;
+};
+
+std::vector<datapoint> curveA,curveB;
+BjoentegaardParams paramsA,paramsB;
+
+#define RATE_NORMALIZATION_FACTOR 1 //(1/1000.0)
+
+
+
+FP invf(int i,int j,const FP* m)
+{
+  int o = 2+(j-i);
+
+  i += 4+o;
+  j += 4-o;
+
+#define e(a,b) m[ ((j+b)%4)*4 + ((i+a)%4) ]
+
+    FP inv =
+      + e(+1,-1)*e(+0,+0)*e(-1,+1)
+      + e(+1,+1)*e(+0,-1)*e(-1,+0)
+      + e(-1,-1)*e(+1,+0)*e(+0,+1)
+      - e(-1,-1)*e(+0,+0)*e(+1,+1)
+      - e(-1,+1)*e(+0,-1)*e(+1,+0)
+      - e(+1,-1)*e(-1,+0)*e(+0,+1);
+
+    return (o%2)?inv : -inv;
+
+    #undef e
+
+}
+
+bool inverseMatrix4x4(const FP *m, FP *out)
+{
+  FP inv[16];
+
+  for(int i=0;i<4;i++)
+    for(int j=0;j<4;j++)
+      inv[j*4+i] = invf(i,j,m);
+
+  FP D = 0;
+
+  for(int k=0;k<4;k++) D += m[k] * inv[k*4];
+
+  if (D == 0) return false;
+
+  D = 1.0 / D;
+
+  for (int i = 0; i < 16; i++)
+    out[i] = inv[i] * D;
+
+  return true;
+
+}
+
+
+
+BjoentegaardParams fitParams(const std::vector<datapoint>& curve)
+{
+  // build regression matrix
+
+  int n = curve.size();
+
+  FP X[4*n];  // regression matrix
+  FP XT[n*4]; // transpose of X
+
+  for (int i=0;i<n;i++) {
+    FP x = log(curve[i].rate) * RATE_NORMALIZATION_FACTOR;
+
+    X[4*i + 0] = 1;
+    X[4*i + 1] = x;
+    X[4*i + 2] = x*x;
+    X[4*i + 3] = x*x*x;
+
+    if (D) printf("%f %f %f %f ;\n",1.0,(double)x,(double)(x*x),(double)(x*x*x));
+
+    XT[i+0*n] = 1;
+    XT[i+1*n] = x;
+    XT[i+2*n] = x*x;
+    XT[i+3*n] = x*x*x;
+  }
+
+  if (D) {
+    printf("rate: ");
+    for (int i=0;i<n;i++) {
+      printf("%f ; ",curve[i].rate);
+    }
+    printf("\n");
+
+    printf("distortion: ");
+    for (int i=0;i<n;i++) {
+      printf("%f ; ",curve[i].distortion);
+    }
+    printf("\n");
+  }
+
+  // calc X^T * X
+
+  FP XTX[4*4];
+  for (int y=0;y<4;y++)
+    for (int x=0;x<4;x++) {
+      FP sum=0;
+
+      for (int i=0;i<n;i++)
+        {
+          sum += XT[y*n + i] * X[x + i*4];
+        }
+
+      XTX[y*4+x] = sum;
+    }
+
+  FP XTXinv[4*4];
+
+  inverseMatrix4x4(XTX, XTXinv);
+
+  if (D) {
+    for (int y=0;y<4;y++) {
+      for (int x=0;x<4;x++) {
+        printf("%f ",(double)XTXinv[y*4+x]);
+      }
+      printf("\n");
+    }
+  }
+
+  // calculate pseudo-inverse XP = (X^T * X)^-1 * X^T
+
+  FP XP[n*4];
+
+  for (int y=0;y<4;y++) {
+    for (int x=0;x<n;x++) {
+      FP sum=0;
+
+      for (int i=0;i<4;i++)
+        {
+          sum += XTXinv[y*4 + i] * XT[x + i*n];
+        }
+
+      XP[y*n+x] = sum;
+    }
+  }
+
+  // calculate regression parameters
+
+  FP p[4];
+
+  for (int k=0;k<4;k++)
+    {
+      FP sum=0;
+
+      for (int i=0;i<n;i++) {
+        sum += XP[k*n + i] * curve[i].distortion;
+      }
+
+      p[k]=sum;
+    }
+
+
+  BjoentegaardParams param;
+  param.d = p[0];
+  param.c = p[1];
+  param.b = p[2];
+  param.a = p[3];
+
+
+  // find min and max rate
+
+  param.minRate = curve[0].rate;
+  param.maxRate = curve[0].rate;
+
+  for (int i=1;i<n;i++) {
+    param.minRate = std::min(param.minRate, curve[i].rate);
+    param.maxRate = std::max(param.maxRate, curve[i].rate);
+  }
+
+  return param;
+}
+
+FP evalIntegralAt(const BjoentegaardParams& p, double x)
+{
+  FP sum = 0;
+
+  // integral of: d
+
+  sum += p.d * x;
+
+  // integral of: c*log(x)
+
+  sum += p.c * x* (log(x)-1);
+
+  // integral of: b*log(x)^2
+
+  sum += p.b * x * ((log(x)-2)*log(x)+2);
+
+  // integral of: a*log(x)^3
+
+  sum += p.a * x * (log(x)*((log(x)-3)*log(x)+6)-6);
+
+  return sum;
+}
+
+
+double calcBjoentegaard(const BjoentegaardParams& paramsA,
+                        const BjoentegaardParams& paramsB,
+                        double min_rate, double max_rate)
+{
+  double mini = std::max(paramsA.minRate, paramsB.minRate);
+  double maxi = std::min(paramsA.maxRate, paramsB.maxRate);
+
+  if (min_rate >= 0) mini = std::max(mini, min_rate);
+  if (max_rate >= 0) maxi = std::min(maxi, max_rate);
+
+  if (D) printf("range: %f %f\n",mini,maxi);
+
+  FP intA = evalIntegralAt(paramsA, maxi) - evalIntegralAt(paramsA, mini);
+  FP intB = evalIntegralAt(paramsB, maxi) - evalIntegralAt(paramsB, mini);
+
+  if (D) printf("int1:%f int2:%f\n",(double)intA,(double)intB);
+
+  return (intA-intB)/(maxi-mini);
+}
+
+
+std::vector<datapoint> readRDFile(const char* filename, float min_rate, float max_rate)
+{
+  std::vector<datapoint> curve;
+  std::ifstream istr(filename);
+
+  for (;;)
+    {
+      std::string line;
+      getline(istr, line);
+      if (istr.eof())
+        break;
+
+      if (line[0]=='#') continue;
+
+      std::stringstream sstr(line);
+      datapoint p;
+      sstr >> p.rate >> p.distortion;
+
+      if (min_rate>=0 && p.rate < min_rate) continue;
+      if (max_rate>=0 && p.rate > max_rate) continue;
+
+      curve.push_back(p);
+    }
+
+  return curve;
+}
+
+
+int main(int argc, char** argv)
+{
+  float min_rate = -1;
+  float max_rate = -1;
+
+  int c;
+  while ((c=getopt(argc,argv, "l:h:")) != -1) {
+    switch (c) {
+    case 'l': min_rate = atof(optarg); break;
+    case 'h': max_rate = atof(optarg); break;
+    }
+  }
+
+  curveA = readRDFile(argv[optind], min_rate, max_rate);
+  paramsA = fitParams(curveA);
+
+  printf("params A: %f %f %f %f\n",paramsA.a,paramsA.b,paramsA.c,paramsA.d);
+
+  printf("gnuplot: %f*log(x)**3+%f*log(x)**2+%f*log(x)+%f\n",paramsA.a,paramsA.b,paramsA.c,paramsA.d);
+
+  if (optind+1<argc) {
+    curveB = readRDFile(argv[optind+1], min_rate, max_rate);
+    paramsB = fitParams(curveB);
+
+    printf("params B: %f %f %f %f\n",paramsB.a,paramsB.b,paramsB.c,paramsB.d);
+
+    printf("gnuplot: %f*log(x)**3+%f*log(x)**2+%f*log(x)+%f\n",paramsB.a,paramsB.b,paramsB.c,paramsB.d);
+
+    double delta = calcBjoentegaard(paramsA,paramsB, min_rate,max_rate);
+
+    printf("Bjoentegaard delta: %f dB   (A-B -> >0 -> first (A) is better)\n",delta);
+
+    if (delta>=0) {
+      printf("-> first is better by %f dB\n",delta);
+    }
+    else {
+      printf("-> second is better by %f dB\n",-delta);
+    }
+  }
+
+  return 0;
+}
diff --git a/tools/block-rate-estim.cc b/tools/block-rate-estim.cc
new file mode 100644
index 0000000..91731bf
--- /dev/null
+++ b/tools/block-rate-estim.cc
@@ -0,0 +1,127 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <vector>
+#include <string>
+#include <fstream>
+#include <math.h>
+
+
+struct datapoint {
+  int   log2blksize;
+  float rate;
+  float estim;
+};
+
+std::vector<datapoint> pts;
+
+#define NBINS 100
+
+/*
+ #define ESTIMDIV 100
+ #define MAXESTIM 80000
+ std::vector<float> bitestim2[MAXESTIM/ESTIMDIV];
+*/
+
+
+
+void print_bitestim_results(int log2blksize)
+{
+  float max_estim=0;
+
+  for (int i=0;i<pts.size();i++) {
+    if (log2blksize==0 || pts[i].log2blksize==log2blksize) {
+      max_estim = std::max(max_estim, pts[i].estim);
+    }
+  }
+
+
+
+  float epsilon = 0.0001;
+  float interval = (max_estim+epsilon) / NBINS;
+
+  for (int b=0;b<NBINS;b++) {
+
+    int cnt=0;
+    double sum=0;
+    float mini=999999;
+    float maxi=0;
+
+    for (int i=0;i<pts.size();i++)
+      if (log2blksize==0 || pts[i].log2blksize==log2blksize) {
+        int bin = pts[i].estim/interval;
+        if (bin==b) {
+          sum += pts[i].rate;
+
+          mini = std::min(mini,pts[i].rate);
+          maxi = std::max(maxi,pts[i].rate);
+          cnt++;
+        }
+      }
+
+    if (cnt>0) {
+      double mean = sum/cnt;
+
+      double var = 0;
+
+      for (int i=0;i<pts.size();i++)
+        if (log2blksize==0 || pts[i].log2blksize==log2blksize) {
+          int bin = pts[i].estim/interval;
+          if (bin==b) {
+            var += (pts[i].rate-mean)*(pts[i].rate-mean);
+          }
+        }
+
+      var /= cnt;
+      double stddev = sqrt(var);
+
+      printf("%f  %f %f  %f %f  %f %f %d\n",
+             (b+0.5)*interval,mean,var,
+             mean-stddev,mean+stddev, mini,maxi,
+             cnt);
+    }
+  }
+}
+
+
+int main(int argc,char** argv)
+{
+  std::string tag = argv[1];
+
+  std::ifstream istr(argv[2]);
+  for (;;)
+    {
+      std::string t;
+      int log2blksize;
+      datapoint pt;
+
+      istr >> t >> pt.log2blksize >> pt.rate >> pt.estim;
+
+      if (istr.eof()) break;
+
+      if (t == tag) {
+        pts.push_back(pt);
+      }
+    }
+
+  print_bitestim_results(0);
+
+  return 0;
+}
diff --git a/tools/gen-entropy-table.cc b/tools/gen-entropy-table.cc
new file mode 100644
index 0000000..3014bbd
--- /dev/null
+++ b/tools/gen-entropy-table.cc
@@ -0,0 +1,477 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "libde265/cabac.h"
+#include <assert.h>
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+
+void simple_getline(char** lineptr,size_t* linelen,FILE* fh)
+{
+  const int LINESIZE=1000;
+
+  if (*lineptr==NULL) {
+    *linelen = LINESIZE;
+    *lineptr = (char*)malloc(LINESIZE);
+  }
+
+  char* p = *lineptr;
+
+  for (;;) {
+    assert(p - *lineptr < LINESIZE);
+
+    int c = fgetc(fh);
+    if (c == EOF || c == '\n') {
+      *p = 0;
+      break;
+    }
+
+    *p++ = c;
+  }
+}
+
+
+void generate_entropy_table()
+{
+#if 000
+  const int nSymbols=1000*1000*10;
+  const int oversample = 10;
+
+  double tab[64][2];
+
+  for (int i=0;i<64;i++)
+    for (int k=0;k<2;k++)
+      tab[i][k]=0;
+
+  srand(time(0));
+  //srand(123);
+
+  int cnt=1;
+  for (;;cnt++) {
+    printf("-------------------- %d --------------------\n",cnt);
+
+    for (int s=0;s<63;s++) {
+      CABAC_encoder_bitstream cabac_mix0;
+      CABAC_encoder_bitstream cabac_mix1;
+      CABAC_encoder_bitstream cabac_ref;
+
+      for (int i=0;i<nSymbols*oversample;i++) {
+        int r = rand();
+        int n = (r>>2) % 63;
+        int m = (r>>1) & 1;
+        int b = r & 1;
+
+        context_model model;
+        model.MPSbit = 1;
+        model.state  = n;
+        cabac_ref.write_CABAC_bit(&model, b);
+
+        model.MPSbit = 1;
+        model.state  = n;
+        cabac_mix0.write_CABAC_bit(&model, b);
+
+        model.MPSbit = 1;
+        model.state  = n;
+        cabac_mix1.write_CABAC_bit(&model, b);
+
+        if (i%oversample == oversample/2) {
+          model.MPSbit = 1;
+          model.state  = s;
+          cabac_mix0.write_CABAC_bit(&model, 0);
+
+          model.MPSbit = 1;
+          model.state  = s;
+          cabac_mix1.write_CABAC_bit(&model, 1);
+
+          //b = rand() & 1;
+          //cabac_mix.write_CABAC_bypass(1);
+        }
+
+      }
+
+      cabac_ref.flush_CABAC();
+      cabac_mix0.flush_CABAC();
+      cabac_mix1.flush_CABAC();
+
+      int bits_ref  = cabac_ref.size()*8;
+      int bits_mix0 = cabac_mix0.size()*8;
+      int bits_mix1 = cabac_mix1.size()*8;
+
+      //printf("bits: %d %d\n",bits_ref,bits_mix);
+      int bits_diff0 = bits_mix0-bits_ref;
+      int bits_diff1 = bits_mix1-bits_ref;
+      //printf("bits diff: %d\n",bits_diff);
+
+      double bits_per_symbol0 = bits_diff0 / double(nSymbols);
+      double bits_per_symbol1 = bits_diff1 / double(nSymbols);
+
+      tab[s][0] += bits_per_symbol0;
+      tab[s][1] += bits_per_symbol1;
+
+      double bps0 = tab[s][0]/cnt;
+      double bps1 = tab[s][1]/cnt;
+
+      printf("/* state=%2d */  0x%05x /* %f */,  0x%05x /* %f */,\n", s,
+             (int)(bps1*0x8000), bps1,
+             (int)(bps0*0x8000), bps0);
+    }
+
+    printf("                0x0010c, 0x3bfbb /* dummy, should never be used */\n");
+  }
+#endif
+}
+
+int probTab[128+2] = {
+  1537234,1602970,
+  1608644,1815493,
+  1822246,2245961,
+  916773,1329391,
+  1337504,1930659,
+  1063692,1707588,
+  868294,1532108,
+  842934,1555538,
+  689043,1396941,
+  860184,1789964,
+  534165,1258482,
+  672508,1598821,
+  578782,1476240,
+  602247,1613140,
+  409393,1206638,
+  459294,1356779,
+  430124,1359893,
+  308326,1050647,
+  313100,1099956,
+  293887,1088978,
+  220901,869582,
+  214967,881695,
+  197226,856990,
+  166131,767761,
+  152514,737406,
+  128332,663998,
+  117638,632653,
+  106178,595539,
+  90898,539506,
+  83437,509231,
+  76511,492801,
+  64915,443096,
+  57847,409809,
+  52730,385395,
+  45707,354059,
+  42018,333028,
+  37086,308073,
+  33256,284497,
+  36130,299172,
+  28831,270716,
+  25365,244840,
+  22850,221896,
+  19732,201462,
+  17268,183729,
+  15252,168106,
+  13787,153979,
+  12187,141455,
+  10821,130337,
+  9896,120165,
+  8626,112273,
+  8162,103886,
+  7201,96441,
+  6413,89805,
+  5886,83733,
+  5447,78084,
+  4568,73356,
+  4388,68831,
+  3959,64688,
+  3750,60804,
+  3407,57271,
+  3109,54024,
+  2820,51099,
+  48569,1451987,
+  0,    0,
+  0*22686225,    0
+};
+
+
+void generate_entropy_table_probTableWeighted()
+{
+#if 000
+  int64_t probTabSum=0;
+  for (int i=0;i<130;i++)
+    probTabSum += probTab[i];
+
+
+  const int nSymbols=1000*1000*10;
+  const int oversample = 10;
+
+  double tab[64][2];
+
+  for (int i=0;i<64;i++)
+    for (int k=0;k<2;k++)
+      tab[i][k]=0;
+
+  srand(time(0));
+  //srand(123);
+
+  int cnt=1;
+  for (;;cnt++) {
+    printf("-------------------- %d --------------------\n",cnt);
+
+    for (int s=0;s<63;s++) {
+      CABAC_encoder_bitstream cabac_mix0;
+      CABAC_encoder_bitstream cabac_mix1;
+      CABAC_encoder_bitstream cabac_ref;
+
+      for (int i=0;i<nSymbols*oversample;i++) {
+        int r = rand();
+
+        r %= probTabSum;
+        int idx=0;
+        while (r>probTab[idx]) {
+          r-=probTab[idx];
+          idx++;
+        }
+
+        assert(idx<=128);
+
+        int n = idx/2;
+        int b = idx&1;
+        bool bypass = (idx==128);
+
+        printf("%d %d %d\n",n,b,bypass);
+
+        context_model model;
+        model.MPSbit = 1;
+        model.state  = n;
+        if (bypass) cabac_ref.write_CABAC_bypass(1);
+        else        cabac_ref.write_CABAC_bit(&model, b);
+
+        model.MPSbit = 1;
+        model.state  = n;
+        if (bypass) cabac_mix0.write_CABAC_bypass(1);
+        else        cabac_mix0.write_CABAC_bit(&model, b);
+
+        model.MPSbit = 1;
+        model.state  = n;
+        if (bypass) cabac_mix1.write_CABAC_bypass(1);
+        else        cabac_mix1.write_CABAC_bit(&model, b);
+
+        if (i%oversample == oversample/2) {
+          model.MPSbit = 1;
+          model.state  = s;
+          cabac_mix0.write_CABAC_bit(&model, 0);
+
+          model.MPSbit = 1;
+          model.state  = s;
+          cabac_mix1.write_CABAC_bit(&model, 1);
+
+          //b = rand() & 1;
+          //cabac_mix.write_CABAC_bypass(1);
+        }
+
+      }
+
+      cabac_ref.flush_CABAC();
+      cabac_mix0.flush_CABAC();
+      cabac_mix1.flush_CABAC();
+
+      int bits_ref  = cabac_ref.size()*8;
+      int bits_mix0 = cabac_mix0.size()*8;
+      int bits_mix1 = cabac_mix1.size()*8;
+
+      //printf("bits: %d %d\n",bits_ref,bits_mix);
+      int bits_diff0 = bits_mix0-bits_ref;
+      int bits_diff1 = bits_mix1-bits_ref;
+      //printf("bits diff: %d\n",bits_diff);
+
+      double bits_per_symbol0 = bits_diff0 / double(nSymbols);
+      double bits_per_symbol1 = bits_diff1 / double(nSymbols);
+
+      tab[s][0] += bits_per_symbol0;
+      tab[s][1] += bits_per_symbol1;
+
+      double bps0 = tab[s][0]/cnt;
+      double bps1 = tab[s][1]/cnt;
+
+      printf("/* state=%2d */  0x%05x /* %f */,  0x%05x /* %f */,\n", s,
+             (int)(bps1*0x8000), bps1,
+             (int)(bps0*0x8000), bps0);
+    }
+
+    printf("                0x0010c, 0x3bfbb /* dummy, should never be used */\n");
+  }
+#endif
+}
+
+
+void generate_entropy_table_replay()
+{
+#if 000
+  const int oversample = 10;
+
+  char* lineptr = NULL;
+  size_t linelen = 0;
+
+  for (int s=0;s<63;s++) {
+    CABAC_encoder_bitstream cabac_mix0;
+    CABAC_encoder_bitstream cabac_mix1;
+    CABAC_encoder_bitstream cabac_ref;
+
+    int nSymbols = 0;
+
+    FILE* fh = fopen("streamdump-paris-intra","r");
+
+    for (int i=0;i<80000000;i++) {
+      simple_getline(&lineptr,&linelen,fh);
+      if (feof(fh))
+        break;
+
+      int n,b;
+      sscanf(lineptr,"%d %d",&n,&b);
+
+      bool bypass = (n==64);
+
+      if ((i%10000)==0)
+        { printf("%d  %d %d    \r",i,n,b);
+        }
+
+      //printf("%d %d %d\n",n,b,bypass);
+
+      context_model model;
+      model.MPSbit = 1;
+      model.state  = n;
+      if (bypass) cabac_ref.write_CABAC_bypass(1);
+      else        cabac_ref.write_CABAC_bit(&model, b);
+
+      model.MPSbit = 1;
+      model.state  = n;
+      if (bypass) cabac_mix0.write_CABAC_bypass(1);
+      else        cabac_mix0.write_CABAC_bit(&model, b);
+
+      model.MPSbit = 1;
+      model.state  = n;
+      if (bypass) cabac_mix1.write_CABAC_bypass(1);
+      else        cabac_mix1.write_CABAC_bit(&model, b);
+
+      if (i%oversample == oversample/2) {
+        model.MPSbit = 1;
+        model.state  = s;
+        cabac_mix0.write_CABAC_bit(&model, 0);
+
+        model.MPSbit = 1;
+        model.state  = s;
+        cabac_mix1.write_CABAC_bit(&model, 1);
+
+        nSymbols++;
+
+        //b = rand() & 1;
+        //cabac_mix.write_CABAC_bypass(1);
+      }
+    }
+
+    fclose(fh);
+
+    cabac_ref.flush_CABAC();
+    cabac_mix0.flush_CABAC();
+    cabac_mix1.flush_CABAC();
+
+    int bits_ref  = cabac_ref.size()*8;
+    int bits_mix0 = cabac_mix0.size()*8;
+    int bits_mix1 = cabac_mix1.size()*8;
+
+    //printf("bits: %d %d\n",bits_ref,bits_mix);
+    int bits_diff0 = bits_mix0-bits_ref;
+    int bits_diff1 = bits_mix1-bits_ref;
+    //printf("bits diff: %d\n",bits_diff);
+
+    double bits_per_symbol0 = bits_diff0 / double(nSymbols);
+    double bits_per_symbol1 = bits_diff1 / double(nSymbols);
+
+    double bps0 = bits_per_symbol0;
+    double bps1 = bits_per_symbol1;
+
+    printf("/* state=%2d */  0x%05x /* %f */,  0x%05x /* %f */,\n", s,
+           (int)(bps1*0x8000), bps1,
+           (int)(bps0*0x8000), bps0);
+  }
+
+  printf("                0x0010c, 0x3bfbb /* dummy, should never be used */\n");
+#endif
+}
+
+
+void test_entropy_table_replay()
+{
+#if 000
+  char* lineptr = NULL;
+  size_t linelen = 0;
+
+
+  CABAC_encoder_bitstream cabac_bs;
+  CABAC_encoder_estim     cabac_estim;
+
+  //FILE* fh = fopen("y","r");
+  //FILE* fh = fopen("own-dump","r");
+  //FILE* fh = fopen("rawstream-dump","r");
+  //FILE* fh = fopen("johnny-stream-dump","r");
+  FILE* fh = fopen("streamdump-paris-intra","r");
+
+  for (int i=0;i<80000000;i++) {
+    simple_getline(&lineptr,&linelen,fh);
+    if (feof(fh))
+      break;
+
+    int n,b;
+    sscanf(lineptr,"%d %d",&n,&b);
+    b=!b;
+    bool bypass = (n==64);
+
+    if ((i%10000)==0)
+      { printf("%d  %d %d    \n",i,n,b);
+      }
+
+    context_model model;
+    model.MPSbit = 1;
+    model.state  = n;
+    if (bypass) cabac_bs.write_CABAC_bypass(1);
+    else        cabac_bs.write_CABAC_bit(&model, b);
+
+    model.MPSbit = 1;
+    model.state  = n;
+    if (bypass) cabac_estim.write_CABAC_bypass(1);
+    else        cabac_estim.write_CABAC_bit(&model, b);
+  }
+
+  fclose(fh);
+
+  printf("bs:%d estim:%d\n",cabac_bs.size(),cabac_estim.size());
+#endif
+}
+
+
+int main(int argc, char** argv)
+{
+  //generate_entropy_table();
+  //generate_entropy_table_replay();
+
+  test_entropy_table_replay();
+
+  return 0;
+}
diff --git a/tools/rd-curves.cc b/tools/rd-curves.cc
new file mode 100644
index 0000000..71fd3e2
--- /dev/null
+++ b/tools/rd-curves.cc
@@ -0,0 +1,1075 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+#include <getopt.h>
+
+#include <sys/time.h>
+
+#ifndef WIN32
+#include <sys/times.h>
+#endif
+
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <vector>
+
+
+
+static struct {
+  const char* name;
+  const char* value;
+} variables[] = {
+  { "$HOME"    , "/home/domain/farindk" },
+  { "$ROOT"    , "/home/domain/farindk/prog/h265" },
+  { "$ENC265"  , "$ROOT/libde265/enc265/enc265" },
+  { "$DEC265"  , "$ROOT/libde265/dec265/dec265" },
+  { "$YUVDIST" , "$ROOT/libde265/tools/yuv-distortion" },
+  { "$YUVTMP"  , "/mnt/temp/dirk/yuv/ftp.tnt.uni-hannover.de/testsequences" },
+  { "$YUV"     , "/storage/users/farindk/yuv" },
+  { "$HMENC"   , "HM13enc" },
+  { "$HM13CFG" , "$ROOT/HM/HM-13.0-dev/cfg" },
+  { "$HMSCCENC", "HM-SCC-enc" },
+  { "$HMSCCCFG", "$ROOT/HM/HM-SCC-extensions/cfg" },
+  { "$X265ENC" , "$ROOT/x265/build/linux/x265" },
+  { "$X264"    , "x264" },
+  { "$FFMPEG"  , "ffmpeg" },
+  { "$F265"    , "$ROOT/f265/build/f265cli" },
+  { 0,0 }
+};
+
+
+bool keepStreams = false;
+int  maxFrames = 0;
+std::string encoderParameters;
+
+
+std::string replace_variables(std::string str)
+{
+  bool replaced = false;
+  for (int i=0;variables[i].name;i++) {
+    size_t pos = str.find(variables[i].name);
+    if (pos != std::string::npos) {
+      replaced = true;
+      str = str.replace(pos, strlen(variables[i].name), variables[i].value);
+      break;
+    }
+  }
+
+  if (!replaced) return str;
+  else return replace_variables(str);
+}
+
+
+// ---------------------------------------------------------------------------
+
+struct Preset
+{
+  const int ID;
+  const char* name;
+  const char* descr;
+
+  const char* options_de265;
+  const char* options_hm;
+  const char* options_hm_scc;
+  const char* options_x265;
+  const char* options_f265;
+  const char* options_x264;
+  const char* options_x264_ffmpeg;
+  const char* options_ffmpeg_mpeg2;
+
+  //int nFrames;
+};
+
+
+Preset preset[] = {
+  { 1, "pre01-intra-noLF", "intra, no LF, no SBH, CTB-size 32, min CB=8",
+    /* de265  */ "--sop-structure intra",
+    /* HM     */ "-c $HM13CFG/encoder_intra_main.cfg -SBH 0 --SAO=0 --LoopFilterDisable --DeblockingFilterControlPresent --MaxCUSize=32 --MaxPartitionDepth=2",
+    /* HM SCC */ "-c $HMSCCCFG/encoder_intra_main_scc.cfg -SBH 0 --SAO=0 --LoopFilterDisable --DeblockingFilterControlPresent --MaxCUSize=32 --MaxPartitionDepth=2",
+    /* x265   */ "--no-lft -I 1 --no-signhide",
+    /* f265   */ "key-frame-spacing=1",
+    /* x264   */ "-I 1",
+    /* ffmpeg */ "-g 1",
+    /* mpeg-2 */ "-g 1"
+    // 0 // all frames
+  },
+
+  { 2, "pre02-fastIntra", "intra, no LF, no SBH, CTB-size 32, min CB=8",
+    /* de265  */ "--sop-structure intra --TB-IntraPredMode minSSD",
+    /* HM     */ "-c $HM13CFG/encoder_intra_main.cfg -SBH 0 --SAO=0 --LoopFilterDisable --DeblockingFilterControlPresent --MaxCUSize=32 --MaxPartitionDepth=2",
+    /* HM SCC */ "-c $HMSCCCFG/encoder_intra_main_scc.cfg -SBH 0 --SAO=0 --LoopFilterDisable --DeblockingFilterControlPresent --MaxCUSize=32 --MaxPartitionDepth=2",
+    /* x265   */ "--no-lft -I 1 --no-signhide",
+    /* f265   */ "key-frame-spacing=1",
+    /* x264   */ "-I 1",
+    /* ffmpeg */ "-g 1",
+    /* mpeg-2 */ "-g 1"
+    // 0 // all frames
+  },
+
+  { 3, "pre03-fastIntra", "pre02, but fast-brute",
+    /* de265  */ "--sop-structure intra --TB-IntraPredMode fast-brute",
+    /* HM     */ "-c $HM13CFG/encoder_intra_main.cfg -SBH 0 --SAO=0 --LoopFilterDisable --DeblockingFilterControlPresent --MaxCUSize=32 --MaxPartitionDepth=2",
+    /* HM SCC */ "-c $HMSCCCFG/encoder_intra_main_scc.cfg -SBH 0 --SAO=0 --LoopFilterDisable --DeblockingFilterControlPresent --MaxCUSize=32 --MaxPartitionDepth=2",
+    /* x265   */ "--no-lft -I 1 --no-signhide",
+    /* f265   */ "key-frame-spacing=1",
+    /* x264   */ "-I 1",
+    /* ffmpeg */ "-g 1",
+    /* mpeg-2 */ "-g 1"
+    // 0 // all frames
+  },
+
+  { 50, "cb-auto16", "(development test)",
+    /* de265  */ "--max-cb-size 16 --min-cb-size 8",
+    /* HM     */ "-c $HM13CFG/encoder_intra_main.cfg -SBH 0 --SAO=0 --LoopFilterDisable --DeblockingFilterControlPresent --MaxCUSize=32 --MaxPartitionDepth=2",
+    /* HM SCC */ "-c $HMSCCCFG/encoder_intra_main_scc.cfg -SBH 0 --SAO=0 --LoopFilterDisable --DeblockingFilterControlPresent --MaxCUSize=32 --MaxPartitionDepth=2",
+    /* x265   */ "--no-lft -I 1 --no-signhide",
+    /* f265   */ "key-frame-spacing=1",
+    /* x264   */ "-I 1",
+    /* ffmpeg */ "-g 1",
+    /* mpeg-2 */ "-g 1"
+    // 0 // all frames
+  },
+
+  { 80, "lowdelay", "default (low-default) encoder parameters",
+    "--MEMode search --max-cb-size 32 --min-cb-size 8 --min-tb-size 4 --CB-IntraPartMode-Fixed-partMode 2Nx2N --CB-IntraPartMode fixed --TB-IntraPredMode min-residual --PB-MV-TestMode zero",
+    /* de265  */ //"--sop-structure low-delay --MEMode search --max-cb-size 32 --min-cb-size 8 --min-tb-size 4 --CB-IntraPartMode fixed --TB-IntraPredMode min-residual",
+    /* HM     */ "-c $HM13CFG/encoder_lowdelay_main.cfg -ip 248",
+    /* HM SCC */ "-c $HMSCCCFG/encoder_lowdelay_main_scc.cfg -ip 248",
+    /* x265   */ "-I 248 --no-wpp --bframes 0", // GOP size: 248
+    /* f265   */ 0, //"key-frame-spacing=248",
+    /* x264   */ "",
+    /* ffmpeg */ "-g 248 -bf 0",
+    /* mpeg-2 */ "" // GOP size 248 does not make sense here
+    // 0 // all frames
+  },
+
+  { 98, "best", "default (random-access) encoder parameters",
+    /* de265  */ "--max-cb-size 16 --min-cb-size 8",
+    /* HM     */ "-c $HM13CFG/encoder_randomaccess_main.cfg",
+    /* HM SCC */ "-c $HMSCCCFG/encoder_randomaccess_main_scc.cfg",
+    /* x265   */ "",
+    /* f265   */ "",
+    /* x264   */ "",
+    /* ffmpeg */ "",
+    /* mpeg-2 */ ""
+    // 0 // all frames
+  },
+
+  { 99, "besteq", "default (random-access) encoder parameters, I-frame distance = 248",
+    /* de265  */ "",
+    /* HM     */ "-c $HM13CFG/encoder_randomaccess_main.cfg -ip 248",
+    /* HM SCC */ "-c $HMSCCCFG/encoder_randomaccess_main_scc.cfg -ip 248",
+    /* x265   */ "-I 248 --no-wpp", // GOP size: 248
+    /* f265   */ "key-frame-spacing=248",
+    /* x264   */ "",
+    /* ffmpeg */ "-g 248",
+    /* mpeg-2 */ "" // GOP size 248 does not make sense here
+    // 0 // all frames
+  },
+
+  { 0, NULL }
+};
+
+// ---------------------------------------------------------------------------
+
+class Input
+{
+public:
+  Input() {
+    width=height=0;
+    maxFrames=0;
+  }
+
+  void setInput(const char* yuvfilename,int w,int h, float fps) {
+    mInputFilename = yuvfilename;
+    width = w;
+    height = h;
+    mFPS = fps;
+  }
+
+  void setMaxFrames(int n) { maxFrames=n; }
+
+  std::string options_de265() const {
+    std::stringstream sstr;
+    sstr << " -i " << mInputFilename << " --width " << width << " --height " << height;
+    if (maxFrames) sstr << " --frames " << maxFrames;
+
+    return sstr.str();
+  }
+
+  std::string options_HM() const {
+    std::stringstream sstr;
+    sstr << "-i " << mInputFilename << " -wdt " << width << " -hgt " << height
+         << " -fr " << mFPS;
+    if (maxFrames) sstr << " -f " << maxFrames;
+
+    return sstr.str();
+  }
+
+  std::string options_x265() const {
+    std::stringstream sstr;
+    sstr << mInputFilename << " --input-res " << width << "x" << height
+         << " --fps " << mFPS;
+    if (maxFrames) sstr << " -f " << maxFrames;
+
+    return sstr.str();
+  }
+
+  std::string options_x264() const {
+    std::stringstream sstr;
+    sstr << mInputFilename << " --input-res " << width << "x" << height;
+    sstr << " --fps 25"; // TODO: check why crf/qp rate-control freaks out when fps is != 25
+    if (maxFrames) sstr << " --frames " << maxFrames;
+
+    return sstr.str();
+  }
+
+  std::string options_ffmpeg() const {
+    std::stringstream sstr;
+    sstr << "-f rawvideo -vcodec rawvideo -s " << width << "x" << height; // << " -r " << mFPS
+    sstr << " -pix_fmt yuv420p -i " << mInputFilename;
+    if (maxFrames) sstr << " -vframes " << maxFrames;
+
+    return sstr.str();
+  }
+
+  std::string options_f265() const {
+    std::stringstream sstr;
+    sstr << mInputFilename << " -w " << width << ":" << height;
+    if (maxFrames) sstr << " -c " << maxFrames;
+
+    return sstr.str();
+  }
+
+  std::string getFilename() const { return mInputFilename; }
+  float getFPS() const { return mFPS; }
+  int   getNFrames() const { return maxFrames; }
+  int   getWidth() const { return width; }
+  int   getHeight() const { return height; }
+
+private:
+  std::string mInputFilename;
+  int width, height;
+  int maxFrames;
+  float mFPS;
+};
+
+Input input;
+
+struct InputSpec
+{
+  const char* name;
+  const char* filename;
+  int width,height, nFrames;
+  float fps;
+} inputSpec[] = {
+  { "paris",       "$YUV/paris_cif.yuv",352,288,1065, 30.0 },
+  { "paris10",     "$YUV/paris_cif.yuv",352,288,  10, 30.0 },
+  { "paris100",    "$YUV/paris_cif.yuv",352,288, 100, 30.0 },
+  { "johnny",      "$YUV/Johnny_1280x720_60.yuv",1280,720,600,60.0 },
+  { "johnny10",    "$YUV/Johnny_1280x720_60.yuv",1280,720, 10,60.0 },
+  { "johnny100",   "$YUV/Johnny_1280x720_60.yuv",1280,720,100,60.0 },
+  { "cactus",      "$YUV/Cactus_1920x1080_50.yuv",1920,1080,500,50.0 },
+  { "cactus10",    "$YUV/Cactus_1920x1080_50.yuv",1920,1080, 10,50.0 },
+  { "4people",     "$YUVTMP/FourPeople_1280x720_60.yuv",1280,720,600,60.0 },
+  { "4people100",  "$YUVTMP/FourPeople_1280x720_60.yuv",1280,720,100,60.0 },
+  { "slideedit",   "$YUVTMP/SlideEditing_1280x720_30.yuv",1280,720,300,30.0 },
+  { "slideedit100","$YUVTMP/SlideEditing_1280x720_30.yuv",1280,720,100,30.0 },
+  { "slideshow",   "$YUVTMP/SlideShow_1280x720_20.yuv",1280,720,500,20.0 },
+  { "slideshow100","$YUVTMP/SlideShow_1280x720_20.yuv",1280,720,100,20.0 },
+  { "screensharing","$HOME/test-screensharing-encoding/Screensharing.yuv",1360,768,4715,60.0 },
+  { NULL }
+};
+
+
+void setInput(const char* input_preset)
+{
+  bool presetFound=false;
+
+  for (int i=0;inputSpec[i].name;i++) {
+    if (strcmp(input_preset, inputSpec[i].name)==0) {
+      input.setInput(inputSpec[i].filename,
+                     inputSpec[i].width,
+                     inputSpec[i].height,
+                     inputSpec[i].fps);
+      input.setMaxFrames(inputSpec[i].nFrames);
+      presetFound=true;
+      break;
+    }
+  }
+
+  if (!presetFound) {
+    fprintf(stderr,"no input preset '%s'\n",input_preset);
+    exit(5);
+  }
+}
+
+
+float bitrate(const char* filename)
+{
+  struct stat s;
+  stat(filename,&s);
+
+  long size = s.st_size;
+
+  int frames = input.getNFrames();
+  assert(frames!=0);
+
+  float bitrate = size*8/(frames/input.getFPS());
+  return bitrate;
+}
+
+
+// ---------------------------------------------------------------------------
+
+class Quality
+{
+public:
+  virtual ~Quality() { }
+
+  virtual void measure(const char* h265filename);
+  virtual void measure_yuv(const char* yuvfilename);
+
+  float psnr, ssim;
+};
+
+
+void Quality::measure(const char* h265filename)
+{
+  std::stringstream sstr;
+  sstr << "$DEC265 " << h265filename << " -q -t6 -m " << input.getFilename() << " | grep total "
+    //"| awk '{print $2}' "
+    ">/tmp/xtmp";
+
+  //std::cout << sstr.str() << "\n";
+  int retval = system(replace_variables(sstr.str()).c_str());
+
+  std::ifstream istr;
+  istr.open("/tmp/xtmp");
+  std::string dummy;
+  istr >> dummy >> psnr >> dummy >> dummy >> ssim;
+
+  unlink("/tmp/xtmp");
+}
+
+
+void Quality::measure_yuv(const char* yuvfilename)
+{
+  std::stringstream sstr;
+
+  sstr << "$YUVDIST " << input.getFilename() << " " << yuvfilename
+       << " " << input.getWidth() << " " << input.getHeight()
+       << "|grep total "
+    //"|awk '{print $2}' "
+    ">/tmp/ytmp";
+
+  //std::cout << sstr.str() << "\n";
+  int retval = system(replace_variables(sstr.str()).c_str());
+
+  std::ifstream istr;
+  istr.open("/tmp/ytmp");
+  std::string dummy;
+  istr >> dummy >> psnr >> ssim;
+
+  unlink("/tmp/ytmp");
+}
+
+Quality quality;
+
+// ---------------------------------------------------------------------------
+
+long ticks_per_second;
+
+void init_clock()
+{
+#ifndef WIN32
+  ticks_per_second = sysconf(_SC_CLK_TCK);
+#endif
+}
+
+double get_cpu_time()
+{
+#ifndef WIN32
+  struct tms t;
+  times(&t);
+  return double(t.tms_cutime)/ticks_per_second;
+#else
+  return 0; // not supported on windows (TODO)
+#endif
+}
+
+double get_wall_time()
+{
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  double t  = tv.tv_sec;
+  double ut = tv.tv_usec/1000000.0f;
+  t += ut;
+  return t;
+}
+
+
+struct RDPoint
+{
+  float rate;
+  float psnr;
+  float ssim;
+  double cpu_time; // computation time in seconds
+  double wall_time;
+
+
+  RDPoint() { }
+
+  void compute_from_h265(std::string stream_name) {
+    rate = bitrate(stream_name.c_str());
+    quality.measure(stream_name.c_str());
+    psnr = quality.psnr;
+    ssim = quality.ssim;
+  }
+
+  void compute_from_yuv(std::string stream_name, std::string yuv_name) {
+    rate = bitrate(stream_name.c_str());
+    quality.measure_yuv(yuv_name.c_str());
+    psnr = quality.psnr;
+    ssim = quality.ssim;
+  }
+
+  void start_timer() {
+    cpu_time = get_cpu_time();
+    wall_time= get_wall_time();
+  }
+
+  void end_timer() {
+    cpu_time = get_cpu_time() - cpu_time;
+    wall_time= get_wall_time()- wall_time;
+  }
+};
+
+
+FILE* output_fh;
+
+void write_rd_line(RDPoint p)
+{
+  fprintf(output_fh,"%9.2f %6.4f %5.3f %5.4f %5.4f\n",
+          p.rate/1024, p.psnr, p.ssim,
+          p.cpu_time/60, p.wall_time/60);
+  fflush(output_fh);
+}
+
+
+
+
+class Encoder
+{
+public:
+  virtual ~Encoder() { }
+
+  virtual std::vector<RDPoint> encode_curve(const Preset& preset) const = 0;
+
+private:
+};
+
+
+class Encoder_de265 : public Encoder
+{
+public:
+  Encoder_de265();
+  void setQPRange(int low,int high,int step) { mQPLow=low; mQPHigh=high; mQPStep=step; }
+
+  virtual std::vector<RDPoint> encode_curve(const Preset& preset) const;
+
+private:
+  RDPoint encode(const Preset& preset,int qp) const;
+
+  int mQPLow,mQPHigh,mQPStep;
+};
+
+
+Encoder_de265::Encoder_de265()
+{
+  mQPLow = 14;
+  mQPHigh= 40;
+  mQPStep=  2;
+}
+
+
+std::vector<RDPoint> Encoder_de265::encode_curve(const Preset& preset) const
+{
+  std::vector<RDPoint> curve;
+
+  for (int qp=mQPHigh ; qp>=mQPLow ; qp-=mQPStep) {
+    curve.push_back(encode(preset, qp));
+  }
+
+  return curve;
+}
+
+
+RDPoint Encoder_de265::encode(const Preset& preset,int qp) const
+{
+  std::stringstream streamname;
+  streamname << "de265-" << preset.name << "-" << qp << ".265";
+
+  std::stringstream cmd1;
+  cmd1 << "$ENC265 " << input.options_de265()
+       << " " << preset.options_de265
+       << " -q " << qp << " -o " << streamname.str()
+       << " " << encoderParameters;
+
+  std::string cmd2 = replace_variables(cmd1.str());
+
+  printf("cmdline: %s\n",cmd2.c_str());
+
+  RDPoint rd;
+  rd.start_timer();
+  int retval = system(cmd2.c_str());
+  rd.end_timer();
+
+  rd.compute_from_h265(streamname.str());
+
+  if (!keepStreams) { unlink(streamname.str().c_str()); }
+
+  write_rd_line(rd);
+
+  return rd;
+}
+
+
+
+
+class Encoder_HM : public Encoder
+{
+public:
+  Encoder_HM();
+
+  void enableSCC(bool flag=true) { useSCC = flag; }
+  void setQPRange(int low,int high,int step) { mQPLow=low; mQPHigh=high; mQPStep=step; }
+
+  virtual std::vector<RDPoint> encode_curve(const Preset& preset) const;
+
+private:
+  RDPoint encode(const Preset& preset,int qp) const;
+
+  bool useSCC;
+  int mQPLow,mQPHigh,mQPStep;
+};
+
+
+Encoder_HM::Encoder_HM()
+{
+  mQPLow = 14;
+  mQPHigh= 40;
+  mQPStep=  2;
+
+  useSCC = false;
+}
+
+
+std::vector<RDPoint> Encoder_HM::encode_curve(const Preset& preset) const
+{
+  std::vector<RDPoint> curve;
+
+  for (int qp=mQPHigh ; qp>=mQPLow ; qp-=mQPStep) {
+    curve.push_back(encode(preset, qp));
+  }
+
+  return curve;
+}
+
+
+RDPoint Encoder_HM::encode(const Preset& preset,int qp) const
+{
+  std::stringstream streamname;
+  streamname << (useSCC ? "hmscc-" : "hm-") << preset.name << "-" << qp << ".265";
+
+  char recoyuv_prefix[] = "/tmp/reco-XXXXXX";
+  char *tempfile = mktemp(recoyuv_prefix);
+  assert(tempfile != NULL && tempfile[0] != 0);
+  std::string recoyuv = std::string(recoyuv_prefix) + ".yuv";
+
+  std::stringstream cmd1;
+  cmd1 << (useSCC ? "$HMSCCENC " : "$HMENC ")
+       << input.options_HM()
+       << " " << (useSCC ? preset.options_hm_scc : preset.options_hm)
+       << " -q " << qp << " -o " << recoyuv << " -b " << streamname.str()
+       << " " << encoderParameters << " >&2";
+
+  std::string cmd2 = replace_variables(cmd1.str());
+
+  std::cout << "CMD: '" << cmd2 << "'\n";
+  RDPoint rd;
+  rd.start_timer();
+  int retval = system(cmd2.c_str());
+  rd.end_timer();
+
+  rd.compute_from_yuv(streamname.str(), recoyuv);
+  if (!keepStreams) { unlink(streamname.str().c_str()); }
+  unlink(recoyuv.c_str());
+
+  write_rd_line(rd);
+
+  return rd;
+}
+
+
+
+class Encoder_x265 : public Encoder
+{
+public:
+  Encoder_x265();
+  void setQPRange(int low,int high,int step) { mQPLow=low; mQPHigh=high; mQPStep=step; }
+
+  virtual std::vector<RDPoint> encode_curve(const Preset& preset) const;
+
+private:
+  RDPoint encode(const Preset& preset,int qp) const;
+
+  int mQPLow,mQPHigh,mQPStep;
+};
+
+
+Encoder_x265::Encoder_x265()
+{
+  /* CRF
+  mQPLow =  4;
+  mQPHigh= 34;
+  mQPStep=  2;
+  */
+
+  mQPLow = 14;
+  mQPHigh= 40;
+  mQPStep=  2;
+}
+
+
+std::vector<RDPoint> Encoder_x265::encode_curve(const Preset& preset) const
+{
+  std::vector<RDPoint> curve;
+
+  for (int qp=mQPHigh ; qp>=mQPLow ; qp-=mQPStep) {
+    curve.push_back(encode(preset, qp));
+  }
+
+  return curve;
+}
+
+
+RDPoint Encoder_x265::encode(const Preset& preset,int qp) const
+{
+  std::stringstream streamname;
+  streamname << "x265-" << preset.name << "-" << qp << ".265";
+
+  std::stringstream cmd1;
+  cmd1 << "$X265ENC " << input.options_x265()
+       << " " << preset.options_x265
+       << " --qp " << qp << " " << streamname.str()
+       << " " << encoderParameters
+       << " >&2";
+
+  std::string cmd2 = replace_variables(cmd1.str());
+
+  //std::cout << "CMD: '" << cmd2 << "'\n";
+  RDPoint rd;
+  rd.start_timer();
+  int retval = system(cmd2.c_str());
+  rd.end_timer();
+
+  rd.compute_from_h265(streamname.str());
+  if (!keepStreams) { unlink(streamname.str().c_str()); }
+
+  write_rd_line(rd);
+
+  return rd;
+}
+
+
+
+
+class Encoder_f265 : public Encoder
+{
+public:
+  Encoder_f265();
+  void setQPRange(int low,int high,int step) { mQPLow=low; mQPHigh=high; mQPStep=step; }
+
+  virtual std::vector<RDPoint> encode_curve(const Preset& preset) const;
+
+private:
+  RDPoint encode(const Preset& preset,int qp) const;
+
+  int mQPLow,mQPHigh,mQPStep;
+};
+
+
+Encoder_f265::Encoder_f265()
+{
+  mQPLow = 14;
+  mQPHigh= 40;
+  mQPStep=  2;
+}
+
+
+std::vector<RDPoint> Encoder_f265::encode_curve(const Preset& preset) const
+{
+  std::vector<RDPoint> curve;
+
+  for (int qp=mQPHigh ; qp>=mQPLow ; qp-=mQPStep) {
+    curve.push_back(encode(preset, qp));
+  }
+
+  return curve;
+}
+
+
+RDPoint Encoder_f265::encode(const Preset& preset,int qp) const
+{
+  std::stringstream cmd1;
+  cmd1 << "$F265 " << input.options_f265()
+       << " f265.out -v -p\"" << preset.options_f265 << " qp=" << qp
+       << " " << encoderParameters
+       << "\" >&2";
+
+  std::string cmd2 = replace_variables(cmd1.str());
+
+  std::cout << "CMD: '" << cmd2 << "'\n";
+  RDPoint rd;
+  rd.start_timer();
+  int retval = system(cmd2.c_str());
+  rd.end_timer();
+
+  rd.compute_from_h265("f265.out");
+  if (!keepStreams) { unlink("f265.out"); }
+
+  write_rd_line(rd);
+
+  return rd;
+}
+
+
+
+class Encoder_x264 : public Encoder
+{
+public:
+  Encoder_x264();
+  //void setCRFRange(int low,int high,int step) { mCRFLow=low; mCRFHigh=high; mCRFStep=step; }
+
+  virtual std::vector<RDPoint> encode_curve(const Preset& preset) const;
+
+private:
+  RDPoint encode(const Preset& preset,int crf) const;
+
+  int mCRFLow,mCRFMid,mCRFHigh;
+  int mCRFStepHigh, mCRFStepLow;
+};
+
+
+Encoder_x264::Encoder_x264()
+{
+  // in the upper bit-rate range [mid;high], use larger CRF step-size 'StepHigh'
+  // in the lower bit-rate range [low;mid], use smaller CRF step-size 'StepLow'
+
+  mCRFLow = 10;
+  mCRFMid = 20;
+  mCRFHigh= 36;
+  mCRFStepHigh= 2;
+  mCRFStepLow = 1;
+}
+
+
+std::vector<RDPoint> Encoder_x264::encode_curve(const Preset& preset) const
+{
+  std::vector<RDPoint> curve;
+
+  for (int crf=mCRFLow ; crf<mCRFMid ; crf+=mCRFStepHigh) {
+    curve.push_back(encode(preset, crf));
+  }
+
+  for (int crf=mCRFMid ; crf<=mCRFHigh ; crf+=mCRFStepLow) {
+    curve.push_back(encode(preset, crf));
+  }
+
+  return curve;
+}
+
+
+RDPoint Encoder_x264::encode(const Preset& preset,int qp_crf) const
+{
+  std::stringstream streamname;
+  streamname << "x264-" << preset.name << "-" << qp_crf << ".264";
+
+  std::stringstream cmd1;
+#if 0
+  cmd1 << "$X264 " << input.options_x264()
+       << " " << preset.options_x264
+       << " --crf " << qp_crf
+       << " -o " << streamname.str();
+#else
+  cmd1 << "$FFMPEG " << input.options_ffmpeg()
+       << " " << preset.options_x264_ffmpeg
+       << " -crf " << qp_crf
+       << " -threads 6"
+       << " -f h264 " << streamname.str()
+       << " " << encoderParameters;
+#endif
+
+  std::string cmd2 = replace_variables(cmd1.str());
+
+  std::cerr << "-----------------------------\n";
+
+  std::cerr << "CMD: '" << cmd2 << "'\n";
+
+  RDPoint rd;
+  rd.start_timer();
+  int retval = system(cmd2.c_str());
+  rd.end_timer();
+
+  char tmpyuv_prefix[] = "/tmp/rdout-XXXXXX";
+  char *tempfile = mktemp(tmpyuv_prefix);
+  assert(tempfile != NULL && tempfile[0] != 0);
+  std::string tmpyuv = std::string(tmpyuv_prefix) + ".yuv";
+
+  std::string cmd3 = "ffmpeg -i " + streamname.str() + " -threads 6 " + tmpyuv;
+
+  retval = system(cmd3.c_str());
+
+  rd.compute_from_yuv(streamname.str(), tmpyuv);
+
+  unlink(tmpyuv.c_str());
+  if (!keepStreams) { unlink(streamname.str().c_str()); }
+
+  write_rd_line(rd);
+
+  return rd;
+}
+
+
+class Encoder_mpeg2 : public Encoder
+{
+public:
+  Encoder_mpeg2();
+
+  virtual std::vector<RDPoint> encode_curve(const Preset& preset) const;
+
+private:
+  RDPoint encode(const Preset& preset,int bitrate) const;
+};
+
+
+Encoder_mpeg2::Encoder_mpeg2()
+{
+}
+
+
+std::vector<RDPoint> Encoder_mpeg2::encode_curve(const Preset& preset) const
+{
+  std::vector<RDPoint> curve;
+
+  int bitrates[] = { 250,500,750,1000,1250,1500,1750,2000,2500,3000,3500,4000,4500,5000,
+                     6000,7000,8000,9000,10000,12000,14000,16000,18000,20000,25000,30000,
+                     -1 };
+
+  for (int i=0; bitrates[i]>0; i++) {
+    curve.push_back(encode(preset, bitrates[i]));
+  }
+
+  return curve;
+}
+
+
+RDPoint Encoder_mpeg2::encode(const Preset& preset,int br) const
+{
+  std::stringstream streamname;
+  streamname << "mpeg2-" << preset.name << "-"
+             << std::setfill('0') << std::setw(5) << br << ".mp2";
+
+  std::stringstream cmd1;
+  cmd1 << "$FFMPEG " << input.options_ffmpeg()
+       << " " << preset.options_x264_ffmpeg
+       << " -b " << br << "k "
+       << " -threads 6"
+       << " -f mpeg2video " << streamname.str()
+       << " " << encoderParameters;
+
+  std::string cmd2 = replace_variables(cmd1.str());
+
+  std::cerr << "-----------------------------\n";
+
+  std::cerr << "CMD: '" << cmd2 << "'\n";
+
+  RDPoint rd;
+  rd.start_timer();
+  int retval = system(cmd2.c_str());
+  rd.end_timer();
+
+  char tmpyuv_prefix[] = "/tmp/rdout-XXXXXX";
+  char *tempfile = mktemp(tmpyuv_prefix);
+  assert(tempfile != NULL && tempfile[0] != 0);
+  std::string tmpyuv = std::string(tmpyuv_prefix) + ".yuv";
+
+  std::string cmd3 = "ffmpeg -i " + streamname.str() + " -threads 6 " + tmpyuv;
+
+  retval = system(cmd3.c_str());
+
+  rd.compute_from_yuv(streamname.str(), tmpyuv);
+
+  unlink(tmpyuv.c_str());
+  if (!keepStreams) { unlink(streamname.str().c_str()); }
+
+  write_rd_line(rd);
+
+  return rd;
+}
+
+
+Encoder_de265 enc_de265;
+Encoder_HM enc_hm;
+Encoder_x265 enc_x265;
+Encoder_f265 enc_f265;
+Encoder_x264 enc_x264;
+Encoder_mpeg2 enc_mpeg2;
+
+// ---------------------------------------------------------------------------
+
+static struct option long_options[] = {
+  {"keep-streams",      no_argument,       0, 'k' },
+  //{"write-bytestream", required_argument,0, 'B' },
+  {0,         0,                 0,  0 }
+};
+
+
+void show_usage()
+{
+  fprintf(stderr,
+          "usage: rd-curves 'preset_id' 'input_preset' 'encoder'\n"
+          "supported encoders: de265 / hm / hmscc / x265 / f265 / x264 / mpeg2\n");
+  fprintf(stderr,
+          "presets:\n");
+
+  for (int i=0;preset[i].name!=NULL;i++) {
+    fprintf(stderr,
+            " %2d %-20s %s\n",preset[i].ID,preset[i].name,preset[i].descr);
+  }
+
+  fprintf(stderr,
+          "\ninput presets:\n");
+  for (int i=0;inputSpec[i].name;i++) {
+    fprintf(stderr,
+            " %-12s %-30s %4dx%4d, %4d frames, %5.2f fps\n",
+            inputSpec[i].name,
+            inputSpec[i].filename,
+            inputSpec[i].width,
+            inputSpec[i].height,
+            inputSpec[i].nFrames,
+            inputSpec[i].fps);
+  }
+}
+
+int main(int argc, char** argv)
+{
+  init_clock();
+
+  while (1) {
+    int option_index = 0;
+
+    int c = getopt_long(argc, argv, "kf:p:",
+                        long_options, &option_index);
+    if (c == -1)
+      break;
+
+    switch (c) {
+    case 'k': keepStreams=true; break;
+    case 'f': maxFrames=atoi(optarg); break;
+    case 'p': encoderParameters=optarg; break;
+    }
+  }
+
+  if (optind != argc-3) {
+    show_usage();
+    exit(5);
+  }
+
+  int presetID = atoi( argv[optind] );
+  const char* inputName = argv[optind+1];
+  const char* encoderName = argv[optind+2];
+
+  int presetIdx = -1;
+
+  for (int i=0;preset[i].name != NULL;i++) {
+    if (preset[i].ID == presetID) {
+      presetIdx = i;
+      break;
+    }
+  }
+
+  if (presetIdx == -1) {
+    fprintf(stderr,"preset ID %d does not exist\n",presetID);
+    exit(5);
+  }
+
+  setInput(inputName);
+  if (maxFrames) input.setMaxFrames(maxFrames);
+
+
+  Encoder* enc = NULL;
+  /**/ if (strcmp(encoderName,"de265")==0) { enc = &enc_de265; }
+  else if (strcmp(encoderName,"hm"   )==0) { enc = &enc_hm;   }
+  else if (strcmp(encoderName,"hmscc")==0) { enc = &enc_hm;   enc_hm.enableSCC(); }
+  else if (strcmp(encoderName,"x265" )==0) { enc = &enc_x265; }
+  else if (strcmp(encoderName,"f265" )==0) { enc = &enc_f265; }
+  else if (strcmp(encoderName,"x264" )==0) { enc = &enc_x264; }
+  else if (strcmp(encoderName,"mpeg2")==0) { enc = &enc_mpeg2; }
+
+  if (enc==NULL) {
+    fprintf(stderr, "unknown encoder");
+    exit(5);
+  }
+
+
+  std::stringstream data_filename;
+  data_filename << encoderName << "-" << inputName << "-" << preset[presetIdx].name << ".rd";
+  output_fh = fopen(data_filename.str().c_str(), "wb");
+
+  fprintf(output_fh,"# %s\n", preset[presetIdx].descr);
+  fprintf(output_fh,"# 1:rate 2:psnr 3:ssim 4:cputime(min) 5:walltime(min)\n");
+
+  std::vector<RDPoint> curve = enc->encode_curve(preset[presetIdx]);
+
+  for (int i=0;i<curve.size();i++) {
+    //fprintf(out_fh,"%7.2f %6.4f\n", curve[i].rate/1024, curve[i].psnr);
+  }
+
+  fclose(output_fh);
+
+  return 0;
+}
diff --git a/tools/tests.cc b/tools/tests.cc
new file mode 100644
index 0000000..f728c67
--- /dev/null
+++ b/tools/tests.cc
@@ -0,0 +1,100 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <iostream>
+#include <string.h>
+
+
+class Test
+{
+public:
+  Test() { next=s_firstTest; s_firstTest=this; }
+  virtual ~Test() { }
+
+  virtual const char* getName() const { return "noname"; }
+  virtual const char* getDescription() const { return "no description"; }
+  virtual bool work(bool quiet=false) = 0;
+
+  static void runTest(const char* name) {
+    Test* t = s_firstTest;
+    while (t) {
+      if (strcmp(t->getName(), name)==0) {
+        t->work();
+        break;
+      }
+      t=t->next;
+    }
+  }
+
+  static void runAllTests() {
+    Test* t = s_firstTest;
+    while (t) {
+      printf("%s ... ",t->getName());
+      fflush(stdout);
+      if (t->work(true) == false) {
+        printf("*** FAILED ***\n");
+      }
+      else {
+        printf("passed\n");
+      }
+
+      t=t->next;
+    }
+  }
+
+public:
+  Test* next;
+  static Test* s_firstTest;
+};
+
+Test* Test::s_firstTest = NULL;
+
+
+class ListTests : public Test
+{
+public:
+  const char* getName() const { return "list"; }
+  const char* getDescription() const { return "list all available tests"; }
+  bool work(bool quiet) {
+    if (!quiet) {
+      Test* t = s_firstTest;
+      while (t) {
+        printf("- %s: %s\n",t->getName(), t->getDescription());
+        t=t->next;
+      }
+    }
+    return true;
+  }
+} listtest;
+
+
+
+int main(int argc,char** argv)
+{
+  if (argc>=2) {
+    Test::runTest(argv[1]);
+  }
+  else {
+    Test::runAllTests();
+  }
+
+  return 0;
+}
diff --git a/tools/yuv-distortion.cc b/tools/yuv-distortion.cc
new file mode 100644
index 0000000..ca08a99
--- /dev/null
+++ b/tools/yuv-distortion.cc
@@ -0,0 +1,113 @@
+/*
+ * H.265 video codec.
+ * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin at struktur.de>
+ *
+ * This file is part of libde265.
+ *
+ * libde265 is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * libde265 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <libde265/quality.h>
+
+#if HAVE_VIDEOGFX
+#include <libvideogfx.hh>
+using namespace videogfx;
+#endif
+
+
+float ssim(const uint8_t* img1,
+           const uint8_t* img2,
+           int width, int height)
+{
+#if HAVE_VIDEOGFX
+  Bitmap<Pixel> ref, coded;
+  ref  .Create(width, height); // reference image
+  coded.Create(width, height); // coded image
+
+  for (int y=0;y<height;y++) {
+    memcpy(coded[y], img1 + y*width, width);
+    memcpy(ref[y],   img2 + y*width, width);
+  }
+
+  SSIM ssimAlgo;
+  return ssimAlgo.calcMSSIM(ref,coded);
+#else
+  return 0;
+#endif
+}
+
+
+int main(int argc, char** argv)
+{
+  if (argc != 5) {
+    fprintf(stderr,"need two YUV files and image size as input: FILE1 FILE2 WIDTH HEIGHT\n");
+    exit(5);
+  }
+
+
+  FILE* fh_ref = fopen(argv[1],"rb");
+  FILE* fh_cmp = fopen(argv[2],"rb");
+
+  int width  = atoi(argv[3]);
+  int height = atoi(argv[4]);
+
+  uint8_t* yp_ref = (uint8_t*)malloc(width*height);
+  uint8_t* yp_cmp = (uint8_t*)malloc(width*height);
+
+  double mse_y=0.0, ssim_y=0.0;
+  int nFrames=0;
+
+  for (;;)
+    {
+      if (fread(yp_ref,1,width*height,fh_ref) != width*height) {
+        break;
+      }
+      if (fread(yp_cmp,1,width*height,fh_cmp) != width*height) {
+        break;
+      }
+
+      if (feof(fh_ref)) break;
+      if (feof(fh_cmp)) break;
+
+      fprintf(stderr,"yuv-distortion processing frame %d\r",nFrames+1);
+
+      fseek(fh_ref,width*height/2,SEEK_CUR);
+      fseek(fh_cmp,width*height/2,SEEK_CUR);
+
+      double curr_mse_y = MSE(yp_ref, width,  yp_cmp, width,  width, height);
+      mse_y += curr_mse_y;
+
+      double curr_ssim_y = ssim(yp_ref, yp_cmp, width, height);
+      ssim_y += curr_ssim_y;
+
+      printf("%4d %f %f\n",nFrames,PSNR(curr_mse_y),curr_ssim_y);
+
+      nFrames++;
+    }
+
+  printf("total: %f %f\n",PSNR(mse_y/nFrames),ssim_y/nFrames);
+  fprintf(stderr,"\n");
+
+  fclose(fh_ref);
+  fclose(fh_cmp);
+
+  return 0;
+}

-- 
libde265 packaging