[SCM] blender/upstream: New upstream version 2.78.b+dfsg0

Sat Jun 3 20:27:20 UTC 2017

The following commit has been merged in the upstream branch:
commit 65b83c264576a3d26265ff00c11e24d8eca37d71
Author: Matteo F. Vescovi <mfv at debian.org>
Date:   Sat Jun 3 22:09:20 2017 +0200

    New upstream version 2.78.b+dfsg0

diff --git a/build_files/cmake/platform/platform_win32_msvc.cmake b/build_files/cmake/platform/platform_win32_msvc.cmake
index 5efda52..ecdba59 100644
--- a/build_files/cmake/platform/platform_win32_msvc.cmake
+++ b/build_files/cmake/platform/platform_win32_msvc.cmake
@@ -236,14 +236,14 @@ if(WITH_CODEC_FFMPEG)
 	windows_find_package(FFMPEG)
 	if(NOT FFMPEG_FOUND)
 		warn_hardcoded_paths(ffmpeg)
-		set(FFMPEG_LIBRARY_VERSION 55)
-		set(FFMPEG_LIBRARY_VERSION_AVU 52)
+		set(FFMPEG_LIBRARY_VERSION 57)
+		set(FFMPEG_LIBRARY_VERSION_AVU 55)
 		set(FFMPEG_LIBRARIES
-			${LIBDIR}/ffmpeg/lib/avcodec-${FFMPEG_LIBRARY_VERSION}.lib
-			${LIBDIR}/ffmpeg/lib/avformat-${FFMPEG_LIBRARY_VERSION}.lib
-			${LIBDIR}/ffmpeg/lib/avdevice-${FFMPEG_LIBRARY_VERSION}.lib
-			${LIBDIR}/ffmpeg/lib/avutil-${FFMPEG_LIBRARY_VERSION_AVU}.lib
-			${LIBDIR}/ffmpeg/lib/swscale-2.lib
+			${LIBDIR}/ffmpeg/lib/avcodec.lib
+			${LIBDIR}/ffmpeg/lib/avformat.lib
+			${LIBDIR}/ffmpeg/lib/avdevice.lib
+			${LIBDIR}/ffmpeg/lib/avutil.lib
+			${LIBDIR}/ffmpeg/lib/swscale.lib
 			)
 	endif()
 endif()
@@ -378,6 +378,7 @@ if(WITH_OPENIMAGEIO)
 	set(OPENCOLORIO_DEFINITIONS "-DOCIO_STATIC_BUILD")
 	set(OPENIMAGEIO_IDIFF "${OPENIMAGEIO}/bin/idiff.exe")
 	add_definitions(-DOIIO_STATIC_BUILD)
+	add_definitions(-DOIIO_NO_SSE=1)
 endif()
 
 if(WITH_LLVM)
diff --git a/intern/atomic/atomic_ops.h b/intern/atomic/atomic_ops.h
index 0bc7905..1a139d7 100644
--- a/intern/atomic/atomic_ops.h
+++ b/intern/atomic/atomic_ops.h
@@ -77,13 +77,13 @@
 /* Function prototypes. */
 
 #if (LG_SIZEOF_PTR == 8 || LG_SIZEOF_INT == 8)
-ATOMIC_INLINE uint64_t atomic_add_uint64(uint64_t *p, uint64_t x);
-ATOMIC_INLINE uint64_t atomic_sub_uint64(uint64_t *p, uint64_t x);
+ATOMIC_INLINE uint64_t atomic_add_and_fetch_uint64(uint64_t *p, uint64_t x);
+ATOMIC_INLINE uint64_t atomic_sub_and_fetch_uint64(uint64_t *p, uint64_t x);
 ATOMIC_INLINE uint64_t atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _new);
 #endif
 
-ATOMIC_INLINE uint32_t atomic_add_uint32(uint32_t *p, uint32_t x);
-ATOMIC_INLINE uint32_t atomic_sub_uint32(uint32_t *p, uint32_t x);
+ATOMIC_INLINE uint32_t atomic_add_and_fetch_uint32(uint32_t *p, uint32_t x);
+ATOMIC_INLINE uint32_t atomic_sub_and_fetch_uint32(uint32_t *p, uint32_t x);
 ATOMIC_INLINE uint32_t atomic_cas_uint32(uint32_t *v, uint32_t old, uint32_t _new);
 
 ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x);
@@ -91,18 +91,18 @@ ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x);
 ATOMIC_INLINE uint8_t atomic_fetch_and_or_uint8(uint8_t *p, uint8_t b);
 ATOMIC_INLINE uint8_t atomic_fetch_and_and_uint8(uint8_t *p, uint8_t b);
 
-ATOMIC_INLINE size_t atomic_add_z(size_t *p, size_t x);
-ATOMIC_INLINE size_t atomic_sub_z(size_t *p, size_t x);
+ATOMIC_INLINE size_t atomic_add_and_fetch_z(size_t *p, size_t x);
+ATOMIC_INLINE size_t atomic_sub_and_fetch_z(size_t *p, size_t x);
 ATOMIC_INLINE size_t atomic_cas_z(size_t *v, size_t old, size_t _new);
 
-ATOMIC_INLINE unsigned atomic_add_u(unsigned *p, unsigned x);
-ATOMIC_INLINE unsigned atomic_sub_u(unsigned *p, unsigned x);
+ATOMIC_INLINE unsigned atomic_add_and_fetch_u(unsigned *p, unsigned x);
+ATOMIC_INLINE unsigned atomic_sub_and_fetch_u(unsigned *p, unsigned x);
 ATOMIC_INLINE unsigned atomic_cas_u(unsigned *v, unsigned old, unsigned _new);
 
 /* WARNING! Float 'atomics' are really faked ones, those are actually closer to some kind of spinlock-sync'ed operation,
  *          which means they are only efficient if collisions are highly unlikely (i.e. if probability of two threads
  *          working on the same pointer at the same time is very low). */
-ATOMIC_INLINE float atomic_add_fl(float *p, const float x);
+ATOMIC_INLINE float atomic_add_and_fetch_fl(float *p, const float x);
 
 /******************************************************************************/
 /* Include system-dependent implementations. */
diff --git a/intern/atomic/intern/atomic_ops_ext.h b/intern/atomic/intern/atomic_ops_ext.h
index 4065299..74ed327 100644
--- a/intern/atomic/intern/atomic_ops_ext.h
+++ b/intern/atomic/intern/atomic_ops_ext.h
@@ -56,25 +56,25 @@
 
 /******************************************************************************/
 /* size_t operations. */
-ATOMIC_INLINE size_t atomic_add_z(size_t *p, size_t x)
+ATOMIC_INLINE size_t atomic_add_and_fetch_z(size_t *p, size_t x)
 {
 	assert(sizeof(size_t) == LG_SIZEOF_PTR);
 
 #if (LG_SIZEOF_PTR == 8)
-	return (size_t)atomic_add_uint64((uint64_t *)p, (uint64_t)x);
+	return (size_t)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)x);
 #elif (LG_SIZEOF_PTR == 4)
-	return (size_t)atomic_add_uint32((uint32_t *)p, (uint32_t)x);
+	return (size_t)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)x);
 #endif
 }
 
-ATOMIC_INLINE size_t atomic_sub_z(size_t *p, size_t x)
+ATOMIC_INLINE size_t atomic_sub_and_fetch_z(size_t *p, size_t x)
 {
 	assert(sizeof(size_t) == LG_SIZEOF_PTR);
 
 #if (LG_SIZEOF_PTR == 8)
-	return (size_t)atomic_add_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
+	return (size_t)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
 #elif (LG_SIZEOF_PTR == 4)
-	return (size_t)atomic_add_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
+	return (size_t)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
 #endif
 }
 
@@ -91,25 +91,25 @@ ATOMIC_INLINE size_t atomic_cas_z(size_t *v, size_t old, size_t _new)
 
 /******************************************************************************/
 /* unsigned operations. */
-ATOMIC_INLINE unsigned atomic_add_u(unsigned *p, unsigned x)
+ATOMIC_INLINE unsigned atomic_add_and_fetch_u(unsigned *p, unsigned x)
 {
 	assert(sizeof(unsigned) == LG_SIZEOF_INT);
 
 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_add_uint64((uint64_t *)p, (uint64_t)x);
+	return (unsigned)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)x);
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_add_uint32((uint32_t *)p, (uint32_t)x);
+	return (unsigned)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)x);
 #endif
 }
 
-ATOMIC_INLINE unsigned atomic_sub_u(unsigned *p, unsigned x)
+ATOMIC_INLINE unsigned atomic_sub_and_fetch_u(unsigned *p, unsigned x)
 {
 	assert(sizeof(unsigned) == LG_SIZEOF_INT);
 
 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_add_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
+	return (unsigned)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_add_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
+	return (unsigned)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
 #endif
 }
 
@@ -127,7 +127,7 @@ ATOMIC_INLINE unsigned atomic_cas_u(unsigned *v, unsigned old, unsigned _new)
 /******************************************************************************/
 /* float operations. */
 
-ATOMIC_INLINE float atomic_add_fl(float *p, const float x)
+ATOMIC_INLINE float atomic_add_and_fetch_fl(float *p, const float x)
 {
 	assert(sizeof(float) == sizeof(uint32_t));
 
diff --git a/intern/atomic/intern/atomic_ops_msvc.h b/intern/atomic/intern/atomic_ops_msvc.h
index 15ddda2..c6a4bef 100644
--- a/intern/atomic/intern/atomic_ops_msvc.h
+++ b/intern/atomic/intern/atomic_ops_msvc.h
@@ -43,12 +43,12 @@
 /******************************************************************************/
 /* 64-bit operations. */
 #if (LG_SIZEOF_PTR == 8 || LG_SIZEOF_INT == 8)
-ATOMIC_INLINE uint64_t atomic_add_uint64(uint64_t *p, uint64_t x)
+ATOMIC_INLINE uint64_t atomic_add_and_fetch_uint64(uint64_t *p, uint64_t x)
 {
 	return InterlockedExchangeAdd64((int64_t *)p, (int64_t)x) + x;
 }
 
-ATOMIC_INLINE uint64_t atomic_sub_uint64(uint64_t *p, uint64_t x)
+ATOMIC_INLINE uint64_t atomic_sub_and_fetch_uint64(uint64_t *p, uint64_t x)
 {
 	return InterlockedExchangeAdd64((int64_t *)p, -((int64_t)x)) - x;
 }
@@ -61,12 +61,12 @@ ATOMIC_INLINE uint64_t atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _ne
 
 /******************************************************************************/
 /* 32-bit operations. */
-ATOMIC_INLINE uint32_t atomic_add_uint32(uint32_t *p, uint32_t x)
+ATOMIC_INLINE uint32_t atomic_add_and_fetch_uint32(uint32_t *p, uint32_t x)
 {
 	return InterlockedExchangeAdd(p, x) + x;
 }
 
-ATOMIC_INLINE uint32_t atomic_sub_uint32(uint32_t *p, uint32_t x)
+ATOMIC_INLINE uint32_t atomic_sub_and_fetch_uint32(uint32_t *p, uint32_t x)
 {
 	return InterlockedExchangeAdd(p, -((int32_t)x)) - x;
 }
diff --git a/intern/atomic/intern/atomic_ops_unix.h b/intern/atomic/intern/atomic_ops_unix.h
index 55c0002..ad6fe74 100644
--- a/intern/atomic/intern/atomic_ops_unix.h
+++ b/intern/atomic/intern/atomic_ops_unix.h
@@ -58,12 +58,12 @@
 /* 64-bit operations. */
 #if (LG_SIZEOF_PTR == 8 || LG_SIZEOF_INT == 8)
 #  if (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8) || defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_8))
-ATOMIC_INLINE uint64_t atomic_add_uint64(uint64_t *p, uint64_t x)
+ATOMIC_INLINE uint64_t atomic_add_and_fetch_uint64(uint64_t *p, uint64_t x)
 {
 	return __sync_add_and_fetch(p, x);
 }
 
-ATOMIC_INLINE uint64_t atomic_sub_uint64(uint64_t *p, uint64_t x)
+ATOMIC_INLINE uint64_t atomic_sub_and_fetch_uint64(uint64_t *p, uint64_t x)
 {
 	return __sync_sub_and_fetch(p, x);
 }
@@ -73,7 +73,7 @@ ATOMIC_INLINE uint64_t atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _ne
 	return __sync_val_compare_and_swap(v, old, _new);
 }
 #  elif (defined(__amd64__) || defined(__x86_64__))
-ATOMIC_INLINE uint64_t atomic_add_uint64(uint64_t *p, uint64_t x)
+ATOMIC_INLINE uint64_t atomic_add_and_fetch_uint64(uint64_t *p, uint64_t x)
 {
 	asm volatile (
 	    "lock; xaddq %0, %1;"
@@ -83,7 +83,7 @@ ATOMIC_INLINE uint64_t atomic_add_uint64(uint64_t *p, uint64_t x)
 	return x;
 }
 
-ATOMIC_INLINE uint64_t atomic_sub_uint64(uint64_t *p, uint64_t x)
+ATOMIC_INLINE uint64_t atomic_sub_and_fetch_uint64(uint64_t *p, uint64_t x)
 {
 	x = (uint64_t)(-(int64_t)x);
 	asm volatile (
@@ -112,12 +112,12 @@ ATOMIC_INLINE uint64_t atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _ne
 /******************************************************************************/
 /* 32-bit operations. */
 #if (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4) || defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_4))
-ATOMIC_INLINE uint32_t atomic_add_uint32(uint32_t *p, uint32_t x)
+ATOMIC_INLINE uint32_t atomic_add_and_fetch_uint32(uint32_t *p, uint32_t x)
 {
 	return __sync_add_and_fetch(p, x);
 }
 
-ATOMIC_INLINE uint32_t atomic_sub_uint32(uint32_t *p, uint32_t x)
+ATOMIC_INLINE uint32_t atomic_sub_and_fetch_uint32(uint32_t *p, uint32_t x)
 {
 	return __sync_sub_and_fetch(p, x);
 }
@@ -127,7 +127,7 @@ ATOMIC_INLINE uint32_t atomic_cas_uint32(uint32_t *v, uint32_t old, uint32_t _ne
    return __sync_val_compare_and_swap(v, old, _new);
 }
 #elif (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
-ATOMIC_INLINE uint32_t atomic_add_uint32(uint32_t *p, uint32_t x)
+ATOMIC_INLINE uint32_t atomic_add_and_fetch_uint32(uint32_t *p, uint32_t x)
 {
 	uint32_t ret = x;
 	asm volatile (
@@ -138,7 +138,7 @@ ATOMIC_INLINE uint32_t atomic_add_uint32(uint32_t *p, uint32_t x)
 	return ret+x;
 }
 
-ATOMIC_INLINE uint32_t atomic_sub_uint32(uint32_t *p, uint32_t x)
+ATOMIC_INLINE uint32_t atomic_sub_and_fetch_uint32(uint32_t *p, uint32_t x)
 {
 	ret = (uint32_t)(-(int32_t)x);
 	asm volatile (
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 97854a8..79c1c3e 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -74,7 +74,6 @@ elseif(CMAKE_COMPILER_IS_GNUCC)
 	if(CXX_HAS_AVX2)
 		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c -mfpmath=sse")
 	endif()
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
 elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 	check_cxx_compiler_flag(-msse CXX_HAS_SSE)
 	check_cxx_compiler_flag(-mavx CXX_HAS_AVX)
@@ -90,7 +89,6 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 	if(CXX_HAS_AVX2)
 		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
 	endif()
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
 endif()
 
 if(CXX_HAS_SSE)
diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp
index e8168bc..9816d61 100644
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -72,20 +72,17 @@ static void session_print(const string& str)
 
 static void session_print_status()
 {
-	int sample, tile;
-	double total_time, sample_time, render_time;
 	string status, substatus;
 
 	/* get status */
-	sample = options.session->progress.get_sample();
-	options.session->progress.get_tile(tile, total_time, sample_time, render_time);
+	float progress = options.session->progress.get_progress();
 	options.session->progress.get_status(status, substatus);
 
 	if(substatus != "")
 		status += ": " + substatus;
 
 	/* print status */
-	status = string_printf("Sample %d   %s", sample, status.c_str());
+	status = string_printf("Progress %05.2f   %s", (double) progress*100, status.c_str());
 	session_print(status);
 }
 
@@ -167,13 +164,12 @@ static void display_info(Progress& progress)
 	latency = (elapsed - last);
 	last = elapsed;
 
-	int sample, tile;
-	double total_time, sample_time, render_time;
+	double total_time, sample_time;
 	string status, substatus;
 
-	sample = progress.get_sample();
-	progress.get_tile(tile, total_time, sample_time, render_time);
+	progress.get_time(total_time, sample_time);
 	progress.get_status(status, substatus);
+	float progress_val = progress.get_progress();
 
 	if(substatus != "")
 		status += ": " + substatus;
@@ -184,10 +180,10 @@ static void display_info(Progress& progress)
 	        "%s"
 	        "        Time: %.2f"
 	        "        Latency: %.4f"
-	        "        Sample: %d"
+	        "        Progress: %05.2f"
 	        "        Average: %.4f"
 	        "        Interactive: %s",
-	        status.c_str(), total_time, latency, sample, sample_time, interactive.c_str());
+	        status.c_str(), total_time, latency, (double) progress_val*100, sample_time, interactive.c_str());
 
 	view_display_info(str.c_str());
 
@@ -337,7 +333,7 @@ static void options_parse(int argc, const char **argv)
 
 	/* device names */
 	string device_names = "";
-	string devicename = "cpu";
+	string devicename = "CPU";
 	bool list = false;
 
 	vector<DeviceType>& types = Device::available_types();
diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp
index 8a3eb98..35a30ae 100644
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -210,17 +210,6 @@ static void xml_read_camera(XMLReadState& state, pugi::xml_node node)
 
 /* Shader */
 
-static string xml_socket_name(const char *name)
-{
-	string sname = name;
-	size_t i;
-
-	while((i = sname.find(" ")) != string::npos)
-		sname.replace(i, 1, "");
-	
-	return sname;
-}
-
 static void xml_read_shader_graph(XMLReadState& state, Shader *shader, pugi::xml_node graph_node)
 {
 	xml_read_node(state, shader, graph_node);
@@ -255,7 +244,7 @@ static void xml_read_shader_graph(XMLReadState& state, Shader *shader, pugi::xml
 					ShaderNode *fromnode = (ShaderNode*)graph_reader.node_map[from_node_name];
 
 					foreach(ShaderOutput *out, fromnode->outputs)
-						if(string_iequals(xml_socket_name(out->name().c_str()), from_socket_name.c_str()))
+						if(string_iequals(out->socket_type.name.string(), from_socket_name.string()))
 							output = out;
 
 					if(!output)
@@ -268,7 +257,7 @@ static void xml_read_shader_graph(XMLReadState& state, Shader *shader, pugi::xml
 					ShaderNode *tonode = (ShaderNode*)graph_reader.node_map[to_node_name];
 
 					foreach(ShaderInput *in, tonode->inputs)
-						if(string_iequals(xml_socket_name(in->name().c_str()), to_socket_name.c_str()))
+						if(string_iequals(in->socket_type.name.string(), to_socket_name.string()))
 							input = in;
 
 					if(!input)
@@ -406,7 +395,7 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node)
 	int shader = 0;
 	bool smooth = state.smooth;
 
-	/* read vertices and polygons, RIB style */
+	/* read vertices and polygons */
 	vector<float3> P;
 	vector<float> UV;
 	vector<int> verts, nverts;
@@ -532,8 +521,12 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node)
 		sdparams.objecttoworld = state.tfm;
 	}
 
-	/* temporary for test compatibility */
-	mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
+	/* we don't yet support arbitrary attributes, for now add vertex
+	 * coordinates as generated coordinates if requested */
+	if(mesh->need_attribute(state.scene, ATTR_STD_GENERATED)) {
+		Attribute *attr = mesh->attributes.add(ATTR_STD_GENERATED);
+		memcpy(attr->data_float3(), mesh->verts.data(), sizeof(float3)*mesh->verts.size());
+	}
 }
 
 /* Light */
diff --git a/intern/cycles/blender/CCL_api.h b/intern/cycles/blender/CCL_api.h
index d3a68c4..233ffc8 100644
--- a/intern/cycles/blender/CCL_api.h
+++ b/intern/cycles/blender/CCL_api.h
@@ -21,17 +21,6 @@
 extern "C" {
 #endif
 
-/* returns a list of devices for selection, array is empty identifier
- * terminated and must not be freed */
-
-typedef struct CCLDeviceInfo {
-	char identifier[128];
-	char name[512];
-	int value;
-} CCLDeviceInfo;
-
-CCLDeviceInfo *CCL_compute_device_list(int device_type);
-
 /* create python module _cycles used by addon */
 
 void *CCL_python_module_init(void);
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index a79deca..b57502b 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -25,6 +25,7 @@ set(SRC
 	blender_camera.cpp
 	blender_mesh.cpp
 	blender_object.cpp
+	blender_object_cull.cpp
 	blender_particles.cpp
 	blender_curves.cpp
 	blender_logging.cpp
@@ -35,6 +36,7 @@ set(SRC
 	blender_texture.cpp
 
 	CCL_api.h
+	blender_object_cull.h
 	blender_sync.h
 	blender_session.h
 	blender_texture.h
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index 2938831..1fc3758 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -28,6 +28,20 @@ bl_info = {
     "support": 'OFFICIAL',
     "category": "Render"}
 
+# Support 'reload' case.
+if "bpy" in locals():
+    import importlib
+    if "engine" in locals():
+        importlib.reload(engine)
+    if "version_update" in locals():
+        importlib.reload(version_update)
+    if "ui" in locals():
+        importlib.reload(ui)
+    if "properties" in locals():
+        importlib.reload(properties)
+    if "presets" in locals():
+        importlib.reload(presets)
+
 import bpy
 
 from . import (
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 977d7f7..802b9b7 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -21,7 +21,8 @@ from bpy.props import (BoolProperty,
                        EnumProperty,
                        FloatProperty,
                        IntProperty,
-                       PointerProperty)
+                       PointerProperty,
+                       StringProperty)
 
 # enums
 
@@ -29,7 +30,7 @@ import _cycles
 
 enum_devices = (
     ('CPU', "CPU", "Use CPU for rendering"),
-    ('GPU', "GPU Compute", "Use GPU compute device for rendering, configured in user preferences"),
+    ('GPU', "GPU Compute", "Use GPU compute device for rendering, configured in the system tab in the user preferences"),
     )
 
 if _cycles.with_network:
@@ -122,6 +123,22 @@ enum_volume_interpolation = (
     ('CUBIC', "Cubic", "Smoothed high quality interpolation, but slower")
     )
 
+enum_device_type = (
+    ('CPU', "CPU", "CPU", 0),
+    ('CUDA', "CUDA", "CUDA", 1),
+    ('OPENCL', "OpenCL", "OpenCL", 2)
+    )
+
+enum_texture_limit = (
+    ('OFF', "No Limit", "No texture size limit", 0),
+    ('128', "128", "Limit texture size to 128 pixels", 1),
+    ('256', "256", "Limit texture size to 256 pixels", 2),
+    ('512', "512", "Limit texture size to 512 pixels", 3),
+    ('1024', "1024", "Limit texture size to 1024 pixels", 4),
+    ('2048', "2048", "Limit texture size to 2048 pixels", 5),
+    ('4096', "4096", "Limit texture size to 4096 pixels", 6),
+    ('8192', "8192", "Limit texture size to 8192 pixels", 7),
+    )
 
 class CyclesRenderSettings(bpy.types.PropertyGroup):
     @classmethod
@@ -266,6 +283,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 description="Sample all lights (for indirect samples), rather than randomly picking one",
                 default=True,
                 )
+        cls.light_sampling_threshold = FloatProperty(
+                name="Light Sampling Threshold",
+                description="Probabilistically terminate light samples when the light contribution is below this threshold (more noise but faster rendering). "
+                            "Zero disables the test and never ignores lights",
+                min=0.0, max=1.0,
+                default=0.01,
+                )
 
         cls.caustics_reflective = BoolProperty(
                 name="Reflective Caustics",
@@ -504,6 +528,12 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 description="Use special type BVH optimized for hair (uses more ram but renders faster)",
                 default=True,
                 )
+        cls.debug_bvh_time_steps = IntProperty(
+                name="BVH Time Steps",
+                description="Split BVH primitives by this number of time steps to speed up render time in cost of memory",
+                default=0,
+                min=0, max=16,
+                )
         cls.tile_order = EnumProperty(
                 name="Tile Order",
                 description="Tile order for rendering",
@@ -552,6 +582,19 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 min=0.0, max=5.0
                 )
 
+        cls.use_distance_cull = BoolProperty(
+                name="Use Distance Cull",
+                description="Allow objects to be culled based on the distance from camera",
+                default=False,
+                )
+
+        cls.distance_cull_margin = FloatProperty(
+                name="Cull Distance",
+                description="Cull objects which are further away from camera than this distance",
+                default=50,
+                min=0.0
+                )
+
         cls.motion_blur_position = EnumProperty(
             name="Motion Blur Position",
             default='CENTER',
@@ -581,6 +624,20 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
             min=0.0, max=1.0,
             )
 
+        cls.texture_limit = EnumProperty(
+            name="Viewport Texture Limit",
+            default='OFF',
+            description="Limit texture size used by viewport rendering",
+            items=enum_texture_limit
+            )
+
+        cls.texture_limit_render = EnumProperty(
+            name="Render Texture Limit",
+            default='OFF',
+            description="Limit texture size used by final rendering",
+            items=enum_texture_limit
+            )
+
         # Various fine-tuning debug flags
 
         def devices_update_callback(self, context):
@@ -1002,6 +1059,12 @@ class CyclesObjectSettings(bpy.types.PropertyGroup):
                 default=False,
                 )
 
+        cls.use_distance_cull = BoolProperty(
+                name="Use Distance Cull",
+                description="Allow this object and its duplicators to be culled by distance from camera",
+                default=False,
+                )
+
         cls.use_adaptive_subdivision = BoolProperty(
                 name="Use Adaptive Subdivision",
                 description="Use adaptive render time subdivision",
@@ -1123,6 +1186,107 @@ class CyclesCurveSettings(bpy.types.PropertyGroup):
         del bpy.types.ParticleSettings.cycles
 
 
+class CyclesDeviceSettings(bpy.types.PropertyGroup):
+    @classmethod
+    def register(cls):
+        cls.id = StringProperty(name="ID")
+        cls.name = StringProperty(name="Name")
+        cls.use = BoolProperty(name="Use", default=True)
+        cls.type = EnumProperty(name="Type", items=enum_device_type, default='CUDA')
+
+
+class CyclesPreferences(bpy.types.AddonPreferences):
+    bl_idname = __package__
+
+    def get_device_types(self, context):
+        import _cycles
+        has_cuda, has_opencl = _cycles.get_device_types()
+        list = [('NONE', "None", "Don't use compute device", 0)]
+        if has_cuda:
+            list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1))
+        if has_opencl:
+            list.append(('OPENCL', "OpenCL", "Use OpenCL for GPU acceleration", 2))
+        return list
+
+    compute_device_type = EnumProperty(
+            name="Compute Device Type",
+            description="Device to use for computation (rendering with Cycles)",
+            items=get_device_types,
+            )
+
+    devices = bpy.props.CollectionProperty(type=CyclesDeviceSettings)
+
+    def get_devices(self):
+        import _cycles
+        # Layout of the device tuples: (Name, Type, Persistent ID)
+        device_list = _cycles.available_devices()
+
+        cuda_devices = []
+        opencl_devices = []
+        for device in device_list:
+            if not device[1] in {'CUDA', 'OPENCL'}:
+                continue
+
+            entry = None
+            # Try to find existing Device entry
+            for dev in self.devices:
+                if dev.id == device[2] and dev.type == device[1]:
+                    entry = dev
+                    break
+            # Create new entry if no existing one was found
+            if not entry:
+                entry = self.devices.add()
+                entry.id   = device[2]
+                entry.name = device[0]
+                entry.type = device[1]
+
+            # Sort entries into lists
+            if entry.type == 'CUDA':
+                cuda_devices.append(entry)
+            elif entry.type == 'OPENCL':
+                opencl_devices.append(entry)
+        return cuda_devices, opencl_devices
+
+
+    def get_num_gpu_devices(self):
+        import _cycles
+        device_list = _cycles.available_devices()
+        num = 0
+        for device in device_list:
+            if device[1] != self.compute_device_type:
+                continue
+            for dev in self.devices:
+                if dev.use and dev.id == device[2]:
+                    num += 1
+        return num
+
+
+    def has_active_device(self):
+        return self.get_num_gpu_devices() > 0
+
+
+    def draw_impl(self, layout, context):
+        layout.label(text="Cycles Compute Device:")
+        layout.row().prop(self, "compute_device_type", expand=True)
+
+        cuda_devices, opencl_devices = self.get_devices()
+        row = layout.row()
+
+        if self.compute_device_type == 'CUDA' and cuda_devices:
+            col = row.column(align=True)
+            for device in cuda_devices:
+                col.prop(device, "use", text=device.name, toggle=True)
+
+        if self.compute_device_type == 'OPENCL' and opencl_devices:
+            col = row.column(align=True)
+            for device in opencl_devices:
+                col.prop(device, "use", text=device.name, toggle=True)
+
+
+    def draw(self, context):
+        self.draw_impl(self.layout, context)
+
+
 def register():
     bpy.utils.register_class(CyclesRenderSettings)
     bpy.utils.register_class(CyclesCameraSettings)
@@ -1134,6 +1298,8 @@ def register():
     bpy.utils.register_class(CyclesObjectSettings)
     bpy.utils.register_class(CyclesCurveRenderSettings)
     bpy.utils.register_class(CyclesCurveSettings)
+    bpy.utils.register_class(CyclesDeviceSettings)
+    bpy.utils.register_class(CyclesPreferences)
 
 
 def unregister():
@@ -1147,3 +1313,5 @@ def unregister():
     bpy.utils.unregister_class(CyclesVisibilitySettings)
     bpy.utils.unregister_class(CyclesCurveRenderSettings)
     bpy.utils.unregister_class(CyclesCurveSettings)
+    bpy.utils.unregister_class(CyclesDeviceSettings)
+    bpy.utils.unregister_class(CyclesPreferences)
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 52872d2..ddcefaf 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -53,25 +53,26 @@ class CyclesButtonsPanel:
         return rd.engine in cls.COMPAT_ENGINES
 
 
+def get_device_type(context):
+    return context.user_preferences.addons[__package__].preferences.compute_device_type
+
+
 def use_cpu(context):
     cscene = context.scene.cycles
-    device_type = context.user_preferences.system.compute_device_type
 
-    return (device_type == 'NONE' or cscene.device == 'CPU')
+    return (get_device_type(context) == 'NONE' or cscene.device == 'CPU')
 
 
 def use_opencl(context):
     cscene = context.scene.cycles
-    device_type = context.user_preferences.system.compute_device_type
 
-    return (device_type == 'OPENCL' and cscene.device == 'GPU')
+    return (get_device_type(context) == 'OPENCL' and cscene.device == 'GPU')
 
 
 def use_cuda(context):
     cscene = context.scene.cycles
-    device_type = context.user_preferences.system.compute_device_type
 
-    return (device_type == 'CUDA' and cscene.device == 'GPU')
+    return (get_device_type(context) == 'CUDA' and cscene.device == 'GPU')
 
 
 def use_branched_path(context):
@@ -85,6 +86,14 @@ def use_sample_all_lights(context):
 
     return cscene.sample_all_lights_direct or cscene.sample_all_lights_indirect
 
+def show_device_selection(context):
+    type = get_device_type(context)
+    if type == 'NETWORK':
+        return True
+    if not type in {'CUDA', 'OPENCL'}:
+        return False
+    return context.user_preferences.addons[__package__].preferences.has_active_device()
+
 
 def draw_samples_info(layout, context):
     cscene = context.scene.cycles
@@ -141,7 +150,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
 
         scene = context.scene
         cscene = scene.cycles
-        device_type = context.user_preferences.system.compute_device_type
 
         row = layout.row(align=True)
         row.menu("CYCLES_MT_sampling_presets", text=bpy.types.CYCLES_MT_sampling_presets.bl_label)
@@ -150,7 +158,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
 
         row = layout.row()
         sub = row.row()
-        sub.active = device_type != 'OPENCL' or use_cpu(context)
+        sub.active = get_device_type(context) != 'OPENCL' or use_cpu(context)
         sub.prop(cscene, "progressive", text="")
         row.prop(cscene, "use_square_samples")
 
@@ -166,6 +174,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
 
         sub.prop(cscene, "sample_clamp_direct")
         sub.prop(cscene, "sample_clamp_indirect")
+        sub.prop(cscene, "light_sampling_threshold")
 
         if cscene.progressive == 'PATH' or use_branched_path(context) is False:
             col = split.column()
@@ -208,7 +217,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
         draw_samples_info(layout, context)
 
 
-class CyclesRender_PT_geometery(CyclesButtonsPanel, Panel):
+class CyclesRender_PT_geometry(CyclesButtonsPanel, Panel):
     bl_label = "Geometry"
     bl_options = {'DEFAULT_CLOSED'}
 
@@ -217,6 +226,7 @@ class CyclesRender_PT_geometery(CyclesButtonsPanel, Panel):
 
         scene = context.scene
         cscene = scene.cycles
+        ccscene = scene.cycles_curves
 
         if cscene.feature_set == 'EXPERIMENTAL':
             split = layout.split()
@@ -243,6 +253,25 @@ class CyclesRender_PT_geometery(CyclesButtonsPanel, Panel):
             row.prop(cscene, "volume_step_size")
             row.prop(cscene, "volume_max_steps")
 
+        layout.prop(ccscene, "use_curves", text="Use Hair")
+        col = layout.column()
+        col.active = ccscene.use_curves
+
+        col.prop(ccscene, "primitive", text="Primitive")
+        col.prop(ccscene, "shape", text="Shape")
+
+        if not (ccscene.primitive in {'CURVE_SEGMENTS', 'LINE_SEGMENTS'} and ccscene.shape == 'RIBBONS'):
+            col.prop(ccscene, "cull_backfacing", text="Cull back-faces")
+
+        if ccscene.primitive == 'TRIANGLES' and ccscene.shape == 'THICK':
+            col.prop(ccscene, "resolution", text="Resolution")
+        elif ccscene.primitive == 'CURVE_SEGMENTS':
+            col.prop(ccscene, "subdivisions", text="Curve subdivisions")
+
+        row = col.row()
+        row.prop(ccscene, "minimum_width", text="Min Pixels")
+        row.prop(ccscene, "maximum_width", text="Max Ext.")
+
 
 class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
     bl_label = "Light Paths"
@@ -403,6 +432,10 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel):
         col.prop(cscene, "debug_use_spatial_splits")
         col.prop(cscene, "debug_use_hair_bvh")
 
+        row = col.row()
+        row.active = not cscene.debug_use_spatial_splits
+        row.prop(cscene, "debug_bvh_time_steps")
+
 
 class CyclesRender_PT_layer_options(CyclesButtonsPanel, Panel):
     bl_label = "Layer"
@@ -758,8 +791,13 @@ class CyclesObject_PT_cycles_settings(CyclesButtonsPanel, Panel):
         col = layout.column()
         col.label(text="Performance:")
         row = col.row()
-        row.active = scene.render.use_simplify and cscene.use_camera_cull
-        row.prop(cob, "use_camera_cull")
+        sub = row.row()
+        sub.active = scene.render.use_simplify and cscene.use_camera_cull
+        sub.prop(cob, "use_camera_cull")
+
+        sub = row.row()
+        sub.active = scene.render.use_simplify and cscene.use_distance_cull
+        sub.prop(cob, "use_distance_cull")
 
 
 class CYCLES_OT_use_shading_nodes(Operator):
@@ -1380,43 +1418,6 @@ class CyclesParticle_PT_textures(CyclesButtonsPanel, Panel):
             layout.template_ID(slot, "texture", new="texture.new")
 
 
-class CyclesRender_PT_CurveRendering(CyclesButtonsPanel, Panel):
-    bl_label = "Cycles Hair Rendering"
-    bl_context = "particle"
-
-    @classmethod
-    def poll(cls, context):
-        psys = context.particle_system
-        return CyclesButtonsPanel.poll(context) and psys and psys.settings.type == 'HAIR'
-
-    def draw_header(self, context):
-        ccscene = context.scene.cycles_curves
-        self.layout.prop(ccscene, "use_curves", text="")
-
-    def draw(self, context):
-        layout = self.layout
-
-        scene = context.scene
-        ccscene = scene.cycles_curves
-
-        layout.active = ccscene.use_curves
-
-        layout.prop(ccscene, "primitive", text="Primitive")
-        layout.prop(ccscene, "shape", text="Shape")
-
-        if not (ccscene.primitive in {'CURVE_SEGMENTS', 'LINE_SEGMENTS'} and ccscene.shape == 'RIBBONS'):
-            layout.prop(ccscene, "cull_backfacing", text="Cull back-faces")
-
-        if ccscene.primitive == 'TRIANGLES' and ccscene.shape == 'THICK':
-            layout.prop(ccscene, "resolution", text="Resolution")
-        elif ccscene.primitive == 'CURVE_SEGMENTS':
-            layout.prop(ccscene, "subdivisions", text="Curve subdivisions")
-
-        row = layout.row()
-        row.prop(ccscene, "minimum_width", text="Min Pixels")
-        row.prop(ccscene, "maximum_width", text="Max Ext.")
-
-
 class CyclesRender_PT_bake(CyclesButtonsPanel, Panel):
     bl_label = "Bake"
     bl_context = "render"
@@ -1576,24 +1577,40 @@ class CyclesScene_PT_simplify(CyclesButtonsPanel, Panel):
         cscene = scene.cycles
 
         layout.active = rd.use_simplify
-        split = layout.split()
 
-        col = split.column()
-        col.label(text="Viewport:")
-        col.prop(rd, "simplify_subdivision", text="Subdivision")
-        col.prop(rd, "simplify_child_particles", text="Child Particles")
+        col = layout.column(align=True)
+        col.label(text="Subdivision")
+        row = col.row(align=True)
+        row.prop(rd, "simplify_subdivision", text="Viewport")
+        row.prop(rd, "simplify_subdivision_render", text="Render")
 
-        col = split.column()
-        col.label(text="Render:")
-        col.prop(rd, "simplify_subdivision_render", text="Subdivision")
-        col.prop(rd, "simplify_child_particles_render", text="Child Particles")
+        col = layout.column(align=True)
+        col.label(text="Child Particles")
+        row = col.row(align=True)
+        row.prop(rd, "simplify_child_particles", text="Viewport")
+        row.prop(rd, "simplify_child_particles_render", text="Render")
 
-        col = layout.column()
+        col = layout.column(align=True)
+        split = col.split()
+        sub = split.column()
+        sub.label(text="Texture Limit Viewport")
+        sub.prop(cscene, "texture_limit", text="")
+        sub = split.column()
+        sub.label(text="Texture Limit Render")
+        sub.prop(cscene, "texture_limit_render", text="")
+
+        split = layout.split()
+        col = split.column()
         col.prop(cscene, "use_camera_cull")
-        subsub = col.column()
-        subsub.active = cscene.use_camera_cull
-        subsub.prop(cscene, "camera_cull_margin")
+        row = col.row()
+        row.active = cscene.use_camera_cull
+        row.prop(cscene, "camera_cull_margin")
 
+        col = split.column()
+        col.prop(cscene, "use_distance_cull")
+        row = col.row()
+        row.active = cscene.use_distance_cull
+        row.prop(cscene, "distance_cull_margin", text="Distance")
 
 def draw_device(self, context):
     scene = context.scene
@@ -1605,9 +1622,11 @@ def draw_device(self, context):
 
         layout.prop(cscene, "feature_set")
 
-        device_type = context.user_preferences.system.compute_device_type
-        if device_type in {'CUDA', 'OPENCL', 'NETWORK'}:
-            layout.prop(cscene, "device")
+        split = layout.split(percentage=1/3)
+        split.label("Device:")
+        row = split.row()
+        row.active = show_device_selection(context)
+        row.prop(cscene, "device", text="")
 
         if engine.with_osl() and use_cpu(context):
             layout.prop(cscene, "shading_system")
diff --git a/intern/cycles/blender/addon/version_update.py b/intern/cycles/blender/addon/version_update.py
index 830723d..b2a7455 100644
--- a/intern/cycles/blender/addon/version_update.py
+++ b/intern/cycles/blender/addon/version_update.py
@@ -172,6 +172,24 @@ def custom_bake_remap(scene):
 
 @persistent
 def do_versions(self):
+    if bpy.context.user_preferences.version <= (2, 78, 1):
+        prop = bpy.context.user_preferences.addons[__package__].preferences
+        system = bpy.context.user_preferences.system
+        if not prop.is_property_set("compute_device_type"):
+            # Device might not currently be available so this can fail
+            try:
+                if system.legacy_compute_device_type == 1:
+                    prop.compute_device_type = 'OPENCL'
+                elif system.legacy_compute_device_type == 2:
+                    prop.compute_device_type = 'CUDA'
+                else:
+                    prop.compute_device_type = 'NONE'
+            except:
+                pass
+
+            # Init device list for UI
+            prop.get_devices()
+
     # We don't modify startup file because it assumes to
     # have all the default values only.
     if not bpy.data.is_saved:
@@ -278,3 +296,9 @@ def do_versions(self):
                     cscene.pixel_filter_type = cscene.filter_type
                 if cscene.filter_type == 'BLACKMAN_HARRIS':
                     cscene.filter_type = 'GAUSSIAN'
+
+    if bpy.data.version <= (2, 78, 2):
+        for scene in bpy.data.scenes:
+            cscene = scene.cycles
+            if not cscene.is_property_set("light_sampling_threshold"):
+                cscene.light_sampling_threshold = 0.0
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index 378ae67..e42ff5d 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -29,24 +29,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Utilities */
-
-/* Hair curve functions */
-
-void curveinterp_v3_v3v3v3v3(float3 *p, float3 *v1, float3 *v2, float3 *v3, float3 *v4, const float w[4]);
-void interp_weights(float t, float data[4]);
-float shaperadius(float shape, float root, float tip, float time);
-void InterpolateKeySegments(int seg, int segno, int key, int curve, float3 *keyloc, float *time, ParticleCurveData *CData);
-bool ObtainCacheParticleUV(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background, int uv_num);
-bool ObtainCacheParticleVcol(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background, int vcol_num);
-bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background);
-void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData);
-void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
-                               float3 RotCam, bool is_ortho);
-void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, int resolution);
-void ExportCurveTriangleUV(ParticleCurveData *CData, int vert_offset, int resol, float3 *uvdata);
-void ExportCurveTriangleVcol(ParticleCurveData *CData, int vert_offset, int resol, uchar4 *cdata);
-
 ParticleCurveData::ParticleCurveData()
 {
 }
@@ -55,7 +37,7 @@ ParticleCurveData::~ParticleCurveData()
 {
 }
 
-void interp_weights(float t, float data[4])
+static void interp_weights(float t, float data[4])
 {
 	/* Cardinal curve interpolation */
 	float t2 = t * t;
@@ -68,17 +50,19 @@ void interp_weights(float t, float data[4])
 	data[3] =  fc          * t3  - fc * t2;
 }
 
-void curveinterp_v3_v3v3v3v3(float3 *p, float3 *v1, float3 *v2, float3 *v3, float3 *v4, const float w[4])
+static void curveinterp_v3_v3v3v3v3(float3 *p,
+                                    float3 *v1, float3 *v2, float3 *v3, float3 *v4,
+                                    const float w[4])
 {
 	p->x = v1->x * w[0] + v2->x * w[1] + v3->x * w[2] + v4->x * w[3];
 	p->y = v1->y * w[0] + v2->y * w[1] + v3->y * w[2] + v4->y * w[3];
 	p->z = v1->z * w[0] + v2->z * w[1] + v3->z * w[2] + v4->z * w[3];
 }
 
-float shaperadius(float shape, float root, float tip, float time)
+static float shaperadius(float shape, float root, float tip, float time)
 {
 	float radius = 1.0f - time;
-	
+
 	if(shape != 0.0f) {
 		if(shape < 0.0f)
 			radius = powf(radius, 1.0f + shape);
@@ -90,7 +74,13 @@ float shaperadius(float shape, float root, float tip, float time)
 
 /* curve functions */
 
-void InterpolateKeySegments(int seg, int segno, int key, int curve, float3 *keyloc, float *time, ParticleCurveData *CData)
+static void InterpolateKeySegments(int seg,
+                                   int segno,
+                                   int key,
+                                   int curve,
+                                   float3 *keyloc,
+                                   float *time,
+                                   ParticleCurveData *CData)
 {
 	float3 ckey_loc1 = CData->curvekey_co[key];
 	float3 ckey_loc2 = ckey_loc1;
@@ -119,7 +109,11 @@ void InterpolateKeySegments(int seg, int segno, int key, int curve, float3 *keyl
 		curveinterp_v3_v3v3v3v3(keyloc, &ckey_loc1, &ckey_loc2, &ckey_loc3, &ckey_loc4, t);
 }
 
-bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background)
+static bool ObtainCacheParticleData(Mesh *mesh,
+                                    BL::Mesh *b_mesh,
+                                    BL::Object *b_ob,
+                                    ParticleCurveData *CData,
+                                    bool background)
 {
 	int curvenum = 0;
 	int keyno = 0;
@@ -143,7 +137,7 @@ bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par
 				int totparts = b_psys.particles.length();
 				int totchild = background ? b_psys.child_particles.length() : (int)((float)b_psys.child_particles.length() * (float)b_part.draw_percentage() / 100.0f);
 				int totcurves = totchild;
-				
+
 				if(b_part.child_type() == 0 || totchild == 0)
 					totcurves += totparts;
 
@@ -161,7 +155,7 @@ bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par
 				CData->psys_shader.push_back_slow(shader);
 
 				float radius = get_float(cpsys, "radius_scale") * 0.5f;
-	
+
 				CData->psys_rootradius.push_back_slow(radius * get_float(cpsys, "root_width"));
 				CData->psys_tipradius.push_back_slow(radius * get_float(cpsys, "tip_width"));
 				CData->psys_shape.push_back_slow(get_float(cpsys, "shape"));
@@ -181,7 +175,7 @@ bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par
 				for(; pa_no < totparts+totchild; pa_no++) {
 					int keynum = 0;
 					CData->curve_firstkey.push_back_slow(keyno);
-					
+
 					float curve_length = 0.0f;
 					float3 pcKey;
 					for(int step_no = 0; step_no < ren_step; step_no++) {
@@ -213,7 +207,12 @@ bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par
 	return true;
 }
 
-bool ObtainCacheParticleUV(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background, int uv_num)
+static bool ObtainCacheParticleUV(Mesh *mesh,
+                                  BL::Mesh *b_mesh,
+                                  BL::Object *b_ob,
+                                  ParticleCurveData *CData,
+                                  bool background,
+                                  int uv_num)
 {
 	if(!(mesh && b_mesh && b_ob && CData))
 		return false;
@@ -231,7 +230,7 @@ bool ObtainCacheParticleUV(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Parti
 				int totparts = b_psys.particles.length();
 				int totchild = background ? b_psys.child_particles.length() : (int)((float)b_psys.child_particles.length() * (float)b_part.draw_percentage() / 100.0f);
 				int totcurves = totchild;
-				
+
 				if(b_part.child_type() == 0 || totchild == 0)
 					totcurves += totparts;
 
@@ -267,7 +266,12 @@ bool ObtainCacheParticleUV(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Parti
 	return true;
 }
 
-bool ObtainCacheParticleVcol(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background, int vcol_num)
+static bool ObtainCacheParticleVcol(Mesh *mesh,
+                                    BL::Mesh *b_mesh,
+                                    BL::Object *b_ob,
+                                    ParticleCurveData *CData,
+                                    bool background,
+                                    int vcol_num)
 {
 	if(!(mesh && b_mesh && b_ob && CData))
 		return false;
@@ -285,7 +289,7 @@ bool ObtainCacheParticleVcol(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par
 				int totparts = b_psys.particles.length();
 				int totchild = background ? b_psys.child_particles.length() : (int)((float)b_psys.child_particles.length() * (float)b_part.draw_percentage() / 100.0f);
 				int totcurves = totchild;
-				
+
 				if(b_part.child_type() == 0 || totchild == 0)
 					totcurves += totparts;
 
@@ -333,16 +337,16 @@ static void set_resolution(BL::Object *b_ob, BL::Scene *scene, bool render)
 	}
 }
 
-void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
-                               float3 RotCam, bool is_ortho)
+static void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
+                                      float3 RotCam, bool is_ortho)
 {
 	int vertexno = mesh->verts.size();
 	int vertexindex = vertexno;
 	int numverts = 0, numtris = 0;
 
 	/* compute and reserve size of arrays */
-	for(int sys = 0; sys < CData->psys_firstcurve.size() ; sys++) {
-		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys] ; curve++) {
+	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
+		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; curve++) {
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;
 
@@ -354,8 +358,8 @@ void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
 	mesh->reserve_mesh(mesh->verts.size() + numverts, mesh->num_triangles() + numtris);
 
 	/* actually export */
-	for(int sys = 0; sys < CData->psys_firstcurve.size() ; sys++) {
-		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys] ; curve++) {
+	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
+		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; curve++) {
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;
 
@@ -380,7 +384,7 @@ void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
 
 				if(curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1)
 					v1 = CData->curvekey_co[curvekey] - CData->curvekey_co[max(curvekey - 1, CData->curve_firstkey[curve])];
-				else 
+				else
 					v1 = CData->curvekey_co[curvekey + 1] - CData->curvekey_co[curvekey - 1];
 
 				time = CData->curvekey_time[curvekey]/CData->curve_length[curve];
@@ -416,15 +420,17 @@ void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
 	/* texture coords still needed */
 }
 
-void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, int resolution)
+static void ExportCurveTriangleGeometry(Mesh *mesh,
+                                        ParticleCurveData *CData,
+                                        int resolution)
 {
 	int vertexno = mesh->verts.size();
 	int vertexindex = vertexno;
 	int numverts = 0, numtris = 0;
 
 	/* compute and reserve size of arrays */
-	for(int sys = 0; sys < CData->psys_firstcurve.size() ; sys++) {
-		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys] ; curve++) {
+	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
+		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; curve++) {
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;
 
@@ -436,8 +442,8 @@ void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, int resol
 	mesh->reserve_mesh(mesh->verts.size() + numverts, mesh->num_triangles() + numtris);
 
 	/* actually export */
-	for(int sys = 0; sys < CData->psys_firstcurve.size() ; sys++) {
-		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys] ; curve++) {
+	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
+		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; curve++) {
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;
 
@@ -548,7 +554,7 @@ void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, int resol
 	/* texture coords still needed */
 }
 
-void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
+static void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
 {
 	int num_keys = 0;
 	int num_curves = 0;
@@ -557,13 +563,13 @@ void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
 		return;
 
 	Attribute *attr_intercept = NULL;
-	
+
 	if(mesh->need_attribute(scene, ATTR_STD_CURVE_INTERCEPT))
 		attr_intercept = mesh->curve_attributes.add(ATTR_STD_CURVE_INTERCEPT);
 
 	/* compute and reserve size of arrays */
-	for(int sys = 0; sys < CData->psys_firstcurve.size() ; sys++) {
-		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys] ; curve++) {
+	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
+		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; curve++) {
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;
 
@@ -582,8 +588,8 @@ void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
 	num_curves = 0;
 
 	/* actually export */
-	for(int sys = 0; sys < CData->psys_firstcurve.size() ; sys++) {
-		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys] ; curve++) {
+	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
+		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; curve++) {
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;
 
@@ -677,8 +683,13 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int
 	/* in case of new attribute, we verify if there really was any motion */
 	if(new_attribute) {
 		if(i != numkeys || !have_motion) {
-			/* no motion, remove attributes again */
-			VLOG(1) << "No motion, removing attribute";
+			/* No motion or hair "topology" changed, remove attributes again. */
+			if(i != numkeys) {
+				VLOG(1) << "Hair topology changed, removing attribute.";
+			}
+			else {
+				VLOG(1) << "No motion, removing attribute.";
+			}
 			mesh->curve_attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION);
 		}
 		else if(time_index > 0) {
@@ -698,7 +709,10 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int
 	}
 }
 
-void ExportCurveTriangleUV(ParticleCurveData *CData, int vert_offset, int resol, float3 *uvdata)
+static void ExportCurveTriangleUV(ParticleCurveData *CData,
+                                  int vert_offset,
+                                  int resol,
+                                  float3 *uvdata)
 {
 	if(uvdata == NULL)
 		return;
@@ -708,8 +722,8 @@ void ExportCurveTriangleUV(ParticleCurveData *CData, int vert_offset, int resol,
 
 	int vertexindex = vert_offset;
 
-	for(int sys = 0; sys < CData->psys_firstcurve.size() ; sys++) {
-		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys] ; curve++) {
+	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
+		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; curve++) {
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;
 
@@ -743,15 +757,18 @@ void ExportCurveTriangleUV(ParticleCurveData *CData, int vert_offset, int resol,
 	}
 }
 
-void ExportCurveTriangleVcol(ParticleCurveData *CData, int vert_offset, int resol, uchar4 *cdata)
+static void ExportCurveTriangleVcol(ParticleCurveData *CData,
+                                    int vert_offset,
+                                    int resol,
+                                    uchar4 *cdata)
 {
 	if(cdata == NULL)
 		return;
 
 	int vertexindex = vert_offset;
 
-	for(int sys = 0; sys < CData->psys_firstcurve.size() ; sys++) {
-		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys] ; curve++) {
+	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
+		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; curve++) {
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;
 
@@ -1044,4 +1061,3 @@ void BlenderSync::sync_curves(Mesh *mesh,
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index 7c2049d..85117cf 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -597,8 +597,8 @@ static void create_mesh(Scene *scene,
                         Mesh *mesh,
                         BL::Mesh& b_mesh,
                         const vector<Shader*>& used_shaders,
-                        bool subdivision=false,
-                        bool subdivide_uvs=true)
+                        bool subdivision = false,
+                        bool subdivide_uvs = true)
 {
 	/* count vertices and faces */
 	int numverts = b_mesh.vertices.length();
@@ -671,28 +671,10 @@ static void create_mesh(Scene *scene,
 			int shader = clamp(f->material_index(), 0, used_shaders.size()-1);
 			bool smooth = f->use_smooth() || use_loop_normals;
 
-			/* split vertices if normal is different
+			/* Create triangles.
 			 *
-			 * note all vertex attributes must have been set here so we can split
-			 * and copy attributes in split_vertex without remapping later */
-			if(use_loop_normals) {
-				BL::Array<float, 12> loop_normals = f->split_normals();
-
-				for(int i = 0; i < n; i++) {
-					float3 loop_N = make_float3(loop_normals[i * 3], loop_normals[i * 3 + 1], loop_normals[i * 3 + 2]);
-
-					if(N[vi[i]] != loop_N) {
-						int new_vi = mesh->split_vertex(vi[i]);
-
-						/* set new normal and vertex index */
-						N = attr_N->data_float3();
-						N[new_vi] = loop_N;
-						vi[i] = new_vi;
-					}
-				}
-			}
-
-			/* create triangles */
+			 * NOTE: Autosmooth is already taken care about.
+			 */
 			if(n == 4) {
 				if(is_zero(cross(mesh->verts[vi[1]] - mesh->verts[vi[0]], mesh->verts[vi[2]] - mesh->verts[vi[0]])) ||
 				   is_zero(cross(mesh->verts[vi[2]] - mesh->verts[vi[0]], mesh->verts[vi[3]] - mesh->verts[vi[0]])))
@@ -724,24 +706,8 @@ static void create_mesh(Scene *scene,
 
 			vi.reserve(n);
 			for(int i = 0; i < n; i++) {
+				/* NOTE: Autosmooth is already taken care about. */
 				vi[i] = b_mesh.loops[p->loop_start() + i].vertex_index();
-
-				/* split vertices if normal is different
-				 *
-				 * note all vertex attributes must have been set here so we can split
-				 * and copy attributes in split_vertex without remapping later */
-				if(use_loop_normals) {
-					float3 loop_N = get_float3(b_mesh.loops[p->loop_start() + i].normal());
-
-					if(N[vi[i]] != loop_N) {
-						int new_vi = mesh->split_vertex(vi[i]);
-
-						/* set new normal and vertex index */
-						N = attr_N->data_float3();
-						N[new_vi] = loop_N;
-						vi[i] = new_vi;
-					}
-				}
 			}
 
 			/* create subd faces */
@@ -847,7 +813,7 @@ static void sync_mesh_fluid_motion(BL::Object& b_ob, Scene *scene, Mesh *mesh)
 
 	/* Only export previous and next frame, we don't have any in between data. */
 	float motion_times[2] = {-1.0f, 1.0f};
-	for (int step = 0; step < 2; step++) {
+	for(int step = 0; step < 2; step++) {
 		float relative_time = motion_times[step] * scene->motion_shutter_time() * 0.5f;
 		float3 *mP = attr_mP->data_float3() + step*mesh->verts.size();
 
@@ -961,7 +927,20 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
 
 		mesh->subdivision_type = object_subdivision_type(b_ob, preview, experimental);
 
-		BL::Mesh b_mesh = object_to_mesh(b_data, b_ob, b_scene, true, !preview, need_undeformed, mesh->subdivision_type);
+		/* Disable adaptive subdivision while baking as the baking system
+		 * currently doesnt support the topology and will crash.
+		 */
+		if(scene->bake_manager->get_baking()) {
+			mesh->subdivision_type = Mesh::SUBDIVISION_NONE;
+		}
+
+		BL::Mesh b_mesh = object_to_mesh(b_data,
+		                                 b_ob,
+		                                 b_scene,
+		                                 true,
+		                                 !preview,
+		                                 need_undeformed,
+		                                 mesh->subdivision_type);
 
 		if(b_mesh) {
 			if(render_layer.use_surfaces && !hide_tris) {
@@ -1086,7 +1065,13 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 
 	if(ccl::BKE_object_is_deform_modified(b_ob, b_scene, preview)) {
 		/* get derived mesh */
-		b_mesh = object_to_mesh(b_data, b_ob, b_scene, true, !preview, false, false);
+		b_mesh = object_to_mesh(b_data,
+		                        b_ob,
+		                        b_scene,
+		                        true,
+		                        !preview,
+		                        false,
+		                        Mesh::SUBDIVISION_NONE);
 	}
 
 	if(!b_mesh) {
@@ -1157,10 +1142,12 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 			{
 				/* no motion, remove attributes again */
 				if(b_mesh.vertices.length() != numverts) {
-					VLOG(1) << "Topology differs, disabling motion blur.";
+					VLOG(1) << "Topology differs, disabling motion blur for object "
+					        << b_ob.name();
 				}
 				else {
-					VLOG(1) << "No actual deformation motion for object " << b_ob.name();
+					VLOG(1) << "No actual deformation motion for object "
+					        << b_ob.name();
 				}
 				mesh->attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION);
 				if(attr_mN)
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index f7f77df..637cf7a 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -25,6 +25,7 @@
 #include "particles.h"
 #include "shader.h"
 
+#include "blender_object_cull.h"
 #include "blender_sync.h"
 #include "blender_util.h"
 
@@ -153,6 +154,7 @@ void BlenderSync::sync_light(BL::Object& b_parent,
 	/* location and (inverted!) direction */
 	light->co = transform_get_column(&tfm, 3);
 	light->dir = -transform_get_column(&tfm, 2);
+	light->tfm = tfm;
 
 	/* shader */
 	vector<Shader*> used_shaders;
@@ -234,55 +236,6 @@ void BlenderSync::sync_background_light(bool use_portal)
 
 /* Object */
 
-/* TODO(sergey): Not really optimal, consider approaches based on k-DOP in order
- * to reduce number of objects which are wrongly considered visible.
- */
-static bool object_boundbox_clip(Scene *scene,
-                                 BL::Object& b_ob,
-                                 Transform& tfm,
-                                 float margin)
-{
-	Camera *cam = scene->camera;
-	Transform& worldtondc = cam->worldtondc;
-	BL::Array<float, 24> boundbox = b_ob.bound_box();
-	float3 bb_min = make_float3(FLT_MAX, FLT_MAX, FLT_MAX),
-	       bb_max = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
-	bool all_behind = true;
-	for(int i = 0; i < 8; ++i) {
-		float3 p = make_float3(boundbox[3 * i + 0],
-		                       boundbox[3 * i + 1],
-		                       boundbox[3 * i + 2]);
-		p = transform_point(&tfm, p);
-
-		float4 b = make_float4(p.x, p.y, p.z, 1.0f);
-		float4 c = make_float4(dot(worldtondc.x, b),
-		                       dot(worldtondc.y, b),
-		                       dot(worldtondc.z, b),
-		                       dot(worldtondc.w, b));
-		p = float4_to_float3(c / c.w);
-		if(c.z < 0.0f) {
-			p.x = 1.0f - p.x;
-			p.y = 1.0f - p.y;
-		}
-		if(c.z >= -margin) {
-			all_behind = false;
-		}
-		bb_min = min(bb_min, p);
-		bb_max = max(bb_max, p);
-	}
-	if(!all_behind) {
-		if(bb_min.x >= 1.0f + margin ||
-		   bb_min.y >= 1.0f + margin ||
-		   bb_max.x <= -margin ||
-		   bb_max.y <= -margin)
-		{
-			return true;
-		}
-		return false;
-	}
-	return true;
-}
-
 Object *BlenderSync::sync_object(BL::Object& b_parent,
                                  int persistent_id[OBJECT_PERSISTENT_ID_SIZE],
                                  BL::DupliObject& b_dupli_ob,
@@ -290,8 +243,7 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
                                  uint layer_flag,
                                  float motion_time,
                                  bool hide_tris,
-                                 bool use_camera_cull,
-                                 float camera_cull_margin,
+                                 BlenderObjectCulling& culling,
                                  bool *use_portal)
 {
 	BL::Object b_ob = (b_dupli_ob ? b_dupli_ob.object() : b_parent);
@@ -307,11 +259,12 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 	}
 
 	/* only interested in object that we can create meshes from */
-	if(!object_is_mesh(b_ob))
+	if(!object_is_mesh(b_ob)) {
 		return NULL;
+	}
 
-	/* Perform camera space culling. */
-	if(use_camera_cull && object_boundbox_clip(scene, b_ob, tfm, camera_cull_margin)) {
+	/* Perform object culling. */
+	if(culling.test(scene, b_ob, tfm)) {
 		return NULL;
 	}
 
@@ -547,17 +500,8 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time)
 		mesh_motion_synced.clear();
 	}
 
-	bool allow_camera_cull = false;
-	float camera_cull_margin = 0.0f;
-	if(b_scene.render().use_simplify()) {
-		PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
-		allow_camera_cull = scene->camera->type != CAMERA_PANORAMA &&
-		                    !b_scene.render().use_multiview() &&
-		                    get_boolean(cscene, "use_camera_cull");
-		if(allow_camera_cull) {
-			camera_cull_margin = get_float(cscene, "camera_cull_margin");
-		}
-	}
+	/* initialize culling */
+	BlenderObjectCulling culling(scene, b_scene);
 
 	/* object loop */
 	BL::Scene::object_bases_iterator b_base;
@@ -589,12 +533,9 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time)
 			if(!hide) {
 				progress.set_sync_status("Synchronizing object", b_ob.name());
 
-				PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
-				bool use_camera_cull = allow_camera_cull && get_boolean(cobject, "use_camera_cull");
-				if(use_camera_cull) {
-					/* Need to have proper projection matrix. */
-					scene->camera->update();
-				}
+				/* load per-object culling data */
+				culling.init_object(scene, b_ob);
+
 				if(b_ob.is_duplicator() && !object_render_hide_duplis(b_ob)) {
 					/* dupli objects */
 					b_ob.dupli_list_create(b_scene, dupli_settings);
@@ -621,8 +562,7 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time)
 							                             ob_layer,
 							                             motion_time,
 							                             hide_tris,
-							                             use_camera_cull,
-							                             camera_cull_margin,
+							                             culling,
 							                             &use_portal);
 
 							/* sync possible particle data, note particle_id
@@ -651,8 +591,7 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time)
 					            ob_layer,
 					            motion_time,
 					            hide_tris,
-					            use_camera_cull,
-					            camera_cull_margin,
+					            culling,
 					            &use_portal);
 				}
 			}
diff --git a/intern/cycles/blender/blender_object_cull.cpp b/intern/cycles/blender/blender_object_cull.cpp
new file mode 100644
index 0000000..08918dd
--- /dev/null
+++ b/intern/cycles/blender/blender_object_cull.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdlib>
+
+#include "camera.h"
+
+#include "blender_object_cull.h"
+
+CCL_NAMESPACE_BEGIN
+
+BlenderObjectCulling::BlenderObjectCulling(Scene *scene, BL::Scene& b_scene)
+        : use_scene_camera_cull_(false),
+          use_camera_cull_(false),
+          camera_cull_margin_(0.0f),
+          use_scene_distance_cull_(false),
+          use_distance_cull_(false),
+          distance_cull_margin_(0.0f)
+{
+	if(b_scene.render().use_simplify()) {
+		PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+
+		use_scene_camera_cull_ = scene->camera->type != CAMERA_PANORAMA &&
+		                         !b_scene.render().use_multiview() &&
+		                         get_boolean(cscene, "use_camera_cull");
+		use_scene_distance_cull_ = scene->camera->type != CAMERA_PANORAMA &&
+		                           !b_scene.render().use_multiview() &&
+		                           get_boolean(cscene, "use_distance_cull");
+
+		camera_cull_margin_ = get_float(cscene, "camera_cull_margin");
+		distance_cull_margin_ = get_float(cscene, "distance_cull_margin");
+
+		if(distance_cull_margin_ == 0.0f) {
+			use_scene_distance_cull_ = false;
+		}
+	}
+}
+
+void BlenderObjectCulling::init_object(Scene *scene, BL::Object& b_ob)
+{
+	if(!use_scene_camera_cull_ && !use_scene_distance_cull_) {
+		return;
+	}
+
+	PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
+
+	use_camera_cull_ = use_scene_camera_cull_ && get_boolean(cobject, "use_camera_cull");
+	use_distance_cull_ = use_scene_distance_cull_ && get_boolean(cobject, "use_distance_cull");
+
+	if(use_camera_cull_ || use_distance_cull_) {
+		/* Need to have proper projection matrix. */
+		scene->camera->update();
+	}
+}
+
+bool BlenderObjectCulling::test(Scene *scene, BL::Object& b_ob, Transform& tfm)
+{
+	if(!use_camera_cull_ && !use_distance_cull_) {
+		return false;
+	}
+
+	/* Compute world space bounding box corners. */
+	float3 bb[8];
+	BL::Array<float, 24> boundbox = b_ob.bound_box();
+	for(int i = 0; i < 8; ++i) {
+		float3 p = make_float3(boundbox[3 * i + 0],
+		                       boundbox[3 * i + 1],
+		                       boundbox[3 * i + 2]);
+		bb[i] = transform_point(&tfm, p);
+	}
+
+	bool camera_culled = use_camera_cull_ && test_camera(scene, bb);
+	bool distance_culled = use_distance_cull_ && test_distance(scene, bb);
+
+	return ((camera_culled && distance_culled) ||
+	        (camera_culled && !use_distance_cull_) ||
+	        (distance_culled && !use_camera_cull_));
+}
+
+/* TODO(sergey): Not really optimal, consider approaches based on k-DOP in order
+ * to reduce number of objects which are wrongly considered visible.
+ */
+bool BlenderObjectCulling::test_camera(Scene *scene, float3 bb[8])
+{
+	Camera *cam = scene->camera;
+	Transform& worldtondc = cam->worldtondc;
+	float3 bb_min = make_float3(FLT_MAX, FLT_MAX, FLT_MAX),
+	       bb_max = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
+	bool all_behind = true;
+	for(int i = 0; i < 8; ++i) {
+		float3 p = bb[i];
+		float4 b = make_float4(p.x, p.y, p.z, 1.0f);
+		float4 c = make_float4(dot(worldtondc.x, b),
+		                       dot(worldtondc.y, b),
+		                       dot(worldtondc.z, b),
+		                       dot(worldtondc.w, b));
+		p = float4_to_float3(c / c.w);
+		if(c.z < 0.0f) {
+			p.x = 1.0f - p.x;
+			p.y = 1.0f - p.y;
+		}
+		if(c.z >= -camera_cull_margin_) {
+			all_behind = false;
+		}
+		bb_min = min(bb_min, p);
+		bb_max = max(bb_max, p);
+	}
+	if(all_behind) {
+		return true;
+	}
+	return (bb_min.x >= 1.0f + camera_cull_margin_ ||
+	        bb_min.y >= 1.0f + camera_cull_margin_ ||
+	        bb_max.x <= -camera_cull_margin_ ||
+	        bb_max.y <= -camera_cull_margin_);
+}
+
+bool BlenderObjectCulling::test_distance(Scene *scene, float3 bb[8])
+{
+	float3 camera_position = transform_get_column(&scene->camera->matrix, 3);
+	float3 bb_min = make_float3(FLT_MAX, FLT_MAX, FLT_MAX),
+	       bb_max = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
+
+	/* Find min & max points for x & y & z on bounding box */
+	for(int i = 0; i < 8; ++i) {
+		float3 p = bb[i];
+		bb_min = min(bb_min, p);
+		bb_max = max(bb_max, p);
+	}
+
+	float3 closest_point = max(min(bb_max,camera_position),bb_min);
+	return (len_squared(camera_position - closest_point) >
+	        distance_cull_margin_ * distance_cull_margin_);
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/render/background.h b/intern/cycles/blender/blender_object_cull.h
similarity index 50%
copy from intern/cycles/render/background.h
copy to intern/cycles/blender/blender_object_cull.h
index 8029c6a..b6f0ca5 100644
--- a/intern/cycles/render/background.h
+++ b/intern/cycles/blender/blender_object_cull.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2013 Blender Foundation
+ * Copyright 2011-2016 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,47 +14,36 @@
  * limitations under the License.
  */
 
-#ifndef __BACKGROUND_H__
-#define __BACKGROUND_H__
-
-#include "node.h"
+#ifndef __BLENDER_OBJECT_CULL_H__
+#define __BLENDER_OBJECT_CULL_H__
 
+#include "blender_sync.h"
 #include "util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
-class Device;
-class DeviceScene;
 class Scene;
-class Shader;
 
-class Background : public Node {
+class BlenderObjectCulling
+{
 public:
-	NODE_DECLARE;
-
-	float ao_factor;
-	float ao_distance;
-
-	bool use_shader;
-	bool use_ao;
+	BlenderObjectCulling(Scene *scene, BL::Scene& b_scene);
 
-	uint visibility;
-	Shader *shader;
+	void init_object(Scene *scene, BL::Object& b_ob);
+	bool test(Scene *scene, BL::Object& b_ob, Transform& tfm);
 
-	bool transparent;
-	bool need_update;
+private:
+	bool test_camera(Scene *scene, float3 bb[8]);
+	bool test_distance(Scene *scene, float3 bb[8]);
 
-	Background();
-	~Background();
-
-	void device_update(Device *device, DeviceScene *dscene, Scene *scene);
-	void device_free(Device *device, DeviceScene *dscene);
-
-	bool modified(const Background& background);
-	void tag_update(Scene *scene);
+	bool use_scene_camera_cull_;
+	bool use_camera_cull_;
+	float camera_cull_margin_;
+	bool use_scene_distance_cull_;
+	bool use_distance_cull_;
+	float distance_cull_margin_;
 };
 
 CCL_NAMESPACE_END
 
-#endif /* __BACKGROUND_H__ */
-
+#endif /* __BLENDER_OBJECT_CULL_H__ */
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index a50f5ed..438abc4 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -40,10 +40,6 @@ CCL_NAMESPACE_BEGIN
 
 namespace {
 
-/* Device list stored static (used by compute_device_list()). */
-static ccl::vector<CCLDeviceInfo> device_list;
-static ccl::DeviceType device_type = DEVICE_NONE;
-
 /* Flag describing whether debug flags were synchronized from scene. */
 bool debug_flags_set = false;
 
@@ -195,7 +191,6 @@ static PyObject *exit_func(PyObject * /*self*/, PyObject * /*args*/)
 	ShaderManager::free_memory();
 	TaskScheduler::free_memory();
 	Device::free_memory();
-	device_list.free_memory();
 	Py_RETURN_NONE;
 }
 
@@ -389,7 +384,12 @@ static PyObject *available_devices_func(PyObject * /*self*/, PyObject * /*args*/
 
 	for(size_t i = 0; i < devices.size(); i++) {
 		DeviceInfo& device = devices[i];
-		PyTuple_SET_ITEM(ret, i, PyUnicode_FromString(device.description.c_str()));
+		string type_name = Device::string_from_type(device.type);
+		PyObject *device_tuple = PyTuple_New(3);
+		PyTuple_SET_ITEM(device_tuple, 0, PyUnicode_FromString(device.description.c_str()));
+		PyTuple_SET_ITEM(device_tuple, 1, PyUnicode_FromString(type_name.c_str()));
+		PyTuple_SET_ITEM(device_tuple, 2, PyUnicode_FromString(device.id.c_str()));
+		PyTuple_SET_ITEM(ret, i, device_tuple);
 	}
 
 	return ret;
@@ -676,6 +676,20 @@ static PyObject *set_resumable_chunks_func(PyObject * /*self*/, PyObject *args)
 	Py_RETURN_NONE;
 }
 
+static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
+{
+	vector<DeviceInfo>& devices = Device::available_devices();
+	bool has_cuda = false, has_opencl = false;
+	for(int i = 0; i < devices.size(); i++) {
+		has_cuda   |= (devices[i].type == DEVICE_CUDA);
+		has_opencl |= (devices[i].type == DEVICE_OPENCL);
+	}
+	PyObject *list = PyTuple_New(2);
+	PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
+	PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_opencl));
+	return list;
+}
+
 static PyMethodDef methods[] = {
 	{"init", init_func, METH_VARARGS, ""},
 	{"exit", exit_func, METH_VARARGS, ""},
@@ -703,6 +717,9 @@ static PyMethodDef methods[] = {
 	/* Resumable render */
 	{"set_resumable_chunks", set_resumable_chunks_func, METH_VARARGS, ""},
 
+	/* Compute Device selection */
+	{"get_device_types", get_device_types_func, METH_VARARGS, ""},
+
 	{NULL, NULL, 0, NULL},
 };
 
@@ -715,47 +732,6 @@ static struct PyModuleDef module = {
 	NULL, NULL, NULL, NULL
 };
 
-static CCLDeviceInfo *compute_device_list(DeviceType type)
-{
-	/* create device list if it's not already done */
-	if(type != device_type) {
-		ccl::vector<DeviceInfo>& devices = ccl::Device::available_devices();
-
-		device_type = type;
-		device_list.clear();
-
-		/* add devices */
-		int i = 0;
-
-		foreach(DeviceInfo& info, devices) {
-			if(info.type == type ||
-			   (info.type == DEVICE_MULTI && info.multi_devices[0].type == type))
-			{
-				CCLDeviceInfo cinfo;
-
-				strncpy(cinfo.identifier, info.id.c_str(), sizeof(cinfo.identifier));
-				cinfo.identifier[info.id.length()] = '\0';
-
-				strncpy(cinfo.name, info.description.c_str(), sizeof(cinfo.name));
-				cinfo.name[info.description.length()] = '\0';
-
-				cinfo.value = i++;
-
-				device_list.push_back(cinfo);
-			}
-		}
-
-		/* null terminate */
-		if(!device_list.empty()) {
-			CCLDeviceInfo cinfo = {"", "", 0};
-			device_list.push_back(cinfo);
-		}
-	}
-
-	return (device_list.empty())? NULL: &device_list[0];
-}
-
-
 CCL_NAMESPACE_END
 
 void *CCL_python_module_init()
@@ -794,24 +770,3 @@ void *CCL_python_module_init()
 
 	return (void*)mod;
 }
-
-CCLDeviceInfo *CCL_compute_device_list(int device_type)
-{
-	ccl::DeviceType type;
-	switch(device_type) {
-		case 0:
-			type = ccl::DEVICE_CUDA;
-			break;
-		case 1:
-			type = ccl::DEVICE_OPENCL;
-			break;
-		case 2:
-			type = ccl::DEVICE_NETWORK;
-			break;
-		default:
-			type = ccl::DEVICE_NONE;
-			break;
-	}
-	return ccl::compute_device_list(type);
-}
-
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index c250a54..2f30cbd 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -32,6 +32,7 @@
 #include "util_color.h"
 #include "util_foreach.h"
 #include "util_function.h"
+#include "util_hash.h"
 #include "util_logging.h"
 #include "util_progress.h"
 #include "util_time.h"
@@ -125,8 +126,8 @@ void BlenderSession::create_session()
 
 	/* setup callbacks for builtin image support */
 	scene->image_manager->builtin_image_info_cb = function_bind(&BlenderSession::builtin_image_info, this, _1, _2, _3, _4, _5, _6, _7);
-	scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3);
-	scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3);
+	scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3, _4);
+	scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3, _4);
 
 	/* create session */
 	session = new Session(session_params);
@@ -304,12 +305,16 @@ static PassType get_pass_type(BL::RenderPass& b_pass)
 #ifdef WITH_CYCLES_DEBUG
 		case BL::RenderPass::type_DEBUG:
 		{
-			if(b_pass.debug_type() == BL::RenderPass::debug_type_BVH_TRAVERSAL_STEPS)
-				return PASS_BVH_TRAVERSAL_STEPS;
-			if(b_pass.debug_type() == BL::RenderPass::debug_type_BVH_TRAVERSED_INSTANCES)
-				return PASS_BVH_TRAVERSED_INSTANCES;
-			if(b_pass.debug_type() == BL::RenderPass::debug_type_RAY_BOUNCES)
-				return PASS_RAY_BOUNCES;
+			switch(b_pass.debug_type()) {
+				case BL::RenderPass::debug_type_BVH_TRAVERSED_NODES:
+					return PASS_BVH_TRAVERSED_NODES;
+				case BL::RenderPass::debug_type_BVH_TRAVERSED_INSTANCES:
+					return PASS_BVH_TRAVERSED_INSTANCES;
+				case BL::RenderPass::debug_type_BVH_INTERSECTIONS:
+					return PASS_BVH_INTERSECTIONS;
+				case BL::RenderPass::debug_type_RAY_BOUNCES:
+					return PASS_RAY_BOUNCES;
+			}
 			break;
 		}
 #endif
@@ -498,7 +503,8 @@ void BlenderSession::render()
 		scene->film->tag_update(scene);
 		scene->integrator->tag_update(scene);
 
-		for(b_rr.views.begin(b_view_iter); b_view_iter != b_rr.views.end(); ++b_view_iter) {
+		int view_index = 0;
+		for(b_rr.views.begin(b_view_iter); b_view_iter != b_rr.views.end(); ++b_view_iter, ++view_index) {
 			b_rview_name = b_view_iter->name();
 
 			/* set the current view */
@@ -514,6 +520,12 @@ void BlenderSession::render()
 			                &python_thread_state,
 			                b_rlay_name.c_str());
 
+			/* Make sure all views have different noise patterns. - hardcoded value just to make it random */
+			if(view_index != 0) {
+				scene->integrator->seed += hash_int_2d(scene->integrator->seed, hash_int(view_index * 0xdeadbeef));
+				scene->integrator->tag_update(scene);
+			}
+
 			/* Update number of samples per layer. */
 			int samples = sync->get_layer_samples();
 			bool bound_samples = sync->get_layer_bound_samples();
@@ -572,7 +584,7 @@ static void populate_bake_data(BakeData *data, const
 	BL::BakePixel bp = pixel_array;
 
 	int i;
-	for(i=0; i < num_pixels; i++) {
+	for(i = 0; i < num_pixels; i++) {
 		if(bp.object_id() == object_id) {
 			data->set(i, bp.primitive_id(), bp.uv(), bp.du_dx(), bp.du_dy(), bp.dv_dx(), bp.dv_dy());
 		} else {
@@ -922,38 +934,13 @@ void BlenderSession::get_status(string& status, string& substatus)
 
 void BlenderSession::get_progress(float& progress, double& total_time, double& render_time)
 {
-	double tile_time;
-	int tile, sample, samples_per_tile;
-	int tile_total = session->tile_manager.state.num_tiles;
-	int samples = session->tile_manager.state.sample + 1;
-	int total_samples = session->tile_manager.get_num_effective_samples();
-
-	session->progress.get_tile(tile, total_time, render_time, tile_time);
-
-	sample = session->progress.get_sample();
-	samples_per_tile = session->tile_manager.get_num_effective_samples();
-
-	if(background && samples_per_tile && tile_total)
-		progress = ((float)sample / (float)(tile_total * samples_per_tile));
-	else if(!background && samples > 0 && total_samples != INT_MAX)
-		progress = ((float)samples) / total_samples;
-	else
-		progress = 0.0;
+	session->progress.get_time(total_time, render_time);
+	progress = session->progress.get_progress();
 }
 
 void BlenderSession::update_bake_progress()
 {
-	float progress;
-	int sample, samples_per_task, parts_total;
-
-	sample = session->progress.get_sample();
-	samples_per_task = scene->bake_manager->num_samples;
-	parts_total = scene->bake_manager->num_parts;
-
-	if(samples_per_task)
-		progress = ((float)sample / (float)(parts_total * samples_per_task));
-	else
-		progress = 0.0;
+	float progress = session->progress.get_progress();
 
 	if(progress != last_progress) {
 		b_engine.update_progress(progress);
@@ -1072,7 +1059,13 @@ int BlenderSession::builtin_image_frame(const string &builtin_name)
 	return atoi(builtin_name.substr(last + 1, builtin_name.size() - last - 1).c_str());
 }
 
-void BlenderSession::builtin_image_info(const string &builtin_name, void *builtin_data, bool &is_float, int &width, int &height, int &depth, int &channels)
+void BlenderSession::builtin_image_info(const string &builtin_name,
+                                        void *builtin_data,
+                                        bool &is_float,
+                                        int &width,
+                                        int &height,
+                                        int &depth,
+                                        int &channels)
 {
 	/* empty image */
 	is_float = false;
@@ -1150,60 +1143,67 @@ void BlenderSession::builtin_image_info(const string &builtin_name, void *builti
 	}
 }
 
-bool BlenderSession::builtin_image_pixels(const string &builtin_name, void *builtin_data, unsigned char *pixels)
+bool BlenderSession::builtin_image_pixels(const string &builtin_name,
+                                          void *builtin_data,
+                                          unsigned char *pixels,
+                                          const size_t pixels_size)
 {
-	if(!builtin_data)
+	if(!builtin_data) {
 		return false;
+	}
 
-	int frame = builtin_image_frame(builtin_name);
+	const int frame = builtin_image_frame(builtin_name);
 
 	PointerRNA ptr;
 	RNA_id_pointer_create((ID*)builtin_data, &ptr);
 	BL::Image b_image(ptr);
 
-	int width = b_image.size()[0];
-	int height = b_image.size()[1];
-	int channels = b_image.channels();
+	const int width = b_image.size()[0];
+	const int height = b_image.size()[1];
+	const int channels = b_image.channels();
 
-	unsigned char *image_pixels;
-	image_pixels = image_get_pixels_for_frame(b_image, frame);
-	size_t num_pixels = ((size_t)width) * height;
+	unsigned char *image_pixels = image_get_pixels_for_frame(b_image, frame);
+	const size_t num_pixels = ((size_t)width) * height;
 
-	if(image_pixels) {
-		memcpy(pixels, image_pixels, num_pixels * channels * sizeof(unsigned char));
+	if(image_pixels && num_pixels * channels == pixels_size) {
+		memcpy(pixels, image_pixels, pixels_size * sizeof(unsigned char));
 		MEM_freeN(image_pixels);
 	}
 	else {
 		if(channels == 1) {
-			memset(pixels, 0, num_pixels * sizeof(unsigned char));
+			memset(pixels, 0, pixels_size * sizeof(unsigned char));
 		}
 		else {
+			const size_t num_pixels_safe = pixels_size / channels;
 			unsigned char *cp = pixels;
-			for(size_t i = 0; i < num_pixels; i++, cp += channels) {
+			for(size_t i = 0; i < num_pixels_safe; i++, cp += channels) {
 				cp[0] = 255;
 				cp[1] = 0;
 				cp[2] = 255;
-				if(channels == 4)
+				if(channels == 4) {
 					cp[3] = 255;
+				}
 			}
 		}
 	}
-
-	/* premultiply, byte images are always straight for blender */
+	/* Premultiply, byte images are always straight for Blender. */
 	unsigned char *cp = pixels;
 	for(size_t i = 0; i < num_pixels; i++, cp += channels) {
 		cp[0] = (cp[0] * cp[3]) >> 8;
 		cp[1] = (cp[1] * cp[3]) >> 8;
 		cp[2] = (cp[2] * cp[3]) >> 8;
 	}
-
 	return true;
 }
 
-bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void *builtin_data, float *pixels)
+bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
+                                                void *builtin_data,
+                                                float *pixels,
+                                                const size_t pixels_size)
 {
-	if(!builtin_data)
+	if(!builtin_data) {
 		return false;
+	}
 
 	PointerRNA ptr;
 	RNA_id_pointer_create((ID*)builtin_data, &ptr);
@@ -1214,16 +1214,16 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void
 		BL::Image b_image(b_id);
 		int frame = builtin_image_frame(builtin_name);
 
-		int width = b_image.size()[0];
-		int height = b_image.size()[1];
-		int channels = b_image.channels();
+		const int width = b_image.size()[0];
+		const int height = b_image.size()[1];
+		const int channels = b_image.channels();
 
 		float *image_pixels;
 		image_pixels = image_get_float_pixels_for_frame(b_image, frame);
-		size_t num_pixels = ((size_t)width) * height;
+		const size_t num_pixels = ((size_t)width) * height;
 
-		if(image_pixels) {
-			memcpy(pixels, image_pixels, num_pixels * channels * sizeof(float));
+		if(image_pixels && num_pixels * channels == pixels_size) {
+			memcpy(pixels, image_pixels, pixels_size * sizeof(float));
 			MEM_freeN(image_pixels);
 		}
 		else {
@@ -1231,13 +1231,15 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void
 				memset(pixels, 0, num_pixels * sizeof(float));
 			}
 			else {
+				const size_t num_pixels_safe = pixels_size / channels;
 				float *fp = pixels;
-				for(int i = 0; i < num_pixels; i++, fp += channels) {
+				for(int i = 0; i < num_pixels_safe; i++, fp += channels) {
 					fp[0] = 1.0f;
 					fp[1] = 0.0f;
 					fp[2] = 1.0f;
-					if(channels == 4)
+					if(channels == 4) {
 						fp[3] = 1.0f;
+					}
 				}
 			}
 		}
@@ -1249,8 +1251,9 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void
 		BL::Object b_ob(b_id);
 		BL::SmokeDomainSettings b_domain = object_smoke_domain_find(b_ob);
 
-		if(!b_domain)
+		if(!b_domain) {
 			return false;
+		}
 
 		int3 resolution = get_int3(b_domain.domain_resolution());
 		int length, amplify = (b_domain.use_high_resolution())? b_domain.amplify() + 1: 1;
@@ -1262,10 +1265,10 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void
 			amplify = 1;
 		}
 
-		int width = resolution.x * amplify;
-		int height = resolution.y * amplify;
-		int depth = resolution.z * amplify;
-		size_t num_pixels = ((size_t)width) * height * depth;
+		const int width = resolution.x * amplify;
+		const int height = resolution.y * amplify;
+		const int depth = resolution.z * amplify;
+		const size_t num_pixels = ((size_t)width) * height * depth;
 
 		if(builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_DENSITY)) {
 			SmokeDomainSettings_density_grid_get_length(&b_domain.ptr, &length);
@@ -1349,6 +1352,9 @@ void BlenderSession::update_resumable_tile_manager(int num_samples)
 	VLOG(1) << "Samples range start is " << range_start_sample << ", "
 	        << "number of samples to render is " << range_num_samples;
 
+	scene->integrator->start_sample = range_start_sample;
+	scene->integrator->tag_update(scene);
+
 	session->tile_manager.range_start_sample = range_start_sample;
 	session->tile_manager.range_num_samples = range_num_samples;
 }
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index 66a6945..82fe218 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -145,9 +145,21 @@ protected:
 	void do_write_update_render_tile(RenderTile& rtile, bool do_update_only);
 
 	int builtin_image_frame(const string &builtin_name);
-	void builtin_image_info(const string &builtin_name, void *builtin_data, bool &is_float, int &width, int &height, int &depth, int &channels);
-	bool builtin_image_pixels(const string &builtin_name, void *builtin_data, unsigned char *pixels);
-	bool builtin_image_float_pixels(const string &builtin_name, void *builtin_data, float *pixels);
+	void builtin_image_info(const string &builtin_name,
+	                        void *builtin_data,
+	                        bool &is_float,
+	                        int &width,
+	                        int &height,
+	                        int &depth,
+	                        int &channels);
+	bool builtin_image_pixels(const string &builtin_name,
+	                          void *builtin_data,
+	                          unsigned char *pixels,
+	                          const size_t pixels_size);
+	bool builtin_image_float_pixels(const string &builtin_name,
+	                                void *builtin_data,
+	                                float *pixels,
+	                                const size_t pixels_size);
 
 	/* Update tile manager to reflect resumable render settings. */
 	void update_resumable_tile_manager(int num_samples);
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 4ca202a..f8f2303 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -255,8 +255,17 @@ void BlenderSync::sync_integrator()
 	integrator->filter_glossy = get_float(cscene, "blur_glossy");
 
 	integrator->seed = get_int(cscene, "seed");
-	if(get_boolean(cscene, "use_animated_seed"))
-		integrator->seed = hash_int_2d(b_scene.frame_current(), get_int(cscene, "seed"));
+	if(get_boolean(cscene, "use_animated_seed")) {
+		integrator->seed = hash_int_2d(b_scene.frame_current(),
+		                               get_int(cscene, "seed"));
+		if(b_scene.frame_subframe() != 0.0f) {
+			/* TODO(sergey): Ideally should be some sort of hash_merge,
+			 * but this is good enough for now.
+			 */
+			integrator->seed += hash_int_2d((int)(b_scene.frame_subframe() * (float)INT_MAX),
+			                                get_int(cscene, "seed"));
+		}
+	}
 
 	integrator->sampling_pattern = (SamplingPattern)get_enum(
 	        cscene,
@@ -284,6 +293,7 @@ void BlenderSync::sync_integrator()
 
 	integrator->sample_all_lights_direct = get_boolean(cscene, "sample_all_lights_direct");
 	integrator->sample_all_lights_indirect = get_boolean(cscene, "sample_all_lights_indirect");
+	integrator->light_sampling_threshold = get_float(cscene, "light_sampling_threshold");
 
 	int diffuse_samples = get_int(cscene, "diffuse_samples");
 	int glossy_samples = get_int(cscene, "glossy_samples");
@@ -488,12 +498,27 @@ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene,
 
 	params.use_bvh_spatial_split = RNA_boolean_get(&cscene, "debug_use_spatial_splits");
 	params.use_bvh_unaligned_nodes = RNA_boolean_get(&cscene, "debug_use_hair_bvh");
+	params.num_bvh_time_steps = RNA_int_get(&cscene, "debug_bvh_time_steps");
 
 	if(background && params.shadingsystem != SHADINGSYSTEM_OSL)
 		params.persistent_data = r.use_persistent_data();
 	else
 		params.persistent_data = false;
 
+	int texture_limit;
+	if(background) {
+		texture_limit = RNA_enum_get(&cscene, "texture_limit_render");
+	}
+	else {
+		texture_limit = RNA_enum_get(&cscene, "texture_limit");
+	}
+	if(texture_limit > 0 && b_scene.render().use_simplify()) {
+		params.texture_limit = 1 << (texture_limit + 6);
+	}
+	else {
+		params.texture_limit = 0;
+	}
+
 #if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
 	if(is_cpu) {
 		params.use_qbvh = DebugFlags().cpu.qbvh && system_cpu_support_sse2();
@@ -530,7 +555,12 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 	vector<DeviceInfo>& devices = Device::available_devices();
 	
 	/* device default CPU */
-	params.device = devices[0];
+	foreach(DeviceInfo& device, devices) {
+		if(device.type == DEVICE_CPU) {
+			params.device = device;
+			break;
+		}
+	}
 
 	if(get_enum(cscene, "device") == 2) {
 		/* find network device */
@@ -539,17 +569,39 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 				params.device = info;
 	}
 	else if(get_enum(cscene, "device") == 1) {
-		/* find GPU device with given id */
-		PointerRNA systemptr = b_userpref.system().ptr;
-		PropertyRNA *deviceprop = RNA_struct_find_property(&systemptr, "compute_device");
-		int device_id = b_userpref.system().compute_device();
+		PointerRNA b_preferences;
 
-		const char *id;
+		BL::UserPreferences::addons_iterator b_addon_iter;
+		for(b_userpref.addons.begin(b_addon_iter); b_addon_iter != b_userpref.addons.end(); ++b_addon_iter) {
+			if(b_addon_iter->module() == "cycles") {
+				b_preferences = b_addon_iter->preferences().ptr;
+				break;
+			}
+		}
 
-		if(RNA_property_enum_identifier(NULL, &systemptr, deviceprop, device_id, &id)) {
-			foreach(DeviceInfo& info, devices)
-				if(info.id == id)
-					params.device = info;
+		int compute_device = get_enum(b_preferences, "compute_device_type");
+
+		if(compute_device != 0) {
+			vector<DeviceInfo> used_devices;
+			RNA_BEGIN(&b_preferences, device, "devices") {
+				if(get_enum(device, "type") == compute_device && get_boolean(device, "use")) {
+					string id = get_string(device, "id");
+					foreach(DeviceInfo& info, devices) {
+						if(info.id == id) {
+							used_devices.push_back(info);
+							break;
+						}
+					}
+				}
+			} RNA_END
+
+			if(used_devices.size() == 1) {
+				params.device = used_devices[0];
+			}
+			else if(used_devices.size() > 1) {
+				params.device = Device::get_multi_device(used_devices);
+			}
+			/* Else keep using the CPU device that was set before. */
 		}
 	}
 
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index 9a01b4f..6984cbd 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -35,6 +35,7 @@
 CCL_NAMESPACE_BEGIN
 
 class Background;
+class BlenderObjectCulling;
 class Camera;
 class Film;
 class Light;
@@ -122,8 +123,7 @@ private:
 	                    uint layer_flag,
 	                    float motion_time,
 	                    bool hide_tris,
-	                    bool use_camera_cull,
-	                    float camera_cull_margin,
+	                    BlenderObjectCulling& culling,
 	                    bool *use_portal);
 	void sync_light(BL::Object& b_parent,
 	                int persistent_id[OBJECT_PERSISTENT_ID_SIZE],
diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h
index f17a61f..b67834c 100644
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -48,12 +48,12 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data,
                                       bool apply_modifiers,
                                       bool render,
                                       bool calc_undeformed,
-                                      bool subdivision)
+                                      Mesh::SubdivisionType subdivision_type)
 {
 	bool subsurf_mod_show_render;
 	bool subsurf_mod_show_viewport;
 
-	if(subdivision) {
+	if(subdivision_type != Mesh::SUBDIVISION_NONE) {
 		BL::Modifier subsurf_mod = object.modifiers[object.modifiers.length()-1];
 
 		subsurf_mod_show_render = subsurf_mod.show_render();
@@ -65,7 +65,7 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data,
 
 	BL::Mesh me = data.meshes.new_from_object(scene, object, apply_modifiers, (render)? 2: 1, false, calc_undeformed);
 
-	if(subdivision) {
+	if(subdivision_type != Mesh::SUBDIVISION_NONE) {
 		BL::Modifier subsurf_mod = object.modifiers[object.modifiers.length()-1];
 
 		subsurf_mod.show_render(subsurf_mod_show_render);
@@ -74,9 +74,14 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data,
 
 	if((bool)me) {
 		if(me.use_auto_smooth()) {
-			me.calc_normals_split();
+			if(subdivision_type == Mesh::SUBDIVISION_CATMULL_CLARK) {
+				me.calc_normals_split();
+			}
+			else {
+				me.split_faces();
+			}
 		}
-		if(!subdivision) {
+		if(subdivision_type == Mesh::SUBDIVISION_NONE) {
 			me.calc_tessface(true);
 		}
 	}
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 1570520..874a424 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -597,7 +597,7 @@ void RegularBVH::pack_nodes(const BVHNode *root)
 		else {
 			/* innner node */
 			int idx[2];
-			for (int i = 0; i < 2; ++i) {
+			for(int i = 0; i < 2; ++i) {
 				if(e.node->get_child(i)->is_leaf()) {
 					idx[i] = nextLeafNodeIdx++;
 				}
@@ -835,13 +835,39 @@ void QBVH::pack_aligned_inner(const BVHStackEntry& e,
                               const BVHStackEntry *en,
                               int num)
 {
+	BoundBox bounds[4];
+	int child[4];
+	for(int i = 0; i < num; ++i) {
+		bounds[i] = en[i].node->m_bounds;
+		child[i] = en[i].encodeIdx();
+	}
+	pack_aligned_node(e.idx,
+	                  bounds,
+	                  child,
+	                  e.node->m_visibility,
+	                  e.node->m_time_from,
+	                  e.node->m_time_to,
+	                  num);
+}
+
+void QBVH::pack_aligned_node(int idx,
+                             const BoundBox *bounds,
+                             const int *child,
+                             const uint visibility,
+                             const float time_from,
+                             const float time_to,
+                             const int num)
+{
 	float4 data[BVH_QNODE_SIZE];
 	memset(data, 0, sizeof(data));
 
-	data[0].x = __uint_as_float(e.node->m_visibility & ~PATH_RAY_NODE_UNALIGNED);
+	data[0].x = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
+	data[0].y = time_from;
+	data[0].z = time_to;
+
 	for(int i = 0; i < num; i++) {
-		float3 bb_min = en[i].node->m_bounds.min;
-		float3 bb_max = en[i].node->m_bounds.max;
+		float3 bb_min = bounds[i].min;
+		float3 bb_max = bounds[i].max;
 
 		data[1][i] = bb_min.x;
 		data[2][i] = bb_max.x;
@@ -850,7 +876,7 @@ void QBVH::pack_aligned_inner(const BVHStackEntry& e,
 		data[5][i] = bb_min.z;
 		data[6][i] = bb_max.z;
 
-		data[7][i] = __int_as_float(en[i].encodeIdx());
+		data[7][i] = __int_as_float(child[i]);
 	}
 
 	for(int i = num; i < 4; i++) {
@@ -869,22 +895,51 @@ void QBVH::pack_aligned_inner(const BVHStackEntry& e,
 		data[7][i] = __int_as_float(0);
 	}
 
-	memcpy(&pack.nodes[e.idx], data, sizeof(float4)*BVH_QNODE_SIZE);
+	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_QNODE_SIZE);
 }
 
 void QBVH::pack_unaligned_inner(const BVHStackEntry& e,
                                 const BVHStackEntry *en,
                                 int num)
 {
+	Transform aligned_space[4];
+	BoundBox bounds[4];
+	int child[4];
+	for(int i = 0; i < num; ++i) {
+		aligned_space[i] = en[i].node->get_aligned_space();
+		bounds[i] = en[i].node->m_bounds;
+		child[i] = en[i].encodeIdx();
+	}
+	pack_unaligned_node(e.idx,
+	                    aligned_space,
+	                    bounds,
+	                    child,
+	                    e.node->m_visibility,
+	                    e.node->m_time_from,
+	                    e.node->m_time_to,
+	                    num);
+}
+
+void QBVH::pack_unaligned_node(int idx,
+                               const Transform *aligned_space,
+                               const BoundBox *bounds,
+                               const int *child,
+                               const uint visibility,
+                               const float time_from,
+                               const float time_to,
+                               const int num)
+{
 	float4 data[BVH_UNALIGNED_QNODE_SIZE];
 	memset(data, 0, sizeof(data));
 
-	data[0].x = __uint_as_float(e.node->m_visibility | PATH_RAY_NODE_UNALIGNED);
+	data[0].x = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
+	data[0].y = time_from;
+	data[0].z = time_to;
 
 	for(int i = 0; i < num; i++) {
 		Transform space = BVHUnaligned::compute_node_transform(
-		        en[i].node->m_bounds,
-		        en[i].node->get_aligned_space());
+		        bounds[i],
+		        aligned_space[i]);
 
 		data[1][i] = space.x.x;
 		data[2][i] = space.x.y;
@@ -902,7 +957,7 @@ void QBVH::pack_unaligned_inner(const BVHStackEntry& e,
 		data[11][i] = space.y.w;
 		data[12][i] = space.z.w;
 
-		data[13][i] = __int_as_float(en[i].encodeIdx());
+		data[13][i] = __int_as_float(child[i]);
 	}
 
 	for(int i = num; i < 4; i++) {
@@ -929,7 +984,7 @@ void QBVH::pack_unaligned_inner(const BVHStackEntry& e,
 		data[13][i] = __int_as_float(0);
 	}
 
-	memcpy(&pack.nodes[e.idx], data, sizeof(float4)*BVH_UNALIGNED_QNODE_SIZE);
+	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_QNODE_SIZE);
 }
 
 /* Quad SIMD Nodes */
@@ -1155,61 +1210,28 @@ void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 			}
 		}
 
-		/* TODO(sergey): To be de-duplicated with pack_inner(),
-		 * but for that need some sort of pack_node(). which operates with
-		 * direct data, not stack element.
-		 */
 		if(is_unaligned) {
-			Transform aligned_space = transform_identity();
-			float4 inner_data[BVH_UNALIGNED_QNODE_SIZE];
-			inner_data[0] = make_float4(
-			        __int_as_float(visibility | PATH_RAY_NODE_UNALIGNED),
-			        0.0f,
-			        0.0f,
-			        0.0f);
-			for(int i = 0; i < 4; ++i) {
-				Transform space = BVHUnaligned::compute_node_transform(
-				        child_bbox[i],
-				        aligned_space);
-				inner_data[1][i] = space.x.x;
-				inner_data[2][i] = space.x.y;
-				inner_data[3][i] = space.x.z;
-
-				inner_data[4][i] = space.y.x;
-				inner_data[5][i] = space.y.y;
-				inner_data[6][i] = space.y.z;
-
-				inner_data[7][i] = space.z.x;
-				inner_data[8][i] = space.z.y;
-				inner_data[9][i] = space.z.z;
-
-				inner_data[10][i] = space.x.w;
-				inner_data[11][i] = space.y.w;
-				inner_data[12][i] = space.z.w;
-
-				inner_data[13][i] = __int_as_float(c[i]);
-			}
-			memcpy(&pack.nodes[idx], inner_data, sizeof(float4)*BVH_UNALIGNED_QNODE_SIZE);
+			Transform aligned_space[4] = {transform_identity(),
+			                              transform_identity(),
+			                              transform_identity(),
+			                              transform_identity()};
+			pack_unaligned_node(idx,
+			                    aligned_space,
+			                    child_bbox,
+			                    &c[0],
+			                    visibility,
+			                    0.0f,
+			                    1.0f,
+			                    4);
 		}
 		else {
-			float4 inner_data[BVH_QNODE_SIZE];
-			inner_data[0] = make_float4(
-			        __int_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED),
-			        0.0f,
-			        0.0f,
-			        0.0f);
-			for(int i = 0; i < 4; ++i) {
-				float3 bb_min = child_bbox[i].min;
-				float3 bb_max = child_bbox[i].max;
-				inner_data[1][i] = bb_min.x;
-				inner_data[2][i] = bb_max.x;
-				inner_data[3][i] = bb_min.y;
-				inner_data[4][i] = bb_max.y;
-				inner_data[5][i] = bb_min.z;
-				inner_data[6][i] = bb_max.z;
-				inner_data[7][i] = __int_as_float(c[i]);
-			}
-			memcpy(&pack.nodes[idx], inner_data, sizeof(float4)*BVH_QNODE_SIZE);
+			pack_aligned_node(idx,
+			                  child_bbox,
+			                  &c[0],
+			                  visibility,
+			                  0.0f,
+			                  1.0f,
+			                  4);
 		}
 	}
 }
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index 1675207..35f4d30 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -171,9 +171,25 @@ protected:
 	void pack_aligned_inner(const BVHStackEntry& e,
 	                        const BVHStackEntry *en,
 	                        int num);
+	void pack_aligned_node(int idx,
+	                       const BoundBox *bounds,
+	                       const int *child,
+	                       const uint visibility,
+	                       const float time_from,
+	                       const float time_to,
+	                       const int num);
+
 	void pack_unaligned_inner(const BVHStackEntry& e,
 	                          const BVHStackEntry *en,
 	                          int num);
+	void pack_unaligned_node(int idx,
+	                         const Transform *aligned_space,
+	                         const BoundBox *bounds,
+	                         const int *child,
+	                         const uint visibility,
+	                         const float time_from,
+	                         const float time_to,
+	                         const int num);
 
 	/* refit */
 	void refit_nodes();
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index 14f66ac..a2f8b33 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -26,6 +26,7 @@
 #include "scene.h"
 #include "curves.h"
 
+#include "util_algorithm.h"
 #include "util_debug.h"
 #include "util_foreach.h"
 #include "util_logging.h"
@@ -112,81 +113,237 @@ BVHBuild::~BVHBuild()
 
 /* Adding References */
 
-void BVHBuild::add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh, int i)
+void BVHBuild::add_reference_triangles(BoundBox& root, BoundBox& center, Mesh *mesh, int i)
 {
-	if(params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
-		Attribute *attr_mP = NULL;
-
-		if(mesh->has_motion_blur())
-			attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-		size_t num_triangles = mesh->num_triangles();
-		for(uint j = 0; j < num_triangles; j++) {
-			Mesh::Triangle t = mesh->get_triangle(j);
+	const Attribute *attr_mP = NULL;
+	if(mesh->has_motion_blur()) {
+		attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+	}
+	const size_t num_triangles = mesh->num_triangles();
+	for(uint j = 0; j < num_triangles; j++) {
+		Mesh::Triangle t = mesh->get_triangle(j);
+		const float3 *verts = &mesh->verts[0];
+		if(attr_mP == NULL) {
 			BoundBox bounds = BoundBox::empty;
-			PrimitiveType type = PRIMITIVE_TRIANGLE;
-
-			t.bounds_grow(&mesh->verts[0], bounds);
-
-			/* motion triangles */
-			if(attr_mP) {
-				size_t mesh_size = mesh->verts.size();
-				size_t steps = mesh->motion_steps - 1;
-				float3 *vert_steps = attr_mP->data_float3();
-
-				for(size_t i = 0; i < steps; i++)
-					t.bounds_grow(vert_steps + i*mesh_size, bounds);
-
-				type = PRIMITIVE_MOTION_TRIANGLE;
+			t.bounds_grow(verts, bounds);
+			if(bounds.valid()) {
+				references.push_back(BVHReference(bounds,
+				                                  j,
+				                                  i,
+				                                  PRIMITIVE_TRIANGLE));
+				root.grow(bounds);
+				center.grow(bounds.center2());
+			}
+		}
+		else if(params.num_motion_triangle_steps == 0 || params.use_spatial_split) {
+			/* Motion triangles, simple case: single node for the whole
+			 * primitive. Lowest memory footprint and faster BVH build but
+			 * least optimal ray-tracing.
+			 */
+			/* TODO(sergey): Support motion steps for spatially split BVH. */
+			const size_t num_verts = mesh->verts.size();
+			const size_t num_steps = mesh->motion_steps;
+			const float3 *vert_steps = attr_mP->data_float3();
+			BoundBox bounds = BoundBox::empty;
+			t.bounds_grow(verts, bounds);
+			for(size_t step = 0; step < num_steps - 1; step++) {
+				t.bounds_grow(vert_steps + step*num_verts, bounds);
 			}
-
 			if(bounds.valid()) {
-				references.push_back(BVHReference(bounds, j, i, type));
+				references.push_back(
+				        BVHReference(bounds,
+				                     j,
+				                     i,
+				                     PRIMITIVE_MOTION_TRIANGLE));
 				root.grow(bounds);
 				center.grow(bounds.center2());
 			}
 		}
+		else {
+			/* Motion triangles, trace optimized case:  we split triangle
+			 * primitives into separate nodes for each of the time steps.
+			 * This way we minimize overlap of neighbor curve primitives.
+			 */
+			const int num_bvh_steps = params.num_motion_curve_steps * 2 + 1;
+			const float num_bvh_steps_inv_1 = 1.0f / (num_bvh_steps - 1);
+			const size_t num_verts = mesh->verts.size();
+			const size_t num_steps = mesh->motion_steps;
+			const float3 *vert_steps = attr_mP->data_float3();
+			/* Calculate bounding box of the previous time step.
+			 * Will be reused later to avoid duplicated work on
+			 * calculating BVH time step boundbox.
+			 */
+			float3 prev_verts[3];
+			t.motion_verts(verts,
+			               vert_steps,
+			               num_verts,
+			               num_steps,
+			               0.0f,
+			               prev_verts);
+			BoundBox prev_bounds = BoundBox::empty;
+			prev_bounds.grow(prev_verts[0]);
+			prev_bounds.grow(prev_verts[1]);
+			prev_bounds.grow(prev_verts[2]);
+			/* Create all primitive time steps, */
+			for(int bvh_step = 1; bvh_step < num_bvh_steps; ++bvh_step) {
+				const float curr_time = (float)(bvh_step) * num_bvh_steps_inv_1;
+				float3 curr_verts[3];
+				t.motion_verts(verts,
+				               vert_steps,
+				               num_verts,
+				               num_steps,
+				               curr_time,
+				               curr_verts);
+				BoundBox curr_bounds = BoundBox::empty;
+				curr_bounds.grow(curr_verts[0]);
+				curr_bounds.grow(curr_verts[1]);
+				curr_bounds.grow(curr_verts[2]);
+				BoundBox bounds = prev_bounds;
+				bounds.grow(curr_bounds);
+				if(bounds.valid()) {
+					const float prev_time = (float)(bvh_step - 1) * num_bvh_steps_inv_1;
+					references.push_back(
+					        BVHReference(bounds,
+					                     j,
+					                     i,
+					                     PRIMITIVE_MOTION_TRIANGLE,
+					                     prev_time,
+					                     curr_time));
+					root.grow(bounds);
+					center.grow(bounds.center2());
+				}
+				/* Current time boundbox becomes previous one for the
+				 * next time step.
+				 */
+				prev_bounds = curr_bounds;
+			}
+		}
 	}
+}
 
-	if(params.primitive_mask & PRIMITIVE_ALL_CURVE) {
-		Attribute *curve_attr_mP = NULL;
-
-		if(mesh->has_motion_blur())
-			curve_attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-		size_t num_curves = mesh->num_curves();
-		for(uint j = 0; j < num_curves; j++) {
-			Mesh::Curve curve = mesh->get_curve(j);
-			PrimitiveType type = PRIMITIVE_CURVE;
-
-			for(int k = 0; k < curve.num_keys - 1; k++) {
+void BVHBuild::add_reference_curves(BoundBox& root, BoundBox& center, Mesh *mesh, int i)
+{
+	const Attribute *curve_attr_mP = NULL;
+	if(mesh->has_motion_blur()) {
+		curve_attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+	}
+	const size_t num_curves = mesh->num_curves();
+	for(uint j = 0; j < num_curves; j++) {
+		const Mesh::Curve curve = mesh->get_curve(j);
+		const float *curve_radius = &mesh->curve_radius[0];
+		for(int k = 0; k < curve.num_keys - 1; k++) {
+			if(curve_attr_mP == NULL) {
+				/* Really simple logic for static hair. */
 				BoundBox bounds = BoundBox::empty;
-				curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bounds);
-
-				/* motion curve */
-				if(curve_attr_mP) {
-					size_t mesh_size = mesh->curve_keys.size();
-					size_t steps = mesh->motion_steps - 1;
-					float3 *key_steps = curve_attr_mP->data_float3();
-
-					for(size_t i = 0; i < steps; i++)
-						curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bounds);
-
-					type = PRIMITIVE_MOTION_CURVE;
-				}
-
+				curve.bounds_grow(k, &mesh->curve_keys[0], curve_radius, bounds);
 				if(bounds.valid()) {
-					int packed_type = PRIMITIVE_PACK_SEGMENT(type, k);
-
+					int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_CURVE, k);
 					references.push_back(BVHReference(bounds, j, i, packed_type));
 					root.grow(bounds);
 					center.grow(bounds.center2());
 				}
 			}
+			else if(params.num_motion_curve_steps == 0 || params.use_spatial_split) {
+				/* Simple case of motion curves: single node for the while
+				 * shutter time. Lowest memory usage but less optimal
+				 * rendering.
+				 */
+				/* TODO(sergey): Support motion steps for spatially split BVH. */
+				BoundBox bounds = BoundBox::empty;
+				curve.bounds_grow(k, &mesh->curve_keys[0], curve_radius, bounds);
+				const size_t num_keys = mesh->curve_keys.size();
+				const size_t num_steps = mesh->motion_steps;
+				const float3 *key_steps = curve_attr_mP->data_float3();
+				for(size_t step = 0; step < num_steps - 1; step++) {
+					curve.bounds_grow(k,
+					                  key_steps + step*num_keys,
+					                  curve_radius,
+					                  bounds);
+				}
+				if(bounds.valid()) {
+					int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_MOTION_CURVE, k);
+					references.push_back(BVHReference(bounds,
+					                                  j,
+					                                  i,
+					                                  packed_type));
+					root.grow(bounds);
+					center.grow(bounds.center2());
+				}
+			}
+			else {
+				/* Motion curves, trace optimized case:  we split curve keys
+				 * primitives into separate nodes for each of the time steps.
+				 * This way we minimize overlap of neighbor curve primitives.
+				 */
+				const int num_bvh_steps = params.num_motion_curve_steps * 2 + 1;
+				const float num_bvh_steps_inv_1 = 1.0f / (num_bvh_steps - 1);
+				const size_t num_steps = mesh->motion_steps;
+				const float3 *curve_keys = &mesh->curve_keys[0];
+				const float3 *key_steps = curve_attr_mP->data_float3();
+				const size_t num_keys = mesh->curve_keys.size();
+				/* Calculate bounding box of the previous time step.
+				 * Will be reused later to avoid duplicated work on
+				 * calculating BVH time step boundbox.
+				 */
+				float4 prev_keys[4];
+				curve.cardinal_motion_keys(curve_keys,
+				                           curve_radius,
+				                           key_steps,
+				                           num_keys,
+				                           num_steps,
+				                           0.0f,
+				                           k - 1, k, k + 1, k + 2,
+				                           prev_keys);
+				BoundBox prev_bounds = BoundBox::empty;
+				curve.bounds_grow(prev_keys, prev_bounds);
+				/* Create all primitive time steps, */
+				for(int bvh_step = 1; bvh_step < num_bvh_steps; ++bvh_step) {
+					const float curr_time = (float)(bvh_step) * num_bvh_steps_inv_1;
+					float4 curr_keys[4];
+					curve.cardinal_motion_keys(curve_keys,
+					                           curve_radius,
+					                           key_steps,
+					                           num_keys,
+					                           num_steps,
+					                           curr_time,
+					                           k - 1, k, k + 1, k + 2,
+					                           curr_keys);
+					BoundBox curr_bounds = BoundBox::empty;
+					curve.bounds_grow(curr_keys, curr_bounds);
+					BoundBox bounds = prev_bounds;
+					bounds.grow(curr_bounds);
+					if(bounds.valid()) {
+						const float prev_time = (float)(bvh_step - 1) * num_bvh_steps_inv_1;
+						int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_MOTION_CURVE, k);
+						references.push_back(BVHReference(bounds,
+						                                  j,
+						                                  i,
+						                                  packed_type,
+						                                  prev_time,
+						                                  curr_time));
+						root.grow(bounds);
+						center.grow(bounds.center2());
+					}
+					/* Current time boundbox becomes previous one for the
+					 * next time step.
+					 */
+					prev_bounds = curr_bounds;
+				}
+			}
 		}
 	}
 }
 
+void BVHBuild::add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh, int i)
+{
+	if(params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
+		add_reference_triangles(root, center, mesh, i);
+	}
+	if(params.primitive_mask & PRIMITIVE_ALL_CURVE) {
+		add_reference_curves(root, center, mesh, i);
+	}
+}
+
 void BVHBuild::add_reference_object(BoundBox& root, BoundBox& center, Object *ob, int i)
 {
 	references.push_back(BVHReference(ob->bounds, -1, i, 0));
@@ -200,7 +357,7 @@ static size_t count_curve_segments(Mesh *mesh)
 
 	for(size_t i = 0; i < num_curves; i++)
 		num += mesh->get_curve(i).num_keys - 1;
-	
+
 	return num;
 }
 
@@ -344,6 +501,7 @@ BVHNode* BVHBuild::run()
 		else {
 			/*rotate(rootnode, 4, 5);*/
 			rootnode->update_visibility();
+			rootnode->update_time();
 		}
 		if(rootnode != NULL) {
 			VLOG(1) << "BVH build statistics:\n"
@@ -371,7 +529,7 @@ void BVHBuild::progress_update()
 {
 	if(time_dt() - progress_start_time < 0.25)
 		return;
-	
+
 	double progress_start = (double)progress_count/(double)progress_total;
 	double duplicates = (double)(progress_total - progress_original_total)/(double)progress_total;
 
@@ -379,7 +537,7 @@ void BVHBuild::progress_update()
 	                           progress_start * 100.0, duplicates * 100.0);
 
 	progress.set_substatus(msg);
-	progress_start_time = time_dt(); 
+	progress_start_time = time_dt();
 }
 
 void BVHBuild::thread_build_node(InnerNode *inner,
@@ -435,6 +593,7 @@ bool BVHBuild::range_within_max_leaf_size(const BVHRange& range,
 		return false;
 
 	size_t num_triangles = 0;
+	size_t num_motion_triangles = 0;
 	size_t num_curves = 0;
 	size_t num_motion_curves = 0;
 
@@ -445,13 +604,16 @@ bool BVHBuild::range_within_max_leaf_size(const BVHRange& range,
 			num_curves++;
 		if(ref.prim_type() & PRIMITIVE_MOTION_CURVE)
 			num_motion_curves++;
-		else if(ref.prim_type() & PRIMITIVE_ALL_TRIANGLE)
+		else if(ref.prim_type() & PRIMITIVE_TRIANGLE)
 			num_triangles++;
+		else if(ref.prim_type() & PRIMITIVE_MOTION_TRIANGLE)
+			num_motion_triangles++;
 	}
 
-	return (num_triangles < params.max_triangle_leaf_size) &&
-	       (num_curves < params.max_curve_leaf_size) &&
-	       (num_motion_curves < params.max_curve_leaf_size);
+	return (num_triangles <= params.max_triangle_leaf_size) &&
+	       (num_motion_triangles <= params.max_motion_triangle_leaf_size) &&
+	       (num_curves <= params.max_curve_leaf_size) &&
+	       (num_motion_curves <= params.max_motion_curve_leaf_size);
 }
 
 /* multithreaded binning builder */
@@ -689,18 +851,24 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start,
 		prim_object[start] = ref->prim_object();
 
 		uint visibility = objects[ref->prim_object()]->visibility;
-		return new LeafNode(ref->bounds(), visibility, start, start+1);
+		BVHNode *leaf_node =  new LeafNode(ref->bounds(), visibility, start, start+1);
+		leaf_node->m_time_from = ref->time_from();
+		leaf_node->m_time_to = ref->time_to();
+		return leaf_node;
 	}
 	else {
 		int mid = num/2;
-		BVHNode *leaf0 = create_object_leaf_nodes(ref, start, mid); 
-		BVHNode *leaf1 = create_object_leaf_nodes(ref+mid, start+mid, num-mid); 
+		BVHNode *leaf0 = create_object_leaf_nodes(ref, start, mid);
+		BVHNode *leaf1 = create_object_leaf_nodes(ref+mid, start+mid, num-mid);
 
 		BoundBox bounds = BoundBox::empty;
 		bounds.grow(leaf0->m_bounds);
 		bounds.grow(leaf1->m_bounds);
 
-		return new InnerNode(bounds, leaf0, leaf1);
+		BVHNode *inner_node = new InnerNode(bounds, leaf0, leaf1);
+		inner_node->m_time_from = min(leaf0->m_time_from, leaf1->m_time_from);
+		inner_node->m_time_to = max(leaf0->m_time_to, leaf1->m_time_to);
+		return inner_node;
 	}
 }
 
@@ -804,6 +972,16 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			                                   visibility[i],
 			                                   start_index,
 			                                   start_index + num);
+			if(true) {
+				float time_from = 1.0f, time_to = 0.0f;
+				for(int j = 0; j < num; ++j) {
+					const BVHReference &ref = p_ref[i][j];
+					time_from = min(time_from, ref.time_from());
+					time_to = max(time_to, ref.time_to());
+				}
+				leaf_node->m_time_from = time_from;
+				leaf_node->m_time_to = time_to;
+			}
 			if(alignment_found) {
 				/* Need to recalculate leaf bounds with new alignment. */
 				leaf_node->m_bounds = BoundBox::empty;
@@ -918,7 +1096,7 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 		BVHNode *inner = new InnerNode(inner_bounds, leaves[1], leaves[2]);
 		return new InnerNode(range.bounds(), leaves[0], inner);
 	} else {
-		/* Shpuld be doing more branches if more primitive types added. */
+		/* Should be doing more branches if more primitive types added. */
 		assert(num_leaves <= 5);
 		BoundBox inner_bounds_a = merge(leaves[0]->m_bounds, leaves[1]->m_bounds);
 		BoundBox inner_bounds_b = merge(leaves[2]->m_bounds, leaves[3]->m_bounds);
@@ -951,7 +1129,7 @@ void BVHBuild::rotate(BVHNode *node, int max_depth)
 	/* nothing to rotate if we reached a leaf node. */
 	if(node->is_leaf() || max_depth < 0)
 		return;
-	
+
 	InnerNode *parent = (InnerNode*)node;
 
 	/* rotate all children first */
diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h
index 6418034..ee3cde6 100644
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@@ -63,6 +63,8 @@ protected:
 	friend class BVHObjectBinning;
 
 	/* Adding references. */
+	void add_reference_triangles(BoundBox& root, BoundBox& center, Mesh *mesh, int i);
+	void add_reference_curves(BoundBox& root, BoundBox& center, Mesh *mesh, int i);
 	void add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh, int i);
 	void add_reference_object(BoundBox& root, BoundBox& center, Object *ob, int i);
 	void add_references(BVHRange& root);
diff --git a/intern/cycles/bvh/bvh_node.cpp b/intern/cycles/bvh/bvh_node.cpp
index f5cd699..67580e1 100644
--- a/intern/cycles/bvh/bvh_node.cpp
+++ b/intern/cycles/bvh/bvh_node.cpp
@@ -176,6 +176,19 @@ uint BVHNode::update_visibility()
 	return m_visibility;
 }
 
+void BVHNode::update_time()
+{
+	if(!is_leaf()) {
+		InnerNode *inner = (InnerNode*)this;
+		BVHNode *child0 = inner->children[0];
+		BVHNode *child1 = inner->children[1];
+		child0->update_time();
+		child1->update_time();
+		m_time_from = min(child0->m_time_from, child1->m_time_from);
+		m_time_to =  max(child0->m_time_to, child1->m_time_to);
+	}
+}
+
 /* Inner Node */
 
 void InnerNode::print(int depth) const
diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h
index 2faa40a..090c426 100644
--- a/intern/cycles/bvh/bvh_node.h
+++ b/intern/cycles/bvh/bvh_node.h
@@ -47,7 +47,9 @@ class BVHNode
 {
 public:
 	BVHNode() : m_is_unaligned(false),
-	            m_aligned_space(NULL)
+	            m_aligned_space(NULL),
+	            m_time_from(0.0f),
+	            m_time_to(1.0f)
 	{
 	}
 
@@ -91,12 +93,15 @@ public:
 	void deleteSubtree();
 
 	uint update_visibility();
+	void update_time();
 
 	bool m_is_unaligned;
 
 	// TODO(sergey): Can be stored as 3x3 matrix, but better to have some
 	// utilities and type defines in util_transform first.
 	Transform *m_aligned_space;
+
+	float m_time_from, m_time_to;
 };
 
 class InnerNode : public BVHNode
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index 2e698a8..65f9da1 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -43,7 +43,9 @@ public:
 	/* number of primitives in leaf */
 	int min_leaf_size;
 	int max_triangle_leaf_size;
+	int max_motion_triangle_leaf_size;
 	int max_curve_leaf_size;
+	int max_motion_curve_leaf_size;
 
 	/* object or mesh level bvh */
 	bool top_level;
@@ -59,6 +61,17 @@ public:
 	 */
 	bool use_unaligned_nodes;
 
+	/* Split time range to this number of steps and create leaf node for each
+	 * of this time steps.
+	 *
+	 * Speeds up rendering of motion curve primitives in the cost of higher
+	 * memory usage.
+	 */
+	int num_motion_curve_steps;
+
+	/* Same as above, but for triangle primitives. */
+	int num_motion_triangle_steps;
+
 	/* fixed parameters */
 	enum {
 		MAX_DEPTH = 64,
@@ -80,13 +93,17 @@ public:
 
 		min_leaf_size = 1;
 		max_triangle_leaf_size = 8;
-		max_curve_leaf_size = 2;
+		max_motion_triangle_leaf_size = 8;
+		max_curve_leaf_size = 1;
+		max_motion_curve_leaf_size = 4;
 
 		top_level = false;
 		use_qbvh = false;
 		use_unaligned_nodes = false;
 
 		primitive_mask = PRIMITIVE_ALL;
+
+		num_motion_curve_steps = 0;
 	}
 
 	/* SAH costs */
@@ -113,8 +130,15 @@ class BVHReference
 public:
 	__forceinline BVHReference() {}
 
-	__forceinline BVHReference(const BoundBox& bounds_, int prim_index_, int prim_object_, int prim_type)
-	: rbounds(bounds_)
+	__forceinline BVHReference(const BoundBox& bounds_,
+	                           int prim_index_,
+	                           int prim_object_,
+	                           int prim_type,
+	                           float time_from = 0.0f,
+	                           float time_to = 1.0f)
+	        : rbounds(bounds_),
+	          time_from_(time_from),
+	          time_to_(time_to)
 	{
 		rbounds.min.w = __int_as_float(prim_index_);
 		rbounds.max.w = __int_as_float(prim_object_);
@@ -125,6 +149,9 @@ public:
 	__forceinline int prim_index() const { return __float_as_int(rbounds.min.w); }
 	__forceinline int prim_object() const { return __float_as_int(rbounds.max.w); }
 	__forceinline int prim_type() const { return type; }
+	__forceinline float time_from() const { return time_from_; }
+	__forceinline float time_to() const { return time_to_; }
+
 
 	BVHReference& operator=(const BVHReference &arg) {
 		if(&arg != this) {
@@ -133,9 +160,11 @@ public:
 		return *this;
 	}
 
+
 protected:
 	BoundBox rbounds;
 	uint type;
+	float time_from_, time_to_;
 };
 
 /* BVH Range
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index 616dd94..403a054 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -44,6 +44,10 @@ if(WITH_CYCLES_CUDA_BINARIES OR NOT WITH_CUDA_DYNLOAD)
 	else()
 		message(STATUS "CUDA compiler not found, disabling WITH_CYCLES_CUDA_BINARIES")
 		set(WITH_CYCLES_CUDA_BINARIES OFF)
+		if(NOT WITH_CUDA_DYNLOAD)
+			message(STATUS "Additionally falling back to dynamic CUDA load")
+			set(WITH_CUDA_DYNLOAD ON)
+		endif()
 	endif()
 endif()
 
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index c34677e..966ff5e 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -36,6 +36,15 @@ set(SRC
 	device_task.cpp
 )
 
+set(SRC_OPENCL
+	opencl/opencl.h
+
+	opencl/opencl_base.cpp
+	opencl/opencl_mega.cpp
+	opencl/opencl_split.cpp
+	opencl/opencl_util.cpp
+)
+
 if(WITH_CYCLES_NETWORK)
 	list(APPEND SRC
 		device_network.cpp
@@ -67,4 +76,4 @@ endif()
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-add_library(cycles_device ${SRC} ${SRC_HEADERS})
+add_library(cycles_device ${SRC} ${SRC_OPENCL} ${SRC_HEADERS})
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 909ec7a..31c99f4 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -64,6 +64,8 @@ std::ostream& operator <<(std::ostream &os,
 	   << string_from_bool(requested_features.use_integrator_branched) << std::endl;
 	os << "Use Patch Evaluation: "
 	   << string_from_bool(requested_features.use_patch_evaluation) << std::endl;
+	os << "Use Transparent Shadows: "
+	   << string_from_bool(requested_features.use_transparent) << std::endl;
 	return os;
 }
 
@@ -258,33 +260,33 @@ Device *Device::create(DeviceInfo& info, Stats &stats, bool background)
 
 DeviceType Device::type_from_string(const char *name)
 {
-	if(strcmp(name, "cpu") == 0)
+	if(strcmp(name, "CPU") == 0)
 		return DEVICE_CPU;
-	else if(strcmp(name, "cuda") == 0)
+	else if(strcmp(name, "CUDA") == 0)
 		return DEVICE_CUDA;
-	else if(strcmp(name, "opencl") == 0)
+	else if(strcmp(name, "OPENCL") == 0)
 		return DEVICE_OPENCL;
-	else if(strcmp(name, "network") == 0)
+	else if(strcmp(name, "NETWORK") == 0)
 		return DEVICE_NETWORK;
-	else if(strcmp(name, "multi") == 0)
+	else if(strcmp(name, "MULTI") == 0)
 		return DEVICE_MULTI;
-	
+
 	return DEVICE_NONE;
 }
 
 string Device::string_from_type(DeviceType type)
 {
 	if(type == DEVICE_CPU)
-		return "cpu";
+		return "CPU";
 	else if(type == DEVICE_CUDA)
-		return "cuda";
+		return "CUDA";
 	else if(type == DEVICE_OPENCL)
-		return "opencl";
+		return "OPENCL";
 	else if(type == DEVICE_NETWORK)
-		return "network";
+		return "NETWORK";
 	else if(type == DEVICE_MULTI)
-		return "multi";
-	
+		return "MULTI";
+
 	return "";
 }
 
@@ -307,9 +309,6 @@ vector<DeviceType>& Device::available_types()
 #ifdef WITH_NETWORK
 		types.push_back(DEVICE_NETWORK);
 #endif
-#ifdef WITH_MULTI
-		types.push_back(DEVICE_MULTI);
-#endif
 
 		need_types_update = false;
 	}
@@ -331,10 +330,6 @@ vector<DeviceInfo>& Device::available_devices()
 			device_opencl_info(devices);
 #endif
 
-#ifdef WITH_MULTI
-		device_multi_info(devices);
-#endif
-
 		device_cpu_info(devices);
 
 #ifdef WITH_NETWORK
@@ -368,6 +363,29 @@ string Device::device_capabilities()
 	return capabilities;
 }
 
+DeviceInfo Device::get_multi_device(vector<DeviceInfo> subdevices)
+{
+	assert(subdevices.size() > 1);
+
+	DeviceInfo info;
+	info.type = DEVICE_MULTI;
+	info.id = "MULTI";
+	info.description = "Multi Device";
+	info.multi_devices = subdevices;
+	info.num = 0;
+
+	info.has_bindless_textures = true;
+	info.pack_images = false;
+	foreach(DeviceInfo &device, subdevices) {
+		assert(device.type == info.multi_devices[0].type);
+
+		info.pack_images |= device.pack_images;
+		info.has_bindless_textures &= device.has_bindless_textures;
+	}
+
+	return info;
+}
+
 void Device::tag_update()
 {
 	need_types_update = true;
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 77dc1fa..ccee25a 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -49,7 +49,7 @@ class DeviceInfo {
 public:
 	DeviceType type;
 	string description;
-	string id;
+	string id; /* used for user preferences, should stay fixed with changing hardware config */
 	int num;
 	bool display_device;
 	bool advanced_shading;
@@ -69,6 +69,12 @@ public:
 		has_bindless_textures = false;
 		use_split_kernel = false;
 	}
+
+	bool operator==(const DeviceInfo &info) {
+		/* Multiple Devices with the same ID would be very bad. */
+		assert(id != info.id || (type == info.type && num == info.num && description == info.description));
+		return id == info.id;
+	}
 };
 
 class DeviceRequestedFeatures {
@@ -111,6 +117,9 @@ public:
 
 	/* Use OpenSubdiv patch evaluation */
 	bool use_patch_evaluation;
+	
+	/* Use Transparent shadows */
+	bool use_transparent;
 
 	DeviceRequestedFeatures()
 	{
@@ -127,6 +136,7 @@ public:
 		use_volume = false;
 		use_integrator_branched = false;
 		use_patch_evaluation = false;
+		use_transparent = false;
 	}
 
 	bool modified(const DeviceRequestedFeatures& requested_features)
@@ -142,7 +152,8 @@ public:
 		         use_subsurface == requested_features.use_subsurface &&
 		         use_volume == requested_features.use_volume &&
 		         use_integrator_branched == requested_features.use_integrator_branched &&
-		         use_patch_evaluation == requested_features.use_patch_evaluation);
+		         use_patch_evaluation == requested_features.use_patch_evaluation &&
+		         use_transparent == requested_features.use_transparent);
 	}
 
 	/* Convert the requested features structure to a build options,
@@ -183,6 +194,9 @@ public:
 		if(!use_patch_evaluation) {
 			build_options += " -D__NO_PATCH_EVAL__";
 		}
+		if(!use_transparent) {
+			build_options += " -D__NO_TRANSPARENT__";
+		}
 		return build_options;
 	}
 };
@@ -214,6 +228,7 @@ public:
 	DeviceInfo info;
 	virtual const string& error_message() { return error_msg; }
 	bool have_error() { return !error_message().empty(); }
+	virtual bool show_samples() const { return false; }
 
 	/* statistics */
 	Stats &stats;
@@ -282,6 +297,7 @@ public:
 	static vector<DeviceType>& available_types();
 	static vector<DeviceInfo>& available_devices();
 	static string device_capabilities();
+	static DeviceInfo get_multi_device(vector<DeviceInfo> subdevices);
 
 	/* Tag devices lists for update. */
 	static void tag_update();
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index aed86d8..c8e001e 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -112,6 +112,11 @@ public:
 		task_pool.stop();
 	}
 
+	virtual bool show_samples() const
+	{
+		return (TaskScheduler::num_threads() == 1);
+	}
+
 	void mem_alloc(device_memory& mem, MemoryType /*type*/)
 	{
 		mem.device_pointer = mem.data_pointer;
@@ -275,7 +280,7 @@ public:
 
 				tile.sample = sample + 1;
 
-				task.update_progress(&tile);
+				task.update_progress(&tile, tile.w*tile.h);
 			}
 
 			task.release_tile(tile);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index a968a81..233f94b 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <climits>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -114,6 +115,12 @@ public:
 		return path_exists(cubins_path);
 	}
 
+	virtual bool show_samples() const
+	{
+		/* The CUDADevice only processes one tile at a time, so showing samples is fine. */
+		return true;
+	}
+
 /*#ifdef NDEBUG
 #define cuda_abort()
 #else
@@ -213,7 +220,8 @@ public:
 			return;
 
 		int major, minor;
-		cuDeviceComputeCapability(&major, &minor, cuDevId);
+		cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+		cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
 		cuDevArchitecture = major*100 + minor*10;
 
 		cuda_pop_context();
@@ -233,7 +241,8 @@ public:
 	bool support_device(const DeviceRequestedFeatures& /*requested_features*/)
 	{
 		int major, minor;
-		cuDeviceComputeCapability(&major, &minor, cuDevId);
+		cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+		cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
 
 		/* We only support sm_20 and above */
 		if(major < 2) {
@@ -315,7 +324,8 @@ public:
 	{
 		/* Compute cubin name. */
 		int major, minor;
-		cuDeviceComputeCapability(&major, &minor, cuDevId);
+		cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+		cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
 
 		/* Attempt to use kernel provided with Blender. */
 		if(!use_adaptive_compilation()) {
@@ -343,7 +353,7 @@ public:
 		const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin",
 		                                        major, minor,
 		                                        cubin_md5.c_str());
-		const string cubin = path_user_get(path_join("cache", cubin_file));
+		const string cubin = path_cache_get(path_join("kernels", cubin_file));
 		VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
 		if(path_exists(cubin)) {
 			VLOG(1) << "Using locally compiled kernel.";
@@ -1263,7 +1273,7 @@ public:
 
 					tile.sample = sample + 1;
 
-					task->update_progress(&tile);
+					task->update_progress(&tile, tile.w*tile.h);
 				}
 
 				task->release_tile(tile);
@@ -1394,8 +1404,8 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 		if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS)
 			continue;
 
-		int major, minor;
-		cuDeviceComputeCapability(&major, &minor, num);
+		int major;
+		cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, num);
 		if(major < 2) {
 			continue;
 		}
@@ -1404,13 +1414,22 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 
 		info.type = DEVICE_CUDA;
 		info.description = string(name);
-		info.id = string_printf("CUDA_%d", num);
 		info.num = num;
 
 		info.advanced_shading = (major >= 2);
 		info.has_bindless_textures = (major >= 3);
 		info.pack_images = false;
 
+		int pci_location[3] = {0, 0, 0};
+		cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
+		cuDeviceGetAttribute(&pci_location[1], CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, num);
+		cuDeviceGetAttribute(&pci_location[2], CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, num);
+		info.id = string_printf("CUDA_%s_%04x:%02x:%02x",
+		                        name,
+		                        (unsigned int)pci_location[0],
+		                        (unsigned int)pci_location[1],
+		                        (unsigned int)pci_location[2]);
+
 		/* if device has a kernel timeout, assume it is used for display */
 		if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) {
 			info.description += " (Display)";
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
index 47584ae..de48764 100644
--- a/intern/cycles/device/device_intern.h
+++ b/intern/cycles/device/device_intern.h
@@ -33,7 +33,6 @@ void device_cpu_info(vector<DeviceInfo>& devices);
 void device_opencl_info(vector<DeviceInfo>& devices);
 void device_cuda_info(vector<DeviceInfo>& devices);
 void device_network_info(vector<DeviceInfo>& devices);
-void device_multi_info(vector<DeviceInfo>& devices);
 
 string device_cpu_capabilities(void);
 string device_opencl_capabilities(void);
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index ef25735..31b8006 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -89,6 +89,14 @@ public:
 		return error_msg;
 	}
 
+	virtual bool show_samples() const
+	{
+		if(devices.size() > 1) {
+			return false;
+		}
+		return devices.front().device->show_samples();
+	}
+
 	bool load_kernels(const DeviceRequestedFeatures& requested_features)
 	{
 		foreach(SubDevice& sub, devices)
@@ -350,120 +358,5 @@ Device *device_multi_create(DeviceInfo& info, Stats &stats, bool background)
 	return new MultiDevice(info, stats, background);
 }
 
-static bool device_multi_add(vector<DeviceInfo>& devices, DeviceType type, bool with_display, bool with_advanced_shading, const char *id_fmt, int num)
-{
-	DeviceInfo info;
-
-	/* create map to find duplicate descriptions */
-	map<string, int> dupli_map;
-	map<string, int>::iterator dt;
-	int num_added = 0, num_display = 0;
-
-	info.advanced_shading = with_advanced_shading;
-	info.pack_images = false;
-	info.has_bindless_textures = true;
-
-	foreach(DeviceInfo& subinfo, devices) {
-		if(subinfo.type == type) {
-			if(subinfo.advanced_shading != info.advanced_shading)
-				continue;
-			if(subinfo.display_device) {
-				if(with_display)
-					num_display++;
-				else
-					continue;
-			}
-
-			string key = subinfo.description;
-
-			if(dupli_map.find(key) == dupli_map.end())
-				dupli_map[key] = 1;
-			else
-				dupli_map[key]++;
-
-			info.multi_devices.push_back(subinfo);
-			if(subinfo.display_device)
-				info.display_device = true;
-			info.pack_images = info.pack_images || subinfo.pack_images;
-			info.has_bindless_textures = info.has_bindless_textures && subinfo.has_bindless_textures;
-			num_added++;
-		}
-	}
-
-	if(num_added <= 1 || (with_display && num_display == 0))
-		return false;
-
-	/* generate string */
-	stringstream desc;
-	vector<string> last_tokens;
-	bool first = true;
-
-	for(dt = dupli_map.begin(); dt != dupli_map.end(); dt++) {
-		if(!first) desc << " + ";
-		first = false;
-
-		/* get name and count */
-		string name = dt->first;
-		int count = dt->second;
-
-		/* strip common prefixes */
-		vector<string> tokens;
-		string_split(tokens, dt->first);
-
-		if(tokens.size() > 1) {
-			int i;
-
-			for(i = 0; i < tokens.size() && i < last_tokens.size(); i++)
-				if(tokens[i] != last_tokens[i])
-					break;
-
-			name = "";
-			for(; i < tokens.size(); i++) {
-				name += tokens[i];
-				if(i != tokens.size() - 1)
-					name += " ";
-			}
-		}
-
-		last_tokens = tokens;
-
-		/* add */
-		if(count > 1)
-			desc << name << " (" << count << "x)";
-		else
-			desc << name;
-	}
-
-	/* add info */
-	info.type = DEVICE_MULTI;
-	info.description = desc.str();
-	info.id = string_printf(id_fmt, num);
-	info.display_device = with_display;
-	info.num = 0;
-
-	if(with_display)
-		devices.push_back(info);
-	else
-		devices.insert(devices.begin(), info);
-	
-	return true;
-}
-
-void device_multi_info(vector<DeviceInfo>& devices)
-{
-	int num = 0;
-
-	if(!device_multi_add(devices, DEVICE_CUDA, false, true, "CUDA_MULTI_%d", num++))
-		device_multi_add(devices, DEVICE_CUDA, false, false, "CUDA_MULTI_%d", num++);
-	if(!device_multi_add(devices, DEVICE_CUDA, true, true, "CUDA_MULTI_%d", num++))
-		device_multi_add(devices, DEVICE_CUDA, true, false, "CUDA_MULTI_%d", num++);
-
-	num = 0;
-	if(!device_multi_add(devices, DEVICE_OPENCL, false, true, "OPENCL_MULTI_%d", num++))
-		device_multi_add(devices, DEVICE_OPENCL, false, false, "OPENCL_MULTI_%d", num++);
-	if(!device_multi_add(devices, DEVICE_OPENCL, true, true, "OPENCL_MULTI_%d", num++))
-		device_multi_add(devices, DEVICE_OPENCL, true, false, "OPENCL_MULTI_%d", num++);
-}
-
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index 3eb5ad2..53eef6c 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -51,6 +51,11 @@ public:
 
 	thread_mutex rpc_lock;
 
+	virtual bool show_samples() const
+	{
+		return false;
+	}
+
 	NetworkDevice(DeviceInfo& info, Stats &stats, const char *address)
 	: Device(info, stats, true), socket(io_service)
 	{
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index dce1d37..ba94c59 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -16,3275 +16,29 @@
 
 #ifdef WITH_OPENCL
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+#include "opencl/opencl.h"
 
-#include "clew.h"
-
-#include "device.h"
-#include "device_intern.h"
-
-#include "buffers.h"
-
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_math.h"
-#include "util_md5.h"
-#include "util_opengl.h"
-#include "util_path.h"
-#include "util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
-
-/* Macro declarations used with split kernel */
-
-/* Macro to enable/disable work-stealing */
-#define __WORK_STEALING__
-
-#define SPLIT_KERNEL_LOCAL_SIZE_X 64
-#define SPLIT_KERNEL_LOCAL_SIZE_Y 1
-
-/* This value may be tuned according to the scene we are rendering.
- *
- * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
- * ray-bounces will improve performance.
- */
-#define PATH_ITER_INC_FACTOR 8
-
-/* When allocate global memory in chunks. We may not be able to
- * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
- * Since some bytes may be needed for aligning chunks of memory;
- * This is the amount of memory that we dedicate for that purpose.
- */
-#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
-
-struct OpenCLPlatformDevice {
-	OpenCLPlatformDevice(cl_platform_id platform_id,
-	                     const string& platform_name,
-	                     cl_device_id device_id,
-	                     cl_device_type device_type,
-	                     const string& device_name)
-	  : platform_id(platform_id),
-	    platform_name(platform_name),
-	    device_id(device_id),
-	    device_type(device_type),
-	    device_name(device_name) {}
-	cl_platform_id platform_id;
-	string platform_name;
-	cl_device_id device_id;
-	cl_device_type device_type;
-	string device_name;
-};
-
-namespace {
-
-cl_device_type opencl_device_type()
-{
-	switch(DebugFlags().opencl.device_type)
-	{
-		case DebugFlags::OpenCL::DEVICE_NONE:
-			return 0;
-		case DebugFlags::OpenCL::DEVICE_ALL:
-			return CL_DEVICE_TYPE_ALL;
-		case DebugFlags::OpenCL::DEVICE_DEFAULT:
-			return CL_DEVICE_TYPE_DEFAULT;
-		case DebugFlags::OpenCL::DEVICE_CPU:
-			return CL_DEVICE_TYPE_CPU;
-		case DebugFlags::OpenCL::DEVICE_GPU:
-			return CL_DEVICE_TYPE_GPU;
-		case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
-			return CL_DEVICE_TYPE_ACCELERATOR;
-		default:
-			return CL_DEVICE_TYPE_ALL;
-	}
-}
-
-inline bool opencl_kernel_use_debug()
-{
-	return DebugFlags().opencl.debug;
-}
-
-bool opencl_kernel_use_advanced_shading(const string& platform)
-{
-	/* keep this in sync with kernel_types.h! */
-	if(platform == "NVIDIA CUDA")
-		return true;
-	else if(platform == "Apple")
-		return true;
-	else if(platform == "AMD Accelerated Parallel Processing")
-		return true;
-	else if(platform == "Intel(R) OpenCL")
-		return true;
-	/* Make sure officially unsupported OpenCL platforms
-	 * does not set up to use advanced shading.
-	 */
-	return false;
-}
-
-bool opencl_kernel_use_split(const string& platform_name,
-                             const cl_device_type device_type)
-{
-	if(DebugFlags().opencl.kernel_type == DebugFlags::OpenCL::KERNEL_SPLIT) {
-		VLOG(1) << "Forcing split kernel to use.";
-		return true;
-	}
-	if(DebugFlags().opencl.kernel_type == DebugFlags::OpenCL::KERNEL_MEGA) {
-		VLOG(1) << "Forcing mega kernel to use.";
-		return false;
-	}
-	/* TODO(sergey): Replace string lookups with more enum-like API,
-	 * similar to device/vendor checks blender's gpu.
-	 */
-	if(platform_name == "AMD Accelerated Parallel Processing" &&
-	   device_type == CL_DEVICE_TYPE_GPU)
-	{
-		return true;
-	}
-	return false;
-}
-
-bool opencl_device_supported(const string& platform_name,
-                             const cl_device_id device_id)
-{
-	cl_device_type device_type;
-	clGetDeviceInfo(device_id,
-	                CL_DEVICE_TYPE,
-	                sizeof(cl_device_type),
-	                &device_type,
-	                NULL);
-	if(platform_name == "AMD Accelerated Parallel Processing" &&
-	   device_type == CL_DEVICE_TYPE_GPU)
-	{
-		return true;
-	}
-	if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
-		return true;
-	}
-	return false;
-}
-
-bool opencl_platform_version_check(cl_platform_id platform,
-                                   string *error = NULL)
-{
-	const int req_major = 1, req_minor = 1;
-	int major, minor;
-	char version[256];
-	clGetPlatformInfo(platform,
-	                  CL_PLATFORM_VERSION,
-	                  sizeof(version),
-	                  &version,
-	                  NULL);
-	if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
-		if(error != NULL) {
-			*error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
-		}
-		return false;
-	}
-	if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
-		if(error != NULL) {
-			*error = string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
-		}
-		return false;
-	}
-	if(error != NULL) {
-		*error = "";
-	}
-	return true;
-}
-
-bool opencl_device_version_check(cl_device_id device,
-                                 string *error = NULL)
-{
-	const int req_major = 1, req_minor = 1;
-	int major, minor;
-	char version[256];
-	clGetDeviceInfo(device,
-	                CL_DEVICE_OPENCL_C_VERSION,
-	                sizeof(version),
-	                &version,
-	                NULL);
-	if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
-		if(error != NULL) {
-			*error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
-		}
-		return false;
-	}
-	if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
-		if(error != NULL) {
-			*error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
-		}
-		return false;
-	}
-	if(error != NULL) {
-		*error = "";
-	}
-	return true;
-}
-
-void opencl_get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices)
-{
-	const bool force_all_platforms =
-		(DebugFlags().opencl.kernel_type != DebugFlags::OpenCL::KERNEL_DEFAULT);
-	const cl_device_type device_type = opencl_device_type();
-	static bool first_time = true;
-#define FIRST_VLOG(severity) if(first_time) VLOG(severity)
-
-	usable_devices->clear();
-
-	if(device_type == 0) {
-		FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
-		first_time = false;
-		return;
-	}
-
-	vector<cl_device_id> device_ids;
-	cl_uint num_devices = 0;
-	vector<cl_platform_id> platform_ids;
-	cl_uint num_platforms = 0;
-
-	/* Get devices. */
-	if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS ||
-	   num_platforms == 0)
-	{
-		FIRST_VLOG(2) << "No OpenCL platforms were found.";
-		first_time = false;
-		return;
-	}
-	platform_ids.resize(num_platforms);
-	if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS) {
-		FIRST_VLOG(2) << "Failed to fetch platform IDs from the driver..";
-		first_time = false;
-		return;
-	}
-	/* Devices are numbered consecutively across platforms. */
-	for(int platform = 0; platform < num_platforms; platform++) {
-		cl_platform_id platform_id = platform_ids[platform];
-		char pname[256];
-		if(clGetPlatformInfo(platform_id,
-		                     CL_PLATFORM_NAME,
-		                     sizeof(pname),
-		                     &pname,
-		                     NULL) != CL_SUCCESS)
-		{
-			FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
-			continue;
-		}
-		string platform_name = pname;
-		FIRST_VLOG(2) << "Enumerating devices for platform "
-		              << platform_name << ".";
-		if(!opencl_platform_version_check(platform_id)) {
-			FIRST_VLOG(2) << "Ignoring platform " << platform_name
-			              << " due to too old compiler version.";
-			continue;
-		}
-		num_devices = 0;
-		if(clGetDeviceIDs(platform_id,
-		                  device_type,
-		                  0,
-		                  NULL,
-		                  &num_devices) != CL_SUCCESS || num_devices == 0)
-		{
-			FIRST_VLOG(2) << "Ignoring platform " << platform_name
-			              << ", failed to fetch number of devices.";
-			continue;
-		}
-		device_ids.resize(num_devices);
-		if(clGetDeviceIDs(platform_id,
-		                  device_type,
-		                  num_devices,
-		                  &device_ids[0],
-		                  NULL) != CL_SUCCESS)
-		{
-			FIRST_VLOG(2) << "Ignoring platform " << platform_name
-			              << ", failed to fetch devices list.";
-			continue;
-		}
-		for(int num = 0; num < num_devices; num++) {
-			cl_device_id device_id = device_ids[num];
-			char device_name[1024] = "\0";
-			if(clGetDeviceInfo(device_id,
-			                   CL_DEVICE_NAME,
-			                   sizeof(device_name),
-			                   &device_name,
-			                   NULL) != CL_SUCCESS)
-			{
-				FIRST_VLOG(2) << "Failed to fetch device name, ignoring.";
-				continue;
-			}
-			if(!opencl_device_version_check(device_id)) {
-				FIRST_VLOG(2) << "Ignoring device " << device_name
-				              << " due to old compiler version.";
-				continue;
-			}
-			if(force_all_platforms ||
-			   opencl_device_supported(platform_name, device_id))
-			{
-				cl_device_type device_type;
-				if(clGetDeviceInfo(device_id,
-				                   CL_DEVICE_TYPE,
-				                   sizeof(cl_device_type),
-				                   &device_type,
-				                   NULL) != CL_SUCCESS)
-				{
-					FIRST_VLOG(2) << "Ignoring device " << device_name
-					              << ", failed to fetch device type.";
-					continue;
-				}
-				FIRST_VLOG(2) << "Adding new device " << device_name << ".";
-				usable_devices->push_back(OpenCLPlatformDevice(platform_id,
-				                                               platform_name,
-				                                               device_id,
-				                                               device_type,
-				                                               device_name));
-			}
-			else {
-				FIRST_VLOG(2) << "Ignoring device " << device_name
-				              << ", not officially supported yet.";
-			}
-		}
-	}
-	first_time = false;
-}
-
-}  /* namespace */
-
-/* Thread safe cache for contexts and programs.
- *
- * TODO(sergey): Make it more generous, so it can contain any type of program
- * without hardcoding possible program types in the slot.
- */
-class OpenCLCache
-{
-	struct Slot
-	{
-		thread_mutex *mutex;
-		cl_context context;
-		/* cl_program for shader, bake, film_convert kernels (used in OpenCLDeviceBase) */
-		cl_program ocl_dev_base_program;
-		/* cl_program for megakernel (used in OpenCLDeviceMegaKernel) */
-		cl_program ocl_dev_megakernel_program;
-
-		Slot() : mutex(NULL),
-		         context(NULL),
-		         ocl_dev_base_program(NULL),
-		         ocl_dev_megakernel_program(NULL) {}
-
-		Slot(const Slot& rhs)
-		    : mutex(rhs.mutex),
-		      context(rhs.context),
-		      ocl_dev_base_program(rhs.ocl_dev_base_program),
-		      ocl_dev_megakernel_program(rhs.ocl_dev_megakernel_program)
-		{
-			/* copy can only happen in map insert, assert that */
-			assert(mutex == NULL);
-		}
-
-		~Slot()
-		{
-			delete mutex;
-			mutex = NULL;
-		}
-	};
-
-	/* key is combination of platform ID and device ID */
-	typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
-
-	/* map of Slot objects */
-	typedef map<PlatformDevicePair, Slot> CacheMap;
-	CacheMap cache;
-
-	thread_mutex cache_lock;
-
-	/* lazy instantiate */
-	static OpenCLCache &global_instance()
-	{
-		static OpenCLCache instance;
-		return instance;
-	}
-
-	OpenCLCache()
-	{
-	}
-
-	~OpenCLCache()
-	{
-		/* Intel OpenCL bug raises SIGABRT due to pure virtual call
-		 * so this is disabled. It's not necessary to free objects
-		 * at process exit anyway.
-		 * http://software.intel.com/en-us/forums/topic/370083#comments */
-
-		//flush();
-	}
-
-	/* lookup something in the cache. If this returns NULL, slot_locker
-	 * will be holding a lock for the cache. slot_locker should refer to a
-	 * default constructed thread_scoped_lock */
-	template<typename T>
-	static T get_something(cl_platform_id platform,
-	                       cl_device_id device,
-	                       T Slot::*member,
-	                       thread_scoped_lock& slot_locker)
-	{
-		assert(platform != NULL);
-
-		OpenCLCache& self = global_instance();
-
-		thread_scoped_lock cache_lock(self.cache_lock);
-
-		pair<CacheMap::iterator,bool> ins = self.cache.insert(
-			CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
-
-		Slot &slot = ins.first->second;
-
-		/* create slot lock only while holding cache lock */
-		if(!slot.mutex)
-			slot.mutex = new thread_mutex;
-
-		/* need to unlock cache before locking slot, to allow store to complete */
-		cache_lock.unlock();
-
-		/* lock the slot */
-		slot_locker = thread_scoped_lock(*slot.mutex);
-
-		/* If the thing isn't cached */
-		if(slot.*member == NULL) {
-			/* return with the caller's lock holder holding the slot lock */
-			return NULL;
-		}
-
-		/* the item was already cached, release the slot lock */
-		slot_locker.unlock();
-
-		return slot.*member;
-	}
-
-	/* store something in the cache. you MUST have tried to get the item before storing to it */
-	template<typename T>
-	static void store_something(cl_platform_id platform,
-	                            cl_device_id device,
-	                            T thing,
-	                            T Slot::*member,
-	                            thread_scoped_lock& slot_locker)
-	{
-		assert(platform != NULL);
-		assert(device != NULL);
-		assert(thing != NULL);
-
-		OpenCLCache &self = global_instance();
-
-		thread_scoped_lock cache_lock(self.cache_lock);
-		CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
-		cache_lock.unlock();
-
-		Slot &slot = i->second;
-
-		/* sanity check */
-		assert(i != self.cache.end());
-		assert(slot.*member == NULL);
-
-		slot.*member = thing;
-
-		/* unlock the slot */
-		slot_locker.unlock();
-	}
-
-public:
-
-	enum ProgramName {
-		OCL_DEV_BASE_PROGRAM,
-		OCL_DEV_MEGAKERNEL_PROGRAM,
-	};
-
-	/* see get_something comment */
-	static cl_context get_context(cl_platform_id platform,
-	                              cl_device_id device,
-	                              thread_scoped_lock& slot_locker)
-	{
-		cl_context context = get_something<cl_context>(platform,
-		                                               device,
-		                                               &Slot::context,
-		                                               slot_locker);
-
-		if(!context)
-			return NULL;
-
-		/* caller is going to release it when done with it, so retain it */
-		cl_int ciErr = clRetainContext(context);
-		assert(ciErr == CL_SUCCESS);
-		(void)ciErr;
-
-		return context;
-	}
-
-	/* see get_something comment */
-	static cl_program get_program(cl_platform_id platform,
-	                              cl_device_id device,
-	                              ProgramName program_name,
-	                              thread_scoped_lock& slot_locker)
-	{
-		cl_program program = NULL;
-
-		switch(program_name) {
-			case OCL_DEV_BASE_PROGRAM:
-				/* Get program related to OpenCLDeviceBase */
-				program = get_something<cl_program>(platform,
-				                                    device,
-				                                    &Slot::ocl_dev_base_program,
-				                                    slot_locker);
-				break;
-			case OCL_DEV_MEGAKERNEL_PROGRAM:
-				/* Get program related to megakernel */
-				program = get_something<cl_program>(platform,
-				                                    device,
-				                                    &Slot::ocl_dev_megakernel_program,
-				                                    slot_locker);
-				break;
-		default:
-			assert(!"Invalid program name");
-		}
-
-		if(!program)
-			return NULL;
-
-		/* caller is going to release it when done with it, so retain it */
-		cl_int ciErr = clRetainProgram(program);
-		assert(ciErr == CL_SUCCESS);
-		(void)ciErr;
-
-		return program;
-	}
-
-	/* see store_something comment */
-	static void store_context(cl_platform_id platform,
-	                          cl_device_id device,
-	                          cl_context context,
-	                          thread_scoped_lock& slot_locker)
-	{
-		store_something<cl_context>(platform,
-		                            device,
-		                            context,
-		                            &Slot::context,
-		                            slot_locker);
-
-		/* increment reference count in OpenCL.
-		 * The caller is going to release the object when done with it. */
-		cl_int ciErr = clRetainContext(context);
-		assert(ciErr == CL_SUCCESS);
-		(void)ciErr;
-	}
-
-	/* see store_something comment */
-	static void store_program(cl_platform_id platform,
-	                          cl_device_id device,
-	                          cl_program program,
-	                          ProgramName program_name,
-	                          thread_scoped_lock& slot_locker)
-	{
-		switch(program_name) {
-			case OCL_DEV_BASE_PROGRAM:
-				store_something<cl_program>(platform,
-				                            device,
-				                            program,
-				                            &Slot::ocl_dev_base_program,
-				                            slot_locker);
-				break;
-			case OCL_DEV_MEGAKERNEL_PROGRAM:
-				store_something<cl_program>(platform,
-				                            device,
-				                            program,
-				                            &Slot::ocl_dev_megakernel_program,
-				                            slot_locker);
-				break;
-			default:
-				assert(!"Invalid program name\n");
-				return;
-		}
-
-		/* Increment reference count in OpenCL.
-		 * The caller is going to release the object when done with it.
-		 */
-		cl_int ciErr = clRetainProgram(program);
-		assert(ciErr == CL_SUCCESS);
-		(void)ciErr;
-	}
-
-	/* Discard all cached contexts and programs.  */
-	static void flush()
-	{
-		OpenCLCache &self = global_instance();
-		thread_scoped_lock cache_lock(self.cache_lock);
-
-		foreach(CacheMap::value_type &item, self.cache) {
-			if(item.second.ocl_dev_base_program != NULL)
-				clReleaseProgram(item.second.ocl_dev_base_program);
-			if(item.second.ocl_dev_megakernel_program != NULL)
-				clReleaseProgram(item.second.ocl_dev_megakernel_program);
-			if(item.second.context != NULL)
-				clReleaseContext(item.second.context);
-		}
-
-		self.cache.clear();
-	}
-};
-
-class OpenCLDeviceBase : public Device
-{
-public:
-	DedicatedTaskPool task_pool;
-	cl_context cxContext;
-	cl_command_queue cqCommandQueue;
-	cl_platform_id cpPlatform;
-	cl_device_id cdDevice;
-	cl_program cpProgram;
-	cl_kernel ckFilmConvertByteKernel;
-	cl_kernel ckFilmConvertHalfFloatKernel;
-	cl_kernel ckShaderKernel;
-	cl_kernel ckBakeKernel;
-	cl_int ciErr;
-
-	typedef map<string, device_vector<uchar>*> ConstMemMap;
-	typedef map<string, device_ptr> MemMap;
-
-	ConstMemMap const_mem_map;
-	MemMap mem_map;
-	device_ptr null_mem;
-
-	bool device_initialized;
-	string platform_name;
-
-	bool opencl_error(cl_int err)
-	{
-		if(err != CL_SUCCESS) {
-			string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
-			if(error_msg == "")
-				error_msg = message;
-			fprintf(stderr, "%s\n", message.c_str());
-			return true;
-		}
-
-		return false;
-	}
-
-	void opencl_error(const string& message)
-	{
-		if(error_msg == "")
-			error_msg = message;
-		fprintf(stderr, "%s\n", message.c_str());
-	}
-
-#define opencl_assert(stmt) \
-	{ \
-		cl_int err = stmt; \
-		\
-		if(err != CL_SUCCESS) { \
-			string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
-			if(error_msg == "") \
-				error_msg = message; \
-			fprintf(stderr, "%s\n", message.c_str()); \
-		} \
-	} (void)0
-
-	void opencl_assert_err(cl_int err, const char* where)
-	{
-		if(err != CL_SUCCESS) {
-			string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
-			if(error_msg == "")
-				error_msg = message;
-			fprintf(stderr, "%s\n", message.c_str());
-#ifndef NDEBUG
-			abort();
-#endif
-		}
-	}
-
-	OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
-	: Device(info, stats, background_)
-	{
-		cpPlatform = NULL;
-		cdDevice = NULL;
-		cxContext = NULL;
-		cqCommandQueue = NULL;
-		cpProgram = NULL;
-		ckFilmConvertByteKernel = NULL;
-		ckFilmConvertHalfFloatKernel = NULL;
-		ckShaderKernel = NULL;
-		ckBakeKernel = NULL;
-		null_mem = 0;
-		device_initialized = false;
-
-		vector<OpenCLPlatformDevice> usable_devices;
-		opencl_get_usable_devices(&usable_devices);
-		if(usable_devices.size() == 0) {
-			opencl_error("OpenCL: no devices found.");
-			return;
-		}
-		assert(info.num < usable_devices.size());
-		OpenCLPlatformDevice& platform_device = usable_devices[info.num];
-		cpPlatform = platform_device.platform_id;
-		cdDevice = platform_device.device_id;
-		platform_name = platform_device.platform_name;
-		VLOG(2) << "Creating new Cycles device for OpenCL platform "
-		        << platform_name << ", device "
-		        << platform_device.device_name << ".";
-
-		{
-			/* try to use cached context */
-			thread_scoped_lock cache_locker;
-			cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
-
-			if(cxContext == NULL) {
-				/* create context properties array to specify platform */
-				const cl_context_properties context_props[] = {
-					CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
-					0, 0
-				};
-
-				/* create context */
-				cxContext = clCreateContext(context_props, 1, &cdDevice,
-					context_notify_callback, cdDevice, &ciErr);
-
-				if(opencl_error(ciErr)) {
-					opencl_error("OpenCL: clCreateContext failed");
-					return;
-				}
-
-				/* cache it */
-				OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
-			}
-		}
-
-		cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
-		if(opencl_error(ciErr))
-			return;
-
-		null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
-		if(opencl_error(ciErr))
-			return;
-
-		fprintf(stderr, "Device init success\n");
-		device_initialized = true;
-	}
-
-	static void CL_CALLBACK context_notify_callback(const char *err_info,
-		const void * /*private_info*/, size_t /*cb*/, void *user_data)
-	{
-		char name[256];
-		clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
-
-		fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info);
-	}
-
-	bool opencl_version_check()
-	{
-		string error;
-		if(!opencl_platform_version_check(cpPlatform, &error)) {
-			opencl_error(error);
-			return false;
-		}
-		if(!opencl_device_version_check(cdDevice, &error)) {
-			opencl_error(error);
-			return false;
-		}
-		return true;
-	}
-
-	bool load_binary(const string& /*kernel_path*/,
-	                 const string& clbin,
-	                 const string& custom_kernel_build_options,
-	                 cl_program *program,
-	                 const string *debug_src = NULL)
-	{
-		/* read binary into memory */
-		vector<uint8_t> binary;
-
-		if(!path_read_binary(clbin, binary)) {
-			opencl_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
-			return false;
-		}
-
-		/* create program */
-		cl_int status;
-		size_t size = binary.size();
-		const uint8_t *bytes = &binary[0];
-
-		*program = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
-			&size, &bytes, &status, &ciErr);
-
-		if(opencl_error(status) || opencl_error(ciErr)) {
-			opencl_error(string_printf("OpenCL failed create program from cached binary %s.", clbin.c_str()));
-			return false;
-		}
-
-		if(!build_kernel(program, custom_kernel_build_options, debug_src))
-			return false;
-
-		return true;
-	}
-
-	bool save_binary(cl_program *program, const string& clbin)
-	{
-		size_t size = 0;
-		clGetProgramInfo(*program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
-
-		if(!size)
-			return false;
-
-		vector<uint8_t> binary(size);
-		uint8_t *bytes = &binary[0];
-
-		clGetProgramInfo(*program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
-
-		if(!path_write_binary(clbin, binary)) {
-			opencl_error(string_printf("OpenCL failed to write cached binary %s.", clbin.c_str()));
-			return false;
-		}
-
-		return true;
-	}
-
-	bool build_kernel(cl_program *kernel_program,
-	                  const string& custom_kernel_build_options,
-	                  const string *debug_src = NULL)
-	{
-		string build_options;
-		build_options = kernel_build_options(debug_src) + custom_kernel_build_options;
-
-		ciErr = clBuildProgram(*kernel_program, 0, NULL, build_options.c_str(), NULL, NULL);
-
-		/* show warnings even if build is successful */
-		size_t ret_val_size = 0;
-
-		clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
-
-		if(ret_val_size > 1) {
-			vector<char> build_log(ret_val_size + 1);
-			clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
-
-			build_log[ret_val_size] = '\0';
-			/* Skip meaningless empty output from the NVidia compiler. */
-			if(!(ret_val_size == 2 && build_log[0] == '\n')) {
-				fprintf(stderr, "OpenCL kernel build output:\n");
-				fprintf(stderr, "%s\n", &build_log[0]);
-			}
-		}
-
-		if(ciErr != CL_SUCCESS) {
-			opencl_error("OpenCL build failed: errors in console");
-			fprintf(stderr, "Build error: %s\n", clewErrorString(ciErr));
-			return false;
-		}
-
-		return true;
-	}
-
-	bool compile_kernel(const string& kernel_name,
-	                    const string& kernel_path,
-	                    const string& source,
-	                    const string& custom_kernel_build_options,
-	                    cl_program *kernel_program,
-	                    const string *debug_src = NULL)
-	{
-		/* We compile kernels consisting of many files. unfortunately OpenCL
-		 * kernel caches do not seem to recognize changes in included files.
-		 * so we force recompile on changes by adding the md5 hash of all files.
-		 */
-		string inlined_source = path_source_replace_includes(source,
-		                                                     kernel_path);
-
-		if(debug_src) {
-			path_write_text(*debug_src, inlined_source);
-		}
-
-		size_t source_len = inlined_source.size();
-		const char *source_str = inlined_source.c_str();
-
-		*kernel_program = clCreateProgramWithSource(cxContext,
-		                                            1,
-		                                            &source_str,
-		                                            &source_len,
-		                                            &ciErr);
-
-		if(opencl_error(ciErr)) {
-			return false;
-		}
-
-		double starttime = time_dt();
-		printf("Compiling %s OpenCL kernel ...\n", kernel_name.c_str());
-		/* TODO(sergey): Report which kernel is being compiled
-		 * as well (megakernel or which of split kernels etc..).
-		 */
-		printf("Build flags: %s\n", custom_kernel_build_options.c_str());
-
-		if(!build_kernel(kernel_program, custom_kernel_build_options, debug_src))
-			return false;
-
-		printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
-
-		return true;
-	}
-
-	string device_md5_hash(string kernel_custom_build_options = "")
-	{
-		MD5Hash md5;
-		char version[256], driver[256], name[256], vendor[256];
-
-		clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
-		clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
-		clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
-		clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
-
-		md5.append((uint8_t*)vendor, strlen(vendor));
-		md5.append((uint8_t*)version, strlen(version));
-		md5.append((uint8_t*)name, strlen(name));
-		md5.append((uint8_t*)driver, strlen(driver));
-
-		string options = kernel_build_options();
-		options += kernel_custom_build_options;
-		md5.append((uint8_t*)options.c_str(), options.size());
-
-		return md5.get_hex();
-	}
-
-	bool load_kernels(const DeviceRequestedFeatures& requested_features)
-	{
-		/* Verify if device was initialized. */
-		if(!device_initialized) {
-			fprintf(stderr, "OpenCL: failed to initialize device.\n");
-			return false;
-		}
-
-		/* Try to use cached kernel. */
-		thread_scoped_lock cache_locker;
-		cpProgram = load_cached_kernel(requested_features,
-		                               OpenCLCache::OCL_DEV_BASE_PROGRAM,
-		                               cache_locker);
-
-		if(!cpProgram) {
-			VLOG(2) << "No cached OpenCL kernel.";
-
-			/* Verify we have right opencl version. */
-			if(!opencl_version_check())
-				return false;
-
-			string build_flags = build_options_for_base_program(requested_features);
-
-			/* Calculate md5 hashes to detect changes. */
-			string kernel_path = path_get("kernel");
-			string kernel_md5 = path_files_md5_hash(kernel_path);
-			string device_md5 = device_md5_hash(build_flags);
-
-			/* Path to cached binary.
-			 *
-			 * TODO(sergey): Seems we could de-duplicate all this string_printf()
-			 * calls with some utility function which will give file name for a
-			 * given hashes..
-			 */
-			string clbin = string_printf("cycles_kernel_%s_%s.clbin",
-			                             device_md5.c_str(),
-			                             kernel_md5.c_str());
-			clbin = path_user_get(path_join("cache", clbin));
-
-			/* path to preprocessed source for debugging */
-			string clsrc, *debug_src = NULL;
-
-			if(opencl_kernel_use_debug()) {
-				clsrc = string_printf("cycles_kernel_%s_%s.cl",
-				                      device_md5.c_str(),
-				                      kernel_md5.c_str());
-				clsrc = path_user_get(path_join("cache", clsrc));
-				debug_src = &clsrc;
-			}
-
-			/* If binary kernel exists already, try use it. */
-			if(path_exists(clbin) && load_binary(kernel_path,
-			                                     clbin,
-			                                     build_flags,
-			                                     &cpProgram))
-			{
-				/* Kernel loaded from binary, nothing to do. */
-				VLOG(2) << "Loaded kernel from " << clbin << ".";
-			}
-			else {
-				VLOG(2) << "Kernel file " << clbin << " either doesn't exist or failed to be loaded by driver.";
-				string init_kernel_source = "#include \"kernels/opencl/kernel.cl\" // " + kernel_md5 + "\n";
-
-				/* If does not exist or loading binary failed, compile kernel. */
-				if(!compile_kernel("base_kernel",
-				                   kernel_path,
-				                   init_kernel_source,
-				                   build_flags,
-				                   &cpProgram,
-				                   debug_src))
-				{
-					return false;
-				}
-
-				/* Save binary for reuse. */
-				if(!save_binary(&cpProgram, clbin)) {
-					return false;
-				}
-			}
-
-			/* Cache the program. */
-			store_cached_kernel(cpPlatform,
-			                    cdDevice,
-			                    cpProgram,
-			                    OpenCLCache::OCL_DEV_BASE_PROGRAM,
-			                    cache_locker);
-		}
-		else {
-			VLOG(2) << "Found cached OpenCL kernel.";
-		}
-
-		/* Find kernels. */
-#define FIND_KERNEL(kernel_var, kernel_name) \
-		do { \
-			kernel_var = clCreateKernel(cpProgram, "kernel_ocl_" kernel_name, &ciErr); \
-			if(opencl_error(ciErr)) \
-				return false; \
-		} while(0)
-
-		FIND_KERNEL(ckFilmConvertByteKernel, "convert_to_byte");
-		FIND_KERNEL(ckFilmConvertHalfFloatKernel, "convert_to_half_float");
-		FIND_KERNEL(ckShaderKernel, "shader");
-		FIND_KERNEL(ckBakeKernel, "bake");
-
-#undef FIND_KERNEL
-		return true;
-	}
-
-	~OpenCLDeviceBase()
-	{
-		task_pool.stop();
-
-		if(null_mem)
-			clReleaseMemObject(CL_MEM_PTR(null_mem));
-
-		ConstMemMap::iterator mt;
-		for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
-			mem_free(*(mt->second));
-			delete mt->second;
-		}
-
-		if(ckFilmConvertByteKernel)
-			clReleaseKernel(ckFilmConvertByteKernel);
-		if(ckFilmConvertHalfFloatKernel)
-			clReleaseKernel(ckFilmConvertHalfFloatKernel);
-		if(ckShaderKernel)
-			clReleaseKernel(ckShaderKernel);
-		if(ckBakeKernel)
-			clReleaseKernel(ckBakeKernel);
-		if(cpProgram)
-			clReleaseProgram(cpProgram);
-		if(cqCommandQueue)
-			clReleaseCommandQueue(cqCommandQueue);
-		if(cxContext)
-			clReleaseContext(cxContext);
-	}
-
-	void mem_alloc(device_memory& mem, MemoryType type)
-	{
-		size_t size = mem.memory_size();
-
-		cl_mem_flags mem_flag;
-		void *mem_ptr = NULL;
-
-		if(type == MEM_READ_ONLY)
-			mem_flag = CL_MEM_READ_ONLY;
-		else if(type == MEM_WRITE_ONLY)
-			mem_flag = CL_MEM_WRITE_ONLY;
-		else
-			mem_flag = CL_MEM_READ_WRITE;
-
-		/* Zero-size allocation might be invoked by render, but not really
-		 * supported by OpenCL. Using NULL as device pointer also doesn't really
-		 * work for some reason, so for the time being we'll use special case
-		 * will null_mem buffer.
-		 */
-		if(size != 0) {
-			mem.device_pointer = (device_ptr)clCreateBuffer(cxContext,
-			                                                mem_flag,
-			                                                size,
-			                                                mem_ptr,
-			                                                &ciErr);
-			opencl_assert_err(ciErr, "clCreateBuffer");
-		}
-		else {
-			mem.device_pointer = null_mem;
-		}
-
-		stats.mem_alloc(size);
-		mem.device_size = size;
-	}
-
-	void mem_copy_to(device_memory& mem)
-	{
-		/* this is blocking */
-		size_t size = mem.memory_size();
-		if(size != 0) {
-			opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
-			                                   CL_MEM_PTR(mem.device_pointer),
-			                                   CL_TRUE,
-			                                   0,
-			                                   size,
-			                                   (void*)mem.data_pointer,
-			                                   0,
-			                                   NULL, NULL));
-		}
-	}
-
-	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
-	{
-		size_t offset = elem*y*w;
-		size_t size = elem*w*h;
-		assert(size != 0);
-		opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
-		                                  CL_MEM_PTR(mem.device_pointer),
-		                                  CL_TRUE,
-		                                  offset,
-		                                  size,
-		                                  (uchar*)mem.data_pointer + offset,
-		                                  0,
-		                                  NULL, NULL));
-	}
-
-	void mem_zero(device_memory& mem)
-	{
-		if(mem.device_pointer) {
-			memset((void*)mem.data_pointer, 0, mem.memory_size());
-			mem_copy_to(mem);
-		}
-	}
-
-	void mem_free(device_memory& mem)
-	{
-		if(mem.device_pointer) {
-			if(mem.device_pointer != null_mem) {
-				opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
-			}
-			mem.device_pointer = 0;
-
-			stats.mem_free(mem.device_size);
-			mem.device_size = 0;
-		}
-	}
-
-	void const_copy_to(const char *name, void *host, size_t size)
-	{
-		ConstMemMap::iterator i = const_mem_map.find(name);
-
-		if(i == const_mem_map.end()) {
-			device_vector<uchar> *data = new device_vector<uchar>();
-			data->copy((uchar*)host, size);
-
-			mem_alloc(*data, MEM_READ_ONLY);
-			i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
-		}
-		else {
-			device_vector<uchar> *data = i->second;
-			data->copy((uchar*)host, size);
-		}
-
-		mem_copy_to(*i->second);
-	}
-
-	void tex_alloc(const char *name,
-	               device_memory& mem,
-	               InterpolationType /*interpolation*/,
-	               ExtensionType /*extension*/)
-	{
-		VLOG(1) << "Texture allocate: " << name << ", "
-		        << string_human_readable_number(mem.memory_size()) << " bytes. ("
-		        << string_human_readable_size(mem.memory_size()) << ")";
-		mem_alloc(mem, MEM_READ_ONLY);
-		mem_copy_to(mem);
-		assert(mem_map.find(name) == mem_map.end());
-		mem_map.insert(MemMap::value_type(name, mem.device_pointer));
-	}
-
-	void tex_free(device_memory& mem)
-	{
-		if(mem.device_pointer) {
-			foreach(const MemMap::value_type& value, mem_map) {
-				if(value.second == mem.device_pointer) {
-					mem_map.erase(value.first);
-					break;
-				}
-			}
-
-			mem_free(mem);
-		}
-	}
-
-	size_t global_size_round_up(int group_size, int global_size)
-	{
-		int r = global_size % group_size;
-		return global_size + ((r == 0)? 0: group_size - r);
-	}
-
-	void enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
-	{
-		size_t workgroup_size, max_work_items[3];
-
-		clGetKernelWorkGroupInfo(kernel, cdDevice,
-			CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
-		clGetDeviceInfo(cdDevice,
-			CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
-
-		/* Try to divide evenly over 2 dimensions. */
-		size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
-		size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
-
-		/* Some implementations have max size 1 on 2nd dimension. */
-		if(local_size[1] > max_work_items[1]) {
-			local_size[0] = workgroup_size/max_work_items[1];
-			local_size[1] = max_work_items[1];
-		}
-
-		size_t global_size[2] = {global_size_round_up(local_size[0], w),
-		                         global_size_round_up(local_size[1], h)};
-
-		/* Vertical size of 1 is coming from bake/shade kernels where we should
-		 * not round anything up because otherwise we'll either be doing too
-		 * much work per pixel (if we don't check global ID on Y axis) or will
-		 * be checking for global ID to always have Y of 0.
-		 */
-		if(h == 1) {
-			global_size[h] = 1;
-		}
-
-		/* run kernel */
-		opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
-		opencl_assert(clFlush(cqCommandQueue));
-	}
-
-	void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
-	{
-		cl_mem ptr;
-
-		MemMap::iterator i = mem_map.find(name);
-		if(i != mem_map.end()) {
-			ptr = CL_MEM_PTR(i->second);
-		}
-		else {
-			/* work around NULL not working, even though the spec says otherwise */
-			ptr = CL_MEM_PTR(null_mem);
-		}
-		
-		opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
-	}
-
-	void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
-	{
-		/* cast arguments to cl types */
-		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-		cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
-		cl_mem d_buffer = CL_MEM_PTR(buffer);
-		cl_int d_x = task.x;
-		cl_int d_y = task.y;
-		cl_int d_w = task.w;
-		cl_int d_h = task.h;
-		cl_float d_sample_scale = 1.0f/(task.sample + 1);
-		cl_int d_offset = task.offset;
-		cl_int d_stride = task.stride;
-
-
-		cl_kernel ckFilmConvertKernel = (rgba_byte)? ckFilmConvertByteKernel: ckFilmConvertHalfFloatKernel;
-
-		cl_uint start_arg_index =
-			kernel_set_args(ckFilmConvertKernel,
-			                0,
-			                d_data,
-			                d_rgba,
-			                d_buffer);
-
-#define KERNEL_TEX(type, ttype, name) \
-	set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
-#include "kernel_textures.h"
-#undef KERNEL_TEX
-
-		start_arg_index += kernel_set_args(ckFilmConvertKernel,
-		                                   start_arg_index,
-		                                   d_sample_scale,
-		                                   d_x,
-		                                   d_y,
-		                                   d_w,
-		                                   d_h,
-		                                   d_offset,
-		                                   d_stride);
-
-		enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
-	}
-
-	void shader(DeviceTask& task)
-	{
-		/* cast arguments to cl types */
-		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-		cl_mem d_input = CL_MEM_PTR(task.shader_input);
-		cl_mem d_output = CL_MEM_PTR(task.shader_output);
-		cl_mem d_output_luma = CL_MEM_PTR(task.shader_output_luma);
-		cl_int d_shader_eval_type = task.shader_eval_type;
-		cl_int d_shader_filter = task.shader_filter;
-		cl_int d_shader_x = task.shader_x;
-		cl_int d_shader_w = task.shader_w;
-		cl_int d_offset = task.offset;
-
-		cl_kernel kernel;
-
-		if(task.shader_eval_type >= SHADER_EVAL_BAKE)
-			kernel = ckBakeKernel;
-		else
-			kernel = ckShaderKernel;
-
-		cl_uint start_arg_index =
-			kernel_set_args(kernel,
-			                0,
-			                d_data,
-			                d_input,
-			                d_output);
-
-		if(task.shader_eval_type < SHADER_EVAL_BAKE) {
-			start_arg_index += kernel_set_args(kernel,
-			                                   start_arg_index,
-			                                   d_output_luma);
-		}
-
-#define KERNEL_TEX(type, ttype, name) \
-		set_kernel_arg_mem(kernel, &start_arg_index, #name);
-#include "kernel_textures.h"
-#undef KERNEL_TEX
-
-		start_arg_index += kernel_set_args(kernel,
-		                                   start_arg_index,
-		                                   d_shader_eval_type);
-		if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
-			start_arg_index += kernel_set_args(kernel,
-			                                   start_arg_index,
-			                                   d_shader_filter);
-		}
-		start_arg_index += kernel_set_args(kernel,
-		                                   start_arg_index,
-		                                   d_shader_x,
-		                                   d_shader_w,
-		                                   d_offset);
-
-		for(int sample = 0; sample < task.num_samples; sample++) {
-
-			if(task.get_cancel())
-				break;
-
-			kernel_set_args(kernel, start_arg_index, sample);
-
-			enqueue_kernel(kernel, task.shader_w, 1);
-
-			clFinish(cqCommandQueue);
-
-			task.update_progress(NULL);
-		}
-	}
-
-	class OpenCLDeviceTask : public DeviceTask {
-	public:
-		OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
-		: DeviceTask(task)
-		{
-			run = function_bind(&OpenCLDeviceBase::thread_run,
-			                    device,
-			                    this);
-		}
-	};
-
-	int get_split_task_count(DeviceTask& /*task*/)
-	{
-		return 1;
-	}
-
-	void task_add(DeviceTask& task)
-	{
-		task_pool.push(new OpenCLDeviceTask(this, task));
-	}
-
-	void task_wait()
-	{
-		task_pool.wait();
-	}
-
-	void task_cancel()
-	{
-		task_pool.cancel();
-	}
-
-	virtual void thread_run(DeviceTask * /*task*/) = 0;
-
-protected:
-	string kernel_build_options(const string *debug_src = NULL)
-	{
-		string build_options = "-cl-fast-relaxed-math ";
-
-		if(platform_name == "NVIDIA CUDA") {
-			build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
-			                 "-cl-nv-maxrregcount=32 "
-			                 "-cl-nv-verbose ";
-
-			uint compute_capability_major, compute_capability_minor;
-			clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
-			                sizeof(cl_uint), &compute_capability_major, NULL);
-			clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
-			                sizeof(cl_uint), &compute_capability_minor, NULL);
-
-			build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
-			                               compute_capability_major * 100 +
-			                               compute_capability_minor * 10);
-		}
-
-		else if(platform_name == "Apple")
-			build_options += "-D__KERNEL_OPENCL_APPLE__ ";
-
-		else if(platform_name == "AMD Accelerated Parallel Processing")
-			build_options += "-D__KERNEL_OPENCL_AMD__ ";
-
-		else if(platform_name == "Intel(R) OpenCL") {
-			build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
-
-			/* Options for gdb source level kernel debugging.
-			 * this segfaults on linux currently.
-			 */
-			if(opencl_kernel_use_debug() && debug_src)
-				build_options += "-g -s \"" + *debug_src + "\" ";
-		}
-
-		if(opencl_kernel_use_debug())
-			build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
-
-#ifdef WITH_CYCLES_DEBUG
-		build_options += "-D__KERNEL_DEBUG__ ";
-#endif
-
-		return build_options;
-	}
-
-	class ArgumentWrapper {
-	public:
-		ArgumentWrapper() : size(0), pointer(NULL) {}
-		template <typename T>
-		ArgumentWrapper(T& argument) : size(sizeof(argument)),
-		                               pointer(&argument) { }
-		ArgumentWrapper(int argument) : size(sizeof(int)),
-		                                int_value(argument),
-		                                pointer(&int_value) { }
-		ArgumentWrapper(float argument) : size(sizeof(float)),
-		                                  float_value(argument),
-		                                  pointer(&float_value) { }
-		size_t size;
-		int int_value;
-		float float_value;
-		void *pointer;
-	};
-
-	/* TODO(sergey): In the future we can use variadic templates, once
-	 * C++0x is allowed. Should allow to clean this up a bit.
-	 */
-	int kernel_set_args(cl_kernel kernel,
-	                    int start_argument_index,
-	                    const ArgumentWrapper& arg1 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg2 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg3 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg4 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg5 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg6 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg7 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg8 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg9 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg10 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg11 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg12 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg13 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg14 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg15 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg16 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg17 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg18 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg19 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg20 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg21 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg22 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg23 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg24 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg25 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg26 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg27 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg28 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg29 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg30 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg31 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg32 = ArgumentWrapper(),
-	                    const ArgumentWrapper& arg33 = ArgumentWrapper())
-	{
-		int current_arg_index = 0;
-#define FAKE_VARARG_HANDLE_ARG(arg) \
-		do { \
-			if(arg.pointer != NULL) { \
-				opencl_assert(clSetKernelArg( \
-					kernel, \
-					start_argument_index + current_arg_index, \
-					arg.size, arg.pointer)); \
-				++current_arg_index; \
-			} \
-			else { \
-				return current_arg_index; \
-			} \
-		} while(false)
-		FAKE_VARARG_HANDLE_ARG(arg1);
-		FAKE_VARARG_HANDLE_ARG(arg2);
-		FAKE_VARARG_HANDLE_ARG(arg3);
-		FAKE_VARARG_HANDLE_ARG(arg4);
-		FAKE_VARARG_HANDLE_ARG(arg5);
-		FAKE_VARARG_HANDLE_ARG(arg6);
-		FAKE_VARARG_HANDLE_ARG(arg7);
-		FAKE_VARARG_HANDLE_ARG(arg8);
-		FAKE_VARARG_HANDLE_ARG(arg9);
-		FAKE_VARARG_HANDLE_ARG(arg10);
-		FAKE_VARARG_HANDLE_ARG(arg11);
-		FAKE_VARARG_HANDLE_ARG(arg12);
-		FAKE_VARARG_HANDLE_ARG(arg13);
-		FAKE_VARARG_HANDLE_ARG(arg14);
-		FAKE_VARARG_HANDLE_ARG(arg15);
-		FAKE_VARARG_HANDLE_ARG(arg16);
-		FAKE_VARARG_HANDLE_ARG(arg17);
-		FAKE_VARARG_HANDLE_ARG(arg18);
-		FAKE_VARARG_HANDLE_ARG(arg19);
-		FAKE_VARARG_HANDLE_ARG(arg20);
-		FAKE_VARARG_HANDLE_ARG(arg21);
-		FAKE_VARARG_HANDLE_ARG(arg22);
-		FAKE_VARARG_HANDLE_ARG(arg23);
-		FAKE_VARARG_HANDLE_ARG(arg24);
-		FAKE_VARARG_HANDLE_ARG(arg25);
-		FAKE_VARARG_HANDLE_ARG(arg26);
-		FAKE_VARARG_HANDLE_ARG(arg27);
-		FAKE_VARARG_HANDLE_ARG(arg28);
-		FAKE_VARARG_HANDLE_ARG(arg29);
-		FAKE_VARARG_HANDLE_ARG(arg30);
-		FAKE_VARARG_HANDLE_ARG(arg31);
-		FAKE_VARARG_HANDLE_ARG(arg32);
-		FAKE_VARARG_HANDLE_ARG(arg33);
-#undef FAKE_VARARG_HANDLE_ARG
-		return current_arg_index;
-	}
-
-	inline void release_kernel_safe(cl_kernel kernel)
-	{
-		if(kernel) {
-			clReleaseKernel(kernel);
-		}
-	}
-
-	inline void release_mem_object_safe(cl_mem mem)
-	{
-		if(mem != NULL) {
-			clReleaseMemObject(mem);
-		}
-	}
-
-	inline void release_program_safe(cl_program program)
-	{
-		if(program) {
-			clReleaseProgram(program);
-		}
-	}
-
-	/* ** Those guys are for workign around some compiler-specific bugs ** */
-
-	virtual cl_program load_cached_kernel(
-	        const DeviceRequestedFeatures& /*requested_features*/,
-	        OpenCLCache::ProgramName program_name,
-	        thread_scoped_lock& cache_locker)
-	{
-		return OpenCLCache::get_program(cpPlatform,
-		                                cdDevice,
-		                                program_name,
-		                                cache_locker);
-	}
-
-	virtual void store_cached_kernel(cl_platform_id platform,
-	                                 cl_device_id device,
-	                                 cl_program program,
-	                                 OpenCLCache::ProgramName program_name,
-	                                 thread_scoped_lock& cache_locker)
-	{
-		OpenCLCache::store_program(platform,
-		                           device,
-		                           program,
-		                           program_name,
-		                           cache_locker);
-	}
-
-	virtual string build_options_for_base_program(
-	        const DeviceRequestedFeatures& /*requested_features*/)
-	{
-		/* TODO(sergey): By default we compile all features, meaning
-		 * mega kernel is not getting feature-based optimizations.
-		 *
-		 * Ideally we need always compile kernel with as less features
-		 * enabled as possible to keep performance at it's max.
-		 */
-		return "";
-	}
-};
-
-class OpenCLDeviceMegaKernel : public OpenCLDeviceBase
-{
-public:
-	cl_kernel ckPathTraceKernel;
-	cl_program path_trace_program;
-
-	OpenCLDeviceMegaKernel(DeviceInfo& info, Stats &stats, bool background_)
-	: OpenCLDeviceBase(info, stats, background_)
-	{
-		ckPathTraceKernel = NULL;
-		path_trace_program = NULL;
-	}
-
-	bool load_kernels(const DeviceRequestedFeatures& requested_features)
-	{
-		/* Get Shader, bake and film convert kernels.
-		 * It'll also do verification of OpenCL actually initialized.
-		 */
-		if(!OpenCLDeviceBase::load_kernels(requested_features)) {
-			return false;
-		}
-
-		/* Try to use cached kernel. */
-		thread_scoped_lock cache_locker;
-		path_trace_program = OpenCLCache::get_program(cpPlatform,
-		                                              cdDevice,
-		                                              OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
-		                                              cache_locker);
-
-		if(!path_trace_program) {
-			/* Verify we have right opencl version. */
-			if(!opencl_version_check())
-				return false;
-
-			/* Calculate md5 hash to detect changes. */
-			string kernel_path = path_get("kernel");
-			string kernel_md5 = path_files_md5_hash(kernel_path);
-			string custom_kernel_build_options = "-D__COMPILE_ONLY_MEGAKERNEL__ ";
-			string device_md5 = device_md5_hash(custom_kernel_build_options);
-
-			/* Path to cached binary. */
-			string clbin = string_printf("cycles_kernel_%s_%s.clbin",
-			                             device_md5.c_str(),
-			                             kernel_md5.c_str());
-			clbin = path_user_get(path_join("cache", clbin));
-
-			/* Path to preprocessed source for debugging. */
-			string clsrc, *debug_src = NULL;
-			if(opencl_kernel_use_debug()) {
-				clsrc = string_printf("cycles_kernel_%s_%s.cl",
-				                      device_md5.c_str(),
-				                      kernel_md5.c_str());
-				clsrc = path_user_get(path_join("cache", clsrc));
-				debug_src = &clsrc;
-			}
-
-			/* If exists already, try use it. */
-			if(path_exists(clbin) && load_binary(kernel_path,
-			                                     clbin,
-			                                     custom_kernel_build_options,
-			                                     &path_trace_program,
-			                                     debug_src))
-			{
-				/* Kernel loaded from binary, nothing to do. */
-			}
-			else {
-				string init_kernel_source = "#include \"kernels/opencl/kernel.cl\" // " +
-				                            kernel_md5 + "\n";
-				/* If does not exist or loading binary failed, compile kernel. */
-				if(!compile_kernel("mega_kernel",
-				                   kernel_path,
-				                   init_kernel_source,
-				                   custom_kernel_build_options,
-				                   &path_trace_program,
-				                   debug_src))
-				{
-					return false;
-				}
-				/* Save binary for reuse. */
-				if(!save_binary(&path_trace_program, clbin)) {
-					return false;
-				}
-			}
-			/* Cache the program. */
-			OpenCLCache::store_program(cpPlatform,
-			                           cdDevice,
-			                           path_trace_program,
-			                           OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
-			                           cache_locker);
-		}
-
-		/* Find kernels. */
-		ckPathTraceKernel = clCreateKernel(path_trace_program,
-		                                   "kernel_ocl_path_trace",
-		                                   &ciErr);
-		if(opencl_error(ciErr))
-			return false;
-		return true;
-	}
-
-	~OpenCLDeviceMegaKernel()
-	{
-		task_pool.stop();
-		release_kernel_safe(ckPathTraceKernel);
-		release_program_safe(path_trace_program);
-	}
-
-	void path_trace(RenderTile& rtile, int sample)
-	{
-		/* Cast arguments to cl types. */
-		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
-		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
-		cl_int d_x = rtile.x;
-		cl_int d_y = rtile.y;
-		cl_int d_w = rtile.w;
-		cl_int d_h = rtile.h;
-		cl_int d_offset = rtile.offset;
-		cl_int d_stride = rtile.stride;
-
-		/* Sample arguments. */
-		cl_int d_sample = sample;
-
-		cl_uint start_arg_index =
-			kernel_set_args(ckPathTraceKernel,
-			                0,
-			                d_data,
-			                d_buffer,
-			                d_rng_state);
-
-#define KERNEL_TEX(type, ttype, name) \
-		set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
-#include "kernel_textures.h"
-#undef KERNEL_TEX
-
-		start_arg_index += kernel_set_args(ckPathTraceKernel,
-		                                   start_arg_index,
-		                                   d_sample,
-		                                   d_x,
-		                                   d_y,
-		                                   d_w,
-		                                   d_h,
-		                                   d_offset,
-		                                   d_stride);
-
-		enqueue_kernel(ckPathTraceKernel, d_w, d_h);
-	}
-
-	void thread_run(DeviceTask *task)
-	{
-		if(task->type == DeviceTask::FILM_CONVERT) {
-			film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
-		}
-		else if(task->type == DeviceTask::SHADER) {
-			shader(*task);
-		}
-		else if(task->type == DeviceTask::PATH_TRACE) {
-			RenderTile tile;
-			/* Keep rendering tiles until done. */
-			while(task->acquire_tile(this, tile)) {
-				int start_sample = tile.start_sample;
-				int end_sample = tile.start_sample + tile.num_samples;
-
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if(task->get_cancel()) {
-						if(task->need_finish_queue == false)
-							break;
-					}
-
-					path_trace(tile, sample);
-
-					tile.sample = sample + 1;
-
-					task->update_progress(&tile);
-				}
-
-				/* Complete kernel execution before release tile */
-				/* This helps in multi-device render;
-				 * The device that reaches the critical-section function
-				 * release_tile waits (stalling other devices from entering
-				 * release_tile) for all kernels to complete. If device1 (a
-				 * slow-render device) reaches release_tile first then it would
-				 * stall device2 (a fast-render device) from proceeding to render
-				 * next tile.
-				 */
-				clFinish(cqCommandQueue);
-
-				task->release_tile(tile);
-			}
-		}
-	}
-};
-
-/* TODO(sergey): This is to keep tile split on OpenCL level working
- * for now, since without this view-port render does not work as it
- * should.
- *
- * Ideally it'll be done on the higher level, but we need to get ready
- * for merge rather soon, so let's keep split logic private here in
- * the file.
- */
-class SplitRenderTile : public RenderTile {
-public:
-	SplitRenderTile()
-		: RenderTile(),
-		  buffer_offset_x(0),
-		  buffer_offset_y(0),
-		  rng_state_offset_x(0),
-		  rng_state_offset_y(0),
-		  buffer_rng_state_stride(0) {}
-
-	explicit SplitRenderTile(RenderTile& tile)
-		: RenderTile(),
-		  buffer_offset_x(0),
-		  buffer_offset_y(0),
-		  rng_state_offset_x(0),
-		  rng_state_offset_y(0),
-		  buffer_rng_state_stride(0)
-	{
-		x = tile.x;
-		y = tile.y;
-		w = tile.w;
-		h = tile.h;
-		start_sample = tile.start_sample;
-		num_samples = tile.num_samples;
-		sample = tile.sample;
-		resolution = tile.resolution;
-		offset = tile.offset;
-		stride = tile.stride;
-		buffer = tile.buffer;
-		rng_state = tile.rng_state;
-		buffers = tile.buffers;
-	}
-
-	/* Split kernel is device global memory constrained;
-	 * hence split kernel cant render big tile size's in
-	 * one go. If the user sets a big tile size (big tile size
-	 * is a term relative to the available device global memory),
-	 * we split the tile further and then call path_trace on
-	 * each of those split tiles. The following variables declared,
-	 * assist in achieving that purpose
-	 */
-	int buffer_offset_x;
-	int buffer_offset_y;
-	int rng_state_offset_x;
-	int rng_state_offset_y;
-	int buffer_rng_state_stride;
-};
-
-/* OpenCLDeviceSplitKernel's declaration/definition. */
-class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
-{
-public:
-	/* Kernel declaration. */
-	cl_kernel ckPathTraceKernel_data_init;
-	cl_kernel ckPathTraceKernel_scene_intersect;
-	cl_kernel ckPathTraceKernel_lamp_emission;
-	cl_kernel ckPathTraceKernel_queue_enqueue;
-	cl_kernel ckPathTraceKernel_background_buffer_update;
-	cl_kernel ckPathTraceKernel_shader_eval;
-	cl_kernel ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao;
-	cl_kernel ckPathTraceKernel_direct_lighting;
-	cl_kernel ckPathTraceKernel_shadow_blocked;
-	cl_kernel ckPathTraceKernel_next_iteration_setup;
-	cl_kernel ckPathTraceKernel_sum_all_radiance;
-
-	/* cl_program declaration. */
-	cl_program data_init_program;
-	cl_program scene_intersect_program;
-	cl_program lamp_emission_program;
-	cl_program queue_enqueue_program;
-	cl_program background_buffer_update_program;
-	cl_program shader_eval_program;
-	cl_program holdout_emission_blurring_pathtermination_ao_program;
-	cl_program direct_lighting_program;
-	cl_program shadow_blocked_program;
-	cl_program next_iteration_setup_program;
-	cl_program sum_all_radiance_program;
-
-	/* Global memory variables [porting]; These memory is used for
-	 * co-operation between different kernels; Data written by one
-	 * kernel will be available to another kernel via this global
-	 * memory.
-	 */
-	cl_mem rng_coop;
-	cl_mem throughput_coop;
-	cl_mem L_transparent_coop;
-	cl_mem PathRadiance_coop;
-	cl_mem Ray_coop;
-	cl_mem PathState_coop;
-	cl_mem Intersection_coop;
-	cl_mem kgbuffer;  /* KernelGlobals buffer. */
-
-	/* Global buffers for ShaderData. */
-	cl_mem sd;             /* ShaderData used in the main path-iteration loop. */
-	cl_mem sd_DL_shadow;   /* ShaderData used in Direct Lighting and
-	                        * shadow_blocked kernel.
-	                        */
-
-	/* Global memory required for shadow blocked and accum_radiance. */
-	cl_mem BSDFEval_coop;
-	cl_mem ISLamp_coop;
-	cl_mem LightRay_coop;
-	cl_mem AOAlpha_coop;
-	cl_mem AOBSDF_coop;
-	cl_mem AOLightRay_coop;
-	cl_mem Intersection_coop_shadow;
-
-#ifdef WITH_CYCLES_DEBUG
-	/* DebugData memory */
-	cl_mem debugdata_coop;
-#endif
-
-	/* Global state array that tracks ray state. */
-	cl_mem ray_state;
-
-	/* Per sample buffers. */
-	cl_mem per_sample_output_buffers;
-
-	/* Denotes which sample each ray is being processed for. */
-	cl_mem work_array;
-
-	/* Queue */
-	cl_mem Queue_data;  /* Array of size queuesize * num_queues * sizeof(int). */
-	cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
-	                     * Tracks the size of each queue.
-	                     */
-
-	/* Flag to make sceneintersect and lampemission kernel use queues. */
-	cl_mem use_queues_flag;
-
-	/* Amount of memory in output buffer associated with one pixel/thread. */
-	size_t per_thread_output_buffer_size;
-
-	/* Total allocatable available device memory. */
-	size_t total_allocatable_memory;
-
-	/* host version of ray_state; Used in checking host path-iteration
-	 * termination.
-	 */
-	char *hostRayStateArray;
-
-	/* Number of path-iterations to be done in one shot. */
-	unsigned int PathIteration_times;
-
-#ifdef __WORK_STEALING__
-	/* Work pool with respect to each work group. */
-	cl_mem work_pool_wgs;
-
-	/* Denotes the maximum work groups possible w.r.t. current tile size. */
-	unsigned int max_work_groups;
-#endif
-
-	/* clos_max value for which the kernels have been loaded currently. */
-	int current_max_closure;
-
-	/* Marked True in constructor and marked false at the end of path_trace(). */
-	bool first_tile;
-
-	OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
-	: OpenCLDeviceBase(info, stats, background_)
-	{
-		background = background_;
-
-		/* Initialize kernels. */
-		ckPathTraceKernel_data_init = NULL;
-		ckPathTraceKernel_scene_intersect = NULL;
-		ckPathTraceKernel_lamp_emission = NULL;
-		ckPathTraceKernel_background_buffer_update = NULL;
-		ckPathTraceKernel_shader_eval = NULL;
-		ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao = NULL;
-		ckPathTraceKernel_direct_lighting = NULL;
-		ckPathTraceKernel_shadow_blocked = NULL;
-		ckPathTraceKernel_next_iteration_setup = NULL;
-		ckPathTraceKernel_sum_all_radiance = NULL;
-		ckPathTraceKernel_queue_enqueue = NULL;
-
-		/* Initialize program. */
-		data_init_program = NULL;
-		scene_intersect_program = NULL;
-		lamp_emission_program = NULL;
-		queue_enqueue_program = NULL;
-		background_buffer_update_program = NULL;
-		shader_eval_program = NULL;
-		holdout_emission_blurring_pathtermination_ao_program = NULL;
-		direct_lighting_program = NULL;
-		shadow_blocked_program = NULL;
-		next_iteration_setup_program = NULL;
-		sum_all_radiance_program = NULL;
-
-		/* Initialize cl_mem variables. */
-		kgbuffer = NULL;
-		sd = NULL;
-		sd_DL_shadow = NULL;
-
-		rng_coop = NULL;
-		throughput_coop = NULL;
-		L_transparent_coop = NULL;
-		PathRadiance_coop = NULL;
-		Ray_coop = NULL;
-		PathState_coop = NULL;
-		Intersection_coop = NULL;
-		ray_state = NULL;
-
-		AOAlpha_coop = NULL;
-		AOBSDF_coop = NULL;
-		AOLightRay_coop = NULL;
-		BSDFEval_coop = NULL;
-		ISLamp_coop = NULL;
-		LightRay_coop = NULL;
-		Intersection_coop_shadow = NULL;
-
-#ifdef WITH_CYCLES_DEBUG
-		debugdata_coop = NULL;
-#endif
-
-		work_array = NULL;
-
-		/* Queue. */
-		Queue_data = NULL;
-		Queue_index = NULL;
-		use_queues_flag = NULL;
-
-		per_sample_output_buffers = NULL;
-
-		per_thread_output_buffer_size = 0;
-		hostRayStateArray = NULL;
-		PathIteration_times = PATH_ITER_INC_FACTOR;
-#ifdef __WORK_STEALING__
-		work_pool_wgs = NULL;
-		max_work_groups = 0;
-#endif
-		current_max_closure = -1;
-		first_tile = true;
-
-		/* Get device's maximum memory that can be allocated. */
-		ciErr = clGetDeviceInfo(cdDevice,
-		                        CL_DEVICE_MAX_MEM_ALLOC_SIZE,
-		                        sizeof(size_t),
-		                        &total_allocatable_memory,
-		                        NULL);
-		assert(ciErr == CL_SUCCESS);
-		if(platform_name == "AMD Accelerated Parallel Processing") {
-			/* This value is tweak-able; AMD platform does not seem to
-			 * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
-			 * is considered for further computation.
-			 */
-			total_allocatable_memory /= 2;
-		}
-	}
-
-	/* TODO(sergey): Seems really close to load_kernel(),
-	 * could it be de-duplicated?
-	 */
-	bool load_split_kernel(const string& kernel_name,
-	                       const string& kernel_path,
-	                       const string& kernel_init_source,
-	                       const string& clbin,
-	                       const string& custom_kernel_build_options,
-	                       cl_program *program,
-	                       const string *debug_src = NULL)
-	{
-		if(!opencl_version_check()) {
-			return false;
-		}
-
-		string cache_clbin = path_user_get(path_join("cache", clbin));
-
-		/* If exists already, try use it. */
-		if(path_exists(cache_clbin) && load_binary(kernel_path,
-		                                           cache_clbin,
-		                                           custom_kernel_build_options,
-		                                           program,
-		                                           debug_src))
-		{
-			/* Kernel loaded from binary. */
-		}
-		else {
-			/* If does not exist or loading binary failed, compile kernel. */
-			if(!compile_kernel(kernel_name,
-			                   kernel_path,
-			                   kernel_init_source,
-			                   custom_kernel_build_options,
-			                   program,
-			                   debug_src))
-			{
-				return false;
-			}
-			/* Save binary for reuse. */
-			if(!save_binary(program, cache_clbin)) {
-				return false;
-			}
-		}
-		return true;
-	}
-
-	/* Split kernel utility functions. */
-	size_t get_tex_size(const char *tex_name)
-	{
-		cl_mem ptr;
-		size_t ret_size = 0;
-		MemMap::iterator i = mem_map.find(tex_name);
-		if(i != mem_map.end()) {
-			ptr = CL_MEM_PTR(i->second);
-			ciErr = clGetMemObjectInfo(ptr,
-			                           CL_MEM_SIZE,
-			                           sizeof(ret_size),
-			                           &ret_size,
-			                           NULL);
-			assert(ciErr == CL_SUCCESS);
-		}
-		return ret_size;
-	}
-
-	size_t get_shader_data_size(size_t max_closure)
-	{
-		/* ShaderData size with variable size ShaderClosure array */
-		return sizeof(ShaderData) - (sizeof(ShaderClosure) * (MAX_CLOSURE - max_closure));
-	}
-
-	/* Returns size of KernelGlobals structure associated with OpenCL. */
-	size_t get_KernelGlobals_size()
-	{
-		/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
-		 * fetch its size.
-		 */
-		typedef struct KernelGlobals {
-			ccl_constant KernelData *data;
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name;
-#include "kernel_textures.h"
-#undef KERNEL_TEX
-			void *sd_input;
-			void *isect_shadow;
-		} KernelGlobals;
-
-		return sizeof(KernelGlobals);
-	}
-
-	bool load_kernels(const DeviceRequestedFeatures& requested_features)
-	{
-		/* Get Shader, bake and film_convert kernels.
-		 * It'll also do verification of OpenCL actually initialized.
-		 */
-		if(!OpenCLDeviceBase::load_kernels(requested_features)) {
-			return false;
-		}
-
-		string kernel_path = path_get("kernel");
-		string kernel_md5 = path_files_md5_hash(kernel_path);
-		string device_md5;
-		string kernel_init_source;
-		string clbin;
-		string clsrc, *debug_src = NULL;
-
-		string build_options = "-D__SPLIT_KERNEL__ ";
-#ifdef __WORK_STEALING__
-		build_options += "-D__WORK_STEALING__ ";
-#endif
-		build_options += requested_features.get_build_options();
-
-		/* Set compute device build option. */
-		cl_device_type device_type;
-		ciErr = clGetDeviceInfo(cdDevice,
-		                        CL_DEVICE_TYPE,
-		                        sizeof(cl_device_type),
-		                        &device_type,
-		                        NULL);
-		assert(ciErr == CL_SUCCESS);
-		if(device_type == CL_DEVICE_TYPE_GPU) {
-			build_options += " -D__COMPUTE_DEVICE_GPU__";
-		}
-
-#define GLUE(a, b) a ## b
-#define LOAD_KERNEL(name) \
-	do { \
-		kernel_init_source = "#include \"kernels/opencl/kernel_" #name ".cl\" // " + \
-		                     kernel_md5 + "\n"; \
-		device_md5 = device_md5_hash(build_options); \
-		clbin = string_printf("cycles_kernel_%s_%s_" #name ".clbin", \
-		                      device_md5.c_str(), kernel_md5.c_str()); \
-		if(opencl_kernel_use_debug()) { \
-			clsrc = string_printf("cycles_kernel_%s_%s_" #name ".cl", \
-			                      device_md5.c_str(), kernel_md5.c_str()); \
-			clsrc = path_user_get(path_join("cache", clsrc)); \
-			debug_src = &clsrc; \
-		} \
-		if(!load_split_kernel(#name, \
-		                      kernel_path, \
-		                      kernel_init_source, \
-		                      clbin, \
-		                      build_options, \
-		                      &GLUE(name, _program), \
-		                      debug_src)) \
-		{ \
-			fprintf(stderr, "Faled to compile %s\n", #name); \
-			return false; \
-		} \
-	} while(false)
-
-		LOAD_KERNEL(data_init);
-		LOAD_KERNEL(scene_intersect);
-		LOAD_KERNEL(lamp_emission);
-		LOAD_KERNEL(queue_enqueue);
-		LOAD_KERNEL(background_buffer_update);
-		LOAD_KERNEL(shader_eval);
-		LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
-		LOAD_KERNEL(direct_lighting);
-		LOAD_KERNEL(shadow_blocked);
-		LOAD_KERNEL(next_iteration_setup);
-		LOAD_KERNEL(sum_all_radiance);
-
-#undef LOAD_KERNEL
-
-#define FIND_KERNEL(name) \
-	do { \
-		GLUE(ckPathTraceKernel_, name) = \
-			clCreateKernel(GLUE(name, _program), \
-			               "kernel_ocl_path_trace_"  #name, &ciErr); \
-		if(opencl_error(ciErr)) { \
-			fprintf(stderr,"Missing kernel kernel_ocl_path_trace_%s\n", #name); \
-			return false; \
-		} \
-	} while(false)
-
-		FIND_KERNEL(data_init);
-		FIND_KERNEL(scene_intersect);
-		FIND_KERNEL(lamp_emission);
-		FIND_KERNEL(queue_enqueue);
-		FIND_KERNEL(background_buffer_update);
-		FIND_KERNEL(shader_eval);
-		FIND_KERNEL(holdout_emission_blurring_pathtermination_ao);
-		FIND_KERNEL(direct_lighting);
-		FIND_KERNEL(shadow_blocked);
-		FIND_KERNEL(next_iteration_setup);
-		FIND_KERNEL(sum_all_radiance);
-#undef FIND_KERNEL
-#undef GLUE
-
-		current_max_closure = requested_features.max_closure;
-
-		return true;
-	}
-
-	~OpenCLDeviceSplitKernel()
-	{
-		task_pool.stop();
-
-		/* Release kernels */
-		release_kernel_safe(ckPathTraceKernel_data_init);
-		release_kernel_safe(ckPathTraceKernel_scene_intersect);
-		release_kernel_safe(ckPathTraceKernel_lamp_emission);
-		release_kernel_safe(ckPathTraceKernel_queue_enqueue);
-		release_kernel_safe(ckPathTraceKernel_background_buffer_update);
-		release_kernel_safe(ckPathTraceKernel_shader_eval);
-		release_kernel_safe(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao);
-		release_kernel_safe(ckPathTraceKernel_direct_lighting);
-		release_kernel_safe(ckPathTraceKernel_shadow_blocked);
-		release_kernel_safe(ckPathTraceKernel_next_iteration_setup);
-		release_kernel_safe(ckPathTraceKernel_sum_all_radiance);
-
-		/* Release global memory */
-		release_mem_object_safe(rng_coop);
-		release_mem_object_safe(throughput_coop);
-		release_mem_object_safe(L_transparent_coop);
-		release_mem_object_safe(PathRadiance_coop);
-		release_mem_object_safe(Ray_coop);
-		release_mem_object_safe(PathState_coop);
-		release_mem_object_safe(Intersection_coop);
-		release_mem_object_safe(kgbuffer);
-		release_mem_object_safe(sd);
-		release_mem_object_safe(sd_DL_shadow);
-		release_mem_object_safe(ray_state);
-		release_mem_object_safe(AOAlpha_coop);
-		release_mem_object_safe(AOBSDF_coop);
-		release_mem_object_safe(AOLightRay_coop);
-		release_mem_object_safe(BSDFEval_coop);
-		release_mem_object_safe(ISLamp_coop);
-		release_mem_object_safe(LightRay_coop);
-		release_mem_object_safe(Intersection_coop_shadow);
-#ifdef WITH_CYCLES_DEBUG
-		release_mem_object_safe(debugdata_coop);
-#endif
-		release_mem_object_safe(use_queues_flag);
-		release_mem_object_safe(Queue_data);
-		release_mem_object_safe(Queue_index);
-		release_mem_object_safe(work_array);
-#ifdef __WORK_STEALING__
-		release_mem_object_safe(work_pool_wgs);
-#endif
-		release_mem_object_safe(per_sample_output_buffers);
-
-		/* Release programs */
-		release_program_safe(data_init_program);
-		release_program_safe(scene_intersect_program);
-		release_program_safe(lamp_emission_program);
-		release_program_safe(queue_enqueue_program);
-		release_program_safe(background_buffer_update_program);
-		release_program_safe(shader_eval_program);
-		release_program_safe(holdout_emission_blurring_pathtermination_ao_program);
-		release_program_safe(direct_lighting_program);
-		release_program_safe(shadow_blocked_program);
-		release_program_safe(next_iteration_setup_program);
-		release_program_safe(sum_all_radiance_program);
-
-		if(hostRayStateArray != NULL) {
-			free(hostRayStateArray);
-		}
-	}
-
-	void path_trace(DeviceTask *task,
-	                SplitRenderTile& rtile,
-	                int2 max_render_feasible_tile_size)
-	{
-		/* cast arguments to cl types */
-		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
-		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
-		cl_int d_x = rtile.x;
-		cl_int d_y = rtile.y;
-		cl_int d_w = rtile.w;
-		cl_int d_h = rtile.h;
-		cl_int d_offset = rtile.offset;
-		cl_int d_stride = rtile.stride;
-
-		/* Make sure that set render feasible tile size is a multiple of local
-		 * work size dimensions.
-		 */
-		assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
-		assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
-
-		size_t global_size[2];
-		size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
-		                        SPLIT_KERNEL_LOCAL_SIZE_Y};
-
-		/* Set the range of samples to be processed for every ray in
-		 * path-regeneration logic.
-		 */
-		cl_int start_sample = rtile.start_sample;
-		cl_int end_sample = rtile.start_sample + rtile.num_samples;
-		cl_int num_samples = rtile.num_samples;
-
-#ifdef __WORK_STEALING__
-		global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
-		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
-		unsigned int num_parallel_samples = 1;
-#else
-		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
-		unsigned int num_threads = max_render_feasible_tile_size.x *
-		                           max_render_feasible_tile_size.y;
-		unsigned int num_tile_columns_possible = num_threads / global_size[1];
-		/* Estimate number of parallel samples that can be
-		 * processed in parallel.
-		 */
-		unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
-		                                        rtile.num_samples);
-		/* Wavefront size in AMD is 64.
-		 * TODO(sergey): What about other platforms?
-		 */
-		if(num_parallel_samples >= 64) {
-			/* TODO(sergey): Could use generic round-up here. */
-			num_parallel_samples = (num_parallel_samples / 64) * 64;
-		}
-		assert(num_parallel_samples != 0);
-
-		global_size[0] = d_w * num_parallel_samples;
-#endif  /* __WORK_STEALING__ */
-
-		assert(global_size[0] * global_size[1] <=
-		       max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
-
-		/* Allocate all required global memory once. */
-		if(first_tile) {
-			size_t num_global_elements = max_render_feasible_tile_size.x *
-			                             max_render_feasible_tile_size.y;
-			/* TODO(sergey): This will actually over-allocate if
-			 * particular kernel does not support multiclosure.
-			 */
-			size_t shaderdata_size = get_shader_data_size(current_max_closure);
-
-#ifdef __WORK_STEALING__
-			/* Calculate max groups */
-			size_t max_global_size[2];
-			size_t tile_x = max_render_feasible_tile_size.x;
-			size_t tile_y = max_render_feasible_tile_size.y;
-			max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
-			max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
-			max_work_groups = (max_global_size[0] * max_global_size[1]) /
-			                  (local_size[0] * local_size[1]);
-			/* Allocate work_pool_wgs memory. */
-			work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
-#endif  /* __WORK_STEALING__ */
-
-			/* Allocate queue_index memory only once. */
-			Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
-			use_queues_flag = mem_alloc(sizeof(char));
-			kgbuffer = mem_alloc(get_KernelGlobals_size());
-
-			/* Create global buffers for ShaderData. */
-			sd = mem_alloc(num_global_elements * shaderdata_size);
-			sd_DL_shadow = mem_alloc(num_global_elements * 2 * shaderdata_size);
-
-			/* Creation of global memory buffers which are shared among
-			 * the kernels.
-			 */
-			rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
-			throughput_coop = mem_alloc(num_global_elements * sizeof(float3));
-			L_transparent_coop = mem_alloc(num_global_elements * sizeof(float));
-			PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
-			Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
-			PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
-			Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
-			AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3));
-			AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3));
-			AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
-			BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
-			ISLamp_coop = mem_alloc(num_global_elements * sizeof(int));
-			LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
-			Intersection_coop_shadow = mem_alloc(2 * num_global_elements * sizeof(Intersection));
-
-#ifdef WITH_CYCLES_DEBUG
-			debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
-#endif
-
-			ray_state = mem_alloc(num_global_elements * sizeof(char));
-
-			hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
-			assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
-
-			Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
-			work_array = mem_alloc(num_global_elements * sizeof(unsigned int));
-			per_sample_output_buffers = mem_alloc(num_global_elements *
-			                                      per_thread_output_buffer_size);
-		}
-
-		cl_int dQueue_size = global_size[0] * global_size[1];
-
-		cl_uint start_arg_index =
-			kernel_set_args(ckPathTraceKernel_data_init,
-			                0,
-			                kgbuffer,
-			                sd_DL_shadow,
-			                d_data,
-			                per_sample_output_buffers,
-			                d_rng_state,
-			                rng_coop,
-			                throughput_coop,
-			                L_transparent_coop,
-			                PathRadiance_coop,
-			                Ray_coop,
-			                PathState_coop,
-			                Intersection_coop_shadow,
-			                ray_state);
-
-/* TODO(sergey): Avoid map lookup here. */
-#define KERNEL_TEX(type, ttype, name) \
-	set_kernel_arg_mem(ckPathTraceKernel_data_init, &start_arg_index, #name);
-#include "kernel_textures.h"
-#undef KERNEL_TEX
-
-		start_arg_index +=
-			kernel_set_args(ckPathTraceKernel_data_init,
-			                start_arg_index,
-			                start_sample,
-			                d_x,
-			                d_y,
-			                d_w,
-			                d_h,
-			                d_offset,
-			                d_stride,
-			                rtile.rng_state_offset_x,
-			                rtile.rng_state_offset_y,
-			                rtile.buffer_rng_state_stride,
-			                Queue_data,
-			                Queue_index,
-			                dQueue_size,
-			                use_queues_flag,
-			                work_array,
-#ifdef __WORK_STEALING__
-			                work_pool_wgs,
-			                num_samples,
-#endif
-#ifdef WITH_CYCLES_DEBUG
-			                debugdata_coop,
-#endif
-			                num_parallel_samples);
-
-		kernel_set_args(ckPathTraceKernel_scene_intersect,
-		                0,
-		                kgbuffer,
-		                d_data,
-		                rng_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                ray_state,
-		                d_w,
-		                d_h,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-		                use_queues_flag,
-#ifdef WITH_CYCLES_DEBUG
-		                debugdata_coop,
-#endif
-		                num_parallel_samples);
-
-		kernel_set_args(ckPathTraceKernel_lamp_emission,
-		                0,
-		                kgbuffer,
-		                d_data,
-		                throughput_coop,
-		                PathRadiance_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                ray_state,
-		                d_w,
-		                d_h,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-		                use_queues_flag,
-		                num_parallel_samples);
-
-		kernel_set_args(ckPathTraceKernel_queue_enqueue,
-		                0,
-		                Queue_data,
-		                Queue_index,
-		                ray_state,
-		                dQueue_size);
-
-		kernel_set_args(ckPathTraceKernel_background_buffer_update,
-		                 0,
-		                 kgbuffer,
-		                 d_data,
-		                 per_sample_output_buffers,
-		                 d_rng_state,
-		                 rng_coop,
-		                 throughput_coop,
-		                 PathRadiance_coop,
-		                 Ray_coop,
-		                 PathState_coop,
-		                 L_transparent_coop,
-		                 ray_state,
-		                 d_w,
-		                 d_h,
-		                 d_x,
-		                 d_y,
-		                 d_stride,
-		                 rtile.rng_state_offset_x,
-		                 rtile.rng_state_offset_y,
-		                 rtile.buffer_rng_state_stride,
-		                 work_array,
-		                 Queue_data,
-		                 Queue_index,
-		                 dQueue_size,
-		                 end_sample,
-		                 start_sample,
-#ifdef __WORK_STEALING__
-		                 work_pool_wgs,
-		                 num_samples,
-#endif
-#ifdef WITH_CYCLES_DEBUG
-		                 debugdata_coop,
-#endif
-		                 num_parallel_samples);
-
-		kernel_set_args(ckPathTraceKernel_shader_eval,
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                rng_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size);
-
-		kernel_set_args(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao,
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                per_sample_output_buffers,
-		                rng_coop,
-		                throughput_coop,
-		                L_transparent_coop,
-		                PathRadiance_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                AOAlpha_coop,
-		                AOBSDF_coop,
-		                AOLightRay_coop,
-		                d_w,
-		                d_h,
-		                d_x,
-		                d_y,
-		                d_stride,
-		                ray_state,
-		                work_array,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-#ifdef __WORK_STEALING__
-		                start_sample,
-#endif
-		                num_parallel_samples);
-
-		kernel_set_args(ckPathTraceKernel_direct_lighting,
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                rng_coop,
-		                PathState_coop,
-		                ISLamp_coop,
-		                LightRay_coop,
-		                BSDFEval_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size);
-
-		kernel_set_args(ckPathTraceKernel_shadow_blocked,
-		                0,
-		                kgbuffer,
-		                d_data,
-		                PathState_coop,
-		                LightRay_coop,
-		                AOLightRay_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size);
-
-		kernel_set_args(ckPathTraceKernel_next_iteration_setup,
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                rng_coop,
-		                throughput_coop,
-		                PathRadiance_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                LightRay_coop,
-		                ISLamp_coop,
-		                BSDFEval_coop,
-		                AOLightRay_coop,
-		                AOBSDF_coop,
-		                AOAlpha_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-		                use_queues_flag);
-
-		kernel_set_args(ckPathTraceKernel_sum_all_radiance,
-		                0,
-		                d_data,
-		                d_buffer,
-		                per_sample_output_buffers,
-		                num_parallel_samples,
-		                d_w,
-		                d_h,
-		                d_stride,
-		                rtile.buffer_offset_x,
-		                rtile.buffer_offset_y,
-		                rtile.buffer_rng_state_stride,
-		                start_sample);
-
-		/* Macro for Enqueuing split kernels. */
-#define GLUE(a, b) a ## b
-#define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \
-		{ \
-			ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \
-			                               GLUE(ckPathTraceKernel_, \
-			                                    kernelName), \
-			                               2, \
-			                               NULL, \
-			                               globalSize, \
-			                               localSize, \
-			                               0, \
-			                               NULL, \
-			                               NULL); \
-			opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); \
-			if(ciErr != CL_SUCCESS) { \
-				string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", \
-				                               clewErrorString(ciErr)); \
-				opencl_error(message); \
-				return; \
-			} \
-		} (void) 0
-
-		/* Enqueue ckPathTraceKernel_data_init kernel. */
-		ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size);
-		bool activeRaysAvailable = true;
-
-		/* Record number of time host intervention has been made */
-		unsigned int numHostIntervention = 0;
-		unsigned int numNextPathIterTimes = PathIteration_times;
-		bool canceled = false;
-		while(activeRaysAvailable) {
-			/* Twice the global work size of other kernels for
-			 * ckPathTraceKernel_shadow_blocked_direct_lighting. */
-			size_t global_size_shadow_blocked[2];
-			global_size_shadow_blocked[0] = global_size[0] * 2;
-			global_size_shadow_blocked[1] = global_size[1];
-
-			/* Do path-iteration in host [Enqueue Path-iteration kernels. */
-			for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) {
-				ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size);
-				ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
-				if(task->get_cancel()) {
-					canceled = true;
-					break;
-				}
-			}
-
-			/* Read ray-state into Host memory to decide if we should exit
-			 * path-iteration in host.
-			 */
-			ciErr = clEnqueueReadBuffer(cqCommandQueue,
-			                            ray_state,
-			                            CL_TRUE,
-			                            0,
-			                            global_size[0] * global_size[1] * sizeof(char),
-			                            hostRayStateArray,
-			                            0,
-			                            NULL,
-			                            NULL);
-			assert(ciErr == CL_SUCCESS);
-
-			activeRaysAvailable = false;
-
-			for(int rayStateIter = 0;
-			    rayStateIter < global_size[0] * global_size[1];
-			    ++rayStateIter)
-			{
-				if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) {
-					/* Not all rays are RAY_INACTIVE. */
-					activeRaysAvailable = true;
-					break;
-				}
-			}
-
-			if(activeRaysAvailable) {
-				numHostIntervention++;
-				PathIteration_times = PATH_ITER_INC_FACTOR;
-				/* Host intervention done before all rays become RAY_INACTIVE;
-				 * Set do more initial iterations for the next tile.
-				 */
-				numNextPathIterTimes += PATH_ITER_INC_FACTOR;
-			}
-			if(task->get_cancel()) {
-				canceled = true;
-				break;
-			}
-		}
-
-		/* Execute SumALLRadiance kernel to accumulate radiance calculated in
-		 * per_sample_output_buffers into RenderTile's output buffer.
-		 */
-		if(!canceled) {
-			size_t sum_all_radiance_local_size[2] = {16, 16};
-			size_t sum_all_radiance_global_size[2];
-			sum_all_radiance_global_size[0] =
-				(((d_w - 1) / sum_all_radiance_local_size[0]) + 1) *
-				sum_all_radiance_local_size[0];
-			sum_all_radiance_global_size[1] =
-				(((d_h - 1) / sum_all_radiance_local_size[1]) + 1) *
-				sum_all_radiance_local_size[1];
-			ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
-			                     sum_all_radiance_global_size,
-			                     sum_all_radiance_local_size);
-		}
-
-#undef ENQUEUE_SPLIT_KERNEL
-#undef GLUE
-
-		if(numHostIntervention == 0) {
-			/* This means that we are executing kernel more than required
-			 * Must avoid this for the next sample/tile.
-			 */
-			PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ?
-			PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR;
-		}
-		else {
-			/* Number of path-iterations done for this tile is set as
-			 * Initial path-iteration times for the next tile
-			 */
-			PathIteration_times = numNextPathIterTimes;
-		}
-
-		first_tile = false;
-	}
-
-	/* Calculates the amount of memory that has to be always
-	 * allocated in order for the split kernel to function.
-	 * This memory is tile/scene-property invariant (meaning,
-	 * the value returned by this function does not depend
-	 * on the user set tile size or scene properties.
-	 */
-	size_t get_invariable_mem_allocated()
-	{
-		size_t total_invariable_mem_allocated = 0;
-		size_t KernelGlobals_size = 0;
-
-		KernelGlobals_size = get_KernelGlobals_size();
-
-		total_invariable_mem_allocated += KernelGlobals_size; /* KernelGlobals size */
-		total_invariable_mem_allocated += NUM_QUEUES * sizeof(unsigned int); /* Queue index size */
-		total_invariable_mem_allocated += sizeof(char); /* use_queues_flag size */
-
-		return total_invariable_mem_allocated;
-	}
-
-	/* Calculate the memory that has-to-be/has-been allocated for
-	 * the split kernel to function.
-	 */
-	size_t get_tile_specific_mem_allocated(const int2 tile_size)
-	{
-		size_t tile_specific_mem_allocated = 0;
-
-		/* Get required tile info */
-		unsigned int user_set_tile_w = tile_size.x;
-		unsigned int user_set_tile_h = tile_size.y;
-
-#ifdef __WORK_STEALING__
-		/* Calculate memory to be allocated for work_pools in
-		 * case of work_stealing.
-		 */
-		size_t max_global_size[2];
-		size_t max_num_work_pools = 0;
-		max_global_size[0] =
-			(((user_set_tile_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		max_global_size[1] =
-			(((user_set_tile_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		max_num_work_pools =
-			(max_global_size[0] * max_global_size[1]) /
-			(SPLIT_KERNEL_LOCAL_SIZE_X * SPLIT_KERNEL_LOCAL_SIZE_Y);
-		tile_specific_mem_allocated += max_num_work_pools * sizeof(unsigned int);
-#endif
-
-		tile_specific_mem_allocated +=
-			user_set_tile_w * user_set_tile_h * per_thread_output_buffer_size;
-		tile_specific_mem_allocated +=
-			user_set_tile_w * user_set_tile_h * sizeof(RNG);
-
-		return tile_specific_mem_allocated;
-	}
-
-	/* Calculates the texture memories and KernelData (d_data) memory
-	 * that has been allocated.
-	 */
-	size_t get_scene_specific_mem_allocated(cl_mem d_data)
-	{
-		size_t scene_specific_mem_allocated = 0;
-		/* Calculate texture memories. */
-#define KERNEL_TEX(type, ttype, name) \
-	scene_specific_mem_allocated += get_tex_size(#name);
-#include "kernel_textures.h"
-#undef KERNEL_TEX
-		size_t d_data_size;
-		ciErr = clGetMemObjectInfo(d_data,
-		                           CL_MEM_SIZE,
-		                           sizeof(d_data_size),
-		                           &d_data_size,
-		                           NULL);
-		assert(ciErr == CL_SUCCESS && "Can't get d_data mem object info");
-		scene_specific_mem_allocated += d_data_size;
-		return scene_specific_mem_allocated;
-	}
-
-	/* Calculate the memory required for one thread in split kernel. */
-	size_t get_per_thread_memory()
-	{
-		size_t shaderdata_size = 0;
-		/* TODO(sergey): This will actually over-allocate if
-		 * particular kernel does not support multiclosure.
-		 */
-		shaderdata_size = get_shader_data_size(current_max_closure);
-		size_t retval = sizeof(RNG)
-			+ sizeof(float3)          /* Throughput size */
-			+ sizeof(float)           /* L transparent size */
-			+ sizeof(char)            /* Ray state size */
-			+ sizeof(unsigned int)    /* Work element size */
-			+ sizeof(int)             /* ISLamp_size */
-			+ sizeof(PathRadiance) + sizeof(Ray) + sizeof(PathState)
-			+ sizeof(Intersection)    /* Overall isect */
-			+ sizeof(Intersection)    /* Instersection_coop_AO */
-			+ sizeof(Intersection)    /* Intersection coop DL */
-			+ shaderdata_size         /* Overall ShaderData */
-			+ (shaderdata_size * 2)   /* ShaderData : DL and shadow */
-			+ sizeof(Ray) + sizeof(BsdfEval)
-			+ sizeof(float3)          /* AOAlpha size */
-			+ sizeof(float3)          /* AOBSDF size */
-			+ sizeof(Ray)
-			+ (sizeof(int) * NUM_QUEUES)
-			+ per_thread_output_buffer_size;
-		return retval;
-	}
-
-	/* Considers the total memory available in the device and
-	 * and returns the maximum global work size possible.
-	 */
-	size_t get_feasible_global_work_size(int2 tile_size, cl_mem d_data)
-	{
-		/* Calculate invariably allocated memory. */
-		size_t invariable_mem_allocated = get_invariable_mem_allocated();
-		/* Calculate tile specific allocated memory. */
-		size_t tile_specific_mem_allocated =
-			get_tile_specific_mem_allocated(tile_size);
-		/* Calculate scene specific allocated memory. */
-		size_t scene_specific_mem_allocated =
-			get_scene_specific_mem_allocated(d_data);
-		/* Calculate total memory available for the threads in global work size. */
-		size_t available_memory = total_allocatable_memory
-			- invariable_mem_allocated
-			- tile_specific_mem_allocated
-			- scene_specific_mem_allocated
-			- DATA_ALLOCATION_MEM_FACTOR;
-		size_t per_thread_memory_required = get_per_thread_memory();
-		return (available_memory / per_thread_memory_required);
-	}
-
-	/* Checks if the device has enough memory to render the whole tile;
-	 * If not, we should split single tile into multiple tiles of small size
-	 * and process them all.
-	 */
-	bool need_to_split_tile(unsigned int d_w,
-	                        unsigned int d_h,
-	                        int2 max_render_feasible_tile_size)
-	{
-		size_t global_size_estimate[2];
-		/* TODO(sergey): Such round-ups are in quite few places, need to replace
-		 * them with an utility macro.
-		 */
-		global_size_estimate[0] =
-			(((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		global_size_estimate[1] =
-			(((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		if((global_size_estimate[0] * global_size_estimate[1]) >
-		   (max_render_feasible_tile_size.x * max_render_feasible_tile_size.y))
-		{
-			return true;
-		}
-		else {
-			return false;
-		}
-	}
-
-	/* Considers the scene properties, global memory available in the device
-	 * and returns a rectanglular tile dimension (approx the maximum)
-	 * that should render on split kernel.
-	 */
-	int2 get_max_render_feasible_tile_size(size_t feasible_global_work_size)
-	{
-		int2 max_render_feasible_tile_size;
-		int square_root_val = (int)sqrt(feasible_global_work_size);
-		max_render_feasible_tile_size.x = square_root_val;
-		max_render_feasible_tile_size.y = square_root_val;
-		/* Ciel round-off max_render_feasible_tile_size. */
-		int2 ceil_render_feasible_tile_size;
-		ceil_render_feasible_tile_size.x =
-			(((max_render_feasible_tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		ceil_render_feasible_tile_size.y =
-			(((max_render_feasible_tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		if(ceil_render_feasible_tile_size.x * ceil_render_feasible_tile_size.y <=
-		   feasible_global_work_size)
-		{
-			return ceil_render_feasible_tile_size;
-		}
-		/* Floor round-off max_render_feasible_tile_size. */
-		int2 floor_render_feasible_tile_size;
-		floor_render_feasible_tile_size.x =
-			(max_render_feasible_tile_size.x / SPLIT_KERNEL_LOCAL_SIZE_X) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		floor_render_feasible_tile_size.y =
-			(max_render_feasible_tile_size.y / SPLIT_KERNEL_LOCAL_SIZE_Y) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		return floor_render_feasible_tile_size;
-	}
-
-	/* Try splitting the current tile into multiple smaller
-	 * almost-square-tiles.
-	 */
-	int2 get_split_tile_size(RenderTile rtile,
-	                         int2 max_render_feasible_tile_size)
-	{
-		int2 split_tile_size;
-		int num_global_threads = max_render_feasible_tile_size.x *
-		                         max_render_feasible_tile_size.y;
-		int d_w = rtile.w;
-		int d_h = rtile.h;
-		/* Ceil round off d_w and d_h */
-		d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		while(d_w * d_h > num_global_threads) {
-			/* Halve the longer dimension. */
-			if(d_w >= d_h) {
-				d_w = d_w / 2;
-				d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-					SPLIT_KERNEL_LOCAL_SIZE_X;
-			}
-			else {
-				d_h = d_h / 2;
-				d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-					SPLIT_KERNEL_LOCAL_SIZE_Y;
-			}
-		}
-		split_tile_size.x = d_w;
-		split_tile_size.y = d_h;
-		return split_tile_size;
-	}
-
-	/* Splits existing tile into multiple tiles of tile size split_tile_size. */
-	vector<SplitRenderTile> split_tiles(RenderTile rtile, int2 split_tile_size)
-	{
-		vector<SplitRenderTile> to_path_trace_rtile;
-		int d_w = rtile.w;
-		int d_h = rtile.h;
-		int num_tiles_x = (((d_w - 1) / split_tile_size.x) + 1);
-		int num_tiles_y = (((d_h - 1) / split_tile_size.y) + 1);
-		/* Buffer and rng_state offset calc. */
-		size_t offset_index = rtile.offset + (rtile.x + rtile.y * rtile.stride);
-		size_t offset_x = offset_index % rtile.stride;
-		size_t offset_y = offset_index / rtile.stride;
-		/* Resize to_path_trace_rtile. */
-		to_path_trace_rtile.resize(num_tiles_x * num_tiles_y);
-		for(int tile_iter_y = 0; tile_iter_y < num_tiles_y; tile_iter_y++) {
-			for(int tile_iter_x = 0; tile_iter_x < num_tiles_x; tile_iter_x++) {
-				int rtile_index = tile_iter_y * num_tiles_x + tile_iter_x;
-				to_path_trace_rtile[rtile_index].rng_state_offset_x = offset_x + tile_iter_x * split_tile_size.x;
-				to_path_trace_rtile[rtile_index].rng_state_offset_y = offset_y + tile_iter_y * split_tile_size.y;
-				to_path_trace_rtile[rtile_index].buffer_offset_x = offset_x + tile_iter_x * split_tile_size.x;
-				to_path_trace_rtile[rtile_index].buffer_offset_y = offset_y + tile_iter_y * split_tile_size.y;
-				to_path_trace_rtile[rtile_index].start_sample = rtile.start_sample;
-				to_path_trace_rtile[rtile_index].num_samples = rtile.num_samples;
-				to_path_trace_rtile[rtile_index].sample = rtile.sample;
-				to_path_trace_rtile[rtile_index].resolution = rtile.resolution;
-				to_path_trace_rtile[rtile_index].offset = rtile.offset;
-				to_path_trace_rtile[rtile_index].buffers = rtile.buffers;
-				to_path_trace_rtile[rtile_index].buffer = rtile.buffer;
-				to_path_trace_rtile[rtile_index].rng_state = rtile.rng_state;
-				to_path_trace_rtile[rtile_index].x = rtile.x + (tile_iter_x * split_tile_size.x);
-				to_path_trace_rtile[rtile_index].y = rtile.y + (tile_iter_y * split_tile_size.y);
-				to_path_trace_rtile[rtile_index].buffer_rng_state_stride = rtile.stride;
-				/* Fill width and height of the new render tile. */
-				to_path_trace_rtile[rtile_index].w = (tile_iter_x == (num_tiles_x - 1)) ?
-					(d_w - (tile_iter_x * split_tile_size.x)) /* Border tile */
-					: split_tile_size.x;
-				to_path_trace_rtile[rtile_index].h = (tile_iter_y == (num_tiles_y - 1)) ?
-					(d_h - (tile_iter_y * split_tile_size.y)) /* Border tile */
-					: split_tile_size.y;
-				to_path_trace_rtile[rtile_index].stride = to_path_trace_rtile[rtile_index].w;
-			}
-		}
-		return to_path_trace_rtile;
-	}
-
-	void thread_run(DeviceTask *task)
-	{
-		if(task->type == DeviceTask::FILM_CONVERT) {
-			film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
-		}
-		else if(task->type == DeviceTask::SHADER) {
-			shader(*task);
-		}
-		else if(task->type == DeviceTask::PATH_TRACE) {
-			RenderTile tile;
-			bool initialize_data_and_check_render_feasibility = false;
-			bool need_to_split_tiles_further = false;
-			int2 max_render_feasible_tile_size;
-			size_t feasible_global_work_size;
-			const int2 tile_size = task->requested_tile_size;
-			/* Keep rendering tiles until done. */
-			while(task->acquire_tile(this, tile)) {
-				if(!initialize_data_and_check_render_feasibility) {
-					/* Initialize data. */
-					/* Calculate per_thread_output_buffer_size. */
-					size_t output_buffer_size = 0;
-					ciErr = clGetMemObjectInfo((cl_mem)tile.buffer,
-					                           CL_MEM_SIZE,
-					                           sizeof(output_buffer_size),
-					                           &output_buffer_size,
-					                           NULL);
-					assert(ciErr == CL_SUCCESS && "Can't get tile.buffer mem object info");
-					/* This value is different when running on AMD and NV. */
-					if(background) {
-						/* In offline render the number of buffer elements
-						 * associated with tile.buffer is the current tile size.
-						 */
-						per_thread_output_buffer_size =
-							output_buffer_size / (tile.w * tile.h);
-					}
-					else {
-						/* interactive rendering, unlike offline render, the number of buffer elements
-						 * associated with tile.buffer is the entire viewport size.
-						 */
-						per_thread_output_buffer_size =
-							output_buffer_size / (tile.buffers->params.width *
-							                      tile.buffers->params.height);
-					}
-					/* Check render feasibility. */
-					feasible_global_work_size = get_feasible_global_work_size(
-						tile_size,
-						CL_MEM_PTR(const_mem_map["__data"]->device_pointer));
-					max_render_feasible_tile_size =
-						get_max_render_feasible_tile_size(
-							feasible_global_work_size);
-					need_to_split_tiles_further =
-						need_to_split_tile(tile_size.x,
-						                   tile_size.y,
-						                   max_render_feasible_tile_size);
-					initialize_data_and_check_render_feasibility = true;
-				}
-				if(need_to_split_tiles_further) {
-					int2 split_tile_size =
-						get_split_tile_size(tile,
-						                    max_render_feasible_tile_size);
-					vector<SplitRenderTile> to_path_trace_render_tiles =
-						split_tiles(tile, split_tile_size);
-					/* Print message to console */
-					if(background && (to_path_trace_render_tiles.size() > 1)) {
-						fprintf(stderr, "Message : Tiles need to be split "
-						        "further inside path trace (due to insufficient "
-						        "device-global-memory for split kernel to "
-						        "function) \n"
-						        "The current tile of dimensions %dx%d is split "
-						        "into tiles of dimension %dx%d for render \n",
-						        tile.w, tile.h,
-						        split_tile_size.x,
-						        split_tile_size.y);
-					}
-					/* Process all split tiles. */
-					for(int tile_iter = 0;
-					    tile_iter < to_path_trace_render_tiles.size();
-					    ++tile_iter)
-					{
-						path_trace(task,
-						           to_path_trace_render_tiles[tile_iter],
-						           max_render_feasible_tile_size);
-					}
-				}
-				else {
-					/* No splitting required; process the entire tile at once. */
-					/* Render feasible tile size is user-set-tile-size itself. */
-					max_render_feasible_tile_size.x =
-						(((tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-						SPLIT_KERNEL_LOCAL_SIZE_X;
-					max_render_feasible_tile_size.y =
-						(((tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-						SPLIT_KERNEL_LOCAL_SIZE_Y;
-					/* buffer_rng_state_stride is stride itself. */
-					SplitRenderTile split_tile(tile);
-					split_tile.buffer_rng_state_stride = tile.stride;
-					path_trace(task, split_tile, max_render_feasible_tile_size);
-				}
-				tile.sample = tile.start_sample + tile.num_samples;
-
-				/* Complete kernel execution before release tile. */
-				/* This helps in multi-device render;
-				 * The device that reaches the critical-section function
-				 * release_tile waits (stalling other devices from entering
-				 * release_tile) for all kernels to complete. If device1 (a
-				 * slow-render device) reaches release_tile first then it would
-				 * stall device2 (a fast-render device) from proceeding to render
-				 * next tile.
-				 */
-				clFinish(cqCommandQueue);
-
-				task->release_tile(tile);
-			}
-		}
-	}
-
-protected:
-	cl_mem mem_alloc(size_t bufsize, cl_mem_flags mem_flag = CL_MEM_READ_WRITE)
-	{
-		cl_mem ptr;
-		assert(bufsize != 0);
-		ptr = clCreateBuffer(cxContext, mem_flag, bufsize, NULL, &ciErr);
-		opencl_assert_err(ciErr, "clCreateBuffer");
-		return ptr;
-	}
-
-	/* ** Those guys are for workign around some compiler-specific bugs ** */
-
-	cl_program load_cached_kernel(
-	        const DeviceRequestedFeatures& /*requested_features*/,
-	        OpenCLCache::ProgramName /*program_name*/,
-	        thread_scoped_lock /*cache_locker*/)
-	{
-		VLOG(2) << "Skip loading kernel from cache, "
-		        << "not supported by split kernel.";
-		return NULL;
-	}
+#include "device_intern.h"
 
-	void store_cached_kernel(cl_platform_id /*platform*/,
-	                         cl_device_id /*device*/,
-	                         cl_program /*program*/,
-	                         OpenCLCache::ProgramName /*program_name*/,
-	                         thread_scoped_lock& /*slot_locker*/)
-	{
-		VLOG(2) << "Skip storing kernel in cache, "
-		        << "not supported by split kernel.";
-	}
+#include "util_foreach.h"
+#include "util_logging.h"
 
-	string build_options_for_base_program(
-	        const DeviceRequestedFeatures& requested_features)
-	{
-		return requested_features.get_build_options();
-	}
-};
+CCL_NAMESPACE_BEGIN
 
 Device *device_opencl_create(DeviceInfo& info, Stats &stats, bool background)
 {
 	vector<OpenCLPlatformDevice> usable_devices;
-	opencl_get_usable_devices(&usable_devices);
+	OpenCLInfo::get_usable_devices(&usable_devices);
 	assert(info.num < usable_devices.size());
 	const OpenCLPlatformDevice& platform_device = usable_devices[info.num];
 	const string& platform_name = platform_device.platform_name;
 	const cl_device_type device_type = platform_device.device_type;
-	if(opencl_kernel_use_split(platform_name, device_type)) {
+	if(OpenCLInfo::kernel_use_split(platform_name, device_type)) {
 		VLOG(1) << "Using split kernel.";
-		return new OpenCLDeviceSplitKernel(info, stats, background);
+		return opencl_create_split_device(info, stats, background);
 	} else {
 		VLOG(1) << "Using mega kernel.";
-		return new OpenCLDeviceMegaKernel(info, stats, background);
+		return opencl_create_mega_device(info, stats, background);
 	}
 }
 
@@ -3298,7 +52,7 @@ bool device_opencl_init(void)
 
 	initialized = true;
 
-	if(opencl_device_type() != 0) {
+	if(OpenCLInfo::device_type() != 0) {
 		int clew_result = clewInit();
 		if(clew_result == CLEW_SUCCESS) {
 			VLOG(1) << "CLEW initialization succeeded.";
@@ -3322,24 +76,29 @@ bool device_opencl_init(void)
 void device_opencl_info(vector<DeviceInfo>& devices)
 {
 	vector<OpenCLPlatformDevice> usable_devices;
-	opencl_get_usable_devices(&usable_devices);
+	OpenCLInfo::get_usable_devices(&usable_devices);
 	/* Devices are numbered consecutively across platforms. */
 	int num_devices = 0;
 	foreach(OpenCLPlatformDevice& platform_device, usable_devices) {
 		const string& platform_name = platform_device.platform_name;
 		const cl_device_type device_type = platform_device.device_type;
 		const string& device_name = platform_device.device_name;
+		string hardware_id = platform_device.hardware_id;
+		if(hardware_id == "") {
+			hardware_id = string_printf("ID_%d", num_devices);
+		}
+
 		DeviceInfo info;
 		info.type = DEVICE_OPENCL;
 		info.description = string_remove_trademark(string(device_name));
 		info.num = num_devices;
-		info.id = string_printf("OPENCL_%d", info.num);
 		/* We don't know if it's used for display, but assume it is. */
 		info.display_device = true;
-		info.advanced_shading = opencl_kernel_use_advanced_shading(platform_name);
+		info.advanced_shading = OpenCLInfo::kernel_use_advanced_shading(platform_name);
 		info.pack_images = true;
-		info.use_split_kernel = opencl_kernel_use_split(platform_name,
-		                                                device_type);
+		info.use_split_kernel = OpenCLInfo::kernel_use_split(platform_name,
+		                                                     device_type);
+		info.id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;
 		devices.push_back(info);
 		num_devices++;
 	}
@@ -3347,7 +106,7 @@ void device_opencl_info(vector<DeviceInfo>& devices)
 
 string device_opencl_capabilities(void)
 {
-	if(opencl_device_type() == 0) {
+	if(OpenCLInfo::device_type() == 0) {
 		return "All OpenCL devices are forced to be OFF";
 	}
 	string result = "";
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index 1f1128a..48d1803 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -19,6 +19,8 @@
 
 #include "device_task.h"
 
+#include "buffers.h"
+
 #include "util_algorithm.h"
 #include "util_time.h"
 
@@ -99,14 +101,18 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
 	}
 }
 
-void DeviceTask::update_progress(RenderTile *rtile)
+void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
 {
 	if((type != PATH_TRACE) &&
 	   (type != SHADER))
 		return;
 
-	if(update_progress_sample)
-		update_progress_sample();
+	if(update_progress_sample) {
+		if(pixel_samples == -1) {
+			pixel_samples = shader_w;
+		}
+		update_progress_sample(pixel_samples, rtile? rtile->sample : 0);
+	}
 
 	if(update_tile_sample) {
 		double current_time = time_dt();
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 8423e83..8bd54c3 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -56,10 +56,10 @@ public:
 	int get_subtask_count(int num, int max_size = 0);
 	void split(list<DeviceTask>& tasks, int num, int max_size = 0);
 
-	void update_progress(RenderTile *rtile);
+	void update_progress(RenderTile *rtile, int pixel_samples = -1);
 
 	function<bool(Device *device, RenderTile&)> acquire_tile;
-	function<void(void)> update_progress_sample;
+	function<void(long, int)> update_progress_sample;
 	function<void(RenderTile&)> update_tile_sample;
 	function<void(RenderTile&)> release_tile;
 	function<bool(void)> get_cancel;
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
new file mode 100644
index 0000000..4023ba8
--- /dev/null
+++ b/intern/cycles/device/opencl/opencl.h
@@ -0,0 +1,408 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPENCL
+
+#include "device.h"
+
+#include "util_map.h"
+#include "util_param.h"
+#include "util_string.h"
+
+#include "clew.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
+
+/* Macro declarations used with split kernel */
+
+/* Macro to enable/disable work-stealing */
+#define __WORK_STEALING__
+
+#define SPLIT_KERNEL_LOCAL_SIZE_X 64
+#define SPLIT_KERNEL_LOCAL_SIZE_Y 1
+
+/* This value may be tuned according to the scene we are rendering.
+ *
+ * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
+ * ray-bounces will improve performance.
+ */
+#define PATH_ITER_INC_FACTOR 8
+
+/* When allocate global memory in chunks. We may not be able to
+ * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
+ * Since some bytes may be needed for aligning chunks of memory;
+ * This is the amount of memory that we dedicate for that purpose.
+ */
+#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
+
+struct OpenCLPlatformDevice {
+	OpenCLPlatformDevice(cl_platform_id platform_id,
+	                     const string& platform_name,
+	                     cl_device_id device_id,
+	                     cl_device_type device_type,
+	                     const string& device_name,
+	                     const string& hardware_id)
+	  : platform_id(platform_id),
+	    platform_name(platform_name),
+	    device_id(device_id),
+	    device_type(device_type),
+	    device_name(device_name),
+	    hardware_id(hardware_id) {}
+	cl_platform_id platform_id;
+	string platform_name;
+	cl_device_id device_id;
+	cl_device_type device_type;
+	string device_name;
+	string hardware_id;
+};
+
+/* Contains all static OpenCL helper functions. */
+class OpenCLInfo
+{
+public:
+	static cl_device_type device_type();
+	static bool use_debug();
+	static bool kernel_use_advanced_shading(const string& platform_name);
+	static bool kernel_use_split(const string& platform_name,
+	                             const cl_device_type device_type);
+	static bool device_supported(const string& platform_name,
+	                             const cl_device_id device_id);
+	static bool platform_version_check(cl_platform_id platform,
+	                                   string *error = NULL);
+	static bool device_version_check(cl_device_id device,
+	                                 string *error = NULL);
+	static string get_hardware_id(string platform_name,
+	                              cl_device_id device_id);
+	static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
+	                               bool force_all = false);
+};
+
+/* Thread safe cache for contexts and programs.
+ */
+class OpenCLCache
+{
+	struct Slot
+	{
+		struct ProgramEntry
+		{
+			ProgramEntry();
+			ProgramEntry(const ProgramEntry& rhs);
+			~ProgramEntry();
+			cl_program program;
+			thread_mutex *mutex;
+		};
+
+		Slot();
+		Slot(const Slot& rhs);
+		~Slot();
+
+		thread_mutex *context_mutex;
+		cl_context context;
+		typedef map<ustring, ProgramEntry> EntryMap;
+		EntryMap programs;
+
+	};
+
+	/* key is combination of platform ID and device ID */
+	typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
+
+	/* map of Slot objects */
+	typedef map<PlatformDevicePair, Slot> CacheMap;
+	CacheMap cache;
+
+	/* MD5 hash of the kernel source. */
+	string kernel_md5;
+
+	thread_mutex cache_lock;
+	thread_mutex kernel_md5_lock;
+
+	/* lazy instantiate */
+	static OpenCLCache& global_instance();
+
+public:
+
+	enum ProgramName {
+		OCL_DEV_BASE_PROGRAM,
+		OCL_DEV_MEGAKERNEL_PROGRAM,
+	};
+
+	/* Lookup context in the cache. If this returns NULL, slot_locker
+	 * will be holding a lock for the cache. slot_locker should refer to a
+	 * default constructed thread_scoped_lock. */
+	static cl_context get_context(cl_platform_id platform,
+	                              cl_device_id device,
+	                              thread_scoped_lock& slot_locker);
+	/* Same as above. */
+	static cl_program get_program(cl_platform_id platform,
+	                              cl_device_id device,
+	                              ustring key,
+	                              thread_scoped_lock& slot_locker);
+
+	/* Store context in the cache. You MUST have tried to get the item before storing to it. */
+	static void store_context(cl_platform_id platform,
+	                          cl_device_id device,
+	                          cl_context context,
+	                          thread_scoped_lock& slot_locker);
+	/* Same as above. */
+	static void store_program(cl_platform_id platform,
+	                          cl_device_id device,
+	                          cl_program program,
+	                          ustring key,
+	                          thread_scoped_lock& slot_locker);
+
+	static string get_kernel_md5();
+};
+
+#define opencl_assert(stmt) \
+	{ \
+		cl_int err = stmt; \
+		\
+		if(err != CL_SUCCESS) { \
+			string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
+			if(error_msg == "") \
+				error_msg = message; \
+			fprintf(stderr, "%s\n", message.c_str()); \
+		} \
+	} (void)0
+
+class OpenCLDeviceBase : public Device
+{
+public:
+	DedicatedTaskPool task_pool;
+	cl_context cxContext;
+	cl_command_queue cqCommandQueue;
+	cl_platform_id cpPlatform;
+	cl_device_id cdDevice;
+	cl_int ciErr;
+
+	class OpenCLProgram {
+	public:
+		OpenCLProgram() : loaded(false), device(NULL) {}
+		OpenCLProgram(OpenCLDeviceBase *device,
+		              string program_name,
+		              string kernel_name,
+		              string kernel_build_options,
+		              bool use_stdout = true);
+		~OpenCLProgram();
+
+		void add_kernel(ustring name);
+		void load();
+
+		bool is_loaded()    { return loaded; }
+		string get_log()    { return log; }
+		void report_error();
+
+		cl_kernel operator()();
+		cl_kernel operator()(ustring name);
+
+		void release();
+
+	private:
+		bool build_kernel(const string *debug_src);
+		bool compile_kernel(const string *debug_src);
+		bool load_binary(const string& clbin, const string *debug_src = NULL);
+		bool save_binary(const string& clbin);
+
+		void add_log(string msg, bool is_debug);
+		void add_error(string msg);
+
+		bool loaded;
+		cl_program program;
+		OpenCLDeviceBase *device;
+
+		/* Used for the OpenCLCache key. */
+		string program_name;
+
+		string kernel_file, kernel_build_options, device_md5;
+
+		bool use_stdout;
+		string log, error_msg;
+		string compile_output;
+
+		map<ustring, cl_kernel> kernels;
+	};
+
+	OpenCLProgram base_program;
+
+	typedef map<string, device_vector<uchar>*> ConstMemMap;
+	typedef map<string, device_ptr> MemMap;
+
+	ConstMemMap const_mem_map;
+	MemMap mem_map;
+	device_ptr null_mem;
+
+	bool device_initialized;
+	string platform_name;
+
+	bool opencl_error(cl_int err);
+	void opencl_error(const string& message);
+	void opencl_assert_err(cl_int err, const char* where);
+
+	OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_);
+	~OpenCLDeviceBase();
+
+	static void CL_CALLBACK context_notify_callback(const char *err_info,
+		const void * /*private_info*/, size_t /*cb*/, void *user_data);
+
+	bool opencl_version_check();
+
+	string device_md5_hash(string kernel_custom_build_options = "");
+	bool load_kernels(const DeviceRequestedFeatures& requested_features);
+
+	/* Has to be implemented by the real device classes.
+	 * The base device will then load all these programs. */
+	virtual void load_kernels(const DeviceRequestedFeatures& requested_features,
+	                          vector<OpenCLProgram*> &programs) = 0;
+
+	void mem_alloc(device_memory& mem, MemoryType type);
+	void mem_copy_to(device_memory& mem);
+	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
+	void mem_zero(device_memory& mem);
+	void mem_free(device_memory& mem);
+	void const_copy_to(const char *name, void *host, size_t size);
+	void tex_alloc(const char *name,
+	               device_memory& mem,
+	               InterpolationType /*interpolation*/,
+	               ExtensionType /*extension*/);
+	void tex_free(device_memory& mem);
+
+	size_t global_size_round_up(int group_size, int global_size);
+	void enqueue_kernel(cl_kernel kernel, size_t w, size_t h);
+	void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
+
+	void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
+	void shader(DeviceTask& task);
+
+	class OpenCLDeviceTask : public DeviceTask {
+	public:
+		OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
+		: DeviceTask(task)
+		{
+			run = function_bind(&OpenCLDeviceBase::thread_run,
+			                    device,
+			                    this);
+		}
+	};
+
+	int get_split_task_count(DeviceTask& /*task*/)
+	{
+		return 1;
+	}
+
+	void task_add(DeviceTask& task)
+	{
+		task_pool.push(new OpenCLDeviceTask(this, task));
+	}
+
+	void task_wait()
+	{
+		task_pool.wait();
+	}
+
+	void task_cancel()
+	{
+		task_pool.cancel();
+	}
+
+	virtual void thread_run(DeviceTask * /*task*/) = 0;
+
+protected:
+	string kernel_build_options(const string *debug_src = NULL);
+
+	class ArgumentWrapper {
+	public:
+		ArgumentWrapper() : size(0), pointer(NULL) {}
+		template <typename T>
+		ArgumentWrapper(T& argument) : size(sizeof(argument)),
+		                               pointer(&argument) { }
+		ArgumentWrapper(int argument) : size(sizeof(int)),
+		                                int_value(argument),
+		                                pointer(&int_value) { }
+		ArgumentWrapper(float argument) : size(sizeof(float)),
+		                                  float_value(argument),
+		                                  pointer(&float_value) { }
+		size_t size;
+		int int_value;
+		float float_value;
+		void *pointer;
+	};
+
+	/* TODO(sergey): In the future we can use variadic templates, once
+	 * C++0x is allowed. Should allow to clean this up a bit.
+	 */
+	int kernel_set_args(cl_kernel kernel,
+	                    int start_argument_index,
+	                    const ArgumentWrapper& arg1 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg2 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg3 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg4 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg5 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg6 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg7 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg8 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg9 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg10 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg11 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg12 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg13 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg14 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg15 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg16 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg17 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg18 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg19 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg20 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg21 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg22 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg23 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg24 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg25 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg26 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg27 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg28 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg29 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg30 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg31 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg32 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg33 = ArgumentWrapper());
+
+	void release_kernel_safe(cl_kernel kernel);
+	void release_mem_object_safe(cl_mem mem);
+	void release_program_safe(cl_program program);
+
+	/* ** Those guys are for workign around some compiler-specific bugs ** */
+
+	virtual cl_program load_cached_kernel(
+	        ustring key,
+	        thread_scoped_lock& cache_locker);
+
+	virtual void store_cached_kernel(
+	        cl_program program,
+	        ustring key,
+	        thread_scoped_lock& cache_locker);
+
+	virtual string build_options_for_base_program(
+	        const DeviceRequestedFeatures& /*requested_features*/);
+};
+
+Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background);
+Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background);
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
new file mode 100644
index 0000000..a2b9003
--- /dev/null
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -0,0 +1,741 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPENCL
+
+#include "opencl.h"
+
+#include "kernel_types.h"
+
+#include "util_foreach.h"
+#include "util_logging.h"
+#include "util_md5.h"
+#include "util_path.h"
+#include "util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+bool OpenCLDeviceBase::opencl_error(cl_int err)
+{
+	if(err != CL_SUCCESS) {
+		string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
+		if(error_msg == "")
+			error_msg = message;
+		fprintf(stderr, "%s\n", message.c_str());
+		return true;
+	}
+
+	return false;
+}
+
+void OpenCLDeviceBase::opencl_error(const string& message)
+{
+	if(error_msg == "")
+		error_msg = message;
+	fprintf(stderr, "%s\n", message.c_str());
+}
+
+void OpenCLDeviceBase::opencl_assert_err(cl_int err, const char* where)
+{
+	if(err != CL_SUCCESS) {
+		string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
+		if(error_msg == "")
+			error_msg = message;
+		fprintf(stderr, "%s\n", message.c_str());
+#ifndef NDEBUG
+		abort();
+#endif
+	}
+}
+
+OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
+: Device(info, stats, background_)
+{
+	cpPlatform = NULL;
+	cdDevice = NULL;
+	cxContext = NULL;
+	cqCommandQueue = NULL;
+	null_mem = 0;
+	device_initialized = false;
+
+	vector<OpenCLPlatformDevice> usable_devices;
+	OpenCLInfo::get_usable_devices(&usable_devices);
+	if(usable_devices.size() == 0) {
+		opencl_error("OpenCL: no devices found.");
+		return;
+	}
+	assert(info.num < usable_devices.size());
+	OpenCLPlatformDevice& platform_device = usable_devices[info.num];
+	cpPlatform = platform_device.platform_id;
+	cdDevice = platform_device.device_id;
+	platform_name = platform_device.platform_name;
+	VLOG(2) << "Creating new Cycles device for OpenCL platform "
+	        << platform_name << ", device "
+	        << platform_device.device_name << ".";
+
+	{
+		/* try to use cached context */
+		thread_scoped_lock cache_locker;
+		cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
+
+		if(cxContext == NULL) {
+			/* create context properties array to specify platform */
+			const cl_context_properties context_props[] = {
+				CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
+				0, 0
+			};
+
+			/* create context */
+			cxContext = clCreateContext(context_props, 1, &cdDevice,
+				context_notify_callback, cdDevice, &ciErr);
+
+			if(opencl_error(ciErr)) {
+				opencl_error("OpenCL: clCreateContext failed");
+				return;
+			}
+
+			/* cache it */
+			OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
+		}
+	}
+
+	cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
+	if(opencl_error(ciErr))
+		return;
+
+	null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
+	if(opencl_error(ciErr))
+		return;
+
+	fprintf(stderr, "Device init success\n");
+	device_initialized = true;
+}
+
+OpenCLDeviceBase::~OpenCLDeviceBase()
+{
+	task_pool.stop();
+
+	if(null_mem)
+		clReleaseMemObject(CL_MEM_PTR(null_mem));
+
+	ConstMemMap::iterator mt;
+	for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
+		mem_free(*(mt->second));
+		delete mt->second;
+	}
+
+	base_program.release();
+	if(cqCommandQueue)
+		clReleaseCommandQueue(cqCommandQueue);
+	if(cxContext)
+		clReleaseContext(cxContext);
+}
+
+void CL_CALLBACK OpenCLDeviceBase::context_notify_callback(const char *err_info,
+	const void * /*private_info*/, size_t /*cb*/, void *user_data)
+{
+	char name[256];
+	clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
+
+	fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info);
+}
+
+bool OpenCLDeviceBase::opencl_version_check()
+{
+	string error;
+	if(!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
+		opencl_error(error);
+		return false;
+	}
+	if(!OpenCLInfo::device_version_check(cdDevice, &error)) {
+		opencl_error(error);
+		return false;
+	}
+	return true;
+}
+
+string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options)
+{
+	MD5Hash md5;
+	char version[256], driver[256], name[256], vendor[256];
+
+	clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
+	clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
+	clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
+	clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
+
+	md5.append((uint8_t*)vendor, strlen(vendor));
+	md5.append((uint8_t*)version, strlen(version));
+	md5.append((uint8_t*)name, strlen(name));
+	md5.append((uint8_t*)driver, strlen(driver));
+
+	string options = kernel_build_options();
+	options += kernel_custom_build_options;
+	md5.append((uint8_t*)options.c_str(), options.size());
+
+	return md5.get_hex();
+}
+
+bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features)
+{
+	/* Verify if device was initialized. */
+	if(!device_initialized) {
+		fprintf(stderr, "OpenCL: failed to initialize device.\n");
+		return false;
+	}
+
+	/* Verify we have right opencl version. */
+	if(!opencl_version_check())
+		return false;
+
+	base_program = OpenCLProgram(this, "base", "kernel.cl", build_options_for_base_program(requested_features));
+	base_program.add_kernel(ustring("convert_to_byte"));
+	base_program.add_kernel(ustring("convert_to_half_float"));
+	base_program.add_kernel(ustring("shader"));
+	base_program.add_kernel(ustring("bake"));
+
+	vector<OpenCLProgram*> programs;
+	programs.push_back(&base_program);
+	/* Call actual class to fill the vector with its programs. */
+	load_kernels(requested_features, programs);
+
+	/* Parallel compilation is supported by Cycles, but currently all OpenCL frameworks
+	 * serialize the calls internally, so it's not much use right now.
+	 * Note: When enabling parallel compilation, use_stdout in the OpenCLProgram constructor
+	 * should be set to false as well. */
+#if 0
+	TaskPool task_pool;
+	foreach(OpenCLProgram *program, programs) {
+		task_pool.push(function_bind(&OpenCLProgram::load, program));
+	}
+	task_pool.wait_work();
+
+	foreach(OpenCLProgram *program, programs) {
+		VLOG(2) << program->get_log();
+		if(!program->is_loaded()) {
+			program->report_error();
+			return false;
+		}
+	}
+#else
+	foreach(OpenCLProgram *program, programs) {
+		program->load();
+		if(!program->is_loaded()) {
+			return false;
+		}
+	}
+#endif
+
+	return true;
+}
+
+void OpenCLDeviceBase::mem_alloc(device_memory& mem, MemoryType type)
+{
+	size_t size = mem.memory_size();
+
+	cl_mem_flags mem_flag;
+	void *mem_ptr = NULL;
+
+	if(type == MEM_READ_ONLY)
+		mem_flag = CL_MEM_READ_ONLY;
+	else if(type == MEM_WRITE_ONLY)
+		mem_flag = CL_MEM_WRITE_ONLY;
+	else
+		mem_flag = CL_MEM_READ_WRITE;
+
+	/* Zero-size allocation might be invoked by render, but not really
+	 * supported by OpenCL. Using NULL as device pointer also doesn't really
+	 * work for some reason, so for the time being we'll use special case
+	 * will null_mem buffer.
+	 */
+	if(size != 0) {
+		mem.device_pointer = (device_ptr)clCreateBuffer(cxContext,
+		                                                mem_flag,
+		                                                size,
+		                                                mem_ptr,
+		                                                &ciErr);
+		opencl_assert_err(ciErr, "clCreateBuffer");
+	}
+	else {
+		mem.device_pointer = null_mem;
+	}
+
+	stats.mem_alloc(size);
+	mem.device_size = size;
+}
+
+void OpenCLDeviceBase::mem_copy_to(device_memory& mem)
+{
+	/* this is blocking */
+	size_t size = mem.memory_size();
+	if(size != 0) {
+		opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
+		                                   CL_MEM_PTR(mem.device_pointer),
+		                                   CL_TRUE,
+		                                   0,
+		                                   size,
+		                                   (void*)mem.data_pointer,
+		                                   0,
+		                                   NULL, NULL));
+	}
+}
+
+void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
+{
+	size_t offset = elem*y*w;
+	size_t size = elem*w*h;
+	assert(size != 0);
+	opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
+	                                  CL_MEM_PTR(mem.device_pointer),
+	                                  CL_TRUE,
+	                                  offset,
+	                                  size,
+	                                  (uchar*)mem.data_pointer + offset,
+	                                  0,
+	                                  NULL, NULL));
+}
+
+void OpenCLDeviceBase::mem_zero(device_memory& mem)
+{
+	if(mem.device_pointer) {
+		memset((void*)mem.data_pointer, 0, mem.memory_size());
+		mem_copy_to(mem);
+	}
+}
+
+void OpenCLDeviceBase::mem_free(device_memory& mem)
+{
+	if(mem.device_pointer) {
+		if(mem.device_pointer != null_mem) {
+			opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
+		}
+		mem.device_pointer = 0;
+
+		stats.mem_free(mem.device_size);
+		mem.device_size = 0;
+	}
+}
+
+void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
+{
+	ConstMemMap::iterator i = const_mem_map.find(name);
+
+	if(i == const_mem_map.end()) {
+		device_vector<uchar> *data = new device_vector<uchar>();
+		data->copy((uchar*)host, size);
+
+		mem_alloc(*data, MEM_READ_ONLY);
+		i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
+	}
+	else {
+		device_vector<uchar> *data = i->second;
+		data->copy((uchar*)host, size);
+	}
+
+	mem_copy_to(*i->second);
+}
+
+void OpenCLDeviceBase::tex_alloc(const char *name,
+               device_memory& mem,
+               InterpolationType /*interpolation*/,
+               ExtensionType /*extension*/)
+{
+	VLOG(1) << "Texture allocate: " << name << ", "
+	        << string_human_readable_number(mem.memory_size()) << " bytes. ("
+	        << string_human_readable_size(mem.memory_size()) << ")";
+	mem_alloc(mem, MEM_READ_ONLY);
+	mem_copy_to(mem);
+	assert(mem_map.find(name) == mem_map.end());
+	mem_map.insert(MemMap::value_type(name, mem.device_pointer));
+}
+
+void OpenCLDeviceBase::tex_free(device_memory& mem)
+{
+	if(mem.device_pointer) {
+		foreach(const MemMap::value_type& value, mem_map) {
+			if(value.second == mem.device_pointer) {
+				mem_map.erase(value.first);
+				break;
+			}
+		}
+
+		mem_free(mem);
+	}
+}
+
+size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
+{
+	int r = global_size % group_size;
+	return global_size + ((r == 0)? 0: group_size - r);
+}
+
+void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
+{
+	size_t workgroup_size, max_work_items[3];
+
+	clGetKernelWorkGroupInfo(kernel, cdDevice,
+		CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
+	clGetDeviceInfo(cdDevice,
+		CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
+
+	/* Try to divide evenly over 2 dimensions. */
+	size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
+	size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
+
+	/* Some implementations have max size 1 on 2nd dimension. */
+	if(local_size[1] > max_work_items[1]) {
+		local_size[0] = workgroup_size/max_work_items[1];
+		local_size[1] = max_work_items[1];
+	}
+
+	size_t global_size[2] = {global_size_round_up(local_size[0], w),
+	                         global_size_round_up(local_size[1], h)};
+
+	/* Vertical size of 1 is coming from bake/shade kernels where we should
+	 * not round anything up because otherwise we'll either be doing too
+	 * much work per pixel (if we don't check global ID on Y axis) or will
+	 * be checking for global ID to always have Y of 0.
+	 */
+	if(h == 1) {
+		global_size[h] = 1;
+	}
+
+	/* run kernel */
+	opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
+	opencl_assert(clFlush(cqCommandQueue));
+}
+
+void OpenCLDeviceBase::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
+{
+	cl_mem ptr;
+
+	MemMap::iterator i = mem_map.find(name);
+	if(i != mem_map.end()) {
+		ptr = CL_MEM_PTR(i->second);
+	}
+	else {
+		/* work around NULL not working, even though the spec says otherwise */
+		ptr = CL_MEM_PTR(null_mem);
+	}
+
+	opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
+}
+
+void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
+{
+	/* cast arguments to cl types */
+	cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+	cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
+	cl_mem d_buffer = CL_MEM_PTR(buffer);
+	cl_int d_x = task.x;
+	cl_int d_y = task.y;
+	cl_int d_w = task.w;
+	cl_int d_h = task.h;
+	cl_float d_sample_scale = 1.0f/(task.sample + 1);
+	cl_int d_offset = task.offset;
+	cl_int d_stride = task.stride;
+
+
+	cl_kernel ckFilmConvertKernel = (rgba_byte)? base_program(ustring("convert_to_byte")): base_program(ustring("convert_to_half_float"));
+
+	cl_uint start_arg_index =
+		kernel_set_args(ckFilmConvertKernel,
+		                0,
+		                d_data,
+		                d_rgba,
+		                d_buffer);
+
+#define KERNEL_TEX(type, ttype, name) \
+set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+
+	start_arg_index += kernel_set_args(ckFilmConvertKernel,
+	                                   start_arg_index,
+	                                   d_sample_scale,
+	                                   d_x,
+	                                   d_y,
+	                                   d_w,
+	                                   d_h,
+	                                   d_offset,
+	                                   d_stride);
+
+	enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
+}
+
+void OpenCLDeviceBase::shader(DeviceTask& task)
+{
+	/* cast arguments to cl types */
+	cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+	cl_mem d_input = CL_MEM_PTR(task.shader_input);
+	cl_mem d_output = CL_MEM_PTR(task.shader_output);
+	cl_mem d_output_luma = CL_MEM_PTR(task.shader_output_luma);
+	cl_int d_shader_eval_type = task.shader_eval_type;
+	cl_int d_shader_filter = task.shader_filter;
+	cl_int d_shader_x = task.shader_x;
+	cl_int d_shader_w = task.shader_w;
+	cl_int d_offset = task.offset;
+
+	cl_kernel kernel;
+
+	if(task.shader_eval_type >= SHADER_EVAL_BAKE)
+		kernel = base_program(ustring("bake"));
+	else
+		kernel = base_program(ustring("shader"));
+
+	cl_uint start_arg_index =
+		kernel_set_args(kernel,
+		                0,
+		                d_data,
+		                d_input,
+		                d_output);
+
+	if(task.shader_eval_type < SHADER_EVAL_BAKE) {
+		start_arg_index += kernel_set_args(kernel,
+		                                   start_arg_index,
+		                                   d_output_luma);
+	}
+
+#define KERNEL_TEX(type, ttype, name) \
+	set_kernel_arg_mem(kernel, &start_arg_index, #name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+
+	start_arg_index += kernel_set_args(kernel,
+	                                   start_arg_index,
+	                                   d_shader_eval_type);
+	if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
+		start_arg_index += kernel_set_args(kernel,
+		                                   start_arg_index,
+		                                   d_shader_filter);
+	}
+	start_arg_index += kernel_set_args(kernel,
+	                                   start_arg_index,
+	                                   d_shader_x,
+	                                   d_shader_w,
+	                                   d_offset);
+
+	for(int sample = 0; sample < task.num_samples; sample++) {
+
+		if(task.get_cancel())
+			break;
+
+		kernel_set_args(kernel, start_arg_index, sample);
+
+		enqueue_kernel(kernel, task.shader_w, 1);
+
+		clFinish(cqCommandQueue);
+
+		task.update_progress(NULL);
+	}
+}
+
+string OpenCLDeviceBase::kernel_build_options(const string *debug_src)
+{
+	string build_options = "-cl-fast-relaxed-math ";
+
+	if(platform_name == "NVIDIA CUDA") {
+		build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
+		                 "-cl-nv-maxrregcount=32 "
+		                 "-cl-nv-verbose ";
+
+		uint compute_capability_major, compute_capability_minor;
+		clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
+		                sizeof(cl_uint), &compute_capability_major, NULL);
+		clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
+		                sizeof(cl_uint), &compute_capability_minor, NULL);
+
+		build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
+		                               compute_capability_major * 100 +
+		                               compute_capability_minor * 10);
+	}
+
+	else if(platform_name == "Apple")
+		build_options += "-D__KERNEL_OPENCL_APPLE__ ";
+
+	else if(platform_name == "AMD Accelerated Parallel Processing")
+		build_options += "-D__KERNEL_OPENCL_AMD__ ";
+
+	else if(platform_name == "Intel(R) OpenCL") {
+		build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
+
+		/* Options for gdb source level kernel debugging.
+		 * this segfaults on linux currently.
+		 */
+		if(OpenCLInfo::use_debug() && debug_src)
+			build_options += "-g -s \"" + *debug_src + "\" ";
+	}
+
+	if(OpenCLInfo::use_debug())
+		build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
+
+#ifdef WITH_CYCLES_DEBUG
+	build_options += "-D__KERNEL_DEBUG__ ";
+#endif
+
+	return build_options;
+}
+
+/* TODO(sergey): In the future we can use variadic templates, once
+ * C++0x is allowed. Should allow to clean this up a bit.
+ */
+int OpenCLDeviceBase::kernel_set_args(cl_kernel kernel,
+                    int start_argument_index,
+                    const ArgumentWrapper& arg1,
+                    const ArgumentWrapper& arg2,
+                    const ArgumentWrapper& arg3,
+                    const ArgumentWrapper& arg4,
+                    const ArgumentWrapper& arg5,
+                    const ArgumentWrapper& arg6,
+                    const ArgumentWrapper& arg7,
+                    const ArgumentWrapper& arg8,
+                    const ArgumentWrapper& arg9,
+                    const ArgumentWrapper& arg10,
+                    const ArgumentWrapper& arg11,
+                    const ArgumentWrapper& arg12,
+                    const ArgumentWrapper& arg13,
+                    const ArgumentWrapper& arg14,
+                    const ArgumentWrapper& arg15,
+                    const ArgumentWrapper& arg16,
+                    const ArgumentWrapper& arg17,
+                    const ArgumentWrapper& arg18,
+                    const ArgumentWrapper& arg19,
+                    const ArgumentWrapper& arg20,
+                    const ArgumentWrapper& arg21,
+                    const ArgumentWrapper& arg22,
+                    const ArgumentWrapper& arg23,
+                    const ArgumentWrapper& arg24,
+                    const ArgumentWrapper& arg25,
+                    const ArgumentWrapper& arg26,
+                    const ArgumentWrapper& arg27,
+                    const ArgumentWrapper& arg28,
+                    const ArgumentWrapper& arg29,
+                    const ArgumentWrapper& arg30,
+                    const ArgumentWrapper& arg31,
+                    const ArgumentWrapper& arg32,
+                    const ArgumentWrapper& arg33)
+{
+	int current_arg_index = 0;
+#define FAKE_VARARG_HANDLE_ARG(arg) \
+	do { \
+		if(arg.pointer != NULL) { \
+			opencl_assert(clSetKernelArg( \
+				kernel, \
+				start_argument_index + current_arg_index, \
+				arg.size, arg.pointer)); \
+			++current_arg_index; \
+		} \
+		else { \
+			return current_arg_index; \
+		} \
+	} while(false)
+	FAKE_VARARG_HANDLE_ARG(arg1);
+	FAKE_VARARG_HANDLE_ARG(arg2);
+	FAKE_VARARG_HANDLE_ARG(arg3);
+	FAKE_VARARG_HANDLE_ARG(arg4);
+	FAKE_VARARG_HANDLE_ARG(arg5);
+	FAKE_VARARG_HANDLE_ARG(arg6);
+	FAKE_VARARG_HANDLE_ARG(arg7);
+	FAKE_VARARG_HANDLE_ARG(arg8);
+	FAKE_VARARG_HANDLE_ARG(arg9);
+	FAKE_VARARG_HANDLE_ARG(arg10);
+	FAKE_VARARG_HANDLE_ARG(arg11);
+	FAKE_VARARG_HANDLE_ARG(arg12);
+	FAKE_VARARG_HANDLE_ARG(arg13);
+	FAKE_VARARG_HANDLE_ARG(arg14);
+	FAKE_VARARG_HANDLE_ARG(arg15);
+	FAKE_VARARG_HANDLE_ARG(arg16);
+	FAKE_VARARG_HANDLE_ARG(arg17);
+	FAKE_VARARG_HANDLE_ARG(arg18);
+	FAKE_VARARG_HANDLE_ARG(arg19);
+	FAKE_VARARG_HANDLE_ARG(arg20);
+	FAKE_VARARG_HANDLE_ARG(arg21);
+	FAKE_VARARG_HANDLE_ARG(arg22);
+	FAKE_VARARG_HANDLE_ARG(arg23);
+	FAKE_VARARG_HANDLE_ARG(arg24);
+	FAKE_VARARG_HANDLE_ARG(arg25);
+	FAKE_VARARG_HANDLE_ARG(arg26);
+	FAKE_VARARG_HANDLE_ARG(arg27);
+	FAKE_VARARG_HANDLE_ARG(arg28);
+	FAKE_VARARG_HANDLE_ARG(arg29);
+	FAKE_VARARG_HANDLE_ARG(arg30);
+	FAKE_VARARG_HANDLE_ARG(arg31);
+	FAKE_VARARG_HANDLE_ARG(arg32);
+	FAKE_VARARG_HANDLE_ARG(arg33);
+#undef FAKE_VARARG_HANDLE_ARG
+	return current_arg_index;
+}
+
+void OpenCLDeviceBase::release_kernel_safe(cl_kernel kernel)
+{
+	if(kernel) {
+		clReleaseKernel(kernel);
+	}
+}
+
+void OpenCLDeviceBase::release_mem_object_safe(cl_mem mem)
+{
+	if(mem != NULL) {
+		clReleaseMemObject(mem);
+	}
+}
+
+void OpenCLDeviceBase::release_program_safe(cl_program program)
+{
+	if(program) {
+		clReleaseProgram(program);
+	}
+}
+
+/* ** Those guys are for workign around some compiler-specific bugs ** */
+
+cl_program OpenCLDeviceBase::load_cached_kernel(
+        ustring key,
+        thread_scoped_lock& cache_locker)
+{
+	return OpenCLCache::get_program(cpPlatform,
+	                                cdDevice,
+	                                key,
+	                                cache_locker);
+}
+
+void OpenCLDeviceBase::store_cached_kernel(
+        cl_program program,
+        ustring key,
+        thread_scoped_lock& cache_locker)
+{
+	OpenCLCache::store_program(cpPlatform,
+	                           cdDevice,
+	                           program,
+	                           key,
+	                           cache_locker);
+}
+
+string OpenCLDeviceBase::build_options_for_base_program(
+        const DeviceRequestedFeatures& /*requested_features*/)
+{
+	/* TODO(sergey): By default we compile all features, meaning
+	 * mega kernel is not getting feature-based optimizations.
+	 *
+	 * Ideally we need always compile kernel with as less features
+	 * enabled as possible to keep performance at it's max.
+	 */
+	return "";
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp
new file mode 100644
index 0000000..6ea7619
--- /dev/null
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPENCL
+
+#include "opencl.h"
+
+#include "buffers.h"
+
+#include "kernel_types.h"
+
+#include "util_md5.h"
+#include "util_path.h"
+#include "util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OpenCLDeviceMegaKernel : public OpenCLDeviceBase
+{
+public:
+	OpenCLProgram path_trace_program;
+
+	OpenCLDeviceMegaKernel(DeviceInfo& info, Stats &stats, bool background_)
+	: OpenCLDeviceBase(info, stats, background_),
+	  path_trace_program(this, "megakernel", "kernel.cl", "-D__COMPILE_ONLY_MEGAKERNEL__ ")
+	{
+	}
+
+	virtual bool show_samples() const {
+		return true;
+	}
+
+	virtual void load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
+	                          vector<OpenCLProgram*> &programs)
+	{
+		path_trace_program.add_kernel(ustring("path_trace"));
+		programs.push_back(&path_trace_program);
+	}
+
+	~OpenCLDeviceMegaKernel()
+	{
+		task_pool.stop();
+		path_trace_program.release();
+	}
+
+	void path_trace(RenderTile& rtile, int sample)
+	{
+		/* Cast arguments to cl types. */
+		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
+		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
+		cl_int d_x = rtile.x;
+		cl_int d_y = rtile.y;
+		cl_int d_w = rtile.w;
+		cl_int d_h = rtile.h;
+		cl_int d_offset = rtile.offset;
+		cl_int d_stride = rtile.stride;
+
+		/* Sample arguments. */
+		cl_int d_sample = sample;
+
+		cl_kernel ckPathTraceKernel = path_trace_program(ustring("path_trace"));
+
+		cl_uint start_arg_index =
+			kernel_set_args(ckPathTraceKernel,
+			                0,
+			                d_data,
+			                d_buffer,
+			                d_rng_state);
+
+#define KERNEL_TEX(type, ttype, name) \
+		set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+
+		start_arg_index += kernel_set_args(ckPathTraceKernel,
+		                                   start_arg_index,
+		                                   d_sample,
+		                                   d_x,
+		                                   d_y,
+		                                   d_w,
+		                                   d_h,
+		                                   d_offset,
+		                                   d_stride);
+
+		enqueue_kernel(ckPathTraceKernel, d_w, d_h);
+	}
+
+	void thread_run(DeviceTask *task)
+	{
+		if(task->type == DeviceTask::FILM_CONVERT) {
+			film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
+		}
+		else if(task->type == DeviceTask::SHADER) {
+			shader(*task);
+		}
+		else if(task->type == DeviceTask::PATH_TRACE) {
+			RenderTile tile;
+			/* Keep rendering tiles until done. */
+			while(task->acquire_tile(this, tile)) {
+				int start_sample = tile.start_sample;
+				int end_sample = tile.start_sample + tile.num_samples;
+
+				for(int sample = start_sample; sample < end_sample; sample++) {
+					if(task->get_cancel()) {
+						if(task->need_finish_queue == false)
+							break;
+					}
+
+					path_trace(tile, sample);
+
+					tile.sample = sample + 1;
+
+					task->update_progress(&tile, tile.w*tile.h);
+				}
+
+				/* Complete kernel execution before release tile */
+				/* This helps in multi-device render;
+				 * The device that reaches the critical-section function
+				 * release_tile waits (stalling other devices from entering
+				 * release_tile) for all kernels to complete. If device1 (a
+				 * slow-render device) reaches release_tile first then it would
+				 * stall device2 (a fast-render device) from proceeding to render
+				 * next tile.
+				 */
+				clFinish(cqCommandQueue);
+
+				task->release_tile(tile);
+			}
+		}
+	}
+};
+
+Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background)
+{
+	return new OpenCLDeviceMegaKernel(info, stats, background);
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
new file mode 100644
index 0000000..3c3c215
--- /dev/null
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -0,0 +1,1311 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPENCL
+
+#include "opencl.h"
+
+#include "buffers.h"
+
+#include "kernel_types.h"
+
+#include "util_md5.h"
+#include "util_path.h"
+#include "util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* TODO(sergey): This is to keep tile split on OpenCL level working
+ * for now, since without this view-port render does not work as it
+ * should.
+ *
+ * Ideally it'll be done on the higher level, but we need to get ready
+ * for merge rather soon, so let's keep split logic private here in
+ * the file.
+ */
+class SplitRenderTile : public RenderTile {
+public:
+	SplitRenderTile()
+		: RenderTile(),
+		  buffer_offset_x(0),
+		  buffer_offset_y(0),
+		  rng_state_offset_x(0),
+		  rng_state_offset_y(0),
+		  buffer_rng_state_stride(0) {}
+
+	explicit SplitRenderTile(RenderTile& tile)
+		: RenderTile(),
+		  buffer_offset_x(0),
+		  buffer_offset_y(0),
+		  rng_state_offset_x(0),
+		  rng_state_offset_y(0),
+		  buffer_rng_state_stride(0)
+	{
+		x = tile.x;
+		y = tile.y;
+		w = tile.w;
+		h = tile.h;
+		start_sample = tile.start_sample;
+		num_samples = tile.num_samples;
+		sample = tile.sample;
+		resolution = tile.resolution;
+		offset = tile.offset;
+		stride = tile.stride;
+		buffer = tile.buffer;
+		rng_state = tile.rng_state;
+		buffers = tile.buffers;
+	}
+
+	/* Split kernel is device global memory constrained;
+	 * hence split kernel cant render big tile size's in
+	 * one go. If the user sets a big tile size (big tile size
+	 * is a term relative to the available device global memory),
+	 * we split the tile further and then call path_trace on
+	 * each of those split tiles. The following variables declared,
+	 * assist in achieving that purpose
+	 */
+	int buffer_offset_x;
+	int buffer_offset_y;
+	int rng_state_offset_x;
+	int rng_state_offset_y;
+	int buffer_rng_state_stride;
+};
+
+/* OpenCLDeviceSplitKernel's declaration/definition. */
+class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
+{
+public:
+	/* Kernel declaration. */
+	OpenCLProgram program_data_init;
+	OpenCLProgram program_scene_intersect;
+	OpenCLProgram program_lamp_emission;
+	OpenCLProgram program_queue_enqueue;
+	OpenCLProgram program_background_buffer_update;
+	OpenCLProgram program_shader_eval;
+	OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
+	OpenCLProgram program_direct_lighting;
+	OpenCLProgram program_shadow_blocked;
+	OpenCLProgram program_next_iteration_setup;
+	OpenCLProgram program_sum_all_radiance;
+
+	/* Global memory variables [porting]; These memory is used for
+	 * co-operation between different kernels; Data written by one
+	 * kernel will be available to another kernel via this global
+	 * memory.
+	 */
+	cl_mem rng_coop;
+	cl_mem throughput_coop;
+	cl_mem L_transparent_coop;
+	cl_mem PathRadiance_coop;
+	cl_mem Ray_coop;
+	cl_mem PathState_coop;
+	cl_mem Intersection_coop;
+	cl_mem kgbuffer;  /* KernelGlobals buffer. */
+
+	/* Global buffers for ShaderData. */
+	cl_mem sd;             /* ShaderData used in the main path-iteration loop. */
+	cl_mem sd_DL_shadow;   /* ShaderData used in Direct Lighting and
+	                        * shadow_blocked kernel.
+	                        */
+
+	/* Global memory required for shadow blocked and accum_radiance. */
+	cl_mem BSDFEval_coop;
+	cl_mem ISLamp_coop;
+	cl_mem LightRay_coop;
+	cl_mem AOAlpha_coop;
+	cl_mem AOBSDF_coop;
+	cl_mem AOLightRay_coop;
+	cl_mem Intersection_coop_shadow;
+
+#ifdef WITH_CYCLES_DEBUG
+	/* DebugData memory */
+	cl_mem debugdata_coop;
+#endif
+
+	/* Global state array that tracks ray state. */
+	cl_mem ray_state;
+
+	/* Per sample buffers. */
+	cl_mem per_sample_output_buffers;
+
+	/* Denotes which sample each ray is being processed for. */
+	cl_mem work_array;
+
+	/* Queue */
+	cl_mem Queue_data;  /* Array of size queuesize * num_queues * sizeof(int). */
+	cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
+	                     * Tracks the size of each queue.
+	                     */
+
+	/* Flag to make sceneintersect and lampemission kernel use queues. */
+	cl_mem use_queues_flag;
+
+	/* Amount of memory in output buffer associated with one pixel/thread. */
+	size_t per_thread_output_buffer_size;
+
+	/* Total allocatable available device memory. */
+	size_t total_allocatable_memory;
+
+	/* host version of ray_state; Used in checking host path-iteration
+	 * termination.
+	 */
+	char *hostRayStateArray;
+
+	/* Number of path-iterations to be done in one shot. */
+	unsigned int PathIteration_times;
+
+#ifdef __WORK_STEALING__
+	/* Work pool with respect to each work group. */
+	cl_mem work_pool_wgs;
+
+	/* Denotes the maximum work groups possible w.r.t. current tile size. */
+	unsigned int max_work_groups;
+#endif
+
+	/* clos_max value for which the kernels have been loaded currently. */
+	int current_max_closure;
+
+	/* Marked True in constructor and marked false at the end of path_trace(). */
+	bool first_tile;
+
+	OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
+	: OpenCLDeviceBase(info, stats, background_)
+	{
+		background = background_;
+
+		/* Initialize cl_mem variables. */
+		kgbuffer = NULL;
+		sd = NULL;
+		sd_DL_shadow = NULL;
+
+		rng_coop = NULL;
+		throughput_coop = NULL;
+		L_transparent_coop = NULL;
+		PathRadiance_coop = NULL;
+		Ray_coop = NULL;
+		PathState_coop = NULL;
+		Intersection_coop = NULL;
+		ray_state = NULL;
+
+		AOAlpha_coop = NULL;
+		AOBSDF_coop = NULL;
+		AOLightRay_coop = NULL;
+		BSDFEval_coop = NULL;
+		ISLamp_coop = NULL;
+		LightRay_coop = NULL;
+		Intersection_coop_shadow = NULL;
+
+#ifdef WITH_CYCLES_DEBUG
+		debugdata_coop = NULL;
+#endif
+
+		work_array = NULL;
+
+		/* Queue. */
+		Queue_data = NULL;
+		Queue_index = NULL;
+		use_queues_flag = NULL;
+
+		per_sample_output_buffers = NULL;
+
+		per_thread_output_buffer_size = 0;
+		hostRayStateArray = NULL;
+		PathIteration_times = PATH_ITER_INC_FACTOR;
+#ifdef __WORK_STEALING__
+		work_pool_wgs = NULL;
+		max_work_groups = 0;
+#endif
+		current_max_closure = -1;
+		first_tile = true;
+
+		/* Get device's maximum memory that can be allocated. */
+		ciErr = clGetDeviceInfo(cdDevice,
+		                        CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+		                        sizeof(size_t),
+		                        &total_allocatable_memory,
+		                        NULL);
+		assert(ciErr == CL_SUCCESS);
+		if(platform_name == "AMD Accelerated Parallel Processing") {
+			/* This value is tweak-able; AMD platform does not seem to
+			 * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
+			 * is considered for further computation.
+			 */
+			total_allocatable_memory /= 2;
+		}
+	}
+
+	virtual bool show_samples() const {
+		return false;
+	}
+
+	/* Split kernel utility functions. */
+	size_t get_tex_size(const char *tex_name)
+	{
+		cl_mem ptr;
+		size_t ret_size = 0;
+		MemMap::iterator i = mem_map.find(tex_name);
+		if(i != mem_map.end()) {
+			ptr = CL_MEM_PTR(i->second);
+			ciErr = clGetMemObjectInfo(ptr,
+			                           CL_MEM_SIZE,
+			                           sizeof(ret_size),
+			                           &ret_size,
+			                           NULL);
+			assert(ciErr == CL_SUCCESS);
+		}
+		return ret_size;
+	}
+
+	size_t get_shader_data_size(size_t max_closure)
+	{
+		/* ShaderData size with variable size ShaderClosure array */
+		return sizeof(ShaderData) - (sizeof(ShaderClosure) * (MAX_CLOSURE - max_closure));
+	}
+
+	/* Returns size of KernelGlobals structure associated with OpenCL. */
+	size_t get_KernelGlobals_size()
+	{
+		/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
+		 * fetch its size.
+		 */
+		typedef struct KernelGlobals {
+			ccl_constant KernelData *data;
+#define KERNEL_TEX(type, ttype, name) \
+	ccl_global type *name;
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+			void *sd_input;
+			void *isect_shadow;
+		} KernelGlobals;
+
+		return sizeof(KernelGlobals);
+	}
+
+	virtual void load_kernels(const DeviceRequestedFeatures& requested_features,
+	                          vector<OpenCLProgram*> &programs)
+	{
+		string build_options = "-D__SPLIT_KERNEL__ ";
+#ifdef __WORK_STEALING__
+		build_options += "-D__WORK_STEALING__ ";
+#endif
+		build_options += requested_features.get_build_options();
+
+		/* Set compute device build option. */
+		cl_device_type device_type;
+		ciErr = clGetDeviceInfo(cdDevice,
+		                        CL_DEVICE_TYPE,
+		                        sizeof(cl_device_type),
+		                        &device_type,
+		                        NULL);
+		assert(ciErr == CL_SUCCESS);
+		if(device_type == CL_DEVICE_TYPE_GPU) {
+			build_options += " -D__COMPUTE_DEVICE_GPU__";
+		}
+
+#define GLUE(a, b) a ## b
+#define LOAD_KERNEL(name) \
+	do { \
+		GLUE(program_, name) = OpenCLProgram(this, "split_" #name, "kernel_" #name ".cl", build_options); \
+		GLUE(program_, name).add_kernel(ustring("path_trace_" #name)); \
+		programs.push_back(&GLUE(program_, name)); \
+	} while(false)
+
+		LOAD_KERNEL(data_init);
+		LOAD_KERNEL(scene_intersect);
+		LOAD_KERNEL(lamp_emission);
+		LOAD_KERNEL(queue_enqueue);
+		LOAD_KERNEL(background_buffer_update);
+		LOAD_KERNEL(shader_eval);
+		LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
+		LOAD_KERNEL(direct_lighting);
+		LOAD_KERNEL(shadow_blocked);
+		LOAD_KERNEL(next_iteration_setup);
+		LOAD_KERNEL(sum_all_radiance);
+
+#undef FIND_KERNEL
+#undef GLUE
+
+		current_max_closure = requested_features.max_closure;
+	}
+
+	~OpenCLDeviceSplitKernel()
+	{
+		task_pool.stop();
+
+		/* Release kernels */
+		program_data_init.release();
+		program_scene_intersect.release();
+		program_lamp_emission.release();
+		program_queue_enqueue.release();
+		program_background_buffer_update.release();
+		program_shader_eval.release();
+		program_holdout_emission_blurring_pathtermination_ao.release();
+		program_direct_lighting.release();
+		program_shadow_blocked.release();
+		program_next_iteration_setup.release();
+		program_sum_all_radiance.release();
+
+		/* Release global memory */
+		release_mem_object_safe(rng_coop);
+		release_mem_object_safe(throughput_coop);
+		release_mem_object_safe(L_transparent_coop);
+		release_mem_object_safe(PathRadiance_coop);
+		release_mem_object_safe(Ray_coop);
+		release_mem_object_safe(PathState_coop);
+		release_mem_object_safe(Intersection_coop);
+		release_mem_object_safe(kgbuffer);
+		release_mem_object_safe(sd);
+		release_mem_object_safe(sd_DL_shadow);
+		release_mem_object_safe(ray_state);
+		release_mem_object_safe(AOAlpha_coop);
+		release_mem_object_safe(AOBSDF_coop);
+		release_mem_object_safe(AOLightRay_coop);
+		release_mem_object_safe(BSDFEval_coop);
+		release_mem_object_safe(ISLamp_coop);
+		release_mem_object_safe(LightRay_coop);
+		release_mem_object_safe(Intersection_coop_shadow);
+#ifdef WITH_CYCLES_DEBUG
+		release_mem_object_safe(debugdata_coop);
+#endif
+		release_mem_object_safe(use_queues_flag);
+		release_mem_object_safe(Queue_data);
+		release_mem_object_safe(Queue_index);
+		release_mem_object_safe(work_array);
+#ifdef __WORK_STEALING__
+		release_mem_object_safe(work_pool_wgs);
+#endif
+		release_mem_object_safe(per_sample_output_buffers);
+
+		if(hostRayStateArray != NULL) {
+			free(hostRayStateArray);
+		}
+	}
+
+	void path_trace(DeviceTask *task,
+	                SplitRenderTile& rtile,
+	                int2 max_render_feasible_tile_size)
+	{
+		/* cast arguments to cl types */
+		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
+		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
+		cl_int d_x = rtile.x;
+		cl_int d_y = rtile.y;
+		cl_int d_w = rtile.w;
+		cl_int d_h = rtile.h;
+		cl_int d_offset = rtile.offset;
+		cl_int d_stride = rtile.stride;
+
+		/* Make sure that set render feasible tile size is a multiple of local
+		 * work size dimensions.
+		 */
+		assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
+		assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
+
+		size_t global_size[2];
+		size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
+		                        SPLIT_KERNEL_LOCAL_SIZE_Y};
+
+		/* Set the range of samples to be processed for every ray in
+		 * path-regeneration logic.
+		 */
+		cl_int start_sample = rtile.start_sample;
+		cl_int end_sample = rtile.start_sample + rtile.num_samples;
+		cl_int num_samples = rtile.num_samples;
+
+#ifdef __WORK_STEALING__
+		global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
+		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
+		unsigned int num_parallel_samples = 1;
+#else
+		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
+		unsigned int num_threads = max_render_feasible_tile_size.x *
+		                           max_render_feasible_tile_size.y;
+		unsigned int num_tile_columns_possible = num_threads / global_size[1];
+		/* Estimate number of parallel samples that can be
+		 * processed in parallel.
+		 */
+		unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
+		                                        rtile.num_samples);
+		/* Wavefront size in AMD is 64.
+		 * TODO(sergey): What about other platforms?
+		 */
+		if(num_parallel_samples >= 64) {
+			/* TODO(sergey): Could use generic round-up here. */
+			num_parallel_samples = (num_parallel_samples / 64) * 64;
+		}
+		assert(num_parallel_samples != 0);
+
+		global_size[0] = d_w * num_parallel_samples;
+#endif  /* __WORK_STEALING__ */
+
+		assert(global_size[0] * global_size[1] <=
+		       max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
+
+		/* Allocate all required global memory once. */
+		if(first_tile) {
+			size_t num_global_elements = max_render_feasible_tile_size.x *
+			                             max_render_feasible_tile_size.y;
+			/* TODO(sergey): This will actually over-allocate if
+			 * particular kernel does not support multiclosure.
+			 */
+			size_t shaderdata_size = get_shader_data_size(current_max_closure);
+
+#ifdef __WORK_STEALING__
+			/* Calculate max groups */
+			size_t max_global_size[2];
+			size_t tile_x = max_render_feasible_tile_size.x;
+			size_t tile_y = max_render_feasible_tile_size.y;
+			max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
+			max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
+			max_work_groups = (max_global_size[0] * max_global_size[1]) /
+			                  (local_size[0] * local_size[1]);
+			/* Allocate work_pool_wgs memory. */
+			work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
+#endif  /* __WORK_STEALING__ */
+
+			/* Allocate queue_index memory only once. */
+			Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
+			use_queues_flag = mem_alloc(sizeof(char));
+			kgbuffer = mem_alloc(get_KernelGlobals_size());
+
+			/* Create global buffers for ShaderData. */
+			sd = mem_alloc(num_global_elements * shaderdata_size);
+			sd_DL_shadow = mem_alloc(num_global_elements * 2 * shaderdata_size);
+
+			/* Creation of global memory buffers which are shared among
+			 * the kernels.
+			 */
+			rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
+			throughput_coop = mem_alloc(num_global_elements * sizeof(float3));
+			L_transparent_coop = mem_alloc(num_global_elements * sizeof(float));
+			PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
+			Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
+			PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
+			Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
+			AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3));
+			AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3));
+			AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
+			BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
+			ISLamp_coop = mem_alloc(num_global_elements * sizeof(int));
+			LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
+			Intersection_coop_shadow = mem_alloc(2 * num_global_elements * sizeof(Intersection));
+
+#ifdef WITH_CYCLES_DEBUG
+			debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
+#endif
+
+			ray_state = mem_alloc(num_global_elements * sizeof(char));
+
+			hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
+			assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
+
+			Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
+			work_array = mem_alloc(num_global_elements * sizeof(unsigned int));
+			per_sample_output_buffers = mem_alloc(num_global_elements *
+			                                      per_thread_output_buffer_size);
+		}
+
+		cl_int dQueue_size = global_size[0] * global_size[1];
+
+		cl_uint start_arg_index =
+			kernel_set_args(program_data_init(),
+			                0,
+			                kgbuffer,
+			                sd_DL_shadow,
+			                d_data,
+			                per_sample_output_buffers,
+			                d_rng_state,
+			                rng_coop,
+			                throughput_coop,
+			                L_transparent_coop,
+			                PathRadiance_coop,
+			                Ray_coop,
+			                PathState_coop,
+			                Intersection_coop_shadow,
+			                ray_state);
+
+/* TODO(sergey): Avoid map lookup here. */
+#define KERNEL_TEX(type, ttype, name) \
+	set_kernel_arg_mem(program_data_init(), &start_arg_index, #name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+
+		start_arg_index +=
+			kernel_set_args(program_data_init(),
+			                start_arg_index,
+			                start_sample,
+			                d_x,
+			                d_y,
+			                d_w,
+			                d_h,
+			                d_offset,
+			                d_stride,
+			                rtile.rng_state_offset_x,
+			                rtile.rng_state_offset_y,
+			                rtile.buffer_rng_state_stride,
+			                Queue_data,
+			                Queue_index,
+			                dQueue_size,
+			                use_queues_flag,
+			                work_array,
+#ifdef __WORK_STEALING__
+			                work_pool_wgs,
+			                num_samples,
+#endif
+#ifdef WITH_CYCLES_DEBUG
+			                debugdata_coop,
+#endif
+			                num_parallel_samples);
+
+		kernel_set_args(program_scene_intersect(),
+		                0,
+		                kgbuffer,
+		                d_data,
+		                rng_coop,
+		                Ray_coop,
+		                PathState_coop,
+		                Intersection_coop,
+		                ray_state,
+		                d_w,
+		                d_h,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size,
+		                use_queues_flag,
+#ifdef WITH_CYCLES_DEBUG
+		                debugdata_coop,
+#endif
+		                num_parallel_samples);
+
+		kernel_set_args(program_lamp_emission(),
+		                0,
+		                kgbuffer,
+		                d_data,
+		                throughput_coop,
+		                PathRadiance_coop,
+		                Ray_coop,
+		                PathState_coop,
+		                Intersection_coop,
+		                ray_state,
+		                d_w,
+		                d_h,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size,
+		                use_queues_flag,
+		                num_parallel_samples);
+
+		kernel_set_args(program_queue_enqueue(),
+		                0,
+		                Queue_data,
+		                Queue_index,
+		                ray_state,
+		                dQueue_size);
+
+		kernel_set_args(program_background_buffer_update(),
+		                 0,
+		                 kgbuffer,
+		                 d_data,
+		                 per_sample_output_buffers,
+		                 d_rng_state,
+		                 rng_coop,
+		                 throughput_coop,
+		                 PathRadiance_coop,
+		                 Ray_coop,
+		                 PathState_coop,
+		                 L_transparent_coop,
+		                 ray_state,
+		                 d_w,
+		                 d_h,
+		                 d_x,
+		                 d_y,
+		                 d_stride,
+		                 rtile.rng_state_offset_x,
+		                 rtile.rng_state_offset_y,
+		                 rtile.buffer_rng_state_stride,
+		                 work_array,
+		                 Queue_data,
+		                 Queue_index,
+		                 dQueue_size,
+		                 end_sample,
+		                 start_sample,
+#ifdef __WORK_STEALING__
+		                 work_pool_wgs,
+		                 num_samples,
+#endif
+#ifdef WITH_CYCLES_DEBUG
+		                 debugdata_coop,
+#endif
+		                 num_parallel_samples);
+
+		kernel_set_args(program_shader_eval(),
+		                0,
+		                kgbuffer,
+		                d_data,
+		                sd,
+		                rng_coop,
+		                Ray_coop,
+		                PathState_coop,
+		                Intersection_coop,
+		                ray_state,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size);
+
+		kernel_set_args(program_holdout_emission_blurring_pathtermination_ao(),
+		                0,
+		                kgbuffer,
+		                d_data,
+		                sd,
+		                per_sample_output_buffers,
+		                rng_coop,
+		                throughput_coop,
+		                L_transparent_coop,
+		                PathRadiance_coop,
+		                PathState_coop,
+		                Intersection_coop,
+		                AOAlpha_coop,
+		                AOBSDF_coop,
+		                AOLightRay_coop,
+		                d_w,
+		                d_h,
+		                d_x,
+		                d_y,
+		                d_stride,
+		                ray_state,
+		                work_array,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size,
+#ifdef __WORK_STEALING__
+		                start_sample,
+#endif
+		                num_parallel_samples);
+
+		kernel_set_args(program_direct_lighting(),
+		                0,
+		                kgbuffer,
+		                d_data,
+		                sd,
+		                rng_coop,
+		                PathState_coop,
+		                ISLamp_coop,
+		                LightRay_coop,
+		                BSDFEval_coop,
+		                ray_state,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size);
+
+		kernel_set_args(program_shadow_blocked(),
+		                0,
+		                kgbuffer,
+		                d_data,
+		                PathState_coop,
+		                LightRay_coop,
+		                AOLightRay_coop,
+		                ray_state,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size);
+
+		kernel_set_args(program_next_iteration_setup(),
+		                0,
+		                kgbuffer,
+		                d_data,
+		                sd,
+		                rng_coop,
+		                throughput_coop,
+		                PathRadiance_coop,
+		                Ray_coop,
+		                PathState_coop,
+		                LightRay_coop,
+		                ISLamp_coop,
+		                BSDFEval_coop,
+		                AOLightRay_coop,
+		                AOBSDF_coop,
+		                AOAlpha_coop,
+		                ray_state,
+		                Queue_data,
+		                Queue_index,
+		                dQueue_size,
+		                use_queues_flag);
+
+		kernel_set_args(program_sum_all_radiance(),
+		                0,
+		                d_data,
+		                d_buffer,
+		                per_sample_output_buffers,
+		                num_parallel_samples,
+		                d_w,
+		                d_h,
+		                d_stride,
+		                rtile.buffer_offset_x,
+		                rtile.buffer_offset_y,
+		                rtile.buffer_rng_state_stride,
+		                start_sample);
+
+		/* Macro for Enqueuing split kernels. */
+#define GLUE(a, b) a ## b
+#define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \
+		{ \
+			ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \
+			                               GLUE(program_, \
+			                                    kernelName)(), \
+			                               2, \
+			                               NULL, \
+			                               globalSize, \
+			                               localSize, \
+			                               0, \
+			                               NULL, \
+			                               NULL); \
+			opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); \
+			if(ciErr != CL_SUCCESS) { \
+				string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", \
+				                               clewErrorString(ciErr)); \
+				opencl_error(message); \
+				return; \
+			} \
+		} (void) 0
+
+		/* Enqueue ckPathTraceKernel_data_init kernel. */
+		ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size);
+		bool activeRaysAvailable = true;
+
+		/* Record number of time host intervention has been made */
+		unsigned int numHostIntervention = 0;
+		unsigned int numNextPathIterTimes = PathIteration_times;
+		bool canceled = false;
+		while(activeRaysAvailable) {
+			/* Twice the global work size of other kernels for
+			 * ckPathTraceKernel_shadow_blocked_direct_lighting. */
+			size_t global_size_shadow_blocked[2];
+			global_size_shadow_blocked[0] = global_size[0] * 2;
+			global_size_shadow_blocked[1] = global_size[1];
+
+			/* Do path-iteration in host [Enqueue Path-iteration kernels. */
+			for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) {
+				ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size);
+				ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
+
+				if(task->get_cancel()) {
+					canceled = true;
+					break;
+				}
+			}
+
+			/* Read ray-state into Host memory to decide if we should exit
+			 * path-iteration in host.
+			 */
+			ciErr = clEnqueueReadBuffer(cqCommandQueue,
+			                            ray_state,
+			                            CL_TRUE,
+			                            0,
+			                            global_size[0] * global_size[1] * sizeof(char),
+			                            hostRayStateArray,
+			                            0,
+			                            NULL,
+			                            NULL);
+			assert(ciErr == CL_SUCCESS);
+
+			activeRaysAvailable = false;
+
+			for(int rayStateIter = 0;
+			    rayStateIter < global_size[0] * global_size[1];
+			    ++rayStateIter)
+			{
+				if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) {
+					/* Not all rays are RAY_INACTIVE. */
+					activeRaysAvailable = true;
+					break;
+				}
+			}
+
+			if(activeRaysAvailable) {
+				numHostIntervention++;
+				PathIteration_times = PATH_ITER_INC_FACTOR;
+				/* Host intervention done before all rays become RAY_INACTIVE;
+				 * Set do more initial iterations for the next tile.
+				 */
+				numNextPathIterTimes += PATH_ITER_INC_FACTOR;
+			}
+
+			if(task->get_cancel()) {
+				canceled = true;
+				break;
+			}
+		}
+
+		/* Execute SumALLRadiance kernel to accumulate radiance calculated in
+		 * per_sample_output_buffers into RenderTile's output buffer.
+		 */
+		if(!canceled) {
+			size_t sum_all_radiance_local_size[2] = {16, 16};
+			size_t sum_all_radiance_global_size[2];
+			sum_all_radiance_global_size[0] =
+				(((d_w - 1) / sum_all_radiance_local_size[0]) + 1) *
+				sum_all_radiance_local_size[0];
+			sum_all_radiance_global_size[1] =
+				(((d_h - 1) / sum_all_radiance_local_size[1]) + 1) *
+				sum_all_radiance_local_size[1];
+			ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
+			                     sum_all_radiance_global_size,
+			                     sum_all_radiance_local_size);
+		}
+
+#undef ENQUEUE_SPLIT_KERNEL
+#undef GLUE
+
+		if(numHostIntervention == 0) {
+			/* This means that we are executing kernel more than required
+			 * Must avoid this for the next sample/tile.
+			 */
+			PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ?
+			PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR;
+		}
+		else {
+			/* Number of path-iterations done for this tile is set as
+			 * Initial path-iteration times for the next tile
+			 */
+			PathIteration_times = numNextPathIterTimes;
+		}
+
+		first_tile = false;
+	}
+
+	/* Calculates the amount of memory that has to be always
+	 * allocated in order for the split kernel to function.
+	 * This memory is tile/scene-property invariant (meaning,
+	 * the value returned by this function does not depend
+	 * on the user set tile size or scene properties.
+	 */
+	size_t get_invariable_mem_allocated()
+	{
+		size_t total_invariable_mem_allocated = 0;
+		size_t KernelGlobals_size = 0;
+
+		KernelGlobals_size = get_KernelGlobals_size();
+
+		total_invariable_mem_allocated += KernelGlobals_size; /* KernelGlobals size */
+		total_invariable_mem_allocated += NUM_QUEUES * sizeof(unsigned int); /* Queue index size */
+		total_invariable_mem_allocated += sizeof(char); /* use_queues_flag size */
+
+		return total_invariable_mem_allocated;
+	}
+
+	/* Calculate the memory that has-to-be/has-been allocated for
+	 * the split kernel to function.
+	 */
+	size_t get_tile_specific_mem_allocated(const int2 tile_size)
+	{
+		size_t tile_specific_mem_allocated = 0;
+
+		/* Get required tile info */
+		unsigned int user_set_tile_w = tile_size.x;
+		unsigned int user_set_tile_h = tile_size.y;
+
+#ifdef __WORK_STEALING__
+		/* Calculate memory to be allocated for work_pools in
+		 * case of work_stealing.
+		 */
+		size_t max_global_size[2];
+		size_t max_num_work_pools = 0;
+		max_global_size[0] =
+			(((user_set_tile_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_X;
+		max_global_size[1] =
+			(((user_set_tile_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_Y;
+		max_num_work_pools =
+			(max_global_size[0] * max_global_size[1]) /
+			(SPLIT_KERNEL_LOCAL_SIZE_X * SPLIT_KERNEL_LOCAL_SIZE_Y);
+		tile_specific_mem_allocated += max_num_work_pools * sizeof(unsigned int);
+#endif
+
+		tile_specific_mem_allocated +=
+			user_set_tile_w * user_set_tile_h * per_thread_output_buffer_size;
+		tile_specific_mem_allocated +=
+			user_set_tile_w * user_set_tile_h * sizeof(RNG);
+
+		return tile_specific_mem_allocated;
+	}
+
+	/* Calculates the texture memories and KernelData (d_data) memory
+	 * that has been allocated.
+	 */
+	size_t get_scene_specific_mem_allocated(cl_mem d_data)
+	{
+		size_t scene_specific_mem_allocated = 0;
+		/* Calculate texture memories. */
+#define KERNEL_TEX(type, ttype, name) \
+	scene_specific_mem_allocated += get_tex_size(#name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+		size_t d_data_size;
+		ciErr = clGetMemObjectInfo(d_data,
+		                           CL_MEM_SIZE,
+		                           sizeof(d_data_size),
+		                           &d_data_size,
+		                           NULL);
+		assert(ciErr == CL_SUCCESS && "Can't get d_data mem object info");
+		scene_specific_mem_allocated += d_data_size;
+		return scene_specific_mem_allocated;
+	}
+
+	/* Calculate the memory required for one thread in split kernel. */
+	size_t get_per_thread_memory()
+	{
+		size_t shaderdata_size = 0;
+		/* TODO(sergey): This will actually over-allocate if
+		 * particular kernel does not support multiclosure.
+		 */
+		shaderdata_size = get_shader_data_size(current_max_closure);
+		size_t retval = sizeof(RNG)
+			+ sizeof(float3)          /* Throughput size */
+			+ sizeof(float)           /* L transparent size */
+			+ sizeof(char)            /* Ray state size */
+			+ sizeof(unsigned int)    /* Work element size */
+			+ sizeof(int)             /* ISLamp_size */
+			+ sizeof(PathRadiance) + sizeof(Ray) + sizeof(PathState)
+			+ sizeof(Intersection)    /* Overall isect */
+			+ sizeof(Intersection)    /* Instersection_coop_AO */
+			+ sizeof(Intersection)    /* Intersection coop DL */
+			+ shaderdata_size         /* Overall ShaderData */
+			+ (shaderdata_size * 2)   /* ShaderData : DL and shadow */
+			+ sizeof(Ray) + sizeof(BsdfEval)
+			+ sizeof(float3)          /* AOAlpha size */
+			+ sizeof(float3)          /* AOBSDF size */
+			+ sizeof(Ray)
+			+ (sizeof(int) * NUM_QUEUES)
+			+ per_thread_output_buffer_size;
+		return retval;
+	}
+
+	/* Considers the total memory available in the device and
+	 * and returns the maximum global work size possible.
+	 */
+	size_t get_feasible_global_work_size(int2 tile_size, cl_mem d_data)
+	{
+		/* Calculate invariably allocated memory. */
+		size_t invariable_mem_allocated = get_invariable_mem_allocated();
+		/* Calculate tile specific allocated memory. */
+		size_t tile_specific_mem_allocated =
+			get_tile_specific_mem_allocated(tile_size);
+		/* Calculate scene specific allocated memory. */
+		size_t scene_specific_mem_allocated =
+			get_scene_specific_mem_allocated(d_data);
+		/* Calculate total memory available for the threads in global work size. */
+		size_t available_memory = total_allocatable_memory
+			- invariable_mem_allocated
+			- tile_specific_mem_allocated
+			- scene_specific_mem_allocated
+			- DATA_ALLOCATION_MEM_FACTOR;
+		size_t per_thread_memory_required = get_per_thread_memory();
+		return (available_memory / per_thread_memory_required);
+	}
+
+	/* Checks if the device has enough memory to render the whole tile;
+	 * If not, we should split single tile into multiple tiles of small size
+	 * and process them all.
+	 */
+	bool need_to_split_tile(unsigned int d_w,
+	                        unsigned int d_h,
+	                        int2 max_render_feasible_tile_size)
+	{
+		size_t global_size_estimate[2];
+		/* TODO(sergey): Such round-ups are in quite few places, need to replace
+		 * them with an utility macro.
+		 */
+		global_size_estimate[0] =
+			(((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_X;
+		global_size_estimate[1] =
+			(((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_Y;
+		if((global_size_estimate[0] * global_size_estimate[1]) >
+		   (max_render_feasible_tile_size.x * max_render_feasible_tile_size.y))
+		{
+			return true;
+		}
+		else {
+			return false;
+		}
+	}
+
+	/* Considers the scene properties, global memory available in the device
+	 * and returns a rectanglular tile dimension (approx the maximum)
+	 * that should render on split kernel.
+	 */
+	int2 get_max_render_feasible_tile_size(size_t feasible_global_work_size)
+	{
+		int2 max_render_feasible_tile_size;
+		int square_root_val = (int)sqrt(feasible_global_work_size);
+		max_render_feasible_tile_size.x = square_root_val;
+		max_render_feasible_tile_size.y = square_root_val;
+		/* Ciel round-off max_render_feasible_tile_size. */
+		int2 ceil_render_feasible_tile_size;
+		ceil_render_feasible_tile_size.x =
+			(((max_render_feasible_tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_X;
+		ceil_render_feasible_tile_size.y =
+			(((max_render_feasible_tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_Y;
+		if(ceil_render_feasible_tile_size.x * ceil_render_feasible_tile_size.y <=
+		   feasible_global_work_size)
+		{
+			return ceil_render_feasible_tile_size;
+		}
+		/* Floor round-off max_render_feasible_tile_size. */
+		int2 floor_render_feasible_tile_size;
+		floor_render_feasible_tile_size.x =
+			(max_render_feasible_tile_size.x / SPLIT_KERNEL_LOCAL_SIZE_X) *
+			SPLIT_KERNEL_LOCAL_SIZE_X;
+		floor_render_feasible_tile_size.y =
+			(max_render_feasible_tile_size.y / SPLIT_KERNEL_LOCAL_SIZE_Y) *
+			SPLIT_KERNEL_LOCAL_SIZE_Y;
+		return floor_render_feasible_tile_size;
+	}
+
+	/* Try splitting the current tile into multiple smaller
+	 * almost-square-tiles.
+	 */
+	int2 get_split_tile_size(RenderTile rtile,
+	                         int2 max_render_feasible_tile_size)
+	{
+		int2 split_tile_size;
+		int num_global_threads = max_render_feasible_tile_size.x *
+		                         max_render_feasible_tile_size.y;
+		int d_w = rtile.w;
+		int d_h = rtile.h;
+		/* Ceil round off d_w and d_h */
+		d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_X;
+		d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+			SPLIT_KERNEL_LOCAL_SIZE_Y;
+		while(d_w * d_h > num_global_threads) {
+			/* Halve the longer dimension. */
+			if(d_w >= d_h) {
+				d_w = d_w / 2;
+				d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+					SPLIT_KERNEL_LOCAL_SIZE_X;
+			}
+			else {
+				d_h = d_h / 2;
+				d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+					SPLIT_KERNEL_LOCAL_SIZE_Y;
+			}
+		}
+		split_tile_size.x = d_w;
+		split_tile_size.y = d_h;
+		return split_tile_size;
+	}
+
+	/* Splits existing tile into multiple tiles of tile size split_tile_size. */
+	vector<SplitRenderTile> split_tiles(RenderTile rtile, int2 split_tile_size)
+	{
+		vector<SplitRenderTile> to_path_trace_rtile;
+		int d_w = rtile.w;
+		int d_h = rtile.h;
+		int num_tiles_x = (((d_w - 1) / split_tile_size.x) + 1);
+		int num_tiles_y = (((d_h - 1) / split_tile_size.y) + 1);
+		/* Buffer and rng_state offset calc. */
+		size_t offset_index = rtile.offset + (rtile.x + rtile.y * rtile.stride);
+		size_t offset_x = offset_index % rtile.stride;
+		size_t offset_y = offset_index / rtile.stride;
+		/* Resize to_path_trace_rtile. */
+		to_path_trace_rtile.resize(num_tiles_x * num_tiles_y);
+		for(int tile_iter_y = 0; tile_iter_y < num_tiles_y; tile_iter_y++) {
+			for(int tile_iter_x = 0; tile_iter_x < num_tiles_x; tile_iter_x++) {
+				int rtile_index = tile_iter_y * num_tiles_x + tile_iter_x;
+				to_path_trace_rtile[rtile_index].rng_state_offset_x = offset_x + tile_iter_x * split_tile_size.x;
+				to_path_trace_rtile[rtile_index].rng_state_offset_y = offset_y + tile_iter_y * split_tile_size.y;
+				to_path_trace_rtile[rtile_index].buffer_offset_x = offset_x + tile_iter_x * split_tile_size.x;
+				to_path_trace_rtile[rtile_index].buffer_offset_y = offset_y + tile_iter_y * split_tile_size.y;
+				to_path_trace_rtile[rtile_index].start_sample = rtile.start_sample;
+				to_path_trace_rtile[rtile_index].num_samples = rtile.num_samples;
+				to_path_trace_rtile[rtile_index].sample = rtile.sample;
+				to_path_trace_rtile[rtile_index].resolution = rtile.resolution;
+				to_path_trace_rtile[rtile_index].offset = rtile.offset;
+				to_path_trace_rtile[rtile_index].buffers = rtile.buffers;
+				to_path_trace_rtile[rtile_index].buffer = rtile.buffer;
+				to_path_trace_rtile[rtile_index].rng_state = rtile.rng_state;
+				to_path_trace_rtile[rtile_index].x = rtile.x + (tile_iter_x * split_tile_size.x);
+				to_path_trace_rtile[rtile_index].y = rtile.y + (tile_iter_y * split_tile_size.y);
+				to_path_trace_rtile[rtile_index].buffer_rng_state_stride = rtile.stride;
+				/* Fill width and height of the new render tile. */
+				to_path_trace_rtile[rtile_index].w = (tile_iter_x == (num_tiles_x - 1)) ?
+					(d_w - (tile_iter_x * split_tile_size.x)) /* Border tile */
+					: split_tile_size.x;
+				to_path_trace_rtile[rtile_index].h = (tile_iter_y == (num_tiles_y - 1)) ?
+					(d_h - (tile_iter_y * split_tile_size.y)) /* Border tile */
+					: split_tile_size.y;
+				to_path_trace_rtile[rtile_index].stride = to_path_trace_rtile[rtile_index].w;
+			}
+		}
+		return to_path_trace_rtile;
+	}
+
+	void thread_run(DeviceTask *task)
+	{
+		if(task->type == DeviceTask::FILM_CONVERT) {
+			film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
+		}
+		else if(task->type == DeviceTask::SHADER) {
+			shader(*task);
+		}
+		else if(task->type == DeviceTask::PATH_TRACE) {
+			RenderTile tile;
+			bool initialize_data_and_check_render_feasibility = false;
+			bool need_to_split_tiles_further = false;
+			int2 max_render_feasible_tile_size;
+			size_t feasible_global_work_size;
+			const int2 tile_size = task->requested_tile_size;
+			/* Keep rendering tiles until done. */
+			while(task->acquire_tile(this, tile)) {
+				if(!initialize_data_and_check_render_feasibility) {
+					/* Initialize data. */
+					/* Calculate per_thread_output_buffer_size. */
+					size_t output_buffer_size = 0;
+					ciErr = clGetMemObjectInfo((cl_mem)tile.buffer,
+					                           CL_MEM_SIZE,
+					                           sizeof(output_buffer_size),
+					                           &output_buffer_size,
+					                           NULL);
+					assert(ciErr == CL_SUCCESS && "Can't get tile.buffer mem object info");
+					/* This value is different when running on AMD and NV. */
+					if(background) {
+						/* In offline render the number of buffer elements
+						 * associated with tile.buffer is the current tile size.
+						 */
+						per_thread_output_buffer_size =
+							output_buffer_size / (tile.w * tile.h);
+					}
+					else {
+						/* interactive rendering, unlike offline render, the number of buffer elements
+						 * associated with tile.buffer is the entire viewport size.
+						 */
+						per_thread_output_buffer_size =
+							output_buffer_size / (tile.buffers->params.width *
+							                      tile.buffers->params.height);
+					}
+					/* Check render feasibility. */
+					feasible_global_work_size = get_feasible_global_work_size(
+						tile_size,
+						CL_MEM_PTR(const_mem_map["__data"]->device_pointer));
+					max_render_feasible_tile_size =
+						get_max_render_feasible_tile_size(
+							feasible_global_work_size);
+					need_to_split_tiles_further =
+						need_to_split_tile(tile_size.x,
+						                   tile_size.y,
+						                   max_render_feasible_tile_size);
+					initialize_data_and_check_render_feasibility = true;
+				}
+				if(need_to_split_tiles_further) {
+					int2 split_tile_size =
+						get_split_tile_size(tile,
+						                    max_render_feasible_tile_size);
+					vector<SplitRenderTile> to_path_trace_render_tiles =
+						split_tiles(tile, split_tile_size);
+					/* Print message to console */
+					if(background && (to_path_trace_render_tiles.size() > 1)) {
+						fprintf(stderr, "Message : Tiles need to be split "
+						        "further inside path trace (due to insufficient "
+						        "device-global-memory for split kernel to "
+						        "function) \n"
+						        "The current tile of dimensions %dx%d is split "
+						        "into tiles of dimension %dx%d for render \n",
+						        tile.w, tile.h,
+						        split_tile_size.x,
+						        split_tile_size.y);
+					}
+					/* Process all split tiles. */
+					for(int tile_iter = 0;
+					    tile_iter < to_path_trace_render_tiles.size();
+					    ++tile_iter)
+					{
+						path_trace(task,
+						           to_path_trace_render_tiles[tile_iter],
+						           max_render_feasible_tile_size);
+					}
+				}
+				else {
+					/* No splitting required; process the entire tile at once. */
+					/* Render feasible tile size is user-set-tile-size itself. */
+					max_render_feasible_tile_size.x =
+						(((tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+						SPLIT_KERNEL_LOCAL_SIZE_X;
+					max_render_feasible_tile_size.y =
+						(((tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+						SPLIT_KERNEL_LOCAL_SIZE_Y;
+					/* buffer_rng_state_stride is stride itself. */
+					SplitRenderTile split_tile(tile);
+					split_tile.buffer_rng_state_stride = tile.stride;
+					path_trace(task, split_tile, max_render_feasible_tile_size);
+				}
+				tile.sample = tile.start_sample + tile.num_samples;
+
+				/* Complete kernel execution before release tile. */
+				/* This helps in multi-device render;
+				 * The device that reaches the critical-section function
+				 * release_tile waits (stalling other devices from entering
+				 * release_tile) for all kernels to complete. If device1 (a
+				 * slow-render device) reaches release_tile first then it would
+				 * stall device2 (a fast-render device) from proceeding to render
+				 * next tile.
+				 */
+				clFinish(cqCommandQueue);
+
+				task->release_tile(tile);
+			}
+		}
+	}
+
+protected:
+	cl_mem mem_alloc(size_t bufsize, cl_mem_flags mem_flag = CL_MEM_READ_WRITE)
+	{
+		cl_mem ptr;
+		assert(bufsize != 0);
+		ptr = clCreateBuffer(cxContext, mem_flag, bufsize, NULL, &ciErr);
+		opencl_assert_err(ciErr, "clCreateBuffer");
+		return ptr;
+	}
+
+	/* ** Those guys are for workign around some compiler-specific bugs ** */
+
+	string build_options_for_base_program(
+	        const DeviceRequestedFeatures& requested_features)
+	{
+		return requested_features.get_build_options();
+	}
+};
+
+Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background)
+{
+	return new OpenCLDeviceSplitKernel(info, stats, background);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
new file mode 100644
index 0000000..82e1640
--- /dev/null
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -0,0 +1,822 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPENCL
+
+#include "opencl.h"
+
+#include "util_logging.h"
+#include "util_path.h"
+#include "util_time.h"
+
+using std::cerr;
+using std::endl;
+
+CCL_NAMESPACE_BEGIN
+
+OpenCLCache::Slot::ProgramEntry::ProgramEntry()
+ : program(NULL),
+   mutex(NULL)
+{
+}
+
+OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry& rhs)
+ : program(rhs.program),
+   mutex(NULL)
+{
+}
+
+OpenCLCache::Slot::ProgramEntry::~ProgramEntry()
+{
+	delete mutex;
+}
+
+OpenCLCache::Slot::Slot()
+ : context_mutex(NULL),
+   context(NULL)
+{
+}
+
+OpenCLCache::Slot::Slot(const Slot& rhs)
+ : context_mutex(NULL),
+   context(NULL),
+   programs(rhs.programs)
+{
+}
+
+OpenCLCache::Slot::~Slot()
+{
+	delete context_mutex;
+}
+
+OpenCLCache& OpenCLCache::global_instance()
+{
+	static OpenCLCache instance;
+	return instance;
+}
+
+cl_context OpenCLCache::get_context(cl_platform_id platform,
+                                    cl_device_id device,
+                                    thread_scoped_lock& slot_locker)
+{
+	assert(platform != NULL);
+
+	OpenCLCache& self = global_instance();
+
+	thread_scoped_lock cache_lock(self.cache_lock);
+
+	pair<CacheMap::iterator,bool> ins = self.cache.insert(
+		CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
+
+	Slot &slot = ins.first->second;
+
+	/* create slot lock only while holding cache lock */
+	if(!slot.context_mutex)
+		slot.context_mutex = new thread_mutex;
+
+	/* need to unlock cache before locking slot, to allow store to complete */
+	cache_lock.unlock();
+
+	/* lock the slot */
+	slot_locker = thread_scoped_lock(*slot.context_mutex);
+
+	/* If the thing isn't cached */
+	if(slot.context == NULL) {
+		/* return with the caller's lock holder holding the slot lock */
+		return NULL;
+	}
+
+	/* the item was already cached, release the slot lock */
+	slot_locker.unlock();
+
+	cl_int ciErr = clRetainContext(slot.context);
+	assert(ciErr == CL_SUCCESS);
+	(void)ciErr;
+
+	return slot.context;
+}
+
+cl_program OpenCLCache::get_program(cl_platform_id platform,
+                                    cl_device_id device,
+                                    ustring key,
+                                    thread_scoped_lock& slot_locker)
+{
+	assert(platform != NULL);
+
+	OpenCLCache& self = global_instance();
+
+	thread_scoped_lock cache_lock(self.cache_lock);
+
+	pair<CacheMap::iterator,bool> ins = self.cache.insert(
+		CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
+
+	Slot &slot = ins.first->second;
+
+	pair<Slot::EntryMap::iterator,bool> ins2 = slot.programs.insert(
+		Slot::EntryMap::value_type(key, Slot::ProgramEntry()));
+
+	Slot::ProgramEntry &entry = ins2.first->second;
+
+	/* create slot lock only while holding cache lock */
+	if(!entry.mutex)
+		entry.mutex = new thread_mutex;
+
+	/* need to unlock cache before locking slot, to allow store to complete */
+	cache_lock.unlock();
+
+	/* lock the slot */
+	slot_locker = thread_scoped_lock(*entry.mutex);
+
+	/* If the thing isn't cached */
+	if(entry.program == NULL) {
+		/* return with the caller's lock holder holding the slot lock */
+		return NULL;
+	}
+
+	/* the item was already cached, release the slot lock */
+	slot_locker.unlock();
+
+	cl_int ciErr = clRetainProgram(entry.program);
+	assert(ciErr == CL_SUCCESS);
+	(void)ciErr;
+
+	return entry.program;
+}
+
+void OpenCLCache::store_context(cl_platform_id platform,
+                                cl_device_id device,
+                                cl_context context,
+                                thread_scoped_lock& slot_locker)
+{
+	assert(platform != NULL);
+	assert(device != NULL);
+	assert(context != NULL);
+
+	OpenCLCache &self = global_instance();
+
+	thread_scoped_lock cache_lock(self.cache_lock);
+	CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
+	cache_lock.unlock();
+
+	Slot &slot = i->second;
+
+	/* sanity check */
+	assert(i != self.cache.end());
+	assert(slot.context == NULL);
+
+	slot.context = context;
+
+	/* unlock the slot */
+	slot_locker.unlock();
+
+	/* increment reference count in OpenCL.
+	 * The caller is going to release the object when done with it. */
+	cl_int ciErr = clRetainContext(context);
+	assert(ciErr == CL_SUCCESS);
+	(void)ciErr;
+}
+
+void OpenCLCache::store_program(cl_platform_id platform,
+                                cl_device_id device,
+                                cl_program program,
+                                ustring key,
+                                thread_scoped_lock& slot_locker)
+{
+	assert(platform != NULL);
+	assert(device != NULL);
+	assert(program != NULL);
+
+	OpenCLCache &self = global_instance();
+
+	thread_scoped_lock cache_lock(self.cache_lock);
+
+	CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
+	assert(i != self.cache.end());
+	Slot &slot = i->second;
+
+	Slot::EntryMap::iterator i2 = slot.programs.find(key);
+	assert(i2 != slot.programs.end());
+	Slot::ProgramEntry &entry = i2->second;
+
+	assert(entry.program == NULL);
+
+	cache_lock.unlock();
+
+	entry.program = program;
+
+	/* unlock the slot */
+	slot_locker.unlock();
+
+	/* Increment reference count in OpenCL.
+	 * The caller is going to release the object when done with it.
+	 */
+	cl_int ciErr = clRetainProgram(program);
+	assert(ciErr == CL_SUCCESS);
+	(void)ciErr;
+}
+
+string OpenCLCache::get_kernel_md5()
+{
+	OpenCLCache &self = global_instance();
+	thread_scoped_lock lock(self.kernel_md5_lock);
+
+	if(self.kernel_md5.empty()) {
+		self.kernel_md5 = path_files_md5_hash(path_get("kernel"));
+	}
+	return self.kernel_md5;
+}
+
+OpenCLDeviceBase::OpenCLProgram::OpenCLProgram(OpenCLDeviceBase *device,
+                                               string program_name,
+                                               string kernel_file,
+                                               string kernel_build_options,
+                                               bool use_stdout)
+ : device(device),
+   program_name(program_name),
+   kernel_file(kernel_file),
+   kernel_build_options(kernel_build_options),
+   use_stdout(use_stdout)
+{
+	loaded = false;
+	program = NULL;
+}
+
+OpenCLDeviceBase::OpenCLProgram::~OpenCLProgram()
+{
+	release();
+}
+
+void OpenCLDeviceBase::OpenCLProgram::release()
+{
+	for(map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); ++kernel) {
+		if(kernel->second) {
+			clReleaseKernel(kernel->second);
+			kernel->second = NULL;
+		}
+	}
+	if(program) {
+		clReleaseProgram(program);
+		program = NULL;
+	}
+}
+
+void OpenCLDeviceBase::OpenCLProgram::add_log(string msg, bool debug)
+{
+	if(!use_stdout) {
+		log += msg + "\n";
+	}
+	else if(!debug) {
+		printf("%s\n", msg.c_str());
+	}
+	else {
+		VLOG(2) << msg;
+	}
+}
+
+void OpenCLDeviceBase::OpenCLProgram::add_error(string msg)
+{
+	if(use_stdout) {
+		fprintf(stderr, "%s\n", msg.c_str());
+	}
+	if(error_msg == "") {
+		error_msg += "\n";
+	}
+	error_msg += msg;
+}
+
+void OpenCLDeviceBase::OpenCLProgram::add_kernel(ustring name)
+{
+	if(!kernels.count(name)) {
+		kernels[name] = NULL;
+	}
+}
+
+bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src)
+{
+	string build_options;
+	build_options = device->kernel_build_options(debug_src) + kernel_build_options;
+
+	cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
+
+	/* show warnings even if build is successful */
+	size_t ret_val_size = 0;
+
+	clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+
+	if(ciErr != CL_SUCCESS) {
+		add_error(string("OpenCL build failed with error ") + clewErrorString(ciErr) + ", errors in console.");
+	}
+
+	if(ret_val_size > 1) {
+		vector<char> build_log(ret_val_size + 1);
+		clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
+
+		build_log[ret_val_size] = '\0';
+		/* Skip meaningless empty output from the NVidia compiler. */
+		if(!(ret_val_size == 2 && build_log[0] == '\n')) {
+			add_log(string("OpenCL program ") + program_name + " build output: " + string(&build_log[0]), ciErr == CL_SUCCESS);
+		}
+	}
+
+	return (ciErr == CL_SUCCESS);
+}
+
+bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src)
+{
+	string source = "#include \"kernels/opencl/" + kernel_file + "\" // " + OpenCLCache::get_kernel_md5() + "\n";
+	/* We compile kernels consisting of many files. unfortunately OpenCL
+	 * kernel caches do not seem to recognize changes in included files.
+	 * so we force recompile on changes by adding the md5 hash of all files.
+	 */
+	source = path_source_replace_includes(source, path_get("kernel"));
+
+	if(debug_src) {
+		path_write_text(*debug_src, source);
+	}
+
+	size_t source_len = source.size();
+	const char *source_str = source.c_str();
+	cl_int ciErr;
+
+	program = clCreateProgramWithSource(device->cxContext,
+	                                   1,
+	                                   &source_str,
+	                                   &source_len,
+	                                   &ciErr);
+
+	if(ciErr != CL_SUCCESS) {
+		add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
+		return false;
+	}
+
+	double starttime = time_dt();
+	add_log(string("Compiling OpenCL program ") + program_name.c_str(), false);
+	add_log(string("Build flags: ") + kernel_build_options, true);
+
+	if(!build_kernel(debug_src))
+		return false;
+
+	add_log(string("Kernel compilation of ") + program_name + " finished in " + string_printf("%.2lfs.\n", time_dt() - starttime), false);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::OpenCLProgram::load_binary(const string& clbin,
+                                                  const string *debug_src)
+{
+	/* read binary into memory */
+	vector<uint8_t> binary;
+
+	if(!path_read_binary(clbin, binary)) {
+		add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
+		return false;
+	}
+
+	/* create program */
+	cl_int status, ciErr;
+	size_t size = binary.size();
+	const uint8_t *bytes = &binary[0];
+
+	program = clCreateProgramWithBinary(device->cxContext, 1, &device->cdDevice,
+		&size, &bytes, &status, &ciErr);
+
+	if(status != CL_SUCCESS || ciErr != CL_SUCCESS) {
+		add_error(string("OpenCL failed create program from cached binary ") + clbin + ": "
+		                 + clewErrorString(status) + " " + clewErrorString(ciErr));
+		return false;
+	}
+
+	if(!build_kernel(debug_src))
+		return false;
+
+	return true;
+}
+
+bool OpenCLDeviceBase::OpenCLProgram::save_binary(const string& clbin)
+{
+	size_t size = 0;
+	clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
+
+	if(!size)
+		return false;
+
+	vector<uint8_t> binary(size);
+	uint8_t *bytes = &binary[0];
+
+	clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
+
+	return path_write_binary(clbin, binary);
+}
+
+void OpenCLDeviceBase::OpenCLProgram::load()
+{
+	assert(device);
+
+	loaded = false;
+
+	string device_md5 = device->device_md5_hash(kernel_build_options);
+
+	/* Try to use cached kernel. */
+	thread_scoped_lock cache_locker;
+	ustring cache_key(program_name + device_md5);
+	program = device->load_cached_kernel(cache_key,
+	                                     cache_locker);
+
+	if(!program) {
+		add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
+
+		string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + OpenCLCache::get_kernel_md5();
+		basename = path_cache_get(path_join("kernels", basename));
+		string clbin = basename + ".clbin";
+
+		/* path to preprocessed source for debugging */
+		string clsrc, *debug_src = NULL;
+
+		if(OpenCLInfo::use_debug()) {
+			clsrc = basename + ".cl";
+			debug_src = &clsrc;
+		}
+
+		/* If binary kernel exists already, try use it. */
+		if(path_exists(clbin) && load_binary(clbin)) {
+			/* Kernel loaded from binary, nothing to do. */
+			add_log(string("Loaded program from ") + clbin + ".", true);
+		}
+		else {
+			add_log(string("Kernel file ") + clbin + " either doesn't exist or failed to be loaded by driver.", true);
+
+			/* If does not exist or loading binary failed, compile kernel. */
+			if(!compile_kernel(debug_src)) {
+				return;
+			}
+
+			/* Save binary for reuse. */
+			if(!save_binary(clbin)) {
+				add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true);
+			}
+		}
+
+		/* Cache the program. */
+		device->store_cached_kernel(program,
+		                            cache_key,
+		                            cache_locker);
+	}
+	else {
+		add_log(string("Found cached OpenCL program ") + program_name + ".", true);
+	}
+
+	for(map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); ++kernel) {
+		assert(kernel->second == NULL);
+		cl_int ciErr;
+		string name = "kernel_ocl_" + kernel->first.string();
+		kernel->second = clCreateKernel(program, name.c_str(), &ciErr);
+		if(device->opencl_error(ciErr)) {
+			add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " + clewErrorString(ciErr));
+			return;
+		}
+	}
+
+	loaded = true;
+}
+
+void OpenCLDeviceBase::OpenCLProgram::report_error()
+{
+	/* If loaded is true, there was no error. */
+	if(loaded) return;
+	/* if use_stdout is true, the error was already reported. */
+	if(use_stdout) return;
+
+	cerr << error_msg << endl;
+	if(!compile_output.empty()) {
+		cerr << "OpenCL kernel build output for " << program_name << ":" << endl;
+		cerr << compile_output << endl;
+	}
+}
+
+cl_kernel OpenCLDeviceBase::OpenCLProgram::operator()()
+{
+	assert(kernels.size() == 1);
+	return kernels.begin()->second;
+}
+
+cl_kernel OpenCLDeviceBase::OpenCLProgram::operator()(ustring name)
+{
+	assert(kernels.count(name));
+	return kernels[name];
+}
+
+cl_device_type OpenCLInfo::device_type()
+{
+	switch(DebugFlags().opencl.device_type)
+	{
+		case DebugFlags::OpenCL::DEVICE_NONE:
+			return 0;
+		case DebugFlags::OpenCL::DEVICE_ALL:
+			return CL_DEVICE_TYPE_ALL;
+		case DebugFlags::OpenCL::DEVICE_DEFAULT:
+			return CL_DEVICE_TYPE_DEFAULT;
+		case DebugFlags::OpenCL::DEVICE_CPU:
+			return CL_DEVICE_TYPE_CPU;
+		case DebugFlags::OpenCL::DEVICE_GPU:
+			return CL_DEVICE_TYPE_GPU;
+		case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
+			return CL_DEVICE_TYPE_ACCELERATOR;
+		default:
+			return CL_DEVICE_TYPE_ALL;
+	}
+}
+
+bool OpenCLInfo::use_debug()
+{
+	return DebugFlags().opencl.debug;
+}
+
+bool OpenCLInfo::kernel_use_advanced_shading(const string& platform)
+{
+	/* keep this in sync with kernel_types.h! */
+	if(platform == "NVIDIA CUDA")
+		return true;
+	else if(platform == "Apple")
+		return true;
+	else if(platform == "AMD Accelerated Parallel Processing")
+		return true;
+	else if(platform == "Intel(R) OpenCL")
+		return true;
+	/* Make sure officially unsupported OpenCL platforms
+	 * does not set up to use advanced shading.
+	 */
+	return false;
+}
+
+bool OpenCLInfo::kernel_use_split(const string& platform_name,
+                                  const cl_device_type device_type)
+{
+	if(DebugFlags().opencl.kernel_type == DebugFlags::OpenCL::KERNEL_SPLIT) {
+		VLOG(1) << "Forcing split kernel to use.";
+		return true;
+	}
+	if(DebugFlags().opencl.kernel_type == DebugFlags::OpenCL::KERNEL_MEGA) {
+		VLOG(1) << "Forcing mega kernel to use.";
+		return false;
+	}
+	/* TODO(sergey): Replace string lookups with more enum-like API,
+	 * similar to device/vendor checks blender's gpu.
+	 */
+	if(platform_name == "AMD Accelerated Parallel Processing" &&
+	   device_type == CL_DEVICE_TYPE_GPU)
+	{
+		return true;
+	}
+	return false;
+}
+
+bool OpenCLInfo::device_supported(const string& platform_name,
+                                  const cl_device_id device_id)
+{
+	cl_device_type device_type;
+	clGetDeviceInfo(device_id,
+	                CL_DEVICE_TYPE,
+	                sizeof(cl_device_type),
+	                &device_type,
+	                NULL);
+	if(platform_name == "AMD Accelerated Parallel Processing" &&
+	   device_type == CL_DEVICE_TYPE_GPU)
+	{
+		return true;
+	}
+	if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
+		return true;
+	}
+	return false;
+}
+
+bool OpenCLInfo::platform_version_check(cl_platform_id platform,
+                                        string *error)
+{
+	const int req_major = 1, req_minor = 1;
+	int major, minor;
+	char version[256];
+	clGetPlatformInfo(platform,
+	                  CL_PLATFORM_VERSION,
+	                  sizeof(version),
+	                  &version,
+	                  NULL);
+	if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
+		if(error != NULL) {
+			*error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
+		}
+		return false;
+	}
+	if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
+		if(error != NULL) {
+			*error = string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = "";
+	}
+	return true;
+}
+
+bool OpenCLInfo::device_version_check(cl_device_id device,
+                                      string *error)
+{
+	const int req_major = 1, req_minor = 1;
+	int major, minor;
+	char version[256];
+	clGetDeviceInfo(device,
+	                CL_DEVICE_OPENCL_C_VERSION,
+	                sizeof(version),
+	                &version,
+	                NULL);
+	if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
+		if(error != NULL) {
+			*error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
+		}
+		return false;
+	}
+	if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
+		if(error != NULL) {
+			*error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = "";
+	}
+	return true;
+}
+
+string OpenCLInfo::get_hardware_id(string platform_name, cl_device_id device_id)
+{
+	if(platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") {
+		/* Use cl_amd_device_topology extension. */
+		cl_char topology[24];
+		if(clGetDeviceInfo(device_id, 0x4037, sizeof(topology), topology, NULL) == CL_SUCCESS && topology[0] == 1) {
+			return string_printf("%02x:%02x.%01x",
+			                     (unsigned int)topology[21],
+			                     (unsigned int)topology[22],
+			                     (unsigned int)topology[23]);
+		}
+	}
+	else if(platform_name == "NVIDIA CUDA") {
+		/* Use two undocumented options of the cl_nv_device_attribute_query extension. */
+		cl_int bus_id, slot_id;
+		if(clGetDeviceInfo(device_id, 0x4008, sizeof(cl_int), &bus_id,  NULL) == CL_SUCCESS &&
+		   clGetDeviceInfo(device_id, 0x4009, sizeof(cl_int), &slot_id, NULL) == CL_SUCCESS) {
+			return string_printf("%02x:%02x.%01x",
+			                     (unsigned int)(bus_id),
+			                     (unsigned int)(slot_id >> 3),
+			                     (unsigned int)(slot_id & 0x7));
+		}
+	}
+	/* No general way to get a hardware ID from OpenCL => give up. */
+	return "";
+}
+
+void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
+                                    bool force_all)
+{
+	const bool force_all_platforms = force_all ||
+		(DebugFlags().opencl.kernel_type != DebugFlags::OpenCL::KERNEL_DEFAULT);
+	const cl_device_type device_type = OpenCLInfo::device_type();
+	static bool first_time = true;
+#define FIRST_VLOG(severity) if(first_time) VLOG(severity)
+
+	usable_devices->clear();
+
+	if(device_type == 0) {
+		FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
+		first_time = false;
+		return;
+	}
+
+	vector<cl_device_id> device_ids;
+	cl_uint num_devices = 0;
+	vector<cl_platform_id> platform_ids;
+	cl_uint num_platforms = 0;
+
+	/* Get devices. */
+	if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS ||
+	   num_platforms == 0)
+	{
+		FIRST_VLOG(2) << "No OpenCL platforms were found.";
+		first_time = false;
+		return;
+	}
+	platform_ids.resize(num_platforms);
+	if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS) {
+		FIRST_VLOG(2) << "Failed to fetch platform IDs from the driver..";
+		first_time = false;
+		return;
+	}
+	/* Devices are numbered consecutively across platforms. */
+	for(int platform = 0; platform < num_platforms; platform++) {
+		cl_platform_id platform_id = platform_ids[platform];
+		char pname[256];
+		if(clGetPlatformInfo(platform_id,
+		                     CL_PLATFORM_NAME,
+		                     sizeof(pname),
+		                     &pname,
+		                     NULL) != CL_SUCCESS)
+		{
+			FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
+			continue;
+		}
+		string platform_name = pname;
+		FIRST_VLOG(2) << "Enumerating devices for platform "
+		              << platform_name << ".";
+		if(!platform_version_check(platform_id)) {
+			FIRST_VLOG(2) << "Ignoring platform " << platform_name
+			              << " due to too old compiler version.";
+			continue;
+		}
+		num_devices = 0;
+		cl_int ciErr;
+		if((ciErr = clGetDeviceIDs(platform_id,
+		                  device_type,
+		                  0,
+		                  NULL,
+		                  &num_devices)) != CL_SUCCESS || num_devices == 0)
+		{
+			FIRST_VLOG(2) << "Ignoring platform " << platform_name
+			              << ", failed to fetch number of devices: " << string(clewErrorString(ciErr));
+			continue;
+		}
+		device_ids.resize(num_devices);
+		if(clGetDeviceIDs(platform_id,
+		                  device_type,
+		                  num_devices,
+		                  &device_ids[0],
+		                  NULL) != CL_SUCCESS)
+		{
+			FIRST_VLOG(2) << "Ignoring platform " << platform_name
+			              << ", failed to fetch devices list.";
+			continue;
+		}
+		for(int num = 0; num < num_devices; num++) {
+			cl_device_id device_id = device_ids[num];
+			char device_name[1024] = "\0";
+			if(clGetDeviceInfo(device_id,
+			                   CL_DEVICE_NAME,
+			                   sizeof(device_name),
+			                   &device_name,
+			                   NULL) != CL_SUCCESS)
+			{
+				FIRST_VLOG(2) << "Failed to fetch device name, ignoring.";
+				continue;
+			}
+			if(!device_version_check(device_id)) {
+				FIRST_VLOG(2) << "Ignoring device " << device_name
+				              << " due to old compiler version.";
+				continue;
+			}
+			if(force_all_platforms ||
+			   device_supported(platform_name, device_id))
+			{
+				cl_device_type device_type;
+				if(clGetDeviceInfo(device_id,
+				                   CL_DEVICE_TYPE,
+				                   sizeof(cl_device_type),
+				                   &device_type,
+				                   NULL) != CL_SUCCESS)
+				{
+					FIRST_VLOG(2) << "Ignoring device " << device_name
+					              << ", failed to fetch device type.";
+					continue;
+				}
+				FIRST_VLOG(2) << "Adding new device " << device_name << ".";
+				string hardware_id = get_hardware_id(platform_name, device_id);
+				usable_devices->push_back(OpenCLPlatformDevice(platform_id,
+				                                               platform_name,
+				                                               device_id,
+				                                               device_type,
+				                                               device_name,
+				                                               hardware_id));
+			}
+			else {
+				FIRST_VLOG(2) << "Ignoring device " << device_name
+				              << ", not officially supported yet.";
+			}
+		}
+	}
+	first_time = false;
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index e4341c8..29e0f44 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -57,6 +57,7 @@ set(SRC_HEADERS
 	kernel_emission.h
 	kernel_film.h
 	kernel_globals.h
+	kernel_image_opencl.h
 	kernel_jitter.h
 	kernel_light.h
 	kernel_math.h
@@ -163,6 +164,8 @@ set(SRC_GEOM_HEADERS
 	geom/geom_curve.h
 	geom/geom_motion_curve.h
 	geom/geom_motion_triangle.h
+	geom/geom_motion_triangle_intersect.h
+	geom/geom_motion_triangle_shader.h
 	geom/geom_object.h
 	geom/geom_patch.h
 	geom/geom_primitive.h
@@ -176,6 +179,7 @@ set(SRC_UTIL_HEADERS
 	../util/util_atomic.h
 	../util/util_color.h
 	../util/util_half.h
+	../util/util_hash.h
 	../util/util_math.h
 	../util/util_math_fast.h
 	../util/util_static_assert.h
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 71c925a..3679898 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -72,13 +72,13 @@ CCL_NAMESPACE_BEGIN
 #  define BVH_FUNCTION_NAME bvh_intersect_subsurface
 #  define BVH_FUNCTION_FEATURES BVH_HAIR
 #  include "bvh_subsurface.h"
-#endif
 
-#if defined(__SUBSURFACE__) && defined(__OBJECT_MOTION__)
-#  define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
-#  define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR
-#  include "bvh_subsurface.h"
-#endif
+#  if defined(__OBJECT_MOTION__)
+#    define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
+#    define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR
+#    include "bvh_subsurface.h"
+#  endif
+#endif  /* __SUBSURFACE__ */
 
 /* Volume BVH traversal */
 
@@ -86,19 +86,19 @@ CCL_NAMESPACE_BEGIN
 #  define BVH_FUNCTION_NAME bvh_intersect_volume
 #  define BVH_FUNCTION_FEATURES BVH_HAIR
 #  include "bvh_volume.h"
-#endif
-
-#if defined(__VOLUME__) && defined(__INSTANCING__)
-#  define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#  include "bvh_volume.h"
-#endif
 
-#if defined(__VOLUME__) && defined(__OBJECT_MOTION__)
-#  define BVH_FUNCTION_NAME bvh_intersect_volume_motion
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
-#  include "bvh_volume.h"
-#endif
+#  if defined(__INSTANCING__)
+#    define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
+#    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
+#    include "bvh_volume.h"
+#  endif
+
+#  if defined(__OBJECT_MOTION__)
+#    define BVH_FUNCTION_NAME bvh_intersect_volume_motion
+#    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
+#    include "bvh_volume.h"
+#  endif
+#endif  /* __VOLUME__ */
 
 /* Record all intersections - Shadow BVH traversal */
 
@@ -106,31 +106,31 @@ CCL_NAMESPACE_BEGIN
 #  define BVH_FUNCTION_NAME bvh_intersect_shadow_all
 #  define BVH_FUNCTION_FEATURES 0
 #  include "bvh_shadow_all.h"
-#endif
-
-#if defined(__SHADOW_RECORD_ALL__) && defined(__INSTANCING__)
-#  define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#  include "bvh_shadow_all.h"
-#endif
-
-#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__)
-#  define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#  include "bvh_shadow_all.h"
-#endif
-
-#if defined(__SHADOW_RECORD_ALL__) && defined(__OBJECT_MOTION__)
-#  define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#  include "bvh_shadow_all.h"
-#endif
 
-#if defined(__SHADOW_RECORD_ALL__) && defined(__HAIR__) && defined(__OBJECT_MOTION__)
-#  define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
-#  include "bvh_shadow_all.h"
-#endif
+#  if defined(__INSTANCING__)
+#    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
+#    define BVH_FUNCTION_FEATURES BVH_INSTANCING
+#    include "bvh_shadow_all.h"
+#  endif
+
+#  if defined(__HAIR__)
+#    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
+#    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
+#    include "bvh_shadow_all.h"
+#  endif
+
+#  if defined(__OBJECT_MOTION__)
+#    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
+#    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
+#    include "bvh_shadow_all.h"
+#  endif
+
+#  if defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
+#    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
+#    include "bvh_shadow_all.h"
+#  endif
+#endif  /* __SHADOW_RECORD_ALL__ */
 
 /* Record all intersections - Volume BVH traversal  */
 
@@ -138,19 +138,19 @@ CCL_NAMESPACE_BEGIN
 #  define BVH_FUNCTION_NAME bvh_intersect_volume_all
 #  define BVH_FUNCTION_FEATURES BVH_HAIR
 #  include "bvh_volume_all.h"
-#endif
-
-#if defined(__VOLUME_RECORD_ALL__) && defined(__INSTANCING__)
-#  define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#  include "bvh_volume_all.h"
-#endif
 
-#if defined(__VOLUME_RECORD_ALL__) && defined(__OBJECT_MOTION__)
-#  define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
-#  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
-#  include "bvh_volume_all.h"
-#endif
+#  if defined(__INSTANCING__)
+#    define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
+#    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
+#    include "bvh_volume_all.h"
+#  endif
+
+#  if defined(__OBJECT_MOTION__)
+#    define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
+#    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
+#    include "bvh_volume_all.h"
+#  endif
+#endif  /* __VOLUME_RECORD_ALL__ */
 
 #undef BVH_FEATURE
 #undef BVH_NAME_JOIN
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index e9eeff3..df33a86 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -108,7 +108,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 		do {
 			/* traverse internal nodes */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-				int node_addr_ahild1, traverse_mask;
+				int node_addr_child1, traverse_mask;
 				float dist[2];
 				float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
 
@@ -141,25 +141,25 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #endif // __KERNEL_SSE2__
 
 				node_addr = __float_as_int(cnodes.z);
-				node_addr_ahild1 = __float_as_int(cnodes.w);
+				node_addr_child1 = __float_as_int(cnodes.w);
 
 				if(traverse_mask == 3) {
 					/* Both children were intersected, push the farther one. */
 					bool is_closest_child1 = (dist[1] < dist[0]);
 					if(is_closest_child1) {
 						int tmp = node_addr;
-						node_addr = node_addr_ahild1;
-						node_addr_ahild1 = tmp;
+						node_addr = node_addr_child1;
+						node_addr_child1 = tmp;
 					}
 
 					++stack_ptr;
 					kernel_assert(stack_ptr < BVH_STACK_SIZE);
-					traversal_stack[stack_ptr] = node_addr_ahild1;
+					traversal_stack[stack_ptr] = node_addr_child1;
 				}
 				else {
 					/* One child was intersected. */
 					if(traverse_mask == 2) {
-						node_addr = node_addr_ahild1;
+						node_addr = node_addr_child1;
 					}
 					else if(traverse_mask == 0) {
 						/* Neither child was intersected. */
@@ -187,7 +187,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 					/* primitive intersection */
 					while(prim_addr < prim_addr2) {
-						kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+						kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
 
 						bool hit;
 
@@ -222,6 +222,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_HAIR)
 							case PRIMITIVE_CURVE:
 							case PRIMITIVE_MOTION_CURVE: {
+								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
 									hit = bvh_cardinal_curve_intersect(kg,
 									                                   isect_array,
@@ -231,7 +232,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									                                   object,
 									                                   prim_addr,
 									                                   ray->time,
-									                                   type,
+									                                   curve_type,
 									                                   NULL,
 									                                   0, 0);
 								}
@@ -244,7 +245,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									                          object,
 									                          prim_addr,
 									                          ray->time,
-									                          type,
+									                          curve_type,
 									                          NULL,
 									                          0, 0);
 								}
@@ -278,7 +279,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								shader = __float_as_int(str.z);
 							}
 #endif
-							int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*2);
+							int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*SHADER_SIZE);
 
 							/* if no transparent shadows, all light is blocked */
 							if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
@@ -343,6 +344,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 		if(stack_ptr >= 0) {
 			kernel_assert(object != OBJECT_NONE);
 
+			/* Instance pop. */
 			if(num_hits_in_instance) {
 				float t_fac;
 
@@ -355,8 +357,9 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				triangle_intersect_precalc(dir, &isect_precalc);
 
 				/* scale isect->t to adjust for instancing */
-				for(int i = 0; i < num_hits_in_instance; i++)
+				for(int i = 0; i < num_hits_in_instance; i++) {
 					(isect_array-i-1)->t *= t_fac;
+				}
 			}
 			else {
 				float ignore_t = FLT_MAX;
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index a0e478e..80c8f31 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -213,7 +213,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						--stack_ptr;
 					}
 				}
-				BVH_DEBUG_NEXT_STEP();
+				BVH_DEBUG_NEXT_NODE();
 			}
 
 			/* if node is leaf, fetch triangle list */
@@ -235,7 +235,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					switch(type & PRIMITIVE_ALL) {
 						case PRIMITIVE_TRIANGLE: {
 							for(; prim_addr < prim_addr2; prim_addr++) {
-								BVH_DEBUG_NEXT_STEP();
+								BVH_DEBUG_NEXT_INTERSECTION();
 								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								if(triangle_intersect(kg,
 								                      &isect_precalc,
@@ -264,7 +264,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_MOTION)
 						case PRIMITIVE_MOTION_TRIANGLE: {
 							for(; prim_addr < prim_addr2; prim_addr++) {
-								BVH_DEBUG_NEXT_STEP();
+								BVH_DEBUG_NEXT_INTERSECTION();
 								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								if(motion_triangle_intersect(kg,
 								                             isect,
@@ -296,8 +296,9 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						case PRIMITIVE_CURVE:
 						case PRIMITIVE_MOTION_CURVE: {
 							for(; prim_addr < prim_addr2; prim_addr++) {
-								BVH_DEBUG_NEXT_STEP();
-								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								BVH_DEBUG_NEXT_INTERSECTION();
+								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
+								kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
 								bool hit;
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
 									hit = bvh_cardinal_curve_intersect(kg,
@@ -308,7 +309,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									                                   object,
 									                                   prim_addr,
 									                                   ray->time,
-									                                   type,
+									                                   curve_type,
 									                                   lcg_state,
 									                                   difl,
 									                                   extmax);
@@ -322,7 +323,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									                          object,
 									                          prim_addr,
 									                          ray->time,
-									                          type,
+									                          curve_type,
 									                          lcg_state,
 									                          difl,
 									                          extmax);
diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h
index c3abe2e..ead424a 100644
--- a/intern/cycles/kernel/bvh/bvh_types.h
+++ b/intern/cycles/kernel/bvh/bvh_types.h
@@ -50,12 +50,17 @@ CCL_NAMESPACE_BEGIN
 #ifdef __KERNEL_DEBUG__
 #  define BVH_DEBUG_INIT() \
 	do { \
-		isect->num_traversal_steps = 0; \
+		isect->num_traversed_nodes = 0; \
 		isect->num_traversed_instances = 0; \
+		isect->num_intersections = 0; \
 	} while(0)
-#  define BVH_DEBUG_NEXT_STEP() \
+#  define BVH_DEBUG_NEXT_NODE() \
 	do { \
-		++isect->num_traversal_steps; \
+		++isect->num_traversed_nodes; \
+	} while(0)
+#  define BVH_DEBUG_NEXT_INTERSECTION() \
+	do { \
+		++isect->num_intersections; \
 	} while(0)
 #  define BVH_DEBUG_NEXT_INSTANCE() \
 	do { \
@@ -63,7 +68,8 @@ CCL_NAMESPACE_BEGIN
 	} while(0)
 #else  /* __KERNEL_DEBUG__ */
 #  define BVH_DEBUG_INIT()
-#  define BVH_DEBUG_NEXT_STEP()
+#  define BVH_DEBUG_NEXT_NODE()
+#  define BVH_DEBUG_NEXT_INTERSECTION()
 #  define BVH_DEBUG_NEXT_INSTANCE()
 #endif  /* __KERNEL_DEBUG__ */
 
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index 1f6515c..529848e 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -99,7 +99,7 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	ssef tsplat(0.0f, 0.0f, -isect_t, -isect_t);
 
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
-#endif
+#endif  /* __KERNEL_SSE2__ */
 
 	IsectPrecalc isect_precalc;
 	triangle_intersect_precalc(dir, &isect_precalc);
@@ -334,6 +334,7 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 		if(stack_ptr >= 0) {
 			kernel_assert(object != OBJECT_NONE);
 
+			/* Instance pop. */
 			if(num_hits_in_instance) {
 				float t_fac;
 #  if BVH_FEATURE(BVH_MOTION)
@@ -377,7 +378,7 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 			node_addr = traversal_stack[stack_ptr];
 			--stack_ptr;
 		}
-#endif  /* FEATURE(BVH_MOTION) */
+#endif  /* FEATURE(BVH_INSTANCING) */
 	} while(node_addr != ENTRYPOINT_SENTINEL);
 
 	return num_hits;
diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h
index 2ee2a39..6d22f0b 100644
--- a/intern/cycles/kernel/bvh/qbvh_nodes.h
+++ b/intern/cycles/kernel/bvh/qbvh_nodes.h
@@ -21,6 +21,36 @@ struct QBVHStackItem {
 	float dist;
 };
 
+ccl_device_inline void qbvh_near_far_idx_calc(const float3& idir,
+                                              int *ccl_restrict near_x,
+                                              int *ccl_restrict near_y,
+                                              int *ccl_restrict near_z,
+                                              int *ccl_restrict far_x,
+                                              int *ccl_restrict far_y,
+                                              int *ccl_restrict far_z)
+
+{
+#ifdef __KERNEL_SSE__
+	*near_x = 0; *far_x = 1;
+	*near_y = 2; *far_y = 3;
+	*near_z = 4; *far_z = 5;
+
+	const size_t mask = movemask(ssef(idir.m128));
+
+	const int mask_x = mask & 1;
+	const int mask_y = (mask & 2) >> 1;
+	const int mask_z = (mask & 4) >> 2;
+
+	*near_x += mask_x; *far_x -= mask_x;
+	*near_y += mask_y; *far_y -= mask_y;
+	*near_z += mask_z; *far_z -= mask_z;
+#else
+	if(idir.x >= 0.0f) { *near_x = 0; *far_x = 1; } else { *near_x = 1; *far_x = 0; }
+	if(idir.y >= 0.0f) { *near_y = 2; *far_y = 3; } else { *near_y = 3; *far_y = 2; }
+	if(idir.z >= 0.0f) { *near_z = 4; *far_z = 5; } else { *near_z = 5; *far_z = 4; }
+#endif
+}
+
 /* TOOD(sergey): Investigate if using intrinsics helps for both
  * stack item swap and float comparison.
  */
diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
index 2e6b6b8..607295f 100644
--- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
@@ -37,6 +37,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                              uint *num_hits)
 {
 	/* TODO(sergey):
+	*  - Test if pushing distance on the stack helps.
 	 * - Likely and unlikely for if() statements.
 	 * - Test restrict attribute for pointers.
 	 */
@@ -74,7 +75,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	int num_hits_in_instance = 0;
 #endif
 
-	ssef tnear(0.0f), tfar(tmax);
+	ssef tnear(0.0f), tfar(isect_t);
 #if BVH_FEATURE(BVH_HAIR)
 	sse3f dir4(ssef(dir.x), ssef(dir.y), ssef(dir.z));
 #endif
@@ -91,10 +92,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	/* Offsets to select the side that becomes the lower or upper bound. */
 	int near_x, near_y, near_z;
 	int far_x, far_y, far_z;
-
-	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+	qbvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);
 
 	IsectPrecalc isect_precalc;
 	triangle_intersect_precalc(dir, &isect_precalc);
@@ -106,14 +106,20 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
 
+				if(false
 #ifdef __VISIBILITY_FLAG__
-				if((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) {
+				   || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0)
+#endif
+#if BVH_FEATURE(BVH_MOTION)
+				   || UNLIKELY(ray->time < inodes.y)
+				   || UNLIKELY(ray->time > inodes.z)
+#endif
+				) {
 					/* Pop. */
 					node_addr = traversal_stack[stack_ptr].addr;
 					--stack_ptr;
 					continue;
 				}
-#endif
 
 				ssef dist;
 				int child_mask = NODE_INTERSECT(kg,
@@ -122,12 +128,12 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #ifdef __KERNEL_AVX2__
 				                                P_idir4,
 #endif
-#  if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
+#if BVH_FEATURE(BVH_HAIR) || !defined(__KERNEL_AVX2__)
 				                                org4,
-#  endif
-#  if BVH_FEATURE(BVH_HAIR)
+#endif
+#if BVH_FEATURE(BVH_HAIR)
 				                                dir4,
-#  endif
+#endif
 				                                idir4,
 				                                near_x, near_y, near_z,
 				                                far_x, far_y, far_z,
@@ -262,7 +268,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 					/* Primitive intersection. */
 					while(prim_addr < prim_addr2) {
-						kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+						kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
 
 						bool hit;
 
@@ -297,6 +303,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_HAIR)
 							case PRIMITIVE_CURVE:
 							case PRIMITIVE_MOTION_CURVE: {
+								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
 									hit = bvh_cardinal_curve_intersect(kg,
 									                                   isect_array,
@@ -306,7 +313,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 									                                   object,
 									                                   prim_addr,
 									                                   ray->time,
-									                                   type,
+									                                   curve_type,
 									                                   NULL,
 									                                   0, 0);
 								}
@@ -319,7 +326,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 									                          object,
 									                          prim_addr,
 									                          ray->time,
-									                          type,
+									                          curve_type,
 									                          NULL,
 									                          0, 0);
 								}
@@ -353,7 +360,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								shader = __float_as_int(str.z);
 							}
 #endif
-							int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*2);
+							int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*SHADER_SIZE);
 
 							/* if no transparent shadows, all light is blocked */
 							if(!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
@@ -391,9 +398,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					num_hits_in_instance = 0;
 					isect_array->t = isect_t;
 
-					if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+					qbvh_near_far_idx_calc(idir,
+					                       &near_x, &near_y, &near_z,
+					                       &far_x, &far_y, &far_z);
 					tfar = ssef(isect_t);
 #  if BVH_FEATURE(BVH_HAIR)
 					dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
@@ -424,22 +431,21 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 		if(stack_ptr >= 0) {
 			kernel_assert(object != OBJECT_NONE);
 
+			/* Instance pop. */
 			if(num_hits_in_instance) {
 				float t_fac;
-
 #  if BVH_FEATURE(BVH_MOTION)
 				bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
 #  else
 				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
 #  endif
-
-				/* scale isect->t to adjust for instancing */
-				for(int i = 0; i < num_hits_in_instance; i++)
+				/* Scale isect->t to adjust for instancing. */
+				for(int i = 0; i < num_hits_in_instance; i++) {
 					(isect_array-i-1)->t *= t_fac;
+				}
 			}
 			else {
 				float ignore_t = FLT_MAX;
-
 #  if BVH_FEATURE(BVH_MOTION)
 				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
 #  else
@@ -450,10 +456,10 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			isect_t = tmax;
 			isect_array->t = isect_t;
 
-			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
-			tfar = ssef(tmax);
+			qbvh_near_far_idx_calc(idir,
+			                       &near_x, &near_y, &near_z,
+			                       &far_x, &far_y, &far_z);
+			tfar = ssef(isect_t);
 #  if BVH_FEATURE(BVH_HAIR)
 			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
 #  endif
diff --git a/intern/cycles/kernel/bvh/qbvh_subsurface.h b/intern/cycles/kernel/bvh/qbvh_subsurface.h
index 24aca96..ccd36df 100644
--- a/intern/cycles/kernel/bvh/qbvh_subsurface.h
+++ b/intern/cycles/kernel/bvh/qbvh_subsurface.h
@@ -101,10 +101,9 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	/* Offsets to select the side that becomes the lower or upper bound. */
 	int near_x, near_y, near_z;
 	int far_x, far_y, far_z;
-
-	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+	qbvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);
 
 	IsectPrecalc isect_precalc;
 	triangle_intersect_precalc(dir, &isect_precalc);
diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h
index a1e154d..10ae7be 100644
--- a/intern/cycles/kernel/bvh/qbvh_traversal.h
+++ b/intern/cycles/kernel/bvh/qbvh_traversal.h
@@ -102,10 +102,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	/* Offsets to select the side that becomes the lower or upper bound. */
 	int near_x, near_y, near_z;
 	int far_x, far_y, far_z;
-
-	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+	qbvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);
 
 	IsectPrecalc isect_precalc;
 	triangle_intersect_precalc(dir, &isect_precalc);
@@ -118,6 +117,10 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
 
 				if(UNLIKELY(node_dist > isect->t)
+#if BVH_FEATURE(BVH_MOTION)
+				   || UNLIKELY(ray->time < inodes.y)
+				   || UNLIKELY(ray->time > inodes.z)
+#endif
 #ifdef __VISIBILITY_FLAG__
 				   || (__float_as_uint(inodes.x) & visibility) == 0)
 #endif
@@ -132,7 +135,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				int child_mask;
 				ssef dist;
 
-				BVH_DEBUG_NEXT_STEP();
+				BVH_DEBUG_NEXT_NODE();
 
 #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
 				if(difl != 0.0f) {
@@ -327,7 +330,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					switch(type & PRIMITIVE_ALL) {
 						case PRIMITIVE_TRIANGLE: {
 							for(; prim_addr < prim_addr2; prim_addr++) {
-								BVH_DEBUG_NEXT_STEP();
+								BVH_DEBUG_NEXT_INTERSECTION();
 								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								if(triangle_intersect(kg,
 								                      &isect_precalc,
@@ -348,7 +351,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_MOTION)
 						case PRIMITIVE_MOTION_TRIANGLE: {
 							for(; prim_addr < prim_addr2; prim_addr++) {
-								BVH_DEBUG_NEXT_STEP();
+								BVH_DEBUG_NEXT_INTERSECTION();
 								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								if(motion_triangle_intersect(kg,
 								                             isect,
@@ -372,8 +375,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						case PRIMITIVE_CURVE:
 						case PRIMITIVE_MOTION_CURVE: {
 							for(; prim_addr < prim_addr2; prim_addr++) {
-								BVH_DEBUG_NEXT_STEP();
-								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+								BVH_DEBUG_NEXT_INTERSECTION();
+								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
+								kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
 								bool hit;
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
 									hit = bvh_cardinal_curve_intersect(kg,
@@ -384,7 +388,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 									                                   object,
 									                                   prim_addr,
 									                                   ray->time,
-									                                   type,
+									                                   curve_type,
 									                                   lcg_state,
 									                                   difl,
 									                                   extmax);
@@ -398,7 +402,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 									                          object,
 									                          prim_addr,
 									                          ray->time,
-									                          type,
+									                          curve_type,
 									                          lcg_state,
 									                          difl,
 									                          extmax);
@@ -427,9 +431,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist);
 #  endif
 
-					if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+					qbvh_near_far_idx_calc(idir,
+					                       &near_x, &near_y, &near_z,
+					                       &far_x, &far_y, &far_z);
 					tfar = ssef(isect->t);
 #  if BVH_FEATURE(BVH_HAIR)
 					dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
@@ -469,9 +473,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
 #  endif
 
-			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			qbvh_near_far_idx_calc(idir,
+			                       &near_x, &near_y, &near_z,
+			                       &far_x, &far_y, &far_z);
 			tfar = ssef(isect->t);
 #  if BVH_FEATURE(BVH_HAIR)
 			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
diff --git a/intern/cycles/kernel/bvh/qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h
index a97bf3c..1e77d8e 100644
--- a/intern/cycles/kernel/bvh/qbvh_volume.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume.h
@@ -87,10 +87,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	/* Offsets to select the side that becomes the lower or upper bound. */
 	int near_x, near_y, near_z;
 	int far_x, far_y, far_z;
-
-	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+	qbvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);
 
 	IsectPrecalc isect_precalc;
 	triangle_intersect_precalc(dir, &isect_precalc);
@@ -100,8 +99,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 		do {
 			/* Traverse internal nodes. */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-#ifdef __VISIBILITY_FLAG__
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+#ifdef __VISIBILITY_FLAG__
 				if((__float_as_uint(inodes.x) & visibility) == 0) {
 					/* Pop. */
 					node_addr = traversal_stack[stack_ptr].addr;
@@ -295,16 +295,15 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					int object_flag = kernel_tex_fetch(__object_flag, object);
 
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
-
 #  if BVH_FEATURE(BVH_MOTION)
 						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
 #  else
 						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
 #  endif
 
-						if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+						qbvh_near_far_idx_calc(idir,
+						                       &near_x, &near_y, &near_z,
+						                       &far_x, &far_y, &far_z);
 						tfar = ssef(isect->t);
 #  if BVH_FEATURE(BVH_HAIR)
 						dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
@@ -348,9 +347,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
 #  endif
 
-			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			qbvh_near_far_idx_calc(idir,
+			                       &near_x, &near_y, &near_z,
+			                       &far_x, &far_y, &far_z);
 			tfar = ssef(isect->t);
 #  if BVH_FEATURE(BVH_HAIR)
 			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h
index e1ca27a..eb48af6 100644
--- a/intern/cycles/kernel/bvh/qbvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h
@@ -66,7 +66,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 #ifndef __KERNEL_SSE41__
 	if(!isfinite(P.x)) {
-		return false;
+		return 0;
 	}
 #endif
 
@@ -91,10 +91,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	/* Offsets to select the side that becomes the lower or upper bound. */
 	int near_x, near_y, near_z;
 	int far_x, far_y, far_z;
-
-	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+	qbvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);
 
 	IsectPrecalc isect_precalc;
 	triangle_intersect_precalc(dir, &isect_precalc);
@@ -104,8 +103,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 		do {
 			/* Traverse internal nodes. */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
-#ifdef __VISIBILITY_FLAG__
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+
+#ifdef __VISIBILITY_FLAG__
 				if((__float_as_uint(inodes.x) & visibility) == 0) {
 					/* Pop. */
 					node_addr = traversal_stack[stack_ptr].addr;
@@ -353,9 +353,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
 #  endif
 
-						if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+						qbvh_near_far_idx_calc(idir,
+						                       &near_x, &near_y, &near_z,
+						                       &far_x, &far_y, &far_z);
 						tfar = ssef(isect_t);
 						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
 #  if BVH_FEATURE(BVH_HAIR)
@@ -402,7 +402,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #  else
 				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
 #  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
 				/* Scale isect->t to adjust for instancing. */
 				for(int i = 0; i < num_hits_in_instance; i++) {
 					(isect_array-i-1)->t *= t_fac;
@@ -415,12 +414,14 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #  else
 				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
 #  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
 			}
 
-			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			isect_t = tmax;
+			isect_array->t = isect_t;
+
+			qbvh_near_far_idx_calc(idir,
+			                       &near_x, &near_y, &near_z,
+			                       &far_x, &far_y, &far_z);
 			tfar = ssef(isect_t);
 #  if BVH_FEATURE(BVH_HAIR)
 			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
@@ -435,8 +436,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #  endif
 
 			triangle_intersect_precalc(dir, &isect_precalc);
-			isect_t = tmax;
-			isect_array->t = isect_t;
 
 			object = OBJECT_NONE;
 			node_addr = traversal_stack[stack_ptr].addr;
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index 1cd8246..b6c896c 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -143,6 +143,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 {
 	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
 	float3 N = bsdf->N;
+	int label = LABEL_REFLECT | LABEL_GLOSSY;
 
 	float NdotI = dot(N, I);
 	if(NdotI > 0.0f) {
@@ -211,6 +212,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 			/* Some high number for MIS. */
 			*pdf = 1e6f;
 			*eval = make_float3(1e6f, 1e6f, 1e6f);
+			label = LABEL_REFLECT | LABEL_SINGULAR;
 		}
 		else {
 			/* leave the rest to eval_reflect */
@@ -224,7 +226,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 #endif
 	}
 
-	return LABEL_REFLECT|LABEL_GLOSSY;
+	return label;
 }
 
 
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index bede5f4..daaa26d 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -267,7 +267,10 @@ ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng,
 
 	*eval = make_float3(*pdf, *pdf, *pdf);
 
-	kernel_assert(dot(locy, *omega_in) < 0.0f);
+	/* TODO(sergey): Should always be negative, but seems some precision issue
+	 * is involved here.
+	 */
+	kernel_assert(dot(locy, *omega_in) < 1e-4f);
 
 	return LABEL_TRANSMIT|LABEL_GLOSSY;
 }
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index 08e580f..4a1316f 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -267,7 +267,7 @@ ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosur
 	       (isequal_float3(bsdf_a->T, bsdf_b->T)) &&
 	       (bsdf_a->ior == bsdf_b->ior) &&
 	       ((!bsdf_a->extra && !bsdf_b->extra) ||
-            ((bsdf_a->extra && bsdf_b->extra) &&
+	        ((bsdf_a->extra && bsdf_b->extra) &&
 	         (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color))));
 }
 
@@ -452,6 +452,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 	float alpha_y = bsdf->alpha_y;
 	bool m_refractive = bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
 	float3 N = bsdf->N;
+	int label;
 
 	float cosNO = dot(N, I);
 	if(cosNO > 0) {
@@ -477,6 +478,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 		/* reflection or refraction? */
 		if(!m_refractive) {
 			float cosMO = dot(m, I);
+			label = LABEL_REFLECT | LABEL_GLOSSY;
 
 			if(cosMO > 0) {
 				/* eq. 39 - compute actual reflected direction */
@@ -487,6 +489,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 						/* some high number for MIS */
 						*pdf = 1e6f;
 						*eval = make_float3(1e6f, 1e6f, 1e6f);
+						label = LABEL_REFLECT | LABEL_SINGULAR;
 					}
 					else {
 						/* microfacet normal is visible to this ray */
@@ -549,6 +552,8 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 			}
 		}
 		else {
+			label = LABEL_TRANSMIT | LABEL_GLOSSY;
+
 			/* CAUTION: the i and o variables are inverted relative to the paper
 			 * eq. 39 - compute actual refractive direction */
 			float3 R, T;
@@ -576,6 +581,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 					/* some high number for MIS */
 					*pdf = 1e6f;
 					*eval = make_float3(1e6f, 1e6f, 1e6f);
+					label = LABEL_TRANSMIT | LABEL_SINGULAR;
 				}
 				else {
 					/* eq. 33 */
@@ -607,7 +613,10 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 			}
 		}
 	}
-	return (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY;
+	else {
+		label = (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY;
+	}
+	return label;
 }
 
 /* Beckmann microfacet with Smith shadow-masking from:
@@ -815,6 +824,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 	float alpha_y = bsdf->alpha_y;
 	bool m_refractive = bsdf->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
 	float3 N = bsdf->N;
+	int label;
 
 	float cosNO = dot(N, I);
 	if(cosNO > 0) {
@@ -839,6 +849,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 
 		/* reflection or refraction? */
 		if(!m_refractive) {
+			label = LABEL_REFLECT | LABEL_GLOSSY;
 			float cosMO = dot(m, I);
 
 			if(cosMO > 0) {
@@ -850,6 +861,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 						/* some high number for MIS */
 						*pdf = 1e6f;
 						*eval = make_float3(1e6f, 1e6f, 1e6f);
+						label = LABEL_REFLECT | LABEL_SINGULAR;
 					}
 					else {
 						/* microfacet normal is visible to this ray
@@ -904,6 +916,8 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 			}
 		}
 		else {
+			label = LABEL_TRANSMIT | LABEL_GLOSSY;
+
 			/* CAUTION: the i and o variables are inverted relative to the paper
 			 * eq. 39 - compute actual refractive direction */
 			float3 R, T;
@@ -931,6 +945,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 					/* some high number for MIS */
 					*pdf = 1e6f;
 					*eval = make_float3(1e6f, 1e6f, 1e6f);
+					label = LABEL_TRANSMIT | LABEL_SINGULAR;
 				}
 				else {
 					/* eq. 33 */
@@ -963,7 +978,10 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 			}
 		}
 	}
-	return (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY;
+	else {
+		label = (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY;
+	}
+	return label;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index 24ced93..6838e26 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -23,6 +23,8 @@
 #include "geom_subd_triangle.h"
 #include "geom_triangle_intersect.h"
 #include "geom_motion_triangle.h"
+#include "geom_motion_triangle_intersect.h"
+#include "geom_motion_triangle_shader.h"
 #include "geom_motion_curve.h"
 #include "geom_curve.h"
 #include "geom_volume.h"
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index 84aaaab..9de3354 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -255,6 +255,17 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 		int ka = max(k0 - 1, v00.x);
 		int kb = min(k1 + 1, v00.x + v00.y - 1);
 
+#ifdef __KERNEL_AVX2__
+		avxf P_curve_0_1, P_curve_2_3;
+		if(type & PRIMITIVE_CURVE) {
+			P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x);
+			P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x);
+		}
+		else {
+			int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
+			motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3);
+		}
+#else  /* __KERNEL_AVX2__ */
 		ssef P_curve[4];
 
 		if(type & PRIMITIVE_CURVE) {
@@ -267,6 +278,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
 			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve);
 		}
+#endif  /* __KERNEL_AVX2__ */
 
 		ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss));
 		ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn;
@@ -278,6 +290,33 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 		ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0);
 		ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
 
+#ifdef __KERNEL_AVX2__
+		const avxf vPP = _mm256_broadcast_ps(&P.m128);
+		const avxf htfm00 = avxf(htfm0.m128, htfm0.m128);
+		const avxf htfm11 = avxf(htfm1.m128, htfm1.m128);
+		const avxf htfm22 = avxf(htfm2.m128, htfm2.m128);
+
+		const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP),
+		                      htfm00,
+		                      madd(shuffle<1>(P_curve_0_1 - vPP),
+		                           htfm11,
+		                           shuffle<2>(P_curve_0_1 - vPP) * htfm22));
+		const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP),
+		                      htfm00,
+		                      madd(shuffle<1>(P_curve_2_3 - vPP),
+		                           htfm11,
+		                           shuffle<2>(P_curve_2_3 - vPP)*htfm22));
+
+		const ssef p0 = _mm256_castps256_ps128(p01);
+		const ssef p1 = _mm256_extractf128_ps(p01, 1);
+		const ssef p2 = _mm256_castps256_ps128(p23);
+		const ssef p3 = _mm256_extractf128_ps(p23, 1);
+
+		const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1);
+		r_st = ((float4 &)P_curve_1).w;
+		const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3);
+		r_en = ((float4 &)P_curve_2).w;
+#else  /* __KERNEL_AVX2__ */
 		ssef htfm[] = { htfm0, htfm1, htfm2 };
 		ssef vP = load4f(P);
 		ssef p0 = transform_point_T3(htfm, P_curve[0] - vP);
@@ -285,6 +324,10 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 		ssef p2 = transform_point_T3(htfm, P_curve[2] - vP);
 		ssef p3 = transform_point_T3(htfm, P_curve[3] - vP);
 
+		r_st = ((float4 &)P_curve[1]).w;
+		r_en = ((float4 &)P_curve[2]).w;
+#endif  /* __KERNEL_AVX2__ */
+
 		float fc = 0.71f;
 		ssef vfc = ssef(fc);
 		ssef vfcxp3 = vfc * p3;
@@ -294,8 +337,6 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 		vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3)));
 		vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3));
 
-		r_st = ((float4 &)P_curve[1]).w;
-		r_en = ((float4 &)P_curve[2]).w;
 	}
 #else
 	float3 curve_coef[4];
@@ -383,8 +424,9 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 
 	/* begin loop */
 	while(!(tree >> (depth))) {
-		float i_st = tree * resol;
-		float i_en = i_st + (level * resol);
+		const float i_st = tree * resol;
+		const float i_en = i_st + (level * resol);
+
 #ifdef __KERNEL_SSE2__
 		ssef vi_st = ssef(i_st), vi_en = ssef(i_en);
 		ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
@@ -458,13 +500,23 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 
 			if(flags & CURVE_KN_RIBBONS) {
 				float3 tg = (p_en - p_st);
+#ifdef __KERNEL_SSE__
+				const float3 tg_sq = tg * tg;
+				float w = tg_sq.x + tg_sq.y;
+#else
 				float w = tg.x * tg.x + tg.y * tg.y;
+#endif
 				if(w == 0) {
 					tree++;
 					level = tree & -tree;
 					continue;
 				}
+#ifdef __KERNEL_SSE__
+				const float3 p_sttg = p_st * tg;
+				w = -(p_sttg.x + p_sttg.y) / w;
+#else
 				w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
+#endif
 				w = saturate(w);
 
 				/* compute u on the curve segment */
@@ -496,7 +548,13 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 				if(difl != 0.0f) {
 					mw_extension = min(difl * fabsf(bmaxz), extmax);
 					r_ext = mw_extension + r_curr;
+#ifdef __KERNEL_SSE__
+					const float3 p_curr_sq = p_curr * p_curr;
+					const float3 dxxx = _mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128));
+					float d = dxxx.x;
+#else
 					float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
+#endif
 					float d0 = d - r_curr;
 					float d1 = d + r_curr;
 					float inv_mw_extension = 1.0f/mw_extension;
@@ -853,7 +911,7 @@ ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection
 #  undef len3_squared
 #  undef len3
 #  undef dot3
-#  endif
+#endif
 }
 
 ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h
index 6de5aa7..dc1388b 100644
--- a/intern/cycles/kernel/geom/geom_motion_curve.h
+++ b/intern/cycles/kernel/geom/geom_motion_curve.h
@@ -50,12 +50,12 @@ ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg, int object,
 ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg, int offset, int numkeys, int numsteps, int step, int k0, int k1, float4 keys[2])
 {
 	if(step == numsteps) {
-		/* center step: regular vertex location */
+		/* center step: regular key location */
 		keys[0] = kernel_tex_fetch(__curve_keys, k0);
 		keys[1] = kernel_tex_fetch(__curve_keys, k1);
 	}
 	else {
-		/* center step not stored in this array */
+		/* center step is not stored in this array */
 		if(step > numsteps)
 			step--;
 
@@ -97,14 +97,14 @@ ccl_device_inline void motion_curve_keys(KernelGlobals *kg, int object, int prim
 ccl_device_inline void motion_cardinal_curve_keys_for_step(KernelGlobals *kg, int offset, int numkeys, int numsteps, int step, int k0, int k1, int k2, int k3, float4 keys[4])
 {
 	if(step == numsteps) {
-		/* center step: regular vertex location */
+		/* center step: regular key location */
 		keys[0] = kernel_tex_fetch(__curve_keys, k0);
 		keys[1] = kernel_tex_fetch(__curve_keys, k1);
 		keys[2] = kernel_tex_fetch(__curve_keys, k2);
 		keys[3] = kernel_tex_fetch(__curve_keys, k3);
 	}
 	else {
-		/* center step not store in this array */
+		/* center step is not stored in this array */
 		if(step > numsteps)
 			step--;
 
@@ -118,7 +118,12 @@ ccl_device_inline void motion_cardinal_curve_keys_for_step(KernelGlobals *kg, in
 }
 
 /* return 2 curve key locations */
-ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg, int object, int prim, float time, int k0, int k1, int k2, int k3, float4 keys[4])
+ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg,
+                                                  int object,
+                                                  int prim,
+                                                  float time,
+                                                  int k0, int k1, int k2, int k3,
+                                                  float4 keys[4])
 {
 	/* get motion info */
 	int numsteps, numkeys;
@@ -147,6 +152,65 @@ ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg, int object,
 	keys[3] = (1.0f - t)*keys[3] + t*next_keys[3];
 }
 
+#ifdef __KERNEL_AVX2__
+/* Similar to above, but returns keys as pair of two AVX registers with each
+ * holding two float4.
+ */
+ccl_device_inline void motion_cardinal_curve_keys_avx(KernelGlobals *kg,
+                                                      int object,
+                                                      int prim,
+                                                      float time,
+                                                      int k0, int k1,
+                                                      int k2, int k3,
+                                                      avxf *out_keys_0_1,
+                                                      avxf *out_keys_2_3)
+{
+	/* Get motion info. */
+	int numsteps, numkeys;
+	object_motion_info(kg, object, &numsteps, NULL, &numkeys);
+
+	/* Figure out which steps we need to fetch and their interpolation factor. */
+	int maxstep = numsteps * 2;
+	int step = min((int)(time*maxstep), maxstep - 1);
+	float t = time*maxstep - step;
+
+	/* Find attribute. */
+	AttributeElement elem;
+	int offset = find_attribute_curve_motion(kg,
+	                                         object,
+	                                         ATTR_STD_MOTION_VERTEX_POSITION,
+	                                         &elem);
+	kernel_assert(offset != ATTR_STD_NOT_FOUND);
+
+	/* Fetch key coordinates. */
+	float4 next_keys[4];
+	float4 keys[4];
+	motion_cardinal_curve_keys_for_step(kg,
+	                                    offset,
+	                                    numkeys,
+	                                    numsteps,
+	                                    step,
+	                                    k0, k1, k2, k3,
+	                                    keys);
+	motion_cardinal_curve_keys_for_step(kg,
+	                                    offset,
+	                                    numkeys,
+	                                    numsteps,
+	                                    step + 1,
+	                                    k0, k1, k2, k3,
+	                                    next_keys);
+
+	const avxf keys_0_1 = avxf(keys[0].m128, keys[1].m128);
+	const avxf keys_2_3 = avxf(keys[2].m128, keys[3].m128);
+	const avxf next_keys_0_1 = avxf(next_keys[0].m128, next_keys[1].m128);
+	const avxf next_keys_2_3 = avxf(next_keys[2].m128, next_keys[3].m128);
+
+	/* Interpolate between steps. */
+	*out_keys_0_1 = (1.0f - t) * keys_0_1 + t*next_keys_0_1;
+	*out_keys_2_3 = (1.0f - t) * keys_2_3 + t*next_keys_2_3;
+}
+#endif
+
 #endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
index 3cbe59a..4e84aa9 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -76,7 +76,7 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg, uint4
 		normals[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
 	}
 	else {
-		/* center step not stored in this array */
+		/* center step is not stored in this array */
 		if(step > numsteps)
 			step--;
 
@@ -117,312 +117,4 @@ ccl_device_inline void motion_triangle_vertices(KernelGlobals *kg, int object, i
 	verts[2] = (1.0f - t)*verts[2] + t*next_verts[2];
 }
 
-/* Refine triangle intersection to more precise hit point. For rays that travel
- * far the precision is often not so good, this reintersects the primitive from
- * a closer distance. */
-
-ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, float3 verts[3])
-{
-	float3 P = ray->P;
-	float3 D = ray->D;
-	float t = isect->t;
-
-#ifdef __INTERSECTION_REFINE__
-	if(isect->object != OBJECT_NONE) {
-		if(UNLIKELY(t == 0.0f)) {
-			return P;
-		}
-#  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
-#  else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#  endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D*t);
-		D = normalize_len(D, &t);
-	}
-
-	P = P + D*t;
-
-	/* compute refined intersection distance */
-	const float3 e1 = verts[0] - verts[2];
-	const float3 e2 = verts[1] - verts[2];
-	const float3 s1 = cross(D, e2);
-
-	const float invdivisor = 1.0f/dot(s1, e1);
-	const float3 d = P - verts[2];
-	const float3 s2 = cross(d, e1);
-	float rt = dot(e2, s2)*invdivisor;
-
-	/* compute refined position */
-	P = P + D*rt;
-
-	if(isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
-#  else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#  endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-#else
-	return P + D*t;
-#endif
-}
-
-/* Same as above, except that isect->t is assumed to be in object space for instancing */
-
-#ifdef __SUBSURFACE__
-#  if defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86))
-ccl_device_noinline
-#  else
-ccl_device_inline
-#  endif
-float3 motion_triangle_refine_subsurface(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, float3 verts[3])
-{
-	float3 P = ray->P;
-	float3 D = ray->D;
-	float t = isect->t;
-
-#  ifdef __INTERSECTION_REFINE__
-	if(isect->object != OBJECT_NONE) {
-#    ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
-#    else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#    endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D);
-		D = normalize(D);
-	}
-
-	P = P + D*t;
-
-	/* compute refined intersection distance */
-	const float3 e1 = verts[0] - verts[2];
-	const float3 e2 = verts[1] - verts[2];
-	const float3 s1 = cross(D, e2);
-
-	const float invdivisor = 1.0f/dot(s1, e1);
-	const float3 d = P - verts[2];
-	const float3 s2 = cross(d, e1);
-	float rt = dot(e2, s2)*invdivisor;
-
-	P = P + D*rt;
-
-	if(isect->object != OBJECT_NONE) {
-#    ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
-#    else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#    endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-#  else
-	return P + D*t;
-#  endif
-}
-#endif
-
-/* Setup of motion triangle specific parts of ShaderData, moved into this one
- * function to more easily share computation of interpolated positions and
- * normals */
-
-/* return 3 triangle vertex normals */
-ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool subsurface)
-{
-	/* get shader */
-	ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
-
-	/* get motion info */
-	int numsteps, numverts;
-	object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL);
-
-	/* figure out which steps we need to fetch and their interpolation factor */
-	int maxstep = numsteps*2;
-	int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1);
-	float t = ccl_fetch(sd, time)*maxstep - step;
-
-	/* find attribute */
-	AttributeElement elem;
-	int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_POSITION, &elem);
-	kernel_assert(offset != ATTR_STD_NOT_FOUND);
-
-	/* fetch vertex coordinates */
-	float3 verts[3], next_verts[3];
-	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
-
-	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
-	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
-
-	/* interpolate between steps */
-	verts[0] = (1.0f - t)*verts[0] + t*next_verts[0];
-	verts[1] = (1.0f - t)*verts[1] + t*next_verts[1];
-	verts[2] = (1.0f - t)*verts[2] + t*next_verts[2];
-
-	/* compute refined position */
-#ifdef __SUBSURFACE__
-	if(!subsurface)
-#endif
-		ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts);
-#ifdef __SUBSURFACE__
-	else
-		ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts);
-#endif
-
-	/* compute face normal */
-	float3 Ng;
-	if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED)
-		Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
-	else
-		Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
-
-	ccl_fetch(sd, Ng) = Ng;
-	ccl_fetch(sd, N) = Ng;
-
-	/* compute derivatives of P w.r.t. uv */
-#ifdef __DPDU__
-	ccl_fetch(sd, dPdu) = (verts[0] - verts[2]);
-	ccl_fetch(sd, dPdv) = (verts[1] - verts[2]);
-#endif
-
-	/* compute smooth normal */
-	if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
-		/* find attribute */
-		AttributeElement elem;
-		int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_NORMAL, &elem);
-		kernel_assert(offset != ATTR_STD_NOT_FOUND);
-
-		/* fetch vertex coordinates */
-		float3 normals[3], next_normals[3];
-		motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step, normals);
-		motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_normals);
-
-		/* interpolate between steps */
-		normals[0] = (1.0f - t)*normals[0] + t*next_normals[0];
-		normals[1] = (1.0f - t)*normals[1] + t*next_normals[1];
-		normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];
-
-		/* interpolate between vertices */
-		float u = ccl_fetch(sd, u);
-		float v = ccl_fetch(sd, v);
-		float w = 1.0f - u - v;
-		ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]);
-	}
-}
-
-/* Ray intersection. We simply compute the vertex positions at the given ray
- * time and do a ray intersection with the resulting triangle */
-
-ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 dir, float time, uint visibility, int object, int triAddr)
-{
-	/* primitive index for vertex location lookup */
-	int prim = kernel_tex_fetch(__prim_index, triAddr);
-	int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, triAddr): object;
-
-	/* get vertex locations for intersection */
-	float3 verts[3];
-	motion_triangle_vertices(kg, fobject, prim, time, verts);
-
-	/* ray-triangle intersection, unoptimized */
-	float t, u, v;
-
-	if(ray_triangle_intersect_uv(P, dir, isect->t, verts[2], verts[0], verts[1], &u, &v, &t)) {
-#ifdef __VISIBILITY_FLAG__
-		/* visibility flag test. we do it here under the assumption
-		 * that most triangles are culled by node flags */
-		if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
-#endif
-		{
-			isect->t = t;
-			isect->u = u;
-			isect->v = v;
-			isect->prim = triAddr;
-			isect->object = object;
-			isect->type = PRIMITIVE_MOTION_TRIANGLE;
-		
-			return true;
-		}
-	}
-
-	return false;
-}
-
-/* Special ray intersection routines for subsurface scattering. In that case we
- * only want to intersect with primitives in the same object, and if case of
- * multiple hits we pick a single random primitive as the intersection point. */
-
-#ifdef __SUBSURFACE__
-ccl_device_inline void motion_triangle_intersect_subsurface(
-        KernelGlobals *kg,
-        SubsurfaceIntersection *ss_isect,
-        float3 P,
-        float3 dir,
-        float time,
-        int object,
-        int triAddr,
-        float tmax,
-        uint *lcg_state,
-        int max_hits)
-{
-	/* primitive index for vertex location lookup */
-	int prim = kernel_tex_fetch(__prim_index, triAddr);
-	int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, triAddr): object;
-
-	/* get vertex locations for intersection */
-	float3 verts[3];
-	motion_triangle_vertices(kg, fobject, prim, time, verts);
-
-	/* ray-triangle intersection, unoptimized */
-	float t, u, v;
-
-	if(ray_triangle_intersect_uv(P, dir, tmax, verts[2], verts[0], verts[1], &u, &v, &t)) {
-		for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) {
-			if(ss_isect->hits[i].t == t) {
-				return;
-			}
-		}
-
-		ss_isect->num_hits++;
-
-		int hit;
-
-		if(ss_isect->num_hits <= max_hits) {
-			hit = ss_isect->num_hits - 1;
-		}
-		else {
-			/* reservoir sampling: if we are at the maximum number of
-			 * hits, randomly replace element or skip it */
-			hit = lcg_step_uint(lcg_state) % ss_isect->num_hits;
-
-			if(hit >= max_hits)
-				return;
-		}
-
-		/* record intersection */
-		Intersection *isect = &ss_isect->hits[hit];
-		isect->t = t;
-		isect->u = u;
-		isect->v = v;
-		isect->prim = triAddr;
-		isect->object = object;
-		isect->type = PRIMITIVE_MOTION_TRIANGLE;
-
-		/* Record geometric normal. */
-		ss_isect->Ng[hit] = normalize(cross(verts[1] - verts[0],
-		                                    verts[2] - verts[0]));
-	}
-}
-#endif
-
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
new file mode 100644
index 0000000..d57d74e
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
@@ -0,0 +1,280 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Motion Triangle Primitive
+ *
+ * These are stored as regular triangles, plus extra positions and normals at
+ * times other than the frame center. Computing the triangle vertex positions
+ * or normals at a given ray time is a matter of interpolation of the two steps
+ * between which the ray time lies.
+ *
+ * The extra positions and normals are stored as ATTR_STD_MOTION_VERTEX_POSITION
+ * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Refine triangle intersection to more precise hit point. For rays that travel
+ * far the precision is often not so good, this reintersects the primitive from
+ * a closer distance.
+ */
+
+ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg,
+                                                ShaderData *sd,
+                                                const Intersection *isect,
+                                                const Ray *ray,
+                                                float3 verts[3])
+{
+	float3 P = ray->P;
+	float3 D = ray->D;
+	float t = isect->t;
+
+#ifdef __INTERSECTION_REFINE__
+	if(isect->object != OBJECT_NONE) {
+		if(UNLIKELY(t == 0.0f)) {
+			return P;
+		}
+#  ifdef __OBJECT_MOTION__
+		Transform tfm = ccl_fetch(sd, ob_itfm);
+#  else
+		Transform tfm = object_fetch_transform(kg,
+		                                       isect->object,
+		                                       OBJECT_INVERSE_TRANSFORM);
+#  endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D*t);
+		D = normalize_len(D, &t);
+	}
+
+	P = P + D*t;
+
+	/* Compute refined intersection distance. */
+	const float3 e1 = verts[0] - verts[2];
+	const float3 e2 = verts[1] - verts[2];
+	const float3 s1 = cross(D, e2);
+
+	const float invdivisor = 1.0f/dot(s1, e1);
+	const float3 d = P - verts[2];
+	const float3 s2 = cross(d, e1);
+	float rt = dot(e2, s2)*invdivisor;
+
+	/* Compute refined position. */
+	P = P + D*rt;
+
+	if(isect->object != OBJECT_NONE) {
+#  ifdef __OBJECT_MOTION__
+		Transform tfm = ccl_fetch(sd, ob_tfm);
+#  else
+		Transform tfm = object_fetch_transform(kg,
+		                                       isect->object,
+		                                       OBJECT_TRANSFORM);
+#  endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+#else
+	return P + D*t;
+#endif
+}
+
+/* Same as above, except that isect->t is assumed to be in object space
+ * for instancing.
+ */
+
+#ifdef __SUBSURFACE__
+#  if defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86))
+ccl_device_noinline
+#  else
+ccl_device_inline
+#  endif
+float3 motion_triangle_refine_subsurface(KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         const Intersection *isect,
+                                         const Ray *ray,
+                                         float3 verts[3])
+{
+	float3 P = ray->P;
+	float3 D = ray->D;
+	float t = isect->t;
+
+#  ifdef __INTERSECTION_REFINE__
+	if(isect->object != OBJECT_NONE) {
+#    ifdef __OBJECT_MOTION__
+		Transform tfm = ccl_fetch(sd, ob_itfm);
+#    else
+		Transform tfm = object_fetch_transform(kg,
+		                                       isect->object,
+		                                       OBJECT_INVERSE_TRANSFORM);
+#    endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D);
+		D = normalize(D);
+	}
+
+	P = P + D*t;
+
+	/* compute refined intersection distance */
+	const float3 e1 = verts[0] - verts[2];
+	const float3 e2 = verts[1] - verts[2];
+	const float3 s1 = cross(D, e2);
+
+	const float invdivisor = 1.0f/dot(s1, e1);
+	const float3 d = P - verts[2];
+	const float3 s2 = cross(d, e1);
+	float rt = dot(e2, s2)*invdivisor;
+
+	P = P + D*rt;
+
+	if(isect->object != OBJECT_NONE) {
+#    ifdef __OBJECT_MOTION__
+		Transform tfm = ccl_fetch(sd, ob_tfm);
+#    else
+		Transform tfm = object_fetch_transform(kg,
+		                                       isect->object,
+		                                       OBJECT_TRANSFORM);
+#    endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+#  else  /* __INTERSECTION_REFINE__ */
+	return P + D*t;
+#  endif  /* __INTERSECTION_REFINE__ */
+}
+#endif  /* __SUBSURFACE__ */
+
+
+/* Ray intersection. We simply compute the vertex positions at the given ray
+ * time and do a ray intersection with the resulting triangle.
+ */
+
+ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
+                                                 Intersection *isect,
+                                                 float3 P,
+                                                 float3 dir,
+                                                 float time,
+                                                 uint visibility,
+                                                 int object,
+                                                 int prim_addr)
+{
+	/* Primitive index for vertex location lookup. */
+	int prim = kernel_tex_fetch(__prim_index, prim_addr);
+	int fobject = (object == OBJECT_NONE)
+	                  ? kernel_tex_fetch(__prim_object, prim_addr)
+	                  : object;
+	/* Get vertex locations for intersection. */
+	float3 verts[3];
+	motion_triangle_vertices(kg, fobject, prim, time, verts);
+	/* Ray-triangle intersection, unoptimized. */
+	float t, u, v;
+	if(ray_triangle_intersect_uv(P,
+	                             dir,
+	                             isect->t,
+	                             verts[2], verts[0], verts[1],
+	                             &u, &v, &t))
+	{
+#ifdef __VISIBILITY_FLAG__
+		/* Visibility flag test. we do it here under the assumption
+		 * that most triangles are culled by node flags.
+		 */
+		if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility)
+#endif
+		{
+			isect->t = t;
+			isect->u = u;
+			isect->v = v;
+			isect->prim = prim_addr;
+			isect->object = object;
+			isect->type = PRIMITIVE_MOTION_TRIANGLE;
+			return true;
+		}
+	}
+	return false;
+}
+
+/* Special ray intersection routines for subsurface scattering. In that case we
+ * only want to intersect with primitives in the same object, and if case of
+ * multiple hits we pick a single random primitive as the intersection point.
+ */
+#ifdef __SUBSURFACE__
+ccl_device_inline void motion_triangle_intersect_subsurface(
+        KernelGlobals *kg,
+        SubsurfaceIntersection *ss_isect,
+        float3 P,
+        float3 dir,
+        float time,
+        int object,
+        int prim_addr,
+        float tmax,
+        uint *lcg_state,
+        int max_hits)
+{
+	/* Primitive index for vertex location lookup. */
+	int prim = kernel_tex_fetch(__prim_index, prim_addr);
+	int fobject = (object == OBJECT_NONE)
+	                  ? kernel_tex_fetch(__prim_object, prim_addr)
+	                  : object;
+	/* Get vertex locations for intersection. */
+	float3 verts[3];
+	motion_triangle_vertices(kg, fobject, prim, time, verts);
+	/* Ray-triangle intersection, unoptimized. */
+	float t, u, v;
+	if(ray_triangle_intersect_uv(P,
+	                             dir,
+	                             tmax,
+	                             verts[2], verts[0], verts[1],
+	                             &u, &v, &t))
+	{
+		for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) {
+			if(ss_isect->hits[i].t == t) {
+				return;
+			}
+		}
+		ss_isect->num_hits++;
+		int hit;
+		if(ss_isect->num_hits <= max_hits) {
+			hit = ss_isect->num_hits - 1;
+		}
+		else {
+			/* Reservoir sampling: if we are at the maximum number of
+			 * hits, randomly replace element or skip it.
+			 */
+			hit = lcg_step_uint(lcg_state) % ss_isect->num_hits;
+
+			if(hit >= max_hits)
+				return;
+		}
+		/* Record intersection. */
+		Intersection *isect = &ss_isect->hits[hit];
+		isect->t = t;
+		isect->u = u;
+		isect->v = v;
+		isect->prim = prim_addr;
+		isect->object = object;
+		isect->type = PRIMITIVE_MOTION_TRIANGLE;
+		/* Record geometric normal. */
+		ss_isect->Ng[hit] = normalize(cross(verts[1] - verts[0],
+		                                    verts[2] - verts[0]));
+	}
+}
+#endif  /* __SUBSURFACE__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
new file mode 100644
index 0000000..c5dbc6a
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Motion Triangle Primitive
+ *
+ * These are stored as regular triangles, plus extra positions and normals at
+ * times other than the frame center. Computing the triangle vertex positions
+ * or normals at a given ray time is a matter of interpolation of the two steps
+ * between which the ray time lies.
+ *
+ * The extra positions and normals are stored as ATTR_STD_MOTION_VERTEX_POSITION
+ * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Setup of motion triangle specific parts of ShaderData, moved into this one
+ * function to more easily share computation of interpolated positions and
+ * normals */
+
+/* return 3 triangle vertex normals */
+ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
+                                                      ShaderData *sd, const
+                                                      Intersection *isect,
+                                                      const Ray *ray,
+                                                      bool subsurface)
+{
+	/* Get shader. */
+	ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
+	/* Get motion info. */
+	/* TODO(sergey): This logic is really similar to motion_triangle_vertices(),
+	 * can we de-duplicate something here?
+	 */
+	int numsteps, numverts;
+	object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL);
+	/* Figure out which steps we need to fetch and their interpolation factor. */
+	int maxstep = numsteps*2;
+	int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1);
+	float t = ccl_fetch(sd, time)*maxstep - step;
+	/* Find attribute. */
+	AttributeElement elem;
+	int offset = find_attribute_motion(kg, ccl_fetch(sd, object),
+	                                   ATTR_STD_MOTION_VERTEX_POSITION,
+	                                   &elem);
+	kernel_assert(offset != ATTR_STD_NOT_FOUND);
+	/* Fetch vertex coordinates. */
+	float3 verts[3], next_verts[3];
+	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
+	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
+	/* Interpolate between steps. */
+	verts[0] = (1.0f - t)*verts[0] + t*next_verts[0];
+	verts[1] = (1.0f - t)*verts[1] + t*next_verts[1];
+	verts[2] = (1.0f - t)*verts[2] + t*next_verts[2];
+	/* Compute refined position. */
+#ifdef __SUBSURFACE__
+	if(subsurface) {
+		ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg,
+		                                                     sd,
+		                                                     isect,
+		                                                     ray,
+		                                                     verts);
+	}
+	else
+#endif  /*  __SUBSURFACE__*/
+	{
+		ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts);
+	}
+	/* Compute face normal. */
+	float3 Ng;
+	if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED) {
+		Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
+	}
+	else {
+		Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
+	}
+	ccl_fetch(sd, Ng) = Ng;
+	ccl_fetch(sd, N) = Ng;
+	/* Compute derivatives of P w.r.t. uv. */
+#ifdef __DPDU__
+	ccl_fetch(sd, dPdu) = (verts[0] - verts[2]);
+	ccl_fetch(sd, dPdv) = (verts[1] - verts[2]);
+#endif
+	/* Compute smooth normal. */
+	if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
+		/* Find attribute. */
+		AttributeElement elem;
+		int offset = find_attribute_motion(kg,
+		                                   ccl_fetch(sd, object),
+		                                   ATTR_STD_MOTION_VERTEX_NORMAL,
+		                                   &elem);
+		kernel_assert(offset != ATTR_STD_NOT_FOUND);
+		/* Fetch vertex coordinates. */
+		float3 normals[3], next_normals[3];
+		motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step, normals);
+		motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_normals);
+		/* Interpolate between steps. */
+		normals[0] = (1.0f - t)*normals[0] + t*next_normals[0];
+		normals[1] = (1.0f - t)*normals[1] + t*next_normals[1];
+		normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];
+		/* Interpolate between vertices. */
+		float u = ccl_fetch(sd, u);
+		float v = ccl_fetch(sd, v);
+		float w = 1.0f - u - v;
+		ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]);
+	}
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index 4f72c5b..9f0fe03 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -55,6 +55,21 @@ ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg, int object
 	return tfm;
 }
 
+/* Lamp to world space transformation */
+
+ccl_device_inline Transform lamp_fetch_transform(KernelGlobals *kg, int lamp, bool inverse)
+{
+	int offset = lamp*LIGHT_SIZE + (inverse? 8 : 5);
+
+	Transform tfm;
+	tfm.x = kernel_tex_fetch(__light_data, offset + 0);
+	tfm.y = kernel_tex_fetch(__light_data, offset + 1);
+	tfm.z = kernel_tex_fetch(__light_data, offset + 2);
+	tfm.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
+
+	return tfm;
+}
+
 /* Object to world space transformation for motion vectors */
 
 ccl_device_inline Transform object_fetch_vector_transform(KernelGlobals *kg, int object, enum ObjectVectorTransform type)
@@ -147,10 +162,14 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons
 ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	*N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_tfm), *N));
+	if((ccl_fetch(sd, object) != OBJECT_NONE) || (ccl_fetch(sd, type) == PRIMITIVE_LAMP)) {
+		*N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_tfm), *N));
+	}
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
-	*N = normalize(transform_direction_transposed(&tfm, *N));
+	if(ccl_fetch(sd, object) != OBJECT_NONE) {
+		Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+		*N = normalize(transform_direction_transposed(&tfm, *N));
+	}
 #endif
 }
 
@@ -308,7 +327,7 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
 
 ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
 {
-	return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2 + 1);
+	return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE + 1);
 }
 
 /* Particle data from which object was instanced */
@@ -376,15 +395,33 @@ ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle)
 ccl_device_inline float3 bvh_clamp_direction(float3 dir)
 {
 	/* clamp absolute values by exp2f(-80.0f) to avoid division by zero when calculating inverse direction */
-	float ooeps = 8.271806E-25f;
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
+	const ssef oopes(8.271806E-25f,8.271806E-25f,8.271806E-25f,0.0f);
+	const ssef mask = _mm_cmpgt_ps(fabs(dir), oopes);
+	const ssef signdir = signmsk(dir.m128) | oopes;
+#  ifndef __KERNEL_AVX__
+	ssef res = mask & ssef(dir);
+	res = _mm_or_ps(res,_mm_andnot_ps(mask, signdir));
+#  else
+	ssef res = _mm_blendv_ps(signdir, dir, mask);
+#  endif
+	return float3(res);
+#else  /* __KERNEL_SSE__ && __KERNEL_SSE2__ */
+	const float ooeps = 8.271806E-25f;
 	return make_float3((fabsf(dir.x) > ooeps)? dir.x: copysignf(ooeps, dir.x),
 	                   (fabsf(dir.y) > ooeps)? dir.y: copysignf(ooeps, dir.y),
 	                   (fabsf(dir.z) > ooeps)? dir.z: copysignf(ooeps, dir.z));
+#endif  /* __KERNEL_SSE__ && __KERNEL_SSE2__ */
 }
 
 ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 {
+	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
+#if defined(__KERNEL_SSE__) && 0
+	return rcp(dir);
+#else
 	return 1.0f / dir;
+#endif
 }
 
 /* Transform ray into object space to enter static object in BVH */
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index dd53282..4db121d 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -59,21 +59,33 @@ void triangle_intersect_precalc(float3 dir,
                                 IsectPrecalc *isect_precalc)
 {
 	/* Calculate dimension where the ray direction is maximal. */
+#ifndef __KERNEL_SSE__
 	int kz = util_max_axis(make_float3(fabsf(dir.x),
 	                                   fabsf(dir.y),
 	                                   fabsf(dir.z)));
 	int kx = kz + 1; if(kx == 3) kx = 0;
 	int ky = kx + 1; if(ky == 3) ky = 0;
+#else
+	int kx, ky, kz;
+	/* Avoiding mispredicted branch on direction. */
+	kz = util_max_axis(fabs(dir));
+	static const char inc_xaxis[] = {1, 2, 0, 55};
+	static const char inc_yaxis[] = {2, 0, 1, 55};
+	kx = inc_xaxis[kz];
+	ky = inc_yaxis[kz];
+#endif
+
+	float dir_kz = IDX(dir, kz);
 
 	/* Swap kx and ky dimensions to preserve winding direction of triangles. */
-	if(IDX(dir, kz) < 0.0f) {
+	if(dir_kz < 0.0f) {
 		int tmp = kx;
 		kx = ky;
 		ky = tmp;
 	}
 
 	/* Calculate the shear constants. */
-	float inv_dir_z = 1.0f / IDX(dir, kz);
+	float inv_dir_z = 1.0f / dir_kz;
 	isect_precalc->Sx = IDX(dir, kx) * inv_dir_z;
 	isect_precalc->Sy = IDX(dir, ky) * inv_dir_z;
 	isect_precalc->Sz = inv_dir_z;
@@ -96,7 +108,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
                                           float3 P,
                                           uint visibility,
                                           int object,
-                                          int triAddr)
+                                          int prim_addr)
 {
 	const int kx = isect_precalc->kx;
 	const int ky = isect_precalc->ky;
@@ -106,7 +118,68 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 	const float Sz = isect_precalc->Sz;
 
 	/* Calculate vertices relative to ray origin. */
-	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, triAddr);
+	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
+
+#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
+	const avxf avxf_P(P.m128, P.m128);
+
+	const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0);
+	const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1);
+
+	const avxf AB = tri_ab - avxf_P;
+	const avxf BC = tri_bc - avxf_P;
+
+	const __m256i permute_mask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx);
+
+	const avxf AB_k = shuffle(AB, permute_mask);
+	const avxf BC_k = shuffle(BC, permute_mask);
+
+	/* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */
+	const avxf ABBC_kz = shuffle<2>(AB_k, BC_k);
+
+	/* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */
+	const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k);
+
+	const avxf Sxy(Sy, Sx, Sy, Sx);
+
+	/* Ax, Ay, Bx, By, Bx, By, Cx, Cy */
+	const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy);
+
+	float ABBC_kz_array[8];
+	_mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz);
+
+	const float A_kz = ABBC_kz_array[0];
+	const float B_kz = ABBC_kz_array[2];
+	const float C_kz = ABBC_kz_array[6];
+
+	/* By, Bx, Cy, Cx, By, Bx, Ay, Ax */
+	const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy);
+
+	const avxf neg_mask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000);
+
+	/* W           U                             V
+	 * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX
+	 */
+	const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, neg_mask /* Dont care */);
+
+	const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ neg_mask;
+
+	/* Calculate scaled barycentric coordinates. */
+	float WUVW_array[4];
+	_mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW));
+
+	const float W = WUVW_array[0];
+	const float U = WUVW_array[1];
+	const float V = WUVW_array[2];
+
+	const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW);
+	const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW,
+	                                               _mm256_setzero_ps(), 0));
+
+	if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) {
+		return false;
+	}
+#else
 	const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
 	             tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
 	             tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
@@ -135,6 +208,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 	{
 		return false;
 	}
+#endif
 
 	/* Calculate determinant. */
 	float det = U + V + W;
@@ -157,7 +231,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 #ifdef __VISIBILITY_FLAG__
 	/* visibility flag test. we do it here under the assumption
 	 * that most triangles are culled by node flags */
-	if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
+	if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility)
 #endif
 	{
 #ifdef __KERNEL_CUDA__
@@ -167,7 +241,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 #endif
 		/* Normalize U, V, W, and T. */
 		const float inv_det = 1.0f / det;
-		isect->prim = triAddr;
+		isect->prim = prim_addr;
 		isect->object = object;
 		isect->type = PRIMITIVE_TRIANGLE;
 		isect->u = U * inv_det;
@@ -190,7 +264,7 @@ ccl_device_inline void triangle_intersect_subsurface(
         SubsurfaceIntersection *ss_isect,
         float3 P,
         int object,
-        int triAddr,
+        int prim_addr,
         float tmax,
         uint *lcg_state,
         int max_hits)
@@ -203,10 +277,71 @@ ccl_device_inline void triangle_intersect_subsurface(
 	const float Sz = isect_precalc->Sz;
 
 	/* Calculate vertices relative to ray origin. */
-	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, triAddr);
+	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
 	const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
 	             tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
 	             tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
+
+#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
+	const avxf avxf_P(P.m128, P.m128);
+
+	const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0);
+	const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1);
+
+	const avxf AB = tri_ab - avxf_P;
+	const avxf BC = tri_bc - avxf_P;
+
+	const __m256i permuteMask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx);
+
+	const avxf AB_k = shuffle(AB, permuteMask);
+	const avxf BC_k = shuffle(BC, permuteMask);
+
+	/* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */
+	const avxf ABBC_kz = shuffle<2>(AB_k, BC_k);
+
+	/* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */
+	const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k);
+
+	const avxf Sxy(Sy, Sx, Sy, Sx);
+
+	/* Ax, Ay, Bx, By, Bx, By, Cx, Cy */
+	const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy);
+
+	float ABBC_kz_array[8];
+	_mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz);
+
+	const float A_kz = ABBC_kz_array[0];
+	const float B_kz = ABBC_kz_array[2];
+	const float C_kz = ABBC_kz_array[6];
+
+	/* By, Bx, Cy, Cx, By, Bx, Ay, Ax */
+	const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy);
+
+	const avxf negMask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000);
+
+	/* W           U                             V
+	 * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX
+	 */
+	const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, negMask /* Dont care */);
+
+	const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ negMask;
+
+	/* Calculate scaled barycentric coordinates. */
+	float WUVW_array[4];
+	_mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW));
+
+	const float W = WUVW_array[0];
+	const float U = WUVW_array[1];
+	const float V = WUVW_array[2];
+
+	const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW);
+	const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW,
+	                                               _mm256_setzero_ps(), 0));
+
+	if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) {
+		return;
+	}
+#else
 	const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z);
 	const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z);
 	const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z);
@@ -233,6 +368,7 @@ ccl_device_inline void triangle_intersect_subsurface(
 	{
 		return;
 	}
+#endif
 
 	/* Calculate determinant. */
 	float det = U + V + W;
@@ -279,7 +415,7 @@ ccl_device_inline void triangle_intersect_subsurface(
 
 	/* record intersection */
 	Intersection *isect = &ss_isect->hits[hit];
-	isect->prim = triAddr;
+	isect->prim = prim_addr;
 	isect->object = object;
 	isect->type = PRIMITIVE_TRIANGLE;
 	isect->u = U * inv_det;
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index fd97a63..03724c9 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Return position normalized to 0..1 in mesh bounds */
 
-#if defined(__KERNEL_GPU__) && __CUDA_ARCH__ < 300
+#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300
 ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z)
 {
 	float4 r;
@@ -42,7 +42,7 @@ ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z)
 	}
 	return r;
 }
-#endif  /* __KERNEL_GPU__ */
+#endif  /* __KERNEL_CUDA__ */
 
 ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
                                                     const ShaderData *sd,
@@ -64,8 +64,8 @@ ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
 
 ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
 {
-	float3 P = volume_normalized_position(kg, sd, sd->P);
-#ifdef __KERNEL_GPU__
+	float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
+#ifdef __KERNEL_CUDA__
 #  if __CUDA_ARCH__ >= 300
 	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset);
 	float f = kernel_tex_image_interp_3d_float(tex, P.x, P.y, P.z);
@@ -73,6 +73,8 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 #  else
 	float4 r = volume_image_texture_3d(desc.offset, P.x, P.y, P.z);
 #  endif
+#elif defined(__KERNEL_OPENCL__)
+	float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z);
 #else
 	float4 r;
 	if(sd->flag & SD_VOLUME_CUBIC)
@@ -89,14 +91,16 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 
 ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy)
 {
-	float3 P = volume_normalized_position(kg, sd, sd->P);
-#ifdef __KERNEL_GPU__
+	float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
+#ifdef __KERNEL_CUDA__
 #  if __CUDA_ARCH__ >= 300
 	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset);
 	float4 r = kernel_tex_image_interp_3d_float4(tex, P.x, P.y, P.z);
 #  else
 	float4 r = volume_image_texture_3d(desc.offset, P.x, P.y, P.z);
 #  endif
+#elif defined(__KERNEL_OPENCL__)
+	float4 r = kernel_tex_image_interp_3d(kg, desc.offset, P.x, P.y, P.z);
 #else
 	float4 r;
 	if(sd->flag & SD_VOLUME_CUBIC)
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 2a21524..6c3ee6b 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -48,10 +48,10 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
 			eval->scatter = value;
 	}
 	else
-		eval->diffuse = value;
-#else
-	eval->diffuse = value;
 #endif
+	{
+		eval->diffuse = value;
+	}
 }
 
 ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
@@ -72,10 +72,10 @@ ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3
 		/* skipping transparent, this function is used by for eval(), will be zero then */
 	}
 	else
-		eval->diffuse += value;
-#else
-	eval->diffuse += value;
 #endif
+	{
+		eval->diffuse += value;
+	}
 }
 
 ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
@@ -90,13 +90,32 @@ ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
 			&& is_zero(eval->scatter);
 	}
 	else
+#endif
+	{
 		return is_zero(eval->diffuse);
-#else
-	return is_zero(eval->diffuse);
+	}
+}
+
+ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
+{
+#ifdef __PASSES__
+	if(eval->use_light_pass) {
+		eval->diffuse *= value;
+		eval->glossy *= value;
+		eval->transmission *= value;
+		eval->subsurface *= value;
+		eval->scatter *= value;
+
+		/* skipping transparent, this function is used by for eval(), will be zero then */
+	}
+	else
 #endif
+	{
+		eval->diffuse *= value;
+	}
 }
 
-ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float3 value)
+ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value)
 {
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
@@ -115,6 +134,17 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float3 value)
 #endif
 }
 
+ccl_device_inline float3 bsdf_eval_sum(BsdfEval *eval)
+{
+#ifdef __PASSES__
+	if(eval->use_light_pass) {
+		return eval->diffuse + eval->glossy + eval->transmission + eval->subsurface + eval->scatter;
+	}
+	else
+#endif
+	return eval->diffuse;
+}
+
 /* Path Radiance
  *
  * We accumulate different render passes separately. After summing at the end
@@ -164,10 +194,10 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 		L->mist = 0.0f;
 	}
 	else
-		L->emission = make_float3(0.0f, 0.0f, 0.0f);
-#else
-	L->emission = make_float3(0.0f, 0.0f, 0.0f);
 #endif
+	{
+		L->emission = make_float3(0.0f, 0.0f, 0.0f);
+	}
 }
 
 ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
@@ -193,16 +223,15 @@ ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space
 		}
 		else {
 			/* transparent bounce before first hit, or indirectly visible through BSDF */
-			float3 sum = (bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->transparent +
-						  bsdf_eval->subsurface + bsdf_eval->scatter) * inverse_pdf;
+			float3 sum = (bsdf_eval_sum(bsdf_eval) + bsdf_eval->transparent) * inverse_pdf;
 			*throughput *= sum;
 		}
 	}
 	else
-		*throughput *= bsdf_eval->diffuse*inverse_pdf;
-#else
-	*throughput *= bsdf_eval->diffuse*inverse_pdf;
 #endif
+	{
+		*throughput *= bsdf_eval->diffuse*inverse_pdf;
+	}
 }
 
 ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 throughput, float3 value, int bounce)
@@ -217,10 +246,10 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro
 			L->indirect += throughput*value;
 	}
 	else
-		L->emission += throughput*value;
-#else
-	L->emission += throughput*value;
 #endif
+	{
+		L->emission += throughput*value;
+	}
 }
 
 ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput, float3 alpha, float3 bsdf, float3 ao, int bounce)
@@ -238,10 +267,10 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput
 		}
 	}
 	else
-		L->emission += throughput*bsdf*ao;
-#else
-	L->emission += throughput*bsdf*ao;
 #endif
+	{
+		L->emission += throughput*bsdf*ao;
+	}
 }
 
 ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp)
@@ -264,15 +293,14 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
 		}
 		else {
 			/* indirectly visible lighting after BSDF bounce */
-			float3 sum = bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->subsurface + bsdf_eval->scatter;
-			L->indirect += throughput*sum*shadow;
+			L->indirect += throughput*bsdf_eval_sum(bsdf_eval)*shadow;
 		}
 	}
 	else
-		L->emission += throughput*bsdf_eval->diffuse*shadow;
-#else
-	L->emission += throughput*bsdf_eval->diffuse*shadow;
 #endif
+	{
+		L->emission += throughput*bsdf_eval->diffuse*shadow;
+	}
 }
 
 ccl_device_inline void path_radiance_accum_background(PathRadiance *L, float3 throughput, float3 value, int bounce)
@@ -287,10 +315,10 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L, float3 th
 			L->indirect += throughput*value;
 	}
 	else
-		L->emission += throughput*value;
-#else
-	L->emission += throughput*value;
 #endif
+	{
+		L->emission += throughput*value;
+	}
 }
 
 ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
@@ -433,10 +461,10 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 
 	/* No Light Passes */
 	else
-		L_sum = L->emission;
-#else
-	L_sum = L->emission;
 #endif
+	{
+		L_sum = L->emission;
+	}
 
 	/* Reject invalid value */
 	float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
@@ -465,14 +493,12 @@ ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance
 	L->indirect_subsurface += L_sample->indirect_subsurface*fac;
 	L->indirect_scatter += L_sample->indirect_scatter*fac;
 
-	L->emission += L_sample->emission*fac;
 	L->background += L_sample->background*fac;
 	L->ao += L_sample->ao*fac;
 	L->shadow += L_sample->shadow*fac;
 	L->mist += L_sample->mist*fac;
-#else
-	L->emission += L_sample->emission * fac;
 #endif
+	L->emission += L_sample->emission * fac;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index fd9207f..c32ac6c 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -63,7 +63,7 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 
 		/* sample ambient occlusion */
 		if(pass_filter & BAKE_FILTER_AO) {
-			kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput);
+			kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput, shader_bsdf_alpha(kg, sd));
 		}
 
 		/* sample emission */
@@ -320,7 +320,8 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	                         P, Ng, Ng,
 	                         shader, object, prim,
 	                         u, v, 1.0f, 0.5f,
-	                         !(kernel_tex_fetch(__object_flag, object) & SD_TRANSFORM_APPLIED));
+	                         !(kernel_tex_fetch(__object_flag, object) & SD_TRANSFORM_APPLIED),
+	                         LAMP_NONE);
 	sd.I = sd.N;
 
 	/* update differentials */
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 7b30df0..9d1f3bd 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -71,6 +71,20 @@ template<typename T> struct texture  {
 		return data[index];
 	}
 
+#ifdef __KERNEL_AVX__
+	/* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
+	 * compatibility with existing indicies and data structures.
+	 */
+	ccl_always_inline avxf fetch_avxf(const int index)
+	{
+		kernel_assert(index >= 0 && (index+1) < width);
+		ssef *ssefData = (ssef*)data;
+		ssef *ssefNodeData = &ssefData[index];
+		return _mm256_loadu_ps((float *)ssefNodeData);
+	}
+
+#endif
+
 #ifdef __KERNEL_SSE2__
 	ccl_always_inline ssef fetch_ssef(int index)
 	{
@@ -506,6 +520,7 @@ typedef texture_image<half4> texture_image_half4;
 /* Macros to handle different memory storage on different devices */
 
 #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
+#define kernel_tex_fetch_avxf(tex, index) (kg->tex.fetch_avxf(index))
 #define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
 #define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
 #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h
index 24d6458..5647bba 100644
--- a/intern/cycles/kernel/kernel_debug.h
+++ b/intern/cycles/kernel/kernel_debug.h
@@ -18,8 +18,9 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline void debug_data_init(DebugData *debug_data)
 {
-	debug_data->num_bvh_traversal_steps = 0;
+	debug_data->num_bvh_traversed_nodes = 0;
 	debug_data->num_bvh_traversed_instances = 0;
+	debug_data->num_bvh_intersections = 0;
 	debug_data->num_ray_bounces = 0;
 }
 
@@ -30,16 +31,21 @@ ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
                                                  int sample)
 {
 	int flag = kernel_data.film.pass_flag;
-	if(flag & PASS_BVH_TRAVERSAL_STEPS) {
-		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversal_steps,
+	if(flag & PASS_BVH_TRAVERSED_NODES) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_nodes,
 		                        sample,
-		                        debug_data->num_bvh_traversal_steps);
+		                        debug_data->num_bvh_traversed_nodes);
 	}
 	if(flag & PASS_BVH_TRAVERSED_INSTANCES) {
 		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances,
 		                        sample,
 		                        debug_data->num_bvh_traversed_instances);
 	}
+	if(flag & PASS_BVH_INTERSECTIONS) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_intersections,
+		                        sample,
+		                        debug_data->num_bvh_intersections);
+	}
 	if(flag & PASS_RAY_BOUNCES) {
 		kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces,
 		                        sample,
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index 457887f..8c7c651 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -29,6 +29,8 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 	/* setup shading at emitter */
 	float3 eval;
 
+	int shader_flag = kernel_tex_fetch(__shader_flag, (ls->shader & SHADER_MASK)*SHADER_SIZE);
+
 #ifdef __BACKGROUND_MIS__
 	if(ls->type == LIGHT_BACKGROUND) {
 		Ray ray;
@@ -49,11 +51,21 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 	}
 	else
 #endif
+	if(shader_flag & SD_HAS_CONSTANT_EMISSION)
+	{
+		eval.x = __int_as_float(kernel_tex_fetch(__shader_flag, (ls->shader & SHADER_MASK)*SHADER_SIZE + 2));
+		eval.y = __int_as_float(kernel_tex_fetch(__shader_flag, (ls->shader & SHADER_MASK)*SHADER_SIZE + 3));
+		eval.z = __int_as_float(kernel_tex_fetch(__shader_flag, (ls->shader & SHADER_MASK)*SHADER_SIZE + 4));
+		if((ls->prim != PRIM_NONE) && dot(ls->Ng, I) < 0.0f) {
+			ls->Ng = -ls->Ng;
+		}
+	}
+	else
 	{
 		shader_setup_from_sample(kg, emission_sd,
 		                         ls->P, ls->Ng, I,
 		                         ls->shader, ls->object, ls->prim,
-		                         ls->u, ls->v, t, time, false);
+		                         ls->u, ls->v, t, time, false, ls->lamp);
 
 		ls->Ng = ccl_fetch(emission_sd, Ng);
 
@@ -82,7 +94,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
                                          ccl_addr_space PathState *state,
                                          Ray *ray,
                                          BsdfEval *eval,
-                                         bool *is_lamp)
+                                         bool *is_lamp,
+                                         float rand_terminate)
 {
 	if(ls->pdf == 0.0f)
 		return false;
@@ -122,7 +135,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS);
 #endif
 
-	bsdf_eval_mul(eval, light_eval/ls->pdf);
+	bsdf_eval_mul3(eval, light_eval/ls->pdf);
 
 #ifdef __PASSES__
 	/* use visibility flag to skip lights */
@@ -143,6 +156,16 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	if(bsdf_eval_is_zero(eval))
 		return false;
 
+	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
+		float probability = max3(bsdf_eval_sum(eval)) * kernel_data.integrator.light_inv_rr_threshold;
+		if(probability < 1.0f) {
+			if(rand_terminate >= probability) {
+				return false;
+			}
+			bsdf_eval_mul(eval, 1.0f / probability);
+		}
+	}
+
 	if(ls->shader & SHADER_CAST_SHADOW) {
 		/* setup ray */
 		bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f);
diff --git a/intern/cycles/kernel/kernel_image_opencl.h b/intern/cycles/kernel/kernel_image_opencl.h
new file mode 100644
index 0000000..0352c58
--- /dev/null
+++ b/intern/cycles/kernel/kernel_image_opencl.h
@@ -0,0 +1,231 @@
+/*
+ * Copyright 2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/* For OpenCL all images are packed in a single array, and we do manual lookup
+ * and interpolation. */
+
+ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset)
+{
+	/* Float4 */
+	if(id < TEX_START_BYTE4_OPENCL) {
+		return kernel_tex_fetch(__tex_image_float4_packed, offset);
+	}
+	/* Byte4 */
+	else if(id < TEX_START_FLOAT_OPENCL) {
+		uchar4 r = kernel_tex_fetch(__tex_image_byte4_packed, offset);
+		float f = 1.0f/255.0f;
+		return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
+	}
+	/* Float */
+	else if(id < TEX_START_BYTE_OPENCL) {
+		float f = kernel_tex_fetch(__tex_image_float_packed, offset);
+		return make_float4(f, f, f, 1.0f);
+	}
+	/* Byte */
+	else {
+		uchar r = kernel_tex_fetch(__tex_image_byte_packed, offset);
+		float f = r * (1.0f/255.0f);
+		return make_float4(f, f, f, 1.0f);
+	}
+}
+
+ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
+{
+	x %= width;
+	if(x < 0)
+		x += width;
+	return x;
+}
+
+ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
+{
+	return clamp(x, 0, width-1);
+}
+
+ccl_device_inline float svm_image_texture_frac(float x, int *ix)
+{
+	int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
+	*ix = i;
+	return x - (float)i;
+}
+
+ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
+{
+	uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2);
+	uint width = info.x;
+	uint height = info.y;
+	uint offset = info.z;
+
+	/* Image Options */
+	uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
+	uint extension;
+	if(info.w & (1 << 1))
+		extension = EXTENSION_REPEAT;
+	else if(info.w & (1 << 2))
+		extension = EXTENSION_EXTEND;
+	else
+		extension = EXTENSION_CLIP;
+
+	float4 r;
+	int ix, iy, nix, niy;
+	if(interpolation == INTERPOLATION_CLOSEST) {
+		svm_image_texture_frac(x*width, &ix);
+		svm_image_texture_frac(y*height, &iy);
+
+		if(extension == EXTENSION_REPEAT) {
+			ix = svm_image_texture_wrap_periodic(ix, width);
+			iy = svm_image_texture_wrap_periodic(iy, height);
+		}
+		else {
+			if(extension == EXTENSION_CLIP) {
+				if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				}
+			}
+			/* Fall through. */
+			/* EXTENSION_EXTEND */
+			ix = svm_image_texture_wrap_clamp(ix, width);
+			iy = svm_image_texture_wrap_clamp(iy, height);
+		}
+
+		r = svm_image_texture_read(kg, id, offset + ix + iy*width);
+	}
+	else { /* INTERPOLATION_LINEAR */
+		float tx = svm_image_texture_frac(x*width - 0.5f, &ix);
+		float ty = svm_image_texture_frac(y*height - 0.5f, &iy);
+
+		if(extension == EXTENSION_REPEAT) {
+			ix = svm_image_texture_wrap_periodic(ix, width);
+			iy = svm_image_texture_wrap_periodic(iy, height);
+
+			nix = svm_image_texture_wrap_periodic(ix+1, width);
+			niy = svm_image_texture_wrap_periodic(iy+1, height);
+		}
+		else {
+			if(extension == EXTENSION_CLIP) {
+				if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				}
+			}
+			nix = svm_image_texture_wrap_clamp(ix+1, width);
+			niy = svm_image_texture_wrap_clamp(iy+1, height);
+			ix = svm_image_texture_wrap_clamp(ix, width);
+			iy = svm_image_texture_wrap_clamp(iy, height);
+		}
+
+		r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width);
+		r += (1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width);
+		r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width);
+		r += ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width);
+	}
+
+	return r;
+}
+
+
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z)
+{
+	uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2);
+	uint width = info.x;
+	uint height = info.y;
+	uint offset = info.z;
+	uint depth = kernel_tex_fetch(__tex_image_packed_info, id*2+1).x;
+
+	/* Image Options */
+	uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
+	uint extension;
+	if(info.w & (1 << 1))
+		extension = EXTENSION_REPEAT;
+	else if(info.w & (1 << 2))
+		extension = EXTENSION_EXTEND;
+	else
+		extension = EXTENSION_CLIP;
+
+	float4 r;
+	int ix, iy, iz, nix, niy, niz;
+	if(interpolation == INTERPOLATION_CLOSEST) {
+		svm_image_texture_frac(x*width, &ix);
+		svm_image_texture_frac(y*height, &iy);
+		svm_image_texture_frac(z*depth, &iz);
+
+		if(extension == EXTENSION_REPEAT) {
+			ix = svm_image_texture_wrap_periodic(ix, width);
+			iy = svm_image_texture_wrap_periodic(iy, height);
+			iz = svm_image_texture_wrap_periodic(iz, depth);
+		}
+		else {
+			if(extension == EXTENSION_CLIP) {
+				if(x < 0.0f || y < 0.0f || z < 0.0f ||
+				   x > 1.0f || y > 1.0f || z > 1.0f)
+				 {
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				}
+			}
+			/* Fall through. */
+			/* EXTENSION_EXTEND */
+			ix = svm_image_texture_wrap_clamp(ix, width);
+			iy = svm_image_texture_wrap_clamp(iy, height);
+			iz = svm_image_texture_wrap_clamp(iz, depth);
+		}
+		r = svm_image_texture_read(kg, id, offset + ix + iy*width + iz*width*height);
+	}
+	else { /* INTERPOLATION_LINEAR */
+		float tx = svm_image_texture_frac(x*(float)width - 0.5f, &ix);
+		float ty = svm_image_texture_frac(y*(float)height - 0.5f, &iy);
+		float tz = svm_image_texture_frac(z*(float)depth - 0.5f, &iz);
+
+		if(extension == EXTENSION_REPEAT) {
+			ix = svm_image_texture_wrap_periodic(ix, width);
+			iy = svm_image_texture_wrap_periodic(iy, height);
+			iz = svm_image_texture_wrap_periodic(iz, depth);
+
+			nix = svm_image_texture_wrap_periodic(ix+1, width);
+			niy = svm_image_texture_wrap_periodic(iy+1, height);
+			niz = svm_image_texture_wrap_periodic(iz+1, depth);
+		}
+		else {
+			if(extension == EXTENSION_CLIP)
+				if(x < 0.0f || y < 0.0f || z < 0.0f ||
+				   x > 1.0f || y > 1.0f || z > 1.0f)
+				{
+					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				}
+			/* Fall through. */
+			/*  EXTENSION_EXTEND */
+			nix = svm_image_texture_wrap_clamp(ix+1, width);
+			niy = svm_image_texture_wrap_clamp(iy+1, height);
+			niz = svm_image_texture_wrap_clamp(iz+1, depth);
+
+			ix = svm_image_texture_wrap_clamp(ix, width);
+			iy = svm_image_texture_wrap_clamp(iy, height);
+			iz = svm_image_texture_wrap_clamp(iz, depth);
+		}
+
+		r  = (1.0f - tz)*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width + iz*width*height);
+		r += (1.0f - tz)*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + iz*width*height);
+		r += (1.0f - tz)*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + iz*width*height);
+		r += (1.0f - tz)*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + iz*width*height);
+
+		r += tz*(1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width + niz*width*height);
+		r += tz*(1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width + niz*width*height);
+		r += tz*ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width + niz*width*height);
+		r += tz*ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width + niz*width*height);
+
+	}
+
+	return r;
+}
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index aec7bc3..6754613 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -149,6 +149,15 @@ ccl_device_inline uint cmj_hash(uint i, uint p)
 	return i;
 }
 
+ccl_device_inline uint cmj_hash_simple(uint i, uint p)
+{
+	i = (i ^ 61) ^ p;
+	i += i << 3;
+	i ^= i >> 4;
+	i *= 0x27d4eb2d;
+	return i;
+}
+
 ccl_device_inline float cmj_randfloat(uint i, uint p)
 {
 	return cmj_hash(i, p) * (1.0f / 4294967808.0f);
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index 1e99f2c..d4cc36d 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -297,7 +297,7 @@ ccl_device_inline float background_portal_pdf(KernelGlobals *kg,
 		float3 axisu = make_float3(data1.y, data1.z, data1.w);
 		float3 axisv = make_float3(data2.y, data2.z, data2.w);
 
-		if(!ray_quad_intersect(P, direction, 1e-4f, FLT_MAX, lightpos, axisu, axisv, dir, NULL, NULL))
+		if(!ray_quad_intersect(P, direction, 1e-4f, FLT_MAX, lightpos, axisu, axisv, dir, NULL, NULL, NULL, NULL))
 			continue;
 
 		portal_pdf += area_light_sample(P, &lightpos, axisu, axisv, 0.0f, 0.0f, false);
@@ -510,7 +510,7 @@ ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3
 	return t*t/cos_pi;
 }
 
-ccl_device_inline void lamp_light_sample(KernelGlobals *kg,
+ccl_device_inline bool lamp_light_sample(KernelGlobals *kg,
                                          int lamp,
                                          float randu, float randv,
                                          float3 P,
@@ -581,7 +581,14 @@ ccl_device_inline void lamp_light_sample(KernelGlobals *kg,
 				/* spot light attenuation */
 				float4 data2 = kernel_tex_fetch(__light_data, lamp*LIGHT_SIZE + 2);
 				ls->eval_fac *= spot_light_attenuation(data1, data2, ls);
+				if(ls->eval_fac == 0.0f) {
+					return false;
+				}
 			}
+			float2 uv = map_to_sphere(ls->Ng);
+			ls->u = uv.x;
+			ls->v = uv.y;
+
 			ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 		}
 		else {
@@ -593,23 +600,31 @@ ccl_device_inline void lamp_light_sample(KernelGlobals *kg,
 			float3 axisv = make_float3(data2.y, data2.z, data2.w);
 			float3 D = make_float3(data3.y, data3.z, data3.w);
 
+			if(dot(ls->P - P, D) > 0.0f) {
+				return false;
+			}
+
+			float3 inplane = ls->P;
 			ls->pdf = area_light_sample(P, &ls->P,
 			                          axisu, axisv,
 			                          randu, randv,
 			                          true);
 
+			inplane = ls->P - inplane;
+			ls->u = dot(inplane, axisu) * (1.0f / dot(axisu, axisu)) + 0.5f;
+			ls->v = dot(inplane, axisv) * (1.0f / dot(axisv, axisv)) + 0.5f;
+
 			ls->Ng = D;
 			ls->D = normalize_len(ls->P - P, &ls->t);
 
 			float invarea = data2.x;
 			ls->eval_fac = 0.25f*invarea;
-
-			if(dot(ls->D, D) > 0.0f)
-				ls->pdf = 0.0f;
 		}
 
 		ls->eval_fac *= kernel_data.integrator.inv_pdf_lights;
 	}
+
+	return (ls->pdf > 0.0f);
 }
 
 ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D, float t, LightSample *ls)
@@ -700,6 +715,9 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 			if(ls->eval_fac == 0.0f)
 				return false;
 		}
+		float2 uv = map_to_sphere(ls->Ng);
+		ls->u = uv.x;
+		ls->v = uv.y;
 
 		/* compute pdf */
 		if(ls->t != FLT_MAX)
@@ -724,8 +742,10 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 
 		float3 light_P = make_float3(data0.y, data0.z, data0.w);
 
-		if(!ray_quad_intersect(P, D, 0.0f, t,
-		                       light_P, axisu, axisv, Ng, &ls->P, &ls->t))
+		if(!ray_quad_intersect(P, D, 0.0f, t, light_P,
+		                       axisu, axisv, Ng,
+		                       &ls->P, &ls->t,
+		                       &ls->u, &ls->v))
 		{
 			return false;
 		}
@@ -836,7 +856,7 @@ ccl_device bool light_select_reached_max_bounces(KernelGlobals *kg, int index, i
 	return (bounce > __float_as_int(data4.x));
 }
 
-ccl_device_noinline void light_sample(KernelGlobals *kg,
+ccl_device_noinline bool light_sample(KernelGlobals *kg,
                                       float randt,
                                       float randu,
                                       float randv,
@@ -857,21 +877,20 @@ ccl_device_noinline void light_sample(KernelGlobals *kg,
 		int shader_flag = __float_as_int(l.z);
 
 		triangle_light_sample(kg, prim, object, randu, randv, time, ls);
-
 		/* compute incoming direction, distance and pdf */
 		ls->D = normalize_len(ls->P - P, &ls->t);
 		ls->pdf = triangle_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 		ls->shader |= shader_flag;
+		return (ls->pdf > 0.0f);
 	}
 	else {
 		int lamp = -prim-1;
 
 		if(UNLIKELY(light_select_reached_max_bounces(kg, lamp, bounce))) {
-			ls->pdf = 0.0f;
-			return;
+			return false;
 		}
 
-		lamp_light_sample(kg, lamp, randu, randv, P, ls);
+		return lamp_light_sample(kg, lamp, randu, randv, P, ls);
 	}
 }
 
@@ -882,4 +901,3 @@ ccl_device int light_select_num_samples(KernelGlobals *kg, int index)
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 20cf3fa..7aec47e 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -20,7 +20,7 @@ ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sam
 {
 	ccl_global float *buf = buffer;
 #if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
-	atomic_add_float(buf, value);
+	atomic_add_and_fetch_float(buf, value);
 #else
 	*buf = (sample == 0)? value: *buf + value;
 #endif // __SPLIT_KERNEL__ && __WORK_STEALING__
@@ -33,9 +33,9 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sa
 	ccl_global float *buf_y = buffer + 1;
 	ccl_global float *buf_z = buffer + 2;
 
-	atomic_add_float(buf_x, value.x);
-	atomic_add_float(buf_y, value.y);
-	atomic_add_float(buf_z, value.z);
+	atomic_add_and_fetch_float(buf_x, value.x);
+	atomic_add_and_fetch_float(buf_y, value.y);
+	atomic_add_and_fetch_float(buf_z, value.z);
 #else
 	ccl_global float3 *buf = (ccl_global float3*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
@@ -50,10 +50,10 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
 	ccl_global float *buf_z = buffer + 2;
 	ccl_global float *buf_w = buffer + 3;
 
-	atomic_add_float(buf_x, value.x);
-	atomic_add_float(buf_y, value.y);
-	atomic_add_float(buf_z, value.z);
-	atomic_add_float(buf_w, value.w);
+	atomic_add_and_fetch_float(buf_x, value.x);
+	atomic_add_and_fetch_float(buf_y, value.y);
+	atomic_add_and_fetch_float(buf_z, value.z);
+	atomic_add_and_fetch_float(buf_w, value.w);
 #else
 	ccl_global float4 *buf = (ccl_global float4*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 7558fb9..e25f259 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -53,6 +53,47 @@
 
 CCL_NAMESPACE_BEGIN
 
+ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
+                                        ShaderData *sd,
+                                        ShaderData *emission_sd,
+                                        PathRadiance *L,
+                                        PathState *state,
+                                        RNG *rng,
+                                        float3 throughput,
+                                        float3 ao_alpha)
+{
+	/* todo: solve correlation */
+	float bsdf_u, bsdf_v;
+
+	path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+	float ao_factor = kernel_data.background.ao_factor;
+	float3 ao_N;
+	float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+	float3 ao_D;
+	float ao_pdf;
+
+	sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+	if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+		Ray light_ray;
+		float3 ao_shadow;
+
+		light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+		light_ray.D = ao_D;
+		light_ray.t = kernel_data.background.ao_distance;
+#ifdef __OBJECT_MOTION__
+		light_ray.time = ccl_fetch(sd, time);
+#endif  /* __OBJECT_MOTION__ */
+		light_ray.dP = ccl_fetch(sd, dP);
+		light_ray.dD = differential3_zero();
+
+		if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
+			path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+		}
+	}
+}
+
 ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                      ShaderData *sd,
                                      ShaderData *emission_sd,
@@ -97,9 +138,13 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 				                             state->bounce);
 			}
 		}
-#endif
+#endif  /* __LAMP_MIS__ */
 
 #ifdef __VOLUME__
+		/* Sanitize volume stack. */
+		if(!hit) {
+			kernel_volume_clean_stack(kg, state->volume_stack);
+		}
 		/* volume attenuation, emission, scatter */
 		if(state->volume_stack[0].shader != SHADER_NONE) {
 			Ray volume_ray = *ray;
@@ -198,7 +243,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 				}
 			}
 			else
-#  endif
+#  endif  /* __VOLUME_DECOUPLED__ */
 			{
 				/* integrate along volume segment with distance sampling */
 				VolumeIntegrateResult result = kernel_volume_integrate(
@@ -230,10 +275,10 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 						break;
 					}
 				}
-#  endif
+#  endif  /* __VOLUME_SCATTER__ */
 			}
 		}
-#endif
+#endif  /* __VOLUME__ */
 
 		if(!hit) {
 #ifdef __BACKGROUND__
@@ -243,7 +288,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			                               throughput,
 			                               L_background,
 			                               state->bounce);
-#endif
+#endif  /* __BACKGROUND__ */
 
 			break;
 		}
@@ -257,7 +302,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 		shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT);
 #ifdef __BRANCHED_PATH__
 		shader_merge_closures(sd);
-#endif
+#endif  /* __BRANCHED_PATH__ */
 
 		/* blurring of bsdf after bounces, for rays that have a small likelihood
 		 * of following this particular path (diffuse, rough glossy) */
@@ -280,7 +325,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			                                              state->ray_pdf);
 			path_radiance_accum_emission(L, throughput, emission, state->bounce);
 		}
-#endif
+#endif  /* __EMISSION__ */
 
 		/* path termination. this is a strange place to put the termination, it's
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
@@ -305,42 +350,9 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
-			float bsdf_u, bsdf_v;
-			path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-			float ao_factor = kernel_data.background.ao_factor;
-			float3 ao_N;
-			float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-			float3 ao_D;
-			float ao_pdf;
-			float3 ao_alpha = make_float3(0.0f, 0.0f, 0.0f);
-
-			sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-			if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-				Ray light_ray;
-				float3 ao_shadow;
-
-				light_ray.P = ray_offset(sd->P, sd->Ng);
-				light_ray.D = ao_D;
-				light_ray.t = kernel_data.background.ao_distance;
-#  ifdef __OBJECT_MOTION__
-				light_ray.time = sd->time;
-#  endif
-				light_ray.dP = sd->dP;
-				light_ray.dD = differential3_zero();
-
-				if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
-					path_radiance_accum_ao(L,
-					                       throughput,
-					                       ao_alpha,
-					                       ao_bsdf,
-					                       ao_shadow,
-					                       state->bounce);
-				}
-			}
+			kernel_path_ao(kg, sd, emission_sd, L, state, rng, throughput, make_float3(0.0f, 0.0f, 0.0f));
 		}
-#endif
+#endif  /* __AO__ */
 
 #ifdef __SUBSURFACE__
 		/* bssrdf scatter to a different location on the same object, replacing
@@ -372,7 +384,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 				                        false);
 			}
 		}
-#endif
+#endif  /* __SUBSURFACE__ */
 
 #if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
 		if(kernel_data.integrator.use_direct_light) {
@@ -387,53 +399,13 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			                                           L,
 			                                           all);
 		}
-#endif
+#endif  /* defined(__EMISSION__) && defined(__BRANCHED_PATH__) */
 
 		if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray))
 			break;
 	}
 }
 
-ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
-                                        ShaderData *sd,
-                                        ShaderData *emission_sd,
-                                        PathRadiance *L,
-                                        PathState *state,
-                                        RNG *rng,
-                                        float3 throughput)
-{
-	/* todo: solve correlation */
-	float bsdf_u, bsdf_v;
-
-	path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-	float ao_factor = kernel_data.background.ao_factor;
-	float3 ao_N;
-	float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-	float3 ao_D;
-	float ao_pdf;
-	float3 ao_alpha = shader_bsdf_alpha(kg, sd);
-
-	sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-	if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
-		Ray light_ray;
-		float3 ao_shadow;
-
-		light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
-		light_ray.D = ao_D;
-		light_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-		light_ray.time = ccl_fetch(sd, time);
-#endif
-		light_ray.dP = ccl_fetch(sd, dP);
-		light_ray.dD = differential3_zero();
-
-		if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow))
-			path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
-	}
-}
-
 #ifdef __SUBSURFACE__
 #  ifndef __KERNEL_CUDA__
 ccl_device
@@ -481,7 +453,7 @@ bool kernel_path_subsurface_scatter(
 		ss_indirect->need_update_volume_stack =
 		        kernel_data.integrator.use_volumes &&
 		        ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
-#  endif
+#  endif  /* __VOLUME__ */
 
 		/* compute lighting with the BSDF closure */
 		for(int hit = 0; hit < num_hits; hit++) {
@@ -524,7 +496,7 @@ bool kernel_path_subsurface_scatter(
 			{
 #  ifdef __LAMP_MIS__
 				hit_state->ray_t = 0.0f;
-#  endif
+#  endif  /* __LAMP_MIS__ */
 
 #  ifdef __VOLUME__
 				if(ss_indirect->need_update_volume_stack) {
@@ -539,7 +511,7 @@ bool kernel_path_subsurface_scatter(
 					    &volume_ray,
 					    hit_state->volume_stack);
 				}
-#  endif
+#  endif  /* __VOLUME__ */
 				path_radiance_reset_indirect(L);
 				ss_indirect->num_rays++;
 			}
@@ -625,14 +597,14 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #ifdef __KERNEL_DEBUG__
 	DebugData debug_data;
 	debug_data_init(&debug_data);
-#endif
+#endif  /* __KERNEL_DEBUG__ */
 
 #ifdef __SUBSURFACE__
 	SubsurfaceIndirectRays ss_indirect;
 	kernel_path_subsurface_init_indirect(&ss_indirect);
 
 	for(;;) {
-#endif
+#endif  /* __SUBSURFACE__ */
 
 	/* path iteration */
 	for(;;) {
@@ -658,15 +630,16 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 		bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
 #else
 		bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif
+#endif  /* __HAIR__ */
 
 #ifdef __KERNEL_DEBUG__
 		if(state.flag & PATH_RAY_CAMERA) {
-			debug_data.num_bvh_traversal_steps += isect.num_traversal_steps;
+			debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes;
 			debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
+			debug_data.num_bvh_intersections += isect.num_intersections;
 		}
 		debug_data.num_ray_bounces++;
-#endif
+#endif  /* __KERNEL_DEBUG__ */
 
 #ifdef __LAMP_MIS__
 		if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
@@ -687,9 +660,13 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
 				path_radiance_accum_emission(&L, throughput, emission, state.bounce);
 		}
-#endif
+#endif  /* __LAMP_MIS__ */
 
 #ifdef __VOLUME__
+		/* Sanitize volume stack. */
+		if(!hit) {
+			kernel_volume_clean_stack(kg, state.volume_stack);
+		}
 		/* volume attenuation, emission, scatter */
 		if(state.volume_stack[0].shader != SHADER_NONE) {
 			Ray volume_ray = ray;
@@ -751,7 +728,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 				}
 			}
 			else
-#  endif
+#  endif  /* __VOLUME_DECOUPLED__ */
 			{
 				/* integrate along volume segment with distance sampling */
 				VolumeIntegrateResult result = kernel_volume_integrate(
@@ -768,10 +745,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 					else
 						break;
 				}
-#  endif
+#  endif  /* __VOLUME_SCATTER__ */
 			}
 		}
-#endif
+#endif  /* __VOLUME__ */
 
 		if(!hit) {
 			/* eval background shader if nothing hit */
@@ -780,7 +757,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 
 #ifdef __PASSES__
 				if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif
+#endif  /* __PASSES__ */
 					break;
 			}
 
@@ -788,7 +765,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			/* sample background shader */
 			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
 			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
-#endif
+#endif  /* __BACKGROUND__ */
 
 			break;
 		}
@@ -816,7 +793,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			if(sd.flag & SD_HOLDOUT_MASK)
 				break;
 		}
-#endif
+#endif  /* __HOLDOUT__ */
 
 		/* holdout mask objects do not write data passes */
 		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
@@ -839,7 +816,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
 			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
 		}
-#endif
+#endif  /* __EMISSION__ */
 
 		/* path termination. this is a strange place to put the termination, it's
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
@@ -851,7 +828,6 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 		}
 		else if(probability != 1.0f) {
 			float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
-
 			if(terminate >= probability)
 				break;
 
@@ -861,9 +837,9 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
+			kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
 		}
-#endif
+#endif  /* __AO__ */
 
 #ifdef __SUBSURFACE__
 		/* bssrdf scatter to a different location on the same object, replacing
@@ -918,7 +894,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 
 #ifdef __KERNEL_DEBUG__
 	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
-#endif
+#endif  /* __KERNEL_DEBUG__ */
 
 	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
 }
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index cdb07db..72a8d98 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -51,7 +51,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 			light_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
 			light_ray.time = ccl_fetch(sd, time);
-#endif
+#endif  /* __OBJECT_MOTION__ */
 			light_ray.dP = ccl_fetch(sd, dP);
 			light_ray.dD = differential3_zero();
 
@@ -169,7 +169,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 			Ray volume_ray = *ray;
 			bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
 			                                ccl_fetch(sd, flag) & SD_OBJECT_INTERSECTS_VOLUME;
-#endif
+#endif  /* __VOLUME__ */
 
 			/* compute lighting with the BSDF closure */
 			for(int hit = 0; hit < num_hits; hit++) {
@@ -200,7 +200,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 					    &volume_ray,
 					    hit_state.volume_stack);
 				}
-#endif
+#endif  /* __VOLUME__ */
 
 #ifdef __EMISSION__
 				/* direct light */
@@ -217,7 +217,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 					        L,
 					        all);
 				}
-#endif
+#endif  /* __EMISSION__ */
 
 				/* indirect light */
 				kernel_branched_path_surface_indirect_light(
@@ -234,7 +234,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 		}
 	}
 }
-#endif
+#endif  /* __SUBSURFACE__ */
 
 ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
 {
@@ -256,7 +256,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #ifdef __KERNEL_DEBUG__
 	DebugData debug_data;
 	debug_data_init(&debug_data);
-#endif
+#endif  /* __KERNEL_DEBUG__ */
 
 	/* Main Loop
 	 * Here we only handle transparency intersections from the camera ray.
@@ -285,15 +285,20 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
 #else
 		bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif
+#endif  /* __HAIR__ */
 
 #ifdef __KERNEL_DEBUG__
-		debug_data.num_bvh_traversal_steps += isect.num_traversal_steps;
+		debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes;
 		debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
+		debug_data.num_bvh_intersections += isect.num_intersections;
 		debug_data.num_ray_bounces++;
-#endif
+#endif  /* __KERNEL_DEBUG__ */
 
 #ifdef __VOLUME__
+		/* Sanitize volume stack. */
+		if(!hit) {
+			kernel_volume_clean_stack(kg, state.volume_stack);
+		}
 		/* volume attenuation, emission, scatter */
 		if(state.volume_stack[0].shader != SHADER_NONE) {
 			Ray volume_ray = ray;
@@ -432,14 +437,14 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 						path_radiance_reset_indirect(&L);
 					}
 				}
-#endif
+#endif  /* __VOLUME_SCATTER__ */
 			}
 
 			/* todo: avoid this calculation using decoupled ray marching */
 			kernel_volume_shadow(kg, &emission_sd, &state, &volume_ray, &throughput);
-#endif
+#endif  /* __VOLUME_DECOUPLED__ */
 		}
-#endif
+#endif  /* __VOLUME__ */
 
 		if(!hit) {
 			/* eval background shader if nothing hit */
@@ -448,7 +453,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 
 #ifdef __PASSES__
 				if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif
+#endif  /* __PASSES__ */
 					break;
 			}
 
@@ -456,7 +461,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			/* sample background shader */
 			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
 			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
-#endif
+#endif  /* __BACKGROUND__ */
 
 			break;
 		}
@@ -484,7 +489,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			if(sd.flag & SD_HOLDOUT_MASK)
 				break;
 		}
-#endif
+#endif  /* __HOLDOUT__ */
 
 		/* holdout mask objects do not write data passes */
 		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
@@ -495,7 +500,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
 			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
 		}
-#endif
+#endif  /* __EMISSION__ */
 
 		/* transparency termination */
 		if(state.flag & PATH_RAY_TRANSPARENT) {
@@ -522,7 +527,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
 			kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
 		}
-#endif
+#endif  /* __AO__ */
 
 #ifdef __SUBSURFACE__
 		/* bssrdf scatter to a different location on the same object */
@@ -530,7 +535,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
 			                                        &L, &state, rng, &ray, throughput);
 		}
-#endif
+#endif  /* __SUBSURFACE__ */
 
 		if(!(sd.flag & SD_HAS_ONLY_VOLUME)) {
 			PathState hit_state = state;
@@ -542,7 +547,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				kernel_branched_path_surface_connect_light(kg, rng,
 					&sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
 			}
-#endif
+#endif  /* __EMISSION__ */
 
 			/* indirect light */
 			kernel_branched_path_surface_indirect_light(kg, rng,
@@ -567,12 +572,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		ray.dP = sd.dP;
 		ray.dD.dx = -sd.dI.dx;
 		ray.dD.dy = -sd.dI.dy;
-#endif
+#endif  /* __RAY_DIFFERENTIALS__ */
 
 #ifdef __VOLUME__
 		/* enter/exit volume */
 		kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
-#endif
+#endif  /* __VOLUME__ */
 	}
 
 	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
@@ -581,7 +586,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 
 #ifdef __KERNEL_DEBUG__
 	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
-#endif
+#endif  /* __KERNEL_DEBUG__ */
 
 	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
 }
diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h
index 1912dfa..7b90355 100644
--- a/intern/cycles/kernel/kernel_path_common.h
+++ b/intern/cycles/kernel/kernel_path_common.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "util_hash.h"
+
 CCL_NAMESPACE_BEGIN
 
 ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,
@@ -28,6 +30,10 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,
 
 	int num_samples = kernel_data.integrator.aa_samples;
 
+	if(sample == kernel_data.integrator.start_sample) {
+		*rng_state = hash_int_2d(x, y);
+	}
+
 	path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v);
 
 	/* sample camera ray */
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index 250b8e9..fea503d 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -46,23 +46,26 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 			float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
 			RNG lamp_rng = cmj_hash(*rng, i);
 
-			if(kernel_data.integrator.pdf_triangles != 0.0f)
-				num_samples_inv *= 0.5f;
-
 			for(int j = 0; j < num_samples; j++) {
 				float light_u, light_v;
 				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+				float terminate = path_branched_rng_light_termination(kg, &lamp_rng, state, j, num_samples);
 
 				LightSample ls;
-				lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls);
-
-				if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
-					/* trace shadow ray */
-					float3 shadow;
-
-					if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
-						/* accumulate */
-						path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+				if(lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls)) {
+					/* The sampling probability returned by lamp_light_sample assumes that all lights were sampled.
+					 * However, this code only samples lamps, so if the scene also had mesh lights, the real probability is twice as high. */
+					if(kernel_data.integrator.pdf_triangles != 0.0f)
+						ls.pdf *= 2.0f;
+
+					if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
+						/* trace shadow ray */
+						float3 shadow;
+
+						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+							/* accumulate */
+							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+						}
 					}
 				}
 			}
@@ -73,28 +76,30 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 			int num_samples = ceil_to_int(num_samples_adjust*kernel_data.integrator.mesh_light_samples);
 			float num_samples_inv = num_samples_adjust/num_samples;
 
-			if(kernel_data.integrator.num_all_lights)
-				num_samples_inv *= 0.5f;
-
 			for(int j = 0; j < num_samples; j++) {
 				float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT);
 				float light_u, light_v;
 				path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+				float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples);
 
 				/* only sample triangle lights */
 				if(kernel_data.integrator.num_all_lights)
 					light_t = 0.5f*light_t;
 
 				LightSample ls;
-				light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
-
-				if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
-					/* trace shadow ray */
-					float3 shadow;
-
-					if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
-						/* accumulate */
-						path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+				if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+					/* Same as above, probability needs to be corrected since the sampling was forced to select a mesh light. */
+					if(kernel_data.integrator.num_all_lights)
+						ls.pdf *= 2.0f;
+
+					if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
+						/* trace shadow ray */
+						float3 shadow;
+
+						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+							/* accumulate */
+							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+						}
 					}
 				}
 			}
@@ -105,18 +110,19 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 		float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 		float light_u, light_v;
 		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+		float terminate = path_state_rng_light_termination(kg, rng, state);
 
 		LightSample ls;
-		light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
-
-		/* sample random light */
-		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
-			/* trace shadow ray */
-			float3 shadow;
-
-			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
-				/* accumulate */
-				path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
+		if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+			/* sample random light */
+			if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
+				/* trace shadow ray */
+				float3 shadow;
+
+				if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+					/* accumulate */
+					path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
+				}
 			}
 		}
 	}
@@ -206,15 +212,16 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_
 #endif
 
 	LightSample ls;
-	light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
-
-	if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
-		/* trace shadow ray */
-		float3 shadow;
+	if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+		float terminate = path_state_rng_light_termination(kg, rng, state);
+		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
+			/* trace shadow ray */
+			float3 shadow;
 
-		if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
-			/* accumulate */
-			path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+				/* accumulate */
+				path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+			}
 		}
 	}
 #endif
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index 5fd4f2f..3d3b738 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -46,17 +46,17 @@ ccl_device_inline void kernel_path_volume_connect_light(
 	light_ray.time = sd->time;
 #  endif
 
-	light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
-	if(ls.pdf == 0.0f)
-		return;
-	
-	if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
-		/* trace shadow ray */
-		float3 shadow;
+	if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls))
+	{
+		float terminate = path_state_rng_light_termination(kg, rng, state);
+		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
+			/* trace shadow ray */
+			float3 shadow;
 
-		if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
-			/* accumulate */
-			path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+				/* accumulate */
+				path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+			}
 		}
 	}
 #endif
@@ -137,16 +137,13 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 			float num_samples_inv = 1.0f/(num_samples*kernel_data.integrator.num_all_lights);
 			RNG lamp_rng = cmj_hash(*rng, i);
 
-			if(kernel_data.integrator.pdf_triangles != 0.0f)
-				num_samples_inv *= 0.5f;
-
 			for(int j = 0; j < num_samples; j++) {
 				/* sample random position on given light */
 				float light_u, light_v;
 				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 
 				LightSample ls;
-				lamp_light_sample(kg, i, light_u, light_v, ray->P, &ls);		
+				lamp_light_sample(kg, i, light_u, light_v, ray->P, &ls);
 
 				float3 tp = throughput;
 
@@ -156,23 +153,24 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 
 				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
 					state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
-					
+
 				(void)result;
 				kernel_assert(result == VOLUME_PATH_SCATTERED);
 
 				/* todo: split up light_sample so we don't have to call it again with new position */
-				lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls);
-
-				if(ls.pdf == 0.0f)
-					continue;
-
-				if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
-					/* trace shadow ray */
-					float3 shadow;
-
-					if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
-						/* accumulate */
-						path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+				if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) {
+					if(kernel_data.integrator.pdf_triangles != 0.0f)
+						ls.pdf *= 2.0f;
+
+					float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples);
+					if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
+						/* trace shadow ray */
+						float3 shadow;
+
+						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+							/* accumulate */
+							path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+						}
 					}
 				}
 			}
@@ -183,9 +181,6 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 			int num_samples = kernel_data.integrator.mesh_light_samples;
 			float num_samples_inv = 1.0f/num_samples;
 
-			if(kernel_data.integrator.num_all_lights)
-				num_samples_inv *= 0.5f;
-
 			for(int j = 0; j < num_samples; j++) {
 				/* sample random position on random triangle */
 				float light_t = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_LIGHT);
@@ -212,18 +207,19 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 				kernel_assert(result == VOLUME_PATH_SCATTERED);
 
 				/* todo: split up light_sample so we don't have to call it again with new position */
-				light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
-
-				if(ls.pdf == 0.0f)
-					continue;
-
-				if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
-					/* trace shadow ray */
-					float3 shadow;
-
-					if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
-						/* accumulate */
-						path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+				if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+					if(kernel_data.integrator.num_all_lights)
+						ls.pdf *= 2.0f;
+
+					float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples);
+					if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
+						/* trace shadow ray */
+						float3 shadow;
+
+						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+							/* accumulate */
+							path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+						}
 					}
 				}
 			}
@@ -251,19 +247,17 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 		kernel_assert(result == VOLUME_PATH_SCATTERED);
 
 		/* todo: split up light_sample so we don't have to call it again with new position */
-		light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls);
-
-		if(ls.pdf == 0.0f)
-			return;
-
-		/* sample random light */
-		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
-			/* trace shadow ray */
-			float3 shadow;
-
-			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
-				/* accumulate */
-				path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+		if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+			/* sample random light */
+			float terminate = path_state_rng_light_termination(kg, rng, state);
+			if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
+				/* trace shadow ray */
+				float3 shadow;
+
+				if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+					/* accumulate */
+					path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+				}
 			}
 		}
 	}
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index ba714b6..9a2b088 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -235,7 +235,7 @@ ccl_device_inline void spherical_stereo_transform(KernelGlobals *kg, float3 *P,
 	if(kernel_data.cam.pole_merge_angle_to > 0.0f) {
 		const float pole_merge_angle_from = kernel_data.cam.pole_merge_angle_from,
 		            pole_merge_angle_to = kernel_data.cam.pole_merge_angle_to;
-		float altitude = fabsf(safe_asinf(D->z));
+		float altitude = fabsf(safe_asinf((*D).z));
 		if(altitude > pole_merge_angle_to) {
 			interocular_offset = 0.0f;
 		}
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index 4a76ffd..e773753 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -120,13 +120,11 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *
 	/* Cranly-Patterson rotation using rng seed */
 	float shift;
 
-	/* using the same *rng value to offset seems to give correlation issues,
-	 * we could hash it with the dimension but this has a performance impact,
-	 * we need to find a solution for this */
-	if(dimension & 1)
-		shift = (*rng >> 16) * (1.0f/(float)0xFFFF);
-	else
-		shift = (*rng & 0xFFFF) * (1.0f/(float)0xFFFF);
+	/* Hash rng with dimension to solve correlation issues.
+	 * See T38710, T50116.
+	 */
+	RNG tmp_rng = cmj_hash_simple(dimension, *rng);
+	shift = tmp_rng * (1.0f/(float)0xFFFFFFFF);
 
 	return r + shift - floorf(r + shift);
 #endif
@@ -300,6 +298,23 @@ ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, ccl_addr_space RN
 	path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
 }
 
+/* Utitility functions to get light termination value, since it might not be needed in many cases. */
+ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state)
+{
+	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
+		return path_state_rng_1D_for_decision(kg, rng, state, PRNG_LIGHT_TERMINATE);
+	}
+	return 0.0f;
+}
+
+ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches)
+{
+	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
+		return path_branched_rng_1D_for_decision(kg, rng, state, branch, num_branches, PRNG_LIGHT_TERMINATE);
+	}
+	return 0.0f;
+}
+
 ccl_device_inline void path_state_branch(PathState *state, int branch, int num_branches)
 {
 	/* path is splitting into a branch, adjust so that each branch
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index daf8c20..c1b3153 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -111,7 +111,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
 
 	ccl_fetch(sd, I) = -ray->D;
 
-	ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2);
+	ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
 
 #ifdef __INSTANCING__
 	if(isect->object != OBJECT_NONE) {
@@ -195,7 +195,7 @@ void shader_setup_from_subsurface(
 		motion_triangle_shader_setup(kg, sd, isect, ray, true);
 	}
 
-	sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
+	sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
 
 #  ifdef __INSTANCING__
 	if(isect->object != OBJECT_NONE) {
@@ -242,7 +242,8 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
                                                 int shader, int object, int prim,
                                                 float u, float v, float t,
                                                 float time,
-                                                bool object_space)
+                                                bool object_space,
+                                                int lamp)
 {
 	/* vectors */
 	ccl_fetch(sd, P) = P;
@@ -250,7 +251,12 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
 	ccl_fetch(sd, Ng) = Ng;
 	ccl_fetch(sd, I) = I;
 	ccl_fetch(sd, shader) = shader;
-	ccl_fetch(sd, type) = (prim == PRIM_NONE)? PRIMITIVE_NONE: PRIMITIVE_TRIANGLE;
+	if(prim != PRIM_NONE)
+		ccl_fetch(sd, type) = PRIMITIVE_TRIANGLE;
+	else if(lamp != LAMP_NONE)
+		ccl_fetch(sd, type) = PRIMITIVE_LAMP;
+	else
+		ccl_fetch(sd, type) = PRIMITIVE_NONE;
 
 	/* primitive */
 #ifdef __INSTANCING__
@@ -264,18 +270,19 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
 #endif
 	ccl_fetch(sd, ray_length) = t;
 
-	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2);
+	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
 	if(ccl_fetch(sd, object) != OBJECT_NONE) {
 		ccl_fetch(sd, flag) |= kernel_tex_fetch(__object_flag, ccl_fetch(sd, object));
 
 #ifdef __OBJECT_MOTION__
 		shader_setup_object_transforms(kg, sd, time);
+		ccl_fetch(sd, time) = time;
 	}
-
-	ccl_fetch(sd, time) = time;
-#else
-	}
+	else if(lamp != LAMP_NONE) {
+		ccl_fetch(sd, ob_tfm)  = lamp_fetch_transform(kg, lamp, false);
+		ccl_fetch(sd, ob_itfm) = lamp_fetch_transform(kg, lamp, true);
 #endif
+	}
 
 	/* transform into world space */
 	if(object_space) {
@@ -357,7 +364,8 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 	                         P, Ng, I,
 	                         shader, object, prim,
 	                         u, v, 0.0f, 0.5f,
-	                         !(kernel_tex_fetch(__object_flag, object) & SD_TRANSFORM_APPLIED));
+	                         !(kernel_tex_fetch(__object_flag, object) & SD_TRANSFORM_APPLIED),
+	                         LAMP_NONE);
 }
 
 /* ShaderData setup from ray into background */
@@ -370,7 +378,7 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderDat
 	ccl_fetch(sd, Ng) = -ray->D;
 	ccl_fetch(sd, I) = -ray->D;
 	ccl_fetch(sd, shader) = kernel_data.background.surface_shader;
-	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2);
+	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
 #ifdef __OBJECT_MOTION__
 	ccl_fetch(sd, time) = ray->time;
 #endif
@@ -561,7 +569,7 @@ void shader_bsdf_eval(KernelGlobals *kg,
 		_shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, -1, eval, 0.0f, 0.0f);
 		if(use_mis) {
 			float weight = power_heuristic(light_pdf, pdf);
-			bsdf_eval_mul(eval, make_float3(weight, weight, weight));
+			bsdf_eval_mul(eval, weight);
 		}
 	}
 }
@@ -1027,7 +1035,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 		sd->shader = stack[i].shader;
 
 		sd->flag &= ~(SD_SHADER_FLAGS|SD_OBJECT_FLAGS);
-		sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2);
+		sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
 
 		if(sd->object != OBJECT_NONE) {
 			sd->flag |= kernel_tex_fetch(__object_flag, sd->object);
@@ -1100,7 +1108,7 @@ ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect
 		shader = __float_as_int(str.z);
 	}
 #endif
-	int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*2);
+	int flag = kernel_tex_fetch(__shader_flag, (shader & SHADER_MASK)*SHADER_SIZE);
 
 	return (flag & SD_HAS_TRANSPARENT_SHADOW) != 0;
 }
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 169f03f..2981f6a 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -125,14 +125,14 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd,
 
 				/* stop if all light is blocked */
 				if(is_zero(throughput)) {
-					/* free dynamic storage */
 					return true;
 				}
 
 				/* move ray forward */
 				ray->P = shadow_sd->P;
-				if(ray->t != FLT_MAX)
+				if(ray->t != FLT_MAX) {
 					ray->D = normalize_len(Pend - ray->P, &ray->t);
+				}
 
 #ifdef __VOLUME__
 				/* exit/enter volume */
@@ -234,8 +234,9 @@ ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
 					return false;
 				}
 
-				if(!shader_transparent_shadow(kg, isect))
+				if(!shader_transparent_shadow(kg, isect)) {
 					return true;
+				}
 
 #ifdef __VOLUME__
 				/* attenuation between last surface and next surface */
@@ -258,13 +259,16 @@ ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
 					throughput *= shader_bsdf_transparency(kg, shadow_sd);
 				}
 
-				if(is_zero(throughput))
+				/* stop if all light is blocked */
+				if(is_zero(throughput)) {
 					return true;
+				}
 
 				/* move ray forward */
 				ray->P = ray_offset(ccl_fetch(shadow_sd, P), -ccl_fetch(shadow_sd, Ng));
-				if(ray->t != FLT_MAX)
+				if(ray->t != FLT_MAX) {
 					ray->D = normalize_len(Pend - ray->P, &ray->t);
+				}
 
 #ifdef __VOLUME__
 				/* exit/enter volume */
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index 41bc9db..52c05b8 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -220,7 +220,7 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
  */
 ccl_device_inline int subsurface_scatter_multi_intersect(
         KernelGlobals *kg,
-        SubsurfaceIntersection* ss_isect,
+        SubsurfaceIntersection *ss_isect,
         ShaderData *sd,
         ShaderClosure *sc,
         uint *lcg_state,
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 1156b41..358db9e 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -37,11 +37,12 @@ CCL_NAMESPACE_BEGIN
 /* constants */
 #define OBJECT_SIZE 		12
 #define OBJECT_VECTOR_SIZE	6
-#define LIGHT_SIZE			5
+#define LIGHT_SIZE		11
 #define FILTER_TABLE_SIZE	1024
 #define RAMP_TABLE_SIZE		256
 #define SHUTTER_TABLE_SIZE		256
 #define PARTICLE_SIZE 		5
+#define SHADER_SIZE		5
 
 #define BSSRDF_MIN_RADIUS			1e-8f
 #define BSSRDF_MAX_HITS				4
@@ -191,6 +192,9 @@ CCL_NAMESPACE_BEGIN
 #ifdef __NO_PATCH_EVAL__
 #  undef __PATCH_EVAL__
 #endif
+#ifdef __NO_TRANSPARENT__
+#  undef __TRANSPARENT_SHADOWS__
+#endif
 
 /* Random Numbers */
 
@@ -249,7 +253,7 @@ enum PathTraceDimension {
 	PRNG_LIGHT = 3,
 	PRNG_LIGHT_U = 4,
 	PRNG_LIGHT_V = 5,
-	PRNG_UNUSED_3 = 6,
+	PRNG_LIGHT_TERMINATE = 6,
 	PRNG_TERMINATE = 7,
 
 #ifdef __VOLUME__
@@ -341,9 +345,10 @@ typedef enum PassType {
 	PASS_SUBSURFACE_COLOR = (1 << 24),
 	PASS_LIGHT = (1 << 25), /* no real pass, used to force use_light_pass */
 #ifdef __KERNEL_DEBUG__
-	PASS_BVH_TRAVERSAL_STEPS = (1 << 26),
+	PASS_BVH_TRAVERSED_NODES = (1 << 26),
 	PASS_BVH_TRAVERSED_INSTANCES = (1 << 27),
-	PASS_RAY_BOUNCES = (1 << 28),
+	PASS_BVH_INTERSECTIONS = (1 << 28),
+	PASS_RAY_BOUNCES = (1 << 29),
 #endif
 } PassType;
 
@@ -538,33 +543,38 @@ typedef ccl_addr_space struct Intersection {
 	int type;
 
 #ifdef __KERNEL_DEBUG__
-	int num_traversal_steps;
+	int num_traversed_nodes;
 	int num_traversed_instances;
+	int num_intersections;
 #endif
 } Intersection;
 
 /* Primitives */
 
 typedef enum PrimitiveType {
-	PRIMITIVE_NONE = 0,
-	PRIMITIVE_TRIANGLE = 1,
-	PRIMITIVE_MOTION_TRIANGLE = 2,
-	PRIMITIVE_CURVE = 4,
-	PRIMITIVE_MOTION_CURVE = 8,
+	PRIMITIVE_NONE            = 0,
+	PRIMITIVE_TRIANGLE        = (1 << 0),
+	PRIMITIVE_MOTION_TRIANGLE = (1 << 1),
+	PRIMITIVE_CURVE           = (1 << 2),
+	PRIMITIVE_MOTION_CURVE    = (1 << 3),
+	/* Lamp primitive is not included below on purpose,
+	 * since it is no real traceable primitive.
+	 */
+	PRIMITIVE_LAMP            = (1 << 4),
 
 	PRIMITIVE_ALL_TRIANGLE = (PRIMITIVE_TRIANGLE|PRIMITIVE_MOTION_TRIANGLE),
 	PRIMITIVE_ALL_CURVE = (PRIMITIVE_CURVE|PRIMITIVE_MOTION_CURVE),
 	PRIMITIVE_ALL_MOTION = (PRIMITIVE_MOTION_TRIANGLE|PRIMITIVE_MOTION_CURVE),
 	PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE|PRIMITIVE_ALL_CURVE),
 
-	/* Total number of different primitives.
+	/* Total number of different traceable primitives.
 	 * NOTE: This is an actual value, not a bitflag.
 	 */
 	PRIMITIVE_NUM_TOTAL = 4,
 } PrimitiveType;
 
-#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << 16) | type)
-#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> 16)
+#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM_TOTAL) | (type))
+#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM_TOTAL)
 
 /* Attributes */
 
@@ -714,20 +724,21 @@ enum ShaderDataFlag {
 	SD_VOLUME_CUBIC           = (1 << 20),  /* use cubic interpolation for voxels */
 	SD_HAS_BUMP               = (1 << 21),  /* has data connected to the displacement input */
 	SD_HAS_DISPLACEMENT       = (1 << 22),  /* has true displacement */
+	SD_HAS_CONSTANT_EMISSION  = (1 << 23),  /* has constant emission (value stored in __shader_flag) */
 
 	SD_SHADER_FLAGS = (SD_USE_MIS|SD_HAS_TRANSPARENT_SHADOW|SD_HAS_VOLUME|
 	                   SD_HAS_ONLY_VOLUME|SD_HETEROGENEOUS_VOLUME|
 	                   SD_HAS_BSSRDF_BUMP|SD_VOLUME_EQUIANGULAR|SD_VOLUME_MIS|
-	                   SD_VOLUME_CUBIC|SD_HAS_BUMP|SD_HAS_DISPLACEMENT),
+	                   SD_VOLUME_CUBIC|SD_HAS_BUMP|SD_HAS_DISPLACEMENT|SD_HAS_CONSTANT_EMISSION),
 
 	/* object flags */
-	SD_HOLDOUT_MASK             = (1 << 23),  /* holdout for camera rays */
-	SD_OBJECT_MOTION            = (1 << 24),  /* has object motion blur */
-	SD_TRANSFORM_APPLIED        = (1 << 25),  /* vertices have transform applied */
-	SD_NEGATIVE_SCALE_APPLIED   = (1 << 26),  /* vertices have negative scale applied */
-	SD_OBJECT_HAS_VOLUME        = (1 << 27),  /* object has a volume shader */
-	SD_OBJECT_INTERSECTS_VOLUME = (1 << 28),  /* object intersects AABB of an object with volume shader */
-	SD_OBJECT_HAS_VERTEX_MOTION = (1 << 29),  /* has position for motion vertices */
+	SD_HOLDOUT_MASK             = (1 << 24),  /* holdout for camera rays */
+	SD_OBJECT_MOTION            = (1 << 25),  /* has object motion blur */
+	SD_TRANSFORM_APPLIED        = (1 << 26),  /* vertices have transform applied */
+	SD_NEGATIVE_SCALE_APPLIED   = (1 << 27),  /* vertices have negative scale applied */
+	SD_OBJECT_HAS_VOLUME        = (1 << 28),  /* object has a volume shader */
+	SD_OBJECT_INTERSECTS_VOLUME = (1 << 29),  /* object intersects AABB of an object with volume shader */
+	SD_OBJECT_HAS_VERTEX_MOTION = (1 << 30),  /* has position for motion vertices */
 
 	SD_OBJECT_FLAGS = (SD_HOLDOUT_MASK|SD_OBJECT_MOTION|SD_TRANSFORM_APPLIED|
 	                   SD_NEGATIVE_SCALE_APPLIED|SD_OBJECT_HAS_VOLUME|
@@ -827,7 +838,7 @@ typedef ccl_addr_space struct ShaderData {
 	ccl_soa_member(differential3, ray_dP);
 
 #ifdef __OSL__
-	struct KernelGlobals * osl_globals;
+	struct KernelGlobals *osl_globals;
 	struct PathState *osl_path_state;
 #endif
 } ShaderData;
@@ -1031,10 +1042,10 @@ typedef struct KernelFilm {
 	float mist_falloff;
 
 #ifdef __KERNEL_DEBUG__
-	int pass_bvh_traversal_steps;
+	int pass_bvh_traversed_nodes;
 	int pass_bvh_traversed_instances;
+	int pass_bvh_intersections;
 	int pass_ray_bounces;
-	int pass_pad3;
 #endif
 } KernelFilm;
 static_assert_align(KernelFilm, 16);
@@ -1119,8 +1130,9 @@ typedef struct KernelIntegrator {
 	float volume_step_size;
 	int volume_samples;
 
-	int pad1;
-	int pad2;
+	float light_inv_rr_threshold;
+
+	int start_sample;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);
 
@@ -1178,10 +1190,9 @@ static_assert_align(KernelData, 16);
  * really important here.
  */
 typedef ccl_addr_space struct DebugData {
-	// Total number of BVH node traversal steps and primitives intersections
-	// for the camera rays.
-	int num_bvh_traversal_steps;
+	int num_bvh_traversed_nodes;
 	int num_bvh_traversed_instances;
+	int num_bvh_intersections;
 	int num_ray_bounces;
 } DebugData;
 #endif
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index 4ab51b8..c7cb29b 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -115,7 +115,7 @@ ccl_device float kernel_volume_channel_get(float3 value, int channel)
 ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, VolumeStack *stack)
 {
 	for(int i = 0; stack[i].shader != SHADER_NONE; i++) {
-		int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*2);
+		int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*SHADER_SIZE);
 
 		if(shader_flag & SD_HETEROGENEOUS_VOLUME)
 			return true;
@@ -132,7 +132,7 @@ ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stac
 	int method = -1;
 
 	for(int i = 0; stack[i].shader != SHADER_NONE; i++) {
-		int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*2);
+		int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*SHADER_SIZE);
 
 		if(shader_flag & SD_VOLUME_MIS) {
 			return SD_VOLUME_MIS;
@@ -245,11 +245,18 @@ ccl_device float kernel_volume_equiangular_sample(Ray *ray, float3 light_P, floa
 	float t = ray->t;
 
 	float delta = dot((light_P - ray->P) , ray->D);
-	float D = sqrtf(len_squared(light_P - ray->P) - delta * delta);
+	float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+	if(UNLIKELY(D == 0.0f)) {
+		*pdf = 0.0f;
+		return 0.0f;
+	}
 	float theta_a = -atan2f(delta, D);
 	float theta_b = atan2f(t - delta, D);
 	float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a);
-
+	if(UNLIKELY(theta_b == theta_a)) {
+		*pdf = 0.0f;
+		return 0.0f;
+	}
 	*pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
 
 	return min(t, delta + t_); /* min is only for float precision errors */
@@ -258,13 +265,19 @@ ccl_device float kernel_volume_equiangular_sample(Ray *ray, float3 light_P, floa
 ccl_device float kernel_volume_equiangular_pdf(Ray *ray, float3 light_P, float sample_t)
 {
 	float delta = dot((light_P - ray->P) , ray->D);
-	float D = sqrtf(len_squared(light_P - ray->P) - delta * delta);
+	float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+	if(UNLIKELY(D == 0.0f)) {
+		return 0.0f;
+	}
 
 	float t = ray->t;
 	float t_ = sample_t - delta;
 
 	float theta_a = -atan2f(delta, D);
 	float theta_b = atan2f(t - delta, D);
+	if(UNLIKELY(theta_b == theta_a)) {
+		return 0.0f;
+	}
 
 	float pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
 
@@ -569,17 +582,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
 ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg,
 	PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng, bool heterogeneous)
 {
-	/* workaround to fix correlation bug in T38710, can find better solution
-	 * in random number generator later, for now this is done here to not impact
-	 * performance of rendering without volumes */
-	RNG tmp_rng = cmj_hash(*rng, state->rng_offset);
-
 	shader_setup_from_volume(kg, sd, ray);
 
 	if(heterogeneous)
-		return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, &tmp_rng);
+		return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, rng);
 	else
-		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, &tmp_rng, true);
+		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, rng, true);
 }
 
 /* Decoupled Volume Sampling
@@ -958,6 +966,9 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 			mis_weight = 2.0f*power_heuristic(pdf, distance_pdf);
 		}
 	}
+	if(sample_t < 1e-6f) {
+		return VOLUME_PATH_SCATTERED;
+	}
 
 	/* compute transmittance up to this step */
 	if(step != segment->steps)
@@ -1251,4 +1262,30 @@ ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg,
 }
 #endif
 
+/* Clean stack after the last bounce.
+ *
+ * It is expected that all volumes are closed manifolds, so at the time when ray
+ * hits nothing (for example, it is a last bounce which goes to environment) the
+ * only expected volume in the stack is the world's one. All the rest volume
+ * entries should have been exited already.
+ *
+ * This isn't always true because of ray intersection precision issues, which
+ * could lead us to an infinite non-world volume in the stack, causing render
+ * artifacts.
+ *
+ * Use this function after the last bounce to get rid of all volumes apart from
+ * the world's one after the last bounce to avoid render artifacts.
+ */
+ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg,
+                                                 VolumeStack *volume_stack)
+{
+	if(kernel_data.background.volume_shader != SHADER_NONE) {
+		/* Keep the world's volume in stack. */
+		volume_stack[1].shader = SHADER_NONE;
+	}
+	else {
+		volume_stack[0].shader = SHADER_NONE;
+	}
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp
index f11c85d..72dbbd9 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp
@@ -42,9 +42,11 @@
 #    define __KERNEL_SSE41__
 #  endif
 #  ifdef __AVX__
+#    define __KERNEL_SSE__
 #    define __KERNEL_AVX__
 #  endif
 #  ifdef __AVX2__
+#    define __KERNEL_SSE__
 #    define __KERNEL_AVX2__
 #  endif
 #endif
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
index 533ab46..1350d9e 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -20,6 +20,7 @@
  
 /* SSE optimization disabled for now on 32 bit, see bug #36316 */
 #if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE__
 #  define __KERNEL_SSE2__
 #  define __KERNEL_SSE3__
 #  define __KERNEL_SSSE3__
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
index 7351e2b..1a416e7 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -20,6 +20,7 @@
 
 /* SSE optimization disabled for now on 32 bit, see bug #36316 */
 #if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE__
 #  define __KERNEL_SSE2__
 #  define __KERNEL_SSE3__
 #  define __KERNEL_SSSE3__
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index 37907cd..a68f978 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -20,6 +20,7 @@
 #include "../../kernel_math.h"
 #include "../../kernel_types.h"
 #include "../../kernel_globals.h"
+#include "../../kernel_image_opencl.h"
 
 #include "../../kernel_film.h"
 
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 0f3edcb..eeccf9a 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -102,6 +102,8 @@ ustring OSLRenderServices::u_curve_tangent_normal("geom:curve_tangent_normal");
 #endif
 ustring OSLRenderServices::u_path_ray_length("path:ray_length");
 ustring OSLRenderServices::u_path_ray_depth("path:ray_depth");
+ustring OSLRenderServices::u_path_diffuse_depth("path:diffuse_depth");
+ustring OSLRenderServices::u_path_glossy_depth("path:glossy_depth");
 ustring OSLRenderServices::u_path_transparent_depth("path:transparent_depth");
 ustring OSLRenderServices::u_path_transmission_depth("path:transmission_depth");
 ustring OSLRenderServices::u_trace("trace");
@@ -168,6 +170,12 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 
 			return true;
 		}
+		else if(sd->type == PRIMITIVE_LAMP) {
+			Transform tfm = transform_transpose(sd->ob_tfm);
+			COPY_MATRIX44(&result, &tfm);
+
+			return true;
+		}
 	}
 
 	return false;
@@ -198,6 +206,12 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 
 			return true;
 		}
+		else if(sd->type == PRIMITIVE_LAMP) {
+			Transform tfm = transform_transpose(sd->ob_itfm);
+			COPY_MATRIX44(&result, &tfm);
+
+			return true;
+		}
 	}
 
 	return false;
@@ -287,6 +301,12 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 
 			return true;
 		}
+		else if(sd->type == PRIMITIVE_LAMP) {
+			Transform tfm = transform_transpose(sd->ob_tfm);
+			COPY_MATRIX44(&result, &tfm);
+
+			return true;
+		}
 	}
 
 	return false;
@@ -312,6 +332,12 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 
 			return true;
 		}
+		else if(sd->type == PRIMITIVE_LAMP) {
+			Transform tfm = transform_transpose(sd->ob_itfm);
+			COPY_MATRIX44(&result, &tfm);
+
+			return true;
+		}
 	}
 
 	return false;
@@ -735,6 +761,24 @@ bool OSLRenderServices::get_background_attribute(KernelGlobals *kg, ShaderData *
 		int f = state->bounce;
 		return set_attribute_int(f, type, derivatives, val);
 	}
+	else if(name == u_path_diffuse_depth) {
+		/* Diffuse Ray Depth */
+		PathState *state = sd->osl_path_state;
+		int f = state->diffuse_bounce;
+		return set_attribute_int(f, type, derivatives, val);
+	}
+	else if(name == u_path_glossy_depth) {
+		/* Glossy Ray Depth */
+		PathState *state = sd->osl_path_state;
+		int f = state->glossy_bounce;
+		return set_attribute_int(f, type, derivatives, val);
+	}
+	else if(name == u_path_transmission_depth) {
+		/* Transmission Ray Depth */
+		PathState *state = sd->osl_path_state;
+		int f = state->transmission_bounce;
+		return set_attribute_int(f, type, derivatives, val);
+	}
 	else if(name == u_path_transparent_depth) {
 		/* Transparent Ray Depth */
 		PathState *state = sd->osl_path_state;
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 0f2e02c..ec34ca7 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -165,6 +165,8 @@ public:
 	static ustring u_curve_tangent_normal;
 	static ustring u_path_ray_length;
 	static ustring u_path_ray_depth;
+	static ustring u_path_diffuse_depth;
+	static ustring u_path_glossy_depth;
 	static ustring u_path_transparent_depth;
 	static ustring u_path_transmission_depth;
 	static ustring u_trace;
diff --git a/intern/cycles/kernel/shaders/node_brick_texture.osl b/intern/cycles/kernel/shaders/node_brick_texture.osl
index d5e0a7d..c303594 100644
--- a/intern/cycles/kernel/shaders/node_brick_texture.osl
+++ b/intern/cycles/kernel/shaders/node_brick_texture.osl
@@ -28,7 +28,7 @@ float brick_noise(int n) /* fast integer noise */
 	return 0.5 * ((float)nn / 1073741824.0);
 }
 
-float brick(point p, float mortar_size, float bias,
+float brick(point p, float mortar_size, float mortar_smooth, float bias,
 	float BrickWidth, float row_height, float offset_amount, int offset_frequency,
 	float squash_amount, int squash_frequency, float tint)
 {
@@ -51,9 +51,17 @@ float brick(point p, float mortar_size, float bias,
 
 	tint = clamp((brick_noise((rownum << 16) + (bricknum & 65535)) + bias), 0.0, 1.0);
 
-	return (x < mortar_size || y < mortar_size ||
-	        x > (brick_width - mortar_size) ||
-	        y > (row_height - mortar_size)) ? 1.0 : 0.0;
+	float min_dist = min(min(x, y), min(brick_width - x, row_height - y));
+	if(min_dist >= mortar_size) {
+		return 0.0;
+	}
+	else if(mortar_smooth == 0.0) {
+		return 1.0;
+	}
+	else {
+		min_dist = 1.0 - min_dist/mortar_size;
+		return smoothstep(0.0, mortar_smooth, min_dist);
+	}
 }
 
 shader node_brick_texture(
@@ -69,6 +77,7 @@ shader node_brick_texture(
 	color Mortar = 0.0,
 	float Scale = 5.0,
 	float MortarSize = 0.02,
+	float MortarSmooth = 0.0,
 	float Bias = 0.0,
 	float BrickWidth = 0.5,
 	float RowHeight = 0.25,
@@ -83,7 +92,7 @@ shader node_brick_texture(
 	float tint = 0.0;
 	color Col = Color1;
 	
-	Fac = brick(p * Scale, MortarSize, Bias, BrickWidth, RowHeight,
+	Fac = brick(p * Scale, MortarSize, MortarSmooth, Bias, BrickWidth, RowHeight,
 		offset, offset_frequency, squash, squash_frequency, tint);
 		
 	if (Fac != 1.0) {
@@ -91,6 +100,6 @@ shader node_brick_texture(
 		Col = facm * Color1 + tint * Color2;
 	}
 	
-	Color = (Fac == 1.0) ? Mortar : Col;
+	Color = mix(Col, Mortar, Fac);
 }
 
diff --git a/intern/cycles/kernel/shaders/node_light_path.osl b/intern/cycles/kernel/shaders/node_light_path.osl
index a021a40..64fe4c2 100644
--- a/intern/cycles/kernel/shaders/node_light_path.osl
+++ b/intern/cycles/kernel/shaders/node_light_path.osl
@@ -27,6 +27,8 @@ shader node_light_path(
 	output float IsVolumeScatterRay = 0.0,
 	output float RayLength = 0.0,
 	output float RayDepth = 0.0,
+	output float DiffuseDepth = 0.0,
+	output float GlossyDepth = 0.0,
 	output float TransparentDepth = 0.0,
 	output float TransmissionDepth = 0.0)
 {
@@ -45,6 +47,14 @@ shader node_light_path(
 	getattribute("path:ray_depth", ray_depth);
 	RayDepth = (float)ray_depth;
 
+	int diffuse_depth;
+	getattribute("path:diffuse_depth", diffuse_depth);
+	DiffuseDepth = (float)diffuse_depth;
+
+	int glossy_depth;
+	getattribute("path:glossy_depth", glossy_depth);
+	GlossyDepth = (float)glossy_depth;
+
 	int transparent_depth;
 	getattribute("path:transparent_depth", transparent_depth);
 	TransparentDepth = (float)transparent_depth;
diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h
index f42d0a9..9bfa71c 100644
--- a/intern/cycles/kernel/split/kernel_background_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_background_buffer_update.h
@@ -232,7 +232,8 @@ ccl_device char kernel_background_buffer_update(
 #endif
 				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
 				enqueue_flag = 1;
-			} else {
+			}
+			else {
 				/* These rays do not participate in path-iteration. */
 				float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 				/* Accumulate result in output buffer. */
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index e3dbc43..6e158d5 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -215,7 +215,8 @@ ccl_device void kernel_data_init(
 #ifdef __KERNEL_DEBUG__
 			debug_data_init(&debugdata_coop[ray_index]);
 #endif
-		} else {
+		}
+		else {
 			/* These rays do not participate in path-iteration. */
 			float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 			/* Accumulate result in output buffer. */
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
index ebe9109..82ca188 100644
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -72,32 +72,34 @@ ccl_device char kernel_direct_lighting(
 			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 			float light_u, light_v;
 			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+			float terminate = path_state_rng_light_termination(kg, rng, state);
 
 			LightSample ls;
-			light_sample(kg,
-			             light_t, light_u, light_v,
-			             ccl_fetch(sd, time),
-			             ccl_fetch(sd, P),
-			             state->bounce,
-			             &ls);
+			if(light_sample(kg,
+			                light_t, light_u, light_v,
+			                ccl_fetch(sd, time),
+			                ccl_fetch(sd, P),
+			                state->bounce,
+			                &ls)) {
 
-			Ray light_ray;
+				Ray light_ray;
 #ifdef __OBJECT_MOTION__
-			light_ray.time = ccl_fetch(sd, time);
+				light_ray.time = ccl_fetch(sd, time);
 #endif
 
-			BsdfEval L_light;
-			bool is_lamp;
-			if(direct_emission(kg, sd, kg->sd_input, &ls, state, &light_ray, &L_light, &is_lamp)) {
-				/* Write intermediate data to global memory to access from
-				 * the next kernel.
-				 */
-				LightRay_coop[ray_index] = light_ray;
-				BSDFEval_coop[ray_index] = L_light;
-				ISLamp_coop[ray_index] = is_lamp;
-				/* Mark ray state for next shadow kernel. */
-				ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
-				enqueue_flag = 1;
+				BsdfEval L_light;
+				bool is_lamp;
+				if(direct_emission(kg, sd, kg->sd_input, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
+					/* Write intermediate data to global memory to access from
+					 * the next kernel.
+					 */
+					LightRay_coop[ray_index] = light_ray;
+					BSDFEval_coop[ray_index] = L_light;
+					ISLamp_coop[ray_index] = is_lamp;
+					/* Mark ray state for next shadow kernel. */
+					ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+					enqueue_flag = 1;
+				}
 			}
 		}
 #endif  /* __EMISSION__ */
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 78dada8..435d117 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -212,7 +212,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 				if(terminate >= probability) {
 					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
 					*enqueue_flag = 1;
-				} else {
+				}
+				else {
 					throughput_coop[ray_index] = throughput/probability;
 				}
 			}
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index 74da80b..816f3a6 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -126,7 +126,7 @@ ccl_device char kernel_next_iteration_setup(
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		ccl_global float3 *throughput = &throughput_coop[ray_index];
 		ccl_global Ray *ray = &Ray_coop[ray_index];
-		ccl_global RNG* rng = &rng_coop[ray_index];
+		ccl_global RNG *rng = &rng_coop[ray_index];
 		state = &PathState_coop[ray_index];
 		L = &PathRadiance_coop[ray_index];
 
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
index fc4b4ee..2388580 100644
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -116,8 +116,9 @@ ccl_device void kernel_scene_intersect(
 
 #ifdef __KERNEL_DEBUG__
 	if(state.flag & PATH_RAY_CAMERA) {
-		debug_data->num_bvh_traversal_steps += isect->num_traversal_steps;
+		debug_data->num_bvh_traversed_nodes += isect->num_traversed_nodes;
 		debug_data->num_bvh_traversed_instances += isect->num_traversed_instances;
+		debug_data->num_bvh_intersections += isect->num_intersections;
 	}
 	debug_data->num_ray_bounces++;
 #endif
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 88d6dab..2135ee2 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -21,6 +21,7 @@
 #include "kernel_math.h"
 #include "kernel_types.h"
 #include "kernel_globals.h"
+#include "kernel_image_opencl.h"
 
 #include "util_atomic.h"
 
diff --git a/intern/cycles/kernel/svm/svm_brick.h b/intern/cycles/kernel/svm/svm_brick.h
index 9b0cf5a..14245cf 100644
--- a/intern/cycles/kernel/svm/svm_brick.h
+++ b/intern/cycles/kernel/svm/svm_brick.h
@@ -27,7 +27,7 @@ ccl_device_noinline float brick_noise(int n) /* fast integer noise */
 	return 0.5f * ((float)nn / 1073741824.0f);
 }
 
-ccl_device_noinline float2 svm_brick(float3 p, float mortar_size, float bias,
+ccl_device_noinline float2 svm_brick(float3 p, float mortar_size, float mortar_smooth, float bias,
 	float brick_width, float row_height, float offset_amount, int offset_frequency,
 	float squash_amount, int squash_frequency)
 {
@@ -47,30 +47,41 @@ ccl_device_noinline float2 svm_brick(float3 p, float mortar_size, float bias,
 	x = (p.x+offset) - brick_width*bricknum;
 	y = p.y - row_height*rownum;
 
-	return make_float2(
-		saturate((brick_noise((rownum << 16) + (bricknum & 0xFFFF)) + bias)),
+	float tint = saturate((brick_noise((rownum << 16) + (bricknum & 0xFFFF)) + bias));
+	float min_dist = min(min(x, y), min(brick_width - x, row_height - y));
 
-		(x < mortar_size || y < mortar_size ||
-		x > (brick_width - mortar_size) ||
-		y > (row_height - mortar_size)) ? 1.0f : 0.0f);
+	float mortar;
+	if(min_dist >= mortar_size) {
+		mortar = 0.0f;
+	}
+	else if(mortar_smooth == 0.0f) {
+		mortar = 1.0f;
+	}
+	else {
+		min_dist = 1.0f - min_dist/mortar_size;
+		mortar = (min_dist < mortar_smooth)? smoothstepf(min_dist / mortar_smooth) : 1.0f;
+	}
+
+	return make_float2(tint, mortar);
 }
 
 ccl_device void svm_node_tex_brick(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {	
 	uint4 node2 = read_node(kg, offset);
 	uint4 node3 = read_node(kg, offset);
+	uint4 node4 = read_node(kg, offset);
 	
 	/* Input and Output Sockets */
 	uint co_offset, color1_offset, color2_offset, mortar_offset, scale_offset;
 	uint mortar_size_offset, bias_offset, brick_width_offset, row_height_offset;
-	uint color_offset, fac_offset;
+	uint color_offset, fac_offset, mortar_smooth_offset;
 	
 	/* RNA properties */
 	uint offset_frequency, squash_frequency;
 	
 	decode_node_uchar4(node.y, &co_offset, &color1_offset, &color2_offset, &mortar_offset);
 	decode_node_uchar4(node.z, &scale_offset, &mortar_size_offset, &bias_offset, &brick_width_offset);
-	decode_node_uchar4(node.w, &row_height_offset, &color_offset, &fac_offset, NULL);
+	decode_node_uchar4(node.w, &row_height_offset, &color_offset, &fac_offset, &mortar_smooth_offset);
 	
 	decode_node_uchar4(node2.x, &offset_frequency, &squash_frequency, NULL, NULL);
 
@@ -82,13 +93,14 @@ ccl_device void svm_node_tex_brick(KernelGlobals *kg, ShaderData *sd, float *sta
 	
 	float scale = stack_load_float_default(stack, scale_offset, node2.y);
 	float mortar_size = stack_load_float_default(stack, mortar_size_offset, node2.z);
+	float mortar_smooth = stack_load_float_default(stack, mortar_smooth_offset, node4.x);
 	float bias = stack_load_float_default(stack, bias_offset, node2.w);
 	float brick_width = stack_load_float_default(stack, brick_width_offset, node3.x);
 	float row_height = stack_load_float_default(stack, row_height_offset, node3.y);
 	float offset_amount = __int_as_float(node3.z);
 	float squash_amount = __int_as_float(node3.w);
 	
-	float2 f2 = svm_brick(co*scale, mortar_size, bias, brick_width, row_height,
+	float2 f2 = svm_brick(co*scale, mortar_size, mortar_smooth, bias, brick_width, row_height,
 		offset_amount, offset_frequency, squash_amount, squash_frequency);
 
 	float tint = f2.x;
@@ -100,7 +112,7 @@ ccl_device void svm_node_tex_brick(KernelGlobals *kg, ShaderData *sd, float *sta
 	}
 
 	if(stack_valid(color_offset))
-		stack_store_float3(stack, color_offset, (f == 1.0f)? mortar: color1);
+		stack_store_float3(stack, color_offset, color1*(1.0f-f) + mortar*f);
 	if(stack_valid(fac_offset))
 		stack_store_float(stack, fac_offset, f);
 }
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 378ce65..2afdf61 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -29,147 +29,6 @@ CCL_NAMESPACE_BEGIN
 #  define TEX_NUM_FLOAT4_IMAGES	TEX_NUM_FLOAT4_OPENCL
 #endif
 
-#ifdef __KERNEL_OPENCL__
-
-/* For OpenCL all images are packed in a single array, and we do manual lookup
- * and interpolation. */
-
-ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset)
-{
-	/* Float4 */
-	if(id < TEX_START_BYTE4_OPENCL) {
-		return kernel_tex_fetch(__tex_image_float4_packed, offset);
-	}
-	/* Byte4 */
-	else if(id < TEX_START_FLOAT_OPENCL) {
-		uchar4 r = kernel_tex_fetch(__tex_image_byte4_packed, offset);
-		float f = 1.0f/255.0f;
-		return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
-	}
-	/* Float */
-	else if(id < TEX_START_BYTE_OPENCL) {
-		float f = kernel_tex_fetch(__tex_image_float_packed, offset);
-		return make_float4(f, f, f, 1.0f);
-	}
-	/* Byte */
-	else {
-		uchar r = kernel_tex_fetch(__tex_image_byte_packed, offset);
-		float f = r * (1.0f/255.0f);
-		return make_float4(f, f, f, 1.0f);
-	}
-}
-
-ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
-{
-	x %= width;
-	if(x < 0)
-		x += width;
-	return x;
-}
-
-ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
-{
-	return clamp(x, 0, width-1);
-}
-
-ccl_device_inline float svm_image_texture_frac(float x, int *ix)
-{
-	int i = float_to_int(x) - ((x < 0.0f)? 1: 0);
-	*ix = i;
-	return x - (float)i;
-}
-
-ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
-{
-	uint4 info = kernel_tex_fetch(__tex_image_packed_info, id);
-	uint width = info.x;
-	uint height = info.y;
-	uint offset = info.z;
-
-	/* Image Options */
-	uint interpolation = (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
-	uint extension;
-	if(info.w & (1 << 1))
-		extension = EXTENSION_REPEAT;
-	else if(info.w & (1 << 2))
-		extension = EXTENSION_EXTEND;
-	else
-		extension = EXTENSION_CLIP;
-
-	float4 r;
-	int ix, iy, nix, niy;
-	if(interpolation == INTERPOLATION_CLOSEST) {
-		svm_image_texture_frac(x*width, &ix);
-		svm_image_texture_frac(y*height, &iy);
-
-		if(extension == EXTENSION_REPEAT) {
-			ix = svm_image_texture_wrap_periodic(ix, width);
-			iy = svm_image_texture_wrap_periodic(iy, height);
-		}
-		else if(extension == EXTENSION_CLIP) {
-			if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f)
-				return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-		}
-		else { /* EXTENSION_EXTEND */
-			ix = svm_image_texture_wrap_clamp(ix, width);
-			iy = svm_image_texture_wrap_clamp(iy, height);
-		}
-
-		r = svm_image_texture_read(kg, id, offset + ix + iy*width);
-	}
-	else { /* INTERPOLATION_LINEAR */
-		float tx = svm_image_texture_frac(x*width - 0.5f, &ix);
-		float ty = svm_image_texture_frac(y*height - 0.5f, &iy);
-
-		if(extension == EXTENSION_REPEAT) {
-			ix = svm_image_texture_wrap_periodic(ix, width);
-			iy = svm_image_texture_wrap_periodic(iy, height);
-
-			nix = svm_image_texture_wrap_periodic(ix+1, width);
-			niy = svm_image_texture_wrap_periodic(iy+1, height);
-		}
-		else {
-			if(extension == EXTENSION_CLIP) {
-				if(x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-					return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-				}
-			}
-			nix = svm_image_texture_wrap_clamp(ix+1, width);
-			niy = svm_image_texture_wrap_clamp(iy+1, height);
-			ix = svm_image_texture_wrap_clamp(ix, width);
-			iy = svm_image_texture_wrap_clamp(iy, height);
-		}
-
-		r = (1.0f - ty)*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + iy*width);
-		r += (1.0f - ty)*tx*svm_image_texture_read(kg, id, offset + nix + iy*width);
-		r += ty*(1.0f - tx)*svm_image_texture_read(kg, id, offset + ix + niy*width);
-		r += ty*tx*svm_image_texture_read(kg, id, offset + nix + niy*width);
-	}
-
-	if(use_alpha && r.w != 1.0f && r.w != 0.0f) {
-		float invw = 1.0f/r.w;
-		r.x *= invw;
-		r.y *= invw;
-		r.z *= invw;
-
-		if(id >= TEX_NUM_FLOAT4_IMAGES) {
-			r.x = min(r.x, 1.0f);
-			r.y = min(r.y, 1.0f);
-			r.z = min(r.z, 1.0f);
-		}
-	}
-
-	if(srgb) {
-		r.x = color_srgb_to_scene_linear(r.x);
-		r.y = color_srgb_to_scene_linear(r.y);
-		r.z = color_srgb_to_scene_linear(r.z);
-	}
-
-	return r;
-}
-
-#else
-
 ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
 {
 #ifdef __KERNEL_CPU__
@@ -180,6 +39,8 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 #  else
 	float4 r = kernel_tex_image_interp(id, x, y);
 #  endif
+#elif defined(__KERNEL_OPENCL__)
+	float4 r = kernel_tex_image_interp(kg, id, x, y);
 #else
 	float4 r;
 
@@ -339,8 +200,6 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 	return r;
 }
 
-#endif
-
 /* Remap coordnate from 0..1 box to -1..-1 */
 ccl_device_inline float3 texco_remap_square(float3 co)
 {
@@ -382,8 +241,7 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
 	float3 N = ccl_fetch(sd, N);
 
 	N = ccl_fetch(sd, N);
-	if(ccl_fetch(sd, object) != OBJECT_NONE)
-		object_inverse_normal_transform(kg, sd, &N);
+	object_inverse_normal_transform(kg, sd, &N);
 
 	/* project from direction vector to barycentric coordinates in triangles */
 	N.x = fabsf(N.x);
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index f35ea05..04f6f62 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -34,6 +34,8 @@ ccl_device void svm_node_light_path(ShaderData *sd, ccl_addr_space PathState *st
 		case NODE_LP_backfacing: info = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f: 0.0f; break;
 		case NODE_LP_ray_length: info = ccl_fetch(sd, ray_length); break;
 		case NODE_LP_ray_depth: info = (float)state->bounce; break;
+		case NODE_LP_ray_diffuse: info = (float)state->diffuse_bounce; break;
+		case NODE_LP_ray_glossy: info = (float)state->glossy_bounce; break;
 		case NODE_LP_ray_transparent: info = (float)state->transparent_bounce; break;
 		case NODE_LP_ray_transmission: info = (float)state->transmission_bounce; break;
 	}
diff --git a/intern/cycles/kernel/svm/svm_math_util.h b/intern/cycles/kernel/svm/svm_math_util.h
index 6d13a0d..01547b6 100644
--- a/intern/cycles/kernel/svm/svm_math_util.h
+++ b/intern/cycles/kernel/svm/svm_math_util.h
@@ -164,6 +164,9 @@ ccl_device float3 svm_math_blackbody_color(float t) {
 
 ccl_device_inline float3 svm_math_gamma_color(float3 color, float gamma)
 {
+	if(gamma == 0.0f)
+		return make_float3(1.0f, 1.0f, 1.0f);
+
 	if(color.x > 0.0f)
 		color.x = powf(color.x, gamma);
 	if(color.y > 0.0f)
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index 6ea2539..c0b0126 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -49,8 +49,7 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
 		}
 		case NODE_TEXCO_NORMAL: {
 			data = ccl_fetch(sd, N);
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				object_inverse_normal_transform(kg, sd, &data);
+			object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
@@ -131,8 +130,7 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
 		}
 		case NODE_TEXCO_NORMAL: {
 			data = ccl_fetch(sd, N);
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				object_inverse_normal_transform(kg, sd, &data);
+			object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
@@ -216,8 +214,7 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
 		}
 		case NODE_TEXCO_NORMAL: {
 			data = ccl_fetch(sd, N);
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				object_inverse_normal_transform(kg, sd, &data);
+			object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 5adf7d3..47209dd 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -188,6 +188,8 @@ typedef enum NodeLightPath {
 	NODE_LP_backfacing,
 	NODE_LP_ray_length,
 	NODE_LP_ray_depth,
+	NODE_LP_ray_diffuse,
+	NODE_LP_ray_glossy,
 	NODE_LP_ray_transparent,
 	NODE_LP_ray_transmission,
 } NodeLightPath;
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
index f54f4e8..a8b3604 100644
--- a/intern/cycles/kernel/svm/svm_voxel.h
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -43,7 +43,7 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg,
 		co = transform_point(&tfm, co);
 	}
 	float4 r;
-#  if defined(__KERNEL_GPU__)
+#  if defined(__KERNEL_CUDA__)
 #    if __CUDA_ARCH__ >= 300
 	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
 	if(id < 2048) /* TODO(dingto): Make this a variable */
@@ -55,9 +55,11 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg,
 #    else /* __CUDA_ARCH__ >= 300 */
 	r = volume_image_texture_3d(id, co.x, co.y, co.z);
 #    endif
-#  else /* __KERNEL_GPU__ */
+#  elif defined(__KERNEL_OPENCL__)
+	r = kernel_tex_image_interp_3d(kg, id, co.x, co.y, co.z);
+#  else
 	r = kernel_tex_image_interp_3d(id, co.x, co.y, co.z);
-#  endif
+#  endif /* __KERNEL_CUDA__ */
 #else
 	float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 #endif
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
index 13310a6..d9a2970 100644
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@@ -135,20 +135,16 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
 {
 	size_t num_pixels = bake_data->size();
 
-	progress.reset_sample();
-	this->num_parts = 0;
+	int num_samples = is_aa_pass(shader_type)? scene->integrator->aa_samples : 1;
 
-	/* calculate the total parts for the progress bar */
+	/* calculate the total pixel samples for the progress bar */
+	total_pixel_samples = 0;
 	for(size_t shader_offset = 0; shader_offset < num_pixels; shader_offset += m_shader_limit) {
 		size_t shader_size = (size_t)fminf(num_pixels - shader_offset, m_shader_limit);
-
-		DeviceTask task(DeviceTask::SHADER);
-		task.shader_w = shader_size;
-
-		this->num_parts += device->get_split_task_count(task);
+		total_pixel_samples += shader_size * num_samples;
 	}
-
-	this->num_samples = is_aa_pass(shader_type)? scene->integrator->aa_samples : 1;
+	progress.reset_sample();
+	progress.set_total_pixel_samples(total_pixel_samples);
 
 	for(size_t shader_offset = 0; shader_offset < num_pixels; shader_offset += m_shader_limit) {
 		size_t shader_size = (size_t)fminf(num_pixels - shader_offset, m_shader_limit);
@@ -187,9 +183,9 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
 		task.shader_x = 0;
 		task.offset = shader_offset;
 		task.shader_w = d_output.size();
-		task.num_samples = this->num_samples;
+		task.num_samples = num_samples;
 		task.get_cancel = function_bind(&Progress::get_cancel, &progress);
-		task.update_progress_sample = function_bind(&Progress::increment_sample_update, &progress);
+		task.update_progress_sample = function_bind(&Progress::add_samples_update, &progress, _1, _2);
 
 		device->task_add(task);
 		device->task_wait();
diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h
index 8377e38..25f5eb3 100644
--- a/intern/cycles/render/bake.h
+++ b/intern/cycles/render/bake.h
@@ -73,8 +73,7 @@ public:
 
 	bool need_update;
 
-	int num_samples;
-	int num_parts;
+	int total_pixel_samples;
 
 private:
 	BakeData *m_bake_data;
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index 1e170d3..f169271 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -135,15 +135,7 @@ void RenderBuffers::reset(Device *device, BufferParams& params_)
 	/* allocate rng state */
 	rng_state.resize(params.width, params.height);
 
-	uint *init_state = rng_state.resize(params.width, params.height);
-	int x, y, width = params.width, height = params.height;
-	
-	for(y = 0; y < height; y++)
-		for(x = 0; x < width; x++)
-			init_state[y*width + x] = hash_int_2d(params.full_x+x, params.full_y+y);
-
 	device->mem_alloc(rng_state, MEM_READ_WRITE);
-	device->mem_copy_to(rng_state);
 }
 
 bool RenderBuffers::copy_from_device()
@@ -193,13 +185,11 @@ bool RenderBuffers::get_pass_rect(PassType type, float exposure, int sample, int
 				}
 			}
 #ifdef WITH_CYCLES_DEBUG
-			else if(type == PASS_BVH_TRAVERSAL_STEPS) {
-				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
-					float f = *in;
-					pixels[0] = f*scale;
-				}
-			}
-			else if(type == PASS_RAY_BOUNCES) {
+			else if(type == PASS_BVH_TRAVERSED_NODES ||
+			        type == PASS_BVH_TRAVERSED_INSTANCES ||
+			        type == PASS_BVH_INTERSECTIONS ||
+			        type == PASS_RAY_BOUNCES)
+			{
 				for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
 					float f = *in;
 					pixels[0] = f*scale;
diff --git a/intern/cycles/render/constant_fold.cpp b/intern/cycles/render/constant_fold.cpp
index 200a4c4..b7f2566 100644
--- a/intern/cycles/render/constant_fold.cpp
+++ b/intern/cycles/render/constant_fold.cpp
@@ -89,6 +89,19 @@ void ConstantFolder::make_zero() const
 	}
 }
 
+void ConstantFolder::make_one() const
+{
+	if(output->type() == SocketType::FLOAT) {
+		make_constant(1.0f);
+	}
+	else if(SocketType::is_float3(output->type())) {
+		make_constant(make_float3(1.0f, 1.0f, 1.0f));
+	}
+	else {
+		assert(0);
+	}
+}
+
 void ConstantFolder::bypass(ShaderOutput *new_output) const
 {
 	assert(new_output);
@@ -321,6 +334,15 @@ void ConstantFolder::fold_math(NodeMath type, bool clamp) const
 				make_zero();
 			}
 			break;
+		case NODE_MATH_POWER:
+			/* 1 ^ X == X ^ 0 == 1 */
+			if(is_one(value1_in) || is_zero(value2_in)) {
+				make_one();
+			}
+			/* X ^ 1 == X */
+			else if(is_one(value2_in)) {
+				try_bypass_or_make_constant(value1_in, clamp);
+			}
 		default:
 			break;
 	}
diff --git a/intern/cycles/render/constant_fold.h b/intern/cycles/render/constant_fold.h
index 2b31c2a..7962698 100644
--- a/intern/cycles/render/constant_fold.h
+++ b/intern/cycles/render/constant_fold.h
@@ -43,6 +43,7 @@ public:
 	void make_constant_clamp(float value, bool clamp) const;
 	void make_constant_clamp(float3 value, bool clamp) const;
 	void make_zero() const;
+	void make_one() const;
 
 	/* Bypass node, relinking to another output socket. */
 	void bypass(ShaderOutput *output) const;
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index e10a938..923252b 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -154,14 +154,9 @@ void Pass::add(PassType type, array<Pass>& passes)
 			pass.components = 0;
 			break;
 #ifdef WITH_CYCLES_DEBUG
-		case PASS_BVH_TRAVERSAL_STEPS:
-			pass.components = 1;
-			pass.exposure = false;
-			break;
+		case PASS_BVH_TRAVERSED_NODES:
 		case PASS_BVH_TRAVERSED_INSTANCES:
-			pass.components = 1;
-			pass.exposure = false;
-			break;
+		case PASS_BVH_INTERSECTIONS:
 		case PASS_RAY_BOUNCES:
 			pass.components = 1;
 			pass.exposure = false;
@@ -421,12 +416,15 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 				break;
 
 #ifdef WITH_CYCLES_DEBUG
-			case PASS_BVH_TRAVERSAL_STEPS:
-				kfilm->pass_bvh_traversal_steps = kfilm->pass_stride;
+			case PASS_BVH_TRAVERSED_NODES:
+				kfilm->pass_bvh_traversed_nodes = kfilm->pass_stride;
 				break;
 			case PASS_BVH_TRAVERSED_INSTANCES:
 				kfilm->pass_bvh_traversed_instances = kfilm->pass_stride;
 				break;
+			case PASS_BVH_INTERSECTIONS:
+				kfilm->pass_bvh_intersections = kfilm->pass_stride;
+				break;
 			case PASS_RAY_BOUNCES:
 				kfilm->pass_ray_bounces = kfilm->pass_stride;
 				break;
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index 83d69e9..ab830b1 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -19,6 +19,7 @@
 #include "scene.h"
 
 #include "util_foreach.h"
+#include "util_logging.h"
 #include "util_path.h"
 #include "util_progress.h"
 #include "util_texture.h"
@@ -280,6 +281,8 @@ int ImageManager::add_image(const string& filename,
 
 	ImageDataType type = get_image_metadata(filename, builtin_data, is_linear);
 
+	thread_scoped_lock device_lock(device_mutex);
+
 	/* Do we have a float? */
 	if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4)
 		is_float = true;
@@ -469,133 +472,51 @@ bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &wid
 	return true;
 }
 
-template<typename T>
-bool ImageManager::file_load_byte_image(Image *img, ImageDataType type, device_vector<T>& tex_img)
+template<TypeDesc::BASETYPE FileFormat,
+         typename StorageType,
+         typename DeviceType>
+bool ImageManager::file_load_image(Image *img,
+                                   ImageDataType type,
+                                   int texture_limit,
+                                   device_vector<DeviceType>& tex_img)
 {
+	const StorageType alpha_one = (FileFormat == TypeDesc::UINT8)? 255 : 1;
 	ImageInput *in = NULL;
 	int width, height, depth, components;
-
-	if(!file_load_image_generic(img, &in, width, height, depth, components))
-		return false;
-
-	/* read RGBA pixels */
-	uchar *pixels = (uchar*)tex_img.resize(width, height, depth);
-	if(pixels == NULL) {
+	if(!file_load_image_generic(img, &in, width, height, depth, components)) {
 		return false;
 	}
-	bool cmyk = false;
-
-	if(in) {
-		if(depth <= 1) {
-			int scanlinesize = width*components*sizeof(uchar);
-
-			in->read_image(TypeDesc::UINT8,
-			               (uchar*)pixels + (((size_t)height)-1)*scanlinesize,
-			               AutoStride,
-			               -scanlinesize,
-			               AutoStride);
-		}
-		else {
-			in->read_image(TypeDesc::UINT8, (uchar*)pixels);
-		}
-
-		cmyk = strcmp(in->format_name(), "jpeg") == 0 && components == 4;
-
-		in->close();
-		delete in;
+	/* Read RGBA pixels. */
+	vector<StorageType> pixels_storage;
+	StorageType *pixels;
+	const size_t max_size = max(max(width, height), depth);
+	if(texture_limit > 0 && max_size > texture_limit) {
+		pixels_storage.resize(((size_t)width)*height*depth*4);
+		pixels = &pixels_storage[0];
 	}
 	else {
-		builtin_image_pixels_cb(img->filename, img->builtin_data, pixels);
-	}
-
-	/* Check if we actually have a byte4 slot, in case components == 1, but device
-	 * doesn't support single channel textures. */
-	if(type == IMAGE_DATA_TYPE_BYTE4) {
-		size_t num_pixels = ((size_t)width) * height * depth;
-		if(cmyk) {
-			/* CMYK */
-			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+2] = (pixels[i*4+2]*pixels[i*4+3])/255;
-				pixels[i*4+1] = (pixels[i*4+1]*pixels[i*4+3])/255;
-				pixels[i*4+0] = (pixels[i*4+0]*pixels[i*4+3])/255;
-				pixels[i*4+3] = 255;
-			}
-		}
-		else if(components == 2) {
-			/* grayscale + alpha */
-			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = pixels[i*2+1];
-				pixels[i*4+2] = pixels[i*2+0];
-				pixels[i*4+1] = pixels[i*2+0];
-				pixels[i*4+0] = pixels[i*2+0];
-			}
-		}
-		else if(components == 3) {
-			/* RGB */
-			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 255;
-				pixels[i*4+2] = pixels[i*3+2];
-				pixels[i*4+1] = pixels[i*3+1];
-				pixels[i*4+0] = pixels[i*3+0];
-			}
-		}
-		else if(components == 1) {
-			/* grayscale */
-			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 255;
-				pixels[i*4+2] = pixels[i];
-				pixels[i*4+1] = pixels[i];
-				pixels[i*4+0] = pixels[i];
-			}
-		}
-
-		if(img->use_alpha == false) {
-			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 255;
-			}
-		}
-	}
-
-	return true;
-}
-
-template<typename T>
-bool ImageManager::file_load_float_image(Image *img, ImageDataType type, device_vector<T>& tex_img)
-{
-	ImageInput *in = NULL;
-	int width, height, depth, components;
-
-	if(!file_load_image_generic(img, &in, width, height, depth, components))
-		return false;
-
-	/* read RGBA pixels */
-	float *pixels = (float*)tex_img.resize(width, height, depth);
-	if(pixels == NULL) {
-		return false;
+		pixels = (StorageType*)tex_img.resize(width, height, depth);
 	}
 	bool cmyk = false;
-
+	const size_t num_pixels = ((size_t)width) * height * depth;
 	if(in) {
-		float *readpixels = pixels;
-		vector<float> tmppixels;
-
+		StorageType *readpixels = pixels;
+		vector<StorageType> tmppixels;
 		if(components > 4) {
 			tmppixels.resize(((size_t)width)*height*components);
 			readpixels = &tmppixels[0];
 		}
-
 		if(depth <= 1) {
-			size_t scanlinesize = ((size_t)width)*components*sizeof(float);
-			in->read_image(TypeDesc::FLOAT,
+			size_t scanlinesize = ((size_t)width)*components*sizeof(StorageType);
+			in->read_image(FileFormat,
 			               (uchar*)readpixels + (height-1)*scanlinesize,
 			               AutoStride,
 			               -scanlinesize,
 			               AutoStride);
 		}
 		else {
-			in->read_image(TypeDesc::FLOAT, (uchar*)readpixels);
+			in->read_image(FileFormat, (uchar*)readpixels);
 		}
-
 		if(components > 4) {
 			size_t dimensions = ((size_t)width)*height;
 			for(size_t i = dimensions-1, pixel = 0; pixel < dimensions; pixel++, i--) {
@@ -604,30 +525,43 @@ bool ImageManager::file_load_float_image(Image *img, ImageDataType type, device_
 				pixels[i*4+1] = tmppixels[i*components+1];
 				pixels[i*4+0] = tmppixels[i*components+0];
 			}
-
 			tmppixels.clear();
 		}
-
 		cmyk = strcmp(in->format_name(), "jpeg") == 0 && components == 4;
-
 		in->close();
 		delete in;
 	}
 	else {
-		builtin_image_float_pixels_cb(img->filename, img->builtin_data, pixels);
+		if(FileFormat == TypeDesc::FLOAT) {
+			builtin_image_float_pixels_cb(img->filename,
+			                              img->builtin_data,
+			                              (float*)&pixels[0],
+			                              num_pixels * components);
+		}
+		else if(FileFormat == TypeDesc::UINT8) {
+			builtin_image_pixels_cb(img->filename,
+			                        img->builtin_data,
+			                        (uchar*)&pixels[0],
+			                        num_pixels * components);
+		}
+		else {
+			/* TODO(dingto): Support half for ImBuf. */
+		}
 	}
-
-	/* Check if we actually have a float4 slot, in case components == 1, but device
-	 * doesn't support single channel textures. */
-	if(type == IMAGE_DATA_TYPE_FLOAT4) {
-		size_t num_pixels = ((size_t)width) * height * depth;
+	/* Check if we actually have a float4 slot, in case components == 1,
+	 * but device doesn't support single channel textures.
+	 */
+	bool is_rgba = (type == IMAGE_DATA_TYPE_FLOAT4 ||
+	                type == IMAGE_DATA_TYPE_HALF4 ||
+	                type == IMAGE_DATA_TYPE_BYTE4);
+	if(is_rgba) {
 		if(cmyk) {
 			/* CMYK */
 			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 255;
 				pixels[i*4+2] = (pixels[i*4+2]*pixels[i*4+3])/255;
 				pixels[i*4+1] = (pixels[i*4+1]*pixels[i*4+3])/255;
 				pixels[i*4+0] = (pixels[i*4+0]*pixels[i*4+3])/255;
+				pixels[i*4+3] = alpha_one;
 			}
 		}
 		else if(components == 2) {
@@ -642,7 +576,7 @@ bool ImageManager::file_load_float_image(Image *img, ImageDataType type, device_
 		else if(components == 3) {
 			/* RGB */
 			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 1.0f;
+				pixels[i*4+3] = alpha_one;
 				pixels[i*4+2] = pixels[i*3+2];
 				pixels[i*4+1] = pixels[i*3+1];
 				pixels[i*4+0] = pixels[i*3+0];
@@ -651,128 +585,53 @@ bool ImageManager::file_load_float_image(Image *img, ImageDataType type, device_
 		else if(components == 1) {
 			/* grayscale */
 			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 1.0f;
+				pixels[i*4+3] = alpha_one;
 				pixels[i*4+2] = pixels[i];
 				pixels[i*4+1] = pixels[i];
 				pixels[i*4+0] = pixels[i];
 			}
 		}
-
 		if(img->use_alpha == false) {
 			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 1.0f;
+				pixels[i*4+3] = alpha_one;
 			}
 		}
 	}
-
-	return true;
-}
-
-template<typename T>
-bool ImageManager::file_load_half_image(Image *img, ImageDataType type, device_vector<T>& tex_img)
-{
-	ImageInput *in = NULL;
-	int width, height, depth, components;
-
-	if(!file_load_image_generic(img, &in, width, height, depth, components))
-		return false;
-
-	/* read RGBA pixels */
-	half *pixels = (half*)tex_img.resize(width, height, depth);
-	if(pixels == NULL) {
-		return false;
-	}
-
-	if(in) {
-		half *readpixels = pixels;
-		vector<half> tmppixels;
-
-		if(components > 4) {
-			tmppixels.resize(((size_t)width)*height*components);
-			readpixels = &tmppixels[0];
-		}
-
-		if(depth <= 1) {
-			size_t scanlinesize = ((size_t)width)*components*sizeof(half);
-			in->read_image(TypeDesc::HALF,
-			               (uchar*)readpixels + (height-1)*scanlinesize,
-			               AutoStride,
-			               -scanlinesize,
-			               AutoStride);
-		}
-		else {
-			in->read_image(TypeDesc::HALF, (uchar*)readpixels);
-		}
-
-		if(components > 4) {
-			size_t dimensions = ((size_t)width)*height;
-			for(size_t i = dimensions-1, pixel = 0; pixel < dimensions; pixel++, i--) {
-				pixels[i*4+3] = tmppixels[i*components+3];
-				pixels[i*4+2] = tmppixels[i*components+2];
-				pixels[i*4+1] = tmppixels[i*components+1];
-				pixels[i*4+0] = tmppixels[i*components+0];
-			}
-
-			tmppixels.clear();
-		}
-
-		in->close();
-		delete in;
+	if(pixels_storage.size() > 0) {
+		float scale_factor = 1.0f;
+		while(max_size * scale_factor > texture_limit) {
+			scale_factor *= 0.5f;
+		}
+		VLOG(1) << "Scaling image " << img->filename
+		        << " by a factor of " << scale_factor << ".";
+		vector<StorageType> scaled_pixels;
+		size_t scaled_width, scaled_height, scaled_depth;
+		util_image_resize_pixels(pixels_storage,
+		                         width, height, depth,
+		                         is_rgba ? 4 : 1,
+		                         scale_factor,
+		                         &scaled_pixels,
+		                         &scaled_width, &scaled_height, &scaled_depth);
+		StorageType *texture_pixels = (StorageType*)tex_img.resize(scaled_width,
+		                                                           scaled_height,
+		                                                           scaled_depth);
+		memcpy(texture_pixels,
+		       &scaled_pixels[0],
+		       scaled_pixels.size() * sizeof(StorageType));
 	}
-#if 0
-	/* TODO(dingto): Support half for ImBuf. */
-	else {
-		builtin_image_float_pixels_cb(img->filename, img->builtin_data, pixels);
-	}
-#endif
-
-	/* Check if we actually have a half4 slot, in case components == 1, but device
-	 * doesn't support single channel textures. */
-	if(type == IMAGE_DATA_TYPE_HALF4) {
-		size_t num_pixels = ((size_t)width) * height * depth;
-		if(components == 2) {
-			/* grayscale + alpha */
-			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = pixels[i*2+1];
-				pixels[i*4+2] = pixels[i*2+0];
-				pixels[i*4+1] = pixels[i*2+0];
-				pixels[i*4+0] = pixels[i*2+0];
-			}
-		}
-		else if(components == 3) {
-			/* RGB */
-			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 1.0f;
-				pixels[i*4+2] = pixels[i*3+2];
-				pixels[i*4+1] = pixels[i*3+1];
-				pixels[i*4+0] = pixels[i*3+0];
-			}
-		}
-		else if(components == 1) {
-			/* grayscale */
-			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 1.0f;
-				pixels[i*4+2] = pixels[i];
-				pixels[i*4+1] = pixels[i];
-				pixels[i*4+0] = pixels[i];
-			}
-		}
-
-		if(img->use_alpha == false) {
-			for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) {
-				pixels[i*4+3] = 1.0f;
-			}
-		}
-	}
-
 	return true;
 }
 
-void ImageManager::device_load_image(Device *device, DeviceScene *dscene, ImageDataType type, int slot, Progress *progress)
+void ImageManager::device_load_image(Device *device,
+                                     DeviceScene *dscene,
+                                     Scene *scene,
+                                     ImageDataType type,
+                                     int slot,
+                                     Progress *progress)
 {
 	if(progress->get_cancel())
 		return;
-	
+
 	Image *img = images[type][slot];
 
 	if(osl_texture_system && !img->builtin_data)
@@ -781,6 +640,8 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, ImageD
 	string filename = path_filename(images[type][slot]->filename);
 	progress->set_status("Updating Images", "Loading " + filename);
 
+	const int texture_limit = scene->params.texture_limit;
+
 	/* Slot assignment */
 	int flat_slot = type_index_to_flattened_slot(slot, type);
 
@@ -800,7 +661,11 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, ImageD
 			device->tex_free(tex_img);
 		}
 
-		if(!file_load_float_image(img, type, tex_img)) {
+		if(!file_load_image<TypeDesc::FLOAT, float>(img,
+		                                            type,
+		                                            texture_limit,
+		                                            tex_img))
+		{
 			/* on failure to load, we set a 1x1 pixels pink image */
 			float *pixels = (float*)tex_img.resize(1, 1);
 
@@ -826,7 +691,11 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, ImageD
 			device->tex_free(tex_img);
 		}
 
-		if(!file_load_float_image(img, type, tex_img)) {
+		if(!file_load_image<TypeDesc::FLOAT, float>(img,
+		                                            type,
+		                                            texture_limit,
+		                                            tex_img))
+		{
 			/* on failure to load, we set a 1x1 pixels pink image */
 			float *pixels = (float*)tex_img.resize(1, 1);
 
@@ -849,7 +718,11 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, ImageD
 			device->tex_free(tex_img);
 		}
 
-		if(!file_load_byte_image(img, type, tex_img)) {
+		if(!file_load_image<TypeDesc::UINT8, uchar>(img,
+		                                            type,
+		                                            texture_limit,
+		                                            tex_img))
+		{
 			/* on failure to load, we set a 1x1 pixels pink image */
 			uchar *pixels = (uchar*)tex_img.resize(1, 1);
 
@@ -875,7 +748,10 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, ImageD
 			device->tex_free(tex_img);
 		}
 
-		if(!file_load_byte_image(img, type, tex_img)) {
+		if(!file_load_image<TypeDesc::UINT8, uchar>(img,
+		                                            type,
+		                                            texture_limit,
+		                                            tex_img)) {
 			/* on failure to load, we set a 1x1 pixels pink image */
 			uchar *pixels = (uchar*)tex_img.resize(1, 1);
 
@@ -898,7 +774,10 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, ImageD
 			device->tex_free(tex_img);
 		}
 
-		if(!file_load_half_image(img, type, tex_img)) {
+		if(!file_load_image<TypeDesc::HALF, half>(img,
+		                                          type,
+		                                          texture_limit,
+		                                          tex_img)) {
 			/* on failure to load, we set a 1x1 pixels pink image */
 			half *pixels = (half*)tex_img.resize(1, 1);
 
@@ -924,7 +803,10 @@ void ImageManager::device_load_image(Device *device, DeviceScene *dscene, ImageD
 			device->tex_free(tex_img);
 		}
 
-		if(!file_load_half_image(img, type, tex_img)) {
+		if(!file_load_image<TypeDesc::HALF, half>(img,
+		                                          type,
+		                                          texture_limit,
+		                                          tex_img)) {
 			/* on failure to load, we set a 1x1 pixels pink image */
 			half *pixels = (half*)tex_img.resize(1, 1);
 
@@ -1020,7 +902,10 @@ void ImageManager::device_free_image(Device *device, DeviceScene *dscene, ImageD
 	}
 }
 
-void ImageManager::device_update(Device *device, DeviceScene *dscene, Progress& progress)
+void ImageManager::device_update(Device *device,
+                                 DeviceScene *dscene,
+                                 Scene *scene,
+                                 Progress& progress)
 {
 	if(!need_update)
 		return;
@@ -1037,7 +922,14 @@ void ImageManager::device_update(Device *device, DeviceScene *dscene, Progress&
 			}
 			else if(images[type][slot]->need_load) {
 				if(!osl_texture_system || images[type][slot]->builtin_data)
-					pool.push(function_bind(&ImageManager::device_load_image, this, device, dscene, (ImageDataType)type, slot, &progress));
+					pool.push(function_bind(&ImageManager::device_load_image,
+					                        this,
+					                        device,
+					                        dscene,
+					                        scene,
+					                        (ImageDataType)type,
+					                        slot,
+					                        &progress));
 			}
 		}
 	}
@@ -1052,6 +944,7 @@ void ImageManager::device_update(Device *device, DeviceScene *dscene, Progress&
 
 void ImageManager::device_update_slot(Device *device,
                                       DeviceScene *dscene,
+                                      Scene *scene,
                                       int flat_slot,
                                       Progress *progress)
 {
@@ -1068,6 +961,7 @@ void ImageManager::device_update_slot(Device *device,
 		if(!osl_texture_system || image->builtin_data)
 			device_load_image(device,
 			                  dscene,
+			                  scene,
 			                  type,
 			                  slot,
 			                  progress);
@@ -1105,7 +999,7 @@ void ImageManager::device_pack_images(Device *device,
 
 	int info_size = tex_num_images[IMAGE_DATA_TYPE_FLOAT4] + tex_num_images[IMAGE_DATA_TYPE_BYTE4]
 	                + tex_num_images[IMAGE_DATA_TYPE_FLOAT] + tex_num_images[IMAGE_DATA_TYPE_BYTE];
-	uint4 *info = dscene->tex_image_packed_info.resize(info_size);
+	uint4 *info = dscene->tex_image_packed_info.resize(info_size*2);
 
 	/* Byte4 Textures*/
 	type = IMAGE_DATA_TYPE_BYTE4;
@@ -1128,7 +1022,9 @@ void ImageManager::device_pack_images(Device *device,
 
 		uint8_t options = pack_image_options(type, slot);
 
-		info[type_index_to_flattened_slot(slot, type)] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
+		int index = type_index_to_flattened_slot(slot, type) * 2;
+		info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
+		info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
 
 		memcpy(pixels_byte4+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
 		offset += tex_img.size();
@@ -1157,7 +1053,10 @@ void ImageManager::device_pack_images(Device *device,
 		/* todo: support 3D textures, only CPU for now */
 
 		uint8_t options = pack_image_options(type, slot);
-		info[type_index_to_flattened_slot(slot, type)] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
+
+		int index = type_index_to_flattened_slot(slot, type) * 2;
+		info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
+		info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
 
 		memcpy(pixels_float4+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
 		offset += tex_img.size();
@@ -1185,7 +1084,9 @@ void ImageManager::device_pack_images(Device *device,
 
 		uint8_t options = pack_image_options(type, slot);
 
-		info[type_index_to_flattened_slot(slot, type)] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
+		int index = type_index_to_flattened_slot(slot, type) * 2;
+		info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
+		info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
 
 		memcpy(pixels_byte+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
 		offset += tex_img.size();
@@ -1214,7 +1115,10 @@ void ImageManager::device_pack_images(Device *device,
 		/* todo: support 3D textures, only CPU for now */
 
 		uint8_t options = pack_image_options(type, slot);
-		info[type_index_to_flattened_slot(slot, type)] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
+
+		int index = type_index_to_flattened_slot(slot, type) * 2;
+		info[index] = make_uint4(tex_img.data_width, tex_img.data_height, offset, options);
+		info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
 
 		memcpy(pixels_float+offset, (void*)tex_img.data_pointer, tex_img.memory_size());
 		offset += tex_img.size();
diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h
index cca71a6..47bbd92 100644
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -30,6 +30,7 @@ CCL_NAMESPACE_BEGIN
 class Device;
 class DeviceScene;
 class Progress;
+class Scene;
 
 class ImageManager {
 public:
@@ -67,8 +68,15 @@ public:
 	                      ExtensionType extension);
 	ImageDataType get_image_metadata(const string& filename, void *builtin_data, bool& is_linear);
 
-	void device_update(Device *device, DeviceScene *dscene, Progress& progress);
-	void device_update_slot(Device *device, DeviceScene *dscene, int flat_slot, Progress *progress);
+	void device_update(Device *device,
+	                   DeviceScene *dscene,
+	                   Scene *scene,
+	                   Progress& progress);
+	void device_update_slot(Device *device,
+	                        DeviceScene *dscene,
+	                        Scene *scene,
+	                        int flat_slot,
+	                        Progress *progress);
 	void device_free(Device *device, DeviceScene *dscene);
 	void device_free_builtin(Device *device, DeviceScene *dscene);
 
@@ -78,9 +86,25 @@ public:
 
 	bool need_update;
 
-	function<void(const string &filename, void *data, bool &is_float, int &width, int &height, int &depth, int &channels)> builtin_image_info_cb;
-	function<bool(const string &filename, void *data, unsigned char *pixels)> builtin_image_pixels_cb;
-	function<bool(const string &filename, void *data, float *pixels)> builtin_image_float_pixels_cb;
+	/* NOTE: Here pixels_size is a size of storage, which equals to
+	 *       width * height * depth.
+	 *       Use this to avoid some nasty memory corruptions.
+	 */
+	function<void(const string &filename,
+	              void *data,
+	              bool &is_float,
+	              int &width,
+	              int &height,
+	              int &depth,
+	              int &channels)> builtin_image_info_cb;
+	function<bool(const string &filename,
+	              void *data,
+	              unsigned char *pixels,
+	              const size_t pixels_size)> builtin_image_pixels_cb;
+	function<bool(const string &filename,
+	              void *data,
+	              float *pixels,
+	              const size_t pixels_size)> builtin_image_float_pixels_cb;
 
 	struct Image {
 		string filename;
@@ -109,14 +133,13 @@ private:
 
 	bool file_load_image_generic(Image *img, ImageInput **in, int &width, int &height, int &depth, int &components);
 
-	template<typename T>
-	bool file_load_byte_image(Image *img, ImageDataType type, device_vector<T>& tex_img);
-
-	template<typename T>
-	bool file_load_float_image(Image *img, ImageDataType type, device_vector<T>& tex_img);
-
-	template<typename T>
-	bool file_load_half_image(Image *img, ImageDataType type, device_vector<T>& tex_img);
+	template<TypeDesc::BASETYPE FileFormat,
+	         typename StorageType,
+	         typename DeviceType>
+	bool file_load_image(Image *img,
+	                     ImageDataType type,
+	                     int texture_limit,
+	                     device_vector<DeviceType>& tex_img);
 
 	int type_index_to_flattened_slot(int slot, ImageDataType type);
 	int flattened_slot_to_type_index(int flat_slot, ImageDataType *type);
@@ -124,10 +147,20 @@ private:
 
 	uint8_t pack_image_options(ImageDataType type, size_t slot);
 
-	void device_load_image(Device *device, DeviceScene *dscene, ImageDataType type, int slot, Progress *progess);
-	void device_free_image(Device *device, DeviceScene *dscene, ImageDataType type, int slot);
-
-	void device_pack_images(Device *device, DeviceScene *dscene, Progress& progess);
+	void device_load_image(Device *device,
+	                       DeviceScene *dscene,
+	                       Scene *scene,
+	                       ImageDataType type,
+	                       int slot,
+	                       Progress *progess);
+	void device_free_image(Device *device,
+	                       DeviceScene *dscene,
+	                       ImageDataType type,
+	                       int slot);
+
+	void device_pack_images(Device *device,
+	                        DeviceScene *dscene,
+	                        Progress& progess);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index 63914e5..d434b33 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -62,9 +62,11 @@ NODE_DEFINE(Integrator)
 	SOCKET_INT(mesh_light_samples, "Mesh Light Samples", 1);
 	SOCKET_INT(subsurface_samples, "Subsurface Samples", 1);
 	SOCKET_INT(volume_samples, "Volume Samples", 1);
+	SOCKET_INT(start_sample, "Start Sample", 0);
 
 	SOCKET_BOOLEAN(sample_all_lights_direct, "Sample All Lights Direct", true);
 	SOCKET_BOOLEAN(sample_all_lights_indirect, "Sample All Lights Indirect", true);
+	SOCKET_FLOAT(light_sampling_threshold, "Light Sampling Threshold", 0.05f);
 
 	static NodeEnum method_enum;
 	method_enum.insert("path", PATH);
@@ -151,6 +153,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	kintegrator->mesh_light_samples = mesh_light_samples;
 	kintegrator->subsurface_samples = subsurface_samples;
 	kintegrator->volume_samples = volume_samples;
+	kintegrator->start_sample = start_sample;
 
 	if(method == BRANCHED_PATH) {
 		kintegrator->sample_all_lights_direct = sample_all_lights_direct;
@@ -164,6 +167,13 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	kintegrator->sampling_pattern = sampling_pattern;
 	kintegrator->aa_samples = aa_samples;
 
+	if(light_sampling_threshold > 0.0f) {
+		kintegrator->light_inv_rr_threshold = 1.0f / light_sampling_threshold;
+	}
+	else {
+		kintegrator->light_inv_rr_threshold = 0.0f;
+	}
+
 	/* sobol directions table */
 	int max_samples = 1;
 
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 39eaaf2..e1e316d 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -64,8 +64,11 @@ public:
 	int mesh_light_samples;
 	int subsurface_samples;
 	int volume_samples;
+	int start_sample;
+
 	bool sample_all_lights_direct;
 	bool sample_all_lights_indirect;
+	float light_sampling_threshold;
 
 	enum Method {
 		BRANCHED_PATH = 0,
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index b6c45dd..2245c86 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -43,8 +43,8 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res
 
 	for(int y = 0; y < height; y++) {
 		for(int x = 0; x < width; x++) {
-			float u = x/(float)width;
-			float v = y/(float)height;
+			float u = (x + 0.5f)/width;
+			float v = (y + 0.5f)/height;
 
 			uint4 in = make_uint4(__float_as_int(u), __float_as_int(v), 0, 0);
 			d_input_data[x + y*width] = in;
@@ -106,6 +106,7 @@ NODE_DEFINE(Light)
 
 	static NodeEnum type_enum;
 	type_enum.insert("point", LIGHT_POINT);
+	type_enum.insert("distant", LIGHT_DISTANT);
 	type_enum.insert("background", LIGHT_BACKGROUND);
 	type_enum.insert("area", LIGHT_AREA);
 	type_enum.insert("spot", LIGHT_SPOT);
@@ -126,6 +127,8 @@ NODE_DEFINE(Light)
 	SOCKET_FLOAT(spot_angle, "Spot Angle", M_PI_4_F);
 	SOCKET_FLOAT(spot_smooth, "Spot Smooth", 0.0f);
 
+	SOCKET_TRANSFORM(tfm, "Transform", transform_identity());
+
 	SOCKET_BOOLEAN(cast_shadow, "Cast Shadow", true);
 	SOCKET_BOOLEAN(use_mis, "Use Mis", false);
 	SOCKET_BOOLEAN(use_diffuse, "Use Diffuse", true);
@@ -674,7 +677,6 @@ void LightManager::device_update_points(Device *device,
 			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, invarea, 0.0f);
 			light_data[light_index*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
-			light_data[light_index*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
 		else if(light->type == LIGHT_DISTANT) {
 			shader_id &= ~SHADER_AREA_LIGHT;
@@ -695,7 +697,6 @@ void LightManager::device_update_points(Device *device,
 			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, cosangle, invarea);
 			light_data[light_index*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
-			light_data[light_index*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
 		else if(light->type == LIGHT_BACKGROUND) {
 			uint visibility = scene->background->visibility;
@@ -724,7 +725,6 @@ void LightManager::device_update_points(Device *device,
 			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), 0.0f, 0.0f, 0.0f);
 			light_data[light_index*LIGHT_SIZE + 2] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
-			light_data[light_index*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
 		else if(light->type == LIGHT_AREA) {
 			float3 axisu = light->axisu*(light->sizeu*light->size);
@@ -742,7 +742,6 @@ void LightManager::device_update_points(Device *device,
 			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), axisu.x, axisu.y, axisu.z);
 			light_data[light_index*LIGHT_SIZE + 2] = make_float4(invarea, axisv.x, axisv.y, axisv.z);
 			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, dir.x, dir.y, dir.z);
-			light_data[light_index*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
 		else if(light->type == LIGHT_SPOT) {
 			shader_id &= ~SHADER_AREA_LIGHT;
@@ -762,9 +761,15 @@ void LightManager::device_update_points(Device *device,
 			light_data[light_index*LIGHT_SIZE + 1] = make_float4(__int_as_float(shader_id), radius, invarea, spot_angle);
 			light_data[light_index*LIGHT_SIZE + 2] = make_float4(spot_smooth, dir.x, dir.y, dir.z);
 			light_data[light_index*LIGHT_SIZE + 3] = make_float4(samples, 0.0f, 0.0f, 0.0f);
-			light_data[light_index*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
 		}
 
+		light_data[light_index*LIGHT_SIZE + 4] = make_float4(max_bounces, 0.0f, 0.0f, 0.0f);
+
+		Transform tfm = light->tfm;
+		Transform itfm = transform_inverse(tfm);
+		memcpy(&light_data[light_index*LIGHT_SIZE + 5], &tfm, sizeof(float4)*3);
+		memcpy(&light_data[light_index*LIGHT_SIZE + 8], &itfm, sizeof(float4)*3);
+
 		light_index++;
 	}
 
@@ -791,6 +796,11 @@ void LightManager::device_update_points(Device *device,
 		light_data[light_index*LIGHT_SIZE + 3] = make_float4(-1, dir.x, dir.y, dir.z);
 		light_data[light_index*LIGHT_SIZE + 4] = make_float4(-1, 0.0f, 0.0f, 0.0f);
 
+		Transform tfm = light->tfm;
+		Transform itfm = transform_inverse(tfm);
+		memcpy(&light_data[light_index*LIGHT_SIZE + 5], &tfm, sizeof(float4)*3);
+		memcpy(&light_data[light_index*LIGHT_SIZE + 8], &itfm, sizeof(float4)*3);
+
 		light_index++;
 	}
 
diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h
index 040a672..f56530b 100644
--- a/intern/cycles/render/light.h
+++ b/intern/cycles/render/light.h
@@ -50,6 +50,8 @@ public:
 	float3 axisv;
 	float sizev;
 
+	Transform tfm;
+
 	int map_resolution;
 
 	float spot_angle;
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index ac369a0..c42b329 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -49,6 +49,64 @@ void Mesh::Triangle::bounds_grow(const float3 *verts, BoundBox& bounds) const
 	bounds.grow(verts[v[2]]);
 }
 
+void Mesh::Triangle::motion_verts(const float3 *verts,
+                                  const float3 *vert_steps,
+                                  size_t num_verts,
+                                  size_t num_steps,
+                                  float time,
+                                  float3 r_verts[3]) const
+{
+	/* Figure out which steps we need to fetch and their interpolation factor. */
+	const size_t max_step = num_steps - 1;
+	const size_t step = min((int)(time * max_step), max_step - 1);
+	const float t = time*max_step - step;
+	/* Fetch vertex coordinates. */
+	float3 curr_verts[3];
+	float3 next_verts[3];
+	verts_for_step(verts,
+	               vert_steps,
+	               num_verts,
+	               num_steps,
+	               step,
+	               curr_verts);
+	verts_for_step(verts,
+	               vert_steps,
+	               num_verts,
+	               num_steps,
+	               step + 1,
+	               next_verts);
+	/* Interpolate between steps. */
+	r_verts[0] = (1.0f - t)*curr_verts[0] + t*next_verts[0];
+	r_verts[1] = (1.0f - t)*curr_verts[1] + t*next_verts[1];
+	r_verts[2] = (1.0f - t)*curr_verts[2] + t*next_verts[2];
+}
+
+void Mesh::Triangle::verts_for_step(const float3 *verts,
+                                    const float3 *vert_steps,
+                                    size_t num_verts,
+                                    size_t num_steps,
+                                    size_t step,
+                                    float3 r_verts[3]) const
+{
+	const size_t center_step = ((num_steps - 1) / 2);
+	if(step == center_step) {
+		/* Center step: regular vertex location. */
+		r_verts[0] = verts[v[0]];
+		r_verts[1] = verts[v[1]];
+		r_verts[2] = verts[v[2]];
+	}
+	else {
+		/* Center step not stored in the attribute array array. */
+		if(step > center_step) {
+			step--;
+		}
+		size_t offset = step * num_verts;
+		r_verts[0] = vert_steps[offset + v[0]];
+		r_verts[1] = vert_steps[offset + v[1]];
+		r_verts[2] = vert_steps[offset + v[2]];
+	}
+}
+
 /* Curve */
 
 void Mesh::Curve::bounds_grow(const int k, const float3 *curve_keys, const float *curve_radius, BoundBox& bounds) const
@@ -104,6 +162,205 @@ void Mesh::Curve::bounds_grow(const int k,
 	bounds.grow(upper, mr);
 }
 
+void Mesh::Curve::bounds_grow(float4 keys[4], BoundBox& bounds) const
+{
+	float3 P[4] = {
+		float4_to_float3(keys[0]),
+		float4_to_float3(keys[1]),
+		float4_to_float3(keys[2]),
+		float4_to_float3(keys[3]),
+	};
+
+	float3 lower;
+	float3 upper;
+
+	curvebounds(&lower.x, &upper.x, P, 0);
+	curvebounds(&lower.y, &upper.y, P, 1);
+	curvebounds(&lower.z, &upper.z, P, 2);
+
+	float mr = max(keys[1].w, keys[2].w);
+
+	bounds.grow(lower, mr);
+	bounds.grow(upper, mr);
+}
+
+void Mesh::Curve::motion_keys(const float3 *curve_keys,
+                              const float *curve_radius,
+                              const float3 *key_steps,
+                              size_t num_curve_keys,
+                              size_t num_steps,
+                              float time,
+                              size_t k0, size_t k1,
+                              float4 r_keys[2]) const
+{
+	/* Figure out which steps we need to fetch and their interpolation factor. */
+	const size_t max_step = num_steps - 1;
+	const size_t step = min((int)(time * max_step), max_step - 1);
+	const float t = time*max_step - step;
+	/* Fetch vertex coordinates. */
+	float4 curr_keys[2];
+	float4 next_keys[2];
+	keys_for_step(curve_keys,
+	              curve_radius,
+	              key_steps,
+	              num_curve_keys,
+	              num_steps,
+	              step,
+	              k0, k1,
+	              curr_keys);
+	keys_for_step(curve_keys,
+	              curve_radius,
+	              key_steps,
+	              num_curve_keys,
+	              num_steps,
+	              step + 1,
+	              k0, k1,
+	              next_keys);
+	/* Interpolate between steps. */
+	r_keys[0] = (1.0f - t)*curr_keys[0] + t*next_keys[0];
+	r_keys[1] = (1.0f - t)*curr_keys[1] + t*next_keys[1];
+}
+
+void Mesh::Curve::cardinal_motion_keys(const float3 *curve_keys,
+                                       const float *curve_radius,
+                                       const float3 *key_steps,
+                                       size_t num_curve_keys,
+                                       size_t num_steps,
+                                       float time,
+                                       size_t k0, size_t k1,
+                                       size_t k2, size_t k3,
+                                       float4 r_keys[4]) const
+{
+	/* Figure out which steps we need to fetch and their interpolation factor. */
+	const size_t max_step = num_steps - 1;
+	const size_t step = min((int)(time * max_step), max_step - 1);
+	const float t = time*max_step - step;
+	/* Fetch vertex coordinates. */
+	float4 curr_keys[4];
+	float4 next_keys[4];
+	cardinal_keys_for_step(curve_keys,
+	                       curve_radius,
+	                       key_steps,
+	                       num_curve_keys,
+	                       num_steps,
+	                       step,
+	                       k0, k1, k2, k3,
+	                       curr_keys);
+	cardinal_keys_for_step(curve_keys,
+	                       curve_radius,
+	                       key_steps,
+	                       num_curve_keys,
+	                       num_steps,
+	                       step + 1,
+	                       k0, k1, k2, k3,
+	                       next_keys);
+	/* Interpolate between steps. */
+	r_keys[0] = (1.0f - t)*curr_keys[0] + t*next_keys[0];
+	r_keys[1] = (1.0f - t)*curr_keys[1] + t*next_keys[1];
+	r_keys[2] = (1.0f - t)*curr_keys[2] + t*next_keys[2];
+	r_keys[3] = (1.0f - t)*curr_keys[3] + t*next_keys[3];
+}
+
+void Mesh::Curve::keys_for_step(const float3 *curve_keys,
+                                const float *curve_radius,
+                                const float3 *key_steps,
+                                size_t num_curve_keys,
+                                size_t num_steps,
+                                size_t step,
+                                size_t k0, size_t k1,
+                                float4 r_keys[2]) const
+{
+	k0 = max(k0, 0);
+	k1 = min(k1, num_keys - 1);
+	const size_t center_step = ((num_steps - 1) / 2);
+	if(step == center_step) {
+		/* Center step: regular key location. */
+		/* TODO(sergey): Consider adding make_float4(float3, float)
+		 * function.
+		 */
+		r_keys[0] = make_float4(curve_keys[first_key + k0].x,
+		                        curve_keys[first_key + k0].y,
+		                        curve_keys[first_key + k0].z,
+		                        curve_radius[first_key + k0]);
+		r_keys[1] = make_float4(curve_keys[first_key + k1].x,
+		                        curve_keys[first_key + k1].y,
+		                        curve_keys[first_key + k1].z,
+		                        curve_radius[first_key + k1]);
+	}
+	else {
+		/* Center step is not stored in this array. */
+		if(step > center_step) {
+			step--;
+		}
+		const size_t offset = first_key + step * num_curve_keys;
+		r_keys[0] = make_float4(key_steps[offset + k0].x,
+		                        key_steps[offset + k0].y,
+		                        key_steps[offset + k0].z,
+		                        curve_radius[first_key + k0]);
+		r_keys[1] = make_float4(key_steps[offset + k1].x,
+		                        key_steps[offset + k1].y,
+		                        key_steps[offset + k1].z,
+		                        curve_radius[first_key + k1]);
+	}
+}
+
+void Mesh::Curve::cardinal_keys_for_step(const float3 *curve_keys,
+                                         const float *curve_radius,
+                                         const float3 *key_steps,
+                                         size_t num_curve_keys,
+                                         size_t num_steps,
+                                         size_t step,
+                                         size_t k0, size_t k1,
+                                         size_t k2, size_t k3,
+                                         float4 r_keys[4]) const
+{
+	k0 = max(k0, 0);
+	k3 = min(k3, num_keys - 1);
+	const size_t center_step = ((num_steps - 1) / 2);
+	if(step == center_step) {
+		/* Center step: regular key location. */
+		r_keys[0] = make_float4(curve_keys[first_key + k0].x,
+		                        curve_keys[first_key + k0].y,
+		                        curve_keys[first_key + k0].z,
+		                        curve_radius[first_key + k0]);
+		r_keys[1] = make_float4(curve_keys[first_key + k1].x,
+		                        curve_keys[first_key + k1].y,
+		                        curve_keys[first_key + k1].z,
+		                        curve_radius[first_key + k1]);
+		r_keys[2] = make_float4(curve_keys[first_key + k2].x,
+		                        curve_keys[first_key + k2].y,
+		                        curve_keys[first_key + k2].z,
+		                        curve_radius[first_key + k2]);
+		r_keys[3] = make_float4(curve_keys[first_key + k3].x,
+		                        curve_keys[first_key + k3].y,
+		                        curve_keys[first_key + k3].z,
+		                        curve_radius[first_key + k3]);
+	}
+	else {
+		/* Center step is not stored in this array. */
+		if(step > center_step) {
+			step--;
+		}
+		const size_t offset = first_key + step * num_curve_keys;
+		r_keys[0] = make_float4(key_steps[offset + k0].x,
+		                        key_steps[offset + k0].y,
+		                        key_steps[offset + k0].z,
+		                        curve_radius[first_key + k0]);
+		r_keys[1] = make_float4(key_steps[offset + k1].x,
+		                        key_steps[offset + k1].y,
+		                        key_steps[offset + k1].z,
+		                        curve_radius[first_key + k1]);
+		r_keys[2] = make_float4(key_steps[offset + k2].x,
+		                        key_steps[offset + k2].y,
+		                        key_steps[offset + k2].z,
+		                        curve_radius[first_key + k2]);
+		r_keys[3] = make_float4(key_steps[offset + k3].x,
+		                        key_steps[offset + k3].y,
+		                        key_steps[offset + k3].z,
+		                        curve_radius[first_key + k3]);
+	}
+}
+
 /* SubdFace */
 
 float3 Mesh::SubdFace::normal(const Mesh *mesh) const
@@ -394,7 +651,7 @@ void Mesh::compute_bounds()
 		if(use_motion_blur && attr) {
 			size_t steps_size = verts.size() * (motion_steps - 1);
 			float3 *vert_steps = attr->data_float3();
-	
+
 			for(size_t i = 0; i < steps_size; i++)
 				bnds.grow(vert_steps[i]);
 		}
@@ -403,7 +660,7 @@ void Mesh::compute_bounds()
 		if(use_motion_blur && curve_attr) {
 			size_t steps_size = curve_keys.size() * (motion_steps - 1);
 			float3 *key_steps = curve_attr->data_float3();
-	
+
 			for(size_t i = 0; i < steps_size; i++)
 				bnds.grow(key_steps[i]);
 		}
@@ -417,11 +674,11 @@ void Mesh::compute_bounds()
 
 			for(size_t i = 0; i < curve_keys_size; i++)
 				bnds.grow_safe(curve_keys[i], curve_radius[i]);
-			
+
 			if(use_motion_blur && attr) {
 				size_t steps_size = verts.size() * (motion_steps - 1);
 				float3 *vert_steps = attr->data_float3();
-		
+
 				for(size_t i = 0; i < steps_size; i++)
 					bnds.grow_safe(vert_steps[i]);
 			}
@@ -429,7 +686,7 @@ void Mesh::compute_bounds()
 			if(use_motion_blur && curve_attr) {
 				size_t steps_size = curve_keys.size() * (motion_steps - 1);
 				float3 *key_steps = curve_attr->data_float3();
-		
+
 				for(size_t i = 0; i < steps_size; i++)
 					bnds.grow_safe(key_steps[i]);
 			}
@@ -464,7 +721,7 @@ void Mesh::add_face_normals()
 	/* don't compute if already there */
 	if(attributes.find(ATTR_STD_FACE_NORMAL))
 		return;
-	
+
 	/* get attributes */
 	Attribute *attr_fN = attributes.add(ATTR_STD_FACE_NORMAL);
 	float3 *fN = attr_fN->data_float3();
@@ -796,6 +1053,8 @@ void Mesh::compute_bvh(DeviceScene *dscene,
 			bparams.use_qbvh = params->use_qbvh;
 			bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
 			                              params->use_bvh_unaligned_nodes;
+			bparams.num_motion_triangle_steps = params->num_bvh_time_steps;
+			bparams.num_motion_curve_steps = params->num_bvh_time_steps;
 
 			delete bvh;
 			bvh = BVH::create(bparams, objects);
@@ -1002,7 +1261,7 @@ void MeshManager::update_svm_attributes(Device *device, DeviceScene *dscene, Sce
 
 	if(attr_map_stride == 0)
 		return;
-	
+
 	/* create attribute map */
 	uint4 *attr_map = dscene->attributes_map.resize(attr_map_stride*scene->objects.size());
 	memset(attr_map, 0, dscene->attributes_map.size()*sizeof(uint));
@@ -1084,7 +1343,7 @@ void MeshManager::update_svm_attributes(Device *device, DeviceScene *dscene, Sce
 		}
 
 		/* terminator */
-		for(int i = 0; i < ATTR_PRIM_TYPES; i++) {
+		for(int j = 0; j < ATTR_PRIM_TYPES; j++) {
 			attr_map[index].x = ATTR_STD_NONE;
 			attr_map[index].y = 0;
 			attr_map[index].z = 0;
@@ -1564,6 +1823,8 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 	bparams.use_spatial_split = scene->params.use_bvh_spatial_split;
 	bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
 	                              scene->params.use_bvh_unaligned_nodes;
+	bparams.num_motion_triangle_steps = scene->params.num_bvh_time_steps;
+	bparams.num_motion_curve_steps = scene->params.num_bvh_time_steps;
 
 	delete bvh;
 	bvh = BVH::create(bparams, scene->objects);
@@ -1665,6 +1926,7 @@ void MeshManager::device_update_displacement_images(Device *device,
 						 */
 						image_manager->device_update(device,
 						                             dscene,
+						                             scene,
 						                             progress);
 						return;
 					}
@@ -1682,6 +1944,7 @@ void MeshManager::device_update_displacement_images(Device *device,
 		                        image_manager,
 		                        device,
 		                        dscene,
+		                        scene,
 		                        slot,
 		                        &progress));
 	}
@@ -1944,14 +2207,14 @@ bool Mesh::need_attribute(Scene *scene, AttributeStandard std)
 {
 	if(std == ATTR_STD_NONE)
 		return false;
-	
+
 	if(scene->need_global_attribute(std))
 		return true;
 
 	foreach(Shader *shader, used_shaders)
 		if(shader->attributes.find(std))
 			return true;
-	
+
 	return false;
 }
 
@@ -1963,9 +2226,8 @@ bool Mesh::need_attribute(Scene * /*scene*/, ustring name)
 	foreach(Shader *shader, used_shaders)
 		if(shader->attributes.find(name))
 			return true;
-	
+
 	return false;
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h
index c0310f4..5f33e30 100644
--- a/intern/cycles/render/mesh.h
+++ b/intern/cycles/render/mesh.h
@@ -31,6 +31,7 @@
 
 CCL_NAMESPACE_BEGIN
 
+class Attribute;
 class BVH;
 class Device;
 class DeviceScene;
@@ -54,11 +55,27 @@ public:
 		int v[3];
 
 		void bounds_grow(const float3 *verts, BoundBox& bounds) const;
+
+		void motion_verts(const float3 *verts,
+		                  const float3 *vert_steps,
+		                  size_t num_verts,
+		                  size_t num_steps,
+		                  float time,
+		                  float3 r_verts[3]) const;
+
+		void verts_for_step(const float3 *verts,
+		                    const float3 *vert_steps,
+		                    size_t num_verts,
+		                    size_t num_steps,
+		                    size_t step,
+		                    float3 r_verts[3]) const;
 	};
 
 	Triangle get_triangle(size_t i) const
 	{
-		Triangle tri = {{triangles[i*3 + 0], triangles[i*3 + 1], triangles[i*3 + 2]}};
+		Triangle tri = {{triangles[i*3 + 0],
+		                 triangles[i*3 + 1],
+		                 triangles[i*3 + 2]}};
 		return tri;
 	}
 
@@ -78,11 +95,48 @@ public:
 		                 const float3 *curve_keys,
 		                 const float *curve_radius,
 		                 BoundBox& bounds) const;
+		void bounds_grow(float4 keys[4], BoundBox& bounds) const;
 		void bounds_grow(const int k,
 		                 const float3 *curve_keys,
 		                 const float *curve_radius,
 		                 const Transform& aligned_space,
 		                 BoundBox& bounds) const;
+
+		void motion_keys(const float3 *curve_keys,
+		                 const float *curve_radius,
+		                 const float3 *key_steps,
+		                 size_t num_curve_keys,
+		                 size_t num_steps,
+		                 float time,
+		                 size_t k0, size_t k1,
+		                 float4 r_keys[2]) const;
+		void cardinal_motion_keys(const float3 *curve_keys,
+		                          const float *curve_radius,
+		                          const float3 *key_steps,
+		                          size_t num_curve_keys,
+		                          size_t num_steps,
+		                          float time,
+		                          size_t k0, size_t k1,
+		                          size_t k2, size_t k3,
+		                          float4 r_keys[4]) const;
+
+		void keys_for_step(const float3 *curve_keys,
+		                   const float *curve_radius,
+		                   const float3 *key_steps,
+		                   size_t num_curve_keys,
+		                   size_t num_steps,
+		                   size_t step,
+		                   size_t k0, size_t k1,
+		                   float4 r_keys[2]) const;
+		void cardinal_keys_for_step(const float3 *curve_keys,
+		                            const float *curve_radius,
+		                            const float3 *key_steps,
+		                            size_t num_curve_keys,
+		                            size_t num_steps,
+		                            size_t step,
+		                            size_t k0, size_t k1,
+		                            size_t k2, size_t k3,
+		                            float4 r_keys[4]) const;
 	};
 
 	Curve get_curve(size_t i) const
diff --git a/intern/cycles/render/mesh_subdivision.cpp b/intern/cycles/render/mesh_subdivision.cpp
index 913c3c7..57c76a9 100644
--- a/intern/cycles/render/mesh_subdivision.cpp
+++ b/intern/cycles/render/mesh_subdivision.cpp
@@ -92,7 +92,7 @@ namespace Far {
 
 			if(vert_edges.size() == 2) {
 				float sharpness = refiner.getLevel(0).getEdgeSharpness(vert_edges[0]);
-				sharpness = min(sharpness, refiner.getLevel(0).getEdgeSharpness(vert_edges[1]));
+				sharpness = ccl::min(sharpness, refiner.getLevel(0).getEdgeSharpness(vert_edges[1]));
 
 				setBaseVertexSharpness(refiner, i, sharpness);
 			}
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index 405b31e..1e4a9fd 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -1257,6 +1257,7 @@ NODE_DEFINE(BrickTextureNode)
 	SOCKET_IN_COLOR(mortar, "Mortar", make_float3(0.0f, 0.0f, 0.0f));
 	SOCKET_IN_FLOAT(scale, "Scale", 5.0f);
 	SOCKET_IN_FLOAT(mortar_size, "Mortar Size", 0.02f);
+	SOCKET_IN_FLOAT(mortar_smooth, "Mortar Smooth", 0.0f);
 	SOCKET_IN_FLOAT(bias, "Bias", 0.0f);
 	SOCKET_IN_FLOAT(brick_width, "Brick Width", 0.5f);
 	SOCKET_IN_FLOAT(row_height, "Row Height", 0.25f);
@@ -1280,6 +1281,7 @@ void BrickTextureNode::compile(SVMCompiler& compiler)
 	ShaderInput *mortar_in = input("Mortar");
 	ShaderInput *scale_in = input("Scale");
 	ShaderInput *mortar_size_in = input("Mortar Size");
+	ShaderInput *mortar_smooth_in = input("Mortar Smooth");
 	ShaderInput *bias_in = input("Bias");
 	ShaderInput *brick_width_in = input("Brick Width");
 	ShaderInput *row_height_in = input("Row Height");
@@ -1303,7 +1305,8 @@ void BrickTextureNode::compile(SVMCompiler& compiler)
 		compiler.encode_uchar4(
 			compiler.stack_assign_if_linked(row_height_in),
 			compiler.stack_assign_if_linked(color_out),
-			compiler.stack_assign_if_linked(fac_out)));
+			compiler.stack_assign_if_linked(fac_out),
+			compiler.stack_assign_if_linked(mortar_smooth_in)));
 			
 	compiler.add_node(compiler.encode_uchar4(offset_frequency, squash_frequency),
 		__float_as_int(scale),
@@ -1315,6 +1318,11 @@ void BrickTextureNode::compile(SVMCompiler& compiler)
 		__float_as_int(offset),
 		__float_as_int(squash));
 
+	compiler.add_node(__float_as_int(mortar_smooth),
+		SVM_STACK_INVALID,
+		SVM_STACK_INVALID,
+		SVM_STACK_INVALID);
+
 	tex_mapping.compile_end(compiler, vector_in, vector_offset);
 }
 
@@ -1434,14 +1442,14 @@ void PointDensityTextureNode::compile(SVMCompiler& compiler)
 		else {
 			if(use_density) {
 				compiler.add_node(NODE_VALUE_F,
-								  __float_as_int(0.0f),
-								  compiler.stack_assign(density_out));
+				                  __float_as_int(0.0f),
+				                  compiler.stack_assign(density_out));
 			}
 			if(use_color) {
 				compiler.add_node(NODE_VALUE_V, compiler.stack_assign(color_out));
 				compiler.add_node(NODE_VALUE_V, make_float3(TEX_IMAGE_MISSING_R,
-															TEX_IMAGE_MISSING_G,
-															TEX_IMAGE_MISSING_B));
+				                                            TEX_IMAGE_MISSING_G,
+				                                            TEX_IMAGE_MISSING_B));
 			}
 		}
 	}
@@ -2413,7 +2421,7 @@ void BackgroundNode::compile(SVMCompiler& compiler)
 	if(color_in->link || strength_in->link) {
 		compiler.add_node(NODE_EMISSION_WEIGHT,
 		                  compiler.stack_assign(color_in),
-						  compiler.stack_assign(strength_in));
+		                  compiler.stack_assign(strength_in));
 	}
 	else
 		compiler.add_node(NODE_CLOSURE_SET_WEIGHT, color*strength);
@@ -3019,6 +3027,8 @@ NODE_DEFINE(LightPathNode)
 	SOCKET_OUT_FLOAT(is_volume_scatter_ray, "Is Volume Scatter Ray");
 	SOCKET_OUT_FLOAT(ray_length, "Ray Length");
 	SOCKET_OUT_FLOAT(ray_depth, "Ray Depth");
+	SOCKET_OUT_FLOAT(diffuse_depth, "Diffuse Depth");
+	SOCKET_OUT_FLOAT(glossy_depth, "Glossy Depth");
 	SOCKET_OUT_FLOAT(transparent_depth, "Transparent Depth");
 	SOCKET_OUT_FLOAT(transmission_depth, "Transmission Depth");
 
@@ -3085,6 +3095,16 @@ void LightPathNode::compile(SVMCompiler& compiler)
 		compiler.add_node(NODE_LIGHT_PATH, NODE_LP_ray_depth, compiler.stack_assign(out));
 	}
 
+	out = output("Diffuse Depth");
+	if(!out->links.empty()) {
+		compiler.add_node(NODE_LIGHT_PATH, NODE_LP_ray_diffuse, compiler.stack_assign(out));
+	}
+
+	out = output("Glossy Depth");
+	if(!out->links.empty()) {
+		compiler.add_node(NODE_LIGHT_PATH, NODE_LP_ray_glossy, compiler.stack_assign(out));
+	}
+
 	out = output("Transparent Depth");
 	if(!out->links.empty()) {
 		compiler.add_node(NODE_LIGHT_PATH, NODE_LP_ray_transparent, compiler.stack_assign(out));
@@ -3898,6 +3918,19 @@ void GammaNode::constant_fold(const ConstantFolder& folder)
 	if(folder.all_inputs_constant()) {
 		folder.make_constant(svm_math_gamma_color(color, gamma));
 	}
+	else {
+		ShaderInput *color_in = input("Color");
+		ShaderInput *gamma_in = input("Gamma");
+
+		/* 1 ^ X == X ^ 0 == 1 */
+		if(folder.is_one(color_in) || folder.is_zero(gamma_in)) {
+			folder.make_one();
+		}
+		/* X ^ 1 == X */
+		else if(folder.is_one(gamma_in)) {
+			folder.try_bypass_or_make_constant(color_in, false);
+		}
+	}
 }
 
 void GammaNode::compile(SVMCompiler& compiler)
@@ -3972,7 +4005,7 @@ NODE_DEFINE(SeparateRGBNode)
 
 	SOCKET_IN_COLOR(color, "Image", make_float3(0.0f, 0.0f, 0.0f));
 
-	SOCKET_OUT_FLOAT(g, "R");
+	SOCKET_OUT_FLOAT(r, "R");
 	SOCKET_OUT_FLOAT(g, "G");
 	SOCKET_OUT_FLOAT(b, "B");
 
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index 13791c6..eb0f797 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -243,7 +243,7 @@ public:
 	int offset_frequency, squash_frequency;
 
 	float3 color1, color2, mortar;
-	float scale, mortar_size, bias, brick_width, row_height;
+	float scale, mortar_size, mortar_smooth, bias, brick_width, row_height;
 	float3 vector;
 
 	virtual int get_group() { return NODE_GROUP_LEVEL_2; }
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index 8b8b988..c592b62 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -166,7 +166,7 @@ void Object::apply_transform(bool apply_to_motion)
 		float3 c0 = transform_get_column(&tfm, 0);
 		float3 c1 = transform_get_column(&tfm, 1);
 		float3 c2 = transform_get_column(&tfm, 2);
-		float scalar = pow(fabsf(dot(cross(c0, c1), c2)), 1.0f/3.0f);
+		float scalar = powf(fabsf(dot(cross(c0, c1), c2)), 1.0f/3.0f);
 
 		/* apply transform to curve keys */
 		for(size_t i = 0; i < mesh->curve_keys.size(); i++) {
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index 18a32f7..67b68e6 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -825,7 +825,7 @@ void OSLCompiler::parameter(ShaderNode* node, const char *name)
 			// OSL does not support booleans, so convert to int
 			const array<bool>& value = node->get_bool_array(socket);
 			array<int> intvalue(value.size());
-			for (size_t i = 0; i < value.size(); i++)
+			for(size_t i = 0; i < value.size(); i++)
 				intvalue[i] = value[i];
 			ss->Parameter(uname, array_typedesc(TypeDesc::TypeInt, value.size()), intvalue.data());
 			break;
@@ -861,8 +861,7 @@ void OSLCompiler::parameter(ShaderNode* node, const char *name)
 			// convert to tightly packed array since float3 has padding
 			const array<float3>& value = node->get_float3_array(socket);
 			array<float> fvalue(value.size() * 3);
-			for (size_t i = 0, j = 0; i < value.size(); i++)
-			{
+			for(size_t i = 0, j = 0; i < value.size(); i++) {
 				fvalue[j++] = value[i].x;
 				fvalue[j++] = value[i].y;
 				fvalue[j++] = value[i].z;
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index b341837..68124e7 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -187,7 +187,7 @@ void Scene::device_update(Device *device_, Progress& progress)
 	if(progress.get_cancel() || device->have_error()) return;
 
 	progress.set_status("Updating Images");
-	image_manager->device_update(device, &dscene, progress);
+	image_manager->device_update(device, &dscene, this, progress);
 
 	if(progress.get_cancel() || device->have_error()) return;
 
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 8fec171..8768682 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -143,8 +143,10 @@ public:
 	} bvh_type;
 	bool use_bvh_spatial_split;
 	bool use_bvh_unaligned_nodes;
+	int num_bvh_time_steps;
 	bool use_qbvh;
 	bool persistent_data;
+	int texture_limit;
 
 	SceneParams()
 	{
@@ -152,8 +154,10 @@ public:
 		bvh_type = BVH_DYNAMIC;
 		use_bvh_spatial_split = false;
 		use_bvh_unaligned_nodes = true;
+		num_bvh_time_steps = 0;
 		use_qbvh = false;
 		persistent_data = false;
+		texture_limit = 0;
 	}
 
 	bool modified(const SceneParams& params)
@@ -161,8 +165,10 @@ public:
 		&& bvh_type == params.bvh_type
 		&& use_bvh_spatial_split == params.use_bvh_spatial_split
 		&& use_bvh_unaligned_nodes == params.use_bvh_unaligned_nodes
+		&& num_bvh_time_steps == params.num_bvh_time_steps
 		&& use_qbvh == params.use_qbvh
-		&& persistent_data == params.persistent_data); }
+		&& persistent_data == params.persistent_data
+		&& texture_limit == params.texture_limit); }
 };
 
 /* Scene */
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 9d8c9fe..7c01934 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -67,10 +67,7 @@ Session::Session(const SessionParams& params_)
 	session_thread = NULL;
 	scene = NULL;
 
-	start_time = 0.0;
 	reset_time = 0.0;
-	preview_time = 0.0;
-	paused_time = 0.0;
 	last_update_time = 0.0;
 
 	delayed_reset.do_reset = false;
@@ -201,12 +198,10 @@ void Session::run_gpu()
 {
 	bool tiles_written = false;
 
-	start_time = time_dt();
 	reset_time = time_dt();
-	paused_time = 0.0;
 	last_update_time = time_dt();
 
-	progress.set_render_start_time(start_time + paused_time);
+	progress.set_render_start_time();
 
 	while(!progress.get_cancel()) {
 		/* advance to next tile */
@@ -233,13 +228,9 @@ void Session::run_gpu()
 				update_status_time(pause, no_tiles);
 
 				while(1) {
-					double pause_start = time_dt();
+					scoped_timer pause_timer;
 					pause_cond.wait(pause_lock);
-					paused_time += time_dt() - pause_start;
-
-					if(!params.background)
-						progress.set_start_time(start_time + paused_time);
-					progress.set_render_start_time(start_time + paused_time);
+					progress.add_skip_time(pause_timer, params.background);
 
 					update_status_time(pause, no_tiles);
 					progress.set_update();
@@ -255,7 +246,9 @@ void Session::run_gpu()
 
 		if(!no_tiles) {
 			/* update scene */
+			scoped_timer update_timer;
 			update_scene();
+			progress.add_skip_time(update_timer, params.background);
 
 			if(!device->error_message().empty())
 				progress.set_error(device->error_message());
@@ -465,6 +458,8 @@ void Session::release_tile(RenderTile& rtile)
 {
 	thread_scoped_lock tile_lock(tile_mutex);
 
+	progress.add_finished_tile();
+
 	if(write_render_tile_cb) {
 		if(params.progressive_refine == false) {
 			/* todo: optimize this by making it thread safe and removing lock */
@@ -523,13 +518,9 @@ void Session::run_cpu()
 				update_status_time(pause, no_tiles);
 
 				while(1) {
-					double pause_start = time_dt();
+					scoped_timer pause_timer;
 					pause_cond.wait(pause_lock);
-					paused_time += time_dt() - pause_start;
-
-					if(!params.background)
-						progress.set_start_time(start_time + paused_time);
-					progress.set_render_start_time(start_time + paused_time);
+					progress.add_skip_time(pause_timer, params.background);
 
 					update_status_time(pause, no_tiles);
 					progress.set_update();
@@ -550,7 +541,9 @@ void Session::run_cpu()
 			thread_scoped_lock buffers_lock(buffers_mutex);
 
 			/* update scene */
+			scoped_timer update_timer;
 			update_scene();
+			progress.add_skip_time(update_timer, params.background);
 
 			if(!device->error_message().empty())
 				progress.set_error(device->error_message());
@@ -645,6 +638,7 @@ DeviceRequestedFeatures Session::get_requested_device_features()
 	BakeManager *bake_manager = scene->bake_manager;
 	requested_features.use_baking = bake_manager->get_baking();
 	requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH);
+	requested_features.use_transparent &= scene->integrator->transparent_shadows;
 
 	return requested_features;
 }
@@ -718,14 +712,14 @@ void Session::reset_(BufferParams& buffer_params, int samples)
 	}
 
 	tile_manager.reset(buffer_params, samples);
+	progress.reset_sample();
 
-	start_time = time_dt();
-	preview_time = 0.0;
-	paused_time = 0.0;
+	bool show_progress = params.background || tile_manager.get_num_effective_samples() != INT_MAX;
+	progress.set_total_pixel_samples(show_progress? tile_manager.state.total_pixel_samples : 0);
 
 	if(!params.background)
-		progress.set_start_time(start_time);
-	progress.set_render_start_time(start_time);
+		progress.set_start_time();
+	progress.set_render_start_time();
 }
 
 void Session::reset(BufferParams& buffer_params, int samples)
@@ -827,61 +821,40 @@ void Session::update_scene()
 
 void Session::update_status_time(bool show_pause, bool show_done)
 {
-	int sample = tile_manager.state.sample;
-	int resolution = tile_manager.state.resolution_divider;
-	int num_tiles = tile_manager.state.num_tiles;
+	int progressive_sample = tile_manager.state.sample;
+	int num_samples = tile_manager.get_num_effective_samples();
+
 	int tile = tile_manager.state.num_rendered_tiles;
+	int num_tiles = tile_manager.state.num_tiles;
 
 	/* update status */
 	string status, substatus;
 
 	if(!params.progressive) {
-		const int progress_sample = progress.get_sample(),
-		          num_samples = tile_manager.get_num_effective_samples();
-		const bool is_gpu = params.device.type == DEVICE_CUDA || params.device.type == DEVICE_OPENCL;
-		const bool is_multidevice = params.device.multi_devices.size() > 1;
 		const bool is_cpu = params.device.type == DEVICE_CPU;
-		const bool is_last_tile = (num_samples * num_tiles - progress_sample) < num_samples;
+		const bool is_last_tile = (progress.get_finished_tiles() + 1) == num_tiles;
 
 		substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles);
 
-		if((is_gpu && !is_multidevice && !device->info.use_split_kernel) ||
-		   (is_cpu && (num_tiles == 1 || is_last_tile)))
-		{
-			/* When using split-kernel (OpenCL) each thread in a tile will be working on a different
-			 * sample. Can't display sample number when device uses split-kernel
-			 */
-
-			/* when rendering on GPU multithreading happens within single tile, as in
-			 * tiles are handling sequentially and in this case we could display
-			 * currently rendering sample number
-			 * this helps a lot from feedback point of view.
-			 * also display the info on CPU, when using 1 tile only
+		if(device->show_samples() || (is_cpu && is_last_tile)) {
+			/* Some devices automatically support showing the sample number:
+			 * - CUDADevice
+			 * - OpenCLDevice when using the megakernel (the split kernel renders multiple
+			 *   samples at the same time, so the current sample isn't really defined)
+			 * - CPUDevice when using one thread
+			 * For these devices, the current sample is always shown.
+			 *
+			 * The other option is when the last tile is currently being rendered by the CPU.
 			 */
-
-			int status_sample = progress_sample;
-			if(tile > 1) {
-				/* sample counter is global for all tiles, subtract samples
-				 * from already finished tiles to get sample counter for
-				 * current tile only
-				 */
-				if(is_cpu && is_last_tile && num_tiles > 1) {
-					status_sample = num_samples - (num_samples * num_tiles - progress_sample);
-				}
-				else {
-					status_sample -= (tile - 1) * num_samples;
-				}
-			}
-
-			substatus += string_printf(", Sample %d/%d", status_sample, num_samples);
+			substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples);
 		}
 	}
 	else if(tile_manager.num_samples == INT_MAX)
-		substatus = string_printf("Path Tracing Sample %d", sample+1);
+		substatus = string_printf("Path Tracing Sample %d", progressive_sample+1);
 	else
 		substatus = string_printf("Path Tracing Sample %d/%d",
-		                          sample+1,
-		                          tile_manager.get_num_effective_samples());
+		                          progressive_sample+1,
+		                          num_samples);
 	
 	if(show_pause) {
 		status = "Paused";
@@ -895,22 +868,6 @@ void Session::update_status_time(bool show_pause, bool show_done)
 	}
 
 	progress.set_status(status, substatus);
-
-	/* update timing */
-	if(preview_time == 0.0 && resolution == 1)
-		preview_time = time_dt();
-	
-	double tile_time = (tile == 0 || sample == 0)? 0.0: (time_dt() - preview_time - paused_time) / sample;
-
-	/* negative can happen when we pause a bit before rendering, can discard that */
-	if(preview_time < 0.0) preview_time = 0.0;
-
-	progress.set_tile(tile, tile_time);
-}
-
-void Session::update_progress_sample()
-{
-	progress.increment_sample();
 }
 
 void Session::path_trace()
@@ -922,7 +879,7 @@ void Session::path_trace()
 	task.release_tile = function_bind(&Session::release_tile, this, _1);
 	task.get_cancel = function_bind(&Progress::get_cancel, &this->progress);
 	task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1);
-	task.update_progress_sample = function_bind(&Session::update_progress_sample, this);
+	task.update_progress_sample = function_bind(&Progress::add_samples, &this->progress, _1, _2);
 	task.need_finish_queue = params.progressive_refine;
 	task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH;
 	task.requested_tile_size = params.tile_size;
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index 8bff0f9..c7ff144 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -89,8 +89,7 @@ public:
 	}
 
 	bool modified(const SessionParams& params)
-	{ return !(device.type == params.device.type
-		&& device.id == params.device.id
+	{ return !(device == params.device
 		&& background == params.background
 		&& progressive_refine == params.progressive_refine
 		&& output_path == params.output_path
@@ -146,6 +145,10 @@ public:
 
 	void device_free();
 
+	/* Returns the rendering progress or 0 if no progress can be determined
+	 * (for example, when rendering with unlimited samples). */
+	float get_progress();
+
 protected:
 	struct DelayedReset {
 		thread_mutex mutex;
@@ -174,8 +177,6 @@ protected:
 	void update_tile_sample(RenderTile& tile);
 	void release_tile(RenderTile& tile);
 
-	void update_progress_sample();
-
 	bool device_use_gl;
 
 	thread *session_thread;
@@ -195,10 +196,7 @@ protected:
 
 	bool kernels_loaded;
 
-	double start_time;
 	double reset_time;
-	double preview_time;
-	double paused_time;
 
 	/* progressive refine */
 	double last_update_time;
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 70e1443..335edcb 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -194,6 +194,28 @@ Shader::~Shader()
 	delete graph_bump;
 }
 
+bool Shader::is_constant_emission(float3 *emission)
+{
+	ShaderInput *surf = graph->output()->input("Surface");
+
+	if(!surf->link || surf->link->parent->type != EmissionNode::node_type) {
+		return false;
+	}
+
+	EmissionNode *node = (EmissionNode*) surf->link->parent;
+
+	assert(node->input("Color"));
+	assert(node->input("Strength"));
+
+	if(node->input("Color")->link || node->input("Strength")->link) {
+		return false;
+	}
+
+	*emission = node->color*node->strength;
+
+	return true;
+}
+
 void Shader::set_graph(ShaderGraph *graph_)
 {
 	/* do this here already so that we can detect if mesh or object attributes
@@ -379,7 +401,7 @@ void ShaderManager::device_update_common(Device *device,
 	if(scene->shaders.size() == 0)
 		return;
 
-	uint shader_flag_size = scene->shaders.size()*2;
+	uint shader_flag_size = scene->shaders.size()*SHADER_SIZE;
 	uint *shader_flag = dscene->shader_flag.resize(shader_flag_size);
 	uint i = 0;
 	bool has_volumes = false;
@@ -424,9 +446,17 @@ void ShaderManager::device_update_common(Device *device,
 		if(shader->displacement_method != DISPLACE_TRUE && shader->graph_bump)
 			flag |= SD_HAS_BSSRDF_BUMP;
 
+		/* constant emission check */
+		float3 constant_emission = make_float3(0.0f, 0.0f, 0.0f);
+		if(shader->is_constant_emission(&constant_emission))
+			flag |= SD_HAS_CONSTANT_EMISSION;
+
 		/* regular shader */
 		shader_flag[i++] = flag;
 		shader_flag[i++] = shader->pass_id;
+		shader_flag[i++] = __float_as_int(constant_emission.x);
+		shader_flag[i++] = __float_as_int(constant_emission.y);
+		shader_flag[i++] = __float_as_int(constant_emission.z);
 
 		has_transparent_shadow |= (flag & SD_HAS_TRANSPARENT_SHADOW) != 0;
 	}
@@ -541,6 +571,9 @@ void ShaderManager::get_requested_graph_features(ShaderGraph *graph,
 		if(node->has_surface_bssrdf()) {
 			requested_features->use_subsurface = true;
 		}
+		if(node->has_surface_transparent()) {
+			requested_features->use_transparent = true;
+		}
 	}
 }
 
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index 696e22b..7d89665 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -139,6 +139,10 @@ public:
 	Shader();
 	~Shader();
 
+	/* Checks whether the shader consists of just a emission node with fixed inputs that's connected directly to the output.
+	 * If yes, it sets the content of emission to the constant value (color * strength), which is then used for speeding up light evaluation. */
+	bool is_constant_emission(float3* emission);
+
 	void set_graph(ShaderGraph *graph);
 	void tag_update(Scene *scene);
 	void tag_used(Scene *scene);
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index 0332106..955b892 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -27,6 +27,7 @@
 #include "util_logging.h"
 #include "util_foreach.h"
 #include "util_progress.h"
+#include "util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -44,6 +45,51 @@ void SVMShaderManager::reset(Scene * /*scene*/)
 {
 }
 
+void SVMShaderManager::device_update_shader(Scene *scene,
+                                            Shader *shader,
+                                            Progress *progress,
+                                            vector<int4> *global_svm_nodes)
+{
+	if(progress->get_cancel()) {
+		return;
+	}
+	assert(shader->graph);
+
+	vector<int4> svm_nodes;
+	svm_nodes.push_back(make_int4(NODE_SHADER_JUMP, 0, 0, 0));
+
+	SVMCompiler::Summary summary;
+	SVMCompiler compiler(scene->shader_manager, scene->image_manager);
+	compiler.background = (shader == scene->default_background);
+	compiler.compile(scene, shader, svm_nodes, 0, &summary);
+
+	VLOG(2) << "Compilation summary:\n"
+	        << "Shader name: " << shader->name << "\n"
+	        << summary.full_report();
+
+	if(shader->use_mis && shader->has_surface_emission) {
+		scene->light_manager->need_update = true;
+	}
+
+	/* The copy needs to be done inside the lock, if another thread resizes the array 
+	 * while memcpy is running, it'll be copying into possibly invalid/freed ram. 
+	 */
+	nodes_lock_.lock();
+	size_t global_nodes_size = global_svm_nodes->size();
+	global_svm_nodes->resize(global_nodes_size + svm_nodes.size());
+	
+	/* Offset local SVM nodes to a global address space. */
+	int4& jump_node = global_svm_nodes->at(shader->id);
+	jump_node.y = svm_nodes[0].y + global_nodes_size - 1;
+	jump_node.z = svm_nodes[0].z + global_nodes_size - 1;
+	jump_node.w = svm_nodes[0].w + global_nodes_size - 1;
+	/* Copy new nodes to global storage. */
+	memcpy(&global_svm_nodes->at(global_nodes_size),
+	       &svm_nodes[1],
+	       sizeof(int4) * (svm_nodes.size() - 1));
+	nodes_lock_.unlock();
+}
+
 void SVMShaderManager::device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress)
 {
 	if(!need_update)
@@ -51,6 +97,8 @@ void SVMShaderManager::device_update(Device *device, DeviceScene *dscene, Scene
 
 	VLOG(1) << "Total " << scene->shaders.size() << " shaders.";
 
+	double start_time = time_dt();
+
 	/* test if we need to update */
 	device_free(device, dscene, scene);
 
@@ -65,23 +113,20 @@ void SVMShaderManager::device_update(Device *device, DeviceScene *dscene, Scene
 		svm_nodes.push_back(make_int4(NODE_SHADER_JUMP, 0, 0, 0));
 	}
 
+	TaskPool task_pool;
 	foreach(Shader *shader, scene->shaders) {
-		if(progress.get_cancel()) return;
-
-		assert(shader->graph);
-
-		SVMCompiler::Summary summary;
-		SVMCompiler compiler(scene->shader_manager, scene->image_manager);
-		compiler.background = (shader == scene->default_background);
-		compiler.compile(scene, shader, svm_nodes, shader->id, &summary);
-
-		if(shader->use_mis && shader->has_surface_emission) {
-			scene->light_manager->need_update = true;
-		}
+		task_pool.push(function_bind(&SVMShaderManager::device_update_shader,
+		                             this,
+		                             scene,
+		                             shader,
+		                             &progress,
+		                             &svm_nodes),
+		               false);
+	}
+	task_pool.wait_work();
 
-		VLOG(2) << "Compilation summary:\n"
-		        << "Shader name: " << shader->name << "\n"
-		        << summary.full_report();
+	if(progress.get_cancel()) {
+		return;
 	}
 
 	dscene->svm_nodes.copy((uint4*)&svm_nodes[0], svm_nodes.size());
@@ -95,6 +140,10 @@ void SVMShaderManager::device_update(Device *device, DeviceScene *dscene, Scene
 	device_update_common(device, dscene, scene, progress);
 
 	need_update = false;
+
+	VLOG(1) << "Shader manager updated "
+	        << scene->shaders.size() << " shaders in "
+	        << time_dt() - start_time << " seconds.";
 }
 
 void SVMShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *scene)
@@ -317,17 +366,17 @@ uint SVMCompiler::encode_uchar4(uint x, uint y, uint z, uint w)
 
 void SVMCompiler::add_node(int a, int b, int c, int d)
 {
-	svm_nodes.push_back(make_int4(a, b, c, d));
+	current_svm_nodes.push_back(make_int4(a, b, c, d));
 }
 
 void SVMCompiler::add_node(ShaderNodeType type, int a, int b, int c)
 {
-	svm_nodes.push_back(make_int4(type, a, b, c));
+	current_svm_nodes.push_back(make_int4(type, a, b, c));
 }
 
 void SVMCompiler::add_node(ShaderNodeType type, const float3& f)
 {
-	svm_nodes.push_back(make_int4(type,
+	current_svm_nodes.push_back(make_int4(type,
 		__float_as_int(f.x),
 		__float_as_int(f.y),
 		__float_as_int(f.z)));
@@ -335,7 +384,7 @@ void SVMCompiler::add_node(ShaderNodeType type, const float3& f)
 
 void SVMCompiler::add_node(const float4& f)
 {
-	svm_nodes.push_back(make_int4(
+	current_svm_nodes.push_back(make_int4(
 		__float_as_int(f.x),
 		__float_as_int(f.y),
 		__float_as_int(f.z),
@@ -572,26 +621,38 @@ void SVMCompiler::generate_multi_closure(ShaderNode *root_node,
 
 			/* generate instructions for input closure 1 */
 			if(cl1in->link) {
-				/* add instruction to skip closure and its dependencies if mix weight is zero */
-				svm_nodes.push_back(make_int4(NODE_JUMP_IF_ONE, 0, stack_assign(facin), 0));
-				int node_jump_skip_index = svm_nodes.size() - 1;
+				/* Add instruction to skip closure and its dependencies if mix
+				 * weight is zero.
+				 */
+				current_svm_nodes.push_back(make_int4(NODE_JUMP_IF_ONE,
+				                                      0,
+				                                      stack_assign(facin),
+				                                      0));
+				int node_jump_skip_index = current_svm_nodes.size() - 1;
 
 				generate_multi_closure(root_node, cl1in->link->parent, state);
 
-				/* fill in jump instruction location to be after closure */
-				svm_nodes[node_jump_skip_index].y = svm_nodes.size() - node_jump_skip_index - 1;
+				/* Fill in jump instruction location to be after closure. */
+				current_svm_nodes[node_jump_skip_index].y =
+				        current_svm_nodes.size() - node_jump_skip_index - 1;
 			}
 
 			/* generate instructions for input closure 2 */
 			if(cl2in->link) {
-				/* add instruction to skip closure and its dependencies if mix weight is zero */
-				svm_nodes.push_back(make_int4(NODE_JUMP_IF_ZERO, 0, stack_assign(facin), 0));
-				int node_jump_skip_index = svm_nodes.size() - 1;
+				/* Add instruction to skip closure and its dependencies if mix
+				 * weight is zero.
+				 */
+				current_svm_nodes.push_back(make_int4(NODE_JUMP_IF_ZERO,
+				                                      0,
+				                                      stack_assign(facin),
+				                                      0));
+				int node_jump_skip_index = current_svm_nodes.size() - 1;
 
 				generate_multi_closure(root_node, cl2in->link->parent, state);
 
-				/* fill in jump instruction location to be after closure */
-				svm_nodes[node_jump_skip_index].y = svm_nodes.size() - node_jump_skip_index - 1;
+				/* Fill in jump instruction location to be after closure. */
+				current_svm_nodes[node_jump_skip_index].y =
+				        current_svm_nodes.size() - node_jump_skip_index - 1;
 			}
 
 			/* unassign */
@@ -661,7 +722,7 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty
 
 	/* clear all compiler state */
 	memset(&active_stack, 0, sizeof(active_stack));
-	svm_nodes.clear();
+	current_svm_nodes.clear();
 
 	foreach(ShaderNode *node_iter, graph->nodes) {
 		foreach(ShaderInput *input, node_iter->inputs)
@@ -721,7 +782,7 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty
 
 	/* if compile failed, generate empty shader */
 	if(compile_failed) {
-		svm_nodes.clear();
+		current_svm_nodes.clear();
 		compile_failed = false;
 	}
 
@@ -733,13 +794,13 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty
 
 void SVMCompiler::compile(Scene *scene,
                           Shader *shader,
-                          vector<int4>& global_svm_nodes,
+                          vector<int4>& svm_nodes,
                           int index,
                           Summary *summary)
 {
 	/* copy graph for shader with bump mapping */
 	ShaderNode *node = shader->graph->output();
-	int start_num_svm_nodes = global_svm_nodes.size();
+	int start_num_svm_nodes = svm_nodes.size();
 
 	const double time_start = time_dt();
 
@@ -783,8 +844,10 @@ void SVMCompiler::compile(Scene *scene,
 	if(shader->displacement_method != DISPLACE_TRUE && shader->graph_bump) {
 		scoped_timer timer((summary != NULL)? &summary->time_generate_bump: NULL);
 		compile_type(shader, shader->graph_bump, SHADER_TYPE_BUMP);
-		global_svm_nodes[index].y = global_svm_nodes.size();
-		global_svm_nodes.insert(global_svm_nodes.end(), svm_nodes.begin(), svm_nodes.end());
+		svm_nodes[index].y = svm_nodes.size();
+		svm_nodes.insert(svm_nodes.end(),
+		                 current_svm_nodes.begin(),
+		                 current_svm_nodes.end());
 	}
 
 	/* generate surface shader */
@@ -793,32 +856,38 @@ void SVMCompiler::compile(Scene *scene,
 		compile_type(shader, shader->graph, SHADER_TYPE_SURFACE);
 		/* only set jump offset if there's no bump shader, as the bump shader will fall thru to this one if it exists */
 		if(shader->displacement_method == DISPLACE_TRUE || !shader->graph_bump) {
-			global_svm_nodes[index].y = global_svm_nodes.size();
+			svm_nodes[index].y = svm_nodes.size();
 		}
-		global_svm_nodes.insert(global_svm_nodes.end(), svm_nodes.begin(), svm_nodes.end());
+		svm_nodes.insert(svm_nodes.end(),
+		                 current_svm_nodes.begin(),
+		                 current_svm_nodes.end());
 	}
 
 	/* generate volume shader */
 	{
 		scoped_timer timer((summary != NULL)? &summary->time_generate_volume: NULL);
 		compile_type(shader, shader->graph, SHADER_TYPE_VOLUME);
-		global_svm_nodes[index].z = global_svm_nodes.size();
-		global_svm_nodes.insert(global_svm_nodes.end(), svm_nodes.begin(), svm_nodes.end());
+		svm_nodes[index].z = svm_nodes.size();
+		svm_nodes.insert(svm_nodes.end(),
+		                 current_svm_nodes.begin(),
+		                 current_svm_nodes.end());
 	}
 
 	/* generate displacement shader */
 	{
 		scoped_timer timer((summary != NULL)? &summary->time_generate_displacement: NULL);
 		compile_type(shader, shader->graph, SHADER_TYPE_DISPLACEMENT);
-		global_svm_nodes[index].w = global_svm_nodes.size();
-		global_svm_nodes.insert(global_svm_nodes.end(), svm_nodes.begin(), svm_nodes.end());
+		svm_nodes[index].w = svm_nodes.size();
+		svm_nodes.insert(svm_nodes.end(),
+		                 current_svm_nodes.begin(),
+		                 current_svm_nodes.end());
 	}
 
 	/* Fill in summary information. */
 	if(summary != NULL) {
 		summary->time_total = time_dt() - time_start;
 		summary->peak_stack_usage = max_stack_use;
-		summary->num_svm_nodes = global_svm_nodes.size() - start_num_svm_nodes;
+		summary->num_svm_nodes = svm_nodes.size() - start_num_svm_nodes;
 	}
 }
 
diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h
index 99e91ca..a501b6b 100644
--- a/intern/cycles/render/svm.h
+++ b/intern/cycles/render/svm.h
@@ -23,6 +23,7 @@
 
 #include "util_set.h"
 #include "util_string.h"
+#include "util_thread.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -46,6 +47,15 @@ public:
 
 	void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress& progress);
 	void device_free(Device *device, DeviceScene *dscene, Scene *scene);
+
+protected:
+	/* Lock used to synchronize threaded nodes compilation. */
+	thread_spin_lock nodes_lock_;
+
+	void device_update_shader(Scene *scene,
+	                          Shader *shader,
+	                          Progress *progress,
+	                          vector<int4> *global_svm_nodes);
 };
 
 /* Graph Compiler */
@@ -200,7 +210,7 @@ protected:
 	/* compile */
 	void compile_type(Shader *shader, ShaderGraph *graph, ShaderType type);
 
-	vector<int4> svm_nodes;
+	vector<int4> current_svm_nodes;
 	ShaderType current_type;
 	Shader *current_shader;
 	ShaderGraph *current_graph;
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index 3a6dfea..a493c3f 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -108,36 +108,57 @@ TileManager::~TileManager()
 {
 }
 
-void TileManager::reset(BufferParams& params_, int num_samples_)
+static int get_divider(int w, int h, int start_resolution)
 {
-	params = params_;
-
 	int divider = 1;
-	int w = params.width, h = params.height;
-
 	if(start_resolution != INT_MAX) {
 		while(w*h > start_resolution*start_resolution) {
 			w = max(1, w/2);
 			h = max(1, h/2);
 
-			divider *= 2;
+			divider <<= 1;
 		}
 	}
+	return divider;
+}
 
-	num_samples = num_samples_;
+void TileManager::reset(BufferParams& params_, int num_samples_)
+{
+	params = params_;
+
+	set_samples(num_samples_);
 
 	state.buffer = BufferParams();
 	state.sample = range_start_sample - 1;
 	state.num_tiles = 0;
 	state.num_rendered_tiles = 0;
 	state.num_samples = 0;
-	state.resolution_divider = divider;
+	state.resolution_divider = get_divider(params.width, params.height, start_resolution);
 	state.tiles.clear();
 }
 
 void TileManager::set_samples(int num_samples_)
 {
 	num_samples = num_samples_;
+
+	/* No real progress indication is possible when using unlimited samples. */
+	if(num_samples == INT_MAX) {
+		state.total_pixel_samples = 0;
+	}
+	else {
+		uint64_t pixel_samples = 0;
+		/* While rendering in the viewport, the initial preview resolution is increased to the native resolution
+		 * before the actual rendering begins. Therefore, additional pixel samples will be rendered. */
+		int divider = get_divider(params.width, params.height, start_resolution) / 2;
+		while(divider > 1) {
+			int image_w = max(1, params.width/divider);
+			int image_h = max(1, params.height/divider);
+			pixel_samples += image_w * image_h;
+			divider >>= 1;
+		}
+
+		state.total_pixel_samples = pixel_samples + (uint64_t)get_num_effective_samples() * params.width*params.height;
+	}
 }
 
 /* If sliced is false, splits image into tiles and assigns equal amount of tiles to every render device.
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index af1b1ed..5d92eba 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -64,6 +64,10 @@ public:
 		int resolution_divider;
 		int num_tiles;
 		int num_rendered_tiles;
+
+		/* Total samples over all pixels: Generally num_samples*num_pixels,
+		 * but can be higher due to the initial resolution division for previews. */
+		uint64_t total_pixel_samples;
 		/* This vector contains a list of tiles for every logical device in the session.
 		 * In each list, the tiles are sorted according to the tile order setting. */
 		vector<list<Tile> > tiles;
@@ -91,7 +95,7 @@ public:
 	/* Number to samples in the rendering range. */
 	int range_num_samples;
 
-	/* get number of actual samples to render. */
+	/* Get number of actual samples to render. */
 	int get_num_effective_samples();
 protected:
 
diff --git a/intern/cycles/subd/subd_patch_table.cpp b/intern/cycles/subd/subd_patch_table.cpp
index 62572ef..d437b04 100644
--- a/intern/cycles/subd/subd_patch_table.cpp
+++ b/intern/cycles/subd/subd_patch_table.cpp
@@ -46,7 +46,7 @@ struct PatchMapQuadNode {
 	/* sets all the children to point to the patch of index */
 	void set_child(int index)
 	{
-		for (int i = 0; i < 4; i++) {
+		for(int i = 0; i < 4; i++) {
 			children[i] = index | PATCH_MAP_NODE_IS_SET | PATCH_MAP_NODE_IS_LEAF;
 		}
 	}
diff --git a/intern/cycles/test/render_graph_finalize_test.cpp b/intern/cycles/test/render_graph_finalize_test.cpp
index 60e41be..32b4c72 100644
--- a/intern/cycles/test/render_graph_finalize_test.cpp
+++ b/intern/cycles/test/render_graph_finalize_test.cpp
@@ -931,6 +931,72 @@ TEST(render_graph, constant_fold_gamma)
 }
 
 /*
+ * Tests: Gamma with one constant 0 input.
+ */
+TEST(render_graph, constant_fold_gamma_part_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	INVALID_INFO_MESSAGE(log, "Folding Gamma_Cx::");
+	CORRECT_INFO_MESSAGE(log, "Folding Gamma_xC::Color to constant (1, 1, 1).");
+
+	builder
+		.add_attribute("Attribute")
+		/* constant on the left */
+		.add_node(ShaderNodeBuilder<GammaNode>("Gamma_Cx")
+		          .set("Color", make_float3(0.0f, 0.0f, 0.0f)))
+		.add_connection("Attribute::Fac", "Gamma_Cx::Gamma")
+		/* constant on the right */
+		.add_node(ShaderNodeBuilder<GammaNode>("Gamma_xC")
+		          .set("Gamma", 0.0f))
+		.add_connection("Attribute::Color", "Gamma_xC::Color")
+		/* output sum */
+		.add_node(ShaderNodeBuilder<MixNode>("Out")
+		          .set(&MixNode::type, NODE_MIX_ADD)
+		          .set(&MixNode::use_clamp, true)
+		          .set("Fac", 1.0f))
+		.add_connection("Gamma_Cx::Color", "Out::Color1")
+		.add_connection("Gamma_xC::Color", "Out::Color2")
+		.output_color("Out::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: Gamma with one constant 1 input.
+ */
+TEST(render_graph, constant_fold_gamma_part_1)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Gamma_Cx::Color to constant (1, 1, 1).");
+	CORRECT_INFO_MESSAGE(log, "Folding Gamma_xC::Color to socket Attribute::Color.");
+
+	builder
+		.add_attribute("Attribute")
+		/* constant on the left */
+		.add_node(ShaderNodeBuilder<GammaNode>("Gamma_Cx")
+		          .set("Color", make_float3(1.0f, 1.0f, 1.0f)))
+		.add_connection("Attribute::Fac", "Gamma_Cx::Gamma")
+		/* constant on the right */
+		.add_node(ShaderNodeBuilder<GammaNode>("Gamma_xC")
+		          .set("Gamma", 1.0f))
+		.add_connection("Attribute::Color", "Gamma_xC::Color")
+		/* output sum */
+		.add_node(ShaderNodeBuilder<MixNode>("Out")
+		          .set(&MixNode::type, NODE_MIX_ADD)
+		          .set(&MixNode::use_clamp, true)
+		          .set("Fac", 1.0f))
+		.add_connection("Gamma_Cx::Color", "Out::Color1")
+		.add_connection("Gamma_xC::Color", "Out::Color2")
+		.output_color("Out::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
  * Tests: BrightnessContrast with all constant inputs.
  */
 TEST(render_graph, constant_fold_bright_contrast)
@@ -1143,6 +1209,40 @@ TEST(render_graph, constant_fold_part_math_div_0)
 }
 
 /*
+ * Tests: partial folding for Math Power with known 0.
+ */
+TEST(render_graph, constant_fold_part_math_pow_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	/* X ^ 0 == 1 */
+	INVALID_INFO_MESSAGE(log, "Folding Math_Cx::");
+	CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Value to constant (1).");
+	INVALID_INFO_MESSAGE(log, "Folding Out::");
+
+	build_math_partial_test_graph(builder, NODE_MATH_POWER, 0.0f);
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: partial folding for Math Power with known 1.
+ */
+TEST(render_graph, constant_fold_part_math_pow_1)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	/* 1 ^ X == 1; X ^ 1 == X */
+	CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to constant (1)");
+	CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Value to socket Attribute::Fac.");
+	INVALID_INFO_MESSAGE(log, "Folding Out::");
+
+	build_math_partial_test_graph(builder, NODE_MATH_POWER, 1.0f);
+	graph.finalize(&scene);
+}
+
+/*
  * Tests: Vector Math with all constant inputs.
  */
 TEST(render_graph, constant_fold_vector_math)
@@ -1307,8 +1407,9 @@ void init_test_curve(array<T> &buffer, T start, T end, int steps)
 {
 	buffer.resize(steps);
 
-	for (int i = 0; i < steps; i++)
+	for(int i = 0; i < steps; i++) {
 		buffer[i] = lerp(start, end, float(i)/(steps-1));
+	}
 }
 
 /*
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index f5674bd..d8abf67 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -45,6 +45,7 @@ set(SRC_HEADERS
 	util_half.h
 	util_hash.h
 	util_image.h
+	util_image_impl.h
 	util_list.h
 	util_logging.h
 	util_map.h
@@ -63,6 +64,7 @@ set(SRC_HEADERS
 	util_sky_model.cpp
 	util_sky_model.h
 	util_sky_model_data.h
+	util_avxf.h
 	util_sseb.h
 	util_ssef.h
 	util_ssei.h
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index 1d1e296..433e41f 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -39,7 +39,7 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
 /* Float atomics implementation credits:
  *   http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html
  */
-ccl_device_inline void atomic_add_float(volatile ccl_global float *source,
+ccl_device_inline void atomic_add_and_fetch_float(volatile ccl_global float *source,
                                         const float operand)
 {
 	union {
diff --git a/intern/cycles/util/util_avxf.h b/intern/cycles/util/util_avxf.h
new file mode 100644
index 0000000..2451213
--- /dev/null
+++ b/intern/cycles/util/util_avxf.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright 2016 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_AVXF_H__
+#define __UTIL_AVXF_H__
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __KERNEL_AVX__
+struct avxf
+{
+	typedef avxf Float;
+
+	enum { size = 8 };  /* Number of SIMD elements. */
+
+	union {
+		__m256 m256;
+		float f[8];
+		int i[8];
+	};
+
+	__forceinline avxf           () {}
+	__forceinline avxf           (const avxf& other) { m256 = other.m256; }
+	__forceinline avxf& operator=(const avxf& other) { m256 = other.m256; return *this; }
+
+	__forceinline avxf(const __m256 a) : m256(a) {}
+	__forceinline avxf(const __m256i a) : m256(_mm256_castsi256_ps (a)) {}
+
+	__forceinline operator const __m256&(void) const { return m256; }
+	__forceinline operator       __m256&(void)       { return m256; }
+
+	__forceinline avxf          (float a) : m256(_mm256_set1_ps(a)) {}
+
+	__forceinline avxf(float high32x4, float low32x4) :
+	   m256(_mm256_set_ps(high32x4, high32x4, high32x4, high32x4, low32x4, low32x4, low32x4, low32x4)) {}
+
+	__forceinline avxf(float a3, float a2, float a1, float a0) :
+	   m256(_mm256_set_ps(a3, a2, a1, a0, a3, a2, a1, a0)) {}
+
+	__forceinline avxf(float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0) :
+		m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0)) {}
+
+
+	__forceinline avxf(int a3, int a2, int a1, int a0)
+	{
+		const __m256i foo = _mm256_set_epi32(a3, a2, a1, a0, a3, a2, a1, a0);
+		m256 = _mm256_castsi256_ps(foo);
+	}
+
+
+	__forceinline avxf(int a7, int a6, int a5, int a4, int a3, int a2, int a1, int a0)
+	{
+		const __m256i foo = _mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0);
+		m256 = _mm256_castsi256_ps(foo);
+	}
+
+	__forceinline avxf(__m128 a, __m128 b)
+	{
+		const __m256 foo = _mm256_castps128_ps256(a);
+		m256 = _mm256_insertf128_ps(foo, b, 1);
+	}
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxf mm256_sqrt(const avxf& a) { return _mm256_sqrt_ps(a.m256); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Binary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxf operator +(const avxf& a, const avxf& b) { return _mm256_add_ps(a.m256, b.m256); }
+__forceinline const avxf operator +(const avxf& a, const float& b) { return a + avxf(b); }
+__forceinline const avxf operator +(const float& a, const avxf& b) { return avxf(a) + b; }
+
+__forceinline const avxf operator -(const avxf& a, const avxf& b) { return _mm256_sub_ps(a.m256, b.m256); }
+__forceinline const avxf operator -(const avxf& a, const float& b) { return a - avxf(b); }
+__forceinline const avxf operator -(const float& a, const avxf& b) { return avxf(a) - b; }
+
+__forceinline const avxf operator *(const avxf& a, const avxf& b) { return _mm256_mul_ps(a.m256, b.m256); }
+__forceinline const avxf operator *(const avxf& a, const float& b) { return a * avxf(b); }
+__forceinline const avxf operator *(const float& a, const avxf& b) { return avxf(a) * b; }
+
+__forceinline const avxf operator /(const avxf& a, const avxf& b) { return _mm256_div_ps(a.m256,b.m256); }
+__forceinline const avxf operator /(const avxf& a, const float& b) { return a/avxf(b); }
+__forceinline const avxf operator /(const float& a, const avxf& b) { return avxf(a)/b; }
+
+__forceinline const avxf operator|(const avxf& a, const avxf& b) { return _mm256_or_ps(a.m256,b.m256); }
+
+__forceinline const avxf operator^(const avxf& a, const avxf& b) { return _mm256_xor_ps(a.m256,b.m256); }
+
+__forceinline const avxf operator&(const avxf& a, const avxf& b) { return _mm256_and_ps(a.m256,b.m256); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Movement/Shifting/Shuffling Functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxf shuffle(const avxf& a, const __m256i &shuf) {
+	return _mm256_permutevar_ps(a, shuf);
+}
+
+template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> __forceinline const avxf shuffle(const avxf& a) {
+	return _mm256_permutevar_ps(a, _mm256_set_epi32( i7,i6,i5,i4 ,i3,i2,i1,i0));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const avxf shuffle(const avxf& a, const avxf& b) {
+	return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+}
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const avxf shuffle(const avxf& a) {
+	return shuffle<i0,i1,i2,i3>(a,a);
+}
+template<size_t i0> __forceinline const avxf shuffle(const avxf& a, const avxf& b) {
+	return shuffle<i0,i0,i0,i0>(a, b);
+}
+template<size_t i0> __forceinline const avxf shuffle(const avxf& a) {
+	return shuffle<i0>(a,a);
+}
+
+template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> __forceinline const avxf permute(const avxf& a) {
+#ifdef __KERNEL_AVX2__
+	return  _mm256_permutevar8x32_ps(a,_mm256_set_epi32( i7,i6,i5,i4 ,i3,i2,i1,i0));
+#else
+	float temp[8];
+	_mm256_storeu_ps((float*)&temp, a);
+	return avxf(temp[i7], temp[i6], temp[i5], temp[i4], temp[i3], temp[i2], temp[i1], temp[i0]);
+#endif
+}
+
+template<int S0, int S1, int S2, int S3,int S4,int S5,int S6, int S7>
+ccl_device_inline const avxf set_sign_bit(const avxf &a)
+{
+	return a ^ avxf(S7 << 31, S6 << 31, S5 << 31, S4 << 31, S3 << 31,S2 << 31,S1 << 31,S0 << 31);
+}
+
+template<size_t S0, size_t S1, size_t S2, size_t S3,size_t S4,size_t S5,size_t S6, size_t S7>
+ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
+{
+	return _mm256_blend_ps(a,b,S7 << 0 | S6 << 1 | S5 << 2 | S4 << 3 | S3 << 4 | S2 << 5 | S1 << 6 | S0 << 7);
+}
+
+template<size_t S0, size_t S1, size_t S2, size_t S3 >
+ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
+{
+	return blend<S0,S1,S2,S3,S0,S1,S2,S3>(a,b);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Ternary Operators
+////////////////////////////////////////////////////////////////////////////////
+__forceinline const avxf madd (const avxf& a, const avxf& b, const avxf& c) {
+#ifdef __KERNEL_AVX2__
+	return _mm256_fmadd_ps(a,b,c);
+#else
+	return c+(a*b);
+#endif
+}
+
+__forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) {
+#ifdef __KERNEL_AVX2__
+	return _mm256_fnmadd_ps(a, b, c);
+#else
+	return c-(a*b);
+#endif
+}
+#endif
+
+#ifndef _mm256_set_m128
+#  define _mm256_set_m128(/* __m128 */ hi, /* __m128 */ lo) \
+    _mm256_insertf128_ps(_mm256_castps128_ps256(lo), (hi), 0x1)
+#endif
+
+#define _mm256_loadu2_m128(/* float const* */ hiaddr, /* float const* */ loaddr) \
+    _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr))
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/util/util_boundbox.h b/intern/cycles/util/util_boundbox.h
index 599222d..dfe4977 100644
--- a/intern/cycles/util/util_boundbox.h
+++ b/intern/cycles/util/util_boundbox.h
@@ -25,8 +25,6 @@
 #include "util_transform.h"
 #include "util_types.h"
 
-using namespace std;
-
 CCL_NAMESPACE_BEGIN
 
 /* 3D BoundBox */
diff --git a/intern/cycles/util/util_hash.h b/intern/cycles/util/util_hash.h
index 3ff2802..98c3a68 100644
--- a/intern/cycles/util/util_hash.h
+++ b/intern/cycles/util/util_hash.h
@@ -21,7 +21,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-static inline uint hash_int_2d(uint kx, uint ky)
+ccl_device_inline uint hash_int_2d(uint kx, uint ky)
 {
 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
 
@@ -44,11 +44,12 @@ static inline uint hash_int_2d(uint kx, uint ky)
 #undef rot
 }
 
-static inline uint hash_int(uint k)
+ccl_device_inline uint hash_int(uint k)
 {
 	return hash_int_2d(k, 0);
 }
 
+#ifndef __KERNEL_GPU__
 static inline uint hash_string(const char *str)
 {
 	uint i = 0, c;
@@ -58,6 +59,7 @@ static inline uint hash_string(const char *str)
 
 	return i;
 }
+#endif
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_image.h b/intern/cycles/util/util_image.h
index bb8a31c..c8efc55 100644
--- a/intern/cycles/util/util_image.h
+++ b/intern/cycles/util/util_image.h
@@ -21,11 +21,25 @@
 
 #include <OpenImageIO/imageio.h>
 
+#include "util_vector.h"
+
 CCL_NAMESPACE_BEGIN
 
 OIIO_NAMESPACE_USING
 
+template<typename T>
+void util_image_resize_pixels(const vector<T>& input_pixels,
+                              const size_t input_width,
+                              const size_t input_height,
+                              const size_t input_depth,
+                              const size_t components,
+                              vector<T> *output_pixels,
+                              size_t *output_width,
+                              size_t *output_height,
+                              size_t *output_depth);
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_IMAGE_H__ */
 
+#include "util_image_impl.h"
diff --git a/intern/cycles/util/util_image_impl.h b/intern/cycles/util/util_image_impl.h
new file mode 100644
index 0000000..73ecfda
--- /dev/null
+++ b/intern/cycles/util/util_image_impl.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_IMAGE_IMPL_H__
+#define __UTIL_IMAGE_IMPL_H__
+
+#include "util_algorithm.h"
+#include "util_debug.h"
+#include "util_image.h"
+
+CCL_NAMESPACE_BEGIN
+
+namespace {
+
+template<typename T>
+const T *util_image_read(const vector<T>& pixels,
+                         const size_t width,
+                         const size_t height,
+                         const size_t /*depth*/,
+                         const size_t components,
+                         const size_t x, const size_t y, const size_t z) {
+	const size_t index = ((size_t)z * (width * height) +
+	                      (size_t)y * width +
+	                      (size_t)x) * components;
+	return &pixels[index];
+}
+
+template<typename T>
+void util_image_downscale_sample(const vector<T>& pixels,
+                                 const size_t width,
+                                 const size_t height,
+                                 const size_t depth,
+                                 const size_t components,
+                                 const size_t kernel_size,
+                                 const float x,
+                                 const float y,
+                                 const float z,
+                                 T *result)
+{
+	assert(components <= 4);
+	const size_t ix = (size_t)x,
+	             iy = (size_t)y,
+	             iz = (size_t)z;
+	/* TODO(sergey): Support something smarter than box filer. */
+	float accum[4] = {0};
+	size_t count = 0;
+	for(size_t dz = 0; dz < kernel_size; ++dz) {
+		for(size_t dy = 0; dy < kernel_size; ++dy) {
+			for(size_t dx = 0; dx < kernel_size; ++dx) {
+				const size_t nx = ix + dx,
+				             ny = iy + dy,
+				             nz = iz + dz;
+				if(nx >= width || ny >= height || nz >= depth) {
+					continue;
+				}
+				const T *pixel = util_image_read(pixels,
+				                                 width, height, depth,
+				                                 components,
+				                                 nx, ny, nz);
+				for(size_t k = 0; k < components; ++k) {
+					accum[k] += pixel[k];
+				}
+				++count;
+			}
+		}
+	}
+	const float inv_count = 1.0f / (float)count;
+	for(size_t k = 0; k < components; ++k) {
+		result[k] = T(accum[k] * inv_count);
+	}
+}
+
+template<typename T>
+void util_image_downscale_pixels(const vector<T>& input_pixels,
+                                 const size_t input_width,
+                                 const size_t input_height,
+                                 const size_t input_depth,
+                                 const size_t components,
+                                 const float inv_scale_factor,
+                                 const size_t output_width,
+                                 const size_t output_height,
+                                 const size_t output_depth,
+                                 vector<T> *output_pixels)
+{
+	const size_t kernel_size = (size_t)(inv_scale_factor + 0.5f);
+	for(size_t z = 0; z < output_depth; ++z) {
+		for(size_t y = 0; y < output_height; ++y) {
+			for(size_t x = 0; x < output_width; ++x) {
+				const float input_x = (float)x * inv_scale_factor,
+				            input_y = (float)y * inv_scale_factor,
+				            input_z = (float)z * inv_scale_factor;
+				const size_t output_index =
+				        (z * output_width * output_height +
+				         y * output_width + x) * components;
+				util_image_downscale_sample(input_pixels,
+				                            input_width, input_height, input_depth,
+				                            components,
+				                            kernel_size,
+				                            input_x, input_y, input_z,
+				                            &output_pixels->at(output_index));
+			}
+		}
+	}
+}
+
+}  /* namespace */
+
+template<typename T>
+void util_image_resize_pixels(const vector<T>& input_pixels,
+                              const size_t input_width,
+                              const size_t input_height,
+                              const size_t input_depth,
+                              const size_t components,
+                              const float scale_factor,
+                              vector<T> *output_pixels,
+                              size_t *output_width,
+                              size_t *output_height,
+                              size_t *output_depth)
+{
+	/* Early output for case when no scaling is applied. */
+	if(scale_factor == 1.0f) {
+		*output_width = input_width;
+		*output_height = input_height;
+		*output_depth = input_depth;
+		*output_pixels = input_pixels;
+		return;
+	}
+	/* First of all, we calculate output image dimensions.
+	 * We clamp them to be 1 pixel at least so we do not generate degenerate
+	 * image.
+	 */
+	*output_width = max((size_t)((float)input_width * scale_factor), (size_t)1);
+	*output_height = max((size_t)((float)input_height * scale_factor), (size_t)1);
+	*output_depth = max((size_t)((float)input_depth * scale_factor), (size_t)1);
+	/* Prepare pixel storage for the result. */
+	const size_t num_output_pixels = ((*output_width) *
+	                                  (*output_height) *
+	                                  (*output_depth)) * components;
+	output_pixels->resize(num_output_pixels);
+	if(scale_factor < 1.0f) {
+		const float inv_scale_factor = 1.0f / scale_factor;
+		util_image_downscale_pixels(input_pixels,
+		                            input_width, input_height, input_depth,
+		                            components,
+		                            inv_scale_factor,
+		                            *output_width, *output_height, *output_depth,
+		                            output_pixels);
+	} else {
+		/* TODO(sergey): Needs implementation. */
+	}
+}
+
+CCL_NAMESPACE_END
+
+#endif  /* __UTIL_IMAGE_IMPL_H__ */
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 89a882d..2b81c8c 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -22,6 +22,11 @@
  * Basic math functions on scalar and vector types. This header is used by
  * both the kernel code when compiled as C++, and other C++ non-kernel code. */
 
+#ifndef __KERNEL_GPU__
+#  include <cmath>
+#endif
+
+
 #ifndef __KERNEL_OPENCL__
 
 #include <float.h>
@@ -97,6 +102,9 @@ ccl_device_inline float fminf(float a, float b)
 
 #ifndef __KERNEL_GPU__
 
+using std::isfinite;
+using std::isnan;
+
 ccl_device_inline int abs(int x)
 {
 	return (x > 0)? x: -x;
@@ -162,6 +170,11 @@ ccl_device_inline float max4(float a, float b, float c, float d)
 	return max(max(a, b), max(c, d));
 }
 
+ccl_device_inline float max3(float3 a)
+{
+	return max(max(a.x, a.y), a.z);
+}
+
 #ifndef __KERNEL_OPENCL__
 
 ccl_device_inline int clamp(int a, int mn, int mx)
@@ -233,7 +246,7 @@ ccl_device_inline int mod(int x, int m)
 
 #ifndef __KERNEL_OPENCL__
 
-ccl_device_inline bool is_zero(const float2 a)
+ccl_device_inline bool is_zero(const float2& a)
 {
 	return (a.x == 0.0f && a.y == 0.0f);
 }
@@ -242,7 +255,7 @@ ccl_device_inline bool is_zero(const float2 a)
 
 #ifndef __KERNEL_OPENCL__
 
-ccl_device_inline float average(const float2 a)
+ccl_device_inline float average(const float2& a)
 {
 	return (a.x + a.y)*(1.0f/2.0f);
 }
@@ -251,58 +264,58 @@ ccl_device_inline float average(const float2 a)
 
 #ifndef __KERNEL_OPENCL__
 
-ccl_device_inline float2 operator-(const float2 a)
+ccl_device_inline float2 operator-(const float2& a)
 {
 	return make_float2(-a.x, -a.y);
 }
 
-ccl_device_inline float2 operator*(const float2 a, const float2 b)
+ccl_device_inline float2 operator*(const float2& a, const float2& b)
 {
 	return make_float2(a.x*b.x, a.y*b.y);
 }
 
-ccl_device_inline float2 operator*(const float2 a, float f)
+ccl_device_inline float2 operator*(const float2& a, float f)
 {
 	return make_float2(a.x*f, a.y*f);
 }
 
-ccl_device_inline float2 operator*(float f, const float2 a)
+ccl_device_inline float2 operator*(float f, const float2& a)
 {
 	return make_float2(a.x*f, a.y*f);
 }
 
-ccl_device_inline float2 operator/(float f, const float2 a)
+ccl_device_inline float2 operator/(float f, const float2& a)
 {
 	return make_float2(f/a.x, f/a.y);
 }
 
-ccl_device_inline float2 operator/(const float2 a, float f)
+ccl_device_inline float2 operator/(const float2& a, float f)
 {
 	float invf = 1.0f/f;
 	return make_float2(a.x*invf, a.y*invf);
 }
 
-ccl_device_inline float2 operator/(const float2 a, const float2 b)
+ccl_device_inline float2 operator/(const float2& a, const float2& b)
 {
 	return make_float2(a.x/b.x, a.y/b.y);
 }
 
-ccl_device_inline float2 operator+(const float2 a, const float2 b)
+ccl_device_inline float2 operator+(const float2& a, const float2& b)
 {
 	return make_float2(a.x+b.x, a.y+b.y);
 }
 
-ccl_device_inline float2 operator-(const float2 a, const float2 b)
+ccl_device_inline float2 operator-(const float2& a, const float2& b)
 {
 	return make_float2(a.x-b.x, a.y-b.y);
 }
 
-ccl_device_inline float2 operator+=(float2& a, const float2 b)
+ccl_device_inline float2 operator+=(float2& a, const float2& b)
 {
 	return a = a + b;
 }
 
-ccl_device_inline float2 operator*=(float2& a, const float2 b)
+ccl_device_inline float2 operator*=(float2& a, const float2& b)
 {
 	return a = a * b;
 }
@@ -312,7 +325,7 @@ ccl_device_inline float2 operator*=(float2& a, float f)
 	return a = a * f;
 }
 
-ccl_device_inline float2 operator/=(float2& a, const float2 b)
+ccl_device_inline float2 operator/=(float2& a, const float2& b)
 {
 	return a = a / b;
 }
@@ -324,12 +337,12 @@ ccl_device_inline float2 operator/=(float2& a, float f)
 }
 
 
-ccl_device_inline float dot(const float2 a, const float2 b)
+ccl_device_inline float dot(const float2& a, const float2& b)
 {
 	return a.x*b.x + a.y*b.y;
 }
 
-ccl_device_inline float cross(const float2 a, const float2 b)
+ccl_device_inline float cross(const float2& a, const float2& b)
 {
 	return (a.x*b.y - a.y*b.x);
 }
@@ -343,59 +356,59 @@ ccl_device_inline bool operator==(const int2 a, const int2 b)
 	return (a.x == b.x && a.y == b.y);
 }
 
-ccl_device_inline float len(const float2 a)
+ccl_device_inline float len(const float2& a)
 {
 	return sqrtf(dot(a, a));
 }
 
-ccl_device_inline float2 normalize(const float2 a)
+ccl_device_inline float2 normalize(const float2& a)
 {
 	return a/len(a);
 }
 
-ccl_device_inline float2 normalize_len(const float2 a, float *t)
+ccl_device_inline float2 normalize_len(const float2& a, float *t)
 {
 	*t = len(a);
 	return a/(*t);
 }
 
-ccl_device_inline float2 safe_normalize(const float2 a)
+ccl_device_inline float2 safe_normalize(const float2& a)
 {
 	float t = len(a);
 	return (t != 0.0f)? a/t: a;
 }
 
-ccl_device_inline bool operator==(const float2 a, const float2 b)
+ccl_device_inline bool operator==(const float2& a, const float2& b)
 {
 	return (a.x == b.x && a.y == b.y);
 }
 
-ccl_device_inline bool operator!=(const float2 a, const float2 b)
+ccl_device_inline bool operator!=(const float2& a, const float2& b)
 {
 	return !(a == b);
 }
 
-ccl_device_inline float2 min(float2 a, float2 b)
+ccl_device_inline float2 min(const float2& a, const float2& b)
 {
 	return make_float2(min(a.x, b.x), min(a.y, b.y));
 }
 
-ccl_device_inline float2 max(float2 a, float2 b)
+ccl_device_inline float2 max(const float2& a, const float2& b)
 {
 	return make_float2(max(a.x, b.x), max(a.y, b.y));
 }
 
-ccl_device_inline float2 clamp(float2 a, float2 mn, float2 mx)
+ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx)
 {
 	return min(max(a, mn), mx);
 }
 
-ccl_device_inline float2 fabs(float2 a)
+ccl_device_inline float2 fabs(const float2& a)
 {
 	return make_float2(fabsf(a.x), fabsf(a.y));
 }
 
-ccl_device_inline float2 as_float2(const float4 a)
+ccl_device_inline float2 as_float2(const float4& a)
 {
 	return make_float2(a.x, a.y);
 }
@@ -413,7 +426,7 @@ ccl_device_inline void print_float2(const char *label, const float2& a)
 
 #ifndef __KERNEL_OPENCL__
 
-ccl_device_inline float2 interp(float2 a, float2 b, float t)
+ccl_device_inline float2 interp(const float2& a, const float2& b, float t)
 {
 	return a + t*(b - a);
 }
@@ -424,58 +437,95 @@ ccl_device_inline float2 interp(float2 a, float2 b, float t)
 
 #ifndef __KERNEL_OPENCL__
 
-ccl_device_inline float3 operator-(const float3 a)
+ccl_device_inline float3 operator-(const float3& a)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+#else
 	return make_float3(-a.x, -a.y, -a.z);
+#endif
 }
 
-ccl_device_inline float3 operator*(const float3 a, const float3 b)
+ccl_device_inline float3 operator*(const float3& a, const float3& b)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_mul_ps(a.m128,b.m128));
+#else
 	return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
+#endif
 }
 
-ccl_device_inline float3 operator*(const float3 a, float f)
+ccl_device_inline float3 operator*(const float3& a, const float f)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f)));
+#else
 	return make_float3(a.x*f, a.y*f, a.z*f);
+#endif
 }
 
-ccl_device_inline float3 operator*(float f, const float3 a)
+ccl_device_inline float3 operator*(const float f, const float3& a)
 {
+	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
+#if defined(__KERNEL_SSE__) && 0
+	return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
+#else
 	return make_float3(a.x*f, a.y*f, a.z*f);
+#endif
 }
 
-ccl_device_inline float3 operator/(float f, const float3 a)
+ccl_device_inline float3 operator/(const float f, const float3& a)
 {
-	return make_float3(f/a.x, f/a.y, f/a.z);
+	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
+#if defined(__KERNEL_SSE__) && 0
+	__m128 rc = _mm_rcp_ps(a.m128);
+	return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
+#else
+	return make_float3(f / a.x, f / a.y, f / a.z);
+#endif
 }
 
-ccl_device_inline float3 operator/(const float3 a, float f)
+ccl_device_inline float3 operator/(const float3& a, const float f)
 {
 	float invf = 1.0f/f;
-	return make_float3(a.x*invf, a.y*invf, a.z*invf);
+	return a * invf;
 }
 
-ccl_device_inline float3 operator/(const float3 a, const float3 b)
+ccl_device_inline float3 operator/(const float3& a, const float3& b)
 {
-	return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
+	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
+#if defined(__KERNEL_SSE__) && 0
+	__m128 rc = _mm_rcp_ps(b.m128);
+	return float3(_mm_mul_ps(a, rc));
+#else
+	return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+#endif
 }
 
-ccl_device_inline float3 operator+(const float3 a, const float3 b)
+ccl_device_inline float3 operator+(const float3& a, const float3& b)
 {
-	return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
+#ifdef __KERNEL_SSE__
+	return float3(_mm_add_ps(a.m128, b.m128));
+#else
+	return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+#endif
 }
 
-ccl_device_inline float3 operator-(const float3 a, const float3 b)
+ccl_device_inline float3 operator-(const float3& a, const float3& b)
 {
-	return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
+#ifdef __KERNEL_SSE__
+	return float3(_mm_sub_ps(a.m128, b.m128));
+#else
+	return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+#endif
 }
 
-ccl_device_inline float3 operator+=(float3& a, const float3 b)
+ccl_device_inline float3 operator+=(float3& a, const float3& b)
 {
 	return a = a + b;
 }
 
-ccl_device_inline float3 operator*=(float3& a, const float3 b)
+ccl_device_inline float3 operator*=(float3& a, const float3& b)
 {
 	return a = a * b;
 }
@@ -485,7 +535,7 @@ ccl_device_inline float3 operator*=(float3& a, float f)
 	return a = a * f;
 }
 
-ccl_device_inline float3 operator/=(float3& a, const float3 b)
+ccl_device_inline float3 operator/=(float3& a, const float3& b)
 {
 	return a = a / b;
 }
@@ -496,7 +546,7 @@ ccl_device_inline float3 operator/=(float3& a, float f)
 	return a = a * invf;
 }
 
-ccl_device_inline float dot(const float3 a, const float3 b)
+ccl_device_inline float dot(const float3& a, const float3& b)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
 	return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
@@ -505,7 +555,16 @@ ccl_device_inline float dot(const float3 a, const float3 b)
 #endif
 }
 
-ccl_device_inline float dot(const float4 a, const float4 b)
+ccl_device_inline float dot_xy(const float3& a, const float3& b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b));
+#else
+	return a.x*b.x + a.y*b.y;
+#endif
+}
+
+ccl_device_inline float dot(const float4& a, const float4& b)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
 	return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
@@ -514,7 +573,7 @@ ccl_device_inline float dot(const float4 a, const float4 b)
 #endif
 }
 
-ccl_device_inline float3 cross(const float3 a, const float3 b)
+ccl_device_inline float3 cross(const float3& a, const float3& b)
 {
 	float3 r = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
 	return r;
@@ -538,12 +597,12 @@ ccl_device_inline float len_squared(const float3 a)
 
 #ifndef __KERNEL_OPENCL__
 
-ccl_device_inline float len_squared(const float4 a)
+ccl_device_inline float len_squared(const float4& a)
 {
 	return dot(a, a);
 }
 
-ccl_device_inline float3 normalize(const float3 a)
+ccl_device_inline float3 normalize(const float3& a)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
 	__m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
@@ -563,13 +622,14 @@ ccl_device_inline float3 saturate3(float3 a)
 ccl_device_inline float3 normalize_len(const float3 a, float *t)
 {
 	*t = len(a);
-	return a/(*t);
+	float x = 1.0f / *t;
+	return a*x;
 }
 
 ccl_device_inline float3 safe_normalize(const float3 a)
 {
 	float t = len(a);
-	return (t != 0.0f)? a/t: a;
+	return (t != 0.0f)? a * (1.0f/t) : a;
 }
 
 ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
@@ -580,7 +640,7 @@ ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
 
 #ifndef __KERNEL_OPENCL__
 
-ccl_device_inline bool operator==(const float3 a, const float3 b)
+ccl_device_inline bool operator==(const float3& a, const float3& b)
 {
 #ifdef __KERNEL_SSE__
 	return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
@@ -589,12 +649,12 @@ ccl_device_inline bool operator==(const float3 a, const float3 b)
 #endif
 }
 
-ccl_device_inline bool operator!=(const float3 a, const float3 b)
+ccl_device_inline bool operator!=(const float3& a, const float3& b)
 {
 	return !(a == b);
 }
 
-ccl_device_inline float3 min(float3 a, float3 b)
+ccl_device_inline float3 min(const float3& a, const float3& b)
 {
 #ifdef __KERNEL_SSE__
 	return _mm_min_ps(a.m128, b.m128);
@@ -603,7 +663,7 @@ ccl_device_inline float3 min(float3 a, float3 b)
 #endif
 }
 
-ccl_device_inline float3 max(float3 a, float3 b)
+ccl_device_inline float3 max(const float3& a, const float3& b)
 {
 #ifdef __KERNEL_SSE__
 	return _mm_max_ps(a.m128, b.m128);
@@ -612,12 +672,12 @@ ccl_device_inline float3 max(float3 a, float3 b)
 #endif
 }
 
-ccl_device_inline float3 clamp(float3 a, float3 mn, float3 mx)
+ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx)
 {
 	return min(max(a, mn), mx);
 }
 
-ccl_device_inline float3 fabs(float3 a)
+ccl_device_inline float3 fabs(const float3& a)
 {
 #ifdef __KERNEL_SSE__
 	__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
@@ -670,7 +730,7 @@ ccl_device_inline float3 interp(float3 a, float3 b, float t)
 
 #ifndef __KERNEL_OPENCL__
 
-ccl_device_inline float3 mix(float3 a, float3 b, float t)
+ccl_device_inline float3 mix(const float3& a, const float3& b, float t)
 {
 	return a + t*(b - a);
 }
@@ -754,7 +814,7 @@ ccl_device_inline float4 operator*(const float4& a, const float4& b)
 
 ccl_device_inline float4 operator*(const float4& a, float f)
 {
-#ifdef __KERNEL_SSE__
+#if defined(__KERNEL_SSE__)
 	return a * make_float4(f);
 #else
 	return make_float4(a.x*f, a.y*f, a.z*f, a.w*f);
@@ -833,7 +893,7 @@ ccl_device_inline int4 operator<(const float4& a, const float4& b)
 #endif
 }
 
-ccl_device_inline int4 operator>=(float4 a, float4 b)
+ccl_device_inline int4 operator>=(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
 	return _mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)); /* todo: avoid cvt */
@@ -851,7 +911,7 @@ ccl_device_inline int4 operator<=(const float4& a, const float4& b)
 #endif
 }
 
-ccl_device_inline bool operator==(const float4 a, const float4 b)
+ccl_device_inline bool operator==(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
 	return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
@@ -893,23 +953,23 @@ ccl_device_inline float average(const float4& a)
 	return reduce_add(a) * 0.25f;
 }
 
-ccl_device_inline float len(const float4 a)
+ccl_device_inline float len(const float4& a)
 {
 	return sqrtf(dot(a, a));
 }
 
-ccl_device_inline float4 normalize(const float4 a)
+ccl_device_inline float4 normalize(const float4& a)
 {
 	return a/len(a);
 }
 
-ccl_device_inline float4 safe_normalize(const float4 a)
+ccl_device_inline float4 safe_normalize(const float4& a)
 {
 	float t = len(a);
 	return (t != 0.0f)? a/t: a;
 }
 
-ccl_device_inline float4 min(float4 a, float4 b)
+ccl_device_inline float4 min(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
 	return _mm_min_ps(a.m128, b.m128);
@@ -918,7 +978,7 @@ ccl_device_inline float4 min(float4 a, float4 b)
 #endif
 }
 
-ccl_device_inline float4 max(float4 a, float4 b)
+ccl_device_inline float4 max(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
 	return _mm_max_ps(a.m128, b.m128);
@@ -1181,6 +1241,20 @@ ccl_device_inline float __uint_as_float(uint i)
 	return u.f;
 }
 
+/* Versions of functions which are safe for fast math. */
+ccl_device_inline bool isnan_safe(float f)
+{
+	unsigned int x = __float_as_uint(f);
+	return (x << 1) > 0xff000000u;
+}
+
+ccl_device_inline bool isfinite_safe(float f)
+{
+	/* By IEEE 754 rule, 2*Inf equals Inf */
+	unsigned int x = __float_as_uint(f);
+	return (f == f) && (x == 0 || (f != 2.0f*f));
+}
+
 /* Interpolation */
 
 template<class A, class B> A lerp(const A& a, const A& b, const B& t)
@@ -1190,7 +1264,7 @@ template<class A, class B> A lerp(const A& a, const A& b, const B& t)
 
 /* Triangle */
 
-ccl_device_inline float triangle_area(const float3 v1, const float3 v2, const float3 v3)
+ccl_device_inline float triangle_area(const float3& v1, const float3& v2, const float3& v3)
 {
 	return len(cross(v3 - v2, v1 - v2))*0.5f;
 }
@@ -1529,7 +1603,7 @@ ccl_device_inline bool ray_triangle_intersect_uv(
 
 ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D, float ray_mint, float ray_maxt,
                                    float3 quad_P, float3 quad_u, float3 quad_v, float3 quad_n,
-                                   float3 *isect_P, float *isect_t)
+                                   float3 *isect_P, float *isect_t, float *isect_u, float *isect_v)
 {
 	float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n);
 	if(t < ray_mint || t > ray_maxt)
@@ -1537,13 +1611,19 @@ ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D, float ray_mint, f
 
 	float3 hit = ray_P + t*ray_D;
 	float3 inplane = hit - quad_P;
-	if(fabsf(dot(inplane, quad_u) / dot(quad_u, quad_u)) > 0.5f)
+
+	float u = dot(inplane, quad_u) / dot(quad_u, quad_u) + 0.5f;
+	if(u < 0.0f || u > 1.0f)
 		return false;
-	if(fabsf(dot(inplane, quad_v) / dot(quad_v, quad_v)) > 0.5f)
+
+	float v = dot(inplane, quad_v) / dot(quad_v, quad_v) + 0.5f;
+	if(v < 0.0f || v > 1.0f)
 		return false;
 
 	if(isect_P) *isect_P = hit;
 	if(isect_t) *isect_t = t;
+	if(isect_u) *isect_u = u;
+	if(isect_v) *isect_v = v;
 
 	return true;
 }
@@ -1584,6 +1664,14 @@ ccl_device_inline float2 map_to_sphere(const float3 co)
 
 ccl_device_inline int util_max_axis(float3 vec)
 {
+#ifdef __KERNEL_SSE__
+	__m128 a = shuffle<0,0,1,1>(vec.m128);
+	__m128 b = shuffle<1,2,2,1>(vec.m128);
+	__m128 c = _mm_cmpgt_ps(a, b);
+	int mask = _mm_movemask_ps(c) & 0x7;
+	static const char tab[8] = {2, 2, 2, 0, 1, 2, 1, 0};
+	return tab[mask];
+#else
 	if(vec.x > vec.y) {
 		if(vec.x > vec.z)
 			return 0;
@@ -1596,6 +1684,7 @@ ccl_device_inline int util_max_axis(float3 vec)
 		else
 			return 2;
 	}
+#endif
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index f6fba03..5df262f 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -36,6 +36,9 @@ OIIO_NAMESPACE_USING
 #else
 #  define DIR_SEP '/'
 #  include <dirent.h>
+#  include <pwd.h>
+#  include <unistd.h>
+#  include <sys/types.h>
 #endif
 
 #ifdef HAVE_SHLWAPI_H
@@ -63,6 +66,7 @@ typedef struct stat path_stat_t;
 
 static string cached_path = "";
 static string cached_user_path = "";
+static string cached_xdg_cache_path = "";
 
 namespace {
 
@@ -331,6 +335,23 @@ static char *path_specials(const string& sub)
 	return NULL;
 }
 
+#if defined(__linux__) || defined(__APPLE__)
+static string path_xdg_cache_get()
+{
+	const char *home = getenv("XDG_CACHE_HOME");
+	if(home) {
+		return string(home);
+	}
+	else {
+		home = getenv("HOME");
+		if(home == NULL) {
+			home = getpwuid(getuid())->pw_dir;
+		}
+		return path_join(string(home), ".cache");
+	}
+}
+#endif
+
 void path_init(const string& path, const string& user_path)
 {
 	cached_path = path;
@@ -364,6 +385,24 @@ string path_user_get(const string& sub)
 	return path_join(cached_user_path, sub);
 }
 
+string path_cache_get(const string& sub)
+{
+#if defined(__linux__) || defined(__APPLE__)
+	if(cached_xdg_cache_path == "") {
+		cached_xdg_cache_path = path_xdg_cache_get();
+	}
+	string result = path_join(cached_xdg_cache_path, "cycles");
+	return path_join(result, sub);
+#else
+	/* TODO(sergey): What that should be on Windows? */
+	return path_user_get(path_join("cache", sub));
+#endif
+}
+
+#if defined(__linux__) || defined(__APPLE__)
+string path_xdg_home_get(const string& sub = "");
+#endif
+
 string path_filename(const string& path)
 {
 	size_t index = find_last_slash(path);
@@ -718,9 +757,9 @@ uint64_t path_modified_time(const string& path)
 {
 	path_stat_t st;
 	if(path_stat(path, &st) != 0) {
-		return st.st_mtime;
+		return 0;
 	}
-	return 0;
+	return st.st_mtime;
 }
 
 bool path_remove(const string& path)
diff --git a/intern/cycles/util/util_path.h b/intern/cycles/util/util_path.h
index 1ff76b6..70dbb5a 100644
--- a/intern/cycles/util/util_path.h
+++ b/intern/cycles/util/util_path.h
@@ -35,6 +35,7 @@ CCL_NAMESPACE_BEGIN
 void path_init(const string& path = "", const string& user_path = "");
 string path_get(const string& sub = "");
 string path_user_get(const string& sub = "");
+string path_cache_get(const string& sub = "");
 
 /* path string manipulation */
 string path_filename(const string& path);
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index 4ae1d61..1421505 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -34,12 +34,12 @@ class Progress {
 public:
 	Progress()
 	{
-		tile = 0;
-		sample = 0;
+		pixel_samples = 0;
+		total_pixel_samples = 0;
+		current_tile_sample = 0;
+		finished_tiles = 0;
 		start_time = time_dt();
-		total_time = 0.0;
-		render_time = 0.0;
-		tile_time = 0.0;
+		render_start_time = time_dt();
 		status = "Initializing";
 		substatus = "";
 		sync_status = "";
@@ -62,22 +62,22 @@ public:
 		thread_scoped_lock lock(progress.progress_mutex);
 
 		progress.get_status(status, substatus);
-		progress.get_tile(tile, total_time, render_time, tile_time);
 
-		sample = progress.get_sample();
+		pixel_samples = progress.pixel_samples;
+		total_pixel_samples = progress.total_pixel_samples;
+		current_tile_sample = progress.get_current_sample();
 
 		return *this;
 	}
 
 	void reset()
 	{
-		tile = 0;
-		sample = 0;
+		pixel_samples = 0;
+		total_pixel_samples = 0;
+		current_tile_sample = 0;
+		finished_tiles = 0;
 		start_time = time_dt();
 		render_start_time = time_dt();
-		total_time = 0.0;
-		render_time = 0.0;
-		tile_time = 0.0;
 		status = "Initializing";
 		substatus = "";
 		sync_status = "";
@@ -139,69 +139,93 @@ public:
 
 	/* tile and timing information */
 
-	void set_start_time(double start_time_)
+	void set_start_time()
 	{
 		thread_scoped_lock lock(progress_mutex);
 
-		start_time = start_time_;
+		start_time = time_dt();
 	}
 
-	void set_render_start_time(double render_start_time_)
+	void set_render_start_time()
 	{
 		thread_scoped_lock lock(progress_mutex);
 
-		render_start_time = render_start_time_;
+		render_start_time = time_dt();
 	}
 
-	void set_tile(int tile_, double tile_time_)
+	void add_skip_time(const scoped_timer &start_timer, bool only_render)
 	{
-		thread_scoped_lock lock(progress_mutex);
+		double skip_time = time_dt() - start_timer.get_start();
 
-		tile = tile_;
-		total_time = time_dt() - start_time;
-		render_time = time_dt() - render_start_time;
-		tile_time = tile_time_;
+		render_start_time += skip_time;
+		if(!only_render) {
+			start_time += skip_time;
+		}
 	}
 
-	void get_tile(int& tile_, double& total_time_, double& render_time_, double& tile_time_)
+	void get_time(double& total_time_, double& render_time_)
 	{
 		thread_scoped_lock lock(progress_mutex);
 
-		tile_ = tile;
-		total_time_ = (total_time > 0.0)? total_time: 0.0;
-		render_time_ = (render_time > 0.0)? render_time: 0.0;
-		tile_time_ = tile_time;
+		total_time_ = time_dt() - start_time;
+		render_time_ = time_dt() - render_start_time;
 	}
 
-	void get_time(double& total_time_, double& render_time_)
+	void reset_sample()
 	{
-		total_time_ = (total_time > 0.0)? total_time: 0.0;
-		render_time_ = (render_time > 0.0)? render_time: 0.0;
+		thread_scoped_lock lock(progress_mutex);
+
+		pixel_samples = 0;
+		current_tile_sample = 0;
+		finished_tiles = 0;
 	}
 
-	void reset_sample()
+	void set_total_pixel_samples(uint64_t total_pixel_samples_)
 	{
 		thread_scoped_lock lock(progress_mutex);
 
-		sample = 0;
+		total_pixel_samples = total_pixel_samples_;
 	}
 
-	void increment_sample()
+	float get_progress()
+	{
+		if(total_pixel_samples > 0) {
+			return ((float) pixel_samples) / total_pixel_samples;
+		}
+		return 0.0f;
+	}
+
+	void add_samples(uint64_t pixel_samples_, int tile_sample)
 	{
 		thread_scoped_lock lock(progress_mutex);
 
-		sample++;
+		pixel_samples += pixel_samples_;
+		current_tile_sample = tile_sample;
 	}
 
-	void increment_sample_update()
+	void add_samples_update(uint64_t pixel_samples_, int tile_sample)
 	{
-		increment_sample();
+		add_samples(pixel_samples_, tile_sample);
 		set_update();
 	}
 
-	int get_sample()
+	void add_finished_tile()
+	{
+		thread_scoped_lock lock(progress_mutex);
+
+		finished_tiles++;
+	}
+
+	int get_current_sample()
+	{
+		/* Note that the value here always belongs to the last tile that updated,
+		 * so it's only useful if there is only one active tile. */
+		return current_tile_sample;
+	}
+
+	int get_finished_tiles()
 	{
-		return sample;
+		return finished_tiles;
 	}
 
 	/* status messages */
@@ -212,8 +236,6 @@ public:
 			thread_scoped_lock lock(progress_mutex);
 			status = status_;
 			substatus = substatus_;
-			total_time = time_dt() - start_time;
-			render_time = time_dt() - render_start_time;
 		}
 
 		set_update();
@@ -224,8 +246,6 @@ public:
 		{
 			thread_scoped_lock lock(progress_mutex);
 			substatus = substatus_;
-			total_time = time_dt() - start_time;
-			render_time = time_dt() - render_start_time;
 		}
 
 		set_update();
@@ -237,8 +257,6 @@ public:
 			thread_scoped_lock lock(progress_mutex);
 			sync_status = status_;
 			sync_substatus = substatus_;
-			total_time = time_dt() - start_time;
-			render_time = time_dt() - render_start_time;
 		}
 
 		set_update();
@@ -250,8 +268,6 @@ public:
 		{
 			thread_scoped_lock lock(progress_mutex);
 			sync_substatus = substatus_;
-			total_time = time_dt() - start_time;
-			render_time = time_dt() - render_start_time;
 		}
 
 		set_update();
@@ -292,12 +308,19 @@ protected:
 	function<void(void)> update_cb;
 	function<void(void)> cancel_cb;
 
-	int tile;    /* counter for rendered tiles */
-	int sample;  /* counter of rendered samples, global for all tiles */
+	/* pixel_samples counts how many samples have been rendered over all pixel, not just per pixel.
+	 * This makes the progress estimate more accurate when tiles with different sizes are used.
+	 *
+	 * total_pixel_samples is the total amount of pixel samples that will be rendered. */
+	uint64_t pixel_samples, total_pixel_samples;
+	/* Stores the current sample count of the last tile that called the update function.
+	 * It's used to display the sample count if only one tile is active. */
+	int current_tile_sample;
+	/* Stores the number of tiles that's already finished.
+	 * Used to determine whether all but the last tile are finished rendering, in which case the current_tile_sample is displayed. */
+	int finished_tiles;
 
 	double start_time, render_start_time;
-	double total_time, render_time;
-	double tile_time;
 
 	string status;
 	string substatus;
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 36da155..756bd15 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -71,7 +71,7 @@ __forceinline operator          int      ( ) const { return std::numeric_limits<
 #define _lzcnt_u64 __lzcnt64
 #endif
 
-#if defined(_WIN32) && !defined(__MINGW32__)
+#if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
 
 __forceinline int __popcnt(int in) {
   return _mm_popcnt_u32(in);
@@ -229,7 +229,7 @@ __forceinline int __btr(int v, int i) {
   int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
 }
 
-#if defined(__KERNEL_64_BIT__) || defined(__APPLE__)
+#if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && !(defined(__ILP32__) && defined(__x86_64__))
 __forceinline size_t __bsf(size_t v) {
   size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
 }
@@ -271,7 +271,7 @@ __forceinline unsigned int bitscan(unsigned int v) {
 #endif
 }
 
-#if defined(__KERNEL_64_BIT__) || defined(__APPLE__)
+#if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && !(defined(__ILP32__) && defined(__x86_64__))
 __forceinline size_t bitscan(size_t v) {
 #if defined(__KERNEL_AVX2__)
 #if defined(__KERNEL_64_BIT__)
@@ -313,7 +313,7 @@ __forceinline unsigned int __bscf(unsigned int& v)
   return i;
 }
 
-#if defined(__KERNEL_64_BIT__) || defined(__APPLE__)
+#if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && !(defined(__ILP32__) && defined(__x86_64__))
 __forceinline size_t __bscf(size_t& v) 
 {
   size_t i = bitscan(v);
@@ -455,6 +455,7 @@ CCL_NAMESPACE_END
 #include "util_sseb.h"
 #include "util_ssei.h"
 #include "util_ssef.h"
+#include "util_avxf.h"
 
 #endif /* __UTIL_SIMD_TYPES_H__ */
 
diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h
index b970b01..c21a848 100644
--- a/intern/cycles/util/util_stats.h
+++ b/intern/cycles/util/util_stats.h
@@ -29,13 +29,13 @@ public:
 	explicit Stats(static_init_t) {}
 
 	void mem_alloc(size_t size) {
-		atomic_add_z(&mem_used, size);
+		atomic_add_and_fetch_z(&mem_used, size);
 		atomic_update_max_z(&mem_peak, mem_used);
 	}
 
 	void mem_free(size_t size) {
 		assert(mem_used >= size);
-		atomic_sub_z(&mem_used, size);
+		atomic_sub_and_fetch_z(&mem_used, size);
 	}
 
 	size_t mem_used;
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index d5fac9a..87d885c 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -89,6 +89,22 @@ int system_cpu_thread_count()
 	return count;
 }
 
+unsigned short system_cpu_process_groups(unsigned short max_groups,
+                                         unsigned short *groups)
+{
+#ifdef _WIN32
+	unsigned short group_count = max_groups;
+	if(!GetProcessGroupAffinity(GetCurrentProcess(), &group_count, groups)) {
+		return 0;
+	}
+	return group_count;
+#else
+	(void) max_groups;
+	(void) groups;
+	return 0;
+#endif
+}
+
 #if !defined(_WIN32) || defined(FREE_WINDOWS)
 static void __cpuid(int data[4], int selector)
 {
diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h
index 557aab6..ff61b26 100644
--- a/intern/cycles/util/util_system.h
+++ b/intern/cycles/util/util_system.h
@@ -30,6 +30,10 @@ int system_cpu_group_thread_count(int group);
 /* Get total number of threads in all groups. */
 int system_cpu_thread_count();
 
+/* Get current process groups. */
+unsigned short system_cpu_process_groups(unsigned short max_groups,
+                                         unsigned short *grpups);
+
 string system_cpu_brand_string();
 int system_cpu_bits();
 bool system_cpu_support_sse2();
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index 352ba81..0d1fed3 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -195,7 +195,8 @@ void TaskScheduler::init(int num_threads)
 	if(users == 0) {
 		do_exit = false;
 
-		if(num_threads == 0) {
+		const bool use_auto_threads = (num_threads == 0);
+		if(use_auto_threads) {
 			/* automatic number of threads */
 			num_threads = system_cpu_thread_count();
 		}
@@ -204,7 +205,18 @@ void TaskScheduler::init(int num_threads)
 		/* launch threads that will be waiting for work */
 		threads.resize(num_threads);
 
-		int num_groups = system_cpu_group_count();
+		const int num_groups = system_cpu_group_count();
+		unsigned short num_process_groups;
+		vector<unsigned short> process_groups;
+		int current_group_threads;
+		if(num_groups > 1) {
+			process_groups.resize(num_groups);
+			num_process_groups = system_cpu_process_groups(num_groups, 
+			                                               &process_groups[0]);
+			if(num_process_groups == 1) {
+				current_group_threads = system_cpu_group_thread_count(process_groups[0]);
+			}
+		}
 		int thread_index = 0;
 		for(int group = 0; group < num_groups; ++group) {
 			/* NOTE: That's not really efficient from threading point of view,
@@ -218,9 +230,25 @@ void TaskScheduler::init(int num_threads)
 				group_thread < num_group_threads && thread_index < threads.size();
 				++group_thread, ++thread_index)
 			{
+				/* NOTE: Thread group of -1 means we would not force thread affinity. */
+				int thread_group;
+				if(num_groups == 1) {
+					/* Use default affinity if there's only one CPU group in the system. */
+					thread_group = -1;
+				}
+				else if(use_auto_threads &&
+				        num_process_groups == 1 &&
+						num_threads <= current_group_threads)
+				{
+					/* If we fit into curent CPU group we also don't force any affinity. */
+					thread_group = -1;
+				}
+				else {
+					thread_group = group;
+				}
 				threads[thread_index] = new thread(function_bind(&TaskScheduler::thread_run,
 				                                                 thread_index + 1),
-				                                   group);
+				                                   thread_group);
 			}
 		}
 	}
diff --git a/intern/cycles/util/util_time.h b/intern/cycles/util/util_time.h
index a5b074b..6579824 100644
--- a/intern/cycles/util/util_time.h
+++ b/intern/cycles/util/util_time.h
@@ -29,7 +29,7 @@ void time_sleep(double t);
 
 class scoped_timer {
 public:
-	explicit scoped_timer(double *value) : value_(value)
+	explicit scoped_timer(double *value = NULL) : value_(value)
 	{
 		time_start_ = time_dt();
 	}
@@ -40,6 +40,12 @@ public:
 			*value_ = time_dt() - time_start_;
 		}
 	}
+
+	double get_start() const
+	{
+		return time_start_;
+	}
+
 protected:
 	double *value_;
 	double time_start_;
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index bfc8f55..a0695f2 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -73,22 +73,59 @@ ccl_device_inline float3 transform_perspective(const Transform *t, const float3
 
 ccl_device_inline float3 transform_point(const Transform *t, const float3 a)
 {
+	/* TODO(sergey): Disabled for now, causes crashes in certain cases. */
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
+	ssef x, y, z, w, aa;
+	aa = a.m128;
+
+	x = _mm_loadu_ps(&t->x.x);
+	y = _mm_loadu_ps(&t->y.x);
+	z = _mm_loadu_ps(&t->z.x);
+	w = _mm_loadu_ps(&t->w.x);
+
+	_MM_TRANSPOSE4_PS(x, y, z, w);
+
+	ssef tmp = shuffle<0>(aa) * x;
+	tmp = madd(shuffle<1>(aa), y, tmp);
+	tmp = madd(shuffle<2>(aa), z, tmp);
+	tmp += w;
+
+	return float3(tmp.m128);
+#else
 	float3 c = make_float3(
 		a.x*t->x.x + a.y*t->x.y + a.z*t->x.z + t->x.w,
 		a.x*t->y.x + a.y*t->y.y + a.z*t->y.z + t->y.w,
 		a.x*t->z.x + a.y*t->z.y + a.z*t->z.z + t->z.w);
 
 	return c;
+#endif
 }
 
 ccl_device_inline float3 transform_direction(const Transform *t, const float3 a)
 {
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
+	ssef x, y, z, w, aa;
+	aa = a.m128;
+	x = _mm_loadu_ps(&t->x.x);
+	y = _mm_loadu_ps(&t->y.x);
+	z = _mm_loadu_ps(&t->z.x);
+	w = _mm_setzero_ps();
+
+	_MM_TRANSPOSE4_PS(x, y, z, w);
+
+	ssef tmp = shuffle<0>(aa) * x;
+	tmp = madd(shuffle<1>(aa), y, tmp);
+	tmp = madd(shuffle<2>(aa), z, tmp);
+
+	return float3(tmp.m128);
+#else
 	float3 c = make_float3(
 		a.x*t->x.x + a.y*t->x.y + a.z*t->x.z,
 		a.x*t->y.x + a.y*t->y.y + a.z*t->y.z,
 		a.x*t->z.x + a.y*t->z.y + a.z*t->z.z);
 
 	return c;
+#endif
 }
 
 ccl_device_inline float3 transform_direction_transposed(const Transform *t, const float3 a)
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 6af65f8..a000fae 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -174,6 +174,9 @@ struct ccl_try_align(16) int3 {
 	__forceinline int3(const __m128i a) : m128(a) {}
 	__forceinline operator const __m128i&(void) const { return m128; }
 	__forceinline operator __m128i&(void) { return m128; }
+
+	int3(const int3& a) { m128 = a.m128; }
+	int3& operator =(const int3& a) { m128 = a.m128; return *this; }
 #else
 	int x, y, z, w;
 #endif
@@ -193,6 +196,9 @@ struct ccl_try_align(16) int4 {
 	__forceinline int4(const __m128i a) : m128(a) {}
 	__forceinline operator const __m128i&(void) const { return m128; }
 	__forceinline operator __m128i&(void) { return m128; }
+
+	int4(const int4& a) : m128(a.m128) {}
+	int4& operator=(const int4& a) { m128 = a.m128; return *this; }
 #else
 	int x, y, z, w;
 #endif
@@ -237,9 +243,12 @@ struct ccl_try_align(16) float3 {
 	};
 
 	__forceinline float3() {}
-	__forceinline float3(const __m128 a) : m128(a) {}
+	__forceinline float3(const __m128& a) : m128(a) {}
 	__forceinline operator const __m128&(void) const { return m128; }
 	__forceinline operator __m128&(void) { return m128; }
+
+	__forceinline float3(const float3& a) : m128(a.m128) {}
+	__forceinline float3& operator =(const float3& a) { m128 = a.m128; return *this; }
 #else
 	float x, y, z, w;
 #endif
@@ -259,6 +268,10 @@ struct ccl_try_align(16) float4 {
 	__forceinline float4(const __m128 a) : m128(a) {}
 	__forceinline operator const __m128&(void) const { return m128; }
 	__forceinline operator __m128&(void) { return m128; }
+
+	__forceinline float4(const float4& a) : m128(a.m128) {}
+	__forceinline float4& operator =(const float4& a) { m128 = a.m128; return *this; }
+
 #else
 	float x, y, z, w;
 #endif
diff --git a/intern/cycles/util/util_version.h b/intern/cycles/util/util_version.h
index 186a177..d609c73 100644
--- a/intern/cycles/util/util_version.h
+++ b/intern/cycles/util/util_version.h
@@ -21,9 +21,9 @@
 
 CCL_NAMESPACE_BEGIN
 
-#define CYCLES_VERSION_MAJOR	1
-#define CYCLES_VERSION_MINOR	7
-#define CYCLES_VERSION_PATCH	0
+#define CYCLES_VERSION_MAJOR    1
+#define CYCLES_VERSION_MINOR    8
+#define CYCLES_VERSION_PATCH    1
 
 #define CYCLES_MAKE_VERSION_STRING2(a,b,c) #a "." #b "." #c
 #define CYCLES_MAKE_VERSION_STRING(a,b,c) CYCLES_MAKE_VERSION_STRING2(a,b,c)
diff --git a/intern/cycles/util/util_windows.cpp b/intern/cycles/util/util_windows.cpp
index ee5b3fd..4de8483 100644
--- a/intern/cycles/util/util_windows.cpp
+++ b/intern/cycles/util/util_windows.cpp
@@ -28,6 +28,7 @@ CCL_NAMESPACE_BEGIN
 tGetActiveProcessorGroupCount *GetActiveProcessorGroupCount;
 tGetActiveProcessorCount *GetActiveProcessorCount;
 tSetThreadGroupAffinity *SetThreadGroupAffinity;
+tGetProcessGroupAffinity *GetProcessGroupAffinity;
 #endif
 
 static WORD GetActiveProcessorGroupCount_stub()
@@ -50,6 +51,18 @@ static BOOL SetThreadGroupAffinity_stub(
 	return TRUE;
 }
 
+static BOOL GetProcessGroupAffinity_stub(HANDLE hProcess,
+                                         PUSHORT GroupCount,
+                                         PUSHORT GroupArray)
+{
+	if(*GroupCount < 1) {
+		return FALSE;
+	}
+	*GroupCount = 1;
+	GroupArray[0] = 0;
+	return TRUE;
+}
+
 static bool supports_numa()
 {
 #ifndef _M_X64
@@ -72,6 +85,7 @@ void util_windows_init_numa_groups()
 		GetActiveProcessorGroupCount = GetActiveProcessorGroupCount_stub;
 		GetActiveProcessorCount = GetActiveProcessorCount_stub;
 		SetThreadGroupAffinity = SetThreadGroupAffinity_stub;
+		GetProcessGroupAffinity = GetProcessGroupAffinity_stub;
 		return;
 	}
 	HMODULE kernel = GetModuleHandleA("kernel32.dll");
@@ -79,6 +93,7 @@ void util_windows_init_numa_groups()
 	READ_SYMBOL(GetActiveProcessorGroupCount);
 	READ_SYMBOL(GetActiveProcessorCount);
 	READ_SYMBOL(SetThreadGroupAffinity);
+	READ_SYMBOL(GetProcessGroupAffinity);
 #  undef READ_SUMBOL
 #endif
 }
diff --git a/intern/cycles/util/util_windows.h b/intern/cycles/util/util_windows.h
index ac61d53..7ea3e65 100644
--- a/intern/cycles/util/util_windows.h
+++ b/intern/cycles/util/util_windows.h
@@ -39,10 +39,14 @@ typedef DWORD tGetActiveProcessorCount(WORD GroupNumber);
 typedef BOOL tSetThreadGroupAffinity(HANDLE hThread,
                                      const GROUP_AFFINITY  *GroupAffinity,
                                      PGROUP_AFFINITY PreviousGroupAffinity);
+typedef BOOL tGetProcessGroupAffinity(HANDLE  hProcess,
+                                     PUSHORT GroupCount,
+                                     PUSHORT GroupArray);
 
 extern tGetActiveProcessorGroupCount *GetActiveProcessorGroupCount;
 extern tGetActiveProcessorCount *GetActiveProcessorCount;
 extern tSetThreadGroupAffinity *SetThreadGroupAffinity;
+extern tGetProcessGroupAffinity *GetProcessGroupAffinity;
 #endif
 
 /* Make sure NUMA and processor groups API is initialized. */
diff --git a/intern/elbeem/intern/solver_class.h b/intern/elbeem/intern/solver_class.h
index 593fea1..2b2e214 100644
--- a/intern/elbeem/intern/solver_class.h
+++ b/intern/elbeem/intern/solver_class.h
@@ -332,7 +332,7 @@ class LbmFsgrSolver :
 		void debugMarkCellCall(int level, int vi,int vj,int vk);
 		
 		// loop over grid, stream&collide update
-		void mainLoop(int lev);
+		void mainLoop(const int lev);
 		// change time step size
 		void adaptTimestep();
 		//! init mObjectSpeeds for current parametrization
diff --git a/intern/elbeem/intern/solver_main.cpp b/intern/elbeem/intern/solver_main.cpp
index 55a8d3e..68f7c04 100644
--- a/intern/elbeem/intern/solver_main.cpp
+++ b/intern/elbeem/intern/solver_main.cpp
@@ -355,7 +355,7 @@ void LbmFsgrSolver::fineAdvance()
 //! fine step function
 /*****************************************************************************/
 void 
-LbmFsgrSolver::mainLoop(int lev)
+LbmFsgrSolver::mainLoop(const int lev)
 {
 	// loops over _only inner_ cells  -----------------------------------------------------------------------------------
 	
@@ -376,13 +376,16 @@ LbmFsgrSolver::mainLoop(int lev)
   // main loop region
 	const bool doReduce = true;
 	const int gridLoopBound=1;
+	int calcNumInvIfCells = 0;
+	LbmFloat calcInitialMass = 0;
 	GRID_REGION_INIT();
 #if PARALLEL==1
-#pragma omp parallel default(shared) num_threads(mNumOMPThreads) \
+	const int gDebugLevel = ::gDebugLevel;
+#pragma omp parallel default(none) num_threads(mNumOMPThreads) \
   reduction(+: \
 	  calcCurrentMass,calcCurrentVolume, \
 		calcCellsFilled,calcCellsEmptied, \
-		calcNumUsedCells )
+		calcNumUsedCells,calcNumInvIfCells,calcInitialMass)
 	GRID_REGION_START();
 #else // PARALLEL==1
 	GRID_REGION_START();
@@ -468,7 +471,7 @@ LbmFsgrSolver::mainLoop(int lev)
 				calcCurrentMass += iniRho; 
 				calcCurrentVolume += 1.0; 
 				calcNumUsedCells++;
-				mInitialMass += iniRho;
+				calcInitialMass += iniRho;
 				// dont treat cell until next step
 				continue;
 			} 
@@ -479,7 +482,7 @@ LbmFsgrSolver::mainLoop(int lev)
 			if(isnotValid) {
 				// remove fluid cells, shouldnt be here anyway
 				LbmFloat fluidRho = m[0]; FORDF1 { fluidRho += m[l]; }
-				mInitialMass -= fluidRho;
+				calcInitialMass -= fluidRho;
 				const LbmFloat iniRho = 0.0;
 				RAC(tcel, dMass) = RAC(tcel, dFfrac) = iniRho;
 				RAC(tcel, dFlux) = FLUX_INIT;
@@ -608,8 +611,8 @@ LbmFsgrSolver::mainLoop(int lev)
 		// read distribution funtions of adjacent cells = stream step
 		DEFAULT_STREAM;
 
-		if((nbored & CFFluid)==0) { newFlag |= CFNoNbFluid; mNumInvIfCells++; }
-		if((nbored & CFEmpty)==0) { newFlag |= CFNoNbEmpty; mNumInvIfCells++; }
+		if((nbored & CFFluid)==0) { newFlag |= CFNoNbFluid; calcNumInvIfCells++; }
+		if((nbored & CFEmpty)==0) { newFlag |= CFNoNbEmpty; calcNumInvIfCells++; }
 
 		// calculate mass exchange for interface cells 
 		LbmFloat myfrac = RAC(ccel,dFfrac);
@@ -809,7 +812,7 @@ LbmFsgrSolver::mainLoop(int lev)
 			// fill if cells in inflow region
 			if(myfrac<0.5) { 
 				mass += 0.25; 
-				mInitialMass += 0.25;
+				calcInitialMass += 0.25;
 			}
 			const int OId = oldFlag>>24;
 			const LbmVec vel(mObjectSpeeds[OId]);
@@ -865,10 +868,8 @@ LbmFsgrSolver::mainLoop(int lev)
 			// physical drop model
 			if(mPartUsePhysModel) {
 				LbmFloat realWorldFac = (mLevel[lev].simCellSize / mLevel[lev].timestep);
-				LbmFloat rux = (ux * realWorldFac);
-				LbmFloat ruy = (uy * realWorldFac);
-				LbmFloat ruz = (uz * realWorldFac);
-				LbmFloat rl = norm(ntlVec3Gfx(rux,ruy,ruz));
+				LbmVec ru(ux * realWorldFac, uy * realWorldFac, uz * realWorldFac);
+				LbmFloat rl = norm(ru);
 				basethresh *= rl;
 
 				// reduce probability in outer region?
@@ -960,14 +961,15 @@ LbmFsgrSolver::mainLoop(int lev)
 				// average normal & velocity 
 				// -> mostly along velocity dir, many into surface
 				// fluid velocity (not normalized!)
-				LbmVec flvelVel = LbmVec(ux,uy,uz);
+				LbmVec flvelVel(ux,uy,uz);
 				LbmFloat flvelLen = norm(flvelVel);
 				// surface normal
-				LbmVec normVel = LbmVec(surfaceNormal[0],surfaceNormal[1],surfaceNormal[2]);
+				LbmVec normVel(surfaceNormal[0],surfaceNormal[1],surfaceNormal[2]);
 				normalize(normVel);
 				LbmFloat normScale = (0.01+flvelLen);
 				// jitter vector, 0.2 * flvel
-				LbmVec jittVel = LbmVec(jx,jy,jz)*(0.05+flvelLen)*0.1;
+				LbmVec jittVel(jx,jy,jz);
+				jittVel *= (0.05+flvelLen)*0.1;
 				// weighten velocities
 				const LbmFloat flvelWeight = 0.9;
 				LbmVec newpartVel = normVel*normScale*(1.-flvelWeight) + flvelVel*(flvelWeight) + jittVel; 
@@ -1013,7 +1015,7 @@ LbmFsgrSolver::mainLoop(int lev)
 		if( (mass) <= (rho * (   -FSGR_MAGICNR)) ) { ifemptied = 1; }
 
 		if(oldFlag & (CFMbndOutflow)) {
-			mInitialMass -= mass;
+			calcInitialMass -= mass;
 			mass = myfrac = 0.0;
 			iffilled = 0; ifemptied = 1;
 		}
@@ -1105,6 +1107,8 @@ LbmFsgrSolver::mainLoop(int lev)
 	mNumFilledCells  = calcCellsFilled;
 	mNumEmptiedCells = calcCellsEmptied;
 	mNumUsedCells = calcNumUsedCells;
+	mNumInvIfCells += calcNumInvIfCells;
+	mInitialMass += calcInitialMass;
 }
 
 
@@ -1121,7 +1125,8 @@ LbmFsgrSolver::preinitGrids()
 	
 		GRID_REGION_INIT();
 #if PARALLEL==1
-#pragma omp parallel default(shared) num_threads(mNumOMPThreads) \
+	const int gDebugLevel = ::gDebugLevel;
+#pragma omp parallel default(none) num_threads(mNumOMPThreads) \
   reduction(+: \
 	  calcCurrentMass,calcCurrentVolume, \
 		calcCellsFilled,calcCellsEmptied, \
@@ -1158,7 +1163,8 @@ LbmFsgrSolver::standingFluidPreinit()
 
 	GRID_REGION_INIT();
 #if PARALLEL==1
-#pragma omp parallel default(shared) num_threads(mNumOMPThreads) \
+	const int gDebugLevel = ::gDebugLevel;
+#pragma omp parallel default(none) num_threads(mNumOMPThreads) \
   reduction(+: \
 	  calcCurrentMass,calcCurrentVolume, \
 		calcCellsFilled,calcCellsEmptied, \
diff --git a/intern/guardedalloc/intern/mallocn_guarded_impl.c b/intern/guardedalloc/intern/mallocn_guarded_impl.c
index 1933e9d..76b7e07 100644
--- a/intern/guardedalloc/intern/mallocn_guarded_impl.c
+++ b/intern/guardedalloc/intern/mallocn_guarded_impl.c
@@ -505,8 +505,8 @@ static void make_memhead_header(MemHead *memh, size_t len, const char *str)
 	memt = (MemTail *)(((char *) memh) + sizeof(MemHead) + len);
 	memt->tag3 = MEMTAG3;
 
-	atomic_add_u(&totblock, 1);
-	atomic_add_z(&mem_in_use, len);
+	atomic_add_and_fetch_u(&totblock, 1);
+	atomic_add_and_fetch_z(&mem_in_use, len);
 
 	mem_lock_thread();
 	addtail(membase, &memh->next);
@@ -638,7 +638,7 @@ void *MEM_guarded_mapallocN(size_t len, const char *str)
 	if (memh != (MemHead *)-1) {
 		make_memhead_header(memh, len, str);
 		memh->mmap = 1;
-		atomic_add_z(&mmap_in_use, len);
+		atomic_add_and_fetch_z(&mmap_in_use, len);
 		mem_lock_thread();
 		peak_mem = mmap_in_use > peak_mem ? mmap_in_use : peak_mem;
 		mem_unlock_thread();
@@ -1007,8 +1007,8 @@ static void rem_memblock(MemHead *memh)
 	}
 	mem_unlock_thread();
 
-	atomic_sub_u(&totblock, 1);
-	atomic_sub_z(&mem_in_use, memh->len);
+	atomic_sub_and_fetch_u(&totblock, 1);
+	atomic_sub_and_fetch_z(&mem_in_use, memh->len);
 
 #ifdef DEBUG_MEMDUPLINAME
 	if (memh->need_free_name)
@@ -1016,7 +1016,7 @@ static void rem_memblock(MemHead *memh)
 #endif
 
 	if (memh->mmap) {
-		atomic_sub_z(&mmap_in_use, memh->len);
+		atomic_sub_and_fetch_z(&mmap_in_use, memh->len);
 #if defined(WIN32)
 		/* our windows mmap implementation is not thread safe */
 		mem_lock_thread();
diff --git a/intern/guardedalloc/intern/mallocn_lockfree_impl.c b/intern/guardedalloc/intern/mallocn_lockfree_impl.c
index a80d67c..ce8a5b2 100644
--- a/intern/guardedalloc/intern/mallocn_lockfree_impl.c
+++ b/intern/guardedalloc/intern/mallocn_lockfree_impl.c
@@ -142,11 +142,11 @@ void MEM_lockfree_freeN(void *vmemh)
 		return;
 	}
 
-	atomic_sub_u(&totblock, 1);
-	atomic_sub_z(&mem_in_use, len);
+	atomic_sub_and_fetch_u(&totblock, 1);
+	atomic_sub_and_fetch_z(&mem_in_use, len);
 
 	if (MEMHEAD_IS_MMAP(memh)) {
-		atomic_sub_z(&mmap_in_use, len);
+		atomic_sub_and_fetch_z(&mmap_in_use, len);
 #if defined(WIN32)
 		/* our windows mmap implementation is not thread safe */
 		mem_lock_thread();
@@ -287,8 +287,8 @@ void *MEM_lockfree_callocN(size_t len, const char *str)
 
 	if (LIKELY(memh)) {
 		memh->len = len;
-		atomic_add_u(&totblock, 1);
-		atomic_add_z(&mem_in_use, len);
+		atomic_add_and_fetch_u(&totblock, 1);
+		atomic_add_and_fetch_z(&mem_in_use, len);
 		update_maximum(&peak_mem, mem_in_use);
 
 		return PTR_FROM_MEMHEAD(memh);
@@ -312,8 +312,8 @@ void *MEM_lockfree_mallocN(size_t len, const char *str)
 		}
 
 		memh->len = len;
-		atomic_add_u(&totblock, 1);
-		atomic_add_z(&mem_in_use, len);
+		atomic_add_and_fetch_u(&totblock, 1);
+		atomic_add_and_fetch_z(&mem_in_use, len);
 		update_maximum(&peak_mem, mem_in_use);
 
 		return PTR_FROM_MEMHEAD(memh);
@@ -361,8 +361,8 @@ void *MEM_lockfree_mallocN_aligned(size_t len, size_t alignment, const char *str
 
 		memh->len = len | (size_t) MEMHEAD_ALIGN_FLAG;
 		memh->alignment = (short) alignment;
-		atomic_add_u(&totblock, 1);
-		atomic_add_z(&mem_in_use, len);
+		atomic_add_and_fetch_u(&totblock, 1);
+		atomic_add_and_fetch_z(&mem_in_use, len);
 		update_maximum(&peak_mem, mem_in_use);
 
 		return PTR_FROM_MEMHEAD(memh);
@@ -396,9 +396,9 @@ void *MEM_lockfree_mapallocN(size_t len, const char *str)
 
 	if (memh != (MemHead *)-1) {
 		memh->len = len | (size_t) MEMHEAD_MMAP_FLAG;
-		atomic_add_u(&totblock, 1);
-		atomic_add_z(&mem_in_use, len);
-		atomic_add_z(&mmap_in_use, len);
+		atomic_add_and_fetch_u(&totblock, 1);
+		atomic_add_and_fetch_z(&mem_in_use, len);
+		atomic_add_and_fetch_z(&mmap_in_use, len);
 
 		update_maximum(&peak_mem, mem_in_use);
 		update_maximum(&peak_mem, mmap_in_use);
diff --git a/intern/iksolver/intern/IK_QSegment.h b/intern/iksolver/intern/IK_QSegment.h
index 74f157a..247807d 100644
--- a/intern/iksolver/intern/IK_QSegment.h
+++ b/intern/iksolver/intern/IK_QSegment.h
@@ -60,6 +60,7 @@
 class IK_QSegment
 {
 public:
+	EIGEN_MAKE_ALIGNED_OPERATOR_NEW
 	virtual ~IK_QSegment();
 
 	// start: a user defined translation
diff --git a/intern/iksolver/intern/IK_Solver.cpp b/intern/iksolver/intern/IK_Solver.cpp
index cefb8c7..a00db4f 100644
--- a/intern/iksolver/intern/IK_Solver.cpp
+++ b/intern/iksolver/intern/IK_Solver.cpp
@@ -42,6 +42,7 @@ using namespace std;
 
 class IK_QSolver {
 public:
+	EIGEN_MAKE_ALIGNED_OPERATOR_NEW
 	IK_QSolver() : root(NULL) {
 	}
 
diff --git a/release/scripts/addons/render_auto_tile_size.py b/release/scripts/addons/render_auto_tile_size.py
index 8aef71d..3625c0e 100644
--- a/release/scripts/addons/render_auto_tile_size.py
+++ b/release/scripts/addons/render_auto_tile_size.py
@@ -146,7 +146,9 @@ def ats_poll(context):
 
 
 def engine_is_gpu(engine, device, userpref):
-    return engine == 'CYCLES' and device == 'GPU' and userpref.system.compute_device_type != 'NONE'
+    if engine == 'CYCLES' and device == 'GPU':
+        return userpref.addons['cycles'].preferences.has_active_device()
+    return False
 
 
 def get_tilesize_prop(engine, device, userpref):
@@ -206,11 +208,7 @@ def get_threads(context, device):
     userpref = context.user_preferences
 
     if engine_is_gpu(engine, device, userpref):
-        gpu_device_str = userpref.system.compute_device
-        if 'MULTI' in gpu_device_str:
-            threads = int(gpu_device_str.split('_')[-1])
-        else:
-            threads = 1
+        threads = userpref.addons['cycles'].preferences.get_num_gpu_devices()
     else:
         threads = render.threads
 
diff --git a/release/scripts/startup/bl_operators/wm.py b/release/scripts/startup/bl_operators/wm.py
index 1c97d21..343fcdb 100644
--- a/release/scripts/startup/bl_operators/wm.py
+++ b/release/scripts/startup/bl_operators/wm.py
@@ -2163,3 +2163,32 @@ class WM_OT_addon_expand(Operator):
             info["show_expanded"] = not info["show_expanded"]
 
         return {'FINISHED'}
+
+class WM_OT_addon_userpref_show(Operator):
+    "Show add-on user preferences"
+    bl_idname = "wm.addon_userpref_show"
+    bl_label = ""
+    bl_options = {'INTERNAL'}
+
+    module = StringProperty(
+            name="Module",
+            description="Module name of the add-on to expand",
+            )
+
+    def execute(self, context):
+        import addon_utils
+
+        module_name = self.module
+
+        modules = addon_utils.modules(refresh=False)
+        mod = addon_utils.addons_fake_modules.get(module_name)
+        if mod is not None:
+            info = addon_utils.module_bl_info(mod)
+            info["show_expanded"] = True
+
+            bpy.context.user_preferences.active_section = 'ADDONS'
+            context.window_manager.addon_filter = 'All'
+            context.window_manager.addon_search = info["name"]
+            bpy.ops.screen.userpref_show('INVOKE_DEFAULT')
+
+        return {'FINISHED'}
diff --git a/release/scripts/startup/bl_ui/space_userpref.py b/release/scripts/startup/bl_ui/space_userpref.py
index dcafac6..bdbb633 100644
--- a/release/scripts/startup/bl_ui/space_userpref.py
+++ b/release/scripts/startup/bl_ui/space_userpref.py
@@ -429,12 +429,8 @@ class USERPREF_PT_system(Panel):
 
         col.separator()
 
-        if hasattr(system, "compute_device_type"):
-            col.label(text="Compute Device:")
-            col.row().prop(system, "compute_device_type", expand=True)
-            sub = col.row()
-            sub.active = system.compute_device_type != 'CPU'
-            sub.prop(system, "compute_device", text="")
+        if userpref.addons.find('cycles') != -1:
+            userpref.addons['cycles'].preferences.draw_impl(col, context)
 
         if hasattr(system, "opensubdiv_compute_type"):
             col.label(text="OpenSubdiv compute:")
diff --git a/source/blender/blenkernel/BKE_armature.h b/source/blender/blenkernel/BKE_armature.h
index c232310..78d6f6c 100644
--- a/source/blender/blenkernel/BKE_armature.h
+++ b/source/blender/blenkernel/BKE_armature.h
@@ -97,6 +97,7 @@ void BKE_armature_where_is(struct bArmature *arm);
 void BKE_armature_where_is_bone(struct Bone *bone, struct Bone *prevbone, const bool use_recursion);
 void BKE_pose_clear_pointers(struct bPose *pose);
 void BKE_pose_rebuild(struct Object *ob, struct bArmature *arm);
+void BKE_pose_rebuild_ex(struct Object *ob, struct bArmature *arm, const bool sort_bones);
 void BKE_pose_where_is(struct Scene *scene, struct Object *ob);
 void BKE_pose_where_is_bone(struct Scene *scene, struct Object *ob, struct bPoseChannel *pchan, float ctime, bool do_extra);
 void BKE_pose_where_is_bone_tail(struct bPoseChannel *pchan);
diff --git a/source/blender/blenkernel/BKE_blender_version.h b/source/blender/blenkernel/BKE_blender_version.h
index aba5f9e..6dad0d7 100644
--- a/source/blender/blenkernel/BKE_blender_version.h
+++ b/source/blender/blenkernel/BKE_blender_version.h
@@ -35,7 +35,7 @@
 
 /* used by packaging tools */
 /* can be left blank, otherwise a,b,c... etc with no quotes */
-#define BLENDER_VERSION_CHAR    a
+#define BLENDER_VERSION_CHAR    b
 /* alpha/beta/rc/release, docs use this */
 #define BLENDER_VERSION_CYCLE   release
 
diff --git a/source/blender/blenkernel/intern/armature.c b/source/blender/blenkernel/intern/armature.c
index c644fe0..aaec3a9 100644
--- a/source/blender/blenkernel/intern/armature.c
+++ b/source/blender/blenkernel/intern/armature.c
@@ -1916,7 +1916,7 @@ void BKE_pose_clear_pointers(bPose *pose)
 
 /* only after leave editmode, duplicating, validating older files, library syncing */
 /* NOTE: pose->flag is set for it */
-void BKE_pose_rebuild(Object *ob, bArmature *arm)
+void BKE_pose_rebuild_ex(Object *ob, bArmature *arm, const bool sort_bones)
 {
 	Bone *bone;
 	bPose *pose;
@@ -1963,8 +1963,9 @@ void BKE_pose_rebuild(Object *ob, bArmature *arm)
 #ifdef WITH_LEGACY_DEPSGRAPH
 	/* the sorting */
 	/* Sorting for new dependnecy graph is done on the scene graph level. */
-	if (counter > 1)
+	if (counter > 1 && sort_bones) {
 		DAG_pose_sort(ob);
+	}
 #endif
 
 	ob->pose->flag &= ~POSE_RECALC;
@@ -1973,6 +1974,11 @@ void BKE_pose_rebuild(Object *ob, bArmature *arm)
 	BKE_pose_channels_hash_make(ob->pose);
 }
 
+void BKE_pose_rebuild(Object *ob, bArmature *arm)
+{
+	BKE_pose_rebuild_ex(ob, arm, true);
+}
+
 /* ********************** THE POSE SOLVER ******************* */
 
 /* loc/rot/size to given mat4 */
diff --git a/source/blender/blenkernel/intern/depsgraph.c b/source/blender/blenkernel/intern/depsgraph.c
index 5f8332d..8a16b3e 100644
--- a/source/blender/blenkernel/intern/depsgraph.c
+++ b/source/blender/blenkernel/intern/depsgraph.c
@@ -800,6 +800,10 @@ static void build_dag_object(DagForest *dag, DagNode *scenenode, Main *bmain, Sc
 				/* Actual code uses get_collider_cache */
 				dag_add_collision_relations(dag, scene, ob, node, part->collision_group, ob->lay, eModifierType_Collision, NULL, true, "Particle Collision");
 			}
+			else if ((psys->flag & PSYS_HAIR_DYNAMICS) && psys->clmd && psys->clmd->coll_parms) {
+				/* Hair uses cloth simulation, i.e. get_collision_objects */
+				dag_add_collision_relations(dag, scene, ob, node, psys->clmd->coll_parms->group, ob->lay | scene->lay, eModifierType_Collision, NULL, true, "Hair Collision");
+			}
 
 			dag_add_forcefield_relations(dag, scene, ob, node, part->effector_weights, part->type == PART_HAIR, 0, "Particle Force Field");
 
@@ -3284,7 +3288,7 @@ void DAG_threaded_update_handle_node_updated(void *node_v,
 	for (itA = node->child; itA; itA = itA->next) {
 		DagNode *child_node = itA->node;
 		if (child_node != node) {
-			atomic_sub_uint32(&child_node->num_pending_parents, 1);
+			atomic_sub_and_fetch_uint32(&child_node->num_pending_parents, 1);
 
 			if (child_node->num_pending_parents == 0) {
 				bool need_schedule;
diff --git a/source/blender/blenkernel/intern/dynamicpaint.c b/source/blender/blenkernel/intern/dynamicpaint.c
index 51bc5fc..5d660be 100644
--- a/source/blender/blenkernel/intern/dynamicpaint.c
+++ b/source/blender/blenkernel/intern/dynamicpaint.c
@@ -2257,7 +2257,7 @@ static void dynamic_paint_create_uv_surface_neighbor_cb(void *userdata, const in
 							 * to non--1 *before* its tri_index is set (i.e. that it cannot be used a neighbour).
 							 */
 							tPoint->neighbour_pixel = ind - 1;
-							atomic_add_uint32(&tPoint->neighbour_pixel, 1);
+							atomic_add_and_fetch_uint32(&tPoint->neighbour_pixel, 1);
 							tPoint->tri_index = i;
 
 							/* Now calculate pixel data for this pixel as it was on polygon surface */
@@ -2283,7 +2283,7 @@ static void dynamic_paint_create_uv_surface_neighbor_cb(void *userdata, const in
 
 		/* Increase the final number of active surface points if relevant. */
 		if (tPoint->tri_index != -1)
-			atomic_add_uint32(active_points, 1);
+			atomic_add_and_fetch_uint32(active_points, 1);
 	}
 }
 
diff --git a/source/blender/blenkernel/intern/mesh_evaluate.c b/source/blender/blenkernel/intern/mesh_evaluate.c
index fa113ef..a3fe73e 100644
--- a/source/blender/blenkernel/intern/mesh_evaluate.c
+++ b/source/blender/blenkernel/intern/mesh_evaluate.c
@@ -58,6 +58,7 @@
 
 #include "BLI_strict_flags.h"
 
+#include "atomic_ops.h"
 #include "mikktspace.h"
 
 // #define DEBUG_TIME
@@ -236,7 +237,9 @@ static void mesh_calc_normals_poly_accum_task_cb(void *userdata, const int pidx)
 			const float fac = saacos(-dot_v3v3(cur_edge, prev_edge));
 
 			/* accumulate */
-			madd_v3_v3fl(vnors[ml[i].v], pnor, fac);
+			for (int k = 3; k--; ) {
+				atomic_add_and_fetch_fl(&vnors[ml[i].v][k], pnor[k] * fac);
+			}
 			prev_edge = cur_edge;
 		}
 	}
diff --git a/source/blender/blenkernel/intern/pbvh.c b/source/blender/blenkernel/intern/pbvh.c
index ff69f38..4fe4d6e 100644
--- a/source/blender/blenkernel/intern/pbvh.c
+++ b/source/blender/blenkernel/intern/pbvh.c
@@ -977,7 +977,7 @@ static void pbvh_update_normals_accum_task_cb(void *userdata, const int n)
 					 *       Not exact equivalent though, since atomicity is only ensured for one component
 					 *       of the vector at a time, but here it shall not make any sensible difference. */
 					for (int k = 3; k--; ) {
-						atomic_add_fl(&vnors[v][k], fn[k]);
+						atomic_add_and_fetch_fl(&vnors[v][k], fn[k]);
 					}
 				}
 			}
diff --git a/source/blender/blenlib/intern/task.c b/source/blender/blenlib/intern/task.c
index 436cd2b..fc2d967 100644
--- a/source/blender/blenlib/intern/task.c
+++ b/source/blender/blenlib/intern/task.c
@@ -237,7 +237,7 @@ static void task_pool_num_decrease(TaskPool *pool, size_t done)
 	BLI_assert(pool->num >= done);
 
 	pool->num -= done;
-	atomic_sub_z(&pool->currently_running_tasks, done);
+	atomic_sub_and_fetch_z(&pool->currently_running_tasks, done);
 	pool->done += done;
 
 	if (pool->num == 0)
@@ -292,7 +292,7 @@ static bool task_scheduler_thread_wait_pop(TaskScheduler *scheduler, Task **task
 				continue;
 			}
 
-			if (atomic_add_z(&pool->currently_running_tasks, 1) <= pool->num_threads ||
+			if (atomic_add_and_fetch_z(&pool->currently_running_tasks, 1) <= pool->num_threads ||
 			    pool->num_threads == 0)
 			{
 				*task = current_task;
@@ -301,7 +301,7 @@ static bool task_scheduler_thread_wait_pop(TaskScheduler *scheduler, Task **task
 				break;
 			}
 			else {
-				atomic_sub_z(&pool->currently_running_tasks, 1);
+				atomic_sub_and_fetch_z(&pool->currently_running_tasks, 1);
 			}
 		}
 		if (!found_task)
@@ -669,7 +669,7 @@ void BLI_task_pool_work_and_wait(TaskPool *pool)
 		/* if found task, do it, otherwise wait until other tasks are done */
 		if (found_task) {
 			/* run task */
-			atomic_add_z(&pool->currently_running_tasks, 1);
+			atomic_add_and_fetch_z(&pool->currently_running_tasks, 1);
 			work_task->run(pool, work_task->taskdata, 0);
 
 			/* delete task */
diff --git a/source/blender/compositor/intern/COM_ExecutionGroup.cpp b/source/blender/compositor/intern/COM_ExecutionGroup.cpp
index e5c2b8a..9a47c6b 100644
--- a/source/blender/compositor/intern/COM_ExecutionGroup.cpp
+++ b/source/blender/compositor/intern/COM_ExecutionGroup.cpp
@@ -383,7 +383,7 @@ void ExecutionGroup::finalizeChunkExecution(int chunkNumber, MemoryBuffer **memo
 	if (this->m_chunkExecutionStates[chunkNumber] == COM_ES_SCHEDULED)
 		this->m_chunkExecutionStates[chunkNumber] = COM_ES_EXECUTED;
 	
-	atomic_add_u(&this->m_chunksFinished, 1);
+	atomic_add_and_fetch_u(&this->m_chunksFinished, 1);
 	if (memoryBuffers) {
 		for (unsigned int index = 0; index < this->m_cachedMaxReadBufferOffset; index++) {
 			MemoryBuffer *buffer = memoryBuffers[index];
diff --git a/source/blender/compositor/operations/COM_TextureOperation.cpp b/source/blender/compositor/operations/COM_TextureOperation.cpp
index bba5c87..6bfd8ae 100644
--- a/source/blender/compositor/operations/COM_TextureOperation.cpp
+++ b/source/blender/compositor/operations/COM_TextureOperation.cpp
@@ -118,7 +118,7 @@ void TextureBaseOperation::executePixelSampled(float output[4], float x, float y
 	 * interpolaiton and (b) in such configuration multitex() sinply floor's the value
 	 * which often produces artifacts.
 	 */
-	if ((m_texture->imaflag & TEX_INTERPOL) == 0) {
+	if (m_texture != NULL && (m_texture->imaflag & TEX_INTERPOL) == 0) {
 		u += 0.5f / cx;
 		v += 0.5f / cy;
 	}
diff --git a/source/blender/depsgraph/CMakeLists.txt b/source/blender/depsgraph/CMakeLists.txt
index fd2a521..e635256 100644
--- a/source/blender/depsgraph/CMakeLists.txt
+++ b/source/blender/depsgraph/CMakeLists.txt
@@ -43,8 +43,13 @@ set(SRC
 	intern/builder/deg_builder.cc
 	intern/builder/deg_builder_cycle.cc
 	intern/builder/deg_builder_nodes.cc
+	intern/builder/deg_builder_nodes_rig.cc
+	intern/builder/deg_builder_nodes_scene.cc
 	intern/builder/deg_builder_pchanmap.cc
 	intern/builder/deg_builder_relations.cc
+	intern/builder/deg_builder_relations_keys.cc
+	intern/builder/deg_builder_relations_rig.cc
+	intern/builder/deg_builder_relations_scene.cc
 	intern/builder/deg_builder_transitive.cc
 	intern/debug/deg_debug_graphviz.cc
 	intern/eval/deg_eval.cc
diff --git a/source/blender/depsgraph/DEG_depsgraph_build.h b/source/blender/depsgraph/DEG_depsgraph_build.h
index 0945da4..fdc8654 100644
--- a/source/blender/depsgraph/DEG_depsgraph_build.h
+++ b/source/blender/depsgraph/DEG_depsgraph_build.h
@@ -51,8 +51,12 @@ extern "C" {
 
 /* Graph Building -------------------------------- */
 
-/* Build depsgraph for the given scene, and dump results in given graph container */
-void DEG_graph_build_from_scene(struct Depsgraph *graph, struct Main *bmain, struct Scene *scene);
+/* Build depsgraph for the given scene, and dump results in given
+ * graph container.
+ */
+void DEG_graph_build_from_scene(struct Depsgraph *graph,
+                                struct Main *bmain,
+                                struct Scene *scene);
 
 /* Tag relations from the given graph for update. */
 void DEG_graph_tag_relations_update(struct Depsgraph *graph);
@@ -85,31 +89,69 @@ struct CacheFile;
 struct Object;
 
 typedef enum eDepsSceneComponentType {
-	DEG_SCENE_COMP_PARAMETERS,     /* Parameters Component - Default when nothing else fits (i.e. just SDNA property setting) */
-	DEG_SCENE_COMP_ANIMATION,      /* Animation Component */                 // XXX: merge in with parameters?
-	DEG_SCENE_COMP_SEQUENCER,      /* Sequencer Component (Scene Only) */
+	/* Parameters Component - Default when nothing else fits
+	 * (i.e. just SDNA property setting).
+	 */
+	DEG_SCENE_COMP_PARAMETERS,
+	/* Animation Component
+	 * TODO(sergey): merge in with parameters?
+	 */
+	DEG_SCENE_COMP_ANIMATION,
+	/* Sequencer Component (Scene Only). */
+	DEG_SCENE_COMP_SEQUENCER,
 } eDepsSceneComponentType;
 
 typedef enum eDepsObjectComponentType {
-	DEG_OB_COMP_PARAMETERS,        /* Parameters Component - Default when nothing else fits (i.e. just SDNA property setting) */
-	DEG_OB_COMP_PROXY,             /* Generic "Proxy-Inherit" Component */   // XXX: Also for instancing of subgraphs?
-	DEG_OB_COMP_ANIMATION,         /* Animation Component */                 // XXX: merge in with parameters?
-	DEG_OB_COMP_TRANSFORM,         /* Transform Component (Parenting/Constraints) */
-	DEG_OB_COMP_GEOMETRY,          /* Geometry Component (DerivedMesh/Displist) */
-	
+	/* Parameters Component - Default when nothing else fits
+	 * (i.e. just SDNA property setting).
+	 */
+	DEG_OB_COMP_PARAMETERS,
+	/* Generic "Proxy-Inherit" Component.
+	 * TODO(sergey): Also for instancing of subgraphs?
+	 */
+	DEG_OB_COMP_PROXY,
+	/* Animation Component.
+	 *
+	 * TODO(sergey): merge in with parameters?
+	 */
+	DEG_OB_COMP_ANIMATION,
+	/* Transform Component (Parenting/Constraints) */
+	DEG_OB_COMP_TRANSFORM,
+	/* Geometry Component (DerivedMesh/Displist) */
+	DEG_OB_COMP_GEOMETRY,
+
 	/* Evaluation-Related Outer Types (with Subdata) */
-	DEG_OB_COMP_EVAL_POSE,         /* Pose Component - Owner/Container of Bones Eval */
-	DEG_OB_COMP_BONE,              /* Bone Component - Child/Subcomponent of Pose */
-	
-	DEG_OB_COMP_EVAL_PARTICLES,    /* Particle Systems Component */
-	DEG_OB_COMP_SHADING,           /* Material Shading Component */
-	DEG_OB_COMP_CACHE,             /* Cache Component */
+
+	/* Pose Component - Owner/Container of Bones Eval */
+	DEG_OB_COMP_EVAL_POSE,
+	/* Bone Component - Child/Subcomponent of Pose */
+	DEG_OB_COMP_BONE,
+
+	/* Particle Systems Component */
+	DEG_OB_COMP_EVAL_PARTICLES,
+	/* Material Shading Component */
+	DEG_OB_COMP_SHADING,
+	/* Cache Component */
+	DEG_OB_COMP_CACHE,
 } eDepsObjectComponentType;
 
-void DEG_add_scene_relation(struct DepsNodeHandle *node, struct Scene *scene, eDepsSceneComponentType component, const char *description);
-void DEG_add_object_relation(struct DepsNodeHandle *node, struct Object *ob, eDepsObjectComponentType component, const char *description);
-void DEG_add_bone_relation(struct DepsNodeHandle *handle, struct Object *ob, const char *bone_name, eDepsObjectComponentType component, const char *description);
-void DEG_add_object_cache_relation(struct DepsNodeHandle *handle, struct CacheFile *cache_file, eDepsObjectComponentType component, const char *description);
+void DEG_add_scene_relation(struct DepsNodeHandle *node,
+                            struct Scene *scene,
+                            eDepsSceneComponentType component,
+                            const char *description);
+void DEG_add_object_relation(struct DepsNodeHandle *node, struct
+                             Object *ob,
+                             eDepsObjectComponentType component,
+                             const char *description);
+void DEG_add_bone_relation(struct DepsNodeHandle *handle,
+                           struct Object *ob,
+                           const char *bone_name,
+                           eDepsObjectComponentType component,
+                           const char *description);
+void DEG_add_object_cache_relation(struct DepsNodeHandle *handle,
+                                   struct CacheFile *cache_file,
+                                   eDepsObjectComponentType component,
+                                   const char *description);
 
 /* TODO(sergey): Remove once all geometry update is granular. */
 void DEG_add_special_eval_flag(struct Depsgraph *graph, struct ID *id, short flag);
@@ -117,8 +159,22 @@ void DEG_add_special_eval_flag(struct Depsgraph *graph, struct ID *id, short fla
 /* Utility functions for physics modifiers */
 typedef bool (*DEG_CollobjFilterFunction)(struct Object *obj, struct ModifierData *md);
 
-void DEG_add_collision_relations(struct DepsNodeHandle *handle, struct Scene *scene, Object *ob, struct Group *group, int layer, unsigned int modifier_type, DEG_CollobjFilterFunction fn, bool dupli, const char *name);
-void DEG_add_forcefield_relations(struct DepsNodeHandle *handle, struct Scene *scene, Object *ob, struct EffectorWeights *eff, bool add_absorption, int skip_forcefield, const char *name);
+void DEG_add_collision_relations(struct DepsNodeHandle *handle,
+                                 struct Scene *scene,
+                                 Object *ob,
+                                 struct Group *group,
+                                 int layer,
+                                 unsigned int modifier_type,
+                                 DEG_CollobjFilterFunction fn,
+                                 bool dupli,
+                                 const char *name);
+void DEG_add_forcefield_relations(struct DepsNodeHandle *handle,
+                                  struct Scene *scene,
+                                  Object *ob,
+                                  struct EffectorWeights *eff,
+                                  bool add_absorption,
+                                  int skip_forcefield,
+                                  const char *name);
 
 /* ************************************************ */
 
diff --git a/source/blender/depsgraph/intern/builder/deg_builder.cc b/source/blender/depsgraph/intern/builder/deg_builder.cc
index 6169100..cb2f057 100644
--- a/source/blender/depsgraph/intern/builder/deg_builder.cc
+++ b/source/blender/depsgraph/intern/builder/deg_builder.cc
@@ -34,6 +34,8 @@
 #include <stack>
 
 #include "DNA_anim_types.h"
+#include "DNA_object_types.h"
+#include "DNA_ID.h"
 
 #include "BLI_utildefines.h"
 #include "BLI_ghash.h"
@@ -46,6 +48,8 @@
 
 #include "util/deg_util_foreach.h"
 
+#include <cstdio>
+
 namespace DEG {
 
 string deg_fcurve_id_name(const FCurve *fcu)
@@ -56,10 +60,46 @@ string deg_fcurve_id_name(const FCurve *fcu)
 	return string(fcu->rna_path) + index_buf;
 }
 
+static bool check_object_needs_evaluation(Object *object)
+{
+	if (object->recalc & OB_RECALC_ALL) {
+		/* Object is tagged for update anyway, no need to re-tag it. */
+		return false;
+	}
+	if (object->type == OB_MESH) {
+		return object->derivedFinal == NULL;
+	}
+	else if (ELEM(object->type,
+	              OB_CURVE, OB_SURF, OB_FONT, OB_MBALL, OB_LATTICE))
+	{
+		return object->curve_cache == NULL;
+	}
+	return false;
+}
+
 void deg_graph_build_finalize(Depsgraph *graph)
 {
+	/* STEP 1: Make sure new invisible dependencies are ready for use.
+	 *
+	 * TODO(sergey): This might do a bit of extra tagging, but it's kinda nice
+	 * to do it ahead of a time and don't spend time on flushing updates on
+	 * every frame change.
+	 */
+	GHASH_FOREACH_BEGIN(IDDepsNode *, id_node, graph->id_hash)
+	{
+		if (id_node->layers == 0) {
+			ID *id = id_node->id;
+			if (GS(id->name) == ID_OB) {
+				Object *object = (Object *)id;
+				if (check_object_needs_evaluation(object)) {
+					id_node->tag_update(graph);
+				}
+			}
+		}
+	}
+	GHASH_FOREACH_END();
+	/* STEP 2: Flush visibility layers from children to parent. */
 	std::stack<OperationDepsNode *> stack;
-
 	foreach (OperationDepsNode *node, graph->operations) {
 		IDDepsNode *id_node = node->owner->owner;
 		node->done = 0;
@@ -78,7 +118,6 @@ void deg_graph_build_finalize(Depsgraph *graph)
 		node->owner->layers = id_node->layers;
 		id_node->id->tag |= LIB_TAG_DOIT;
 	}
-
 	while (!stack.empty()) {
 		OperationDepsNode *node = stack.top();
 		stack.pop();
@@ -104,8 +143,9 @@ void deg_graph_build_finalize(Depsgraph *graph)
 			}
 		}
 	}
-
-	/* Re-tag IDs for update if it was tagged before the relations update tag. */
+	/* STEP 3: Re-tag IDs for update if it was tagged before the relations
+	 * update tag.
+	 */
 	GHASH_FOREACH_BEGIN(IDDepsNode *, id_node, graph->id_hash)
 	{
 		GHASH_FOREACH_BEGIN(ComponentDepsNode *, comp, id_node->components)
@@ -114,12 +154,21 @@ void deg_graph_build_finalize(Depsgraph *graph)
 		}
 		GHASH_FOREACH_END();
 
-		ID *id = id_node->id;
-		if (id->tag & LIB_TAG_ID_RECALC_ALL &&
-		    id->tag & LIB_TAG_DOIT)
-		{
-			id_node->tag_update(graph);
-			id->tag &= ~LIB_TAG_DOIT;
+		if ((id_node->layers & graph->layers) != 0) {
+			ID *id = id_node->id;
+			if ((id->tag & LIB_TAG_ID_RECALC_ALL) &&
+			    (id->tag & LIB_TAG_DOIT))
+			{
+				id_node->tag_update(graph);
+				id->tag &= ~LIB_TAG_DOIT;
+			}
+			else if (GS(id->name) == ID_OB) {
+				Object *object = (Object *)id;
+				if (object->recalc & OB_RECALC_ALL) {
+					id_node->tag_update(graph);
+					id->tag &= ~LIB_TAG_DOIT;
+				}
+			}
 		}
 		id_node->finalize_build();
 	}
diff --git a/source/blender/depsgraph/intern/builder/deg_builder_cycle.cc b/source/blender/depsgraph/intern/builder/deg_builder_cycle.cc
index 225cc64..9b37aaa 100644
--- a/source/blender/depsgraph/intern/builder/deg_builder_cycle.cc
+++ b/source/blender/depsgraph/intern/builder/deg_builder_cycle.cc
@@ -56,12 +56,14 @@ struct StackEntry {
 
 void deg_graph_detect_cycles(Depsgraph *graph)
 {
-	/* Not is not visited at all during traversal. */
-	const int NODE_NOT_VISITED = 0;
-	/* Node has been visited during traversal and not in current stack. */
-	const int NODE_VISITED = 1;
-	/* Node has been visited during traversal and is in current stack. */
-	const int NODE_IN_STACK = 2;
+	enum {
+		/* Not is not visited at all during traversal. */
+		NODE_NOT_VISITED = 0,
+		/* Node has been visited during traversal and not in current stack. */
+		NODE_VISITED = 1,
+		/* Node has been visited during traversal and is in current stack. */
+		NODE_IN_STACK = 2,
+	};
 
 	std::stack<StackEntry> traversal_stack;
 	foreach (OperationDepsNode *node, graph->operations) {
@@ -77,21 +79,23 @@ void deg_graph_detect_cycles(Depsgraph *graph)
 			entry.from = NULL;
 			entry.via_relation = NULL;
 			traversal_stack.push(entry);
-			node->done = NODE_IN_STACK;
+			node->tag = NODE_IN_STACK;
 		}
 		else {
-			node->done = NODE_NOT_VISITED;
+			node->tag = NODE_NOT_VISITED;
 		}
+		node->done = 0;
 	}
 
 	while (!traversal_stack.empty()) {
-		StackEntry &entry = traversal_stack.top();
+		StackEntry& entry = traversal_stack.top();
 		OperationDepsNode *node = entry.node;
 		bool all_child_traversed = true;
-		foreach (DepsRelation *rel, node->outlinks) {
+		for (int i = node->done; i < node->outlinks.size(); ++i) {
+			DepsRelation *rel = node->outlinks[i];
 			if (rel->to->type == DEPSNODE_TYPE_OPERATION) {
 				OperationDepsNode *to = (OperationDepsNode *)rel->to;
-				if (to->done == NODE_IN_STACK) {
+				if (to->tag == NODE_IN_STACK) {
 					printf("Dependency cycle detected:\n");
 					printf("  '%s' depends on '%s' through '%s'\n",
 					       to->full_identifier().c_str(),
@@ -107,23 +111,24 @@ void deg_graph_detect_cycles(Depsgraph *graph)
 						       current->via_relation->name);
 						current = current->from;
 					}
-					/* TODO(sergey): So called roussian rlette cycle solver. */
+					/* TODO(sergey): So called russian roulette cycle solver. */
 					rel->flag |= DEPSREL_FLAG_CYCLIC;
 				}
-				else if (to->done == NODE_NOT_VISITED) {
+				else if (to->tag == NODE_NOT_VISITED) {
 					StackEntry new_entry;
 					new_entry.node = to;
 					new_entry.from = &entry;
 					new_entry.via_relation = rel;
 					traversal_stack.push(new_entry);
-					to->done = NODE_IN_STACK;
+					to->tag = NODE_IN_STACK;
 					all_child_traversed = false;
+					node->done = i;
 					break;
 				}
 			}
 		}
 		if (all_child_traversed) {
-			node->done = NODE_VISITED;
+			node->tag = NODE_VISITED;
 			traversal_stack.pop();
 		}
 	}
diff --git a/source/blender/depsgraph/intern/builder/deg_builder_nodes.cc b/source/blender/depsgraph/intern/builder/deg_builder_nodes.cc
index 1812384..e312c4e 100644
--- a/source/blender/depsgraph/intern/builder/deg_builder_nodes.cc
+++ b/source/blender/depsgraph/intern/builder/deg_builder_nodes.cc
@@ -34,7 +34,6 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <string.h>
 
 #include "MEM_guardedalloc.h"
 
@@ -56,8 +55,10 @@ extern "C" {
 #include "DNA_key_types.h"
 #include "DNA_lamp_types.h"
 #include "DNA_material_types.h"
+#include "DNA_mask_types.h"
 #include "DNA_mesh_types.h"
 #include "DNA_meta_types.h"
+#include "DNA_movieclip_types.h"
 #include "DNA_node_types.h"
 #include "DNA_particle_types.h"
 #include "DNA_object_types.h"
@@ -106,9 +107,44 @@ extern "C" {
 #include "intern/nodes/deg_node_operation.h"
 #include "intern/depsgraph_types.h"
 #include "intern/depsgraph_intern.h"
+#include "util/deg_util_foreach.h"
 
 namespace DEG {
 
+namespace {
+
+struct BuilderWalkUserData {
+	DepsgraphNodeBuilder *builder;
+	Scene *scene;
+};
+
+static void modifier_walk(void *user_data,
+                          struct Object * /*ob*/,
+                          struct Object **obpoin,
+                          int /*cd_flag*/)
+{
+	BuilderWalkUserData *data = (BuilderWalkUserData *)user_data;
+	if (*obpoin) {
+		data->builder->build_object(data->scene, NULL, *obpoin);
+	}
+}
+
+void constraint_walk(bConstraint * /*con*/,
+                     ID **idpoin,
+                     bool /*is_reference*/,
+                     void *user_data)
+{
+	BuilderWalkUserData *data = (BuilderWalkUserData *)user_data;
+	if (*idpoin) {
+		ID *id = *idpoin;
+		if (GS(id->name) == ID_OB) {
+			data->builder->build_object(data->scene, NULL, (Object *)id);
+		}
+	}
+}
+
+}  /* namespace */
+
 /* ************ */
 /* Node Builder */
 
@@ -131,8 +167,7 @@ RootDepsNode *DepsgraphNodeBuilder::add_root_node()
 
 IDDepsNode *DepsgraphNodeBuilder::add_id_node(ID *id)
 {
-	const char *idtype_name = BKE_idcode_to_name(GS(id->name));
-	return m_graph->add_id_node(id, string(id->name + 2) + "[" + idtype_name + "]");
+	return m_graph->add_id_node(id, id->name);
 }
 
 TimeSourceDepsNode *DepsgraphNodeBuilder::add_time_source(ID *id)
@@ -179,7 +214,7 @@ TimeSourceDepsNode *DepsgraphNodeBuilder::add_time_source(ID *id)
 ComponentDepsNode *DepsgraphNodeBuilder::add_component_node(
         ID *id,
         eDepsNode_Type comp_type,
-        const string &comp_name)
+        const char *comp_name)
 {
 	IDDepsNode *id_node = add_id_node(id);
 	ComponentDepsNode *comp_node = id_node->add_component(comp_type, comp_name);
@@ -192,15 +227,19 @@ OperationDepsNode *DepsgraphNodeBuilder::add_operation_node(
         eDepsOperation_Type optype,
         DepsEvalOperationCb op,
         eDepsOperation_Code opcode,
-        const string &description)
+        const char *name,
+        int name_tag)
 {
-	OperationDepsNode *op_node = comp_node->has_operation(opcode, description);
+	OperationDepsNode *op_node = comp_node->has_operation(opcode,
+	                                                      name,
+	                                                      name_tag);
 	if (op_node == NULL) {
-		op_node = comp_node->add_operation(optype, op, opcode, description);
+		op_node = comp_node->add_operation(optype, op, opcode, name, name_tag);
 		m_graph->operations.push_back(op_node);
 	}
 	else {
-		fprintf(stderr, "add_operation: Operation already exists - %s has %s at %p\n",
+		fprintf(stderr,
+		        "add_operation: Operation already exists - %s has %s at %p\n",
 		        comp_node->identifier().c_str(),
 		        op_node->identifier().c_str(),
 		        op_node);
@@ -212,14 +251,15 @@ OperationDepsNode *DepsgraphNodeBuilder::add_operation_node(
 OperationDepsNode *DepsgraphNodeBuilder::add_operation_node(
         ID *id,
         eDepsNode_Type comp_type,
-        const string &comp_name,
+        const char *comp_name,
         eDepsOperation_Type optype,
         DepsEvalOperationCb op,
         eDepsOperation_Code opcode,
-        const string &description)
+        const char *name,
+        int name_tag)
 {
 	ComponentDepsNode *comp_node = add_component_node(id, comp_type, comp_name);
-	return add_operation_node(comp_node, optype, op, opcode, description);
+	return add_operation_node(comp_node, optype, op, opcode, name, name_tag);
 }
 
 OperationDepsNode *DepsgraphNodeBuilder::add_operation_node(
@@ -228,128 +268,58 @@ OperationDepsNode *DepsgraphNodeBuilder::add_operation_node(
         eDepsOperation_Type optype,
         DepsEvalOperationCb op,
         eDepsOperation_Code opcode,
-        const string& description)
+        const char *name,
+        int name_tag)
 {
-	return add_operation_node(id, comp_type, "", optype, op, opcode, description);
+	return add_operation_node(id,
+	                          comp_type,
+	                          "",
+	                          optype,
+	                          op,
+	                          opcode,
+	                          name,
+	                          name_tag);
 }
 
 bool DepsgraphNodeBuilder::has_operation_node(ID *id,
                                               eDepsNode_Type comp_type,
-                                              const string &comp_name,
+                                              const char *comp_name,
                                               eDepsOperation_Code opcode,
-                                              const string &description)
+                                              const char *name,
+                                              int name_tag)
 {
-	return find_operation_node(id, comp_type, comp_name, opcode, description) != NULL;
+	return find_operation_node(id,
+	                           comp_type,
+	                           comp_name,
+	                           opcode,
+	                           name,
+	                           name_tag) != NULL;
 }
 
 OperationDepsNode *DepsgraphNodeBuilder::find_operation_node(
         ID *id,
         eDepsNode_Type comp_type,
-        const string &comp_name,
+        const char *comp_name,
         eDepsOperation_Code opcode,
-        const string &description)
+        const char *name,
+        int name_tag)
 {
 	ComponentDepsNode *comp_node = add_component_node(id, comp_type, comp_name);
-	return comp_node->has_operation(opcode, description);
+	return comp_node->has_operation(opcode, name, name_tag);
 }
 
 OperationDepsNode *DepsgraphNodeBuilder::find_operation_node(
         ID *id,
         eDepsNode_Type comp_type,
         eDepsOperation_Code opcode,
-        const string& description)
+        const char *name,
+        int name_tag)
 {
-	return find_operation_node(id, comp_type, "", opcode, description);
+	return find_operation_node(id, comp_type, "", opcode, name, name_tag);
 }
 
 /* **** Build functions for entity nodes **** */
 
-void DepsgraphNodeBuilder::build_scene(Main *bmain, Scene *scene)
-{
-	/* LIB_TAG_DOIT is used to indicate whether node for given ID was already
-	 * created or not. This flag is being set in add_id_node(), so functions
-	 * shouldn't bother with setting it, they only might query this flag when
-	 * needed.
-	 */
-	BKE_main_id_tag_all(bmain, LIB_TAG_DOIT, false);
-	/* XXX nested node trees are not included in tag-clearing above,
-	 * so we need to do this manually.
-	 */
-	FOREACH_NODETREE(bmain, nodetree, id) {
-		if (id != (ID *)nodetree)
-			nodetree->id.tag &= ~LIB_TAG_DOIT;
-	} FOREACH_NODETREE_END
-
-	/* scene ID block */
-	add_id_node(&scene->id);
-
-	/* timesource */
-	add_time_source(NULL);
-
-	/* build subgraph for set, and link this in... */
-	// XXX: depending on how this goes, that scene itself could probably store its
-	//      own little partial depsgraph?
-	if (scene->set) {
-		build_scene(bmain, scene->set);
-	}
-
-	/* scene objects */
-	for (Base *base = (Base *)scene->base.first; base; base = base->next) {
-		Object *ob = base->object;
-
-		/* object itself */
-		build_object(scene, base, ob);
-
-		/* object that this is a proxy for */
-		// XXX: the way that proxies work needs to be completely reviewed!
-		if (ob->proxy) {
-			ob->proxy->proxy_from = ob;
-			build_object(scene, base, ob->proxy);
-		}
-
-		/* Object dupligroup. */
-		if (ob->dup_group) {
-			build_group(scene, base, ob->dup_group);
-		}
-	}
-
-	/* rigidbody */
-	if (scene->rigidbody_world) {
-		build_rigidbody(scene);
-	}
-
-	/* scene's animation and drivers */
-	if (scene->adt) {
-		build_animdata(&scene->id);
-	}
-
-	/* world */
-	if (scene->world) {
-		build_world(scene->world);
-	}
-
-	/* compo nodes */
-	if (scene->nodetree) {
-		build_compositor(scene);
-	}
-
-	/* sequencer */
-	// XXX...
-
-	/* grease pencil */
-	if (scene->gpd) {
-		build_gpencil(scene->gpd);
-	}
-
-	/* cache files */
-	for (CacheFile *cachefile = static_cast<CacheFile *>(bmain->cachefiles.first);
-	     cachefile;
-	     cachefile = static_cast<CacheFile *>(cachefile->id.next))
-	{
-		build_cachefile(cachefile);
-	}
-}
-
 void DepsgraphNodeBuilder::build_group(Scene *scene,
                                        Base *base,
                                        Group *group)
@@ -360,10 +330,7 @@ void DepsgraphNodeBuilder::build_group(Scene *scene,
 	}
 	group_id->tag |= LIB_TAG_DOIT;
 
-	for (GroupObject *go = (GroupObject *)group->gobject.first;
-	     go != NULL;
-	     go = go->next)
-	{
+	LINKLIST_FOREACH (GroupObject *, go, &group->gobject) {
 		build_object(scene, base, go->ob);
 	}
 }
@@ -380,45 +347,74 @@ SubgraphDepsNode *DepsgraphNodeBuilder::build_subgraph(Group *group)
 	DepsgraphNodeBuilder subgraph_builder(m_bmain, subgraph);
 
 	/* add group objects */
-	for (GroupObject *go = (GroupObject *)group->gobject.first;
-	     go != NULL;
-	     go = go->next)
-	{
+	LINKLIST_FOREACH (GroupObject *, go, &group->gobject) {
 		/*Object *ob = go->ob;*/
 
-		/* Each "group object" is effectively a separate instance of the underlying
-		 * object data. When the group is evaluated, the transform results and/or
-		 * some other attributes end up getting overridden by the group
+		/* Each "group object" is effectively a separate instance of the
+		 * underlying object data. When the group is evaluated, the transform
+		 * results and/or some other attributes end up getting overridden by
+		 * the group.
 		 */
 	}
 
-	/* create a node for representing subgraph */
+	/* Create a node for representing subgraph. */
 	SubgraphDepsNode *subgraph_node = m_graph->add_subgraph_node(&group->id);
 	subgraph_node->graph = subgraph;
 
-	/* make a copy of the data this node will need? */
-	// XXX: do we do this now, or later?
-	// TODO: need API function which queries graph's ID's hash, and duplicates those blocks thoroughly with all outside links removed...
+	/* Make a copy of the data this node will need? */
+	/* XXX: do we do this now, or later? */
+	/* TODO: need API function which queries graph's ID's hash, and duplicates
+	 * those blocks thoroughly with all outside links removed.
+	 */
 
 	return subgraph_node;
 }
 
 void DepsgraphNodeBuilder::build_object(Scene *scene, Base *base, Object *ob)
 {
-	if (ob->id.tag & LIB_TAG_DOIT) {
-		IDDepsNode *id_node = m_graph->find_id_node(&ob->id);
+	const bool has_object = (ob->id.tag & LIB_TAG_DOIT);
+	IDDepsNode *id_node = (has_object)
+	        ? m_graph->find_id_node(&ob->id)
+	        : add_id_node(&ob->id);
+	/* Update node layers.
+	 * Do it for both new and existing ID nodes. This is so because several
+	 * bases might be sharing same object.
+	 */
+	if (base != NULL) {
 		id_node->layers |= base->lay;
+	}
+	if (ob == scene->camera) {
+		/* Camera should always be updated, it used directly by viewport. */
+		id_node->layers |= (unsigned int)(-1);
+	}
+	/* Skip rest of components if the ID node was already there. */
+	if (has_object) {
 		return;
 	}
-
-	IDDepsNode *id_node = add_id_node(&ob->id);
-	id_node->layers |= base->lay;
+	ob->id.tag |= LIB_TAG_DOIT;
 	ob->customdata_mask = 0;
 
-	/* standard components */
+	/* Standard components. */
 	build_object_transform(scene, ob);
 
-	/* object data */
+	if (ob->parent != NULL) {
+		build_object(scene, NULL, ob->parent);
+	}
+	if (ob->modifiers.first != NULL) {
+		BuilderWalkUserData data;
+		data.builder = this;
+		data.scene = scene;
+		modifiers_foreachObjectLink(ob, modifier_walk, &data);
+	}
+	if (ob->constraints.first != NULL) {
+		BuilderWalkUserData data;
+		data.builder = this;
+		data.scene = scene;
+		modifiers_foreachObjectLink(ob, modifier_walk, &data);
+		BKE_constraints_id_loop(&ob->constraints, constraint_walk, &data);
+	}
+
+	/* Object data. */
 	if (ob->data) {
 		/* type-specific data... */
 		switch (ob->type) {
@@ -428,15 +424,6 @@ void DepsgraphNodeBuilder::build_object(Scene *scene, Base *base, Object *ob)
 			case OB_SURF:
 			case OB_MBALL:
 			case OB_LATTICE:
-			{
-				/* TODO(sergey): This way using this object's
-				 * properties as driver target works fine.
-				 *
-				 * Does this depend on other nodes?
-				 */
-				add_operation_node(&ob->id, DEPSNODE_TYPE_PARAMETERS, DEPSOP_TYPE_POST, NULL,
-				                   DEG_OPCODE_PLACEHOLDER, "Parameters Eval");
-
 				build_obdata_geom(scene, ob);
 				/* TODO(sergey): Only for until we support granular
 				 * update of curves.
@@ -448,7 +435,6 @@ void DepsgraphNodeBuilder::build_object(Scene *scene, Base *base, Object *ob)
 					}
 				}
 				break;
-			}
 
 			case OB_ARMATURE: /* Pose */
 				if (ID_IS_LINKED_DATABLOCK(ob) && ob->proxy_from != NULL) {
@@ -558,14 +544,6 @@ void DepsgraphNodeBuilder::build_object_constraints(Scene *scene, Object *ob)
 	                   DEG_OPCODE_TRANSFORM_CONSTRAINTS);
 }
 
-void DepsgraphNodeBuilder::build_pose_constraints(Object *ob, bPoseChannel *pchan)
-{
-	/* create node for constraint stack */
-	add_operation_node(&ob->id, DEPSNODE_TYPE_BONE, pchan->name,
-	                   DEPSOP_TYPE_EXEC, function_bind(BKE_pose_constraints_evaluate, _1, ob, pchan),
-	                   DEG_OPCODE_BONE_CONSTRAINTS);
-}
-
 /**
  * Build graph nodes for AnimData block
  * \param id: ID-Block which hosts the AnimData
@@ -593,7 +571,7 @@ void DepsgraphNodeBuilder::build_animdata(ID *id)
 		}
 
 		/* drivers */
-		for (FCurve *fcu = (FCurve *)adt->drivers.first; fcu; fcu = fcu->next) {
+		LINKLIST_FOREACH (FCurve *, fcu, &adt->drivers) {
 			/* create driver */
 			build_driver(id, fcu);
 		}
@@ -617,12 +595,17 @@ OperationDepsNode *DepsgraphNodeBuilder::build_driver(ID *id, FCurve *fcu)
 	OperationDepsNode *driver_op = find_operation_node(id,
 	                                                   DEPSNODE_TYPE_PARAMETERS,
 	                                                   DEG_OPCODE_DRIVER,
-	                                                   deg_fcurve_id_name(fcu));
+	                                                   fcu->rna_path,
+	                                                   fcu->array_index);
 
 	if (driver_op == NULL) {
-		driver_op = add_operation_node(id, DEPSNODE_TYPE_PARAMETERS,
-		                               DEPSOP_TYPE_EXEC, function_bind(BKE_animsys_eval_driver, _1, id, fcu),
-		                               DEG_OPCODE_DRIVER, deg_fcurve_id_name(fcu));
+		driver_op = add_operation_node(id,
+		                               DEPSNODE_TYPE_PARAMETERS,
+		                               DEPSOP_TYPE_EXEC,
+		                               function_bind(BKE_animsys_eval_driver, _1, id, fcu),
+		                               DEG_OPCODE_DRIVER,
+		                               fcu->rna_path,
+		                               fcu->array_index);
 	}
 
 	/* tag "scripted expression" drivers as needing Python (due to GIL issues, etc.) */
@@ -701,7 +684,7 @@ void DepsgraphNodeBuilder::build_rigidbody(Scene *scene)
 
 	/* objects - simulation participants */
 	if (rbw->group) {
-		for (GroupObject *go = (GroupObject *)rbw->group->gobject.first; go; go = go->next) {
+		LINKLIST_FOREACH (GroupObject *, go, &rbw->group->gobject) {
 			Object *ob = go->ob;
 
 			if (!ob || (ob->type != OB_MESH))
@@ -737,7 +720,7 @@ void DepsgraphNodeBuilder::build_particles(Scene *scene, Object *ob)
 	ComponentDepsNode *psys_comp = add_component_node(&ob->id, DEPSNODE_TYPE_EVAL_PARTICLES);
 
 	/* particle systems */
-	for (ParticleSystem *psys = (ParticleSystem *)ob->particlesystem.first; psys; psys = psys->next) {
+	LINKLIST_FOREACH (ParticleSystem *, psys, &ob->particlesystem) {
 		ParticleSettings *part = psys->part;
 
 		/* particle settings */
@@ -747,7 +730,11 @@ void DepsgraphNodeBuilder::build_particles(Scene *scene, Object *ob)
 		/* this particle system */
 		// TODO: for now, this will just be a placeholder "ubereval" node
 		add_operation_node(psys_comp,
-		                   DEPSOP_TYPE_EXEC, function_bind(BKE_particle_system_eval, _1, scene, ob, psys),
+		                   DEPSOP_TYPE_EXEC, function_bind(BKE_particle_system_eval,
+		                                                   _1,
+		                                                   scene,
+		                                                   ob,
+		                                                   psys),
 		                   DEG_OPCODE_PSYS_EVAL,
 		                   psys->name);
 	}
@@ -756,207 +743,6 @@ void DepsgraphNodeBuilder::build_particles(Scene *scene, Object *ob)
 	// TODO...
 }
 
-/* IK Solver Eval Steps */
-void DepsgraphNodeBuilder::build_ik_pose(Scene *scene, Object *ob, bPoseChannel *pchan, bConstraint *con)
-{
-	bKinematicConstraint *data = (bKinematicConstraint *)con->data;
-
-	/* Find the chain's root. */
-	bPoseChannel *rootchan = BKE_armature_ik_solver_find_root(pchan, data);
-
-	if (has_operation_node(&ob->id, DEPSNODE_TYPE_EVAL_POSE, rootchan->name,
-	                       DEG_OPCODE_POSE_IK_SOLVER))
-	{
-		return;
-	}
-
-	/* Operation node for evaluating/running IK Solver. */
-	add_operation_node(&ob->id, DEPSNODE_TYPE_EVAL_POSE, rootchan->name,
-	                   DEPSOP_TYPE_SIM, function_bind(BKE_pose_iktree_evaluate, _1, scene, ob, rootchan),
-	                   DEG_OPCODE_POSE_IK_SOLVER);
-}
-
-/* Spline IK Eval Steps */
-void DepsgraphNodeBuilder::build_splineik_pose(Scene *scene, Object *ob, bPoseChannel *pchan, bConstraint *con)
-{
-	bSplineIKConstraint *data = (bSplineIKConstraint *)con->data;
-
-	/* Find the chain's root. */
-	bPoseChannel *rootchan = BKE_armature_splineik_solver_find_root(pchan, data);
-
-	/* Operation node for evaluating/running Spline IK Solver.
-	 * Store the "root bone" of this chain in the solver, so it knows where to start.
-	 */
-	add_operation_node(&ob->id, DEPSNODE_TYPE_EVAL_POSE, rootchan->name,
-	                   DEPSOP_TYPE_SIM, function_bind(BKE_pose_splineik_evaluate, _1, scene, ob, rootchan),
-	                   DEG_OPCODE_POSE_SPLINE_IK_SOLVER);
-}
-
-/* Pose/Armature Bones Graph */
-void DepsgraphNodeBuilder::build_rig(Scene *scene, Object *ob)
-{
-	bArmature *arm = (bArmature *)ob->data;
-
-	/* animation and/or drivers linking posebones to base-armature used to define them
-	 * NOTE: AnimData here is really used to control animated deform properties,
-	 *       which ideally should be able to be unique across different instances.
-	 *       Eventually, we need some type of proxy/isolation mechanism in-between here
-	 *       to ensure that we can use same rig multiple times in same scene...
-	 */
-	build_animdata(&arm->id);
-
-	/* Rebuild pose if not up to date. */
-	if (ob->pose == NULL || (ob->pose->flag & POSE_RECALC)) {
-		BKE_pose_rebuild(ob, arm);
-		/* XXX: Without this animation gets lost in certain circumstances
-		 * after loading file. Need to investigate further since it does
-		 * not happen with simple scenes..
-		 */
-		if (ob->adt) {
-			ob->adt->recalc |= ADT_RECALC_ANIM;
-		}
-	}
-
-	/* speed optimization for animation lookups */
-	if (ob->pose) {
-		BKE_pose_channels_hash_make(ob->pose);
-		if (ob->pose->flag & POSE_CONSTRAINTS_NEED_UPDATE_FLAGS) {
-			BKE_pose_update_constraint_flags(ob->pose);
-		}
-	}
-
-	/* Make sure pose is up-to-date with armature updates. */
-	add_operation_node(&arm->id,
-	                   DEPSNODE_TYPE_PARAMETERS,
-	                   DEPSOP_TYPE_EXEC,
-	                   NULL,
-	                   DEG_OPCODE_PLACEHOLDER,
-	                   "Armature Eval");
-
-	/**
-	 * Pose Rig Graph
-	 * ==============
-	 *
-	 * Pose Component:
-	 * - Mainly used for referencing Bone components.
-	 * - This is where the evaluation operations for init/exec/cleanup
-	 *   (ik) solvers live, and are later hooked up (so that they can be
-	 *   interleaved during runtime) with bone-operations they depend on/affect.
-	 * - init_pose_eval() and cleanup_pose_eval() are absolute first and last
-	 *   steps of pose eval process. ALL bone operations must be performed
-	 *   between these two...
-	 *
-	 * Bone Component:
-	 * - Used for representing each bone within the rig
-	 * - Acts to encapsulate the evaluation operations (base matrix + parenting,
-	 *   and constraint stack) so that they can be easily found.
-	 * - Everything else which depends on bone-results hook up to the component only
-	 *   so that we can redirect those to point at either the the post-IK/
-	 *   post-constraint/post-matrix steps, as needed.
-	 */
-
-	/* pose eval context */
-	add_operation_node(&ob->id, DEPSNODE_TYPE_EVAL_POSE,
-	                   DEPSOP_TYPE_INIT, function_bind(BKE_pose_eval_init, _1, scene, ob, ob->pose), DEG_OPCODE_POSE_INIT);
-
-	add_operation_node(&ob->id, DEPSNODE_TYPE_EVAL_POSE,
-	                   DEPSOP_TYPE_POST, function_bind(BKE_pose_eval_flush, _1, scene, ob, ob->pose), DEG_OPCODE_POSE_DONE);
-
-	/* bones */
-	for (bPoseChannel *pchan = (bPoseChannel *)ob->pose->chanbase.first; pchan; pchan = pchan->next) {
-		/* node for bone eval */
-		add_operation_node(&ob->id, DEPSNODE_TYPE_BONE, pchan->name,
-		                   DEPSOP_TYPE_INIT, NULL, // XXX: BKE_pose_eval_bone_local
-		                   DEG_OPCODE_BONE_LOCAL);
-
-		add_operation_node(&ob->id, DEPSNODE_TYPE_BONE, pchan->name,
-		                   DEPSOP_TYPE_EXEC, function_bind(BKE_pose_eval_bone, _1, scene, ob, pchan), // XXX: BKE_pose_eval_bone_pose
-		                   DEG_OPCODE_BONE_POSE_PARENT);
-
-		add_operation_node(&ob->id, DEPSNODE_TYPE_BONE, pchan->name,
-		                   DEPSOP_TYPE_OUT, NULL, /* NOTE: dedicated noop for easier relationship construction */
-		                   DEG_OPCODE_BONE_READY);
-
-		add_operation_node(&ob->id, DEPSNODE_TYPE_BONE, pchan->name,
-		                   DEPSOP_TYPE_POST, function_bind(BKE_pose_bone_done, _1, pchan),
-		                   DEG_OPCODE_BONE_DONE);
-
-		/* constraints */
-		if (pchan->constraints.first != NULL) {
-			build_pose_constraints(ob, pchan);
-		}
-
-		/**
-		 * IK Solvers...
-		 *
-		 * - These require separate processing steps are pose-level
-		 *   to be executed between chains of bones (i.e. once the
-		 *   base transforms of a bunch of bones is done)
-		 *
-		 * Unsolved Issues:
-		 * - Care is needed to ensure that multi-headed trees work out the same as in ik-tree building
-		 * - Animated chain-lengths are a problem...
-		 */
-		for (bConstraint *con = (bConstraint *)pchan->constraints.first; con; con = con->next) {
-			switch (con->type) {
-				case CONSTRAINT_TYPE_KINEMATIC:
-					build_ik_pose(scene, ob, pchan, con);
-					break;
-
-				case CONSTRAINT_TYPE_SPLINEIK:
-					build_splineik_pose(scene, ob, pchan, con);
-					break;
-
-				default:
-					break;
-			}
-		}
-	}
-}
-
-void DepsgraphNodeBuilder::build_proxy_rig(Object *ob)
-{
-	ID *obdata = (ID *)ob->data;
-	build_animdata(obdata);
-
-	BLI_assert(ob->pose != NULL);
-
-	/* speed optimization for animation lookups */
-	BKE_pose_channels_hash_make(ob->pose);
-	if (ob->pose->flag & POSE_CONSTRAINTS_NEED_UPDATE_FLAGS) {
-		BKE_pose_update_constraint_flags(ob->pose);
-	}
-
-	add_operation_node(&ob->id,
-	                   DEPSNODE_TYPE_EVAL_POSE,
-	                   DEPSOP_TYPE_INIT,
-	                   function_bind(BKE_pose_eval_proxy_copy, _1, ob),
-	                   DEG_OPCODE_POSE_INIT);
-
-	for (bPoseChannel *pchan = (bPoseChannel *)ob->pose->chanbase.first;
-	     pchan != NULL;
-	     pchan = pchan->next)
-	{
-		add_operation_node(&ob->id, DEPSNODE_TYPE_BONE, pchan->name,
-		                   DEPSOP_TYPE_INIT, NULL,
-		                   DEG_OPCODE_BONE_LOCAL);
-
-		add_operation_node(&ob->id, DEPSNODE_TYPE_BONE, pchan->name,
-		                   DEPSOP_TYPE_EXEC, NULL,
-		                   DEG_OPCODE_BONE_READY);
-
-		add_operation_node(&ob->id, DEPSNODE_TYPE_BONE, pchan->name,
-		                   DEPSOP_TYPE_POST, NULL,
-		                   DEG_OPCODE_BONE_DONE);
-	}
-
-	add_operation_node(&ob->id,
-	                   DEPSNODE_TYPE_EVAL_POSE,
-	                   DEPSOP_TYPE_POST,
-	                   NULL,
-	                   DEG_OPCODE_POSE_DONE);
-}
-
 /* Shapekeys */
 void DepsgraphNodeBuilder::build_shapekeys(Key *key)
 {
@@ -972,6 +758,18 @@ void DepsgraphNodeBuilder::build_obdata_geom(Scene *scene, Object *ob)
 {
 	ID *obdata = (ID *)ob->data;
 
+	/* TODO(sergey): This way using this object's properties as driver target
+	 * works fine.
+	 *
+	 * Does this depend on other nodes?
+	 */
+	add_operation_node(&ob->id,
+	                   DEPSNODE_TYPE_PARAMETERS,
+	                   DEPSOP_TYPE_POST,
+	                   NULL,
+	                   DEG_OPCODE_PLACEHOLDER,
+	                   "Parameters Eval");
+
 	/* Temporary uber-update node, which does everything.
 	 * It is for the being we're porting old dependencies into the new system.
 	 * We'll get rid of this node as soon as all the granular update functions
@@ -979,39 +777,42 @@ void DepsgraphNodeBuilder::build_obdata_geom(Scene *scene, Object *ob)
 	 *
 	 * TODO(sergey): Get rid of this node.
 	 */
-	add_operation_node(&ob->id, DEPSNODE_TYPE_GEOMETRY,
-	                   DEPSOP_TYPE_POST, function_bind(BKE_object_eval_uber_data, _1, scene, ob),
+	add_operation_node(&ob->id,
+	                   DEPSNODE_TYPE_GEOMETRY,
+	                   DEPSOP_TYPE_POST,
+	                   function_bind(BKE_object_eval_uber_data, _1, scene, ob),
 	                   DEG_OPCODE_GEOMETRY_UBEREVAL);
 
-	add_operation_node(&ob->id, DEPSNODE_TYPE_GEOMETRY,
-	                   DEPSOP_TYPE_INIT, NULL,
-	                   DEG_OPCODE_PLACEHOLDER, "Eval Init");
+	add_operation_node(&ob->id,
+	                   DEPSNODE_TYPE_GEOMETRY,
+	                   DEPSOP_TYPE_INIT,
+	                   NULL,
+	                   DEG_OPCODE_PLACEHOLDER,
+	                   "Eval Init");
 
 	// TODO: "Done" operation
 
 	/* Modifiers */
-	if (ob->modifiers.first) {
-		ModifierData *md;
-
-		for (md = (ModifierData *)ob->modifiers.first; md; md = md->next) {
-			add_operation_node(&ob->id, DEPSNODE_TYPE_GEOMETRY,
-			                   DEPSOP_TYPE_EXEC, function_bind(BKE_object_eval_modifier, _1, scene, ob, md),
-			                   DEG_OPCODE_GEOMETRY_MODIFIER, md->name);
-		}
+	LINKLIST_FOREACH (ModifierData *, md, &ob->modifiers) {
+		add_operation_node(&ob->id,
+		                   DEPSNODE_TYPE_GEOMETRY,
+		                   DEPSOP_TYPE_EXEC,
+		                   function_bind(BKE_object_eval_modifier,
+		                                 _1,
+		                                 scene,
+		                                 ob,
+		                                 md),
+		                   DEG_OPCODE_GEOMETRY_MODIFIER,
+		                   md->name);
 	}
 
 	/* materials */
-	if (ob->totcol) {
-		int a;
-
-		for (a = 1; a <= ob->totcol; a++) {
-			Material *ma = give_current_material(ob, a);
-
-			if (ma) {
-				// XXX?!
-				ComponentDepsNode *geom_node = add_component_node(&ob->id, DEPSNODE_TYPE_GEOMETRY);
-				build_material(geom_node, ma);
-			}
+	for (int a = 1; a <= ob->totcol; a++) {
+		Material *ma = give_current_material(ob, a);
+		if (ma != NULL) {
+			// XXX?!
+			ComponentDepsNode *geom_node = add_component_node(&ob->id, DEPSNODE_TYPE_GEOMETRY);
+			build_material(geom_node, ma);
 		}
 	}
 
@@ -1032,16 +833,23 @@ void DepsgraphNodeBuilder::build_obdata_geom(Scene *scene, Object *ob)
 
 	build_animdata(obdata);
 
-	/* nodes for result of obdata's evaluation, and geometry evaluation on object */
+	/* Nodes for result of obdata's evaluation, and geometry
+	 * evaluation on object.
+	 */
 	switch (ob->type) {
 		case OB_MESH:
 		{
 			//Mesh *me = (Mesh *)ob->data;
 
 			/* evaluation operations */
-			add_operation_node(obdata, DEPSNODE_TYPE_GEOMETRY,
-			                   DEPSOP_TYPE_INIT, function_bind(BKE_mesh_eval_geometry, _1, (Mesh *)obdata),
-			                   DEG_OPCODE_PLACEHOLDER, "Geometry Eval");
+			add_operation_node(obdata,
+			                   DEPSNODE_TYPE_GEOMETRY,
+			                   DEPSOP_TYPE_INIT,
+			                   function_bind(BKE_mesh_eval_geometry,
+			                                 _1,
+			                                 (Mesh *)obdata),
+			                   DEG_OPCODE_PLACEHOLDER,
+			                   "Geometry Eval");
 			break;
 		}
 
@@ -1049,48 +857,76 @@ void DepsgraphNodeBuilder::build_obdata_geom(Scene *scene, Object *ob)
 		{
 			Object *mom = BKE_mball_basis_find(scene, ob);
 
-			/* motherball - mom depends on children! */
+			/* Motherball - mom depends on children! */
 			if (mom == ob) {
 				/* metaball evaluation operations */
 				/* NOTE: only the motherball gets evaluated! */
-				add_operation_node(obdata, DEPSNODE_TYPE_GEOMETRY,
-				                   DEPSOP_TYPE_INIT, function_bind(BKE_mball_eval_geometry, _1, (MetaBall *)obdata),
-				                   DEG_OPCODE_PLACEHOLDER, "Geometry Eval");
+				add_operation_node(obdata,
+				                   DEPSNODE_TYPE_GEOMETRY,
+				                   DEPSOP_TYPE_INIT,
+				                   function_bind(BKE_mball_eval_geometry,
+				                                 _1,
+				                                 (MetaBall *)obdata),
+				                   DEG_OPCODE_PLACEHOLDER,
+				                   "Geometry Eval");
 			}
 			break;
 		}
 
 		case OB_CURVE:
+		case OB_SURF:
 		case OB_FONT:
 		{
-			/* curve evaluation operations */
+			/* Curve/nurms evaluation operations. */
 			/* - calculate curve geometry (including path) */
-			add_operation_node(obdata, DEPSNODE_TYPE_GEOMETRY,
-			                   DEPSOP_TYPE_INIT, function_bind(BKE_curve_eval_geometry, _1, (Curve *)obdata),
-			                   DEG_OPCODE_PLACEHOLDER, "Geometry Eval");
-
-			/* - calculate curve path - this is used by constraints, etc. */
-			add_operation_node(obdata, DEPSNODE_TYPE_GEOMETRY,
-			                   DEPSOP_TYPE_EXEC, function_bind(BKE_curve_eval_path, _1, (Curve *)obdata),
-			                   DEG_OPCODE_GEOMETRY_PATH, "Path");
-			break;
-		}
+			add_operation_node(obdata,
+			                   DEPSNODE_TYPE_GEOMETRY,
+			                   DEPSOP_TYPE_INIT,
+			                   function_bind(BKE_curve_eval_geometry,
+			                                 _1,
+			                                 (Curve *)obdata),
+			                   DEG_OPCODE_PLACEHOLDER,
+			                   "Geometry Eval");
+
+			/* Calculate curve path - this is used by constraints, etc. */
+			if (ELEM(ob->type, OB_CURVE, OB_FONT)) {
+				add_operation_node(obdata,
+				                   DEPSNODE_TYPE_GEOMETRY,
+				                   DEPSOP_TYPE_EXEC,
+				                   function_bind(BKE_curve_eval_path,
+				                                 _1,
+				                                 (Curve *)obdata),
+				                   DEG_OPCODE_GEOMETRY_PATH,
+				                   "Path");
+			}
 
-		case OB_SURF: /* Nurbs Surface */
-		{
-			/* nurbs evaluation operations */
-			add_operation_node(obdata, DEPSNODE_TYPE_GEOMETRY,
-			                   DEPSOP_TYPE_INIT, function_bind(BKE_curve_eval_geometry, _1, (Curve *)obdata),
-			                   DEG_OPCODE_PLACEHOLDER, "Geometry Eval");
+			/* Make sure objects used for bevel.taper are in the graph.
+			 * NOTE: This objects might be not linked to the scene.
+			 */
+			Curve *cu = (Curve *)obdata;
+			if (cu->bevobj != NULL) {
+				build_object(scene, NULL, cu->bevobj);
+			}
+			if (cu->taperobj != NULL) {
+				build_object(scene, NULL, cu->taperobj);
+			}
+			if (ob->type == OB_FONT && cu->textoncurve != NULL) {
+				build_object(scene, NULL, cu->textoncurve);
+			}
 			break;
 		}
 
-		case OB_LATTICE: /* Lattice */
+		case OB_LATTICE:
 		{
-			/* lattice evaluation operations */
-			add_operation_node(obdata, DEPSNODE_TYPE_GEOMETRY,
-			                   DEPSOP_TYPE_INIT, function_bind(BKE_lattice_eval_geometry, _1, (Lattice *)obdata),
-			                   DEG_OPCODE_PLACEHOLDER, "Geometry Eval");
+			/* Lattice evaluation operations. */
+			add_operation_node(obdata,
+			                   DEPSNODE_TYPE_GEOMETRY,
+			                   DEPSOP_TYPE_INIT,
+			                   function_bind(BKE_lattice_eval_geometry,
+			                                 _1,
+			                                 (Lattice *)obdata),
+			                   DEG_OPCODE_PLACEHOLDER,
+			                   "Geometry Eval");
 			break;
 		}
 	}
@@ -1170,16 +1006,21 @@ void DepsgraphNodeBuilder::build_nodetree(DepsNode *owner_node, bNodeTree *ntree
 	                   DEG_OPCODE_PLACEHOLDER, "Parameters Eval");
 
 	/* nodetree's nodes... */
-	for (bNode *bnode = (bNode *)ntree->nodes.first; bnode; bnode = bnode->next) {
-		if (bnode->id) {
-			if (GS(bnode->id->name) == ID_MA) {
-				build_material(owner_node, (Material *)bnode->id);
+	LINKLIST_FOREACH (bNode *, bnode, &ntree->nodes) {
+		ID *id = bnode->id;
+		if (id != NULL) {
+			short id_type = GS(id->name);
+			if (id_type == ID_MA) {
+				build_material(owner_node, (Material *)id);
 			}
-			else if (bnode->type == ID_TE) {
-				build_texture(owner_node, (Tex *)bnode->id);
+			else if (id_type == ID_TE) {
+				build_texture(owner_node, (Tex *)id);
+			}
+			else if (id_type == ID_IM) {
+				build_image((Image *)id);
 			}
 			else if (bnode->type == NODE_GROUP) {
-				bNodeTree *group_ntree = (bNodeTree *)bnode->id;
+				bNodeTree *group_ntree = (bNodeTree *)id;
 				if ((group_ntree->id.tag & LIB_TAG_DOIT) == 0) {
 					build_nodetree(owner_node, group_ntree);
 				}
@@ -1236,10 +1077,33 @@ void DepsgraphNodeBuilder::build_texture(DepsNode *owner_node, Tex *tex)
 		return;
 	}
 	tex_id->tag |= LIB_TAG_DOIT;
-	/* texture itself */
+	/* Texture itself. */
 	build_animdata(tex_id);
-	/* texture's nodetree */
+	/* Texture's nodetree. */
 	build_nodetree(owner_node, tex->nodetree);
+	/* Special cases for different IDs which texture uses. */
+	if (tex->type == TEX_IMAGE) {
+		if (tex->ima != NULL) {
+			build_image(tex->ima);
+		}
+	}
+}
+
+void DepsgraphNodeBuilder::build_image(Image *image) {
+	ID *image_id = &image->id;
+	if (image_id->tag & LIB_TAG_DOIT) {
+		return;
+	}
+	image_id->tag |= LIB_TAG_DOIT;
+	/* Image ID node itself. */
+	add_id_node(image_id);
+	/* Placeholder so we can add relations and tag ID node for update. */
+	add_operation_node(image_id,
+	                   DEPSNODE_TYPE_PARAMETERS,
+	                   DEPSOP_TYPE_EXEC,
+	                   NULL,
+	                   DEG_OPCODE_PLACEHOLDER,
+	                   "Image Eval");
 }
 
 void DepsgraphNodeBuilder::build_compositor(Scene *scene)
@@ -1273,7 +1137,6 @@ void DepsgraphNodeBuilder::build_cachefile(CacheFile *cache_file)
 	ID *cache_file_id = &cache_file->id;
 
 	add_component_node(cache_file_id, DEPSNODE_TYPE_CACHE);
-
 	add_operation_node(cache_file_id, DEPSNODE_TYPE_CACHE,
 	                   DEPSOP_TYPE_EXEC, NULL,
 	                   DEG_OPCODE_PLACEHOLDER, "Cache File Update");
@@ -1282,4 +1145,17 @@ void DepsgraphNodeBuilder::build_cachefile(CacheFile *cache_file)
 	build_animdata(cache_file_id);
 }
 
+void DepsgraphNodeBuilder::build_mask(Mask *mask)
+{
+	ID *mask_id = &mask->id;
+	add_id_node(mask_id);
+	build_animdata(mask_id);
+}
+
+void DepsgraphNodeBuilder::build_movieclip(MovieClip *clip) {
+	ID *clip_id = &clip->id;
+	add_id_node(clip_id);
+	build_animdata(clip_id);
+}
+
 }  // namespace DEG
diff --git a/source/blender/depsgraph/intern/builder/deg_builder_nodes.h b/source/blender/depsgraph/intern/builder/deg_builder_nodes.h
index f378f07..9cb8bc5 100644
--- a/source/blender/depsgraph/intern/builder/deg_builder_nodes.h
+++ b/source/blender/depsgraph/intern/builder/deg_builder_nodes.h
@@ -38,12 +38,15 @@ struct bGPdata;
 struct ListBase;
 struct GHash;
 struct ID;
+struct Image;
 struct FCurve;
 struct Group;
 struct Key;
 struct Main;
 struct Material;
+struct Mask;
 struct MTex;
+struct MovieClip;
 struct bNodeTree;
 struct Object;
 struct bPoseChannel;
@@ -75,43 +78,49 @@ struct DepsgraphNodeBuilder {
 
 	ComponentDepsNode *add_component_node(ID *id,
 	                                      eDepsNode_Type comp_type,
-	                                      const string& comp_name = "");
+	                                      const char *comp_name = "");
 
 	OperationDepsNode *add_operation_node(ComponentDepsNode *comp_node,
 	                                      eDepsOperation_Type optype,
 	                                      DepsEvalOperationCb op,
 	                                      eDepsOperation_Code opcode,
-	                                      const string& description = "");
+	                                      const char *name = "",
+	                                      int name_tag = -1);
 	OperationDepsNode *add_operation_node(ID *id,
 	                                      eDepsNode_Type comp_type,
-	                                      const string& comp_name,
+	                                      const char *comp_name,
 	                                      eDepsOperation_Type optype,
 	                                      DepsEvalOperationCb op,
 	                                      eDepsOperation_Code opcode,
-	                                      const string& description = "");
+	                                      const char *name = "",
+	                                      int name_tag = -1);
 	OperationDepsNode *add_operation_node(ID *id,
 	                                      eDepsNode_Type comp_type,
 	                                      eDepsOperation_Type optype,
 	                                      DepsEvalOperationCb op,
 	                                      eDepsOperation_Code opcode,
-	                                      const string& description = "");
+	                                      const char *name = "",
+	                                      int name_tag = -1);
 
 	bool has_operation_node(ID *id,
 	                        eDepsNode_Type comp_type,
-	                        const string& comp_name,
+	                        const char *comp_name,
 	                        eDepsOperation_Code opcode,
-	                        const string& description = "");
+	                        const char *name = "",
+	                        int name_tag = -1);
 
 	OperationDepsNode *find_operation_node(ID *id,
 	                                       eDepsNode_Type comp_type,
-	                                       const string &comp_name,
+	                                       const char *comp_name,
 	                                       eDepsOperation_Code opcode,
-	                                       const string &description = "");
+	                                       const char *name = "",
+	                                       int name_tag = -1);
 
 	OperationDepsNode *find_operation_node(ID *id,
 	                                       eDepsNode_Type comp_type,
 	                                       eDepsOperation_Code opcode,
-	                                       const string &description = "");
+	                                       const char *name = "",
+	                                       int name_tag = -1);
 
 	void build_scene(Main *bmain, Scene *scene);
 	SubgraphDepsNode *build_subgraph(Group *group);
@@ -142,10 +151,13 @@ struct DepsgraphNodeBuilder {
 	void build_material(DepsNode *owner_node, Material *ma);
 	void build_texture(DepsNode *owner_node, Tex *tex);
 	void build_texture_stack(DepsNode *owner_node, MTex **texture_stack);
+	void build_image(Image *image);
 	void build_world(World *world);
 	void build_compositor(Scene *scene);
 	void build_gpencil(bGPdata *gpd);
 	void build_cachefile(CacheFile *cache_file);
+	void build_mask(Mask *mask);
+	void build_movieclip(MovieClip *clip);
 
 protected:
 	Main *m_bmain;
diff --git a/source/blender/depsgraph/intern/builder/deg_builder_nodes_rig.cc b/source/blender/depsgraph/intern/builder/deg_builder_nodes_rig.cc
new file mode 100644
index 0000000..4a5f3dc
--- /dev/null
+++ b/source/blender/depsgraph/intern/builder/deg_builder_nodes_rig.cc
@@ -0,0 +1,273 @@
+/*
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2013 Blender Foundation.
+ * All rights reserved.
+ *
+ * Original Author: Joshua Leung
+ * Contributor(s): Based on original depsgraph.c code - Blender Foundation (2005-2013)
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+/** \file blender/depsgraph/intern/builder/deg_builder_nodes_rig.cc
+ *  \ingroup depsgraph
+ *
+ * Methods for constructing depsgraph's nodes
+ */
+
+#include "intern/builder/deg_builder_nodes.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "MEM_guardedalloc.h"
+
+extern "C" {
+#include "BLI_blenlib.h"
+#include "BLI_string.h"
+#include "BLI_utildefines.h"
+
+#include "DNA_anim_types.h"
+#include "DNA_armature_types.h"
+#include "DNA_constraint_types.h"
+#include "DNA_object_types.h"
+
+#include "BKE_action.h"
+#include "BKE_armature.h"
+
+#include "DEG_depsgraph.h"
+#include "DEG_depsgraph_build.h"
+} /* extern "C" */
+
+#include "intern/builder/deg_builder.h"
+#include "intern/nodes/deg_node.h"
+#include "intern/nodes/deg_node_component.h"
+#include "intern/nodes/deg_node_operation.h"
+#include "intern/depsgraph_types.h"
+#include "intern/depsgraph_intern.h"
+#include "util/deg_util_foreach.h"
+
+namespace DEG {
+
+void DepsgraphNodeBuilder::build_pose_constraints(Object *ob, bPoseChannel *pchan)
+{
+	/* create node for constraint stack */
+	add_operation_node(&ob->id, DEPSNODE_TYPE_BONE, pchan->name,
+	                   DEPSOP_TYPE_EXEC, function_bind(BKE_pose_constraints_evaluate, _1, ob, pchan),
+	                   DEG_OPCODE_BONE_CONSTRAINTS);
+}
+
+/* IK Solver Eval Steps */
+void DepsgraphNodeBuilder::build_ik_pose(Scene *scene, Object *ob, bPoseChannel *pchan, bConstraint *con)
+{
+	bKinematicConstraint *data = (bKinematicConstraint *)con->data;
+
+	/* Find the chain's root. */
+	bPoseChannel *rootchan = BKE_armature_ik_solver_find_root(pchan, data);
+
+	if (has_operation_node(&ob->id, DEPSNODE_TYPE_EVAL_POSE, rootchan->name,
+	                       DEG_OPCODE_POSE_IK_SOLVER))
+	{
+		return;
+	}
+
+	/* Operation node for evaluating/running IK Solver. */
+	add_operation_node(&ob->id, DEPSNODE_TYPE_EVAL_POSE, rootchan->name,
+	                   DEPSOP_TYPE_SIM, function_bind(BKE_pose_iktree_evaluate, _1, scene, ob, rootchan),
+	                   DEG_OPCODE_POSE_IK_SOLVER);
+}
+
+/* Spline IK Eval Steps */
+void DepsgraphNodeBuilder::build_splineik_pose(Scene *scene, Object *ob, bPoseChannel *pchan, bConstraint *con)
+{
+	bSplineIKConstraint *data = (bSplineIKConstraint *)con->data;
+
+	/* Find the chain's root. */
+	bPoseChannel *rootchan = BKE_armature_splineik_solver_find_root(pchan, data);
+
+	/* Operation node for evaluating/running Spline IK Solver.
+	 * Store the "root bone" of this chain in the solver, so it knows where to start.
+	 */
+	add_operation_node(&ob->id, DEPSNODE_TYPE_EVAL_POSE, rootchan->name,
+	                   DEPSOP_TYPE_SIM, function_bind(BKE_pose_splineik_evaluate, _1, scene, ob, rootchan),
+	                   DEG_OPCODE_POSE_SPLINE_IK_SOLVER);
+}
+
+/* Pose/Armature Bones Graph */
+void DepsgraphNodeBuilder::build_rig(Scene *scene, Object *ob)
+{
+	bArmature *arm = (bArmature *)ob->data;
+
+	/* animation and/or drivers linking posebones to base-armature used to define them
+	 * NOTE: AnimData here is really used to control animated deform properties,
+	 *       which ideally should be able to be unique across different instances.
+	 *       Eventually, we need some type of proxy/isolation mechanism in-between here
+	 *       to ensure that we can use same rig multiple times in same scene...
+	 */
+	build_animdata(&arm->id);
+
+	/* Rebuild pose if not up to date. */
+	if (ob->pose == NULL || (ob->pose->flag & POSE_RECALC)) {
+		BKE_pose_rebuild_ex(ob, arm, false);
+		/* XXX: Without this animation gets lost in certain circumstances
+		 * after loading file. Need to investigate further since it does
+		 * not happen with simple scenes..
+		 */
+		if (ob->adt) {
+			ob->adt->recalc |= ADT_RECALC_ANIM;
+		}
+	}
+
+	/* speed optimization for animation lookups */
+	if (ob->pose) {
+		BKE_pose_channels_hash_make(ob->pose);
+		if (ob->pose->flag & POSE_CONSTRAINTS_NEED_UPDATE_FLAGS) {
+			BKE_pose_update_constraint_flags(ob->pose);
+		}
+	}
+
+	/* Make sure pose is up-to-date with armature updates. */
+	add_operation_node(&arm->id,
+	                   DEPSNODE_TYPE_PARAMETERS,
+	                   DEPSOP_TYPE_EXEC,
+	                   NULL,
+	                   DEG_OPCODE_PLACEHOLDER,
+	                   "Armature Eval");
+
+	/**
+	 * Pose Rig Graph
+	 * ==============
+	 *
+	 * Pose Component:
+	 * - Mainly used for referencing Bone components.
+	 * - This is where the evaluation operations for init/exec/cleanup
+	 *   (ik) solvers live, and are later hooked up (so that they can be
+	 *   interleaved during runtime) with bone-operations they depend on/affect.
+	 * - init_pose_eval() and cleanup_pose_eval() are absolute first and last
+	 *   steps of pose eval process. ALL bone operations must be performed
+	 *   between these two...
+	 *
+	 * Bone Component:
+	 * - Used for representing each bone within the rig
+	 * - Acts to encapsulate the evaluation operations (base matrix + parenting,
+	 *   and constraint stack) so that they can be easily found.
+	 * - Everything else which depends on bone-results hook up to the component only
+	 *   so that we can redirect those to point at either the the post-IK/
+	 *   post-constraint/post-matrix steps, as needed.
+	 */
+
+	/* pose eval context */
+	add_operation_node(&ob->id, DEPSNODE_TYPE_EVAL_POSE,
+	                   DEPSOP_TYPE_INIT, function_bind(BKE_pose_eval_init, _1, scene, ob, ob->pose), DEG_OPCODE_POSE_INIT);
+
+	add_operation_node(&ob->id, DEPSNODE_TYPE_EVAL_POSE,
+	                   DEPSOP_TYPE_POST, function_bind(BKE_pose_eval_flush, _1, scene, ob, ob->pose), DEG_OPCODE_POSE_DONE);
+
+	/* bones */
+	LINKLIST_FOREACH (bPoseChannel *, pchan, &ob->pose->chanbase) {
+		/* node for bone eval */
+		add_operation_node(&ob->id, DEPSNODE_TYPE_BONE, pchan->name,
+		                   DEPSOP_TYPE_INIT, NULL, // XXX: BKE_pose_eval_bone_local
+		                   DEG_OPCODE_BONE_LOCAL);
+
+		add_operation_node(&ob->id, DEPSNODE_TYPE_BONE, pchan->name,
+		                   DEPSOP_TYPE_EXEC, function_bind(BKE_pose_eval_bone, _1, scene, ob, pchan), // XXX: BKE_pose_eval_bone_pose
+		                   DEG_OPCODE_BONE_POSE_PARENT);
+
+		add_operation_node(&ob->id, DEPSNODE_TYPE_BONE, pchan->name,
+		                   DEPSOP_TYPE_OUT, NULL, /* NOTE: dedicated noop for easier relationship construction */
+		                   DEG_OPCODE_BONE_READY);
+
+		add_operation_node(&ob->id, DEPSNODE_TYPE_BONE, pchan->name,
+		                   DEPSOP_TYPE_POST, function_bind(BKE_pose_bone_done, _1, pchan),
+		                   DEG_OPCODE_BONE_DONE);
+
+		/* constraints */
+		if (pchan->constraints.first != NULL) {
+			build_pose_constraints(ob, pchan);
+		}
+
+		/**
+		 * IK Solvers...
+		 *
+		 * - These require separate processing steps are pose-level
+		 *   to be executed between chains of bones (i.e. once the
+		 *   base transforms of a bunch of bones is done)
+		 *
+		 * Unsolved Issues:
+		 * - Care is needed to ensure that multi-headed trees work out the same as in ik-tree building
+		 * - Animated chain-lengths are a problem...
+		 */
+		LINKLIST_FOREACH (bConstraint *, con, &pchan->constraints) {
+			switch (con->type) {
+				case CONSTRAINT_TYPE_KINEMATIC:
+					build_ik_pose(scene, ob, pchan, con);
+					break;
+
+				case CONSTRAINT_TYPE_SPLINEIK:
+					build_splineik_pose(scene, ob, pchan, con);
+					break;
+
+				default:
+					break;
+			}
+		}
+	}
+}
+
+void DepsgraphNodeBuilder::build_proxy_rig(Object *ob)
+{
+	ID *obdata = (ID *)ob->data;
+	build_animdata(obdata);
+
+	BLI_assert(ob->pose != NULL);
+
+	/* speed optimization for animation lookups */
+	BKE_pose_channels_hash_make(ob->pose);
+	if (ob->pose->flag & POSE_CONSTRAINTS_NEED_UPDATE_FLAGS) {
+		BKE_pose_update_constraint_flags(ob->pose);
+	}
+
+	add_operation_node(&ob->id,
+	                   DEPSNODE_TYPE_EVAL_POSE,
+	                   DEPSOP_TYPE_INIT,
+	                   function_bind(BKE_pose_eval_proxy_copy, _1, ob),
+	                   DEG_OPCODE_POSE_INIT);
+
+	LINKLIST_FOREACH (bPoseChannel *, pchan, &ob->pose->chanbase) {
+		add_operation_node(&ob->id, DEPSNODE_TYPE_BONE, pchan->name,
+		                   DEPSOP_TYPE_INIT, NULL,
+		                   DEG_OPCODE_BONE_LOCAL);
+
+		add_operation_node(&ob->id, DEPSNODE_TYPE_BONE, pchan->name,
+		                   DEPSOP_TYPE_EXEC, NULL,
+		                   DEG_OPCODE_BONE_READY);
+
+		add_operation_node(&ob->id, DEPSNODE_TYPE_BONE, pchan->name,
+		                   DEPSOP_TYPE_POST, NULL,
+		                   DEG_OPCODE_BONE_DONE);
+	}
+
+	add_operation_node(&ob->id,
+	                   DEPSNODE_TYPE_EVAL_POSE,
+	                   DEPSOP_TYPE_POST,
+	                   NULL,
+	                   DEG_OPCODE_POSE_DONE);
+}
+
+}  // namespace DEG
diff --git a/source/blender/depsgraph/intern/builder/deg_builder_nodes_scene.cc b/source/blender/depsgraph/intern/builder/deg_builder_nodes_scene.cc
new file mode 100644
index 0000000..bcd4bc5
--- /dev/null
+++ b/source/blender/depsgraph/intern/builder/deg_builder_nodes_scene.cc
@@ -0,0 +1,159 @@
+/*
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2013 Blender Foundation.
+ * All rights reserved.
+ *
+ * Original Author: Joshua Leung
+ * Contributor(s): Based on original depsgraph.c code - Blender Foundation (2005-2013)
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+/** \file blender/depsgraph/intern/builder/deg_builder_nodes_scene.cc
+ *  \ingroup depsgraph
+ *
+ * Methods for constructing depsgraph's nodes
+ */
+
+#include "intern/builder/deg_builder_nodes.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "MEM_guardedalloc.h"
+
+extern "C" {
+#include "BLI_blenlib.h"
+#include "BLI_string.h"
+#include "BLI_utildefines.h"
+
+#include "DNA_node_types.h"
+#include "DNA_object_types.h"
+#include "DNA_scene_types.h"
+
+#include "BKE_main.h"
+#include "BKE_node.h"
+
+#include "DEG_depsgraph.h"
+#include "DEG_depsgraph_build.h"
+} /* extern "C" */
+
+#include "intern/builder/deg_builder.h"
+#include "intern/nodes/deg_node.h"
+#include "intern/nodes/deg_node_component.h"
+#include "intern/nodes/deg_node_operation.h"
+#include "intern/depsgraph_types.h"
+#include "intern/depsgraph_intern.h"
+#include "util/deg_util_foreach.h"
+
+namespace DEG {
+
+void DepsgraphNodeBuilder::build_scene(Main *bmain, Scene *scene)
+{
+	/* LIB_TAG_DOIT is used to indicate whether node for given ID was already
+	 * created or not. This flag is being set in add_id_node(), so functions
+	 * shouldn't bother with setting it, they only might query this flag when
+	 * needed.
+	 */
+	BKE_main_id_tag_all(bmain, LIB_TAG_DOIT, false);
+	/* XXX nested node trees are not included in tag-clearing above,
+	 * so we need to do this manually.
+	 */
+	FOREACH_NODETREE(bmain, nodetree, id) {
+		if (id != (ID *)nodetree)
+			nodetree->id.tag &= ~LIB_TAG_DOIT;
+	} FOREACH_NODETREE_END
+
+	/* scene ID block */
+	add_id_node(&scene->id);
+
+	/* timesource */
+	add_time_source(NULL);
+
+	/* build subgraph for set, and link this in... */
+	// XXX: depending on how this goes, that scene itself could probably store its
+	//      own little partial depsgraph?
+	if (scene->set) {
+		build_scene(bmain, scene->set);
+	}
+
+	/* scene objects */
+	LINKLIST_FOREACH (Base *, base, &scene->base) {
+		Object *ob = base->object;
+
+		/* object itself */
+		build_object(scene, base, ob);
+
+		/* object that this is a proxy for */
+		// XXX: the way that proxies work needs to be completely reviewed!
+		if (ob->proxy) {
+			ob->proxy->proxy_from = ob;
+			build_object(scene, base, ob->proxy);
+		}
+
+		/* Object dupligroup. */
+		if (ob->dup_group) {
+			build_group(scene, base, ob->dup_group);
+		}
+	}
+
+	/* rigidbody */
+	if (scene->rigidbody_world) {
+		build_rigidbody(scene);
+	}
+
+	/* scene's animation and drivers */
+	if (scene->adt) {
+		build_animdata(&scene->id);
+	}
+
+	/* world */
+	if (scene->world) {
+		build_world(scene->world);
+	}
+
+	/* compo nodes */
+	if (scene->nodetree) {
+		build_compositor(scene);
+	}
+
+	/* sequencer */
+	// XXX...
+
+	/* grease pencil */
+	if (scene->gpd) {
+		build_gpencil(scene->gpd);
+	}
+
+	/* Cache file. */
+	LINKLIST_FOREACH (CacheFile *, cachefile, &bmain->cachefiles) {
+		build_cachefile(cachefile);
+	}
+
+	/* Masks. */
+	LINKLIST_FOREACH (Mask *, mask, &bmain->mask) {
+		build_mask(mask);
+	}
+
+	/* Movie clips. */
+	LINKLIST_FOREACH (MovieClip *, clip, &bmain->movieclip) {
+		build_movieclip(clip);
+	}
+}
+
+}  // namespace DEG
diff --git a/source/blender/depsgraph/intern/builder/deg_builder_relations.cc b/source/blender/depsgraph/intern/builder/deg_builder_relations.cc
index 2148a35..b5272d3 100644
--- a/source/blender/depsgraph/intern/builder/deg_builder_relations.cc
+++ b/source/blender/depsgraph/intern/builder/deg_builder_relations.cc
@@ -34,13 +34,12 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <string.h>
+#include <cstring>  /* required for STREQ later on. */
 
 #include "MEM_guardedalloc.h"
 
 extern "C" {
 #include "BLI_blenlib.h"
-#include "BLI_string.h"
 #include "BLI_utildefines.h"
 
 #include "DNA_action_types.h"
@@ -56,8 +55,10 @@ extern "C" {
 #include "DNA_key_types.h"
 #include "DNA_lamp_types.h"
 #include "DNA_material_types.h"
+#include "DNA_mask_types.h"
 #include "DNA_mesh_types.h"
 #include "DNA_meta_types.h"
+#include "DNA_movieclip_types.h"
 #include "DNA_node_types.h"
 #include "DNA_particle_types.h"
 #include "DNA_object_types.h"
@@ -115,6 +116,32 @@ namespace DEG {
 /* ***************** */
 /* Relations Builder */
 
+/* TODO(sergey): This is somewhat weak, but we don't want neither false-positive
+ * time dependencies nor special exceptions in the depsgraph evaluation.
+ */
+static bool python_driver_depends_on_time(ChannelDriver *driver)
+{
+	if (driver->expression[0] == '\0') {
+		/* Empty expression depends on nothing. */
+		return false;
+	}
+	if (strchr(driver->expression, '(') != NULL) {
+		/* Function calls are considered dependent on a time. */
+		return true;
+	}
+	if (strstr(driver->expression, "time") != NULL) {
+		/* Variable `time` depends on time. */
+		/* TODO(sergey): This is a bit weak, but not sure about better way of
+		 * handling this.
+		 */
+		return true;
+	}
+	/* Possible indirect time relation s should be handled via variable
+	 * targets.
+	 */
+	return false;
+}
+
 /* **** General purpose functions ****  */
 
 RNAPathKey::RNAPathKey(ID *id, const char *path) :
@@ -185,10 +212,12 @@ OperationDepsNode *DepsgraphRelationBuilder::find_node(
 		return NULL;
 	}
 
-	OperationDepsNode *op_node = comp_node->find_operation(key.opcode, key.name);
+	OperationDepsNode *op_node = comp_node->find_operation(key.opcode,
+	                                                       key.name,
+	                                                       key.name_tag);
 	if (!op_node) {
 		fprintf(stderr, "find_node_operation: Failed for (%s, '%s')\n",
-		        DEG_OPNAMES[key.opcode], key.name.c_str());
+		        DEG_OPNAMES[key.opcode], key.name);
 	}
 	return op_node;
 }
@@ -210,7 +239,7 @@ OperationDepsNode *DepsgraphRelationBuilder::has_node(
 	if (!comp_node) {
 		return NULL;
 	}
-	return comp_node->has_operation(key.opcode, key.name);
+	return comp_node->has_operation(key.opcode, key.name, key.name_tag);
 }
 
 void DepsgraphRelationBuilder::add_time_relation(TimeSourceDepsNode *timesrc,
@@ -310,88 +339,6 @@ void DepsgraphRelationBuilder::add_forcefield_relations(const OperationKey &key,
 
 /* **** Functions to build relations between entities  **** */
 
-void DepsgraphRelationBuilder::build_scene(Main *bmain, Scene *scene)
-{
-	/* LIB_TAG_DOIT is used to indicate whether node for given ID was already
-	 * created or not.
-	 */
-	BKE_main_id_tag_all(bmain, LIB_TAG_DOIT, false);
-	/* XXX nested node trees are not included in tag-clearing above,
-	 * so we need to do this manually.
-	 */
-	FOREACH_NODETREE(bmain, nodetree, id) {
-		if (id != (ID *)nodetree)
-			nodetree->id.tag &= ~LIB_TAG_DOIT;
-	} FOREACH_NODETREE_END
-
-	if (scene->set) {
-		// TODO: link set to scene, especially our timesource...
-	}
-
-	/* scene objects */
-	for (Base *base = (Base *)scene->base.first; base; base = base->next) {
-		Object *ob = base->object;
-
-		/* object itself */
-		build_object(bmain, scene, ob);
-
-		/* object that this is a proxy for */
-		if (ob->proxy) {
-			ob->proxy->proxy_from = ob;
-			build_object(bmain, scene, ob->proxy);
-			/* TODO(sergey): This is an inverted relation, matches old depsgraph
-			 * behavior and need to be investigated if it still need to be inverted.
-			 */
-			ComponentKey ob_pose_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE);
-			ComponentKey proxy_pose_key(&ob->proxy->id, DEPSNODE_TYPE_EVAL_POSE);
-			add_relation(ob_pose_key, proxy_pose_key, DEPSREL_TYPE_TRANSFORM, "Proxy");
-		}
-
-		/* Object dupligroup. */
-		if (ob->dup_group) {
-			build_group(bmain, scene, ob, ob->dup_group);
-		}
-	}
-
-	/* rigidbody */
-	if (scene->rigidbody_world) {
-		build_rigidbody(scene);
-	}
-
-	/* scene's animation and drivers */
-	if (scene->adt) {
-		build_animdata(&scene->id);
-	}
-
-	/* world */
-	if (scene->world) {
-		build_world(scene->world);
-	}
-
-	/* compo nodes */
-	if (scene->nodetree) {
-		build_compositor(scene);
-	}
-
-	/* grease pencil */
-	if (scene->gpd) {
-		build_gpencil(&scene->id, scene->gpd);
-	}
-
-	for (Depsgraph::OperationNodes::const_iterator it_op = m_graph->operations.begin();
-	     it_op != m_graph->operations.end();
-	     ++it_op)
-	{
-		OperationDepsNode *node = *it_op;
-		IDDepsNode *id_node = node->owner->owner;
-		ID *id = id_node->id;
-		if (GS(id->name) == ID_OB) {
-			Object *object = (Object *)id;
-			object->customdata_mask |= node->customdata_mask;
-		}
-	}
-}
-
 void DepsgraphRelationBuilder::build_group(Main *bmain,
                                            Scene *scene,
                                            Object *object,
@@ -402,10 +349,7 @@ void DepsgraphRelationBuilder::build_group(Main *bmain,
 	OperationKey object_local_transform_key(&object->id,
 	                                        DEPSNODE_TYPE_TRANSFORM,
 	                                        DEG_OPCODE_TRANSFORM_LOCAL);
-	for (GroupObject *go = (GroupObject *)group->gobject.first;
-	     go != NULL;
-	     go = go->next)
-	{
+	LINKLIST_FOREACH (GroupObject *, go, &group->gobject) {
 		if (!group_done) {
 			build_object(bmain, scene, go->ob);
 		}
@@ -423,6 +367,7 @@ void DepsgraphRelationBuilder::build_object(Main *bmain, Scene *scene, Object *o
 	if (ob->id.tag & LIB_TAG_DOIT) {
 		return;
 	}
+	ob->id.tag |= LIB_TAG_DOIT;
 
 	/* Object Transforms */
 	eDepsOperation_Code base_op = (ob->parent) ? DEG_OPCODE_TRANSFORM_PARENT : DEG_OPCODE_TRANSFORM_LOCAL;
@@ -461,12 +406,23 @@ void DepsgraphRelationBuilder::build_object(Main *bmain, Scene *scene, Object *o
 		add_relation(ob_ubereval_key, final_transform_key, DEPSREL_TYPE_COMPONENT_ORDER, "Temp Ubereval");
 	}
 	else {
-		/* operation order */
-		add_relation(base_op_key, final_transform_key, DEPSREL_TYPE_COMPONENT_ORDER, "Object Transform");
-
-		// XXX
-		add_relation(base_op_key, ob_ubereval_key, DEPSREL_TYPE_COMPONENT_ORDER, "Temp Ubereval");
-		add_relation(ob_ubereval_key, final_transform_key, DEPSREL_TYPE_COMPONENT_ORDER, "Temp Ubereval");
+		/* NOTE: Keep an eye here, we skip some relations here to "streamline"
+		 * dependencies and avoid transitive relations which causes overhead.
+		 * But once we get rid of uber eval node this will need reconsideration.
+		 */
+		if (ob->rigidbody_object == NULL) {
+			/* Rigid body will hook up another node inbetween, so skip
+			 * relation here to avoid transitive relation.
+			 */
+			add_relation(base_op_key,
+			             ob_ubereval_key,
+			             DEPSREL_TYPE_COMPONENT_ORDER,
+			             "Temp Ubereval");
+		}
+		add_relation(ob_ubereval_key,
+		             final_transform_key,
+		             DEPSREL_TYPE_COMPONENT_ORDER,
+		             "Temp Ubereval");
 	}
 
 
@@ -474,7 +430,7 @@ void DepsgraphRelationBuilder::build_object(Main *bmain, Scene *scene, Object *o
 	build_animdata(&ob->id);
 
 	// XXX: This should be hooked up by the build_animdata code
-	if (ob->adt && (ob->adt->action || ob->adt->nla_tracks.first)) {
+	if (needs_animdata_node(&ob->id)) {
 		ComponentKey adt_key(&ob->id, DEPSNODE_TYPE_ANIMATION);
 		add_relation(adt_key, local_transform_key, DEPSREL_TYPE_OPERATION, "Object Animation");
 	}
@@ -572,8 +528,20 @@ void DepsgraphRelationBuilder::build_object_parent(Object *ob)
 
 		case PARBONE: /* Bone Parent */
 		{
-			ComponentKey parent_key(&ob->parent->id, DEPSNODE_TYPE_BONE, ob->parsubstr);
-			add_relation(parent_key, ob_key, DEPSREL_TYPE_TRANSFORM, "Bone Parent");
+			ComponentKey parent_bone_key(&ob->parent->id,
+			                             DEPSNODE_TYPE_BONE,
+			                             ob->parsubstr);
+			OperationKey parent_transform_key(&ob->parent->id,
+			                                  DEPSNODE_TYPE_TRANSFORM,
+			                                  DEG_OPCODE_TRANSFORM_FINAL);
+			add_relation(parent_bone_key,
+			             ob_key,
+			             DEPSREL_TYPE_TRANSFORM,
+			             "Bone Parent");
+			add_relation(parent_transform_key,
+			             ob_key,
+			             DEPSREL_TYPE_TRANSFORM,
+			             "Armature Parent");
 			break;
 		}
 
@@ -681,9 +649,10 @@ void DepsgraphRelationBuilder::build_constraints(Scene *scene, ID *id, eDepsNode
 			ListBase targets = {NULL, NULL};
 			cti->get_constraint_targets(con, &targets);
 
-			for (bConstraintTarget *ct = (bConstraintTarget *)targets.first; ct; ct = ct->next) {
-				if (!ct->tar)
+			LINKLIST_FOREACH (bConstraintTarget *, ct, &targets) {
+				if (ct->tar == NULL) {
 					continue;
+				}
 
 				if (ELEM(con->type, CONSTRAINT_TYPE_KINEMATIC, CONSTRAINT_TYPE_SPLINEIK)) {
 					/* ignore IK constraints - these are handled separately (on pose level) */
@@ -810,12 +779,64 @@ void DepsgraphRelationBuilder::build_animdata(ID *id)
 	}
 
 	/* drivers */
-	for (FCurve *fcu = (FCurve *)adt->drivers.first; fcu; fcu = fcu->next) {
-		OperationKey driver_key(id, DEPSNODE_TYPE_PARAMETERS, DEG_OPCODE_DRIVER, deg_fcurve_id_name(fcu));
+	LINKLIST_FOREACH (FCurve *, fcu, &adt->drivers) {
+		OperationKey driver_key(id,
+		                        DEPSNODE_TYPE_PARAMETERS,
+		                        DEG_OPCODE_DRIVER,
+		                        fcu->rna_path,
+		                        fcu->array_index);
 
 		/* create the driver's relations to targets */
 		build_driver(id, fcu);
 
+		/* Special case for array drivers: we can not multithread them because
+		 * of the way how they work internally: animation system will write the
+		 * whole array back to RNA even when changing individual array value.
+		 *
+		 * Some tricky things here:
+		 * - array_index is -1 for single channel drivers, meaning we only have
+		 *   to do some magic when array_index is not -1.
+		 * - We do relation from next array index to a previous one, so we don't
+		 *   have to deal with array index 0.
+		 *
+		 * TODO(sergey): Avoid liner lookup somehow.
+		 */
+		if (fcu->array_index > 0) {
+			FCurve *fcu_prev = NULL;
+			LINKLIST_FOREACH (FCurve *, fcu_candidate, &adt->drivers) {
+				/* Writing to different RNA paths is  */
+				if (!STREQ(fcu_candidate->rna_path, fcu->rna_path)) {
+					continue;
+				}
+				/* We only do relation from previous fcurve to previous one. */
+				if (fcu_candidate->array_index >= fcu->array_index) {
+					continue;
+				}
+				/* Choose fcurve with highest possible array index. */
+				if (fcu_prev == NULL ||
+				    fcu_candidate->array_index > fcu_prev->array_index)
+				{
+					fcu_prev = fcu_candidate;
+				}
+			}
+			if (fcu_prev != NULL) {
+				OperationKey prev_driver_key(id,
+				                             DEPSNODE_TYPE_PARAMETERS,
+				                             DEG_OPCODE_DRIVER,
+				                             fcu_prev->rna_path,
+				                             fcu_prev->array_index);
+				OperationKey driver_key(id,
+				                        DEPSNODE_TYPE_PARAMETERS,
+				                        DEG_OPCODE_DRIVER,
+				                        fcu->rna_path,
+				                        fcu->array_index);
+				add_relation(prev_driver_key,
+				             driver_key,
+				             DEPSREL_TYPE_OPERATION,
+				             "[Driver Order]");
+			}
+		}
+
 		/* prevent driver from occurring before own animation... */
 		if (adt->action || adt->nla_tracks.first) {
 			add_relation(adt_key, driver_key, DEPSREL_TYPE_OPERATION,
@@ -827,7 +848,11 @@ void DepsgraphRelationBuilder::build_animdata(ID *id)
 void DepsgraphRelationBuilder::build_driver(ID *id, FCurve *fcu)
 {
 	ChannelDriver *driver = fcu->driver;
-	OperationKey driver_key(id, DEPSNODE_TYPE_PARAMETERS, DEG_OPCODE_DRIVER, deg_fcurve_id_name(fcu));
+	OperationKey driver_key(id,
+	                        DEPSNODE_TYPE_PARAMETERS,
+	                        DEG_OPCODE_DRIVER,
+	                        fcu->rna_path,
+	                        fcu->array_index);
 	bPoseChannel *pchan = NULL;
 
 	/* create dependency between driver and data affected by it */
@@ -942,7 +967,7 @@ void DepsgraphRelationBuilder::build_driver(ID *id, FCurve *fcu)
 	// XXX: the data itself could also set this, if it were to be truly initialised later?
 
 	/* loop over variables to get the target relationships */
-	for (DriverVar *dvar = (DriverVar *)driver->variables.first; dvar; dvar = dvar->next) {
+	LINKLIST_FOREACH (DriverVar *, dvar, &driver->variables) {
 		/* only used targets */
 		DRIVER_TARGETS_USED_LOOPER(dvar)
 		{
@@ -1016,7 +1041,9 @@ void DepsgraphRelationBuilder::build_driver(ID *id, FCurve *fcu)
 	 * so for now we'll be quite conservative here about optimization and consider
 	 * all python drivers to be depending on time.
 	 */
-	if (driver->type == DRIVER_TYPE_PYTHON) {
+	if ((driver->type == DRIVER_TYPE_PYTHON) &&
+	    python_driver_depends_on_time(driver))
+	{
 		TimeSourceKey time_src_key;
 		add_relation(time_src_key, driver_key, DEPSREL_TYPE_TIME, "[TimeSrc -> Driver]");
 	}
@@ -1055,15 +1082,18 @@ void DepsgraphRelationBuilder::build_rigidbody(Scene *scene)
 
 	/* time dependency */
 	TimeSourceKey time_src_key;
-	add_relation(time_src_key, init_key, DEPSREL_TYPE_TIME, "TimeSrc -> Rigidbody Reset/Rebuild (Optional)");
-	add_relation(time_src_key, sim_key, DEPSREL_TYPE_TIME, "TimeSrc -> Rigidbody Sim Step");
+	add_relation(time_src_key,
+	             init_key,
+	             DEPSREL_TYPE_TIME,
+	             "TimeSrc -> Rigidbody Reset/Rebuild (Optional)");
 
 	/* objects - simulation participants */
 	if (rbw->group) {
-		for (GroupObject *go = (GroupObject *)rbw->group->gobject.first; go; go = go->next) {
+		LINKLIST_FOREACH (GroupObject *, go, &rbw->group->gobject) {
 			Object *ob = go->ob;
-			if (!ob || ob->type != OB_MESH)
+			if (ob == NULL || ob->type != OB_MESH) {
 				continue;
+			}
 
 			/* hook up evaluation order...
 			 * 1) flushing rigidbody results follows base transforms being applied
@@ -1078,7 +1108,6 @@ void DepsgraphRelationBuilder::build_rigidbody(Scene *scene)
 			eDepsOperation_Code trans_opcode = ob->parent ? DEG_OPCODE_TRANSFORM_PARENT : DEG_OPCODE_TRANSFORM_LOCAL;
 			OperationKey trans_op(&ob->id, DEPSNODE_TYPE_TRANSFORM, trans_opcode);
 
-			add_relation(trans_op, rbo_key, DEPSREL_TYPE_OPERATION, "Base Ob Transform -> RBO Sync");
 			add_relation(sim_key, rbo_key, DEPSREL_TYPE_COMPONENT_ORDER, "Rigidbody Sim Eval -> RBO Sync");
 
 			/* if constraints exist, those depend on the result of the rigidbody sim
@@ -1090,31 +1119,44 @@ void DepsgraphRelationBuilder::build_rigidbody(Scene *scene)
 			 *   to control whether rigidbody eval gets interleaved into the constraint stack
 			 */
 			if (ob->constraints.first) {
-				OperationKey constraint_key(&ob->id, DEPSNODE_TYPE_TRANSFORM, DEG_OPCODE_TRANSFORM_CONSTRAINTS);
-				add_relation(rbo_key, constraint_key, DEPSREL_TYPE_COMPONENT_ORDER, "RBO Sync -> Ob Constraints");
+				OperationKey constraint_key(&ob->id,
+				                            DEPSNODE_TYPE_TRANSFORM,
+				                            DEG_OPCODE_TRANSFORM_CONSTRAINTS);
+				add_relation(rbo_key,
+				             constraint_key,
+				             DEPSREL_TYPE_COMPONENT_ORDER,
+				             "RBO Sync -> Ob Constraints");
 			}
 			else {
-				/* final object transform depends on rigidbody */
-				OperationKey done_key(&ob->id, DEPSNODE_TYPE_TRANSFORM, DEG_OPCODE_TRANSFORM_FINAL);
-				add_relation(rbo_key, done_key, DEPSREL_TYPE_COMPONENT_ORDER, "RBO Sync -> Done");
-
-				// XXX: ubereval will be removed eventually, but we still need it in the meantime
-				OperationKey uber_key(&ob->id, DEPSNODE_TYPE_TRANSFORM, DEG_OPCODE_OBJECT_UBEREVAL);
-				add_relation(rbo_key, uber_key, DEPSREL_TYPE_COMPONENT_ORDER, "RBO Sync -> Uber (Temp)");
+				/* Final object transform depends on rigidbody.
+				 *
+				 * NOTE: Currently we consider final here an ubereval node.
+				 * If it is gone we'll need to reconsider relation here.
+				 */
+				OperationKey uber_key(&ob->id,
+				                      DEPSNODE_TYPE_TRANSFORM,
+				                      DEG_OPCODE_OBJECT_UBEREVAL);
+				add_relation(rbo_key,
+				             uber_key,
+				             DEPSREL_TYPE_COMPONENT_ORDER,
+				             "RBO Sync -> Uber (Temp)");
 			}
 
-
-			/* needed to get correct base values */
-			add_relation(trans_op, sim_key, DEPSREL_TYPE_OPERATION, "Base Ob Transform -> Rigidbody Sim Eval");
+			/* Needed to get correct base values. */
+			add_relation(trans_op,
+			             sim_key,
+			             DEPSREL_TYPE_OPERATION,
+			             "Base Ob Transform -> Rigidbody Sim Eval");
 		}
 	}
 
 	/* constraints */
 	if (rbw->constraints) {
-		for (GroupObject *go = (GroupObject *)rbw->constraints->gobject.first; go; go = go->next) {
+		LINKLIST_FOREACH (GroupObject *, go, &rbw->constraints->gobject) {
 			Object *ob = go->ob;
-			if (!ob || !ob->rigidbody_constraint)
+			if (ob == NULL || !ob->rigidbody_constraint) {
 				continue;
+			}
 
 			RigidBodyCon *rbc = ob->rigidbody_constraint;
 
@@ -1143,7 +1185,7 @@ void DepsgraphRelationBuilder::build_particles(Scene *scene, Object *ob)
 	                                 DEG_OPCODE_GEOMETRY_UBEREVAL);
 
 	/* particle systems */
-	for (ParticleSystem *psys = (ParticleSystem *)ob->particlesystem.first; psys; psys = psys->next) {
+	LINKLIST_FOREACH (ParticleSystem *, psys, &ob->particlesystem) {
 		ParticleSettings *part = psys->part;
 
 		/* particle settings */
@@ -1174,9 +1216,7 @@ void DepsgraphRelationBuilder::build_particles(Scene *scene, Object *ob)
 
 #if 0
 		if (ELEM(part->phystype, PART_PHYS_KEYED, PART_PHYS_BOIDS)) {
-			ParticleTarget *pt;
-
-			for (pt = psys->targets.first; pt; pt = pt->next) {
+			LINKLIST_FOREACH (ParticleTarget *, pt, &psys->targets) {
 				if (pt->ob && BLI_findlink(&pt->ob->particlesystem, pt->psys - 1)) {
 					node2 = dag_get_node(dag, pt->ob);
 					dag_add_relation(dag, node2, node, DAG_RL_DATA_DATA | DAG_RL_OB_DATA, "Particle Targets");
@@ -1195,7 +1235,7 @@ void DepsgraphRelationBuilder::build_particles(Scene *scene, Object *ob)
 		}
 
 		if (part->ren_as == PART_DRAW_GR && part->dup_group) {
-			for (go = part->dup_group->gobject.first; go; go = go->next) {
+			LINKLIST_FOREACH (GroupObject *, go, &part->dup_group->gobject) {
 				node2 = dag_get_node(dag, go->ob);
 				dag_add_relation(dag, node2, node, DAG_RL_OB_OB, "Particle Group Visualization");
 			}
@@ -1206,17 +1246,17 @@ void DepsgraphRelationBuilder::build_particles(Scene *scene, Object *ob)
 		if (part->type != PART_HAIR) {
 			add_collision_relations(psys_key, scene, ob, part->collision_group, ob->lay, true, "Particle Collision");
 		}
+		else if ((psys->flag & PSYS_HAIR_DYNAMICS) && psys->clmd && psys->clmd->coll_parms) {
+			add_collision_relations(psys_key, scene, ob, psys->clmd->coll_parms->group, ob->lay | scene->lay, true, "Hair Collision");
+		}
 
 		/* effectors */
 		add_forcefield_relations(psys_key, scene, ob, psys, part->effector_weights, part->type == PART_HAIR, "Particle Field");
 
 		/* boids */
 		if (part->boids) {
-			BoidRule *rule = NULL;
-			BoidState *state = NULL;
-
-			for (state = (BoidState *)part->boids->states.first; state; state = state->next) {
-				for (rule = (BoidRule *)state->rules.first; rule; rule = rule->next) {
+			LINKLIST_FOREACH (BoidState *, state, &part->boids->states) {
+				LINKLIST_FOREACH (BoidRule *, rule, &state->rules) {
 					Object *ruleob = NULL;
 					if (rule->type == eBoidRuleType_Avoid)
 						ruleob = ((BoidRuleGoalAvoid *)rule)->ob;
@@ -1256,391 +1296,6 @@ void DepsgraphRelationBuilder::build_particles(Scene *scene, Object *ob)
 	// TODO...
 }
 
-/* IK Solver Eval Steps */
-void DepsgraphRelationBuilder::build_ik_pose(Object *ob,
-                                             bPoseChannel *pchan,
-                                             bConstraint *con,
-                                             RootPChanMap *root_map)
-{
-	bKinematicConstraint *data = (bKinematicConstraint *)con->data;
-
-	/* attach owner to IK Solver too
-	 * - assume that owner is always part of chain
-	 * - see notes on direction of rel below...
-	 */
-	bPoseChannel *rootchan = BKE_armature_ik_solver_find_root(pchan, data);
-	OperationKey solver_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE, rootchan->name, DEG_OPCODE_POSE_IK_SOLVER);
-
-	/* IK target */
-	// XXX: this should get handled as part of the constraint code
-	if (data->tar != NULL) {
-		/* TODO(sergey): For until we'll store partial matricies in the depsgraph,
-		 * we create dependency between target object and pose eval component.
-		 *
-		 * This way we ensuring the whole subtree is updated from scratch without
-		 * need of intermediate matricies. This is an overkill, but good enough for
-		 * testing IK solver.
-		 */
-		// FIXME: geometry targets...
-		ComponentKey pose_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE);
-		if ((data->tar->type == OB_ARMATURE) && (data->subtarget[0])) {
-			/* TODO(sergey): This is only for until granular update stores intermediate result. */
-			if (data->tar != ob) {
-				/* different armature - can just read the results */
-				ComponentKey target_key(&data->tar->id, DEPSNODE_TYPE_BONE, data->subtarget);
-				add_relation(target_key, pose_key, DEPSREL_TYPE_TRANSFORM, con->name);
-			}
-			else {
-				/* same armature - we'll use the ready state only, just in case this bone is in the chain we're solving */
-				OperationKey target_key(&data->tar->id, DEPSNODE_TYPE_BONE, data->subtarget, DEG_OPCODE_BONE_DONE);
-				add_relation(target_key, solver_key, DEPSREL_TYPE_TRANSFORM, con->name);
-			}
-		}
-		else if (ELEM(data->tar->type, OB_MESH, OB_LATTICE) && (data->subtarget[0])) {
-			/* vertex group target */
-			/* NOTE: for now, we don't need to represent vertex groups separately... */
-			ComponentKey target_key(&data->tar->id, DEPSNODE_TYPE_GEOMETRY);
-			add_relation(target_key, solver_key, DEPSREL_TYPE_GEOMETRY_EVAL, con->name);
-
-			if (data->tar->type == OB_MESH) {
-				OperationDepsNode *node2 = find_operation_node(target_key);
-				if (node2 != NULL) {
-					node2->customdata_mask |= CD_MASK_MDEFORMVERT;
-				}
-			}
-		}
-		else {
-			/* Standard Object Target */
-			ComponentKey target_key(&data->tar->id, DEPSNODE_TYPE_TRANSFORM);
-			add_relation(target_key, pose_key, DEPSREL_TYPE_TRANSFORM, con->name);
-		}
-
-		if ((data->tar == ob) && (data->subtarget[0])) {
-			/* Prevent target's constraints from linking to anything from same
-			 * chain that it controls.
-			 */
-			root_map->add_bone(data->subtarget, rootchan->name);
-		}
-	}
-
-	/* Pole Target */
-	// XXX: this should get handled as part of the constraint code
-	if (data->poletar != NULL) {
-		if ((data->poletar->type == OB_ARMATURE) && (data->polesubtarget[0])) {
-			// XXX: same armature issues - ready vs done?
-			ComponentKey target_key(&data->poletar->id, DEPSNODE_TYPE_BONE, data->subtarget);
-			add_relation(target_key, solver_key, DEPSREL_TYPE_TRANSFORM, con->name);
-		}
-		else if (ELEM(data->poletar->type, OB_MESH, OB_LATTICE) && (data->subtarget[0])) {
-			/* vertex group target */
-			/* NOTE: for now, we don't need to represent vertex groups separately... */
-			ComponentKey target_key(&data->poletar->id, DEPSNODE_TYPE_GEOMETRY);
-			add_relation(target_key, solver_key, DEPSREL_TYPE_GEOMETRY_EVAL, con->name);
-
-			if (data->poletar->type == OB_MESH) {
-				OperationDepsNode *node2 = find_operation_node(target_key);
-				if (node2 != NULL) {
-					node2->customdata_mask |= CD_MASK_MDEFORMVERT;
-				}
-			}
-		}
-		else {
-			ComponentKey target_key(&data->poletar->id, DEPSNODE_TYPE_TRANSFORM);
-			add_relation(target_key, solver_key, DEPSREL_TYPE_TRANSFORM, con->name);
-		}
-	}
-
-	DEG_DEBUG_PRINTF("\nStarting IK Build: pchan = %s, target = (%s, %s), segcount = %d\n",
-	                 pchan->name, data->tar->id.name, data->subtarget, data->rootbone);
-
-	bPoseChannel *parchan = pchan;
-	/* exclude tip from chain? */
-	if (!(data->flag & CONSTRAINT_IK_TIP)) {
-		OperationKey tip_transforms_key(&ob->id, DEPSNODE_TYPE_BONE,
-		                                parchan->name, DEG_OPCODE_BONE_LOCAL);
-		add_relation(solver_key, tip_transforms_key,
-		             DEPSREL_TYPE_TRANSFORM, "IK Solver Result");
-		parchan = pchan->parent;
-	}
-
-	root_map->add_bone(parchan->name, rootchan->name);
-
-	OperationKey parchan_transforms_key(&ob->id, DEPSNODE_TYPE_BONE,
-	                                    parchan->name, DEG_OPCODE_BONE_READY);
-	add_relation(parchan_transforms_key, solver_key,
-	             DEPSREL_TYPE_TRANSFORM, "IK Solver Owner");
-
-	/* Walk to the chain's root */
-	//size_t segcount = 0;
-	int segcount = 0;
-
-	while (parchan) {
-		/* Make IK-solver dependent on this bone's result,
-		 * since it can only run after the standard results
-		 * of the bone are know. Validate links step on the
-		 * bone will ensure that users of this bone only
-		 * grab the result with IK solver results...
-		 */
-		if (parchan != pchan) {
-			OperationKey parent_key(&ob->id, DEPSNODE_TYPE_BONE, parchan->name, DEG_OPCODE_BONE_READY);
-			add_relation(parent_key, solver_key, DEPSREL_TYPE_TRANSFORM, "IK Chain Parent");
-
-			OperationKey done_key(&ob->id, DEPSNODE_TYPE_BONE, parchan->name, DEG_OPCODE_BONE_DONE);
-			add_relation(solver_key, done_key, DEPSREL_TYPE_TRANSFORM, "IK Chain Result");
-		}
-		else {
-			OperationKey final_transforms_key(&ob->id, DEPSNODE_TYPE_BONE, parchan->name, DEG_OPCODE_BONE_DONE);
-			add_relation(solver_key, final_transforms_key, DEPSREL_TYPE_TRANSFORM, "IK Solver Result");
-		}
-		parchan->flag |= POSE_DONE;
-
-
-		root_map->add_bone(parchan->name, rootchan->name);
-
-		/* continue up chain, until we reach target number of items... */
-		DEG_DEBUG_PRINTF("  %d = %s\n", segcount, parchan->name);
-		segcount++;
-		if ((segcount == data->rootbone) || (segcount > 255)) break;  /* 255 is weak */
-
-		parchan  = parchan->parent;
-	}
-
-	OperationKey flush_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE, DEG_OPCODE_POSE_DONE);
-	add_relation(solver_key, flush_key, DEPSREL_TYPE_OPERATION, "PoseEval Result-Bone Link");
-}
-
-/* Spline IK Eval Steps */
-void DepsgraphRelationBuilder::build_splineik_pose(Object *ob,
-                                                   bPoseChannel *pchan,
-                                                   bConstraint *con,
-                                                   RootPChanMap *root_map)
-{
-	bSplineIKConstraint *data = (bSplineIKConstraint *)con->data;
-	bPoseChannel *rootchan = BKE_armature_splineik_solver_find_root(pchan, data);
-	OperationKey transforms_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_READY);
-	OperationKey solver_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE, rootchan->name, DEG_OPCODE_POSE_SPLINE_IK_SOLVER);
-
-	/* attach owner to IK Solver too
-	 * - assume that owner is always part of chain
-	 * - see notes on direction of rel below...
-	 */
-	add_relation(transforms_key, solver_key, DEPSREL_TYPE_TRANSFORM, "Spline IK Solver Owner");
-
-	/* attach path dependency to solver */
-	if (data->tar) {
-		/* TODO(sergey): For until we'll store partial matricies in the depsgraph,
-		 * we create dependency between target object and pose eval component.
-		 * See IK pose for a bit more information.
-		 */
-		// TODO: the bigggest point here is that we need the curve PATH and not just the general geometry...
-		ComponentKey target_key(&data->tar->id, DEPSNODE_TYPE_GEOMETRY);
-		ComponentKey pose_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE);
-		add_relation(target_key, pose_key, DEPSREL_TYPE_TRANSFORM, "[Curve.Path -> Spline IK] DepsRel");
-	}
-
-	pchan->flag |= POSE_DONE;
-	OperationKey final_transforms_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_DONE);
-	add_relation(solver_key, final_transforms_key, DEPSREL_TYPE_TRANSFORM, "Spline IK Result");
-
-	root_map->add_bone(pchan->name, rootchan->name);
-
-	/* Walk to the chain's root */
-	//size_t segcount = 0;
-	int segcount = 0;
-
-	for (bPoseChannel *parchan = pchan->parent; parchan; parchan = parchan->parent) {
-		/* Make Spline IK solver dependent on this bone's result,
-		 * since it can only run after the standard results
-		 * of the bone are know. Validate links step on the
-		 * bone will ensure that users of this bone only
-		 * grab the result with IK solver results...
-		 */
-		if (parchan != pchan) {
-			OperationKey parent_key(&ob->id, DEPSNODE_TYPE_BONE, parchan->name, DEG_OPCODE_BONE_READY);
-			add_relation(parent_key, solver_key, DEPSREL_TYPE_TRANSFORM, "Spline IK Solver Update");
-
-			OperationKey done_key(&ob->id, DEPSNODE_TYPE_BONE, parchan->name, DEG_OPCODE_BONE_DONE);
-			add_relation(solver_key, done_key, DEPSREL_TYPE_TRANSFORM, "IK Chain Result");
-		}
-		parchan->flag |= POSE_DONE;
-
-		OperationKey final_transforms_key(&ob->id, DEPSNODE_TYPE_BONE, parchan->name, DEG_OPCODE_BONE_DONE);
-		add_relation(solver_key, final_transforms_key, DEPSREL_TYPE_TRANSFORM, "Spline IK Solver Result");
-
-		root_map->add_bone(parchan->name, rootchan->name);
-
-		/* continue up chain, until we reach target number of items... */
-		segcount++;
-		if ((segcount == data->chainlen) || (segcount > 255)) break;  /* 255 is weak */
-	}
-
-	OperationKey flush_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE, DEG_OPCODE_POSE_DONE);
-	add_relation(solver_key, flush_key, DEPSREL_TYPE_OPERATION, "PoseEval Result-Bone Link");
-}
-
-/* Pose/Armature Bones Graph */
-void DepsgraphRelationBuilder::build_rig(Scene *scene, Object *ob)
-{
-	/* Armature-Data */
-	bArmature *arm = (bArmature *)ob->data;
-
-	// TODO: selection status?
-
-	/* attach links between pose operations */
-	OperationKey init_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE, DEG_OPCODE_POSE_INIT);
-	OperationKey flush_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE, DEG_OPCODE_POSE_DONE);
-
-	add_relation(init_key, flush_key, DEPSREL_TYPE_COMPONENT_ORDER, "[Pose Init -> Pose Cleanup]");
-
-	/* Make sure pose is up-to-date with armature updates. */
-	OperationKey armature_key(&arm->id,
-	                          DEPSNODE_TYPE_PARAMETERS,
-	                          DEG_OPCODE_PLACEHOLDER,
-	                          "Armature Eval");
-	add_relation(armature_key, init_key, DEPSREL_TYPE_COMPONENT_ORDER, "Data dependency");
-
-	if (ob->adt && (ob->adt->action || ob->adt->nla_tracks.first)) {
-		ComponentKey animation_key(&ob->id, DEPSNODE_TYPE_ANIMATION);
-		add_relation(animation_key, init_key, DEPSREL_TYPE_OPERATION, "Rig Animation");
-	}
-
-	/* IK Solvers...
-	 * - These require separate processing steps are pose-level
-	 *   to be executed between chains of bones (i.e. once the
-	 *   base transforms of a bunch of bones is done)
-	 *
-	 * - We build relations for these before the dependencies
-	 *   between ops in the same component as it is necessary
-	 *   to check whether such bones are in the same IK chain
-	 *   (or else we get weird issues with either in-chain
-	 *   references, or with bones being parented to IK'd bones)
-	 *
-	 * Unsolved Issues:
-	 * - Care is needed to ensure that multi-headed trees work out the same as in ik-tree building
-	 * - Animated chain-lengths are a problem...
-	 */
-	RootPChanMap root_map;
-	bool pose_depends_on_local_transform = false;
-	for (bPoseChannel *pchan = (bPoseChannel *)ob->pose->chanbase.first; pchan; pchan = pchan->next) {
-		for (bConstraint *con = (bConstraint *)pchan->constraints.first; con; con = con->next) {
-			switch (con->type) {
-				case CONSTRAINT_TYPE_KINEMATIC:
-					build_ik_pose(ob, pchan, con, &root_map);
-					pose_depends_on_local_transform = true;
-					break;
-
-				case CONSTRAINT_TYPE_SPLINEIK:
-					build_splineik_pose(ob, pchan, con, &root_map);
-					pose_depends_on_local_transform = true;
-					break;
-
-				/* Constraints which needs world's matrix for transform.
-				 * TODO(sergey): More constraints here?
-				 */
-				case CONSTRAINT_TYPE_ROTLIKE:
-				case CONSTRAINT_TYPE_SIZELIKE:
-				case CONSTRAINT_TYPE_LOCLIKE:
-				case CONSTRAINT_TYPE_TRANSLIKE:
-					/* TODO(sergey): Add used space check. */
-					pose_depends_on_local_transform = true;
-					break;
-
-				default:
-					break;
-			}
-		}
-	}
-	//root_map.print_debug();
-
-	if (pose_depends_on_local_transform) {
-		/* TODO(sergey): Once partial updates are possible use relation between
-		 * object transform and solver itself in it's build function.
-		 */
-		ComponentKey pose_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE);
-		ComponentKey local_transform_key(&ob->id, DEPSNODE_TYPE_TRANSFORM);
-		add_relation(local_transform_key, pose_key, DEPSREL_TYPE_TRANSFORM, "Local Transforms");
-	}
-
-
-	/* links between operations for each bone */
-	for (bPoseChannel *pchan = (bPoseChannel *)ob->pose->chanbase.first; pchan; pchan = pchan->next) {
-		OperationKey bone_local_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_LOCAL);
-		OperationKey bone_pose_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_POSE_PARENT);
-		OperationKey bone_ready_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_READY);
-		OperationKey bone_done_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_DONE);
-
-		pchan->flag &= ~POSE_DONE;
-
-		/* pose init to bone local */
-		add_relation(init_key, bone_local_key, DEPSREL_TYPE_OPERATION, "PoseEval Source-Bone Link");
-
-		/* local to pose parenting operation */
-		add_relation(bone_local_key, bone_pose_key, DEPSREL_TYPE_OPERATION, "Bone Local - PoseSpace Link");
-
-		/* parent relation */
-		if (pchan->parent != NULL) {
-			eDepsOperation_Code parent_key_opcode;
-
-			/* NOTE: this difference in handling allows us to prevent lockups while ensuring correct poses for separate chains */
-			if (root_map.has_common_root(pchan->name, pchan->parent->name)) {
-				parent_key_opcode = DEG_OPCODE_BONE_READY;
-			}
-			else {
-				parent_key_opcode = DEG_OPCODE_BONE_DONE;
-			}
-
-			OperationKey parent_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->parent->name, parent_key_opcode);
-			add_relation(parent_key, bone_pose_key, DEPSREL_TYPE_TRANSFORM, "[Parent Bone -> Child Bone]");
-		}
-
-		/* constraints */
-		if (pchan->constraints.first != NULL) {
-			/* constraints stack and constraint dependencies */
-			build_constraints(scene, &ob->id, DEPSNODE_TYPE_BONE, pchan->name, &pchan->constraints, &root_map);
-
-			/* pose -> constraints */
-			OperationKey constraints_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_CONSTRAINTS);
-			add_relation(bone_pose_key, constraints_key, DEPSREL_TYPE_OPERATION, "Constraints Stack");
-
-			/* constraints -> ready */
-			// TODO: when constraint stack is exploded, this step should occur before the first IK solver
-			add_relation(constraints_key, bone_ready_key, DEPSREL_TYPE_OPERATION, "Constraints -> Ready");
-		}
-		else {
-			/* pose -> ready */
-			add_relation(bone_pose_key, bone_ready_key, DEPSREL_TYPE_OPERATION, "Pose -> Ready");
-		}
-
-		/* bone ready -> done
-		 * NOTE: For bones without IK, this is all that's needed.
-		 *       For IK chains however, an additional rel is created from IK to done,
-		 *       with transitive reduction removing this one...
-		 */
-		add_relation(bone_ready_key, bone_done_key, DEPSREL_TYPE_OPERATION, "Ready -> Done");
-
-		/* assume that all bones must be done for the pose to be ready (for deformers) */
-		add_relation(bone_done_key, flush_key, DEPSREL_TYPE_OPERATION, "PoseEval Result-Bone Link");
-	}
-}
-
-void DepsgraphRelationBuilder::build_proxy_rig(Object *ob)
-{
-	OperationKey pose_init_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE, DEG_OPCODE_POSE_INIT);
-	OperationKey pose_done_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE, DEG_OPCODE_POSE_DONE);
-	for (bPoseChannel *pchan = (bPoseChannel *)ob->pose->chanbase.first;
-	     pchan != NULL;
-	     pchan = pchan->next)
-	{
-		OperationKey bone_local_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_LOCAL);
-		OperationKey bone_ready_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_READY);
-		OperationKey bone_done_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_DONE);
-		add_relation(pose_init_key, bone_local_key, DEPSREL_TYPE_OPERATION, "Pose Init -> Bone Local");
-		add_relation(bone_local_key, bone_ready_key, DEPSREL_TYPE_OPERATION, "Local -> Ready");
-		add_relation(bone_ready_key, bone_done_key, DEPSREL_TYPE_OPERATION, "Ready -> Done");
-		add_relation(bone_done_key, pose_done_key, DEPSREL_TYPE_OPERATION, "Bone Done -> Pose Done");
-	}
-}
-
 /* Shapekeys */
 void DepsgraphRelationBuilder::build_shapekeys(ID *obdata, Key *key)
 {
@@ -1701,10 +1356,9 @@ void DepsgraphRelationBuilder::build_obdata_geom(Main *bmain, Scene *scene, Obje
 
 	/* Modifiers */
 	if (ob->modifiers.first) {
-		ModifierData *md;
 		OperationKey prev_mod_key;
 
-		for (md = (ModifierData *)ob->modifiers.first; md; md = md->next) {
+		LINKLIST_FOREACH (ModifierData *, md, &ob->modifiers) {
 			const ModifierTypeInfo *mti = modifierType_getInfo((ModifierType)md->type);
 			OperationKey mod_key(&ob->id, DEPSNODE_TYPE_GEOMETRY, DEG_OPCODE_GEOMETRY_MODIFIER, md->name);
 
@@ -1737,7 +1391,7 @@ void DepsgraphRelationBuilder::build_obdata_geom(Main *bmain, Scene *scene, Obje
 				 * for either the modifier needing time, or that it is animated.
 				 */
 				/* XXX: Remove this hack when these links are added as part of build_animdata() instead */
-				if (modifier_dependsOnTime(md) == false) {
+				if (modifier_dependsOnTime(md) == false && needs_animdata_node(&ob->id)) {
 					ComponentKey animation_key(&ob->id, DEPSNODE_TYPE_ANIMATION);
 					add_relation(animation_key, mod_key, DEPSREL_TYPE_OPERATION, "Modifier Animation");
 				}
@@ -1820,15 +1474,18 @@ void DepsgraphRelationBuilder::build_obdata_geom(Main *bmain, Scene *scene, Obje
 			// XXX: these needs geom data, but where is geom stored?
 			if (cu->bevobj) {
 				ComponentKey bevob_key(&cu->bevobj->id, DEPSNODE_TYPE_GEOMETRY);
+				build_object(bmain, scene, cu->bevobj);
 				add_relation(bevob_key, geom_key, DEPSREL_TYPE_GEOMETRY_EVAL, "Curve Bevel");
 			}
 			if (cu->taperobj) {
 				ComponentKey taperob_key(&cu->taperobj->id, DEPSNODE_TYPE_GEOMETRY);
+				build_object(bmain, scene, cu->taperobj);
 				add_relation(taperob_key, geom_key, DEPSREL_TYPE_GEOMETRY_EVAL, "Curve Taper");
 			}
 			if (ob->type == OB_FONT) {
 				if (cu->textoncurve) {
-					ComponentKey textoncurve_key(&cu->taperobj->id, DEPSNODE_TYPE_GEOMETRY);
+					ComponentKey textoncurve_key(&cu->textoncurve->id, DEPSNODE_TYPE_GEOMETRY);
+					build_object(bmain, scene, cu->textoncurve);
 					add_relation(textoncurve_key, geom_key, DEPSREL_TYPE_GEOMETRY_EVAL, "Text on Curve");
 				}
 			}
@@ -1936,7 +1593,7 @@ void DepsgraphRelationBuilder::build_nodetree(ID *owner, bNodeTree *ntree)
 	                            "Parameters Eval");
 
 	/* nodetree's nodes... */
-	for (bNode *bnode = (bNode *)ntree->nodes.first; bnode; bnode = bnode->next) {
+	LINKLIST_FOREACH (bNode *, bnode, &ntree->nodes) {
 		if (bnode->id) {
 			if (GS(bnode->id->name) == ID_MA) {
 				build_material(owner, (Material *)bnode->id);
@@ -2035,9 +1692,26 @@ bool DepsgraphRelationBuilder::needs_animdata_node(ID *id)
 {
 	AnimData *adt = BKE_animdata_from_id(id);
 	if (adt != NULL) {
-		return adt->action != NULL;
+		return (adt->action != NULL) || (adt->nla_tracks.first != NULL);
 	}
 	return false;
 }
 
+void DepsgraphRelationBuilder::build_cachefile(CacheFile *cache_file) {
+	/* Animation. */
+	build_animdata(&cache_file->id);
+}
+
+void DepsgraphRelationBuilder::build_mask(Mask *mask)
+{
+	/* Animation. */
+	build_animdata(&mask->id);
+}
+
+void DepsgraphRelationBuilder::build_movieclip(MovieClip *clip)
+{
+	/* Animation. */
+	build_animdata(&clip->id);
+}
+
 }  // namespace DEG
diff --git a/source/blender/depsgraph/intern/builder/deg_builder_relations.h b/source/blender/depsgraph/intern/builder/deg_builder_relations.h
index 46e65d4..6e8485b 100644
--- a/source/blender/depsgraph/intern/builder/deg_builder_relations.h
+++ b/source/blender/depsgraph/intern/builder/deg_builder_relations.h
@@ -47,6 +47,7 @@
 
 struct Base;
 struct bGPdata;
+struct CacheFile;
 struct ListBase;
 struct GHash;
 struct ID;
@@ -54,8 +55,10 @@ struct FCurve;
 struct Group;
 struct Key;
 struct Main;
+struct Mask;
 struct Material;
 struct MTex;
+struct MovieClip;
 struct bNodeTree;
 struct Object;
 struct bPoseChannel;
@@ -81,108 +84,83 @@ struct ComponentDepsNode;
 struct OperationDepsNode;
 struct RootPChanMap;
 
-struct RootKey
-{
-	RootKey() {}
+struct RootKey {
+	RootKey();
 };
 
 struct TimeSourceKey
 {
-	TimeSourceKey() : id(NULL) {}
-	TimeSourceKey(ID *id) : id(id) {}
+	TimeSourceKey();
+	TimeSourceKey(ID *id);
 
-	string identifier() const
-	{
-		return string("TimeSourceKey");
-	}
+	string identifier() const;
 
 	ID *id;
 };
 
 struct ComponentKey
 {
-	ComponentKey() :
-	    id(NULL), type(DEPSNODE_TYPE_UNDEFINED), name("")
-	{}
-	ComponentKey(ID *id, eDepsNode_Type type, const string &name = "") :
-	    id(id), type(type), name(name)
-	{}
-
-	string identifier() const
-	{
-		const char *idname = (id) ? id->name : "<None>";
+	ComponentKey();
+	ComponentKey(ID *id, eDepsNode_Type type, const char *name = "");
 
-		char typebuf[5];
-		BLI_snprintf(typebuf, sizeof(typebuf), "%d", type);
-
-		return string("ComponentKey(") + idname + ", " + typebuf + ", '" + name + "')";
-	}
+	string identifier() const;
 
 	ID *id;
 	eDepsNode_Type type;
-	string name;
+	const char *name;
 };
 
 struct OperationKey
 {
-	OperationKey() :
-	    id(NULL), component_type(DEPSNODE_TYPE_UNDEFINED), component_name(""), opcode(DEG_OPCODE_OPERATION), name("")
-	{}
-
-	OperationKey(ID *id, eDepsNode_Type component_type, const string &name) :
-	    id(id), component_type(component_type), component_name(""), opcode(DEG_OPCODE_OPERATION), name(name)
-	{}
-	OperationKey(ID *id, eDepsNode_Type component_type, const string &component_name, const string &name) :
-	    id(id), component_type(component_type), component_name(component_name), opcode(DEG_OPCODE_OPERATION), name(name)
-	{}
-
-	OperationKey(ID *id, eDepsNode_Type component_type, eDepsOperation_Code opcode) :
-	    id(id), component_type(component_type), component_name(""), opcode(opcode), name("")
-	{}
-	OperationKey(ID *id, eDepsNode_Type component_type, const string &component_name, eDepsOperation_Code opcode) :
-	    id(id), component_type(component_type), component_name(component_name), opcode(opcode), name("")
-	{}
-
-	OperationKey(ID *id, eDepsNode_Type component_type, eDepsOperation_Code opcode, const string &name) :
-	    id(id), component_type(component_type), component_name(""), opcode(opcode), name(name)
-	{}
-	OperationKey(ID *id, eDepsNode_Type component_type, const string &component_name, eDepsOperation_Code opcode, const string &name) :
-	    id(id), component_type(component_type), component_name(component_name), opcode(opcode), name(name)
-	{}
-
-	string identifier() const
-	{
-		char typebuf[5];
-		BLI_snprintf(typebuf, sizeof(typebuf), "%d", component_type);
-
-		return string("OperationKey(") + "t: " + typebuf + ", cn: '" + component_name + "', c: " + DEG_OPNAMES[opcode] + ", n: '" + name + "')";
-	}
-
+	OperationKey();
+	OperationKey(ID *id,
+	             eDepsNode_Type component_type,
+	             const char *name,
+	             int name_tag = -1);
+	OperationKey(ID *id,
+	             eDepsNode_Type component_type,
+	             const char *component_name,
+	             const char *name,
+	             int name_tag);
+
+	OperationKey(ID *id,
+	             eDepsNode_Type component_type,
+	             eDepsOperation_Code opcode);
+	OperationKey(ID *id,
+	             eDepsNode_Type component_type,
+	             const char *component_name,
+	             eDepsOperation_Code opcode);
+
+	OperationKey(ID *id,
+	             eDepsNode_Type component_type,
+	             eDepsOperation_Code opcode,
+	             const char *name,
+	             int name_tag = -1);
+	OperationKey(ID *id,
+	             eDepsNode_Type component_type,
+	             const char *component_name,
+	             eDepsOperation_Code opcode,
+	             const char *name,
+	             int name_tag = -1);
+
+	string identifier() const;
 
 	ID *id;
 	eDepsNode_Type component_type;
-	string component_name;
+	const char *component_name;
 	eDepsOperation_Code opcode;
-	string name;
+	const char *name;
+	int name_tag;
 };
 
 struct RNAPathKey
 {
-	// Note: see depsgraph_build.cpp for implementation
+	/* NOTE: see depsgraph_build.cpp for implementation */
 	RNAPathKey(ID *id, const char *path);
 
-	RNAPathKey(ID *id, const PointerRNA &ptr, PropertyRNA *prop) :
-	    id(id), ptr(ptr), prop(prop)
-	{}
-
-	string identifier() const
-	{
-		const char *id_name   = (id) ?  id->name : "<No ID>";
-		const char *prop_name = (prop) ? RNA_property_identifier(prop) : "<No Prop>";
-
-		return string("RnaPathKey(") + "id: " + id_name + ", prop: " + prop_name +  "')";
-	}
+	RNAPathKey(ID *id, const PointerRNA &ptr, PropertyRNA *prop);
 
+	string identifier() const;
 
 	ID *id;
 	PointerRNA ptr;
@@ -245,6 +223,9 @@ struct DepsgraphRelationBuilder
 	void build_texture_stack(ID *owner, MTex **texture_stack);
 	void build_compositor(Scene *scene);
 	void build_gpencil(ID *owner, bGPdata *gpd);
+	void build_cachefile(CacheFile *cache_file);
+	void build_mask(Mask *mask);
+	void build_movieclip(MovieClip *clip);
 
 	void add_collision_relations(const OperationKey &key, Scene *scene, Object *ob, Group *group, int layer, bool dupli, const char *name);
 	void add_forcefield_relations(const OperationKey &key, Scene *scene, Object *ob, ParticleSystem *psys, EffectorWeights *eff, bool add_absorption, const char *name);
@@ -270,7 +251,7 @@ protected:
 
 	template <typename KeyType>
 	DepsNodeHandle create_node_handle(const KeyType& key,
-	                                  const string& default_name = "");
+	                                  const char *default_name = "");
 
 	bool needs_animdata_node(ID *id);
 
@@ -280,7 +261,7 @@ private:
 
 struct DepsNodeHandle
 {
-	DepsNodeHandle(DepsgraphRelationBuilder *builder, OperationDepsNode *node, const string &default_name = "") :
+	DepsNodeHandle(DepsgraphRelationBuilder *builder, OperationDepsNode *node, const char *default_name = "") :
 	    builder(builder),
 	    node(node),
 	    default_name(default_name)
@@ -290,7 +271,7 @@ struct DepsNodeHandle
 
 	DepsgraphRelationBuilder *builder;
 	OperationDepsNode *node;
-	const string &default_name;
+	const char *default_name;
 };
 
 /* Utilities for Builders ----------------------------------------------------- */
@@ -318,6 +299,7 @@ void DepsgraphRelationBuilder::add_relation(const KeyFrom &key_from,
 	else {
 		if (!op_from) {
 			/* XXX TODO handle as error or report if needed */
+			node_from = find_node(key_from);
 			fprintf(stderr, "add_relation(%d, %s) - Could not find op_from (%s)\n",
 			        type, description, key_from.identifier().c_str());
 		}
@@ -370,10 +352,12 @@ void DepsgraphRelationBuilder::add_node_handle_relation(
 	}
 	else {
 		if (!op_from) {
-			/* XXX TODO handle as error or report if needed */
+			fprintf(stderr, "add_node_handle_relation(%d, %s) - Could not find op_from (%s)\n",
+			        type, description, key_from.identifier().c_str());
 		}
 		if (!op_to) {
-			/* XXX TODO handle as error or report if needed */
+			fprintf(stderr, "add_node_handle_relation(%d, %s) - Could not find op_to (%s)\n",
+			        type, description, key_from.identifier().c_str());
 		}
 	}
 }
@@ -381,7 +365,7 @@ void DepsgraphRelationBuilder::add_node_handle_relation(
 template <typename KeyType>
 DepsNodeHandle DepsgraphRelationBuilder::create_node_handle(
         const KeyType &key,
-        const string &default_name)
+        const char *default_name)
 {
 	return DepsNodeHandle(this, find_node(key), default_name);
 }
diff --git a/source/blender/depsgraph/intern/builder/deg_builder_relations_keys.cc b/source/blender/depsgraph/intern/builder/deg_builder_relations_keys.cc
new file mode 100644
index 0000000..feae8bc
--- /dev/null
+++ b/source/blender/depsgraph/intern/builder/deg_builder_relations_keys.cc
@@ -0,0 +1,211 @@
+/*
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2013 Blender Foundation.
+ * All rights reserved.
+ *
+ * Original Author: Joshua Leung
+ * Contributor(s): Based on original depsgraph.c code - Blender Foundation (2005-2013)
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+/** \file blender/depsgraph/intern/builder/deg_builder_relations_keys.cc
+ *  \ingroup depsgraph
+ *
+ * Methods for constructing depsgraph
+ */
+
+#include "intern/builder/deg_builder_relations.h"
+
+namespace DEG {
+
+/////////////////////////////////////////
+// Root.
+
+RootKey::RootKey()
+{
+}
+
+/////////////////////////////////////////
+// Time source.
+
+TimeSourceKey::TimeSourceKey()
+        : id(NULL)
+{
+}
+
+TimeSourceKey::TimeSourceKey(ID *id)
+        : id(id)
+{
+}
+
+string TimeSourceKey::identifier() const
+{
+	return string("TimeSourceKey");
+}
+
+/////////////////////////////////////////
+// Component.
+
+ComponentKey::ComponentKey()
+        : id(NULL),
+          type(DEPSNODE_TYPE_UNDEFINED),
+          name("")
+{
+}
+
+ComponentKey::ComponentKey(ID *id, eDepsNode_Type type, const char *name)
+        : id(id),
+          type(type),
+          name(name)
+{
+}
+
+string ComponentKey::identifier() const
+{
+	const char *idname = (id) ? id->name : "<None>";
+	char typebuf[5];
+	BLI_snprintf(typebuf, sizeof(typebuf), "%d", type);
+	return string("ComponentKey(") +
+	       idname + ", " + typebuf + ", '" + name + "')";
+}
+
+/////////////////////////////////////////
+// Operation.
+
+OperationKey::OperationKey()
+        : id(NULL),
+          component_type(DEPSNODE_TYPE_UNDEFINED),
+          component_name(""),
+          opcode(DEG_OPCODE_OPERATION),
+          name(""),
+          name_tag(-1)
+{
+}
+
+OperationKey::OperationKey(ID *id,
+                           eDepsNode_Type component_type,
+                           const char *name,
+                           int name_tag)
+        : id(id),
+          component_type(component_type),
+          component_name(""),
+          opcode(DEG_OPCODE_OPERATION),
+          name(name),
+          name_tag(name_tag)
+{
+}
+
+OperationKey::OperationKey(ID *id,
+                           eDepsNode_Type component_type,
+                           const char *component_name,
+                           const char *name,
+                           int name_tag)
+        : id(id),
+          component_type(component_type),
+          component_name(component_name),
+          opcode(DEG_OPCODE_OPERATION),
+          name(name),
+          name_tag(name_tag)
+{
+}
+
+OperationKey::OperationKey(ID *id,
+                           eDepsNode_Type component_type,
+                           eDepsOperation_Code opcode)
+        : id(id),
+          component_type(component_type),
+          component_name(""),
+          opcode(opcode),
+          name(""),
+          name_tag(-1)
+{
+}
+
+OperationKey::OperationKey(ID *id,
+                           eDepsNode_Type component_type,
+                           const char *component_name,
+                           eDepsOperation_Code opcode)
+        : id(id),
+          component_type(component_type),
+          component_name(component_name),
+          opcode(opcode),
+          name(""),
+          name_tag(-1)
+{
+}
+
+OperationKey::OperationKey(ID *id,
+                           eDepsNode_Type component_type,
+                           eDepsOperation_Code opcode,
+                           const char *name,
+                           int name_tag)
+        : id(id),
+          component_type(component_type),
+          component_name(""),
+          opcode(opcode),
+          name(name),
+          name_tag(name_tag)
+{
+}
+
+OperationKey::OperationKey(ID *id,
+                           eDepsNode_Type component_type,
+                           const char *component_name,
+                           eDepsOperation_Code opcode,
+                           const char *name,
+                           int name_tag)
+        : id(id),
+          component_type(component_type),
+          component_name(component_name),
+          opcode(opcode),
+          name(name),
+          name_tag(name_tag)
+{
+}
+
+string OperationKey::identifier() const
+{
+	char typebuf[5];
+	BLI_snprintf(typebuf, sizeof(typebuf), "%d", component_type);
+	return string("OperationKey(") +
+	       "t: " + typebuf +
+	       ", cn: '" + component_name +
+	       "', c: " + DEG_OPNAMES[opcode] +
+	       ", n: '" + name + "')";
+}
+
+/////////////////////////////////////////
+// RNA path.
+
+RNAPathKey::RNAPathKey(ID *id, const PointerRNA &ptr, PropertyRNA *prop)
+        : id(id),
+          ptr(ptr),
+          prop(prop)
+{
+}
+
+string RNAPathKey::identifier() const
+{
+	const char *id_name   = (id) ?  id->name : "<No ID>";
+	const char *prop_name = (prop) ? RNA_property_identifier(prop) : "<No Prop>";
+	return string("RnaPathKey(") + "id: " + id_name +
+	                               ", prop: " + prop_name +  "')";
+}
+
+}  // namespace DEG
diff --git a/source/blender/depsgraph/intern/builder/deg_builder_relations_rig.cc b/source/blender/depsgraph/intern/builder/deg_builder_relations_rig.cc
new file mode 100644
index 0000000..2b4c000
--- /dev/null
+++ b/source/blender/depsgraph/intern/builder/deg_builder_relations_rig.cc
@@ -0,0 +1,455 @@
+/*
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2013 Blender Foundation.
+ * All rights reserved.
+ *
+ * Original Author: Joshua Leung
+ * Contributor(s): Based on original depsgraph.c code - Blender Foundation (2005-2013)
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+/** \file blender/depsgraph/intern/builder/deg_builder_relations_rig.cc
+ *  \ingroup depsgraph
+ *
+ * Methods for constructing depsgraph
+ */
+
+#include "intern/builder/deg_builder_relations.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <cstring>  /* required for STREQ later on. */
+
+#include "MEM_guardedalloc.h"
+
+extern "C" {
+#include "BLI_blenlib.h"
+#include "BLI_utildefines.h"
+
+#include "DNA_action_types.h"
+#include "DNA_anim_types.h"
+#include "DNA_armature_types.h"
+#include "DNA_constraint_types.h"
+#include "DNA_customdata_types.h"
+#include "DNA_object_types.h"
+
+#include "BKE_action.h"
+#include "BKE_armature.h"
+
+#include "DEG_depsgraph.h"
+#include "DEG_depsgraph_build.h"
+} /* extern "C" */
+
+#include "intern/builder/deg_builder.h"
+#include "intern/builder/deg_builder_pchanmap.h"
+
+#include "intern/nodes/deg_node.h"
+#include "intern/nodes/deg_node_component.h"
+#include "intern/nodes/deg_node_operation.h"
+
+#include "intern/depsgraph_intern.h"
+#include "intern/depsgraph_types.h"
+
+#include "util/deg_util_foreach.h"
+
+namespace DEG {
+
+/* IK Solver Eval Steps */
+void DepsgraphRelationBuilder::build_ik_pose(Object *ob,
+                                             bPoseChannel *pchan,
+                                             bConstraint *con,
+                                             RootPChanMap *root_map)
+{
+	bKinematicConstraint *data = (bKinematicConstraint *)con->data;
+
+	/* attach owner to IK Solver too
+	 * - assume that owner is always part of chain
+	 * - see notes on direction of rel below...
+	 */
+	bPoseChannel *rootchan = BKE_armature_ik_solver_find_root(pchan, data);
+	OperationKey solver_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE, rootchan->name, DEG_OPCODE_POSE_IK_SOLVER);
+
+	/* IK target */
+	// XXX: this should get handled as part of the constraint code
+	if (data->tar != NULL) {
+		/* TODO(sergey): For until we'll store partial matricies in the depsgraph,
+		 * we create dependency between target object and pose eval component.
+		 *
+		 * This way we ensuring the whole subtree is updated from scratch without
+		 * need of intermediate matricies. This is an overkill, but good enough for
+		 * testing IK solver.
+		 */
+		// FIXME: geometry targets...
+		ComponentKey pose_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE);
+		if ((data->tar->type == OB_ARMATURE) && (data->subtarget[0])) {
+			/* TODO(sergey): This is only for until granular update stores intermediate result. */
+			if (data->tar != ob) {
+				/* different armature - can just read the results */
+				ComponentKey target_key(&data->tar->id, DEPSNODE_TYPE_BONE, data->subtarget);
+				add_relation(target_key, pose_key, DEPSREL_TYPE_TRANSFORM, con->name);
+			}
+			else {
+				/* same armature - we'll use the ready state only, just in case this bone is in the chain we're solving */
+				OperationKey target_key(&data->tar->id, DEPSNODE_TYPE_BONE, data->subtarget, DEG_OPCODE_BONE_DONE);
+				add_relation(target_key, solver_key, DEPSREL_TYPE_TRANSFORM, con->name);
+			}
+		}
+		else if (ELEM(data->tar->type, OB_MESH, OB_LATTICE) && (data->subtarget[0])) {
+			/* vertex group target */
+			/* NOTE: for now, we don't need to represent vertex groups separately... */
+			ComponentKey target_key(&data->tar->id, DEPSNODE_TYPE_GEOMETRY);
+			add_relation(target_key, solver_key, DEPSREL_TYPE_GEOMETRY_EVAL, con->name);
+
+			if (data->tar->type == OB_MESH) {
+				OperationDepsNode *node2 = find_operation_node(target_key);
+				if (node2 != NULL) {
+					node2->customdata_mask |= CD_MASK_MDEFORMVERT;
+				}
+			}
+		}
+		else {
+			/* Standard Object Target */
+			ComponentKey target_key(&data->tar->id, DEPSNODE_TYPE_TRANSFORM);
+			add_relation(target_key, pose_key, DEPSREL_TYPE_TRANSFORM, con->name);
+		}
+
+		if ((data->tar == ob) && (data->subtarget[0])) {
+			/* Prevent target's constraints from linking to anything from same
+			 * chain that it controls.
+			 */
+			root_map->add_bone(data->subtarget, rootchan->name);
+		}
+	}
+
+	/* Pole Target */
+	// XXX: this should get handled as part of the constraint code
+	if (data->poletar != NULL) {
+		if ((data->poletar->type == OB_ARMATURE) && (data->polesubtarget[0])) {
+			// XXX: same armature issues - ready vs done?
+			ComponentKey target_key(&data->poletar->id, DEPSNODE_TYPE_BONE, data->polesubtarget);
+			add_relation(target_key, solver_key, DEPSREL_TYPE_TRANSFORM, con->name);
+		}
+		else if (ELEM(data->poletar->type, OB_MESH, OB_LATTICE) && (data->polesubtarget[0])) {
+			/* vertex group target */
+			/* NOTE: for now, we don't need to represent vertex groups separately... */
+			ComponentKey target_key(&data->poletar->id, DEPSNODE_TYPE_GEOMETRY);
+			add_relation(target_key, solver_key, DEPSREL_TYPE_GEOMETRY_EVAL, con->name);
+
+			if (data->poletar->type == OB_MESH) {
+				OperationDepsNode *node2 = find_operation_node(target_key);
+				if (node2 != NULL) {
+					node2->customdata_mask |= CD_MASK_MDEFORMVERT;
+				}
+			}
+		}
+		else {
+			ComponentKey target_key(&data->poletar->id, DEPSNODE_TYPE_TRANSFORM);
+			add_relation(target_key, solver_key, DEPSREL_TYPE_TRANSFORM, con->name);
+		}
+	}
+
+	DEG_DEBUG_PRINTF("\nStarting IK Build: pchan = %s, target = (%s, %s), segcount = %d\n",
+	                 pchan->name, data->tar->id.name, data->subtarget, data->rootbone);
+
+	bPoseChannel *parchan = pchan;
+	/* exclude tip from chain? */
+	if (!(data->flag & CONSTRAINT_IK_TIP)) {
+		OperationKey tip_transforms_key(&ob->id, DEPSNODE_TYPE_BONE,
+		                                parchan->name, DEG_OPCODE_BONE_LOCAL);
+		add_relation(solver_key, tip_transforms_key,
+		             DEPSREL_TYPE_TRANSFORM, "IK Solver Result");
+		parchan = pchan->parent;
+	}
+
+	root_map->add_bone(parchan->name, rootchan->name);
+
+	OperationKey parchan_transforms_key(&ob->id, DEPSNODE_TYPE_BONE,
+	                                    parchan->name, DEG_OPCODE_BONE_READY);
+	add_relation(parchan_transforms_key, solver_key,
+	             DEPSREL_TYPE_TRANSFORM, "IK Solver Owner");
+
+	/* Walk to the chain's root */
+	//size_t segcount = 0;
+	int segcount = 0;
+
+	while (parchan) {
+		/* Make IK-solver dependent on this bone's result,
+		 * since it can only run after the standard results
+		 * of the bone are know. Validate links step on the
+		 * bone will ensure that users of this bone only
+		 * grab the result with IK solver results...
+		 */
+		if (parchan != pchan) {
+			OperationKey parent_key(&ob->id, DEPSNODE_TYPE_BONE, parchan->name, DEG_OPCODE_BONE_READY);
+			add_relation(parent_key, solver_key, DEPSREL_TYPE_TRANSFORM, "IK Chain Parent");
+
+			OperationKey done_key(&ob->id, DEPSNODE_TYPE_BONE, parchan->name, DEG_OPCODE_BONE_DONE);
+			add_relation(solver_key, done_key, DEPSREL_TYPE_TRANSFORM, "IK Chain Result");
+		}
+		else {
+			OperationKey final_transforms_key(&ob->id, DEPSNODE_TYPE_BONE, parchan->name, DEG_OPCODE_BONE_DONE);
+			add_relation(solver_key, final_transforms_key, DEPSREL_TYPE_TRANSFORM, "IK Solver Result");
+		}
+		parchan->flag |= POSE_DONE;
+
+
+		root_map->add_bone(parchan->name, rootchan->name);
+
+		/* continue up chain, until we reach target number of items... */
+		DEG_DEBUG_PRINTF("  %d = %s\n", segcount, parchan->name);
+		segcount++;
+		if ((segcount == data->rootbone) || (segcount > 255)) break;  /* 255 is weak */
+
+		parchan  = parchan->parent;
+	}
+
+	OperationKey flush_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE, DEG_OPCODE_POSE_DONE);
+	add_relation(solver_key, flush_key, DEPSREL_TYPE_OPERATION, "PoseEval Result-Bone Link");
+}
+
+/* Spline IK Eval Steps */
+void DepsgraphRelationBuilder::build_splineik_pose(Object *ob,
+                                                   bPoseChannel *pchan,
+                                                   bConstraint *con,
+                                                   RootPChanMap *root_map)
+{
+	bSplineIKConstraint *data = (bSplineIKConstraint *)con->data;
+	bPoseChannel *rootchan = BKE_armature_splineik_solver_find_root(pchan, data);
+	OperationKey transforms_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_READY);
+	OperationKey solver_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE, rootchan->name, DEG_OPCODE_POSE_SPLINE_IK_SOLVER);
+
+	/* attach owner to IK Solver too
+	 * - assume that owner is always part of chain
+	 * - see notes on direction of rel below...
+	 */
+	add_relation(transforms_key, solver_key, DEPSREL_TYPE_TRANSFORM, "Spline IK Solver Owner");
+
+	/* attach path dependency to solver */
+	if (data->tar) {
+		/* TODO(sergey): For until we'll store partial matricies in the depsgraph,
+		 * we create dependency between target object and pose eval component.
+		 * See IK pose for a bit more information.
+		 */
+		// TODO: the bigggest point here is that we need the curve PATH and not just the general geometry...
+		ComponentKey target_key(&data->tar->id, DEPSNODE_TYPE_GEOMETRY);
+		ComponentKey pose_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE);
+		add_relation(target_key, pose_key, DEPSREL_TYPE_TRANSFORM, "[Curve.Path -> Spline IK] DepsRel");
+	}
+
+	pchan->flag |= POSE_DONE;
+	OperationKey final_transforms_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_DONE);
+	add_relation(solver_key, final_transforms_key, DEPSREL_TYPE_TRANSFORM, "Spline IK Result");
+
+	root_map->add_bone(pchan->name, rootchan->name);
+
+	/* Walk to the chain's root */
+	//size_t segcount = 0;
+	int segcount = 0;
+
+	for (bPoseChannel *parchan = pchan->parent; parchan; parchan = parchan->parent) {
+		/* Make Spline IK solver dependent on this bone's result,
+		 * since it can only run after the standard results
+		 * of the bone are know. Validate links step on the
+		 * bone will ensure that users of this bone only
+		 * grab the result with IK solver results...
+		 */
+		if (parchan != pchan) {
+			OperationKey parent_key(&ob->id, DEPSNODE_TYPE_BONE, parchan->name, DEG_OPCODE_BONE_READY);
+			add_relation(parent_key, solver_key, DEPSREL_TYPE_TRANSFORM, "Spline IK Solver Update");
+
+			OperationKey done_key(&ob->id, DEPSNODE_TYPE_BONE, parchan->name, DEG_OPCODE_BONE_DONE);
+			add_relation(solver_key, done_key, DEPSREL_TYPE_TRANSFORM, "IK Chain Result");
+		}
+		parchan->flag |= POSE_DONE;
+
+		OperationKey final_transforms_key(&ob->id, DEPSNODE_TYPE_BONE, parchan->name, DEG_OPCODE_BONE_DONE);
+		add_relation(solver_key, final_transforms_key, DEPSREL_TYPE_TRANSFORM, "Spline IK Solver Result");
+
+		root_map->add_bone(parchan->name, rootchan->name);
+
+		/* continue up chain, until we reach target number of items... */
+		segcount++;
+		if ((segcount == data->chainlen) || (segcount > 255)) break;  /* 255 is weak */
+	}
+
+	OperationKey flush_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE, DEG_OPCODE_POSE_DONE);
+	add_relation(solver_key, flush_key, DEPSREL_TYPE_OPERATION, "PoseEval Result-Bone Link");
+}
+
+/* Pose/Armature Bones Graph */
+void DepsgraphRelationBuilder::build_rig(Scene *scene, Object *ob)
+{
+	/* Armature-Data */
+	bArmature *arm = (bArmature *)ob->data;
+
+	// TODO: selection status?
+
+	/* attach links between pose operations */
+	OperationKey init_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE, DEG_OPCODE_POSE_INIT);
+	OperationKey flush_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE, DEG_OPCODE_POSE_DONE);
+
+	add_relation(init_key, flush_key, DEPSREL_TYPE_COMPONENT_ORDER, "[Pose Init -> Pose Cleanup]");
+
+	/* Make sure pose is up-to-date with armature updates. */
+	OperationKey armature_key(&arm->id,
+	                          DEPSNODE_TYPE_PARAMETERS,
+	                          DEG_OPCODE_PLACEHOLDER,
+	                          "Armature Eval");
+	add_relation(armature_key, init_key, DEPSREL_TYPE_COMPONENT_ORDER, "Data dependency");
+
+	if (needs_animdata_node(&ob->id)) {
+		ComponentKey animation_key(&ob->id, DEPSNODE_TYPE_ANIMATION);
+		add_relation(animation_key, init_key, DEPSREL_TYPE_OPERATION, "Rig Animation");
+	}
+
+	/* IK Solvers...
+	 * - These require separate processing steps are pose-level
+	 *   to be executed between chains of bones (i.e. once the
+	 *   base transforms of a bunch of bones is done)
+	 *
+	 * - We build relations for these before the dependencies
+	 *   between ops in the same component as it is necessary
+	 *   to check whether such bones are in the same IK chain
+	 *   (or else we get weird issues with either in-chain
+	 *   references, or with bones being parented to IK'd bones)
+	 *
+	 * Unsolved Issues:
+	 * - Care is needed to ensure that multi-headed trees work out the same as in ik-tree building
+	 * - Animated chain-lengths are a problem...
+	 */
+	RootPChanMap root_map;
+	bool pose_depends_on_local_transform = false;
+	LINKLIST_FOREACH (bPoseChannel *, pchan, &ob->pose->chanbase) {
+		LINKLIST_FOREACH (bConstraint *, con, &pchan->constraints) {
+			switch (con->type) {
+				case CONSTRAINT_TYPE_KINEMATIC:
+					build_ik_pose(ob, pchan, con, &root_map);
+					pose_depends_on_local_transform = true;
+					break;
+
+				case CONSTRAINT_TYPE_SPLINEIK:
+					build_splineik_pose(ob, pchan, con, &root_map);
+					pose_depends_on_local_transform = true;
+					break;
+
+				/* Constraints which needs world's matrix for transform.
+				 * TODO(sergey): More constraints here?
+				 */
+				case CONSTRAINT_TYPE_ROTLIKE:
+				case CONSTRAINT_TYPE_SIZELIKE:
+				case CONSTRAINT_TYPE_LOCLIKE:
+				case CONSTRAINT_TYPE_TRANSLIKE:
+					/* TODO(sergey): Add used space check. */
+					pose_depends_on_local_transform = true;
+					break;
+
+				default:
+					break;
+			}
+		}
+	}
+	//root_map.print_debug();
+
+	if (pose_depends_on_local_transform) {
+		/* TODO(sergey): Once partial updates are possible use relation between
+		 * object transform and solver itself in it's build function.
+		 */
+		ComponentKey pose_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE);
+		ComponentKey local_transform_key(&ob->id, DEPSNODE_TYPE_TRANSFORM);
+		add_relation(local_transform_key, pose_key, DEPSREL_TYPE_TRANSFORM, "Local Transforms");
+	}
+
+
+	/* links between operations for each bone */
+	LINKLIST_FOREACH (bPoseChannel *, pchan, &ob->pose->chanbase) {
+		OperationKey bone_local_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_LOCAL);
+		OperationKey bone_pose_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_POSE_PARENT);
+		OperationKey bone_ready_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_READY);
+		OperationKey bone_done_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_DONE);
+
+		pchan->flag &= ~POSE_DONE;
+
+		/* pose init to bone local */
+		add_relation(init_key, bone_local_key, DEPSREL_TYPE_OPERATION, "PoseEval Source-Bone Link");
+
+		/* local to pose parenting operation */
+		add_relation(bone_local_key, bone_pose_key, DEPSREL_TYPE_OPERATION, "Bone Local - PoseSpace Link");
+
+		/* parent relation */
+		if (pchan->parent != NULL) {
+			eDepsOperation_Code parent_key_opcode;
+
+			/* NOTE: this difference in handling allows us to prevent lockups while ensuring correct poses for separate chains */
+			if (root_map.has_common_root(pchan->name, pchan->parent->name)) {
+				parent_key_opcode = DEG_OPCODE_BONE_READY;
+			}
+			else {
+				parent_key_opcode = DEG_OPCODE_BONE_DONE;
+			}
+
+			OperationKey parent_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->parent->name, parent_key_opcode);
+			add_relation(parent_key, bone_pose_key, DEPSREL_TYPE_TRANSFORM, "[Parent Bone -> Child Bone]");
+		}
+
+		/* constraints */
+		if (pchan->constraints.first != NULL) {
+			/* constraints stack and constraint dependencies */
+			build_constraints(scene, &ob->id, DEPSNODE_TYPE_BONE, pchan->name, &pchan->constraints, &root_map);
+
+			/* pose -> constraints */
+			OperationKey constraints_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_CONSTRAINTS);
+			add_relation(bone_pose_key, constraints_key, DEPSREL_TYPE_OPERATION, "Constraints Stack");
+
+			/* constraints -> ready */
+			// TODO: when constraint stack is exploded, this step should occur before the first IK solver
+			add_relation(constraints_key, bone_ready_key, DEPSREL_TYPE_OPERATION, "Constraints -> Ready");
+		}
+		else {
+			/* pose -> ready */
+			add_relation(bone_pose_key, bone_ready_key, DEPSREL_TYPE_OPERATION, "Pose -> Ready");
+		}
+
+		/* bone ready -> done
+		 * NOTE: For bones without IK, this is all that's needed.
+		 *       For IK chains however, an additional rel is created from IK to done,
+		 *       with transitive reduction removing this one...
+		 */
+		add_relation(bone_ready_key, bone_done_key, DEPSREL_TYPE_OPERATION, "Ready -> Done");
+
+		/* assume that all bones must be done for the pose to be ready (for deformers) */
+		add_relation(bone_done_key, flush_key, DEPSREL_TYPE_OPERATION, "PoseEval Result-Bone Link");
+	}
+}
+
+void DepsgraphRelationBuilder::build_proxy_rig(Object *ob)
+{
+	OperationKey pose_init_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE, DEG_OPCODE_POSE_INIT);
+	OperationKey pose_done_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE, DEG_OPCODE_POSE_DONE);
+	LINKLIST_FOREACH (bPoseChannel *, pchan, &ob->pose->chanbase) {
+		OperationKey bone_local_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_LOCAL);
+		OperationKey bone_ready_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_READY);
+		OperationKey bone_done_key(&ob->id, DEPSNODE_TYPE_BONE, pchan->name, DEG_OPCODE_BONE_DONE);
+		add_relation(pose_init_key, bone_local_key, DEPSREL_TYPE_OPERATION, "Pose Init -> Bone Local");
+		add_relation(bone_local_key, bone_ready_key, DEPSREL_TYPE_OPERATION, "Local -> Ready");
+		add_relation(bone_ready_key, bone_done_key, DEPSREL_TYPE_OPERATION, "Ready -> Done");
+		add_relation(bone_done_key, pose_done_key, DEPSREL_TYPE_OPERATION, "Bone Done -> Pose Done");
+	}
+}
+
+}  // namespace DEG
diff --git a/source/blender/depsgraph/intern/builder/deg_builder_relations_scene.cc b/source/blender/depsgraph/intern/builder/deg_builder_relations_scene.cc
new file mode 100644
index 0000000..6b51a95
--- /dev/null
+++ b/source/blender/depsgraph/intern/builder/deg_builder_relations_scene.cc
@@ -0,0 +1,162 @@
+/*
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2013 Blender Foundation.
+ * All rights reserved.
+ *
+ * Original Author: Joshua Leung
+ * Contributor(s): Based on original depsgraph.c code - Blender Foundation (2005-2013)
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+/** \file blender/depsgraph/intern/builder/deg_builder_relations_scene.cc
+ *  \ingroup depsgraph
+ *
+ * Methods for constructing depsgraph
+ */
+
+#include "intern/builder/deg_builder_relations.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <cstring>  /* required for STREQ later on. */
+
+#include "MEM_guardedalloc.h"
+
+extern "C" {
+#include "BLI_blenlib.h"
+#include "BLI_utildefines.h"
+
+#include "DNA_node_types.h"
+#include "DNA_object_types.h"
+#include "DNA_scene_types.h"
+
+#include "BKE_main.h"
+#include "BKE_node.h"
+
+#include "DEG_depsgraph.h"
+#include "DEG_depsgraph_build.h"
+} /* extern "C" */
+
+#include "intern/builder/deg_builder.h"
+#include "intern/builder/deg_builder_pchanmap.h"
+
+#include "intern/nodes/deg_node.h"
+#include "intern/nodes/deg_node_component.h"
+#include "intern/nodes/deg_node_operation.h"
+
+#include "intern/depsgraph_intern.h"
+#include "intern/depsgraph_types.h"
+
+#include "util/deg_util_foreach.h"
+
+namespace DEG {
+
+void DepsgraphRelationBuilder::build_scene(Main *bmain, Scene *scene)
+{
+	/* LIB_TAG_DOIT is used to indicate whether node for given ID was already
+	 * created or not.
+	 */
+	BKE_main_id_tag_all(bmain, LIB_TAG_DOIT, false);
+	/* XXX nested node trees are not included in tag-clearing above,
+	 * so we need to do this manually.
+	 */
+	FOREACH_NODETREE(bmain, nodetree, id) {
+		if (id != (ID *)nodetree)
+			nodetree->id.tag &= ~LIB_TAG_DOIT;
+	} FOREACH_NODETREE_END
+
+	if (scene->set) {
+		// TODO: link set to scene, especially our timesource...
+	}
+
+	/* scene objects */
+	LINKLIST_FOREACH (Base *, base, &scene->base) {
+		Object *ob = base->object;
+
+		/* object itself */
+		build_object(bmain, scene, ob);
+
+		/* object that this is a proxy for */
+		if (ob->proxy) {
+			ob->proxy->proxy_from = ob;
+			build_object(bmain, scene, ob->proxy);
+			/* TODO(sergey): This is an inverted relation, matches old depsgraph
+			 * behavior and need to be investigated if it still need to be inverted.
+			 */
+			ComponentKey ob_pose_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE);
+			ComponentKey proxy_pose_key(&ob->proxy->id, DEPSNODE_TYPE_EVAL_POSE);
+			add_relation(ob_pose_key, proxy_pose_key, DEPSREL_TYPE_TRANSFORM, "Proxy");
+		}
+
+		/* Object dupligroup. */
+		if (ob->dup_group) {
+			build_group(bmain, scene, ob, ob->dup_group);
+		}
+	}
+
+	/* rigidbody */
+	if (scene->rigidbody_world) {
+		build_rigidbody(scene);
+	}
+
+	/* scene's animation and drivers */
+	if (scene->adt) {
+		build_animdata(&scene->id);
+	}
+
+	/* world */
+	if (scene->world) {
+		build_world(scene->world);
+	}
+
+	/* compo nodes */
+	if (scene->nodetree) {
+		build_compositor(scene);
+	}
+
+	/* grease pencil */
+	if (scene->gpd) {
+		build_gpencil(&scene->id, scene->gpd);
+	}
+
+	/* Masks. */
+	LINKLIST_FOREACH (Mask *, mask, &bmain->mask) {
+		build_mask(mask);
+	}
+
+	/* Movie clips. */
+	LINKLIST_FOREACH (MovieClip *, clip, &bmain->movieclip) {
+		build_movieclip(clip);
+	}
+
+	for (Depsgraph::OperationNodes::const_iterator it_op = m_graph->operations.begin();
+	     it_op != m_graph->operations.end();
+	     ++it_op)
+	{
+		OperationDepsNode *node = *it_op;
+		IDDepsNode *id_node = node->owner->owner;
+		ID *id = id_node->id;
+		if (GS(id->name) == ID_OB) {
+			Object *object = (Object *)id;
+			object->customdata_mask |= node->customdata_mask;
+		}
+	}
+}
+
+}  // namespace DEG
diff --git a/source/blender/depsgraph/intern/debug/deg_debug_graphviz.cc b/source/blender/depsgraph/intern/debug/deg_debug_graphviz.cc
index 70cd5f1..0d56ce7 100644
--- a/source/blender/depsgraph/intern/debug/deg_debug_graphviz.cc
+++ b/source/blender/depsgraph/intern/debug/deg_debug_graphviz.cc
@@ -321,7 +321,7 @@ static void deg_debug_graphviz_node_single(const DebugContext &ctx,
 static void deg_debug_graphviz_node_cluster_begin(const DebugContext &ctx,
                                                   const DepsNode *node)
 {
-	string name = node->identifier().c_str();
+	string name = node->identifier();
 	if (node->type == DEPSNODE_TYPE_ID_REF) {
 		IDDepsNode *id_node = (IDDepsNode *)node;
 		char buf[256];
diff --git a/source/blender/depsgraph/intern/depsgraph.cc b/source/blender/depsgraph/intern/depsgraph.cc
index 2b7c637..5604044 100644
--- a/source/blender/depsgraph/intern/depsgraph.cc
+++ b/source/blender/depsgraph/intern/depsgraph.cc
@@ -32,8 +32,6 @@
 
 #include "intern/depsgraph.h" /* own include */
 
-#include <string.h>
-
 #include "MEM_guardedalloc.h"
 
 #include "BLI_utildefines.h"
@@ -53,6 +51,8 @@ extern "C" {
 #include "RNA_access.h"
 }
 
+#include <cstring>
+
 #include "DEG_depsgraph.h"
 
 #include "intern/nodes/deg_node.h"
@@ -116,7 +116,7 @@ static bool pointer_to_component_node_criteria(const PointerRNA *ptr,
                                                const PropertyRNA *prop,
                                                ID **id,
                                                eDepsNode_Type *type,
-                                               string *subdata)
+                                               const char **subdata)
 {
 	if (!ptr->type)
 		return false;
@@ -189,16 +189,23 @@ static bool pointer_to_component_node_criteria(const PointerRNA *ptr,
 		/* Transforms props? */
 		if (prop) {
 			const char *prop_identifier = RNA_property_identifier((PropertyRNA *)prop);
-
+			/* TODO(sergey): How to optimize this? */
 			if (strstr(prop_identifier, "location") ||
 			    strstr(prop_identifier, "rotation") ||
-			    strstr(prop_identifier, "scale"))
+			    strstr(prop_identifier, "scale") ||
+			    strstr(prop_identifier, "matrix_"))
 			{
 				*type = DEPSNODE_TYPE_TRANSFORM;
 				return true;
 			}
+			else if (strstr(prop_identifier, "data")) {
+				/* We access object.data, most likely a geometry.
+				 * Might be a bone tho..
+				 */
+				*type = DEPSNODE_TYPE_GEOMETRY;
+				return true;
+			}
 		}
-		// ...
 	}
 	else if (ptr->type == &RNA_ShapeKey) {
 		Key *key = (Key *)ptr->id.data;
@@ -232,7 +239,7 @@ DepsNode *Depsgraph::find_node_from_pointer(const PointerRNA *ptr,
 {
 	ID *id;
 	eDepsNode_Type type;
-	string name;
+	const char *name;
 
 	/* Get querying conditions. */
 	if (pointer_to_id_node_criteria(ptr, prop, &id)) {
@@ -240,8 +247,9 @@ DepsNode *Depsgraph::find_node_from_pointer(const PointerRNA *ptr,
 	}
 	else if (pointer_to_component_node_criteria(ptr, prop, &id, &type, &name)) {
 		IDDepsNode *id_node = find_id_node(id);
-		if (id_node)
+		if (id_node != NULL) {
 			return id_node->find_component(type, name);
+		}
 	}
 
 	return NULL;
@@ -328,7 +336,7 @@ IDDepsNode *Depsgraph::find_id_node(const ID *id) const
 	return reinterpret_cast<IDDepsNode *>(BLI_ghash_lookup(id_hash, id));
 }
 
-IDDepsNode *Depsgraph::add_id_node(ID *id, const string &name)
+IDDepsNode *Depsgraph::add_id_node(ID *id, const char *name)
 {
 	IDDepsNode *id_node = find_id_node(id);
 	if (!id_node) {
@@ -370,8 +378,7 @@ DepsRelation *Depsgraph::add_new_relation(OperationDepsNode *from,
 	if (comp_node->type == DEPSNODE_TYPE_GEOMETRY) {
 		IDDepsNode *id_to = to->owner->owner;
 		IDDepsNode *id_from = from->owner->owner;
-		Object *object_to = (Object *)id_to->id;
-		if (id_to != id_from && (object_to->recalc & OB_RECALC_ALL)) {
+		if (id_to != id_from && (id_to->id->tag & LIB_TAG_ID_RECALC_ALL)) {
 			if ((id_from->eval_flags & DAG_EVAL_NEED_CPU) == 0) {
 				id_from->tag_update(this);
 				id_from->eval_flags |= DAG_EVAL_NEED_CPU;
diff --git a/source/blender/depsgraph/intern/depsgraph.h b/source/blender/depsgraph/intern/depsgraph.h
index 08b264f..e668fac 100644
--- a/source/blender/depsgraph/intern/depsgraph.h
+++ b/source/blender/depsgraph/intern/depsgraph.h
@@ -101,22 +101,6 @@ struct Depsgraph {
 	~Depsgraph();
 
 	/**
-	 * Find node which matches the specified description.
-	 *
-	 * \param id: ID block that is associated with this
-	 * \param subdata: identifier used for sub-ID data (e.g. bone)
-	 * \param type: type of node we're dealing with
-	 * \param name: custom identifier assigned to node
-	 *
-	 * \return A node matching the required characteristics if it exists
-	 * or NULL if no such node exists in the graph.
-	 */
-	DepsNode *find_node(const ID *id,
-	                    eDepsNode_Type type,
-	                    const string &subdata,
-	                    const string &name);
-
-	/**
 	 * Convenience wrapper to find node given just pointer + property.
 	 *
 	 * \param ptr: pointer to the data that node will represent
@@ -136,7 +120,7 @@ struct Depsgraph {
 	void clear_subgraph_nodes();
 
 	IDDepsNode *find_id_node(const ID *id) const;
-	IDDepsNode *add_id_node(ID *id, const string &name = "");
+	IDDepsNode *add_id_node(ID *id, const char *name = "");
 	void remove_id_node(const ID *id);
 	void clear_id_nodes();
 
diff --git a/source/blender/depsgraph/intern/depsgraph_build.cc b/source/blender/depsgraph/intern/depsgraph_build.cc
index 7a3b19e..9952f71 100644
--- a/source/blender/depsgraph/intern/depsgraph_build.cc
+++ b/source/blender/depsgraph/intern/depsgraph_build.cc
@@ -32,6 +32,8 @@
 
 #include "MEM_guardedalloc.h"
 
+// #define DEBUG_TIME
+
 extern "C" {
 #include "DNA_cachefile_types.h"
 #include "DNA_object_types.h"
@@ -41,6 +43,11 @@ extern "C" {
 #include "BLI_utildefines.h"
 #include "BLI_ghash.h"
 
+#ifdef DEBUG_TIME
+#  include "PIL_time.h"
+#  include "PIL_time_utildefines.h"
+#endif
+
 #include "BKE_main.h"
 #include "BKE_collision.h"
 #include "BKE_effect.h"
@@ -190,6 +197,10 @@ void DEG_add_special_eval_flag(Depsgraph *graph, ID *id, short flag)
  */
 void DEG_graph_build_from_scene(Depsgraph *graph, Main *bmain, Scene *scene)
 {
+#ifdef DEBUG_TIME
+	TIMEIT_START(DEG_graph_build_from_scene);
+#endif
+
 	DEG::Depsgraph *deg_graph = reinterpret_cast<DEG::Depsgraph *>(graph);
 
 	/* 1) Generate all the nodes in the graph first */
@@ -239,6 +250,10 @@ void DEG_graph_build_from_scene(Depsgraph *graph, Main *bmain, Scene *scene)
 		abort();
 	}
 #endif
+
+#ifdef DEBUG_TIME
+	TIMEIT_END(DEG_graph_build_from_scene);
+#endif
 }
 
 /* Tag graph relations for update. */
@@ -309,7 +324,15 @@ void DEG_scene_graph_free(Scene *scene)
 	}
 }
 
-void DEG_add_collision_relations(DepsNodeHandle *handle, Scene *scene, Object *ob, Group *group, int layer, unsigned int modifier_type, DEG_CollobjFilterFunction fn, bool dupli, const char *name)
+void DEG_add_collision_relations(DepsNodeHandle *handle,
+                                 Scene *scene,
+                                 Object *ob,
+                                 Group *group,
+                                 int layer,
+                                 unsigned int modifier_type,
+                                 DEG_CollobjFilterFunction fn,
+                                 bool dupli,
+                                 const char *name)
 {
 	unsigned int numcollobj;
 	Object **collobjs = get_collisionobjects_ext(scene, ob, group, layer, &numcollobj, modifier_type, dupli);
@@ -327,7 +350,13 @@ void DEG_add_collision_relations(DepsNodeHandle *handle, Scene *scene, Object *o
 		MEM_freeN(collobjs);
 }
 
-void DEG_add_forcefield_relations(DepsNodeHandle *handle, Scene *scene, Object *ob, EffectorWeights *effector_weights, bool add_absorption, int skip_forcefield, const char *name)
+void DEG_add_forcefield_relations(DepsNodeHandle *handle,
+                                  Scene *scene,
+                                  Object *ob,
+                                  EffectorWeights *effector_weights,
+                                  bool add_absorption,
+                                  int skip_forcefield,
+                                  const char *name)
 {
 	ListBase *effectors = pdInitEffectors(scene, ob, NULL, effector_weights, false);
 
@@ -339,17 +368,33 @@ void DEG_add_forcefield_relations(DepsNodeHandle *handle, Scene *scene, Object *
 				if (eff->psys) {
 					DEG_add_object_relation(handle, eff->ob, DEG_OB_COMP_EVAL_PARTICLES, name);
 
-					/* TODO: remove this when/if EVAL_PARTICLES is sufficient for up to date particles */
+					/* TODO: remove this when/if EVAL_PARTICLES is sufficient
+					 * for up to date particles.
+					 */
 					DEG_add_object_relation(handle, eff->ob, DEG_OB_COMP_GEOMETRY, name);
 				}
 
 				if (eff->pd->forcefield == PFIELD_SMOKEFLOW && eff->pd->f_source) {
-					DEG_add_object_relation(handle, eff->pd->f_source, DEG_OB_COMP_TRANSFORM, "Smoke Force Domain");
-					DEG_add_object_relation(handle, eff->pd->f_source, DEG_OB_COMP_GEOMETRY, "Smoke Force Domain");
+					DEG_add_object_relation(handle,
+					                        eff->pd->f_source,
+					                        DEG_OB_COMP_TRANSFORM,
+					                        "Smoke Force Domain");
+					DEG_add_object_relation(handle,
+					                        eff->pd->f_source,
+					                        DEG_OB_COMP_GEOMETRY,
+					                        "Smoke Force Domain");
 				}
 
 				if (add_absorption && (eff->pd->flag & PFIELD_VISIBILITY)) {
-					DEG_add_collision_relations(handle, scene, ob, NULL, eff->ob->lay, eModifierType_Collision, NULL, true, "Force Absorption");
+					DEG_add_collision_relations(handle,
+					                            scene,
+					                            ob,
+					                            NULL,
+					                            eff->ob->lay,
+					                            eModifierType_Collision,
+					                            NULL,
+					                            true,
+					                            "Force Absorption");
 				}
 			}
 		}
diff --git a/source/blender/depsgraph/intern/depsgraph_intern.h b/source/blender/depsgraph/intern/depsgraph_intern.h
index e5d3d1f..2d8e7dc 100644
--- a/source/blender/depsgraph/intern/depsgraph_intern.h
+++ b/source/blender/depsgraph/intern/depsgraph_intern.h
@@ -63,8 +63,8 @@ struct DepsNodeFactory {
 	virtual const char *tname() const = 0;
 
 	virtual DepsNode *create_node(const ID *id,
-	                              const string &subdata,
-	                              const string &name) const = 0;
+	                              const char *subdata,
+	                              const char *name) const = 0;
 };
 
 template <class NodeType>
@@ -73,7 +73,7 @@ struct DepsNodeFactoryImpl : public DepsNodeFactory {
 	eDepsNode_Class tclass() const { return NodeType::typeinfo.tclass; }
 	const char *tname() const { return NodeType::typeinfo.tname; }
 
-	DepsNode *create_node(const ID *id, const string &subdata, const string &name) const
+	DepsNode *create_node(const ID *id, const char *subdata, const char *name) const
 	{
 		DepsNode *node = OBJECT_GUARDED_NEW(NodeType);
 
@@ -81,12 +81,14 @@ struct DepsNodeFactoryImpl : public DepsNodeFactory {
 		node->type = type();
 		node->tclass = tclass();
 
-		if (!name.empty())
+		if (name[0] != '\0') {
 			/* set name if provided ... */
 			node->name = name;
-		else
+		}
+		else {
 			/* ... otherwise use default type name */
 			node->name = tname();
+		}
 
 		node->init(id, subdata);
 
diff --git a/source/blender/depsgraph/intern/depsgraph_tag.cc b/source/blender/depsgraph/intern/depsgraph_tag.cc
index b7b62bd..e8ed036 100644
--- a/source/blender/depsgraph/intern/depsgraph_tag.cc
+++ b/source/blender/depsgraph/intern/depsgraph_tag.cc
@@ -31,7 +31,7 @@
  */
 
 #include <stdio.h>
-#include <cstring>
+#include <cstring>  /* required for memset */
 #include <queue>
 
 extern "C" {
@@ -235,6 +235,9 @@ void DEG_id_tag_update_ex(Main *bmain, ID *id, short flag)
 			if (flag & (OB_RECALC_OB | OB_RECALC_DATA)) {
 				DEG_graph_id_tag_update(bmain, graph, id);
 			}
+			else if (flag & OB_RECALC_TIME) {
+				DEG_graph_id_tag_update(bmain, graph, id);
+			}
 		}
 	}
 
diff --git a/source/blender/depsgraph/intern/eval/deg_eval.cc b/source/blender/depsgraph/intern/eval/deg_eval.cc
index c3fd202..065f656 100644
--- a/source/blender/depsgraph/intern/eval/deg_eval.cc
+++ b/source/blender/depsgraph/intern/eval/deg_eval.cc
@@ -152,7 +152,7 @@ static void deg_task_run_func(TaskPool *pool,
 				}
 				if ((rel->flag & DEPSREL_FLAG_CYCLIC) == 0) {
 					BLI_assert(child->num_links_pending > 0);
-					atomic_sub_uint32(&child->num_links_pending, 1);
+					atomic_sub_and_fetch_uint32(&child->num_links_pending, 1);
 				}
 				if (child->num_links_pending == 0) {
 					bool is_scheduled = atomic_fetch_and_or_uint8(
@@ -287,7 +287,7 @@ static void schedule_node(TaskPool *pool, Depsgraph *graph, unsigned int layers,
 	{
 		if (dec_parents) {
 			BLI_assert(node->num_links_pending > 0);
-			atomic_sub_uint32(&node->num_links_pending, 1);
+			atomic_sub_and_fetch_uint32(&node->num_links_pending, 1);
 		}
 
 		if (node->num_links_pending == 0) {
@@ -304,7 +304,7 @@ static void schedule_node(TaskPool *pool, Depsgraph *graph, unsigned int layers,
 					                               deg_task_run_func,
 					                               node,
 					                               false,
-					                               TASK_PRIORITY_LOW,
+					                               TASK_PRIORITY_HIGH,
 					                               thread_id);
 				}
 			}
diff --git a/source/blender/depsgraph/intern/eval/deg_eval_debug.cc b/source/blender/depsgraph/intern/eval/deg_eval_debug.cc
index 67d64aa..060544a 100644
--- a/source/blender/depsgraph/intern/eval/deg_eval_debug.cc
+++ b/source/blender/depsgraph/intern/eval/deg_eval_debug.cc
@@ -30,10 +30,10 @@
  * Implementation of tools for debugging the depsgraph
  */
 
-#include <cstring>
-
 #include "intern/eval/deg_eval_debug.h"
 
+#include <cstring>  /* required for STREQ later on. */
+
 extern "C" {
 #include "BLI_listbase.h"
 #include "BLI_ghash.h"
@@ -53,10 +53,10 @@ namespace DEG {
 
 DepsgraphStats *DepsgraphDebug::stats = NULL;
 
-static string get_component_name(eDepsNode_Type type, const string &name = "")
+static string get_component_name(eDepsNode_Type type, const char *name = "")
 {
 	DepsNodeFactory *factory = deg_get_node_factory(type);
-	if (name.empty()) {
+	if (name[0] != '\0') {
 		return string(factory->tname());
 	}
 	else {
@@ -116,7 +116,7 @@ void DepsgraphDebug::task_started(Depsgraph *graph,
 			 */
 			DepsgraphStatsComponent *comp_stats =
 			        get_component_stats(id, get_component_name(comp->type,
-			                                                   comp->name),
+			                                                   comp->name).c_str(),
 			                            true);
 			times_clear(comp_stats->times);
 		}
@@ -146,7 +146,7 @@ void DepsgraphDebug::task_completed(Depsgraph *graph,
 			DepsgraphStatsComponent *comp_stats =
 			        get_component_stats(id,
 			                            get_component_name(comp->type,
-			                                               comp->name),
+			                                               comp->name).c_str(),
 			                            true);
 			times_add(comp_stats->times, time);
 		}
@@ -226,7 +226,7 @@ DepsgraphStatsID *DepsgraphDebug::get_id_stats(ID *id, bool create)
 
 DepsgraphStatsComponent *DepsgraphDebug::get_component_stats(
         DepsgraphStatsID *id_stats,
-        const string &name,
+        const char *name,
         bool create)
 {
 	DepsgraphStatsComponent *comp_stats;
@@ -234,13 +234,14 @@ DepsgraphStatsComponent *DepsgraphDebug::get_component_stats(
 	     comp_stats != NULL;
 	     comp_stats = comp_stats->next)
 	{
-		if (STREQ(comp_stats->name, name.c_str()))
+		if (STREQ(comp_stats->name, name)) {
 			break;
+		}
 	}
 	if (!comp_stats && create) {
 		comp_stats = (DepsgraphStatsComponent *)MEM_callocN(sizeof(DepsgraphStatsComponent),
 		                                                    "Depsgraph Component Stats");
-		BLI_strncpy(comp_stats->name, name.c_str(), sizeof(comp_stats->name));
+		BLI_strncpy(comp_stats->name, name, sizeof(comp_stats->name));
 		BLI_addtail(&id_stats->components, comp_stats);
 	}
 	return comp_stats;
diff --git a/source/blender/depsgraph/intern/eval/deg_eval_debug.h b/source/blender/depsgraph/intern/eval/deg_eval_debug.h
index 9109019..0bbe88c 100644
--- a/source/blender/depsgraph/intern/eval/deg_eval_debug.h
+++ b/source/blender/depsgraph/intern/eval/deg_eval_debug.h
@@ -66,10 +66,10 @@ struct DepsgraphDebug {
 
 	static DepsgraphStatsID *get_id_stats(ID *id, bool create);
 	static DepsgraphStatsComponent *get_component_stats(DepsgraphStatsID *id_stats,
-	                                                    const string &name,
+	                                                    const char *name,
 	                                                    bool create);
 	static DepsgraphStatsComponent *get_component_stats(ID *id,
-	                                                    const string &name,
+	                                                    const char *name,
 	                                                    bool create)
 	{
 		return get_component_stats(get_id_stats(id, create), name, create);
diff --git a/source/blender/depsgraph/intern/nodes/deg_node.cc b/source/blender/depsgraph/intern/nodes/deg_node.cc
index eb408f2..57b25c1 100644
--- a/source/blender/depsgraph/intern/nodes/deg_node.cc
+++ b/source/blender/depsgraph/intern/nodes/deg_node.cc
@@ -31,7 +31,7 @@
 #include "intern/nodes/deg_node.h"
 
 #include <stdio.h>
-#include <string.h>
+#include <cstring>  /* required for STREQ later on. */
 
 #include "BLI_utildefines.h"
 #include "BLI_ghash.h"
@@ -72,7 +72,7 @@ DepsNode::TypeInfo::TypeInfo(eDepsNode_Type type, const char *tname)
 
 DepsNode::DepsNode()
 {
-	name[0] = '\0';
+	name = "";
 }
 
 DepsNode::~DepsNode()
@@ -122,7 +122,7 @@ RootDepsNode::~RootDepsNode()
 	OBJECT_GUARDED_DELETE(time_source, TimeSourceDepsNode);
 }
 
-TimeSourceDepsNode *RootDepsNode::add_time_source(const string &name)
+TimeSourceDepsNode *RootDepsNode::add_time_source(const char *name)
 {
 	if (!time_source) {
 		DepsNodeFactory *factory = deg_get_node_factory(DEPSNODE_TYPE_TIMESOURCE);
@@ -142,12 +142,24 @@ static DepsNodeFactoryImpl<TimeSourceDepsNode> DNTI_TIMESOURCE;
 
 /* ID Node ================================================ */
 
+IDDepsNode::ComponentIDKey::ComponentIDKey(eDepsNode_Type type,
+                                           const char *name)
+        : type(type), name(name)
+{
+}
+
+bool IDDepsNode::ComponentIDKey::operator== (const ComponentIDKey &other) const
+{
+    return type == other.type &&
+           STREQ(name, other.name);
+}
+
 static unsigned int id_deps_node_hash_key(const void *key_v)
 {
 	const IDDepsNode::ComponentIDKey *key =
 	        reinterpret_cast<const IDDepsNode::ComponentIDKey *>(key_v);
 	return hash_combine(BLI_ghashutil_uinthash(key->type),
-	                    BLI_ghashutil_strhash_p(key->name.c_str()));
+	                    BLI_ghashutil_strhash_p(key->name));
 }
 
 static bool id_deps_node_hash_key_cmp(const void *a, const void *b)
@@ -173,7 +185,7 @@ static void id_deps_node_hash_value_free(void *value_v)
 }
 
 /* Initialize 'id' node - from pointer data given. */
-void IDDepsNode::init(const ID *id, const string &UNUSED(subdata))
+void IDDepsNode::init(const ID *id, const char *UNUSED(subdata))
 {
 	/* Store ID-pointer. */
 	BLI_assert(id != NULL);
@@ -204,14 +216,14 @@ IDDepsNode::~IDDepsNode()
 }
 
 ComponentDepsNode *IDDepsNode::find_component(eDepsNode_Type type,
-                                              const string &name) const
+                                              const char *name) const
 {
 	ComponentIDKey key(type, name);
 	return reinterpret_cast<ComponentDepsNode *>(BLI_ghash_lookup(components, &key));
 }
 
 ComponentDepsNode *IDDepsNode::add_component(eDepsNode_Type type,
-                                             const string &name)
+                                             const char *name)
 {
 	ComponentDepsNode *comp_node = find_component(type, name);
 	if (!comp_node) {
@@ -226,7 +238,7 @@ ComponentDepsNode *IDDepsNode::add_component(eDepsNode_Type type,
 	return comp_node;
 }
 
-void IDDepsNode::remove_component(eDepsNode_Type type, const string &name)
+void IDDepsNode::remove_component(eDepsNode_Type type, const char *name)
 {
 	ComponentDepsNode *comp_node = find_component(type, name);
 	if (comp_node) {
@@ -281,7 +293,7 @@ static DepsNodeFactoryImpl<IDDepsNode> DNTI_ID_REF;
 /* Subgraph Node ========================================== */
 
 /* Initialize 'subgraph' node - from pointer data given. */
-void SubgraphDepsNode::init(const ID *id, const string &UNUSED(subdata))
+void SubgraphDepsNode::init(const ID *id, const char *UNUSED(subdata))
 {
 	/* Store ID-ref if provided. */
 	this->root_id = (ID *)id;
diff --git a/source/blender/depsgraph/intern/nodes/deg_node.h b/source/blender/depsgraph/intern/nodes/deg_node.h
index b2262c4..7c2f538 100644
--- a/source/blender/depsgraph/intern/nodes/deg_node.h
+++ b/source/blender/depsgraph/intern/nodes/deg_node.h
@@ -32,6 +32,8 @@
 
 #include "intern/depsgraph_types.h"
 
+#include "BLI_utildefines.h"
+
 struct ID;
 struct GHash;
 struct Scene;
@@ -57,7 +59,7 @@ struct DepsNode {
 	};
 
 	/* Identifier - mainly for debugging purposes. */
-	string name;
+	const char *name;
 
 	/* Structural type of node. */
 	eDepsNode_Type type;
@@ -78,8 +80,9 @@ struct DepsNode {
 	/* Nodes which depend on this one. */
 	Relations outlinks;
 
-	/* Generic tag for traversal algorithms */
+	/* Generic tags for traversal algorithms. */
 	int done;
+	int tag;
 
 	/* Methods. */
 
@@ -90,7 +93,7 @@ struct DepsNode {
 	string full_identifier() const;
 
 	virtual void init(const ID * /*id*/,
-	                  const string &/*subdata*/) {}
+	                  const char * /*subdata*/) {}
 
 	virtual void tag_update(Depsgraph * /*graph*/) {}
 
@@ -129,7 +132,7 @@ struct RootDepsNode : public DepsNode {
 	RootDepsNode();
 	~RootDepsNode();
 
-	TimeSourceDepsNode *add_time_source(const string &name = "");
+	TimeSourceDepsNode *add_time_source(const char *name = "");
 
 	/* scene that this corresponds to */
 	Scene *scene;
@@ -143,26 +146,21 @@ struct RootDepsNode : public DepsNode {
 /* ID-Block Reference */
 struct IDDepsNode : public DepsNode {
 	struct ComponentIDKey {
-		ComponentIDKey(eDepsNode_Type type, const string &name = "")
-		    : type(type), name(name) {}
-
-		bool operator== (const ComponentIDKey &other) const
-		{
-			return type == other.type && name == other.name;
-		}
+		ComponentIDKey(eDepsNode_Type type, const char *name = "");
+		bool operator==(const ComponentIDKey &other) const;
 
 		eDepsNode_Type type;
-		string name;
+		const char *name;
 	};
 
-	void init(const ID *id, const string &subdata);
+	void init(const ID *id, const char *subdata);
 	~IDDepsNode();
 
 	ComponentDepsNode *find_component(eDepsNode_Type type,
-	                                  const string &name = "") const;
+	                                  const char *name = "") const;
 	ComponentDepsNode *add_component(eDepsNode_Type type,
-	                                 const string &name = "");
-	void remove_component(eDepsNode_Type type, const string &name = "");
+	                                 const char *name = "");
+	void remove_component(eDepsNode_Type type, const char *name = "");
 	void clear_components();
 
 	void tag_update(Depsgraph *graph);
@@ -189,7 +187,7 @@ struct IDDepsNode : public DepsNode {
 
 /* Subgraph Reference. */
 struct SubgraphDepsNode : public DepsNode {
-	void init(const ID *id, const string &subdata);
+	void init(const ID *id, const char *subdata);
 	~SubgraphDepsNode();
 
 	/* Instanced graph. */
diff --git a/source/blender/depsgraph/intern/nodes/deg_node_component.cc b/source/blender/depsgraph/intern/nodes/deg_node_component.cc
index 01f33b6..06f91ac 100644
--- a/source/blender/depsgraph/intern/nodes/deg_node_component.cc
+++ b/source/blender/depsgraph/intern/nodes/deg_node_component.cc
@@ -31,7 +31,7 @@
 #include "intern/nodes/deg_node_component.h"
 
 #include <stdio.h>
-#include <string.h>
+#include <cstring>  /* required for STREQ later on. */
 
 extern "C" {
 #include "BLI_utildefines.h"
@@ -53,12 +53,50 @@ namespace DEG {
 
 /* Standard Component Methods ============================= */
 
+ComponentDepsNode::OperationIDKey::OperationIDKey()
+        : opcode(DEG_OPCODE_OPERATION),
+          name(""),
+          name_tag(-1)
+{
+}
+
+ComponentDepsNode::OperationIDKey::OperationIDKey(eDepsOperation_Code opcode)
+        : opcode(opcode),
+          name(""),
+          name_tag(-1)
+{
+}
+
+ComponentDepsNode::OperationIDKey::OperationIDKey(eDepsOperation_Code opcode,
+                                                 const char *name,
+                                                 int name_tag)
+        : opcode(opcode),
+          name(name),
+          name_tag(name_tag)
+{
+}
+
+string ComponentDepsNode::OperationIDKey::identifier() const
+{
+	char codebuf[5];
+	BLI_snprintf(codebuf, sizeof(codebuf), "%d", opcode);
+	return string("OperationIDKey(") + codebuf + ", " + name + ")";
+}
+
+bool ComponentDepsNode::OperationIDKey::operator==(
+        const OperationIDKey &other) const
+{
+	return (opcode == other.opcode) &&
+		(STREQ(name, other.name)) &&
+		(name_tag == other.name_tag);
+}
+
 static unsigned int comp_node_hash_key(const void *key_v)
 {
 	const ComponentDepsNode::OperationIDKey *key =
 	        reinterpret_cast<const ComponentDepsNode::OperationIDKey *>(key_v);
 	return hash_combine(BLI_ghashutil_uinthash(key->opcode),
-	                    BLI_ghashutil_strhash_p(key->name.c_str()));
+	                    BLI_ghashutil_strhash_p(key->name));
 }
 
 static bool comp_node_hash_key_cmp(const void *a, const void *b)
@@ -95,7 +133,7 @@ ComponentDepsNode::ComponentDepsNode() :
 
 /* Initialize 'component' node - from pointer data given */
 void ComponentDepsNode::init(const ID * /*id*/,
-                             const string & /*subdata*/)
+                             const char * /*subdata*/)
 {
 	/* hook up eval context? */
 	// XXX: maybe this needs a special API?
@@ -114,7 +152,7 @@ ComponentDepsNode::~ComponentDepsNode()
 
 string ComponentDepsNode::identifier() const
 {
-	string &idname = this->owner->name;
+	string idname = this->owner->name;
 
 	char typebuf[16];
 	sprintf(typebuf, "(%d)", type);
@@ -139,9 +177,11 @@ OperationDepsNode *ComponentDepsNode::find_operation(OperationIDKey key) const
 	}
 }
 
-OperationDepsNode *ComponentDepsNode::find_operation(eDepsOperation_Code opcode, const string &name) const
+OperationDepsNode *ComponentDepsNode::find_operation(eDepsOperation_Code opcode,
+                                                     const char *name,
+                                                     int name_tag) const
 {
-	OperationIDKey key(opcode, name);
+	OperationIDKey key(opcode, name, name_tag);
 	return find_operation(key);
 }
 
@@ -151,21 +191,26 @@ OperationDepsNode *ComponentDepsNode::has_operation(OperationIDKey key) const
 }
 
 OperationDepsNode *ComponentDepsNode::has_operation(eDepsOperation_Code opcode,
-                                                    const string &name) const
+                                                    const char *name,
+                                                    int name_tag) const
 {
-	OperationIDKey key(opcode, name);
+	OperationIDKey key(opcode, name, name_tag);
 	return has_operation(key);
 }
 
-OperationDepsNode *ComponentDepsNode::add_operation(eDepsOperation_Type optype, DepsEvalOperationCb op, eDepsOperation_Code opcode, const string &name)
+OperationDepsNode *ComponentDepsNode::add_operation(eDepsOperation_Type optype,
+                                                    DepsEvalOperationCb op,
+                                                    eDepsOperation_Code opcode,
+                                                    const char *name,
+                                                    int name_tag)
 {
-	OperationDepsNode *op_node = has_operation(opcode, name);
+	OperationDepsNode *op_node = has_operation(opcode, name, name_tag);
 	if (!op_node) {
 		DepsNodeFactory *factory = deg_get_node_factory(DEPSNODE_TYPE_OPERATION);
 		op_node = (OperationDepsNode *)factory->create_node(this->owner->id, "", name);
 
 		/* register opnode in this component's operation set */
-		OperationIDKey *key = OBJECT_GUARDED_NEW(OperationIDKey, opcode, name);
+		OperationIDKey *key = OBJECT_GUARDED_NEW(OperationIDKey, opcode, name, name_tag);
 		BLI_ghash_insert(operations_map, key, op_node);
 
 		/* set as entry/exit node of component (if appropriate) */
@@ -197,16 +242,6 @@ OperationDepsNode *ComponentDepsNode::add_operation(eDepsOperation_Type optype,
 	return op_node;
 }
 
-void ComponentDepsNode::remove_operation(eDepsOperation_Code opcode, const string &name)
-{
-	/* unregister */
-	OperationIDKey key(opcode, name);
-	BLI_ghash_remove(operations_map,
-	                 &key,
-	                 comp_node_hash_key_free,
-	                 comp_node_hash_key_free);
-}
-
 void ComponentDepsNode::clear_operations()
 {
 	if (operations_map != NULL) {
@@ -337,7 +372,7 @@ static DepsNodeFactoryImpl<PoseComponentDepsNode> DNTI_EVAL_POSE;
 /* Bone Component ========================================= */
 
 /* Initialize 'bone component' node - from pointer data given */
-void BoneComponentDepsNode::init(const ID *id, const string &subdata)
+void BoneComponentDepsNode::init(const ID *id, const char *subdata)
 {
 	/* generic component-node... */
 	ComponentDepsNode::init(id, subdata);
@@ -350,7 +385,7 @@ void BoneComponentDepsNode::init(const ID *id, const string &subdata)
 
 	/* bone-specific node data */
 	Object *ob = (Object *)id;
-	this->pchan = BKE_pose_channel_find_name(ob->pose, subdata.c_str());
+	this->pchan = BKE_pose_channel_find_name(ob->pose, subdata);
 }
 
 DEG_DEPSNODE_DEFINE(BoneComponentDepsNode, DEPSNODE_TYPE_BONE, "Bone Component");
diff --git a/source/blender/depsgraph/intern/nodes/deg_node_component.h b/source/blender/depsgraph/intern/nodes/deg_node_component.h
index 7dec8ea..969771a 100644
--- a/source/blender/depsgraph/intern/nodes/deg_node_component.h
+++ b/source/blender/depsgraph/intern/nodes/deg_node_component.h
@@ -53,50 +53,38 @@ struct ComponentDepsNode : public DepsNode {
 	struct OperationIDKey
 	{
 		eDepsOperation_Code opcode;
-		string name;
-
-
-		OperationIDKey() :
-			opcode(DEG_OPCODE_OPERATION), name("")
-		{}
-		OperationIDKey(eDepsOperation_Code opcode) :
-			opcode(opcode), name("")
-		{}
-		OperationIDKey(eDepsOperation_Code opcode, const string &name) :
-		   opcode(opcode), name(name)
-		{}
-
-		string identifier() const
-		{
-			char codebuf[5];
-			BLI_snprintf(codebuf, sizeof(codebuf), "%d", opcode);
-
-			return string("OperationIDKey(") + codebuf + ", " + name + ")";
-		}
-
-		bool operator==(const OperationIDKey &other) const
-		{
-			return (opcode == other.opcode) && (name == other.name);
-		}
+		const char *name;
+		int name_tag;
+
+		OperationIDKey();
+		OperationIDKey(eDepsOperation_Code opcode);
+		OperationIDKey(eDepsOperation_Code opcode,
+		               const char *name,
+		               int name_tag);
+
+		string identifier() const;
+		bool operator==(const OperationIDKey &other) const;
 	};
 
 	/* Typedef for container of operations */
 	ComponentDepsNode();
 	~ComponentDepsNode();
 
-	void init(const ID *id, const string &subdata);
+	void init(const ID *id, const char *subdata);
 
 	string identifier() const;
 
 	/* Find an existing operation, will throw an assert() if it does not exist. */
 	OperationDepsNode *find_operation(OperationIDKey key) const;
 	OperationDepsNode *find_operation(eDepsOperation_Code opcode,
-	                                  const string &name) const;
+	                                  const char *name,
+	                                  int name_tag) const;
 
 	/* Check operation exists and return it. */
 	OperationDepsNode *has_operation(OperationIDKey key) const;
 	OperationDepsNode *has_operation(eDepsOperation_Code opcode,
-	                                 const string &name) const;
+	                                 const char *name,
+	                                 int name_tag) const;
 
 	/**
 	 * Create a new node for representing an operation and add this to graph
@@ -114,9 +102,9 @@ struct ComponentDepsNode : public DepsNode {
 	OperationDepsNode *add_operation(eDepsOperation_Type optype,
 	                                 DepsEvalOperationCb op,
 	                                 eDepsOperation_Code opcode,
-	                                 const string &name);
+	                                 const char *name,
+	                                 int name_tag);
 
-	void remove_operation(eDepsOperation_Code opcode, const string &name);
 	void clear_operations();
 
 	void tag_update(Depsgraph *graph);
@@ -194,7 +182,7 @@ struct PoseComponentDepsNode : public ComponentDepsNode {
 
 /* Bone Component */
 struct BoneComponentDepsNode : public ComponentDepsNode {
-	void init(const ID *id, const string &subdata);
+	void init(const ID *id, const char *subdata);
 
 	struct bPoseChannel *pchan;     /* the bone that this component represents */
 
diff --git a/source/blender/depsgraph/intern/nodes/deg_node_operation.cc b/source/blender/depsgraph/intern/nodes/deg_node_operation.cc
index 5847af2..9eed4df 100644
--- a/source/blender/depsgraph/intern/nodes/deg_node_operation.cc
+++ b/source/blender/depsgraph/intern/nodes/deg_node_operation.cc
@@ -68,7 +68,7 @@ string OperationDepsNode::full_identifier() const
 {
 	string owner_str = "";
 	if (owner->type == DEPSNODE_TYPE_BONE) {
-		owner_str = owner->owner->name + "." + owner->name;
+		owner_str = string(owner->owner->name) + "." + owner->name;
 	}
 	else {
 		owner_str = owner->owner->name;
diff --git a/source/blender/depsgraph/util/deg_util_foreach.h b/source/blender/depsgraph/util/deg_util_foreach.h
index 14cf4fc..87d3716 100644
--- a/source/blender/depsgraph/util/deg_util_foreach.h
+++ b/source/blender/depsgraph/util/deg_util_foreach.h
@@ -66,3 +66,8 @@
 #define GSET_FOREACH_END() \
 		} \
 	} while(0)
+
+#define LINKLIST_FOREACH(type, var, list)          \
+	for (type var = (type)((list)->first);     \
+	     var != NULL;                          \
+	     var = (type)(((Link*)(var))->next))
diff --git a/source/blender/editors/space_file/filelist.c b/source/blender/editors/space_file/filelist.c
index ab95a77..f8ba619 100644
--- a/source/blender/editors/space_file/filelist.c
+++ b/source/blender/editors/space_file/filelist.c
@@ -2509,7 +2509,7 @@ static void filelist_readjob_do(
 			 * Using an atomic operation to avoid having to lock thread...
 			 * Note that we do not really need this here currently, since there is a single listing thread, but better
 			 * remain consistent about threading! */
-			*((uint32_t *)entry->uuid) = atomic_add_uint32((uint32_t *)filelist->filelist_intern.curr_uuid, 1);
+			*((uint32_t *)entry->uuid) = atomic_add_and_fetch_uint32((uint32_t *)filelist->filelist_intern.curr_uuid, 1);
 
 			/* Only thing we change in direntry here, so we need to free it first. */
 			MEM_freeN(entry->relpath);
diff --git a/source/blender/editors/space_outliner/outliner_tree.c b/source/blender/editors/space_outliner/outliner_tree.c
index 96bab3d..17aefac 100644
--- a/source/blender/editors/space_outliner/outliner_tree.c
+++ b/source/blender/editors/space_outliner/outliner_tree.c
@@ -1108,8 +1108,12 @@ static TreeElement *outliner_add_element(SpaceOops *soops, ListBase *lb, void *i
 					tselem->flag &= ~TSE_CLOSED;
 
 			if (TSELEM_OPEN(tselem, soops)) {
-				for (a = 0; a < tot; a++)
-					outliner_add_element(soops, &te->subtree, (void *)ptr, te, TSE_RNA_PROPERTY, a);
+				for (a = 0; a < tot; a++) {
+					RNA_property_collection_lookup_int(ptr, iterprop, a, &propptr);
+					if (!(RNA_property_flag(propptr.data) & PROP_HIDDEN)) {
+						outliner_add_element(soops, &te->subtree, (void *)ptr, te, TSE_RNA_PROPERTY, a);
+					}
+				}
 			}
 			else if (tot)
 				te->flag |= TE_LAZY_CLOSED;
diff --git a/source/blender/gpu/shaders/gpu_shader_material.glsl b/source/blender/gpu/shaders/gpu_shader_material.glsl
index 67da820..549c979 100644
--- a/source/blender/gpu/shaders/gpu_shader_material.glsl
+++ b/source/blender/gpu/shaders/gpu_shader_material.glsl
@@ -1358,7 +1358,7 @@ void mtex_cube_map_refl_from_refldir(
         samplerCube ima, vec3 reflecteddirection, out float value, out vec4 color)
 {
         color = textureCube(ima, reflecteddirection);
-        value = 1.0;
+        value = color.a;
 }
 
 void mtex_cube_map_refl(
diff --git a/source/blender/imbuf/intern/oiio/CMakeLists.txt b/source/blender/imbuf/intern/oiio/CMakeLists.txt
index c873fa3..a4fb9c5 100644
--- a/source/blender/imbuf/intern/oiio/CMakeLists.txt
+++ b/source/blender/imbuf/intern/oiio/CMakeLists.txt
@@ -49,6 +49,11 @@ if(WITH_OPENIMAGEIO)
 		${OPENIMAGEIO_INCLUDE_DIRS}
 		${BOOST_INCLUDE_DIR}
 	)
+	if(WITH_IMAGE_OPENEXR)
+		list(APPEND INC_SYS
+			${OPENEXR_INCLUDE_DIRS}
+		)
+	endif()
 	add_definitions(-DWITH_OPENIMAGEIO)
 endif()
 
diff --git a/source/blender/makesdna/DNA_userdef_types.h b/source/blender/makesdna/DNA_userdef_types.h
index 02a0b41..0f71dff 100644
--- a/source/blender/makesdna/DNA_userdef_types.h
+++ b/source/blender/makesdna/DNA_userdef_types.h
@@ -867,14 +867,6 @@ typedef enum eNdof_Flag {
 
 #define NDOF_PIXELS_PER_SECOND 600.0f
 
-/* compute_device_type */
-typedef enum eCompute_Device_Type {
-	USER_COMPUTE_DEVICE_NONE	= 0,
-	USER_COMPUTE_DEVICE_OPENCL	= 1,
-	USER_COMPUTE_DEVICE_CUDA	= 2,
-} eCompute_Device_Type;
-
-	
 typedef enum eMultiSample_Type {
 	USER_MULTISAMPLE_NONE	= 0,
 	USER_MULTISAMPLE_2	= 2,
diff --git a/source/blender/makesrna/intern/rna_main.c b/source/blender/makesrna/intern/rna_main.c
index bf01619..33246c0 100644
--- a/source/blender/makesrna/intern/rna_main.c
+++ b/source/blender/makesrna/intern/rna_main.c
@@ -399,7 +399,7 @@ void RNA_def_main(BlenderRNA *brna)
 	RNA_def_property_ui_text(prop, "Use Autopack", "Automatically pack all external data into .blend file");
 
 	prop = RNA_def_int_vector(srna, "version", 3, NULL, 0, INT_MAX,
-	                   "Version", "Version of the blender the .blend was saved with", 0, INT_MAX);
+	                   "Version", "Version of Blender the .blend was saved with", 0, INT_MAX);
 	RNA_def_property_int_funcs(prop, "rna_Main_version_get", NULL, NULL);
 	RNA_def_property_clear_flag(prop, PROP_EDITABLE);
 	RNA_def_property_flag(prop, PROP_THICK_WRAP);
diff --git a/source/blender/makesrna/intern/rna_mesh_api.c b/source/blender/makesrna/intern/rna_mesh_api.c
index a3bc21b..97a618b 100644
--- a/source/blender/makesrna/intern/rna_mesh_api.c
+++ b/source/blender/makesrna/intern/rna_mesh_api.c
@@ -240,6 +240,9 @@ void RNA_api_mesh(StructRNA *srna)
 	func = RNA_def_function(srna, "free_normals_split", "rna_Mesh_free_normals_split");
 	RNA_def_function_ui_description(func, "Free split vertex normals");
 
+	func = RNA_def_function(srna, "split_faces", "BKE_mesh_split_faces");
+	RNA_def_function_ui_description(func, "Spli faces based on the edge angle");
+
 	func = RNA_def_function(srna, "calc_tangents", "rna_Mesh_calc_tangents");
 	RNA_def_function_flag(func, FUNC_USE_REPORTS);
 	RNA_def_function_ui_description(func,
diff --git a/source/blender/makesrna/intern/rna_render.c b/source/blender/makesrna/intern/rna_render.c
index 8438270..12f20f5 100644
--- a/source/blender/makesrna/intern/rna_render.c
+++ b/source/blender/makesrna/intern/rna_render.c
@@ -78,9 +78,14 @@ EnumPropertyItem rna_enum_render_pass_type_items[] = {
 };
 
 EnumPropertyItem rna_enum_render_pass_debug_type_items[] = {
-	{RENDER_PASS_DEBUG_BVH_TRAVERSAL_STEPS, "BVH_TRAVERSAL_STEPS", 0, "BVH Traversal Steps", ""},
-	{RENDER_PASS_DEBUG_BVH_TRAVERSED_INSTANCES, "BVH_TRAVERSED_INSTANCES", 0, "BVH Traversed Instances", ""},
-	{RENDER_PASS_DEBUG_RAY_BOUNCES, "RAY_BOUNCES", 0, "Ray Steps", ""},
+	{RENDER_PASS_DEBUG_BVH_TRAVERSED_NODES, "BVH_TRAVERSED_NODES", 0, "BVH Traversed Nodes",
+	 "Number of nodes traversed in BVH for the camera rays"},
+	{RENDER_PASS_DEBUG_BVH_TRAVERSED_INSTANCES, "BVH_TRAVERSED_INSTANCES", 0, "BVH Traversed Instances",
+	 "Number of BVH instances traversed by camera rays"},
+	{RENDER_PASS_DEBUG_BVH_INTERSECTIONS, "BVH_INTERSECTIONS", 0, "BVH Intersections",
+	 "Number of primitive intersections performed by the camera rays"},
+	{RENDER_PASS_DEBUG_RAY_BOUNCES, "RAY_BOUNCES", 0, "Ray Steps",
+	 "Number of bounces done by the main integration loop"},
 	{0, NULL, 0, NULL, NULL}
 };
 
diff --git a/source/blender/makesrna/intern/rna_userdef.c b/source/blender/makesrna/intern/rna_userdef.c
index 73a5ae5..b16e886 100644
--- a/source/blender/makesrna/intern/rna_userdef.c
+++ b/source/blender/makesrna/intern/rna_userdef.c
@@ -52,15 +52,6 @@
 #include "BLT_lang.h"
 #include "GPU_buffers.h"
 
-#ifdef WITH_CYCLES
-static EnumPropertyItem compute_device_type_items[] = {
-	{USER_COMPUTE_DEVICE_NONE, "NONE", 0, "None", "Don't use compute device"},
-	{USER_COMPUTE_DEVICE_CUDA, "CUDA", 0, "CUDA", "Use CUDA for GPU acceleration"},
-	{USER_COMPUTE_DEVICE_OPENCL, "OPENCL", 0, "OpenCL", "Use OpenCL for GPU acceleration"},
-	{ 0, NULL, 0, NULL, NULL}
-};
-#endif
-
 #ifdef WITH_OPENSUBDIV
 static EnumPropertyItem opensubdiv_compute_type_items[] = {
 	{USER_OPENSUBDIV_COMPUTE_NONE, "NONE", 0, "None", ""},
@@ -124,8 +115,6 @@ static EnumPropertyItem rna_enum_language_default_items[] = {
 
 #include "UI_interface.h"
 
-#include "CCL_api.h"
-
 #ifdef WITH_OPENSUBDIV
 #  include "opensubdiv_capi.h"
 #endif
@@ -135,6 +124,14 @@ static EnumPropertyItem rna_enum_language_default_items[] = {
 #endif
 
 
+static void rna_userdef_version_get(PointerRNA *ptr, int *value)
+{
+	UserDef *userdef = (UserDef *)ptr->data;
+	value[0] = userdef->versionfile / 100;
+	value[1] = userdef->versionfile % 100;
+	value[2] = userdef->subversionfile;
+}
+
 static void rna_userdef_update(Main *UNUSED(bmain), Scene *UNUSED(scene), PointerRNA *UNUSED(ptr))
 {
 	WM_main_add_notifier(NC_WINDOW, NULL);
@@ -476,78 +473,6 @@ static PointerRNA rna_Theme_space_list_generic_get(PointerRNA *ptr)
 }
 
 
-#ifdef WITH_CYCLES
-static EnumPropertyItem *rna_userdef_compute_device_type_itemf(bContext *UNUSED(C), PointerRNA *UNUSED(ptr),
-                                                               PropertyRNA *UNUSED(prop), bool *r_free)
-{
-	EnumPropertyItem *item = NULL;
-	int totitem = 0;
-
-	/* add supported device types */
-	RNA_enum_items_add_value(&item, &totitem, compute_device_type_items, USER_COMPUTE_DEVICE_NONE);
-	if (CCL_compute_device_list(0))
-		RNA_enum_items_add_value(&item, &totitem, compute_device_type_items, USER_COMPUTE_DEVICE_CUDA);
-	if (CCL_compute_device_list(1))
-		RNA_enum_items_add_value(&item, &totitem, compute_device_type_items, USER_COMPUTE_DEVICE_OPENCL);
-
-	RNA_enum_item_end(&item, &totitem);
-	*r_free = true;
-
-	return item;
-}
-
-static int rna_userdef_compute_device_get(PointerRNA *UNUSED(ptr))
-{
-	if (U.compute_device_type == USER_COMPUTE_DEVICE_NONE)
-		return 0;
-
-	return U.compute_device_id;
-}
-
-static EnumPropertyItem *rna_userdef_compute_device_itemf(bContext *UNUSED(C), PointerRNA *UNUSED(ptr),
-                                                          PropertyRNA *UNUSED(prop), bool *r_free)
-{
-	EnumPropertyItem tmp = {0, "", 0, "", ""};
-	EnumPropertyItem *item = NULL;
-	int totitem = 0;
-	
-	if (U.compute_device_type == USER_COMPUTE_DEVICE_NONE) {
-		/* only add a single CPU device */
-		tmp.value = 0;
-		tmp.name = "CPU";
-		tmp.identifier = "CPU";
-		RNA_enum_item_add(&item, &totitem, &tmp);
-	}
-	else {
-		/* get device list from cycles. it would be good to make this generic
-		 * once we have more subsystems using opencl, for now this is easiest */
-		int opencl = (U.compute_device_type == USER_COMPUTE_DEVICE_OPENCL);
-		CCLDeviceInfo *devices = CCL_compute_device_list(opencl);
-		int a;
-
-		if (devices) {
-			for (a = 0; devices[a].identifier[0]; a++) {
-				tmp.value = devices[a].value;
-				tmp.identifier = devices[a].identifier;
-				tmp.name = devices[a].name;
-				RNA_enum_item_add(&item, &totitem, &tmp);
-			}
-		}
-		else {
-			tmp.value = 0;
-			tmp.name = "CPU";
-			tmp.identifier = "CPU";
-			RNA_enum_item_add(&item, &totitem, &tmp);
-		}
-	}
-
-	RNA_enum_item_end(&item, &totitem);
-	*r_free = true;
-
-	return item;
-}
-#endif
-
 #ifdef WITH_OPENSUBDIV
 static EnumPropertyItem *rna_userdef_opensubdiv_compute_type_itemf(bContext *UNUSED(C), PointerRNA *UNUSED(ptr),
                                                                    PropertyRNA *UNUSED(prop), bool *r_free)
@@ -3967,13 +3892,6 @@ static void rna_def_userdef_system(BlenderRNA *brna)
 		{0, NULL, 0, NULL, NULL}
 	};
 
-#ifdef WITH_CYCLES
-	static EnumPropertyItem compute_device_items[] = {
-		{0, "CPU", 0, "CPU", ""},
-		{ 0, NULL, 0, NULL, NULL}
-	};
-#endif
-
 	static EnumPropertyItem image_draw_methods[] = {
 		{IMAGE_DRAW_METHOD_2DTEXTURE, "2DTEXTURE", 0, "2D Texture", "Use CPU for display transform and draw image with 2D texture"},
 		{IMAGE_DRAW_METHOD_GLSL, "GLSL", 0, "GLSL", "Use GLSL shaders for display transform and draw image with 2D texture"},
@@ -4265,23 +4183,6 @@ static void rna_def_userdef_system(BlenderRNA *brna)
 	                         "Draw tool/property regions over the main region, when using Triple Buffer");
 	RNA_def_property_update(prop, 0, "rna_userdef_dpi_update");	
 
-#ifdef WITH_CYCLES
-	prop = RNA_def_property(srna, "compute_device_type", PROP_ENUM, PROP_NONE);
-	RNA_def_property_flag(prop, PROP_ENUM_NO_CONTEXT);
-	RNA_def_property_enum_sdna(prop, NULL, "compute_device_type");
-	RNA_def_property_enum_items(prop, compute_device_type_items);
-	RNA_def_property_enum_funcs(prop, NULL, NULL, "rna_userdef_compute_device_type_itemf");
-	RNA_def_property_ui_text(prop, "Compute Device Type", "Device to use for computation (rendering with Cycles)");
-	RNA_def_property_update(prop, NC_SPACE | ND_SPACE_PROPERTIES, NULL);
-
-	prop = RNA_def_property(srna, "compute_device", PROP_ENUM, PROP_NONE);
-	RNA_def_property_flag(prop, PROP_ENUM_NO_CONTEXT);
-	RNA_def_property_enum_sdna(prop, NULL, "compute_device_id");
-	RNA_def_property_enum_items(prop, compute_device_items);
-	RNA_def_property_enum_funcs(prop, "rna_userdef_compute_device_get", NULL, "rna_userdef_compute_device_itemf");
-	RNA_def_property_ui_text(prop, "Compute Device", "Device to use for computation");
-#endif
-
 #ifdef WITH_OPENSUBDIV
 	prop = RNA_def_property(srna, "opensubdiv_compute_type", PROP_ENUM, PROP_NONE);
 	RNA_def_property_flag(prop, PROP_ENUM_NO_CONTEXT);
@@ -4291,6 +4192,14 @@ static void rna_def_userdef_system(BlenderRNA *brna)
 	RNA_def_property_ui_text(prop, "OpenSubdiv Compute Type", "Type of computer back-end used with OpenSubdiv");
 	RNA_def_property_update(prop, NC_SPACE | ND_SPACE_PROPERTIES, "rna_userdef_opensubdiv_update");
 #endif
+
+#ifdef WITH_CYCLES
+	prop = RNA_def_property(srna, "legacy_compute_device_type", PROP_INT, PROP_NONE);
+	RNA_def_property_int_sdna(prop, NULL, "compute_device_type");
+	RNA_def_property_clear_flag(prop, PROP_EDITABLE);
+	RNA_def_property_flag(prop, PROP_HIDDEN);
+	RNA_def_property_ui_text(prop, "Legacy Compute Device Type", "For backwards compatibility only");
+#endif
 }
 
 static void rna_def_userdef_input(BlenderRNA *brna)
@@ -4808,6 +4717,12 @@ void RNA_def_userdef(BlenderRNA *brna)
 	RNA_def_property_pointer_funcs(prop, "rna_UserDef_system_get", NULL, NULL, NULL);
 	RNA_def_property_ui_text(prop, "System & OpenGL", "Graphics driver and operating system settings");
 	
+	prop = RNA_def_int_vector(srna, "version", 3, NULL, 0, INT_MAX,
+	                   "Version", "Version of Blender the userpref.blend was saved with", 0, INT_MAX);
+	RNA_def_property_int_funcs(prop, "rna_userdef_version_get", NULL, NULL);
+	RNA_def_property_clear_flag(prop, PROP_EDITABLE);
+	RNA_def_property_flag(prop, PROP_THICK_WRAP);
+
 	rna_def_userdef_view(brna);
 	rna_def_userdef_edit(brna);
 	rna_def_userdef_input(brna);
diff --git a/source/blender/modifiers/intern/MOD_hook.c b/source/blender/modifiers/intern/MOD_hook.c
index 83c4ca7..9186b10 100644
--- a/source/blender/modifiers/intern/MOD_hook.c
+++ b/source/blender/modifiers/intern/MOD_hook.c
@@ -145,12 +145,9 @@ static void updateDepsgraph(ModifierData *md,
 	HookModifierData *hmd = (HookModifierData *)md;
 	if (hmd->object != NULL) {
 		if (hmd->subtarget[0]) {
-			DEG_add_bone_relation(node, hmd->object, hmd->subtarget, DEG_OB_COMP_TRANSFORM, "Hook Modifier");
 			DEG_add_bone_relation(node, hmd->object, hmd->subtarget, DEG_OB_COMP_BONE, "Hook Modifier");
 		}
-		else {
-			DEG_add_object_relation(node, hmd->object, DEG_OB_COMP_TRANSFORM, "Hook Modifier");
-		}
+		DEG_add_object_relation(node, hmd->object, DEG_OB_COMP_TRANSFORM, "Hook Modifier");
 	}
 	/* We need own transformation as well. */
 	DEG_add_object_relation(node, ob, DEG_OB_COMP_TRANSFORM, "Hook Modifier");
diff --git a/source/blender/nodes/shader/nodes/node_shader_light_path.c b/source/blender/nodes/shader/nodes/node_shader_light_path.c
index b1001cd..052f2a6 100644
--- a/source/blender/nodes/shader/nodes/node_shader_light_path.c
+++ b/source/blender/nodes/shader/nodes/node_shader_light_path.c
@@ -39,6 +39,8 @@ static bNodeSocketTemplate sh_node_light_path_out[] = {
 	{	SOCK_FLOAT, 0, N_("Is Transmission Ray"),	0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
 	{	SOCK_FLOAT, 0, N_("Ray Length"),			0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
 	{	SOCK_FLOAT, 0, N_("Ray Depth"),				0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
+	{	SOCK_FLOAT, 0, N_("Diffuse Depth"),	0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
+	{	SOCK_FLOAT, 0, N_("Glossy Depth"),	0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
 	{	SOCK_FLOAT, 0, N_("Transparent Depth"),		0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
 	{	SOCK_FLOAT, 0, N_("Transmission Depth"),	0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
 	{	-1, 0, ""	}
diff --git a/source/blender/nodes/shader/nodes/node_shader_tex_brick.c b/source/blender/nodes/shader/nodes/node_shader_tex_brick.c
index bb7f216..0be47c4 100644
--- a/source/blender/nodes/shader/nodes/node_shader_tex_brick.c
+++ b/source/blender/nodes/shader/nodes/node_shader_tex_brick.c
@@ -36,6 +36,7 @@ static bNodeSocketTemplate sh_node_tex_brick_in[] = {
 	{ 	SOCK_RGBA, 1, 	N_("Mortar"), 		0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 1.0f, PROP_NONE, SOCK_NO_INTERNAL_LINK},
 	{	SOCK_FLOAT, 1,  N_("Scale"),		5.0f, 0.0f, 0.0f, 0.0f, -1000.0f, 1000.0f, PROP_NONE, SOCK_NO_INTERNAL_LINK},
 	{	SOCK_FLOAT, 1,  N_("Mortar Size"),	0.02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.125f, PROP_NONE, SOCK_NO_INTERNAL_LINK},
+	{	SOCK_FLOAT, 1,  N_("Mortar Smooth"),	0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_NONE, SOCK_NO_INTERNAL_LINK},
 	{	SOCK_FLOAT, 1,  N_("Bias"),		    0.0f, 0.0f, 0.0f, 0.0f, -1.0f, 1.0f, PROP_NONE, SOCK_NO_INTERNAL_LINK},
 	{	SOCK_FLOAT, 1,  N_("Brick Width"),	0.5f, 0.0f, 0.0f, 0.0f, 0.01f, 100.0f, PROP_NONE, SOCK_NO_INTERNAL_LINK},
 	{	SOCK_FLOAT, 1,  N_("Row Height"),   0.25f, 0.0f, 0.0f, 0.0f, 0.01f, 100.0f, PROP_NONE, SOCK_NO_INTERNAL_LINK},
@@ -60,6 +61,12 @@ static void node_shader_init_tex_brick(bNodeTree *UNUSED(ntree), bNode *node)
 	tex->squash_freq = 2;
 
 	node->storage = tex;
+
+	for (bNodeSocket *sock = node->inputs.first; sock; sock = sock->next) {
+		if (STREQ(sock->name, "Mortar Smooth")) {
+			((bNodeSocketValueFloat*)sock->default_value)->value = 0.1f;
+		}
+	}
 }
 
 static int node_shader_gpu_tex_brick(GPUMaterial *mat, bNode *node, bNodeExecData *UNUSED(execdata), GPUNodeStack *in, GPUNodeStack *out)
diff --git a/source/blender/render/extern/include/RE_pipeline.h b/source/blender/render/extern/include/RE_pipeline.h
index 509ad6f..85311bd 100644
--- a/source/blender/render/extern/include/RE_pipeline.h
+++ b/source/blender/render/extern/include/RE_pipeline.h
@@ -98,9 +98,10 @@ typedef struct RenderPass {
 } RenderPass;
 
 enum {
-	RENDER_PASS_DEBUG_BVH_TRAVERSAL_STEPS = 0,
+	RENDER_PASS_DEBUG_BVH_TRAVERSED_NODES = 0,
 	RENDER_PASS_DEBUG_BVH_TRAVERSED_INSTANCES = 1,
 	RENDER_PASS_DEBUG_RAY_BOUNCES = 2,
+	RENDER_PASS_DEBUG_BVH_INTERSECTIONS = 3,
 };
 
 /* a renderlayer is a full image, but with all passes and samples */
diff --git a/source/blender/render/intern/source/render_result.c b/source/blender/render/intern/source/render_result.c
index 6ea46af..bdb3b58 100644
--- a/source/blender/render/intern/source/render_result.c
+++ b/source/blender/render/intern/source/render_result.c
@@ -550,10 +550,12 @@ RenderPass *gp_add_pass(RenderResult *rr, RenderLayer *rl, int channels, int pas
 const char *RE_debug_pass_name_get(int debug_type)
 {
 	switch (debug_type) {
-		case RENDER_PASS_DEBUG_BVH_TRAVERSAL_STEPS:
-			return "BVH Traversal Steps";
+		case RENDER_PASS_DEBUG_BVH_TRAVERSED_NODES:
+			return "BVH Traversed Nodes";
 		case RENDER_PASS_DEBUG_BVH_TRAVERSED_INSTANCES:
 			return "BVH Traversed Instances";
+		case RENDER_PASS_DEBUG_BVH_INTERSECTIONS:
+			return "BVH Primitive Intersections";
 		case RENDER_PASS_DEBUG_RAY_BOUNCES:
 			return "Ray Bounces";
 	}
diff --git a/source/blenderplayer/bad_level_call_stubs/stubs.c b/source/blenderplayer/bad_level_call_stubs/stubs.c
index d8a4ddc..6040dff 100644
--- a/source/blenderplayer/bad_level_call_stubs/stubs.c
+++ b/source/blenderplayer/bad_level_call_stubs/stubs.c
@@ -142,7 +142,6 @@ struct wmWindowManager;
 #  pragma GCC diagnostic ignored "-Wunused-parameter"
 #endif
 
-#include "../../intern/cycles/blender/CCL_api.h"
 #include "../../intern/dualcon/dualcon.h"
 #include "../../intern/elbeem/extern/elbeem.h"
 #include "../blender/blenkernel/BKE_modifier.h"
@@ -770,10 +769,6 @@ void *dualcon(const DualConInput *input_mesh,
               float scale,
               int depth) RET_ZERO
 
-/* intern/cycles */
-struct CCLDeviceInfo;
-struct CCLDeviceInfo *CCL_compute_device_list(int opencl) RET_NULL
-
 /* compositor */
 void COM_execute(RenderData *rd, Scene *scene, bNodeTree *editingtree, int rendering,
                  const ColorManagedViewSettings *viewSettings, const ColorManagedDisplaySettings *displaySettings,
diff --git a/source/creator/CMakeLists.txt b/source/creator/CMakeLists.txt
index f65688e..2df5ddc 100644
--- a/source/creator/CMakeLists.txt
+++ b/source/creator/CMakeLists.txt
@@ -713,10 +713,7 @@ elseif(WIN32)
 			)
 
 			if(WITH_PYTHON_INSTALL_NUMPY)
-				set(PYTHON_NUMPY_VERSION 1.9)
-				if(MSVC_VERSION EQUAL 1900)
-					set(PYTHON_NUMPY_VERSION 1.11)
-				endif()
+				set(PYTHON_NUMPY_VERSION 1.10)
 				add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${BLENDER_VERSION}/python/lib/site-packages
 					COMMAND ${CMAKE_COMMAND} -E
 					        make_directory ${CMAKE_CURRENT_BINARY_DIR}/${BLENDER_VERSION}/python/lib/site-packages)
@@ -830,11 +827,12 @@ elseif(WIN32)
 		else()
 			install(
 				FILES
-					${LIBDIR}/ffmpeg/lib/avcodec-55.dll
-					${LIBDIR}/ffmpeg/lib/avformat-55.dll
-					${LIBDIR}/ffmpeg/lib/avdevice-55.dll
-					${LIBDIR}/ffmpeg/lib/avutil-52.dll
-					${LIBDIR}/ffmpeg/lib/swscale-2.dll
+					${LIBDIR}/ffmpeg/lib/avcodec-57.dll
+					${LIBDIR}/ffmpeg/lib/avformat-57.dll
+					${LIBDIR}/ffmpeg/lib/avdevice-57.dll
+					${LIBDIR}/ffmpeg/lib/avutil-55.dll
+					${LIBDIR}/ffmpeg/lib/swscale-4.dll
+					${LIBDIR}/ffmpeg/lib/swresample-2.dll
 				DESTINATION "."
 			)
 		endif()
diff --git a/source/gameengine/VideoTexture/VideoDeckLink.cpp b/source/gameengine/VideoTexture/VideoDeckLink.cpp
index 4f5e348..c588a4b 100644
--- a/source/gameengine/VideoTexture/VideoDeckLink.cpp
+++ b/source/gameengine/VideoTexture/VideoDeckLink.cpp
@@ -544,12 +544,12 @@ HRESULT STDMETHODCALLTYPE	PinnedMemoryAllocator::QueryInterface(REFIID /*iid*/,
 
 ULONG STDMETHODCALLTYPE		PinnedMemoryAllocator::AddRef(void)
 {
-	return atomic_add_uint32(&mRefCount, 1U);
+	return atomic_add_and_fetch_uint32(&mRefCount, 1U);
 }
 
 ULONG STDMETHODCALLTYPE		PinnedMemoryAllocator::Release(void)
 {
-	uint32_t newCount = atomic_sub_uint32(&mRefCount, 1U);
+	uint32_t newCount = atomic_sub_and_fetch_uint32(&mRefCount, 1U);
 	if (newCount == 0)
 		delete this;
 	return (ULONG)newCount;
diff --git a/tests/python/cycles_render_tests.py b/tests/python/cycles_render_tests.py
index 78b4b34..fa4b3f2 100755
--- a/tests/python/cycles_render_tests.py
+++ b/tests/python/cycles_render_tests.py
@@ -15,6 +15,7 @@ def render_file(filepath):
         "--background",
         "-noaudio",
         "--factory-startup",
+        "--enable-autoexec",
         filepath,
         "-E", "CYCLES",
         # Run with OSL enabled

-- 
blender packaging