Fix typo and change .enabled to .active

Use parenthesis for bit shifts
Remove unused weight_components
2017-02-18 04:12:29 -02:00 · 2017-02-14 18:13:08 -02:00 · 2017-02-14 18:13:08 -02:00 · 2017-02-14 18:13:08 -02:00 · 2017-02-06 01:23:18 -02:00 · 2017-01-25 04:05:53 -02:00
940 changed files with 16993 additions and 28389 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -445,7 +445,6 @@ option(WITH_BOOST					"Enable features depending on boost" ON)

 # Unit testsing
 option(WITH_GTESTS "Enable GTest unit testing" OFF)
-option(WITH_OPENGL_TESTS "Enable OpenGL related unit testing (Experimental)" OFF)


 # Documentation
@@ -519,20 +518,18 @@ endif()
 option(WITH_LEGACY_DEPSGRAPH "Build Blender with legacy dependency graph" ON)
 mark_as_advanced(WITH_LEGACY_DEPSGRAPH)

-if(WIN32)
-	# Use hardcoded paths or find_package to find externals
-	option(WITH_WINDOWS_FIND_MODULES "Use find_package to locate libraries" OFF)
-	mark_as_advanced(WITH_WINDOWS_FIND_MODULES)
+# Use hardcoded paths or find_package to find externals
+option(WITH_WINDOWS_FIND_MODULES "Use find_package to locate libraries" OFF)
+mark_as_advanced(WITH_WINDOWS_FIND_MODULES)

-	option(WITH_WINDOWS_CODESIGN "Use signtool to sign the final binary." OFF)
-	mark_as_advanced(WITH_WINDOWS_CODESIGN)
+option(WITH_WINDOWS_CODESIGN "Use signtool to sign the final binary." OFF)
+mark_as_advanced(WITH_WINDOWS_CODESIGN)

-	set(WINDOWS_CODESIGN_PFX CACHE FILEPATH  "Path to pfx file to use for codesigning.")
-	mark_as_advanced(WINDOWS_CODESIGN_PFX)
+set(WINDOWS_CODESIGN_PFX CACHE FILEPATH  "Path to pfx file to use for codesigning.")
+mark_as_advanced(WINDOWS_CODESIGN_PFX)

-	set(WINDOWS_CODESIGN_PFX_PASSWORD CACHE STRING  "password for pfx file used for codesigning.")
-	mark_as_advanced(WINDOWS_CODESIGN_PFX_PASSWORD)
-endif()
+set(WINDOWS_CODESIGN_PFX_PASSWORD CACHE STRING  "password for pfx file used for codesigning.")
+mark_as_advanced(WINDOWS_CODESIGN_PFX_PASSWORD)

 # avoid using again
 option_defaults_clear()
@@ -631,12 +628,6 @@ if(APPLE)
 			# to silence sdk not found warning, just overrides CMAKE_OSX_SYSROOT
 			set(CMAKE_XCODE_ATTRIBUTE_SDKROOT macosx${OSX_SYSTEM})
 		endif()
-
-		# QuickTime framework is no longer available in SDK 10.12+
-		if(WITH_CODEC_QUICKTIME AND ${OSX_SYSTEM} VERSION_GREATER 10.11)
-			set(WITH_CODEC_QUICKTIME OFF)
-			message(STATUS "QuickTime not supported by SDK ${OSX_SYSTEM}, disabling WITH_CODEC_QUICKTIME")
-		endif()
 	endif()

 	if(OSX_SYSTEM MATCHES 10.9)
@@ -726,7 +717,7 @@ if(NOT WITH_BOOST)
 	macro(set_and_warn
 		_setting _val)
 		if(${${_setting}})
-			message(STATUS "'WITH_BOOST' is disabled: forcing 'set(${_setting} ${_val})'")
+			message(STATUS "'WITH_BOOST' is disabled: forceing 'set(${_setting} ${_val})'")
 		endif()
 		set(${_setting} ${_val})
 	endmacro()
@@ -870,7 +861,7 @@ endif()
 # linux only, not cached
 set(WITH_BINRELOC OFF)

-# MACOSX only, set to avoid uninitialized
+# MAXOSX only, set to avoid uninitialized
 set(EXETYPE "")

 # C/C++ flags
@@ -927,7 +918,7 @@ if(WITH_X11)
 	if(WITH_X11_ALPHA)
 		find_library(X11_Xrender_LIB Xrender  ${X11_LIB_SEARCH_PATH})
 		mark_as_advanced(X11_Xrender_LIB)
-		if(X11_Xrender_LIB)
+		if (X11_Xrender_LIB)
 			list(APPEND PLATFORM_LINKLIBS ${X11_Xrender_LIB})
 		else()
 			set(WITH_X11_ALPHA OFF)
@@ -1576,7 +1567,7 @@ if(WITH_CXX11)
 	if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_C_COMPILER_ID MATCHES "Clang")
 		# TODO(sergey): Do we want c++11 or gnu-c++11 here?
 		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-	elseif(MSVC)
+	elseif(MSVC12)
 		# Nothing special is needed, C++11 features are available by default.
 	else()
 		message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER_ID} is not supported for C++11 build yet")
--- a/6
+++ b/6
@@ -1,4 +1,4 @@
-# -*- mode: gnumakefile; tab-width: 4; indent-tabs-mode: t; -*-
+# -*- mode: gnumakefile; tab-width: 8; indent-tabs-mode: t; -*-
 # vim: tabstop=4
 #
 # ##### BEGIN GPL LICENSE BLOCK #####
@@ -113,7 +113,7 @@ CMAKE_CONFIG = cmake $(BUILD_CMAKE_ARGS) \
 # X11 spesific
 ifdef DISPLAY
 	CMAKE_CONFIG_TOOL = cmake-gui
-else
+else 
 	CMAKE_CONFIG_TOOL = ccmake
 endif

@@ -127,7 +127,7 @@ all: .FORCE
 #	# if test ! -f $(BUILD_DIR)/CMakeCache.txt ; then \
 #	# 	$(CMAKE_CONFIG); \
 #	# fi
-
+	
 #	# do this always incase of failed initial build, could be smarter here...
 	@$(CMAKE_CONFIG)

--- a/build_files/build_environment/install_deps.sh
+++ b/build_files/build_environment/install_deps.sh
@@ -289,7 +289,7 @@ NO_BUILD=false
 NO_CONFIRM=false
 USE_CXX11=false

-PYTHON_VERSION="3.5.2"
+PYTHON_VERSION="3.5.1"
 PYTHON_VERSION_MIN="3.5"
 PYTHON_FORCE_BUILD=false
 PYTHON_FORCE_REBUILD=false
@@ -322,7 +322,7 @@ OPENEXR_FORCE_REBUILD=false
 OPENEXR_SKIP=false
 _with_built_openexr=false

-OIIO_VERSION="1.7.8"
+OIIO_VERSION="1.6.9"
 OIIO_VERSION_MIN="1.6.0"
 OIIO_VERSION_MAX="1.9.0"  # UNKNOWN currently # Not supported by current OSL...
 OIIO_FORCE_BUILD=false
@@ -337,14 +337,14 @@ LLVM_FORCE_REBUILD=false
 LLVM_SKIP=false

 # OSL needs to be compiled for now!
-OSL_VERSION="1.7.5"
+OSL_VERSION="1.7.3"
 OSL_VERSION_MIN=$OSL_VERSION
 OSL_FORCE_BUILD=false
 OSL_FORCE_REBUILD=false
 OSL_SKIP=false

 # OpenSubdiv needs to be compiled for now
-OSD_VERSION="3.1.1"
+OSD_VERSION="3.0.5"
 OSD_VERSION_MIN=$OSD_VERSION
 OSD_FORCE_BUILD=false
 OSD_FORCE_REBUILD=false
@@ -372,7 +372,7 @@ OPENCOLLADA_FORCE_BUILD=false
 OPENCOLLADA_FORCE_REBUILD=false
 OPENCOLLADA_SKIP=false

-FFMPEG_VERSION="3.2.1"
+FFMPEG_VERSION="2.8.4"
 FFMPEG_VERSION_MIN="2.8.4"
 FFMPEG_FORCE_BUILD=false
 FFMPEG_FORCE_REBUILD=false
@@ -795,7 +795,7 @@ CXXFLAGS_BACK=$CXXFLAGS
 if [ "$USE_CXX11" = true ]; then
  WARNING "You are trying to use c++11, this *should* go smoothely with any very recent distribution
 However, if you are experiencing linking errors (also when building Blender itself), please try the following:
-    * Re-run this script with '--build-all --force-all' options.
+    * Re-run this script with `--build-all --force-all` options.
    * Ensure your gcc version is at the very least 4.8, if possible you should really rather use gcc-5.1 or above.

 Please note that until the transition to C++11-built libraries if completed in your distribution, situation will
@@ -2480,7 +2480,7 @@ compile_FFmpeg() {
        --enable-avfilter --disable-vdpau \
        --disable-bzlib --disable-libgsm --disable-libspeex \
        --enable-pthreads --enable-zlib --enable-stripping --enable-runtime-cpudetect \
-        --disable-vaapi --disable-nonfree --enable-gpl \
+        --disable-vaapi --disable-libfaac --disable-nonfree --enable-gpl \
        --disable-postproc --disable-librtmp --disable-libopencore-amrnb \
        --disable-libopencore-amrwb --disable-libdc1394 --disable-version3 --disable-outdev=sdl \
        --disable-libxcb \
--- a/build_files/buildbot/master.cfg
+++ b/build_files/buildbot/master.cfg
@@ -297,8 +297,8 @@ def generic_builder(id, libdir='', branch='', rsync=False):
 # Builders

 add_builder(c, 'mac_x86_64_10_6_cmake', 'darwin-9.x.universal', generic_builder, hour=5)
-# add_builder(c, 'linux_glibc211_i686_cmake', '', generic_builder, hour=1)
-# add_builder(c, 'linux_glibc211_x86_64_cmake', '', generic_builder, hour=2)
+add_builder(c, 'linux_glibc211_i686_cmake', '', generic_builder, hour=1)
+add_builder(c, 'linux_glibc211_x86_64_cmake', '', generic_builder, hour=2)
 add_builder(c, 'linux_glibc219_i686_cmake', '', generic_builder, hour=3)
 add_builder(c, 'linux_glibc219_x86_64_cmake', '', generic_builder, hour=4)
 add_builder(c, 'win32_cmake_vc2013', 'windows_vc12', generic_builder, hour=3)
--- a/build_files/buildbot/slave_compile.py
+++ b/build_files/buildbot/slave_compile.py
@@ -72,8 +72,10 @@ if 'cmake' in builder:
        # Set up OSX architecture
        if builder.endswith('x86_64_10_6_cmake'):
            cmake_extra_options.append('-DCMAKE_OSX_ARCHITECTURES:STRING=x86_64')
+        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE=/usr/local/cuda8-hack/bin/nvcc')
        cmake_extra_options.append('-DWITH_CODEC_QUICKTIME=OFF')
        cmake_extra_options.append('-DCMAKE_OSX_DEPLOYMENT_TARGET=10.6')
+        build_cubins = False


    elif builder.startswith('win'):
--- a/build_files/cmake/Modules/GTestTesting.cmake
+++ b/build_files/cmake/Modules/GTestTesting.cmake
@@ -45,7 +45,7 @@ macro(BLENDER_SRC_GTEST_EX NAME SRC EXTRA_LIBS DO_ADD_TEST)
 		                      RUNTIME_OUTPUT_DIRECTORY_DEBUG   "${TESTS_OUTPUT_DIR}"
 		                      INCLUDE_DIRECTORIES              "${TEST_INC}")
 		if(${DO_ADD_TEST})
-			add_test(NAME ${NAME}_test COMMAND ${TESTS_OUTPUT_DIR}/${NAME}_test WORKING_DIRECTORY $<TARGET_FILE_DIR:blender>)
+			add_test(${NAME}_test ${TESTS_OUTPUT_DIR}/${NAME}_test)
 		endif()
 	endif()
 endmacro()
--- a/build_files/cmake/buildinfo.cmake
+++ b/build_files/cmake/buildinfo.cmake
@@ -56,7 +56,7 @@ if(EXISTS ${SOURCE_DIR}/.git)
 				string(REGEX REPLACE "[\r\n]+" ";" _git_contains_branches "${_git_contains_branches}")
 				string(REGEX REPLACE ";[ \t]+" ";" _git_contains_branches "${_git_contains_branches}")
 				foreach(_branch ${_git_contains_branches})
-					if(NOT "${_branch}" MATCHES "\\(HEAD.*")
+					if (NOT "${_branch}" MATCHES "\\(HEAD.*")
 						set(MY_WC_BRANCH "${_branch}")
 						break()
 					endif()
--- a/build_files/cmake/macros.cmake
+++ b/build_files/cmake/macros.cmake
@@ -416,7 +416,14 @@ function(setup_liblinks
 		target_link_libraries(${target} ${OPENCOLORIO_LIBRARIES})
 	endif()
 	if(WITH_OPENSUBDIV OR WITH_CYCLES_OPENSUBDIV)
+		if(WIN32 AND NOT UNIX)
+			file_list_suffix(OPENSUBDIV_LIBRARIES_DEBUG "${OPENSUBDIV_LIBRARIES}" "_d")
+			target_link_libraries_debug(${target} "${OPENSUBDIV_LIBRARIES_DEBUG}")
+			target_link_libraries_optimized(${target} "${OPENSUBDIV_LIBRARIES}")
+			unset(OPENSUBDIV_LIBRARIES_DEBUG)
+		else()
 			target_link_libraries(${target} ${OPENSUBDIV_LIBRARIES})
+		endif()
 	endif()
 	if(WITH_OPENVDB)
 		target_link_libraries(${target} ${OPENVDB_LIBRARIES} ${TBB_LIBRARIES})
@@ -1574,24 +1581,24 @@ macro(openmp_delayload
 endmacro()

 MACRO(WINDOWS_SIGN_TARGET target)
-	if(WITH_WINDOWS_CODESIGN)
-		if(!SIGNTOOL_EXE)
+	if (WITH_WINDOWS_CODESIGN)
+		if (!SIGNTOOL_EXE)
 			error("Codesigning is enabled, but signtool is not found")
 		else()
-			if(WINDOWS_CODESIGN_PFX_PASSWORD)
+			if (WINDOWS_CODESIGN_PFX_PASSWORD)
 				set(CODESIGNPASSWORD /p ${WINDOWS_CODESIGN_PFX_PASSWORD})
 			else()
-				if($ENV{PFXPASSWORD})
+				if ($ENV{PFXPASSWORD})
 					set(CODESIGNPASSWORD /p $ENV{PFXPASSWORD})
 				else()
-					message(FATAL_ERROR "WITH_WINDOWS_CODESIGN is on but WINDOWS_CODESIGN_PFX_PASSWORD not set, and environment variable PFXPASSWORD not found, unable to sign code.")
+					message( FATAL_ERROR "WITH_WINDOWS_CODESIGN is on but WINDOWS_CODESIGN_PFX_PASSWORD not set, and environment variable PFXPASSWORD not found, unable to sign code.")
 				endif()
 			endif()
 			add_custom_command(TARGET ${target}
-				POST_BUILD
-				COMMAND ${SIGNTOOL_EXE} sign /f ${WINDOWS_CODESIGN_PFX} ${CODESIGNPASSWORD} $<TARGET_FILE:${target}>
-				VERBATIM
-			)
+						POST_BUILD
+						COMMAND ${SIGNTOOL_EXE} sign /f ${WINDOWS_CODESIGN_PFX} ${CODESIGNPASSWORD} $<TARGET_FILE:${target}>
+						VERBATIM
+				)
 		endif()
 	endif()
 ENDMACRO()
--- a/build_files/cmake/packaging.cmake
+++ b/build_files/cmake/packaging.cmake
@@ -1,7 +1,5 @@
-string(TIMESTAMP CURRENT_YEAR "%Y")
-
-set(PROJECT_DESCRIPTION  "Blender is the free and open source 3D creation suite software.")
-set(PROJECT_COPYRIGHT    "Copyright (C) 2001-${CURRENT_YEAR} Blender Foundation")
+set(PROJECT_DESCRIPTION  "Blender is a very fast and versatile 3D modeller/renderer.")
+set(PROJECT_COPYRIGHT    "Copyright (C) 2001-2012 Blender Foundation")
 set(PROJECT_CONTACT      "foundation@blender.org")
 set(PROJECT_VENDOR       "Blender Foundation")

@@ -40,8 +38,8 @@ unset(MY_WC_HASH)
 # Force Package Name
 execute_process(COMMAND date "+%Y%m%d" OUTPUT_VARIABLE CPACK_DATE OUTPUT_STRIP_TRAILING_WHITESPACE)
 string(TOLOWER ${PROJECT_NAME} PROJECT_NAME_LOWER)
-if(MSVC)
-	if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+if (MSVC)
+	if ("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
 		set(PACKAGE_ARCH windows64)
 	else()
 		set(PACKAGE_ARCH windows32)
@@ -50,7 +48,7 @@ else(MSVC)
 	set(PACKAGE_ARCH ${CMAKE_SYSTEM_PROCESSOR})
 endif()

-if(CPACK_OVERRIDE_PACKAGENAME)
+if (CPACK_OVERRIDE_PACKAGENAME)
 	set(CPACK_PACKAGE_FILE_NAME ${CPACK_OVERRIDE_PACKAGENAME}-${PACKAGE_ARCH})
 else()
 	set(CPACK_PACKAGE_FILE_NAME ${PROJECT_NAME_LOWER}-${MAJOR_VERSION}.${MINOR_VERSION}.${PATCH_VERSION}-git${CPACK_DATE}.${BUILD_REV}-${PACKAGE_ARCH})
@@ -137,3 +135,4 @@ unset(MINOR_VERSION)
 unset(PATCH_VERSION)

 unset(BUILD_REV)
+
--- a/build_files/cmake/platform/platform_apple.cmake
+++ b/build_files/cmake/platform/platform_apple.cmake
@@ -158,7 +158,7 @@ if(WITH_CODEC_FFMPEG)
 		mp3lame swscale x264 xvidcore theora theoradec theoraenc vorbis vorbisenc vorbisfile ogg
 	)
 	if(WITH_CXX11)
-		set(FFMPEG_LIBRARIES ${FFMPEG_LIBRARIES} schroedinger orc vpx webp swresample)
+		set(FFMPEG_LIBRARIES ${FFMPEG_LIBRARIES} schroedinger orc vpx)
 	endif()
 	set(FFMPEG_LIBPATH ${FFMPEG}/lib)
 endif()
@@ -316,9 +316,6 @@ if(WITH_OPENIMAGEIO)
 		${OPENEXR_LIBRARIES}
 		${ZLIB_LIBRARIES}
 	)
-	if(WITH_CXX11)
-		set(OPENIMAGEIO_LIBRARIES ${OPENIMAGEIO_LIBRARIES} ${LIBDIR}/ffmpeg/lib/libwebp.a)
-	endif()
 	set(OPENIMAGEIO_LIBPATH
 		${OPENIMAGEIO}/lib
 		${JPEG_LIBPATH}
--- a/build_files/cmake/platform/platform_win32_msvc.cmake
+++ b/build_files/cmake/platform/platform_win32_msvc.cmake
@@ -33,7 +33,7 @@ endmacro()
 macro(windows_find_package package_name
 	)
 	if(WITH_WINDOWS_FIND_MODULES)
-		find_package(${package_name})
+		find_package( ${package_name})
 	endif(WITH_WINDOWS_FIND_MODULES)
 endmacro()

@@ -112,7 +112,7 @@ set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} /ignore:4221")

 # MSVC only, Mingw doesnt need
 if(CMAKE_CL_64)
-	set(PLATFORM_LINKFLAGS "/MACHINE:X64 ${PLATFORM_LINKFLAGS}")
+	set(PLATFORM_LINKFLAGS "/MACHINE:X64 /OPT:NOREF ${PLATFORM_LINKFLAGS}")
 else()
 	set(PLATFORM_LINKFLAGS "/MACHINE:IX86 /LARGEADDRESSAWARE ${PLATFORM_LINKFLAGS}")
 endif()
@@ -238,14 +238,14 @@ if(WITH_CODEC_FFMPEG)
 	windows_find_package(FFMPEG)
 	if(NOT FFMPEG_FOUND)
 		warn_hardcoded_paths(ffmpeg)
-		set(FFMPEG_LIBRARY_VERSION 57)
-		set(FFMPEG_LIBRARY_VERSION_AVU 55)
+		set(FFMPEG_LIBRARY_VERSION 55)
+		set(FFMPEG_LIBRARY_VERSION_AVU 52)
 		set(FFMPEG_LIBRARIES
-			${LIBDIR}/ffmpeg/lib/avcodec.lib
-			${LIBDIR}/ffmpeg/lib/avformat.lib
-			${LIBDIR}/ffmpeg/lib/avdevice.lib
-			${LIBDIR}/ffmpeg/lib/avutil.lib
-			${LIBDIR}/ffmpeg/lib/swscale.lib
+			${LIBDIR}/ffmpeg/lib/avcodec-${FFMPEG_LIBRARY_VERSION}.lib
+			${LIBDIR}/ffmpeg/lib/avformat-${FFMPEG_LIBRARY_VERSION}.lib
+			${LIBDIR}/ffmpeg/lib/avdevice-${FFMPEG_LIBRARY_VERSION}.lib
+			${LIBDIR}/ffmpeg/lib/avutil-${FFMPEG_LIBRARY_VERSION_AVU}.lib
+			${LIBDIR}/ffmpeg/lib/swscale-2.lib
 			)
 	endif()
 endif()
@@ -380,7 +380,6 @@ if(WITH_OPENIMAGEIO)
 	set(OPENCOLORIO_DEFINITIONS "-DOCIO_STATIC_BUILD")
 	set(OPENIMAGEIO_IDIFF "${OPENIMAGEIO}/bin/idiff.exe")
 	add_definitions(-DOIIO_STATIC_BUILD)
-	add_definitions(-DOIIO_NO_SSE=1)
 endif()

 if(WITH_LLVM)
@@ -446,20 +445,10 @@ if(WITH_MOD_CLOTH_ELTOPO)
 endif()

 if(WITH_OPENSUBDIV OR WITH_CYCLES_OPENSUBDIV)
-    set(OPENSUBDIV_INCLUDE_DIR ${LIBDIR}/opensubdiv/include)
-    set(OPENSUBDIV_LIBPATH ${LIBDIR}/opensubdiv/lib)
-    set(OPENSUBDIV_LIBRARIES    optimized ${OPENSUBDIV_LIBPATH}/osdCPU.lib 
-                                optimized ${OPENSUBDIV_LIBPATH}/osdGPU.lib
-                                debug ${OPENSUBDIV_LIBPATH}/osdCPU_d.lib 
-                                debug ${OPENSUBDIV_LIBPATH}/osdGPU_d.lib
-                                )
-    set(OPENSUBDIV_HAS_OPENMP TRUE)
-	set(OPENSUBDIV_HAS_TBB FALSE)
-	set(OPENSUBDIV_HAS_OPENCL TRUE)
-	set(OPENSUBDIV_HAS_CUDA FALSE)
-	set(OPENSUBDIV_HAS_GLSL_TRANSFORM_FEEDBACK TRUE)
-	set(OPENSUBDIV_HAS_GLSL_COMPUTE TRUE)
-    windows_find_package(OpenSubdiv)
+	set(OPENSUBDIV_INCLUDE_DIR ${LIBDIR}/opensubdiv/include)
+	set(OPENSUBDIV_LIBPATH ${LIBDIR}/opensubdiv/lib)
+	set(OPENSUBDIV_LIBRARIES ${OPENSUBDIV_LIBPATH}/osdCPU.lib ${OPENSUBDIV_LIBPATH}/osdGPU.lib)
+	find_package(OpenSubdiv)
 endif()

 if(WITH_SDL)
--- a/doc/python_api/rst/bge.texture.rst
+++ b/doc/python_api/rst/bge.texture.rst
@@ -681,7 +681,7 @@ Image classes

   .. attribute:: zbuff

-      Use depth component of render as grayscale color - suitable for texture source.
+      Use depth component of render as grey scale color -  suitable for texture source.

      :type: bool

@@ -817,7 +817,7 @@ Image classes

   .. attribute:: zbuff

-      Use depth component of viewport as grayscale color - suitable for texture source.
+      Use depth component of viewport as grey scale color - suitable for texture source.

      :type: bool

@@ -1260,8 +1260,8 @@ Filter classes

 .. class:: FilterGray

-   Filter for grayscale effect.
-   Proportions of R, G and B contributions in the output grayscale are 28:151:77.
+   Filter for gray scale effect.
+   Proportions of R, G and B contributions in the output gray scale are 28:151:77.

   .. attribute:: previous

--- a/doc/python_api/rst/bge_types/bge.types.KX_GameObject.rst
+++ b/doc/python_api/rst/bge_types/bge.types.KX_GameObject.rst
@@ -405,7 +405,7 @@ base class --- :class:`SCA_IObject`

      .. note::

-         This attribute is experimental and may be removed (but probably wont be).
+         This attribute is experemental and may be removed (but probably wont be).

      .. note::

@@ -419,7 +419,7 @@ base class --- :class:`SCA_IObject`

      .. note::

-         This attribute is experimental and may be removed (but probably wont be).
+         This attribute is experemental and may be removed (but probably wont be).

      .. note::

@@ -453,7 +453,7 @@ base class --- :class:`SCA_IObject`

   .. attribute:: childrenRecursive

-      all children of this object including children's children, (read-only).
+      all children of this object including childrens children, (read-only).

      :type: :class:`CListValue` of :class:`KX_GameObject`'s

@@ -536,7 +536,7 @@ base class --- :class:`SCA_IObject`

   .. method:: getAxisVect(vect)

-      Returns the axis vector rotates by the object's worldspace orientation.
+      Returns the axis vector rotates by the objects worldspace orientation.
      This is the equivalent of multiplying the vector by the orientation matrix.

      :arg vect: a vector to align the axis.
@@ -596,7 +596,7 @@ base class --- :class:`SCA_IObject`

      Gets the game object's linear velocity.

-      This method returns the game object's velocity through it's center of mass, ie no angular velocity component.
+      This method returns the game object's velocity through it's centre of mass, ie no angular velocity component.

      :arg local:
         * False: you get the "global" velocity ie: relative to world orientation.
@@ -609,7 +609,7 @@ base class --- :class:`SCA_IObject`

      Sets the game object's linear velocity.

-      This method sets game object's velocity through it's center of mass,
+      This method sets game object's velocity through it's centre of mass,
      ie no angular velocity component.

      This requires a dynamic object.
@@ -814,7 +814,7 @@ base class --- :class:`SCA_IObject`
            # do something
            pass

-      The face parameter determines the orientation of the normal.
+      The face paremeter determines the orientation of the normal.

      * 0 => hit normal is always oriented towards the ray origin (as if you casted the ray from outside)
      * 1 => hit normal is the real face normal (only for mesh object, otherwise face has no effect)
@@ -911,7 +911,7 @@ base class --- :class:`SCA_IObject`

      .. note::

-         The gameObject argument has an advantage that it can convert from a mesh with modifiers applied (such as the Subdivision Surface modifier).
+         The gameObject argument has an advantage that it can convert from a mesh with modifiers applied (such as subsurf).

      .. warning::

@@ -919,7 +919,7 @@ base class --- :class:`SCA_IObject`

      .. warning::

-         If the object is a part of a compound object it will fail (parent or child)
+         If the object is a part of a combound object it will fail (parent or child)

      .. warning::

--- a/doc/python_api/rst/bgl.rst
+++ b/doc/python_api/rst/bgl.rst
@@ -12,7 +12,7 @@ contents: dir(bgl).  A simple search on the web can point to more
 than enough material to teach OpenGL programming, from books to many
 collections of tutorials.

-Here is a comprehensive `list of books <https://www.khronos.org/developers/books/>`__ (non free).
+Here is a comprehensive `list of books <https://www.opengl.org/documentation/books/>`__ (non free).
 The `arcsynthesis tutorials <https://web.archive.org/web/20150225192611/http://www.arcsynthesis.org/gltut/index.html>`__
 is one of the best resources to learn modern OpenGL and
 `g-truc <http://www.g-truc.net/post-opengl-samples.html#menu>`__
@@ -2067,7 +2067,7 @@ offers a set of extensive examples, including advanced features.
   :arg length: Returns the length of the string returned in source (excluding the null terminator).
   :type source: :class:`bgl.Buffer` char.
   :arg source: Specifies an array of characters that is used to return the source code string.
-
+   

 .. function:: glShaderSource(shader, shader_string):

--- a/doc/python_api/rst/info_api_reference.rst
+++ b/doc/python_api/rst/info_api_reference.rst
@@ -204,7 +204,7 @@ Lets say we want to access the texture of a brush via Python, to adjust its ``co

 - Start in the default scene and enable 'Sculpt' mode from the 3D-View header.
 - From the toolbar expand the **Texture** panel and add a new texture.
-  *Notice the texture button its self doesn't have very useful links (you can check the tooltips).*
+  *Notice the texture button its self doesn't have very useful links (you can check the tool-tips).*
 - The contrast setting isn't exposed in the sculpt toolbar, so view the texture in the properties panel...

  - In the properties button select the Texture context.
--- a/doc/python_api/rst/info_overview.rst
+++ b/doc/python_api/rst/info_overview.rst
@@ -19,7 +19,7 @@ This is a typical Python environment so tutorials on how to write Python scripts
 will work running the scripts in Blender too.
 Blender provides the :mod:`bpy` module to the Python interpreter.
 This module can be imported in a script and gives access to Blender data, classes, and functions.
-Scripts that deal with Blender data will need to import this module.
+Scripts that deal with Blender data will need to import this module. 

 Here is a simple example of moving a vertex of the object named **Cube**:

@@ -80,7 +80,7 @@ To run as modules:


 Add-ons
-------
+------

 Some of Blenders functionality is best kept optional,
 alongside scripts loaded at startup we have add-ons which are kept in their own directory ``scripts/addons``,
@@ -213,7 +213,7 @@ A simple Blender/Python module can look like this:
       bpy.utils.register_class(SimpleOperator)

   def unregister():
-       bpy.utils.unregister_class(SimpleOperator)
+       bpy.utils.unregister_class(SimpleOperator)    

   if __name__ == "__main__":
       register()
@@ -327,7 +327,7 @@ Say you want to store material settings for a custom engine.
 .. note::

   *The class must be registered before being used in a property, failing to do so will raise an error:*
-
+   
   ``ValueError: bpy_struct "Material" registration error: my_custom_props could not register``


@@ -429,3 +429,4 @@ Calling these operators:
   >>> bpy.ops.object.operator_2()
   Hello World OBJECT_OT_operator_2
   {'FINISHED'}
+
--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -427,9 +427,9 @@ if BLENDER_REVISION != "Unknown":
    BLENDER_VERSION_DOTS += " " + BLENDER_REVISION          # '2.62.1 SHA1'

 BLENDER_VERSION_PATH = "_".join(blender_version_strings)    # '2_62_1'
-if bpy.app.version_cycle in {"rc", "release"}:
-    # '2_62a_release'
-    BLENDER_VERSION_PATH = "%s%s_release" % ("_".join(blender_version_strings[:2]), bpy.app.version_char)
+if bpy.app.version_cycle == "release":
+    BLENDER_VERSION_PATH = "%s%s_release" % ("_".join(blender_version_strings[:2]),
+                                             bpy.app.version_char)   # '2_62_release'

 # --------------------------DOWNLOADABLE FILES----------------------------------

@@ -1565,9 +1565,9 @@ def pyrna2sphinx(basepath):

    # operators
    def write_ops():
-        API_BASEURL = "https://developer.blender.org/diffusion/B/browse/master/release/scripts "
-        API_BASEURL_ADDON = "https://developer.blender.org/diffusion/BA"
-        API_BASEURL_ADDON_CONTRIB = "https://developer.blender.org/diffusion/BAC"
+        API_BASEURL = "https://developer.blender.org/diffusion/B/browse/master/release/scripts/ "
+        API_BASEURL_ADDON = "https://developer.blender.org/diffusion/BA/"
+        API_BASEURL_ADDON_CONTRIB = "https://developer.blender.org/diffusion/BAC/"

        op_modules = {}
        for op in ops.values():
@@ -1632,9 +1632,13 @@ def write_sphinx_conf_py(basepath):
    file = open(filepath, "w", encoding="utf-8")
    fw = file.write

-    fw("import sys, os\n\n")
-    fw("extensions = ['sphinx.ext.intersphinx']\n\n")
-    fw("intersphinx_mapping = {'blender_manual': ('https://docs.blender.org/manual/en/dev/', None)}\n\n")
+    fw("import sys, os\n")
+    fw("\n")
+    fw("extensions = ['sphinx.ext.intersphinx']\n")
+    fw("\n")
+    fw("intersphinx_mapping = {'blender_manual': ('https://www.blender.org/manual/', None)}\n")
+    fw("\n")
+
    fw("project = 'Blender'\n")
    # fw("master_doc = 'index'\n")
    fw("copyright = u'Blender Foundation'\n")
@@ -1651,16 +1655,12 @@ def write_sphinx_conf_py(basepath):

    # not helpful since the source is generated, adds to upload size.
    fw("html_copy_source = False\n")
-    fw("html_show_sphinx = False\n")
    fw("html_split_index = True\n")
    fw("\n")

    # needed for latex, pdf gen
-    fw("latex_elements = {\n")
-    fw("  'papersize': 'a4paper',\n")
-    fw("}\n\n")
-
    fw("latex_documents = [ ('contents', 'contents.tex', 'Blender Index', 'Blender Foundation', 'manual'), ]\n")
+    fw("latex_paper_size = 'a4paper'\n")
    file.close()


--- a/doc/python_api/sphinx_doc_update.py
+++ b/doc/python_api/sphinx_doc_update.py
@@ -41,9 +41,9 @@ import tempfile
 import zipfile


-DEFAULT_RSYNC_SERVER = "docs.blender.org"
+DEFAULT_RSYNC_SERVER = "www.blender.org"
 DEFAULT_RSYNC_ROOT = "/api/"
-DEFAULT_SYMLINK_ROOT = "/data/www/vhosts/docs.blender.org/api"
+DEFAULT_SYMLINK_ROOT = "/data/www/vhosts/www.blender.org/api"


 def argparse_create():
@@ -96,11 +96,6 @@ def main():

    rsync_base = "rsync://%s@%s:%s" % (args.user, args.rsync_server, args.rsync_root)

-    blenver = blenver_zip = ""
-    api_name = ""
-    branch = ""
-    is_release = False
-
    # I) Update local mirror using rsync.
    rsync_mirror_cmd = ("rsync", "--delete-after", "-avzz", rsync_base, args.mirror_dir)
    subprocess.run(rsync_mirror_cmd, env=dict(os.environ, RSYNC_PASSWORD=args.password))
@@ -113,24 +108,19 @@ def main():
        subprocess.run(doc_gen_cmd)

        # III) Get Blender version info.
+        blenver = blenver_zip = ""
        getver_file = os.path.join(tmp_dir, "blendver.txt")
        getver_script = (""
            "import sys, bpy\n"
            "with open(sys.argv[-1], 'w') as f:\n"
-            "    is_release = bpy.app.version_cycle in {'rc', 'release'}\n"
-            "    branch = bpy.app.build_branch.split()[0].decode()\n"
-            "    f.write('%d\\n' % is_release)\n"
-            "    f.write('%s\\n' % branch)\n"
-            "    f.write('%d.%d%s\\n' % (bpy.app.version[0], bpy.app.version[1], bpy.app.version_char)\n"
-            "            if is_release else '%s\\n' % branch)\n"
-            "    f.write('%d_%d%s_release' % (bpy.app.version[0], bpy.app.version[1], bpy.app.version_char)\n"
-            "            if is_release else '%d_%d_%d' % bpy.app.version)\n")
+            "    f.write('%d_%d%s_release\\n' % (bpy.app.version[0], bpy.app.version[1], bpy.app.version_char)\n"
+            "            if bpy.app.version_cycle in {'rc', 'release'} else '%d_%d_%d\\n' % bpy.app.version)\n"
+            "    f.write('%d_%d_%d' % bpy.app.version)\n")
        get_ver_cmd = (args.blender, "--background", "-noaudio", "--factory-startup", "--python-exit-code", "1",
                       "--python-expr", getver_script, "--", getver_file)
        subprocess.run(get_ver_cmd)
        with open(getver_file) as f:
-            is_release, branch, blenver, blenver_zip = f.read().split("\n")
-            is_release = bool(int(is_release))
+            blenver, blenver_zip = f.read().split("\n")
        os.remove(getver_file)

        # IV) Build doc.
@@ -142,7 +132,7 @@ def main():
        os.chdir(curr_dir)

        # V) Cleanup existing matching dir in server mirror (if any), and copy new doc.
-        api_name = blenver
+        api_name = "blender_python_api_%s" % blenver
        api_dir = os.path.join(args.mirror_dir, api_name)
        if os.path.exists(api_dir):
            shutil.rmtree(api_dir)
@@ -160,15 +150,19 @@ def main():
    os.rename(zip_path, os.path.join(api_dir, "%s.zip" % zip_name))

    # VII) Create symlinks and html redirects.
+    #~ os.symlink(os.path.join(DEFAULT_SYMLINK_ROOT, api_name, "contents.html"), os.path.join(api_dir, "index.html"))
    os.symlink("./contents.html", os.path.join(api_dir, "index.html"))
-    if is_release:
-        symlink = os.path.join(args.mirror_dir, "current")
+    if blenver.endswith("release"):
+        symlink = os.path.join(args.mirror_dir, "blender_python_api_current")
        os.remove(symlink)
        os.symlink("./%s" % api_name, symlink)
        with open(os.path.join(args.mirror_dir, "250PythonDoc/index.html"), 'w') as f:
            f.write("<html><head><title>Redirecting...</title><meta http-equiv=\"REFRESH\""
                    "content=\"0;url=../%s/\"></head><body>Redirecting...</body></html>" % api_name)
-    elif branch == "master":
+    else:
+        symlink = os.path.join(args.mirror_dir, "blender_python_api_master")
+        os.remove(symlink)
+        os.symlink("./%s" % api_name, symlink)
        with open(os.path.join(args.mirror_dir, "blender_python_api/index.html"), 'w') as f:
            f.write("<html><head><title>Redirecting...</title><meta http-equiv=\"REFRESH\""
                    "content=\"0;url=../%s/\"></head><body>Redirecting...</body></html>" % api_name)
--- a/extern/Eigen3/Eigen/src/StlSupport/StdVector.h
+++ b/extern/Eigen3/Eigen/src/StlSupport/StdVector.h
@@ -77,7 +77,7 @@ namespace std {
  void resize(size_type new_size)
  { resize(new_size, T()); }

-#if defined(_VECTOR_) && (_MSC_VER<1910)
+#if defined(_VECTOR_)
  // workaround MSVC std::vector implementation
  void resize(size_type new_size, const value_type& x)
  {
@@ -110,7 +110,7 @@ namespace std {
      vector_base::insert(vector_base::end(), new_size - vector_base::size(), x);
  }
 #else
-  // either GCC 4.1, MSVC2017 or non-GCC
+  // either GCC 4.1 or non-GCC
  // default implementation which should always work.
  void resize(size_type new_size, const value_type& x)
  {
--- a/extern/cuew/include/cuew.h
+++ b/extern/cuew/include/cuew.h
@@ -114,7 +114,7 @@ extern "C" {
 #define cuGLGetDevices cuGLGetDevices_v2

 /* Types. */
-#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined (__aarch64__)
+#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
 typedef unsigned long long CUdeviceptr;
 #else
 typedef unsigned int CUdeviceptr;
--- a/extern/curve_fit_nd/curve_fit_nd.h
+++ b/extern/curve_fit_nd/curve_fit_nd.h
@@ -137,7 +137,7 @@ int curve_fit_cubic_to_points_refit_db(
        const double          error_threshold,
        const unsigned int    calc_flag,
        const unsigned int   *corners,
-        const unsigned int    corners_len,
+        unsigned int          corners_len,
        const double          corner_angle,

        double **r_cubic_array, unsigned int *r_cubic_array_len,
--- a/extern/gflags/README.blender
+++ b/extern/gflags/README.blender
@@ -18,8 +18,6 @@ Local modifications:
 - Applied some modifications from fork https://github.com/Nazg-Gul/gflags.git
  (see https://github.com/gflags/gflags/pull/129)

- Avoid attempt of acquiring mutex lock in FlagRegistry::GlobalRegistry when
+- Avoid attemot of acquiring mutex lock in FlagRegistry::GlobalRegistry when
  doing static flags initialization. See d81dd2d in Blender repository.

- Made `google::{anonymous}::FlagValue::ValueSize() const` inlined, so it does
-  not trigger strict compiler warning.
--- a/extern/gflags/src/gflags.cc
+++ b/extern/gflags/src/gflags.cc
@@ -218,7 +218,7 @@ class FlagValue {
  bool Equal(const FlagValue& x) const;
  FlagValue* New() const;   // creates a new one with default value
  void CopyFrom(const FlagValue& x);
-  inline int ValueSize() const;
+  int ValueSize() const;

  // Calls the given validate-fn on value_buffer_, and returns
  // whatever it returns.  But first casts validate_fn_proto to a
@@ -443,7 +443,7 @@ void FlagValue::CopyFrom(const FlagValue& x) {
  }
 }

-inline int FlagValue::ValueSize() const {
+int FlagValue::ValueSize() const {
  if (type_ > FV_MAX_INDEX) {
    assert(false);  // unknown type
    return 0;
--- a/extern/gtest/include/gtest/internal/gtest-internal.h
+++ b/extern/gtest/include/gtest/internal/gtest-internal.h
@@ -60,10 +60,6 @@
 #include <string>
 #include <vector>

-#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
-# include <type_traits>
-#endif
-
 #include "gtest/gtest-message.h"
 #include "gtest/internal/gtest-string.h"
 #include "gtest/internal/gtest-filepath.h"
@@ -858,7 +854,6 @@ struct AddReference<T&> { typedef T& type; };  // NOLINT
 template <typename From, typename To>
 class ImplicitlyConvertible {
 private:
-#if !((__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800))
  // We need the following helper functions only for their types.
  // They have no implementations.

@@ -879,7 +874,6 @@ class ImplicitlyConvertible {
  // implicitly converted to type To.
  static char Helper(To);
  static char (&Helper(...))[2];  // NOLINT
-#endif

  // We have to put the 'public' section after the 'private' section,
  // or MSVC refuses to compile the code.
@@ -889,8 +883,6 @@ class ImplicitlyConvertible {
  // instantiation.  The simplest workaround is to use its C++0x type traits
  // functions (C++Builder 2009 and above only).
  static const bool value = __is_convertible(From, To);
-#elif (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
-  static const bool value = std::is_convertible<From, To>::value;
 #else
  // MSVC warns about implicitly converting from double to int for
  // possible loss of data, so we need to temporarily disable the
--- a/intern/CMakeLists.txt
+++ b/intern/CMakeLists.txt
@@ -34,7 +34,7 @@ add_subdirectory(mikktspace)
 add_subdirectory(glew-mx)
 add_subdirectory(eigen)

-if(WITH_GAMEENGINE_DECKLINK)
+if (WITH_GAMEENGINE_DECKLINK)
 	add_subdirectory(decklink)
 endif()

@@ -62,7 +62,7 @@ if(WITH_IK_ITASC)
 	add_subdirectory(itasc)
 endif()

-if(WITH_GAMEENGINE)
+if(WITH_IK_SOLVER OR WITH_GAMEENGINE OR WITH_MOD_BOOLEAN)
 	add_subdirectory(moto)
 endif()

--- a/intern/atomic/atomic_ops.h
+++ b/intern/atomic/atomic_ops.h
@@ -101,11 +101,11 @@ ATOMIC_INLINE size_t atomic_fetch_and_add_z(size_t *p, size_t x);
 ATOMIC_INLINE size_t atomic_fetch_and_sub_z(size_t *p, size_t x);
 ATOMIC_INLINE size_t atomic_cas_z(size_t *v, size_t old, size_t _new);

-ATOMIC_INLINE unsigned int atomic_add_and_fetch_u(unsigned int *p, unsigned int x);
-ATOMIC_INLINE unsigned int atomic_sub_and_fetch_u(unsigned int *p, unsigned int x);
-ATOMIC_INLINE unsigned int atomic_fetch_and_add_u(unsigned int *p, unsigned int x);
-ATOMIC_INLINE unsigned int atomic_fetch_and_sub_u(unsigned int *p, unsigned int x);
-ATOMIC_INLINE unsigned int atomic_cas_u(unsigned int *v, unsigned int old, unsigned int _new);
+ATOMIC_INLINE unsigned atomic_add_and_fetch_u(unsigned *p, unsigned x);
+ATOMIC_INLINE unsigned atomic_sub_and_fetch_u(unsigned *p, unsigned x);
+ATOMIC_INLINE unsigned atomic_fetch_and_add_u(unsigned *p, unsigned x);
+ATOMIC_INLINE unsigned atomic_fetch_and_sub_u(unsigned *p, unsigned x);
+ATOMIC_INLINE unsigned atomic_cas_u(unsigned *v, unsigned old, unsigned _new);

 /* WARNING! Float 'atomics' are really faked ones, those are actually closer to some kind of spinlock-sync'ed operation,
 *          which means they are only efficient if collisions are highly unlikely (i.e. if probability of two threads
--- a/intern/atomic/intern/atomic_ops_ext.h
+++ b/intern/atomic/intern/atomic_ops_ext.h
@@ -113,58 +113,58 @@ ATOMIC_INLINE size_t atomic_cas_z(size_t *v, size_t old, size_t _new)

 /******************************************************************************/
 /* unsigned operations. */
-ATOMIC_INLINE unsigned int atomic_add_and_fetch_u(unsigned int *p, unsigned int x)
+ATOMIC_INLINE unsigned atomic_add_and_fetch_u(unsigned *p, unsigned x)
 {
-	assert(sizeof(unsigned int) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned) == LG_SIZEOF_INT);

 #if (LG_SIZEOF_INT == 8)
-	return (unsigned int)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)x);
+	return (unsigned)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)x);
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned int)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)x);
+	return (unsigned)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)x);
 #endif
 }

-ATOMIC_INLINE unsigned int atomic_sub_and_fetch_u(unsigned int *p, unsigned int x)
+ATOMIC_INLINE unsigned atomic_sub_and_fetch_u(unsigned *p, unsigned x)
 {
-	assert(sizeof(unsigned int) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned) == LG_SIZEOF_INT);

 #if (LG_SIZEOF_INT == 8)
-	return (unsigned int)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
+	return (unsigned)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned int)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
+	return (unsigned)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
 #endif
 }

-ATOMIC_INLINE unsigned int atomic_fetch_and_add_u(unsigned int *p, unsigned int x)
+ATOMIC_INLINE unsigned atomic_fetch_and_add_u(unsigned *p, unsigned x)
 {
-	assert(sizeof(unsigned int) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned) == LG_SIZEOF_INT);

 #if (LG_SIZEOF_INT == 8)
-	return (unsigned int)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)x);
+	return (unsigned)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)x);
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned int)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)x);
+	return (unsigned)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)x);
 #endif
 }

-ATOMIC_INLINE unsigned int atomic_fetch_and_sub_u(unsigned int *p, unsigned int x)
+ATOMIC_INLINE unsigned atomic_fetch_and_sub_u(unsigned *p, unsigned x)
 {
-	assert(sizeof(unsigned int) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned) == LG_SIZEOF_INT);

 #if (LG_SIZEOF_INT == 8)
-	return (unsigned int)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
+	return (unsigned)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned int)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
+	return (unsigned)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
 #endif
 }

-ATOMIC_INLINE unsigned int atomic_cas_u(unsigned int *v, unsigned int old, unsigned int _new)
+ATOMIC_INLINE unsigned atomic_cas_u(unsigned *v, unsigned old, unsigned _new)
 {
-	assert(sizeof(unsigned int) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned) == LG_SIZEOF_INT);

 #if (LG_SIZEOF_INT == 8)
-	return (unsigned int)atomic_cas_uint64((uint64_t *)v, (uint64_t)old, (uint64_t)_new);
+	return (unsigned)atomic_cas_uint64((uint64_t *)v, (uint64_t)old, (uint64_t)_new);
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned int)atomic_cas_uint32((uint32_t *)v, (uint32_t)old, (uint32_t)_new);
+	return (unsigned)atomic_cas_uint32((uint32_t *)v, (uint32_t)old, (uint32_t)_new);
 #endif
 }

--- a/intern/audaspace/FX/AUD_LimiterReader.cpp
+++ b/intern/audaspace/FX/AUD_LimiterReader.cpp
@@ -110,10 +110,10 @@ void AUD_LimiterReader::read(int& length, bool& eos, sample_t* buffer)
 			eos = true;
 		}

-		if(position < int(m_start * rate))
+		if(position < m_start * rate)
 		{
 			int len2 = length;
-			for(int len = int(m_start * rate) - position;
+			for(int len = m_start * rate - position;
 				len2 == length && !eos;
 				len -= length)
 			{
--- a/intern/audaspace/intern/AUD_SoftwareDevice.cpp
+++ b/intern/audaspace/intern/AUD_SoftwareDevice.cpp
@@ -365,7 +365,6 @@ bool AUD_SoftwareDevice::AUD_SoftwareHandle::seek(float position)
 	if(!m_status)
 		return false;

-	m_pitch->setPitch(m_user_pitch);
 	m_reader->seek((int)(position * m_reader->getSpecs().rate));

 	if(m_status == AUD_STATUS_STOPPED)
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -74,7 +74,7 @@ elseif(CMAKE_COMPILER_IS_GNUCC)
 	if(CXX_HAS_AVX2)
 		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c -mfpmath=sse")
 	endif()
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only")
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
 elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 	check_cxx_compiler_flag(-msse CXX_HAS_SSE)
 	check_cxx_compiler_flag(-mavx CXX_HAS_AVX)
@@ -90,7 +90,7 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 	if(CXX_HAS_AVX2)
 		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
 	endif()
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only")
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
 endif()

 if(CXX_HAS_SSE)
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -72,17 +72,20 @@ static void session_print(const string& str)

 static void session_print_status()
 {
+	int sample, tile;
+	double total_time, sample_time, render_time;
 	string status, substatus;

 	/* get status */
-	float progress = options.session->progress.get_progress();
+	sample = options.session->progress.get_sample();
+	options.session->progress.get_tile(tile, total_time, sample_time, render_time);
 	options.session->progress.get_status(status, substatus);

 	if(substatus != "")
 		status += ": " + substatus;

 	/* print status */
-	status = string_printf("Progress %05.2f   %s", (double) progress*100, status.c_str());
+	status = string_printf("Sample %d   %s", sample, status.c_str());
 	session_print(status);
 }

@@ -164,12 +167,13 @@ static void display_info(Progress& progress)
 	latency = (elapsed - last);
 	last = elapsed;

-	double total_time, sample_time;
+	int sample, tile;
+	double total_time, sample_time, render_time;
 	string status, substatus;

-	progress.get_time(total_time, sample_time);
+	sample = progress.get_sample();
+	progress.get_tile(tile, total_time, sample_time, render_time);
 	progress.get_status(status, substatus);
-	float progress_val = progress.get_progress();

 	if(substatus != "")
 		status += ": " + substatus;
@@ -180,10 +184,10 @@ static void display_info(Progress& progress)
 	        "%s"
 	        "        Time: %.2f"
 	        "        Latency: %.4f"
-	        "        Progress: %05.2f"
+	        "        Sample: %d"
 	        "        Average: %.4f"
 	        "        Interactive: %s",
-	        status.c_str(), total_time, latency, (double) progress_val*100, sample_time, interactive.c_str());
+	        status.c_str(), total_time, latency, sample, sample_time, interactive.c_str());

 	view_display_info(str.c_str());

--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -523,7 +523,7 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node)

 	/* we don't yet support arbitrary attributes, for now add vertex
 	 * coordinates as generated coordinates if requested */
-	if(mesh->need_attribute(state.scene, ATTR_STD_GENERATED)) {
+	if (mesh->need_attribute(state.scene, ATTR_STD_GENERATED)) {
 		Attribute *attr = mesh->attributes.add(ATTR_STD_GENERATED);
 		memcpy(attr->data_float3(), mesh->verts.data(), sizeof(float3)*mesh->verts.size());
 	}
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -25,7 +25,6 @@ set(SRC
 	blender_camera.cpp
 	blender_mesh.cpp
 	blender_object.cpp
-	blender_object_cull.cpp
 	blender_particles.cpp
 	blender_curves.cpp
 	blender_logging.cpp
@@ -36,7 +35,6 @@ set(SRC
 	blender_texture.cpp

 	CCL_api.h
-	blender_object_cull.h
 	blender_sync.h
 	blender_session.h
 	blender_texture.h
--- a/intern/cycles/blender/addon/init.py
+++ b/intern/cycles/blender/addon/init.py
@@ -23,25 +23,11 @@ bl_info = {
    "location": "Info header, render engine menu",
    "description": "Cycles Render Engine integration",
    "warning": "",
-    "wiki_url": "https://docs.blender.org/manual/en/dev/render/cycles/",
+    "wiki_url": "https://www.blender.org/manual/render/cycles/index.html",
    "tracker_url": "",
    "support": 'OFFICIAL',
    "category": "Render"}

-# Support 'reload' case.
-if "bpy" in locals():
-    import importlib
-    if "engine" in locals():
-        importlib.reload(engine)
-    if "version_update" in locals():
-        importlib.reload(version_update)
-    if "ui" in locals():
-        importlib.reload(ui)
-    if "properties" in locals():
-        importlib.reload(properties)
-    if "presets" in locals():
-        importlib.reload(presets)
-
 import bpy

 from . import (
@@ -107,13 +93,7 @@ def engine_exit():
    engine.exit()


-classes = (
-    CyclesRender,
-)
-
-
 def register():
-    from bpy.utils import register_class
    from . import ui
    from . import properties
    from . import presets
@@ -128,15 +108,12 @@ def register():
    properties.register()
    ui.register()
    presets.register()
-
-    for cls in classes:
-        register_class(cls)
+    bpy.utils.register_module(__name__)

    bpy.app.handlers.version_update.append(version_update.do_versions)


 def unregister():
-    from bpy.utils import unregister_class
    from . import ui
    from . import properties
    from . import presets
@@ -147,6 +124,4 @@ def unregister():
    ui.unregister()
    properties.unregister()
    presets.unregister()
-
-    for cls in classes:
-        unregister_class(cls)
+    bpy.utils.unregister_module(__name__)
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -50,24 +50,6 @@ def _workaround_buggy_drivers():
            _cycles.opencl_disable()


-def _configure_argument_parser():
-    import argparse
-    parser = argparse.ArgumentParser(description="Cycles Addon argument parser")
-    parser.add_argument("--cycles-resumable-num-chunks",
-                        help="Number of chunks to split sample range into",
-                        default=None)
-    parser.add_argument("--cycles-resumable-current-chunk",
-                        help="Current chunk of samples range to render",
-                        default=None)
-    parser.add_argument("--cycles-resumable-start-chunk",
-                        help="Start chunk to render",
-                        default=None)
-    parser.add_argument("--cycles-resumable-end-chunk",
-                        help="End chunk to render",
-                        default=None)
-    return parser
-
-
 def _parse_command_line():
    import sys

@@ -75,22 +57,25 @@ def _parse_command_line():
    if "--" not in argv:
        return

-    parser = _configure_argument_parser()
-    args, unknown = parser.parse_known_args(argv[argv.index("--") + 1:])
+    argv = argv[argv.index("--") + 1:]

-    if args.cycles_resumable_num_chunks is not None:
-        if args.cycles_resumable_current_chunk is not None:
-            import _cycles
-            _cycles.set_resumable_chunk(
-                    int(args.cycles_resumable_num_chunks),
-                    int(args.cycles_resumable_current_chunk))
-        elif args.cycles_resumable_start_chunk is not None and \
-             args.cycles_resumable_end_chunk:
-            import _cycles
-            _cycles.set_resumable_chunk_range(
-                    int(args.cycles_resumable_num_chunks),
-                    int(args.cycles_resumable_start_chunk),
-                    int(args.cycles_resumable_end_chunk))
+    num_resumable_chunks = None
+    current_resumable_chunk = None
+
+    # TODO(sergey): Add some nice error ptins if argument is not used properly.
+    idx = 0
+    while idx < len(argv) - 1:
+        arg = argv[idx]
+        if arg == '--cycles-resumable-num-chunks':
+            num_resumable_chunks = int(argv[idx + 1])
+        elif arg == '--cycles-resumable-current-chunk':
+            current_resumable_chunk = int(argv[idx + 1])
+        idx += 1
+
+    if num_resumable_chunks is not None and current_resumable_chunk is not None:
+        import _cycles
+        _cycles.set_resumable_chunks(num_resumable_chunks,
+                                     current_resumable_chunk)


 def init():
--- a/intern/cycles/blender/addon/presets.py
+++ b/intern/cycles/blender/addon/presets.py
@@ -82,23 +82,12 @@ class AddPresetSampling(AddPresetBase, Operator):
    preset_subdir = "cycles/sampling"


-classes = (
-    AddPresetIntegrator,
-    AddPresetSampling,
-)
-
-
 def register():
-    from bpy.utils import register_class
-    for cls in classes:
-        register_class(cls)
+    pass


 def unregister():
-    from bpy.utils import unregister_class
-    for cls in classes:
-        unregister_class(cls)
-
+    pass

 if __name__ == "__main__":
    register()
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -288,7 +288,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                description="Probabilistically terminate light samples when the light contribution is below this threshold (more noise but faster rendering). "
                            "Zero disables the test and never ignores lights",
                min=0.0, max=1.0,
-                default=0.01,
+                default=0.05,
                )

        cls.caustics_reflective = BoolProperty(
@@ -528,12 +528,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                description="Use special type BVH optimized for hair (uses more ram but renders faster)",
                default=True,
                )
-        cls.debug_bvh_time_steps = IntProperty(
-                name="BVH Time Steps",
-                description="Split BVH primitives by this number of time steps to speed up render time in cost of memory",
-                default=0,
-                min=0, max=16,
-                )
        cls.tile_order = EnumProperty(
                name="Tile Order",
                description="Tile order for rendering",
@@ -638,20 +632,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
            items=enum_texture_limit
            )

-        cls.ao_bounces = IntProperty(
-            name="AO Bounces",
-            default=0,
-            description="Approximate indirect light with background tinted ambient occlusion at the specified bounce, 0 disables this feature",
-            min=0, max=1024,
-            )
-
-        cls.ao_bounces_render = IntProperty(
-            name="AO Bounces Render",
-            default=0,
-            description="Approximate indirect light with background tinted ambient occlusion at the specified bounce, 0 disables this feature",
-            min=0, max=1024,
-            )
-
        # Various fine-tuning debug flags

        def devices_update_callback(self, context):
@@ -665,10 +645,8 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        cls.debug_use_cpu_sse3 = BoolProperty(name="SSE3", default=True)
        cls.debug_use_cpu_sse2 = BoolProperty(name="SSE2", default=True)
        cls.debug_use_qbvh = BoolProperty(name="QBVH", default=True)
-        cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False)

        cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False)
-        cls.debug_use_cuda_split_kernel = BoolProperty(name="Split Kernel", default=False)

        cls.debug_opencl_kernel_type = EnumProperty(
            name="OpenCL Kernel Type",
@@ -695,8 +673,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
            update=devices_update_callback
            )

-        cls.debug_opencl_kernel_single_program = BoolProperty(name="Single Program", default=False, update=devices_update_callback);
-
        cls.debug_use_opencl_debug = BoolProperty(name="Debug OpenCL", default=False)

    @classmethod
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -86,10 +86,12 @@ def use_sample_all_lights(context):

    return cscene.sample_all_lights_direct or cscene.sample_all_lights_indirect

-def show_device_active(context):
-    cscene = context.scene.cycles
-    if cscene.device != 'GPU':
+def show_device_selection(context):
+    type = get_device_type(context)
+    if type == 'NETWORK':
        return True
+    if not type in {'CUDA', 'OPENCL'}:
+        return False
    return context.user_preferences.addons[__package__].preferences.has_active_device()


@@ -215,7 +217,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
        draw_samples_info(layout, context)


-class CyclesRender_PT_geometry(CyclesButtonsPanel, Panel):
+class CyclesRender_PT_geometery(CyclesButtonsPanel, Panel):
    bl_label = "Geometry"
    bl_options = {'DEFAULT_CLOSED'}

@@ -224,7 +226,6 @@ class CyclesRender_PT_geometry(CyclesButtonsPanel, Panel):

        scene = context.scene
        cscene = scene.cycles
-        ccscene = scene.cycles_curves

        if cscene.feature_set == 'EXPERIMENTAL':
            split = layout.split()
@@ -251,25 +252,6 @@ class CyclesRender_PT_geometry(CyclesButtonsPanel, Panel):
            row.prop(cscene, "volume_step_size")
            row.prop(cscene, "volume_max_steps")

-        layout.prop(ccscene, "use_curves", text="Use Hair")
-        col = layout.column()
-        col.active = ccscene.use_curves
-
-        col.prop(ccscene, "primitive", text="Primitive")
-        col.prop(ccscene, "shape", text="Shape")
-
-        if not (ccscene.primitive in {'CURVE_SEGMENTS', 'LINE_SEGMENTS'} and ccscene.shape == 'RIBBONS'):
-            col.prop(ccscene, "cull_backfacing", text="Cull back-faces")
-
-        if ccscene.primitive == 'TRIANGLES' and ccscene.shape == 'THICK':
-            col.prop(ccscene, "resolution", text="Resolution")
-        elif ccscene.primitive == 'CURVE_SEGMENTS':
-            col.prop(ccscene, "subdivisions", text="Curve subdivisions")
-
-        row = col.row()
-        row.prop(ccscene, "minimum_width", text="Min Pixels")
-        row.prop(ccscene, "maximum_width", text="Max Ext.")
-

 class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
    bl_label = "Light Paths"
@@ -430,10 +412,6 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel):
        col.prop(cscene, "debug_use_spatial_splits")
        col.prop(cscene, "debug_use_hair_bvh")

-        row = col.row()
-        row.active = not cscene.debug_use_spatial_splits
-        row.prop(cscene, "debug_bvh_time_steps")
-

 class CyclesRender_PT_layer_options(CyclesButtonsPanel, Panel):
    bl_label = "Layer"
@@ -789,13 +767,10 @@ class CyclesObject_PT_cycles_settings(CyclesButtonsPanel, Panel):
        col = layout.column()
        col.label(text="Performance:")
        row = col.row()
-        sub = row.row()
-        sub.active = scene.render.use_simplify and cscene.use_camera_cull
-        sub.prop(cob, "use_camera_cull")
-
-        sub = row.row()
-        sub.active = scene.render.use_simplify and cscene.use_distance_cull
-        sub.prop(cob, "use_distance_cull")
+        row.active = scene.render.use_simplify and cscene.use_camera_cull
+        row.prop(cob, "use_camera_cull")
+        row.active = scene.render.use_simplify and cscene.use_distance_cull
+        row.prop(cob, "use_distance_cull")


 class CYCLES_OT_use_shading_nodes(Operator):
@@ -1036,11 +1011,10 @@ class CyclesWorld_PT_ambient_occlusion(CyclesButtonsPanel, Panel):
        layout = self.layout

        light = context.world.light_settings
-        scene = context.scene

        row = layout.row()
        sub = row.row()
-        sub.active = light.use_ambient_occlusion or scene.render.use_simplify
+        sub.active = light.use_ambient_occlusion
        sub.prop(light, "ao_factor", text="Factor")
        row.prop(light, "distance", text="Distance")

@@ -1417,6 +1391,43 @@ class CyclesParticle_PT_textures(CyclesButtonsPanel, Panel):
            layout.template_ID(slot, "texture", new="texture.new")


+class CyclesRender_PT_CurveRendering(CyclesButtonsPanel, Panel):
+    bl_label = "Cycles Hair Rendering"
+    bl_context = "particle"
+
+    @classmethod
+    def poll(cls, context):
+        psys = context.particle_system
+        return CyclesButtonsPanel.poll(context) and psys and psys.settings.type == 'HAIR'
+
+    def draw_header(self, context):
+        ccscene = context.scene.cycles_curves
+        self.layout.prop(ccscene, "use_curves", text="")
+
+    def draw(self, context):
+        layout = self.layout
+
+        scene = context.scene
+        ccscene = scene.cycles_curves
+
+        layout.active = ccscene.use_curves
+
+        layout.prop(ccscene, "primitive", text="Primitive")
+        layout.prop(ccscene, "shape", text="Shape")
+
+        if not (ccscene.primitive in {'CURVE_SEGMENTS', 'LINE_SEGMENTS'} and ccscene.shape == 'RIBBONS'):
+            layout.prop(ccscene, "cull_backfacing", text="Cull back-faces")
+
+        if ccscene.primitive == 'TRIANGLES' and ccscene.shape == 'THICK':
+            layout.prop(ccscene, "resolution", text="Resolution")
+        elif ccscene.primitive == 'CURVE_SEGMENTS':
+            layout.prop(ccscene, "subdivisions", text="Curve subdivisions")
+
+        row = layout.row()
+        row.prop(ccscene, "minimum_width", text="Min Pixels")
+        row.prop(ccscene, "maximum_width", text="Max Ext.")
+
+
 class CyclesRender_PT_bake(CyclesButtonsPanel, Panel):
    bl_label = "Bake"
    bl_context = "render"
@@ -1516,18 +1527,15 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
        row.prop(cscene, "debug_use_cpu_avx", toggle=True)
        row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
        col.prop(cscene, "debug_use_qbvh")
-        col.prop(cscene, "debug_use_cpu_split_kernel")

        col = layout.column()
        col.label('CUDA Flags:')
        col.prop(cscene, "debug_use_cuda_adaptive_compile")
-        col.prop(cscene, "debug_use_cuda_split_kernel")

        col = layout.column()
        col.label('OpenCL Flags:')
        col.prop(cscene, "debug_opencl_kernel_type", text="Kernel")
        col.prop(cscene, "debug_opencl_device_type", text="Device")
-        col.prop(cscene, "debug_opencl_kernel_single_program", text="Single Program")
        col.prop(cscene, "debug_use_opencl_debug", text="Debug")


@@ -1614,13 +1622,6 @@ class CyclesScene_PT_simplify(CyclesButtonsPanel, Panel):
        row.active = cscene.use_distance_cull
        row.prop(cscene, "distance_cull_margin", text="Distance")

-        split = layout.split()
-        col = split.column()
-        col.prop(cscene, "ao_bounces")
-
-        col = split.column()
-        col.prop(cscene, "ao_bounces_render")
-
 def draw_device(self, context):
    scene = context.scene
    layout = self.layout
@@ -1634,7 +1635,7 @@ def draw_device(self, context):
        split = layout.split(percentage=1/3)
        split.label("Device:")
        row = split.row()
-        row.active = show_device_active(context)
+        row.active = show_device_selection(context)
        row.prop(cscene, "device", text="")

        if engine.with_osl() and use_cpu(context):
@@ -1713,75 +1714,17 @@ def get_panels():

    return panels

-
-classes = (
-    CYCLES_MT_sampling_presets,
-    CYCLES_MT_integrator_presets,
-    CyclesRender_PT_sampling,
-    CyclesRender_PT_geometry,
-    CyclesRender_PT_light_paths,
-    CyclesRender_PT_motion_blur,
-    CyclesRender_PT_film,
-    CyclesRender_PT_performance,
-    CyclesRender_PT_layer_options,
-    CyclesRender_PT_layer_passes,
-    CyclesRender_PT_views,
-    Cycles_PT_post_processing,
-    CyclesCamera_PT_dof,
-    Cycles_PT_context_material,
-    CyclesObject_PT_motion_blur,
-    CyclesObject_PT_cycles_settings,
-    CYCLES_OT_use_shading_nodes,
-    CyclesLamp_PT_preview,
-    CyclesLamp_PT_lamp,
-    CyclesLamp_PT_nodes,
-    CyclesLamp_PT_spot,
-    CyclesWorld_PT_preview,
-    CyclesWorld_PT_surface,
-    CyclesWorld_PT_volume,
-    CyclesWorld_PT_ambient_occlusion,
-    CyclesWorld_PT_mist,
-    CyclesWorld_PT_ray_visibility,
-    CyclesWorld_PT_settings,
-    CyclesMaterial_PT_preview,
-    CyclesMaterial_PT_surface,
-    CyclesMaterial_PT_volume,
-    CyclesMaterial_PT_displacement,
-    CyclesMaterial_PT_settings,
-    CyclesTexture_PT_context,
-    CyclesTexture_PT_node,
-    CyclesTexture_PT_mapping,
-    CyclesTexture_PT_colors,
-    CyclesParticle_PT_textures,
-    CyclesRender_PT_bake,
-    CyclesRender_PT_debug,
-    CyclesParticle_PT_CurveSettings,
-    CyclesScene_PT_simplify,
-)
-
-
 def register():
-    from bpy.utils import register_class
-
    bpy.types.RENDER_PT_render.append(draw_device)
    bpy.types.VIEW3D_HT_header.append(draw_pause)

    for panel in get_panels():
        panel.COMPAT_ENGINES.add('CYCLES')

-    for cls in classes:
-        register_class(cls)
-
-
 def unregister():
-    from bpy.utils import unregister_class
-
    bpy.types.RENDER_PT_render.remove(draw_device)
    bpy.types.VIEW3D_HT_header.remove(draw_pause)

    for panel in get_panels():
        if 'CYCLES' in panel.COMPAT_ENGINES:
            panel.COMPAT_ENGINES.remove('CYCLES')
-
-    for cls in classes:
-        unregister_class(cls)
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -29,6 +29,24 @@

 CCL_NAMESPACE_BEGIN

+/* Utilities */
+
+/* Hair curve functions */
+
+void curveinterp_v3_v3v3v3v3(float3 *p, float3 *v1, float3 *v2, float3 *v3, float3 *v4, const float w[4]);
+void interp_weights(float t, float data[4]);
+float shaperadius(float shape, float root, float tip, float time);
+void InterpolateKeySegments(int seg, int segno, int key, int curve, float3 *keyloc, float *time, ParticleCurveData *CData);
+bool ObtainCacheParticleUV(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background, int uv_num);
+bool ObtainCacheParticleVcol(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background, int vcol_num);
+bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background);
+void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData);
+void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
+                               float3 RotCam, bool is_ortho);
+void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, int resolution);
+void ExportCurveTriangleUV(ParticleCurveData *CData, int vert_offset, int resol, float3 *uvdata);
+void ExportCurveTriangleVcol(ParticleCurveData *CData, int vert_offset, int resol, uchar4 *cdata);
+
 ParticleCurveData::ParticleCurveData()
 {
 }
@@ -37,7 +55,7 @@ ParticleCurveData::~ParticleCurveData()
 {
 }

-static void interp_weights(float t, float data[4])
+void interp_weights(float t, float data[4])
 {
 	/* Cardinal curve interpolation */
 	float t2 = t * t;
@@ -50,19 +68,17 @@ static void interp_weights(float t, float data[4])
 	data[3] =  fc          * t3  - fc * t2;
 }

-static void curveinterp_v3_v3v3v3v3(float3 *p,
-                                    float3 *v1, float3 *v2, float3 *v3, float3 *v4,
-                                    const float w[4])
+void curveinterp_v3_v3v3v3v3(float3 *p, float3 *v1, float3 *v2, float3 *v3, float3 *v4, const float w[4])
 {
 	p->x = v1->x * w[0] + v2->x * w[1] + v3->x * w[2] + v4->x * w[3];
 	p->y = v1->y * w[0] + v2->y * w[1] + v3->y * w[2] + v4->y * w[3];
 	p->z = v1->z * w[0] + v2->z * w[1] + v3->z * w[2] + v4->z * w[3];
 }

-static float shaperadius(float shape, float root, float tip, float time)
+float shaperadius(float shape, float root, float tip, float time)
 {
 	float radius = 1.0f - time;
-
+	
 	if(shape != 0.0f) {
 		if(shape < 0.0f)
 			radius = powf(radius, 1.0f + shape);
@@ -74,13 +90,7 @@ static float shaperadius(float shape, float root, float tip, float time)

 /* curve functions */

-static void InterpolateKeySegments(int seg,
-                                   int segno,
-                                   int key,
-                                   int curve,
-                                   float3 *keyloc,
-                                   float *time,
-                                   ParticleCurveData *CData)
+void InterpolateKeySegments(int seg, int segno, int key, int curve, float3 *keyloc, float *time, ParticleCurveData *CData)
 {
 	float3 ckey_loc1 = CData->curvekey_co[key];
 	float3 ckey_loc2 = ckey_loc1;
@@ -109,11 +119,7 @@ static void InterpolateKeySegments(int seg,
 		curveinterp_v3_v3v3v3v3(keyloc, &ckey_loc1, &ckey_loc2, &ckey_loc3, &ckey_loc4, t);
 }

-static bool ObtainCacheParticleData(Mesh *mesh,
-                                    BL::Mesh *b_mesh,
-                                    BL::Object *b_ob,
-                                    ParticleCurveData *CData,
-                                    bool background)
+bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background)
 {
 	int curvenum = 0;
 	int keyno = 0;
@@ -137,7 +143,7 @@ static bool ObtainCacheParticleData(Mesh *mesh,
 				int totparts = b_psys.particles.length();
 				int totchild = background ? b_psys.child_particles.length() : (int)((float)b_psys.child_particles.length() * (float)b_part.draw_percentage() / 100.0f);
 				int totcurves = totchild;
-
+				
 				if(b_part.child_type() == 0 || totchild == 0)
 					totcurves += totparts;

@@ -155,7 +161,7 @@ static bool ObtainCacheParticleData(Mesh *mesh,
 				CData->psys_shader.push_back_slow(shader);

 				float radius = get_float(cpsys, "radius_scale") * 0.5f;
-
+	
 				CData->psys_rootradius.push_back_slow(radius * get_float(cpsys, "root_width"));
 				CData->psys_tipradius.push_back_slow(radius * get_float(cpsys, "tip_width"));
 				CData->psys_shape.push_back_slow(get_float(cpsys, "shape"));
@@ -175,7 +181,7 @@ static bool ObtainCacheParticleData(Mesh *mesh,
 				for(; pa_no < totparts+totchild; pa_no++) {
 					int keynum = 0;
 					CData->curve_firstkey.push_back_slow(keyno);
-
+					
 					float curve_length = 0.0f;
 					float3 pcKey;
 					for(int step_no = 0; step_no < ren_step; step_no++) {
@@ -207,12 +213,7 @@ static bool ObtainCacheParticleData(Mesh *mesh,
 	return true;
 }

-static bool ObtainCacheParticleUV(Mesh *mesh,
-                                  BL::Mesh *b_mesh,
-                                  BL::Object *b_ob,
-                                  ParticleCurveData *CData,
-                                  bool background,
-                                  int uv_num)
+bool ObtainCacheParticleUV(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background, int uv_num)
 {
 	if(!(mesh && b_mesh && b_ob && CData))
 		return false;
@@ -230,7 +231,7 @@ static bool ObtainCacheParticleUV(Mesh *mesh,
 				int totparts = b_psys.particles.length();
 				int totchild = background ? b_psys.child_particles.length() : (int)((float)b_psys.child_particles.length() * (float)b_part.draw_percentage() / 100.0f);
 				int totcurves = totchild;
-
+				
 				if(b_part.child_type() == 0 || totchild == 0)
 					totcurves += totparts;

@@ -266,12 +267,7 @@ static bool ObtainCacheParticleUV(Mesh *mesh,
 	return true;
 }

-static bool ObtainCacheParticleVcol(Mesh *mesh,
-                                    BL::Mesh *b_mesh,
-                                    BL::Object *b_ob,
-                                    ParticleCurveData *CData,
-                                    bool background,
-                                    int vcol_num)
+bool ObtainCacheParticleVcol(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, ParticleCurveData *CData, bool background, int vcol_num)
 {
 	if(!(mesh && b_mesh && b_ob && CData))
 		return false;
@@ -289,7 +285,7 @@ static bool ObtainCacheParticleVcol(Mesh *mesh,
 				int totparts = b_psys.particles.length();
 				int totchild = background ? b_psys.child_particles.length() : (int)((float)b_psys.child_particles.length() * (float)b_part.draw_percentage() / 100.0f);
 				int totcurves = totchild;
-
+				
 				if(b_part.child_type() == 0 || totchild == 0)
 					totcurves += totparts;

@@ -337,16 +333,16 @@ static void set_resolution(BL::Object *b_ob, BL::Scene *scene, bool render)
 	}
 }

-static void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
-                                      float3 RotCam, bool is_ortho)
+void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
+                               float3 RotCam, bool is_ortho)
 {
 	int vertexno = mesh->verts.size();
 	int vertexindex = vertexno;
 	int numverts = 0, numtris = 0;

 	/* compute and reserve size of arrays */
-	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; curve++) {
+	for(int sys = 0; sys < CData->psys_firstcurve.size() ; sys++) {
+		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys] ; curve++) {
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;

@@ -358,8 +354,8 @@ static void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
 	mesh->reserve_mesh(mesh->verts.size() + numverts, mesh->num_triangles() + numtris);

 	/* actually export */
-	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; curve++) {
+	for(int sys = 0; sys < CData->psys_firstcurve.size() ; sys++) {
+		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys] ; curve++) {
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;

@@ -384,7 +380,7 @@ static void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,

 				if(curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1)
 					v1 = CData->curvekey_co[curvekey] - CData->curvekey_co[max(curvekey - 1, CData->curve_firstkey[curve])];
-				else
+				else 
 					v1 = CData->curvekey_co[curvekey + 1] - CData->curvekey_co[curvekey - 1];

 				time = CData->curvekey_time[curvekey]/CData->curve_length[curve];
@@ -411,7 +407,6 @@ static void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
 		}
 	}

-	mesh->resize_mesh(mesh->verts.size(), mesh->triangles.size());
 	mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
 	mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
 	mesh->add_face_normals();
@@ -421,30 +416,28 @@ static void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
 	/* texture coords still needed */
 }

-static void ExportCurveTriangleGeometry(Mesh *mesh,
-                                        ParticleCurveData *CData,
-                                        int resolution)
+void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, int resolution)
 {
 	int vertexno = mesh->verts.size();
 	int vertexindex = vertexno;
 	int numverts = 0, numtris = 0;

 	/* compute and reserve size of arrays */
-	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; curve++) {
+	for(int sys = 0; sys < CData->psys_firstcurve.size() ; sys++) {
+		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys] ; curve++) {
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;

-			numverts += (CData->curve_keynum[curve] - 1)*resolution + resolution;
-			numtris += (CData->curve_keynum[curve] - 1)*2*resolution;
+			numverts += (CData->curve_keynum[curve] - 2)*2*resolution + resolution;
+			numtris += (CData->curve_keynum[curve] - 2)*resolution;
 		}
 	}

 	mesh->reserve_mesh(mesh->verts.size() + numverts, mesh->num_triangles() + numtris);

 	/* actually export */
-	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; curve++) {
+	for(int sys = 0; sys < CData->psys_firstcurve.size() ; sys++) {
+		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys] ; curve++) {
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;

@@ -546,7 +539,6 @@ static void ExportCurveTriangleGeometry(Mesh *mesh,
 		}
 	}

-	mesh->resize_mesh(mesh->verts.size(), mesh->triangles.size());
 	mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
 	mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
 	mesh->add_face_normals();
@@ -556,7 +548,7 @@ static void ExportCurveTriangleGeometry(Mesh *mesh,
 	/* texture coords still needed */
 }

-static void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
+void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
 {
 	int num_keys = 0;
 	int num_curves = 0;
@@ -565,13 +557,13 @@ static void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CDa
 		return;

 	Attribute *attr_intercept = NULL;
-
+	
 	if(mesh->need_attribute(scene, ATTR_STD_CURVE_INTERCEPT))
 		attr_intercept = mesh->curve_attributes.add(ATTR_STD_CURVE_INTERCEPT);

 	/* compute and reserve size of arrays */
-	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; curve++) {
+	for(int sys = 0; sys < CData->psys_firstcurve.size() ; sys++) {
+		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys] ; curve++) {
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;

@@ -590,8 +582,8 @@ static void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CDa
 	num_curves = 0;

 	/* actually export */
-	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; curve++) {
+	for(int sys = 0; sys < CData->psys_firstcurve.size() ; sys++) {
+		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys] ; curve++) {
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;

@@ -685,13 +677,8 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int
 	/* in case of new attribute, we verify if there really was any motion */
 	if(new_attribute) {
 		if(i != numkeys || !have_motion) {
-			/* No motion or hair "topology" changed, remove attributes again. */
-			if(i != numkeys) {
-				VLOG(1) << "Hair topology changed, removing attribute.";
-			}
-			else {
-				VLOG(1) << "No motion, removing attribute.";
-			}
+			/* no motion, remove attributes again */
+			VLOG(1) << "No motion, removing attribute";
 			mesh->curve_attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION);
 		}
 		else if(time_index > 0) {
@@ -711,10 +698,7 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int
 	}
 }

-static void ExportCurveTriangleUV(ParticleCurveData *CData,
-                                  int vert_offset,
-                                  int resol,
-                                  float3 *uvdata)
+void ExportCurveTriangleUV(ParticleCurveData *CData, int vert_offset, int resol, float3 *uvdata)
 {
 	if(uvdata == NULL)
 		return;
@@ -724,8 +708,8 @@ static void ExportCurveTriangleUV(ParticleCurveData *CData,

 	int vertexindex = vert_offset;

-	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; curve++) {
+	for(int sys = 0; sys < CData->psys_firstcurve.size() ; sys++) {
+		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys] ; curve++) {
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;

@@ -759,18 +743,15 @@ static void ExportCurveTriangleUV(ParticleCurveData *CData,
 	}
 }

-static void ExportCurveTriangleVcol(ParticleCurveData *CData,
-                                    int vert_offset,
-                                    int resol,
-                                    uchar4 *cdata)
+void ExportCurveTriangleVcol(ParticleCurveData *CData, int vert_offset, int resol, uchar4 *cdata)
 {
 	if(cdata == NULL)
 		return;

 	int vertexindex = vert_offset;

-	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
-		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys]; curve++) {
+	for(int sys = 0; sys < CData->psys_firstcurve.size() ; sys++) {
+		for(int curve = CData->psys_firstcurve[sys]; curve < CData->psys_firstcurve[sys] + CData->psys_curvenum[sys] ; curve++) {
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;

@@ -892,7 +873,7 @@ void BlenderSync::sync_curves(Mesh *mesh,
 	}

 	/* obtain general settings */
-	const bool use_curves = scene->curve_system_manager->use_curves;
+	bool use_curves = scene->curve_system_manager->use_curves;

 	if(!(use_curves && b_ob.mode() != b_ob.mode_PARTICLE_EDIT)) {
 		if(!motion)
@@ -900,11 +881,11 @@ void BlenderSync::sync_curves(Mesh *mesh,
 		return;
 	}

-	const int primitive = scene->curve_system_manager->primitive;
-	const int triangle_method = scene->curve_system_manager->triangle_method;
-	const int resolution = scene->curve_system_manager->resolution;
-	const size_t vert_num = mesh->verts.size();
-	const size_t tri_num = mesh->num_triangles();
+	int primitive = scene->curve_system_manager->primitive;
+	int triangle_method = scene->curve_system_manager->triangle_method;
+	int resolution = scene->curve_system_manager->resolution;
+	size_t vert_num = mesh->verts.size();
+	size_t tri_num = mesh->num_triangles();
 	int used_res = 1;

 	/* extract particle hair data - should be combined with connecting to mesh later*/
@@ -1063,3 +1044,4 @@ void BlenderSync::sync_curves(Mesh *mesh,
 }

 CCL_NAMESPACE_END
+
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -27,7 +27,6 @@
 #include "subd_patch.h"
 #include "subd_split.h"

-#include "util_algorithm.h"
 #include "util_foreach.h"
 #include "util_logging.h"
 #include "util_math.h"
@@ -526,177 +525,69 @@ static void attr_create_uv_map(Scene *scene,
 }

 /* Create vertex pointiness attributes. */
-
-/* Compare vertices by sum of their coordinates. */
-class VertexAverageComparator {
-public:
-	VertexAverageComparator(const array<float3>& verts)
-	        : verts_(verts) {
-	}
-
-	bool operator()(const int& vert_idx_a, const int& vert_idx_b)
-	{
-		const float3 &vert_a = verts_[vert_idx_a];
-		const float3 &vert_b = verts_[vert_idx_b];
-		if(vert_a == vert_b) {
-			/* Special case for doubles, so we ensure ordering. */
-			return vert_idx_a > vert_idx_b;
-		}
-		const float x1 = vert_a.x + vert_a.y + vert_a.z;
-		const float x2 = vert_b.x + vert_b.y + vert_b.z;
-		return x1 < x2;
-	}
-
-protected:
-	const array<float3>& verts_;
-};
-
 static void attr_create_pointiness(Scene *scene,
                                   Mesh *mesh,
                                   BL::Mesh& b_mesh,
                                   bool subdivision)
 {
-	if(!mesh->need_attribute(scene, ATTR_STD_POINTINESS)) {
-		return;
-	}
-	const int num_verts = b_mesh.vertices.length();
-	/* STEP 1: Find out duplicated vertices and point duplicates to a single
-	 *         original vertex.
-	 */
-	vector<int> sorted_vert_indeices(num_verts);
-	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
-		sorted_vert_indeices[vert_index] = vert_index;
-	}
-	VertexAverageComparator compare(mesh->verts);
-	sort(sorted_vert_indeices.begin(), sorted_vert_indeices.end(), compare);
-	/* This array stores index of the original vertex for the given vertex
-	 * index.
-	 */
-	vector<int> vert_orig_index(num_verts);
-	for(int sorted_vert_index = 0;
-	    sorted_vert_index < num_verts;
-	    ++sorted_vert_index)
-	{
-		const int vert_index = sorted_vert_indeices[sorted_vert_index];
-		const float3 &vert_co = mesh->verts[vert_index];
-		bool found = false;
-		for(int other_sorted_vert_index = sorted_vert_index + 1;
-		    other_sorted_vert_index < num_verts;
-		    ++other_sorted_vert_index)
-		{
-			const int other_vert_index =
-			        sorted_vert_indeices[other_sorted_vert_index];
-			const float3 &other_vert_co = mesh->verts[other_vert_index];
-			/* We are too far away now, we wouldn't have duplicate. */
-			if((other_vert_co.x + other_vert_co.y + other_vert_co.z) -
-			   (vert_co.x + vert_co.y + vert_co.z) > 3 * FLT_EPSILON)
-			{
-				break;
+	if(mesh->need_attribute(scene, ATTR_STD_POINTINESS)) {
+		const int numverts = b_mesh.vertices.length();
+		AttributeSet& attributes = (subdivision)? mesh->subd_attributes: mesh->attributes;
+		Attribute *attr = attributes.add(ATTR_STD_POINTINESS);
+		float *data = attr->data_float();
+		int *counter = new int[numverts];
+		float *raw_data = new float[numverts];
+		float3 *edge_accum = new float3[numverts];
+
+		/* Calculate pointiness using single ring neighborhood. */
+		memset(counter, 0, sizeof(int) * numverts);
+		memset(raw_data, 0, sizeof(float) * numverts);
+		memset(edge_accum, 0, sizeof(float3) * numverts);
+		BL::Mesh::edges_iterator e;
+		int i = 0;
+		for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++i) {
+			int v0 = b_mesh.edges[i].vertices()[0],
+			    v1 = b_mesh.edges[i].vertices()[1];
+			float3 co0 = get_float3(b_mesh.vertices[v0].co()),
+			       co1 = get_float3(b_mesh.vertices[v1].co());
+			float3 edge = normalize(co1 - co0);
+			edge_accum[v0] += edge;
+			edge_accum[v1] += -edge;
+			++counter[v0];
+			++counter[v1];
+		}
+		i = 0;
+		BL::Mesh::vertices_iterator v;
+		for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end(); ++v, ++i) {
+			if(counter[i] > 0) {
+				float3 normal = get_float3(b_mesh.vertices[i].normal());
+				float angle = safe_acosf(dot(normal, edge_accum[i] / counter[i]));
+				raw_data[i] = angle * M_1_PI_F;
 			}
-			/* Found duplicate. */
-			if(len_squared(other_vert_co - vert_co) < FLT_EPSILON) {
-				found = true;
-				vert_orig_index[vert_index] = other_vert_index;
-				break;
+			else {
+				raw_data[i] = 0.0f;
 			}
 		}
-		if(!found) {
-			vert_orig_index[vert_index] = vert_index;
+
+		/* Blur vertices to approximate 2 ring neighborhood. */
+		memset(counter, 0, sizeof(int) * numverts);
+		memcpy(data, raw_data, sizeof(float) * numverts);
+		i = 0;
+		for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++i) {
+			int v0 = b_mesh.edges[i].vertices()[0],
+			    v1 = b_mesh.edges[i].vertices()[1];
+			data[v0] += raw_data[v1];
+			data[v1] += raw_data[v0];
+			++counter[v0];
+			++counter[v1];
 		}
-	}
-	/* Make sure we always points to the very first orig vertex. */
-	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
-		int orig_index = vert_orig_index[vert_index];
-		while(orig_index != vert_orig_index[orig_index]) {
-			orig_index = vert_orig_index[orig_index];
+		for(i = 0; i < numverts; ++i) {
+			data[i] /= counter[i] + 1;
 		}
-		vert_orig_index[vert_index] = orig_index;
-	}
-	sorted_vert_indeices.free_memory();
-	/* STEP 2: Calculate vertex normals taking into account their possible
-	 *         duplicates which gets "welded" together.
-	 */
-	vector<float3> vert_normal(num_verts, make_float3(0.0f, 0.0f, 0.0f));
-	/* First we accumulate all vertex normals in the original index. */
-	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
-		const float3 normal = get_float3(b_mesh.vertices[vert_index].normal());
-		const int orig_index = vert_orig_index[vert_index];
-		vert_normal[orig_index] += normal;
-	}
-	/* Then we normalize the accumulated result and flush it to all duplicates
-	 * as well.
-	 */
-	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
-		const int orig_index = vert_orig_index[vert_index];
-		vert_normal[vert_index] = normalize(vert_normal[orig_index]);
-	}
-	/* STEP 3: Calculate pointiness using single ring neighborhood. */
-	vector<int> counter(num_verts, 0);
-	vector<float> raw_data(num_verts, 0.0f);
-	vector<float3> edge_accum(num_verts, make_float3(0.0f, 0.0f, 0.0f));
-	BL::Mesh::edges_iterator e;
-	EdgeMap visited_edges;
-	int edge_index = 0;
-	memset(&counter[0], 0, sizeof(int) * counter.size());
-	for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++edge_index) {
-		const int v0 = vert_orig_index[b_mesh.edges[edge_index].vertices()[0]],
-		          v1 = vert_orig_index[b_mesh.edges[edge_index].vertices()[1]];
-		if(visited_edges.exists(v0, v1)) {
-			continue;
-		}
-		visited_edges.insert(v0, v1);
-		float3 co0 = get_float3(b_mesh.vertices[v0].co()),
-		       co1 = get_float3(b_mesh.vertices[v1].co());
-		float3 edge = normalize(co1 - co0);
-		edge_accum[v0] += edge;
-		edge_accum[v1] += -edge;
-		++counter[v0];
-		++counter[v1];
-	}
-	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
-		const int orig_index = vert_orig_index[vert_index];
-		if(orig_index != vert_index) {
-			/* Skip duplicates, they'll be overwritten later on. */
-			continue;
-		}
-		if(counter[vert_index] > 0) {
-			const float3 normal = vert_normal[vert_index];
-			const float angle =
-			        safe_acosf(dot(normal,
-			                       edge_accum[vert_index] / counter[vert_index]));
-			raw_data[vert_index] = angle * M_1_PI_F;
-		}
-		else {
-			raw_data[vert_index] = 0.0f;
-		}
-	}
-	/* STEP 3: Blur vertices to approximate 2 ring neighborhood. */
-	AttributeSet& attributes = (subdivision)? mesh->subd_attributes: mesh->attributes;
-	Attribute *attr = attributes.add(ATTR_STD_POINTINESS);
-	float *data = attr->data_float();
-	memcpy(data, &raw_data[0], sizeof(float) * raw_data.size());
-	memset(&counter[0], 0, sizeof(int) * counter.size());
-	edge_index = 0;
-	visited_edges.clear();
-	for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++edge_index) {
-		const int v0 = vert_orig_index[b_mesh.edges[edge_index].vertices()[0]],
-		          v1 = vert_orig_index[b_mesh.edges[edge_index].vertices()[1]];
-		if(visited_edges.exists(v0, v1)) {
-			continue;
-		}
-		visited_edges.insert(v0, v1);
-		data[v0] += raw_data[v1];
-		data[v1] += raw_data[v0];
-		++counter[v0];
-		++counter[v1];
-	}
-	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
-		data[vert_index] /= counter[vert_index] + 1;
-	}
-	/* STEP 4: Copy attribute to the duplicated vertices. */
-	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
-		const int orig_index = vert_orig_index[vert_index];
-		data[vert_index] = data[orig_index];
+
+		delete [] counter;
+		delete [] raw_data;
+		delete [] edge_accum;
 	}
 }

@@ -706,8 +597,8 @@ static void create_mesh(Scene *scene,
                        Mesh *mesh,
                        BL::Mesh& b_mesh,
                        const vector<Shader*>& used_shaders,
-                        bool subdivision = false,
-                        bool subdivide_uvs = true)
+                        bool subdivision=false,
+                        bool subdivide_uvs=true)
 {
 	/* count vertices and faces */
 	int numverts = b_mesh.vertices.length();
@@ -765,6 +656,9 @@ static void create_mesh(Scene *scene,
 			generated[i++] = get_float3(v->undeformed_co())*size - loc;
 	}

+	/* Create needed vertex attributes. */
+	attr_create_pointiness(scene, mesh, b_mesh, subdivision);
+
 	/* create faces */
 	vector<int> nverts(numfaces);
 	vector<int> face_flags(numfaces, FACE_FLAG_NONE);
@@ -777,19 +671,28 @@ static void create_mesh(Scene *scene,
 			int shader = clamp(f->material_index(), 0, used_shaders.size()-1);
 			bool smooth = f->use_smooth() || use_loop_normals;

+			/* split vertices if normal is different
+			 *
+			 * note all vertex attributes must have been set here so we can split
+			 * and copy attributes in split_vertex without remapping later */
 			if(use_loop_normals) {
 				BL::Array<float, 12> loop_normals = f->split_normals();
+
 				for(int i = 0; i < n; i++) {
-					N[vi[i]] = make_float3(loop_normals[i * 3],
-					                       loop_normals[i * 3 + 1],
-					                       loop_normals[i * 3 + 2]);
+					float3 loop_N = make_float3(loop_normals[i * 3], loop_normals[i * 3 + 1], loop_normals[i * 3 + 2]);
+
+					if(N[vi[i]] != loop_N) {
+						int new_vi = mesh->split_vertex(vi[i]);
+
+						/* set new normal and vertex index */
+						N = attr_N->data_float3();
+						N[new_vi] = loop_N;
+						vi[i] = new_vi;
+					}
 				}
 			}

-			/* Create triangles.
-			 *
-			 * NOTE: Autosmooth is already taken care about.
-			 */
+			/* create triangles */
 			if(n == 4) {
 				if(is_zero(cross(mesh->verts[vi[1]] - mesh->verts[vi[0]], mesh->verts[vi[2]] - mesh->verts[vi[0]])) ||
 				   is_zero(cross(mesh->verts[vi[2]] - mesh->verts[vi[0]], mesh->verts[vi[3]] - mesh->verts[vi[0]])))
@@ -821,8 +724,24 @@ static void create_mesh(Scene *scene,

 			vi.reserve(n);
 			for(int i = 0; i < n; i++) {
-				/* NOTE: Autosmooth is already taken care about. */
 				vi[i] = b_mesh.loops[p->loop_start() + i].vertex_index();
+
+				/* split vertices if normal is different
+				 *
+				 * note all vertex attributes must have been set here so we can split
+				 * and copy attributes in split_vertex without remapping later */
+				if(use_loop_normals) {
+					float3 loop_N = get_float3(b_mesh.loops[p->loop_start() + i].normal());
+
+					if(N[vi[i]] != loop_N) {
+						int new_vi = mesh->split_vertex(vi[i]);
+
+						/* set new normal and vertex index */
+						N = attr_N->data_float3();
+						N[new_vi] = loop_N;
+						vi[i] = new_vi;
+					}
+				}
 			}

 			/* create subd faces */
@@ -833,7 +752,6 @@ static void create_mesh(Scene *scene,
 	/* Create all needed attributes.
 	 * The calculate functions will check whether they're needed or not.
 	 */
-	attr_create_pointiness(scene, mesh, b_mesh, subdivision);
 	attr_create_vertex_color(scene, mesh, b_mesh, nverts, face_flags, subdivision);
 	attr_create_uv_map(scene, mesh, b_mesh, nverts, face_flags, subdivision, subdivide_uvs);

@@ -1043,20 +961,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,

 		mesh->subdivision_type = object_subdivision_type(b_ob, preview, experimental);

-		/* Disable adaptive subdivision while baking as the baking system
-		 * currently doesnt support the topology and will crash.
-		 */
-		if(scene->bake_manager->get_baking()) {
-			mesh->subdivision_type = Mesh::SUBDIVISION_NONE;
-		}
-
-		BL::Mesh b_mesh = object_to_mesh(b_data,
-		                                 b_ob,
-		                                 b_scene,
-		                                 true,
-		                                 !preview,
-		                                 need_undeformed,
-		                                 mesh->subdivision_type);
+		BL::Mesh b_mesh = object_to_mesh(b_data, b_ob, b_scene, true, !preview, need_undeformed, mesh->subdivision_type);

 		if(b_mesh) {
 			if(render_layer.use_surfaces && !hide_tris) {
@@ -1181,13 +1086,7 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,

 	if(ccl::BKE_object_is_deform_modified(b_ob, b_scene, preview)) {
 		/* get derived mesh */
-		b_mesh = object_to_mesh(b_data,
-		                        b_ob,
-		                        b_scene,
-		                        true,
-		                        !preview,
-		                        false,
-		                        Mesh::SUBDIVISION_NONE);
+		b_mesh = object_to_mesh(b_data, b_ob, b_scene, true, !preview, false, false);
 	}

 	if(!b_mesh) {
@@ -1258,12 +1157,10 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 			{
 				/* no motion, remove attributes again */
 				if(b_mesh.vertices.length() != numverts) {
-					VLOG(1) << "Topology differs, disabling motion blur for object "
-					        << b_ob.name();
+					VLOG(1) << "Topology differs, disabling motion blur.";
 				}
 				else {
-					VLOG(1) << "No actual deformation motion for object "
-					        << b_ob.name();
+					VLOG(1) << "No actual deformation motion for object " << b_ob.name();
 				}
 				mesh->attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION);
 				if(attr_mN)
@@ -1294,3 +1191,4 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 }

 CCL_NAMESPACE_END
+
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -25,7 +25,6 @@
 #include "particles.h"
 #include "shader.h"

-#include "blender_object_cull.h"
 #include "blender_sync.h"
 #include "blender_util.h"

@@ -89,6 +88,143 @@ static uint object_ray_visibility(BL::Object& b_ob)
 	return flag;
 }

+/* Culling */
+
+class BlenderObjectCulling
+{
+public:
+	BlenderObjectCulling(Scene *scene, BL::Scene& b_scene)
+	: use_scene_camera_cull(false),
+	  use_camera_cull(false),
+	  camera_cull_margin(0.0f),
+	  use_scene_distance_cull(false),
+	  use_distance_cull(false),
+	  distance_cull_margin(0.0f)
+	{
+		if(b_scene.render().use_simplify()) {
+			PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+
+			use_scene_camera_cull = scene->camera->type != CAMERA_PANORAMA &&
+									!b_scene.render().use_multiview() &&
+									get_boolean(cscene, "use_camera_cull");
+			use_scene_distance_cull = scene->camera->type != CAMERA_PANORAMA &&
+									  !b_scene.render().use_multiview() &&
+									  get_boolean(cscene, "use_distance_cull");
+
+			camera_cull_margin = get_float(cscene, "camera_cull_margin");
+			distance_cull_margin = get_float(cscene, "distance_cull_margin");
+
+			if (distance_cull_margin == 0.0f) {
+				use_scene_distance_cull = false;
+			}
+		}
+	}
+
+	void init_object(Scene *scene, BL::Object& b_ob)
+	{
+		if(!use_scene_camera_cull && !use_scene_distance_cull) {
+			return;
+		}
+
+		PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
+
+		use_camera_cull = use_scene_camera_cull && get_boolean(cobject, "use_camera_cull");
+		use_distance_cull = use_scene_distance_cull && get_boolean(cobject, "use_distance_cull");
+
+		if(use_camera_cull || use_distance_cull) {
+			/* Need to have proper projection matrix. */
+			scene->camera->update();
+		}
+	}
+
+	bool test(Scene *scene, BL::Object& b_ob, Transform& tfm)
+	{
+		if(!use_camera_cull && !use_distance_cull) {
+			return false;
+		}
+
+		/* Compute world space bounding box corners. */
+		float3 bb[8];
+		BL::Array<float, 24> boundbox = b_ob.bound_box();
+		for(int i = 0; i < 8; ++i) {
+			float3 p = make_float3(boundbox[3 * i + 0],
+								   boundbox[3 * i + 1],
+								   boundbox[3 * i + 2]);
+			bb[i] = transform_point(&tfm, p);
+		}
+
+		bool camera_culled = use_camera_cull && test_camera(scene, bb);
+		bool distance_culled = use_distance_cull && test_distance(scene, bb);
+
+		return ((camera_culled && distance_culled) ||
+		        (camera_culled && !use_distance_cull) ||
+		        (distance_culled && !use_camera_cull));
+	}
+
+private:
+	/* TODO(sergey): Not really optimal, consider approaches based on k-DOP in order
+	 * to reduce number of objects which are wrongly considered visible.
+	 */
+	bool test_camera(Scene *scene, float3 bb[8])
+	{
+		Camera *cam = scene->camera;
+		Transform& worldtondc = cam->worldtondc;
+		float3 bb_min = make_float3(FLT_MAX, FLT_MAX, FLT_MAX),
+			   bb_max = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
+		bool all_behind = true;
+		for(int i = 0; i < 8; ++i) {
+			float3 p = bb[i];
+			float4 b = make_float4(p.x, p.y, p.z, 1.0f);
+			float4 c = make_float4(dot(worldtondc.x, b),
+			                       dot(worldtondc.y, b),
+			                       dot(worldtondc.z, b),
+			                       dot(worldtondc.w, b));
+			p = float4_to_float3(c / c.w);
+			if(c.z < 0.0f) {
+				p.x = 1.0f - p.x;
+				p.y = 1.0f - p.y;
+			}
+			if(c.z >= -camera_cull_margin) {
+				all_behind = false;
+			}
+			bb_min = min(bb_min, p);
+			bb_max = max(bb_max, p);
+		}
+		if(all_behind) {
+			return true;
+		}
+		return (bb_min.x >= 1.0f + camera_cull_margin ||
+		        bb_min.y >= 1.0f + camera_cull_margin ||
+		        bb_max.x <= -camera_cull_margin ||
+		        bb_max.y <= -camera_cull_margin);
+	}
+
+	bool test_distance(Scene *scene, float3 bb[8])
+	{
+		float3 camera_position = transform_get_column(&scene->camera->matrix, 3);
+		float3 bb_min = make_float3(FLT_MAX, FLT_MAX, FLT_MAX),
+			   bb_max = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
+
+		/* Find min & max points for x & y & z on bounding box */
+		for(int i = 0; i < 8; ++i) {
+			float3 p = bb[i];
+			bb_min = min(bb_min, p);
+			bb_max = max(bb_max, p);
+		}
+
+		float3 closest_point = max(min(bb_max,camera_position),bb_min);
+		return (len_squared(camera_position - closest_point) >
+		        distance_cull_margin * distance_cull_margin);
+	}
+
+	bool use_scene_camera_cull;
+	bool use_camera_cull;
+	float camera_cull_margin;
+	bool use_scene_distance_cull;
+	bool use_distance_cull;
+	float distance_cull_margin;
+};
+
 /* Light */

 void BlenderSync::sync_light(BL::Object& b_parent,
--- a/intern/cycles/blender/blender_object_cull.cpp
+++ b/intern/cycles/blender/blender_object_cull.cpp
@@ -1,149 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdlib>
-
-#include "camera.h"
-
-#include "blender_object_cull.h"
-
-CCL_NAMESPACE_BEGIN
-
-BlenderObjectCulling::BlenderObjectCulling(Scene *scene, BL::Scene& b_scene)
-        : use_scene_camera_cull_(false),
-          use_camera_cull_(false),
-          camera_cull_margin_(0.0f),
-          use_scene_distance_cull_(false),
-          use_distance_cull_(false),
-          distance_cull_margin_(0.0f)
-{
-	if(b_scene.render().use_simplify()) {
-		PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
-
-		use_scene_camera_cull_ = scene->camera->type != CAMERA_PANORAMA &&
-		                         !b_scene.render().use_multiview() &&
-		                         get_boolean(cscene, "use_camera_cull");
-		use_scene_distance_cull_ = scene->camera->type != CAMERA_PANORAMA &&
-		                           !b_scene.render().use_multiview() &&
-		                           get_boolean(cscene, "use_distance_cull");
-
-		camera_cull_margin_ = get_float(cscene, "camera_cull_margin");
-		distance_cull_margin_ = get_float(cscene, "distance_cull_margin");
-
-		if(distance_cull_margin_ == 0.0f) {
-			use_scene_distance_cull_ = false;
-		}
-	}
-}
-
-void BlenderObjectCulling::init_object(Scene *scene, BL::Object& b_ob)
-{
-	if(!use_scene_camera_cull_ && !use_scene_distance_cull_) {
-		return;
-	}
-
-	PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
-
-	use_camera_cull_ = use_scene_camera_cull_ && get_boolean(cobject, "use_camera_cull");
-	use_distance_cull_ = use_scene_distance_cull_ && get_boolean(cobject, "use_distance_cull");
-
-	if(use_camera_cull_ || use_distance_cull_) {
-		/* Need to have proper projection matrix. */
-		scene->camera->update();
-	}
-}
-
-bool BlenderObjectCulling::test(Scene *scene, BL::Object& b_ob, Transform& tfm)
-{
-	if(!use_camera_cull_ && !use_distance_cull_) {
-		return false;
-	}
-
-	/* Compute world space bounding box corners. */
-	float3 bb[8];
-	BL::Array<float, 24> boundbox = b_ob.bound_box();
-	for(int i = 0; i < 8; ++i) {
-		float3 p = make_float3(boundbox[3 * i + 0],
-		                       boundbox[3 * i + 1],
-		                       boundbox[3 * i + 2]);
-		bb[i] = transform_point(&tfm, p);
-	}
-
-	bool camera_culled = use_camera_cull_ && test_camera(scene, bb);
-	bool distance_culled = use_distance_cull_ && test_distance(scene, bb);
-
-	return ((camera_culled && distance_culled) ||
-	        (camera_culled && !use_distance_cull_) ||
-	        (distance_culled && !use_camera_cull_));
-}
-
-/* TODO(sergey): Not really optimal, consider approaches based on k-DOP in order
- * to reduce number of objects which are wrongly considered visible.
- */
-bool BlenderObjectCulling::test_camera(Scene *scene, float3 bb[8])
-{
-	Camera *cam = scene->camera;
-	Transform& worldtondc = cam->worldtondc;
-	float3 bb_min = make_float3(FLT_MAX, FLT_MAX, FLT_MAX),
-	       bb_max = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
-	bool all_behind = true;
-	for(int i = 0; i < 8; ++i) {
-		float3 p = bb[i];
-		float4 b = make_float4(p.x, p.y, p.z, 1.0f);
-		float4 c = make_float4(dot(worldtondc.x, b),
-		                       dot(worldtondc.y, b),
-		                       dot(worldtondc.z, b),
-		                       dot(worldtondc.w, b));
-		p = float4_to_float3(c / c.w);
-		if(c.z < 0.0f) {
-			p.x = 1.0f - p.x;
-			p.y = 1.0f - p.y;
-		}
-		if(c.z >= -camera_cull_margin_) {
-			all_behind = false;
-		}
-		bb_min = min(bb_min, p);
-		bb_max = max(bb_max, p);
-	}
-	if(all_behind) {
-		return true;
-	}
-	return (bb_min.x >= 1.0f + camera_cull_margin_ ||
-	        bb_min.y >= 1.0f + camera_cull_margin_ ||
-	        bb_max.x <= -camera_cull_margin_ ||
-	        bb_max.y <= -camera_cull_margin_);
-}
-
-bool BlenderObjectCulling::test_distance(Scene *scene, float3 bb[8])
-{
-	float3 camera_position = transform_get_column(&scene->camera->matrix, 3);
-	float3 bb_min = make_float3(FLT_MAX, FLT_MAX, FLT_MAX),
-	       bb_max = make_float3(-FLT_MAX, -FLT_MAX, -FLT_MAX);
-
-	/* Find min & max points for x & y & z on bounding box */
-	for(int i = 0; i < 8; ++i) {
-		float3 p = bb[i];
-		bb_min = min(bb_min, p);
-		bb_max = max(bb_max, p);
-	}
-
-	float3 closest_point = max(min(bb_max,camera_position),bb_min);
-	return (len_squared(camera_position - closest_point) >
-	        distance_cull_margin_ * distance_cull_margin_);
-}
-
-CCL_NAMESPACE_END
-
--- a/intern/cycles/blender/blender_object_cull.h
+++ b/intern/cycles/blender/blender_object_cull.h
@@ -1,49 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __BLENDER_OBJECT_CULL_H__
-#define __BLENDER_OBJECT_CULL_H__
-
-#include "blender_sync.h"
-#include "util_types.h"
-
-CCL_NAMESPACE_BEGIN
-
-class Scene;
-
-class BlenderObjectCulling
-{
-public:
-	BlenderObjectCulling(Scene *scene, BL::Scene& b_scene);
-
-	void init_object(Scene *scene, BL::Object& b_ob);
-	bool test(Scene *scene, BL::Object& b_ob, Transform& tfm);
-
-private:
-	bool test_camera(Scene *scene, float3 bb[8]);
-	bool test_distance(Scene *scene, float3 bb[8]);
-
-	bool use_scene_camera_cull_;
-	bool use_camera_cull_;
-	float camera_cull_margin_;
-	bool use_scene_distance_cull_;
-	bool use_distance_cull_;
-	float distance_cull_margin_;
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __BLENDER_OBJECT_CULL_H__ */
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -67,10 +67,8 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
 	flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3");
 	flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
 	flags.cpu.qbvh = get_boolean(cscene, "debug_use_qbvh");
-	flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel");
 	/* Synchronize CUDA flags. */
 	flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
-	flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
 	/* Synchronize OpenCL kernel type. */
 	switch(get_enum(cscene, "debug_opencl_kernel_type")) {
 		case 0:
@@ -106,7 +104,6 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
 	}
 	/* Synchronize other OpenCL flags. */
 	flags.opencl.debug = get_boolean(cscene, "debug_use_opencl_debug");
-	flags.opencl.single_program = get_boolean(cscene, "debug_opencl_kernel_single_program");
 	return flags.opencl.device_type != opencl_device_type ||
 	       flags.opencl.kernel_type != opencl_kernel_type;
 }
@@ -644,7 +641,7 @@ static PyObject *debug_flags_reset_func(PyObject * /*self*/, PyObject * /*args*/
 	Py_RETURN_NONE;
 }

-static PyObject *set_resumable_chunk_func(PyObject * /*self*/, PyObject *args)
+static PyObject *set_resumable_chunks_func(PyObject * /*self*/, PyObject *args)
 {
 	int num_resumable_chunks, current_resumable_chunk;
 	if(!PyArg_ParseTuple(args, "ii",
@@ -679,53 +676,6 @@ static PyObject *set_resumable_chunk_func(PyObject * /*self*/, PyObject *args)
 	Py_RETURN_NONE;
 }

-static PyObject *set_resumable_chunk_range_func(PyObject * /*self*/, PyObject *args)
-{
-	int num_chunks, start_chunk, end_chunk;
-	if(!PyArg_ParseTuple(args, "iii",
-	                     &num_chunks,
-	                     &start_chunk,
-	                     &end_chunk)) {
-		Py_RETURN_NONE;
-	}
-
-	if(num_chunks <= 0) {
-		fprintf(stderr, "Cycles: Bad value for number of resumable chunks.\n");
-		abort();
-		Py_RETURN_NONE;
-	}
-	if(start_chunk < 1 || start_chunk > num_chunks) {
-		fprintf(stderr, "Cycles: Bad value for start chunk number.\n");
-		abort();
-		Py_RETURN_NONE;
-	}
-	if(end_chunk < 1 || end_chunk > num_chunks) {
-		fprintf(stderr, "Cycles: Bad value for start chunk number.\n");
-		abort();
-		Py_RETURN_NONE;
-	}
-	if(start_chunk > end_chunk) {
-		fprintf(stderr, "Cycles: End chunk should be higher than start one.\n");
-		abort();
-		Py_RETURN_NONE;
-	}
-
-	VLOG(1) << "Initialized resumable render: "
-	        << "num_resumable_chunks=" << num_chunks << ", "
-	        << "start_resumable_chunk=" << start_chunk
-	        << "end_resumable_chunk=" << end_chunk;
-	BlenderSession::num_resumable_chunks = num_chunks;
-	BlenderSession::start_resumable_chunk = start_chunk;
-	BlenderSession::end_resumable_chunk = end_chunk;
-
-	printf("Cycles: Will render chunks %d to %d of %d\n",
-	       start_chunk,
-	       end_chunk,
-	       num_chunks);
-
-	Py_RETURN_NONE;
-}
-
 static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
 {
 	vector<DeviceInfo>& devices = Device::available_devices();
@@ -765,8 +715,7 @@ static PyMethodDef methods[] = {
 	{"debug_flags_reset", debug_flags_reset_func, METH_NOARGS, ""},

 	/* Resumable render */
-	{"set_resumable_chunk", set_resumable_chunk_func, METH_VARARGS, ""},
-	{"set_resumable_chunk_range", set_resumable_chunk_range_func, METH_VARARGS, ""},
+	{"set_resumable_chunks", set_resumable_chunks_func, METH_VARARGS, ""},

 	/* Compute Device selection */
 	{"get_device_types", get_device_types_func, METH_VARARGS, ""},
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -46,8 +46,6 @@ CCL_NAMESPACE_BEGIN
 bool BlenderSession::headless = false;
 int BlenderSession::num_resumable_chunks = 0;
 int BlenderSession::current_resumable_chunk = 0;
-int BlenderSession::start_resumable_chunk = 0;
-int BlenderSession::end_resumable_chunk = 0;

 BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
                               BL::UserPreferences& b_userpref,
@@ -70,7 +68,6 @@ BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
 	background = true;
 	last_redraw_time = 0.0;
 	start_resize_time = 0.0;
-	last_status_time = 0.0;
 }

 BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
@@ -96,7 +93,6 @@ BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
 	background = false;
 	last_redraw_time = 0.0;
 	start_resize_time = 0.0;
-	last_status_time = 0.0;
 }

 BlenderSession::~BlenderSession()
@@ -130,8 +126,8 @@ void BlenderSession::create_session()

 	/* setup callbacks for builtin image support */
 	scene->image_manager->builtin_image_info_cb = function_bind(&BlenderSession::builtin_image_info, this, _1, _2, _3, _4, _5, _6, _7);
-	scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3, _4);
-	scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3, _4);
+	scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3);
+	scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3);

 	/* create session */
 	session = new Session(session_params);
@@ -309,16 +305,12 @@ static PassType get_pass_type(BL::RenderPass& b_pass)
 #ifdef WITH_CYCLES_DEBUG
 		case BL::RenderPass::type_DEBUG:
 		{
-			switch(b_pass.debug_type()) {
-				case BL::RenderPass::debug_type_BVH_TRAVERSED_NODES:
-					return PASS_BVH_TRAVERSED_NODES;
-				case BL::RenderPass::debug_type_BVH_TRAVERSED_INSTANCES:
-					return PASS_BVH_TRAVERSED_INSTANCES;
-				case BL::RenderPass::debug_type_BVH_INTERSECTIONS:
-					return PASS_BVH_INTERSECTIONS;
-				case BL::RenderPass::debug_type_RAY_BOUNCES:
-					return PASS_RAY_BOUNCES;
-			}
+			if(b_pass.debug_type() == BL::RenderPass::debug_type_BVH_TRAVERSAL_STEPS)
+				return PASS_BVH_TRAVERSAL_STEPS;
+			if(b_pass.debug_type() == BL::RenderPass::debug_type_BVH_TRAVERSED_INSTANCES)
+				return PASS_BVH_TRAVERSED_INSTANCES;
+			if(b_pass.debug_type() == BL::RenderPass::debug_type_RAY_BOUNCES)
+				return PASS_RAY_BOUNCES;
 			break;
 		}
 #endif
@@ -588,7 +580,7 @@ static void populate_bake_data(BakeData *data, const
 	BL::BakePixel bp = pixel_array;

 	int i;
-	for(i = 0; i < num_pixels; i++) {
+	for(i=0; i < num_pixels; i++) {
 		if(bp.object_id() == object_id) {
 			data->set(i, bp.primitive_id(), bp.uv(), bp.du_dx(), bp.du_dy(), bp.dv_dx(), bp.dv_dy());
 		} else {
@@ -938,13 +930,38 @@ void BlenderSession::get_status(string& status, string& substatus)

 void BlenderSession::get_progress(float& progress, double& total_time, double& render_time)
 {
-	session->progress.get_time(total_time, render_time);
-	progress = session->progress.get_progress();
+	double tile_time;
+	int tile, sample, samples_per_tile;
+	int tile_total = session->tile_manager.state.num_tiles;
+	int samples = session->tile_manager.state.sample + 1;
+	int total_samples = session->tile_manager.get_num_effective_samples();
+
+	session->progress.get_tile(tile, total_time, render_time, tile_time);
+
+	sample = session->progress.get_sample();
+	samples_per_tile = session->tile_manager.get_num_effective_samples();
+
+	if(background && samples_per_tile && tile_total)
+		progress = ((float)sample / (float)(tile_total * samples_per_tile));
+	else if(!background && samples > 0 && total_samples != INT_MAX)
+		progress = ((float)samples) / total_samples;
+	else
+		progress = 0.0;
 }

 void BlenderSession::update_bake_progress()
 {
-	float progress = session->progress.get_progress();
+	float progress;
+	int sample, samples_per_task, parts_total;
+
+	sample = session->progress.get_sample();
+	samples_per_task = scene->bake_manager->num_samples;
+	parts_total = scene->bake_manager->num_parts;
+
+	if(samples_per_task)
+		progress = ((float)sample / (float)(parts_total * samples_per_task));
+	else
+		progress = 0.0;

 	if(progress != last_progress) {
 		b_engine.update_progress(progress);
@@ -993,14 +1010,10 @@ void BlenderSession::update_status_progress()
 	if(substatus.size() > 0)
 		status += " | " + substatus;

-	double current_time = time_dt();
-	/* When rendering in a window, redraw the status at least once per second to keep the elapsed and remaining time up-to-date.
-	 * For headless rendering, only report when something significant changes to keep the console output readable. */
-	if(status != last_status || (!headless && (current_time - last_status_time) > 1.0)) {
+	if(status != last_status) {
 		b_engine.update_stats("", (timestatus + scene + status).c_str());
 		b_engine.update_memory_stats(mem_used, mem_peak);
 		last_status = status;
-		last_status_time = current_time;
 	}
 	if(progress != last_progress) {
 		b_engine.update_progress(progress);
@@ -1067,13 +1080,7 @@ int BlenderSession::builtin_image_frame(const string &builtin_name)
 	return atoi(builtin_name.substr(last + 1, builtin_name.size() - last - 1).c_str());
 }

-void BlenderSession::builtin_image_info(const string &builtin_name,
-                                        void *builtin_data,
-                                        bool &is_float,
-                                        int &width,
-                                        int &height,
-                                        int &depth,
-                                        int &channels)
+void BlenderSession::builtin_image_info(const string &builtin_name, void *builtin_data, bool &is_float, int &width, int &height, int &depth, int &channels)
 {
 	/* empty image */
 	is_float = false;
@@ -1151,67 +1158,60 @@ void BlenderSession::builtin_image_info(const string &builtin_name,
 	}
 }

-bool BlenderSession::builtin_image_pixels(const string &builtin_name,
-                                          void *builtin_data,
-                                          unsigned char *pixels,
-                                          const size_t pixels_size)
+bool BlenderSession::builtin_image_pixels(const string &builtin_name, void *builtin_data, unsigned char *pixels)
 {
-	if(!builtin_data) {
+	if(!builtin_data)
 		return false;
-	}

-	const int frame = builtin_image_frame(builtin_name);
+	int frame = builtin_image_frame(builtin_name);

 	PointerRNA ptr;
 	RNA_id_pointer_create((ID*)builtin_data, &ptr);
 	BL::Image b_image(ptr);

-	const int width = b_image.size()[0];
-	const int height = b_image.size()[1];
-	const int channels = b_image.channels();
+	int width = b_image.size()[0];
+	int height = b_image.size()[1];
+	int channels = b_image.channels();

-	unsigned char *image_pixels = image_get_pixels_for_frame(b_image, frame);
-	const size_t num_pixels = ((size_t)width) * height;
+	unsigned char *image_pixels;
+	image_pixels = image_get_pixels_for_frame(b_image, frame);
+	size_t num_pixels = ((size_t)width) * height;

-	if(image_pixels && num_pixels * channels == pixels_size) {
-		memcpy(pixels, image_pixels, pixels_size * sizeof(unsigned char));
+	if(image_pixels) {
+		memcpy(pixels, image_pixels, num_pixels * channels * sizeof(unsigned char));
 		MEM_freeN(image_pixels);
 	}
 	else {
 		if(channels == 1) {
-			memset(pixels, 0, pixels_size * sizeof(unsigned char));
+			memset(pixels, 0, num_pixels * sizeof(unsigned char));
 		}
 		else {
-			const size_t num_pixels_safe = pixels_size / channels;
 			unsigned char *cp = pixels;
-			for(size_t i = 0; i < num_pixels_safe; i++, cp += channels) {
+			for(size_t i = 0; i < num_pixels; i++, cp += channels) {
 				cp[0] = 255;
 				cp[1] = 0;
 				cp[2] = 255;
-				if(channels == 4) {
+				if(channels == 4)
 					cp[3] = 255;
-				}
 			}
 		}
 	}
-	/* Premultiply, byte images are always straight for Blender. */
+
+	/* premultiply, byte images are always straight for blender */
 	unsigned char *cp = pixels;
 	for(size_t i = 0; i < num_pixels; i++, cp += channels) {
 		cp[0] = (cp[0] * cp[3]) >> 8;
 		cp[1] = (cp[1] * cp[3]) >> 8;
 		cp[2] = (cp[2] * cp[3]) >> 8;
 	}
+
 	return true;
 }

-bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
-                                                void *builtin_data,
-                                                float *pixels,
-                                                const size_t pixels_size)
+bool BlenderSession::builtin_image_float_pixels(const string &builtin_name, void *builtin_data, float *pixels)
 {
-	if(!builtin_data) {
+	if(!builtin_data)
 		return false;
-	}

 	PointerRNA ptr;
 	RNA_id_pointer_create((ID*)builtin_data, &ptr);
@@ -1222,16 +1222,16 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
 		BL::Image b_image(b_id);
 		int frame = builtin_image_frame(builtin_name);

-		const int width = b_image.size()[0];
-		const int height = b_image.size()[1];
-		const int channels = b_image.channels();
+		int width = b_image.size()[0];
+		int height = b_image.size()[1];
+		int channels = b_image.channels();

 		float *image_pixels;
 		image_pixels = image_get_float_pixels_for_frame(b_image, frame);
-		const size_t num_pixels = ((size_t)width) * height;
+		size_t num_pixels = ((size_t)width) * height;

-		if(image_pixels && num_pixels * channels == pixels_size) {
-			memcpy(pixels, image_pixels, pixels_size * sizeof(float));
+		if(image_pixels) {
+			memcpy(pixels, image_pixels, num_pixels * channels * sizeof(float));
 			MEM_freeN(image_pixels);
 		}
 		else {
@@ -1239,15 +1239,13 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
 				memset(pixels, 0, num_pixels * sizeof(float));
 			}
 			else {
-				const size_t num_pixels_safe = pixels_size / channels;
 				float *fp = pixels;
-				for(int i = 0; i < num_pixels_safe; i++, fp += channels) {
+				for(int i = 0; i < num_pixels; i++, fp += channels) {
 					fp[0] = 1.0f;
 					fp[1] = 0.0f;
 					fp[2] = 1.0f;
-					if(channels == 4) {
+					if(channels == 4)
 						fp[3] = 1.0f;
-					}
 				}
 			}
 		}
@@ -1259,9 +1257,8 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
 		BL::Object b_ob(b_id);
 		BL::SmokeDomainSettings b_domain = object_smoke_domain_find(b_ob);

-		if(!b_domain) {
+		if(!b_domain)
 			return false;
-		}

 		int3 resolution = get_int3(b_domain.domain_resolution());
 		int length, amplify = (b_domain.use_high_resolution())? b_domain.amplify() + 1: 1;
@@ -1273,10 +1270,10 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
 			amplify = 1;
 		}

-		const int width = resolution.x * amplify;
-		const int height = resolution.y * amplify;
-		const int depth = resolution.z * amplify;
-		const size_t num_pixels = ((size_t)width) * height * depth;
+		int width = resolution.x * amplify;
+		int height = resolution.y * amplify;
+		int depth = resolution.z * amplify;
+		size_t num_pixels = ((size_t)width) * height * depth;

 		if(builtin_name == Attribute::standard_name(ATTR_STD_VOLUME_DENSITY)) {
 			SmokeDomainSettings_density_grid_get_length(&b_domain.ptr, &length);
@@ -1350,21 +1347,9 @@ void BlenderSession::update_resumable_tile_manager(int num_samples)
 		return;
 	}

-	const int num_samples_per_chunk = (int)ceilf((float)num_samples / num_resumable_chunks);
-
-	int range_start_sample, range_num_samples;
-	if(current_resumable_chunk != 0) {
-		/* Single chunk rendering. */
-		range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1);
-		range_num_samples = num_samples_per_chunk;
-	}
-	else {
-		/* Ranged-chunks. */
-		const int num_chunks = end_resumable_chunk - start_resumable_chunk + 1;
-		range_start_sample = num_samples_per_chunk * (start_resumable_chunk - 1);
-		range_num_samples = num_chunks * num_samples_per_chunk;
-	}
-	/* Make sure we don't overshoot. */
+	int num_samples_per_chunk = (int)ceilf((float)num_samples / num_resumable_chunks);
+	int range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1);
+	int range_num_samples = num_samples_per_chunk;
 	if(range_start_sample + range_num_samples > num_samples) {
 		range_num_samples = num_samples - range_num_samples;
 	}
@@ -1372,9 +1357,6 @@ void BlenderSession::update_resumable_tile_manager(int num_samples)
 	VLOG(1) << "Samples range start is " << range_start_sample << ", "
 	        << "number of samples to render is " << range_num_samples;

-	scene->integrator->start_sample = range_start_sample;
-	scene->integrator->tag_update(scene);
-
 	session->tile_manager.range_start_sample = range_start_sample;
 	session->tile_manager.range_num_samples = range_num_samples;
 }
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -113,7 +113,6 @@ public:
 	string last_status;
 	string last_error;
 	float last_progress;
-	double last_status_time;

 	int width, height;
 	double start_resize_time;
@@ -138,10 +137,6 @@ public:
 	/* Current resumable chunk index to render. */
 	static int current_resumable_chunk;

-	/* Alternative to single-chunk rendering to render a range of chunks. */
-	static int start_resumable_chunk;
-	static int end_resumable_chunk;
-
 protected:
 	void do_write_update_render_result(BL::RenderResult& b_rr,
 	                                   BL::RenderLayer& b_rlay,
@@ -150,21 +145,9 @@ protected:
 	void do_write_update_render_tile(RenderTile& rtile, bool do_update_only);

 	int builtin_image_frame(const string &builtin_name);
-	void builtin_image_info(const string &builtin_name,
-	                        void *builtin_data,
-	                        bool &is_float,
-	                        int &width,
-	                        int &height,
-	                        int &depth,
-	                        int &channels);
-	bool builtin_image_pixels(const string &builtin_name,
-	                          void *builtin_data,
-	                          unsigned char *pixels,
-	                          const size_t pixels_size);
-	bool builtin_image_float_pixels(const string &builtin_name,
-	                                void *builtin_data,
-	                                float *pixels,
-	                                const size_t pixels_size);
+	void builtin_image_info(const string &builtin_name, void *builtin_data, bool &is_float, int &width, int &height, int &depth, int &channels);
+	bool builtin_image_pixels(const string &builtin_name, void *builtin_data, unsigned char *pixels);
+	bool builtin_image_float_pixels(const string &builtin_name, void *builtin_data, float *pixels);

 	/* Update tile manager to reflect resumable render settings. */
 	void update_resumable_tile_manager(int num_samples);
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -609,8 +609,7 @@ static ShaderNode *add_node(Scene *scene,
 			bool is_builtin = b_image.packed_file() ||
 			                  b_image.source() == BL::Image::source_GENERATED ||
 			                  b_image.source() == BL::Image::source_MOVIE ||
-			                  (b_engine.is_preview() &&
-			                   b_image.source() != BL::Image::source_SEQUENCE);
+			                  b_engine.is_preview();

 			if(is_builtin) {
 				/* for builtin images we're using image datablock name to find an image to
@@ -641,8 +640,7 @@ static ShaderNode *add_node(Scene *scene,
 				        image->filename.string(),
 				        image->builtin_data,
 				        get_image_interpolation(b_image_node),
-				        get_image_extension(b_image_node),
-				        image->use_alpha);
+				        get_image_extension(b_image_node));
 			}
 		}
 		image->color_space = (NodeImageColorSpace)b_image_node.color_space();
@@ -663,8 +661,7 @@ static ShaderNode *add_node(Scene *scene,
 			bool is_builtin = b_image.packed_file() ||
 			                  b_image.source() == BL::Image::source_GENERATED ||
 			                  b_image.source() == BL::Image::source_MOVIE ||
-			                  (b_engine.is_preview() &&
-			                   b_image.source() != BL::Image::source_SEQUENCE);
+			                  b_engine.is_preview();

 			if(is_builtin) {
 				int scene_frame = b_scene.frame_current();
@@ -689,8 +686,7 @@ static ShaderNode *add_node(Scene *scene,
 				        env->filename.string(),
 				        env->builtin_data,
 				        get_image_interpolation(b_env_node),
-				        EXTENSION_REPEAT,
-				        env->use_alpha);
+				        EXTENSION_REPEAT);
 			}
 		}
 		env->color_space = (NodeImageColorSpace)b_env_node.color_space();
@@ -827,8 +823,7 @@ static ShaderNode *add_node(Scene *scene,
 			        point_density->filename.string(),
 			        point_density->builtin_data,
 			        point_density->interpolation,
-			        EXTENSION_CLIP,
-			        true);
+			        EXTENSION_CLIP);
 		}
 		node = point_density;

--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -322,15 +322,6 @@ void BlenderSync::sync_integrator()
 		integrator->volume_samples = volume_samples;
 	}

-	if(b_scene.render().use_simplify()) {
-		if(preview) {
-			integrator->ao_bounces = get_int(cscene, "ao_bounces");
-		}
-		else {
-			integrator->ao_bounces = get_int(cscene, "ao_bounces_render");
-		}
-	}
-
 	if(integrator->modified(previntegrator))
 		integrator->tag_update(scene);
 }
@@ -507,7 +498,6 @@ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene,

 	params.use_bvh_spatial_split = RNA_boolean_get(&cscene, "debug_use_spatial_splits");
 	params.use_bvh_unaligned_nodes = RNA_boolean_get(&cscene, "debug_use_hair_bvh");
-	params.num_bvh_time_steps = RNA_int_get(&cscene, "debug_bvh_time_steps");

 	if(background && params.shadingsystem != SHADINGSYSTEM_OSL)
 		params.persistent_data = r.use_persistent_data();
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -19,7 +19,6 @@

 #include "mesh.h"

-#include "util_algorithm.h"
 #include "util_map.h"
 #include "util_path.h"
 #include "util_set.h"
@@ -49,12 +48,12 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data,
                                      bool apply_modifiers,
                                      bool render,
                                      bool calc_undeformed,
-                                      Mesh::SubdivisionType subdivision_type)
+                                      bool subdivision)
 {
 	bool subsurf_mod_show_render;
 	bool subsurf_mod_show_viewport;

-	if(subdivision_type != Mesh::SUBDIVISION_NONE) {
+	if(subdivision) {
 		BL::Modifier subsurf_mod = object.modifiers[object.modifiers.length()-1];

 		subsurf_mod_show_render = subsurf_mod.show_render();
@@ -66,7 +65,7 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data,

 	BL::Mesh me = data.meshes.new_from_object(scene, object, apply_modifiers, (render)? 2: 1, false, calc_undeformed);

-	if(subdivision_type != Mesh::SUBDIVISION_NONE) {
+	if(subdivision) {
 		BL::Modifier subsurf_mod = object.modifiers[object.modifiers.length()-1];

 		subsurf_mod.show_render(subsurf_mod_show_render);
@@ -75,14 +74,9 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data,

 	if((bool)me) {
 		if(me.use_auto_smooth()) {
-			if(subdivision_type == Mesh::SUBDIVISION_CATMULL_CLARK) {
-				me.calc_normals_split();
-			}
-			else {
-				me.split_faces(false);
-			}
+			me.calc_normals_split();
 		}
-		if(subdivision_type == Mesh::SUBDIVISION_NONE) {
+		if(!subdivision) {
 			me.calc_tessface(true);
 		}
 	}
@@ -787,35 +781,6 @@ struct ParticleSystemKey {
 	}
 };

-class EdgeMap {
-public:
-	EdgeMap() {
-	}
-
-	void clear() {
-		edges_.clear();
-	}
-
-	void insert(int v0, int v1) {
-		get_sorted_verts(v0, v1);
-		edges_.insert(std::pair<int, int>(v0, v1));
-	}
-
-	bool exists(int v0, int v1) {
-		get_sorted_verts(v0, v1);
-		return edges_.find(std::pair<int, int>(v0, v1)) != edges_.end();
-	}
-
-protected:
-	void get_sorted_verts(int& v0, int& v1) {
-		if(v0 > v1) {
-			swap(v0, v1);
-		}
-	}
-
-	set< std::pair<int, int> > edges_;
-};
-
 CCL_NAMESPACE_END

 #endif /* __BLENDER_UTIL_H__ */
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -81,7 +81,6 @@ void BVH::build(Progress& progress)
 	                   pack.prim_type,
 	                   pack.prim_index,
 	                   pack.prim_object,
-	                   pack.prim_time,
 	                   params,
 	                   progress);
 	BVHNode *root = bvh_build.run();
@@ -257,10 +256,6 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 	pack.leaf_nodes.resize(leaf_nodes_size);
 	pack.object_node.resize(objects.size());

-	if(params.num_motion_curve_steps > 0 || params.num_motion_triangle_steps > 0) {
-		pack.prim_time.resize(prim_index_size);
-	}
-
 	int *pack_prim_index = (pack.prim_index.size())? &pack.prim_index[0]: NULL;
 	int *pack_prim_type = (pack.prim_type.size())? &pack.prim_type[0]: NULL;
 	int *pack_prim_object = (pack.prim_object.size())? &pack.prim_object[0]: NULL;
@@ -269,7 +264,6 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 	uint *pack_prim_tri_index = (pack.prim_tri_index.size())? &pack.prim_tri_index[0]: NULL;
 	int4 *pack_nodes = (pack.nodes.size())? &pack.nodes[0]: NULL;
 	int4 *pack_leaf_nodes = (pack.leaf_nodes.size())? &pack.leaf_nodes[0]: NULL;
-	float2 *pack_prim_time = (pack.prim_time.size())? &pack.prim_time[0]: NULL;

 	/* merge */
 	foreach(Object *ob, objects) {
@@ -315,7 +309,6 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 			int *bvh_prim_type = &bvh->pack.prim_type[0];
 			uint *bvh_prim_visibility = &bvh->pack.prim_visibility[0];
 			uint *bvh_prim_tri_index = &bvh->pack.prim_tri_index[0];
-			float2 *bvh_prim_time = bvh->pack.prim_time.size()? &bvh->pack.prim_time[0]: NULL;

 			for(size_t i = 0; i < bvh_prim_index_size; i++) {
 				if(bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
@@ -331,9 +324,6 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 				pack_prim_type[pack_prim_index_offset] = bvh_prim_type[i];
 				pack_prim_visibility[pack_prim_index_offset] = bvh_prim_visibility[i];
 				pack_prim_object[pack_prim_index_offset] = 0;  // unused for instances
-				if(bvh_prim_time != NULL) {
-					pack_prim_time[pack_prim_index_offset] = bvh_prim_time[i];
-				}
 				pack_prim_index_offset++;
 			}
 		}
@@ -855,8 +845,6 @@ void QBVH::pack_aligned_inner(const BVHStackEntry& e,
 	                  bounds,
 	                  child,
 	                  e.node->m_visibility,
-	                  e.node->m_time_from,
-	                  e.node->m_time_to,
 	                  num);
 }

@@ -864,17 +852,12 @@ void QBVH::pack_aligned_node(int idx,
                             const BoundBox *bounds,
                             const int *child,
                             const uint visibility,
-                             const float time_from,
-                             const float time_to,
                             const int num)
 {
 	float4 data[BVH_QNODE_SIZE];
 	memset(data, 0, sizeof(data));

 	data[0].x = __uint_as_float(visibility & ~PATH_RAY_NODE_UNALIGNED);
-	data[0].y = time_from;
-	data[0].z = time_to;
-
 	for(int i = 0; i < num; i++) {
 		float3 bb_min = bounds[i].min;
 		float3 bb_max = bounds[i].max;
@@ -925,8 +908,6 @@ void QBVH::pack_unaligned_inner(const BVHStackEntry& e,
 	                    bounds,
 	                    child,
 	                    e.node->m_visibility,
-	                    e.node->m_time_from,
-	                    e.node->m_time_to,
 	                    num);
 }

@@ -935,16 +916,12 @@ void QBVH::pack_unaligned_node(int idx,
                               const BoundBox *bounds,
                               const int *child,
                               const uint visibility,
-                               const float time_from,
-                               const float time_to,
                               const int num)
 {
 	float4 data[BVH_UNALIGNED_QNODE_SIZE];
 	memset(data, 0, sizeof(data));

 	data[0].x = __uint_as_float(visibility | PATH_RAY_NODE_UNALIGNED);
-	data[0].y = time_from;
-	data[0].z = time_to;

 	for(int i = 0; i < num; i++) {
 		Transform space = BVHUnaligned::compute_node_transform(
@@ -1230,8 +1207,6 @@ void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 			                    child_bbox,
 			                    &c[0],
 			                    visibility,
-			                    0.0f,
-			                    1.0f,
 			                    4);
 		}
 		else {
@@ -1239,8 +1214,6 @@ void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 			                  child_bbox,
 			                  &c[0],
 			                  visibility,
-			                  0.0f,
-			                  1.0f,
 			                  4);
 		}
 	}
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -68,8 +68,6 @@ struct PackedBVH {
 	array<int> prim_index;
 	/* mapping from BVH primitive index, to the object id of that primitive. */
 	array<int> prim_object;
-	/* Time range of BVH primitive. */
-	array<float2> prim_time;

 	/* index of the root node. */
 	int root_index;
@@ -177,8 +175,6 @@ protected:
 	                       const BoundBox *bounds,
 	                       const int *child,
 	                       const uint visibility,
-	                       const float time_from,
-	                       const float time_to,
 	                       const int num);

 	void pack_unaligned_inner(const BVHStackEntry& e,
@@ -189,8 +185,6 @@ protected:
 	                         const BoundBox *bounds,
 	                         const int *child,
 	                         const uint visibility,
-	                         const float time_from,
-	                         const float time_to,
 	                         const int num);

 	/* refit */
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -26,7 +26,6 @@
 #include "scene.h"
 #include "curves.h"

-#include "util_algorithm.h"
 #include "util_debug.h"
 #include "util_foreach.h"
 #include "util_logging.h"
@@ -93,14 +92,12 @@ BVHBuild::BVHBuild(const vector<Object*>& objects_,
                   array<int>& prim_type_,
                   array<int>& prim_index_,
                   array<int>& prim_object_,
-                   array<float2>& prim_time_,
                   const BVHParams& params_,
                   Progress& progress_)
 : objects(objects_),
   prim_type(prim_type_),
   prim_index(prim_index_),
   prim_object(prim_object_),
-   prim_time(prim_time_),
   params(params_),
   progress(progress_),
   progress_start_time(0.0),
@@ -115,237 +112,81 @@ BVHBuild::~BVHBuild()

 /* Adding References */

-void BVHBuild::add_reference_triangles(BoundBox& root, BoundBox& center, Mesh *mesh, int i)
+void BVHBuild::add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh, int i)
 {
-	const Attribute *attr_mP = NULL;
-	if(mesh->has_motion_blur()) {
-		attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-	}
-	const size_t num_triangles = mesh->num_triangles();
-	for(uint j = 0; j < num_triangles; j++) {
-		Mesh::Triangle t = mesh->get_triangle(j);
-		const float3 *verts = &mesh->verts[0];
-		if(attr_mP == NULL) {
-			BoundBox bounds = BoundBox::empty;
-			t.bounds_grow(verts, bounds);
-			if(bounds.valid()) {
-				references.push_back(BVHReference(bounds,
-				                                  j,
-				                                  i,
-				                                  PRIMITIVE_TRIANGLE));
-				root.grow(bounds);
-				center.grow(bounds.center2());
-			}
-		}
-		else if(params.num_motion_triangle_steps == 0 || params.use_spatial_split) {
-			/* Motion triangles, simple case: single node for the whole
-			 * primitive. Lowest memory footprint and faster BVH build but
-			 * least optimal ray-tracing.
-			 */
-			/* TODO(sergey): Support motion steps for spatially split BVH. */
-			const size_t num_verts = mesh->verts.size();
-			const size_t num_steps = mesh->motion_steps;
-			const float3 *vert_steps = attr_mP->data_float3();
-			BoundBox bounds = BoundBox::empty;
-			t.bounds_grow(verts, bounds);
-			for(size_t step = 0; step < num_steps - 1; step++) {
-				t.bounds_grow(vert_steps + step*num_verts, bounds);
-			}
-			if(bounds.valid()) {
-				references.push_back(
-				        BVHReference(bounds,
-				                     j,
-				                     i,
-				                     PRIMITIVE_MOTION_TRIANGLE));
-				root.grow(bounds);
-				center.grow(bounds.center2());
-			}
-		}
-		else {
-			/* Motion triangles, trace optimized case:  we split triangle
-			 * primitives into separate nodes for each of the time steps.
-			 * This way we minimize overlap of neighbor curve primitives.
-			 */
-			const int num_bvh_steps = params.num_motion_curve_steps * 2 + 1;
-			const float num_bvh_steps_inv_1 = 1.0f / (num_bvh_steps - 1);
-			const size_t num_verts = mesh->verts.size();
-			const size_t num_steps = mesh->motion_steps;
-			const float3 *vert_steps = attr_mP->data_float3();
-			/* Calculate bounding box of the previous time step.
-			 * Will be reused later to avoid duplicated work on
-			 * calculating BVH time step boundbox.
-			 */
-			float3 prev_verts[3];
-			t.motion_verts(verts,
-			               vert_steps,
-			               num_verts,
-			               num_steps,
-			               0.0f,
-			               prev_verts);
-			BoundBox prev_bounds = BoundBox::empty;
-			prev_bounds.grow(prev_verts[0]);
-			prev_bounds.grow(prev_verts[1]);
-			prev_bounds.grow(prev_verts[2]);
-			/* Create all primitive time steps, */
-			for(int bvh_step = 1; bvh_step < num_bvh_steps; ++bvh_step) {
-				const float curr_time = (float)(bvh_step) * num_bvh_steps_inv_1;
-				float3 curr_verts[3];
-				t.motion_verts(verts,
-				               vert_steps,
-				               num_verts,
-				               num_steps,
-				               curr_time,
-				               curr_verts);
-				BoundBox curr_bounds = BoundBox::empty;
-				curr_bounds.grow(curr_verts[0]);
-				curr_bounds.grow(curr_verts[1]);
-				curr_bounds.grow(curr_verts[2]);
-				BoundBox bounds = prev_bounds;
-				bounds.grow(curr_bounds);
-				if(bounds.valid()) {
-					const float prev_time = (float)(bvh_step - 1) * num_bvh_steps_inv_1;
-					references.push_back(
-					        BVHReference(bounds,
-					                     j,
-					                     i,
-					                     PRIMITIVE_MOTION_TRIANGLE,
-					                     prev_time,
-					                     curr_time));
-					root.grow(bounds);
-					center.grow(bounds.center2());
-				}
-				/* Current time boundbox becomes previous one for the
-				 * next time step.
-				 */
-				prev_bounds = curr_bounds;
-			}
-		}
-	}
-}
+	if(params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
+		Attribute *attr_mP = NULL;

-void BVHBuild::add_reference_curves(BoundBox& root, BoundBox& center, Mesh *mesh, int i)
-{
-	const Attribute *curve_attr_mP = NULL;
-	if(mesh->has_motion_blur()) {
-		curve_attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+		if(mesh->has_motion_blur())
+			attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+		size_t num_triangles = mesh->num_triangles();
+		for(uint j = 0; j < num_triangles; j++) {
+			Mesh::Triangle t = mesh->get_triangle(j);
+			BoundBox bounds = BoundBox::empty;
+			PrimitiveType type = PRIMITIVE_TRIANGLE;
+
+			t.bounds_grow(&mesh->verts[0], bounds);
+
+			/* motion triangles */
+			if(attr_mP) {
+				size_t mesh_size = mesh->verts.size();
+				size_t steps = mesh->motion_steps - 1;
+				float3 *vert_steps = attr_mP->data_float3();
+
+				for(size_t i = 0; i < steps; i++)
+					t.bounds_grow(vert_steps + i*mesh_size, bounds);
+
+				type = PRIMITIVE_MOTION_TRIANGLE;
+			}
+
+			if(bounds.valid()) {
+				references.push_back(BVHReference(bounds, j, i, type));
+				root.grow(bounds);
+				center.grow(bounds.center2());
+			}
+		}
 	}
-	const size_t num_curves = mesh->num_curves();
-	for(uint j = 0; j < num_curves; j++) {
-		const Mesh::Curve curve = mesh->get_curve(j);
-		const float *curve_radius = &mesh->curve_radius[0];
-		for(int k = 0; k < curve.num_keys - 1; k++) {
-			if(curve_attr_mP == NULL) {
-				/* Really simple logic for static hair. */
+
+	if(params.primitive_mask & PRIMITIVE_ALL_CURVE) {
+		Attribute *curve_attr_mP = NULL;
+
+		if(mesh->has_motion_blur())
+			curve_attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+		size_t num_curves = mesh->num_curves();
+		for(uint j = 0; j < num_curves; j++) {
+			Mesh::Curve curve = mesh->get_curve(j);
+			PrimitiveType type = PRIMITIVE_CURVE;
+
+			for(int k = 0; k < curve.num_keys - 1; k++) {
 				BoundBox bounds = BoundBox::empty;
-				curve.bounds_grow(k, &mesh->curve_keys[0], curve_radius, bounds);
+				curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bounds);
+
+				/* motion curve */
+				if(curve_attr_mP) {
+					size_t mesh_size = mesh->curve_keys.size();
+					size_t steps = mesh->motion_steps - 1;
+					float3 *key_steps = curve_attr_mP->data_float3();
+
+					for(size_t i = 0; i < steps; i++)
+						curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bounds);
+
+					type = PRIMITIVE_MOTION_CURVE;
+				}
+
 				if(bounds.valid()) {
-					int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_CURVE, k);
+					int packed_type = PRIMITIVE_PACK_SEGMENT(type, k);
+
 					references.push_back(BVHReference(bounds, j, i, packed_type));
 					root.grow(bounds);
 					center.grow(bounds.center2());
 				}
 			}
-			else if(params.num_motion_curve_steps == 0 || params.use_spatial_split) {
-				/* Simple case of motion curves: single node for the while
-				 * shutter time. Lowest memory usage but less optimal
-				 * rendering.
-				 */
-				/* TODO(sergey): Support motion steps for spatially split BVH. */
-				BoundBox bounds = BoundBox::empty;
-				curve.bounds_grow(k, &mesh->curve_keys[0], curve_radius, bounds);
-				const size_t num_keys = mesh->curve_keys.size();
-				const size_t num_steps = mesh->motion_steps;
-				const float3 *key_steps = curve_attr_mP->data_float3();
-				for(size_t step = 0; step < num_steps - 1; step++) {
-					curve.bounds_grow(k,
-					                  key_steps + step*num_keys,
-					                  curve_radius,
-					                  bounds);
-				}
-				if(bounds.valid()) {
-					int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_MOTION_CURVE, k);
-					references.push_back(BVHReference(bounds,
-					                                  j,
-					                                  i,
-					                                  packed_type));
-					root.grow(bounds);
-					center.grow(bounds.center2());
-				}
-			}
-			else {
-				/* Motion curves, trace optimized case:  we split curve keys
-				 * primitives into separate nodes for each of the time steps.
-				 * This way we minimize overlap of neighbor curve primitives.
-				 */
-				const int num_bvh_steps = params.num_motion_curve_steps * 2 + 1;
-				const float num_bvh_steps_inv_1 = 1.0f / (num_bvh_steps - 1);
-				const size_t num_steps = mesh->motion_steps;
-				const float3 *curve_keys = &mesh->curve_keys[0];
-				const float3 *key_steps = curve_attr_mP->data_float3();
-				const size_t num_keys = mesh->curve_keys.size();
-				/* Calculate bounding box of the previous time step.
-				 * Will be reused later to avoid duplicated work on
-				 * calculating BVH time step boundbox.
-				 */
-				float4 prev_keys[4];
-				curve.cardinal_motion_keys(curve_keys,
-				                           curve_radius,
-				                           key_steps,
-				                           num_keys,
-				                           num_steps,
-				                           0.0f,
-				                           k - 1, k, k + 1, k + 2,
-				                           prev_keys);
-				BoundBox prev_bounds = BoundBox::empty;
-				curve.bounds_grow(prev_keys, prev_bounds);
-				/* Create all primitive time steps, */
-				for(int bvh_step = 1; bvh_step < num_bvh_steps; ++bvh_step) {
-					const float curr_time = (float)(bvh_step) * num_bvh_steps_inv_1;
-					float4 curr_keys[4];
-					curve.cardinal_motion_keys(curve_keys,
-					                           curve_radius,
-					                           key_steps,
-					                           num_keys,
-					                           num_steps,
-					                           curr_time,
-					                           k - 1, k, k + 1, k + 2,
-					                           curr_keys);
-					BoundBox curr_bounds = BoundBox::empty;
-					curve.bounds_grow(curr_keys, curr_bounds);
-					BoundBox bounds = prev_bounds;
-					bounds.grow(curr_bounds);
-					if(bounds.valid()) {
-						const float prev_time = (float)(bvh_step - 1) * num_bvh_steps_inv_1;
-						int packed_type = PRIMITIVE_PACK_SEGMENT(PRIMITIVE_MOTION_CURVE, k);
-						references.push_back(BVHReference(bounds,
-						                                  j,
-						                                  i,
-						                                  packed_type,
-						                                  prev_time,
-						                                  curr_time));
-						root.grow(bounds);
-						center.grow(bounds.center2());
-					}
-					/* Current time boundbox becomes previous one for the
-					 * next time step.
-					 */
-					prev_bounds = curr_bounds;
-				}
-			}
 		}
 	}
 }

-void BVHBuild::add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh, int i)
-{
-	if(params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
-		add_reference_triangles(root, center, mesh, i);
-	}
-	if(params.primitive_mask & PRIMITIVE_ALL_CURVE) {
-		add_reference_curves(root, center, mesh, i);
-	}
-}
-
 void BVHBuild::add_reference_object(BoundBox& root, BoundBox& center, Object *ob, int i)
 {
 	references.push_back(BVHReference(ob->bounds, -1, i, 0));
@@ -359,7 +200,7 @@ static size_t count_curve_segments(Mesh *mesh)

 	for(size_t i = 0; i < num_curves; i++)
 		num += mesh->get_curve(i).num_keys - 1;
-
+	
 	return num;
 }

@@ -467,9 +308,6 @@ BVHNode* BVHBuild::run()
 	}
 	spatial_free_index = 0;

-	need_prim_time = params.num_motion_curve_steps > 0 ||
-	                 params.num_motion_triangle_steps > 0;
-
 	/* init progress updates */
 	double build_start_time;
 	build_start_time = progress_start_time = time_dt();
@@ -480,12 +318,6 @@ BVHNode* BVHBuild::run()
 	prim_type.resize(references.size());
 	prim_index.resize(references.size());
 	prim_object.resize(references.size());
-	if(need_prim_time) {
-		prim_time.resize(references.size());
-	}
-	else {
-		prim_time.resize(0);
-	}

 	/* build recursively */
 	BVHNode *rootnode;
@@ -512,7 +344,6 @@ BVHNode* BVHBuild::run()
 		else {
 			/*rotate(rootnode, 4, 5);*/
 			rootnode->update_visibility();
-			rootnode->update_time();
 		}
 		if(rootnode != NULL) {
 			VLOG(1) << "BVH build statistics:\n"
@@ -540,7 +371,7 @@ void BVHBuild::progress_update()
 {
 	if(time_dt() - progress_start_time < 0.25)
 		return;
-
+	
 	double progress_start = (double)progress_count/(double)progress_total;
 	double duplicates = (double)(progress_total - progress_original_total)/(double)progress_total;

@@ -548,7 +379,7 @@ void BVHBuild::progress_update()
 	                           progress_start * 100.0, duplicates * 100.0);

 	progress.set_substatus(msg);
-	progress_start_time = time_dt();
+	progress_start_time = time_dt(); 
 }

 void BVHBuild::thread_build_node(InnerNode *inner,
@@ -604,7 +435,6 @@ bool BVHBuild::range_within_max_leaf_size(const BVHRange& range,
 		return false;

 	size_t num_triangles = 0;
-	size_t num_motion_triangles = 0;
 	size_t num_curves = 0;
 	size_t num_motion_curves = 0;

@@ -615,16 +445,13 @@ bool BVHBuild::range_within_max_leaf_size(const BVHRange& range,
 			num_curves++;
 		if(ref.prim_type() & PRIMITIVE_MOTION_CURVE)
 			num_motion_curves++;
-		else if(ref.prim_type() & PRIMITIVE_TRIANGLE)
+		else if(ref.prim_type() & PRIMITIVE_ALL_TRIANGLE)
 			num_triangles++;
-		else if(ref.prim_type() & PRIMITIVE_MOTION_TRIANGLE)
-			num_motion_triangles++;
 	}

-	return (num_triangles <= params.max_triangle_leaf_size) &&
-	       (num_motion_triangles <= params.max_motion_triangle_leaf_size) &&
-	       (num_curves <= params.max_curve_leaf_size) &&
-	       (num_motion_curves <= params.max_motion_curve_leaf_size);
+	return (num_triangles < params.max_triangle_leaf_size) &&
+	       (num_curves < params.max_curve_leaf_size) &&
+	       (num_motion_curves < params.max_curve_leaf_size);
 }

 /* multithreaded binning builder */
@@ -860,29 +687,20 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start,
 		prim_type[start] = ref->prim_type();
 		prim_index[start] = ref->prim_index();
 		prim_object[start] = ref->prim_object();
-		if(need_prim_time) {
-			prim_time[start] = make_float2(ref->time_from(), ref->time_to());
-		}

 		uint visibility = objects[ref->prim_object()]->visibility;
-		BVHNode *leaf_node =  new LeafNode(ref->bounds(), visibility, start, start+1);
-		leaf_node->m_time_from = ref->time_from();
-		leaf_node->m_time_to = ref->time_to();
-		return leaf_node;
+		return new LeafNode(ref->bounds(), visibility, start, start+1);
 	}
 	else {
 		int mid = num/2;
-		BVHNode *leaf0 = create_object_leaf_nodes(ref, start, mid);
-		BVHNode *leaf1 = create_object_leaf_nodes(ref+mid, start+mid, num-mid);
+		BVHNode *leaf0 = create_object_leaf_nodes(ref, start, mid); 
+		BVHNode *leaf1 = create_object_leaf_nodes(ref+mid, start+mid, num-mid); 

 		BoundBox bounds = BoundBox::empty;
 		bounds.grow(leaf0->m_bounds);
 		bounds.grow(leaf1->m_bounds);

-		BVHNode *inner_node = new InnerNode(bounds, leaf0, leaf1);
-		inner_node->m_time_from = min(leaf0->m_time_from, leaf1->m_time_from);
-		inner_node->m_time_to = max(leaf0->m_time_to, leaf1->m_time_to);
-		return inner_node;
+		return new InnerNode(bounds, leaf0, leaf1);
 	}
 }

@@ -905,13 +723,11 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	 *    can not control.
 	 */
 	typedef StackAllocator<256, int> LeafStackAllocator;
-	typedef StackAllocator<256, float2> LeafTimeStackAllocator;
 	typedef StackAllocator<256, BVHReference> LeafReferenceStackAllocator;

 	vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM_TOTAL];
 	vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM_TOTAL];
 	vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM_TOTAL];
-	vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM_TOTAL];
 	vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM_TOTAL];

 	/* TODO(sergey): In theory we should be able to store references. */
@@ -934,8 +750,6 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			p_type[type_index].push_back(ref.prim_type());
 			p_index[type_index].push_back(ref.prim_index());
 			p_object[type_index].push_back(ref.prim_object());
-			p_time[type_index].push_back(make_float2(ref.time_from(),
-			                                         ref.time_to()));

 			bounds[type_index].grow(ref.bounds());
 			visibility[type_index] |= objects[ref.prim_object()]->visibility;
@@ -965,13 +779,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	vector<int, LeafStackAllocator> local_prim_type,
 	                                local_prim_index,
 	                                local_prim_object;
-	vector<float2, LeafTimeStackAllocator> local_prim_time;
 	local_prim_type.resize(num_new_prims);
 	local_prim_index.resize(num_new_prims);
 	local_prim_object.resize(num_new_prims);
-	if(need_prim_time) {
-		local_prim_time.resize(num_new_prims);
-	}
 	for(int i = 0; i < PRIMITIVE_NUM_TOTAL; ++i) {
 		int num = (int)p_type[i].size();
 		if(num != 0) {
@@ -984,9 +794,6 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 				local_prim_type[index] = p_type[i][j];
 				local_prim_index[index] = p_index[i][j];
 				local_prim_object[index] = p_object[i][j];
-				if(need_prim_time) {
-					local_prim_time[index] = p_time[i][j];
-				}
 				if(params.use_unaligned_nodes && !alignment_found) {
 					alignment_found =
 						unaligned_heuristic.compute_aligned_space(p_ref[i][j],
@@ -997,16 +804,6 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			                                   visibility[i],
 			                                   start_index,
 			                                   start_index + num);
-			if(true) {
-				float time_from = 1.0f, time_to = 0.0f;
-				for(int j = 0; j < num; ++j) {
-					const BVHReference &ref = p_ref[i][j];
-					time_from = min(time_from, ref.time_from());
-					time_to = max(time_to, ref.time_to());
-				}
-				leaf_node->m_time_from = time_from;
-				leaf_node->m_time_to = time_to;
-			}
 			if(alignment_found) {
 				/* Need to recalculate leaf bounds with new alignment. */
 				leaf_node->m_bounds = BoundBox::empty;
@@ -1053,17 +850,11 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 				prim_type.reserve(reserve);
 				prim_index.reserve(reserve);
 				prim_object.reserve(reserve);
-				if(need_prim_time) {
-					prim_time.reserve(reserve);
-				}
 			}

 			prim_type.resize(range_end);
 			prim_index.resize(range_end);
 			prim_object.resize(range_end);
-			if(need_prim_time) {
-				prim_time.resize(range_end);
-			}
 		}
 		spatial_spin_lock.unlock();

@@ -1072,9 +863,6 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			memcpy(&prim_type[start_index], &local_prim_type[0], new_leaf_data_size);
 			memcpy(&prim_index[start_index], &local_prim_index[0], new_leaf_data_size);
 			memcpy(&prim_object[start_index], &local_prim_object[0], new_leaf_data_size);
-			if(need_prim_time) {
-				memcpy(&prim_time[start_index], &local_prim_time[0], sizeof(float2)*num_new_leaf_data);
-			}
 		}
 	}
 	else {
@@ -1087,9 +875,6 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			memcpy(&prim_type[start_index], &local_prim_type[0], new_leaf_data_size);
 			memcpy(&prim_index[start_index], &local_prim_index[0], new_leaf_data_size);
 			memcpy(&prim_object[start_index], &local_prim_object[0], new_leaf_data_size);
-			if(need_prim_time) {
-				memcpy(&prim_time[start_index], &local_prim_time[0], sizeof(float2)*num_new_leaf_data);
-			}
 		}
 	}

@@ -1133,7 +918,7 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 		BVHNode *inner = new InnerNode(inner_bounds, leaves[1], leaves[2]);
 		return new InnerNode(range.bounds(), leaves[0], inner);
 	} else {
-		/* Should be doing more branches if more primitive types added. */
+		/* Shpuld be doing more branches if more primitive types added. */
 		assert(num_leaves <= 5);
 		BoundBox inner_bounds_a = merge(leaves[0]->m_bounds, leaves[1]->m_bounds);
 		BoundBox inner_bounds_b = merge(leaves[2]->m_bounds, leaves[3]->m_bounds);
@@ -1166,7 +951,7 @@ void BVHBuild::rotate(BVHNode *node, int max_depth)
 	/* nothing to rotate if we reached a leaf node. */
 	if(node->is_leaf() || max_depth < 0)
 		return;
-
+	
 	InnerNode *parent = (InnerNode*)node;

 	/* rotate all children first */
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@@ -48,7 +48,6 @@ public:
 	         array<int>& prim_type,
 	         array<int>& prim_index,
 	         array<int>& prim_object,
-	         array<float2>& prim_time,
 	         const BVHParams& params,
 	         Progress& progress);
 	~BVHBuild();
@@ -64,8 +63,6 @@ protected:
 	friend class BVHObjectBinning;

 	/* Adding references. */
-	void add_reference_triangles(BoundBox& root, BoundBox& center, Mesh *mesh, int i);
-	void add_reference_curves(BoundBox& root, BoundBox& center, Mesh *mesh, int i);
 	void add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh, int i);
 	void add_reference_object(BoundBox& root, BoundBox& center, Object *ob, int i);
 	void add_references(BVHRange& root);
@@ -113,9 +110,6 @@ protected:
 	array<int>& prim_type;
 	array<int>& prim_index;
 	array<int>& prim_object;
-	array<float2>& prim_time;
-
-	bool need_prim_time;

 	/* Build parameters. */
 	BVHParams params;
--- a/intern/cycles/bvh/bvh_node.cpp
+++ b/intern/cycles/bvh/bvh_node.cpp
@@ -176,19 +176,6 @@ uint BVHNode::update_visibility()
 	return m_visibility;
 }

-void BVHNode::update_time()
-{
-	if(!is_leaf()) {
-		InnerNode *inner = (InnerNode*)this;
-		BVHNode *child0 = inner->children[0];
-		BVHNode *child1 = inner->children[1];
-		child0->update_time();
-		child1->update_time();
-		m_time_from = min(child0->m_time_from, child1->m_time_from);
-		m_time_to =  max(child0->m_time_to, child1->m_time_to);
-	}
-}
-
 /* Inner Node */

 void InnerNode::print(int depth) const
--- a/intern/cycles/bvh/bvh_node.h
+++ b/intern/cycles/bvh/bvh_node.h
@@ -47,9 +47,7 @@ class BVHNode
 {
 public:
 	BVHNode() : m_is_unaligned(false),
-	            m_aligned_space(NULL),
-	            m_time_from(0.0f),
-	            m_time_to(1.0f)
+	            m_aligned_space(NULL)
 	{
 	}

@@ -93,15 +91,12 @@ public:
 	void deleteSubtree();

 	uint update_visibility();
-	void update_time();

 	bool m_is_unaligned;

 	// TODO(sergey): Can be stored as 3x3 matrix, but better to have some
 	// utilities and type defines in util_transform first.
 	Transform *m_aligned_space;
-
-	float m_time_from, m_time_to;
 };

 class InnerNode : public BVHNode
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -43,9 +43,7 @@ public:
 	/* number of primitives in leaf */
 	int min_leaf_size;
 	int max_triangle_leaf_size;
-	int max_motion_triangle_leaf_size;
 	int max_curve_leaf_size;
-	int max_motion_curve_leaf_size;

 	/* object or mesh level bvh */
 	bool top_level;
@@ -61,17 +59,6 @@ public:
 	 */
 	bool use_unaligned_nodes;

-	/* Split time range to this number of steps and create leaf node for each
-	 * of this time steps.
-	 *
-	 * Speeds up rendering of motion curve primitives in the cost of higher
-	 * memory usage.
-	 */
-	int num_motion_curve_steps;
-
-	/* Same as above, but for triangle primitives. */
-	int num_motion_triangle_steps;
-
 	/* fixed parameters */
 	enum {
 		MAX_DEPTH = 64,
@@ -93,18 +80,13 @@ public:

 		min_leaf_size = 1;
 		max_triangle_leaf_size = 8;
-		max_motion_triangle_leaf_size = 8;
-		max_curve_leaf_size = 1;
-		max_motion_curve_leaf_size = 4;
+		max_curve_leaf_size = 2;

 		top_level = false;
 		use_qbvh = false;
 		use_unaligned_nodes = false;

 		primitive_mask = PRIMITIVE_ALL;
-
-		num_motion_curve_steps = 0;
-		num_motion_triangle_steps = 0;
 	}

 	/* SAH costs */
@@ -131,15 +113,8 @@ class BVHReference
 public:
 	__forceinline BVHReference() {}

-	__forceinline BVHReference(const BoundBox& bounds_,
-	                           int prim_index_,
-	                           int prim_object_,
-	                           int prim_type,
-	                           float time_from = 0.0f,
-	                           float time_to = 1.0f)
-	        : rbounds(bounds_),
-	          time_from_(time_from),
-	          time_to_(time_to)
+	__forceinline BVHReference(const BoundBox& bounds_, int prim_index_, int prim_object_, int prim_type)
+	: rbounds(bounds_)
 	{
 		rbounds.min.w = __int_as_float(prim_index_);
 		rbounds.max.w = __int_as_float(prim_object_);
@@ -150,9 +125,6 @@ public:
 	__forceinline int prim_index() const { return __float_as_int(rbounds.min.w); }
 	__forceinline int prim_object() const { return __float_as_int(rbounds.max.w); }
 	__forceinline int prim_type() const { return type; }
-	__forceinline float time_from() const { return time_from_; }
-	__forceinline float time_to() const { return time_to_; }
-

 	BVHReference& operator=(const BVHReference &arg) {
 		if(&arg != this) {
@@ -161,11 +133,9 @@ public:
 		return *this;
 	}

-
 protected:
 	BoundBox rbounds;
 	uint type;
-	float time_from_, time_to_;
 };

 /* BVH Range
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -3,7 +3,6 @@ set(INC
 	.
 	../graph
 	../kernel
-	../kernel/split
 	../kernel/svm
 	../kernel/osl
 	../util
@@ -34,7 +33,6 @@ set(SRC
 	device_cuda.cpp
 	device_multi.cpp
 	device_opencl.cpp
-	device_split_kernel.cpp
 	device_task.cpp
 )

@@ -58,7 +56,6 @@ set(SRC_HEADERS
 	device_memory.h
 	device_intern.h
 	device_network.h
-	device_split_kernel.h
 	device_task.h
 )

--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -64,8 +64,6 @@ std::ostream& operator <<(std::ostream &os,
 	   << string_from_bool(requested_features.use_integrator_branched) << std::endl;
 	os << "Use Patch Evaluation: "
 	   << string_from_bool(requested_features.use_patch_evaluation) << std::endl;
-	os << "Use Transparent Shadows: "
-	   << string_from_bool(requested_features.use_transparent) << std::endl;
 	return os;
 }

@@ -80,7 +78,7 @@ Device::~Device()

 void Device::pixels_alloc(device_memory& mem)
 {
-	mem_alloc("pixels", mem, MEM_READ_WRITE);
+	mem_alloc(mem, MEM_READ_WRITE);
 }

 void Device::pixels_copy_from(device_memory& mem, int y, int w, int h)
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -117,9 +117,6 @@ public:

 	/* Use OpenSubdiv patch evaluation */
 	bool use_patch_evaluation;
-	
-	/* Use Transparent shadows */
-	bool use_transparent;

 	DeviceRequestedFeatures()
 	{
@@ -136,7 +133,6 @@ public:
 		use_volume = false;
 		use_integrator_branched = false;
 		use_patch_evaluation = false;
-		use_transparent = false;
 	}

 	bool modified(const DeviceRequestedFeatures& requested_features)
@@ -152,8 +148,7 @@ public:
 		         use_subsurface == requested_features.use_subsurface &&
 		         use_volume == requested_features.use_volume &&
 		         use_integrator_branched == requested_features.use_integrator_branched &&
-		         use_patch_evaluation == requested_features.use_patch_evaluation &&
-		         use_transparent == requested_features.use_transparent);
+		         use_patch_evaluation == requested_features.use_patch_evaluation);
 	}

 	/* Convert the requested features structure to a build options,
@@ -194,9 +189,6 @@ public:
 		if(!use_patch_evaluation) {
 			build_options += " -D__NO_PATCH_EVAL__";
 		}
-		if(!use_transparent && !use_volume) {
-			build_options += " -D__NO_TRANSPARENT__";
-		}
 		return build_options;
 	}
 };
@@ -228,21 +220,12 @@ public:
 	DeviceInfo info;
 	virtual const string& error_message() { return error_msg; }
 	bool have_error() { return !error_message().empty(); }
-	virtual void set_error(const string& error)
-	{
-		if(!have_error()) {
-			error_msg = error;
-		}
-		fprintf(stderr, "%s\n", error.c_str());
-		fflush(stderr);
-	}
-	virtual bool show_samples() const { return false; }

 	/* statistics */
 	Stats &stats;

 	/* regular memory */
-	virtual void mem_alloc(const char *name, device_memory& mem, MemoryType type) = 0;
+	virtual void mem_alloc(device_memory& mem, MemoryType type) = 0;
 	virtual void mem_copy_to(device_memory& mem) = 0;
 	virtual void mem_copy_from(device_memory& mem,
 		int y, int w, int h, int elem) = 0;
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -26,12 +26,10 @@

 #include "device.h"
 #include "device_intern.h"
-#include "device_split_kernel.h"

 #include "kernel.h"
 #include "kernel_compat_cpu.h"
 #include "kernel_types.h"
-#include "split/kernel_split_data.h"
 #include "kernel_globals.h"

 #include "osl_shader.h"
@@ -43,7 +41,6 @@
 #include "util_foreach.h"
 #include "util_function.h"
 #include "util_logging.h"
-#include "util_map.h"
 #include "util_opengl.h"
 #include "util_progress.h"
 #include "util_system.h"
@@ -51,93 +48,8 @@

 CCL_NAMESPACE_BEGIN

-class CPUDevice;
-
-class CPUSplitKernel : public DeviceSplitKernel {
-	CPUDevice *device;
-public:
-	explicit CPUSplitKernel(CPUDevice *device);
-
-	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
-	                                            RenderTile& rtile,
-	                                            int num_global_elements,
-	                                            device_memory& kernel_globals,
-	                                            device_memory& kernel_data_,
-	                                            device_memory& split_data,
-	                                            device_memory& ray_state,
-	                                            device_memory& queue_index,
-	                                            device_memory& use_queues_flag,
-	                                            device_memory& work_pool_wgs);
-
-	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
-	virtual int2 split_kernel_local_size();
-	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
-	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
-};
-
 class CPUDevice : public Device
 {
-	static unordered_map<string, void*> kernel_functions;
-
-	static void register_kernel_function(const char* name, void* func)
-	{
-		kernel_functions[name] = func;
-	}
-
-	static const char* get_arch_name()
-	{
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-		if(system_cpu_support_avx2()) {
-			return "cpu_avx2";
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-		if(system_cpu_support_avx()) {
-			return "cpu_avx";
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-		if(system_cpu_support_sse41()) {
-			return "cpu_sse41";
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-		if(system_cpu_support_sse3()) {
-			return "cpu_sse3";
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-		if(system_cpu_support_sse2()) {
-			return "cpu_sse2";
-		}
-		else
-#endif
-		{
-			return "cpu";
-		}
-	}
-
-	template<typename F>
-	static F get_kernel_function(string name)
-	{
-		name = string("kernel_") + get_arch_name() + "_" + name;
-
-		unordered_map<string, void*>::iterator it = kernel_functions.find(name);
-
-		if(it == kernel_functions.end()) {
-			assert(!"kernel function not found");
-			return NULL;
-		}
-
-		return (F)it->second;
-	}
-
-	friend class CPUSplitKernel;
-
 public:
 	TaskPool task_pool;
 	KernelGlobals kernel_globals;
@@ -145,15 +57,10 @@ public:
 #ifdef WITH_OSL
 	OSLGlobals osl_globals;
 #endif
-
-	bool use_split_kernel;
-
-	DeviceRequestedFeatures requested_features;
 	
 	CPUDevice(DeviceInfo& info, Stats &stats, bool background)
 	: Device(info, stats, background)
 	{
-
 #ifdef WITH_OSL
 		kernel_globals.osl = &osl_globals;
 #endif
@@ -198,28 +105,6 @@ public:
 		{
 			VLOG(1) << "Will be using regular kernels.";
 		}
-
-		use_split_kernel = DebugFlags().cpu.split_kernel;
-		if(use_split_kernel) {
-			VLOG(1) << "Will be using split kernel.";
-		}
-
-		kernel_cpu_register_functions(register_kernel_function);
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-		kernel_cpu_sse2_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-		kernel_cpu_sse3_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-		kernel_cpu_sse41_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-		kernel_cpu_avx_register_functions(register_kernel_function);
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-		kernel_cpu_avx2_register_functions(register_kernel_function);
-#endif
 	}

 	~CPUDevice()
@@ -227,25 +112,9 @@ public:
 		task_pool.stop();
 	}

-	virtual bool show_samples() const
+	void mem_alloc(device_memory& mem, MemoryType /*type*/)
 	{
-		return (TaskScheduler::num_threads() == 1);
-	}
-
-	void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
-	{
-		if(name) {
-			VLOG(1) << "Buffer allocate: " << name << ", "
-			        << string_human_readable_number(mem.memory_size()) << " bytes. ("
-			        << string_human_readable_size(mem.memory_size()) << ")";
-		}
-
 		mem.device_pointer = mem.data_pointer;
-
-		if(!mem.device_pointer) {
-			mem.device_pointer = (device_ptr)malloc(mem.memory_size());
-		}
-
 		mem.device_size = mem.memory_size();
 		stats.mem_alloc(mem.device_size);
 	}
@@ -270,10 +139,6 @@ public:
 	void mem_free(device_memory& mem)
 	{
 		if(mem.device_pointer) {
-			if(!mem.data_pointer) {
-				free((void*)mem.device_pointer);
-			}
-
 			mem.device_pointer = 0;
 			stats.mem_free(mem.device_size);
 			mem.device_size = 0;
@@ -326,14 +191,8 @@ public:

 	void thread_run(DeviceTask *task)
 	{
-		if(task->type == DeviceTask::PATH_TRACE) {
-			if(!use_split_kernel) {
-				thread_path_trace(*task);
-			}
-			else {
-				thread_path_trace_split(*task);
-			}
-		}
+		if(task->type == DeviceTask::PATH_TRACE)
+			thread_path_trace(*task);
 		else if(task->type == DeviceTask::FILM_CONVERT)
 			thread_film_convert(*task);
 		else if(task->type == DeviceTask::SHADER)
@@ -394,7 +253,7 @@ public:
 		{
 			path_trace_kernel = kernel_cpu_path_trace;
 		}
-
+		
 		while(task.acquire_tile(this, tile)) {
 			float *render_buffer = (float*)tile.buffer;
 			uint *rng_state = (uint*)tile.rng_state;
@@ -416,7 +275,7 @@ public:

 				tile.sample = sample + 1;

-				task.update_progress(&tile, tile.w*tile.h);
+				task.update_progress(&tile);
 			}

 			task.release_tile(tile);
@@ -430,49 +289,6 @@ public:
 		thread_kernel_globals_free(&kg);
 	}

-	void thread_path_trace_split(DeviceTask& task)
-	{
-		if(task_pool.canceled()) {
-			if(task.need_finish_queue == false)
-				return;
-		}
-
-		RenderTile tile;
-
-		CPUSplitKernel split_kernel(this);
-
-		/* allocate buffer for kernel globals */
-		device_memory kgbuffer;
-		kgbuffer.resize(sizeof(KernelGlobals));
-		mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
-
-		KernelGlobals *kg = (KernelGlobals*)kgbuffer.device_pointer;
-		*kg = thread_kernel_globals_init();
-
-		requested_features.max_closure = MAX_CLOSURE;
-		if(!split_kernel.load_kernels(requested_features)) {
-			thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
-			mem_free(kgbuffer);
-
-			return;
-		}
-
-		while(task.acquire_tile(this, tile)) {
-			device_memory data;
-			split_kernel.path_trace(&task, tile, kgbuffer, data);
-
-			task.release_tile(tile);
-
-			if(task_pool.canceled()) {
-				if(task.need_finish_queue == false)
-					break;
-			}
-		}
-
-		thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
-		mem_free(kgbuffer);
-	}
-
 	void thread_film_convert(DeviceTask& task)
 	{
 		float sample_scale = 1.0f/(task.sample + 1);
@@ -680,10 +496,6 @@ protected:

 	inline void thread_kernel_globals_free(KernelGlobals *kg)
 	{
-		if(kg == NULL) {
-			return;
-		}
-
 		if(kg->transparent_shadow_intersections != NULL) {
 			free(kg->transparent_shadow_intersections);
 		}
@@ -698,175 +510,8 @@ protected:
 		OSLShader::thread_free(kg);
 #endif
 	}
-
-	virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) {
-		requested_features = requested_features_;
-
-		return true;
-	}
 };

-/* split kernel */
-
-class CPUSplitKernelFunction : public SplitKernelFunction {
-public:
-	CPUDevice* device;
-	void (*func)(KernelGlobals *kg, KernelData *data);
-
-	CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
-	~CPUSplitKernelFunction() {}
-
-	virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
-	{
-		if(!func) {
-			return false;
-		}
-
-		KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
-		kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
-		for(int y = 0; y < dim.global_size[1]; y++) {
-			for(int x = 0; x < dim.global_size[0]; x++) {
-				kg->global_id = make_int2(x, y);
-
-				func(kg, (KernelData*)data.device_pointer);
-			}
-		}
-
-		return true;
-	}
-};
-
-CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
-                                                    RenderTile& rtile,
-                                                    int num_global_elements,
-                                                    device_memory& kernel_globals,
-                                                    device_memory& data,
-                                                    device_memory& split_data,
-                                                    device_memory& ray_state,
-                                                    device_memory& queue_index,
-                                                    device_memory& use_queues_flags,
-                                                    device_memory& work_pool_wgs)
-{
-	typedef void(*data_init_t)(KernelGlobals *kg,
-	                           ccl_constant KernelData *data,
-	                           ccl_global void *split_data_buffer,
-	                           int num_elements,
-	                           ccl_global char *ray_state,
-	                           ccl_global uint *rng_state,
-	                           int start_sample,
-	                           int end_sample,
-	                           int sx, int sy, int sw, int sh, int offset, int stride,
-	                           ccl_global int *Queue_index,
-	                           int queuesize,
-	                           ccl_global char *use_queues_flag,
-	                           ccl_global unsigned int *work_pool_wgs,
-	                           unsigned int num_samples,
-	                           ccl_global float *buffer);
-
-	data_init_t data_init;
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-	if(system_cpu_support_avx2()) {
-		data_init = kernel_cpu_avx2_data_init;
-	}
-	else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-	if(system_cpu_support_avx()) {
-		data_init = kernel_cpu_avx_data_init;
-	}
-	else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-	if(system_cpu_support_sse41()) {
-		data_init = kernel_cpu_sse41_data_init;
-	}
-	else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-	if(system_cpu_support_sse3()) {
-		data_init = kernel_cpu_sse3_data_init;
-	}
-	else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-	if(system_cpu_support_sse2()) {
-		data_init = kernel_cpu_sse2_data_init;
-	}
-	else
-#endif
-	{
-		data_init = kernel_cpu_data_init;
-	}
-
-	KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
-	kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
-	for(int y = 0; y < dim.global_size[1]; y++) {
-		for(int x = 0; x < dim.global_size[0]; x++) {
-			kg->global_id = make_int2(x, y);
-
-			data_init((KernelGlobals*)kernel_globals.device_pointer,
-			          (KernelData*)data.device_pointer,
-			          (void*)split_data.device_pointer,
-			          num_global_elements,
-			          (char*)ray_state.device_pointer,
-			          (uint*)rtile.rng_state,
-			          rtile.start_sample,
-			          rtile.start_sample + rtile.num_samples,
-			          rtile.x,
-			          rtile.y,
-			          rtile.w,
-			          rtile.h,
-			          rtile.offset,
-			          rtile.stride,
-			          (int*)queue_index.device_pointer,
-			          dim.global_size[0] * dim.global_size[1],
-			          (char*)use_queues_flags.device_pointer,
-			          (uint*)work_pool_wgs.device_pointer,
-			          rtile.num_samples,
-			          (float*)rtile.buffer);
-		}
-	}
-
-	return true;
-}
-
-SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
-{
-	CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
-
-	kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
-	if(!kernel->func) {
-		delete kernel;
-		return NULL;
-	}
-
-	return kernel;
-}
-
-int2 CPUSplitKernel::split_kernel_local_size()
-{
-	return make_int2(1, 1);
-}
-
-int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/) {
-	return make_int2(64, 1);
-}
-
-uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
-	KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
-
-	return split_data_buffer_size(kg, num_threads);
-}
-
-unordered_map<string, void*> CPUDevice::kernel_functions;
-
 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
 {
 	return new CPUDevice(info, stats, background);
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -15,14 +15,12 @@
 */

 #include <climits>
-#include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 #include "device.h"
 #include "device_intern.h"
-#include "device_split_kernel.h"

 #include "buffers.h"

@@ -44,8 +42,6 @@
 #include "util_types.h"
 #include "util_time.h"

-#include "split/kernel_split_data_types.h"
-
 CCL_NAMESPACE_BEGIN

 #ifndef WITH_CUDA_DYNLOAD
@@ -82,31 +78,6 @@ int cuewCompilerVersion(void)
 }  /* namespace */
 #endif  /* WITH_CUDA_DYNLOAD */

-class CUDADevice;
-
-class CUDASplitKernel : public DeviceSplitKernel {
-	CUDADevice *device;
-public:
-	explicit CUDASplitKernel(CUDADevice *device);
-
-	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
-
-	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
-	                                            RenderTile& rtile,
-	                                            int num_global_elements,
-	                                            device_memory& kernel_globals,
-	                                            device_memory& kernel_data_,
-	                                            device_memory& split_data,
-	                                            device_memory& ray_state,
-	                                            device_memory& queue_index,
-	                                            device_memory& use_queues_flag,
-	                                            device_memory& work_pool_wgs);
-
-	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
-	virtual int2 split_kernel_local_size();
-	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
-};
-
 class CUDADevice : public Device
 {
 public:
@@ -144,12 +115,6 @@ public:
 		return path_exists(cubins_path);
 	}

-	virtual bool show_samples() const
-	{
-		/* The CUDADevice only processes one tile at a time, so showing samples is fine. */
-		return true;
-	}
-
 /*#ifdef NDEBUG
 #define cuda_abort()
 #else
@@ -159,7 +124,7 @@ public:
 	{
 		if(first_error) {
 			fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
-			fprintf(stderr, "https://docs.blender.org/manual/en/dev/render/cycles/gpu_rendering.html\n\n");
+			fprintf(stderr, "http://www.blender.org/manual/render/cycles/gpu_rendering.html\n\n");
 			first_error = false;
 		}
 	}
@@ -287,16 +252,11 @@ public:
 		return DebugFlags().cuda.adaptive_compile;
 	}

-	bool use_split_kernel()
-	{
-		return DebugFlags().cuda.split_kernel;
-	}
-
 	/* Common NVCC flags which stays the same regardless of shading model,
 	 * kernel sources md5 and only depends on compiler or compilation settings.
 	 */
 	string compile_kernel_get_common_cflags(
-	        const DeviceRequestedFeatures& requested_features, bool split=false)
+	        const DeviceRequestedFeatures& requested_features)
 	{
 		const int cuda_version = cuewCompilerVersion();
 		const int machine = system_cpu_bits();
@@ -321,11 +281,6 @@ public:
 #ifdef WITH_CYCLES_DEBUG
 		cflags += " -D__KERNEL_DEBUG__";
 #endif
-
-		if(split) {
-			cflags += " -D__SPLIT__";
-		}
-
 		return cflags;
 	}

@@ -359,7 +314,7 @@ public:
 		return true;
 	}

-	string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
+	string compile_kernel(const DeviceRequestedFeatures& requested_features)
 	{
 		/* Compute cubin name. */
 		int major, minor;
@@ -368,8 +323,7 @@ public:

 		/* Attempt to use kernel provided with Blender. */
 		if(!use_adaptive_compilation()) {
-			const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
-			                                                  : "lib/kernel_sm_%d%d.cubin",
+			const string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin",
 			                                            major, minor));
 			VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
 			if(path_exists(cubin)) {
@@ -379,7 +333,7 @@ public:
 		}

 		const string common_cflags =
-		        compile_kernel_get_common_cflags(requested_features, split);
+		        compile_kernel_get_common_cflags(requested_features);

 		/* Try to use locally compiled kernel. */
 		const string kernel_path = path_get("kernel");
@@ -390,8 +344,7 @@ public:
 		 */
 		const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);

-		const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
-		                                              : "cycles_kernel_sm%d%d_%s.cubin",
+		const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin",
 		                                        major, minor,
 		                                        cubin_md5.c_str());
 		const string cubin = path_cache_get(path_join("kernels", cubin_file));
@@ -426,7 +379,7 @@ public:
 		const char *nvcc = cuewCompilerPath();
 		const string kernel = path_join(kernel_path,
 		                          path_join("kernels",
-		                                    path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
+		                                    path_join("cuda", "kernel.cu")));
 		double starttime = time_dt();
 		printf("Compiling CUDA kernel ...\n");

@@ -474,7 +427,7 @@ public:
 			return false;

 		/* get kernel */
-		string cubin = compile_kernel(requested_features, use_split_kernel());
+		string cubin = compile_kernel(requested_features);

 		if(cubin == "")
 			return false;
@@ -507,14 +460,8 @@ public:
 		}
 	}

-	void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
+	void mem_alloc(device_memory& mem, MemoryType /*type*/)
 	{
-		if(name) {
-			VLOG(1) << "Buffer allocate: " << name << ", "
-			        << string_human_readable_number(mem.memory_size()) << " bytes. ("
-			        << string_human_readable_size(mem.memory_size()) << ")";
-		}
-
 		cuda_push_context();
 		CUdeviceptr device_pointer;
 		size_t size = mem.memory_size();
@@ -551,9 +498,7 @@ public:

 	void mem_zero(device_memory& mem)
 	{
-		if(mem.data_pointer) {
-			memset((void*)mem.data_pointer, 0, mem.memory_size());
-		}
+		memset((void*)mem.data_pointer, 0, mem.memory_size());

 		cuda_push_context();
 		if(mem.device_pointer)
@@ -666,7 +611,7 @@ public:
 		/* Data Storage */
 		if(interpolation == INTERPOLATION_NONE) {
 			if(has_bindless_textures) {
-				mem_alloc(NULL, mem, MEM_READ_ONLY);
+				mem_alloc(mem, MEM_READ_ONLY);
 				mem_copy_to(mem);

 				cuda_push_context();
@@ -690,7 +635,7 @@ public:
 				cuda_pop_context();
 			}
 			else {
-				mem_alloc(NULL, mem, MEM_READ_ONLY);
+				mem_alloc(mem, MEM_READ_ONLY);
 				mem_copy_to(mem);

 				cuda_push_context();
@@ -1307,48 +1252,25 @@ public:
 			/* Upload Bindless Mapping */
 			load_bindless_mapping();

-			if(!use_split_kernel()) {
-				/* keep rendering tiles until done */
-				while(task->acquire_tile(this, tile)) {
-					int start_sample = tile.start_sample;
-					int end_sample = tile.start_sample + tile.num_samples;
-
-					for(int sample = start_sample; sample < end_sample; sample++) {
-						if(task->get_cancel()) {
-							if(task->need_finish_queue == false)
-								break;
-						}
-
-						path_trace(tile, sample, branched);
-
-						tile.sample = sample + 1;
-
-						task->update_progress(&tile, tile.w*tile.h);
-					}
-
-					task->release_tile(tile);
-				}
-			}
-			else {
-				DeviceRequestedFeatures requested_features;
-				if(!use_adaptive_compilation()) {
-					requested_features.max_closure = 64;
-				}
-
-				CUDASplitKernel split_kernel(this);
-				split_kernel.load_kernels(requested_features);
-
-				while(task->acquire_tile(this, tile)) {
-					device_memory void_buffer;
-					split_kernel.path_trace(task, tile, void_buffer, void_buffer);
-
-					task->release_tile(tile);
+			/* keep rendering tiles until done */
+			while(task->acquire_tile(this, tile)) {
+				int start_sample = tile.start_sample;
+				int end_sample = tile.start_sample + tile.num_samples;

+				for(int sample = start_sample; sample < end_sample; sample++) {
 					if(task->get_cancel()) {
 						if(task->need_finish_queue == false)
 							break;
 					}
+
+					path_trace(tile, sample, branched);
+
+					tile.sample = sample + 1;
+
+					task->update_progress(&tile);
 				}
+
+				task->release_tile(tile);
 			}
 		}
 		else if(task->type == DeviceTask::SHADER) {
@@ -1401,223 +1323,8 @@ public:
 	{
 		task_pool.cancel();
 	}
-
-	friend class CUDASplitKernelFunction;
-	friend class CUDASplitKernel;
 };

-/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
- * now that the definition of that class is complete
- */
-#undef cuda_assert
-#define cuda_assert(stmt) \
-	{ \
-		CUresult result = stmt; \
-		\
-		if(result != CUDA_SUCCESS) { \
-			string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
-			if(device->error_msg == "") \
-				device->error_msg = message; \
-			fprintf(stderr, "%s\n", message.c_str()); \
-			/*cuda_abort();*/ \
-			device->cuda_error_documentation(); \
-		} \
-	} (void)0
-
-/* split kernel */
-
-class CUDASplitKernelFunction : public SplitKernelFunction{
-	CUDADevice* device;
-	CUfunction func;
-public:
-	CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {}
-
-	/* enqueue the kernel, returns false if there is an error */
-	bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/)
-	{
-		return enqueue(dim, NULL);
-	}
-
-	/* enqueue the kernel, returns false if there is an error */
-	bool enqueue(const KernelDimensions &dim, void *args[])
-	{
-		device->cuda_push_context();
-
-		if(device->have_error())
-			return false;
-
-		/* we ignore dim.local_size for now, as this is faster */
-		int threads_per_block;
-		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
-
-		int xthreads = (int)sqrt(threads_per_block);
-		int ythreads = (int)sqrt(threads_per_block);
-
-		int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
-		int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
-
-		cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
-
-		cuda_assert(cuLaunchKernel(func,
-		                           xblocks , yblocks, 1, /* blocks */
-		                           xthreads, ythreads, 1, /* threads */
-		                           0, 0, args, 0));
-
-		device->cuda_pop_context();
-
-		return !device->have_error();
-	}
-};
-
-CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
-{
-	device_vector<uint64_t> size_buffer;
-	size_buffer.resize(1);
-	device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
-
-	device->cuda_push_context();
-
-	uint threads = num_threads;
-	CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
-
-	struct args_t {
-		uint* num_threads;
-		CUdeviceptr* size;
-	};
-
-	args_t args = {
-		&threads,
-		&d_size
-	};
-
-	CUfunction state_buffer_size;
-	cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
-
-	cuda_assert(cuLaunchKernel(state_buffer_size,
-	                           1, 1, 1,
-	                           1, 1, 1,
-	                           0, 0, &args, 0));
-
-	device->cuda_pop_context();
-
-	device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t));
-	device->mem_free(size_buffer);
-
-	return *size_buffer.get_data();
-}
-
-bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
-                                    RenderTile& rtile,
-                                    int num_global_elements,
-                                    device_memory& /*kernel_globals*/,
-                                    device_memory& /*kernel_data*/,
-                                    device_memory& split_data,
-                                    device_memory& ray_state,
-                                    device_memory& queue_index,
-                                    device_memory& use_queues_flag,
-                                    device_memory& work_pool_wgs)
-{
-	device->cuda_push_context();
-
-	CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
-	CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
-	CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
-	CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
-	CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
-
-	CUdeviceptr d_rng_state = device->cuda_device_ptr(rtile.rng_state);
-	CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
-
-	int end_sample = rtile.start_sample + rtile.num_samples;
-	int queue_size = dim.global_size[0] * dim.global_size[1];
-
-	struct args_t {
-		CUdeviceptr* split_data_buffer;
-		int* num_elements;
-		CUdeviceptr* ray_state;
-		CUdeviceptr* rng_state;
-		int* start_sample;
-		int* end_sample;
-		int* sx;
-		int* sy;
-		int* sw;
-		int* sh;
-		int* offset;
-		int* stride;
-		CUdeviceptr* queue_index;
-		int* queuesize;
-		CUdeviceptr* use_queues_flag;
-		CUdeviceptr* work_pool_wgs;
-		int* num_samples;
-		CUdeviceptr* buffer;
-	};
-
-	args_t args = {
-		&d_split_data,
-		&num_global_elements,
-		&d_ray_state,
-		&d_rng_state,
-		&rtile.start_sample,
-		&end_sample,
-		&rtile.x,
-		&rtile.y,
-		&rtile.w,
-		&rtile.h,
-		&rtile.offset,
-		&rtile.stride,
-		&d_queue_index,
-		&queue_size,
-		&d_use_queues_flag,
-		&d_work_pool_wgs,
-		&rtile.num_samples,
-		&d_buffer
-	};
-
-	CUfunction data_init;
-	cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
-	if(device->have_error()) {
-		return false;
-	}
-
-	CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
-
-	device->cuda_pop_context();
-
-	return !device->have_error();
-}
-
-SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
-{
-	CUfunction func;
-
-	device->cuda_push_context();
-
-	cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
-	if(device->have_error()) {
-		device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
-		return NULL;
-	}
-
-	device->cuda_pop_context();
-
-	return new CUDASplitKernelFunction(device, func);
-}
-
-int2 CUDASplitKernel::split_kernel_local_size()
-{
-	return make_int2(32, 1);
-}
-
-int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask */*task*/)
-{
-	/* TODO(mai): implement something here to detect ideal work size */
-	return make_int2(256, 256);
-}
-
 bool device_cuda_init(void)
 {
 #ifdef WITH_CUDA_DYNLOAD
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -48,8 +48,7 @@ enum DataType {
 	TYPE_UINT,
 	TYPE_INT,
 	TYPE_FLOAT,
-	TYPE_HALF,
-	TYPE_UINT64,
+	TYPE_HALF
 };

 static inline size_t datatype_size(DataType datatype) 
@@ -60,7 +59,6 @@ static inline size_t datatype_size(DataType datatype)
 		case TYPE_UINT: return sizeof(uint);
 		case TYPE_INT: return sizeof(int);
 		case TYPE_HALF: return sizeof(half);
-		case TYPE_UINT64: return sizeof(uint64_t);
 		default: return 0;
 	}
 }
@@ -162,11 +160,6 @@ template<> struct device_type_traits<half4> {
 	static const int num_elements = 4;
 };

-template<> struct device_type_traits<uint64_t> {
-	static const DataType data_type = TYPE_UINT64;
-	static const int num_elements = 1;
-};
-
 /* Device Memory */

 class device_memory
@@ -187,27 +180,10 @@ public:
 	/* device pointer */
 	device_ptr device_pointer;

-	device_memory()
-	{
-		data_type = device_type_traits<uchar>::data_type;
-		data_elements = device_type_traits<uchar>::num_elements;
-		data_pointer = 0;
-		data_size = 0;
-		device_size = 0;
-		data_width = 0;
-		data_height = 0;
-		data_depth = 0;
-		device_pointer = 0;
-	}
+protected:
+	device_memory() {}
 	virtual ~device_memory() { assert(!device_pointer); }

-	void resize(size_t size)
-	{
-		data_size = size;
-		data_width = size;
-	}
-
-protected:
 	/* no copying */
 	device_memory(const device_memory&);
 	device_memory& operator = (const device_memory&);
@@ -222,8 +198,16 @@ public:
 	{
 		data_type = device_type_traits<T>::data_type;
 		data_elements = device_type_traits<T>::num_elements;
+		data_pointer = 0;
+		data_size = 0;
+		device_size = 0;
+		data_width = 0;
+		data_height = 0;
+		data_depth = 0;

 		assert(data_elements > 0);
+
+		device_pointer = 0;
 	}

 	virtual ~device_vector() {}
@@ -282,7 +266,6 @@ public:
 		data_height = 0;
 		data_depth = 0;
 		data_size = 0;
-		device_pointer = 0;
 	}

 	size_t size()
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -89,14 +89,6 @@ public:
 		return error_msg;
 	}

-	virtual bool show_samples() const
-	{
-		if(devices.size() > 1) {
-			return false;
-		}
-		return devices.front().device->show_samples();
-	}
-
 	bool load_kernels(const DeviceRequestedFeatures& requested_features)
 	{
 		foreach(SubDevice& sub, devices)
@@ -106,11 +98,11 @@ public:
 		return true;
 	}

-	void mem_alloc(const char *name, device_memory& mem, MemoryType type)
+	void mem_alloc(device_memory& mem, MemoryType type)
 	{
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = 0;
-			sub.device->mem_alloc(name, mem, type);
+			sub.device->mem_alloc(mem, type);
 			sub.ptr_map[unique_ptr] = mem.device_pointer;
 		}

@@ -162,7 +154,6 @@ public:
 	void mem_free(device_memory& mem)
 	{
 		device_ptr tmp = mem.device_pointer;
-		stats.mem_free(mem.device_size);

 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = sub.ptr_map[tmp];
@@ -171,6 +162,7 @@ public:
 		}

 		mem.device_pointer = 0;
+		stats.mem_free(mem.device_size);
 	}

 	void const_copy_to(const char *name, void *host, size_t size)
@@ -202,7 +194,6 @@ public:
 	void tex_free(device_memory& mem)
 	{
 		device_ptr tmp = mem.device_pointer;
-		stats.mem_free(mem.device_size);

 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = sub.ptr_map[tmp];
@@ -211,6 +202,7 @@ public:
 		}

 		mem.device_pointer = 0;
+		stats.mem_free(mem.device_size);
 	}

 	void pixels_alloc(device_memory& mem)
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -51,11 +51,6 @@ public:

 	thread_mutex rpc_lock;

-	virtual bool show_samples() const
-	{
-		return false;
-	}
-
 	NetworkDevice(DeviceInfo& info, Stats &stats, const char *address)
 	: Device(info, stats, true), socket(io_service)
 	{
@@ -87,14 +82,8 @@ public:
 		snd.write();
 	}

-	void mem_alloc(const char *name, device_memory& mem, MemoryType type)
+	void mem_alloc(device_memory& mem, MemoryType type)
 	{
-		if(name) {
-			VLOG(1) << "Buffer allocate: " << name << ", "
-				    << string_human_readable_number(mem.memory_size()) << " bytes. ("
-				    << string_human_readable_size(mem.memory_size()) << ")";
-		}
-
 		thread_scoped_lock lock(rpc_lock);

 		mem.device_pointer = ++mem_counter;
@@ -487,7 +476,7 @@ protected:
 				mem.data_pointer = 0;

 			/* perform the allocation on the actual device */
-			device->mem_alloc(NULL, mem, type);
+			device->mem_alloc(mem, type);

 			/* store a mapping to/from client_pointer and real device pointer */
 			pointer_mapping_insert(client_pointer, mem.device_pointer);
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -1,306 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device_split_kernel.h"
-
-#include "kernel_types.h"
-#include "kernel_split_data_types.h"
-
-#include "util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-static const double alpha = 0.1; /* alpha for rolling average */
-
-DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device)
-{
-	current_max_closure = -1;
-	first_tile = true;
-
-	avg_time_per_sample = 0.0;
-
-	kernel_path_init = NULL;
-	kernel_scene_intersect = NULL;
-	kernel_lamp_emission = NULL;
-	kernel_do_volume = NULL;
-	kernel_queue_enqueue = NULL;
-	kernel_indirect_background = NULL;
-	kernel_shader_eval = NULL;
-	kernel_holdout_emission_blurring_pathtermination_ao = NULL;
-	kernel_subsurface_scatter = NULL;
-	kernel_direct_lighting = NULL;
-	kernel_shadow_blocked_ao = NULL;
-	kernel_shadow_blocked_dl = NULL;
-	kernel_next_iteration_setup = NULL;
-	kernel_indirect_subsurface = NULL;
-	kernel_buffer_update = NULL;
-}
-
-DeviceSplitKernel::~DeviceSplitKernel()
-{
-	device->mem_free(split_data);
-	device->mem_free(ray_state);
-	device->mem_free(use_queues_flag);
-	device->mem_free(queue_index);
-	device->mem_free(work_pool_wgs);
-
-	delete kernel_path_init;
-	delete kernel_scene_intersect;
-	delete kernel_lamp_emission;
-	delete kernel_do_volume;
-	delete kernel_queue_enqueue;
-	delete kernel_indirect_background;
-	delete kernel_shader_eval;
-	delete kernel_holdout_emission_blurring_pathtermination_ao;
-	delete kernel_subsurface_scatter;
-	delete kernel_direct_lighting;
-	delete kernel_shadow_blocked_ao;
-	delete kernel_shadow_blocked_dl;
-	delete kernel_next_iteration_setup;
-	delete kernel_indirect_subsurface;
-	delete kernel_buffer_update;
-}
-
-bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features)
-{
-#define LOAD_KERNEL(name) \
-		kernel_##name = get_split_kernel_function(#name, requested_features); \
-		if(!kernel_##name) { \
-			return false; \
-		}
-
-	LOAD_KERNEL(path_init);
-	LOAD_KERNEL(scene_intersect);
-	LOAD_KERNEL(lamp_emission);
-	LOAD_KERNEL(do_volume);
-	LOAD_KERNEL(queue_enqueue);
-	LOAD_KERNEL(indirect_background);
-	LOAD_KERNEL(shader_eval);
-	LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
-	LOAD_KERNEL(subsurface_scatter);
-	LOAD_KERNEL(direct_lighting);
-	LOAD_KERNEL(shadow_blocked_ao);
-	LOAD_KERNEL(shadow_blocked_dl);
-	LOAD_KERNEL(next_iteration_setup);
-	LOAD_KERNEL(indirect_subsurface);
-	LOAD_KERNEL(buffer_update);
-
-#undef LOAD_KERNEL
-
-	current_max_closure = requested_features.max_closure;
-
-	return true;
-}
-
-size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size)
-{
-	uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
-	return max_buffer_size / size_per_element;
-}
-
-bool DeviceSplitKernel::path_trace(DeviceTask *task,
-                                   RenderTile& tile,
-                                   device_memory& kgbuffer,
-                                   device_memory& kernel_data)
-{
-	if(device->have_error()) {
-		return false;
-	}
-
-	/* Get local size */
-	size_t local_size[2];
-	{
-		int2 lsize = split_kernel_local_size();
-		local_size[0] = lsize[0];
-		local_size[1] = lsize[1];
-	}
-
-	/* Set gloabl size */
-	size_t global_size[2];
-	{
-		int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
-
-		/* Make sure that set work size is a multiple of local
-		 * work size dimensions.
-		 */
-		global_size[0] = round_up(gsize[0], local_size[0]);
-		global_size[1] = round_up(gsize[1], local_size[1]);
-	}
-
-	/* Number of elements in the global state buffer */
-	int num_global_elements = global_size[0] * global_size[1];
-	assert(num_global_elements % WORK_POOL_SIZE == 0);
-
-	/* Allocate all required global memory once. */
-	if(first_tile) {
-		first_tile = false;
-
-		/* Calculate max groups */
-
-		/* Denotes the maximum work groups possible w.r.t. current requested tile size. */
-		unsigned int max_work_groups = num_global_elements / WORK_POOL_SIZE + 1;
-
-		/* Allocate work_pool_wgs memory. */
-		work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
-		device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);
-
-		queue_index.resize(NUM_QUEUES * sizeof(int));
-		device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);
-
-		use_queues_flag.resize(sizeof(char));
-		device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE);
-
-		ray_state.resize(num_global_elements);
-		device->mem_alloc("ray_state", ray_state, MEM_READ_WRITE);
-
-		split_data.resize(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
-		device->mem_alloc("split_data", split_data, MEM_READ_WRITE);
-	}
-
-#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
-		if(device->have_error()) { \
-			return false; \
-		} \
-		if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
-			return false; \
-		}
-
-	tile.sample = tile.start_sample;
-
-	/* for exponential increase between tile updates */
-	int time_multiplier = 1;
-
-	while(tile.sample < tile.start_sample + tile.num_samples) {
-		/* to keep track of how long it takes to run a number of samples */
-		double start_time = time_dt();
-
-		/* initial guess to start rolling average */
-		const int initial_num_samples = 1;
-		/* approx number of samples per second */
-		int samples_per_second = (avg_time_per_sample > 0.0) ?
-		                         int(double(time_multiplier) / avg_time_per_sample) + 1 : initial_num_samples;
-
-		RenderTile subtile = tile;
-		subtile.start_sample = tile.sample;
-		subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample);
-
-		if(device->have_error()) {
-			return false;
-		}
-
-		/* reset state memory here as global size for data_init
-		 * kernel might not be large enough to do in kernel
-		 */
-		device->mem_zero(work_pool_wgs);
-		device->mem_zero(split_data);
-		device->mem_zero(ray_state);
-
-		if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
-		                                   subtile,
-		                                   num_global_elements,
-		                                   kgbuffer,
-		                                   kernel_data,
-		                                   split_data,
-		                                   ray_state,
-		                                   queue_index,
-		                                   use_queues_flag,
-		                                   work_pool_wgs))
-		{
-			return false;
-		}
-
-		ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
-
-		bool activeRaysAvailable = true;
-
-		while(activeRaysAvailable) {
-			/* Do path-iteration in host [Enqueue Path-iteration kernels. */
-			for(int PathIter = 0; PathIter < 16; PathIter++) {
-				ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
-
-				if(task->get_cancel()) {
-					return true;
-				}
-			}
-
-			/* Decide if we should exit path-iteration in host. */
-			device->mem_copy_from(ray_state, 0, global_size[0] * global_size[1] * sizeof(char), 1, 1);
-
-			activeRaysAvailable = false;
-
-			for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
-				int8_t state = ray_state.get_data()[rayStateIter];
-
-				if(state != RAY_INACTIVE) {
-					if(state == RAY_INVALID) {
-						/* Something went wrong, abort to avoid looping endlessly. */
-						device->set_error("Split kernel error: invalid ray state");
-						return false;
-					}
-
-					/* Not all rays are RAY_INACTIVE. */
-					activeRaysAvailable = true;
-					break;
-				}
-			}
-
-			if(task->get_cancel()) {
-				return true;
-			}
-		}
-
-		double time_per_sample = ((time_dt()-start_time) / subtile.num_samples);
-
-		if(avg_time_per_sample == 0.0) {
-			/* start rolling average */
-			avg_time_per_sample = time_per_sample;
-		}
-		else {
-			avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample;
-		}
-
-#undef ENQUEUE_SPLIT_KERNEL
-
-		tile.sample += subtile.num_samples;
-		task->update_progress(&tile, tile.w*tile.h*subtile.num_samples);
-
-		time_multiplier = min(time_multiplier << 1, 10);
-
-		if(task->get_cancel()) {
-			return true;
-		}
-	}
-
-	return true;
-}
-
-CCL_NAMESPACE_END
-
-
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -1,132 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_SPLIT_KERNEL_H__
-#define __DEVICE_SPLIT_KERNEL_H__
-
-#include "device.h"
-#include "buffers.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* When allocate global memory in chunks. We may not be able to
- * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
- * Since some bytes may be needed for aligning chunks of memory;
- * This is the amount of memory that we dedicate for that purpose.
- */
-#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
-
-/* Types used for split kernel */
-
-class KernelDimensions {
-public:
-	size_t global_size[2];
-	size_t local_size[2];
-
-	KernelDimensions(size_t global_size_[2], size_t local_size_[2])
-	{
-		memcpy(global_size, global_size_, sizeof(global_size));
-		memcpy(local_size, local_size_, sizeof(local_size));
-	}
-};
-
-class SplitKernelFunction {
-public:
-	virtual ~SplitKernelFunction() {}
-
-	/* enqueue the kernel, returns false if there is an error */
-	virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) = 0;
-};
-
-class DeviceSplitKernel {
-private:
-	Device *device;
-
-	SplitKernelFunction *kernel_path_init;
-	SplitKernelFunction *kernel_scene_intersect;
-	SplitKernelFunction *kernel_lamp_emission;
-	SplitKernelFunction *kernel_do_volume;
-	SplitKernelFunction *kernel_queue_enqueue;
-	SplitKernelFunction *kernel_indirect_background;
-	SplitKernelFunction *kernel_shader_eval;
-	SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
-	SplitKernelFunction *kernel_subsurface_scatter;
-	SplitKernelFunction *kernel_direct_lighting;
-	SplitKernelFunction *kernel_shadow_blocked_ao;
-	SplitKernelFunction *kernel_shadow_blocked_dl;
-	SplitKernelFunction *kernel_next_iteration_setup;
-	SplitKernelFunction *kernel_indirect_subsurface;
-	SplitKernelFunction *kernel_buffer_update;
-
-	/* Global memory variables [porting]; These memory is used for
-	 * co-operation between different kernels; Data written by one
-	 * kernel will be available to another kernel via this global
-	 * memory.
-	 */
-	device_memory split_data;
-	device_vector<uchar> ray_state;
-	device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */
-
-	/* Flag to make sceneintersect and lampemission kernel use queues. */
-	device_memory use_queues_flag;
-
-	/* Approximate time it takes to complete one sample */
-	double avg_time_per_sample;
-
-	/* Work pool with respect to each work group. */
-	device_memory work_pool_wgs;
-
-	/* clos_max value for which the kernels have been loaded currently. */
-	int current_max_closure;
-
-	/* Marked True in constructor and marked false at the end of path_trace(). */
-	bool first_tile;
-
-public:
-	explicit DeviceSplitKernel(Device* device);
-	virtual ~DeviceSplitKernel();
-
-	bool load_kernels(const DeviceRequestedFeatures& requested_features);
-	bool path_trace(DeviceTask *task,
-	                RenderTile& rtile,
-	                device_memory& kgbuffer,
-	                device_memory& kernel_data);
-
-	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0;
-	size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size);
-
-	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
-	                                            RenderTile& rtile,
-	                                            int num_global_elements,
-	                                            device_memory& kernel_globals,
-	                                            device_memory& kernel_data_,
-	                                            device_memory& split_data,
-	                                            device_memory& ray_state,
-	                                            device_memory& queue_index,
-	                                            device_memory& use_queues_flag,
-	                                            device_memory& work_pool_wgs) = 0;
-
-	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) = 0;
-	virtual int2 split_kernel_local_size() = 0;
-	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0;
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_SPLIT_KERNEL_H__ */
-
-
-
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -19,8 +19,6 @@

 #include "device_task.h"

-#include "buffers.h"
-
 #include "util_algorithm.h"
 #include "util_time.h"

@@ -101,18 +99,14 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
 	}
 }

-void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
+void DeviceTask::update_progress(RenderTile *rtile)
 {
 	if((type != PATH_TRACE) &&
 	   (type != SHADER))
 		return;

-	if(update_progress_sample) {
-		if(pixel_samples == -1) {
-			pixel_samples = shader_w;
-		}
-		update_progress_sample(pixel_samples, rtile? rtile->sample : 0);
-	}
+	if(update_progress_sample)
+		update_progress_sample();

 	if(update_tile_sample) {
 		double current_time = time_dt();
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -51,17 +51,15 @@ public:
 	int shader_filter;
 	int shader_x, shader_w;

-	int passes_size;
-
 	explicit DeviceTask(Type type = PATH_TRACE);

 	int get_subtask_count(int num, int max_size = 0);
 	void split(list<DeviceTask>& tasks, int num, int max_size = 0);

-	void update_progress(RenderTile *rtile, int pixel_samples = -1);
+	void update_progress(RenderTile *rtile);

 	function<bool(Device *device, RenderTile&)> acquire_tile;
-	function<void(long, int)> update_progress_sample;
+	function<void(void)> update_progress_sample;
 	function<void(RenderTile&)> update_tile_sample;
 	function<void(RenderTile&)> release_tile;
 	function<bool(void)> get_cancel;
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -26,30 +26,30 @@

 CCL_NAMESPACE_BEGIN

-/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */
-#ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
-/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
-#  undef clEnqueueNDRangeKernel
-#  define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
-	clFinish(a); \
-	CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
-	clFinish(a);
-
-#  undef clEnqueueWriteBuffer
-#  define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
-	clFinish(a); \
-	CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
-	clFinish(a);
-
-#  undef clEnqueueReadBuffer
-#  define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
-	clFinish(a); \
-	CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
-	clFinish(a);
-#endif  /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
-
 #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))

+/* Macro declarations used with split kernel */
+
+/* Macro to enable/disable work-stealing */
+#define __WORK_STEALING__
+
+#define SPLIT_KERNEL_LOCAL_SIZE_X 64
+#define SPLIT_KERNEL_LOCAL_SIZE_Y 1
+
+/* This value may be tuned according to the scene we are rendering.
+ *
+ * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
+ * ray-bounces will improve performance.
+ */
+#define PATH_ITER_INC_FACTOR 8
+
+/* When allocate global memory in chunks. We may not be able to
+ * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
+ * Since some bytes may be needed for aligning chunks of memory;
+ * This is the amount of memory that we dedicate for that purpose.
+ */
+#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
+
 struct OpenCLPlatformDevice {
 	OpenCLPlatformDevice(cl_platform_id platform_id,
 	                     const string& platform_name,
@@ -90,7 +90,6 @@ public:
 	                              cl_device_id device_id);
 	static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
 	                               bool force_all = false);
-	static bool use_single_program();
 };

 /* Thread safe cache for contexts and programs.
@@ -249,7 +248,6 @@ public:

 	bool device_initialized;
 	string platform_name;
-	string device_name;

 	bool opencl_error(cl_int err);
 	void opencl_error(const string& message);
@@ -268,10 +266,10 @@ public:

 	/* Has to be implemented by the real device classes.
 	 * The base device will then load all these programs. */
-	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
+	virtual void load_kernels(const DeviceRequestedFeatures& requested_features,
 	                          vector<OpenCLProgram*> &programs) = 0;

-	void mem_alloc(const char *name, device_memory& mem, MemoryType type);
+	void mem_alloc(device_memory& mem, MemoryType type);
 	void mem_copy_to(device_memory& mem);
 	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
 	void mem_zero(device_memory& mem);
@@ -328,39 +326,16 @@ protected:

 	class ArgumentWrapper {
 	public:
-		ArgumentWrapper() : size(0), pointer(NULL)
-		{
-		}
-
-		ArgumentWrapper(device_memory& argument) : size(sizeof(void*)),
-		                                           pointer((void*)(&argument.device_pointer))
-		{
-		}
-
-		template<typename T>
-		ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)),
-		                                              pointer((void*)(&argument.device_pointer))
-		{
-		}
-
-		template<typename T>
+		ArgumentWrapper() : size(0), pointer(NULL) {}
+		template <typename T>
 		ArgumentWrapper(T& argument) : size(sizeof(argument)),
-		                               pointer(&argument)
-		{
-		}
-
+		                               pointer(&argument) { }
 		ArgumentWrapper(int argument) : size(sizeof(int)),
 		                                int_value(argument),
-		                                pointer(&int_value)
-		{
-		}
-
+		                                pointer(&int_value) { }
 		ArgumentWrapper(float argument) : size(sizeof(float)),
 		                                  float_value(argument),
-		                                  pointer(&float_value)
-		{
-		}
-
+		                                  pointer(&float_value) { }
 		size_t size;
 		int int_value;
 		float float_value;
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -82,10 +82,9 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
 	cpPlatform = platform_device.platform_id;
 	cdDevice = platform_device.device_id;
 	platform_name = platform_device.platform_name;
-	device_name = platform_device.device_name;
 	VLOG(2) << "Creating new Cycles device for OpenCL platform "
 	        << platform_name << ", device "
-	        << device_name << ".";
+	        << platform_device.device_name << ".";

 	{
 		/* try to use cached context */
@@ -114,16 +113,12 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
 	}

 	cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
-	if(opencl_error(ciErr)) {
-		opencl_error("OpenCL: Error creating command queue");
+	if(opencl_error(ciErr))
 		return;
-	}

 	null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
-	if(opencl_error(ciErr)) {
-		opencl_error("OpenCL: Error creating memory buffer for NULL");
+	if(opencl_error(ciErr))
 		return;
-	}

 	fprintf(stderr, "Device init success\n");
 	device_initialized = true;
@@ -196,8 +191,6 @@ string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options)

 bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features)
 {
-	VLOG(2) << "Loading kernels for platform " << platform_name
-	        << ", device " << device_name << ".";
 	/* Verify if device was initialized. */
 	if(!device_initialized) {
 		fprintf(stderr, "OpenCL: failed to initialize device.\n");
@@ -213,14 +206,11 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
 	base_program.add_kernel(ustring("convert_to_half_float"));
 	base_program.add_kernel(ustring("shader"));
 	base_program.add_kernel(ustring("bake"));
-	base_program.add_kernel(ustring("zero_buffer"));

 	vector<OpenCLProgram*> programs;
 	programs.push_back(&base_program);
 	/* Call actual class to fill the vector with its programs. */
-	if(!load_kernels(requested_features, programs)) {
-		return false;
-	}
+	load_kernels(requested_features, programs);

 	/* Parallel compilation is supported by Cycles, but currently all OpenCL frameworks
 	 * serialize the calls internally, so it's not much use right now.
@@ -252,14 +242,8 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
 	return true;
 }

-void OpenCLDeviceBase::mem_alloc(const char *name, device_memory& mem, MemoryType type)
+void OpenCLDeviceBase::mem_alloc(device_memory& mem, MemoryType type)
 {
-	if(name) {
-		VLOG(1) << "Buffer allocate: " << name << ", "
-			    << string_human_readable_number(mem.memory_size()) << " bytes. ("
-			    << string_human_readable_size(mem.memory_size()) << ")";
-	}
-
 	size_t size = mem.memory_size();

 	cl_mem_flags mem_flag;
@@ -327,61 +311,8 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in
 void OpenCLDeviceBase::mem_zero(device_memory& mem)
 {
 	if(mem.device_pointer) {
-		if(base_program.is_loaded()) {
-			cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
-
-			size_t global_size[] = {1024, 1024};
-			size_t num_threads = global_size[0] * global_size[1];
-
-			cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer);
-			cl_ulong d_offset = 0;
-			cl_ulong d_size = 0;
-
-			while(d_offset < mem.memory_size()) {
-				d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
-
-				kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
-
-				ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
-				                               ckZeroBuffer,
-				                               2,
-				                               NULL,
-				                               global_size,
-				                               NULL,
-				                               0,
-				                               NULL,
-				                               NULL);
-				opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
-
-				d_offset += d_size;
-			}
-		}
-
-		if(mem.data_pointer) {
-			memset((void*)mem.data_pointer, 0, mem.memory_size());
-		}
-
-		if(!base_program.is_loaded()) {
-			void* zero = (void*)mem.data_pointer;
-
-			if(!mem.data_pointer) {
-				zero = util_aligned_malloc(mem.memory_size(), 16);
-				memset(zero, 0, mem.memory_size());
-			}
-
-			opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
-			                                   CL_MEM_PTR(mem.device_pointer),
-			                                   CL_TRUE,
-			                                   0,
-			                                   mem.memory_size(),
-			                                   zero,
-			                                   0,
-			                                   NULL, NULL));
-
-			if(!mem.data_pointer) {
-				util_aligned_free(zero);
-			}
-		}
+		memset((void*)mem.data_pointer, 0, mem.memory_size());
+		mem_copy_to(mem);
 	}
 }

@@ -406,7 +337,7 @@ void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
 		device_vector<uchar> *data = new device_vector<uchar>();
 		data->copy((uchar*)host, size);

-		mem_alloc(name, *data, MEM_READ_ONLY);
+		mem_alloc(*data, MEM_READ_ONLY);
 		i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
 	}
 	else {
@@ -425,7 +356,7 @@ void OpenCLDeviceBase::tex_alloc(const char *name,
 	VLOG(1) << "Texture allocate: " << name << ", "
 	        << string_human_readable_number(mem.memory_size()) << " bytes. ("
 	        << string_human_readable_size(mem.memory_size()) << ")";
-	mem_alloc(NULL, mem, MEM_READ_ONLY);
+	mem_alloc(mem, MEM_READ_ONLY);
 	mem_copy_to(mem);
 	assert(mem_map.find(name) == mem_map.end());
 	mem_map.insert(MemMap::value_type(name, mem.device_pointer));
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -39,16 +39,11 @@ public:
 	{
 	}

-	virtual bool show_samples() const {
-		return true;
-	}
-
-	virtual bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
+	virtual void load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
 	                          vector<OpenCLProgram*> &programs)
 	{
 		path_trace_program.add_kernel(ustring("path_trace"));
 		programs.push_back(&path_trace_program);
-		return true;
 	}

 	~OpenCLDeviceMegaKernel()
@@ -125,7 +120,7 @@ public:

 					tile.sample = sample + 1;

-					task->update_progress(&tile, tile.w*tile.h);
+					task->update_progress(&tile);
 				}

 				/* Complete kernel execution before release tile */
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -19,7 +19,6 @@
 #include "opencl.h"

 #include "util_logging.h"
-#include "util_md5.h"
 #include "util_path.h"
 #include "util_time.h"

@@ -310,8 +309,6 @@ bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src)
 	string build_options;
 	build_options = device->kernel_build_options(debug_src) + kernel_build_options;

-	VLOG(1) << "Build options passed to clBuildProgram: '"
-	        << build_options << "'.";
 	cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);

 	/* show warnings even if build is successful */
@@ -339,13 +336,12 @@ bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src)

 bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src)
 {
-	string source = "#include \"kernels/opencl/" + kernel_file + "\"\n";
+	string source = "#include \"kernels/opencl/" + kernel_file + "\" // " + OpenCLCache::get_kernel_md5() + "\n";
 	/* We compile kernels consisting of many files. unfortunately OpenCL
 	 * kernel caches do not seem to recognize changes in included files.
 	 * so we force recompile on changes by adding the md5 hash of all files.
 	 */
 	source = path_source_replace_includes(source, path_get("kernel"));
-	source += "\n// " + util_md5_string(source) + "\n";

 	if(debug_src) {
 		path_write_text(*debug_src, source);
@@ -356,10 +352,10 @@ bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src)
 	cl_int ciErr;

 	program = clCreateProgramWithSource(device->cxContext,
-	                                    1,
-	                                    &source_str,
-	                                    &source_len,
-	                                    &ciErr);
+	                                   1,
+	                                   &source_str,
+	                                   &source_len,
+	                                   &ciErr);

 	if(ciErr != CL_SUCCESS) {
 		add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
@@ -442,11 +438,7 @@ void OpenCLDeviceBase::OpenCLProgram::load()
 	if(!program) {
 		add_log(string("OpenCL program ") + program_name + " not found in cache.", true);

-		/* need to create source to get md5 */
-		string source = "#include \"kernels/opencl/" + kernel_file + "\"\n";
-		source = path_source_replace_includes(source, path_get("kernel"));
-
-		string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source);
+		string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + OpenCLCache::get_kernel_md5();
 		basename = path_cache_get(path_join("kernels", basename));
 		string clbin = basename + ".clbin";

@@ -552,11 +544,6 @@ bool OpenCLInfo::use_debug()
 	return DebugFlags().opencl.debug;
 }

-bool OpenCLInfo::use_single_program()
-{
-	return DebugFlags().opencl.single_program;
-}
-
 bool OpenCLInfo::kernel_use_advanced_shading(const string& platform)
 {
 	/* keep this in sync with kernel_types.h! */
@@ -605,19 +592,6 @@ bool OpenCLInfo::device_supported(const string& platform_name,
 	                sizeof(cl_device_type),
 	                &device_type,
 	                NULL);
-	char device_name[1024] = "\0";
-	clGetDeviceInfo(device_id,
-	                CL_DEVICE_NAME,
-	                sizeof(device_name),
-	                &device_name,
-	                NULL);
-	/* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework
-	 * (aka, it will not be on Intel framework). This isn't supported
-	 * and needs an explicit blacklist.
-	 */
-	if(strstr(device_name, "Iris")) {
-		return false;
-	}
 	if(platform_name == "AMD Accelerated Parallel Processing" &&
 	   device_type == CL_DEVICE_TYPE_GPU)
 	{
@@ -774,10 +748,10 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
 		num_devices = 0;
 		cl_int ciErr;
 		if((ciErr = clGetDeviceIDs(platform_id,
-		                           device_type,
-		                           0,
-		                           NULL,
-		                           &num_devices)) != CL_SUCCESS || num_devices == 0)
+		                  device_type,
+		                  0,
+		                  NULL,
+		                  &num_devices)) != CL_SUCCESS || num_devices == 0)
 		{
 			FIRST_VLOG(2) << "Ignoring platform " << platform_name
 			              << ", failed to fetch number of devices: " << string(clewErrorString(ciErr));
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -13,28 +13,19 @@ set(INC_SYS

 set(SRC
 	kernels/cpu/kernel.cpp
-	kernels/cpu/kernel_split.cpp
 	kernels/opencl/kernel.cl
-	kernels/opencl/kernel_state_buffer_size.cl
-	kernels/opencl/kernel_split.cl
 	kernels/opencl/kernel_data_init.cl
-	kernels/opencl/kernel_path_init.cl
 	kernels/opencl/kernel_queue_enqueue.cl
 	kernels/opencl/kernel_scene_intersect.cl
 	kernels/opencl/kernel_lamp_emission.cl
-	kernels/opencl/kernel_do_volume.cl
-	kernels/opencl/kernel_indirect_background.cl
+	kernels/opencl/kernel_background_buffer_update.cl
 	kernels/opencl/kernel_shader_eval.cl
 	kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
-	kernels/opencl/kernel_subsurface_scatter.cl
 	kernels/opencl/kernel_direct_lighting.cl
-	kernels/opencl/kernel_shadow_blocked_ao.cl
-	kernels/opencl/kernel_shadow_blocked_dl.cl
+	kernels/opencl/kernel_shadow_blocked.cl
 	kernels/opencl/kernel_next_iteration_setup.cl
-	kernels/opencl/kernel_indirect_subsurface.cl
-	kernels/opencl/kernel_buffer_update.cl
+	kernels/opencl/kernel_sum_all_radiance.cl
 	kernels/cuda/kernel.cu
-	kernels/cuda/kernel_split.cu
 )

 set(SRC_BVH_HEADERS
@@ -77,7 +68,6 @@ set(SRC_HEADERS
 	kernel_path_common.h
 	kernel_path_state.h
 	kernel_path_surface.h
-	kernel_path_subsurface.h
 	kernel_path_volume.h
 	kernel_projection.h
 	kernel_queues.h
@@ -98,10 +88,6 @@ set(SRC_KERNELS_CPU_HEADERS
 	kernels/cpu/kernel_cpu_image.h
 )

-set(SRC_KERNELS_CUDA_HEADERS
-	kernels/cuda/kernel_config.h
-)
-
 set(SRC_CLOSURE_HEADERS
 	closure/alloc.h
 	closure/bsdf.h
@@ -178,8 +164,6 @@ set(SRC_GEOM_HEADERS
 	geom/geom_curve.h
 	geom/geom_motion_curve.h
 	geom/geom_motion_triangle.h
-	geom/geom_motion_triangle_intersect.h
-	geom/geom_motion_triangle_shader.h
 	geom/geom_object.h
 	geom/geom_patch.h
 	geom/geom_primitive.h
@@ -203,25 +187,17 @@ set(SRC_UTIL_HEADERS
 )

 set(SRC_SPLIT_HEADERS
-	split/kernel_buffer_update.h
+	split/kernel_background_buffer_update.h
 	split/kernel_data_init.h
 	split/kernel_direct_lighting.h
-	split/kernel_do_volume.h
 	split/kernel_holdout_emission_blurring_pathtermination_ao.h
-	split/kernel_indirect_background.h
-	split/kernel_indirect_subsurface.h
 	split/kernel_lamp_emission.h
 	split/kernel_next_iteration_setup.h
-	split/kernel_path_init.h
-	split/kernel_queue_enqueue.h
 	split/kernel_scene_intersect.h
 	split/kernel_shader_eval.h
-	split/kernel_shadow_blocked_ao.h
-	split/kernel_shadow_blocked_dl.h
+	split/kernel_shadow_blocked.h
 	split/kernel_split_common.h
-	split/kernel_split_data.h
-	split/kernel_split_data_types.h
-	split/kernel_subsurface_scatter.h
+	split/kernel_sum_all_radiance.h
 )

 # CUDA module
@@ -249,9 +225,8 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endif()

 	# build for each arch
-	set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu
+	set(cuda_sources kernels/cuda/kernel.cu
 		${SRC_HEADERS}
-		${SRC_KERNELS_CUDA_HEADERS}
 		${SRC_BVH_HEADERS}
 		${SRC_SVM_HEADERS}
 		${SRC_GEOM_HEADERS}
@@ -260,22 +235,15 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	)
 	set(cuda_cubins)

-	macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
-		if(${split})
-			set(cuda_extra_flags "-D__SPLIT__")
-			set(cuda_cubin kernel_split)
+	macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
+		if(${experimental})
+			set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__")
+			set(cuda_cubin kernel_experimental_${arch}.cubin)
 		else()
 			set(cuda_extra_flags "")
-			set(cuda_cubin kernel)
+			set(cuda_cubin kernel_${arch}.cubin)
 		endif()

-		if(${experimental})
-			set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
-			set(cuda_cubin ${cuda_cubin}_experimental)
-		endif()
-
-		set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
-
 		if(WITH_CYCLES_DEBUG)
 			set(cuda_debug_flags "-D__KERNEL_DEBUG__")
 		else()
@@ -288,19 +256,13 @@ if(WITH_CYCLES_CUDA_BINARIES)
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
 		set(cuda_math_flags "--use_fast_math")

-		if(split)
-			set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
-		else()
-			set(cuda_kernel_src "/kernels/cuda/kernel.cu")
-		endif()
-
 		add_custom_command(
 			OUTPUT ${cuda_cubin}
 			COMMAND ${cuda_nvcc_command}
 					-arch=${arch}
 					${CUDA_NVCC_FLAGS}
 					-m${CUDA_BITS}
-					--cubin ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
+					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu
 					-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
 					--ptxas-options="-v"
 					${cuda_arch_flags}
@@ -327,12 +289,7 @@ if(WITH_CYCLES_CUDA_BINARIES)

 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
 		# Compile regular kernel
-		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
-
-		if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
-			# Compile split kernel
-			CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
-		endif()
+		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE)
 	endforeach()

 	add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins})
@@ -355,42 +312,31 @@ if(CXX_HAS_SSE)
 		kernels/cpu/kernel_sse2.cpp
 		kernels/cpu/kernel_sse3.cpp
 		kernels/cpu/kernel_sse41.cpp
-		kernels/cpu/kernel_split_sse2.cpp
-		kernels/cpu/kernel_split_sse3.cpp
-		kernels/cpu/kernel_split_sse41.cpp
 	)

 	set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
-	set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-	set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-	set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 endif()

 if(CXX_HAS_AVX)
 	list(APPEND SRC
 		kernels/cpu/kernel_avx.cpp
-		kernels/cpu/kernel_split_avx.cpp
 	)
 	set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
-	set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 endif()

 if(CXX_HAS_AVX2)
 	list(APPEND SRC
 		kernels/cpu/kernel_avx2.cpp
-		kernels/cpu/kernel_split_avx2.cpp
 	)
 	set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
-	set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()

 add_library(cycles_kernel
 	${SRC}
 	${SRC_HEADERS}
 	${SRC_KERNELS_CPU_HEADERS}
-	${SRC_KERNELS_CUDA_HEADERS}
 	${SRC_BVH_HEADERS}
 	${SRC_CLOSURE_HEADERS}
 	${SRC_SVM_HEADERS}
@@ -413,28 +359,19 @@ endif()
 #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)

 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_state_buffer_size.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_split.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_path_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_do_volume.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_background.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_subsurface_scatter.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_ao.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_dl.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -357,7 +357,7 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 #endif
 }

-#if defined(__VOLUME_RECORD_ALL__) || (defined(__SHADOW_RECORD_ALL__) && defined(__KERNEL_CPU__))
+#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__)
 /* ToDo: Move to another file? */
 ccl_device int intersections_compare(const void *a, const void *b)
 {
@@ -373,28 +373,5 @@ ccl_device int intersections_compare(const void *a, const void *b)
 }
 #endif

-#if defined(__SHADOW_RECORD_ALL__)
-ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
-{
-#ifdef __KERNEL_GPU__
-	/* Use bubble sort which has more friendly memory pattern on GPU. */
-	bool swapped;
-	do {
-		swapped = false;
-		for(int j = 0; j < num_hits - 1; ++j) {
-			if(hits[j].t > hits[j + 1].t) {
-				struct Intersection tmp = hits[j];
-				hits[j] = hits[j + 1];
-				hits[j + 1] = tmp;
-				swapped = true;
-			}
-		}
-		--num_hits;
-	} while(swapped);
-#else
-	qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
-#endif
-}
-#endif  /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */
-
 CCL_NAMESPACE_END
+
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -454,7 +454,7 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
 	Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);

 	float3 aligned_dir0 = transform_direction(&space0, dir),
-	       aligned_dir1 = transform_direction(&space1, dir);
+	       aligned_dir1 = transform_direction(&space1, dir);;
 	float3 aligned_P0 = transform_point(&space0, P),
 	       aligned_P1 = transform_point(&space1, P);
 	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
@@ -516,7 +516,7 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg
 	Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);

 	float3 aligned_dir0 = transform_direction(&space0, dir),
-	       aligned_dir1 = transform_direction(&space1, dir);
+	       aligned_dir1 = transform_direction(&space1, dir);;
 	float3 aligned_P0 = transform_point(&space0, P),
 	       aligned_P1 = transform_point(&space1, P);
 	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -187,7 +187,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,

 					/* primitive intersection */
 					while(prim_addr < prim_addr2) {
-						kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
+						kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);

 						bool hit;

@@ -222,7 +222,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_HAIR)
 							case PRIMITIVE_CURVE:
 							case PRIMITIVE_MOTION_CURVE: {
-								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
 									hit = bvh_cardinal_curve_intersect(kg,
 									                                   isect_array,
@@ -232,7 +231,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									                                   object,
 									                                   prim_addr,
 									                                   ray->time,
-									                                   curve_type,
+									                                   type,
 									                                   NULL,
 									                                   0, 0);
 								}
@@ -245,7 +244,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									                          object,
 									                          prim_addr,
 									                          ray->time,
-									                          curve_type,
+									                          type,
 									                          NULL,
 									                          0, 0);
 								}
@@ -309,9 +308,9 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);

 #  if BVH_FEATURE(BVH_MOTION)
-					isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
 #  else
-					isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
 #  endif

 					triangle_intersect_precalc(dir, &isect_precalc);
@@ -362,10 +361,12 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				}
 			}
 			else {
+				float ignore_t = FLT_MAX;
+
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
 #  endif
 				triangle_intersect_precalc(dir, &isect_precalc);
 			}
--- a/intern/cycles/kernel/bvh/bvh_subsurface.h
+++ b/intern/cycles/kernel/bvh/bvh_subsurface.h
@@ -72,19 +72,19 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	ss_isect->num_hits = 0;

 	const int object_flag = kernel_tex_fetch(__object_flag, subsurface_object);
-	if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+	if(!(object_flag & SD_TRANSFORM_APPLIED)) {
 #if BVH_FEATURE(BVH_MOTION)
 		Transform ob_itfm;
-		isect_t = bvh_instance_motion_push(kg,
-		                                   subsurface_object,
-		                                   ray,
-		                                   &P,
-		                                   &dir,
-		                                   &idir,
-		                                   isect_t,
-		                                   &ob_itfm);
+		bvh_instance_motion_push(kg,
+		                         subsurface_object,
+		                         ray,
+		                         &P,
+		                         &dir,
+		                         &idir,
+		                         &isect_t,
+		                         &ob_itfm);
 #else
-		isect_t = bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, isect_t);
+		bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t);
 #endif
 		object = subsurface_object;
 	}
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -213,7 +213,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						--stack_ptr;
 					}
 				}
-				BVH_DEBUG_NEXT_NODE();
+				BVH_DEBUG_NEXT_STEP();
 			}

 			/* if node is leaf, fetch triangle list */
@@ -235,7 +235,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					switch(type & PRIMITIVE_ALL) {
 						case PRIMITIVE_TRIANGLE: {
 							for(; prim_addr < prim_addr2; prim_addr++) {
-								BVH_DEBUG_NEXT_INTERSECTION();
+								BVH_DEBUG_NEXT_STEP();
 								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								if(triangle_intersect(kg,
 								                      &isect_precalc,
@@ -264,7 +264,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_MOTION)
 						case PRIMITIVE_MOTION_TRIANGLE: {
 							for(; prim_addr < prim_addr2; prim_addr++) {
-								BVH_DEBUG_NEXT_INTERSECTION();
+								BVH_DEBUG_NEXT_STEP();
 								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								if(motion_triangle_intersect(kg,
 								                             isect,
@@ -296,9 +296,8 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						case PRIMITIVE_CURVE:
 						case PRIMITIVE_MOTION_CURVE: {
 							for(; prim_addr < prim_addr2; prim_addr++) {
-								BVH_DEBUG_NEXT_INTERSECTION();
-								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
-								kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
+								BVH_DEBUG_NEXT_STEP();
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								bool hit;
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
 									hit = bvh_cardinal_curve_intersect(kg,
@@ -309,7 +308,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									                                   object,
 									                                   prim_addr,
 									                                   ray->time,
-									                                   curve_type,
+									                                   type,
 									                                   lcg_state,
 									                                   difl,
 									                                   extmax);
@@ -323,7 +322,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									                          object,
 									                          prim_addr,
 									                          ray->time,
-									                          curve_type,
+									                          type,
 									                          lcg_state,
 									                          difl,
 									                          extmax);
@@ -354,9 +353,9 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);

 #  if BVH_FEATURE(BVH_MOTION)
-					isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
 #  else
-					isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
+					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
 #  endif
 					triangle_intersect_precalc(dir, &isect_precalc);

@@ -391,9 +390,9 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,

 			/* instance pop */
 #  if BVH_FEATURE(BVH_MOTION)
-			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
 #  else
-			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
 #  endif
 			triangle_intersect_precalc(dir, &isect_precalc);

--- a/intern/cycles/kernel/bvh/bvh_types.h
+++ b/intern/cycles/kernel/bvh/bvh_types.h
@@ -50,17 +50,12 @@ CCL_NAMESPACE_BEGIN
 #ifdef __KERNEL_DEBUG__
 #  define BVH_DEBUG_INIT() \
 	do { \
-		isect->num_traversed_nodes = 0; \
+		isect->num_traversal_steps = 0; \
 		isect->num_traversed_instances = 0; \
-		isect->num_intersections = 0; \
 	} while(0)
-#  define BVH_DEBUG_NEXT_NODE() \
+#  define BVH_DEBUG_NEXT_STEP() \
 	do { \
-		++isect->num_traversed_nodes; \
-	} while(0)
-#  define BVH_DEBUG_NEXT_INTERSECTION() \
-	do { \
-		++isect->num_intersections; \
+		++isect->num_traversal_steps; \
 	} while(0)
 #  define BVH_DEBUG_NEXT_INSTANCE() \
 	do { \
@@ -68,8 +63,7 @@ CCL_NAMESPACE_BEGIN
 	} while(0)
 #else  /* __KERNEL_DEBUG__ */
 #  define BVH_DEBUG_INIT()
-#  define BVH_DEBUG_NEXT_NODE()
-#  define BVH_DEBUG_NEXT_INTERSECTION()
+#  define BVH_DEBUG_NEXT_STEP()
 #  define BVH_DEBUG_NEXT_INSTANCE()
 #endif  /* __KERNEL_DEBUG__ */

--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -236,11 +236,13 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					/* instance push */
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 					int object_flag = kernel_tex_fetch(__object_flag, object);
+
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
 #  if BVH_FEATURE(BVH_MOTION)
-						isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
 #  else
-						isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
 #  endif

 						triangle_intersect_precalc(dir, &isect_precalc);
@@ -281,9 +283,9 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,

 			/* instance pop */
 #  if BVH_FEATURE(BVH_MOTION)
-			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
 #  else
-			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
 #  endif

 			triangle_intersect_precalc(dir, &isect_precalc);
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -287,11 +287,13 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					/* instance push */
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 					int object_flag = kernel_tex_fetch(__object_flag, object);
+
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
 #  if BVH_FEATURE(BVH_MOTION)
-						isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
 #  else
-						isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
 #  endif

 						triangle_intersect_precalc(dir, &isect_precalc);
@@ -347,10 +349,11 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				}
 			}
 			else {
+				float ignore_t = FLT_MAX;
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
 #  endif
 				triangle_intersect_precalc(dir, &isect_precalc);
 			}
--- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
@@ -106,20 +106,14 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);

-				if(false
 #ifdef __VISIBILITY_FLAG__
-				   || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0)
-#endif
-#if BVH_FEATURE(BVH_MOTION)
-				   || UNLIKELY(ray->time < inodes.y)
-				   || UNLIKELY(ray->time > inodes.z)
-#endif
-				) {
+				if((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0) {
 					/* Pop. */
 					node_addr = traversal_stack[stack_ptr].addr;
 					--stack_ptr;
 					continue;
 				}
+#endif

 				ssef dist;
 				int child_mask = NODE_INTERSECT(kg,
@@ -268,7 +262,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,

 					/* Primitive intersection. */
 					while(prim_addr < prim_addr2) {
-						kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
+						kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);

 						bool hit;

@@ -303,7 +297,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_HAIR)
 							case PRIMITIVE_CURVE:
 							case PRIMITIVE_MOTION_CURVE: {
-								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
 									hit = bvh_cardinal_curve_intersect(kg,
 									                                   isect_array,
@@ -313,7 +306,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 									                                   object,
 									                                   prim_addr,
 									                                   ray->time,
-									                                   curve_type,
+									                                   type,
 									                                   NULL,
 									                                   0, 0);
 								}
@@ -326,7 +319,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 									                          object,
 									                          prim_addr,
 									                          ray->time,
-									                          curve_type,
+									                          type,
 									                          NULL,
 									                          0, 0);
 								}
@@ -390,9 +383,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);

 #  if BVH_FEATURE(BVH_MOTION)
-					isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
 #  else
-					isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
 #  endif

 					num_hits_in_instance = 0;
@@ -445,10 +438,11 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				}
 			}
 			else {
+				float ignore_t = FLT_MAX;
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
 #  endif
 			}

--- a/intern/cycles/kernel/bvh/qbvh_subsurface.h
+++ b/intern/cycles/kernel/bvh/qbvh_subsurface.h
@@ -61,19 +61,19 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	ss_isect->num_hits = 0;

 	const int object_flag = kernel_tex_fetch(__object_flag, subsurface_object);
-	if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+	if(!(object_flag & SD_TRANSFORM_APPLIED)) {
 #if BVH_FEATURE(BVH_MOTION)
 		Transform ob_itfm;
-		isect_t = bvh_instance_motion_push(kg,
-		                                   subsurface_object,
-		                                   ray,
-		                                   &P,
-		                                   &dir,
-		                                   &idir,
-		                                   isect_t,
-		                                   &ob_itfm);
+		bvh_instance_motion_push(kg,
+		                         subsurface_object,
+		                         ray,
+		                         &P,
+		                         &dir,
+		                         &idir,
+		                         &isect_t,
+		                         &ob_itfm);
 #else
-		isect_t = bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, isect_t);
+		bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t);
 #endif
 		object = subsurface_object;
 	}
--- a/intern/cycles/kernel/bvh/qbvh_traversal.h
+++ b/intern/cycles/kernel/bvh/qbvh_traversal.h
@@ -117,10 +117,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);

 				if(UNLIKELY(node_dist > isect->t)
-#if BVH_FEATURE(BVH_MOTION)
-				   || UNLIKELY(ray->time < inodes.y)
-				   || UNLIKELY(ray->time > inodes.z)
-#endif
 #ifdef __VISIBILITY_FLAG__
 				   || (__float_as_uint(inodes.x) & visibility) == 0)
 #endif
@@ -135,7 +131,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				int child_mask;
 				ssef dist;

-				BVH_DEBUG_NEXT_NODE();
+				BVH_DEBUG_NEXT_STEP();

 #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
 				if(difl != 0.0f) {
@@ -330,7 +326,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					switch(type & PRIMITIVE_ALL) {
 						case PRIMITIVE_TRIANGLE: {
 							for(; prim_addr < prim_addr2; prim_addr++) {
-								BVH_DEBUG_NEXT_INTERSECTION();
+								BVH_DEBUG_NEXT_STEP();
 								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								if(triangle_intersect(kg,
 								                      &isect_precalc,
@@ -351,7 +347,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_MOTION)
 						case PRIMITIVE_MOTION_TRIANGLE: {
 							for(; prim_addr < prim_addr2; prim_addr++) {
-								BVH_DEBUG_NEXT_INTERSECTION();
+								BVH_DEBUG_NEXT_STEP();
 								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								if(motion_triangle_intersect(kg,
 								                             isect,
@@ -375,9 +371,8 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						case PRIMITIVE_CURVE:
 						case PRIMITIVE_MOTION_CURVE: {
 							for(; prim_addr < prim_addr2; prim_addr++) {
-								BVH_DEBUG_NEXT_INTERSECTION();
-								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
-								kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
+								BVH_DEBUG_NEXT_STEP();
+								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								bool hit;
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
 									hit = bvh_cardinal_curve_intersect(kg,
@@ -388,7 +383,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 									                                   object,
 									                                   prim_addr,
 									                                   ray->time,
-									                                   curve_type,
+									                                   type,
 									                                   lcg_state,
 									                                   difl,
 									                                   extmax);
@@ -402,7 +397,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 									                          object,
 									                          prim_addr,
 									                          ray->time,
-									                          curve_type,
+									                          type,
 									                          lcg_state,
 									                          difl,
 									                          extmax);
@@ -468,9 +463,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,

 			/* Instance pop. */
 #  if BVH_FEATURE(BVH_MOTION)
-			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
 #  else
-			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
 #  endif

 			qbvh_near_far_idx_calc(idir,
--- a/intern/cycles/kernel/bvh/qbvh_volume.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume.h
@@ -293,11 +293,13 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					/* Instance push. */
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 					int object_flag = kernel_tex_fetch(__object_flag, object);
+
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
 #  if BVH_FEATURE(BVH_MOTION)
-						isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
 #  else
-						isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
 #  endif

 						qbvh_near_far_idx_calc(idir,
@@ -341,9 +343,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,

 			/* Instance pop. */
 #  if BVH_FEATURE(BVH_MOTION)
-			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
 #  else
-			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
+			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
 #  endif

 			qbvh_near_far_idx_calc(idir,
--- a/intern/cycles/kernel/bvh/qbvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h
@@ -344,11 +344,13 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					/* Instance push. */
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 					int object_flag = kernel_tex_fetch(__object_flag, object);
+
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
+
 #  if BVH_FEATURE(BVH_MOTION)
-						isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
 #  else
-						isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
 #  endif

 						qbvh_near_far_idx_calc(idir,
@@ -406,10 +408,11 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				}
 			}
 			else {
+				float ignore_t = FLT_MAX;
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
 #  endif
 			}

--- a/intern/cycles/kernel/closure/alloc.h
+++ b/intern/cycles/kernel/closure/alloc.h
@@ -20,17 +20,17 @@ ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType ty
 {
 	kernel_assert(size <= sizeof(ShaderClosure));

-	int num_closure = sd->num_closure;
-	int num_closure_extra = sd->num_closure_extra;
+	int num_closure = ccl_fetch(sd, num_closure);
+	int num_closure_extra = ccl_fetch(sd, num_closure_extra);
 	if(num_closure + num_closure_extra >= MAX_CLOSURE)
 		return NULL;

-	ShaderClosure *sc = &sd->closure[num_closure];
+	ShaderClosure *sc = &ccl_fetch(sd, closure)[num_closure];

 	sc->type = type;
 	sc->weight = weight;

-	sd->num_closure++;
+	ccl_fetch(sd, num_closure)++;

 	return sc;
 }
@@ -44,25 +44,25 @@ ccl_device ccl_addr_space void *closure_alloc_extra(ShaderData *sd, int size)
 	 * This lets us keep the same fast array iteration over closures, as we
 	 * found linked list iteration and iteration with skipping to be slower. */
 	int num_extra = ((size + sizeof(ShaderClosure) - 1) / sizeof(ShaderClosure));
-	int num_closure = sd->num_closure;
-	int num_closure_extra = sd->num_closure_extra + num_extra;
+	int num_closure = ccl_fetch(sd, num_closure);
+	int num_closure_extra = ccl_fetch(sd, num_closure_extra) + num_extra;

 	if(num_closure + num_closure_extra > MAX_CLOSURE) {
 		/* Remove previous closure. */
-		sd->num_closure--;
-		sd->num_closure_extra++;
+		ccl_fetch(sd, num_closure)--;
+		ccl_fetch(sd, num_closure_extra)++;
 		return NULL;
 	}

-	sd->num_closure_extra = num_closure_extra;
-	return (ccl_addr_space void*)(sd->closure + MAX_CLOSURE - num_closure_extra);
+	ccl_fetch(sd, num_closure_extra) = num_closure_extra;
+	return (ccl_addr_space void*)(ccl_fetch(sd, closure) + MAX_CLOSURE - num_closure_extra);
 }

 ccl_device_inline ShaderClosure *bsdf_alloc(ShaderData *sd, int size, float3 weight)
 {
 	ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight);

-	if(sc == NULL)
+	if(!sc)
 		return NULL;

 	float sample_weight = fabsf(average(weight));
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -51,89 +51,89 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
 	switch(sc->type) {
 		case CLOSURE_BSDF_DIFFUSE_ID:
 		case CLOSURE_BSDF_BSSRDF_ID:
-			label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #ifdef __SVM__
 		case CLOSURE_BSDF_OREN_NAYAR_ID:
-			label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #ifdef __OSL__
 		case CLOSURE_BSDF_PHONG_RAMP_ID:
-			label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-			label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 		case CLOSURE_BSDF_TRANSLUCENT_ID:
-			label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFLECTION_ID:
-			label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFRACTION_ID:
-			label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_TRANSPARENT_ID:
-			label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-			label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
-			label = bsdf_microfacet_multi_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
-			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
+			label = bsdf_microfacet_multi_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state));
 			break;
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
-			label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
-			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
+			label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state));
 			break;
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-			label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-			label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-			label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-			label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_GLOSSY_TOON_ID:
-			label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-			label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-			label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 #ifdef __VOLUME__
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-			label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+			label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 		default:
@@ -157,75 +157,75 @@ float3 bsdf_eval(KernelGlobals *kg,
 {
 	float3 eval;

-	if(dot(sd->Ng, omega_in) >= 0.0f) {
+	if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) {
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #ifdef __OSL__
 			case CLOSURE_BSDF_PHONG_RAMP_ID:
-				eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-				eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
-				eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
+				eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
-				eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
+				eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 			default:
@@ -237,63 +237,63 @@ float3 bsdf_eval(KernelGlobals *kg,
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
-				eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
+				eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
-				eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
+				eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
 				break;
 #endif
 			default:
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -143,7 +143,6 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 {
 	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
 	float3 N = bsdf->N;
-	int label = LABEL_REFLECT | LABEL_GLOSSY;

 	float NdotI = dot(N, I);
 	if(NdotI > 0.0f) {
@@ -212,7 +211,6 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 			/* Some high number for MIS. */
 			*pdf = 1e6f;
 			*eval = make_float3(1e6f, 1e6f, 1e6f);
-			label = LABEL_REFLECT | LABEL_SINGULAR;
 		}
 		else {
 			/* leave the rest to eval_reflect */
@@ -226,7 +224,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc, float3 Ng,
 #endif
 	}

-	return label;
+	return LABEL_REFLECT|LABEL_GLOSSY;
 }


--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -267,10 +267,7 @@ ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc, float3 Ng,

 	*eval = make_float3(*pdf, *pdf, *pdf);

-	/* TODO(sergey): Should always be negative, but seems some precision issue
-	 * is involved here.
-	 */
-	kernel_assert(dot(locy, *omega_in) < 1e-4f);
+	kernel_assert(dot(locy, *omega_in) < 0.0f);

 	return LABEL_TRANSMIT|LABEL_GLOSSY;
 }
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -266,7 +266,7 @@ ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosur
 	       (bsdf_a->alpha_y == bsdf_b->alpha_y) &&
 	       (isequal_float3(bsdf_a->T, bsdf_b->T)) &&
 	       (bsdf_a->ior == bsdf_b->ior) &&
-	       ((bsdf_a->extra == NULL && bsdf_b->extra == NULL) ||
+	       ((!bsdf_a->extra && !bsdf_b->extra) ||
 	        ((bsdf_a->extra && bsdf_b->extra) &&
 	         (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color))));
 }
@@ -452,7 +452,6 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 	float alpha_y = bsdf->alpha_y;
 	bool m_refractive = bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
 	float3 N = bsdf->N;
-	int label;

 	float cosNO = dot(N, I);
 	if(cosNO > 0) {
@@ -478,7 +477,6 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 		/* reflection or refraction? */
 		if(!m_refractive) {
 			float cosMO = dot(m, I);
-			label = LABEL_REFLECT | LABEL_GLOSSY;

 			if(cosMO > 0) {
 				/* eq. 39 - compute actual reflected direction */
@@ -489,7 +487,6 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 						/* some high number for MIS */
 						*pdf = 1e6f;
 						*eval = make_float3(1e6f, 1e6f, 1e6f);
-						label = LABEL_REFLECT | LABEL_SINGULAR;
 					}
 					else {
 						/* microfacet normal is visible to this ray */
@@ -552,8 +549,6 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 			}
 		}
 		else {
-			label = LABEL_TRANSMIT | LABEL_GLOSSY;
-
 			/* CAUTION: the i and o variables are inverted relative to the paper
 			 * eq. 39 - compute actual refractive direction */
 			float3 R, T;
@@ -581,7 +576,6 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 					/* some high number for MIS */
 					*pdf = 1e6f;
 					*eval = make_float3(1e6f, 1e6f, 1e6f);
-					label = LABEL_TRANSMIT | LABEL_SINGULAR;
 				}
 				else {
 					/* eq. 33 */
@@ -613,10 +607,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
 			}
 		}
 	}
-	else {
-		label = (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY;
-	}
-	return label;
+	return (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY;
 }

 /* Beckmann microfacet with Smith shadow-masking from:
@@ -824,7 +815,6 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 	float alpha_y = bsdf->alpha_y;
 	bool m_refractive = bsdf->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
 	float3 N = bsdf->N;
-	int label;

 	float cosNO = dot(N, I);
 	if(cosNO > 0) {
@@ -849,7 +839,6 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl

 		/* reflection or refraction? */
 		if(!m_refractive) {
-			label = LABEL_REFLECT | LABEL_GLOSSY;
 			float cosMO = dot(m, I);

 			if(cosMO > 0) {
@@ -861,7 +850,6 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 						/* some high number for MIS */
 						*pdf = 1e6f;
 						*eval = make_float3(1e6f, 1e6f, 1e6f);
-						label = LABEL_REFLECT | LABEL_SINGULAR;
 					}
 					else {
 						/* microfacet normal is visible to this ray
@@ -916,8 +904,6 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 			}
 		}
 		else {
-			label = LABEL_TRANSMIT | LABEL_GLOSSY;
-
 			/* CAUTION: the i and o variables are inverted relative to the paper
 			 * eq. 39 - compute actual refractive direction */
 			float3 R, T;
@@ -945,7 +931,6 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 					/* some high number for MIS */
 					*pdf = 1e6f;
 					*eval = make_float3(1e6f, 1e6f, 1e6f);
-					label = LABEL_TRANSMIT | LABEL_SINGULAR;
 				}
 				else {
 					/* eq. 33 */
@@ -978,10 +963,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg, const ShaderCl
 			}
 		}
 	}
-	else {
-		label = (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY;
-	}
-	return label;
+	return (m_refractive) ? LABEL_TRANSMIT|LABEL_GLOSSY : LABEL_REFLECT|LABEL_GLOSSY;
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -43,7 +43,7 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
 ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
 {
 	if(cosI > 0.9999f || cosI < 1e-6f) {
-		const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f));
+		const float r = sqrtf(randU.x / (1.0f - randU.x));
 		const float phi = M_2PI_F * randU.y;
 		return make_float2(r*cosf(phi), r*sinf(phi));
 	}
@@ -83,7 +83,7 @@ ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha
 	const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z));
 	const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU);

-	const float2 cossin_phi = safe_normalize(make_float2(wi_11.x, wi_11.y));
+	const float2 cossin_phi = normalize(make_float2(wi_11.x, wi_11.y));
 	const float slope_x = alpha.x*(cossin_phi.x * slope_11.x - cossin_phi.y * slope_11.y);
 	const float slope_y = alpha.y*(cossin_phi.y * slope_11.x + cossin_phi.x * slope_11.y);

--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -23,8 +23,6 @@
 #include "geom_subd_triangle.h"
 #include "geom_triangle_intersect.h"
 #include "geom_motion_triangle.h"
-#include "geom_motion_triangle_intersect.h"
-#include "geom_motion_triangle_shader.h"
 #include "geom_motion_curve.h"
 #include "geom_curve.h"
 #include "geom_volume.h"
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -30,7 +30,7 @@ ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *
 ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd)
 {
 #ifdef __HAIR__
-	if(sd->type & PRIMITIVE_ALL_CURVE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
 		return ATTR_PRIM_CURVE;
 	}
 	else
@@ -53,12 +53,12 @@ ccl_device_inline AttributeDescriptor attribute_not_found()

 ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id)
 {
-	if(sd->object == PRIM_NONE) {
+	if(ccl_fetch(sd, object) == PRIM_NONE) {
 		return attribute_not_found();
 	}

 	/* for SVM, find attribute by unique id */
-	uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
+	uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
 	attr_offset += attribute_primitive_type(kg, sd);
 	uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 	
@@ -73,7 +73,7 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const Sh
 	AttributeDescriptor desc;
 	desc.element = (AttributeElement)attr_map.y;
 	
-	if(sd->prim == PRIM_NONE &&
+	if(ccl_fetch(sd, prim) == PRIM_NONE &&
 	   desc.element != ATTR_ELEMENT_MESH &&
 	   desc.element != ATTR_ELEMENT_VOXEL &&
 	   desc.element != ATTR_ELEMENT_OBJECT)
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -32,22 +32,22 @@ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 		if(dy) *dy = 0.0f;
 #endif

-		return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim);
+		return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim));
 	}
 	else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 		int k1 = k0 + 1;

 		float f0 = kernel_tex_fetch(__attributes_float, desc.offset + k0);
 		float f1 = kernel_tex_fetch(__attributes_float, desc.offset + k1);

 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*(f1 - f0);
+		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
 		if(dy) *dy = 0.0f;
 #endif

-		return (1.0f - sd->u)*f0 + sd->u*f1;
+		return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
 	}
 	else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -71,22 +71,22 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif

-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim)));
 	}
 	else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 		int k1 = k0 + 1;

 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k0));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k1));

 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = sd->du.dx*(f1 - f0);
+		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif

-		return (1.0f - sd->u)*f0 + sd->u*f1;
+		return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
 	}
 	else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -104,22 +104,22 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 {
 	float r = 0.0f;

-	if(sd->type & PRIMITIVE_ALL_CURVE) {
-		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 		int k1 = k0 + 1;

 		float4 P_curve[2];

-		if(sd->type & PRIMITIVE_CURVE) {
+		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
 			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 		}
 		else {
-			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
+			motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
 		}

-		r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w;
+		r = (P_curve[1].w - P_curve[0].w) * ccl_fetch(sd, u) + P_curve[0].w;
 	}

 	return r*2.0f;
@@ -130,8 +130,8 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)

 ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd)
 {
-	float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
-	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+	float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 	int k1 = k0 + 1;

 	float4 P_curve[2];
@@ -139,7 +139,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd
 	P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 	P_curve[1]= kernel_tex_fetch(__curve_keys, k1);

-	return float4_to_float3(P_curve[1]) * sd->u + float4_to_float3(P_curve[0]) * (1.0f - sd->u);
+	return float4_to_float3(P_curve[1]) * ccl_fetch(sd, u) + float4_to_float3(P_curve[0]) * (1.0f - ccl_fetch(sd, u));
 }

 /* Curve tangent normal */
@@ -148,14 +148,14 @@ ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
 {	
 	float3 tgN = make_float3(0.0f,0.0f,0.0f);

-	if(sd->type & PRIMITIVE_ALL_CURVE) {
+	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {

-		tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu)));
+		tgN = -(-ccl_fetch(sd, I) - ccl_fetch(sd, dPdu) * (dot(ccl_fetch(sd, dPdu),-ccl_fetch(sd, I)) / len_squared(ccl_fetch(sd, dPdu))));
 		tgN = normalize(tgN);

 		/* need to find suitable scaled gd for corrected normal */
 #if 0
-		tgN = normalize(tgN - gd * sd->dPdu);
+		tgN = normalize(tgN - gd * ccl_fetch(sd, dPdu));
 #endif
 	}

@@ -229,15 +229,6 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 	float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
 #endif
 {
-	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
-
-	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
-		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
-		if(time < prim_time.x || time > prim_time.y) {
-			return false;
-		}
-	}
-
 	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
 	float epsilon = 0.0f;
 	float r_st, r_en;
@@ -264,20 +255,9 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 		int ka = max(k0 - 1, v00.x);
 		int kb = min(k1 + 1, v00.x + v00.y - 1);

-#ifdef __KERNEL_AVX2__
-		avxf P_curve_0_1, P_curve_2_3;
-		if(is_curve_primitive) {
-			P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x);
-			P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x);
-		}
-		else {
-			int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
-			motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3);
-		}
-#else  /* __KERNEL_AVX2__ */
 		ssef P_curve[4];

-		if(is_curve_primitive) {
+		if(type & PRIMITIVE_CURVE) {
 			P_curve[0] = load4f(&kg->__curve_keys.data[ka].x);
 			P_curve[1] = load4f(&kg->__curve_keys.data[k0].x);
 			P_curve[2] = load4f(&kg->__curve_keys.data[k1].x);
@@ -287,7 +267,6 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
 			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve);
 		}
-#endif  /* __KERNEL_AVX2__ */

 		ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss));
 		ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn;
@@ -299,33 +278,6 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 		ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0);
 		ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);

-#ifdef __KERNEL_AVX2__
-		const avxf vPP = _mm256_broadcast_ps(&P.m128);
-		const avxf htfm00 = avxf(htfm0.m128, htfm0.m128);
-		const avxf htfm11 = avxf(htfm1.m128, htfm1.m128);
-		const avxf htfm22 = avxf(htfm2.m128, htfm2.m128);
-
-		const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP),
-		                      htfm00,
-		                      madd(shuffle<1>(P_curve_0_1 - vPP),
-		                           htfm11,
-		                           shuffle<2>(P_curve_0_1 - vPP) * htfm22));
-		const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP),
-		                      htfm00,
-		                      madd(shuffle<1>(P_curve_2_3 - vPP),
-		                           htfm11,
-		                           shuffle<2>(P_curve_2_3 - vPP)*htfm22));
-
-		const ssef p0 = _mm256_castps256_ps128(p01);
-		const ssef p1 = _mm256_extractf128_ps(p01, 1);
-		const ssef p2 = _mm256_castps256_ps128(p23);
-		const ssef p3 = _mm256_extractf128_ps(p23, 1);
-
-		const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1);
-		r_st = ((float4 &)P_curve_1).w;
-		const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3);
-		r_en = ((float4 &)P_curve_2).w;
-#else  /* __KERNEL_AVX2__ */
 		ssef htfm[] = { htfm0, htfm1, htfm2 };
 		ssef vP = load4f(P);
 		ssef p0 = transform_point_T3(htfm, P_curve[0] - vP);
@@ -333,10 +285,6 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 		ssef p2 = transform_point_T3(htfm, P_curve[2] - vP);
 		ssef p3 = transform_point_T3(htfm, P_curve[3] - vP);

-		r_st = ((float4 &)P_curve[1]).w;
-		r_en = ((float4 &)P_curve[2]).w;
-#endif  /* __KERNEL_AVX2__ */
-
 		float fc = 0.71f;
 		ssef vfc = ssef(fc);
 		ssef vfcxp3 = vfc * p3;
@@ -346,6 +294,8 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 		vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3)));
 		vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3));

+		r_st = ((float4 &)P_curve[1]).w;
+		r_en = ((float4 &)P_curve[2]).w;
 	}
 #else
 	float3 curve_coef[4];
@@ -372,7 +322,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte

 		float4 P_curve[4];

-		if(is_curve_primitive) {
+		if(type & PRIMITIVE_CURVE) {
 			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
 			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
 			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
@@ -433,9 +383,8 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte

 	/* begin loop */
 	while(!(tree >> (depth))) {
-		const float i_st = tree * resol;
-		const float i_en = i_st + (level * resol);
-
+		float i_st = tree * resol;
+		float i_en = i_st + (level * resol);
 #ifdef __KERNEL_SSE2__
 		ssef vi_st = ssef(i_st), vi_en = ssef(i_en);
 		ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
@@ -509,23 +458,13 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte

 			if(flags & CURVE_KN_RIBBONS) {
 				float3 tg = (p_en - p_st);
-#ifdef __KERNEL_SSE__
-				const float3 tg_sq = tg * tg;
-				float w = tg_sq.x + tg_sq.y;
-#else
 				float w = tg.x * tg.x + tg.y * tg.y;
-#endif
 				if(w == 0) {
 					tree++;
 					level = tree & -tree;
 					continue;
 				}
-#ifdef __KERNEL_SSE__
-				const float3 p_sttg = p_st * tg;
-				w = -(p_sttg.x + p_sttg.y) / w;
-#else
 				w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
-#endif
 				w = saturate(w);

 				/* compute u on the curve segment */
@@ -557,13 +496,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 				if(difl != 0.0f) {
 					mw_extension = min(difl * fabsf(bmaxz), extmax);
 					r_ext = mw_extension + r_curr;
-#ifdef __KERNEL_SSE__
-					const float3 p_curr_sq = p_curr * p_curr;
-					const float3 dxxx = _mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128));
-					float d = dxxx.x;
-#else
 					float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
-#endif
 					float d0 = d - r_curr;
 					float d1 = d + r_curr;
 					float inv_mw_extension = 1.0f/mw_extension;
@@ -698,15 +631,6 @@ ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection
 #  define dot3(x, y) dot(x, y)
 #endif

-	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
-
-	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
-		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
-		if(time < prim_time.x || time > prim_time.y) {
-			return false;
-		}
-	}
-
 	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
 	/* curve Intersection check */
 	int flags = kernel_data.curve.curveflags;
@@ -721,7 +645,7 @@ ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection
 #ifndef __KERNEL_SSE2__
 	float4 P_curve[2];

-	if(is_curve_primitive) {
+	if(type & PRIMITIVE_CURVE) {
 		P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
 		P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
 	}
@@ -756,7 +680,7 @@ ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection
 #else
 	ssef P_curve[2];
 	
-	if(is_curve_primitive) {
+	if(type & PRIMITIVE_CURVE) {
 		P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
 		P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
 	}
@@ -929,7 +853,7 @@ ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection
 #  undef len3_squared
 #  undef len3
 #  undef dot3
-#endif
+#  endif
 }

 ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
@@ -966,7 +890,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con

 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
+		Transform tfm = ccl_fetch(sd, ob_itfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #endif
@@ -979,7 +903,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 	int prim = kernel_tex_fetch(__prim_index, isect->prim);
 	float4 v00 = kernel_tex_fetch(__curves, prim);

-	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
 	int k1 = k0 + 1;

 	float3 tg;
@@ -990,14 +914,14 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con

 		float4 P_curve[4];

-		if(sd->type & PRIMITIVE_CURVE) {
+		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
 			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
 			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
 			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
 			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
 		}
 		else {
-			motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
+			motion_cardinal_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), ka, k0, k1, kb, P_curve);
 		}

 		float3 p[4];
@@ -1009,43 +933,43 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 		P = P + D*t;

 #ifdef __UV__
-		sd->u = isect->u;
-		sd->v = 0.0f;
+		ccl_fetch(sd, u) = isect->u;
+		ccl_fetch(sd, v) = 0.0f;
 #endif

 		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));

 		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
-			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
+			ccl_fetch(sd, Ng) = normalize(-(D - tg * (dot(tg, D))));
 		}
 		else {
 			/* direction from inside to surface of curve */
 			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);	
-			sd->Ng = normalize(P - p_curr);
+			ccl_fetch(sd, Ng) = normalize(P - p_curr);

 			/* adjustment for changing radius */
 			float gd = isect->v;

 			if(gd != 0.0f) {
-				sd->Ng = sd->Ng - gd * tg;
-				sd->Ng = normalize(sd->Ng);
+				ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
+				ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
 			}
 		}

 		/* todo: sometimes the normal is still so that this is detected as
 		 * backfacing even if cull backfaces is enabled */

-		sd->N = sd->Ng;
+		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
 	}
 	else {
 		float4 P_curve[2];

-		if(sd->type & PRIMITIVE_CURVE) {
+		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
 			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 		}
 		else {
-			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
+			motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
 		}

 		float l = 1.0f;
@@ -1056,39 +980,39 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 		float3 dif = P - float4_to_float3(P_curve[0]);

 #ifdef __UV__
-		sd->u = dot(dif,tg)/l;
-		sd->v = 0.0f;
+		ccl_fetch(sd, u) = dot(dif,tg)/l;
+		ccl_fetch(sd, v) = 0.0f;
 #endif

 		if(flag & CURVE_KN_TRUETANGENTGNORMAL) {
-			sd->Ng = -(D - tg * dot(tg, D));
-			sd->Ng = normalize(sd->Ng);
+			ccl_fetch(sd, Ng) = -(D - tg * dot(tg, D));
+			ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
 		}
 		else {
 			float gd = isect->v;

 			/* direction from inside to surface of curve */
-			sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd);
+			ccl_fetch(sd, Ng) = (dif - tg * ccl_fetch(sd, u) * l) / (P_curve[0].w + ccl_fetch(sd, u) * l * gd);

 			/* adjustment for changing radius */
 			if(gd != 0.0f) {
-				sd->Ng = sd->Ng - gd * tg;
-				sd->Ng = normalize(sd->Ng);
+				ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
+				ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
 			}
 		}

-		sd->N = sd->Ng;
+		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
 	}

 #ifdef __DPDU__
 	/* dPdu/dPdv */
-	sd->dPdu = tg;
-	sd->dPdv = cross(tg, sd->Ng);
+	ccl_fetch(sd, dPdu) = tg;
+	ccl_fetch(sd, dPdv) = cross(tg, ccl_fetch(sd, Ng));
 #endif

 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
+		Transform tfm = ccl_fetch(sd, ob_tfm);
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #endif
--- a/intern/cycles/kernel/geom/geom_motion_curve.h
+++ b/intern/cycles/kernel/geom/geom_motion_curve.h
@@ -50,12 +50,12 @@ ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg, int object,
 ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg, int offset, int numkeys, int numsteps, int step, int k0, int k1, float4 keys[2])
 {
 	if(step == numsteps) {
-		/* center step: regular key location */
+		/* center step: regular vertex location */
 		keys[0] = kernel_tex_fetch(__curve_keys, k0);
 		keys[1] = kernel_tex_fetch(__curve_keys, k1);
 	}
 	else {
-		/* center step is not stored in this array */
+		/* center step not stored in this array */
 		if(step > numsteps)
 			step--;

@@ -97,14 +97,14 @@ ccl_device_inline void motion_curve_keys(KernelGlobals *kg, int object, int prim
 ccl_device_inline void motion_cardinal_curve_keys_for_step(KernelGlobals *kg, int offset, int numkeys, int numsteps, int step, int k0, int k1, int k2, int k3, float4 keys[4])
 {
 	if(step == numsteps) {
-		/* center step: regular key location */
+		/* center step: regular vertex location */
 		keys[0] = kernel_tex_fetch(__curve_keys, k0);
 		keys[1] = kernel_tex_fetch(__curve_keys, k1);
 		keys[2] = kernel_tex_fetch(__curve_keys, k2);
 		keys[3] = kernel_tex_fetch(__curve_keys, k3);
 	}
 	else {
-		/* center step is not stored in this array */
+		/* center step not store in this array */
 		if(step > numsteps)
 			step--;

@@ -118,12 +118,7 @@ ccl_device_inline void motion_cardinal_curve_keys_for_step(KernelGlobals *kg, in
 }

 /* return 2 curve key locations */
-ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg,
-                                                  int object,
-                                                  int prim,
-                                                  float time,
-                                                  int k0, int k1, int k2, int k3,
-                                                  float4 keys[4])
+ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg, int object, int prim, float time, int k0, int k1, int k2, int k3, float4 keys[4])
 {
 	/* get motion info */
 	int numsteps, numkeys;
@@ -152,65 +147,6 @@ ccl_device_inline void motion_cardinal_curve_keys(KernelGlobals *kg,
 	keys[3] = (1.0f - t)*keys[3] + t*next_keys[3];
 }

-#ifdef __KERNEL_AVX2__
-/* Similar to above, but returns keys as pair of two AVX registers with each
- * holding two float4.
- */
-ccl_device_inline void motion_cardinal_curve_keys_avx(KernelGlobals *kg,
-                                                      int object,
-                                                      int prim,
-                                                      float time,
-                                                      int k0, int k1,
-                                                      int k2, int k3,
-                                                      avxf *out_keys_0_1,
-                                                      avxf *out_keys_2_3)
-{
-	/* Get motion info. */
-	int numsteps, numkeys;
-	object_motion_info(kg, object, &numsteps, NULL, &numkeys);
-
-	/* Figure out which steps we need to fetch and their interpolation factor. */
-	int maxstep = numsteps * 2;
-	int step = min((int)(time*maxstep), maxstep - 1);
-	float t = time*maxstep - step;
-
-	/* Find attribute. */
-	AttributeElement elem;
-	int offset = find_attribute_curve_motion(kg,
-	                                         object,
-	                                         ATTR_STD_MOTION_VERTEX_POSITION,
-	                                         &elem);
-	kernel_assert(offset != ATTR_STD_NOT_FOUND);
-
-	/* Fetch key coordinates. */
-	float4 next_keys[4];
-	float4 keys[4];
-	motion_cardinal_curve_keys_for_step(kg,
-	                                    offset,
-	                                    numkeys,
-	                                    numsteps,
-	                                    step,
-	                                    k0, k1, k2, k3,
-	                                    keys);
-	motion_cardinal_curve_keys_for_step(kg,
-	                                    offset,
-	                                    numkeys,
-	                                    numsteps,
-	                                    step + 1,
-	                                    k0, k1, k2, k3,
-	                                    next_keys);
-
-	const avxf keys_0_1 = avxf(keys[0].m128, keys[1].m128);
-	const avxf keys_2_3 = avxf(keys[2].m128, keys[3].m128);
-	const avxf next_keys_0_1 = avxf(next_keys[0].m128, next_keys[1].m128);
-	const avxf next_keys_2_3 = avxf(next_keys[2].m128, next_keys[3].m128);
-
-	/* Interpolate between steps. */
-	*out_keys_0_1 = (1.0f - t) * keys_0_1 + t*next_keys_0_1;
-	*out_keys_2_3 = (1.0f - t) * keys_2_3 + t*next_keys_2_3;
-}
-#endif
-
 #endif

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -76,7 +76,7 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg, uint4
 		normals[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
 	}
 	else {
-		/* center step is not stored in this array */
+		/* center step not stored in this array */
 		if(step > numsteps)
 			step--;

@@ -117,4 +117,312 @@ ccl_device_inline void motion_triangle_vertices(KernelGlobals *kg, int object, i
 	verts[2] = (1.0f - t)*verts[2] + t*next_verts[2];
 }

+/* Refine triangle intersection to more precise hit point. For rays that travel
+ * far the precision is often not so good, this reintersects the primitive from
+ * a closer distance. */
+
+ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, float3 verts[3])
+{
+	float3 P = ray->P;
+	float3 D = ray->D;
+	float t = isect->t;
+
+#ifdef __INTERSECTION_REFINE__
+	if(isect->object != OBJECT_NONE) {
+		if(UNLIKELY(t == 0.0f)) {
+			return P;
+		}
+#  ifdef __OBJECT_MOTION__
+		Transform tfm = ccl_fetch(sd, ob_itfm);
+#  else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
+#  endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D*t);
+		D = normalize_len(D, &t);
+	}
+
+	P = P + D*t;
+
+	/* compute refined intersection distance */
+	const float3 e1 = verts[0] - verts[2];
+	const float3 e2 = verts[1] - verts[2];
+	const float3 s1 = cross(D, e2);
+
+	const float invdivisor = 1.0f/dot(s1, e1);
+	const float3 d = P - verts[2];
+	const float3 s2 = cross(d, e1);
+	float rt = dot(e2, s2)*invdivisor;
+
+	/* compute refined position */
+	P = P + D*rt;
+
+	if(isect->object != OBJECT_NONE) {
+#  ifdef __OBJECT_MOTION__
+		Transform tfm = ccl_fetch(sd, ob_tfm);
+#  else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
+#  endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+#else
+	return P + D*t;
+#endif
+}
+
+/* Same as above, except that isect->t is assumed to be in object space for instancing */
+
+#ifdef __SUBSURFACE__
+#  if defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86))
+ccl_device_noinline
+#  else
+ccl_device_inline
+#  endif
+float3 motion_triangle_refine_subsurface(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, float3 verts[3])
+{
+	float3 P = ray->P;
+	float3 D = ray->D;
+	float t = isect->t;
+
+#  ifdef __INTERSECTION_REFINE__
+	if(isect->object != OBJECT_NONE) {
+#    ifdef __OBJECT_MOTION__
+		Transform tfm = ccl_fetch(sd, ob_itfm);
+#    else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
+#    endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D);
+		D = normalize(D);
+	}
+
+	P = P + D*t;
+
+	/* compute refined intersection distance */
+	const float3 e1 = verts[0] - verts[2];
+	const float3 e2 = verts[1] - verts[2];
+	const float3 s1 = cross(D, e2);
+
+	const float invdivisor = 1.0f/dot(s1, e1);
+	const float3 d = P - verts[2];
+	const float3 s2 = cross(d, e1);
+	float rt = dot(e2, s2)*invdivisor;
+
+	P = P + D*rt;
+
+	if(isect->object != OBJECT_NONE) {
+#    ifdef __OBJECT_MOTION__
+		Transform tfm = ccl_fetch(sd, ob_tfm);
+#    else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
+#    endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+#  else
+	return P + D*t;
+#  endif
+}
+#endif
+
+/* Setup of motion triangle specific parts of ShaderData, moved into this one
+ * function to more easily share computation of interpolated positions and
+ * normals */
+
+/* return 3 triangle vertex normals */
+ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool subsurface)
+{
+	/* get shader */
+	ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
+
+	/* get motion info */
+	int numsteps, numverts;
+	object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL);
+
+	/* figure out which steps we need to fetch and their interpolation factor */
+	int maxstep = numsteps*2;
+	int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1);
+	float t = ccl_fetch(sd, time)*maxstep - step;
+
+	/* find attribute */
+	AttributeElement elem;
+	int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_POSITION, &elem);
+	kernel_assert(offset != ATTR_STD_NOT_FOUND);
+
+	/* fetch vertex coordinates */
+	float3 verts[3], next_verts[3];
+	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+
+	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
+	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
+
+	/* interpolate between steps */
+	verts[0] = (1.0f - t)*verts[0] + t*next_verts[0];
+	verts[1] = (1.0f - t)*verts[1] + t*next_verts[1];
+	verts[2] = (1.0f - t)*verts[2] + t*next_verts[2];
+
+	/* compute refined position */
+#ifdef __SUBSURFACE__
+	if(!subsurface)
+#endif
+		ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts);
+#ifdef __SUBSURFACE__
+	else
+		ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts);
+#endif
+
+	/* compute face normal */
+	float3 Ng;
+	if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED)
+		Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
+	else
+		Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
+
+	ccl_fetch(sd, Ng) = Ng;
+	ccl_fetch(sd, N) = Ng;
+
+	/* compute derivatives of P w.r.t. uv */
+#ifdef __DPDU__
+	ccl_fetch(sd, dPdu) = (verts[0] - verts[2]);
+	ccl_fetch(sd, dPdv) = (verts[1] - verts[2]);
+#endif
+
+	/* compute smooth normal */
+	if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
+		/* find attribute */
+		AttributeElement elem;
+		int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_NORMAL, &elem);
+		kernel_assert(offset != ATTR_STD_NOT_FOUND);
+
+		/* fetch vertex coordinates */
+		float3 normals[3], next_normals[3];
+		motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step, normals);
+		motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_normals);
+
+		/* interpolate between steps */
+		normals[0] = (1.0f - t)*normals[0] + t*next_normals[0];
+		normals[1] = (1.0f - t)*normals[1] + t*next_normals[1];
+		normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];
+
+		/* interpolate between vertices */
+		float u = ccl_fetch(sd, u);
+		float v = ccl_fetch(sd, v);
+		float w = 1.0f - u - v;
+		ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]);
+	}
+}
+
+/* Ray intersection. We simply compute the vertex positions at the given ray
+ * time and do a ray intersection with the resulting triangle */
+
+ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg, Intersection *isect,
+	float3 P, float3 dir, float time, uint visibility, int object, int triAddr)
+{
+	/* primitive index for vertex location lookup */
+	int prim = kernel_tex_fetch(__prim_index, triAddr);
+	int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, triAddr): object;
+
+	/* get vertex locations for intersection */
+	float3 verts[3];
+	motion_triangle_vertices(kg, fobject, prim, time, verts);
+
+	/* ray-triangle intersection, unoptimized */
+	float t, u, v;
+
+	if(ray_triangle_intersect_uv(P, dir, isect->t, verts[2], verts[0], verts[1], &u, &v, &t)) {
+#ifdef __VISIBILITY_FLAG__
+		/* visibility flag test. we do it here under the assumption
+		 * that most triangles are culled by node flags */
+		if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
+#endif
+		{
+			isect->t = t;
+			isect->u = u;
+			isect->v = v;
+			isect->prim = triAddr;
+			isect->object = object;
+			isect->type = PRIMITIVE_MOTION_TRIANGLE;
+		
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/* Special ray intersection routines for subsurface scattering. In that case we
+ * only want to intersect with primitives in the same object, and if case of
+ * multiple hits we pick a single random primitive as the intersection point. */
+
+#ifdef __SUBSURFACE__
+ccl_device_inline void motion_triangle_intersect_subsurface(
+        KernelGlobals *kg,
+        SubsurfaceIntersection *ss_isect,
+        float3 P,
+        float3 dir,
+        float time,
+        int object,
+        int triAddr,
+        float tmax,
+        uint *lcg_state,
+        int max_hits)
+{
+	/* primitive index for vertex location lookup */
+	int prim = kernel_tex_fetch(__prim_index, triAddr);
+	int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, triAddr): object;
+
+	/* get vertex locations for intersection */
+	float3 verts[3];
+	motion_triangle_vertices(kg, fobject, prim, time, verts);
+
+	/* ray-triangle intersection, unoptimized */
+	float t, u, v;
+
+	if(ray_triangle_intersect_uv(P, dir, tmax, verts[2], verts[0], verts[1], &u, &v, &t)) {
+		for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) {
+			if(ss_isect->hits[i].t == t) {
+				return;
+			}
+		}
+
+		ss_isect->num_hits++;
+
+		int hit;
+
+		if(ss_isect->num_hits <= max_hits) {
+			hit = ss_isect->num_hits - 1;
+		}
+		else {
+			/* reservoir sampling: if we are at the maximum number of
+			 * hits, randomly replace element or skip it */
+			hit = lcg_step_uint(lcg_state) % ss_isect->num_hits;
+
+			if(hit >= max_hits)
+				return;
+		}
+
+		/* record intersection */
+		Intersection *isect = &ss_isect->hits[hit];
+		isect->t = t;
+		isect->u = u;
+		isect->v = v;
+		isect->prim = triAddr;
+		isect->object = object;
+		isect->type = PRIMITIVE_MOTION_TRIANGLE;
+
+		/* Record geometric normal. */
+		ss_isect->Ng[hit] = normalize(cross(verts[1] - verts[0],
+		                                    verts[2] - verts[0]));
+	}
+}
+#endif
+
 CCL_NAMESPACE_END
+
--- a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
@@ -1,280 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Motion Triangle Primitive
- *
- * These are stored as regular triangles, plus extra positions and normals at
- * times other than the frame center. Computing the triangle vertex positions
- * or normals at a given ray time is a matter of interpolation of the two steps
- * between which the ray time lies.
- *
- * The extra positions and normals are stored as ATTR_STD_MOTION_VERTEX_POSITION
- * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Refine triangle intersection to more precise hit point. For rays that travel
- * far the precision is often not so good, this reintersects the primitive from
- * a closer distance.
- */
-
-ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg,
-                                                ShaderData *sd,
-                                                const Intersection *isect,
-                                                const Ray *ray,
-                                                float3 verts[3])
-{
-	float3 P = ray->P;
-	float3 D = ray->D;
-	float t = isect->t;
-
-#ifdef __INTERSECTION_REFINE__
-	if(isect->object != OBJECT_NONE) {
-		if(UNLIKELY(t == 0.0f)) {
-			return P;
-		}
-#  ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
-#  else
-		Transform tfm = object_fetch_transform(kg,
-		                                       isect->object,
-		                                       OBJECT_INVERSE_TRANSFORM);
-#  endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D*t);
-		D = normalize_len(D, &t);
-	}
-
-	P = P + D*t;
-
-	/* Compute refined intersection distance. */
-	const float3 e1 = verts[0] - verts[2];
-	const float3 e2 = verts[1] - verts[2];
-	const float3 s1 = cross(D, e2);
-
-	const float invdivisor = 1.0f/dot(s1, e1);
-	const float3 d = P - verts[2];
-	const float3 s2 = cross(d, e1);
-	float rt = dot(e2, s2)*invdivisor;
-
-	/* Compute refined position. */
-	P = P + D*rt;
-
-	if(isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
-#  else
-		Transform tfm = object_fetch_transform(kg,
-		                                       isect->object,
-		                                       OBJECT_TRANSFORM);
-#  endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-#else
-	return P + D*t;
-#endif
-}
-
-/* Same as above, except that isect->t is assumed to be in object space
- * for instancing.
- */
-
-#ifdef __SUBSURFACE__
-#  if defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86))
-ccl_device_noinline
-#  else
-ccl_device_inline
-#  endif
-float3 motion_triangle_refine_subsurface(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         const Intersection *isect,
-                                         const Ray *ray,
-                                         float3 verts[3])
-{
-	float3 P = ray->P;
-	float3 D = ray->D;
-	float t = isect->t;
-
-#  ifdef __INTERSECTION_REFINE__
-	if(isect->object != OBJECT_NONE) {
-#    ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
-#    else
-		Transform tfm = object_fetch_transform(kg,
-		                                       isect->object,
-		                                       OBJECT_INVERSE_TRANSFORM);
-#    endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D);
-		D = normalize(D);
-	}
-
-	P = P + D*t;
-
-	/* compute refined intersection distance */
-	const float3 e1 = verts[0] - verts[2];
-	const float3 e2 = verts[1] - verts[2];
-	const float3 s1 = cross(D, e2);
-
-	const float invdivisor = 1.0f/dot(s1, e1);
-	const float3 d = P - verts[2];
-	const float3 s2 = cross(d, e1);
-	float rt = dot(e2, s2)*invdivisor;
-
-	P = P + D*rt;
-
-	if(isect->object != OBJECT_NONE) {
-#    ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
-#    else
-		Transform tfm = object_fetch_transform(kg,
-		                                       isect->object,
-		                                       OBJECT_TRANSFORM);
-#    endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-#  else  /* __INTERSECTION_REFINE__ */
-	return P + D*t;
-#  endif  /* __INTERSECTION_REFINE__ */
-}
-#endif  /* __SUBSURFACE__ */
-
-
-/* Ray intersection. We simply compute the vertex positions at the given ray
- * time and do a ray intersection with the resulting triangle.
- */
-
-ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
-                                                 Intersection *isect,
-                                                 float3 P,
-                                                 float3 dir,
-                                                 float time,
-                                                 uint visibility,
-                                                 int object,
-                                                 int prim_addr)
-{
-	/* Primitive index for vertex location lookup. */
-	int prim = kernel_tex_fetch(__prim_index, prim_addr);
-	int fobject = (object == OBJECT_NONE)
-	                  ? kernel_tex_fetch(__prim_object, prim_addr)
-	                  : object;
-	/* Get vertex locations for intersection. */
-	float3 verts[3];
-	motion_triangle_vertices(kg, fobject, prim, time, verts);
-	/* Ray-triangle intersection, unoptimized. */
-	float t, u, v;
-	if(ray_triangle_intersect_uv(P,
-	                             dir,
-	                             isect->t,
-	                             verts[2], verts[0], verts[1],
-	                             &u, &v, &t))
-	{
-#ifdef __VISIBILITY_FLAG__
-		/* Visibility flag test. we do it here under the assumption
-		 * that most triangles are culled by node flags.
-		 */
-		if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility)
-#endif
-		{
-			isect->t = t;
-			isect->u = u;
-			isect->v = v;
-			isect->prim = prim_addr;
-			isect->object = object;
-			isect->type = PRIMITIVE_MOTION_TRIANGLE;
-			return true;
-		}
-	}
-	return false;
-}
-
-/* Special ray intersection routines for subsurface scattering. In that case we
- * only want to intersect with primitives in the same object, and if case of
- * multiple hits we pick a single random primitive as the intersection point.
- */
-#ifdef __SUBSURFACE__
-ccl_device_inline void motion_triangle_intersect_subsurface(
-        KernelGlobals *kg,
-        SubsurfaceIntersection *ss_isect,
-        float3 P,
-        float3 dir,
-        float time,
-        int object,
-        int prim_addr,
-        float tmax,
-        uint *lcg_state,
-        int max_hits)
-{
-	/* Primitive index for vertex location lookup. */
-	int prim = kernel_tex_fetch(__prim_index, prim_addr);
-	int fobject = (object == OBJECT_NONE)
-	                  ? kernel_tex_fetch(__prim_object, prim_addr)
-	                  : object;
-	/* Get vertex locations for intersection. */
-	float3 verts[3];
-	motion_triangle_vertices(kg, fobject, prim, time, verts);
-	/* Ray-triangle intersection, unoptimized. */
-	float t, u, v;
-	if(ray_triangle_intersect_uv(P,
-	                             dir,
-	                             tmax,
-	                             verts[2], verts[0], verts[1],
-	                             &u, &v, &t))
-	{
-		for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) {
-			if(ss_isect->hits[i].t == t) {
-				return;
-			}
-		}
-		ss_isect->num_hits++;
-		int hit;
-		if(ss_isect->num_hits <= max_hits) {
-			hit = ss_isect->num_hits - 1;
-		}
-		else {
-			/* Reservoir sampling: if we are at the maximum number of
-			 * hits, randomly replace element or skip it.
-			 */
-			hit = lcg_step_uint(lcg_state) % ss_isect->num_hits;
-
-			if(hit >= max_hits)
-				return;
-		}
-		/* Record intersection. */
-		Intersection *isect = &ss_isect->hits[hit];
-		isect->t = t;
-		isect->u = u;
-		isect->v = v;
-		isect->prim = prim_addr;
-		isect->object = object;
-		isect->type = PRIMITIVE_MOTION_TRIANGLE;
-		/* Record geometric normal. */
-		ss_isect->Ng[hit] = normalize(cross(verts[1] - verts[0],
-		                                    verts[2] - verts[0]));
-	}
-}
-#endif  /* __SUBSURFACE__ */
-
-CCL_NAMESPACE_END
--- a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
@@ -1,123 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Motion Triangle Primitive
- *
- * These are stored as regular triangles, plus extra positions and normals at
- * times other than the frame center. Computing the triangle vertex positions
- * or normals at a given ray time is a matter of interpolation of the two steps
- * between which the ray time lies.
- *
- * The extra positions and normals are stored as ATTR_STD_MOTION_VERTEX_POSITION
- * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Setup of motion triangle specific parts of ShaderData, moved into this one
- * function to more easily share computation of interpolated positions and
- * normals */
-
-/* return 3 triangle vertex normals */
-ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
-                                                      ShaderData *sd, const
-                                                      Intersection *isect,
-                                                      const Ray *ray,
-                                                      bool subsurface)
-{
-	/* Get shader. */
-	sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
-	/* Get motion info. */
-	/* TODO(sergey): This logic is really similar to motion_triangle_vertices(),
-	 * can we de-duplicate something here?
-	 */
-	int numsteps, numverts;
-	object_motion_info(kg, sd->object, &numsteps, &numverts, NULL);
-	/* Figure out which steps we need to fetch and their interpolation factor. */
-	int maxstep = numsteps*2;
-	int step = min((int)(sd->time*maxstep), maxstep-1);
-	float t = sd->time*maxstep - step;
-	/* Find attribute. */
-	AttributeElement elem;
-	int offset = find_attribute_motion(kg, sd->object,
-	                                   ATTR_STD_MOTION_VERTEX_POSITION,
-	                                   &elem);
-	kernel_assert(offset != ATTR_STD_NOT_FOUND);
-	/* Fetch vertex coordinates. */
-	float3 verts[3], next_verts[3];
-	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
-	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
-	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
-	/* Interpolate between steps. */
-	verts[0] = (1.0f - t)*verts[0] + t*next_verts[0];
-	verts[1] = (1.0f - t)*verts[1] + t*next_verts[1];
-	verts[2] = (1.0f - t)*verts[2] + t*next_verts[2];
-	/* Compute refined position. */
-#ifdef __SUBSURFACE__
-	if(subsurface) {
-		sd->P = motion_triangle_refine_subsurface(kg,
-		                                                     sd,
-		                                                     isect,
-		                                                     ray,
-		                                                     verts);
-	}
-	else
-#endif  /*  __SUBSURFACE__*/
-	{
-		sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
-	}
-	/* Compute face normal. */
-	float3 Ng;
-	if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
-		Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
-	}
-	else {
-		Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
-	}
-	sd->Ng = Ng;
-	sd->N = Ng;
-	/* Compute derivatives of P w.r.t. uv. */
-#ifdef __DPDU__
-	sd->dPdu = (verts[0] - verts[2]);
-	sd->dPdv = (verts[1] - verts[2]);
-#endif
-	/* Compute smooth normal. */
-	if(sd->shader & SHADER_SMOOTH_NORMAL) {
-		/* Find attribute. */
-		AttributeElement elem;
-		int offset = find_attribute_motion(kg,
-		                                   sd->object,
-		                                   ATTR_STD_MOTION_VERTEX_NORMAL,
-		                                   &elem);
-		kernel_assert(offset != ATTR_STD_NOT_FOUND);
-		/* Fetch vertex coordinates. */
-		float3 normals[3], next_normals[3];
-		motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step, normals);
-		motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_normals);
-		/* Interpolate between steps. */
-		normals[0] = (1.0f - t)*normals[0] + t*next_normals[0];
-		normals[1] = (1.0f - t)*normals[1] + t*next_normals[1];
-		normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];
-		/* Interpolate between vertices. */
-		float u = sd->u;
-		float v = sd->v;
-		float w = 1.0f - u - v;
-		sd->N = (u*normals[0] + v*normals[1] + w*normals[2]);
-	}
-}
-
-CCL_NAMESPACE_END
-
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Luna Rood	a2ed635a73	Fix typo and change `.enabled` to `.active`	2017-02-18 04:12:29 -02:00
Luna Rood	b3aead8fd7	Use parenthesis for bit shifts	2017-02-14 18:13:08 -02:00
Luna Rood	733b5b8c66	Remove unused `weight_components`	2017-02-14 18:13:08 -02:00
Luna Rood	5a17cb4c08	Use poll callback to disable bind operator instead of Python	2017-02-14 18:13:08 -02:00
Luna Rood	e5e44c01f2	icon	2017-02-06 01:23:18 -02:00
Luna Rood	a3e32e2ab5	Review: Multithread deform code	2017-01-25 04:05:53 -02:00
Luna Rood	e843f42e66	Review: Cleanup	2017-01-25 04:05:52 -02:00
Luna Rood	96f6ec07fb	Review: Add infinite weight flags enum	2017-01-25 04:05:52 -02:00
Luna Rood	c38e19ca67	Review: Move stuff to helper func and more cleanup	2017-01-25 04:05:52 -02:00
Luna Rood	96d66c7e4d	Review: Optimize numpoly calculation	2017-01-25 04:05:52 -02:00
Luna Rood	46821f072d	Review: Join allocations and some bpoly refactor	2017-01-25 04:05:52 -02:00
Luna Rood	f870343208	Review: Combine allocations and minor cleanup	2017-01-25 04:05:52 -02:00
Luna Rood	cf1a7e3944	Review: Report errors in UI and some more cleanup	2017-01-25 04:05:52 -02:00
Luna Rood	cf660b2a02	Review: Fix depsgraph relation	2017-01-25 04:05:52 -02:00
Luna Rood	6f3957770d	Review: Fix indentations and use MEM_SAFE_FREE	2017-01-25 04:05:52 -02:00
Luna Rood	7608f366c7	Review: Replace weight_components with individual variables	2017-01-25 04:05:52 -02:00
Luna Rood	8c220c57f9	Review: More cleanup...	2017-01-25 04:05:52 -02:00
Luna Rood	a300f80043	Review: Inline loop indices Also fixed endian switch sign, and UI Python thingy...	2017-01-25 04:05:52 -02:00
Luna Rood	22ce298d73	General cleanup (unsigned stuff and loop counter inlining)	2017-01-15 16:54:46 -02:00
Luna Rood	3469aa47c1	Remove warnings	2017-01-15 16:54:46 -02:00
Luna Rood	097a560bc9	Fix silly mistake in nearestVert	2017-01-15 16:54:45 -02:00
Luna Rood	1b7623fc06	Change angle function calls in sdef	2017-01-14 01:51:55 -02:00
Luna Rood	c546256563	Change angle function call in 3d to 2d mapping function	2017-01-14 01:51:55 -02:00
Luna Rood	5c263a9050	Split interp_weights_face_v3 into specific functions for tris and quads	2017-01-11 15:52:52 -02:00
Luna Rood	8745cd825a	Remove custom weight interp func in favor of Blender's built-in implementation	2017-01-11 15:52:32 -02:00
Luna Rood	28622ae81e	Fix VS 2015 issue (change isnanf to isnan)	2017-01-11 13:05:53 -02:00
Luna Rood	d6c7163c06	Fix 2d mapping function's name	2017-01-11 03:59:30 -02:00
Luna Rood	0bb57759ec	Replace "cent" functions from math_geom with "mid" ones from math_vector	2017-01-10 20:29:22 -02:00
Luna Rood	5e1d438d5e	Constify some stuff (for clarity and correctness)	2017-01-10 16:42:34 -02:00
Luna Rood	0721bc0ac4	Silly const mistake (missed in refactor...)	2017-01-03 20:02:49 -02:00
Luna Rood	7ca0894a17	Implement target poly influence interpolation	2017-01-03 19:27:08 -02:00
Luna Rood	751496437b	Add 3d to 2d plane mapping functions to math lib	2017-01-03 19:26:03 -02:00
Luna Rood	3014601f3b	Fix out of bounds memory access in interp_weights_face_v3 interp_weights_face_v3 required a length four array for weights even when calculating weights for a tri, otherwise, it would access unkown memory. This fix allows a weight array of size three to be passed when only calculating tri weights.	2017-01-03 19:22:08 -02:00
Luna Rood	b80971ce10	Initial Surface Deform Modifier implementation	2016-11-29 23:04:40 -02:00
Luna Rood	68f5ce194b	Add cent_poly_v3 function	2016-11-27 00:44:48 -02:00
Luna Rood	1e9003aea5	Add is_poly_convex_v3 function	2016-11-25 14:56:09 -02:00
Luna Rood	95701b0b04	Fix (unreported) looptri array not being recalculated in ccgDM and emDM In ccgDM and emDM, looptri array recalculation was being handled directly by `DM_getLoopTriArray` (`getLoopTriArray` callback), while `DM_recalcLoopTri` (`recalcLoopTri` callback) was doing nothing. This results in the array not being recalculated when other functions that depend on the array data are called. These functions, such as `getNumLoopTris`, call `recalcLoopTri` to ensure the data is up to date, but in the case of CCGDerivedMesh that was doing nothing. This moves all the recalculation code to `ccgDM_recalcLoopTri` and makes `ccgDM_getLoopTriArray` call that. Reviewed By: mont29 Differential Revision: https://developer.blender.org/D2375	2016-11-25 14:49:58 -02:00