Merge branch 'blender2.8' into transform-manipulators

Conflicts: intern/gawain/gawain/immediate.h intern/gawain/src/immediate.c source/blender/editors/physics/physics_ops.c source/blender/editors/screen/glutil.c source/blender/editors/space_view3d/space_view3d.c source/blender/editors/space_view3d/view3d_draw.c source/blender/editors/space_view3d/view3d_edit.c source/blender/editors/space_view3d/view3d_ops.c source/blender/editors/transform/transform_manipulator.c
Collada export cleanup
2017-04-04 21:39:57 +02:00 · 2017-04-04 20:44:22 +02:00 · 2017-04-04 19:17:52 +02:00 · 2017-04-04 18:43:01 +02:00 · 2017-04-04 17:45:55 +02:00 · 2017-04-04 17:04:03 +02:00
1394 changed files with 76162 additions and 32883 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,16 +1,24 @@
 [submodule "release/scripts/addons"]
 	path = release/scripts/addons
 	url = ../blender-addons.git
+	branch = blender2.8
 	ignore = all
+	branch = master
 [submodule "release/scripts/addons_contrib"]
 	path = release/scripts/addons_contrib
 	url = ../blender-addons-contrib.git
+	branch = master
 	ignore = all
+	branch = master
 [submodule "release/datafiles/locale"]
 	path = release/datafiles/locale
 	url = ../blender-translations.git
+	branch = master
 	ignore = all
+	branch = master
 [submodule "source/tools"]
 	path = source/tools
 	url = ../blender-dev-tools.git
+	branch = master
 	ignore = all
+	branch = master
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -242,6 +242,8 @@ endif()
 option(WITH_PLAYER        "Build Player" OFF)
 option(WITH_OPENCOLORIO   "Enable OpenColorIO color management" ${_init_OPENCOLORIO})

+option(WITH_CLAY_ENGINE    "Enable New Clay engine (Breaks Mac and Intel compatibility)" ON)
+
 # Compositor
 option(WITH_COMPOSITOR         "Enable the tile based nodal compositor" ON)

@@ -445,6 +447,7 @@ option(WITH_BOOST					"Enable features depending on boost" ON)

 # Unit testsing
 option(WITH_GTESTS "Enable GTest unit testing" OFF)
+option(WITH_OPENGL_TESTS "Enable OpenGL related unit testing (Experimental)" OFF)


 # Documentation
@@ -471,12 +474,6 @@ mark_as_advanced(
 	WITH_GL_PROFILE_ES20
 )

-if(WITH_GL_PROFILE_COMPAT)
-	set(WITH_GLU ON)
-else()
-	set(WITH_GLU OFF)
-endif()
-
 if(WIN32)
 	option(WITH_GL_ANGLE "Link with the ANGLE library, an OpenGL ES 2.0 implementation based on Direct3D, instead of the system OpenGL library." OFF)
 	mark_as_advanced(WITH_GL_ANGLE)
@@ -517,18 +514,20 @@ endif()
 option(WITH_LEGACY_DEPSGRAPH "Build Blender with legacy dependency graph" ON)
 mark_as_advanced(WITH_LEGACY_DEPSGRAPH)

-# Use hardcoded paths or find_package to find externals
-option(WITH_WINDOWS_FIND_MODULES "Use find_package to locate libraries" OFF)
-mark_as_advanced(WITH_WINDOWS_FIND_MODULES)
+if(WIN32)
+	# Use hardcoded paths or find_package to find externals
+	option(WITH_WINDOWS_FIND_MODULES "Use find_package to locate libraries" OFF)
+	mark_as_advanced(WITH_WINDOWS_FIND_MODULES)

-option(WITH_WINDOWS_CODESIGN "Use signtool to sign the final binary." OFF)
-mark_as_advanced(WITH_WINDOWS_CODESIGN)
+	option(WITH_WINDOWS_CODESIGN "Use signtool to sign the final binary." OFF)
+	mark_as_advanced(WITH_WINDOWS_CODESIGN)

-set(WINDOWS_CODESIGN_PFX CACHE FILEPATH  "Path to pfx file to use for codesigning.")
-mark_as_advanced(WINDOWS_CODESIGN_PFX)
+	set(WINDOWS_CODESIGN_PFX CACHE FILEPATH  "Path to pfx file to use for codesigning.")
+	mark_as_advanced(WINDOWS_CODESIGN_PFX)

-set(WINDOWS_CODESIGN_PFX_PASSWORD CACHE STRING  "password for pfx file used for codesigning.")
-mark_as_advanced(WINDOWS_CODESIGN_PFX_PASSWORD)
+	set(WINDOWS_CODESIGN_PFX_PASSWORD CACHE STRING  "password for pfx file used for codesigning.")
+	mark_as_advanced(WINDOWS_CODESIGN_PFX_PASSWORD)
+endif()

 # avoid using again
 option_defaults_clear()
@@ -722,7 +721,7 @@ if(NOT WITH_BOOST)
 	macro(set_and_warn
 		_setting _val)
 		if(${${_setting}})
-			message(STATUS "'WITH_BOOST' is disabled: forceing 'set(${_setting} ${_val})'")
+			message(STATUS "'WITH_BOOST' is disabled: forcing 'set(${_setting} ${_val})'")
 		endif()
 		set(${_setting} ${_val})
 	endmacro()
@@ -923,7 +922,7 @@ if(WITH_X11)
 	if(WITH_X11_ALPHA)
 		find_library(X11_Xrender_LIB Xrender  ${X11_LIB_SEARCH_PATH})
 		mark_as_advanced(X11_Xrender_LIB)
-		if (X11_Xrender_LIB)
+		if(X11_Xrender_LIB)
 			list(APPEND PLATFORM_LINKLIBS ${X11_Xrender_LIB})
 		else()
 			set(WITH_X11_ALPHA OFF)
@@ -1053,11 +1052,6 @@ endif()
 find_package(OpenGL)
 blender_include_dirs_sys("${OPENGL_INCLUDE_DIR}")

-if(WITH_GLU)
-	list(APPEND BLENDER_GL_LIBRARIES "${OPENGL_glu_LIBRARY}")
-	list(APPEND GL_DEFINITIONS -DWITH_GLU)
-endif()
-
 if(WITH_SYSTEM_GLES)
 	find_package_wrapper(OpenGLES)
 endif()
@@ -1285,9 +1279,7 @@ else()

 endif()

-if(NOT WITH_GLU)
-	list(APPEND GL_DEFINITIONS -DGLEW_NO_GLU)
-endif()
+list(APPEND GL_DEFINITIONS -DGLEW_NO_GLU)

 #-----------------------------------------------------------------------------
 # Configure Bullet
@@ -1795,7 +1787,6 @@ if(FIRST_RUN)

 	info_cfg_text("OpenGL:")
 	info_cfg_option(WITH_GLEW_ES)
-	info_cfg_option(WITH_GLU)
 	info_cfg_option(WITH_GL_EGL)
 	info_cfg_option(WITH_GL_PROFILE_COMPAT)
 	info_cfg_option(WITH_GL_PROFILE_CORE)
--- a/8
+++ b/8
@@ -1,4 +1,4 @@
-# -*- mode: gnumakefile; tab-width: 8; indent-tabs-mode: t; -*-
+# -*- mode: gnumakefile; tab-width: 4; indent-tabs-mode: t; -*-
 # vim: tabstop=4
 #
 # ##### BEGIN GPL LICENSE BLOCK #####
@@ -113,7 +113,7 @@ CMAKE_CONFIG = cmake $(BUILD_CMAKE_ARGS) \
 # X11 spesific
 ifdef DISPLAY
 	CMAKE_CONFIG_TOOL = cmake-gui
-else 
+else
 	CMAKE_CONFIG_TOOL = ccmake
 endif

@@ -127,7 +127,7 @@ all: .FORCE
 #	# if test ! -f $(BUILD_DIR)/CMakeCache.txt ; then \
 #	# 	$(CMAKE_CONFIG); \
 #	# fi
-	
+
 #	# do this always incase of failed initial build, could be smarter here...
 	@$(CMAKE_CONFIG)

@@ -402,7 +402,7 @@ update: .FORCE
 		svn update ../lib/* ; \
 	fi
 	git pull --rebase
-	git submodule foreach git pull --rebase origin master
+	git submodule update --remote


 # -----------------------------------------------------------------------------
--- a/build_files/build_environment/install_deps.sh
+++ b/build_files/build_environment/install_deps.sh
@@ -355,7 +355,7 @@ OPENVDB_FORCE_REBUILD=false
 OPENVDB_SKIP=false

 # Alembic needs to be compiled for now
-ALEMBIC_VERSION="1.6.0"
+ALEMBIC_VERSION="1.7.1"
 ALEMBIC_VERSION_MIN=$ALEMBIC_VERSION
 ALEMBIC_FORCE_BUILD=false
 ALEMBIC_FORCE_REBUILD=false
@@ -787,7 +787,7 @@ CXXFLAGS_BACK=$CXXFLAGS
 if [ "$USE_CXX11" = true ]; then
  WARNING "C++11 is now mandatory for blender2.8, this *should* go smoothly with any very recent distribution.
 However, if you are experiencing linking errors (also when building Blender itself), please try the following:
-    * Re-run this script with `--build-all --force-all` options.
+    * Re-run this script with '--build-all --force-all' options.
    * Ensure your gcc version is at the very least 4.8, if possible you should really rather use gcc-5.1 or above.

 Please note that until the transition to C++11-built libraries if completed in your distribution, situation will
@@ -2228,9 +2228,6 @@ compile_ALEMBIC() {
    return
  fi

-  compile_HDF5
-  PRINT ""
-
  # To be changed each time we make edits that would modify the compiled result!
  alembic_magic=2
  _init_alembic
@@ -2258,6 +2255,12 @@ compile_ALEMBIC() {

    cmake_d="-D CMAKE_INSTALL_PREFIX=$_inst"

+    # Without Boost or TR1, Alembic requires C++11.
+    if [ "$USE_CXX11" != true ]; then
+      cmake_d="$cmake_d -D ALEMBIC_LIB_USES_BOOST=ON"
+      cmake_d="$cmake_d -D ALEMBIC_LIB_USES_TR1=OFF"
+    fi
+
    if [ -d $INST/boost ]; then
      cmake_d="$cmake_d -D BOOST_ROOT=$INST/boost"
      cmake_d="$cmake_d -D USE_STATIC_BOOST=ON"
@@ -2277,8 +2280,6 @@ compile_ALEMBIC() {
      cmake_d="$cmake_d -D USE_STATIC_HDF5=OFF"
      cmake_d="$cmake_d -D ALEMBIC_ILMBASE_LINK_STATIC=OFF"
      cmake_d="$cmake_d -D ALEMBIC_SHARED_LIBS=OFF"
-      cmake_d="$cmake_d -D ALEMBIC_LIB_USES_BOOST=ON"
-      cmake_d="$cmake_d -D ALEMBIC_LIB_USES_TR1=OFF"
      INFO "ILMBASE_ROOT=$INST/openexr"
    fi

@@ -4244,7 +4245,7 @@ print_info() {
    PRINT "  $_3"
    _buildargs="$_buildargs $_1 $_2 $_3"
    if [ -d $INST/osl ]; then
-      _1="-D CYCLES_OSL=$INST/osl"
+      _1="-D OSL_ROOT_DIR=$INST/osl"
      PRINT "  $_1"
      _buildargs="$_buildargs $_1"
    fi
--- a/build_files/buildbot/master.cfg
+++ b/build_files/buildbot/master.cfg
@@ -4,10 +4,10 @@
 # <pep8 compliant>

 # List of the branches being built automatically overnight
-NIGHT_SCHEDULE_BRANCHES = [None]
+NIGHT_SCHEDULE_BRANCHES = [None, "blender2.8"]

 # List of the branches available for force build
-FORCE_SCHEDULE_BRANCHES = ["master", "gooseberry", "experimental-build"]
+FORCE_SCHEDULE_BRANCHES = ["master", "blender2.8", "experimental-build"]

 """
 Stock Twisted directory lister doesn't provide any information about last file
@@ -127,7 +127,14 @@ def schedule_force_build(name):
                project=forcesched.FixedParameter(name="project", default="", hide=True)),
            # For now, hide other codebases.
            forcesched.CodebaseParameter(hide=True, codebase="blender-translations"),
-            forcesched.CodebaseParameter(hide=True, codebase="blender-addons"),
+            forcesched.CodebaseParameter(
+                codebase="blender-addons",
+                branch=forcesched.ChoiceStringParameter(
+                    name="branch", choices=["master", "blender2.8"], default="master"),
+                repository=forcesched.FixedParameter(name="repository", default="", hide=True),
+                project=forcesched.FixedParameter(name="project", default="", hide=True),
+                revision=forcesched.FixedParameter(name="revision", default="", hide=True),
+            ),
            forcesched.CodebaseParameter(hide=True, codebase="blender-addons-contrib"),
            forcesched.CodebaseParameter(hide=True, codebase="blender-dev-tools"),
            forcesched.CodebaseParameter(hide=True, codebase="lib svn")],
@@ -139,11 +146,15 @@ def schedule_build(name, hour, minute=0):
        scheduler_name = "nightly " + name
        if current_branch:
            scheduler_name += ' ' + current_branch
+        # Use special addons submodule branch when building blender2.8 branch.
+        addons_branch = "master"
+        if current_branch == "blender2.8":
+            addons_branch = "blender2.8"
        c['schedulers'].append(timed.Nightly(name=scheduler_name,
            codebases={
                "blender": {"repository": ""},
                "blender-translations": {"repository": "", "branch": "master"},
-                "blender-addons": {"repository": "", "branch": "master"},
+                "blender-addons": {"repository": "", "branch": addons_branch},
                "blender-addons-contrib": {"repository": "", "branch": "master"},
                "blender-dev-tools": {"repository": "", "branch": "master"},
                "lib svn": {"repository": "", "branch": "trunk"}},
@@ -225,8 +236,7 @@ def git_step(branch=''):


 def git_submodules_update():
-    command = ['git', 'submodule', 'foreach', '--recursive',
-               'git', 'pull', 'origin', 'master']
+    command = ['git', 'submodule', 'update', '--remote']
    return ShellCommand(name='Submodules Update',
                        command=command,
                        description='updating',
@@ -235,7 +245,10 @@ def git_submodules_update():


 def lib_svn_step(dir):
-    return SVN(name='lib svn',
+    name = "lib svn"
+    if dir == "darwin":
+        name = "C++11 lib svn"
+    return SVN(name=name,
               baseURL='https://svn.blender.org/svnroot/bf-blender/%%BRANCH%%/lib/' + dir,
               codebase='lib svn',
               mode='update',
@@ -264,6 +277,9 @@ def generic_builder(id, libdir='', branch='', rsync=False):
    f = BuildFactory()
    if libdir != '':
        f.addStep(lib_svn_step(libdir))
+        # Special trick to make sure we always have all the libs.
+        if libdir.startswith("darwin"):
+            f.addStep(lib_svn_step("darwin"))

    for submodule in ('blender-translations',
                      'blender-addons',
@@ -286,7 +302,7 @@ def generic_builder(id, libdir='', branch='', rsync=False):
        f.addStep(FileUpload(name='upload',
                             slavesrc='buildbot_upload.zip',
                             masterdest=filename,
-                             maxsize=150 * 1024 * 1024,
+                             maxsize=180 * 1024 * 1024,
                             workdir='install'))
    f.addStep(MasterShellCommand(name='unpack',
                                 command=['python2.7', unpack_script, filename],
--- a/build_files/buildbot/master_unpack.py
+++ b/build_files/buildbot/master_unpack.py
@@ -67,6 +67,9 @@ def get_platform(filename):


 def get_branch(filename):
+    if filename.startswith("blender-2.8"):
+        return "blender2.8"
+
    tokens = filename.split("-")
    branch = ""

--- a/build_files/buildbot/slave_compile.py
+++ b/build_files/buildbot/slave_compile.py
@@ -72,10 +72,8 @@ if 'cmake' in builder:
        # Set up OSX architecture
        if builder.endswith('x86_64_10_6_cmake'):
            cmake_extra_options.append('-DCMAKE_OSX_ARCHITECTURES:STRING=x86_64')
-        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE=/usr/local/cuda8-hack/bin/nvcc')
        cmake_extra_options.append('-DWITH_CODEC_QUICKTIME=OFF')
        cmake_extra_options.append('-DCMAKE_OSX_DEPLOYMENT_TARGET=10.6')
-        build_cubins = False


    elif builder.startswith('win'):
@@ -93,7 +91,6 @@ if 'cmake' in builder:
            elif builder.startswith('win32'):
                bits = 32
                cmake_options.extend(['-G', 'Visual Studio 12 2013'])
-        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE:FILEPATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/bin/nvcc.exe')

    elif builder.startswith('linux'):
        tokens = builder.split("_")
@@ -113,8 +110,6 @@ if 'cmake' in builder:
            cuda_chroot_name = 'buildbot_' + deb_name + '_x86_64'
            targets = ['player', 'blender', 'cuda']

-        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-8.0/bin/nvcc')
-
    cmake_options.append("-C" + os.path.join(blender_dir, cmake_config_file))

    # Prepare CMake options needed to configure cuda binaries compilation.
--- a/build_files/buildbot/slave_pack.py
+++ b/build_files/buildbot/slave_pack.py
@@ -111,7 +111,8 @@ if builder.find('cmake') != -1:
        if builder.endswith('vc2015'):
            platform += "-vc14"
        builderified_name = 'blender-{}-{}-{}'.format(blender_full_version, git_hash, platform)
-        if branch != '':
+        # NOTE: Blender 2.8 is already respected by blender_full_version.
+        if branch != '' and branch != 'blender2.8':
            builderified_name = branch + "-" + builderified_name

        os.rename(result_file, "{}.zip".format(builderified_name))
@@ -177,7 +178,8 @@ if builder.find('cmake') != -1:
                                                      blender_hash,
                                                      blender_glibc,
                                                      blender_arch)
-        if branch != '':
+        # NOTE: Blender 2.8 is already respected by blender_full_version.
+        if branch != '' and branch != 'blender2.8':
            package_name = branch + "-" + package_name

        upload_filename = package_name + ".tar.bz2"
--- a/build_files/cmake/Modules/GTestTesting.cmake
+++ b/build_files/cmake/Modules/GTestTesting.cmake
@@ -45,7 +45,7 @@ macro(BLENDER_SRC_GTEST_EX NAME SRC EXTRA_LIBS DO_ADD_TEST)
 		                      RUNTIME_OUTPUT_DIRECTORY_DEBUG   "${TESTS_OUTPUT_DIR}"
 		                      INCLUDE_DIRECTORIES              "${TEST_INC}")
 		if(${DO_ADD_TEST})
-			add_test(${NAME}_test ${TESTS_OUTPUT_DIR}/${NAME}_test)
+			add_test(NAME ${NAME}_test COMMAND ${TESTS_OUTPUT_DIR}/${NAME}_test WORKING_DIRECTORY $<TARGET_FILE_DIR:blender>)
 		endif()
 	endif()
 endmacro()
--- a/build_files/cmake/buildinfo.cmake
+++ b/build_files/cmake/buildinfo.cmake
@@ -56,7 +56,7 @@ if(EXISTS ${SOURCE_DIR}/.git)
 				string(REGEX REPLACE "[\r\n]+" ";" _git_contains_branches "${_git_contains_branches}")
 				string(REGEX REPLACE ";[ \t]+" ";" _git_contains_branches "${_git_contains_branches}")
 				foreach(_branch ${_git_contains_branches})
-					if (NOT "${_branch}" MATCHES "\\(HEAD.*")
+					if(NOT "${_branch}" MATCHES "\\(HEAD.*")
 						set(MY_WC_BRANCH "${_branch}")
 						break()
 					endif()
--- a/build_files/cmake/macros.cmake
+++ b/build_files/cmake/macros.cmake
@@ -416,14 +416,7 @@ function(setup_liblinks
 		target_link_libraries(${target} ${OPENCOLORIO_LIBRARIES})
 	endif()
 	if(WITH_OPENSUBDIV OR WITH_CYCLES_OPENSUBDIV)
-		if(WIN32 AND NOT UNIX)
-			file_list_suffix(OPENSUBDIV_LIBRARIES_DEBUG "${OPENSUBDIV_LIBRARIES}" "_d")
-			target_link_libraries_debug(${target} "${OPENSUBDIV_LIBRARIES_DEBUG}")
-			target_link_libraries_optimized(${target} "${OPENSUBDIV_LIBRARIES}")
-			unset(OPENSUBDIV_LIBRARIES_DEBUG)
-		else()
 			target_link_libraries(${target} ${OPENSUBDIV_LIBRARIES})
-		endif()
 	endif()
 	if(WITH_OPENVDB)
 		target_link_libraries(${target} ${OPENVDB_LIBRARIES} ${TBB_LIBRARIES})
@@ -604,6 +597,8 @@ function(SETUP_BLENDER_SORTED_LIBS)
 		bf_modifiers
 		bf_bmesh
 		bf_gpu
+		bf_draw
+		bf_intern_gawain
 		bf_blenloader
 		bf_blenkernel
 		bf_physics
@@ -1581,24 +1576,24 @@ macro(openmp_delayload
 endmacro()

 MACRO(WINDOWS_SIGN_TARGET target)
-	if (WITH_WINDOWS_CODESIGN)
-		if (!SIGNTOOL_EXE)
+	if(WITH_WINDOWS_CODESIGN)
+		if(!SIGNTOOL_EXE)
 			error("Codesigning is enabled, but signtool is not found")
 		else()
-			if (WINDOWS_CODESIGN_PFX_PASSWORD)
+			if(WINDOWS_CODESIGN_PFX_PASSWORD)
 				set(CODESIGNPASSWORD /p ${WINDOWS_CODESIGN_PFX_PASSWORD})
 			else()
-				if ($ENV{PFXPASSWORD})
+				if($ENV{PFXPASSWORD})
 					set(CODESIGNPASSWORD /p $ENV{PFXPASSWORD})
 				else()
-					message( FATAL_ERROR "WITH_WINDOWS_CODESIGN is on but WINDOWS_CODESIGN_PFX_PASSWORD not set, and environment variable PFXPASSWORD not found, unable to sign code.")
+					message(FATAL_ERROR "WITH_WINDOWS_CODESIGN is on but WINDOWS_CODESIGN_PFX_PASSWORD not set, and environment variable PFXPASSWORD not found, unable to sign code.")
 				endif()
 			endif()
 			add_custom_command(TARGET ${target}
-						POST_BUILD
-						COMMAND ${SIGNTOOL_EXE} sign /f ${WINDOWS_CODESIGN_PFX} ${CODESIGNPASSWORD} $<TARGET_FILE:${target}>
-						VERBATIM
-				)
+				POST_BUILD
+				COMMAND ${SIGNTOOL_EXE} sign /f ${WINDOWS_CODESIGN_PFX} ${CODESIGNPASSWORD} $<TARGET_FILE:${target}>
+				VERBATIM
+			)
 		endif()
 	endif()
 ENDMACRO()
--- a/build_files/cmake/packaging.cmake
+++ b/build_files/cmake/packaging.cmake
@@ -1,5 +1,7 @@
-set(PROJECT_DESCRIPTION  "Blender is a very fast and versatile 3D modeller/renderer.")
-set(PROJECT_COPYRIGHT    "Copyright (C) 2001-2012 Blender Foundation")
+string(TIMESTAMP CURRENT_YEAR "%Y")
+
+set(PROJECT_DESCRIPTION  "Blender is the free and open source 3D creation suite software.")
+set(PROJECT_COPYRIGHT    "Copyright (C) 2001-${CURRENT_YEAR} Blender Foundation")
 set(PROJECT_CONTACT      "foundation@blender.org")
 set(PROJECT_VENDOR       "Blender Foundation")

@@ -38,8 +40,8 @@ unset(MY_WC_HASH)
 # Force Package Name
 execute_process(COMMAND date "+%Y%m%d" OUTPUT_VARIABLE CPACK_DATE OUTPUT_STRIP_TRAILING_WHITESPACE)
 string(TOLOWER ${PROJECT_NAME} PROJECT_NAME_LOWER)
-if (MSVC)
-	if ("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+if(MSVC)
+	if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
 		set(PACKAGE_ARCH windows64)
 	else()
 		set(PACKAGE_ARCH windows32)
@@ -48,7 +50,7 @@ else(MSVC)
 	set(PACKAGE_ARCH ${CMAKE_SYSTEM_PROCESSOR})
 endif()

-if (CPACK_OVERRIDE_PACKAGENAME)
+if(CPACK_OVERRIDE_PACKAGENAME)
 	set(CPACK_PACKAGE_FILE_NAME ${CPACK_OVERRIDE_PACKAGENAME}-${PACKAGE_ARCH})
 else()
 	set(CPACK_PACKAGE_FILE_NAME ${PROJECT_NAME_LOWER}-${MAJOR_VERSION}.${MINOR_VERSION}.${PATCH_VERSION}-git${CPACK_DATE}.${BUILD_REV}-${PACKAGE_ARCH})
@@ -135,4 +137,3 @@ unset(MINOR_VERSION)
 unset(PATCH_VERSION)

 unset(BUILD_REV)
-
--- a/build_files/cmake/platform/platform_apple.cmake
+++ b/build_files/cmake/platform/platform_apple.cmake
@@ -23,6 +23,8 @@

 # Libraries configuration for Apple.

+set(MACOSX_DEPLOYMENT_TARGET "10.9")
+
 if(NOT DEFINED LIBDIR)
 	if(WITH_CXX11)
 		set(LIBDIR ${CMAKE_SOURCE_DIR}/../lib/darwin)
--- a/build_files/cmake/platform/platform_win32_msvc.cmake
+++ b/build_files/cmake/platform/platform_win32_msvc.cmake
@@ -33,7 +33,7 @@ endmacro()
 macro(windows_find_package package_name
 	)
 	if(WITH_WINDOWS_FIND_MODULES)
-		find_package( ${package_name})
+		find_package(${package_name})
 	endif(WITH_WINDOWS_FIND_MODULES)
 endmacro()

@@ -446,10 +446,20 @@ if(WITH_MOD_CLOTH_ELTOPO)
 endif()

 if(WITH_OPENSUBDIV OR WITH_CYCLES_OPENSUBDIV)
-	set(OPENSUBDIV_INCLUDE_DIR ${LIBDIR}/opensubdiv/include)
-	set(OPENSUBDIV_LIBPATH ${LIBDIR}/opensubdiv/lib)
-	set(OPENSUBDIV_LIBRARIES ${OPENSUBDIV_LIBPATH}/osdCPU.lib ${OPENSUBDIV_LIBPATH}/osdGPU.lib)
-	find_package(OpenSubdiv)
+    set(OPENSUBDIV_INCLUDE_DIR ${LIBDIR}/opensubdiv/include)
+    set(OPENSUBDIV_LIBPATH ${LIBDIR}/opensubdiv/lib)
+    set(OPENSUBDIV_LIBRARIES    optimized ${OPENSUBDIV_LIBPATH}/osdCPU.lib 
+                                optimized ${OPENSUBDIV_LIBPATH}/osdGPU.lib
+                                debug ${OPENSUBDIV_LIBPATH}/osdCPU_d.lib 
+                                debug ${OPENSUBDIV_LIBPATH}/osdGPU_d.lib
+                                )
+    set(OPENSUBDIV_HAS_OPENMP TRUE)
+	set(OPENSUBDIV_HAS_TBB FALSE)
+	set(OPENSUBDIV_HAS_OPENCL TRUE)
+	set(OPENSUBDIV_HAS_CUDA FALSE)
+	set(OPENSUBDIV_HAS_GLSL_TRANSFORM_FEEDBACK TRUE)
+	set(OPENSUBDIV_HAS_GLSL_COMPUTE TRUE)
+    windows_find_package(OpenSubdiv)
 endif()

 if(WITH_SDL)
--- a/doc/python_api/examples/gpu.offscreen.1.py
+++ b/doc/python_api/examples/gpu.offscreen.1.py
@@ -52,6 +52,7 @@ class OffScreenDraw(bpy.types.Operator):
    @staticmethod
    def _update_offscreen(context, offscreen):
        scene = context.scene
+        render_layer = context.render_layer
        render = scene.render
        camera = scene.camera

@@ -65,6 +66,7 @@ class OffScreenDraw(bpy.types.Operator):

        offscreen.draw_view3d(
                scene,
+                render_layer,
                context.space_data,
                context.region,
                projection_matrix,
--- a/doc/python_api/rst/bge.texture.rst
+++ b/doc/python_api/rst/bge.texture.rst
@@ -681,7 +681,7 @@ Image classes

   .. attribute:: zbuff

-      Use depth component of render as grey scale color -  suitable for texture source.
+      Use depth component of render as grayscale color - suitable for texture source.

      :type: bool

@@ -817,7 +817,7 @@ Image classes

   .. attribute:: zbuff

-      Use depth component of viewport as grey scale color - suitable for texture source.
+      Use depth component of viewport as grayscale color - suitable for texture source.

      :type: bool

@@ -1260,8 +1260,8 @@ Filter classes

 .. class:: FilterGray

-   Filter for gray scale effect.
-   Proportions of R, G and B contributions in the output gray scale are 28:151:77.
+   Filter for grayscale effect.
+   Proportions of R, G and B contributions in the output grayscale are 28:151:77.

   .. attribute:: previous

--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -427,9 +427,9 @@ if BLENDER_REVISION != "Unknown":
    BLENDER_VERSION_DOTS += " " + BLENDER_REVISION          # '2.62.1 SHA1'

 BLENDER_VERSION_PATH = "_".join(blender_version_strings)    # '2_62_1'
-if bpy.app.version_cycle == "release":
-    BLENDER_VERSION_PATH = "%s%s_release" % ("_".join(blender_version_strings[:2]),
-                                             bpy.app.version_char)   # '2_62_release'
+if bpy.app.version_cycle in {"rc", "release"}:
+    # '2_62a_release'
+    BLENDER_VERSION_PATH = "%s%s_release" % ("_".join(blender_version_strings[:2]), bpy.app.version_char)

 # --------------------------DOWNLOADABLE FILES----------------------------------

@@ -1024,6 +1024,7 @@ context_type_map = {
    "brush": ("Brush", False),
    "camera": ("Camera", False),
    "cloth": ("ClothModifier", False),
+    "collection": ("LayerCollection", False),
    "collision": ("CollisionModifier", False),
    "curve": ("Curve", False),
    "dynamic_paint": ("DynamicPaintModifier", False),
@@ -1055,6 +1056,7 @@ context_type_map = {
    "particle_system": ("ParticleSystem", False),
    "particle_system_editable": ("ParticleSystem", False),
    "pose_bone": ("PoseBone", False),
+    "render_layer": ("SceneLayer", False),
    "scene": ("Scene", False),
    "sculpt_object": ("Object", False),
    "selectable_bases": ("ObjectBase", True),
--- a/doc/python_api/sphinx_doc_update.py
+++ b/doc/python_api/sphinx_doc_update.py
@@ -96,6 +96,11 @@ def main():

    rsync_base = "rsync://%s@%s:%s" % (args.user, args.rsync_server, args.rsync_root)

+    blenver = blenver_zip = ""
+    api_name = ""
+    branch = ""
+    is_release = False
+
    # I) Update local mirror using rsync.
    rsync_mirror_cmd = ("rsync", "--delete-after", "-avzz", rsync_base, args.mirror_dir)
    subprocess.run(rsync_mirror_cmd, env=dict(os.environ, RSYNC_PASSWORD=args.password))
@@ -108,19 +113,24 @@ def main():
        subprocess.run(doc_gen_cmd)

        # III) Get Blender version info.
-        blenver = blenver_zip = ""
        getver_file = os.path.join(tmp_dir, "blendver.txt")
        getver_script = (""
            "import sys, bpy\n"
            "with open(sys.argv[-1], 'w') as f:\n"
-            "    f.write('%d_%d%s_release\\n' % (bpy.app.version[0], bpy.app.version[1], bpy.app.version_char)\n"
-            "            if bpy.app.version_cycle in {'rc', 'release'} else '%d_%d_%d\\n' % bpy.app.version)\n"
-            "    f.write('%d_%d_%d' % bpy.app.version)\n")
+            "    is_release = bpy.app.version_cycle in {'rc', 'release'}\n"
+            "    branch = bpy.app.build_branch.split()[0].decode()\n"
+            "    f.write('%d\\n' % is_release)\n"
+            "    f.write('%s\\n' % branch)\n"
+            "    f.write('%d.%d%s\\n' % (bpy.app.version[0], bpy.app.version[1], bpy.app.version_char)\n"
+            "            if is_release else '%s\\n' % branch)\n"
+            "    f.write('%d_%d%s_release' % (bpy.app.version[0], bpy.app.version[1], bpy.app.version_char)\n"
+            "            if is_release else '%d_%d_%d' % bpy.app.version)\n")
        get_ver_cmd = (args.blender, "--background", "-noaudio", "--factory-startup", "--python-exit-code", "1",
                       "--python-expr", getver_script, "--", getver_file)
        subprocess.run(get_ver_cmd)
        with open(getver_file) as f:
-            blenver, blenver_zip = f.read().split("\n")
+            is_release, branch, blenver, blenver_zip = f.read().split("\n")
+            is_release = bool(int(is_release))
        os.remove(getver_file)

        # IV) Build doc.
@@ -132,7 +142,7 @@ def main():
        os.chdir(curr_dir)

        # V) Cleanup existing matching dir in server mirror (if any), and copy new doc.
-        api_name = "blender_python_api_%s" % blenver
+        api_name = blenver
        api_dir = os.path.join(args.mirror_dir, api_name)
        if os.path.exists(api_dir):
            shutil.rmtree(api_dir)
@@ -150,19 +160,15 @@ def main():
    os.rename(zip_path, os.path.join(api_dir, "%s.zip" % zip_name))

    # VII) Create symlinks and html redirects.
-    #~ os.symlink(os.path.join(DEFAULT_SYMLINK_ROOT, api_name, "contents.html"), os.path.join(api_dir, "index.html"))
    os.symlink("./contents.html", os.path.join(api_dir, "index.html"))
-    if blenver.endswith("release"):
-        symlink = os.path.join(args.mirror_dir, "blender_python_api_current")
+    if is_release:
+        symlink = os.path.join(args.mirror_dir, "current")
        os.remove(symlink)
        os.symlink("./%s" % api_name, symlink)
        with open(os.path.join(args.mirror_dir, "250PythonDoc/index.html"), 'w') as f:
            f.write("<html><head><title>Redirecting...</title><meta http-equiv=\"REFRESH\""
                    "content=\"0;url=../%s/\"></head><body>Redirecting...</body></html>" % api_name)
-    else:
-        symlink = os.path.join(args.mirror_dir, "blender_python_api_master")
-        os.remove(symlink)
-        os.symlink("./%s" % api_name, symlink)
+    elif branch == "master":
        with open(os.path.join(args.mirror_dir, "blender_python_api/index.html"), 'w') as f:
            f.write("<html><head><title>Redirecting...</title><meta http-equiv=\"REFRESH\""
                    "content=\"0;url=../%s/\"></head><body>Redirecting...</body></html>" % api_name)
--- a/extern/clew/README.blender
+++ b/extern/clew/README.blender
@@ -1,5 +1,5 @@
 Project: OpenCL Wrangler
 URL: https://github.com/OpenCLWrangler/clew
 License: Apache 2.0
-Upstream version: 309a653
+Upstream version: 27a6867
 Local modifications: None
--- a/extern/clew/include/clew.h
+++ b/extern/clew/include/clew.h
@@ -369,7 +369,7 @@ typedef unsigned int cl_GLenum;
 #endif

 /* Define basic vector types */
-/* WOrkaround for ppc64el platform: conflicts with bool from C++. */
+/* Workaround for ppc64el platform: conflicts with bool from C++. */
 #if defined( __VEC__ ) && !(defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
   typedef vector unsigned char     __cl_uchar16;
@@ -2765,11 +2765,40 @@ CLEW_FUN_EXPORT     PFNCLGETGLCONTEXTINFOKHR            __clewGetGLContextInfoKH
 #define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
 #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
 #define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+#define CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV   0x4007
+#define CL_DEVICE_PCI_BUS_ID_NV                     0x4008
+#define CL_DEVICE_PCI_SLOT_ID_NV                    0x4009

 /*********************************
 * cl_amd_device_attribute_query *
 *********************************/
 #define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
+#define CL_DEVICE_TOPOLOGY_AMD                      0x4037
+#define CL_DEVICE_BOARD_NAME_AMD                    0x4038
+#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD            0x4039
+#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD         0x4040
+#define CL_DEVICE_SIMD_WIDTH_AMD                    0x4041
+#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD        0x4042
+#define CL_DEVICE_WAVEFRONT_WIDTH_AMD               0x4043
+#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD           0x4044
+#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD      0x4045
+#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD    0x4046
+#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD  0x4047
+#define CL_DEVICE_LOCAL_MEM_BANKS_AMD               0x4048
+#define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD        0x4049
+#define CL_DEVICE_GFXIP_MAJOR_AMD                   0x404A
+#define CL_DEVICE_GFXIP_MINOR_AMD                   0x404B
+#define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD        0x404C
+
+#ifndef CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD
+#define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD            1
+
+typedef union
+{
+    struct { cl_uint type; cl_uint data[5]; } raw;
+    struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie;
+} cl_device_topology_amd;
+#endif

 /*********************************
 * cl_arm_printf extension
--- a/extern/clew/src/clew.c
+++ b/extern/clew/src/clew.c
@@ -15,7 +15,7 @@

    typedef HMODULE             CLEW_DYNLIB_HANDLE;

-    #define CLEW_DYNLIB_OPEN    LoadLibrary
+    #define CLEW_DYNLIB_OPEN    LoadLibraryA
    #define CLEW_DYNLIB_CLOSE   FreeLibrary
    #define CLEW_DYNLIB_IMPORT  GetProcAddress
 #else
@@ -223,7 +223,7 @@ int clewInit()
    __clewSetCommandQueueProperty       = (PFNCLSETCOMMANDQUEUEPROPERTY     )CLEW_DYNLIB_IMPORT(module, "clSetCommandQueueProperty");
 #endif
    __clewCreateBuffer                  = (PFNCLCREATEBUFFER                )CLEW_DYNLIB_IMPORT(module, "clCreateBuffer");
-    __clewCreateSubBuffer               = (PFNCLCREATESUBBUFFER             )CLEW_DYNLIB_IMPORT(module, "clCreateBuffer");
+    __clewCreateSubBuffer               = (PFNCLCREATESUBBUFFER             )CLEW_DYNLIB_IMPORT(module, "clCreateSubBuffer");
    __clewCreateImage                   = (PFNCLCREATEIMAGE                 )CLEW_DYNLIB_IMPORT(module, "clCreateImage");
    __clewRetainMemObject               = (PFNCLRETAINMEMOBJECT             )CLEW_DYNLIB_IMPORT(module, "clRetainMemObject");
    __clewReleaseMemObject              = (PFNCLRELEASEMEMOBJECT            )CLEW_DYNLIB_IMPORT(module, "clReleaseMemObject");
--- a/extern/cuew/include/cuew.h
+++ b/extern/cuew/include/cuew.h
@@ -114,7 +114,7 @@ extern "C" {
 #define cuGLGetDevices cuGLGetDevices_v2

 /* Types. */
-#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
+#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined (__aarch64__)
 typedef unsigned long long CUdeviceptr;
 #else
 typedef unsigned int CUdeviceptr;
--- a/intern/CMakeLists.txt
+++ b/intern/CMakeLists.txt
@@ -33,8 +33,9 @@ add_subdirectory(opencolorio)
 add_subdirectory(mikktspace)
 add_subdirectory(glew-mx)
 add_subdirectory(eigen)
+add_subdirectory(gawain)

-if (WITH_GAMEENGINE_DECKLINK)
+if(WITH_GAMEENGINE_DECKLINK)
 	add_subdirectory(decklink)
 endif()

@@ -62,7 +63,7 @@ if(WITH_IK_ITASC)
 	add_subdirectory(itasc)
 endif()

-if(WITH_IK_SOLVER OR WITH_GAMEENGINE OR WITH_MOD_BOOLEAN)
+if(WITH_GAMEENGINE)
 	add_subdirectory(moto)
 endif()

--- a/intern/atomic/atomic_ops.h
+++ b/intern/atomic/atomic_ops.h
@@ -101,11 +101,11 @@ ATOMIC_INLINE size_t atomic_fetch_and_add_z(size_t *p, size_t x);
 ATOMIC_INLINE size_t atomic_fetch_and_sub_z(size_t *p, size_t x);
 ATOMIC_INLINE size_t atomic_cas_z(size_t *v, size_t old, size_t _new);

-ATOMIC_INLINE unsigned atomic_add_and_fetch_u(unsigned *p, unsigned x);
-ATOMIC_INLINE unsigned atomic_sub_and_fetch_u(unsigned *p, unsigned x);
-ATOMIC_INLINE unsigned atomic_fetch_and_add_u(unsigned *p, unsigned x);
-ATOMIC_INLINE unsigned atomic_fetch_and_sub_u(unsigned *p, unsigned x);
-ATOMIC_INLINE unsigned atomic_cas_u(unsigned *v, unsigned old, unsigned _new);
+ATOMIC_INLINE unsigned int atomic_add_and_fetch_u(unsigned int *p, unsigned int x);
+ATOMIC_INLINE unsigned int atomic_sub_and_fetch_u(unsigned int *p, unsigned int x);
+ATOMIC_INLINE unsigned int atomic_fetch_and_add_u(unsigned int *p, unsigned int x);
+ATOMIC_INLINE unsigned int atomic_fetch_and_sub_u(unsigned int *p, unsigned int x);
+ATOMIC_INLINE unsigned int atomic_cas_u(unsigned int *v, unsigned int old, unsigned int _new);

 /* WARNING! Float 'atomics' are really faked ones, those are actually closer to some kind of spinlock-sync'ed operation,
 *          which means they are only efficient if collisions are highly unlikely (i.e. if probability of two threads
--- a/intern/atomic/intern/atomic_ops_ext.h
+++ b/intern/atomic/intern/atomic_ops_ext.h
@@ -113,58 +113,58 @@ ATOMIC_INLINE size_t atomic_cas_z(size_t *v, size_t old, size_t _new)

 /******************************************************************************/
 /* unsigned operations. */
-ATOMIC_INLINE unsigned atomic_add_and_fetch_u(unsigned *p, unsigned x)
+ATOMIC_INLINE unsigned int atomic_add_and_fetch_u(unsigned int *p, unsigned int x)
 {
-	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned int) == LG_SIZEOF_INT);

 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)x);
+	return (unsigned int)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)x);
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)x);
+	return (unsigned int)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)x);
 #endif
 }

-ATOMIC_INLINE unsigned atomic_sub_and_fetch_u(unsigned *p, unsigned x)
+ATOMIC_INLINE unsigned int atomic_sub_and_fetch_u(unsigned int *p, unsigned int x)
 {
-	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned int) == LG_SIZEOF_INT);

 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
+	return (unsigned int)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
+	return (unsigned int)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
 #endif
 }

-ATOMIC_INLINE unsigned atomic_fetch_and_add_u(unsigned *p, unsigned x)
+ATOMIC_INLINE unsigned int atomic_fetch_and_add_u(unsigned int *p, unsigned int x)
 {
-	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned int) == LG_SIZEOF_INT);

 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)x);
+	return (unsigned int)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)x);
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)x);
+	return (unsigned int)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)x);
 #endif
 }

-ATOMIC_INLINE unsigned atomic_fetch_and_sub_u(unsigned *p, unsigned x)
+ATOMIC_INLINE unsigned int atomic_fetch_and_sub_u(unsigned int *p, unsigned int x)
 {
-	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned int) == LG_SIZEOF_INT);

 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
+	return (unsigned int)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
+	return (unsigned int)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
 #endif
 }

-ATOMIC_INLINE unsigned atomic_cas_u(unsigned *v, unsigned old, unsigned _new)
+ATOMIC_INLINE unsigned int atomic_cas_u(unsigned int *v, unsigned int old, unsigned int _new)
 {
-	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned int) == LG_SIZEOF_INT);

 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_cas_uint64((uint64_t *)v, (uint64_t)old, (uint64_t)_new);
+	return (unsigned int)atomic_cas_uint64((uint64_t *)v, (uint64_t)old, (uint64_t)_new);
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_cas_uint32((uint32_t *)v, (uint32_t)old, (uint32_t)_new);
+	return (unsigned int)atomic_cas_uint32((uint32_t *)v, (uint32_t)old, (uint32_t)_new);
 #endif
 }

--- a/intern/audaspace/intern/AUD_SoftwareDevice.cpp
+++ b/intern/audaspace/intern/AUD_SoftwareDevice.cpp
@@ -365,6 +365,7 @@ bool AUD_SoftwareDevice::AUD_SoftwareHandle::seek(float position)
 	if(!m_status)
 		return false;

+	m_pitch->setPitch(m_user_pitch);
 	m_reader->seek((int)(position * m_reader->getSpecs().rate));

 	if(m_status == AUD_STATUS_STOPPED)
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -22,6 +22,7 @@ if(WITH_CYCLES_NATIVE_ONLY)
 		-DWITH_KERNEL_NATIVE
 	)
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+	set(CYCLES_KERNEL_FLAGS "-march=native")
 elseif(NOT WITH_CPU_SSE)
 	set(CXX_HAS_SSE FALSE)
 	set(CXX_HAS_AVX FALSE)
@@ -59,10 +60,13 @@ elseif(WIN32 AND MSVC)
 	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Ox")
 	set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Ox")
 	set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /Ox")
+
+	set(CYCLES_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
 elseif(CMAKE_COMPILER_IS_GNUCC)
 	check_cxx_compiler_flag(-msse CXX_HAS_SSE)
 	check_cxx_compiler_flag(-mavx CXX_HAS_AVX)
 	check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2)
+	set(CYCLES_KERNEL_FLAGS "-ffast-math")
 	if(CXX_HAS_SSE)
 		set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse")
 		set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse")
@@ -74,10 +78,12 @@ elseif(CMAKE_COMPILER_IS_GNUCC)
 	if(CXX_HAS_AVX2)
 		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c -mfpmath=sse")
 	endif()
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only")
 elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 	check_cxx_compiler_flag(-msse CXX_HAS_SSE)
 	check_cxx_compiler_flag(-mavx CXX_HAS_AVX)
 	check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2)
+	set(CYCLES_KERNEL_FLAGS "-ffast-math")
 	if(CXX_HAS_SSE)
 		set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2")
 		set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3")
@@ -89,6 +95,7 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 	if(CXX_HAS_AVX2)
 		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
 	endif()
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only")
 endif()

 if(CXX_HAS_SSE)
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -1,14 +1,6 @@

 set(INC
-	.
-	../bvh
-	../device
-	../graph
-	../kernel
-	../kernel/svm
-	../render
-	../subd
-	../util
+	..
 )
 set(INC_SYS
 )
--- a/intern/cycles/app/cycles_server.cpp
+++ b/intern/cycles/app/cycles_server.cpp
@@ -16,15 +16,15 @@

 #include <stdio.h>

-#include "device.h"
+#include "device/device.h"

-#include "util_args.h"
-#include "util_foreach.h"
-#include "util_path.h"
-#include "util_stats.h"
-#include "util_string.h"
-#include "util_task.h"
-#include "util_logging.h"
+#include "util/util_args.h"
+#include "util/util_foreach.h"
+#include "util/util_path.h"
+#include "util/util_stats.h"
+#include "util/util_string.h"
+#include "util/util_task.h"
+#include "util/util_logging.h"

 using namespace ccl;

--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -16,29 +16,29 @@

 #include <stdio.h>

-#include "buffers.h"
-#include "camera.h"
-#include "device.h"
-#include "scene.h"
-#include "session.h"
-#include "integrator.h"
+#include "render/buffers.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/scene.h"
+#include "render/session.h"
+#include "render/integrator.h"

-#include "util_args.h"
-#include "util_foreach.h"
-#include "util_function.h"
-#include "util_logging.h"
-#include "util_path.h"
-#include "util_progress.h"
-#include "util_string.h"
-#include "util_time.h"
-#include "util_transform.h"
-#include "util_version.h"
+#include "util/util_args.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_logging.h"
+#include "util/util_path.h"
+#include "util/util_progress.h"
+#include "util/util_string.h"
+#include "util/util_time.h"
+#include "util/util_transform.h"
+#include "util/util_version.h"

 #ifdef WITH_CYCLES_STANDALONE_GUI
-#include "util_view.h"
+#include "util/util_view.h"
 #endif

-#include "cycles_xml.h"
+#include "app/cycles_xml.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -20,31 +20,31 @@
 #include <algorithm>
 #include <iterator>

-#include "node_xml.h"
+#include "graph/node_xml.h"

-#include "background.h"
-#include "camera.h"
-#include "film.h"
-#include "graph.h"
-#include "integrator.h"
-#include "light.h"
-#include "mesh.h"
-#include "nodes.h"
-#include "object.h"
-#include "osl.h"
-#include "shader.h"
-#include "scene.h"
+#include "render/background.h"
+#include "render/camera.h"
+#include "render/film.h"
+#include "render/graph.h"
+#include "render/integrator.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/object.h"
+#include "render/osl.h"
+#include "render/shader.h"
+#include "render/scene.h"

-#include "subd_patch.h"
-#include "subd_split.h"
+#include "subd/subd_patch.h"
+#include "subd/subd_split.h"

-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_path.h"
-#include "util_transform.h"
-#include "util_xml.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_path.h"
+#include "util/util_transform.h"
+#include "util/util_xml.h"

-#include "cycles_xml.h"
+#include "app/cycles_xml.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -1,12 +1,6 @@

 set(INC
-	../graph
-	../render
-	../device
-	../kernel
-	../kernel/svm
-	../util
-	../subd
+	..
 	../../glew-mx
 	../../guardedalloc
 	../../mikktspace
--- a/intern/cycles/blender/addon/init.py
+++ b/intern/cycles/blender/addon/init.py
@@ -107,7 +107,13 @@ def engine_exit():
    engine.exit()


+classes = (
+    CyclesRender,
+)
+
+
 def register():
+    from bpy.utils import register_class
    from . import ui
    from . import properties
    from . import presets
@@ -122,12 +128,15 @@ def register():
    properties.register()
    ui.register()
    presets.register()
-    bpy.utils.register_module(__name__)
+
+    for cls in classes:
+        register_class(cls)

    bpy.app.handlers.version_update.append(version_update.do_versions)


 def unregister():
+    from bpy.utils import unregister_class
    from . import ui
    from . import properties
    from . import presets
@@ -138,4 +147,6 @@ def unregister():
    ui.unregister()
    properties.unregister()
    presets.unregister()
-    bpy.utils.unregister_module(__name__)
+
+    for cls in classes:
+        unregister_class(cls)
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -50,6 +50,24 @@ def _workaround_buggy_drivers():
            _cycles.opencl_disable()


+def _configure_argument_parser():
+    import argparse
+    parser = argparse.ArgumentParser(description="Cycles Addon argument parser")
+    parser.add_argument("--cycles-resumable-num-chunks",
+                        help="Number of chunks to split sample range into",
+                        default=None)
+    parser.add_argument("--cycles-resumable-current-chunk",
+                        help="Current chunk of samples range to render",
+                        default=None)
+    parser.add_argument("--cycles-resumable-start-chunk",
+                        help="Start chunk to render",
+                        default=None)
+    parser.add_argument("--cycles-resumable-end-chunk",
+                        help="End chunk to render",
+                        default=None)
+    return parser
+
+
 def _parse_command_line():
    import sys

@@ -57,25 +75,22 @@ def _parse_command_line():
    if "--" not in argv:
        return

-    argv = argv[argv.index("--") + 1:]
+    parser = _configure_argument_parser()
+    args, unknown = parser.parse_known_args(argv[argv.index("--") + 1:])

-    num_resumable_chunks = None
-    current_resumable_chunk = None
-
-    # TODO(sergey): Add some nice error ptins if argument is not used properly.
-    idx = 0
-    while idx < len(argv) - 1:
-        arg = argv[idx]
-        if arg == '--cycles-resumable-num-chunks':
-            num_resumable_chunks = int(argv[idx + 1])
-        elif arg == '--cycles-resumable-current-chunk':
-            current_resumable_chunk = int(argv[idx + 1])
-        idx += 1
-
-    if num_resumable_chunks is not None and current_resumable_chunk is not None:
-        import _cycles
-        _cycles.set_resumable_chunks(num_resumable_chunks,
-                                     current_resumable_chunk)
+    if args.cycles_resumable_num_chunks is not None:
+        if args.cycles_resumable_current_chunk is not None:
+            import _cycles
+            _cycles.set_resumable_chunk(
+                    int(args.cycles_resumable_num_chunks),
+                    int(args.cycles_resumable_current_chunk))
+        elif args.cycles_resumable_start_chunk is not None and \
+             args.cycles_resumable_end_chunk:
+            import _cycles
+            _cycles.set_resumable_chunk_range(
+                    int(args.cycles_resumable_num_chunks),
+                    int(args.cycles_resumable_start_chunk),
+                    int(args.cycles_resumable_end_chunk))


 def init():
--- a/intern/cycles/blender/addon/presets.py
+++ b/intern/cycles/blender/addon/presets.py
@@ -82,12 +82,23 @@ class AddPresetSampling(AddPresetBase, Operator):
    preset_subdir = "cycles/sampling"


+classes = (
+    AddPresetIntegrator,
+    AddPresetSampling,
+)
+
+
 def register():
-    pass
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)


 def unregister():
-    pass
+    from bpy.utils import unregister_class
+    for cls in classes:
+        unregister_class(cls)
+

 if __name__ == "__main__":
    register()
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -638,6 +638,20 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
            items=enum_texture_limit
            )

+        cls.ao_bounces = IntProperty(
+            name="AO Bounces",
+            default=0,
+            description="Approximate indirect light with background tinted ambient occlusion at the specified bounce, 0 disables this feature",
+            min=0, max=1024,
+            )
+
+        cls.ao_bounces_render = IntProperty(
+            name="AO Bounces Render",
+            default=0,
+            description="Approximate indirect light with background tinted ambient occlusion at the specified bounce, 0 disables this feature",
+            min=0, max=1024,
+            )
+
        # Various fine-tuning debug flags

        def devices_update_callback(self, context):
@@ -651,8 +665,10 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        cls.debug_use_cpu_sse3 = BoolProperty(name="SSE3", default=True)
        cls.debug_use_cpu_sse2 = BoolProperty(name="SSE2", default=True)
        cls.debug_use_qbvh = BoolProperty(name="QBVH", default=True)
+        cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False)

        cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False)
+        cls.debug_use_cuda_split_kernel = BoolProperty(name="Split Kernel", default=False)

        cls.debug_opencl_kernel_type = EnumProperty(
            name="OpenCL Kernel Type",
@@ -679,6 +695,8 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
            update=devices_update_callback
            )

+        cls.debug_opencl_kernel_single_program = BoolProperty(name="Single Program", default=False, update=devices_update_callback);
+
        cls.debug_use_opencl_debug = BoolProperty(name="Debug OpenCL", default=False)

    @classmethod
@@ -1078,6 +1096,12 @@ class CyclesObjectSettings(bpy.types.PropertyGroup):
                default=1.0,
                )

+        cls.is_shadow_catcher = BoolProperty(
+                name="Shadow Catcher",
+                description="Only render shadows on this object, for compositing renders into real footage",
+                default=False,
+                )
+
    @classmethod
    def unregister(cls):
        del bpy.types.Object.cycles
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -86,12 +86,10 @@ def use_sample_all_lights(context):

    return cscene.sample_all_lights_direct or cscene.sample_all_lights_indirect

-def show_device_selection(context):
-    type = get_device_type(context)
-    if type == 'NETWORK':
+def show_device_active(context):
+    cscene = context.scene.cycles
+    if cscene.device != 'GPU':
        return True
-    if not type in {'CUDA', 'OPENCL'}:
-        return False
    return context.user_preferences.addons[__package__].preferences.has_active_device()


@@ -186,9 +184,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
            sub.label(text="AA Samples:")
            sub.prop(cscene, "aa_samples", text="Render")
            sub.prop(cscene, "preview_aa_samples", text="Preview")
-            sub.separator()
-            sub.prop(cscene, "sample_all_lights_direct")
-            sub.prop(cscene, "sample_all_lights_indirect")

            col = split.column()
            sub = col.column(align=True)
@@ -205,6 +200,10 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
            sub.prop(cscene, "subsurface_samples", text="Subsurface")
            sub.prop(cscene, "volume_samples", text="Volume")

+            col = layout.column(align=True)
+            col.prop(cscene, "sample_all_lights_direct")
+            col.prop(cscene, "sample_all_lights_indirect")
+
        if not (use_opencl(context) and cscene.feature_set != 'EXPERIMENTAL'):
            layout.row().prop(cscene, "sampling_pattern", text="Pattern")

@@ -270,7 +269,7 @@ class CyclesRender_PT_geometry(CyclesButtonsPanel, Panel):

        row = col.row()
        row.prop(ccscene, "minimum_width", text="Min Pixels")
-        row.prop(ccscene, "maximum_width", text="Max Ext.")
+        row.prop(ccscene, "maximum_width", text="Max Extension")


 class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
@@ -788,6 +787,8 @@ class CyclesObject_PT_cycles_settings(CyclesButtonsPanel, Panel):
        if ob.type != 'LAMP':
            flow.prop(visibility, "shadow")

+        layout.prop(cob, "is_shadow_catcher")
+
        col = layout.column()
        col.label(text="Performance:")
        row = col.row()
@@ -1038,10 +1039,11 @@ class CyclesWorld_PT_ambient_occlusion(CyclesButtonsPanel, Panel):
        layout = self.layout

        light = context.world.light_settings
+        scene = context.scene

        row = layout.row()
        sub = row.row()
-        sub.active = light.use_ambient_occlusion
+        sub.active = light.use_ambient_occlusion or scene.render.use_simplify
        sub.prop(light, "ao_factor", text="Factor")
        row.prop(light, "distance", text="Distance")

@@ -1517,15 +1519,18 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
        row.prop(cscene, "debug_use_cpu_avx", toggle=True)
        row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
        col.prop(cscene, "debug_use_qbvh")
+        col.prop(cscene, "debug_use_cpu_split_kernel")

        col = layout.column()
        col.label('CUDA Flags:')
        col.prop(cscene, "debug_use_cuda_adaptive_compile")
+        col.prop(cscene, "debug_use_cuda_split_kernel")

        col = layout.column()
        col.label('OpenCL Flags:')
        col.prop(cscene, "debug_opencl_kernel_type", text="Kernel")
        col.prop(cscene, "debug_opencl_device_type", text="Device")
+        col.prop(cscene, "debug_opencl_kernel_single_program", text="Single Program")
        col.prop(cscene, "debug_use_opencl_debug", text="Debug")


@@ -1613,6 +1618,13 @@ class CyclesScene_PT_simplify(CyclesButtonsPanel, Panel):
        row.active = cscene.use_distance_cull
        row.prop(cscene, "distance_cull_margin", text="Distance")

+        split = layout.split()
+        col = split.column()
+        col.prop(cscene, "ao_bounces")
+
+        col = split.column()
+        col.prop(cscene, "ao_bounces_render")
+
 def draw_device(self, context):
    scene = context.scene
    layout = self.layout
@@ -1626,7 +1638,7 @@ def draw_device(self, context):
        split = layout.split(percentage=1/3)
        split.label("Device:")
        row = split.row()
-        row.active = show_device_selection(context)
+        row.active = show_device_active(context)
        row.prop(cscene, "device", text="")

        if engine.with_osl() and use_cpu(context):
@@ -1705,17 +1717,75 @@ def get_panels():

    return panels

+
+classes = (
+    CYCLES_MT_sampling_presets,
+    CYCLES_MT_integrator_presets,
+    CyclesRender_PT_sampling,
+    CyclesRender_PT_geometry,
+    CyclesRender_PT_light_paths,
+    CyclesRender_PT_motion_blur,
+    CyclesRender_PT_film,
+    CyclesRender_PT_performance,
+    CyclesRender_PT_layer_options,
+    CyclesRender_PT_layer_passes,
+    CyclesRender_PT_views,
+    Cycles_PT_post_processing,
+    CyclesCamera_PT_dof,
+    Cycles_PT_context_material,
+    CyclesObject_PT_motion_blur,
+    CyclesObject_PT_cycles_settings,
+    CYCLES_OT_use_shading_nodes,
+    CyclesLamp_PT_preview,
+    CyclesLamp_PT_lamp,
+    CyclesLamp_PT_nodes,
+    CyclesLamp_PT_spot,
+    CyclesWorld_PT_preview,
+    CyclesWorld_PT_surface,
+    CyclesWorld_PT_volume,
+    CyclesWorld_PT_ambient_occlusion,
+    CyclesWorld_PT_mist,
+    CyclesWorld_PT_ray_visibility,
+    CyclesWorld_PT_settings,
+    CyclesMaterial_PT_preview,
+    CyclesMaterial_PT_surface,
+    CyclesMaterial_PT_volume,
+    CyclesMaterial_PT_displacement,
+    CyclesMaterial_PT_settings,
+    CyclesTexture_PT_context,
+    CyclesTexture_PT_node,
+    CyclesTexture_PT_mapping,
+    CyclesTexture_PT_colors,
+    CyclesParticle_PT_textures,
+    CyclesRender_PT_bake,
+    CyclesRender_PT_debug,
+    CyclesParticle_PT_CurveSettings,
+    CyclesScene_PT_simplify,
+)
+
+
 def register():
+    from bpy.utils import register_class
+
    bpy.types.RENDER_PT_render.append(draw_device)
    bpy.types.VIEW3D_HT_header.append(draw_pause)

    for panel in get_panels():
        panel.COMPAT_ENGINES.add('CYCLES')

+    for cls in classes:
+        register_class(cls)
+
+
 def unregister():
+    from bpy.utils import unregister_class
+
    bpy.types.RENDER_PT_render.remove(draw_device)
    bpy.types.VIEW3D_HT_header.remove(draw_pause)

    for panel in get_panels():
        if 'CYCLES' in panel.COMPAT_ENGINES:
            panel.COMPAT_ENGINES.remove('CYCLES')
+
+    for cls in classes:
+        unregister_class(cls)
--- a/intern/cycles/blender/blender_camera.cpp
+++ b/intern/cycles/blender/blender_camera.cpp
@@ -14,13 +14,13 @@
 * limitations under the License.
 */

-#include "camera.h"
-#include "scene.h"
+#include "render/camera.h"
+#include "render/scene.h"

-#include "blender_sync.h"
-#include "blender_util.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"

-#include "util_logging.h"
+#include "util/util_logging.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -14,18 +14,18 @@
 * limitations under the License.
 */

-#include "attribute.h"
-#include "camera.h"
-#include "curves.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
+#include "render/attribute.h"
+#include "render/camera.h"
+#include "render/curves.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"

-#include "blender_sync.h"
-#include "blender_util.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"

-#include "util_foreach.h"
-#include "util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"

 CCL_NAMESPACE_BEGIN

@@ -411,6 +411,7 @@ static void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
 		}
 	}

+	mesh->resize_mesh(mesh->verts.size(), mesh->triangles.size());
 	mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
 	mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
 	mesh->add_face_normals();
@@ -434,8 +435,8 @@ static void ExportCurveTriangleGeometry(Mesh *mesh,
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;

-			numverts += (CData->curve_keynum[curve] - 2)*2*resolution + resolution;
-			numtris += (CData->curve_keynum[curve] - 2)*resolution;
+			numverts += (CData->curve_keynum[curve] - 1)*resolution + resolution;
+			numtris += (CData->curve_keynum[curve] - 1)*2*resolution;
 		}
 	}

@@ -545,6 +546,7 @@ static void ExportCurveTriangleGeometry(Mesh *mesh,
 		}
 	}

+	mesh->resize_mesh(mesh->verts.size(), mesh->triangles.size());
 	mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
 	mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
 	mesh->add_face_normals();
@@ -890,7 +892,7 @@ void BlenderSync::sync_curves(Mesh *mesh,
 	}

 	/* obtain general settings */
-	bool use_curves = scene->curve_system_manager->use_curves;
+	const bool use_curves = scene->curve_system_manager->use_curves;

 	if(!(use_curves && b_ob.mode() != b_ob.mode_PARTICLE_EDIT)) {
 		if(!motion)
@@ -898,11 +900,11 @@ void BlenderSync::sync_curves(Mesh *mesh,
 		return;
 	}

-	int primitive = scene->curve_system_manager->primitive;
-	int triangle_method = scene->curve_system_manager->triangle_method;
-	int resolution = scene->curve_system_manager->resolution;
-	size_t vert_num = mesh->verts.size();
-	size_t tri_num = mesh->num_triangles();
+	const int primitive = scene->curve_system_manager->primitive;
+	const int triangle_method = scene->curve_system_manager->triangle_method;
+	const int resolution = scene->curve_system_manager->resolution;
+	const size_t vert_num = mesh->verts.size();
+	const size_t tri_num = mesh->num_triangles();
 	int used_res = 1;

 	/* extract particle hair data - should be combined with connecting to mesh later*/
--- a/intern/cycles/blender/blender_logging.cpp
+++ b/intern/cycles/blender/blender_logging.cpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

-#include "CCL_api.h"
-#include "util_logging.h"
+#include "blender/CCL_api.h"
+#include "util/util_logging.h"

 void CCL_init_logging(const char *argv0)
 {
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -15,21 +15,22 @@
 */

 
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "camera.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/camera.h"

-#include "blender_sync.h"
-#include "blender_session.h"
-#include "blender_util.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_session.h"
+#include "blender/blender_util.h"

-#include "subd_patch.h"
-#include "subd_split.h"
+#include "subd/subd_patch.h"
+#include "subd/subd_split.h"

-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_math.h"
+#include "util/util_algorithm.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_math.h"

 #include "mikktspace.h"

@@ -525,69 +526,177 @@ static void attr_create_uv_map(Scene *scene,
 }

 /* Create vertex pointiness attributes. */
+
+/* Compare vertices by sum of their coordinates. */
+class VertexAverageComparator {
+public:
+	VertexAverageComparator(const array<float3>& verts)
+	        : verts_(verts) {
+	}
+
+	bool operator()(const int& vert_idx_a, const int& vert_idx_b)
+	{
+		const float3 &vert_a = verts_[vert_idx_a];
+		const float3 &vert_b = verts_[vert_idx_b];
+		if(vert_a == vert_b) {
+			/* Special case for doubles, so we ensure ordering. */
+			return vert_idx_a > vert_idx_b;
+		}
+		const float x1 = vert_a.x + vert_a.y + vert_a.z;
+		const float x2 = vert_b.x + vert_b.y + vert_b.z;
+		return x1 < x2;
+	}
+
+protected:
+	const array<float3>& verts_;
+};
+
 static void attr_create_pointiness(Scene *scene,
                                   Mesh *mesh,
                                   BL::Mesh& b_mesh,
                                   bool subdivision)
 {
-	if(mesh->need_attribute(scene, ATTR_STD_POINTINESS)) {
-		const int numverts = b_mesh.vertices.length();
-		AttributeSet& attributes = (subdivision)? mesh->subd_attributes: mesh->attributes;
-		Attribute *attr = attributes.add(ATTR_STD_POINTINESS);
-		float *data = attr->data_float();
-		int *counter = new int[numverts];
-		float *raw_data = new float[numverts];
-		float3 *edge_accum = new float3[numverts];
-
-		/* Calculate pointiness using single ring neighborhood. */
-		memset(counter, 0, sizeof(int) * numverts);
-		memset(raw_data, 0, sizeof(float) * numverts);
-		memset(edge_accum, 0, sizeof(float3) * numverts);
-		BL::Mesh::edges_iterator e;
-		int i = 0;
-		for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++i) {
-			int v0 = b_mesh.edges[i].vertices()[0],
-			    v1 = b_mesh.edges[i].vertices()[1];
-			float3 co0 = get_float3(b_mesh.vertices[v0].co()),
-			       co1 = get_float3(b_mesh.vertices[v1].co());
-			float3 edge = normalize(co1 - co0);
-			edge_accum[v0] += edge;
-			edge_accum[v1] += -edge;
-			++counter[v0];
-			++counter[v1];
-		}
-		i = 0;
-		BL::Mesh::vertices_iterator v;
-		for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end(); ++v, ++i) {
-			if(counter[i] > 0) {
-				float3 normal = get_float3(b_mesh.vertices[i].normal());
-				float angle = safe_acosf(dot(normal, edge_accum[i] / counter[i]));
-				raw_data[i] = angle * M_1_PI_F;
+	if(!mesh->need_attribute(scene, ATTR_STD_POINTINESS)) {
+		return;
+	}
+	const int num_verts = b_mesh.vertices.length();
+	/* STEP 1: Find out duplicated vertices and point duplicates to a single
+	 *         original vertex.
+	 */
+	vector<int> sorted_vert_indeices(num_verts);
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		sorted_vert_indeices[vert_index] = vert_index;
+	}
+	VertexAverageComparator compare(mesh->verts);
+	sort(sorted_vert_indeices.begin(), sorted_vert_indeices.end(), compare);
+	/* This array stores index of the original vertex for the given vertex
+	 * index.
+	 */
+	vector<int> vert_orig_index(num_verts);
+	for(int sorted_vert_index = 0;
+	    sorted_vert_index < num_verts;
+	    ++sorted_vert_index)
+	{
+		const int vert_index = sorted_vert_indeices[sorted_vert_index];
+		const float3 &vert_co = mesh->verts[vert_index];
+		bool found = false;
+		for(int other_sorted_vert_index = sorted_vert_index + 1;
+		    other_sorted_vert_index < num_verts;
+		    ++other_sorted_vert_index)
+		{
+			const int other_vert_index =
+			        sorted_vert_indeices[other_sorted_vert_index];
+			const float3 &other_vert_co = mesh->verts[other_vert_index];
+			/* We are too far away now, we wouldn't have duplicate. */
+			if((other_vert_co.x + other_vert_co.y + other_vert_co.z) -
+			   (vert_co.x + vert_co.y + vert_co.z) > 3 * FLT_EPSILON)
+			{
+				break;
 			}
-			else {
-				raw_data[i] = 0.0f;
+			/* Found duplicate. */
+			if(len_squared(other_vert_co - vert_co) < FLT_EPSILON) {
+				found = true;
+				vert_orig_index[vert_index] = other_vert_index;
+				break;
 			}
 		}
-
-		/* Blur vertices to approximate 2 ring neighborhood. */
-		memset(counter, 0, sizeof(int) * numverts);
-		memcpy(data, raw_data, sizeof(float) * numverts);
-		i = 0;
-		for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++i) {
-			int v0 = b_mesh.edges[i].vertices()[0],
-			    v1 = b_mesh.edges[i].vertices()[1];
-			data[v0] += raw_data[v1];
-			data[v1] += raw_data[v0];
-			++counter[v0];
-			++counter[v1];
+		if(!found) {
+			vert_orig_index[vert_index] = vert_index;
 		}
-		for(i = 0; i < numverts; ++i) {
-			data[i] /= counter[i] + 1;
+	}
+	/* Make sure we always points to the very first orig vertex. */
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		int orig_index = vert_orig_index[vert_index];
+		while(orig_index != vert_orig_index[orig_index]) {
+			orig_index = vert_orig_index[orig_index];
 		}
-
-		delete [] counter;
-		delete [] raw_data;
-		delete [] edge_accum;
+		vert_orig_index[vert_index] = orig_index;
+	}
+	sorted_vert_indeices.free_memory();
+	/* STEP 2: Calculate vertex normals taking into account their possible
+	 *         duplicates which gets "welded" together.
+	 */
+	vector<float3> vert_normal(num_verts, make_float3(0.0f, 0.0f, 0.0f));
+	/* First we accumulate all vertex normals in the original index. */
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		const float3 normal = get_float3(b_mesh.vertices[vert_index].normal());
+		const int orig_index = vert_orig_index[vert_index];
+		vert_normal[orig_index] += normal;
+	}
+	/* Then we normalize the accumulated result and flush it to all duplicates
+	 * as well.
+	 */
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		const int orig_index = vert_orig_index[vert_index];
+		vert_normal[vert_index] = normalize(vert_normal[orig_index]);
+	}
+	/* STEP 3: Calculate pointiness using single ring neighborhood. */
+	vector<int> counter(num_verts, 0);
+	vector<float> raw_data(num_verts, 0.0f);
+	vector<float3> edge_accum(num_verts, make_float3(0.0f, 0.0f, 0.0f));
+	BL::Mesh::edges_iterator e;
+	EdgeMap visited_edges;
+	int edge_index = 0;
+	memset(&counter[0], 0, sizeof(int) * counter.size());
+	for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++edge_index) {
+		const int v0 = vert_orig_index[b_mesh.edges[edge_index].vertices()[0]],
+		          v1 = vert_orig_index[b_mesh.edges[edge_index].vertices()[1]];
+		if(visited_edges.exists(v0, v1)) {
+			continue;
+		}
+		visited_edges.insert(v0, v1);
+		float3 co0 = get_float3(b_mesh.vertices[v0].co()),
+		       co1 = get_float3(b_mesh.vertices[v1].co());
+		float3 edge = normalize(co1 - co0);
+		edge_accum[v0] += edge;
+		edge_accum[v1] += -edge;
+		++counter[v0];
+		++counter[v1];
+	}
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		const int orig_index = vert_orig_index[vert_index];
+		if(orig_index != vert_index) {
+			/* Skip duplicates, they'll be overwritten later on. */
+			continue;
+		}
+		if(counter[vert_index] > 0) {
+			const float3 normal = vert_normal[vert_index];
+			const float angle =
+			        safe_acosf(dot(normal,
+			                       edge_accum[vert_index] / counter[vert_index]));
+			raw_data[vert_index] = angle * M_1_PI_F;
+		}
+		else {
+			raw_data[vert_index] = 0.0f;
+		}
+	}
+	/* STEP 3: Blur vertices to approximate 2 ring neighborhood. */
+	AttributeSet& attributes = (subdivision)? mesh->subd_attributes: mesh->attributes;
+	Attribute *attr = attributes.add(ATTR_STD_POINTINESS);
+	float *data = attr->data_float();
+	memcpy(data, &raw_data[0], sizeof(float) * raw_data.size());
+	memset(&counter[0], 0, sizeof(int) * counter.size());
+	edge_index = 0;
+	visited_edges.clear();
+	for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++edge_index) {
+		const int v0 = vert_orig_index[b_mesh.edges[edge_index].vertices()[0]],
+		          v1 = vert_orig_index[b_mesh.edges[edge_index].vertices()[1]];
+		if(visited_edges.exists(v0, v1)) {
+			continue;
+		}
+		visited_edges.insert(v0, v1);
+		data[v0] += raw_data[v1];
+		data[v1] += raw_data[v0];
+		++counter[v0];
+		++counter[v1];
+	}
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		data[vert_index] /= counter[vert_index] + 1;
+	}
+	/* STEP 4: Copy attribute to the duplicated vertices. */
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		const int orig_index = vert_orig_index[vert_index];
+		data[vert_index] = data[orig_index];
 	}
 }

@@ -656,9 +765,6 @@ static void create_mesh(Scene *scene,
 			generated[i++] = get_float3(v->undeformed_co())*size - loc;
 	}

-	/* Create needed vertex attributes. */
-	attr_create_pointiness(scene, mesh, b_mesh, subdivision);
-
 	/* create faces */
 	vector<int> nverts(numfaces);
 	vector<int> face_flags(numfaces, FACE_FLAG_NONE);
@@ -671,6 +777,15 @@ static void create_mesh(Scene *scene,
 			int shader = clamp(f->material_index(), 0, used_shaders.size()-1);
 			bool smooth = f->use_smooth() || use_loop_normals;

+			if(use_loop_normals) {
+				BL::Array<float, 12> loop_normals = f->split_normals();
+				for(int i = 0; i < n; i++) {
+					N[vi[i]] = make_float3(loop_normals[i * 3],
+					                       loop_normals[i * 3 + 1],
+					                       loop_normals[i * 3 + 2]);
+				}
+			}
+
 			/* Create triangles.
 			 *
 			 * NOTE: Autosmooth is already taken care about.
@@ -704,7 +819,7 @@ static void create_mesh(Scene *scene,
 			int shader = clamp(p->material_index(), 0, used_shaders.size()-1);
 			bool smooth = p->use_smooth() || use_loop_normals;

-			vi.reserve(n);
+			vi.resize(n);
 			for(int i = 0; i < n; i++) {
 				/* NOTE: Autosmooth is already taken care about. */
 				vi[i] = b_mesh.loops[p->loop_start() + i].vertex_index();
@@ -718,6 +833,7 @@ static void create_mesh(Scene *scene,
 	/* Create all needed attributes.
 	 * The calculate functions will check whether they're needed or not.
 	 */
+	attr_create_pointiness(scene, mesh, b_mesh, subdivision);
 	attr_create_vertex_color(scene, mesh, b_mesh, nverts, face_flags, subdivision);
 	attr_create_uv_map(scene, mesh, b_mesh, nverts, face_flags, subdivision, subdivide_uvs);

@@ -927,6 +1043,13 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,

 		mesh->subdivision_type = object_subdivision_type(b_ob, preview, experimental);

+		/* Disable adaptive subdivision while baking as the baking system
+		 * currently doesnt support the topology and will crash.
+		 */
+		if(scene->bake_manager->get_baking()) {
+			mesh->subdivision_type = Mesh::SUBDIVISION_NONE;
+		}
+
 		BL::Mesh b_mesh = object_to_mesh(b_data,
 		                                 b_ob,
 		                                 b_scene,
@@ -1171,4 +1294,3 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 }

 CCL_NAMESPACE_END
-
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -14,24 +14,24 @@
 * limitations under the License.
 */

-#include "camera.h"
-#include "integrator.h"
-#include "graph.h"
-#include "light.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "nodes.h"
-#include "particles.h"
-#include "shader.h"
+#include "render/camera.h"
+#include "render/integrator.h"
+#include "render/graph.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/nodes.h"
+#include "render/particles.h"
+#include "render/shader.h"

-#include "blender_object_cull.h"
-#include "blender_sync.h"
-#include "blender_util.h"
+#include "blender/blender_object_cull.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"

-#include "util_foreach.h"
-#include "util_hash.h"
-#include "util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_hash.h"
+#include "util/util_logging.h"

 CCL_NAMESPACE_BEGIN

@@ -343,6 +343,13 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 		object_updated = true;
 	}

+	PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
+	bool is_shadow_catcher = get_boolean(cobject, "is_shadow_catcher");
+	if(is_shadow_catcher != object->is_shadow_catcher) {
+		object->is_shadow_catcher = is_shadow_catcher;
+		object_updated = true;
+	}
+
 	/* object sync
 	 * transform comparison should not be needed, but duplis don't work perfect
 	 * in the depsgraph and may not signal changes, so this is a workaround */
--- a/intern/cycles/blender/blender_object_cull.cpp
+++ b/intern/cycles/blender/blender_object_cull.cpp
@@ -16,9 +16,9 @@

 #include <cstdlib>

-#include "camera.h"
+#include "render/camera.h"

-#include "blender_object_cull.h"
+#include "blender/blender_object_cull.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/blender/blender_object_cull.h
+++ b/intern/cycles/blender/blender_object_cull.h
@@ -17,8 +17,8 @@
 #ifndef __BLENDER_OBJECT_CULL_H__
 #define __BLENDER_OBJECT_CULL_H__

-#include "blender_sync.h"
-#include "util_types.h"
+#include "blender/blender_sync.h"
+#include "util/util_types.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/blender/blender_particles.cpp
+++ b/intern/cycles/blender/blender_particles.cpp
@@ -14,14 +14,14 @@
 * limitations under the License.
 */

-#include "mesh.h"
-#include "object.h"
-#include "particles.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/particles.h"

-#include "blender_sync.h"
-#include "blender_util.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"

-#include "util_foreach.h"
+#include "util/util_foreach.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -16,21 +16,21 @@

 #include <Python.h>

-#include "CCL_api.h"
+#include "blender/CCL_api.h"

-#include "blender_sync.h"
-#include "blender_session.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_session.h"

-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_md5.h"
-#include "util_opengl.h"
-#include "util_path.h"
-#include "util_string.h"
-#include "util_types.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_opengl.h"
+#include "util/util_path.h"
+#include "util/util_string.h"
+#include "util/util_types.h"

 #ifdef WITH_OSL
-#include "osl.h"
+#include "render/osl.h"

 #include <OSL/oslquery.h>
 #include <OSL/oslconfig.h>
@@ -67,8 +67,10 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
 	flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3");
 	flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
 	flags.cpu.qbvh = get_boolean(cscene, "debug_use_qbvh");
+	flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel");
 	/* Synchronize CUDA flags. */
 	flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
+	flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
 	/* Synchronize OpenCL kernel type. */
 	switch(get_enum(cscene, "debug_opencl_kernel_type")) {
 		case 0:
@@ -104,6 +106,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
 	}
 	/* Synchronize other OpenCL flags. */
 	flags.opencl.debug = get_boolean(cscene, "debug_use_opencl_debug");
+	flags.opencl.single_program = get_boolean(cscene, "debug_opencl_kernel_single_program");
 	return flags.opencl.device_type != opencl_device_type ||
 	       flags.opencl.kernel_type != opencl_kernel_type;
 }
@@ -641,7 +644,7 @@ static PyObject *debug_flags_reset_func(PyObject * /*self*/, PyObject * /*args*/
 	Py_RETURN_NONE;
 }

-static PyObject *set_resumable_chunks_func(PyObject * /*self*/, PyObject *args)
+static PyObject *set_resumable_chunk_func(PyObject * /*self*/, PyObject *args)
 {
 	int num_resumable_chunks, current_resumable_chunk;
 	if(!PyArg_ParseTuple(args, "ii",
@@ -676,6 +679,53 @@ static PyObject *set_resumable_chunks_func(PyObject * /*self*/, PyObject *args)
 	Py_RETURN_NONE;
 }

+static PyObject *set_resumable_chunk_range_func(PyObject * /*self*/, PyObject *args)
+{
+	int num_chunks, start_chunk, end_chunk;
+	if(!PyArg_ParseTuple(args, "iii",
+	                     &num_chunks,
+	                     &start_chunk,
+	                     &end_chunk)) {
+		Py_RETURN_NONE;
+	}
+
+	if(num_chunks <= 0) {
+		fprintf(stderr, "Cycles: Bad value for number of resumable chunks.\n");
+		abort();
+		Py_RETURN_NONE;
+	}
+	if(start_chunk < 1 || start_chunk > num_chunks) {
+		fprintf(stderr, "Cycles: Bad value for start chunk number.\n");
+		abort();
+		Py_RETURN_NONE;
+	}
+	if(end_chunk < 1 || end_chunk > num_chunks) {
+		fprintf(stderr, "Cycles: Bad value for start chunk number.\n");
+		abort();
+		Py_RETURN_NONE;
+	}
+	if(start_chunk > end_chunk) {
+		fprintf(stderr, "Cycles: End chunk should be higher than start one.\n");
+		abort();
+		Py_RETURN_NONE;
+	}
+
+	VLOG(1) << "Initialized resumable render: "
+	        << "num_resumable_chunks=" << num_chunks << ", "
+	        << "start_resumable_chunk=" << start_chunk
+	        << "end_resumable_chunk=" << end_chunk;
+	BlenderSession::num_resumable_chunks = num_chunks;
+	BlenderSession::start_resumable_chunk = start_chunk;
+	BlenderSession::end_resumable_chunk = end_chunk;
+
+	printf("Cycles: Will render chunks %d to %d of %d\n",
+	       start_chunk,
+	       end_chunk,
+	       num_chunks);
+
+	Py_RETURN_NONE;
+}
+
 static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
 {
 	vector<DeviceInfo>& devices = Device::available_devices();
@@ -715,7 +765,8 @@ static PyMethodDef methods[] = {
 	{"debug_flags_reset", debug_flags_reset_func, METH_NOARGS, ""},

 	/* Resumable render */
-	{"set_resumable_chunks", set_resumable_chunks_func, METH_VARARGS, ""},
+	{"set_resumable_chunk", set_resumable_chunk_func, METH_VARARGS, ""},
+	{"set_resumable_chunk_range", set_resumable_chunk_range_func, METH_VARARGS, ""},

 	/* Compute Device selection */
 	{"get_device_types", get_device_types_func, METH_VARARGS, ""},
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -16,36 +16,38 @@

 #include <stdlib.h>

-#include "background.h"
-#include "buffers.h"
-#include "camera.h"
-#include "device.h"
-#include "integrator.h"
-#include "film.h"
-#include "light.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "session.h"
-#include "shader.h"
+#include "render/background.h"
+#include "render/buffers.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/integrator.h"
+#include "render/film.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/session.h"
+#include "render/shader.h"

-#include "util_color.h"
-#include "util_foreach.h"
-#include "util_function.h"
-#include "util_hash.h"
-#include "util_logging.h"
-#include "util_progress.h"
-#include "util_time.h"
+#include "util/util_color.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_hash.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_time.h"

-#include "blender_sync.h"
-#include "blender_session.h"
-#include "blender_util.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_session.h"
+#include "blender/blender_util.h"

 CCL_NAMESPACE_BEGIN

 bool BlenderSession::headless = false;
 int BlenderSession::num_resumable_chunks = 0;
 int BlenderSession::current_resumable_chunk = 0;
+int BlenderSession::start_resumable_chunk = 0;
+int BlenderSession::end_resumable_chunk = 0;

 BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
                               BL::UserPreferences& b_userpref,
@@ -68,6 +70,7 @@ BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
 	background = true;
 	last_redraw_time = 0.0;
 	start_resize_time = 0.0;
+	last_status_time = 0.0;
 }

 BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
@@ -93,6 +96,7 @@ BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
 	background = false;
 	last_redraw_time = 0.0;
 	start_resize_time = 0.0;
+	last_status_time = 0.0;
 }

 BlenderSession::~BlenderSession()
@@ -989,10 +993,14 @@ void BlenderSession::update_status_progress()
 	if(substatus.size() > 0)
 		status += " | " + substatus;

-	if(status != last_status) {
+	double current_time = time_dt();
+	/* When rendering in a window, redraw the status at least once per second to keep the elapsed and remaining time up-to-date.
+	 * For headless rendering, only report when something significant changes to keep the console output readable. */
+	if(status != last_status || (!headless && (current_time - last_status_time) > 1.0)) {
 		b_engine.update_stats("", (timestatus + scene + status).c_str());
 		b_engine.update_memory_stats(mem_used, mem_peak);
 		last_status = status;
+		last_status_time = current_time;
 	}
 	if(progress != last_progress) {
 		b_engine.update_progress(progress);
@@ -1342,9 +1350,21 @@ void BlenderSession::update_resumable_tile_manager(int num_samples)
 		return;
 	}

-	int num_samples_per_chunk = (int)ceilf((float)num_samples / num_resumable_chunks);
-	int range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1);
-	int range_num_samples = num_samples_per_chunk;
+	const int num_samples_per_chunk = (int)ceilf((float)num_samples / num_resumable_chunks);
+
+	int range_start_sample, range_num_samples;
+	if(current_resumable_chunk != 0) {
+		/* Single chunk rendering. */
+		range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1);
+		range_num_samples = num_samples_per_chunk;
+	}
+	else {
+		/* Ranged-chunks. */
+		const int num_chunks = end_resumable_chunk - start_resumable_chunk + 1;
+		range_start_sample = num_samples_per_chunk * (start_resumable_chunk - 1);
+		range_num_samples = num_chunks * num_samples_per_chunk;
+	}
+	/* Make sure we don't overshoot. */
 	if(range_start_sample + range_num_samples > num_samples) {
 		range_num_samples = num_samples - range_num_samples;
 	}
@@ -1352,6 +1372,9 @@ void BlenderSession::update_resumable_tile_manager(int num_samples)
 	VLOG(1) << "Samples range start is " << range_start_sample << ", "
 	        << "number of samples to render is " << range_num_samples;

+	scene->integrator->start_sample = range_start_sample;
+	scene->integrator->tag_update(scene);
+
 	session->tile_manager.range_start_sample = range_start_sample;
 	session->tile_manager.range_num_samples = range_num_samples;
 }
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -17,12 +17,12 @@
 #ifndef __BLENDER_SESSION_H__
 #define __BLENDER_SESSION_H__

-#include "device.h"
-#include "scene.h"
-#include "session.h"
-#include "bake.h"
+#include "device/device.h"
+#include "render/scene.h"
+#include "render/session.h"
+#include "render/bake.h"

-#include "util_vector.h"
+#include "util/util_vector.h"

 CCL_NAMESPACE_BEGIN

@@ -113,6 +113,7 @@ public:
 	string last_status;
 	string last_error;
 	float last_progress;
+	double last_status_time;

 	int width, height;
 	double start_resize_time;
@@ -137,6 +138,10 @@ public:
 	/* Current resumable chunk index to render. */
 	static int current_resumable_chunk;

+	/* Alternative to single-chunk rendering to render a range of chunks. */
+	static int start_resumable_chunk;
+	static int end_resumable_chunk;
+
 protected:
 	void do_write_update_render_result(BL::RenderResult& b_rr,
 	                                   BL::RenderLayer& b_rlay,
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -14,20 +14,23 @@
 * limitations under the License.
 */

-#include "background.h"
-#include "graph.h"
-#include "light.h"
-#include "nodes.h"
-#include "osl.h"
-#include "scene.h"
-#include "shader.h"
+#include "render/background.h"
+#include "render/graph.h"
+#include "render/light.h"
+#include "render/nodes.h"
+#include "render/osl.h"
+#include "render/scene.h"
+#include "render/shader.h"

-#include "blender_texture.h"
-#include "blender_sync.h"
-#include "blender_util.h"
+#include "blender/blender_texture.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"

-#include "util_debug.h"
-#include "util_string.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_string.h"
+#include "util/util_set.h"
+#include "util/util_task.h"

 CCL_NAMESPACE_BEGIN

@@ -609,7 +612,8 @@ static ShaderNode *add_node(Scene *scene,
 			bool is_builtin = b_image.packed_file() ||
 			                  b_image.source() == BL::Image::source_GENERATED ||
 			                  b_image.source() == BL::Image::source_MOVIE ||
-			                  b_engine.is_preview();
+			                  (b_engine.is_preview() &&
+			                   b_image.source() != BL::Image::source_SEQUENCE);

 			if(is_builtin) {
 				/* for builtin images we're using image datablock name to find an image to
@@ -640,7 +644,8 @@ static ShaderNode *add_node(Scene *scene,
 				        image->filename.string(),
 				        image->builtin_data,
 				        get_image_interpolation(b_image_node),
-				        get_image_extension(b_image_node));
+				        get_image_extension(b_image_node),
+				        image->use_alpha);
 			}
 		}
 		image->color_space = (NodeImageColorSpace)b_image_node.color_space();
@@ -661,7 +666,8 @@ static ShaderNode *add_node(Scene *scene,
 			bool is_builtin = b_image.packed_file() ||
 			                  b_image.source() == BL::Image::source_GENERATED ||
 			                  b_image.source() == BL::Image::source_MOVIE ||
-			                  b_engine.is_preview();
+			                  (b_engine.is_preview() &&
+			                   b_image.source() != BL::Image::source_SEQUENCE);

 			if(is_builtin) {
 				int scene_frame = b_scene.frame_current();
@@ -686,7 +692,8 @@ static ShaderNode *add_node(Scene *scene,
 				        env->filename.string(),
 				        env->builtin_data,
 				        get_image_interpolation(b_env_node),
-				        EXTENSION_REPEAT);
+				        EXTENSION_REPEAT,
+				        env->use_alpha);
 			}
 		}
 		env->color_space = (NodeImageColorSpace)b_env_node.color_space();
@@ -823,7 +830,8 @@ static ShaderNode *add_node(Scene *scene,
 			        point_density->filename.string(),
 			        point_density->builtin_data,
 			        point_density->interpolation,
-			        EXTENSION_CLIP);
+			        EXTENSION_CLIP,
+			        true);
 		}
 		node = point_density;

@@ -1159,6 +1167,9 @@ void BlenderSync::sync_materials(bool update_all)
 	/* material loop */
 	BL::BlendData::materials_iterator b_mat;

+	TaskPool pool;
+	set<Shader*> updated_shaders;
+
 	for(b_data.materials.begin(b_mat); b_mat != b_data.materials.end(); ++b_mat) {
 		Shader *shader;

@@ -1194,9 +1205,37 @@ void BlenderSync::sync_materials(bool update_all)
 			shader->displacement_method = (experimental) ? get_displacement_method(cmat) : DISPLACE_BUMP;

 			shader->set_graph(graph);
-			shader->tag_update(scene);
+
+			/* By simplifying the shader graph as soon as possible, some
+			 * redundant shader nodes might be removed which prevents loading
+			 * unnecessary attributes later.
+			 *
+			 * However, since graph simplification also accounts for e.g. mix
+			 * weight, this would cause frequent expensive resyncs in interactive
+			 * sessions, so for those sessions optimization is only performed
+			 * right before compiling.
+			 */
+			if(!preview) {
+				pool.push(function_bind(&ShaderGraph::simplify, graph, scene));
+				/* NOTE: Update shaders out of the threads since those routines
+				 * are accessing and writing to a global context.
+				 */
+				updated_shaders.insert(shader);
+			}
+			else {
+				/* NOTE: Update tagging can access links which are being
+				 * optimized out.
+				 */
+				shader->tag_update(scene);
+			}
 		}
 	}
+
+	pool.wait_work();
+
+	foreach(Shader *shader, updated_shaders) {
+		shader->tag_update(scene);
+	}
 }

 /* Sync World */
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -14,29 +14,29 @@
 * limitations under the License.
 */

-#include "background.h"
-#include "camera.h"
-#include "film.h"
-#include "graph.h"
-#include "integrator.h"
-#include "light.h"
-#include "mesh.h"
-#include "nodes.h"
-#include "object.h"
-#include "scene.h"
-#include "shader.h"
-#include "curves.h"
+#include "render/background.h"
+#include "render/camera.h"
+#include "render/film.h"
+#include "render/graph.h"
+#include "render/integrator.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/curves.h"

-#include "device.h"
+#include "device/device.h"

-#include "blender_sync.h"
-#include "blender_session.h"
-#include "blender_util.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_session.h"
+#include "blender/blender_util.h"

-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_opengl.h"
-#include "util_hash.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_opengl.h"
+#include "util/util_hash.h"

 CCL_NAMESPACE_BEGIN

@@ -322,6 +322,15 @@ void BlenderSync::sync_integrator()
 		integrator->volume_samples = volume_samples;
 	}

+	if(b_scene.render().use_simplify()) {
+		if(preview) {
+			integrator->ao_bounces = get_int(cscene, "ao_bounces");
+		}
+		else {
+			integrator->ao_bounces = get_int(cscene, "ao_bounces_render");
+		}
+	}
+
 	if(integrator->modified(previntegrator))
 		integrator->tag_update(scene);
 }
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -22,15 +22,15 @@
 #include "RNA_access.h"
 #include "RNA_blender_cpp.h"

-#include "blender_util.h"
+#include "blender/blender_util.h"

-#include "scene.h"
-#include "session.h"
+#include "render/scene.h"
+#include "render/session.h"

-#include "util_map.h"
-#include "util_set.h"
-#include "util_transform.h"
-#include "util_vector.h"
+#include "util/util_map.h"
+#include "util/util_set.h"
+#include "util/util_transform.h"
+#include "util/util_vector.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/blender/blender_texture.cpp
+++ b/intern/cycles/blender/blender_texture.cpp
@@ -14,7 +14,7 @@
 * limitations under the License.
 */

-#include "blender_texture.h"
+#include "blender/blender_texture.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/blender/blender_texture.h
+++ b/intern/cycles/blender/blender_texture.h
@@ -18,7 +18,7 @@
 #define __BLENDER_TEXTURE_H__

 #include <stdlib.h>
-#include "blender_sync.h"
+#include "blender/blender_sync.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -17,14 +17,15 @@
 #ifndef __BLENDER_UTIL_H__
 #define __BLENDER_UTIL_H__

-#include "mesh.h"
+#include "render/mesh.h"

-#include "util_map.h"
-#include "util_path.h"
-#include "util_set.h"
-#include "util_transform.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_algorithm.h"
+#include "util/util_map.h"
+#include "util/util_path.h"
+#include "util/util_set.h"
+#include "util/util_transform.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"

 /* Hacks to hook into Blender API
 * todo: clean this up ... */
@@ -78,7 +79,7 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data,
 				me.calc_normals_split();
 			}
 			else {
-				me.split_faces();
+				me.split_faces(false);
 			}
 		}
 		if(subdivision_type == Mesh::SUBDIVISION_NONE) {
@@ -173,22 +174,19 @@ static inline void curvemapping_color_to_array(BL::CurveMapping& cumap,

 	if(rgb_curve) {
 		BL::CurveMap mapI = cumap.curves[3];
-
 		for(int i = 0; i < size; i++) {
-			float t = min_x + (float)i/(float)(size-1) * range_x;
-
-			data[i][0] = mapR.evaluate(mapI.evaluate(t));
-			data[i][1] = mapG.evaluate(mapI.evaluate(t));
-			data[i][2] = mapB.evaluate(mapI.evaluate(t));
+			const float t = min_x + (float)i/(float)(size-1) * range_x;
+			data[i] = make_float3(mapR.evaluate(mapI.evaluate(t)),
+			                      mapG.evaluate(mapI.evaluate(t)),
+			                      mapB.evaluate(mapI.evaluate(t)));
 		}
 	}
 	else {
 		for(int i = 0; i < size; i++) {
 			float t = min_x + (float)i/(float)(size-1) * range_x;
-
-			data[i][0] = mapR.evaluate(t);
-			data[i][1] = mapG.evaluate(t);
-			data[i][2] = mapB.evaluate(t);
+			data[i] = make_float3(mapR.evaluate(t),
+			                      mapG.evaluate(t),
+			                      mapB.evaluate(t));
 		}
 	}
 }
@@ -786,6 +784,35 @@ struct ParticleSystemKey {
 	}
 };

+class EdgeMap {
+public:
+	EdgeMap() {
+	}
+
+	void clear() {
+		edges_.clear();
+	}
+
+	void insert(int v0, int v1) {
+		get_sorted_verts(v0, v1);
+		edges_.insert(std::pair<int, int>(v0, v1));
+	}
+
+	bool exists(int v0, int v1) {
+		get_sorted_verts(v0, v1);
+		return edges_.find(std::pair<int, int>(v0, v1)) != edges_.end();
+	}
+
+protected:
+	void get_sorted_verts(int& v0, int& v1) {
+		if(v0 > v1) {
+			swap(v0, v1);
+		}
+	}
+
+	set< std::pair<int, int> > edges_;
+};
+
 CCL_NAMESPACE_END

 #endif /* __BLENDER_UTIL_H__ */
--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -1,12 +1,6 @@

 set(INC
-	.
-	../graph
-	../kernel
-	../kernel/svm
-	../render
-	../util
-	../device
+	..
 )

 set(INC_SYS
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -15,25 +15,25 @@
 * limitations under the License.
 */

-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "curves.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/curves.h"

-#include "bvh.h"
-#include "bvh_build.h"
-#include "bvh_node.h"
-#include "bvh_params.h"
-#include "bvh_unaligned.h"
+#include "bvh/bvh.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_node.h"
+#include "bvh/bvh_params.h"
+#include "bvh/bvh_unaligned.h"

-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_progress.h"
-#include "util_system.h"
-#include "util_types.h"
-#include "util_math.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_progress.h"
+#include "util/util_system.h"
+#include "util/util_types.h"
+#include "util/util_math.h"

 CCL_NAMESPACE_BEGIN

@@ -67,7 +67,7 @@ BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects)
 	if(params.use_qbvh)
 		return new QBVH(params, objects);
 	else
-		return new RegularBVH(params, objects);
+		return new BinaryBVH(params, objects);
 }

 /* Building */
@@ -81,6 +81,7 @@ void BVH::build(Progress& progress)
 	                   pack.prim_type,
 	                   pack.prim_index,
 	                   pack.prim_object,
+	                   pack.prim_time,
 	                   params,
 	                   progress);
 	BVHNode *root = bvh_build.run();
@@ -256,6 +257,10 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 	pack.leaf_nodes.resize(leaf_nodes_size);
 	pack.object_node.resize(objects.size());

+	if(params.num_motion_curve_steps > 0 || params.num_motion_triangle_steps > 0) {
+		pack.prim_time.resize(prim_index_size);
+	}
+
 	int *pack_prim_index = (pack.prim_index.size())? &pack.prim_index[0]: NULL;
 	int *pack_prim_type = (pack.prim_type.size())? &pack.prim_type[0]: NULL;
 	int *pack_prim_object = (pack.prim_object.size())? &pack.prim_object[0]: NULL;
@@ -264,6 +269,7 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 	uint *pack_prim_tri_index = (pack.prim_tri_index.size())? &pack.prim_tri_index[0]: NULL;
 	int4 *pack_nodes = (pack.nodes.size())? &pack.nodes[0]: NULL;
 	int4 *pack_leaf_nodes = (pack.leaf_nodes.size())? &pack.leaf_nodes[0]: NULL;
+	float2 *pack_prim_time = (pack.prim_time.size())? &pack.prim_time[0]: NULL;

 	/* merge */
 	foreach(Object *ob, objects) {
@@ -309,6 +315,7 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 			int *bvh_prim_type = &bvh->pack.prim_type[0];
 			uint *bvh_prim_visibility = &bvh->pack.prim_visibility[0];
 			uint *bvh_prim_tri_index = &bvh->pack.prim_tri_index[0];
+			float2 *bvh_prim_time = bvh->pack.prim_time.size()? &bvh->pack.prim_time[0]: NULL;

 			for(size_t i = 0; i < bvh_prim_index_size; i++) {
 				if(bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
@@ -324,6 +331,9 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 				pack_prim_type[pack_prim_index_offset] = bvh_prim_type[i];
 				pack_prim_visibility[pack_prim_index_offset] = bvh_prim_visibility[i];
 				pack_prim_object[pack_prim_index_offset] = 0;  // unused for instances
+				if(bvh_prim_time != NULL) {
+					pack_prim_time[pack_prim_index_offset] = bvh_prim_time[i];
+				}
 				pack_prim_index_offset++;
 			}
 		}
@@ -414,64 +424,64 @@ static bool node_bvh_is_unaligned(const BVHNode *node)
 {
 	const BVHNode *node0 = node->get_child(0),
 	              *node1 = node->get_child(1);
-	return node0->is_unaligned() || node1->is_unaligned();
+	return node0->is_unaligned || node1->is_unaligned;
 }

-RegularBVH::RegularBVH(const BVHParams& params_, const vector<Object*>& objects_)
+BinaryBVH::BinaryBVH(const BVHParams& params_, const vector<Object*>& objects_)
 : BVH(params_, objects_)
 {
 }

-void RegularBVH::pack_leaf(const BVHStackEntry& e,
-                           const LeafNode *leaf)
+void BinaryBVH::pack_leaf(const BVHStackEntry& e,
+                          const LeafNode *leaf)
 {
 	assert(e.idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size());
 	float4 data[BVH_NODE_LEAF_SIZE];
 	memset(data, 0, sizeof(data));
-	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->m_lo] == -1) {
+	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
 		/* object */
-		data[0].x = __int_as_float(~(leaf->m_lo));
+		data[0].x = __int_as_float(~(leaf->lo));
 		data[0].y = __int_as_float(0);
 	}
 	else {
 		/* triangle */
-		data[0].x = __int_as_float(leaf->m_lo);
-		data[0].y = __int_as_float(leaf->m_hi);
+		data[0].x = __int_as_float(leaf->lo);
+		data[0].y = __int_as_float(leaf->hi);
 	}
-	data[0].z = __uint_as_float(leaf->m_visibility);
+	data[0].z = __uint_as_float(leaf->visibility);
 	if(leaf->num_triangles() != 0) {
-		data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]);
+		data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
 	}

 	memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
 }

-void RegularBVH::pack_inner(const BVHStackEntry& e,
-                            const BVHStackEntry& e0,
-                            const BVHStackEntry& e1)
+void BinaryBVH::pack_inner(const BVHStackEntry& e,
+                           const BVHStackEntry& e0,
+                           const BVHStackEntry& e1)
 {
-	if(e0.node->is_unaligned() || e1.node->is_unaligned()) {
+	if(e0.node->is_unaligned || e1.node->is_unaligned) {
 		pack_unaligned_inner(e, e0, e1);
 	} else {
 		pack_aligned_inner(e, e0, e1);
 	}
 }

-void RegularBVH::pack_aligned_inner(const BVHStackEntry& e,
-                                    const BVHStackEntry& e0,
-                                    const BVHStackEntry& e1)
+void BinaryBVH::pack_aligned_inner(const BVHStackEntry& e,
+                                   const BVHStackEntry& e0,
+                                   const BVHStackEntry& e1)
 {
 	pack_aligned_node(e.idx,
-	                  e0.node->m_bounds, e1.node->m_bounds,
+	                  e0.node->bounds, e1.node->bounds,
 	                  e0.encodeIdx(), e1.encodeIdx(),
-	                  e0.node->m_visibility, e1.node->m_visibility);
+	                  e0.node->visibility, e1.node->visibility);
 }

-void RegularBVH::pack_aligned_node(int idx,
-                                   const BoundBox& b0,
-                                   const BoundBox& b1,
-                                   int c0, int c1,
-                                   uint visibility0, uint visibility1)
+void BinaryBVH::pack_aligned_node(int idx,
+                                  const BoundBox& b0,
+                                  const BoundBox& b1,
+                                  int c0, int c1,
+                                  uint visibility0, uint visibility1)
 {
 	assert(idx + BVH_NODE_SIZE <= pack.nodes.size());
 	assert(c0 < 0 || c0 < pack.nodes.size());
@@ -498,26 +508,26 @@ void RegularBVH::pack_aligned_node(int idx,
 	memcpy(&pack.nodes[idx], data, sizeof(int4)*BVH_NODE_SIZE);
 }

-void RegularBVH::pack_unaligned_inner(const BVHStackEntry& e,
-                                      const BVHStackEntry& e0,
-                                      const BVHStackEntry& e1)
+void BinaryBVH::pack_unaligned_inner(const BVHStackEntry& e,
+                                     const BVHStackEntry& e0,
+                                     const BVHStackEntry& e1)
 {
 	pack_unaligned_node(e.idx,
 	                    e0.node->get_aligned_space(),
 	                    e1.node->get_aligned_space(),
-	                    e0.node->m_bounds,
-	                    e1.node->m_bounds,
+	                    e0.node->bounds,
+	                    e1.node->bounds,
 	                    e0.encodeIdx(), e1.encodeIdx(),
-	                    e0.node->m_visibility, e1.node->m_visibility);
+	                    e0.node->visibility, e1.node->visibility);
 }

-void RegularBVH::pack_unaligned_node(int idx,
-                                     const Transform& aligned_space0,
-                                     const Transform& aligned_space1,
-                                     const BoundBox& bounds0,
-                                     const BoundBox& bounds1,
-                                     int c0, int c1,
-                                     uint visibility0, uint visibility1)
+void BinaryBVH::pack_unaligned_node(int idx,
+                                    const Transform& aligned_space0,
+                                    const Transform& aligned_space1,
+                                    const BoundBox& bounds0,
+                                    const BoundBox& bounds1,
+                                    int c0, int c1,
+                                    uint visibility0, uint visibility1)
 {
 	assert(idx + BVH_UNALIGNED_NODE_SIZE <= pack.nodes.size());
 	assert(c0 < 0 || c0 < pack.nodes.size());
@@ -543,7 +553,7 @@ void RegularBVH::pack_unaligned_node(int idx,
 	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_NODE_SIZE);
 }

-void RegularBVH::pack_nodes(const BVHNode *root)
+void BinaryBVH::pack_nodes(const BVHNode *root)
 {
 	const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
 	const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
@@ -620,7 +630,7 @@ void RegularBVH::pack_nodes(const BVHNode *root)
 	pack.root_index = (root->is_leaf())? -1: 0;
 }

-void RegularBVH::refit_nodes()
+void BinaryBVH::refit_nodes()
 {
 	assert(!params.top_level);

@@ -629,7 +639,7 @@ void RegularBVH::refit_nodes()
 	refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
 }

-void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
+void BinaryBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 {
 	if(leaf) {
 		assert(idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size());
@@ -759,18 +769,18 @@ static bool node_qbvh_is_unaligned(const BVHNode *node)
 	              *node1 = node->get_child(1);
 	bool has_unaligned = false;
 	if(node0->is_leaf()) {
-		has_unaligned |= node0->is_unaligned();
+		has_unaligned |= node0->is_unaligned;
 	}
 	else {
-		has_unaligned |= node0->get_child(0)->is_unaligned();
-		has_unaligned |= node0->get_child(1)->is_unaligned();
+		has_unaligned |= node0->get_child(0)->is_unaligned;
+		has_unaligned |= node0->get_child(1)->is_unaligned;
 	}
 	if(node1->is_leaf()) {
-		has_unaligned |= node1->is_unaligned();
+		has_unaligned |= node1->is_unaligned;
 	}
 	else {
-		has_unaligned |= node1->get_child(0)->is_unaligned();
-		has_unaligned |= node1->get_child(1)->is_unaligned();
+		has_unaligned |= node1->get_child(0)->is_unaligned;
+		has_unaligned |= node1->get_child(1)->is_unaligned;
 	}
 	return has_unaligned;
 }
@@ -785,19 +795,19 @@ void QBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
 {
 	float4 data[BVH_QNODE_LEAF_SIZE];
 	memset(data, 0, sizeof(data));
-	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->m_lo] == -1) {
+	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
 		/* object */
-		data[0].x = __int_as_float(~(leaf->m_lo));
+		data[0].x = __int_as_float(~(leaf->lo));
 		data[0].y = __int_as_float(0);
 	}
 	else {
 		/* triangle */
-		data[0].x = __int_as_float(leaf->m_lo);
-		data[0].y = __int_as_float(leaf->m_hi);
+		data[0].x = __int_as_float(leaf->lo);
+		data[0].y = __int_as_float(leaf->hi);
 	}
-	data[0].z = __uint_as_float(leaf->m_visibility);
+	data[0].z = __uint_as_float(leaf->visibility);
 	if(leaf->num_triangles() != 0) {
-		data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]);
+		data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
 	}

 	memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
@@ -813,7 +823,7 @@ void QBVH::pack_inner(const BVHStackEntry& e,
 	 */
 	if(params.use_unaligned_nodes) {
 		for(int i = 0; i < num; i++) {
-			if(en[i].node->is_unaligned()) {
+			if(en[i].node->is_unaligned) {
 				has_unaligned = true;
 				break;
 			}
@@ -838,15 +848,15 @@ void QBVH::pack_aligned_inner(const BVHStackEntry& e,
 	BoundBox bounds[4];
 	int child[4];
 	for(int i = 0; i < num; ++i) {
-		bounds[i] = en[i].node->m_bounds;
+		bounds[i] = en[i].node->bounds;
 		child[i] = en[i].encodeIdx();
 	}
 	pack_aligned_node(e.idx,
 	                  bounds,
 	                  child,
-	                  e.node->m_visibility,
-	                  e.node->m_time_from,
-	                  e.node->m_time_to,
+	                  e.node->visibility,
+	                  e.node->time_from,
+	                  e.node->time_to,
 	                  num);
 }

@@ -907,16 +917,16 @@ void QBVH::pack_unaligned_inner(const BVHStackEntry& e,
 	int child[4];
 	for(int i = 0; i < num; ++i) {
 		aligned_space[i] = en[i].node->get_aligned_space();
-		bounds[i] = en[i].node->m_bounds;
+		bounds[i] = en[i].node->bounds;
 		child[i] = en[i].encodeIdx();
 	}
 	pack_unaligned_node(e.idx,
 	                    aligned_space,
 	                    bounds,
 	                    child,
-	                    e.node->m_visibility,
-	                    e.node->m_time_from,
-	                    e.node->m_time_to,
+	                    e.node->visibility,
+	                    e.node->time_from,
+	                    e.node->time_to,
 	                    num);
 }

--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -18,10 +18,10 @@
 #ifndef __BVH_H__
 #define __BVH_H__

-#include "bvh_params.h"
+#include "bvh/bvh_params.h"

-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"

 CCL_NAMESPACE_BEGIN

@@ -68,6 +68,8 @@ struct PackedBVH {
 	array<int> prim_index;
 	/* mapping from BVH primitive index, to the object id of that primitive. */
 	array<int> prim_object;
+	/* Time range of BVH primitive. */
+	array<float2> prim_time;

 	/* index of the root node. */
 	int root_index;
@@ -108,15 +110,15 @@ protected:
 	virtual void refit_nodes() = 0;
 };

-/* Regular BVH
+/* Binary BVH
 *
 * Typical BVH with each node having two children. */

-class RegularBVH : public BVH {
+class BinaryBVH : public BVH {
 protected:
 	/* constructor */
 	friend class BVH;
-	RegularBVH(const BVHParams& params, const vector<Object*>& objects);
+	BinaryBVH(const BVHParams& params, const vector<Object*>& objects);

 	/* pack */
 	void pack_nodes(const BVHNode *root);
--- a/intern/cycles/bvh/bvh_binning.cpp
+++ b/intern/cycles/bvh/bvh_binning.cpp
@@ -19,11 +19,11 @@

 #include <stdlib.h>

-#include "bvh_binning.h"
+#include "bvh/bvh_binning.h"

-#include "util_algorithm.h"
-#include "util_boundbox.h"
-#include "util_types.h"
+#include "util/util_algorithm.h"
+#include "util/util_boundbox.h"
+#include "util/util_types.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/bvh/bvh_binning.h
+++ b/intern/cycles/bvh/bvh_binning.h
@@ -18,10 +18,10 @@
 #ifndef __BVH_BINNING_H__
 #define __BVH_BINNING_H__

-#include "bvh_params.h"
-#include "bvh_unaligned.h"
+#include "bvh/bvh_params.h"
+#include "bvh/bvh_unaligned.h"

-#include "util_types.h"
+#include "util/util_types.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -15,25 +15,26 @@
 * limitations under the License.
 */

-#include "bvh_binning.h"
-#include "bvh_build.h"
-#include "bvh_node.h"
-#include "bvh_params.h"
+#include "bvh/bvh_binning.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_node.h"
+#include "bvh/bvh_params.h"
 #include "bvh_split.h"

-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "curves.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/curves.h"

-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_progress.h"
-#include "util_stack_allocator.h"
-#include "util_simd.h"
-#include "util_time.h"
-#include "util_queue.h"
+#include "util/util_algorithm.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_stack_allocator.h"
+#include "util/util_simd.h"
+#include "util/util_time.h"
+#include "util/util_queue.h"

 CCL_NAMESPACE_BEGIN

@@ -92,12 +93,14 @@ BVHBuild::BVHBuild(const vector<Object*>& objects_,
                   array<int>& prim_type_,
                   array<int>& prim_index_,
                   array<int>& prim_object_,
+                   array<float2>& prim_time_,
                   const BVHParams& params_,
                   Progress& progress_)
 : objects(objects_),
   prim_type(prim_type_),
   prim_index(prim_index_),
   prim_object(prim_object_),
+   prim_time(prim_time_),
   params(params_),
   progress(progress_),
   progress_start_time(0.0),
@@ -464,6 +467,9 @@ BVHNode* BVHBuild::run()
 	}
 	spatial_free_index = 0;

+	need_prim_time = params.num_motion_curve_steps > 0 ||
+	                 params.num_motion_triangle_steps > 0;
+
 	/* init progress updates */
 	double build_start_time;
 	build_start_time = progress_start_time = time_dt();
@@ -474,6 +480,12 @@ BVHNode* BVHBuild::run()
 	prim_type.resize(references.size());
 	prim_index.resize(references.size());
 	prim_object.resize(references.size());
+	if(need_prim_time) {
+		prim_time.resize(references.size());
+	}
+	else {
+		prim_time.resize(0);
+	}

 	/* build recursively */
 	BVHNode *rootnode;
@@ -848,11 +860,14 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start,
 		prim_type[start] = ref->prim_type();
 		prim_index[start] = ref->prim_index();
 		prim_object[start] = ref->prim_object();
+		if(need_prim_time) {
+			prim_time[start] = make_float2(ref->time_from(), ref->time_to());
+		}

 		uint visibility = objects[ref->prim_object()]->visibility;
 		BVHNode *leaf_node =  new LeafNode(ref->bounds(), visibility, start, start+1);
-		leaf_node->m_time_from = ref->time_from();
-		leaf_node->m_time_to = ref->time_to();
+		leaf_node->time_from = ref->time_from();
+		leaf_node->time_to = ref->time_to();
 		return leaf_node;
 	}
 	else {
@@ -861,12 +876,12 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start,
 		BVHNode *leaf1 = create_object_leaf_nodes(ref+mid, start+mid, num-mid);

 		BoundBox bounds = BoundBox::empty;
-		bounds.grow(leaf0->m_bounds);
-		bounds.grow(leaf1->m_bounds);
+		bounds.grow(leaf0->bounds);
+		bounds.grow(leaf1->bounds);

 		BVHNode *inner_node = new InnerNode(bounds, leaf0, leaf1);
-		inner_node->m_time_from = min(leaf0->m_time_from, leaf1->m_time_from);
-		inner_node->m_time_to = max(leaf0->m_time_to, leaf1->m_time_to);
+		inner_node->time_from = min(leaf0->time_from, leaf1->time_from);
+		inner_node->time_to = max(leaf0->time_to, leaf1->time_to);
 		return inner_node;
 	}
 }
@@ -890,11 +905,13 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	 *    can not control.
 	 */
 	typedef StackAllocator<256, int> LeafStackAllocator;
+	typedef StackAllocator<256, float2> LeafTimeStackAllocator;
 	typedef StackAllocator<256, BVHReference> LeafReferenceStackAllocator;

 	vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM_TOTAL];
 	vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM_TOTAL];
 	vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM_TOTAL];
+	vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM_TOTAL];
 	vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM_TOTAL];

 	/* TODO(sergey): In theory we should be able to store references. */
@@ -917,6 +934,8 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			p_type[type_index].push_back(ref.prim_type());
 			p_index[type_index].push_back(ref.prim_index());
 			p_object[type_index].push_back(ref.prim_object());
+			p_time[type_index].push_back(make_float2(ref.time_from(),
+			                                         ref.time_to()));

 			bounds[type_index].grow(ref.bounds());
 			visibility[type_index] |= objects[ref.prim_object()]->visibility;
@@ -946,9 +965,13 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	vector<int, LeafStackAllocator> local_prim_type,
 	                                local_prim_index,
 	                                local_prim_object;
+	vector<float2, LeafTimeStackAllocator> local_prim_time;
 	local_prim_type.resize(num_new_prims);
 	local_prim_index.resize(num_new_prims);
 	local_prim_object.resize(num_new_prims);
+	if(need_prim_time) {
+		local_prim_time.resize(num_new_prims);
+	}
 	for(int i = 0; i < PRIMITIVE_NUM_TOTAL; ++i) {
 		int num = (int)p_type[i].size();
 		if(num != 0) {
@@ -961,6 +984,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 				local_prim_type[index] = p_type[i][j];
 				local_prim_index[index] = p_index[i][j];
 				local_prim_object[index] = p_object[i][j];
+				if(need_prim_time) {
+					local_prim_time[index] = p_time[i][j];
+				}
 				if(params.use_unaligned_nodes && !alignment_found) {
 					alignment_found =
 						unaligned_heuristic.compute_aligned_space(p_ref[i][j],
@@ -978,19 +1004,19 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 					time_from = min(time_from, ref.time_from());
 					time_to = max(time_to, ref.time_to());
 				}
-				leaf_node->m_time_from = time_from;
-				leaf_node->m_time_to = time_to;
+				leaf_node->time_from = time_from;
+				leaf_node->time_to = time_to;
 			}
 			if(alignment_found) {
 				/* Need to recalculate leaf bounds with new alignment. */
-				leaf_node->m_bounds = BoundBox::empty;
+				leaf_node->bounds = BoundBox::empty;
 				for(int j = 0; j < num; ++j) {
 					const BVHReference &ref = p_ref[i][j];
 					BoundBox ref_bounds =
 					        unaligned_heuristic.compute_aligned_prim_boundbox(
 					                ref,
 					                aligned_space);
-					leaf_node->m_bounds.grow(ref_bounds);
+					leaf_node->bounds.grow(ref_bounds);
 				}
 				/* Set alignment space. */
 				leaf_node->set_aligned_space(aligned_space);
@@ -1027,11 +1053,17 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 				prim_type.reserve(reserve);
 				prim_index.reserve(reserve);
 				prim_object.reserve(reserve);
+				if(need_prim_time) {
+					prim_time.reserve(reserve);
+				}
 			}

 			prim_type.resize(range_end);
 			prim_index.resize(range_end);
 			prim_object.resize(range_end);
+			if(need_prim_time) {
+				prim_time.resize(range_end);
+			}
 		}
 		spatial_spin_lock.unlock();

@@ -1040,6 +1072,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			memcpy(&prim_type[start_index], &local_prim_type[0], new_leaf_data_size);
 			memcpy(&prim_index[start_index], &local_prim_index[0], new_leaf_data_size);
 			memcpy(&prim_object[start_index], &local_prim_object[0], new_leaf_data_size);
+			if(need_prim_time) {
+				memcpy(&prim_time[start_index], &local_prim_time[0], sizeof(float2)*num_new_leaf_data);
+			}
 		}
 	}
 	else {
@@ -1052,6 +1087,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			memcpy(&prim_type[start_index], &local_prim_type[0], new_leaf_data_size);
 			memcpy(&prim_index[start_index], &local_prim_index[0], new_leaf_data_size);
 			memcpy(&prim_object[start_index], &local_prim_object[0], new_leaf_data_size);
+			if(need_prim_time) {
+				memcpy(&prim_time[start_index], &local_prim_time[0], sizeof(float2)*num_new_leaf_data);
+			}
 		}
 	}

@@ -1061,8 +1099,8 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	 */
 	for(int i = 0; i < num_leaves; ++i) {
 		LeafNode *leaf = (LeafNode *)leaves[i];
-		leaf->m_lo += start_index;
-		leaf->m_hi += start_index;
+		leaf->lo += start_index;
+		leaf->hi += start_index;
 	}

 	/* Create leaf node for object. */
@@ -1091,17 +1129,17 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 		return new InnerNode(range.bounds(), leaves[0], leaves[1]);
 	}
 	else if(num_leaves == 3) {
-		BoundBox inner_bounds = merge(leaves[1]->m_bounds, leaves[2]->m_bounds);
+		BoundBox inner_bounds = merge(leaves[1]->bounds, leaves[2]->bounds);
 		BVHNode *inner = new InnerNode(inner_bounds, leaves[1], leaves[2]);
 		return new InnerNode(range.bounds(), leaves[0], inner);
 	} else {
 		/* Should be doing more branches if more primitive types added. */
 		assert(num_leaves <= 5);
-		BoundBox inner_bounds_a = merge(leaves[0]->m_bounds, leaves[1]->m_bounds);
-		BoundBox inner_bounds_b = merge(leaves[2]->m_bounds, leaves[3]->m_bounds);
+		BoundBox inner_bounds_a = merge(leaves[0]->bounds, leaves[1]->bounds);
+		BoundBox inner_bounds_b = merge(leaves[2]->bounds, leaves[3]->bounds);
 		BVHNode *inner_a = new InnerNode(inner_bounds_a, leaves[0], leaves[1]);
 		BVHNode *inner_b = new InnerNode(inner_bounds_b, leaves[2], leaves[3]);
-		BoundBox inner_bounds_c = merge(inner_a->m_bounds, inner_b->m_bounds);
+		BoundBox inner_bounds_c = merge(inner_a->bounds, inner_b->bounds);
 		BVHNode *inner_c = new InnerNode(inner_bounds_c, inner_a, inner_b);
 		if(num_leaves == 5) {
 			return new InnerNode(range.bounds(), inner_c, leaves[4]);
@@ -1136,8 +1174,8 @@ void BVHBuild::rotate(BVHNode *node, int max_depth)
 		rotate(parent->children[c], max_depth-1);

 	/* compute current area of all children */
-	BoundBox bounds0 = parent->children[0]->m_bounds;
-	BoundBox bounds1 = parent->children[1]->m_bounds;
+	BoundBox bounds0 = parent->children[0]->bounds;
+	BoundBox bounds1 = parent->children[1]->bounds;

 	float area0 = bounds0.half_area();
 	float area1 = bounds1.half_area();
@@ -1157,8 +1195,8 @@ void BVHBuild::rotate(BVHNode *node, int max_depth)
 		BoundBox& other = (c == 0)? bounds1: bounds0;

 		/* transpose child bounds */
-		BoundBox target0 = child->children[0]->m_bounds;
-		BoundBox target1 = child->children[1]->m_bounds;
+		BoundBox target0 = child->children[0]->bounds;
+		BoundBox target1 = child->children[1]->bounds;

 		/* compute cost for both possible swaps */
 		float cost0 = merge(other, target1).half_area() - child_area[c];
@@ -1190,7 +1228,7 @@ void BVHBuild::rotate(BVHNode *node, int max_depth)
 	InnerNode *child = (InnerNode*)parent->children[best_child];

 	swap(parent->children[best_other], child->children[best_target]);
-	child->m_bounds = merge(child->children[0]->m_bounds, child->children[1]->m_bounds);
+	child->bounds = merge(child->children[0]->bounds, child->children[1]->bounds);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@@ -20,13 +20,13 @@

 #include <float.h>

-#include "bvh.h"
-#include "bvh_binning.h"
-#include "bvh_unaligned.h"
+#include "bvh/bvh.h"
+#include "bvh/bvh_binning.h"
+#include "bvh/bvh_unaligned.h"

-#include "util_boundbox.h"
-#include "util_task.h"
-#include "util_vector.h"
+#include "util/util_boundbox.h"
+#include "util/util_task.h"
+#include "util/util_vector.h"

 CCL_NAMESPACE_BEGIN

@@ -48,6 +48,7 @@ public:
 	         array<int>& prim_type,
 	         array<int>& prim_index,
 	         array<int>& prim_object,
+	         array<float2>& prim_time,
 	         const BVHParams& params,
 	         Progress& progress);
 	~BVHBuild();
@@ -112,6 +113,9 @@ protected:
 	array<int>& prim_type;
 	array<int>& prim_index;
 	array<int>& prim_object;
+	array<float2>& prim_time;
+
+	bool need_prim_time;

 	/* Build parameters. */
 	BVHParams params;
--- a/intern/cycles/bvh/bvh_node.cpp
+++ b/intern/cycles/bvh/bvh_node.cpp
@@ -15,12 +15,12 @@
 * limitations under the License.
 */

-#include "bvh.h"
-#include "bvh_build.h"
-#include "bvh_node.h"
+#include "bvh/bvh.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_node.h"

-#include "util_debug.h"
-#include "util_vector.h"
+#include "util/util_debug.h"
+#include "util/util_vector.h"

 CCL_NAMESPACE_BEGIN

@@ -62,12 +62,12 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 			}
 			return cnt;
 		case BVH_STAT_ALIGNED_COUNT:
-			if(!is_unaligned()) {
+			if(!is_unaligned) {
 				cnt = 1;
 			}
 			break;
 		case BVH_STAT_UNALIGNED_COUNT:
-			if(is_unaligned()) {
+			if(is_unaligned) {
 				cnt = 1;
 			}
 			break;
@@ -75,7 +75,7 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 			if(!is_leaf()) {
 				bool has_unaligned = false;
 				for(int j = 0; j < num_children(); j++) {
-					has_unaligned |= get_child(j)->is_unaligned();
+					has_unaligned |= get_child(j)->is_unaligned;
 				}
 				cnt += has_unaligned? 0: 1;
 			}
@@ -84,7 +84,7 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 			if(!is_leaf()) {
 				bool has_unaligned = false;
 				for(int j = 0; j < num_children(); j++) {
-					has_unaligned |= get_child(j)->is_unaligned();
+					has_unaligned |= get_child(j)->is_unaligned;
 				}
 				cnt += has_unaligned? 1: 0;
 			}
@@ -95,12 +95,12 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 				for(int i = 0; i < num_children(); i++) {
 					BVHNode *node = get_child(i);
 					if(node->is_leaf()) {
-						has_unaligned |= node->is_unaligned();
+						has_unaligned |= node->is_unaligned;
 					}
 					else {
 						for(int j = 0; j < node->num_children(); j++) {
 							cnt += node->get_child(j)->getSubtreeSize(stat);
-							has_unaligned |= node->get_child(j)->is_unaligned();
+							has_unaligned |= node->get_child(j)->is_unaligned;
 						}
 					}
 				}
@@ -113,12 +113,12 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 				for(int i = 0; i < num_children(); i++) {
 					BVHNode *node = get_child(i);
 					if(node->is_leaf()) {
-						has_unaligned |= node->is_unaligned();
+						has_unaligned |= node->is_unaligned;
 					}
 					else {
 						for(int j = 0; j < node->num_children(); j++) {
 							cnt += node->get_child(j)->getSubtreeSize(stat);
-							has_unaligned |= node->get_child(j)->is_unaligned();
+							has_unaligned |= node->get_child(j)->is_unaligned;
 						}
 					}
 				}
@@ -126,10 +126,10 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 			}
 			return cnt;
 		case BVH_STAT_ALIGNED_LEAF_COUNT:
-			cnt = (is_leaf() && !is_unaligned()) ? 1 : 0;
+			cnt = (is_leaf() && !is_unaligned) ? 1 : 0;
 			break;
 		case BVH_STAT_UNALIGNED_LEAF_COUNT:
-			cnt = (is_leaf() && is_unaligned()) ? 1 : 0;
+			cnt = (is_leaf() && is_unaligned) ? 1 : 0;
 			break;
 		default:
 			assert(0); /* unknown mode */
@@ -157,7 +157,7 @@ float BVHNode::computeSubtreeSAHCost(const BVHParams& p, float probability) cons

 	for(int i = 0; i < num_children(); i++) {
 		BVHNode *child = get_child(i);
-		SAH += child->computeSubtreeSAHCost(p, probability * child->m_bounds.safe_area()/m_bounds.safe_area());
+		SAH += child->computeSubtreeSAHCost(p, probability * child->bounds.safe_area()/bounds.safe_area());
 	}

 	return SAH;
@@ -165,15 +165,15 @@ float BVHNode::computeSubtreeSAHCost(const BVHParams& p, float probability) cons

 uint BVHNode::update_visibility()
 {
-	if(!is_leaf() && m_visibility == 0) {
+	if(!is_leaf() && visibility == 0) {
 		InnerNode *inner = (InnerNode*)this;
 		BVHNode *child0 = inner->children[0];
 		BVHNode *child1 = inner->children[1];

-		m_visibility = child0->update_visibility()|child1->update_visibility();
+		visibility = child0->update_visibility()|child1->update_visibility();
 	}

-	return m_visibility;
+	return visibility;
 }

 void BVHNode::update_time()
@@ -184,8 +184,8 @@ void BVHNode::update_time()
 		BVHNode *child1 = inner->children[1];
 		child0->update_time();
 		child1->update_time();
-		m_time_from = min(child0->m_time_from, child1->m_time_from);
-		m_time_to =  max(child0->m_time_to, child1->m_time_to);
+		time_from = min(child0->time_from, child1->time_from);
+		time_to =  max(child0->time_to, child1->time_to);
 	}
 }

@@ -209,7 +209,7 @@ void LeafNode::print(int depth) const
 	for(int i = 0; i < depth; i++)
 		printf("  ");
 	
-	printf("leaf node %d to %d\n", m_lo, m_hi);
+	printf("leaf node %d to %d\n", lo, hi);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/bvh/bvh_node.h
+++ b/intern/cycles/bvh/bvh_node.h
@@ -18,9 +18,9 @@
 #ifndef __BVH_NODE_H__
 #define __BVH_NODE_H__

-#include "util_boundbox.h"
-#include "util_debug.h"
-#include "util_types.h"
+#include "util/util_boundbox.h"
+#include "util/util_debug.h"
+#include "util/util_types.h"

 CCL_NAMESPACE_BEGIN

@@ -46,16 +46,16 @@ class BVHParams;
 class BVHNode
 {
 public:
-	BVHNode() : m_is_unaligned(false),
-	            m_aligned_space(NULL),
-	            m_time_from(0.0f),
-	            m_time_to(1.0f)
+	BVHNode() : is_unaligned(false),
+	            aligned_space(NULL),
+	            time_from(0.0f),
+	            time_to(1.0f)
 	{
 	}

 	virtual ~BVHNode()
 	{
-		delete m_aligned_space;
+		delete aligned_space;
 	}

 	virtual bool is_leaf() const = 0;
@@ -63,30 +63,26 @@ public:
 	virtual BVHNode *get_child(int i) const = 0;
 	virtual int num_triangles() const { return 0; }
 	virtual void print(int depth = 0) const = 0;
-	bool is_unaligned() const { return m_is_unaligned; }

 	inline void set_aligned_space(const Transform& aligned_space)
 	{
-		m_is_unaligned = true;
-		if(m_aligned_space == NULL) {
-			m_aligned_space = new Transform(aligned_space);
+		is_unaligned = true;
+		if(this->aligned_space == NULL) {
+			this->aligned_space = new Transform(aligned_space);
 		}
 		else {
-			*m_aligned_space = aligned_space;
+			*this->aligned_space = aligned_space;
 		}
 	}

 	inline Transform get_aligned_space() const
 	{
-		if(m_aligned_space == NULL) {
+		if(aligned_space == NULL) {
 			return transform_identity();
 		}
-		return *m_aligned_space;
+		return *aligned_space;
 	}

-	BoundBox m_bounds;
-	uint m_visibility;
-
 	// Subtree functions
 	int getSubtreeSize(BVH_STAT stat=BVH_STAT_NODE_COUNT) const;
 	float computeSubtreeSAHCost(const BVHParams& p, float probability = 1.0f) const;
@@ -95,13 +91,18 @@ public:
 	uint update_visibility();
 	void update_time();

-	bool m_is_unaligned;
+	// Properties.
+	BoundBox bounds;
+	uint visibility;

-	// TODO(sergey): Can be stored as 3x3 matrix, but better to have some
-	// utilities and type defines in util_transform first.
-	Transform *m_aligned_space;
+	bool is_unaligned;

-	float m_time_from, m_time_to;
+	/* TODO(sergey): Can be stored as 3x3 matrix, but better to have some
+	 * utilities and type defines in util_transform first.
+	 */
+	Transform *aligned_space;
+
+	float time_from, time_to;
 };

 class InnerNode : public BVHNode
@@ -111,20 +112,20 @@ public:
 	          BVHNode* child0,
 	          BVHNode* child1)
 	{
-		m_bounds = bounds;
+		this->bounds = bounds;
 		children[0] = child0;
 		children[1] = child1;

 		if(child0 && child1)
-			m_visibility = child0->m_visibility|child1->m_visibility;
+			visibility = child0->visibility|child1->visibility;
 		else
-			m_visibility = 0; /* happens on build cancel */
+			visibility = 0; /* happens on build cancel */
 	}

 	explicit InnerNode(const BoundBox& bounds)
 	{
-		m_bounds = bounds;
-		m_visibility = 0;
+		this->bounds = bounds;
+		visibility = 0;
 		children[0] = NULL;
 		children[1] = NULL;
 	}
@@ -140,12 +141,12 @@ public:
 class LeafNode : public BVHNode
 {
 public:
-	LeafNode(const BoundBox& bounds, uint visibility, int lo, int hi) 
+	LeafNode(const BoundBox& bounds, uint visibility, int lo, int hi)
+	: lo(lo),
+	  hi(hi)
 	{
-		m_bounds = bounds;
-		m_visibility = visibility;
-		m_lo = lo;
-		m_hi = hi;
+		this->bounds = bounds;
+		this->visibility = visibility;
 	}

 	LeafNode(const LeafNode& s)
@@ -157,14 +158,13 @@ public:
 	bool is_leaf() const { return true; }
 	int num_children() const { return 0; }
 	BVHNode *get_child(int) const { return NULL; }
-	int num_triangles() const { return m_hi - m_lo; }
+	int num_triangles() const { return hi - lo; }
 	void print(int depth) const;

-	int m_lo;
-	int m_hi;
+	int lo;
+	int hi;
 };

 CCL_NAMESPACE_END

 #endif /* __BVH_NODE_H__ */
-
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -18,9 +18,9 @@
 #ifndef __BVH_PARAMS_H__
 #define __BVH_PARAMS_H__

-#include "util_boundbox.h"
+#include "util/util_boundbox.h"

-#include "kernel_types.h"
+#include "kernel/kernel_types.h"

 CCL_NAMESPACE_BEGIN

@@ -104,6 +104,7 @@ public:
 		primitive_mask = PRIMITIVE_ALL;

 		num_motion_curve_steps = 0;
+		num_motion_triangle_steps = 0;
 	}

 	/* SAH costs */
--- a/intern/cycles/bvh/bvh_sort.cpp
+++ b/intern/cycles/bvh/bvh_sort.cpp
@@ -15,12 +15,12 @@
 * limitations under the License.
 */

-#include "bvh_build.h"
-#include "bvh_sort.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_sort.h"

-#include "util_algorithm.h"
-#include "util_debug.h"
-#include "util_task.h"
+#include "util/util_algorithm.h"
+#include "util/util_debug.h"
+#include "util/util_task.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/bvh/bvh_split.cpp
+++ b/intern/cycles/bvh/bvh_split.cpp
@@ -15,14 +15,14 @@
 * limitations under the License.
 */

-#include "bvh_build.h"
-#include "bvh_split.h"
-#include "bvh_sort.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_split.h"
+#include "bvh/bvh_sort.h"

-#include "mesh.h"
-#include "object.h"
+#include "render/mesh.h"
+#include "render/object.h"

-#include "util_algorithm.h"
+#include "util/util_algorithm.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/bvh/bvh_split.h
+++ b/intern/cycles/bvh/bvh_split.h
@@ -18,8 +18,8 @@
 #ifndef __BVH_SPLIT_H__
 #define __BVH_SPLIT_H__

-#include "bvh_build.h"
-#include "bvh_params.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_params.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/bvh/bvh_unaligned.cpp
+++ b/intern/cycles/bvh/bvh_unaligned.cpp
@@ -15,17 +15,17 @@
 */


-#include "bvh_unaligned.h"
+#include "bvh/bvh_unaligned.h"

-#include "mesh.h"
-#include "object.h"
+#include "render/mesh.h"
+#include "render/object.h"

-#include "bvh_binning.h"
+#include "bvh/bvh_binning.h"
 #include "bvh_params.h"

-#include "util_boundbox.h"
-#include "util_debug.h"
-#include "util_transform.h"
+#include "util/util_boundbox.h"
+#include "util/util_debug.h"
+#include "util/util_transform.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/bvh/bvh_unaligned.h
+++ b/intern/cycles/bvh/bvh_unaligned.h
@@ -17,7 +17,7 @@
 #ifndef __BVH_UNALIGNED_H__
 #define __BVH_UNALIGNED_H__

-#include "util_vector.h"
+#include "util/util_vector.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -1,12 +1,6 @@

 set(INC
-	.
-	../graph
-	../kernel
-	../kernel/svm
-	../kernel/osl
-	../util
-	../render
+	..
 	../../glew-mx
 )

@@ -33,6 +27,7 @@ set(SRC
 	device_cuda.cpp
 	device_multi.cpp
 	device_opencl.cpp
+	device_split_kernel.cpp
 	device_task.cpp
 )

@@ -56,6 +51,7 @@ set(SRC_HEADERS
 	device_memory.h
 	device_intern.h
 	device_network.h
+	device_split_kernel.h
 	device_task.h
 )

--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -17,18 +17,18 @@
 #include <stdlib.h>
 #include <string.h>

-#include "device.h"
-#include "device_intern.h"
+#include "device/device.h"
+#include "device/device_intern.h"

-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_half.h"
-#include "util_math.h"
-#include "util_opengl.h"
-#include "util_time.h"
-#include "util_types.h"
-#include "util_vector.h"
-#include "util_string.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_half.h"
+#include "util/util_math.h"
+#include "util/util_opengl.h"
+#include "util/util_time.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
+#include "util/util_string.h"

 CCL_NAMESPACE_BEGIN

@@ -48,11 +48,11 @@ std::ostream& operator <<(std::ostream &os,
 	os << "Max nodes group: " << requested_features.max_nodes_group << std::endl;
 	/* TODO(sergey): Decode bitflag into list of names. */
 	os << "Nodes features: " << requested_features.nodes_features << std::endl;
-	os << "Use hair: "
+	os << "Use Hair: "
 	   << string_from_bool(requested_features.use_hair) << std::endl;
-	os << "Use object motion: "
+	os << "Use Object Motion: "
 	   << string_from_bool(requested_features.use_object_motion) << std::endl;
-	os << "Use camera motion: "
+	os << "Use Camera Motion: "
 	   << string_from_bool(requested_features.use_camera_motion) << std::endl;
 	os << "Use Baking: "
 	   << string_from_bool(requested_features.use_baking) << std::endl;
@@ -80,7 +80,7 @@ Device::~Device()

 void Device::pixels_alloc(device_memory& mem)
 {
-	mem_alloc(mem, MEM_READ_WRITE);
+	mem_alloc("pixels", mem, MEM_READ_WRITE);
 }

 void Device::pixels_copy_from(device_memory& mem, int y, int w, int h)
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -19,15 +19,15 @@

 #include <stdlib.h>

-#include "device_memory.h"
-#include "device_task.h"
+#include "device/device_memory.h"
+#include "device/device_task.h"

-#include "util_list.h"
-#include "util_stats.h"
-#include "util_string.h"
-#include "util_thread.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_list.h"
+#include "util/util_stats.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"

 CCL_NAMESPACE_BEGIN

@@ -121,6 +121,9 @@ public:
 	/* Use Transparent shadows */
 	bool use_transparent;

+	/* Use various shadow tricks, such as shadow catcher. */
+	bool use_shadow_tricks;
+
 	DeviceRequestedFeatures()
 	{
 		/* TODO(sergey): Find more meaningful defaults. */
@@ -137,6 +140,7 @@ public:
 		use_integrator_branched = false;
 		use_patch_evaluation = false;
 		use_transparent = false;
+		use_shadow_tricks = false;
 	}

 	bool modified(const DeviceRequestedFeatures& requested_features)
@@ -153,7 +157,8 @@ public:
 		         use_volume == requested_features.use_volume &&
 		         use_integrator_branched == requested_features.use_integrator_branched &&
 		         use_patch_evaluation == requested_features.use_patch_evaluation &&
-		         use_transparent == requested_features.use_transparent);
+		         use_transparent == requested_features.use_transparent &&
+		         use_shadow_tricks == requested_features.use_shadow_tricks);
 	}

 	/* Convert the requested features structure to a build options,
@@ -194,9 +199,12 @@ public:
 		if(!use_patch_evaluation) {
 			build_options += " -D__NO_PATCH_EVAL__";
 		}
-		if(!use_transparent) {
+		if(!use_transparent && !use_volume) {
 			build_options += " -D__NO_TRANSPARENT__";
 		}
+		if(!use_shadow_tricks) {
+			build_options += " -D__NO_SHADOW_TRICKS__";
+		}
 		return build_options;
 	}
 };
@@ -228,13 +236,21 @@ public:
 	DeviceInfo info;
 	virtual const string& error_message() { return error_msg; }
 	bool have_error() { return !error_message().empty(); }
+	virtual void set_error(const string& error)
+	{
+		if(!have_error()) {
+			error_msg = error;
+		}
+		fprintf(stderr, "%s\n", error.c_str());
+		fflush(stderr);
+	}
 	virtual bool show_samples() const { return false; }

 	/* statistics */
 	Stats &stats;

 	/* regular memory */
-	virtual void mem_alloc(device_memory& mem, MemoryType type) = 0;
+	virtual void mem_alloc(const char *name, device_memory& mem, MemoryType type) = 0;
 	virtual void mem_copy_to(device_memory& mem) = 0;
 	virtual void mem_copy_from(device_memory& mem,
 		int y, int w, int h, int elem) = 0;
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -20,36 +20,124 @@
 /* So ImathMath is included before our kernel_cpu_compat. */
 #ifdef WITH_OSL
 /* So no context pollution happens from indirectly included windows.h */
-#  include "util_windows.h"
+#  include "util/util_windows.h"
 #  include <OSL/oslexec.h>
 #endif

-#include "device.h"
-#include "device_intern.h"
+#include "device/device.h"
+#include "device/device_intern.h"
+#include "device/device_split_kernel.h"

-#include "kernel.h"
-#include "kernel_compat_cpu.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
+#include "kernel/kernel.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/kernel_types.h"
+#include "kernel/split/kernel_split_data.h"
+#include "kernel/kernel_globals.h"

-#include "osl_shader.h"
-#include "osl_globals.h"
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"

-#include "buffers.h"
+#include "render/buffers.h"

-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_function.h"
-#include "util_logging.h"
-#include "util_opengl.h"
-#include "util_progress.h"
-#include "util_system.h"
-#include "util_thread.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_opengl.h"
+#include "util/util_progress.h"
+#include "util/util_system.h"
+#include "util/util_thread.h"

 CCL_NAMESPACE_BEGIN

+class CPUDevice;
+
+class CPUSplitKernel : public DeviceSplitKernel {
+	CPUDevice *device;
+public:
+	explicit CPUSplitKernel(CPUDevice *device);
+
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data_,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs);
+
+	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+	virtual int2 split_kernel_local_size();
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+};
+
 class CPUDevice : public Device
 {
+	static unordered_map<string, void*> kernel_functions;
+
+	static void register_kernel_function(const char* name, void* func)
+	{
+		kernel_functions[name] = func;
+	}
+
+	static const char* get_arch_name()
+	{
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+		if(system_cpu_support_avx2()) {
+			return "cpu_avx2";
+		}
+		else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+		if(system_cpu_support_avx()) {
+			return "cpu_avx";
+		}
+		else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+		if(system_cpu_support_sse41()) {
+			return "cpu_sse41";
+		}
+		else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+		if(system_cpu_support_sse3()) {
+			return "cpu_sse3";
+		}
+		else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+		if(system_cpu_support_sse2()) {
+			return "cpu_sse2";
+		}
+		else
+#endif
+		{
+			return "cpu";
+		}
+	}
+
+	template<typename F>
+	static F get_kernel_function(string name)
+	{
+		name = string("kernel_") + get_arch_name() + "_" + name;
+
+		unordered_map<string, void*>::iterator it = kernel_functions.find(name);
+
+		if(it == kernel_functions.end()) {
+			assert(!"kernel function not found");
+			return NULL;
+		}
+
+		return (F)it->second;
+	}
+
+	friend class CPUSplitKernel;
+
 public:
 	TaskPool task_pool;
 	KernelGlobals kernel_globals;
@@ -57,10 +145,15 @@ public:
 #ifdef WITH_OSL
 	OSLGlobals osl_globals;
 #endif
+
+	bool use_split_kernel;
+
+	DeviceRequestedFeatures requested_features;
 	
 	CPUDevice(DeviceInfo& info, Stats &stats, bool background)
 	: Device(info, stats, background)
 	{
+
 #ifdef WITH_OSL
 		kernel_globals.osl = &osl_globals;
 #endif
@@ -105,6 +198,28 @@ public:
 		{
 			VLOG(1) << "Will be using regular kernels.";
 		}
+
+		use_split_kernel = DebugFlags().cpu.split_kernel;
+		if(use_split_kernel) {
+			VLOG(1) << "Will be using split kernel.";
+		}
+
+		kernel_cpu_register_functions(register_kernel_function);
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+		kernel_cpu_sse2_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+		kernel_cpu_sse3_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+		kernel_cpu_sse41_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+		kernel_cpu_avx_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+		kernel_cpu_avx2_register_functions(register_kernel_function);
+#endif
 	}

 	~CPUDevice()
@@ -117,9 +232,20 @@ public:
 		return (TaskScheduler::num_threads() == 1);
 	}

-	void mem_alloc(device_memory& mem, MemoryType /*type*/)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
 	{
+		if(name) {
+			VLOG(1) << "Buffer allocate: " << name << ", "
+			        << string_human_readable_number(mem.memory_size()) << " bytes. ("
+			        << string_human_readable_size(mem.memory_size()) << ")";
+		}
+
 		mem.device_pointer = mem.data_pointer;
+
+		if(!mem.device_pointer) {
+			mem.device_pointer = (device_ptr)malloc(mem.memory_size());
+		}
+
 		mem.device_size = mem.memory_size();
 		stats.mem_alloc(mem.device_size);
 	}
@@ -144,6 +270,10 @@ public:
 	void mem_free(device_memory& mem)
 	{
 		if(mem.device_pointer) {
+			if(!mem.data_pointer) {
+				free((void*)mem.device_pointer);
+			}
+
 			mem.device_pointer = 0;
 			stats.mem_free(mem.device_size);
 			mem.device_size = 0;
@@ -196,8 +326,14 @@ public:

 	void thread_run(DeviceTask *task)
 	{
-		if(task->type == DeviceTask::PATH_TRACE)
-			thread_path_trace(*task);
+		if(task->type == DeviceTask::PATH_TRACE) {
+			if(!use_split_kernel) {
+				thread_path_trace(*task);
+			}
+			else {
+				thread_path_trace_split(*task);
+			}
+		}
 		else if(task->type == DeviceTask::FILM_CONVERT)
 			thread_film_convert(*task);
 		else if(task->type == DeviceTask::SHADER)
@@ -258,7 +394,7 @@ public:
 		{
 			path_trace_kernel = kernel_cpu_path_trace;
 		}
-		
+
 		while(task.acquire_tile(this, tile)) {
 			float *render_buffer = (float*)tile.buffer;
 			uint *rng_state = (uint*)tile.rng_state;
@@ -294,6 +430,49 @@ public:
 		thread_kernel_globals_free(&kg);
 	}

+	void thread_path_trace_split(DeviceTask& task)
+	{
+		if(task_pool.canceled()) {
+			if(task.need_finish_queue == false)
+				return;
+		}
+
+		RenderTile tile;
+
+		CPUSplitKernel split_kernel(this);
+
+		/* allocate buffer for kernel globals */
+		device_memory kgbuffer;
+		kgbuffer.resize(sizeof(KernelGlobals));
+		mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
+
+		KernelGlobals *kg = (KernelGlobals*)kgbuffer.device_pointer;
+		*kg = thread_kernel_globals_init();
+
+		requested_features.max_closure = MAX_CLOSURE;
+		if(!split_kernel.load_kernels(requested_features)) {
+			thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+			mem_free(kgbuffer);
+
+			return;
+		}
+
+		while(task.acquire_tile(this, tile)) {
+			device_memory data;
+			split_kernel.path_trace(&task, tile, kgbuffer, data);
+
+			task.release_tile(tile);
+
+			if(task_pool.canceled()) {
+				if(task.need_finish_queue == false)
+					break;
+			}
+		}
+
+		thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+		mem_free(kgbuffer);
+	}
+
 	void thread_film_convert(DeviceTask& task)
 	{
 		float sample_scale = 1.0f/(task.sample + 1);
@@ -501,6 +680,10 @@ protected:

 	inline void thread_kernel_globals_free(KernelGlobals *kg)
 	{
+		if(kg == NULL) {
+			return;
+		}
+
 		if(kg->transparent_shadow_intersections != NULL) {
 			free(kg->transparent_shadow_intersections);
 		}
@@ -515,8 +698,175 @@ protected:
 		OSLShader::thread_free(kg);
 #endif
 	}
+
+	virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) {
+		requested_features = requested_features_;
+
+		return true;
+	}
 };

+/* split kernel */
+
+class CPUSplitKernelFunction : public SplitKernelFunction {
+public:
+	CPUDevice* device;
+	void (*func)(KernelGlobals *kg, KernelData *data);
+
+	CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
+	~CPUSplitKernelFunction() {}
+
+	virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
+	{
+		if(!func) {
+			return false;
+		}
+
+		KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+		kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+		for(int y = 0; y < dim.global_size[1]; y++) {
+			for(int x = 0; x < dim.global_size[0]; x++) {
+				kg->global_id = make_int2(x, y);
+
+				func(kg, (KernelData*)data.device_pointer);
+			}
+		}
+
+		return true;
+	}
+};
+
+CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
+                                                    RenderTile& rtile,
+                                                    int num_global_elements,
+                                                    device_memory& kernel_globals,
+                                                    device_memory& data,
+                                                    device_memory& split_data,
+                                                    device_memory& ray_state,
+                                                    device_memory& queue_index,
+                                                    device_memory& use_queues_flags,
+                                                    device_memory& work_pool_wgs)
+{
+	typedef void(*data_init_t)(KernelGlobals *kg,
+	                           ccl_constant KernelData *data,
+	                           ccl_global void *split_data_buffer,
+	                           int num_elements,
+	                           ccl_global char *ray_state,
+	                           ccl_global uint *rng_state,
+	                           int start_sample,
+	                           int end_sample,
+	                           int sx, int sy, int sw, int sh, int offset, int stride,
+	                           ccl_global int *Queue_index,
+	                           int queuesize,
+	                           ccl_global char *use_queues_flag,
+	                           ccl_global unsigned int *work_pool_wgs,
+	                           unsigned int num_samples,
+	                           ccl_global float *buffer);
+
+	data_init_t data_init;
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+	if(system_cpu_support_avx2()) {
+		data_init = kernel_cpu_avx2_data_init;
+	}
+	else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+	if(system_cpu_support_avx()) {
+		data_init = kernel_cpu_avx_data_init;
+	}
+	else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+	if(system_cpu_support_sse41()) {
+		data_init = kernel_cpu_sse41_data_init;
+	}
+	else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+	if(system_cpu_support_sse3()) {
+		data_init = kernel_cpu_sse3_data_init;
+	}
+	else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+	if(system_cpu_support_sse2()) {
+		data_init = kernel_cpu_sse2_data_init;
+	}
+	else
+#endif
+	{
+		data_init = kernel_cpu_data_init;
+	}
+
+	KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+	kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+	for(int y = 0; y < dim.global_size[1]; y++) {
+		for(int x = 0; x < dim.global_size[0]; x++) {
+			kg->global_id = make_int2(x, y);
+
+			data_init((KernelGlobals*)kernel_globals.device_pointer,
+			          (KernelData*)data.device_pointer,
+			          (void*)split_data.device_pointer,
+			          num_global_elements,
+			          (char*)ray_state.device_pointer,
+			          (uint*)rtile.rng_state,
+			          rtile.start_sample,
+			          rtile.start_sample + rtile.num_samples,
+			          rtile.x,
+			          rtile.y,
+			          rtile.w,
+			          rtile.h,
+			          rtile.offset,
+			          rtile.stride,
+			          (int*)queue_index.device_pointer,
+			          dim.global_size[0] * dim.global_size[1],
+			          (char*)use_queues_flags.device_pointer,
+			          (uint*)work_pool_wgs.device_pointer,
+			          rtile.num_samples,
+			          (float*)rtile.buffer);
+		}
+	}
+
+	return true;
+}
+
+SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+{
+	CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
+
+	kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
+	if(!kernel->func) {
+		delete kernel;
+		return NULL;
+	}
+
+	return kernel;
+}
+
+int2 CPUSplitKernel::split_kernel_local_size()
+{
+	return make_int2(1, 1);
+}
+
+int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/) {
+	return make_int2(64, 1);
+}
+
+uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
+	KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+
+	return split_data_buffer_size(kg, num_threads);
+}
+
+unordered_map<string, void*> CPUDevice::kernel_functions;
+
 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
 {
 	return new CPUDevice(info, stats, background);
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -15,32 +15,36 @@
 */

 #include <climits>
+#include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

-#include "device.h"
-#include "device_intern.h"
+#include "device/device.h"
+#include "device/device_intern.h"
+#include "device/device_split_kernel.h"

-#include "buffers.h"
+#include "render/buffers.h"

 #ifdef WITH_CUDA_DYNLOAD
 #  include "cuew.h"
 #else
-#  include "util_opengl.h"
+#  include "util/util_opengl.h"
 #  include <cuda.h>
 #  include <cudaGL.h>
 #endif
-#include "util_debug.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_md5.h"
-#include "util_opengl.h"
-#include "util_path.h"
-#include "util_string.h"
-#include "util_system.h"
-#include "util_types.h"
-#include "util_time.h"
+#include "util/util_debug.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_md5.h"
+#include "util/util_opengl.h"
+#include "util/util_path.h"
+#include "util/util_string.h"
+#include "util/util_system.h"
+#include "util/util_types.h"
+#include "util/util_time.h"
+
+#include "kernel/split/kernel_split_data_types.h"

 CCL_NAMESPACE_BEGIN

@@ -78,6 +82,31 @@ int cuewCompilerVersion(void)
 }  /* namespace */
 #endif  /* WITH_CUDA_DYNLOAD */

+class CUDADevice;
+
+class CUDASplitKernel : public DeviceSplitKernel {
+	CUDADevice *device;
+public:
+	explicit CUDASplitKernel(CUDADevice *device);
+
+	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data_,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs);
+
+	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+	virtual int2 split_kernel_local_size();
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+};
+
 class CUDADevice : public Device
 {
 public:
@@ -258,16 +287,21 @@ public:
 		return DebugFlags().cuda.adaptive_compile;
 	}

+	bool use_split_kernel()
+	{
+		return DebugFlags().cuda.split_kernel;
+	}
+
 	/* Common NVCC flags which stays the same regardless of shading model,
 	 * kernel sources md5 and only depends on compiler or compilation settings.
 	 */
 	string compile_kernel_get_common_cflags(
-	        const DeviceRequestedFeatures& requested_features)
+	        const DeviceRequestedFeatures& requested_features, bool split=false)
 	{
 		const int cuda_version = cuewCompilerVersion();
 		const int machine = system_cpu_bits();
-		const string kernel_path = path_get("kernel");
-		const string include = kernel_path;
+		const string source_path = path_get("source");
+		const string include_path = source_path;
 		string cflags = string_printf("-m%d "
 		                              "--ptxas-options=\"-v\" "
 		                              "--use_fast_math "
@@ -276,7 +310,7 @@ public:
 		                               "-I\"%s\"",
 		                              machine,
 		                              cuda_version,
-		                              include.c_str());
+		                              include_path.c_str());
 		if(use_adaptive_compilation()) {
 			cflags += " " + requested_features.get_build_options();
 		}
@@ -287,6 +321,11 @@ public:
 #ifdef WITH_CYCLES_DEBUG
 		cflags += " -D__KERNEL_DEBUG__";
 #endif
+
+		if(split) {
+			cflags += " -D__SPLIT__";
+		}
+
 		return cflags;
 	}

@@ -306,21 +345,21 @@ public:
 			cuda_error_message("CUDA nvcc compiler version could not be parsed.");
 			return false;
 		}
-		if(cuda_version < 75) {
+		if(cuda_version < 80) {
 			printf("Unsupported CUDA version %d.%d detected, "
-			       "you need CUDA 7.5 or newer.\n",
+			       "you need CUDA 8.0 or newer.\n",
 			       major, minor);
 			return false;
 		}
-		else if(cuda_version != 75 && cuda_version != 80) {
+		else if(cuda_version != 80) {
 			printf("CUDA version %d.%d detected, build may succeed but only "
-			       "CUDA 7.5 and 8.0 are officially supported.\n",
+			       "CUDA 8.0 is officially supported.\n",
 			       major, minor);
 		}
 		return true;
 	}

-	string compile_kernel(const DeviceRequestedFeatures& requested_features)
+	string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
 	{
 		/* Compute cubin name. */
 		int major, minor;
@@ -329,7 +368,8 @@ public:

 		/* Attempt to use kernel provided with Blender. */
 		if(!use_adaptive_compilation()) {
-			const string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin",
+			const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
+			                                                  : "lib/kernel_sm_%d%d.cubin",
 			                                            major, minor));
 			VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
 			if(path_exists(cubin)) {
@@ -339,18 +379,19 @@ public:
 		}

 		const string common_cflags =
-		        compile_kernel_get_common_cflags(requested_features);
+		        compile_kernel_get_common_cflags(requested_features, split);

 		/* Try to use locally compiled kernel. */
-		const string kernel_path = path_get("kernel");
-		const string kernel_md5 = path_files_md5_hash(kernel_path);
+		const string source_path = path_get("source");
+		const string kernel_md5 = path_files_md5_hash(source_path);

 		/* We include cflags into md5 so changing cuda toolkit or changing other
 		 * compiler command line arguments makes sure cubin gets re-built.
 		 */
 		const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);

-		const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin",
+		const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
+		                                              : "cycles_kernel_sm%d%d_%s.cubin",
 		                                        major, minor,
 		                                        cubin_md5.c_str());
 		const string cubin = path_cache_get(path_join("kernels", cubin_file));
@@ -383,9 +424,10 @@ public:
 			return "";
 		}
 		const char *nvcc = cuewCompilerPath();
-		const string kernel = path_join(kernel_path,
-		                          path_join("kernels",
-		                                    path_join("cuda", "kernel.cu")));
+		const string kernel = path_join(
+		        path_join(source_path, "kernel"),
+		        path_join("kernels",
+		                  path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
 		double starttime = time_dt();
 		printf("Compiling CUDA kernel ...\n");

@@ -433,7 +475,7 @@ public:
 			return false;

 		/* get kernel */
-		string cubin = compile_kernel(requested_features);
+		string cubin = compile_kernel(requested_features, use_split_kernel());

 		if(cubin == "")
 			return false;
@@ -466,8 +508,14 @@ public:
 		}
 	}

-	void mem_alloc(device_memory& mem, MemoryType /*type*/)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
 	{
+		if(name) {
+			VLOG(1) << "Buffer allocate: " << name << ", "
+			        << string_human_readable_number(mem.memory_size()) << " bytes. ("
+			        << string_human_readable_size(mem.memory_size()) << ")";
+		}
+
 		cuda_push_context();
 		CUdeviceptr device_pointer;
 		size_t size = mem.memory_size();
@@ -504,7 +552,9 @@ public:

 	void mem_zero(device_memory& mem)
 	{
-		memset((void*)mem.data_pointer, 0, mem.memory_size());
+		if(mem.data_pointer) {
+			memset((void*)mem.data_pointer, 0, mem.memory_size());
+		}

 		cuda_push_context();
 		if(mem.device_pointer)
@@ -617,7 +667,7 @@ public:
 		/* Data Storage */
 		if(interpolation == INTERPOLATION_NONE) {
 			if(has_bindless_textures) {
-				mem_alloc(mem, MEM_READ_ONLY);
+				mem_alloc(NULL, mem, MEM_READ_ONLY);
 				mem_copy_to(mem);

 				cuda_push_context();
@@ -641,7 +691,7 @@ public:
 				cuda_pop_context();
 			}
 			else {
-				mem_alloc(mem, MEM_READ_ONLY);
+				mem_alloc(NULL, mem, MEM_READ_ONLY);
 				mem_copy_to(mem);

 				cuda_push_context();
@@ -1258,25 +1308,48 @@ public:
 			/* Upload Bindless Mapping */
 			load_bindless_mapping();

-			/* keep rendering tiles until done */
-			while(task->acquire_tile(this, tile)) {
-				int start_sample = tile.start_sample;
-				int end_sample = tile.start_sample + tile.num_samples;
+			if(!use_split_kernel()) {
+				/* keep rendering tiles until done */
+				while(task->acquire_tile(this, tile)) {
+					int start_sample = tile.start_sample;
+					int end_sample = tile.start_sample + tile.num_samples;
+
+					for(int sample = start_sample; sample < end_sample; sample++) {
+						if(task->get_cancel()) {
+							if(task->need_finish_queue == false)
+								break;
+						}
+
+						path_trace(tile, sample, branched);
+
+						tile.sample = sample + 1;
+
+						task->update_progress(&tile, tile.w*tile.h);
+					}
+
+					task->release_tile(tile);
+				}
+			}
+			else {
+				DeviceRequestedFeatures requested_features;
+				if(!use_adaptive_compilation()) {
+					requested_features.max_closure = 64;
+				}
+
+				CUDASplitKernel split_kernel(this);
+				split_kernel.load_kernels(requested_features);
+
+				while(task->acquire_tile(this, tile)) {
+					device_memory void_buffer;
+					split_kernel.path_trace(task, tile, void_buffer, void_buffer);
+
+					task->release_tile(tile);

-				for(int sample = start_sample; sample < end_sample; sample++) {
 					if(task->get_cancel()) {
 						if(task->need_finish_queue == false)
 							break;
 					}
-
-					path_trace(tile, sample, branched);
-
-					tile.sample = sample + 1;
-
-					task->update_progress(&tile, tile.w*tile.h);
 				}
-
-				task->release_tile(tile);
 			}
 		}
 		else if(task->type == DeviceTask::SHADER) {
@@ -1329,8 +1402,223 @@ public:
 	{
 		task_pool.cancel();
 	}
+
+	friend class CUDASplitKernelFunction;
+	friend class CUDASplitKernel;
 };

+/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
+ * now that the definition of that class is complete
+ */
+#undef cuda_assert
+#define cuda_assert(stmt) \
+	{ \
+		CUresult result = stmt; \
+		\
+		if(result != CUDA_SUCCESS) { \
+			string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+			if(device->error_msg == "") \
+				device->error_msg = message; \
+			fprintf(stderr, "%s\n", message.c_str()); \
+			/*cuda_abort();*/ \
+			device->cuda_error_documentation(); \
+		} \
+	} (void)0
+
+/* split kernel */
+
+class CUDASplitKernelFunction : public SplitKernelFunction{
+	CUDADevice* device;
+	CUfunction func;
+public:
+	CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {}
+
+	/* enqueue the kernel, returns false if there is an error */
+	bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/)
+	{
+		return enqueue(dim, NULL);
+	}
+
+	/* enqueue the kernel, returns false if there is an error */
+	bool enqueue(const KernelDimensions &dim, void *args[])
+	{
+		device->cuda_push_context();
+
+		if(device->have_error())
+			return false;
+
+		/* we ignore dim.local_size for now, as this is faster */
+		int threads_per_block;
+		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
+
+		int xthreads = (int)sqrt(threads_per_block);
+		int ythreads = (int)sqrt(threads_per_block);
+
+		int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
+		int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
+
+		cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
+
+		cuda_assert(cuLaunchKernel(func,
+		                           xblocks , yblocks, 1, /* blocks */
+		                           xthreads, ythreads, 1, /* threads */
+		                           0, 0, args, 0));
+
+		device->cuda_pop_context();
+
+		return !device->have_error();
+	}
+};
+
+CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
+{
+	device_vector<uint64_t> size_buffer;
+	size_buffer.resize(1);
+	device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
+
+	device->cuda_push_context();
+
+	uint threads = num_threads;
+	CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
+
+	struct args_t {
+		uint* num_threads;
+		CUdeviceptr* size;
+	};
+
+	args_t args = {
+		&threads,
+		&d_size
+	};
+
+	CUfunction state_buffer_size;
+	cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
+
+	cuda_assert(cuLaunchKernel(state_buffer_size,
+	                           1, 1, 1,
+	                           1, 1, 1,
+	                           0, 0, (void**)&args, 0));
+
+	device->cuda_pop_context();
+
+	device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t));
+	device->mem_free(size_buffer);
+
+	return *size_buffer.get_data();
+}
+
+bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
+                                    RenderTile& rtile,
+                                    int num_global_elements,
+                                    device_memory& /*kernel_globals*/,
+                                    device_memory& /*kernel_data*/,
+                                    device_memory& split_data,
+                                    device_memory& ray_state,
+                                    device_memory& queue_index,
+                                    device_memory& use_queues_flag,
+                                    device_memory& work_pool_wgs)
+{
+	device->cuda_push_context();
+
+	CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
+	CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
+	CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
+	CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
+	CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
+
+	CUdeviceptr d_rng_state = device->cuda_device_ptr(rtile.rng_state);
+	CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
+
+	int end_sample = rtile.start_sample + rtile.num_samples;
+	int queue_size = dim.global_size[0] * dim.global_size[1];
+
+	struct args_t {
+		CUdeviceptr* split_data_buffer;
+		int* num_elements;
+		CUdeviceptr* ray_state;
+		CUdeviceptr* rng_state;
+		int* start_sample;
+		int* end_sample;
+		int* sx;
+		int* sy;
+		int* sw;
+		int* sh;
+		int* offset;
+		int* stride;
+		CUdeviceptr* queue_index;
+		int* queuesize;
+		CUdeviceptr* use_queues_flag;
+		CUdeviceptr* work_pool_wgs;
+		int* num_samples;
+		CUdeviceptr* buffer;
+	};
+
+	args_t args = {
+		&d_split_data,
+		&num_global_elements,
+		&d_ray_state,
+		&d_rng_state,
+		&rtile.start_sample,
+		&end_sample,
+		&rtile.x,
+		&rtile.y,
+		&rtile.w,
+		&rtile.h,
+		&rtile.offset,
+		&rtile.stride,
+		&d_queue_index,
+		&queue_size,
+		&d_use_queues_flag,
+		&d_work_pool_wgs,
+		&rtile.num_samples,
+		&d_buffer
+	};
+
+	CUfunction data_init;
+	cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
+	if(device->have_error()) {
+		return false;
+	}
+
+	CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
+
+	device->cuda_pop_context();
+
+	return !device->have_error();
+}
+
+SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+{
+	CUfunction func;
+
+	device->cuda_push_context();
+
+	cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
+	if(device->have_error()) {
+		device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
+		return NULL;
+	}
+
+	device->cuda_pop_context();
+
+	return new CUDASplitKernelFunction(device, func);
+}
+
+int2 CUDASplitKernel::split_kernel_local_size()
+{
+	return make_int2(32, 1);
+}
+
+int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask */*task*/)
+{
+	/* TODO(mai): implement something here to detect ideal work size */
+	return make_int2(256, 256);
+}
+
 bool device_cuda_init(void)
 {
 #ifdef WITH_CUDA_DYNLOAD
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -28,10 +28,10 @@
 * other devices this is a pointer to device memory, where we will copy memory
 * to and from. */

-#include "util_debug.h"
-#include "util_half.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_debug.h"
+#include "util/util_half.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"

 CCL_NAMESPACE_BEGIN

@@ -48,7 +48,8 @@ enum DataType {
 	TYPE_UINT,
 	TYPE_INT,
 	TYPE_FLOAT,
-	TYPE_HALF
+	TYPE_HALF,
+	TYPE_UINT64,
 };

 static inline size_t datatype_size(DataType datatype) 
@@ -59,6 +60,7 @@ static inline size_t datatype_size(DataType datatype)
 		case TYPE_UINT: return sizeof(uint);
 		case TYPE_INT: return sizeof(int);
 		case TYPE_HALF: return sizeof(half);
+		case TYPE_UINT64: return sizeof(uint64_t);
 		default: return 0;
 	}
 }
@@ -160,6 +162,11 @@ template<> struct device_type_traits<half4> {
 	static const int num_elements = 4;
 };

+template<> struct device_type_traits<uint64_t> {
+	static const DataType data_type = TYPE_UINT64;
+	static const int num_elements = 1;
+};
+
 /* Device Memory */

 class device_memory
@@ -180,10 +187,27 @@ public:
 	/* device pointer */
 	device_ptr device_pointer;

-protected:
-	device_memory() {}
+	device_memory()
+	{
+		data_type = device_type_traits<uchar>::data_type;
+		data_elements = device_type_traits<uchar>::num_elements;
+		data_pointer = 0;
+		data_size = 0;
+		device_size = 0;
+		data_width = 0;
+		data_height = 0;
+		data_depth = 0;
+		device_pointer = 0;
+	}
 	virtual ~device_memory() { assert(!device_pointer); }

+	void resize(size_t size)
+	{
+		data_size = size;
+		data_width = size;
+	}
+
+protected:
 	/* no copying */
 	device_memory(const device_memory&);
 	device_memory& operator = (const device_memory&);
@@ -198,16 +222,8 @@ public:
 	{
 		data_type = device_type_traits<T>::data_type;
 		data_elements = device_type_traits<T>::num_elements;
-		data_pointer = 0;
-		data_size = 0;
-		device_size = 0;
-		data_width = 0;
-		data_height = 0;
-		data_depth = 0;

 		assert(data_elements > 0);
-
-		device_pointer = 0;
 	}

 	virtual ~device_vector() {}
@@ -266,6 +282,7 @@ public:
 		data_height = 0;
 		data_depth = 0;
 		data_size = 0;
+		device_pointer = 0;
 	}

 	size_t size()
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -17,17 +17,17 @@
 #include <stdlib.h>
 #include <sstream>

-#include "device.h"
-#include "device_intern.h"
-#include "device_network.h"
+#include "device/device.h"
+#include "device/device_intern.h"
+#include "device/device_network.h"

-#include "buffers.h"
+#include "render/buffers.h"

-#include "util_foreach.h"
-#include "util_list.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_time.h"
+#include "util/util_foreach.h"
+#include "util/util_list.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_time.h"

 CCL_NAMESPACE_BEGIN

@@ -106,11 +106,11 @@ public:
 		return true;
 	}

-	void mem_alloc(device_memory& mem, MemoryType type)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType type)
 	{
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = 0;
-			sub.device->mem_alloc(mem, type);
+			sub.device->mem_alloc(name, mem, type);
 			sub.ptr_map[unique_ptr] = mem.device_pointer;
 		}

@@ -162,6 +162,7 @@ public:
 	void mem_free(device_memory& mem)
 	{
 		device_ptr tmp = mem.device_pointer;
+		stats.mem_free(mem.device_size);

 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = sub.ptr_map[tmp];
@@ -170,7 +171,6 @@ public:
 		}

 		mem.device_pointer = 0;
-		stats.mem_free(mem.device_size);
 	}

 	void const_copy_to(const char *name, void *host, size_t size)
@@ -202,6 +202,7 @@ public:
 	void tex_free(device_memory& mem)
 	{
 		device_ptr tmp = mem.device_pointer;
+		stats.mem_free(mem.device_size);

 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = sub.ptr_map[tmp];
@@ -210,7 +211,6 @@ public:
 		}

 		mem.device_pointer = 0;
-		stats.mem_free(mem.device_size);
 	}

 	void pixels_alloc(device_memory& mem)
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 */

-#include "device.h"
-#include "device_intern.h"
-#include "device_network.h"
+#include "device/device.h"
+#include "device/device_intern.h"
+#include "device/device_network.h"

-#include "util_foreach.h"
-#include "util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"

 #if defined(WITH_NETWORK)

@@ -87,8 +87,14 @@ public:
 		snd.write();
 	}

-	void mem_alloc(device_memory& mem, MemoryType type)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType type)
 	{
+		if(name) {
+			VLOG(1) << "Buffer allocate: " << name << ", "
+				    << string_human_readable_number(mem.memory_size()) << " bytes. ("
+				    << string_human_readable_size(mem.memory_size()) << ")";
+		}
+
 		thread_scoped_lock lock(rpc_lock);

 		mem.device_pointer = ++mem_counter;
@@ -481,7 +487,7 @@ protected:
 				mem.data_pointer = 0;

 			/* perform the allocation on the actual device */
-			device->mem_alloc(mem, type);
+			device->mem_alloc(NULL, mem, type);

 			/* store a mapping to/from client_pointer and real device pointer */
 			pointer_mapping_insert(client_pointer, mem.device_pointer);
--- a/intern/cycles/device/device_network.h
+++ b/intern/cycles/device/device_network.h
@@ -33,12 +33,12 @@
 #include <sstream>
 #include <deque>

-#include "buffers.h"
+#include "render/buffers.h"

-#include "util_foreach.h"
-#include "util_list.h"
-#include "util_map.h"
-#include "util_string.h"
+#include "util/util_foreach.h"
+#include "util/util_list.h"
+#include "util/util_map.h"
+#include "util/util_string.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -16,12 +16,12 @@

 #ifdef WITH_OPENCL

-#include "opencl/opencl.h"
+#include "device/opencl/opencl.h"

-#include "device_intern.h"
+#include "device/device_intern.h"

-#include "util_foreach.h"
-#include "util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_split_kernel.h"
+
+#include "kernel/kernel_types.h"
+#include "kernel/split/kernel_split_data_types.h"
+
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+static const double alpha = 0.1; /* alpha for rolling average */
+
+DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device)
+{
+	current_max_closure = -1;
+	first_tile = true;
+
+	avg_time_per_sample = 0.0;
+
+	kernel_path_init = NULL;
+	kernel_scene_intersect = NULL;
+	kernel_lamp_emission = NULL;
+	kernel_do_volume = NULL;
+	kernel_queue_enqueue = NULL;
+	kernel_indirect_background = NULL;
+	kernel_shader_eval = NULL;
+	kernel_holdout_emission_blurring_pathtermination_ao = NULL;
+	kernel_subsurface_scatter = NULL;
+	kernel_direct_lighting = NULL;
+	kernel_shadow_blocked_ao = NULL;
+	kernel_shadow_blocked_dl = NULL;
+	kernel_next_iteration_setup = NULL;
+	kernel_indirect_subsurface = NULL;
+	kernel_buffer_update = NULL;
+}
+
+DeviceSplitKernel::~DeviceSplitKernel()
+{
+	device->mem_free(split_data);
+	device->mem_free(ray_state);
+	device->mem_free(use_queues_flag);
+	device->mem_free(queue_index);
+	device->mem_free(work_pool_wgs);
+
+	delete kernel_path_init;
+	delete kernel_scene_intersect;
+	delete kernel_lamp_emission;
+	delete kernel_do_volume;
+	delete kernel_queue_enqueue;
+	delete kernel_indirect_background;
+	delete kernel_shader_eval;
+	delete kernel_holdout_emission_blurring_pathtermination_ao;
+	delete kernel_subsurface_scatter;
+	delete kernel_direct_lighting;
+	delete kernel_shadow_blocked_ao;
+	delete kernel_shadow_blocked_dl;
+	delete kernel_next_iteration_setup;
+	delete kernel_indirect_subsurface;
+	delete kernel_buffer_update;
+}
+
+bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features)
+{
+#define LOAD_KERNEL(name) \
+		kernel_##name = get_split_kernel_function(#name, requested_features); \
+		if(!kernel_##name) { \
+			return false; \
+		}
+
+	LOAD_KERNEL(path_init);
+	LOAD_KERNEL(scene_intersect);
+	LOAD_KERNEL(lamp_emission);
+	LOAD_KERNEL(do_volume);
+	LOAD_KERNEL(queue_enqueue);
+	LOAD_KERNEL(indirect_background);
+	LOAD_KERNEL(shader_eval);
+	LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
+	LOAD_KERNEL(subsurface_scatter);
+	LOAD_KERNEL(direct_lighting);
+	LOAD_KERNEL(shadow_blocked_ao);
+	LOAD_KERNEL(shadow_blocked_dl);
+	LOAD_KERNEL(next_iteration_setup);
+	LOAD_KERNEL(indirect_subsurface);
+	LOAD_KERNEL(buffer_update);
+
+#undef LOAD_KERNEL
+
+	current_max_closure = requested_features.max_closure;
+
+	return true;
+}
+
+size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size)
+{
+	uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
+	return max_buffer_size / size_per_element;
+}
+
+bool DeviceSplitKernel::path_trace(DeviceTask *task,
+                                   RenderTile& tile,
+                                   device_memory& kgbuffer,
+                                   device_memory& kernel_data)
+{
+	if(device->have_error()) {
+		return false;
+	}
+
+	/* Get local size */
+	size_t local_size[2];
+	{
+		int2 lsize = split_kernel_local_size();
+		local_size[0] = lsize[0];
+		local_size[1] = lsize[1];
+	}
+
+	/* Set gloabl size */
+	size_t global_size[2];
+	{
+		int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
+
+		/* Make sure that set work size is a multiple of local
+		 * work size dimensions.
+		 */
+		global_size[0] = round_up(gsize[0], local_size[0]);
+		global_size[1] = round_up(gsize[1], local_size[1]);
+	}
+
+	/* Number of elements in the global state buffer */
+	int num_global_elements = global_size[0] * global_size[1];
+	assert(num_global_elements % WORK_POOL_SIZE == 0);
+
+	/* Allocate all required global memory once. */
+	if(first_tile) {
+		first_tile = false;
+
+		/* Calculate max groups */
+
+		/* Denotes the maximum work groups possible w.r.t. current requested tile size. */
+		unsigned int max_work_groups = num_global_elements / WORK_POOL_SIZE + 1;
+
+		/* Allocate work_pool_wgs memory. */
+		work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
+		device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);
+
+		queue_index.resize(NUM_QUEUES * sizeof(int));
+		device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);
+
+		use_queues_flag.resize(sizeof(char));
+		device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE);
+
+		ray_state.resize(num_global_elements);
+		device->mem_alloc("ray_state", ray_state, MEM_READ_WRITE);
+
+		split_data.resize(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
+		device->mem_alloc("split_data", split_data, MEM_READ_WRITE);
+	}
+
+#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
+		if(device->have_error()) { \
+			return false; \
+		} \
+		if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
+			return false; \
+		}
+
+	tile.sample = tile.start_sample;
+
+	/* for exponential increase between tile updates */
+	int time_multiplier = 1;
+
+	while(tile.sample < tile.start_sample + tile.num_samples) {
+		/* to keep track of how long it takes to run a number of samples */
+		double start_time = time_dt();
+
+		/* initial guess to start rolling average */
+		const int initial_num_samples = 1;
+		/* approx number of samples per second */
+		int samples_per_second = (avg_time_per_sample > 0.0) ?
+		                         int(double(time_multiplier) / avg_time_per_sample) + 1 : initial_num_samples;
+
+		RenderTile subtile = tile;
+		subtile.start_sample = tile.sample;
+		subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample);
+
+		if(device->have_error()) {
+			return false;
+		}
+
+		/* reset state memory here as global size for data_init
+		 * kernel might not be large enough to do in kernel
+		 */
+		device->mem_zero(work_pool_wgs);
+		device->mem_zero(split_data);
+		device->mem_zero(ray_state);
+
+		if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
+		                                   subtile,
+		                                   num_global_elements,
+		                                   kgbuffer,
+		                                   kernel_data,
+		                                   split_data,
+		                                   ray_state,
+		                                   queue_index,
+		                                   use_queues_flag,
+		                                   work_pool_wgs))
+		{
+			return false;
+		}
+
+		ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
+
+		bool activeRaysAvailable = true;
+
+		while(activeRaysAvailable) {
+			/* Do path-iteration in host [Enqueue Path-iteration kernels. */
+			for(int PathIter = 0; PathIter < 16; PathIter++) {
+				ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
+
+				if(task->get_cancel()) {
+					return true;
+				}
+			}
+
+			/* Decide if we should exit path-iteration in host. */
+			device->mem_copy_from(ray_state, 0, global_size[0] * global_size[1] * sizeof(char), 1, 1);
+
+			activeRaysAvailable = false;
+
+			for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
+				int8_t state = ray_state.get_data()[rayStateIter];
+
+				if(state != RAY_INACTIVE) {
+					if(state == RAY_INVALID) {
+						/* Something went wrong, abort to avoid looping endlessly. */
+						device->set_error("Split kernel error: invalid ray state");
+						return false;
+					}
+
+					/* Not all rays are RAY_INACTIVE. */
+					activeRaysAvailable = true;
+					break;
+				}
+			}
+
+			if(task->get_cancel()) {
+				return true;
+			}
+		}
+
+		double time_per_sample = ((time_dt()-start_time) / subtile.num_samples);
+
+		if(avg_time_per_sample == 0.0) {
+			/* start rolling average */
+			avg_time_per_sample = time_per_sample;
+		}
+		else {
+			avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample;
+		}
+
+#undef ENQUEUE_SPLIT_KERNEL
+
+		tile.sample += subtile.num_samples;
+		task->update_progress(&tile, tile.w*tile.h*subtile.num_samples);
+
+		time_multiplier = min(time_multiplier << 1, 10);
+
+		if(task->get_cancel()) {
+			return true;
+		}
+	}
+
+	return true;
+}
+
+CCL_NAMESPACE_END
+
+
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEVICE_SPLIT_KERNEL_H__
+#define __DEVICE_SPLIT_KERNEL_H__
+
+#include "device/device.h"
+#include "render/buffers.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* When allocate global memory in chunks. We may not be able to
+ * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
+ * Since some bytes may be needed for aligning chunks of memory;
+ * This is the amount of memory that we dedicate for that purpose.
+ */
+#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
+
+/* Types used for split kernel */
+
+class KernelDimensions {
+public:
+	size_t global_size[2];
+	size_t local_size[2];
+
+	KernelDimensions(size_t global_size_[2], size_t local_size_[2])
+	{
+		memcpy(global_size, global_size_, sizeof(global_size));
+		memcpy(local_size, local_size_, sizeof(local_size));
+	}
+};
+
+class SplitKernelFunction {
+public:
+	virtual ~SplitKernelFunction() {}
+
+	/* enqueue the kernel, returns false if there is an error */
+	virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) = 0;
+};
+
+class DeviceSplitKernel {
+private:
+	Device *device;
+
+	SplitKernelFunction *kernel_path_init;
+	SplitKernelFunction *kernel_scene_intersect;
+	SplitKernelFunction *kernel_lamp_emission;
+	SplitKernelFunction *kernel_do_volume;
+	SplitKernelFunction *kernel_queue_enqueue;
+	SplitKernelFunction *kernel_indirect_background;
+	SplitKernelFunction *kernel_shader_eval;
+	SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
+	SplitKernelFunction *kernel_subsurface_scatter;
+	SplitKernelFunction *kernel_direct_lighting;
+	SplitKernelFunction *kernel_shadow_blocked_ao;
+	SplitKernelFunction *kernel_shadow_blocked_dl;
+	SplitKernelFunction *kernel_next_iteration_setup;
+	SplitKernelFunction *kernel_indirect_subsurface;
+	SplitKernelFunction *kernel_buffer_update;
+
+	/* Global memory variables [porting]; These memory is used for
+	 * co-operation between different kernels; Data written by one
+	 * kernel will be available to another kernel via this global
+	 * memory.
+	 */
+	device_memory split_data;
+	device_vector<uchar> ray_state;
+	device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */
+
+	/* Flag to make sceneintersect and lampemission kernel use queues. */
+	device_memory use_queues_flag;
+
+	/* Approximate time it takes to complete one sample */
+	double avg_time_per_sample;
+
+	/* Work pool with respect to each work group. */
+	device_memory work_pool_wgs;
+
+	/* clos_max value for which the kernels have been loaded currently. */
+	int current_max_closure;
+
+	/* Marked True in constructor and marked false at the end of path_trace(). */
+	bool first_tile;
+
+public:
+	explicit DeviceSplitKernel(Device* device);
+	virtual ~DeviceSplitKernel();
+
+	bool load_kernels(const DeviceRequestedFeatures& requested_features);
+	bool path_trace(DeviceTask *task,
+	                RenderTile& rtile,
+	                device_memory& kgbuffer,
+	                device_memory& kernel_data);
+
+	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0;
+	size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size);
+
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data_,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs) = 0;
+
+	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) = 0;
+	virtual int2 split_kernel_local_size() = 0;
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __DEVICE_SPLIT_KERNEL_H__ */
+
+
+
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -17,12 +17,12 @@
 #include <stdlib.h>
 #include <string.h>

-#include "device_task.h"
+#include "device/device_task.h"

-#include "buffers.h"
+#include "render/buffers.h"

-#include "util_algorithm.h"
-#include "util_time.h"
+#include "util/util_algorithm.h"
+#include "util/util_time.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -17,11 +17,11 @@
 #ifndef __DEVICE_TASK_H__
 #define __DEVICE_TASK_H__

-#include "device_memory.h"
+#include "device/device_memory.h"

-#include "util_function.h"
-#include "util_list.h"
-#include "util_task.h"
+#include "util/util_function.h"
+#include "util/util_list.h"
+#include "util/util_task.h"

 CCL_NAMESPACE_BEGIN

@@ -51,6 +51,8 @@ public:
 	int shader_filter;
 	int shader_x, shader_w;

+	int passes_size;
+
 	explicit DeviceTask(Type type = PATH_TRACE);

 	int get_subtask_count(int num, int max_size = 0);
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -16,40 +16,40 @@

 #ifdef WITH_OPENCL

-#include "device.h"
+#include "device/device.h"

-#include "util_map.h"
-#include "util_param.h"
-#include "util_string.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_string.h"

 #include "clew.h"

 CCL_NAMESPACE_BEGIN

+/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */
+#ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
+/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
+#  undef clEnqueueNDRangeKernel
+#  define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
+	clFinish(a); \
+	CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
+	clFinish(a);
+
+#  undef clEnqueueWriteBuffer
+#  define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
+	clFinish(a); \
+	CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
+	clFinish(a);
+
+#  undef clEnqueueReadBuffer
+#  define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
+	clFinish(a); \
+	CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
+	clFinish(a);
+#endif  /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
+
 #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))

-/* Macro declarations used with split kernel */
-
-/* Macro to enable/disable work-stealing */
-#define __WORK_STEALING__
-
-#define SPLIT_KERNEL_LOCAL_SIZE_X 64
-#define SPLIT_KERNEL_LOCAL_SIZE_Y 1
-
-/* This value may be tuned according to the scene we are rendering.
- *
- * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
- * ray-bounces will improve performance.
- */
-#define PATH_ITER_INC_FACTOR 8
-
-/* When allocate global memory in chunks. We may not be able to
- * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
- * Since some bytes may be needed for aligning chunks of memory;
- * This is the amount of memory that we dedicate for that purpose.
- */
-#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
-
 struct OpenCLPlatformDevice {
 	OpenCLPlatformDevice(cl_platform_id platform_id,
 	                     const string& platform_name,
@@ -90,6 +90,54 @@ public:
 	                              cl_device_id device_id);
 	static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
 	                               bool force_all = false);
+	static bool use_single_program();
+
+	/* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
+
+	/* Platform information. */
+	static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
+	static cl_uint get_num_platforms();
+
+	static bool get_platforms(vector<cl_platform_id> *platform_ids,
+	                          cl_int *error = NULL);
+	static vector<cl_platform_id> get_platforms();
+
+	static bool get_platform_name(cl_platform_id platform_id,
+	                              string *platform_name);
+	static string get_platform_name(cl_platform_id platform_id);
+
+	static bool get_num_platform_devices(cl_platform_id platform_id,
+	                                     cl_device_type device_type,
+	                                     cl_uint *num_devices,
+	                                     cl_int *error = NULL);
+	static cl_uint get_num_platform_devices(cl_platform_id platform_id,
+	                                        cl_device_type device_type);
+
+	static bool get_platform_devices(cl_platform_id platform_id,
+	                                 cl_device_type device_type,
+	                                 vector<cl_device_id> *device_ids,
+	                                 cl_int* error = NULL);
+	static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
+	                                                 cl_device_type device_type);
+
+	/* Device information. */
+	static bool get_device_name(cl_device_id device_id,
+	                            string *device_name,
+	                            cl_int* error = NULL);
+
+	static string get_device_name(cl_device_id device_id);
+
+	static bool get_device_type(cl_device_id device_id,
+	                            cl_device_type *device_type,
+	                            cl_int* error = NULL);
+	static cl_device_type get_device_type(cl_device_id device_id);
+
+	/* Get somewhat more readable device name.
+	 * Main difference is AMD OpenCL here which only gives code name
+	 * for the regular device name. This will give more sane device
+	 * name using some extensions.
+	 */
+	static string get_readable_device_name(cl_device_id device_id);
 };

 /* Thread safe cache for contexts and programs.
@@ -248,6 +296,7 @@ public:

 	bool device_initialized;
 	string platform_name;
+	string device_name;

 	bool opencl_error(cl_int err);
 	void opencl_error(const string& message);
@@ -266,10 +315,10 @@ public:

 	/* Has to be implemented by the real device classes.
 	 * The base device will then load all these programs. */
-	virtual void load_kernels(const DeviceRequestedFeatures& requested_features,
+	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
 	                          vector<OpenCLProgram*> &programs) = 0;

-	void mem_alloc(device_memory& mem, MemoryType type);
+	void mem_alloc(const char *name, device_memory& mem, MemoryType type);
 	void mem_copy_to(device_memory& mem);
 	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
 	void mem_zero(device_memory& mem);
@@ -326,16 +375,39 @@ protected:

 	class ArgumentWrapper {
 	public:
-		ArgumentWrapper() : size(0), pointer(NULL) {}
-		template <typename T>
+		ArgumentWrapper() : size(0), pointer(NULL)
+		{
+		}
+
+		ArgumentWrapper(device_memory& argument) : size(sizeof(void*)),
+		                                           pointer((void*)(&argument.device_pointer))
+		{
+		}
+
+		template<typename T>
+		ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)),
+		                                              pointer((void*)(&argument.device_pointer))
+		{
+		}
+
+		template<typename T>
 		ArgumentWrapper(T& argument) : size(sizeof(argument)),
-		                               pointer(&argument) { }
+		                               pointer(&argument)
+		{
+		}
+
 		ArgumentWrapper(int argument) : size(sizeof(int)),
 		                                int_value(argument),
-		                                pointer(&int_value) { }
+		                                pointer(&int_value)
+		{
+		}
+
 		ArgumentWrapper(float argument) : size(sizeof(float)),
 		                                  float_value(argument),
-		                                  pointer(&float_value) { }
+		                                  pointer(&float_value)
+		{
+		}
+
 		size_t size;
 		int int_value;
 		float float_value;
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -16,15 +16,15 @@

 #ifdef WITH_OPENCL

-#include "opencl.h"
+#include "device/opencl/opencl.h"

-#include "kernel_types.h"
+#include "kernel/kernel_types.h"

-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_md5.h"
-#include "util_path.h"
-#include "util_time.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_time.h"

 CCL_NAMESPACE_BEGIN

@@ -82,9 +82,10 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
 	cpPlatform = platform_device.platform_id;
 	cdDevice = platform_device.device_id;
 	platform_name = platform_device.platform_name;
+	device_name = platform_device.device_name;
 	VLOG(2) << "Creating new Cycles device for OpenCL platform "
 	        << platform_name << ", device "
-	        << platform_device.device_name << ".";
+	        << device_name << ".";

 	{
 		/* try to use cached context */
@@ -113,12 +114,16 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
 	}

 	cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
-	if(opencl_error(ciErr))
+	if(opencl_error(ciErr)) {
+		opencl_error("OpenCL: Error creating command queue");
 		return;
+	}

 	null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
-	if(opencl_error(ciErr))
+	if(opencl_error(ciErr)) {
+		opencl_error("OpenCL: Error creating memory buffer for NULL");
 		return;
+	}

 	fprintf(stderr, "Device init success\n");
 	device_initialized = true;
@@ -147,10 +152,8 @@ OpenCLDeviceBase::~OpenCLDeviceBase()
 void CL_CALLBACK OpenCLDeviceBase::context_notify_callback(const char *err_info,
 	const void * /*private_info*/, size_t /*cb*/, void *user_data)
 {
-	char name[256];
-	clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
-
-	fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info);
+	string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
+	fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
 }

 bool OpenCLDeviceBase::opencl_version_check()
@@ -191,6 +194,8 @@ string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options)

 bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features)
 {
+	VLOG(2) << "Loading kernels for platform " << platform_name
+	        << ", device " << device_name << ".";
 	/* Verify if device was initialized. */
 	if(!device_initialized) {
 		fprintf(stderr, "OpenCL: failed to initialize device.\n");
@@ -206,11 +211,14 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
 	base_program.add_kernel(ustring("convert_to_half_float"));
 	base_program.add_kernel(ustring("shader"));
 	base_program.add_kernel(ustring("bake"));
+	base_program.add_kernel(ustring("zero_buffer"));

 	vector<OpenCLProgram*> programs;
 	programs.push_back(&base_program);
 	/* Call actual class to fill the vector with its programs. */
-	load_kernels(requested_features, programs);
+	if(!load_kernels(requested_features, programs)) {
+		return false;
+	}

 	/* Parallel compilation is supported by Cycles, but currently all OpenCL frameworks
 	 * serialize the calls internally, so it's not much use right now.
@@ -242,8 +250,14 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
 	return true;
 }

-void OpenCLDeviceBase::mem_alloc(device_memory& mem, MemoryType type)
+void OpenCLDeviceBase::mem_alloc(const char *name, device_memory& mem, MemoryType type)
 {
+	if(name) {
+		VLOG(1) << "Buffer allocate: " << name << ", "
+			    << string_human_readable_number(mem.memory_size()) << " bytes. ("
+			    << string_human_readable_size(mem.memory_size()) << ")";
+	}
+
 	size_t size = mem.memory_size();

 	cl_mem_flags mem_flag;
@@ -311,8 +325,61 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in
 void OpenCLDeviceBase::mem_zero(device_memory& mem)
 {
 	if(mem.device_pointer) {
-		memset((void*)mem.data_pointer, 0, mem.memory_size());
-		mem_copy_to(mem);
+		if(base_program.is_loaded()) {
+			cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
+
+			size_t global_size[] = {1024, 1024};
+			size_t num_threads = global_size[0] * global_size[1];
+
+			cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer);
+			cl_ulong d_offset = 0;
+			cl_ulong d_size = 0;
+
+			while(d_offset < mem.memory_size()) {
+				d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
+
+				kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
+
+				ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
+				                               ckZeroBuffer,
+				                               2,
+				                               NULL,
+				                               global_size,
+				                               NULL,
+				                               0,
+				                               NULL,
+				                               NULL);
+				opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
+
+				d_offset += d_size;
+			}
+		}
+
+		if(mem.data_pointer) {
+			memset((void*)mem.data_pointer, 0, mem.memory_size());
+		}
+
+		if(!base_program.is_loaded()) {
+			void* zero = (void*)mem.data_pointer;
+
+			if(!mem.data_pointer) {
+				zero = util_aligned_malloc(mem.memory_size(), 16);
+				memset(zero, 0, mem.memory_size());
+			}
+
+			opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
+			                                   CL_MEM_PTR(mem.device_pointer),
+			                                   CL_TRUE,
+			                                   0,
+			                                   mem.memory_size(),
+			                                   zero,
+			                                   0,
+			                                   NULL, NULL));
+
+			if(!mem.data_pointer) {
+				util_aligned_free(zero);
+			}
+		}
 	}
 }

@@ -337,7 +404,7 @@ void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
 		device_vector<uchar> *data = new device_vector<uchar>();
 		data->copy((uchar*)host, size);

-		mem_alloc(*data, MEM_READ_ONLY);
+		mem_alloc(name, *data, MEM_READ_ONLY);
 		i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
 	}
 	else {
@@ -356,7 +423,7 @@ void OpenCLDeviceBase::tex_alloc(const char *name,
 	VLOG(1) << "Texture allocate: " << name << ", "
 	        << string_human_readable_number(mem.memory_size()) << " bytes. ("
 	        << string_human_readable_size(mem.memory_size()) << ")";
-	mem_alloc(mem, MEM_READ_ONLY);
+	mem_alloc(NULL, mem, MEM_READ_ONLY);
 	mem_copy_to(mem);
 	assert(mem_map.find(name) == mem_map.end());
 	mem_map.insert(MemMap::value_type(name, mem.device_pointer));
@@ -460,7 +527,7 @@ void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_

 #define KERNEL_TEX(type, ttype, name) \
 set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
-#include "kernel_textures.h"
+#include "kernel/kernel_textures.h"
 #undef KERNEL_TEX

 	start_arg_index += kernel_set_args(ckFilmConvertKernel,
@@ -511,7 +578,7 @@ void OpenCLDeviceBase::shader(DeviceTask& task)

 #define KERNEL_TEX(type, ttype, name) \
 	set_kernel_arg_mem(kernel, &start_arg_index, #name);
-#include "kernel_textures.h"
+#include "kernel/kernel_textures.h"
 #undef KERNEL_TEX

 	start_arg_index += kernel_set_args(kernel,
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -16,15 +16,15 @@

 #ifdef WITH_OPENCL

-#include "opencl.h"
+#include "device/opencl/opencl.h"

-#include "buffers.h"
+#include "render/buffers.h"

-#include "kernel_types.h"
+#include "kernel/kernel_types.h"

-#include "util_md5.h"
-#include "util_path.h"
-#include "util_time.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_time.h"

 CCL_NAMESPACE_BEGIN

@@ -43,11 +43,12 @@ public:
 		return true;
 	}

-	virtual void load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
+	virtual bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
 	                          vector<OpenCLProgram*> &programs)
 	{
 		path_trace_program.add_kernel(ustring("path_trace"));
 		programs.push_back(&path_trace_program);
+		return true;
 	}

 	~OpenCLDeviceMegaKernel()
@@ -83,7 +84,7 @@ public:

 #define KERNEL_TEX(type, ttype, name) \
 		set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
-#include "kernel_textures.h"
+#include "kernel/kernel_textures.h"
 #undef KERNEL_TEX

 		start_arg_index += kernel_set_args(ckPathTraceKernel,
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -16,11 +16,12 @@

 #ifdef WITH_OPENCL

-#include "opencl.h"
+#include "device/opencl/opencl.h"

-#include "util_logging.h"
-#include "util_path.h"
-#include "util_time.h"
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_time.h"

 using std::cerr;
 using std::endl;
@@ -234,7 +235,7 @@ string OpenCLCache::get_kernel_md5()
 	thread_scoped_lock lock(self.kernel_md5_lock);

 	if(self.kernel_md5.empty()) {
-		self.kernel_md5 = path_files_md5_hash(path_get("kernel"));
+		self.kernel_md5 = path_files_md5_hash(path_get("source"));
 	}
 	return self.kernel_md5;
 }
@@ -309,6 +310,8 @@ bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src)
 	string build_options;
 	build_options = device->kernel_build_options(debug_src) + kernel_build_options;

+	VLOG(1) << "Build options passed to clBuildProgram: '"
+	        << build_options << "'.";
 	cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);

 	/* show warnings even if build is successful */
@@ -336,12 +339,13 @@ bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src)

 bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src)
 {
-	string source = "#include \"kernels/opencl/" + kernel_file + "\" // " + OpenCLCache::get_kernel_md5() + "\n";
+	string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
 	/* We compile kernels consisting of many files. unfortunately OpenCL
 	 * kernel caches do not seem to recognize changes in included files.
 	 * so we force recompile on changes by adding the md5 hash of all files.
 	 */
-	source = path_source_replace_includes(source, path_get("kernel"));
+	source = path_source_replace_includes(source, path_get("source"));
+	source += "\n// " + util_md5_string(source) + "\n";

 	if(debug_src) {
 		path_write_text(*debug_src, source);
@@ -352,10 +356,10 @@ bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src)
 	cl_int ciErr;

 	program = clCreateProgramWithSource(device->cxContext,
-	                                   1,
-	                                   &source_str,
-	                                   &source_len,
-	                                   &ciErr);
+	                                    1,
+	                                    &source_str,
+	                                    &source_len,
+	                                    &ciErr);

 	if(ciErr != CL_SUCCESS) {
 		add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
@@ -438,7 +442,11 @@ void OpenCLDeviceBase::OpenCLProgram::load()
 	if(!program) {
 		add_log(string("OpenCL program ") + program_name + " not found in cache.", true);

-		string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + OpenCLCache::get_kernel_md5();
+		/* need to create source to get md5 */
+		string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
+		source = path_source_replace_includes(source, path_get("source"));
+
+		string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source);
 		basename = path_cache_get(path_join("kernels", basename));
 		string clbin = basename + ".clbin";

@@ -544,6 +552,11 @@ bool OpenCLInfo::use_debug()
 	return DebugFlags().opencl.debug;
 }

+bool OpenCLInfo::use_single_program()
+{
+	return DebugFlags().opencl.single_program;
+}
+
 bool OpenCLInfo::kernel_use_advanced_shading(const string& platform)
 {
 	/* keep this in sync with kernel_types.h! */
@@ -587,11 +600,20 @@ bool OpenCLInfo::device_supported(const string& platform_name,
                                  const cl_device_id device_id)
 {
 	cl_device_type device_type;
-	clGetDeviceInfo(device_id,
-	                CL_DEVICE_TYPE,
-	                sizeof(cl_device_type),
-	                &device_type,
-	                NULL);
+	if(!get_device_type(device_id, &device_type)) {
+		return false;
+	}
+	string device_name;
+	if(!get_device_name(device_id, &device_name)) {
+		return false;
+	}
+	/* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework
+	 * (aka, it will not be on Intel framework). This isn't supported
+	 * and needs an explicit blacklist.
+	 */
+	if(strstr(device_name.c_str(), "Iris")) {
+		return false;
+	}
 	if(platform_name == "AMD Accelerated Parallel Processing" &&
 	   device_type == CL_DEVICE_TYPE_GPU)
 	{
@@ -705,39 +727,30 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
 		return;
 	}

+	cl_int error;
 	vector<cl_device_id> device_ids;
-	cl_uint num_devices = 0;
 	vector<cl_platform_id> platform_ids;
-	cl_uint num_platforms = 0;

-	/* Get devices. */
-	if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS ||
-	   num_platforms == 0)
-	{
+	/* Get platforms. */
+	if(!get_platforms(&platform_ids, &error)) {
+		FIRST_VLOG(2) << "Error fetching platforms:"
+		              << string(clewErrorString(error));
+		first_time = false;
+		return;
+	}
+	if(platform_ids.size() == 0) {
 		FIRST_VLOG(2) << "No OpenCL platforms were found.";
 		first_time = false;
 		return;
 	}
-	platform_ids.resize(num_platforms);
-	if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS) {
-		FIRST_VLOG(2) << "Failed to fetch platform IDs from the driver..";
-		first_time = false;
-		return;
-	}
 	/* Devices are numbered consecutively across platforms. */
-	for(int platform = 0; platform < num_platforms; platform++) {
+	for(int platform = 0; platform < platform_ids.size(); platform++) {
 		cl_platform_id platform_id = platform_ids[platform];
-		char pname[256];
-		if(clGetPlatformInfo(platform_id,
-		                     CL_PLATFORM_NAME,
-		                     sizeof(pname),
-		                     &pname,
-		                     NULL) != CL_SUCCESS)
-		{
+		string platform_name;
+		if(!get_platform_name(platform_id, &platform_name)) {
 			FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
 			continue;
 		}
-		string platform_name = pname;
 		FIRST_VLOG(2) << "Enumerating devices for platform "
 		              << platform_name << ".";
 		if(!platform_version_check(platform_id)) {
@@ -745,39 +758,28 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
 			              << " due to too old compiler version.";
 			continue;
 		}
-		num_devices = 0;
-		cl_int ciErr;
-		if((ciErr = clGetDeviceIDs(platform_id,
-		                  device_type,
-		                  0,
-		                  NULL,
-		                  &num_devices)) != CL_SUCCESS || num_devices == 0)
+		if(!get_platform_devices(platform_id,
+		                         device_type,
+		                         &device_ids,
+		                         &error))
 		{
 			FIRST_VLOG(2) << "Ignoring platform " << platform_name
-			              << ", failed to fetch number of devices: " << string(clewErrorString(ciErr));
+			              << ", failed to fetch of devices: "
+			              << string(clewErrorString(error));
 			continue;
 		}
-		device_ids.resize(num_devices);
-		if(clGetDeviceIDs(platform_id,
-		                  device_type,
-		                  num_devices,
-		                  &device_ids[0],
-		                  NULL) != CL_SUCCESS)
-		{
+		if(device_ids.size() == 0) {
 			FIRST_VLOG(2) << "Ignoring platform " << platform_name
-			              << ", failed to fetch devices list.";
+			              << ", it has no devices.";
 			continue;
 		}
-		for(int num = 0; num < num_devices; num++) {
-			cl_device_id device_id = device_ids[num];
-			char device_name[1024] = "\0";
-			if(clGetDeviceInfo(device_id,
-			                   CL_DEVICE_NAME,
-			                   sizeof(device_name),
-			                   &device_name,
-			                   NULL) != CL_SUCCESS)
-			{
-				FIRST_VLOG(2) << "Failed to fetch device name, ignoring.";
+		for(int num = 0; num < device_ids.size(); num++) {
+			const cl_device_id device_id = device_ids[num];
+			string device_name;
+			if(!get_device_name(device_id, &device_name, &error)) {
+				FIRST_VLOG(2) << "Failed to fetch device name: "
+				              << string(clewErrorString(error))
+				              << ", ignoring.";
 				continue;
 			}
 			if(!device_version_check(device_id)) {
@@ -789,24 +791,28 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
 			   device_supported(platform_name, device_id))
 			{
 				cl_device_type device_type;
-				if(clGetDeviceInfo(device_id,
-				                   CL_DEVICE_TYPE,
-				                   sizeof(cl_device_type),
-				                   &device_type,
-				                   NULL) != CL_SUCCESS)
-				{
+				if(!get_device_type(device_id, &device_type, &error)) {
 					FIRST_VLOG(2) << "Ignoring device " << device_name
-					              << ", failed to fetch device type.";
+					              << ", failed to fetch device type:"
+					              << string(clewErrorString(error));
 					continue;
 				}
-				FIRST_VLOG(2) << "Adding new device " << device_name << ".";
+				string readable_device_name =
+				        get_readable_device_name(device_id);
+				if(readable_device_name != device_name) {
+					FIRST_VLOG(2) << "Using more readable device name: "
+					              << readable_device_name;
+				}
+				FIRST_VLOG(2) << "Adding new device "
+				              << readable_device_name << ".";
 				string hardware_id = get_hardware_id(platform_name, device_id);
-				usable_devices->push_back(OpenCLPlatformDevice(platform_id,
-				                                               platform_name,
-				                                               device_id,
-				                                               device_type,
-				                                               device_name,
-				                                               hardware_id));
+				usable_devices->push_back(OpenCLPlatformDevice(
+				        platform_id,
+				        platform_name,
+				        device_id,
+				        device_type,
+				        readable_device_name,
+				        hardware_id));
 			}
 			else {
 				FIRST_VLOG(2) << "Ignoring device " << device_name
@@ -817,6 +823,252 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
 	first_time = false;
 }

+bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids,
+                               cl_int *error)
+{
+	/* Reset from possible previous state. */
+	platform_ids->resize(0);
+	cl_uint num_platforms;
+	if(!get_num_platforms(&num_platforms, error)) {
+		return false;
+	}
+	/* Get actual platforms. */
+	cl_int err;
+	platform_ids->resize(num_platforms);
+	if((err = clGetPlatformIDs(num_platforms,
+	                           &platform_ids->at(0),
+	                           NULL)) != CL_SUCCESS) {
+		if(error != NULL) {
+			*error = err;
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+vector<cl_platform_id> OpenCLInfo::get_platforms()
+{
+	vector<cl_platform_id> platform_ids;
+	get_platforms(&platform_ids);
+	return platform_ids;
+}
+
+bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error)
+{
+	cl_int err;
+	if((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) {
+		if(error != NULL) {
+			*error = err;
+		}
+		*num_platforms = 0;
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+cl_uint OpenCLInfo::get_num_platforms()
+{
+	cl_uint num_platforms;
+	if(!get_num_platforms(&num_platforms)) {
+		return 0;
+	}
+	return num_platforms;
+}
+
+bool OpenCLInfo::get_platform_name(cl_platform_id platform_id,
+                                   string *platform_name)
+{
+	char buffer[256];
+	if(clGetPlatformInfo(platform_id,
+	                     CL_PLATFORM_NAME,
+	                     sizeof(buffer),
+	                     &buffer,
+	                     NULL) != CL_SUCCESS)
+	{
+		*platform_name = "";
+		return false;
+	}
+	*platform_name = buffer;
+	return true;
+}
+
+string OpenCLInfo::get_platform_name(cl_platform_id platform_id)
+{
+	string platform_name;
+	if (!get_platform_name(platform_id, &platform_name)) {
+		return "";
+	}
+	return platform_name;
+}
+
+bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
+                                          cl_device_type device_type,
+                                          cl_uint *num_devices,
+                                          cl_int *error)
+{
+	cl_int err;
+	if((err = clGetDeviceIDs(platform_id,
+	                         device_type,
+	                         0,
+	                         NULL,
+	                         num_devices)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		*num_devices = 0;
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
+                                             cl_device_type device_type)
+{
+	cl_uint num_devices;
+	if(!get_num_platform_devices(platform_id,
+	                             device_type,
+	                             &num_devices))
+	{
+		return 0;
+	}
+	return num_devices;
+}
+
+bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
+                                      cl_device_type device_type,
+                                      vector<cl_device_id> *device_ids,
+                                      cl_int* error)
+{
+	/* Reset from possible previous state. */
+	device_ids->resize(0);
+	/* Get number of devices to pre-allocate memory. */
+	cl_uint num_devices;
+	if(!get_num_platform_devices(platform_id,
+	                             device_type,
+	                             &num_devices,
+	                             error))
+	{
+		return false;
+	}
+	/* Get actual device list. */
+	device_ids->resize(num_devices);
+	cl_int err;
+	if((err = clGetDeviceIDs(platform_id,
+	                         device_type,
+	                         num_devices,
+	                         &device_ids->at(0),
+	                         NULL)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
+                                                      cl_device_type device_type)
+{
+	vector<cl_device_id> devices;
+	get_platform_devices(platform_id, device_type, &devices);
+	return devices;
+}
+
+bool OpenCLInfo::get_device_name(cl_device_id device_id,
+                                 string *device_name,
+                                 cl_int* error)
+{
+	char buffer[1024];
+	cl_int err;
+	if((err = clGetDeviceInfo(device_id,
+	                          CL_DEVICE_NAME,
+	                          sizeof(buffer),
+	                          &buffer,
+	                          NULL)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		*device_name = "";
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	*device_name = buffer;
+	return true;
+}
+
+string OpenCLInfo::get_device_name(cl_device_id device_id)
+{
+	string device_name;
+	if(!get_device_name(device_id, &device_name)) {
+		return "";
+	}
+	return device_name;
+}
+
+bool OpenCLInfo::get_device_type(cl_device_id device_id,
+                                 cl_device_type *device_type,
+                                 cl_int* error)
+{
+	cl_int err;
+	if((err = clGetDeviceInfo(device_id,
+	                          CL_DEVICE_TYPE,
+	                          sizeof(cl_device_type),
+	                          device_type,
+	                          NULL)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		*device_type = 0;
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id)
+{
+	cl_device_type device_type;
+	if(!get_device_type(device_id, &device_type)) {
+		return 0;
+	}
+	return device_type;
+}
+
+string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
+{
+	char board_name[1024];
+	if(clGetDeviceInfo(device_id,
+	                   CL_DEVICE_BOARD_NAME_AMD,
+	                   sizeof(board_name),
+	                   &board_name,
+	                   NULL) == CL_SUCCESS)
+	{
+		return board_name;
+	}
+	/* Fallback to standard device name API. */
+	return get_device_name(device_id);
+}
+
 CCL_NAMESPACE_END

 #endif
--- a/intern/cycles/graph/CMakeLists.txt
+++ b/intern/cycles/graph/CMakeLists.txt
@@ -1,7 +1,6 @@

 set(INC
-	.
-	../util
+	..
 )

 set(SRC
--- a/intern/cycles/graph/node.cpp
+++ b/intern/cycles/graph/node.cpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 */

-#include "node.h"
-#include "node_type.h"
+#include "graph/node.h"
+#include "graph/node_type.h"

-#include "util_foreach.h"
-#include "util_param.h"
-#include "util_transform.h"
+#include "util/util_foreach.h"
+#include "util/util_param.h"
+#include "util/util_transform.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/graph/node.h
+++ b/intern/cycles/graph/node.h
@@ -16,11 +16,11 @@

 #pragma once

-#include "node_type.h"
+#include "graph/node_type.h"

-#include "util_map.h"
-#include "util_param.h"
-#include "util_vector.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_vector.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/graph/node_enum.h
+++ b/intern/cycles/graph/node_enum.h
@@ -16,8 +16,8 @@

 #pragma once

-#include "util_map.h"
-#include "util_param.h"
+#include "util/util_map.h"
+#include "util/util_param.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/graph/node_type.cpp
+++ b/intern/cycles/graph/node_type.cpp
@@ -14,9 +14,9 @@
 * limitations under the License.
 */

-#include "node_type.h"
-#include "util_foreach.h"
-#include "util_transform.h"
+#include "graph/node_type.h"
+#include "util/util_foreach.h"
+#include "util/util_transform.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/graph/node_type.h
+++ b/intern/cycles/graph/node_type.h
@@ -16,12 +16,12 @@

 #pragma once

-#include "node_enum.h"
+#include "graph/node_enum.h"

-#include "util_map.h"
-#include "util_param.h"
-#include "util_string.h"
-#include "util_vector.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_string.h"
+#include "util/util_vector.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/graph/node_xml.cpp
+++ b/intern/cycles/graph/node_xml.cpp
@@ -14,11 +14,11 @@
 * limitations under the License.
 */

-#include "node_xml.h"
+#include "graph/node_xml.h"

-#include "util_foreach.h"
-#include "util_string.h"
-#include "util_transform.h"
+#include "util/util_foreach.h"
+#include "util/util_string.h"
+#include "util/util_transform.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/graph/node_xml.h
+++ b/intern/cycles/graph/node_xml.h
@@ -16,11 +16,11 @@

 #pragma once

-#include "node.h"
+#include "graph/node.h"

-#include "util_map.h"
-#include "util_string.h"
-#include "util_xml.h"
+#include "util/util_map.h"
+#include "util/util_string.h"
+#include "util/util_xml.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -1,10 +1,7 @@
 remove_extra_strict_flags()

 set(INC
-	.
-	../util
-	osl
-	svm
+	..
 )

 set(INC_SYS
@@ -13,19 +10,28 @@ set(INC_SYS

 set(SRC
 	kernels/cpu/kernel.cpp
+	kernels/cpu/kernel_split.cpp
 	kernels/opencl/kernel.cl
+	kernels/opencl/kernel_state_buffer_size.cl
+	kernels/opencl/kernel_split.cl
 	kernels/opencl/kernel_data_init.cl
+	kernels/opencl/kernel_path_init.cl
 	kernels/opencl/kernel_queue_enqueue.cl
 	kernels/opencl/kernel_scene_intersect.cl
 	kernels/opencl/kernel_lamp_emission.cl
-	kernels/opencl/kernel_background_buffer_update.cl
+	kernels/opencl/kernel_do_volume.cl
+	kernels/opencl/kernel_indirect_background.cl
 	kernels/opencl/kernel_shader_eval.cl
 	kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+	kernels/opencl/kernel_subsurface_scatter.cl
 	kernels/opencl/kernel_direct_lighting.cl
-	kernels/opencl/kernel_shadow_blocked.cl
+	kernels/opencl/kernel_shadow_blocked_ao.cl
+	kernels/opencl/kernel_shadow_blocked_dl.cl
 	kernels/opencl/kernel_next_iteration_setup.cl
-	kernels/opencl/kernel_sum_all_radiance.cl
+	kernels/opencl/kernel_indirect_subsurface.cl
+	kernels/opencl/kernel_buffer_update.cl
 	kernels/cuda/kernel.cu
+	kernels/cuda/kernel_split.cu
 )

 set(SRC_BVH_HEADERS
@@ -68,6 +74,7 @@ set(SRC_HEADERS
 	kernel_path_common.h
 	kernel_path_state.h
 	kernel_path_surface.h
+	kernel_path_subsurface.h
 	kernel_path_volume.h
 	kernel_projection.h
 	kernel_queues.h
@@ -88,6 +95,10 @@ set(SRC_KERNELS_CPU_HEADERS
 	kernels/cpu/kernel_cpu_image.h
 )

+set(SRC_KERNELS_CUDA_HEADERS
+	kernels/cuda/kernel_config.h
+)
+
 set(SRC_CLOSURE_HEADERS
 	closure/alloc.h
 	closure/bsdf.h
@@ -182,6 +193,7 @@ set(SRC_UTIL_HEADERS
 	../util/util_hash.h
 	../util/util_math.h
 	../util/util_math_fast.h
+	../util/util_math_intersect.h
 	../util/util_static_assert.h
 	../util/util_transform.h
 	../util/util_texture.h
@@ -189,17 +201,25 @@ set(SRC_UTIL_HEADERS
 )

 set(SRC_SPLIT_HEADERS
-	split/kernel_background_buffer_update.h
+	split/kernel_buffer_update.h
 	split/kernel_data_init.h
 	split/kernel_direct_lighting.h
+	split/kernel_do_volume.h
 	split/kernel_holdout_emission_blurring_pathtermination_ao.h
+	split/kernel_indirect_background.h
+	split/kernel_indirect_subsurface.h
 	split/kernel_lamp_emission.h
 	split/kernel_next_iteration_setup.h
+	split/kernel_path_init.h
+	split/kernel_queue_enqueue.h
 	split/kernel_scene_intersect.h
 	split/kernel_shader_eval.h
-	split/kernel_shadow_blocked.h
+	split/kernel_shadow_blocked_ao.h
+	split/kernel_shadow_blocked_dl.h
 	split/kernel_split_common.h
-	split/kernel_sum_all_radiance.h
+	split/kernel_split_data.h
+	split/kernel_split_data_types.h
+	split/kernel_subsurface_scatter.h
 )

 # CUDA module
@@ -227,8 +247,9 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endif()

 	# build for each arch
-	set(cuda_sources kernels/cuda/kernel.cu
+	set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu
 		${SRC_HEADERS}
+		${SRC_KERNELS_CUDA_HEADERS}
 		${SRC_BVH_HEADERS}
 		${SRC_SVM_HEADERS}
 		${SRC_GEOM_HEADERS}
@@ -237,15 +258,22 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	)
 	set(cuda_cubins)

-	macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
-		if(${experimental})
-			set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__")
-			set(cuda_cubin kernel_experimental_${arch}.cubin)
+	macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
+		if(${split})
+			set(cuda_extra_flags "-D__SPLIT__")
+			set(cuda_cubin kernel_split)
 		else()
 			set(cuda_extra_flags "")
-			set(cuda_cubin kernel_${arch}.cubin)
+			set(cuda_cubin kernel)
 		endif()

+		if(${experimental})
+			set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
+			set(cuda_cubin ${cuda_cubin}_experimental)
+		endif()
+
+		set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
+
 		if(WITH_CYCLES_DEBUG)
 			set(cuda_debug_flags "-D__KERNEL_DEBUG__")
 		else()
@@ -258,13 +286,19 @@ if(WITH_CYCLES_CUDA_BINARIES)
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
 		set(cuda_math_flags "--use_fast_math")

+		if(split)
+			set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
+		else()
+			set(cuda_kernel_src "/kernels/cuda/kernel.cu")
+		endif()
+
 		add_custom_command(
 			OUTPUT ${cuda_cubin}
 			COMMAND ${cuda_nvcc_command}
 					-arch=${arch}
 					${CUDA_NVCC_FLAGS}
 					-m${CUDA_BITS}
-					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu
+					--cubin ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
 					-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
 					--ptxas-options="-v"
 					${cuda_arch_flags}
@@ -272,8 +306,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 					${cuda_math_flags}
 					${cuda_extra_flags}
 					${cuda_debug_flags}
-					-I${CMAKE_CURRENT_SOURCE_DIR}/../util
-					-I${CMAKE_CURRENT_SOURCE_DIR}/svm
+					-I${CMAKE_CURRENT_SOURCE_DIR}/..
 					-DCCL_NAMESPACE_BEGIN=
 					-DCCL_NAMESPACE_END=
 					-DNVCC
@@ -291,7 +324,12 @@ if(WITH_CYCLES_CUDA_BINARIES)

 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
 		# Compile regular kernel
-		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE)
+		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
+
+		if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
+			# Compile split kernel
+			CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
+		endif()
 	endforeach()

 	add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins})
@@ -309,36 +347,50 @@ endif()
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})

+set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+
 if(CXX_HAS_SSE)
 	list(APPEND SRC
 		kernels/cpu/kernel_sse2.cpp
 		kernels/cpu/kernel_sse3.cpp
 		kernels/cpu/kernel_sse41.cpp
+		kernels/cpu/kernel_split_sse2.cpp
+		kernels/cpu/kernel_split_sse3.cpp
+		kernels/cpu/kernel_split_sse41.cpp
 	)

 	set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 endif()

 if(CXX_HAS_AVX)
 	list(APPEND SRC
 		kernels/cpu/kernel_avx.cpp
+		kernels/cpu/kernel_split_avx.cpp
 	)
 	set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 endif()

 if(CXX_HAS_AVX2)
 	list(APPEND SRC
 		kernels/cpu/kernel_avx2.cpp
+		kernels/cpu/kernel_split_avx2.cpp
 	)
 	set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()

 add_library(cycles_kernel
 	${SRC}
 	${SRC_HEADERS}
 	${SRC_KERNELS_CPU_HEADERS}
+	${SRC_KERNELS_CUDA_HEADERS}
 	${SRC_BVH_HEADERS}
 	${SRC_CLOSURE_HEADERS}
 	${SRC_SVM_HEADERS}
@@ -360,24 +412,33 @@ endif()
 #add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED})
 #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)

-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/split)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_state_buffer_size.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_split.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_path_init.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_do_volume.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_background.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_subsurface_scatter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_dl.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/split)

--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -27,43 +27,43 @@

 CCL_NAMESPACE_BEGIN

-#include "bvh_types.h"
+#include "kernel/bvh/bvh_types.h"

 /* Common QBVH functions. */
 #ifdef __QBVH__
-#  include "qbvh_nodes.h"
+#  include "kernel/bvh/qbvh_nodes.h"
 #endif

 /* Regular BVH traversal */

-#include "bvh_nodes.h"
+#include "kernel/bvh/bvh_nodes.h"

 #define BVH_FUNCTION_NAME bvh_intersect
 #define BVH_FUNCTION_FEATURES 0
-#include "bvh_traversal.h"
+#include "kernel/bvh/bvh_traversal.h"

 #if defined(__INSTANCING__)
 #  define BVH_FUNCTION_NAME bvh_intersect_instancing
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif

 #if defined(__HAIR__)
 #  define BVH_FUNCTION_NAME bvh_intersect_hair
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif

 #if defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_motion
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif

 #if defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_hair_motion
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif

 /* Subsurface scattering BVH traversal */
@@ -71,12 +71,12 @@ CCL_NAMESPACE_BEGIN
 #if defined(__SUBSURFACE__)
 #  define BVH_FUNCTION_NAME bvh_intersect_subsurface
 #  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "bvh_subsurface.h"
+#  include "kernel/bvh/bvh_subsurface.h"

 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
 #    define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR
-#    include "bvh_subsurface.h"
+#    include "kernel/bvh/bvh_subsurface.h"
 #  endif
 #endif  /* __SUBSURFACE__ */

@@ -85,18 +85,18 @@ CCL_NAMESPACE_BEGIN
 #if defined(__VOLUME__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume
 #  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "bvh_volume.h"
+#  include "kernel/bvh/bvh_volume.h"

 #  if defined(__INSTANCING__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#    include "bvh_volume.h"
+#    include "kernel/bvh/bvh_volume.h"
 #  endif

 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
-#    include "bvh_volume.h"
+#    include "kernel/bvh/bvh_volume.h"
 #  endif
 #endif  /* __VOLUME__ */

@@ -105,30 +105,30 @@ CCL_NAMESPACE_BEGIN
 #if defined(__SHADOW_RECORD_ALL__)
 #  define BVH_FUNCTION_NAME bvh_intersect_shadow_all
 #  define BVH_FUNCTION_FEATURES 0
-#  include "bvh_shadow_all.h"
+#  include "kernel/bvh/bvh_shadow_all.h"

 #  if defined(__INSTANCING__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif

 #  if defined(__HAIR__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif

 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif

 #  if defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif
 #endif  /* __SHADOW_RECORD_ALL__ */

@@ -137,18 +137,18 @@ CCL_NAMESPACE_BEGIN
 #if defined(__VOLUME_RECORD_ALL__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume_all
 #  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "bvh_volume_all.h"
+#  include "kernel/bvh/bvh_volume_all.h"

 #  if defined(__INSTANCING__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#    include "bvh_volume_all.h"
+#    include "kernel/bvh/bvh_volume_all.h"
 #  endif

 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
-#    include "bvh_volume_all.h"
+#    include "kernel/bvh/bvh_volume_all.h"
 #  endif
 #endif  /* __VOLUME_RECORD_ALL__ */

@@ -202,8 +202,9 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
 }

 #ifdef __SUBSURFACE__
+/* Note: ray is passed by value to work around a possible CUDA compiler bug. */
 ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
-                                                     const Ray *ray,
+                                                     const Ray ray,
                                                     SubsurfaceIntersection *ss_isect,
                                                     int subsurface_object,
                                                     uint *lcg_state,
@@ -212,7 +213,7 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
 #ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 		return bvh_intersect_subsurface_motion(kg,
-		                                       ray,
+		                                       &ray,
 		                                       ss_isect,
 		                                       subsurface_object,
 		                                       lcg_state,
@@ -220,7 +221,7 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
 	}
 #endif /* __OBJECT_MOTION__ */
 	return bvh_intersect_subsurface(kg,
-	                                ray,
+	                                &ray,
 	                                ss_isect,
 	                                subsurface_object,
 	                                lcg_state,
@@ -229,30 +230,63 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
 #endif

 #ifdef __SHADOW_RECORD_ALL__
-ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
+ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
+                                                     const Ray *ray,
+                                                     Intersection *isect,
+                                                     int skip_object,
+                                                     uint max_hits,
+                                                     uint *num_hits)
 {
 #  ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 #    ifdef __HAIR__
-		if(kernel_data.bvh.have_curves)
-			return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, max_hits, num_hits);
+		if(kernel_data.bvh.have_curves) {
+			return bvh_intersect_shadow_all_hair_motion(kg,
+			                                            ray,
+			                                            isect,
+			                                            skip_object,
+			                                            max_hits,
+			                                            num_hits);
+		}
 #    endif /* __HAIR__ */

-		return bvh_intersect_shadow_all_motion(kg, ray, isect, max_hits, num_hits);
+		return bvh_intersect_shadow_all_motion(kg,
+		                                       ray,
+		                                       isect,
+		                                       skip_object,
+		                                       max_hits,
+		                                       num_hits);
 	}
 #  endif /* __OBJECT_MOTION__ */

 #  ifdef __HAIR__
-	if(kernel_data.bvh.have_curves)
-		return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits);
+	if(kernel_data.bvh.have_curves) {
+		return bvh_intersect_shadow_all_hair(kg,
+		                                     ray,
+		                                     isect,
+		                                     skip_object,
+		                                     max_hits,
+		                                     num_hits);
+	}
 #  endif /* __HAIR__ */

 #  ifdef __INSTANCING__
-	if(kernel_data.bvh.have_instancing)
-		return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
+	if(kernel_data.bvh.have_instancing) {
+		return bvh_intersect_shadow_all_instancing(kg,
+		                                           ray,
+		                                           isect,
+		                                           skip_object,
+		                                           max_hits,
+		                                           num_hits);
+	}
 #  endif /* __INSTANCING__ */

-	return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+	return bvh_intersect_shadow_all(kg,
+	                                ray,
+	                                isect,
+	                                skip_object,
+	                                max_hits,
+	                                num_hits);
 }
 #endif  /* __SHADOW_RECORD_ALL__ */

@@ -357,7 +391,7 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 #endif
 }

-#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__)
+#if defined(__VOLUME_RECORD_ALL__) || (defined(__SHADOW_RECORD_ALL__) && defined(__KERNEL_CPU__))
 /* ToDo: Move to another file? */
 ccl_device int intersections_compare(const void *a, const void *b)
 {
@@ -373,5 +407,28 @@ ccl_device int intersections_compare(const void *a, const void *b)
 }
 #endif

-CCL_NAMESPACE_END
+#if defined(__SHADOW_RECORD_ALL__)
+ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
+{
+#ifdef __KERNEL_GPU__
+	/* Use bubble sort which has more friendly memory pattern on GPU. */
+	bool swapped;
+	do {
+		swapped = false;
+		for(int j = 0; j < num_hits - 1; ++j) {
+			if(hits[j].t > hits[j + 1].t) {
+				struct Intersection tmp = hits[j];
+				hits[j] = hits[j + 1];
+				hits[j + 1] = tmp;
+				swapped = true;
+			}
+		}
+		--num_hits;
+	} while(swapped);
+#else
+	qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+#endif
+}
+#endif  /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */

+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -17,8 +17,8 @@
 // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
 // 3-vector which might be faster.
 ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
-                                                           int node_addr,
-                                                           int child)
+                                                                int node_addr,
+                                                                int child)
 {
 	Transform space;
 	const int child_addr = node_addr + child * 3;
@@ -31,12 +31,12 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *k

 #if !defined(__KERNEL_SSE2__)
 ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
-                                                 const float3 P,
-                                                 const float3 idir,
-                                                 const float t,
-                                                 const int node_addr,
-                                                 const uint visibility,
-                                                 float dist[2])
+                                                      const float3 P,
+                                                      const float3 idir,
+                                                      const float t,
+                                                      const int node_addr,
+                                                      const uint visibility,
+                                                      float dist[2])
 {

 	/* fetch node data */
@@ -78,14 +78,14 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
 }

 ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
-                                                        const float3 P,
-                                                        const float3 idir,
-                                                        const float t,
-                                                        const float difl,
-                                                        const float extmax,
-                                                        const int node_addr,
-                                                        const uint visibility,
-                                                        float dist[2])
+                                                             const float3 P,
+                                                             const float3 idir,
+                                                             const float t,
+                                                             const float difl,
+                                                             const float extmax,
+                                                             const int node_addr,
+                                                             const uint visibility,
+                                                             float dist[2])
 {

 	/* fetch node data */
@@ -203,13 +203,13 @@ ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust(
 }

 ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
-                                                   const float3 P,
-                                                   const float3 dir,
-                                                   const float3 idir,
-                                                   const float t,
-                                                   const int node_addr,
-                                                   const uint visibility,
-                                                   float dist[2])
+                                                        const float3 P,
+                                                        const float3 dir,
+                                                        const float3 idir,
+                                                        const float t,
+                                                        const int node_addr,
+                                                        const uint visibility,
+                                                        float dist[2])
 {
 	int mask = 0;
 	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
@@ -233,15 +233,15 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
 }

 ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
-                                                          const float3 P,
-                                                          const float3 dir,
-                                                          const float3 idir,
-                                                          const float t,
-                                                          const float difl,
-                                                          const float extmax,
-                                                          const int node_addr,
-                                                          const uint visibility,
-                                                          float dist[2])
+                                                               const float3 P,
+                                                               const float3 dir,
+                                                               const float3 idir,
+                                                               const float t,
+                                                               const float difl,
+                                                               const float extmax,
+                                                               const int node_addr,
+                                                               const uint visibility,
+                                                               float dist[2])
 {
 	int mask = 0;
 	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
@@ -265,13 +265,13 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg
 }

 ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
-                                         const float3 P,
-                                         const float3 dir,
-                                         const float3 idir,
-                                         const float t,
-                                         const int node_addr,
-                                         const uint visibility,
-                                         float dist[2])
+                                              const float3 P,
+                                              const float3 dir,
+                                              const float3 idir,
+                                              const float t,
+                                              const int node_addr,
+                                              const uint visibility,
+                                              float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
@@ -296,15 +296,15 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
 }

 ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
-                                                const float3 P,
-                                                const float3 dir,
-                                                const float3 idir,
-                                                const float t,
-                                                const float difl,
-                                                const float extmax,
-                                                const int node_addr,
-                                                const uint visibility,
-                                                float dist[2])
+                                                     const float3 P,
+                                                     const float3 dir,
+                                                     const float3 idir,
+                                                     const float t,
+                                                     const float difl,
+                                                     const float extmax,
+                                                     const int node_addr,
+                                                     const uint visibility,
+                                                     float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
@@ -442,19 +442,19 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(
 }

 ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
-                                                   const float3 P,
-                                                   const float3 dir,
-                                                   const ssef& isect_near,
-                                                   const ssef& isect_far,
-                                                   const int node_addr,
-                                                   const uint visibility,
-                                                   float dist[2])
+                                                        const float3 P,
+                                                        const float3 dir,
+                                                        const ssef& isect_near,
+                                                        const ssef& isect_far,
+                                                        const int node_addr,
+                                                        const uint visibility,
+                                                        float dist[2])
 {
 	Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
 	Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);

 	float3 aligned_dir0 = transform_direction(&space0, dir),
-	       aligned_dir1 = transform_direction(&space1, dir);;
+	       aligned_dir1 = transform_direction(&space1, dir);
 	float3 aligned_P0 = transform_point(&space0, P),
 	       aligned_P1 = transform_point(&space1, P);
 	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
@@ -503,20 +503,20 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
 }

 ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
-                                                          const float3 P,
-                                                          const float3 dir,
-                                                          const ssef& isect_near,
-                                                          const ssef& isect_far,
-                                                          const float difl,
-                                                          const int node_addr,
-                                                          const uint visibility,
-                                                          float dist[2])
+                                                               const float3 P,
+                                                               const float3 dir,
+                                                               const ssef& isect_near,
+                                                               const ssef& isect_far,
+                                                               const float difl,
+                                                               const int node_addr,
+                                                               const uint visibility,
+                                                               float dist[2])
 {
 	Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
 	Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);

 	float3 aligned_dir0 = transform_direction(&space0, dir),
-	       aligned_dir1 = transform_direction(&space1, dir);;
+	       aligned_dir1 = transform_direction(&space1, dir);
 	float3 aligned_P0 = transform_point(&space0, P),
 	       aligned_P1 = transform_point(&space1, P);
 	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
@@ -574,17 +574,17 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg
 }

 ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
-                                         const float3& P,
-                                         const float3& dir,
-                                         const ssef& isect_near,
-                                         const ssef& isect_far,
-                                         const ssef& tsplat,
-                                         const ssef Psplat[3],
-                                         const ssef idirsplat[3],
-                                         const shuffle_swap_t shufflexyz[3],
-                                         const int node_addr,
-                                         const uint visibility,
-                                         float dist[2])
+                                              const float3& P,
+                                              const float3& dir,
+                                              const ssef& isect_near,
+                                              const ssef& isect_far,
+                                              const ssef& tsplat,
+                                              const ssef Psplat[3],
+                                              const ssef idirsplat[3],
+                                              const shuffle_swap_t shufflexyz[3],
+                                              const int node_addr,
+                                              const uint visibility,
+                                              float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
@@ -612,19 +612,19 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
 }

 ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
-                                                const float3& P,
-                                                const float3& dir,
-                                                const ssef& isect_near,
-                                                const ssef& isect_far,
-                                                const ssef& tsplat,
-                                                const ssef Psplat[3],
-                                                const ssef idirsplat[3],
-                                                const shuffle_swap_t shufflexyz[3],
-                                                const float difl,
-                                                const float extmax,
-                                                const int node_addr,
-                                                const uint visibility,
-                                                float dist[2])
+                                                     const float3& P,
+                                                     const float3& dir,
+                                                     const ssef& isect_near,
+                                                      const ssef& isect_far,
+                                                     const ssef& tsplat,
+                                                     const ssef Psplat[3],
+                                                     const ssef idirsplat[3],
+                                                     const shuffle_swap_t shufflexyz[3],
+                                                     const float difl,
+                                                     const float extmax,
+                                                     const int node_addr,
+                                                     const uint visibility,
+                                                     float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -18,7 +18,7 @@
 */

 #ifdef __QBVH__
-#  include "qbvh_shadow_all.h"
+#  include "kernel/bvh/qbvh_shadow_all.h"
 #endif

 #if BVH_FEATURE(BVH_HAIR)
@@ -45,6 +45,7 @@ ccl_device_inline
 bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
                                 const Ray *ray,
                                 Intersection *isect_array,
+                                 const int skip_object,
                                 const uint max_hits,
                                 uint *num_hits)
 {
@@ -100,9 +101,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif  /* __KERNEL_SSE2__ */

-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -189,6 +187,16 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					while(prim_addr < prim_addr2) {
 						kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);

+#ifdef __SHADOW_TRICKS__
+						uint tri_object = (object == OBJECT_NONE)
+						        ? kernel_tex_fetch(__prim_object, prim_addr)
+						        : object;
+						if(tri_object == skip_object) {
+							++prim_addr;
+							continue;
+						}
+#endif
+
 						bool hit;

 						/* todo: specialized intersect functions which don't fill in
@@ -198,9 +206,9 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						switch(p_type) {
 							case PRIMITIVE_TRIANGLE: {
 								hit = triangle_intersect(kg,
-								                         &isect_precalc,
 								                         isect_array,
 								                         P,
+								                         dir,
 								                         PATH_RAY_SHADOW,
 								                         object,
 								                         prim_addr);
@@ -309,12 +317,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);

 #  if BVH_FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+					isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
 #  else
-					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+					isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
 #  endif

-					triangle_intersect_precalc(dir, &isect_precalc);
 					num_hits_in_instance = 0;
 					isect_array->t = isect_t;

@@ -354,22 +361,17 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
 #  endif

-				triangle_intersect_precalc(dir, &isect_precalc);
-
 				/* scale isect->t to adjust for instancing */
 				for(int i = 0; i < num_hits_in_instance; i++) {
 					(isect_array-i-1)->t *= t_fac;
 				}
 			}
 			else {
-				float ignore_t = FLT_MAX;
-
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
 			}

 			isect_t = tmax;
@@ -400,6 +402,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                         const Ray *ray,
                                         Intersection *isect_array,
+                                         const int skip_object,
                                         const uint max_hits,
                                         uint *num_hits)
 {
@@ -408,6 +411,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
 		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
 		                                    ray,
 		                                    isect_array,
+		                                    skip_object,
 		                                    max_hits,
 		                                    num_hits);
 	}
@@ -418,6 +422,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
 		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
 		                                   ray,
 		                                   isect_array,
+		                                   skip_object,
 		                                   max_hits,
 		                                   num_hits);
 	}
--- a/Show More
+++ b/Show More