Turn on Decklink by default

Merge remote-tracking branch 'origin/master' into decklink
Cleanup: Add comment on behavior of tweak events
2016-06-08 00:08:43 +02:00 · 2016-06-07 23:30:03 +02:00 · 2016-06-07 23:13:27 +02:00 · 2016-06-07 22:37:31 +02:00 · 2016-06-07 21:53:17 +02:00 · 2016-06-08 05:40:21 +10:00
811 changed files with 54064 additions and 17363 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -172,7 +172,6 @@ if(UNIX AND NOT APPLE)
 	set(_init_CODEC_FFMPEG                   OFF)
 	set(_init_CYCLES_OSL                     OFF)
 	set(_init_IMAGE_OPENEXR                  OFF)
-	set(_init_INPUT_NDOF                     OFF)
 	set(_init_JACK                           OFF)
 	set(_init_OPENCOLLADA                    OFF)
 	set(_init_OPENCOLORIO                    OFF)
@@ -218,6 +217,10 @@ if(${CMAKE_VERSION} VERSION_LESS 2.8.8)
 	# add_library OBJECT arg unsupported
 	set(WITH_BUILDINFO OFF)
 endif()
+set(BUILDINFO_OVERRIDE_DATE "" CACHE STRING "Use instead of the current date for reproducable builds (empty string disables this option)")
+set(BUILDINFO_OVERRIDE_TIME "" CACHE STRING "Use instead of the current time for reproducable builds (empty string disables this option)")
+mark_as_advanced(BUILDINFO_OVERRIDE_DATE)
+mark_as_advanced(BUILDINFO_OVERRIDE_TIME)

 option(WITH_IK_ITASC      "Enable ITASC IK solver (only disable for development & for incompatible C++ compilers)" ON)
 option(WITH_IK_SOLVER     "Enable Legacy IK solver (only disable for development)" ON)
@@ -227,6 +230,7 @@ option(WITH_SYSTEM_BULLET "Use the systems bullet library (currently unsupported
 mark_as_advanced(WITH_SYSTEM_BULLET)
 option(WITH_GAMEENGINE    "Enable Game Engine" ${_init_GAMEENGINE})
 option(WITH_PLAYER        "Build Player" OFF)
+option(WITH_DECKLINK      "Support BlackMagicDesign DeckLink cards in the BGE" ON)
 option(WITH_OPENCOLORIO   "Enable OpenColorIO color management" ${_init_OPENCOLORIO})

 # Compositor
@@ -270,6 +274,7 @@ endif()
 if(WITH_X11)
 	option(WITH_X11_XINPUT    "Enable X11 Xinput (tablet support and unicode input)"  ON)
 	option(WITH_X11_XF86VMODE "Enable X11 video mode switching"                       ON)
+	option(WITH_X11_ALPHA     "Enable X11 transparent background"                     ON)
 endif()

 if(UNIX AND NOT APPLE)
@@ -474,9 +479,19 @@ if(WIN32)
 endif()

 # Experimental support of C11 and C++11
-option(WITH_C11 "Build with C11 standard enabled, for development use only!" OFF)
+#
+# We default options to whatever default standard in the current compiler.
+if(CMAKE_COMPILER_IS_GNUCC AND (NOT "${CMAKE_C_COMPILER_VERSION}" VERSION_LESS "6.0") AND (NOT WITH_CXX11))
+	set(_c11_init ON)
+	set(_cxx11_init ON)
+else()
+	set(_c11_init OFF)
+	set(_cxx11_init OFF)
+endif()
+
+option(WITH_C11 "Build with C11 standard enabled, for development use only!" ${_c11_init})
 mark_as_advanced(WITH_C11)
-option(WITH_CXX11 "Build with C++11 standard enabled, for development use only!" OFF)
+option(WITH_CXX11 "Build with C++11 standard enabled, for development use only!" ${_cxx11_init})
 mark_as_advanced(WITH_CXX11)

 # Dependency graph
@@ -515,8 +530,8 @@ if(APPLE)

 	if(NOT CMAKE_OSX_ARCHITECTURES)
 		set(CMAKE_OSX_ARCHITECTURES x86_64 CACHE STRING
-		"Choose the architecture you want to build Blender for: i386, x86_64 or ppc"
-		FORCE)
+			"Choose the architecture you want to build Blender for: i386, x86_64 or ppc"
+			FORCE)
 	endif()

 	if(NOT DEFINED OSX_SYSTEM)
@@ -526,15 +541,20 @@ if(APPLE)
 		        OUTPUT_STRIP_TRAILING_WHITESPACE)
 	endif()

-	# workaround for incorrect cmake xcode lookup for developer previews - XCODE_VERSION does not take xcode-select path into accout
-	# but would always look into /Applications/Xcode.app while dev versions are named Xcode<version>-DP<preview_number>
-	execute_process(COMMAND xcode-select --print-path  OUTPUT_VARIABLE XCODE_CHECK OUTPUT_STRIP_TRAILING_WHITESPACE)
+	# workaround for incorrect cmake xcode lookup for developer previews - XCODE_VERSION does not
+	# take xcode-select path into account but would always look  into /Applications/Xcode.app
+	# while dev versions are named Xcode<version>-DP<preview_number>
+	execute_process(
+	        COMMAND xcode-select --print-path
+	        OUTPUT_VARIABLE XCODE_CHECK OUTPUT_STRIP_TRAILING_WHITESPACE)
 	string(REPLACE "/Contents/Developer" "" XCODE_BUNDLE ${XCODE_CHECK}) # truncate to bundlepath in any case
 	
 	if(${CMAKE_GENERATOR} MATCHES "Xcode")
 	
-		if(${XCODE_VERSION} VERSION_GREATER 4.2) # earlier xcode has no bundled developer dir, no sense in getting xcode path from
-			string(SUBSTRING "${XCODE_CHECK}" 14 6 DP_NAME) # reduce to XCode name without dp extension
+		# earlier xcode has no bundled developer dir, no sense in getting xcode path from
+		if(${XCODE_VERSION} VERSION_GREATER 4.2) 
+			# reduce to XCode name without dp extension
+			string(SUBSTRING "${XCODE_CHECK}" 14 6 DP_NAME) 
 			if(${DP_NAME} MATCHES Xcode5)
 				set(XCODE_VERSION 5)
 			endif()
@@ -561,25 +581,30 @@ if(APPLE)
 	message(STATUS "Detected OS X ${OSX_SYSTEM} and Xcode ${XCODE_VERSION} at ${XCODE_BUNDLE}")

 	if(${XCODE_VERSION} VERSION_LESS 4.3)
-		set(CMAKE_OSX_SYSROOT /Developer/SDKs/MacOSX${OSX_SYSTEM}.sdk CACHE PATH "" FORCE)  # use guaranteed existing sdk
+		# use guaranteed existing sdk
+		set(CMAKE_OSX_SYSROOT /Developer/SDKs/MacOSX${OSX_SYSTEM}.sdk CACHE PATH "" FORCE)
 	else()
-		# note: xcode-select path could be ambigous, cause /Applications/Xcode.app/Contents/Developer or /Applications/Xcode.app would be allowed
+		# note: xcode-select path could be ambigous,
+		# cause /Applications/Xcode.app/Contents/Developer or /Applications/Xcode.app would be allowed
 		# so i use a selfcomposed bundlepath here  
 		set(OSX_SYSROOT_PREFIX ${XCODE_BUNDLE}/Contents/Developer/Platforms/MacOSX.platform)
 		message(STATUS "OSX_SYSROOT_PREFIX: " ${OSX_SYSROOT_PREFIX})
 		set(OSX_DEVELOPER_PREFIX /Developer/SDKs/MacOSX${OSX_SYSTEM}.sdk) # use guaranteed existing sdk
 		set(CMAKE_OSX_SYSROOT ${OSX_SYSROOT_PREFIX}/${OSX_DEVELOPER_PREFIX} CACHE PATH "" FORCE)
 		if(${CMAKE_GENERATOR} MATCHES "Xcode")
-			set(CMAKE_XCODE_ATTRIBUTE_SDKROOT macosx${OSX_SYSTEM}) # to silence sdk not found warning, just overrides CMAKE_OSX_SYSROOT
+			# to silence sdk not found warning, just overrides CMAKE_OSX_SYSROOT
+			set(CMAKE_XCODE_ATTRIBUTE_SDKROOT macosx${OSX_SYSTEM})
 		endif()
 	endif()

 	if(OSX_SYSTEM MATCHES 10.9)
-		set(CMAKE_FIND_ROOT_PATH ${CMAKE_OSX_SYSROOT}) # make sure syslibs and headers are looked up in sdk ( expecially for 10.9 openGL atm. )
+		# make sure syslibs and headers are looked up in sdk ( expecially for 10.9 openGL atm. )
+		set(CMAKE_FIND_ROOT_PATH ${CMAKE_OSX_SYSROOT})
 	endif()

 	if(NOT CMAKE_OSX_DEPLOYMENT_TARGET)
-		set(CMAKE_OSX_DEPLOYMENT_TARGET "10.6" CACHE STRING "" FORCE) # 10.6 is our min. target, if you use higher sdk, weak linking happens
+		# 10.6 is our min. target, if you use higher sdk, weak linking happens
+		set(CMAKE_OSX_DEPLOYMENT_TARGET "10.6" CACHE STRING "" FORCE)
 	endif()
 	
 	if(NOT ${CMAKE_GENERATOR} MATCHES "Xcode")
@@ -588,8 +613,6 @@ if(APPLE)
 		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}")
 		add_definitions("-DMACOSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET}")
 	endif()
-
-	option(WITH_LIBS10.5  "Use 10.5 libs (needed for 64bit builds)" OFF)
 endif()


@@ -597,7 +620,10 @@ endif()
 # Check for conflicting/unsupported configurations

 if(NOT WITH_BLENDER AND NOT WITH_PLAYER AND NOT WITH_CYCLES_STANDALONE)
-	message(FATAL_ERROR "At least one of WITH_BLENDER or WITH_PLAYER or WITH_CYCLES_STANDALONE must be enabled, nothing to do!")
+	message(FATAL_ERROR
+		"At least one of WITH_BLENDER or WITH_PLAYER or "
+		"WITH_CYCLES_STANDALONE must be enabled, nothing to do!"
+	)
 endif()

 if(NOT WITH_GAMEENGINE AND WITH_PLAYER)
@@ -661,7 +687,8 @@ if(NOT WITH_BOOST)
 	set_and_warn(WITH_OPENAL         OFF)  # depends on AUDASPACE
 	set_and_warn(WITH_GAMEENGINE     OFF)  # depends on AUDASPACE
 	set_and_warn(WITH_PLAYER         OFF)  # depends on GAMEENGINE
-elseif(WITH_CYCLES OR WITH_OPENIMAGEIO OR WITH_AUDASPACE OR WITH_INTERNATIONAL OR WITH_OPENVDB OR WITH_OPENCOLORIO OR WITH_MOD_BOOLEAN)
+elseif(WITH_CYCLES OR WITH_OPENIMAGEIO OR WITH_AUDASPACE OR WITH_INTERNATIONAL OR
+       WITH_OPENVDB OR WITH_OPENCOLORIO OR WITH_MOD_BOOLEAN)
 	# Keep enabled
 else()
 	# New dependency graph needs either Boost or C++11 for function bindings.
@@ -698,6 +725,7 @@ if(WITH_GHOST_SDL OR WITH_HEADLESS)
 	set(WITH_X11           OFF)
 	set(WITH_X11_XINPUT    OFF)
 	set(WITH_X11_XF86VMODE OFF)
+	set(WITH_X11_ALPHA     OFF)
 	set(WITH_GHOST_XDND    OFF)
 	set(WITH_INPUT_IME     OFF)
 endif()
@@ -758,18 +786,22 @@ endif()

 if(WITH_INTERNATIONAL)
 	if(NOT EXISTS "${CMAKE_SOURCE_DIR}/release/datafiles/locale/languages")
-		message(WARNING "Translation path '${CMAKE_SOURCE_DIR}/release/datafiles/locale' is missing, "
-						"This is a 'git submodule', which are known not to work with bridges to other version "
-						"control systems, disabling 'WITH_INTERNATIONAL'.")
+		message(WARNING
+			"Translation path '${CMAKE_SOURCE_DIR}/release/datafiles/locale' is missing, "
+			"This is a 'git submodule', which are known not to work with bridges to other version "
+			"control systems, disabling 'WITH_INTERNATIONAL'."
+		)
 		set(WITH_INTERNATIONAL OFF)
 	endif()
 endif()

 if(WITH_PYTHON)
 	if(NOT EXISTS "${CMAKE_SOURCE_DIR}/release/scripts/addons/modules")
-		message(WARNING "Addons path '${CMAKE_SOURCE_DIR}/release/scripts/addons' is missing, "
-						"This is a 'git submodule', which are known not to work with bridges to other version "
-						"control systems: * CONTINUING WITHOUT ADDONS *")
+		message(WARNING
+			"Addons path '${CMAKE_SOURCE_DIR}/release/scripts/addons' is missing, "
+			"This is a 'git submodule', which are known not to work with bridges to other version "
+			"control systems: * CONTINUING WITHOUT ADDONS *"
+		)
 	endif()
 endif()

@@ -803,21 +835,6 @@ set(PLATFORM_LINKFLAGS "")
 set(PLATFORM_LINKFLAGS_DEBUG "")


-# For alternate Python locations the commandline can be used to override detected/default cache settings, e.g:
-# On Unix:
-#   cmake ../blender \
-#         -D PYTHON_VERSION=3.5 \
-#         -D PYTHON_INCLUDE_DIR=/opt/py35/include/python3.5d \
-#         -D PYTHON_LIBRARY=/opt/py35/lib/libpython3.5d.so
-#
-# On Macs:
-#   cmake ../blender \
-#         -D PYTHON_INCLUDE_DIR=/System/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5 \
-#         -D PYTHON_LIBPATH=/System/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/config \
-#         -G Xcode
-#
-# When changing any of this remember to update the notes in doc/build_systems/cmake.txt
-
 #-----------------------------------------------------------------------------
 #Platform specifics

@@ -848,6 +865,16 @@ if(WITH_X11)
 		endif()
 	endif()

+	if(WITH_X11_ALPHA)
+		find_library(X11_Xrender_LIB Xrender  ${X11_LIB_SEARCH_PATH})
+		mark_as_advanced(X11_Xrender_LIB)
+		if (X11_Xrender_LIB)
+			list(APPEND PLATFORM_LINKLIBS ${X11_Xrender_LIB})
+		else()
+			set(WITH_X11_ALPHA OFF)
+		endif()
+	endif()
+
 endif()


@@ -1019,14 +1046,12 @@ if(UNIX AND NOT APPLE)

 	if(WITH_INPUT_NDOF)
 		find_package_wrapper(Spacenav)
-		if(NOT SPACENAV_FOUND)
-			set(WITH_INPUT_NDOF OFF)
-		endif()
-
-		# use generic names within blenders buildsystem.
 		if(SPACENAV_FOUND)
+			# use generic names within blenders buildsystem.
 			set(NDOF_INCLUDE_DIRS ${SPACENAV_INCLUDE_DIRS})
 			set(NDOF_LIBRARIES ${SPACENAV_LIBRARIES})
+		else()
+			set(WITH_INPUT_NDOF OFF)
 		endif()
 	endif()

@@ -1040,7 +1065,11 @@ if(UNIX AND NOT APPLE)
 			if(${OSL_LIBRARY_VERSION_MAJOR} EQUAL "1" AND ${OSL_LIBRARY_VERSION_MINOR} LESS "6")
 				# Note: --whole-archive is needed to force loading of all symbols in liboslexec,
 				# otherwise LLVM is missing the osl_allocate_closure_component function
-				set(OSL_LIBRARIES ${OSL_OSLCOMP_LIBRARY} -Wl,--whole-archive ${OSL_OSLEXEC_LIBRARY} -Wl,--no-whole-archive ${OSL_OSLQUERY_LIBRARY})
+				set(OSL_LIBRARIES
+					${OSL_OSLCOMP_LIBRARY}
+					-Wl,--whole-archive ${OSL_OSLEXEC_LIBRARY}
+					-Wl,--no-whole-archive ${OSL_OSLQUERY_LIBRARY}
+				)
 			endif()
 		else()
 			message(STATUS "OSL not found, disabling it from Cycles")
@@ -1111,7 +1140,13 @@ if(UNIX AND NOT APPLE)
 			set(PUGIXML_LIBRARIES "")
 		endif()

-		set(OPENIMAGEIO_LIBRARIES ${OPENIMAGEIO_LIBRARIES} ${PNG_LIBRARIES} ${JPEG_LIBRARIES} ${ZLIB_LIBRARIES} ${BOOST_LIBRARIES})
+		set(OPENIMAGEIO_LIBRARIES
+			${OPENIMAGEIO_LIBRARIES}
+			${PNG_LIBRARIES}
+			${JPEG_LIBRARIES}
+			${ZLIB_LIBRARIES}
+			${BOOST_LIBRARIES}
+		)
 		set(OPENIMAGEIO_LIBPATH)  # TODO, remove and reference the absolute path everywhere
 		set(OPENIMAGEIO_DEFINITIONS "")

@@ -1152,7 +1187,9 @@ if(UNIX AND NOT APPLE)

 	if(WITH_LLVM OR WITH_SDL_DYNLOAD)
 		# Fix for conflict with Mesa llvmpipe
-		set(PLATFORM_LINKFLAGS "${PLATFORM_LINKFLAGS} -Wl,--version-script='${CMAKE_SOURCE_DIR}/source/creator/blender.map'")
+		set(PLATFORM_LINKFLAGS
+			"${PLATFORM_LINKFLAGS} -Wl,--version-script='${CMAKE_SOURCE_DIR}/source/creator/blender.map'"
+		)
 	endif()

 	if(WITH_OPENSUBDIV)
@@ -1255,7 +1292,10 @@ elseif(WIN32)
 		set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /SAFESEH:NO")
 		set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} /SAFESEH:NO")

-		list(APPEND PLATFORM_LINKLIBS ws2_32 vfw32 winmm kernel32 user32 gdi32 comdlg32 advapi32 shfolder shell32 ole32 oleaut32 uuid psapi Dbghelp)
+		list(APPEND PLATFORM_LINKLIBS
+			ws2_32 vfw32 winmm kernel32 user32 gdi32 comdlg32
+			advapi32 shfolder shell32 ole32 oleaut32 uuid psapi Dbghelp
+		)

 		if(WITH_INPUT_IME)
 			list(APPEND PLATFORM_LINKLIBS imm32)
@@ -1293,7 +1333,8 @@ elseif(WIN32)
 		set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /MT")
 		set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} /MT")

-		set(PLATFORM_LINKFLAGS "/SUBSYSTEM:CONSOLE /STACK:2097152 /INCREMENTAL:NO /NODEFAULTLIB:msvcrt.lib /NODEFAULTLIB:msvcmrt.lib /NODEFAULTLIB:msvcurt.lib /NODEFAULTLIB:msvcrtd.lib")
+		set(PLATFORM_LINKFLAGS "/SUBSYSTEM:CONSOLE /STACK:2097152 /INCREMENTAL:NO ")
+		set(PLATFORM_LINKFLAGS "${PLATFORM_LINKFLAGS} /NODEFAULTLIB:msvcrt.lib /NODEFAULTLIB:msvcmrt.lib /NODEFAULTLIB:msvcurt.lib /NODEFAULTLIB:msvcrtd.lib ")

 		# Ignore meaningless for us linker warnings.
 		set(PLATFORM_LINKFLAGS "${PLATFORM_LINKFLAGS} /ignore:4049 /ignore:4217 /ignore:4221")
@@ -1347,9 +1388,9 @@ elseif(WIN32)
 		# Add each of our libraries to our cmake_prefix_path so find_package() could work
 		file(GLOB children RELATIVE ${LIBDIR} ${LIBDIR}/*)
 		foreach(child ${children})
-		if(IS_DIRECTORY ${LIBDIR}/${child})
-			list(APPEND CMAKE_PREFIX_PATH  ${LIBDIR}/${child})
-		endif()
+			if(IS_DIRECTORY ${LIBDIR}/${child})
+				list(APPEND CMAKE_PREFIX_PATH  ${LIBDIR}/${child})
+			endif()
 		endforeach()

 		set(ZLIB_INCLUDE_DIRS ${LIBDIR}/zlib/include)
@@ -1454,23 +1495,28 @@ elseif(WIN32)
 				set(OPENEXR_INCLUDE_DIRS ${OPENEXR_INCLUDE_DIR} ${OPENEXR}/include/OpenEXR)
 				set(OPENEXR_LIBPATH ${OPENEXR}/lib)
 				set(OPENEXR_LIBRARIES
-					optimized ${OPENEXR_LIBPATH}/Iex-2_2.lib debug ${OPENEXR_LIBPATH}/Iex-2_2_d.lib
-					optimized ${OPENEXR_LIBPATH}/Half.lib debug ${OPENEXR_LIBPATH}/Half_d.lib
-					optimized ${OPENEXR_LIBPATH}/IlmImf-2_2.lib debug ${OPENEXR_LIBPATH}/IlmImf-2_2_d.lib
-					optimized ${OPENEXR_LIBPATH}/Imath-2_2.lib debug ${OPENEXR_LIBPATH}/Imath-2_2_d.lib
-					optimized ${OPENEXR_LIBPATH}/IlmThread-2_2.lib debug ${OPENEXR_LIBPATH}/IlmThread-2_2_d.lib
+					optimized ${OPENEXR_LIBPATH}/Iex-2_2.lib
+					optimized ${OPENEXR_LIBPATH}/Half.lib
+					optimized ${OPENEXR_LIBPATH}/IlmImf-2_2.lib
+					optimized ${OPENEXR_LIBPATH}/Imath-2_2.lib
+					optimized ${OPENEXR_LIBPATH}/IlmThread-2_2.lib
+					debug ${OPENEXR_LIBPATH}/Iex-2_2_d.lib
+					debug ${OPENEXR_LIBPATH}/Half_d.lib
+					debug ${OPENEXR_LIBPATH}/IlmImf-2_2_d.lib
+					debug ${OPENEXR_LIBPATH}/Imath-2_2_d.lib
+					debug ${OPENEXR_LIBPATH}/IlmThread-2_2_d.lib
 				)
 			endif()
 		endif()

 		if(WITH_IMAGE_TIFF)
-		# Try to find tiff first then complain and set static and maybe wrong paths
-		find_package(TIFF)
-		if(NOT TIFF_FOUND)
-			message(WARNING "Using HARDCODED libtiff locations")
-			set(TIFF_LIBRARY ${LIBDIR}/tiff/lib/libtiff.lib)
-			set(TIFF_INCLUDE_DIR ${LIBDIR}/tiff/include)
-		endif()
+			# Try to find tiff first then complain and set static and maybe wrong paths
+			find_package(TIFF)
+			if(NOT TIFF_FOUND)
+				message(WARNING "Using HARDCODED libtiff locations")
+				set(TIFF_LIBRARY ${LIBDIR}/tiff/lib/libtiff.lib)
+				set(TIFF_INCLUDE_DIR ${LIBDIR}/tiff/include)
+			endif()
 		endif()

 		if(WITH_JACK)
@@ -1525,12 +1571,17 @@ elseif(WIN32)
 					set(BOOST_DEBUG_POSTFIX "vc140-mt-sgd-1_60.lib")
 				endif()
 				set(BOOST_LIBRARIES
-					optimized libboost_date_time-${BOOST_POSTFIX} optimized libboost_filesystem-${BOOST_POSTFIX}
+					optimized libboost_date_time-${BOOST_POSTFIX}
+					optimized libboost_filesystem-${BOOST_POSTFIX}
 					optimized libboost_regex-${BOOST_POSTFIX}
-					optimized libboost_system-${BOOST_POSTFIX} optimized libboost_thread-${BOOST_POSTFIX}
-					debug libboost_date_time-${BOOST_DEBUG_POSTFIX} debug libboost_filesystem-${BOOST_DEBUG_POSTFIX}
+					optimized libboost_system-${BOOST_POSTFIX}
+					optimized libboost_thread-${BOOST_POSTFIX}
+					debug libboost_date_time-${BOOST_DEBUG_POSTFIX}
+					debug libboost_filesystem-${BOOST_DEBUG_POSTFIX}
 					debug libboost_regex-${BOOST_DEBUG_POSTFIX}
-					debug libboost_system-${BOOST_DEBUG_POSTFIX} debug libboost_thread-${BOOST_DEBUG_POSTFIX})
+					debug libboost_system-${BOOST_DEBUG_POSTFIX}
+					debug libboost_thread-${BOOST_DEBUG_POSTFIX}
+				)
 				if(WITH_CYCLES_OSL)
 					set(BOOST_LIBRARIES ${BOOST_LIBRARIES}
 						optimized libboost_wave-${BOOST_POSTFIX}
@@ -1659,7 +1710,8 @@ elseif(WIN32)
 			#endif
 			int main(void) { return 0; }
 			" 
-			WITH_MINGW64)
+			WITH_MINGW64
+		)
 		
 		if(NOT DEFINED LIBDIR)
 			if(WITH_MINGW64)
@@ -1680,7 +1732,10 @@ elseif(WIN32)
 			message(FATAL_ERROR "Windows requires pre-compiled libs at: '${LIBDIR}'")
 		endif()

-		list(APPEND PLATFORM_LINKLIBS -lshell32 -lshfolder -lgdi32 -lmsvcrt -lwinmm -lmingw32 -lm -lws2_32 -lz -lstdc++ -lole32 -luuid -lwsock32 -lpsapi -ldbghelp)
+		list(APPEND PLATFORM_LINKLIBS
+			-lshell32 -lshfolder -lgdi32 -lmsvcrt -lwinmm -lmingw32 -lm -lws2_32
+			-lz -lstdc++ -lole32 -luuid -lwsock32 -lpsapi -ldbghelp
+		)

 		if(WITH_INPUT_IME)
 			list(APPEND PLATFORM_LINKLIBS -limm32)
@@ -1749,7 +1804,14 @@ elseif(WIN32)
 				${OPENCOLLADA}/include/opencollada/GeneratedSaxParser
 			)
 			set(OPENCOLLADA_LIBPATH ${OPENCOLLADA}/lib/opencollada)
-			set(OPENCOLLADA_LIBRARIES OpenCOLLADAStreamWriter OpenCOLLADASaxFrameworkLoader OpenCOLLADAFramework OpenCOLLADABaseUtils GeneratedSaxParser UTF MathMLSolver buffer ftoa xml)
+			set(OPENCOLLADA_LIBRARIES
+				OpenCOLLADAStreamWriter
+				OpenCOLLADASaxFrameworkLoader
+				OpenCOLLADAFramework
+				OpenCOLLADABaseUtils
+				GeneratedSaxParser
+				UTF MathMLSolver buffer ftoa xml
+			)
 			set(PCRE_LIBRARIES pcre)
 		endif()

@@ -1822,12 +1884,14 @@ elseif(WIN32)
 			if(WITH_INTERNATIONAL)
 				set(BOOST_LIBRARIES ${BOOST_LIBRARIES}
 					optimized boost_locale-${BOOST_POSTFIX}
-					debug boost_locale-${BOOST_DEBUG_POSTFIX}) 
+					debug boost_locale-${BOOST_DEBUG_POSTFIX}
+				)
 			endif()
 			if(WITH_CYCLES_OSL)
 				set(BOOST_LIBRARIES ${BOOST_LIBRARIES}
 					optimized boost_wave-${BOOST_POSTFIX}
-					debug boost_wave-${BOOST_DEBUG_POSTFIX}) 
+					debug boost_wave-${BOOST_DEBUG_POSTFIX}
+				)
 			endif()
 			set(BOOST_LIBPATH ${BOOST}/lib)
 			set(BOOST_DEFINITIONS "-DBOOST_ALL_NO_LIB -DBOOST_THREAD_USE_LIB ")
@@ -1908,7 +1972,7 @@ elseif(WIN32)
 		set(OPENAL ${LIBDIR}/openal)
 		set(OPENALDIR ${LIBDIR}/openal)
 		set(OPENAL_INCLUDE_DIR ${OPENAL}/include)
-		if(MSVC12)
+		if(MSVC)
 			set(OPENAL_LIBRARY openal32)
 		else()
 			set(OPENAL_LIBRARY wrap_oal)
@@ -1936,7 +2000,14 @@ elseif(WIN32)
 		find_library(OSL_LIB_EXEC_DEBUG NAMES oslexec_d PATHS ${CYCLES_OSL}/lib)
 		find_library(OSL_LIB_COMP_DEBUG NAMES oslcomp_d PATHS ${CYCLES_OSL}/lib)
 		find_library(OSL_LIB_QUERY_DEBUG NAMES oslquery_d PATHS ${CYCLES_OSL}/lib)
-		list(APPEND OSL_LIBRARIES optimized ${OSL_LIB_COMP} optimized ${OSL_LIB_EXEC} optimized ${OSL_LIB_QUERY} debug ${OSL_LIB_EXEC_DEBUG} debug ${OSL_LIB_COMP_DEBUG} debug ${OSL_LIB_QUERY_DEBUG})
+		list(APPEND OSL_LIBRARIES
+			optimized ${OSL_LIB_COMP}
+			optimized ${OSL_LIB_EXEC}
+			optimized ${OSL_LIB_QUERY}
+			debug ${OSL_LIB_EXEC_DEBUG}
+			debug ${OSL_LIB_COMP_DEBUG}
+			debug ${OSL_LIB_QUERY_DEBUG}
+		)
 		find_path(OSL_INCLUDE_DIR OSL/oslclosure.h PATHS ${CYCLES_OSL}/include)
 		find_program(OSL_COMPILER NAMES oslc PATHS ${CYCLES_OSL}/bin)
 	
@@ -1950,20 +2021,8 @@ elseif(WIN32)

 elseif(APPLE)

-	if(${CMAKE_OSX_DEPLOYMENT_TARGET} STREQUAL "10.5" OR ${CMAKE_OSX_DEPLOYMENT_TARGET} STRGREATER "10.5")
-		set(WITH_LIBS10.5 ON CACHE BOOL "Use 10.5 libs" FORCE) # valid also for 10.6/7/8/9
-	endif()
-
 	if(NOT DEFINED LIBDIR)
-		if(WITH_LIBS10.5)
-			set(LIBDIR ${CMAKE_SOURCE_DIR}/../lib/darwin-9.x.universal)
-		else()
-			if(CMAKE_OSX_ARCHITECTURES MATCHES i386)
-				set(LIBDIR ${CMAKE_SOURCE_DIR}/../lib/darwin-8.x.i386)
-			else()
-				set(LIBDIR ${CMAKE_SOURCE_DIR}/../lib/darwin-8.0.0-powerpc)
-			endif()
-		endif()
+		set(LIBDIR ${CMAKE_SOURCE_DIR}/../lib/darwin-9.x.universal)
 	else()
 		message(STATUS "Using pre-compiled LIBDIR: ${LIBDIR}")
 	endif()
@@ -2021,11 +2080,15 @@ elseif(APPLE)
 			# set(PYTHON_LINKFLAGS "-u _PyMac_Error")  # won't  build with this enabled
 		else()
 			# module must be compiled against Python framework
-			set(PYTHON_INCLUDE_DIR "/Library/Frameworks/Python.framework/Versions/${PYTHON_VERSION}/include/python${PYTHON_VERSION}m")
-			set(PYTHON_EXECUTABLE "/Library/Frameworks/Python.framework/Versions/${PYTHON_VERSION}/bin/python${PYTHON_VERSION}m")
+			set(_py_framework "/Library/Frameworks/Python.framework/Versions/${PYTHON_VERSION}")
+
+			set(PYTHON_INCLUDE_DIR "${_py_framework}/include/python${PYTHON_VERSION}m")
+			set(PYTHON_EXECUTABLE "${_py_framework}/bin/python${PYTHON_VERSION}m")
+			set(PYTHON_LIBPATH "${_py_framework}/lib/python${PYTHON_VERSION}/config-${PYTHON_VERSION}m")
 			#set(PYTHON_LIBRARY python${PYTHON_VERSION})
-			set(PYTHON_LIBPATH "/Library/Frameworks/Python.framework/Versions/${PYTHON_VERSION}/lib/python${PYTHON_VERSION}/config-${PYTHON_VERSION}m")
 			#set(PYTHON_LINKFLAGS "-u _PyMac_Error -framework Python")  # won't  build with this enabled
+
+			unset(_py_framework)
 		endif()
 		
 		# uncached vars
@@ -2067,7 +2130,10 @@ elseif(APPLE)
 	if(WITH_CODEC_FFMPEG)
 		set(FFMPEG ${LIBDIR}/ffmpeg)
 		set(FFMPEG_INCLUDE_DIRS ${FFMPEG}/include)
-		set(FFMPEG_LIBRARIES avcodec avdevice avformat avutil mp3lame swscale x264 xvidcore theora theoradec theoraenc vorbis vorbisenc vorbisfile ogg)
+		set(FFMPEG_LIBRARIES
+			avcodec avdevice avformat avutil
+			mp3lame swscale x264 xvidcore theora theoradec theoraenc vorbis vorbisenc vorbisfile ogg
+		)
 		set(FFMPEG_LIBPATH ${FFMPEG}/lib)
 	endif()

@@ -2078,13 +2144,13 @@ elseif(APPLE)
 	)
 	mark_as_advanced(SYSTEMSTUBS_LIBRARY)
 	if(SYSTEMSTUBS_LIBRARY)
-		list(APPEND PLATFORM_LINKLIBS stdc++ SystemStubs)
-	else()
-		list(APPEND PLATFORM_LINKLIBS stdc++)
+		list(APPEND PLATFORM_LINKLIBS SystemStubs)
 	endif()

 	set(PLATFORM_CFLAGS "-pipe -funsigned-char")
-	set(PLATFORM_LINKFLAGS "-fexceptions -framework CoreServices -framework Foundation -framework IOKit -framework AppKit -framework Cocoa -framework Carbon -framework AudioUnit -framework AudioToolbox -framework CoreAudio")
+	set(PLATFORM_LINKFLAGS
+		"-fexceptions -framework CoreServices -framework Foundation -framework IOKit -framework AppKit -framework Cocoa -framework Carbon -framework AudioUnit -framework AudioToolbox -framework CoreAudio"
+	)
 	if(WITH_CODEC_QUICKTIME)
 		set(PLATFORM_LINKFLAGS "${PLATFORM_LINKFLAGS} -framework QTKit")
 		if(CMAKE_OSX_ARCHITECTURES MATCHES i386)
@@ -2093,22 +2159,10 @@ elseif(APPLE)
 		endif()
 	endif()

-	# XXX - SOME MAC DEV PLEASE TEST WITH THE SDK INSTALLED!
-	# ALSO SHOULD BE MOVED INTO OWN MODULE WHEN FUNCTIONAL
-	if(WITH_INPUT_NDOF)
-		# This thread it *should* work and check the framework - campbell
-		# http://www.cmake.org/pipermail/cmake/2005-December/007740.html
-		find_library(3DCONNEXION_CLIENT_FRAMEWORK
-			NAMES 3DconnexionClient
-		)
-		if(NOT 3DCONNEXION_CLIENT_FRAMEWORK)
-			set(WITH_INPUT_NDOF OFF)
-		endif()
-
-		if(WITH_INPUT_NDOF)
-			set(PLATFORM_LINKFLAGS "${PLATFORM_LINKFLAGS} -F/Library/Frameworks -weak_framework 3DconnexionClient")
-			set(NDOF_INCLUDE_DIRS /Library/Frameworks/3DconnexionClient.framework/Headers )
-		endif()
+	if(WITH_CXX11)
+		list(APPEND PLATFORM_LINKLIBS c++)
+	else()
+		list(APPEND PLATFORM_LINKLIBS stdc++)
 	endif()

 	if(WITH_JACK)
@@ -2116,7 +2170,8 @@ elseif(APPLE)
 	endif()
 	
 	if(WITH_PYTHON_MODULE OR WITH_PYTHON_FRAMEWORK)
-		set(PLATFORM_LINKFLAGS "${PLATFORM_LINKFLAGS} /Library/Frameworks/Python.framework/Versions/${PYTHON_VERSION}/Python")# force cmake to link right framework
+		# force cmake to link right framework
+		set(PLATFORM_LINKFLAGS "${PLATFORM_LINKFLAGS} /Library/Frameworks/Python.framework/Versions/${PYTHON_VERSION}/Python")
 	endif()
 	
 	if(WITH_OPENCOLLADA)
@@ -2131,10 +2186,18 @@ elseif(APPLE)
 		)

 		set(OPENCOLLADA_LIBPATH ${OPENCOLLADA}/lib)
-		set(OPENCOLLADA_LIBRARIES "OpenCOLLADASaxFrameworkLoader -lOpenCOLLADAFramework -lOpenCOLLADABaseUtils -lOpenCOLLADAStreamWriter -lMathMLSolver -lGeneratedSaxParser -lxml2 -lbuffer -lftoa")
+		set(OPENCOLLADA_LIBRARIES
+			OpenCOLLADASaxFrameworkLoader
+			-lOpenCOLLADAFramework
+			-lOpenCOLLADABaseUtils
+			-lOpenCOLLADAStreamWriter
+			-lMathMLSolver
+			-lGeneratedSaxParser
+			-lxml2 -lbuffer -lftoa
+		)
 		# Use UTF functions from collada if LLVM is not enabled
 		if(NOT WITH_LLVM)
-			set(OPENCOLLADA_LIBRARIES "${OPENCOLLADA_LIBRARIES} -lUTF")
+			list(APPEND OPENCOLLADA_LIBRARIES -lUTF)
 		endif()
 		# pcre is bundled with openCollada
 		#set(PCRE ${LIBDIR}/pcre)
@@ -2169,14 +2232,17 @@ elseif(APPLE)
 		set(TIFF_LIBPATH ${TIFF}/lib)
 	endif()

-	if(WITH_INPUT_NDOF)
-		# linker needs "-weak_framework 3DconnexionClient"
-	endif()
-
 	if(WITH_BOOST)
 		set(BOOST ${LIBDIR}/boost)
 		set(BOOST_INCLUDE_DIR ${BOOST}/include)
-		set(BOOST_LIBRARIES boost_date_time-mt boost_filesystem-mt boost_regex-mt boost_system-mt boost_thread-mt boost_wave-mt)
+		set(BOOST_LIBRARIES
+			boost_date_time-mt
+			boost_filesystem-mt
+			boost_regex-mt
+			boost_system-mt
+			boost_thread-mt
+			boost_wave-mt
+		)
 		if(WITH_INTERNATIONAL)
 			list(APPEND BOOST_LIBRARIES boost_locale-mt)
 		endif()
@@ -2197,8 +2263,22 @@ elseif(APPLE)
 	if(WITH_OPENIMAGEIO)
 		set(OPENIMAGEIO ${LIBDIR}/openimageio)
 		set(OPENIMAGEIO_INCLUDE_DIRS ${OPENIMAGEIO}/include)
-		set(OPENIMAGEIO_LIBRARIES ${OPENIMAGEIO}/lib/libOpenImageIO.a ${PNG_LIBRARIES} ${JPEG_LIBRARIES} ${TIFF_LIBRARY} ${OPENEXR_LIBRARIES} ${ZLIB_LIBRARIES})
-		set(OPENIMAGEIO_LIBPATH ${OPENIMAGEIO}/lib ${JPEG_LIBPATH} ${PNG_LIBPATH} ${TIFF_LIBPATH} ${OPENEXR_LIBPATH} ${ZLIB_LIBPATH})
+		set(OPENIMAGEIO_LIBRARIES
+			${OPENIMAGEIO}/lib/libOpenImageIO.a
+			${PNG_LIBRARIES}
+			${JPEG_LIBRARIES}
+			${TIFF_LIBRARY}
+			${OPENEXR_LIBRARIES}
+			${ZLIB_LIBRARIES}
+		)
+		set(OPENIMAGEIO_LIBPATH
+			${OPENIMAGEIO}/lib
+			${JPEG_LIBPATH}
+			${PNG_LIBPATH}
+			${TIFF_LIBPATH}
+			${OPENEXR_LIBPATH}
+			${ZLIB_LIBPATH}
+		)
 		set(OPENIMAGEIO_DEFINITIONS "-DOIIO_STATIC_BUILD")
 		set(OPENIMAGEIO_IDIFF "${LIBDIR}/openimageio/bin/idiff")
 	endif()
@@ -2291,8 +2371,14 @@ elseif(APPLE)
 				include_directories(${LIBDIR}/openmp/include)
 				link_directories(${LIBDIR}/openmp/lib)
 				# This is a workaround for our helperbinaries ( datatoc, masgfmt, ... ),
-				# They are linked also to omp lib, so we need it in builddir for runtime exexcution, TODO: remove all unneeded dependencies from these
-				execute_process(COMMAND ditto -arch ${CMAKE_OSX_ARCHITECTURES} ${LIBDIR}/openmp/lib/libiomp5.dylib ${CMAKE_BINARY_DIR}/Resources/lib/libiomp5.dylib) # for intermediate binaries, in respect to lib ID
+				# They are linked also to omp lib, so we need it in builddir for runtime exexcution,
+				# TODO: remove all unneeded dependencies from these
+
+				# for intermediate binaries, in respect to lib ID
+				execute_process(
+				        COMMAND ditto -arch ${CMAKE_OSX_ARCHITECTURES}
+				        ${LIBDIR}/openmp/lib/libiomp5.dylib
+				        ${CMAKE_BINARY_DIR}/Resources/lib/libiomp5.dylib)
 			endif()
 		endif()
 	endif()
@@ -2318,7 +2404,14 @@ elseif(APPLE)
 		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ftemplate-depth=1024")
 	endif()
 	# Get rid of eventually clashes, we export some symbols explicite as local
-	set(PLATFORM_LINKFLAGS "${PLATFORM_LINKFLAGS} -Xlinker -unexported_symbols_list -Xlinker ${CMAKE_SOURCE_DIR}/source/creator/osx_locals.map")
+	set(PLATFORM_LINKFLAGS
+		"${PLATFORM_LINKFLAGS} -Xlinker -unexported_symbols_list -Xlinker ${CMAKE_SOURCE_DIR}/source/creator/osx_locals.map"
+	)
+
+	if(WITH_CXX11)
+		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
+		set(PLATFORM_LINKFLAGS "${PLATFORM_LINKFLAGS} -stdlib=libc++")
+	endif()

 	# Suppress ranlib "has no symbols" warnings (workaround for T48250)
 	set(CMAKE_C_ARCHIVE_CREATE   "<CMAKE_AR> Scr <TARGET> <LINK_FLAGS> <OBJECTS>")
@@ -2336,38 +2429,50 @@ endif()

 if(WITH_CYCLES)
 	if(NOT WITH_OPENIMAGEIO)
-		message(FATAL_ERROR "Cycles requires WITH_OPENIMAGEIO, the library may not have been found. Configure OIIO or disable WITH_CYCLES")
+		message(FATAL_ERROR
+			"Cycles requires WITH_OPENIMAGEIO, the library may not have been found. "
+			"Configure OIIO or disable WITH_CYCLES"
+		)
 	endif()
 	if(NOT WITH_BOOST)
-		message(FATAL_ERROR "Cycles requires WITH_BOOST, the library may not have been found. Configure BOOST or disable WITH_CYCLES")
+		message(FATAL_ERROR
+			"Cycles requires WITH_BOOST, the library may not have been found. "
+			"Configure BOOST or disable WITH_CYCLES"
+		)
 	endif()

 	if(WITH_CYCLES_OSL)
 		if(NOT WITH_LLVM)
-			message(FATAL_ERROR "Cycles OSL requires WITH_LLVM, the library may not have been found. Configure LLVM or disable WITH_CYCLES_OSL")
+			message(FATAL_ERROR
+				"Cycles OSL requires WITH_LLVM, the library may not have been found. "
+				"Configure LLVM or disable WITH_CYCLES_OSL"
+			)
 		endif()
 	endif()
 endif()

 if(WITH_INTERNATIONAL)
 	if(NOT WITH_BOOST)
-		message(FATAL_ERROR "Internationalization requires WITH_BOOST, the library may not have been found. Configure BOOST or disable WITH_INTERNATIONAL")
+		message(FATAL_ERROR
+			"Internationalization requires WITH_BOOST, the library may not have been found. "
+			"Configure BOOST or disable WITH_INTERNATIONAL"
+		)
 	endif()
 endif()

 # See TEST_SSE_SUPPORT() for how this is defined.

-if(WITH_RAYOPTIMIZATION)
-	if(SUPPORT_SSE_BUILD)
-		set(PLATFORM_CFLAGS " ${COMPILER_SSE_FLAG} ${PLATFORM_CFLAGS}")
-		add_definitions(-D__SSE__ -D__MMX__)
-	endif()
-	if(SUPPORT_SSE2_BUILD)
-		set(PLATFORM_CFLAGS " ${COMPILER_SSE2_FLAG} ${PLATFORM_CFLAGS}")
-		add_definitions(-D__SSE2__)
-		if(NOT SUPPORT_SSE_BUILD) # dont double up
-			add_definitions(-D__MMX__)
-		endif()
+# Do it globally, SSE2 is required for quite some time now.
+# Doing it now allows to use SSE/SSE2 in inline headers.
+if(SUPPORT_SSE_BUILD)
+	set(PLATFORM_CFLAGS " ${COMPILER_SSE_FLAG} ${PLATFORM_CFLAGS}")
+	add_definitions(-D__SSE__ -D__MMX__)
+endif()
+if(SUPPORT_SSE2_BUILD)
+	set(PLATFORM_CFLAGS " ${COMPILER_SSE2_FLAG} ${PLATFORM_CFLAGS}")
+	add_definitions(-D__SSE2__)
+	if(NOT SUPPORT_SSE_BUILD) # dont double up
+		add_definitions(-D__MMX__)
 	endif()
 endif()

@@ -2437,7 +2542,10 @@ if(WITH_GL_PROFILE_COMPAT OR WITH_GL_PROFILE_CORE)
 elseif(WITH_GL_PROFILE_ES20)
 	if(WITH_SYSTEM_GLES)
 		if(NOT OPENGLES_LIBRARY)
-			message(FATAL_ERROR "Unable to find OpenGL ES libraries.  Install them or disable WITH_SYSTEM_GLES.")
+			message(FATAL_ERROR
+				"Unable to find OpenGL ES libraries. "
+				"Install them or disable WITH_SYSTEM_GLES."
+			)
 		endif()

 		list(APPEND BLENDER_GL_LIBRARIES OPENGLES_LIBRARY)
@@ -2449,7 +2557,10 @@ elseif(WITH_GL_PROFILE_ES20)
 		list(APPEND BLENDER_GL_LIBRARIES "${OPENGLES_LIBRARY}")

 		if(NOT OPENGLES_LIBRARY)
-			message(FATAL_ERROR "To compile WITH_GL_EGL you need to set OPENGLES_LIBRARY to the file path of an OpenGL ES 2.0 library.")
+			message(FATAL_ERROR
+				"To compile WITH_GL_EGL you need to set OPENGLES_LIBRARY "
+				"to the file path of an OpenGL ES 2.0 library."
+			)
 		endif()

 	endif()
@@ -2461,7 +2572,10 @@ elseif(WITH_GL_PROFILE_ES20)
 		mark_as_advanced(OPENGLES_DLL)

 		if(NOT OPENGLES_DLL)
-			message(FATAL_ERROR "To compile WITH_GL_PROFILE_ES20 you need to set OPENGLES_DLL to the file path of an OpenGL ES 2.0 runtime dynamic link library (DLL).")
+			message(FATAL_ERROR
+				"To compile WITH_GL_PROFILE_ES20 you need to set OPENGLES_DLL to the file "
+				"path of an OpenGL ES 2.0 runtime dynamic link library (DLL)."
+			)
 		endif()

 		if(WITH_GL_ANGLE)
@@ -2475,7 +2589,10 @@ elseif(WITH_GL_PROFILE_ES20)
 			mark_as_advanced(D3DCOMPILER_DLL)

 			if(D3DCOMPILER_DLL STREQUAL "")
-				message(FATAL_ERROR "To compile WITH_GL_ANGLE you need to set D3DCOMPILER_DLL to the file path of a copy of the DirectX redistributable DLL file: D3DCompiler_46.dll")
+				message(FATAL_ERROR
+					"To compile WITH_GL_ANGLE you need to set D3DCOMPILER_DLL to the file "
+					"path of a copy of the DirectX redistributable DLL file: D3DCompiler_46.dll"
+				)
 			endif()

 		endif()
@@ -2489,7 +2606,10 @@ if(WITH_GL_EGL)

 	if(WITH_SYSTEM_GLES)
 		if(NOT OPENGLES_EGL_LIBRARY)
-			message(FATAL_ERROR "Unable to find OpenGL ES libraries.  Install them or disable WITH_SYSTEM_GLES.")
+			message(FATAL_ERROR
+				"Unable to find OpenGL ES libraries. "
+				"Install them or disable WITH_SYSTEM_GLES."
+			)
 		endif()

 		list(APPEND BLENDER_GL_LIBRARIES OPENGLES_EGL_LIBRARY)
@@ -2501,7 +2621,10 @@ if(WITH_GL_EGL)
 		list(APPEND BLENDER_GL_LIBRARIES "${OPENGLES_LIBRARY}" "${OPENGLES_EGL_LIBRARY}")

 		if(NOT OPENGLES_EGL_LIBRARY)
-			message(FATAL_ERROR "To compile WITH_GL_EGL you need to set OPENGLES_EGL_LIBRARY to the file path of an EGL library.")
+			message(FATAL_ERROR
+				"To compile WITH_GL_EGL you need to set OPENGLES_EGL_LIBRARY "
+				"to the file path of an EGL library."
+			)
 		endif()

 	endif()
@@ -2513,7 +2636,10 @@ if(WITH_GL_EGL)
 		mark_as_advanced(OPENGLES_EGL_DLL)

 		if(NOT OPENGLES_EGL_DLL)
-			message(FATAL_ERROR "To compile WITH_GL_EGL you need to set OPENGLES_EGL_DLL to the file path of an EGL runtime dynamic link library (DLL).")
+			message(FATAL_ERROR
+				"To compile WITH_GL_EGL you need to set OPENGLES_EGL_DLL "
+				"to the file path of an EGL runtime dynamic link library (DLL)."
+			)
 		endif()

 	endif()
@@ -2678,7 +2804,9 @@ endif()
 if(WITH_LIBMV)
 	set(CERES_DEFINES)

-	if(SHARED_PTR_FOUND)
+	if(WITH_CXX11)
+		# nothing to be done
+	elseif(SHARED_PTR_FOUND)
 		if(SHARED_PTR_TR1_MEMORY_HEADER)
 			list(APPEND CERES_DEFINES -DCERES_TR1_MEMORY_HEADER)
 		endif()
@@ -2689,7 +2817,9 @@ if(WITH_LIBMV)
 		message(FATAL_ERROR "Ceres: Unable to find shared_ptr.")
 	endif()

-	if(HAVE_STD_UNORDERED_MAP_HEADER)
+	if(WITH_CXX11)
+		list(APPEND CERES_DEFINES -DCERES_STD_UNORDERED_MAP)
+	elseif(HAVE_STD_UNORDERED_MAP_HEADER)
 		if(HAVE_UNORDERED_MAP_IN_STD_NAMESPACE)
 			list(APPEND CERES_DEFINES -DCERES_STD_UNORDERED_MAP)
 		else()
@@ -2891,10 +3021,12 @@ endif()
 # be most problematic.
 if(WITH_PYTHON)
 	if(NOT EXISTS "${PYTHON_INCLUDE_DIR}/Python.h")
-		message(FATAL_ERROR "Missing: \"${PYTHON_INCLUDE_DIR}/Python.h\",\n"
-							"Set the cache entry 'PYTHON_INCLUDE_DIR' to point "
-							"to a valid python include path. Containing "
-							"Python.h for python version \"${PYTHON_VERSION}\"")
+		message(FATAL_ERROR
+			"Missing: \"${PYTHON_INCLUDE_DIR}/Python.h\",\n"
+			"Set the cache entry 'PYTHON_INCLUDE_DIR' to point "
+			"to a valid python include path. Containing "
+			"Python.h for python version \"${PYTHON_VERSION}\""
+		)
 	endif()

 	if(WIN32 OR APPLE)
@@ -2912,12 +3044,22 @@ endif()

 if(WITH_CXX11)
 	if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+		# TODO(sergey): Do we want c++11 or gnu-c++11 here?
 		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 	elseif(MSVC12)
 		# Nothing special is needed, C++11 features are available by default.
 	else()
 		message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER_ID} is not supported for C++11 build yet")
 	endif()
+else()
+	# GCC-6 switched to C++11 by default, which would break linking with existing libraries
+	# by default. So we explicitly disable C++11 for a new GCC so no linking issues happens.
+	if(CMAKE_COMPILER_IS_GNUCC AND (NOT "${CMAKE_C_COMPILER_VERSION}" VERSION_LESS "6.0"))
+		set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++98")
+		# We also disable any of C++11 ABI from usage, so we wouldn't even try to
+		# link to stuff from std::__cxx11 namespace.
+		add_definitions("-D_GLIBCXX_USE_CXX11_ABI=0")
+	endif()
 endif()

 # Visual Studio has all standards it supports available by default
@@ -3032,7 +3174,7 @@ if(FIRST_RUN)
 		string(LENGTH "${_msg}" _len)
 		while("32" GREATER "${_len}")
 			set(_msg "${_msg} ")
-			 math(EXPR _len "${_len} + 1")
+			math(EXPR _len "${_len} + 1")
 		endwhile()

 		set(_config_msg "${_config_msg}\n${_msg}${${_setting}}" PARENT_SCOPE)
@@ -3070,6 +3212,7 @@ if(FIRST_RUN)

 	info_cfg_text("System Options:")
 	info_cfg_option(WITH_INSTALL_PORTABLE)
+	info_cfg_option(WITH_X11_ALPHA)
 	info_cfg_option(WITH_X11_XF86VMODE)
 	info_cfg_option(WITH_X11_XINPUT)
 	info_cfg_option(WITH_MEM_JEMALLOC)
--- a/74
+++ b/74
@@ -120,7 +120,7 @@ endif

 # -----------------------------------------------------------------------------
 # Build Blender
-all: FORCE
+all: .FORCE
 	@echo
 	@echo Configuring Blender in \"$(BUILD_DIR)\" ...

@@ -149,13 +149,13 @@ bpy: all

 # -----------------------------------------------------------------------------
 # Configuration (save some cd'ing around)
-config: FORCE
+config: .FORCE
 	$(CMAKE_CONFIG_TOOL) "$(BUILD_DIR)"


 # -----------------------------------------------------------------------------
 # Help for build targets
-help: FORCE
+help: .FORCE
 	@echo ""
 	@echo "Convenience targets provided for building blender, (multiple at once can be used)"
 	@echo "  * debug     - build a debug binary"
@@ -228,13 +228,13 @@ help: FORCE
 # -----------------------------------------------------------------------------
 # Packages
 #
-package_debian: FORCE
+package_debian: .FORCE
 	cd build_files/package_spec ; DEB_BUILD_OPTIONS="parallel=$(NPROCS)" sh ./build_debian.sh

-package_pacman: FORCE
+package_pacman: .FORCE
 	cd build_files/package_spec/pacman ; MAKEFLAGS="-j$(NPROCS)" makepkg

-package_archive: FORCE
+package_archive: .FORCE
 	make -C "$(BUILD_DIR)" -s package_archive
 	@echo archive in "$(BUILD_DIR)/release"

@@ -242,24 +242,24 @@ package_archive: FORCE
 # -----------------------------------------------------------------------------
 # Tests
 #
-test: FORCE
+test: .FORCE
 	cd $(BUILD_DIR) ; ctest . --output-on-failure

 # run pep8 check check on scripts we distribute.
-test_pep8: FORCE
+test_pep8: .FORCE
 	$(PYTHON) tests/python/pep8.py > test_pep8.log 2>&1
 	@echo "written: test_pep8.log"

 # run some checks on our cmakefiles.
-test_cmake: FORCE
+test_cmake: .FORCE
 	$(PYTHON) build_files/cmake/cmake_consistency_check.py > test_cmake_consistency.log 2>&1
 	@echo "written: test_cmake_consistency.log"

 # run deprecation tests, see if we have anything to remove.
-test_deprecated: FORCE
+test_deprecated: .FORCE
 	$(PYTHON) tests/check_deprecated.py

-test_style_c: FORCE
+test_style_c: .FORCE
 	# run our own checks on C/C++ style
 	PYTHONIOENCODING=utf_8 $(PYTHON) \
 	    "$(BLENDER_DIR)/source/tools/check_source/check_style_c.py" \
@@ -267,7 +267,7 @@ test_style_c: FORCE
 	    "$(BLENDER_DIR)/source/creator" \
 	    --no-length-check

-test_style_c_qtc: FORCE
+test_style_c_qtc: .FORCE
 	# run our own checks on C/C++ style
 	USE_QTC_TASK=1 \
 	PYTHONIOENCODING=utf_8 $(PYTHON) \
@@ -280,7 +280,7 @@ test_style_c_qtc: FORCE
 	@echo "written: test_style.tasks"


-test_style_osl: FORCE
+test_style_osl: .FORCE
 	# run our own checks on C/C++ style
 	PYTHONIOENCODING=utf_8 $(PYTHON) \
 	    "$(BLENDER_DIR)/source/tools/check_source/check_style_c.py" \
@@ -288,7 +288,7 @@ test_style_osl: FORCE
 	    "$(BLENDER_DIR)/release/scripts/templates_osl"


-test_style_osl_qtc: FORCE
+test_style_osl_qtc: .FORCE
 	# run our own checks on C/C++ style
 	USE_QTC_TASK=1 \
 	PYTHONIOENCODING=utf_8 $(PYTHON) \
@@ -303,13 +303,13 @@ test_style_osl_qtc: FORCE
 # Project Files
 #

-project_qtcreator: FORCE
+project_qtcreator: .FORCE
 	$(PYTHON) build_files/cmake/cmake_qtcreator_project.py "$(BUILD_DIR)"

-project_netbeans: FORCE
+project_netbeans: .FORCE
 	$(PYTHON) build_files/cmake/cmake_netbeans_project.py "$(BUILD_DIR)"

-project_eclipse: FORCE
+project_eclipse: .FORCE
 	cmake -G"Eclipse CDT4 - Unix Makefiles" -H"$(BLENDER_DIR)" -B"$(BUILD_DIR)"


@@ -317,40 +317,40 @@ project_eclipse: FORCE
 # Static Checking
 #

-check_cppcheck: FORCE
+check_cppcheck: .FORCE
 	$(CMAKE_CONFIG)
 	cd "$(BUILD_DIR)" ; \
 	$(PYTHON) "$(BLENDER_DIR)/build_files/cmake/cmake_static_check_cppcheck.py" 2> \
 	    "$(BLENDER_DIR)/check_cppcheck.txt"
 	@echo "written: check_cppcheck.txt"

-check_clang_array: FORCE
+check_clang_array: .FORCE
 	$(CMAKE_CONFIG)
 	cd "$(BUILD_DIR)" ; \
 	$(PYTHON) "$(BLENDER_DIR)/build_files/cmake/cmake_static_check_clang_array.py"

-check_splint: FORCE
+check_splint: .FORCE
 	$(CMAKE_CONFIG)
 	cd "$(BUILD_DIR)" ; \
 	$(PYTHON) "$(BLENDER_DIR)/build_files/cmake/cmake_static_check_splint.py"

-check_sparse: FORCE
+check_sparse: .FORCE
 	$(CMAKE_CONFIG)
 	cd "$(BUILD_DIR)" ; \
 	$(PYTHON) "$(BLENDER_DIR)/build_files/cmake/cmake_static_check_sparse.py"

-check_smatch: FORCE
+check_smatch: .FORCE
 	$(CMAKE_CONFIG)
 	cd "$(BUILD_DIR)" ; \
 	$(PYTHON) "$(BLENDER_DIR)/build_files/cmake/cmake_static_check_smatch.py"

-check_spelling_py: FORCE
+check_spelling_py: .FORCE
 	cd "$(BUILD_DIR)" ; \
 	PYTHONIOENCODING=utf_8 $(PYTHON) \
 	    "$(BLENDER_DIR)/source/tools/check_source/check_spelling.py" \
 	    "$(BLENDER_DIR)/release/scripts"

-check_spelling_c: FORCE
+check_spelling_c: .FORCE
 	cd "$(BUILD_DIR)" ; \
 	PYTHONIOENCODING=utf_8 $(PYTHON) \
 	    "$(BLENDER_DIR)/source/tools/check_source/check_spelling.py" \
@@ -359,7 +359,7 @@ check_spelling_c: FORCE
 	    "$(BLENDER_DIR)/intern/guardedalloc" \
 	    "$(BLENDER_DIR)/intern/ghost" \

-check_spelling_c_qtc: FORCE
+check_spelling_c_qtc: .FORCE
 	cd "$(BUILD_DIR)" ; USE_QTC_TASK=1 \
 	PYTHONIOENCODING=utf_8 $(PYTHON) \
 	    "$(BLENDER_DIR)/source/tools/check_source/check_spelling.py" \
@@ -370,13 +370,13 @@ check_spelling_c_qtc: FORCE
 	    > \
 	    "$(BLENDER_DIR)/check_spelling_c.tasks"

-check_spelling_osl: FORCE
+check_spelling_osl: .FORCE
 	cd "$(BUILD_DIR)" ;\
 	PYTHONIOENCODING=utf_8 $(PYTHON) \
 	    "$(BLENDER_DIR)/source/tools/check_source/check_spelling.py" \
 	    "$(BLENDER_DIR)/intern/cycles/kernel/shaders"

-check_descriptions: FORCE
+check_descriptions: .FORCE
 	"$(BUILD_DIR)/bin/blender" --background -noaudio --factory-startup --python \
 	    "$(BLENDER_DIR)/source/tools/check_source/check_descriptions.py"

@@ -384,14 +384,14 @@ check_descriptions: FORCE
 # Utilities
 #

-tgz: FORCE
+tgz: .FORCE
 	./build_files/utils/build_tgz.sh

-icons: FORCE
+icons: .FORCE
 	"$(BLENDER_DIR)/release/datafiles/blender_icons_update.py"
 	"$(BLENDER_DIR)/release/datafiles/prvicons_update.py"

-update: FORCE
+update: .FORCE
 	if [ -d "../lib" ]; then \
 		svn update ../lib/* ; \
 	fi
@@ -404,23 +404,23 @@ update: FORCE
 #

 # Simple version of ./doc/python_api/sphinx_doc_gen.sh with no PDF generation.
-doc_py: FORCE
+doc_py: .FORCE
 	"$(BUILD_DIR)/bin/blender" --background -noaudio --factory-startup --python doc/python_api/sphinx_doc_gen.py
 	cd doc/python_api ; sphinx-build -b html sphinx-in sphinx-out
 	@echo "docs written into: '$(BLENDER_DIR)/doc/python_api/sphinx-out/contents.html'"

-doc_doxy: FORCE
+doc_doxy: .FORCE
 	cd doc/doxygen; doxygen Doxyfile
 	@echo "docs written into: '$(BLENDER_DIR)/doc/doxygen/html/index.html'"

-doc_dna: FORCE
+doc_dna: .FORCE
 	"$(BUILD_DIR)/bin/blender" --background -noaudio --factory-startup --python doc/blender_file_format/BlendFileDnaExporter_25.py
 	@echo "docs written into: '$(BLENDER_DIR)/doc/blender_file_format/dna.html'"

-doc_man: FORCE
+doc_man: .FORCE
 	$(PYTHON) doc/manpage/blender.1.py "$(BUILD_DIR)/bin/blender"

-help_features: FORCE
+help_features: .FORCE
 	@$(PYTHON) -c \
 		"import re; \
 		print('\n'.join([ \
@@ -431,9 +431,9 @@ help_features: FORCE
 		if w.startswith('WITH_')]))" | uniq


-clean: FORCE
+clean: .FORCE
 	$(MAKE) -C "$(BUILD_DIR)" clean

 .PHONY: all

-FORCE:
+.FORCE:
--- a/build_files/build_environment/install_deps.sh
+++ b/build_files/build_environment/install_deps.sh
@@ -372,6 +372,9 @@ MP3LAME_DEV=""
 OPENJPEG_USE=false
 OPENJPEG_DEV=""

+# Whether to use system GLEW or not (OpenSubDiv needs recent glew to work).
+NO_SYSTEM_GLEW=false
+
 # Switch to english language, else some things (like check_package_DEB()) won't work!
 LANG_BACK=$LANG
 LANG=""
@@ -1108,6 +1111,8 @@ compile_Boost() {
    OIIO_FORCE_REBUILD=true
    OSL_FORCE_BUILD=true
    OSL_FORCE_REBUILD=true
+    OPENVDB_FORCE_BUILD=true
+    OPENVDB_FORCE_REBUILD=true

    prepare_opt

@@ -1115,7 +1120,7 @@ compile_Boost() {
      INFO "Downloading Boost-$BOOST_VERSION"
      mkdir -p $SRC
      download BOOST_SOURCE[@] $_src.tar.bz2
-      tar -C $SRC --transform "s,(.*/?)boost_1_[^/]+(.*),\1boost-$BOOST_VERSION\2,x" -xf $_src.tar.bz2
+      tar -C $SRC --transform "s,\w*,boost-$BOOST_VERSION,x" -xf $_src.tar.bz2
    fi

    cd $_src
@@ -1339,7 +1344,7 @@ clean_OPENEXR() {

 compile_OPENEXR() {
  # To be changed each time we make edits that would modify the compiled result!
-  openexr_magic=13
+  openexr_magic=14

  # Clean install if needed!
  magic_compile_check openexr-$OPENEXR_VERSION $openexr_magic
@@ -1418,7 +1423,7 @@ compile_OPENEXR() {
    if [ -d $_inst ]; then
      _create_inst_shortcut
      # Copy ilmbase files here (blender expects same dir for ilmbase and openexr :/).
-      cp -Lrn $_ilmbase_inst/* $_inst_shortcut
+      cp -an $_ilmbase_inst/* $_inst_shortcut
    else
      ERROR "OpenEXR-$OPENEXR_VERSION failed to compile, exiting"
      exit 1
@@ -1959,7 +1964,7 @@ compile_BLOSC() {
    INFO "Done compiling Blosc-$OPENVDB_BLOSC_VERSION!"
  else
    INFO "Own Blosc-$OPENVDB_BLOSC_VERSION is up to date, nothing to do!"
-    INFO "If you want to force rebuild of this lib (and openexr), use the --force-openvdb option."
+    INFO "If you want to force rebuild of this lib (and openvdb), use the --force-openvdb option."
  fi

  magic_compile_set blosc-$OPENVDB_BLOSC_VERSION $blosc_magic
@@ -1985,7 +1990,7 @@ compile_OPENVDB() {
  PRINT ""

  # To be changed each time we make edits that would modify the compiled result!
-  openvdb_magic=0
+  openvdb_magic=1
  _init_openvdb

  # Clean install if needed!
@@ -2004,8 +2009,6 @@ compile_OPENVDB() {
      download OPENVDB_SOURCE[@] "$_src.tar.gz"

      INFO "Unpacking OpenVDB-$OPENVDB_VERSION"
-      #~ tar -C $SRC --transform "s,(.*/?)OpenShadingLanguage-[^/]*(.*),\1OpenShadingLanguage-$OPENVDB_VERSION\2,x" \
-          #~ -xf $_src.tar.gz
      tar -C $SRC -xf $_src.tar.gz
    fi

@@ -2020,48 +2023,40 @@ compile_OPENVDB() {
      #~ git reset --hard
    #~ fi

-    cd openvdb  # Grrrrrr...
-
-    # Always refresh the whole build!
-    if [ -d build ]; then
-      rm -rf build
-    fi
-    mkdir build
-    cd build
+    # Source builds here
+    cd openvdb

    make_d="DESTDIR=$_inst"
+    make_d="$make_d HDSO=/usr"

    if [ -d $INST/boost ]; then
-      make_d="$make_d -D BOOST_ROOT=$INST/boost -D Boost_NO_SYSTEM_PATHS=ON"
+      make_d="$make_d BOOST_INCL_DIR=$INST/boost/include BOOST_LIB_DIR=$INST/boost/lib"
    fi

-    #~ if [ "$_with_built_openexr" = true ]; then
-      #~ cmake_d="$cmake_d -D ILMBASE_HOME=$INST/openexr"
-      #~ cmake_d="$cmake_d -D OPENEXR_HOME=$INST/openexr"
-      #~ INFO "ILMBASE_HOME=$INST/openexr"
-    #~ fi
+    if [ "$_with_built_openexr" = true ]; then
+      make_d="$make_d ILMBASE_INCL_DIR=$INST/openexr/include ILMBASE_LIB_DIR=$INST/openexr/lib"
+      make_d="$make_d EXR_INCL_DIR=$INST/openexr/include EXR_LIB_DIR=$INST/openexr/lib"
+      INFO "ILMBASE_HOME=$INST/openexr"
+    fi

-    #~ cmake_d="-D CMAKE_BUILD_TYPE=Release"
-    #~ cmake_d="$cmake_d -D CMAKE_INSTALL_PREFIX=$_inst"
-    #~ # ptex is only needed when nicholas bishop is ready
-    #~ cmake_d="$cmake_d -D NO_PTEX=1"
-    #~ cmake_d="$cmake_d -D NO_CLEW=1"
-    #~ # maya plugin, docs, tutorials, regression tests and examples are not needed
-    #~ cmake_d="$cmake_d -D NO_MAYA=1 -D NO_DOC=1 -D NO_TUTORIALS=1 -D NO_REGRESSION=1 -DNO_EXAMPLES=1"
+    if [ -d $INST/blosc ]; then
+      make_d="$make_d BLOSC_INCL_DIR=$INST/blosc/include BLOSC_LIB_DIR=$INST/blosc/lib"
+    fi

-    #~ cmake $cmake_d ..
+    # Build without log4cplus, glfw, python module & docs
+    make_d="$make_d LOG4CPLUS_INCL_DIR= GLFW_INCL_DIR= PYTHON_VERSION= DOXYGEN="

-    #~ make -j$THREADS && make install
-    #~ make clean
+    make -j$THREADS lib $make_d install
+    make clean

-    #~ if [ -d $_inst ]; then
-      #~ _create_inst_shortcut
-    #~ else
-      #~ ERROR "OpenSubdiv-$OSD_VERSION failed to compile, exiting"
-      #~ exit 1
-    #~ fi
+    if [ -d $_inst ]; then
+      _create_inst_shortcut
+    else
+      ERROR "OpenVDB-$OPENVDB_VERSION failed to compile, exiting"
+      exit 1
+    fi

-    #~ magic_compile_set osd-$OSD_VERSION $osd_magic
+    magic_compile_set openvdb-$OPENVDB_VERSION $openvdb_magic

    cd $CWD
    INFO "Done compiling OpenVDB-$OPENVDB_VERSION!"
@@ -2167,7 +2162,7 @@ clean_FFmpeg() {

 compile_FFmpeg() {
  # To be changed each time we make edits that would modify the compiled result!
-  ffmpeg_magic=4
+  ffmpeg_magic=5
  _init_ffmpeg

  # Clean install if needed!
@@ -2234,7 +2229,7 @@ compile_FFmpeg() {
        --disable-postproc --disable-librtmp --disable-libopencore-amrnb \
        --disable-libopencore-amrwb --disable-libdc1394 --disable-version3 --disable-outdev=sdl \
        --disable-libxcb \
-        --disable-outdev=xv \
+        --disable-outdev=xv --disable-indev=sndio --disable-outdev=sndio \
        --disable-outdev=alsa --disable-indev=sdl --disable-indev=alsa --disable-indev=jack \
        --disable-indev=lavfi $extra

@@ -2261,7 +2256,7 @@ compile_FFmpeg() {

 #### Install on DEB-like ####
 get_package_version_DEB() {
-    dpkg-query -W -f '${Version}' $1 | sed -r 's/.*:\s*([0-9]+:)(([0-9]+\.?)+).*/\2/'
+    dpkg-query -W -f '${Version}' $1 | sed -r 's/([0-9]+:)?(([0-9]+\.?)+([0-9]+)).*/\2/'
 }

 check_package_DEB() {
@@ -2321,7 +2316,7 @@ install_packages_DEB() {
  if [ ! $SUDO ]; then
    WARNING "--no-sudo enabled, impossible to run apt-get install for $@, you'll have to do it yourself..."
  else
-    $SUDO apt-get install -y --force-yes $@
+    $SUDO apt-get install -y $@
    if [ $? -ge 1 ]; then
      ERROR "apt-get failed to install requested packages, exiting."
      exit 1
@@ -2341,30 +2336,6 @@ install_DEB() {
    [ "$(echo ${REPLY:=Y} | tr [:upper:] [:lower:])" != "y" ] && exit
  fi

-  if [ ! -z "`cat /etc/debian_version | grep ^6`"  ]; then
-    if [ -z "`cat /etc/apt/sources.list | grep backports.debian.org`"  ]; then
-      WARNING "Looks like you're using Debian Squeeze which does have broken CMake"
-      PRINT "It is highly recommended to install cmake from backports, otherwise"
-      PRINT "compilation of some libraries could fail"
-      PRINT ""
-      PRINT "You could install newer CMake from debian-backports repository"
-      PRINT "Add this this line to your /etc/apt/sources.lixt:"
-      PRINT ""
-      PRINT "deb http://backports.debian.org/debian-backports squeeze-backports main"
-      PRINT ""
-      PRINT "and then run:"
-      PRINT ""
-      PRINT "sudo apt-get update && sudo apt-get install cmake=2.8.7-4~bpo60+1 sudo apt-get install cmake=2.8.7-4~bpo60+1"
-      PRINT ""
-      PRINT "(you could also add this reporisotry using GUI like synaptic)"
-      PRINT ""
-      PRINT "Hit Enter to continue running the script, or hit Ctrl-C to abort the script"
-
-      read
-      PRINT ""
-    fi
-  fi
-
  if [ ! $SUDO ]; then
    WARNING "--no-sudo enabled, impossible to run apt-get update, you'll have to do it yourself..."
  else
@@ -2377,35 +2348,23 @@ install_DEB() {
  OGG_DEV="libogg-dev"
  THEORA_DEV="libtheora-dev"

-  _packages="gawk cmake cmake-curses-gui build-essential libjpeg-dev libpng-dev \
-             libfreetype6-dev libx11-dev \
+  _packages="gawk cmake cmake-curses-gui build-essential libjpeg-dev libpng-dev libtiff-dev \
+             git libfreetype6-dev libx11-dev flex bison libtbb-dev libxxf86vm-dev \
             libxcursor-dev libxi-dev wget libsqlite3-dev libxrandr-dev libxinerama-dev \
             libbz2-dev libncurses5-dev libssl-dev liblzma-dev libreadline-dev $OPENJPEG_DEV \
             libopenal-dev libglew-dev libglewmx-dev yasm $THEORA_DEV $VORBIS_DEV $OGG_DEV \
-             libsdl1.2-dev libfftw3-dev patch bzip2 libxml2-dev libtinyxml-dev"
+             libsdl1.2-dev libfftw3-dev patch bzip2 libxml2-dev libtinyxml-dev libjemalloc-dev"

  OPENJPEG_USE=true
  VORBIS_USE=true
  OGG_USE=true
  THEORA_USE=true

-  PRINT "$LIBYAML_CPP_VER"
+  PRINT ""
  # Some not-so-old distro (ubuntu 12.4) do not have it, do not fail in this case, just warn.
  YAMLCPP_DEV="libyaml-cpp-dev"
  check_package_DEB $YAMLCPP_DEV
  if [ $? -eq 0 ]; then
-    # Another Ubuntu hack - in 14.4, ocio uses (old) 0.3, while default is now 0.5... grrrrr.
-    if $LIBYAML_CPP_VER_DEFINED; then
-      YAMLCPP_VER_DEV="libyaml-cpp$LIBYAML_CPP_VER-dev"
-      check_package_DEB $YAMLCPP_VER_DEV
-      if [ $? -eq 0 ]; then
-        YAMLCPP_DEV=$YAMLCPP_VER_DEV
-      else
-        PRINT ""
-        WARNING "libyaml-cpp$LIBYAML_CPP_VER-dev not found!"
-        PRINT ""
-      fi
-    fi
    _packages="$_packages $YAMLCPP_DEV"
  else
    PRINT ""
@@ -2413,37 +2372,6 @@ install_DEB() {
    PRINT ""
  fi

-  # Install newest libtiff-dev in debian/ubuntu.
-  TIFF="libtiff"
-  check_package_DEB $TIFF
-  if [ $? -eq 0 ]; then
-    _packages="$_packages $TIFF-dev"
-  else
-    TIFF="libtiff5"
-    check_package_DEB $TIFF
-    if [ $? -eq 0 ]; then
-      _packages="$_packages $TIFF-dev"
-    else
-      TIFF="libtiff4"  # Some old distro, like e.g. ubuntu 10.04 :/
-      check_package_DEB $TIFF
-      if [ $? -eq 0 ]; then
-        _packages="$_packages $TIFF-dev"
-      fi
-    fi
-  fi
-
-  GIT="git"
-  check_package_DEB $GIT
-  if [ $? -eq 0 ]; then
-    _packages="$_packages $GIT"
-  else
-    GIT="git-core"  # Some old distro, like e.g. ubuntu 10.04 :/
-    check_package_DEB $GIT
-    if [ $? -eq 0 ]; then
-      _packages="$_packages $GIT"
-    fi
-  fi
-
  if [ "$WITH_ALL" = true ]; then
    _packages="$_packages libspnav-dev"
    # Only install jack if jack2 is not already installed!
@@ -2477,20 +2405,11 @@ install_DEB() {

  if [ "$WITH_ALL" = true ]; then
    PRINT ""
-    # Grmpf, debian is libxvidcore-dev and ubuntu libxvidcore4-dev!
-    # Note: not since ubuntu 10.04
    XVID_DEV="libxvidcore-dev"
    check_package_DEB $XVID_DEV
    if [ $? -eq 0 ]; then
      install_packages_DEB $XVID_DEV
      XVID_USE=true
-    else
-      XVID_DEV="libxvidcore4-dev"
-      check_package_DEB $XVID_DEV
-      if [ $? -eq 0 ]; then
-        install_packages_DEB $XVID_DEV
-        XVID_USE=true
-      fi
    fi

    PRINT ""
@@ -2510,6 +2429,44 @@ install_DEB() {
    fi
  fi

+  # Check cmake/glew versions and disable features for older distros.
+  # This is so Blender can at least compile.
+  PRINT ""
+  _cmake=`get_package_version_DEB cmake`
+  version_ge $_cmake "2.8.10"
+  if [ $? -eq 1 ]; then
+    version_ge $_cmake "2.8.8"
+    if [ $? -eq 1 ]; then
+      WARNING "OpenVDB and OpenCOLLADA disabled because cmake-$_cmake is not enough"
+      OPENVDB_SKIP=true
+      OPENCOLLADA_SKIP=true
+    else
+      WARNING "OpenVDB disabled because cmake-$_cmake is not enough"
+      OPENVDB_SKIP=true
+    fi
+  fi
+
+  PRINT ""
+  _glew=`get_package_version_DEB libglew-dev`
+  if [ -z $_glew ]; then
+    # Stupid virtual package in Ubuntu 12.04 doesn't show version number...
+    _glew=`apt-cache showpkg libglew-dev|tail -n1|awk '{print $2}'|sed 's/-.*//'`
+  fi
+  version_ge $_glew "1.9.0"
+  if [ $? -eq 1 ]; then
+    version_ge $_glew "1.7.0"
+    if [ $? -eq 1 ]; then
+      WARNING "OpenSubdiv disabled because GLEW-$_glew is not enough"
+      WARNING "Blender will not use system GLEW library"
+      OSD_SKIP=true
+      NO_SYSTEM_GLEW=true
+    else
+      WARNING "OpenSubdiv will compile with GLEW-$_glew but with limited capability"
+      WARNING "Blender will not use system GLEW library"
+      NO_SYSTEM_GLEW=true
+    fi
+  fi
+

  PRINT ""
  _do_compile_python=false
@@ -2564,15 +2521,8 @@ install_DEB() {

      boost_version=$(echo `get_package_version_DEB libboost-dev` | sed -r 's/^([0-9]+\.[0-9]+).*/\1/')

-      check_package_DEB libboost-locale$boost_version-dev
-      if [ $? -eq 0 ]; then
-        install_packages_DEB libboost-locale$boost_version-dev libboost-filesystem$boost_version-dev \
-                             libboost-regex$boost_version-dev libboost-system$boost_version-dev \
-                             libboost-thread$boost_version-dev libboost-wave$boost_version-dev
-        clean_Boost
-      else
-        compile_Boost
-      fi
+      install_packages_DEB libboost-{filesystem,iostreams,locale,regex,system,thread,wave}$boost_version-dev
+      clean_Boost
    else
      compile_Boost
    fi
@@ -2623,13 +2573,14 @@ install_DEB() {
    INFO "Forced OpenImageIO building, as requested..."
    compile_OIIO
  else
-    check_package_version_ge_lt_DEB libopenimageio-dev $OIIO_VERSION_MIN $OIIO_VERSION_MAX
-    if [ $? -eq 0 -a "$_with_built_openexr" = false ]; then
-      install_packages_DEB libopenimageio-dev
-      clean_OIIO
-    else
+    # XXX Debian Testing / Ubuntu 16.04 pulls in WAY too many deps (gtk2/opencv ?!) incl. OCIO build against libyaml-cpp0.3 so build for now...
+    #check_package_version_ge_lt_DEB libopenimageio-dev $OIIO_VERSION_MIN $OIIO_VERSION_MAX
+    #if [ $? -eq 0 -a "$_with_built_openexr" = false ]; then
+    #  install_packages_DEB libopenimageio-dev
+    #  clean_OIIO
+    #else
      compile_OIIO
-    fi
+    #fi
  fi


@@ -2638,6 +2589,7 @@ install_DEB() {
  _do_compile_llvm=false
  if [ "$LLVM_SKIP" = true ]; then
    WARNING "Skipping LLVM installation, as requested (this also implies skipping OSL!)..."
+    OSL_SKIP=true
  elif [ "$LLVM_FORCE_BUILD" = true ]; then
    INFO "Forced LLVM building, as requested..."
    _do_compile_llvm=true
@@ -2649,15 +2601,7 @@ install_DEB() {
      LLVM_VERSION_FOUND=$LLVM_VERSION
      clean_LLVM
    else
-      #~ check_package_version_ge_DEB llvm-dev $LLVM_VERSION_MIN
-      #~ if [ $? -eq 0 ]; then
-        #~ install_packages_DEB llvm-dev clang
-        #~ have_llvm=true
-        #~ LLVM_VERSION_FOUND=""  # Using default one, no need to specify it!
-        #~ clean_LLVM
-      #~ else
      _do_compile_llvm=true
-      #~ fi
    fi
  fi

@@ -2680,13 +2624,12 @@ install_DEB() {
    INFO "Forced OpenShadingLanguage building, as requested..."
    _do_compile_osl=true
  else
-      # No package currently!
+    # No package currently!
    _do_compile_osl=true
  fi

  if [ "$_do_compile_osl" = true ]; then
    if [ "$have_llvm" = true ]; then
-      install_packages_DEB flex bison libtbb-dev
      PRINT ""
      compile_OSL
    else
@@ -2696,23 +2639,33 @@ install_DEB() {


  PRINT ""
-  _do_compile_osd=false
  if [ "$OSD_SKIP" = true ]; then
    WARNING "Skipping OpenSubdiv installation, as requested..."
  elif [ "$OSD_FORCE_BUILD" = true ]; then
    INFO "Forced OpenSubdiv building, as requested..."
-    _do_compile_osd=true
+    compile_OSD
  else
-      # No package currently!
-    _do_compile_osd=true
-  fi
-
-  if [ "$_do_compile_osd" = true ]; then
-    install_packages_DEB flex bison libtbb-dev
+    # No package currently!
    PRINT ""
    compile_OSD
  fi

+  PRINT ""
+  if [ "$OPENVDB_SKIP" = true ]; then
+    WARNING "Skipping OpenVDB installation, as requested..."
+  elif [ "$OPENVDB_FORCE_BUILD" = true ]; then
+    INFO "Forced OpenVDB building, as requested..."
+    compile_OPENVDB
+  else
+    check_package_version_ge_DEB libopenvdb-dev $OPENVDB_VERSION_MIN
+    if [ $? -eq 0 ]; then
+      install_packages_DEB libopenvdb-dev libblosc-dev
+      clean_OPENVDB
+    else
+      compile_OPENVDB
+    fi
+  fi
+

  if [ "$WITH_OPENCOLLADA" = true ]; then
    _do_compile_collada=false
@@ -2723,7 +2676,7 @@ install_DEB() {
      INFO "Forced OpenCollada building, as requested..."
      _do_compile_collada=true
    else
-        # No package currently!
+      # No package currently!
      _do_compile_collada=true
    fi

@@ -2745,23 +2698,19 @@ install_DEB() {
    INFO "Forced FFMpeg building, as requested..."
    compile_FFmpeg
  else
-#    XXX Debian features libav packages as ffmpeg, those are not really compatible with blender code currently :/
-#        So for now, always build our own ffmpeg.
-#    check_package_DEB ffmpeg
-#    if [ $? -eq 0 ]; then
-#      install_packages_DEB ffmpeg
-#      ffmpeg_version=`get_package_version_DEB ffmpeg`
-#      PRINT "ffmpeg version: $ffmpeg_version"
-#      if [ ! -z "$ffmpeg_version" ]; then
-#        if  dpkg --compare-versions $ffmpeg_version gt 0.7.2; then
-#          install_packages_DEB libavfilter-dev libavcodec-dev libavdevice-dev libavformat-dev libavutil-dev libswscale-dev
-#          clean_FFmpeg
-#        else
-#          compile_FFmpeg
-#        fi
-#      fi
-#    fi
-    compile_FFmpeg
+    # XXX Debian Testing / Ubuntu 16.04 finally includes FFmpeg, so check as usual
+    check_package_DEB ffmpeg
+    if [ $? -eq 0 ]; then
+      check_package_version_ge_DEB ffmpeg $FFMPEG_VERSION_MIN
+      if [ $? -eq 0 ]; then
+        install_packages_DEB libavdevice-dev
+        clean_FFmpeg
+      else
+        compile_FFmpeg
+      fi
+    else
+      compile_FFmpeg
+    fi
  fi
 }

@@ -2769,7 +2718,7 @@ install_DEB() {
 #### Install on RPM-like ####
 rpm_flavour() {
  if [ -f /etc/redhat-release ]; then
-    if [ "`grep '6\.' /etc/redhat-release`" ]; then
+    if [ "`grep '[6-7]\.' /etc/redhat-release`" ]; then
      RPM="RHEL"
    else
      RPM="FEDORA"
@@ -2781,8 +2730,10 @@ rpm_flavour() {

 get_package_version_RPM() {
  rpm_flavour
-  if [ "$RPM" = "FEDORA" -o "$RPM" = "RHEL" ]; then
+  if [ "$RPM" = "RHEL" ]; then
    yum info $1 | grep Version | tail -n 1 | sed -r 's/.*:\s+(([0-9]+\.?)+).*/\1/'
+  elif [ "$RPM" = "FEDORA" ]; then
+    dnf info $1 | grep Version | tail -n 1 | sed -r 's/.*:\s+(([0-9]+\.?)+).*/\1/'
  elif [ "$RPM" = "SUSE" ]; then
    zypper info $1 | grep Version | tail -n 1 | sed -r 's/.*:\s+(([0-9]+\.?)+).*/\1/'
  fi
@@ -2790,8 +2741,10 @@ get_package_version_RPM() {

 check_package_RPM() {
  rpm_flavour
-  if [ "$RPM" = "FEDORA" -o "$RPM" = "RHEL" ]; then
+  if [ "$RPM" = "RHEL" ]; then
    r=`yum info $1 | grep -c 'Summary'`
+  elif [ "$RPM" = "FEDORA" ]; then
+    r=`dnf info $1 | grep -c 'Summary'`
  elif [ "$RPM" = "SUSE" ]; then
    r=`zypper info $1 | grep -c 'Summary'`
  fi
@@ -2838,26 +2791,28 @@ check_package_version_ge_lt_RPM() {

 install_packages_RPM() {
  rpm_flavour
-  if [ "$RPM" = "FEDORA" -o "$RPM" = "RHEL" ]; then
-    if [ ! $SUDO ]; then
-      WARNING "--no-sudo enabled, impossible to run yum install for $@, you'll have to do it yourself..."
-    else
-      $SUDO yum install -y $@
-      if [ $? -ge 1 ]; then
-        ERROR "yum failed to install requested packages, exiting."
-        exit 1
-      fi
+  if [ ! $SUDO ]; then
+    WARNING "--no-sudo enabled, impossible to install $@, you'll have to do it yourself..."
+  fi
+  if [ "$RPM" = "RHEL" ]; then
+    $SUDO yum install -y $@
+    if [ $? -ge 1 ]; then
+      ERROR "yum failed to install requested packages, exiting."
+      exit 1
+    fi
+
+  elif [ "$RPM" = "FEDORA" ]; then
+    $SUDO dnf install -y $@
+    if [ $? -ge 1 ]; then
+      ERROR "dnf failed to install requested packages, exiting."
+      exit 1
    fi

  elif [ "$RPM" = "SUSE" ]; then
-    if [ ! $SUDO ]; then
-      WARNING "--no-sudo enabled, impossible to run zypper install for $@, you'll have to do it yourself..."
-    else
-      $SUDO zypper --non-interactive install --auto-agree-with-licenses $@
-      if [ $? -ge 1 ]; then
-        ERROR "zypper failed to install requested packages, exiting."
-        exit 1
-      fi
+    $SUDO zypper --non-interactive install --auto-agree-with-licenses $@
+    if [ $? -ge 1 ]; then
+      ERROR "zypper failed to install requested packages, exiting."
+      exit 1
    fi
  fi
 }
@@ -2881,49 +2836,39 @@ install_RPM() {
    rpm_flavour
    if [ "$RPM" = "FEDORA" ]; then
      _fedora_rel="`egrep "[0-9]{1,}" /etc/fedora-release -o`"
-      $SUDO yum -y localinstall --nogpgcheck \
+      $SUDO dnf -y install --nogpgcheck \
      http://download1.rpmfusion.org/free/fedora/rpmfusion-free-release-$_fedora_rel.noarch.rpm \
      http://download1.rpmfusion.org/nonfree/fedora/rpmfusion-nonfree-release-$_fedora_rel.noarch.rpm

-      $SUDO yum -y update
-
-      # Install cmake now because of difference with RHEL
-      $SUDO yum -y install cmake
+      $SUDO dnf -y update

    elif [ "$RPM" = "RHEL" ]; then
-      $SUDO yum -y localinstall --nogpgcheck \
-      http://download.fedoraproject.org/pub/epel/6/$(uname -i)/epel-release-6-8.noarch.rpm \
-      http://download1.rpmfusion.org/free/el/updates/6/$(uname -i)/rpmfusion-free-release-6-1.noarch.rpm \
-      http://download1.rpmfusion.org/nonfree/el/updates/6/$(uname -i)/rpmfusion-nonfree-release-6-1.noarch.rpm
-
-      $SUDO yum -y update
-
-      # Install cmake 2.8 from other repo
-      mkdir -p $SRC
-      if [ -f $SRC/cmake-2.8.8-4.el6.$(uname -m).rpm ]; then
-        PRINT ""
-        INFO "Special cmake already installed"
+      if [ "`grep '6\.' /etc/redhat-release`" ]; then
+        ERROR "Building with GCC 4.4 is not supported!"
+        exit 1
      else
-        curl -O ftp://ftp.pbone.net/mirror/atrpms.net/el6-$(uname -i)/atrpms/testing/cmake-2.8.8-4.el6.$(uname -m).rpm
-        mv cmake-2.8.8-4.el6.$(uname -m).rpm $SRC/
-        $SUDO rpm -ihv $SRC/cmake-2.8.8-4.el6.$(uname -m).rpm
+        $SUDO yum -y install --nogpgcheck \
+        http://download.fedoraproject.org/pub/epel/7/$(uname -i)/e/epel-release-7-6.noarch.rpm \
+        http://li.nux.ro/download/nux/dextop/el7/x86_64/nux-dextop-release-0-5.el7.nux.noarch.rpm
+
+        $SUDO yum -y update
      fi

    elif [ "$RPM" = "SUSE" ]; then
-      # Install this now to avoid using the version from packman repository...
-      if [ "$WITH_ALL" = true ]; then
-        install_packages_RPM libjack-devel
+      # Packman repo now includes name in link...
+      _suse_rel="`grep -w VERSION /etc/os-release | sed 's/[^0-9.]*//g'`"
+      _suse_name="`grep -w NAME /etc/os-release | gawk '{print $2}' | sed 's/\"//'`"
+      if [ $_suse_name ]; then
+        _suse_rel="${_suse_name}_${_suse_rel}"
      fi

-      _suse_rel="`grep VERSION /etc/SuSE-release | gawk '{print $3}'`"
-
      PRINT ""
      INFO "About to add 'packman' repository from http://packman.inode.at/suse/openSUSE_$_suse_rel/"
      INFO "This is only needed if you do not already have a packman repository enabled..."
      read -p "Do you want to add this repo (Y/n)?"
      if [ "$(echo ${REPLY:=Y} | tr [:upper:] [:lower:])" == "y" ]; then
        INFO "    Installing packman..."
-        $SUDO zypper ar --refresh --name 'Packman Repository' http://ftp.gwdg.de/pub/linux/packman/suse/openSUSE_$_suse_rel/ ftp.gwdg.de-suse
+        $SUDO zypper ar -f -n packman http://ftp.gwdg.de/pub/linux/misc/packman/suse/openSUSE_$_suse_rel/ packman
        INFO "    Done."
      else
        INFO "    Skipping packman installation."
@@ -2938,11 +2883,12 @@ install_RPM() {
  OGG_DEV="libogg-devel"
  THEORA_DEV="libtheora-devel"

-  _packages="gcc gcc-c++ git make cmake libtiff-devel libjpeg-devel\
-             libpng-devel libX11-devel libXi-devel libXcursor-devel libXrandr-devel libXinerama-devel \
+  _packages="gcc gcc-c++ git make cmake tar bzip2 xz findutils flex bison \
+             libtiff-devel libjpeg-devel libpng-devel sqlite-devel fftw-devel SDL-devel \
+             libX11-devel libXi-devel libXcursor-devel libXrandr-devel libXinerama-devel \
             wget ncurses-devel readline-devel $OPENJPEG_DEV openal-soft-devel \
             glew-devel yasm $THEORA_DEV $VORBIS_DEV $OGG_DEV patch \
-             libxml2-devel yaml-cpp-devel tinyxml-devel"
+             libxml2-devel yaml-cpp-devel tinyxml-devel jemalloc-devel"

  OPENJPEG_USE=true
  VORBIS_USE=true
@@ -2950,9 +2896,7 @@ install_RPM() {
  THEORA_USE=true

  if [ "$RPM" = "FEDORA" -o "$RPM" = "RHEL" ]; then
-    OPENEXR_DEV="openexr-devel"
-
-    _packages="$_packages freetype-devel libsqlite3x-devel fftw-devel SDL-devel"
+    _packages="$_packages freetype-devel tbb-devel"

    if [ "$WITH_ALL" = true ]; then
      _packages="$_packages jack-audio-connection-kit-devel"
@@ -2988,13 +2932,22 @@ install_RPM() {
    fi

  elif [ "$RPM" = "SUSE" ]; then
-    OPENEXR_DEV="libopenexr-devel"
-
-    _packages="$_packages cmake freetype2-devel sqlite3-devel fftw3-devel libSDL-devel"
+    _packages="$_packages freetype2-devel"

    PRINT ""
    install_packages_RPM $_packages

+    PRINT ""
+    # Install TBB on openSUSE, from temporary repo
+    check_package_RPM tbb-devel
+    if [ $? -eq 0 ]; then
+      install_packages_RPM tbb-devel
+    else
+      $SUDO zypper ar -f http://download.opensuse.org/repositories/devel:/libraries:/c_c++/openSUSE_$_suse_rel/devel:libraries:c_c++.repo
+      $SUDO zypper -n --gpg-auto-import-keys install tbb-devel
+      $SUDO zypper rr devel_libraries_c_c++
+    fi
+
    PRINT ""
    X264_DEV="libx264-devel"
    check_package_version_ge_RPM $X264_DEV $X264_VERSION_MIN
@@ -3083,21 +3036,32 @@ install_RPM() {


  PRINT ""
+  _do_compile_boost=false
  if [ "$BOOST_SKIP" = true ]; then
    WARNING "Skipping Boost installation, as requested..."
  elif [ "$BOOST_FORCE_BUILD" = true ]; then
    INFO "Forced Boost building, as requested..."
-    compile_Boost
+    _do_compile_boost=true
  else
    check_package_version_ge_RPM boost-devel $BOOST_VERSION_MIN
    if [ $? -eq 0 ]; then
      install_packages_RPM boost-devel
      clean_Boost
    else
-      compile_Boost
+      _do_compile_boost=true
    fi
  fi

+  if [ "$_do_compile_boost" = true ]; then
+    if [ "$RPM" = "SUSE" ]; then
+      install_packages_RPM gcc-fortran
+    else
+      install_packages_RPM libquadmath-devel bzip2-devel
+    fi
+    PRINT ""
+    compile_Boost
+  fi
+

  PRINT ""
  if [ "$OCIO_SKIP" = true ]; then
@@ -3106,14 +3070,18 @@ install_RPM() {
    INFO "Forced OpenColorIO building, as requested..."
    compile_OCIO
  else
-    # XXX Always force build of own OCIO, until linux distro guys update their package to default libyaml-cpp ver (0.5)!
-    #check_package_version_ge_RPM OpenColorIO-devel $OCIO_VERSION_MIN
-    #if [ $? -eq 0 ]; then
-      #install_packages_RPM OpenColorIO-devel
-      #clean_OCIO
-    #else
+    if [ "$RPM" = "SUSE" ]; then
+      check_package_version_ge_RPM OpenColorIO-devel $OCIO_VERSION_MIN
+      if [ $? -eq 0 ]; then
+        install_packages_RPM OpenColorIO-devel
+        clean_OCIO
+      else
+        compile_OCIO
+      fi
+    # XXX Fedora/RHEL OCIO still depends on libyaml-cpp v0.3 even when system default is v0.5!
+    else
      compile_OCIO
-    #fi
+    fi
  fi

  PRINT ""
@@ -3123,10 +3091,10 @@ install_RPM() {
    INFO "Forced ILMBase/OpenEXR building, as requested..."
    compile_OPENEXR
  else
-    check_package_version_ge_RPM $OPENEXR_DEV $OPENEXR_VERSION_MIN
+    check_package_version_ge_RPM openexr-devel $OPENEXR_VERSION_MIN
    if [ $? -eq 0 ]; then
-      install_packages_RPM $OPENEXR_DEV
-      OPENEXR_VERSION=`get_package_version_RPM $OPENEXR_DEV`
+      install_packages_RPM openexr-devel
+      OPENEXR_VERSION=`get_package_version_RPM openexr-devel`
      ILMBASE_VERSION=$OPENEXR_VERSION
      clean_OPENEXR
    else
@@ -3141,13 +3109,14 @@ install_RPM() {
    INFO "Forced OpenImageIO building, as requested..."
    compile_OIIO
  else
-    check_package_version_ge_lt_RPM OpenImageIO-devel $OIIO_VERSION_MIN $OIIO_VERSION_MAX
-    if [ $? -eq 0 -a $_with_built_openexr == false ]; then
-      install_packages_RPM OpenImageIO-devel
-      clean_OIIO
-    else
+    # XXX RPM distros pulls in too much and depends on old libs, so better to build for now...
+    #check_package_version_ge_lt_RPM OpenImageIO-devel $OIIO_VERSION_MIN $OIIO_VERSION_MAX
+    #if [ $? -eq 0 -a $_with_built_openexr == false ]; then
+    #  install_packages_RPM OpenImageIO-devel
+    #  clean_OIIO
+    #else
      compile_OIIO
-    fi
+    #fi
  fi


@@ -3156,34 +3125,26 @@ install_RPM() {
  _do_compile_llvm=false
  if [ "$LLVM_SKIP" = true ]; then
    WARNING "Skipping LLVM installation, as requested (this also implies skipping OSL!)..."
+    OSL_SKIP=true
  elif [ "$LLVM_FORCE_BUILD" = true ]; then
    INFO "Forced LLVM building, as requested..."
    _do_compile_llvm=true
  else
-    # Problem compiling with LLVM 3.2 so match version 3.1 ...
    if [ "$RPM" = "SUSE" ]; then
-      check_package_version_match_RPM llvm-clang-devel $LLVM_VERSION
-      if [ $? -eq 0 ]; then
-        install_packages_RPM llvm-devel llvm-clang-devel
-        have_llvm=true
-        LLVM_VERSION_FOUND=$LLVM_VERSION
-        clean_LLVM
-      else
-        # Better to compile it than use minimum version from repo...
-        _do_compile_llvm=true
-      fi
+      CLANG_DEV="llvm-clang-devel"
    else
-      check_package_version_match_RPM clang-devel $LLVM_VERSION
-      if [ $? -eq 0 ]; then
-        install_packages_RPM llvm-devel clang-devel
-        have_llvm=true
-        LLVM_VERSION_FOUND=$LLVM_VERSION
-        clean_LLVM
-      else
-        # Better to compile it than use minimum version from repo...
-        _do_compile_llvm=true
-      fi
+      CLANG_DEV="clang-devel"
    fi
+    # XXX RHEL has 3.4 in repo but OSL complains about not finding MCJIT_LIBRARY, so compile for now...
+    #check_package_version_match_RPM $CLANG_DEV $LLVM_VERSION
+    #if [ $? -eq 0 ]; then
+    #  install_packages_RPM llvm-devel $CLANG_DEV
+    #  have_llvm=true
+    #  LLVM_VERSION_FOUND=$LLVM_VERSION
+    #  clean_LLVM
+    #else
+      _do_compile_llvm=true
+    #fi
  fi

  if [ "$_do_compile_llvm" = true ]; then
@@ -3211,10 +3172,6 @@ install_RPM() {

  if [ "$_do_compile_osl" = true ]; then
    if [ "$have_llvm" = true ]; then
-      install_packages_RPM flex bison
-      if [ "$RPM" = "FEDORA" -o "$RPM" = "RHEL" ]; then
-        install_packages_RPM tbb-devel
-      fi
      PRINT ""
      compile_OSL
    else
@@ -3224,24 +3181,26 @@ install_RPM() {


  PRINT ""
-  _do_compile_osd=false
  if [ "$OSD_SKIP" = true ]; then
    WARNING "Skipping OpenSubdiv installation, as requested..."
  elif [ "$OSD_FORCE_BUILD" = true ]; then
    INFO "Forced OpenSubdiv building, as requested..."
-    _do_compile_osd=true
+    compile_OSD
  else
    # No package currently!
-    _do_compile_osd=true
+    compile_OSD
  fi

-  if [ "$_do_compile_osd" = true ]; then
-    install_packages_RPM flex bison
-    if [ "$RPM" = "FEDORA" -o "$RPM" = "RHEL" ]; then
-      install_packages_RPM tbb-devel
-    fi
-    PRINT ""
-    compile_OSD
+
+  PRINT ""
+  if [ "$OPENVDB_SKIP" = true ]; then
+    WARNING "Skipping OpenVDB installation, as requested..."
+  elif [ "$OPENVDB_FORCE_BUILD" = true ]; then
+    INFO "Forced OpenVDB building, as requested..."
+    compile_OPENVDB
+  else
+    # No package currently!
+    compile_OPENVDB
  fi


@@ -3275,7 +3234,7 @@ install_RPM() {
    INFO "Forced FFMpeg building, as requested..."
    compile_FFmpeg
  else
-    check_package_version_ge_RPM ffmpeg $FFMPEG_VERSION_MIN
+    check_package_version_ge_RPM ffmpeg-devel $FFMPEG_VERSION_MIN
    if [ $? -eq 0 ]; then
      install_packages_RPM ffmpeg ffmpeg-devel
      clean_FFmpeg
@@ -3383,7 +3342,15 @@ install_ARCH() {
  OGG_DEV="libogg"
  THEORA_DEV="libtheora"

-  _packages="base-devel git cmake \
+  BASE_DEVEL="base-devel"
+
+  # Avoid conflicts when gcc-multilib is installed
+  pacman -Qi gcc-multilib &>/dev/null
+  if [ $? -eq 0 ]; then
+    BASE_DEVEL=`pacman -Sgq base-devel | sed -e 's/^gcc$/gcc-multilib/g' | paste -s -d' '`
+  fi
+
+  _packages="$BASE_DEVEL git cmake \
             libxi libxcursor libxrandr libxinerama glew libpng libtiff wget openal \
             $OPENJPEG_DEV $VORBIS_DEV $OGG_DEV $THEORA_DEV yasm sdl fftw intel-tbb \
             libxml2 yaml-cpp tinyxml python-requests jemalloc"
@@ -3557,6 +3524,7 @@ install_ARCH() {
  _do_compile_llvm=false
  if [ "$LLVM_SKIP" = true ]; then
    WARNING "Skipping LLVM installation, as requested (this also implies skipping OSL!)..."
+    OSL_SKIP=true
  elif [ "$LLVM_FORCE_BUILD" = true ]; then
    INFO "Forced LLVM building, as requested..."
    _do_compile_llvm=true
@@ -3565,7 +3533,7 @@ install_ARCH() {
    if [ $? -eq 0 ]; then
      install_packages_ARCH llvm35 clang35
      have_llvm=true
-      LLVM_VERSION=`get_package_version_ARCH llvm`
+      LLVM_VERSION=`get_package_version_ARCH llvm35`
      LLVM_VERSION_FOUND=$LLVM_VERSION
      clean_LLVM
    else
@@ -3629,6 +3597,23 @@ install_ARCH() {
  fi


+  PRINT ""
+  if [ "$OPENVDB_SKIP" = true ]; then
+    WARNING "Skipping OpenVDB installation, as requested..."
+  elif [ "$OPENVDB_FORCE_BUILD" = true ]; then
+    INFO "Forced OpenVDB building, as requested..."
+    compile_OPENVDB
+  else
+    check_package_version_ge_ARCH openvdb $OPENVDB_VERSION_MIN
+    if [ $? -eq 0 ]; then
+      install_packages_ARCH openvdb
+      clean_OPENVDB
+    else
+      compile_OPENVDB
+    fi
+  fi
+
+
  if [ "$WITH_OPENCOLLADA" = true ]; then
    PRINT ""
    _do_compile_collada=false
@@ -3935,7 +3920,7 @@ print_info() {

  _buildargs="-U *SNDFILE* -U *PYTHON* -U *BOOST* -U *Boost*"
  _buildargs="$_buildargs -U *OPENCOLORIO* -U *OPENEXR* -U *OPENIMAGEIO* -U *LLVM* -U *CYCLES*"
-  _buildargs="$_buildargs -U *OPENSUBDIV* -U *COLLADA* -U *FFMPEG*"
+  _buildargs="$_buildargs -U *OPENSUBDIV* -U *OPENVDB* -U *COLLADA* -U *FFMPEG*"

  _1="-D WITH_CODEC_SNDFILE=ON"
  PRINT "  $_1"
@@ -4022,12 +4007,31 @@ print_info() {
    fi
  fi

+  if [ "$OPENVDB_SKIP" = false ]; then
+    _1="-D WITH_OPENVDB=ON"
+    _2="-D WITH_OPENVDB_BLOSC=ON"
+    PRINT "  $_1"
+    PRINT "  $_2"
+    _buildargs="$_buildargs $_1 $_2"
+    if [ -d $INST/openvdb ]; then
+      _1="-D OPENVDB_ROOT_DIR=$INST/openvdb"
+      PRINT "  $_1"
+      _buildargs="$_buildargs $_1"
+    fi
+  fi
+
  if [ "$WITH_OPENCOLLADA" = true ]; then
    _1="-D WITH_OPENCOLLADA=ON"
    PRINT "  $_1"
    _buildargs="$_buildargs $_1"
  fi

+  if [ "$NO_SYSTEM_GLEW" = true ]; then
+    _1="-D WITH_SYSTEM_GLEW=OFF"
+    PRINT "  $_1"
+    _buildargs="$_buildargs $_1"
+  fi
+
  if [ "$FFMPEG_SKIP" = false ]; then
    _1="-D WITH_CODEC_FFMPEG=ON"
    _2="-D FFMPEG_LIBRARIES='avformat;avcodec;avutil;avdevice;swscale;swresample;lzma;rt;`print_info_ffmpeglink`'"
--- a/build_files/buildbot/slave_compile.py
+++ b/build_files/buildbot/slave_compile.py
@@ -56,7 +56,6 @@ if 'cmake' in builder:
    chroot_name = None  # If not None command will be delegated to that chroot
    cuda_chroot_name = None  # If not None cuda compilationcommand will be delegated to that chroot
    build_cubins = True  # Whether to build Cycles CUDA kernels
-    remove_install_dir = False  # Remove installation folder before building
    bits = 64

    # Config file to be used (relative to blender's sources root)
@@ -70,19 +69,24 @@ if 'cmake' in builder:
    cuda_cmake_options = []

    if builder.startswith('mac'):
-        install_dir = None
        # Set up OSX architecture
        if builder.endswith('x86_64_10_6_cmake'):
            cmake_extra_options.append('-DCMAKE_OSX_ARCHITECTURES:STRING=x86_64')
        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-hack/bin/nvcc')

    elif builder.startswith('win'):
-        install_dir = None
+      if builder.endswith('_vc2015'):
        if builder.startswith('win64'):
-            cmake_options.append(['-G', '"Visual Studio 12 2013 Win64"'])
+            cmake_options.extend(['-G', 'Visual Studio 14 2015 Win64', '-DWITH_CYCLES_CUDA_BINARIES=0'])
        elif builder.startswith('win32'):
            bits = 32
-            cmake_options.append(['-G', '"Visual Studio 12 2013"'])
+            cmake_options.extend(['-G', 'Visual Studio 14 2015', '-DWITH_CYCLES_CUDA_BINARIES=0'])
+      else:
+        if builder.startswith('win64'):
+            cmake_options.extend(['-G', 'Visual Studio 12 2013 Win64'])
+        elif builder.startswith('win32'):
+            bits = 32
+            cmake_options.extend(['-G', 'Visual Studio 12 2013'])

    elif builder.startswith('linux'):
        tokens = builder.split("_")
@@ -91,7 +95,6 @@ if 'cmake' in builder:
            deb_name = "jessie"
        elif glibc == 'glibc211':
            deb_name = "squeeze"
-        remove_install_dir = True
        cmake_config_file = "build_files/buildbot/config/blender_linux.cmake"
        cmake_player_config_file = "build_files/buildbot/config/blender_linux_player.cmake"
        if builder.endswith('x86_64_cmake'):
@@ -117,8 +120,7 @@ if 'cmake' in builder:
    if 'cuda' not in targets:
        cmake_options += cuda_cmake_options

-    if install_dir:
-        cmake_options.append("-DCMAKE_INSTALL_PREFIX=%s" % (install_dir))
+    cmake_options.append("-DCMAKE_INSTALL_PREFIX=%s" % (install_dir))

    cmake_options += cmake_extra_options

@@ -133,10 +135,8 @@ if 'cmake' in builder:
        cuda_chroot_prefix = chroot_prefix[:]

    # Make sure no garbage remained from the previous run
-    # (only do it if builder requested this)
-    if remove_install_dir:
-        if os.path.isdir(install_dir):
-            shutil.rmtree(install_dir)
+    if os.path.isdir(install_dir):
+        shutil.rmtree(install_dir)

    for target in targets:
        print("Building target %s" % (target))
--- a/build_files/cmake/buildinfo.cmake
+++ b/build_files/cmake/buildinfo.cmake
@@ -1,5 +1,10 @@
-# This is called by cmake as an extermal process from
+# This is called by cmake as an external process from
 # ./source/creator/CMakeLists.txt to write ./source/creator/buildinfo.h
+# Caller must define:
+#   SOURCE_DIR
+# Optional overrides:
+#   BUILD_DATE
+#   BUILD_TIME

 # Extract working copy information for SOURCE_DIR into MY_XXX variables
 # with a default in case anything fails, for example when using git-svn
@@ -128,12 +133,19 @@ endif()
 # BUILD_PLATFORM and BUILD_PLATFORM are taken from CMake
 # but BUILD_DATE and BUILD_TIME are platform dependent
 if(UNIX)
-	execute_process(COMMAND date "+%Y-%m-%d" OUTPUT_VARIABLE BUILD_DATE OUTPUT_STRIP_TRAILING_WHITESPACE)
-	execute_process(COMMAND date "+%H:%M:%S" OUTPUT_VARIABLE BUILD_TIME OUTPUT_STRIP_TRAILING_WHITESPACE)
-endif()
-if(WIN32)
-	execute_process(COMMAND cmd /c date /t OUTPUT_VARIABLE BUILD_DATE OUTPUT_STRIP_TRAILING_WHITESPACE)
-	execute_process(COMMAND cmd /c time /t OUTPUT_VARIABLE BUILD_TIME OUTPUT_STRIP_TRAILING_WHITESPACE)
+	if(NOT BUILD_DATE)
+		execute_process(COMMAND date "+%Y-%m-%d" OUTPUT_VARIABLE BUILD_DATE OUTPUT_STRIP_TRAILING_WHITESPACE)
+	endif()
+	if(NOT BUILD_TIME)
+		execute_process(COMMAND date "+%H:%M:%S" OUTPUT_VARIABLE BUILD_TIME OUTPUT_STRIP_TRAILING_WHITESPACE)
+	endif()
+elseif(WIN32)
+	if(NOT BUILD_DATE)
+		execute_process(COMMAND cmd /c date /t OUTPUT_VARIABLE BUILD_DATE OUTPUT_STRIP_TRAILING_WHITESPACE)
+	endif()
+	if(NOT BUILD_TIME)
+		execute_process(COMMAND cmd /c time /t OUTPUT_VARIABLE BUILD_TIME OUTPUT_STRIP_TRAILING_WHITESPACE)
+	endif()
 endif()

 # Write a file with the BUILD_HASH define
--- a/build_files/cmake/config/blender_full.cmake
+++ b/build_files/cmake/config/blender_full.cmake
@@ -54,7 +54,7 @@ set(WITH_PLAYER              ON  CACHE BOOL "" FORCE)
 set(WITH_MEM_JEMALLOC        ON  CACHE BOOL "" FORCE)


-# platform dependant options
+# platform dependent options
 if(UNIX AND NOT APPLE)
 	set(WITH_JACK                ON  CACHE BOOL "" FORCE)
 	set(WITH_DOC_MANPAGE         ON  CACHE BOOL "" FORCE)
--- a/build_files/cmake/macros.cmake
+++ b/build_files/cmake/macros.cmake
@@ -435,9 +435,6 @@ function(setup_liblinks
 	if(WITH_MEM_JEMALLOC)
 		target_link_libraries(${target} ${JEMALLOC_LIBRARIES})
 	endif()
-	if(WITH_INPUT_NDOF)
-		target_link_libraries(${target} ${NDOF_LIBRARIES})
-	endif()
 	if(WITH_MOD_CLOTH_ELTOPO)
 		target_link_libraries(${target} ${LAPACK_LIBRARIES})
 	endif()
@@ -451,6 +448,9 @@ function(setup_liblinks
 		if(WITH_OPENMP_STATIC)
 			target_link_libraries(${target} ${OpenMP_LIBRARIES})
 		endif()
+		if(WITH_INPUT_NDOF)
+			target_link_libraries(${target} ${NDOF_LIBRARIES})
+		endif()
 	endif()

 	# We put CLEW and CUEW here because OPENSUBDIV_LIBRARIES dpeends on them..
@@ -487,6 +487,7 @@ function(SETUP_BLENDER_SORTED_LIBS)
 	if(WITH_CYCLES)
 		list(APPEND BLENDER_LINK_LIBS
 			cycles_render
+			cycles_graph
 			cycles_bvh
 			cycles_device
 			cycles_kernel
@@ -551,11 +552,11 @@ function(SETUP_BLENDER_SORTED_LIBS)
 		bf_modifiers
 		bf_bmesh
 		bf_gpu
+		bf_blenloader
 		bf_blenkernel
 		bf_physics
 		bf_nodes
 		bf_rna
-		bf_blenloader
 		bf_imbuf
 		bf_blenlib
 		bf_depsgraph
@@ -600,6 +601,7 @@ function(SETUP_BLENDER_SORTED_LIBS)
 		bf_intern_dualcon
 		bf_intern_cycles
 		cycles_render
+		cycles_graph
 		cycles_bvh
 		cycles_device
 		cycles_kernel
@@ -659,10 +661,6 @@ function(SETUP_BLENDER_SORTED_LIBS)
 		list(APPEND BLENDER_SORTED_LIBS bf_quicktime)
 	endif()

-	if(WITH_INPUT_NDOF)
-		list(APPEND BLENDER_SORTED_LIBS bf_intern_ghostndof3dconnexion)
-	endif()
-	
 	if(WITH_MOD_BOOLEAN)
 		list(APPEND BLENDER_SORTED_LIBS extern_carve)
 	endif()
@@ -687,6 +685,14 @@ function(SETUP_BLENDER_SORTED_LIBS)
 		list_insert_after(BLENDER_SORTED_LIBS "ge_logic_ngnetwork" "extern_bullet")
 	endif()

+	if(WITH_DECKLINK)
+		list(APPEND BLENDER_SORTED_LIBS bf_intern_decklink)
+	endif()
+
+	if(WIN32)
+		list(APPEND BLENDER_SORTED_LIBS bf_intern_gpudirect)
+	endif()
+
 	if(WITH_OPENSUBDIV)
 		list(APPEND BLENDER_SORTED_LIBS bf_intern_opensubdiv)
 	endif()
@@ -803,7 +809,15 @@ macro(TEST_UNORDERED_MAP_SUPPORT)
 	#  UNORDERED_MAP_NAMESPACE, namespace for unordered_map, if found

 	include(CheckIncludeFileCXX)
-	CHECK_INCLUDE_FILE_CXX("unordered_map" HAVE_STD_UNORDERED_MAP_HEADER)
+
+	# Workaround for newer GCC (6.x+) where C++11 was enabled by default, which lead us
+	# to a situation when there is <unordered_map> include but which can't be used uless
+	# C++11 is enabled.
+	if(CMAKE_COMPILER_IS_GNUCC AND (NOT "${CMAKE_C_COMPILER_VERSION}" VERSION_LESS "6.0") AND (NOT WITH_CXX11))
+		set(HAVE_STD_UNORDERED_MAP_HEADER False)
+	else()
+		CHECK_INCLUDE_FILE_CXX("unordered_map" HAVE_STD_UNORDERED_MAP_HEADER)
+	endif()
 	if(HAVE_STD_UNORDERED_MAP_HEADER)
 		# Even so we've found unordered_map header file it doesn't
 		# mean unordered_map and unordered_set will be declared in
@@ -873,8 +887,16 @@ macro(TEST_SHARED_PTR_SUPPORT)
 	# otherwise it's assumed to be defined in std namespace.

 	include(CheckIncludeFileCXX)
+	include(CheckCXXSourceCompiles)
 	set(SHARED_PTR_FOUND FALSE)
-	CHECK_INCLUDE_FILE_CXX(memory HAVE_STD_MEMORY_HEADER)
+	# Workaround for newer GCC (6.x+) where C++11 was enabled by default, which lead us
+	# to a situation when there is <unordered_map> include but which can't be used uless
+	# C++11 is enabled.
+	if(CMAKE_COMPILER_IS_GNUCC AND (NOT "${CMAKE_C_COMPILER_VERSION}" VERSION_LESS "6.0") AND (NOT WITH_CXX11))
+		set(HAVE_STD_MEMORY_HEADER False)
+	else()
+		CHECK_INCLUDE_FILE_CXX(memory HAVE_STD_MEMORY_HEADER)
+	endif()
 	if(HAVE_STD_MEMORY_HEADER)
 		# Finding the memory header doesn't mean that shared_ptr is in std
 		# namespace.
@@ -882,7 +904,6 @@ macro(TEST_SHARED_PTR_SUPPORT)
 		# In particular, MSVC 2008 has shared_ptr declared in std::tr1.  In
 		# order to support this, we do an extra check to see which namespace
 		# should be used.
-		include(CheckCXXSourceCompiles)
 		CHECK_CXX_SOURCE_COMPILES("#include <memory>
 		                           int main() {
 		                             std::shared_ptr<int> int_ptr;
@@ -1050,6 +1071,19 @@ macro(remove_strict_flags_file

 endmacro()

+# External libs may need 'signed char' to be default.
+macro(remove_cc_flag_unsigned_char)
+	if(CMAKE_C_COMPILER_ID MATCHES "^(GNU|Clang|Intel)$")
+		remove_cc_flag("-funsigned-char")
+	elseif(MSVC)
+		remove_cc_flag("/J")
+	else()
+		message(WARNING
+			"Compiler '${CMAKE_C_COMPILER_ID}' failed to disable 'unsigned char' flag."
+			"Build files need updating."
+		)
+	endif()
+endmacro()

 function(ADD_CHECK_C_COMPILER_FLAG
 	_CFLAGS
--- a/doc/python_api/examples/bge.texture.2.py
+++ b/doc/python_api/examples/bge.texture.2.py
@@ -0,0 +1,239 @@
+"""
+Video Capture with DeckLink
+++++++++++++++++++++++++++
+Video frames captured with DeckLink cards have pixel formats that are generally not directly
+usable by OpenGL, they must be processed by a shader. The three shaders presented here should
+cover all common video capture cases.
+
+This file reflects the current video transfer method implemented in the Decklink module:
+whenever possible the video images are transferred as float texture because this is more 
+compatible with GPUs. Of course, only the pixel formats that have a correspondant GL format
+can be transferred as float. Look for fg_shaders in this file for an exhaustive list.
+
+Other pixel formats will be transferred as 32 bits integer red-channel texture but this
+won't work with certain GPU (Intel GMA); the corresponding shaders are not shown here. 
+However, it should not be necessary to use any of them as the list below covers all practical
+cases of video capture with all types of Decklink product.
+
+In other words, only use one of the pixel format below and you will be fine. Note that depending
+on the video stream, only certain pixel formats will be allowed (others will throw an exception).
+For example, to capture a PAL video stream, you must use one of the YUV formats. 
+
+To find which pixel format is suitable for a particular video stream, use the 'Media Express'
+utility that comes with the Decklink software : if you see the video in the 'Log and Capture'
+Window, you have selected the right pixel format and you can use the same in Blender.
+
+Notes: * these shaders only decode the RGB channel and set the alpha channel to a fixed
+value (look for color.a = ). It's up to you to add postprocessing to the color.
+       * these shaders are compatible with 2D and 3D video stream
+"""
+import bge
+from bge import logic
+from bge import texture as vt
+
+# The default vertex shader, because we need one
+#
+VertexShader = """
+#version 130
+   void main()
+   {
+      gl_Position = gl_ModelViewProjectionMatrix * gl_Vertex;
+      gl_TexCoord[0] = gl_MultiTexCoord0;
+   }
+    
+"""
+
+# For use with RGB video stream: the pixel is directly usable
+#
+FragmentShader_R10l = """
+    #version 130
+    uniform sampler2D tex;
+    // stereo = 1.0 if 2D image, =0.5 if 3D (left eye below, right eye above)
+    uniform float stereo;
+    // eye = 0.0 for the left eye, 0.5 for the right eye
+    uniform float eye;
+
+    void main(void)
+    {
+        vec4 color;
+        float tx, ty;
+        tx = gl_TexCoord[0].x;
+        ty = eye+gl_TexCoord[0].y*stereo;
+        color = texture(tex, vec2(tx,ty));
+        color.a = 0.7;
+        gl_FragColor = color; 
+    }
+"""
+
+# For use with YUV video stream
+#
+FragmentShader_2vuy = """
+    #version 130
+    uniform sampler2D tex; 
+    // stereo = 1.0 if 2D image, =0.5 if 3D (left eye below, right eye above)
+    uniform float stereo;
+    // eye = 0.0 for the left eye, 0.5 for the right eye
+    uniform float eye;
+
+    void main(void) 
+    {
+        vec4 color;
+        float tx, ty, width, Y, Cb, Cr; 
+        int px;
+        tx = gl_TexCoord[0].x; 
+        ty = eye+gl_TexCoord[0].y*stereo;
+        width = float(textureSize(tex, 0).x);
+        color = texture(tex, vec2(tx, ty));
+        px = int(floor(fract(tx*width)*2.0));
+        switch (px) {
+        case 0:
+            Y = color.g;
+            break;
+        case 1:
+            Y = color.a;
+            break;
+        }
+        Y = (Y - 0.0625) * 1.168949772; 
+        Cb = (color.b - 0.0625) * 1.142857143 - 0.5; 
+        Cr = (color.r - 0.0625) * 1.142857143 - 0.5; 
+        color.r = Y + 1.5748 * Cr; 
+        color.g = Y - 0.1873 * Cb - 0.4681 * Cr;
+        color.b = Y + 1.8556 * Cb;
+        color.a = 0.7;
+        gl_FragColor = color; 
+    }
+"""
+
+# For use with high resolution YUV
+#
+FragmentShader_v210 = """
+    #version 130
+    uniform sampler2D tex; 
+    // stereo = 1.0 if 2D image, =0.5 if 3D (left eye below, right eye above)
+    uniform float stereo;
+    // eye = 0.0 for the left eye, 0.5 for the right eye
+    uniform float eye;
+
+    void main(void) 
+    {
+        vec4 color, color1, color2, color3;
+        int px;
+        float tx, ty, width, sx, dx, bx, Y, Cb, Cr; 
+        tx = gl_TexCoord[0].x; 
+        ty = eye+gl_TexCoord[0].y*stereo; 
+        width = float(textureSize(tex, 0).x);
+        // to sample macro pixels (6 pixels in 4 words)
+        sx = tx*width*0.25+0.01;
+        // index of display pixel in the macro pixel 0..5
+        px = int(floor(fract(sx)*6.0));
+        // increment as we sample the macro pixel
+        dx = 1.0/width;
+        // base x coord of macro pixel
+        bx = (floor(sx)+0.01)*dx*4.0;
+        color = texture(tex, vec2(bx, ty));
+        color1 = texture(tex, vec2(bx+dx, ty));
+        color2 = texture(tex, vec2(bx+dx*2.0, ty));
+        color3 = texture(tex, vec2(bx+dx*3.0, ty));
+        switch (px) {
+        case 0:
+        case 1:
+            Cb = color.b;
+            Cr = color.r;
+            break;
+        case 2:
+        case 3:
+            Cb = color1.g;
+            Cr = color2.b;
+            break;
+        default:
+            Cb = color2.r;
+            Cr = color3.g;
+            break;
+        }
+        switch (px) {
+        case 0:
+            Y = color.g;
+            break;
+        case 1:
+            Y = color1.b;
+            break;
+        case 2:
+            Y = color1.r;
+            break;
+        case 3:
+            Y = color2.g;
+            break;
+        case 4:
+            Y = color3.b;
+            break;
+        default:
+            Y = color3.r;
+            break;
+        }
+        Y = (Y - 0.0625) * 1.168949772; 
+        Cb = (Cb - 0.0625) * 1.142857143 - 0.5; 
+        Cr = (Cr - 0.0625) * 1.142857143 - 0.5; 
+        color.r = Y + 1.5748 * Cr; 
+        color.g = Y - 0.1873 * Cb - 0.4681 * Cr;
+        color.b = Y + 1.8556 * Cb;
+        color.a = 0.7;
+        gl_FragColor = color; 
+    }
+"""
+
+# The exhausitve list of pixel formats that are transferred as float texture
+# Only use those for greater efficiency and compatiblity.
+#
+fg_shaders = {
+    '2vuy'       :FragmentShader_2vuy,
+    '8BitYUV'    :FragmentShader_2vuy,
+    'v210'       :FragmentShader_v210,
+    '10BitYUV'   :FragmentShader_v210,
+    '8BitBGRA'   :FragmentShader_R10l,
+    'BGRA'       :FragmentShader_R10l,
+    '8BitARGB'   :FragmentShader_R10l,
+    '10BitRGBXLE':FragmentShader_R10l,
+    'R10l'       :FragmentShader_R10l
+    }
+
+
+    
+
+#
+# Helper function to attach a pixel shader to the material that receives the video frame.
+#
+
+def config_video(obj, format, pixel, is3D=False, mat=0, card=0):
+    if not pixel in fg_shaders:
+        raise('Unsuported shader')
+    shader = obj.meshes[0].materials[mat].getShader()
+    if shader != None and not shader.isValid():
+        shader.setSource(VertexShader, fg_shaders[pixel], True)
+        shader.setSampler('tex', 0)
+        shader.setUniformEyef("eye")
+        shader.setUniform1f("stereo", 0.5 if is3D else 1.0)
+    tex = vt.Texture(obj, mat)
+    tex.source = vt.VideoDeckLink(format + "/" + pixel + ("/3D" if is3D else ""), card)
+    print("frame rate: ", tex.source.framerate)
+    tex.source.play()
+    obj["video"] = tex
+
+#
+# Attach this function to an object that has a material with texture
+# and call it once to initialize the object
+# 
+def init(cont):
+    #config_video(cont.owner, 'HD720p5994', '8BitBGRA')    
+    #config_video(cont.owner, 'HD720p5994', '8BitYUV')    
+    #config_video(cont.owner, 'pal ', '10BitYUV')    
+    config_video(cont.owner, 'pal ', '8BitYUV')    
+       
+
+#
+# To be called on every frame
+#
+def play(cont):
+    obj = cont.owner
+    if hasattr(obj, "video"):
+        obj["video"].refresh(True)
+
--- a/doc/python_api/rst/bge.logic.rst
+++ b/doc/python_api/rst/bge.logic.rst
@@ -378,6 +378,27 @@ General functions

   Render next frame (if Python has control)

+.. function:: setRender(render)
+
+   Sets the global flag that controls the render of the scene. 
+   If True, the render is done after the logic frame.
+   If False, the render is skipped and another logic frame starts immediately.
+
+   .. note::
+      GPU VSync no longer limits the number of frame per second when render is off, 
+      but the 'Use Frame Rate' option still regulates the fps. To run as many frames
+      as possible, untick this option (Render Properties, System panel)
+
+   :arg render: the render flag
+   :type render: bool
+
+.. function:: getRender()
+
+   Get the current value of the global render flag
+
+   :return: The flag value
+   :rtype: bool
+
 **********************
 Time related functions
 **********************
--- a/doc/python_api/rst/bge.render.rst
+++ b/doc/python_api/rst/bge.render.rst
@@ -90,6 +90,43 @@ Constants

   Right eye being used during stereoscopic rendering.

+.. data:: RAS_OFS_RENDER_BUFFER
+
+   The pixel buffer for offscreen render is a RenderBuffer. Argument to :func:`offScreenCreate`
+
+.. data:: RAS_OFS_RENDER_TEXTURE
+
+   The pixel buffer for offscreen render is a Texture. Argument to :func:`offScreenCreate`
+
+*****
+Types
+*****
+
+.. class:: RASOffScreen
+
+   An off-screen render buffer object. 
+
+   Use :func:`offScreenCreate` to create it.
+   Currently it can only be used in the :class:`bge.texture.ImageRender` constructor to render on a FBO rather than the 
+   default viewport.
+
+  .. attribute:: width
+
+     The width in pixel of the FBO
+
+     :type: integer
+
+  .. attribute:: height
+
+     The height in pixel of the FBO
+
+     :type: integer
+
+  .. attribute:: color
+
+     The underlying OpenGL bind code of the texture object that holds the rendered image, 0 if the FBO is using RenderBuffer. The choice between RenderBuffer and Texture is determined by the target argument of :func:`offScreenCreate`.
+
+     :type: integer

 *********
 Functions
@@ -362,3 +399,18 @@ Functions
   Get the current vsync value

   :rtype: One of VSYNC_OFF, VSYNC_ON, VSYNC_ADAPTIVE
+
+.. function:: offScreenCreate(width,height[,samples=0][,target=bge.render.RAS_OFS_RENDER_BUFFER])
+
+   Create a Off-screen render buffer object.
+
+   :arg width: the width of the buffer in pixels
+   :type width: integer
+   :arg height: the height of the buffer in pixels
+   :type height: integer
+   :arg samples: the number of multisample for anti-aliasing (MSAA), 0 to disable MSAA
+   :type samples: integer
+   :arg target: the pixel storage: :data:`RAS_OFS_RENDER_BUFFER` to render on RenderBuffers (the default), :data:`RAS_OFS_RENDER_TEXTURE` to render on texture. The later is interesting if you want to access the texture directly (see :attr:`RASOffScreen.color`). Otherwise the default is preferable as it's more widely supported by GPUs and more efficient. If the GPU does not support MSAA+Texture (e.g. Intel HD GPU), MSAA will be disabled.
+   :type target: integer
+   :rtype: :class:`RASOffScreen`
+
--- a/doc/python_api/rst/bge.texture.rst
+++ b/doc/python_api/rst/bge.texture.rst
@@ -51,6 +51,13 @@ When the texture object is deleted, the new texture is deleted and the old textu
   :lines: 8-
   
   
+.. include:: ../examples/bge.texture.2.py
+   :start-line: 1
+   :end-line: 6
+
+.. literalinclude:: ../examples/bge.texture.2.py
+   :lines: 8-
+
 *************
 Video classes
 *************
@@ -173,12 +180,17 @@ Video classes
      :return: Whether the video was playing.
      :rtype: bool

-   .. method:: refresh()
+   .. method:: refresh(buffer=None, format="RGBA", ts=-1.0)

-      Refresh video - get its status.
-      
-      :value: see `FFmpeg Video and Image Status`_.
+      Refresh video - get its status and optionally copy the frame to an external buffer.

+      :arg buffer: An optional object that implements the buffer protocol. If specified, the image is copied to the buffer, which must be big enough or an exception is thrown.
+      :type buffer: any buffer type
+      :arg format: An optional image format specifier for the image that will be copied to the buffer. Only valid values are "RGBA" or "BGRA"
+      :type format: str
+      :arg ts: An optional timestamp (in seconds from the start of the movie) of the frame to be copied to the buffer.
+      :type ts: float
+      :return: see `FFmpeg Video and Image Status`_.
      :rtype: int

 *************
@@ -244,12 +256,15 @@ Image classes
         * :class:`FilterRGB24`
         * :class:`FilterRGBA32`

-   .. method:: refresh()
+   .. method:: refresh(buffer=None, format="RGBA")

-      Refresh image, i.e. load it.
+      Refresh image, get its status and optionally copy the frame to an external buffer.
      
-      :value: see `FFmpeg Video and Image Status`_.
-
+      :arg buffer: An optional object that implements the buffer protocol. If specified, the image is copied to the buffer, which must be big enough or an exception is thrown.
+      :type buffer: any buffer type
+      :arg format: An optional image format specifier for the image that will be copied to the buffer. Only valid values are "RGBA" or "BGRA"
+      :type format: str
+      :return: see `FFmpeg Video and Image Status`_.
      :rtype: int

   .. method:: reload(newname=None)
@@ -411,9 +426,14 @@ Image classes
      
      :type: :class:`~bgl.Buffer` or None

-   .. method:: refresh()
+   .. method:: refresh(buffer=None, format="RGBA")

-      Refresh image - invalidate its current content.
+      Refresh image - render and copy the image to an external buffer (optional) then invalidate its current content.
+
+      :arg buffer: An optional object that implements the buffer protocol. If specified, the image is rendered and copied to the buffer, which must be big enough or an exception is thrown.
+      :type buffer: any buffer type
+      :arg format: An optional image format specifier for the image that will be copied to the buffer. Only valid values are "RGBA" or "BGRA"
+      :type format: str

   .. attribute:: scale

@@ -498,9 +518,14 @@ Image classes
      
      :type: :class:`~bgl.Buffer` or None

-   .. method:: refresh()
+   .. method:: refresh(buffer=None, format="RGBA")

-      Refresh image - invalidate its current content.
+      Refresh image - calculate and copy the image to an external buffer (optional) then invalidate its current content.
+
+      :arg buffer: An optional object that implements the buffer protocol. If specified, the image is calculated and copied to the buffer, which must be big enough or an exception is thrown.
+      :type buffer: any buffer type
+      :arg format: An optional image format specifier for the image that will be copied to the buffer. Only valid values are "RGBA" or "BGRA"
+      :type format: str

   .. attribute:: scale

@@ -545,14 +570,17 @@ Image classes

      :type: bool

-.. class:: ImageRender(scene, camera)
+.. class:: ImageRender(scene, camera, fbo=None)

-   Image source from render.
+   Image source from render. The render is done on a custom framebuffer object if fbo is specified, otherwise on 
+   the default framebuffer.
   
   :arg scene: Scene in which the image has to be taken.
   :type scene: :class:`~bge.types.KX_Scene`
   :arg camera: Camera from which the image has to be taken.
   :type camera: :class:`~bge.types.KX_Camera`
+   :arg fbo: Off-screen render buffer object (optional)
+   :type fbo: :class:`~bge.render.RASOffScreen`

   .. attribute:: alpha

@@ -599,10 +627,6 @@ Image classes
      
      :type: :class:`~bgl.Buffer` or None

-   .. method:: refresh()
-
-      Refresh image - invalidate its current content.
-
   .. attribute:: scale

      Fast scale of image (near neighbour).
@@ -640,6 +664,25 @@ Image classes
      
      :type: bool

+   .. method:: render()
+
+      Render the scene but do not extract the pixels yet. The function returns as soon as the render commands have been send to the GPU. The render will proceed asynchronously in the GPU while the host can perform other tasks. To complete the render, you can either call :func:`refresh` directly of refresh the texture of which this object is the source. This method is useful to implement asynchronous render for optimal performance: call render() on frame n and refresh() on frame n+1 to give as much as time as possible to the GPU to render the frame while the game engine can perform other tasks.
+
+      :return: True if the render was initiated, False if the render cannot be performed (e.g. the camera is active)
+      :rtype: bool
+
+   .. method:: refresh()
+   .. method:: refresh(buffer, format="RGBA")
+
+      Refresh video - render and optionally copy the image to an external buffer then invalidate its current content. The render may have been started earlier with the :func:`render` method, in which case this function simply waits for the render operations to complete. When called without argument, the pixels are not extracted but the render is guaranteed to be completed when the function returns. This only makes sense with offscreen render on texture target (see :func:`~bge.render.offScreenCreate`).
+
+      :arg buffer: An object that implements the buffer protocol. If specified, the image is copied to the buffer, which must be big enough or an exception is thrown. The transfer to the buffer is optimal if no processing of the image is needed. This is the case if flip=False, alpha=True, scale=False, whole=True, depth=False, zbuff=False and no filter is set.
+      :type buffer: any buffer type of sufficient size
+      :arg format: An optional image format specifier for the image that will be copied to the buffer. Only valid values are "RGBA" or "BGRA"
+      :type format: str
+      :return: True if the render is complete, False if the render cannot be performed (e.g. the camera is active)
+      :rtype: bool
+
 .. class:: ImageViewport

   Image source from viewport.
@@ -689,9 +732,14 @@ Image classes
      
      :type: sequence of two ints

-   .. method:: refresh()
+   .. method:: refresh(buffer=None, format="RGBA")

-      Refresh image - invalidate its current content.
+      Refresh video - copy the viewport to an external buffer (optional) then invalidate its current content.
+
+      :arg buffer: An optional object that implements the buffer protocol. If specified, the image is copied to the buffer, which must be big enough or an exception is thrown. The transfer to the buffer is optimal if no processing of the image is needed. This is the case if flip=False, alpha=True, scale=False, whole=True, depth=False, zbuff=False and no filter is set.
+      :type buffer: any buffer type
+      :arg format: An optional image format specifier for the image that will be copied to the buffer. Only valid values are "RGBA" or "BGRA"
+      :type format: str

   .. attribute:: scale

@@ -730,7 +778,174 @@ Image classes
      
      :type: bool

+.. class:: VideoDeckLink(format, capture=0)
+
+   Image source from an external video stream captured with a DeckLink video card from 
+   Black Magic Design.
+   Before this source can be used, a DeckLink hardware device must be installed, it can be a PCIe card
+   or a USB device, and the 'Desktop Video' software package (version 10.4 or above must be installed)
+   on the host as described in the DeckLink documentation.
+   If in addition you have a recent nVideo Quadro card, you can benefit from the 'GPUDirect' technology
+   to push the captured video frame very efficiently to the GPU. For this you need to install the 
+   'DeckLink SDK' version 10.4 or above and copy the 'dvp.dll' runtime library to Blender's 
+   installation directory or to any other place where Blender can load a DLL from.
   
+   :arg format: string describing the video format to be captured. 
+   :type format: str
+   :arg capture: Card number from which the input video must be captured.
+   :type capture: int
+
+   The format argument must be written as “<displayMode>/<pixelFormat>[/3D][:<cacheSize>]” where <displayMode> 
+   describes the frame size and rate and <pixelFormat> the encoding of the pixels. 
+   The optional /3D suffix is to be used if the video stream is stereo with a left and right eye feed.
+   The optional :<cacheSize> suffix determines the number of the video frames kept in cache, by default 8. 
+   Some DeckLink cards won't work below a certain cache size.  The default value 8 should be sufficient for all cards.
+   You may try to reduce the cache size to reduce the memory footprint. For example the The 4K Extreme is known
+   to work with 3 frames only, the Extreme 2 needs 4 frames and the Intensity Shuttle needs 6 frames, etc. 
+   Reducing the cache size may be useful when Decklink is used in conjunction with GPUDirect: all frames must be locked
+   in memory in that case and that puts a lot of pressure on memory.  If you reduce the cache size too much, 
+   you'll get no error but no video feed either.
+   
+   The valid <displayMode> values are copied from the 'BMDDisplayMode' enum in the DeckLink API 
+   without the 'bmdMode' prefix. In case a mode that is not in this list is added in a later version 
+   of the SDK, it is also possible to specify the 4 letters of the internal code for that mode. 
+   You will find the internal code in the DeckLinkAPIModes.h file that is part of the SDK.
+   Here is for reference the full list of supported display modes with their equivalent internal code:
+    * NTSC 'ntsc'
+    * NTSC2398 	'nt23'
+    * PAL		'pal '
+    * NTSCp		'ntsp'
+    * PALp		'palp'
+   HD 1080 Modes:
+    * HD1080p2398	'23ps'
+    * HD1080p24	'24ps'
+    * HD1080p25	'Hp25'
+    * HD1080p2997	'Hp29'
+    * HD1080p30	'Hp30'
+    * HD1080i50	'Hi50'
+    * HD1080i5994	'Hi59'
+    * HD1080i6000	'Hi60'
+    * HD1080p50	'Hp50'
+    * HD1080p5994	'Hp59'
+    * HD1080p6000	'Hp60'
+   HD 720 Modes:
+    * HD720p50	'hp50'
+    * HD720p5994	'hp59'
+    * HD720p60	'hp60'
+   2k Modes
+    * 2k2398	'2k23'
+    * 2k24		'2k24'
+    * 2k25		'2k25'
+   4k Modes
+    * 4K2160p2398	'4k23'
+    * 4K2160p24	'4k24'
+    * 4K2160p25	'4k25'
+    * 4K2160p2997	'4k29'
+    * 4K2160p30	'4k30'
+    * 4K2160p50	'4k50'
+    * 4K2160p5994	'4k59'
+    * 4K2160p60	'4k60'
+   Most of names are self explanatory. If necessary refer to the DeckLink API documentation for more information.
+
+   Similarly, <pixelFormat> is copied from the BMDPixelFormat enum. 
+   Here is for reference the full list of supported pixel format and their equivalent internal code:
+    * 8BitYUV	'2vuy'
+    * 10BitYUV	'v210'
+    * 8BitARGB	* no equivalent code *
+    * 8BitBGRA	'BGRA'
+    * 10BitRGB	'r210'
+    * 12BitRGB	'R12B'
+    * 12BitRGBLE	'R12L'
+    * 10BitRGBXLE	'R10l'
+    * 10BitRGBX	'R10b'
+   Refer to the DeckLink SDK documentation for a full description of these pixel format. 
+   It is important to understand them as the decoding of the pixels is NOT done in VideoTexture 
+   for performance reason. Instead a specific shader must be used to decode the pixel in the GPU. 
+   Only the '8BitARGB', '8BitBGRA' and '10BitRGBXLE' pixel formats are mapped directly to OpenGL RGB float textures.
+   The '8BitYUV' and '10BitYUV' pixel formats are mapped to openGL RGB float texture but require a shader to decode.
+   The other pixel formats are sent as a 'GL_RED_INTEGER' texture (i.e. a texture with only the 
+   red channel coded as an unsigned 32 bit integer) and are not recommended for use.
+
+   Example: “HD1080p24/10BitYUV/3D:4” is equivalent to “24ps/v210/3D:4” and represents a full HD stereo feed at 24 frame per second and 4 frames cache size.
+
+   Although video format auto detection is possible with certain DeckLink devices, the corresponding 
+   API is NOT implemented in the BGE. Therefore it is important to specify the format string that 
+   matches exactly the video feed. If the format is wrong, no frame will be captured. 
+   It should be noted that the pixel format that you need to specify is not necessarily the actual 
+   format in the video feed. For example, the 4K Extreme card delivers 8bit RGBs pixels in the 
+   '10BitRGBXLE' format. Use the 'Media Express' application included in 'Desktop Video' to discover 
+   which pixel format works for a particular video stream.
+
+   .. attribute:: status
+
+      Status of the capture: 1=ready to use, 2=capturing, 3=stopped
+      
+      :type: int
+   
+   .. attribute:: framerate
+
+      Capture frame rate as computed from the video format.
+
+      :type: float
+
+   .. attribute:: valid
+
+      Tells if the image attribute can be used to retrieve the image. 
+      Always False in this implementation (the image is not available at python level)
+
+      :type: bool
+
+   .. attribute:: image
+
+      The image data. Always None in this implementation.
+
+      :type: :class:`~bgl.Buffer` or None
+
+   .. attribute:: size
+
+      The size of the frame in pixel.
+      Stereo frames have double the height of the video frame, i.e. 3D is delivered to the GPU
+      as a single image in top-bottom order, left eye on top.
+
+      :type: (int,int)
+
+   .. attribute:: scale
+
+      Not used in this object.
+
+      :type: bool
+
+   .. attribute:: flip
+
+      Not used in this object.
+
+      :type: bool
+
+   .. attribute:: filter
+
+      Not used in this object.
+
+   .. method:: play()
+
+      Kick-off the capture after creation of the object.
+
+      :return: True if the capture could be started, False otherwise.
+      :rtype: bool
+
+   .. method:: pause()
+
+      Temporary stops the capture. Use play() to restart it.
+
+      :return: True if the capture could be paused, False otherwise.
+      :rtype: bool
+
+   .. method:: stop()
+
+      Stops the capture.
+
+      :return: True if the capture could be stopped, False otherwise.
+      :rtype: bool
+
 ***************
 Texture classes
 ***************
@@ -782,6 +997,7 @@ Texture classes
      :type: one of...
      
         * :class:`VideoFFmpeg`
+         * :class:`VideoDeckLink`
         * :class:`ImageFFmpeg`
         * :class:`ImageBuff`
         * :class:`ImageMirror`
@@ -789,7 +1005,125 @@ Texture classes
         * :class:`ImageRender`
         * :class:`ImageViewport`

+.. class:: DeckLink(cardIdx=0, format="")
+
+   Certain DeckLink devices can be used to playback video: the host sends video frames regularly
+   for immediate or scheduled playback. The video feed is outputted on HDMI or SDI interfaces.
+   This class supports the immediate playback mode: it has a source attribute that is assigned
+   one of the source object in the bge.texture module. Refreshing the DeckLink object causes
+   the image source to be computed and sent to the DeckLink device for immediate transmission
+   on the output interfaces.  Keying is supported: it allows to composite the frame with an 
+   input video feed that transits through the DeckLink card.
+
+   :arg cardIdx: Number of the card to be used for output (0=first card). It should be noted that DeckLink devices are usually half duplex: they can either be used for capture or playback but not both at the same time.
+   :type cardIdx: int
+   :arg format: String representing the display mode of the output feed.
+   :type format: str
+
+   The default value of the format argument is reserved for auto detection but it is currently
+   not supported (it will generate a runtime error) and thus the video format must be explicitly
+   specified. If keying is the goal (see keying attributes), the format must match exactly the
+   input video feed, otherwise it can be any format supported by the device (there will be a
+   runtime error if not).
+   The format of the string is “<displayMode>[/3D]”. 
+
+   Refer to :class:`VideoDeckLink` to get the list of acceptable <displayMode>. 
+   The optional “/3D” suffix is used to create a stereo 3D feed. In that case the 'right' attribute
+   must also be set to specify the image source for the right eye.
+  
+   Note: The pixel format is not specified here because it is always BGRA. The alpha channel is
+   used in keying to mix the source with the input video feed, otherwise it is not used.  
+   If a conversion is needed to match the native video format, it is done inside the DeckLink driver
+   or device. 
+
+   .. attribute:: source
+
+      This attribute must be set to one of the image source. If the image size does not fit exactly
+      the frame size, the extend attribute determines what to do. 
+
+      For best performance, the source image should match exactly the size of the output frame.
+      A further optimization is achieved if the image source object is ImageViewport or ImageRender
+      set for whole viewport, flip disabled and no filter: the GL frame buffer is copied directly
+      to the image buffer and directly from there to the DeckLink card (hence no buffer to buffer
+      copy inside VideoTexture). 
+
+      :type: one of...
+         * :class:`VideoFFmpeg`
+         * :class:`VideoDeckLink`
+         * :class:`ImageFFmpeg`
+         * :class:`ImageBuff`
+         * :class:`ImageMirror`
+         * :class:`ImageMix`
+         * :class:`ImageRender`
+         * :class:`ImageViewport`
   
+   .. attribute:: right
+
+      If the video format is stereo 3D, this attribute should be set to an image source object
+      that will produce the right eye images.  If the goal is to render the BGE scene in 3D, 
+      it can be achieved with 2 cameras, one for each eye, used by 2 ImageRender with an offscreen
+      render buffer that is just the size of the video frame.
+
+      :type: one of...
+         * :class:`VideoFFmpeg`
+         * :class:`VideoDeckLink`
+         * :class:`ImageFFmpeg`
+         * :class:`ImageBuff`
+         * :class:`ImageMirror`
+         * :class:`ImageMix`
+         * :class:`ImageRender`
+         * :class:`ImageViewport`
+
+   .. attribute:: keying
+
+      Specify if keying is enabled. False (default): the output frame is sent unmodified on
+      the output interface (in that case no input video is required). True: the output frame
+      is mixed with the input video, using the alpha channel to blend the two images and the
+      combination is sent on the output interface. 
+
+      :type: bool
+
+   .. attribute:: level
+
+      If keying is enabled, sets the keying level from 0 to 255. This value is a global alpha value
+      that multiplies the alpha channel of the image source. Use 255 (the default) to keep the alpha
+      channel unmodified, 0 to make the output frame totally transparent. 
+
+      :type: int
+
+   .. attribute:: extend
+
+      Determines how the image source should be mapped if the size does not fit the video frame size.
+      * False (the default): map the image pixel by pixel. 
+      If the image size is smaller than the frame size, extra space around the image is filled with
+      0-alpha black. If it is larger, the image is cropped to fit the frame size. 
+      * True: the image is scaled by the nearest neighbor algorithm to fit the frame size. 
+      The scaling is fast but poor quality. For best results, always adjust the image source to
+      match the size of the output video.
+
+      :type: bool
+
+   .. method:: close()
+
+      Close the DeckLink device and release all resources. After calling this method, 
+      the object cannot be reactivated, it must be destroyed and a new DeckLink object
+      created from fresh to restart the output.
+
+   .. method:: refresh(refresh_source,ts)
+
+      This method must be called frequently to update the output frame in the DeckLink device.
+
+      :arg refresh_source: True if the source objects image buffer should be invalidated after being
+                           used to compute the output frame. This triggers the recomputing of the
+                           source image on next refresh, which is normally the desired effect. 
+                           False if the image source buffer should stay valid and reused on next refresh.
+                           Note that the DeckLink device stores the output frame and replays until a
+                           new frame is sent from the host. Thus, it is not necessary to refresh the
+                           DeckLink object if it is known that the image source has not changed.
+      :type refresh_source: bool
+      :arg ts: The timestamp value passed to the image source object to compute the image. If unspecified, the BGE clock is used.
+      :type ts: float
+
 **************
 Filter classes
 **************
--- a/doc/python_api/rst/bge_types/bge.types.BL_Shader.rst
+++ b/doc/python_api/rst/bge_types/bge.types.BL_Shader.rst
@@ -214,6 +214,16 @@ base class --- :class:`PyObjectPlus`
      :arg iList: a list (2, 3 or 4 elements) of integer values
      :type iList: list[integer]

+   .. method:: setUniformEyef(name)
+
+      Set a uniform with a float value that reflects the eye being render in stereo mode: 
+      0.0 for the left eye, 0.5 for the right eye. In non stereo mode, the value of the uniform
+      is fixed to 0.0. The typical use of this uniform is in stereo mode to sample stereo textures
+      containing the left and right eye images in a top-bottom order. 
+
+      :arg name: the uniform name
+      :type name: string
+
   .. method:: validate()

      Validate the shader object.
--- a/doc/python_api/rst/bge_types/bge.types.KX_LightObject.rst
+++ b/doc/python_api/rst/bge_types/bge.types.KX_LightObject.rst
@@ -60,37 +60,37 @@ base class --- :class:`KX_GameObject`

      :type: float (read only)

-   ..attribute:: shadowFrustumSize
+   .. attribute:: shadowFrustumSize

      Size of the frustum used for creating the shadowmap.

      :type: float (read only)

-   ..attribute:: shadowBindId
+   .. attribute:: shadowBindId

      The OpenGL shadow texture bind number/id.

      :type: int (read only)

-   ..attribute:: shadowMapType
+   .. attribute:: shadowMapType

      The shadow shadow map type (0 -> Simple; 1 -> Variance)

      :type: int (read only)

-   ..attribute:: shadowBias
+   .. attribute:: shadowBias

      The shadow buffer sampling bias.

      :type: float (read only)

-   ..attribute:: shadowBleedBias
+   .. attribute:: shadowBleedBias

      The bias for reducing light-bleed on variance shadow maps.

      :type: float (read only)

-   ..attribute:: useShadow
+   .. attribute:: useShadow

      Returns True if the light has Shadow option activated, else returns False.

--- a/doc/python_api/sphinx_doc_gen.sh
+++ b/doc/python_api/sphinx_doc_gen.sh
@@ -61,7 +61,7 @@ if $DO_EXE_BLENDER ; then
 		--python-exit-code 1 \
 		--python $SPHINXBASE/sphinx_doc_gen.py

-	if (($? == 1)) ; then
+	if (($? != 0)) ; then
 		echo "Generating documentation failed, aborting"
 		exit 1
 	fi
--- a/extern/Eigen3/README.blender
+++ b/extern/Eigen3/README.blender
@@ -0,0 +1,6 @@
+Project: Eigen, template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms
+URL: http://eigen.tuxfamily.org/index.php?title=Main_Page
+License: GPLv3+
+Upstream version: 3.2.7
+Local modifications:
+- OpenMP fix for MSVC2015, see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1131
--- a/extern/binreloc/README.blender
+++ b/extern/binreloc/README.blender
@@ -0,0 +1,6 @@
+Project: AutoPackage
+URL: http://autopackage.org/docs/binreloc (original, defunct)
+     http://alien.cern.ch/cache/autopackage-1.0/site/docs/binreloc/ (cache)
+License: Public Domain
+Upstream version: Unknown (Last Release)
+Local modifications: None
--- a/extern/carve/README.blender
+++ b/extern/carve/README.blender
@@ -0,0 +1,4 @@
+Project: Carve, CSG library
+URL: https://code.google.com/archive/p/carve/
+Upstream version 9a85d733a43d
+Local modifications: See patches/ folder
--- a/extern/ceres/README.blender
+++ b/extern/ceres/README.blender
@@ -0,0 +1,4 @@
+Project: Ceres Solver
+URL: http://ceres-solver.org/
+Upstream version 1.11 (aef9c9563b08d5f39eee1576af133a84749d1b48)
+Local modifications: None
--- a/extern/clew/README.blender
+++ b/extern/clew/README.blender
@@ -0,0 +1,5 @@
+Project: OpenCL Wrangler
+URL: https://github.com/OpenCLWrangler/clew
+License: Apache 2.0
+Upstream version: 277db43
+Local modifications: None
--- a/extern/cuew/README.blender
+++ b/extern/cuew/README.blender
@@ -0,0 +1,5 @@
+Project: Cuda Wrangler
+URL: https://github.com/CudaWrangler/cuew
+License: Apache 2.0
+Upstream version: e2e0315
+Local modifications: None
--- a/extern/cuew/include/cuew.h
+++ b/extern/cuew/include/cuew.h
@@ -131,8 +131,8 @@ typedef struct CUsurfref_st* CUsurfref;
 typedef struct CUevent_st* CUevent;
 typedef struct CUstream_st* CUstream;
 typedef struct CUgraphicsResource_st* CUgraphicsResource;
-typedef unsigned CUtexObject;
-typedef unsigned CUsurfObject;
+typedef unsigned long long CUtexObject;
+typedef unsigned long long CUsurfObject;

 typedef struct CUuuid_st {
  char bytes[16];
@@ -603,7 +603,7 @@ typedef struct CUDA_ARRAY_DESCRIPTOR_st {
  size_t Width;
  size_t Height;
  CUarray_format Format;
-  unsigned NumChannels;
+  unsigned int NumChannels;
 } CUDA_ARRAY_DESCRIPTOR;

 typedef struct CUDA_ARRAY3D_DESCRIPTOR_st {
@@ -611,8 +611,8 @@ typedef struct CUDA_ARRAY3D_DESCRIPTOR_st {
  size_t Height;
  size_t Depth;
  CUarray_format Format;
-  unsigned NumChannels;
-  unsigned Flags;
+  unsigned int NumChannels;
+  unsigned int Flags;
 } CUDA_ARRAY3D_DESCRIPTOR;

 typedef struct CUDA_RESOURCE_DESC_st {
@@ -627,13 +627,13 @@ typedef struct CUDA_RESOURCE_DESC_st {
    struct {
      CUdeviceptr devPtr;
      CUarray_format format;
-      unsigned numChannels;
+      unsigned int numChannels;
      size_t sizeInBytes;
    } linear;
    struct {
      CUdeviceptr devPtr;
      CUarray_format format;
-      unsigned numChannels;
+      unsigned int numChannels;
      size_t width;
      size_t height;
      size_t pitchInBytes;
@@ -642,14 +642,14 @@ typedef struct CUDA_RESOURCE_DESC_st {
      int reserved[32];
    } reserved;
  } res;
-  unsigned flags;
+  unsigned int flags;
 } CUDA_RESOURCE_DESC;

 typedef struct CUDA_TEXTURE_DESC_st {
  CUaddress_mode addressMode[3];
  CUfilter_mode filterMode;
-  unsigned flags;
-  unsigned maxAnisotropy;
+  unsigned int flags;
+  unsigned int maxAnisotropy;
  CUfilter_mode mipmapFilterMode;
  float mipmapLevelBias;
  float minMipmapLevelClamp;
@@ -700,19 +700,19 @@ typedef struct CUDA_RESOURCE_VIEW_DESC_st {
  size_t width;
  size_t height;
  size_t depth;
-  unsigned firstMipmapLevel;
-  unsigned lastMipmapLevel;
-  unsigned firstLayer;
-  unsigned lastLayer;
-  unsigned reserved[16];
+  unsigned int firstMipmapLevel;
+  unsigned int lastMipmapLevel;
+  unsigned int firstLayer;
+  unsigned int lastLayer;
+  unsigned int reserved[16];
 } CUDA_RESOURCE_VIEW_DESC;

 typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st {
-  unsigned p2pToken;
-  unsigned vaSpaceToken;
+  unsigned long long p2pToken;
+  unsigned int vaSpaceToken;
 } CUDA_POINTER_ATTRIBUTE_P2P_TOKENS;
-typedef unsigned GLenum;
-typedef unsigned GLuint;
+typedef unsigned int GLenum;
+typedef unsigned int GLuint;
 typedef int GLint;

 typedef enum CUGLDeviceList_enum {
@@ -751,7 +751,7 @@ typedef struct _nvrtcProgram* nvrtcProgram;
 /* Function types. */
 typedef CUresult CUDAAPI tcuGetErrorString(CUresult error, const char* pStr);
 typedef CUresult CUDAAPI tcuGetErrorName(CUresult error, const char* pStr);
-typedef CUresult CUDAAPI tcuInit(unsigned Flags);
+typedef CUresult CUDAAPI tcuInit(unsigned int Flags);
 typedef CUresult CUDAAPI tcuDriverGetVersion(int* driverVersion);
 typedef CUresult CUDAAPI tcuDeviceGet(CUdevice* device, int ordinal);
 typedef CUresult CUDAAPI tcuDeviceGetCount(int* count);
@@ -762,17 +762,17 @@ typedef CUresult CUDAAPI tcuDeviceGetProperties(CUdevprop* prop, CUdevice dev);
 typedef CUresult CUDAAPI tcuDeviceComputeCapability(int* major, int* minor, CUdevice dev);
 typedef CUresult CUDAAPI tcuDevicePrimaryCtxRetain(CUcontext* pctx, CUdevice dev);
 typedef CUresult CUDAAPI tcuDevicePrimaryCtxRelease(CUdevice dev);
-typedef CUresult CUDAAPI tcuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned flags);
-typedef CUresult CUDAAPI tcuDevicePrimaryCtxGetState(CUdevice dev, unsigned* flags, int* active);
+typedef CUresult CUDAAPI tcuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
+typedef CUresult CUDAAPI tcuDevicePrimaryCtxGetState(CUdevice dev, unsigned int* flags, int* active);
 typedef CUresult CUDAAPI tcuDevicePrimaryCtxReset(CUdevice dev);
-typedef CUresult CUDAAPI tcuCtxCreate_v2(CUcontext* pctx, unsigned flags, CUdevice dev);
+typedef CUresult CUDAAPI tcuCtxCreate_v2(CUcontext* pctx, unsigned int flags, CUdevice dev);
 typedef CUresult CUDAAPI tcuCtxDestroy_v2(CUcontext ctx);
 typedef CUresult CUDAAPI tcuCtxPushCurrent_v2(CUcontext ctx);
 typedef CUresult CUDAAPI tcuCtxPopCurrent_v2(CUcontext* pctx);
 typedef CUresult CUDAAPI tcuCtxSetCurrent(CUcontext ctx);
 typedef CUresult CUDAAPI tcuCtxGetCurrent(CUcontext* pctx);
 typedef CUresult CUDAAPI tcuCtxGetDevice(CUdevice* device);
-typedef CUresult CUDAAPI tcuCtxGetFlags(unsigned* flags);
+typedef CUresult CUDAAPI tcuCtxGetFlags(unsigned int* flags);
 typedef CUresult CUDAAPI tcuCtxSynchronize(void);
 typedef CUresult CUDAAPI tcuCtxSetLimit(CUlimit limit, size_t value);
 typedef CUresult CUDAAPI tcuCtxGetLimit(size_t* pvalue, CUlimit limit);
@@ -780,43 +780,43 @@ typedef CUresult CUDAAPI tcuCtxGetCacheConfig(CUfunc_cache* pconfig);
 typedef CUresult CUDAAPI tcuCtxSetCacheConfig(CUfunc_cache config);
 typedef CUresult CUDAAPI tcuCtxGetSharedMemConfig(CUsharedconfig* pConfig);
 typedef CUresult CUDAAPI tcuCtxSetSharedMemConfig(CUsharedconfig config);
-typedef CUresult CUDAAPI tcuCtxGetApiVersion(CUcontext ctx, unsigned* version);
+typedef CUresult CUDAAPI tcuCtxGetApiVersion(CUcontext ctx, unsigned int* version);
 typedef CUresult CUDAAPI tcuCtxGetStreamPriorityRange(int* leastPriority, int* greatestPriority);
-typedef CUresult CUDAAPI tcuCtxAttach(CUcontext* pctx, unsigned flags);
+typedef CUresult CUDAAPI tcuCtxAttach(CUcontext* pctx, unsigned int flags);
 typedef CUresult CUDAAPI tcuCtxDetach(CUcontext ctx);
 typedef CUresult CUDAAPI tcuModuleLoad(CUmodule* module, const char* fname);
 typedef CUresult CUDAAPI tcuModuleLoadData(CUmodule* module, const void* image);
-typedef CUresult CUDAAPI tcuModuleLoadDataEx(CUmodule* module, const void* image, unsigned numOptions, CUjit_option* options, void* optionValues);
+typedef CUresult CUDAAPI tcuModuleLoadDataEx(CUmodule* module, const void* image, unsigned int numOptions, CUjit_option* options, void* optionValues);
 typedef CUresult CUDAAPI tcuModuleLoadFatBinary(CUmodule* module, const void* fatCubin);
 typedef CUresult CUDAAPI tcuModuleUnload(CUmodule hmod);
 typedef CUresult CUDAAPI tcuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name);
 typedef CUresult CUDAAPI tcuModuleGetGlobal_v2(CUdeviceptr* dptr, size_t* bytes, CUmodule hmod, const char* name);
 typedef CUresult CUDAAPI tcuModuleGetTexRef(CUtexref* pTexRef, CUmodule hmod, const char* name);
 typedef CUresult CUDAAPI tcuModuleGetSurfRef(CUsurfref* pSurfRef, CUmodule hmod, const char* name);
-typedef CUresult CUDAAPI tcuLinkCreate_v2(unsigned numOptions, CUjit_option* options, void* optionValues, CUlinkState* stateOut);
-typedef CUresult CUDAAPI tcuLinkAddData_v2(CUlinkState state, CUjitInputType type, void* data, size_t size, const char* name, unsigned numOptions, CUjit_option* options, void* optionValues);
-typedef CUresult CUDAAPI tcuLinkAddFile_v2(CUlinkState state, CUjitInputType type, const char* path, unsigned numOptions, CUjit_option* options, void* optionValues);
+typedef CUresult CUDAAPI tcuLinkCreate_v2(unsigned int numOptions, CUjit_option* options, void* optionValues, CUlinkState* stateOut);
+typedef CUresult CUDAAPI tcuLinkAddData_v2(CUlinkState state, CUjitInputType type, void* data, size_t size, const char* name, unsigned int numOptions, CUjit_option* options, void* optionValues);
+typedef CUresult CUDAAPI tcuLinkAddFile_v2(CUlinkState state, CUjitInputType type, const char* path, unsigned int numOptions, CUjit_option* options, void* optionValues);
 typedef CUresult CUDAAPI tcuLinkComplete(CUlinkState state, void* cubinOut, size_t* sizeOut);
 typedef CUresult CUDAAPI tcuLinkDestroy(CUlinkState state);
 typedef CUresult CUDAAPI tcuMemGetInfo_v2(size_t* free, size_t* total);
 typedef CUresult CUDAAPI tcuMemAlloc_v2(CUdeviceptr* dptr, size_t bytesize);
-typedef CUresult CUDAAPI tcuMemAllocPitch_v2(CUdeviceptr* dptr, size_t* pPitch, size_t WidthInBytes, size_t Height, unsigned ElementSizeBytes);
+typedef CUresult CUDAAPI tcuMemAllocPitch_v2(CUdeviceptr* dptr, size_t* pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
 typedef CUresult CUDAAPI tcuMemFree_v2(CUdeviceptr dptr);
 typedef CUresult CUDAAPI tcuMemGetAddressRange_v2(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr);
 typedef CUresult CUDAAPI tcuMemAllocHost_v2(void* pp, size_t bytesize);
 typedef CUresult CUDAAPI tcuMemFreeHost(void* p);
-typedef CUresult CUDAAPI tcuMemHostAlloc(void* pp, size_t bytesize, unsigned Flags);
-typedef CUresult CUDAAPI tcuMemHostGetDevicePointer_v2(CUdeviceptr* pdptr, void* p, unsigned Flags);
-typedef CUresult CUDAAPI tcuMemHostGetFlags(unsigned* pFlags, void* p);
-typedef CUresult CUDAAPI tcuMemAllocManaged(CUdeviceptr* dptr, size_t bytesize, unsigned flags);
+typedef CUresult CUDAAPI tcuMemHostAlloc(void* pp, size_t bytesize, unsigned int Flags);
+typedef CUresult CUDAAPI tcuMemHostGetDevicePointer_v2(CUdeviceptr* pdptr, void* p, unsigned int Flags);
+typedef CUresult CUDAAPI tcuMemHostGetFlags(unsigned int* pFlags, void* p);
+typedef CUresult CUDAAPI tcuMemAllocManaged(CUdeviceptr* dptr, size_t bytesize, unsigned int flags);
 typedef CUresult CUDAAPI tcuDeviceGetByPCIBusId(CUdevice* dev, const char* pciBusId);
 typedef CUresult CUDAAPI tcuDeviceGetPCIBusId(char* pciBusId, int len, CUdevice dev);
 typedef CUresult CUDAAPI tcuIpcGetEventHandle(CUipcEventHandle* pHandle, CUevent event);
 typedef CUresult CUDAAPI tcuIpcOpenEventHandle(CUevent* phEvent, CUipcEventHandle handle);
 typedef CUresult CUDAAPI tcuIpcGetMemHandle(CUipcMemHandle* pHandle, CUdeviceptr dptr);
-typedef CUresult CUDAAPI tcuIpcOpenMemHandle(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned Flags);
+typedef CUresult CUDAAPI tcuIpcOpenMemHandle(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags);
 typedef CUresult CUDAAPI tcuIpcCloseMemHandle(CUdeviceptr dptr);
-typedef CUresult CUDAAPI tcuMemHostRegister_v2(void* p, size_t bytesize, unsigned Flags);
+typedef CUresult CUDAAPI tcuMemHostRegister_v2(void* p, size_t bytesize, unsigned int Flags);
 typedef CUresult CUDAAPI tcuMemHostUnregister(void* p);
 typedef CUresult CUDAAPI tcuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
 typedef CUresult CUDAAPI tcuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
@@ -842,40 +842,40 @@ typedef CUresult CUDAAPI tcuMemcpyAtoHAsync_v2(void* dstHost, CUarray srcArray,
 typedef CUresult CUDAAPI tcuMemcpy2DAsync_v2(const CUDA_MEMCPY2D* pCopy, CUstream hStream);
 typedef CUresult CUDAAPI tcuMemcpy3DAsync_v2(const CUDA_MEMCPY3D* pCopy, CUstream hStream);
 typedef CUresult CUDAAPI tcuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER* pCopy, CUstream hStream);
-typedef CUresult CUDAAPI tcuMemsetD8_v2(CUdeviceptr dstDevice, unsigned uc, size_t N);
-typedef CUresult CUDAAPI tcuMemsetD16_v2(CUdeviceptr dstDevice, unsigned us, size_t N);
-typedef CUresult CUDAAPI tcuMemsetD32_v2(CUdeviceptr dstDevice, unsigned ui, size_t N);
-typedef CUresult CUDAAPI tcuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned uc, size_t Width, size_t Height);
-typedef CUresult CUDAAPI tcuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned us, size_t Width, size_t Height);
-typedef CUresult CUDAAPI tcuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned ui, size_t Width, size_t Height);
-typedef CUresult CUDAAPI tcuMemsetD8Async(CUdeviceptr dstDevice, unsigned uc, size_t N, CUstream hStream);
-typedef CUresult CUDAAPI tcuMemsetD16Async(CUdeviceptr dstDevice, unsigned us, size_t N, CUstream hStream);
-typedef CUresult CUDAAPI tcuMemsetD32Async(CUdeviceptr dstDevice, unsigned ui, size_t N, CUstream hStream);
-typedef CUresult CUDAAPI tcuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned uc, size_t Width, size_t Height, CUstream hStream);
-typedef CUresult CUDAAPI tcuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned us, size_t Width, size_t Height, CUstream hStream);
-typedef CUresult CUDAAPI tcuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned ui, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N);
+typedef CUresult CUDAAPI tcuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N);
+typedef CUresult CUDAAPI tcuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N);
+typedef CUresult CUDAAPI tcuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+typedef CUresult CUDAAPI tcuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+typedef CUresult CUDAAPI tcuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+typedef CUresult CUDAAPI tcuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult CUDAAPI tcuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
 typedef CUresult CUDAAPI tcuArrayCreate_v2(CUarray* pHandle, const CUDA_ARRAY_DESCRIPTOR* pAllocateArray);
 typedef CUresult CUDAAPI tcuArrayGetDescriptor_v2(CUDA_ARRAY_DESCRIPTOR* pArrayDescriptor, CUarray hArray);
 typedef CUresult CUDAAPI tcuArrayDestroy(CUarray hArray);
 typedef CUresult CUDAAPI tcuArray3DCreate_v2(CUarray* pHandle, const CUDA_ARRAY3D_DESCRIPTOR* pAllocateArray);
 typedef CUresult CUDAAPI tcuArray3DGetDescriptor_v2(CUDA_ARRAY3D_DESCRIPTOR* pArrayDescriptor, CUarray hArray);
-typedef CUresult CUDAAPI tcuMipmappedArrayCreate(CUmipmappedArray* pHandle, const CUDA_ARRAY3D_DESCRIPTOR* pMipmappedArrayDesc, unsigned numMipmapLevels);
-typedef CUresult CUDAAPI tcuMipmappedArrayGetLevel(CUarray* pLevelArray, CUmipmappedArray hMipmappedArray, unsigned level);
+typedef CUresult CUDAAPI tcuMipmappedArrayCreate(CUmipmappedArray* pHandle, const CUDA_ARRAY3D_DESCRIPTOR* pMipmappedArrayDesc, unsigned int numMipmapLevels);
+typedef CUresult CUDAAPI tcuMipmappedArrayGetLevel(CUarray* pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
 typedef CUresult CUDAAPI tcuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
 typedef CUresult CUDAAPI tcuPointerGetAttribute(void* data, CUpointer_attribute attribute, CUdeviceptr ptr);
 typedef CUresult CUDAAPI tcuPointerSetAttribute(const void* value, CUpointer_attribute attribute, CUdeviceptr ptr);
-typedef CUresult CUDAAPI tcuPointerGetAttributes(unsigned numAttributes, CUpointer_attribute* attributes, void* data, CUdeviceptr ptr);
-typedef CUresult CUDAAPI tcuStreamCreate(CUstream* phStream, unsigned Flags);
-typedef CUresult CUDAAPI tcuStreamCreateWithPriority(CUstream* phStream, unsigned flags, int priority);
+typedef CUresult CUDAAPI tcuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute* attributes, void* data, CUdeviceptr ptr);
+typedef CUresult CUDAAPI tcuStreamCreate(CUstream* phStream, unsigned int Flags);
+typedef CUresult CUDAAPI tcuStreamCreateWithPriority(CUstream* phStream, unsigned int flags, int priority);
 typedef CUresult CUDAAPI tcuStreamGetPriority(CUstream hStream, int* priority);
-typedef CUresult CUDAAPI tcuStreamGetFlags(CUstream hStream, unsigned* flags);
-typedef CUresult CUDAAPI tcuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned Flags);
-typedef CUresult CUDAAPI tcuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void* userData, unsigned flags);
-typedef CUresult CUDAAPI tcuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned flags);
+typedef CUresult CUDAAPI tcuStreamGetFlags(CUstream hStream, unsigned int* flags);
+typedef CUresult CUDAAPI tcuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
+typedef CUresult CUDAAPI tcuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void* userData, unsigned int flags);
+typedef CUresult CUDAAPI tcuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
 typedef CUresult CUDAAPI tcuStreamQuery(CUstream hStream);
 typedef CUresult CUDAAPI tcuStreamSynchronize(CUstream hStream);
 typedef CUresult CUDAAPI tcuStreamDestroy_v2(CUstream hStream);
-typedef CUresult CUDAAPI tcuEventCreate(CUevent* phEvent, unsigned Flags);
+typedef CUresult CUDAAPI tcuEventCreate(CUevent* phEvent, unsigned int Flags);
 typedef CUresult CUDAAPI tcuEventRecord(CUevent hEvent, CUstream hStream);
 typedef CUresult CUDAAPI tcuEventQuery(CUevent hEvent);
 typedef CUresult CUDAAPI tcuEventSynchronize(CUevent hEvent);
@@ -884,23 +884,23 @@ typedef CUresult CUDAAPI tcuEventElapsedTime(float* pMilliseconds, CUevent hStar
 typedef CUresult CUDAAPI tcuFuncGetAttribute(int* pi, CUfunction_attribute attrib, CUfunction hfunc);
 typedef CUresult CUDAAPI tcuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
 typedef CUresult CUDAAPI tcuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);
-typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f, unsigned gridDimX, unsigned gridDimY, unsigned gridDimZ, unsigned blockDimX, unsigned blockDimY, unsigned blockDimZ, unsigned sharedMemBytes, CUstream hStream, void* kernelParams, void* extra);
+typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void* kernelParams, void* extra);
 typedef CUresult CUDAAPI tcuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
-typedef CUresult CUDAAPI tcuFuncSetSharedSize(CUfunction hfunc, unsigned bytes);
-typedef CUresult CUDAAPI tcuParamSetSize(CUfunction hfunc, unsigned numbytes);
-typedef CUresult CUDAAPI tcuParamSeti(CUfunction hfunc, int offset, unsigned value);
+typedef CUresult CUDAAPI tcuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
+typedef CUresult CUDAAPI tcuParamSetSize(CUfunction hfunc, unsigned int numbytes);
+typedef CUresult CUDAAPI tcuParamSeti(CUfunction hfunc, int offset, unsigned int value);
 typedef CUresult CUDAAPI tcuParamSetf(CUfunction hfunc, int offset, float value);
-typedef CUresult CUDAAPI tcuParamSetv(CUfunction hfunc, int offset, void* ptr, unsigned numbytes);
+typedef CUresult CUDAAPI tcuParamSetv(CUfunction hfunc, int offset, void* ptr, unsigned int numbytes);
 typedef CUresult CUDAAPI tcuLaunch(CUfunction f);
 typedef CUresult CUDAAPI tcuLaunchGrid(CUfunction f, int grid_width, int grid_height);
 typedef CUresult CUDAAPI tcuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
 typedef CUresult CUDAAPI tcuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
 typedef CUresult CUDAAPI tcuOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
-typedef CUresult CUDAAPI tcuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned flags);
+typedef CUresult CUDAAPI tcuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
 typedef CUresult CUDAAPI tcuOccupancyMaxPotentialBlockSize(int* minGridSize, int* blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
-typedef CUresult CUDAAPI tcuOccupancyMaxPotentialBlockSizeWithFlags(int* minGridSize, int* blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned flags);
-typedef CUresult CUDAAPI tcuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned Flags);
-typedef CUresult CUDAAPI tcuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned Flags);
+typedef CUresult CUDAAPI tcuOccupancyMaxPotentialBlockSizeWithFlags(int* minGridSize, int* blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
+typedef CUresult CUDAAPI tcuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
+typedef CUresult CUDAAPI tcuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
 typedef CUresult CUDAAPI tcuTexRefSetAddress_v2(size_t* ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
 typedef CUresult CUDAAPI tcuTexRefSetAddress2D_v3(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR* desc, CUdeviceptr dptr, size_t Pitch);
 typedef CUresult CUDAAPI tcuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
@@ -909,8 +909,8 @@ typedef CUresult CUDAAPI tcuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode
 typedef CUresult CUDAAPI tcuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm);
 typedef CUresult CUDAAPI tcuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias);
 typedef CUresult CUDAAPI tcuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
-typedef CUresult CUDAAPI tcuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned maxAniso);
-typedef CUresult CUDAAPI tcuTexRefSetFlags(CUtexref hTexRef, unsigned Flags);
+typedef CUresult CUDAAPI tcuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso);
+typedef CUresult CUDAAPI tcuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
 typedef CUresult CUDAAPI tcuTexRefGetAddress_v2(CUdeviceptr* pdptr, CUtexref hTexRef);
 typedef CUresult CUDAAPI tcuTexRefGetArray(CUarray* phArray, CUtexref hTexRef);
 typedef CUresult CUDAAPI tcuTexRefGetMipmappedArray(CUmipmappedArray* phMipmappedArray, CUtexref hTexRef);
@@ -921,10 +921,10 @@ typedef CUresult CUDAAPI tcuTexRefGetMipmapFilterMode(CUfilter_mode* pfm, CUtexr
 typedef CUresult CUDAAPI tcuTexRefGetMipmapLevelBias(float* pbias, CUtexref hTexRef);
 typedef CUresult CUDAAPI tcuTexRefGetMipmapLevelClamp(float* pminMipmapLevelClamp, float* pmaxMipmapLevelClamp, CUtexref hTexRef);
 typedef CUresult CUDAAPI tcuTexRefGetMaxAnisotropy(int* pmaxAniso, CUtexref hTexRef);
-typedef CUresult CUDAAPI tcuTexRefGetFlags(unsigned* pFlags, CUtexref hTexRef);
+typedef CUresult CUDAAPI tcuTexRefGetFlags(unsigned int* pFlags, CUtexref hTexRef);
 typedef CUresult CUDAAPI tcuTexRefCreate(CUtexref* pTexRef);
 typedef CUresult CUDAAPI tcuTexRefDestroy(CUtexref hTexRef);
-typedef CUresult CUDAAPI tcuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned Flags);
+typedef CUresult CUDAAPI tcuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
 typedef CUresult CUDAAPI tcuSurfRefGetArray(CUarray* phArray, CUsurfref hSurfRef);
 typedef CUresult CUDAAPI tcuTexObjectCreate(CUtexObject* pTexObject, const CUDA_RESOURCE_DESC* pResDesc, const CUDA_TEXTURE_DESC* pTexDesc, const CUDA_RESOURCE_VIEW_DESC* pResViewDesc);
 typedef CUresult CUDAAPI tcuTexObjectDestroy(CUtexObject texObject);
@@ -935,27 +935,27 @@ typedef CUresult CUDAAPI tcuSurfObjectCreate(CUsurfObject* pSurfObject, const CU
 typedef CUresult CUDAAPI tcuSurfObjectDestroy(CUsurfObject surfObject);
 typedef CUresult CUDAAPI tcuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC* pResDesc, CUsurfObject surfObject);
 typedef CUresult CUDAAPI tcuDeviceCanAccessPeer(int* canAccessPeer, CUdevice dev, CUdevice peerDev);
-typedef CUresult CUDAAPI tcuCtxEnablePeerAccess(CUcontext peerContext, unsigned Flags);
+typedef CUresult CUDAAPI tcuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags);
 typedef CUresult CUDAAPI tcuCtxDisablePeerAccess(CUcontext peerContext);
 typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource);
-typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray* pArray, CUgraphicsResource resource, unsigned arrayIndex, unsigned mipLevel);
+typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray* pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
 typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray* pMipmappedArray, CUgraphicsResource resource);
 typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer_v2(CUdeviceptr* pDevPtr, size_t* pSize, CUgraphicsResource resource);
-typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags_v2(CUgraphicsResource resource, unsigned flags);
-typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned count, CUgraphicsResource* resources, CUstream hStream);
-typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned count, CUgraphicsResource* resources, CUstream hStream);
+typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags_v2(CUgraphicsResource resource, unsigned int flags);
+typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, CUgraphicsResource* resources, CUstream hStream);
+typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, CUgraphicsResource* resources, CUstream hStream);
 typedef CUresult CUDAAPI tcuGetExportTable(const void* ppExportTable, const CUuuid* pExportTableId);

-typedef CUresult CUDAAPI tcuGraphicsGLRegisterBuffer(CUgraphicsResource* pCudaResource, GLuint buffer, unsigned Flags);
-typedef CUresult CUDAAPI tcuGraphicsGLRegisterImage(CUgraphicsResource* pCudaResource, GLuint image, GLenum target, unsigned Flags);
-typedef CUresult CUDAAPI tcuGLGetDevices_v2(unsigned* pCudaDeviceCount, CUdevice* pCudaDevices, unsigned cudaDeviceCount, CUGLDeviceList deviceList);
-typedef CUresult CUDAAPI tcuGLCtxCreate_v2(CUcontext* pCtx, unsigned Flags, CUdevice device);
+typedef CUresult CUDAAPI tcuGraphicsGLRegisterBuffer(CUgraphicsResource* pCudaResource, GLuint buffer, unsigned int Flags);
+typedef CUresult CUDAAPI tcuGraphicsGLRegisterImage(CUgraphicsResource* pCudaResource, GLuint image, GLenum target, unsigned int Flags);
+typedef CUresult CUDAAPI tcuGLGetDevices_v2(unsigned int* pCudaDeviceCount, CUdevice* pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+typedef CUresult CUDAAPI tcuGLCtxCreate_v2(CUcontext* pCtx, unsigned int Flags, CUdevice device);
 typedef CUresult CUDAAPI tcuGLInit(void);
 typedef CUresult CUDAAPI tcuGLRegisterBufferObject(GLuint buffer);
 typedef CUresult CUDAAPI tcuGLMapBufferObject_v2(CUdeviceptr* dptr, size_t* size, GLuint buffer);
 typedef CUresult CUDAAPI tcuGLUnmapBufferObject(GLuint buffer);
 typedef CUresult CUDAAPI tcuGLUnregisterBufferObject(GLuint buffer);
-typedef CUresult CUDAAPI tcuGLSetBufferObjectMapFlags(GLuint buffer, unsigned Flags);
+typedef CUresult CUDAAPI tcuGLSetBufferObjectMapFlags(GLuint buffer, unsigned int Flags);
 typedef CUresult CUDAAPI tcuGLMapBufferObjectAsync_v2(CUdeviceptr* dptr, size_t* size, GLuint buffer, CUstream hStream);
 typedef CUresult CUDAAPI tcuGLUnmapBufferObjectAsync(GLuint buffer, CUstream hStream);

--- a/extern/curve_fit_nd/curve_fit_nd.h
+++ b/extern/curve_fit_nd/curve_fit_nd.h
@@ -25,8 +25,8 @@
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

-#ifndef __SPLINE_FIT__
-#define __SPLINE_FIT__
+#ifndef __CURVE_FIT_ND_H__
+#define __CURVE_FIT_ND_H__

 /** \file curve_fit_nd.h
 *  \ingroup curve_fit
@@ -79,6 +79,43 @@ int curve_fit_cubic_to_points_fl(
        unsigned int **r_cubic_orig_index,
        unsigned int **r_corners_index_array, unsigned int *r_corners_index_len);

+/**
+ * Takes a flat array of points and evalues that to calculate handle lengths.
+ *
+ * \param points, points_len: The array of points to calculate a cubics from.
+ * \param dims: The number of dimensions for for each element in \a points.
+ * \param error_threshold: the error threshold to allow for,
+ * \param tan_l, tan_r: Normalized tangents the handles will be aligned to.
+ * Note that tangents must both point along the direction of the \a points,
+ * so \a tan_l points in the same direction of the resulting handle,
+ * where \a tan_r will point the opposite direction of its handle.
+ *
+ * \param r_handle_l, r_handle_r: Resulting calculated handles.
+ * \param r_error_sq: The maximum distance  (squared) this curve diverges from \a points.
+ */
+int curve_fit_cubic_to_points_single_db(
+        const double      *points,
+        const unsigned int points_len,
+        const unsigned int dims,
+        const double       error_threshold,
+        const double       tan_l[],
+        const double       tan_r[],
+
+        double  r_handle_l[],
+        double  r_handle_r[],
+        double *r_error_sq);
+
+int curve_fit_cubic_to_points_single_fl(
+        const float       *points,
+        const unsigned int points_len,
+        const unsigned int dims,
+        const float        error_threshold,
+        const float        tan_l[],
+        const float        tan_r[],
+
+        float   r_handle_l[],
+        float   r_handle_r[],
+        float  *r_error_sq);

 /* curve_fit_corners_detect.c */

@@ -122,4 +159,4 @@ int curve_fit_corners_detect_fl(
        unsigned int **r_corners,
        unsigned int  *r_corners_len);

-#endif  /* __SPLINE_FIT__ */
+#endif  /* __CURVE_FIT_ND_H__ */
--- a/extern/curve_fit_nd/intern/curve_fit_corners_detect.c
+++ b/extern/curve_fit_nd/intern/curve_fit_corners_detect.c
@@ -382,9 +382,9 @@ int curve_fit_corners_detect_db(
 				uint i_best = i_span_start;
 				while (i_next < points_len) {
 					if ((points_angle[i_next] == 0.0) ||
-					   (len_squared_vnvn(
-					        &points[(i_next - 1) * dims],
-					        &points[i_next * dims], dims) > radius_min_sq))
+					    (len_squared_vnvn(
+					         &points[(i_next - 1) * dims],
+					         &points[i_next * dims], dims) > radius_min_sq))
 					{
 						break;
 					}
--- a/extern/curve_fit_nd/intern/curve_fit_cubic.c
+++ b/extern/curve_fit_nd/intern/curve_fit_cubic.c
@@ -29,6 +29,10 @@
 *  \ingroup curve_fit
 */

+#ifdef _MSC_VER
+#  define _USE_MATH_DEFINES
+#endif
+
 #include <math.h>
 #include <float.h>
 #include <stdbool.h>
@@ -39,11 +43,14 @@

 #include "../curve_fit_nd.h"

+/* Take curvature into account when calculating the least square solution isn't usable. */
+#define USE_CIRCULAR_FALLBACK
+
 /* avoid re-calculating lengths multiple times */
 #define USE_LENGTH_CACHE

 /* store the indices in the cubic data so we can return the original indices,
- * useful when the caller has data assosiated with the curve. */
+ * useful when the caller has data associated with the curve. */
 #define USE_ORIG_INDEX_DATA

 typedef unsigned int uint;
@@ -109,9 +116,19 @@ typedef struct Cubic {
 	*_p3 = _p2 + (dims); ((void)0)


+static size_t cubic_alloc_size(const uint dims)
+{
+	return sizeof(Cubic) + (sizeof(double) * 4 * dims);
+}
+
 static Cubic *cubic_alloc(const uint dims)
 {
-	return malloc(sizeof(Cubic) + (sizeof(double) * 4 * dims));
+	return malloc(cubic_alloc_size(dims));
+}
+
+static void cubic_copy(Cubic *cubic_dst, const Cubic *cubic_src, const uint dims)
+{
+	memcpy(cubic_dst, cubic_src, cubic_alloc_size(dims));
 }

 static void cubic_init(
@@ -278,7 +295,7 @@ static void cubic_calc_acceleration(
        double r_v[])
 {
 	CUBIC_VARS_CONST(cubic, dims, p0, p1, p2, p3);
-    const double s = 1.0 - t;
+	const double s = 1.0 - t;
 	for (uint j = 0; j < dims; j++) {
 		r_v[j] = 6.0 * ((p2[j] - 2.0 * p1[j] + p0[j]) * s +
 		                (p3[j] - 2.0 * p2[j] + p1[j]) * t);
@@ -286,20 +303,19 @@ static void cubic_calc_acceleration(
 }

 /**
- * Returns a 'measure' of the maximal discrepancy of the points specified
+ * Returns a 'measure' of the maximum distance (squared) of the points specified
 * by points_offset from the corresponding cubic(u[]) points.
 */
-static void cubic_calc_error(
+static double cubic_calc_error(
        const Cubic *cubic,
        const double *points_offset,
        const uint points_offset_len,
        const double *u,
        const uint dims,

-        double *r_error_sq_max,
        uint *r_error_index)
 {
-	double error_sq_max = 0.0;
+	double error_max_sq = 0.0;
 	uint   error_index = 0;

 	const double *pt_real = points_offset + dims;
@@ -313,14 +329,14 @@ static void cubic_calc_error(
 		cubic_evaluate(cubic, u[i], dims, pt_eval);

 		const double err_sq = len_squared_vnvn(pt_real, pt_eval, dims);
-		if (err_sq >= error_sq_max) {
-			error_sq_max = err_sq;
+		if (err_sq >= error_max_sq) {
+			error_max_sq = err_sq;
 			error_index = i;
 		}
 	}

-	*r_error_sq_max   = error_sq_max;
 	*r_error_index = error_index;
+	return error_max_sq;
 }

 /**
@@ -388,12 +404,141 @@ static void points_calc_center_weighted(
 	}
 }

+#ifdef USE_CIRCULAR_FALLBACK
+
+/**
+ * Return a scale value, used to calculate how much the curve handles should be increased,
+ *
+ * This works by placing each end-point on an imaginary circle,
+ * the placement on the circle is based on the tangent vectors,
+ * where larger differences in tangent angle cover a larger part of the circle.
+ *
+ * Return the scale representing how much larger the distance around the circle is.
+ */
+static double points_calc_circumference_factor(
+        const double  tan_l[],
+        const double  tan_r[],
+        const uint dims)
+{
+	const double dot = dot_vnvn(tan_l, tan_r, dims);
+	const double len_tangent = dot < 0.0 ? len_vnvn(tan_l, tan_r, dims) : len_negated_vnvn(tan_l, tan_r, dims);
+	if (len_tangent > DBL_EPSILON) {
+		/* only clamp to avoid precision error */
+		double angle = acos(max(-fabs(dot), -1.0));
+		/* Angle may be less than the length when the tangents define >180 degrees of the circle,
+		 * (tangents that point away from each other).
+		 * We could try support this but will likely cause extreme >1 scales which could cause other issues. */
+		// assert(angle >= len_tangent);
+		double factor = (angle / len_tangent);
+		assert(factor < (M_PI / 2) + DBL_EPSILON);
+		return factor;
+	}
+	else {
+		/* tangents are exactly aligned (think two opposite sides of a circle). */
+		return (M_PI / 2);
+	}
+}
+
+/**
+ * Return the value which the distance between points will need to be scaled by,
+ * to define a handle, given both points are on a perfect circle.
+ *
+ * \note the return value will need to be multiplied by 1.3... for correct results.
+ */
+static double points_calc_circle_tangent_factor(
+        const double  tan_l[],
+        const double  tan_r[],
+        const uint dims)
+{
+	const double eps = 1e-8;
+	const double tan_dot = dot_vnvn(tan_l, tan_r, dims);
+	if (tan_dot > 1.0 - eps) {
+		/* no angle difference (use fallback, length wont make any difference) */
+		return (1.0 / 3.0) * 0.75;
+	}
+	else if (tan_dot < -1.0 + eps) {
+		/* parallele tangents (half-circle) */
+		return (1.0 / 2.0);
+	}
+	else {
+		/* non-aligned tangents, calculate handle length */
+		const double angle = acos(tan_dot) / 2.0;
+
+		/* could also use 'angle_sin = len_vnvn(tan_l, tan_r, dims) / 2.0' */
+		const double angle_sin = sin(angle);
+		const double angle_cos = cos(angle);
+		return ((1.0 - angle_cos) / (angle_sin * 2.0)) / angle_sin;
+	}
+}
+
+/**
+ * Calculate the scale the handles, which serves as a best-guess
+ * used as a fallback when the least-square solution fails.
+ */
+static double points_calc_cubic_scale(
+        const double v_l[], const double v_r[],
+        const double  tan_l[],
+        const double  tan_r[],
+        const double coords_length, uint dims)
+{
+	const double len_direct = len_vnvn(v_l, v_r, dims);
+	const double len_circle_factor = points_calc_circle_tangent_factor(tan_l, tan_r, dims);
+
+	/* if this curve is a circle, this value doesn't need modification */
+	const double len_circle_handle = (len_direct * (len_circle_factor / 0.75));
+
+	/* scale by the difference from the circumference distance */
+	const double len_circle = len_direct * points_calc_circumference_factor(tan_l, tan_r, dims);
+	double scale_handle = (coords_length / len_circle);
+
+	/* Could investigate an accurate calculation here,
+	 * though this gives close results */
+	scale_handle = ((scale_handle - 1.0) * 1.75) + 1.0;
+
+	return len_circle_handle * scale_handle;
+}
+
+static void cubic_from_points_fallback(
+        const double *points_offset,
+        const uint    points_offset_len,
+        const double  tan_l[],
+        const double  tan_r[],
+        const uint dims,
+
+        Cubic *r_cubic)
+{
+	const double *p0 = &points_offset[0];
+	const double *p3 = &points_offset[(points_offset_len - 1) * dims];
+
+	double alpha = len_vnvn(p0, p3, dims) / 3.0;
+
+	double *p1 = CUBIC_PT(r_cubic, 1, dims);
+	double *p2 = CUBIC_PT(r_cubic, 2, dims);
+
+	copy_vnvn(CUBIC_PT(r_cubic, 0, dims), p0, dims);
+	copy_vnvn(CUBIC_PT(r_cubic, 3, dims), p3, dims);
+
+#ifdef USE_ORIG_INDEX_DATA
+	r_cubic->orig_span = (points_offset_len - 1);
+#endif
+
+	/* p1 = p0 - (tan_l * alpha_l);
+	 * p2 = p3 + (tan_r * alpha_r);
+	 */
+	msub_vn_vnvn_fl(p1, p0, tan_l, alpha, dims);
+	madd_vn_vnvn_fl(p2, p3, tan_r, alpha, dims);
+}
+#endif  /* USE_CIRCULAR_FALLBACK */
+
 /**
 * Use least-squares method to find Bezier control points for region.
 */
 static void cubic_from_points(
        const double *points_offset,
        const uint    points_offset_len,
+#ifdef USE_CIRCULAR_FALLBACK
+        const double  points_offset_coords_length,
+#endif
        const double *u_prime,
        const double  tan_l[],
        const double  tan_r[],
@@ -467,11 +612,20 @@ static void cubic_from_points(
 	 * so only problems absurd of approximation and not for bugs in the code.
 	 */

+	bool use_clamp = true;
+
 	/* flip check to catch nan values */
 	if (!(alpha_l >= 0.0) ||
 	    !(alpha_r >= 0.0))
 	{
+#ifdef USE_CIRCULAR_FALLBACK
+		alpha_l = alpha_r = points_calc_cubic_scale(p0, p3, tan_l, tan_r, points_offset_coords_length, dims);
+#else
 		alpha_l = alpha_r = len_vnvn(p0, p3, dims) / 3.0;
+#endif
+
+		/* skip clamping when we're using default handles */
+		use_clamp = false;
 	}

 	double *p1 = CUBIC_PT(r_cubic, 1, dims);
@@ -493,64 +647,69 @@ static void cubic_from_points(
 	/* ------------------------------------
 	 * Clamping (we could make it optional)
 	 */
+	if (use_clamp) {
 #ifdef USE_VLA
-	double center[dims];
+		double center[dims];
 #else
-	double *center = alloca(sizeof(double) * dims);
+		double *center = alloca(sizeof(double) * dims);
 #endif
-	points_calc_center_weighted(points_offset, points_offset_len, dims, center);
+		points_calc_center_weighted(points_offset, points_offset_len, dims, center);

-	const double clamp_scale = 3.0;  /* clamp to 3x */
-	double dist_sq_max = 0.0;
+		const double clamp_scale = 3.0;  /* clamp to 3x */
+		double dist_sq_max = 0.0;

-	{
-		const double *pt = points_offset;
-		for (uint i = 0; i < points_offset_len; i++, pt += dims) {
+		{
+			const double *pt = points_offset;
+			for (uint i = 0; i < points_offset_len; i++, pt += dims) {
 #if 0
-			double dist_sq_test = sq(len_vnvn(center, pt, dims) * clamp_scale);
+				double dist_sq_test = sq(len_vnvn(center, pt, dims) * clamp_scale);
 #else
-			/* do inline */
-			double dist_sq_test = 0.0;
-			for (uint j = 0; j < dims; j++) {
-				dist_sq_test += sq((pt[j] - center[j]) * clamp_scale);
-			}
+				/* do inline */
+				double dist_sq_test = 0.0;
+				for (uint j = 0; j < dims; j++) {
+					dist_sq_test += sq((pt[j] - center[j]) * clamp_scale);
+				}
 #endif
-			dist_sq_max = max(dist_sq_max, dist_sq_test);
-		}
-	}
-
-	double p1_dist_sq = len_squared_vnvn(center, p1, dims);
-	double p2_dist_sq = len_squared_vnvn(center, p2, dims);
-
-	if (p1_dist_sq > dist_sq_max ||
-	    p2_dist_sq > dist_sq_max)
-	{
-
-		alpha_l = alpha_r = len_vnvn(p0, p3, dims) / 3.0;
-
-		/*
-		 * p1 = p0 - (tan_l * alpha_l);
-		 * p2 = p3 + (tan_r * alpha_r);
-		 */
-		for (uint j = 0; j < dims; j++) {
-			p1[j] = p0[j] - (tan_l[j] * alpha_l);
-			p2[j] = p3[j] + (tan_r[j] * alpha_r);
+				dist_sq_max = max(dist_sq_max, dist_sq_test);
+			}
 		}

-		p1_dist_sq = len_squared_vnvn(center, p1, dims);
-		p2_dist_sq = len_squared_vnvn(center, p2, dims);
-	}
+		double p1_dist_sq = len_squared_vnvn(center, p1, dims);
+		double p2_dist_sq = len_squared_vnvn(center, p2, dims);

-	/* clamp within the 3x radius */
-	if (p1_dist_sq > dist_sq_max) {
-		isub_vnvn(p1, center, dims);
-		imul_vn_fl(p1, sqrt(dist_sq_max) / sqrt(p1_dist_sq), dims);
-		iadd_vnvn(p1, center, dims);
-	}
-	if (p2_dist_sq > dist_sq_max) {
-		isub_vnvn(p2, center, dims);
-		imul_vn_fl(p2, sqrt(dist_sq_max) / sqrt(p2_dist_sq), dims);
-		iadd_vnvn(p2, center, dims);
+		if (p1_dist_sq > dist_sq_max ||
+		    p2_dist_sq > dist_sq_max)
+		{
+#ifdef USE_CIRCULAR_FALLBACK
+			alpha_l = alpha_r = points_calc_cubic_scale(p0, p3, tan_l, tan_r, points_offset_coords_length, dims);
+#else
+			alpha_l = alpha_r = len_vnvn(p0, p3, dims) / 3.0;
+#endif
+
+			/*
+			 * p1 = p0 - (tan_l * alpha_l);
+			 * p2 = p3 + (tan_r * alpha_r);
+			 */
+			for (uint j = 0; j < dims; j++) {
+				p1[j] = p0[j] - (tan_l[j] * alpha_l);
+				p2[j] = p3[j] + (tan_r[j] * alpha_r);
+			}
+
+			p1_dist_sq = len_squared_vnvn(center, p1, dims);
+			p2_dist_sq = len_squared_vnvn(center, p2, dims);
+		}
+
+		/* clamp within the 3x radius */
+		if (p1_dist_sq > dist_sq_max) {
+			isub_vnvn(p1, center, dims);
+			imul_vn_fl(p1, sqrt(dist_sq_max) / sqrt(p1_dist_sq), dims);
+			iadd_vnvn(p1, center, dims);
+		}
+		if (p2_dist_sq > dist_sq_max) {
+			isub_vnvn(p2, center, dims);
+			imul_vn_fl(p2, sqrt(dist_sq_max) / sqrt(p2_dist_sq), dims);
+			iadd_vnvn(p2, center, dims);
+		}
 	}
 	/* end clamping */
 }
@@ -574,8 +733,10 @@ static void points_calc_coord_length_cache(
 }
 #endif  /* USE_LENGTH_CACHE */

-
-static void points_calc_coord_length(
+/**
+ * \return the accumulated length of \a points_offset.
+ */
+static double points_calc_coord_length(
        const double *points_offset,
        const uint    points_offset_len,
        const uint    dims,
@@ -608,6 +769,7 @@ static void points_calc_coord_length(
 	for (uint i = 0; i < points_offset_len; i++) {
 		r_u[i] /= w;
 	}
+	return w;
 }

 /**
@@ -620,10 +782,10 @@ static void points_calc_coord_length(
 * \note Return value may be `nan` caller must check for this.
 */
 static double cubic_find_root(
-		const Cubic *cubic,
-		const double p[],
-		const double u,
-		const uint dims)
+        const Cubic *cubic,
+        const double p[],
+        const double u,
+        const uint dims)
 {
 	/* Newton-Raphson Method. */
 	/* all vectors */
@@ -695,7 +857,7 @@ static bool cubic_reparameterize(
 }


-static void fit_cubic_to_points(
+static bool fit_cubic_to_points(
        const double *points_offset,
        const uint    points_offset_len,
 #ifdef USE_LENGTH_CACHE
@@ -703,19 +865,15 @@ static void fit_cubic_to_points(
 #endif
        const double  tan_l[],
        const double  tan_r[],
-        const double  error_threshold,
+        const double  error_threshold_sq,
        const uint    dims,
-        /* fill in the list */
-        CubicList *clist)
+
+        Cubic *r_cubic, double *r_error_max_sq, uint *r_split_index)
 {
 	const uint iteration_max = 4;
-	const double error_sq = sq(error_threshold);
-
-	Cubic *cubic;

 	if (points_offset_len == 2) {
-		cubic = cubic_alloc(dims);
-		CUBIC_VARS(cubic, dims, p0, p1, p2, p3);
+		CUBIC_VARS(r_cubic, dims, p0, p1, p2, p3);

 		copy_vnvn(p0, &points_offset[0 * dims], dims);
 		copy_vnvn(p3, &points_offset[1 * dims], dims);
@@ -725,14 +883,16 @@ static void fit_cubic_to_points(
 		madd_vn_vnvn_fl(p2, p3, tan_r, dist, dims);

 #ifdef USE_ORIG_INDEX_DATA
-		cubic->orig_span = 1;
+		r_cubic->orig_span = 1;
 #endif
-
-		cubic_list_prepend(clist, cubic);
-		return;
+		return true;
 	}

 	double *u = malloc(sizeof(double) * points_offset_len);
+
+#ifdef USE_CIRCULAR_FALLBACK
+	const double points_offset_coords_length  =
+#endif
 	points_calc_coord_length(
 	        points_offset, points_offset_len, dims,
 #ifdef USE_LENGTH_CACHE
@@ -740,55 +900,127 @@ static void fit_cubic_to_points(
 #endif
 	        u);

-	cubic = cubic_alloc(dims);
-
-	double error_sq_max;
+	double error_max_sq;
 	uint split_index;

 	/* Parameterize points, and attempt to fit curve */
 	cubic_from_points(
-	        points_offset, points_offset_len, u, tan_l, tan_r, dims, cubic);
+	        points_offset, points_offset_len,
+#ifdef USE_CIRCULAR_FALLBACK
+	        points_offset_coords_length,
+#endif
+	        u, tan_l, tan_r, dims, r_cubic);

 	/* Find max deviation of points to fitted curve */
-	cubic_calc_error(
-	        cubic, points_offset, points_offset_len, u, dims,
-	        &error_sq_max, &split_index);
+	error_max_sq = cubic_calc_error(
+	        r_cubic, points_offset, points_offset_len, u, dims,
+	        &split_index);

-	if (error_sq_max < error_sq) {
+	Cubic *cubic_test = alloca(cubic_alloc_size(dims));
+
+#ifdef USE_CIRCULAR_FALLBACK
+	if (!(error_max_sq < error_threshold_sq)) {
+		/* Don't use the cubic calculated above, instead calculate a new fallback cubic,
+		 * since this tends to give more balanced split_index along the curve.
+		 * This is because the attempt to calcualte the cubic may contain spikes
+		 * along the curve which may give a lop-sided maximum distance. */
+		cubic_from_points_fallback(
+		        points_offset, points_offset_len,
+		        tan_l, tan_r, dims, cubic_test);
+		const double error_max_sq_test = cubic_calc_error(
+		        cubic_test, points_offset, points_offset_len, u, dims,
+		        &split_index);
+
+		/* intentionally use the newly calculated 'split_index',
+		 * even if the 'error_max_sq_test' is worse. */
+		if (error_max_sq > error_max_sq_test) {
+			error_max_sq = error_max_sq_test;
+			cubic_copy(r_cubic, cubic_test, dims);
+		}
+	}
+#endif
+
+	*r_error_max_sq = error_max_sq;
+	*r_split_index  = split_index;
+
+	if (error_max_sq < error_threshold_sq) {
 		free(u);
-		cubic_list_prepend(clist, cubic);
-		return;
+		return true;
 	}
 	else {
+		cubic_copy(cubic_test, r_cubic, dims);
+
 		/* If error not too large, try some reparameterization and iteration */
 		double *u_prime = malloc(sizeof(double) * points_offset_len);
 		for (uint iter = 0; iter < iteration_max; iter++) {
 			if (!cubic_reparameterize(
-			        cubic, points_offset, points_offset_len, u, dims, u_prime))
+			        cubic_test, points_offset, points_offset_len, u, dims, u_prime))
 			{
 				break;
 			}

 			cubic_from_points(
-			        points_offset, points_offset_len, u_prime,
-			        tan_l, tan_r, dims, cubic);
-			cubic_calc_error(
-			        cubic, points_offset, points_offset_len, u_prime, dims,
-			        &error_sq_max, &split_index);
+			        points_offset, points_offset_len,
+#ifdef USE_CIRCULAR_FALLBACK
+			        points_offset_coords_length,
+#endif
+			        u_prime, tan_l, tan_r, dims, cubic_test);
+			error_max_sq = cubic_calc_error(
+			        cubic_test, points_offset, points_offset_len, u_prime, dims,
+			        &split_index);

-			if (error_sq_max < error_sq) {
+			if (error_max_sq < error_threshold_sq) {
 				free(u_prime);
 				free(u);
-				cubic_list_prepend(clist, cubic);
-				return;
+
+				cubic_copy(r_cubic, cubic_test, dims);
+				*r_error_max_sq = error_max_sq;
+				*r_split_index  = split_index;
+				return true;
+			}
+			else if (error_max_sq < *r_error_max_sq) {
+				cubic_copy(r_cubic, cubic_test, dims);
+				*r_error_max_sq = error_max_sq;
+				*r_split_index = split_index;
 			}

 			SWAP(double *, u, u_prime);
 		}
 		free(u_prime);
-	}
+		free(u);

-	free(u);
+		return false;
+	}
+}
+
+static void fit_cubic_to_points_recursive(
+        const double *points_offset,
+        const uint    points_offset_len,
+#ifdef USE_LENGTH_CACHE
+        const double *points_length_cache,
+#endif
+        const double  tan_l[],
+        const double  tan_r[],
+        const double  error_threshold_sq,
+        const uint    dims,
+        /* fill in the list */
+        CubicList *clist)
+{
+	Cubic *cubic = cubic_alloc(dims);
+	uint split_index;
+	double error_max_sq;
+
+	if (fit_cubic_to_points(
+	        points_offset, points_offset_len,
+#ifdef USE_LENGTH_CACHE
+	        points_length_cache,
+#endif
+	        tan_l, tan_r, error_threshold_sq, dims,
+	        cubic, &error_max_sq, &split_index))
+	{
+		cubic_list_prepend(clist, cubic);
+		return;
+	}
 	cubic_free(cubic);


@@ -814,21 +1046,35 @@ static void fit_cubic_to_points(
 		pt_a += dims;
 	}

-	/* tan_center = (pt_a - pt_b).normalized() */
-	normalize_vn_vnvn(tan_center, pt_a, pt_b, dims);
+	{
+#ifdef USE_VLA
+		double tan_center_a[dims];
+		double tan_center_b[dims];
+#else
+		double *tan_center_a = alloca(sizeof(double) * dims);
+		double *tan_center_b = alloca(sizeof(double) * dims);
+#endif
+		const double *pt   = &points_offset[split_index * dims];

-	fit_cubic_to_points(
+		/* tan_center = ((pt_a - pt).normalized() + (pt - pt_b).normalized()).normalized() */
+		normalize_vn_vnvn(tan_center_a, pt_a, pt, dims);
+		normalize_vn_vnvn(tan_center_b, pt, pt_b, dims);
+		add_vn_vnvn(tan_center, tan_center_a, tan_center_b, dims);
+		normalize_vn(tan_center, dims);
+	}
+
+	fit_cubic_to_points_recursive(
 	        points_offset, split_index + 1,
 #ifdef USE_LENGTH_CACHE
 	        points_length_cache,
 #endif
-	        tan_l, tan_center, error_threshold, dims, clist);
-	fit_cubic_to_points(
+	        tan_l, tan_center, error_threshold_sq, dims, clist);
+	fit_cubic_to_points_recursive(
 	        &points_offset[split_index * dims], points_offset_len - split_index,
 #ifdef USE_LENGTH_CACHE
 	        points_length_cache + split_index,
 #endif
-	        tan_center, tan_r, error_threshold, dims, clist);
+	        tan_center, tan_r, error_threshold_sq, dims, clist);

 }

@@ -890,6 +1136,8 @@ int curve_fit_cubic_to_points_db(
 		corner_index_array[corner_index++] = corners[0];
 	}

+	const double error_threshold_sq = sq(error_threshold);
+
 	for (uint i = 1; i < corners_len; i++) {
 		const uint points_offset_len = corners[i] - corners[i - 1] + 1;
 		const uint first_point = corners[i - 1];
@@ -919,12 +1167,12 @@ int curve_fit_cubic_to_points_db(
 			        points_length_cache);
 #endif

-			fit_cubic_to_points(
+			fit_cubic_to_points_recursive(
 			        &points[first_point * dims], points_offset_len,
 #ifdef USE_LENGTH_CACHE
 			        points_length_cache,
 #endif
-			        tan_l, tan_r, error_threshold, dims, &clist);
+			        tan_l, tan_r, error_threshold_sq, dims, &clist);
 		}
 		else if (points_len == 1) {
 			assert(points_offset_len == 1);
@@ -1001,9 +1249,7 @@ int curve_fit_cubic_to_points_fl(
 	const uint points_flat_len = points_len * dims;
 	double *points_db = malloc(sizeof(double) * points_flat_len);

-	for (uint i = 0; i < points_flat_len; i++) {
-		points_db[i] = (double)points[i];
-	}
+	copy_vndb_vnfl(points_db, points, points_flat_len);

 	double *cubic_array_db = NULL;
 	float  *cubic_array_fl = NULL;
@@ -1031,4 +1277,100 @@ int curve_fit_cubic_to_points_fl(
 	return result;
 }

+/**
+ * Fit a single cubic to points.
+ */
+int curve_fit_cubic_to_points_single_db(
+        const double *points,
+        const uint    points_len,
+        const uint    dims,
+        const double  error_threshold,
+        const double tan_l[],
+        const double tan_r[],
+
+        double  r_handle_l[],
+        double  r_handle_r[],
+        double  *r_error_max_sq)
+{
+	Cubic *cubic = alloca(cubic_alloc_size(dims));
+
+	uint split_index;
+
+	/* in this instance theres no advantage in using length cache,
+	 * since we're not recursively calculating values. */
+#ifdef USE_LENGTH_CACHE
+	double *points_length_cache = malloc(sizeof(double) * points_len);
+	points_calc_coord_length_cache(
+	        points, points_len, dims,
+	        points_length_cache);
+#endif
+
+	fit_cubic_to_points(
+	        points, points_len,
+#ifdef USE_LENGTH_CACHE
+	        points_length_cache,
+#endif
+	        tan_l, tan_r, error_threshold, dims,
+
+	        cubic, r_error_max_sq, &split_index);
+
+#ifdef USE_LENGTH_CACHE
+	free(points_length_cache);
+#endif
+
+	copy_vnvn(r_handle_l, CUBIC_PT(cubic, 1, dims), dims);
+	copy_vnvn(r_handle_r, CUBIC_PT(cubic, 2, dims), dims);
+
+	return 0;
+}
+
+int curve_fit_cubic_to_points_single_fl(
+        const float  *points,
+        const uint    points_len,
+        const uint    dims,
+        const float   error_threshold,
+        const float   tan_l[],
+        const float   tan_r[],
+
+        float   r_handle_l[],
+        float   r_handle_r[],
+        float  *r_error_sq)
+{
+	const uint points_flat_len = points_len * dims;
+	double *points_db = malloc(sizeof(double) * points_flat_len);
+
+	copy_vndb_vnfl(points_db, points, points_flat_len);
+
+#ifdef USE_VLA
+	double tan_l_db[dims];
+	double tan_r_db[dims];
+	double r_handle_l_db[dims];
+	double r_handle_r_db[dims];
+#else
+	double *tan_l_db = alloca(sizeof(double) * dims);
+	double *tan_r_db = alloca(sizeof(double) * dims);
+	double *r_handle_l_db = alloca(sizeof(double) * dims);
+	double *r_handle_r_db = alloca(sizeof(double) * dims);
+#endif
+	double r_error_sq_db;
+
+	copy_vndb_vnfl(tan_l_db, tan_l, dims);
+	copy_vndb_vnfl(tan_r_db, tan_r, dims);
+
+	int result = curve_fit_cubic_to_points_single_db(
+	        points_db, points_len, dims,
+	        (double)error_threshold,
+	        tan_l_db, tan_r_db,
+	        r_handle_l_db, r_handle_r_db,
+	        &r_error_sq_db);
+
+	free(points_db);
+
+	copy_vnfl_vndb(r_handle_l, r_handle_l_db, dims);
+	copy_vnfl_vndb(r_handle_r, r_handle_r_db, dims);
+	*r_error_sq = (float)r_error_sq_db;
+
+	return result;
+}
+
 /** \} */
--- a/extern/curve_fit_nd/intern/curve_fit_inline.h
+++ b/extern/curve_fit_nd/intern/curve_fit_inline.h
@@ -57,7 +57,7 @@ MINLINE double max(const double a, const double b)
 #endif

 MINLINE void zero_vn(
-		double v0[], const uint dims)
+        double v0[], const uint dims)
 {
 	for (uint j = 0; j < dims; j++) {
 		v0[j] = 0.0;
@@ -65,7 +65,7 @@ MINLINE void zero_vn(
 }

 MINLINE void flip_vn_vnvn(
-		double v_out[], const double v0[], const double v1[], const uint dims)
+        double v_out[], const double v0[], const double v1[], const uint dims)
 {
 	for (uint j = 0; j < dims; j++) {
 		v_out[j] = v0[j] + (v0[j] - v1[j]);
@@ -80,6 +80,22 @@ MINLINE void copy_vnvn(
 	}
 }

+MINLINE void copy_vnfl_vndb(
+        float v0[], const double v1[], const uint dims)
+{
+	for (uint j = 0; j < dims; j++) {
+		v0[j] = (float)v1[j];
+	}
+}
+
+MINLINE void copy_vndb_vnfl(
+        double v0[], const float v1[], const uint dims)
+{
+	for (uint j = 0; j < dims; j++) {
+		v0[j] = (double)v1[j];
+	}
+}
+
 MINLINE double dot_vnvn(
        const double v0[], const double v1[], const uint dims)
 {
@@ -178,7 +194,7 @@ MINLINE void imul_vn_fl(double v0[], const double f, const uint dims)


 MINLINE double len_squared_vnvn(
-		const double v0[], const double v1[], const uint dims)
+        const double v0[], const double v1[], const uint dims)
 {
 	double d = 0.0;
 	for (uint j = 0; j < dims; j++) {
@@ -203,13 +219,29 @@ MINLINE double len_vnvn(
 	return sqrt(len_squared_vnvn(v0, v1, dims));
 }

-#if 0
-static double len_vn(
-		const double v0[], const uint dims)
+MINLINE double len_vn(
+        const double v0[], const uint dims)
 {
 	return sqrt(len_squared_vn(v0, dims));
 }

+/* special case, save us negating a copy, then getting the length */
+MINLINE double len_squared_negated_vnvn(
+        const double v0[], const double v1[], const uint dims)
+{
+	double d = 0.0;
+	for (uint j = 0; j < dims; j++) {
+		d += sq(v0[j] + v1[j]);
+	}
+	return d;
+}
+
+MINLINE double len_negated_vnvn(
+        const double v0[], const double v1[], const uint dims)
+{
+	return sqrt(len_squared_negated_vnvn(v0, v1, dims));
+}
+
 MINLINE double normalize_vn(
        double v0[], const uint dims)
 {
@@ -219,7 +251,6 @@ MINLINE double normalize_vn(
 	}
 	return d;
 }
-#endif

 /* v_out = (v0 - v1).normalized() */
 MINLINE double normalize_vn_vnvn(
--- a/extern/gflags/README.blender
+++ b/extern/gflags/README.blender
@@ -1,5 +1,5 @@
 Project: Google Flags
-URL: http://code.google.com/p/google-gflags/
+URL: https://github.com/gflags/gflags
 License: New BSD
 Upstream version: 2.2.0 (9db82895)
 Local modifications:
@@ -17,3 +17,7 @@ Local modifications:

 - Applied some modifications from fork https://github.com/Nazg-Gul/gflags.git
  (see https://github.com/gflags/gflags/pull/129)
+
+- Avoid attemot of acquiring mutex lock in FlagRegistry::GlobalRegistry when
+  doing static flags initialization. See d81dd2d in Blender repository.
+
--- a/extern/glog/README.blender
+++ b/extern/glog/README.blender
@@ -1,5 +1,5 @@
 Project: Google Logging
-URL: http://code.google.com/p/google-glog/
+URL: https://github.com/google/glog
 License: New BSD
 Upstream version: 0.3.4, 4d391fe
 Local modifications:
--- a/extern/gtest/README.blender
+++ b/extern/gtest/README.blender
@@ -1,7 +1,5 @@
 Project: Google C++ Testing Framework
-URL: http://code.google.com/p/googletest
+URL: https://github.com/google/googletest
 License: New BSD
 Upstream version: 1.7.0
-Local modifications:
-
-None.
+Local modifications:None
--- a/extern/libopenjpeg/README.blender
+++ b/extern/libopenjpeg/README.blender
@@ -0,0 +1,5 @@
+Project: OpenJPEG
+URL: http://www.openjpeg.org
+License: BSD 2-Clause
+Upstream version: 1.5.2
+Local modifications:
--- a/extern/rangetree/README.blender
+++ b/extern/rangetree/README.blender
@@ -0,0 +1,5 @@
+Project: RangeTree
+URL: https://github.com/nicholasbishop/RangeTree
+License: GPLv2+
+Upstream version: c4ecf6bb7dfd
+Local modifications: None
--- a/extern/recastnavigation/CMakeLists.txt
+++ b/extern/recastnavigation/CMakeLists.txt
@@ -23,9 +23,11 @@
 #
 # ***** END GPL LICENSE BLOCK *****

+remove_cc_flag_unsigned_char()
+
 set(INC 
-		Recast/Include
-		Detour/Include
+	Recast/Include
+	Detour/Include
 )

 set(INC_SYS
@@ -33,38 +35,38 @@ set(INC_SYS
 )

 set(SRC 
-		recast-capi.cpp
-		recast-capi.h
+	recast-capi.cpp
+	recast-capi.h


-		Detour/Source/DetourCommon.cpp
-		Detour/Source/DetourNode.cpp
-		Detour/Source/DetourStatNavMesh.cpp
-		Detour/Source/DetourStatNavMeshBuilder.cpp
-		Detour/Source/DetourTileNavMesh.cpp
-		Detour/Source/DetourTileNavMeshBuilder.cpp
+	Detour/Source/DetourCommon.cpp
+	Detour/Source/DetourNode.cpp
+	Detour/Source/DetourStatNavMesh.cpp
+	Detour/Source/DetourStatNavMeshBuilder.cpp
+	Detour/Source/DetourTileNavMesh.cpp
+	Detour/Source/DetourTileNavMeshBuilder.cpp

-		Detour/Include/DetourCommon.h
-		Detour/Include/DetourNode.h
-		Detour/Include/DetourStatNavMesh.h
-		Detour/Include/DetourStatNavMeshBuilder.h
-		Detour/Include/DetourTileNavMesh.h
-		Detour/Include/DetourTileNavMeshBuilder.h
+	Detour/Include/DetourCommon.h
+	Detour/Include/DetourNode.h
+	Detour/Include/DetourStatNavMesh.h
+	Detour/Include/DetourStatNavMeshBuilder.h
+	Detour/Include/DetourTileNavMesh.h
+	Detour/Include/DetourTileNavMeshBuilder.h

-		Recast/Source/Recast.cpp
-		Recast/Source/RecastAlloc.cpp
-		Recast/Source/RecastArea.cpp
-		Recast/Source/RecastContour.cpp
-		Recast/Source/RecastFilter.cpp
-		Recast/Source/RecastLayers.cpp
-		Recast/Source/RecastMesh.cpp
-		Recast/Source/RecastMeshDetail.cpp
-		Recast/Source/RecastRasterization.cpp
-		Recast/Source/RecastRegion.cpp
+	Recast/Source/Recast.cpp
+	Recast/Source/RecastAlloc.cpp
+	Recast/Source/RecastArea.cpp
+	Recast/Source/RecastContour.cpp
+	Recast/Source/RecastFilter.cpp
+	Recast/Source/RecastLayers.cpp
+	Recast/Source/RecastMesh.cpp
+	Recast/Source/RecastMeshDetail.cpp
+	Recast/Source/RecastRasterization.cpp
+	Recast/Source/RecastRegion.cpp

-		Recast/Include/Recast.h
-		Recast/Include/RecastAlloc.h
-		Recast/Include/RecastAssert.h
+	Recast/Include/Recast.h
+	Recast/Include/RecastAlloc.h
+	Recast/Include/RecastAssert.h
 )

 blender_add_lib(extern_recastnavigation "${SRC}" "${INC}" "${INC_SYS}")
--- a/extern/sdlew/README.blender
+++ b/extern/sdlew/README.blender
@@ -0,0 +1,5 @@
+Project: SDL Extension Wrangler
+URL: https://github.com/SDLWrangler/sdlew
+License: Apache 2.0
+Upstream version: 15edf8e 
+Local modifications: None
--- a/extern/wcwidth/README.blender
+++ b/extern/wcwidth/README.blender
@@ -0,0 +1,5 @@
+Project: WC Width
+URL: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
+License: ICS
+Upstream version: 2007-05-26
+Local modifications: None
--- a/extern/xdnd/README.blender
+++ b/extern/xdnd/README.blender
@@ -0,0 +1,8 @@
+Project: X Drag and Drop
+URL: http://www.newplanetsoftware.com/xdnd/ (defunct)
+     https://freedesktop.org/wiki/Specifications/XDND/ (cache)
+License: GPLv2+
+Upstream version: 2000-08-08
+Local modifications:
+* Fix T33192
+  Opening Blender breaks drag-and-drop support on the KDE desktop.
--- a/intern/CMakeLists.txt
+++ b/intern/CMakeLists.txt
@@ -34,6 +34,10 @@ add_subdirectory(mikktspace)
 add_subdirectory(glew-mx)
 add_subdirectory(eigen)

+if (WITH_DECKLINK)
+	add_subdirectory(decklink)
+endif()
+
 if(WITH_AUDASPACE)
 	add_subdirectory(audaspace)
 endif()
@@ -79,8 +83,10 @@ if(WITH_OPENSUBDIV)
 endif()

 # only windows needs utf16 converter
+# gpudirect is a runtime interface to the nVidia's DVP driver, only for windows
 if(WIN32)
 	add_subdirectory(utfconv)
+	add_subdirectory(gpudirect)
 endif()

 if(WITH_OPENVDB)
--- a/intern/atomic/atomic_ops.h
+++ b/intern/atomic/atomic_ops.h
@@ -1,11 +1,11 @@
 /*
- * Adopted from jemalloc with this license:
+ * Original code from jemalloc with this license:
 *
 * Copyright (C) 2002-2013 Jason Evans <jasone@canonware.com>.
 * All rights reserved.
 * Copyright (C) 2007-2012 Mozilla Foundation.  All rights reserved.
 * Copyright (C) 2009-2013 Facebook, Inc.  All rights reserved.
-
+ *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 1. Redistributions of source code must retain the above copyright notice(s),
@@ -13,7 +13,7 @@
 * 2. Redistributions in binary form must reproduce the above copyright notice(s),
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
-
+ *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS
 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
@@ -24,63 +24,59 @@
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2016 Blender Foundation.
+ * All rights reserved.
+ *
+ * The Original Code is: adapted from jemalloc.
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+/**
+ * \file atomic_ops.h
+ * \ingroup Atomic
+ *
+ * \author Copyright (C) 2016 Blender Foundation, adapted from jemalloc.
+ * \brief Provides wrapper around system-specific atomic primitives, and some extensions (faked-atomic operations
+ *        over float numbers).
 */

 #ifndef __ATOMIC_OPS_H__
 #define __ATOMIC_OPS_H__

-#include <assert.h>
-
-#if defined (__APPLE__)
-#  include <libkern/OSAtomic.h>
-#elif defined(_MSC_VER)
-#  define NOGDI
-#  ifndef NOMINMAX
-#    define NOMINMAX
-#  endif
-#  define WIN32_LEAN_AND_MEAN
-#  include <windows.h>
-#elif defined(__arm__)
+#if defined(__arm__)
 /* Attempt to fix compilation error on Debian armel kernel.
 * arm7 architecture does have both 32 and 64bit atomics, however
 * it's gcc doesn't have __GCC_HAVE_SYNC_COMPARE_AND_SWAP_n defined.
 */
-#  define JE_FORCE_SYNC_COMPARE_AND_SWAP_8
+#  define JE_FORCE_SYNC_COMPARE_AND_SWAP_1
 #  define JE_FORCE_SYNC_COMPARE_AND_SWAP_4
+#  define JE_FORCE_SYNC_COMPARE_AND_SWAP_8
 #endif

-/* needed for int types */
-#include "../../source/blender/blenlib/BLI_sys_types.h"
-#include <stdlib.h>
-#include <stddef.h>
+#include "intern/atomic_ops_utils.h"

-/* little macro so inline keyword works */
-#if defined(_MSC_VER)
-#  define ATOMIC_INLINE static __forceinline
-#else
-#  if (defined(__APPLE__) && defined(__ppc__))
-/* static inline __attribute__ here breaks osx ppc gcc42 build */
-#    define ATOMIC_INLINE static __attribute__((always_inline))
-#  else
-#    define ATOMIC_INLINE static inline __attribute__((always_inline))
-#  endif
-#endif
-
-/* This is becoming a bit nastier that it was originally foreseen,
- * consider using autoconfig detection instead.
- */
-#if defined(_M_X64) || defined(__amd64__) || defined(__x86_64__) || defined(__s390x__) || defined(__powerpc64__) || defined(__aarch64__) || (defined(__sparc__) && defined(__arch64__)) || defined(__alpha__) || defined(__mips64)
-#  define LG_SIZEOF_PTR 3
-#  define LG_SIZEOF_INT 2
-#else
-#  define LG_SIZEOF_PTR 2
-#  define LG_SIZEOF_INT 2
-#endif
-
-/************************/
+/******************************************************************************/
 /* Function prototypes. */

-#if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
+#if (LG_SIZEOF_PTR == 8 || LG_SIZEOF_INT == 8)
 ATOMIC_INLINE uint64_t atomic_add_uint64(uint64_t *p, uint64_t x);
 ATOMIC_INLINE uint64_t atomic_sub_uint64(uint64_t *p, uint64_t x);
 ATOMIC_INLINE uint64_t atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _new);
@@ -90,6 +86,9 @@ ATOMIC_INLINE uint32_t atomic_add_uint32(uint32_t *p, uint32_t x);
 ATOMIC_INLINE uint32_t atomic_sub_uint32(uint32_t *p, uint32_t x);
 ATOMIC_INLINE uint32_t atomic_cas_uint32(uint32_t *v, uint32_t old, uint32_t _new);

+ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x);
+
+ATOMIC_INLINE uint8_t atomic_fetch_and_or_uint8(uint8_t *p, uint8_t b);
 ATOMIC_INLINE uint8_t atomic_fetch_and_and_uint8(uint8_t *p, uint8_t b);

 ATOMIC_INLINE size_t atomic_add_z(size_t *p, size_t x);
@@ -100,395 +99,22 @@ ATOMIC_INLINE unsigned atomic_add_u(unsigned *p, unsigned x);
 ATOMIC_INLINE unsigned atomic_sub_u(unsigned *p, unsigned x);
 ATOMIC_INLINE unsigned atomic_cas_u(unsigned *v, unsigned old, unsigned _new);

-/******************************************************************************/
-/* 64-bit operations. */
-#if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
-#  ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
-ATOMIC_INLINE uint64_t
-atomic_add_uint64(uint64_t *p, uint64_t x)
-{
-	return __sync_add_and_fetch(p, x);
-}
-
-ATOMIC_INLINE uint64_t
-atomic_sub_uint64(uint64_t *p, uint64_t x)
-{
-	return __sync_sub_and_fetch(p, x);
-}
-
-ATOMIC_INLINE uint64_t
-atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _new)
-{
-	return __sync_val_compare_and_swap(v, old, _new);
-}
-#elif (defined(_MSC_VER))
-ATOMIC_INLINE uint64_t
-atomic_add_uint64(uint64_t *p, uint64_t x)
-{
-	return InterlockedExchangeAdd64((int64_t *)p, (int64_t)x) + x;
-}
-
-ATOMIC_INLINE uint64_t
-atomic_sub_uint64(uint64_t *p, uint64_t x)
-{
-	return InterlockedExchangeAdd64((int64_t *)p, -((int64_t)x)) - x;
-}
-
-ATOMIC_INLINE uint64_t
-atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _new)
-{
-	return InterlockedCompareExchange64((int64_t *)v, _new, old);
-}
-#elif (defined(__APPLE__))
-ATOMIC_INLINE uint64_t
-atomic_add_uint64(uint64_t *p, uint64_t x)
-{
-	return (uint64_t)OSAtomicAdd64((int64_t)x, (int64_t *)p);
-}
-
-ATOMIC_INLINE uint64_t
-atomic_sub_uint64(uint64_t *p, uint64_t x)
-{
-	return (uint64_t)OSAtomicAdd64(-((int64_t)x), (int64_t *)p);
-}
-
-ATOMIC_INLINE uint64_t
-atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _new)
-{
-	uint64_t init_val = *v;
-	OSAtomicCompareAndSwap64((int64_t)old, (int64_t)_new, (int64_t *)v);
-	return init_val;
-}
-#  elif (defined(__amd64__) || defined(__x86_64__))
-ATOMIC_INLINE uint64_t
-atomic_add_uint64(uint64_t *p, uint64_t x)
-{
-	asm volatile (
-	    "lock; xaddq %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
-	return x;
-}
-
-ATOMIC_INLINE uint64_t
-atomic_sub_uint64(uint64_t *p, uint64_t x)
-{
-	x = (uint64_t)(-(int64_t)x);
-	asm volatile (
-	    "lock; xaddq %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
-	return x;
-}
-
-ATOMIC_INLINE uint64_t
-atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _new)
-{
-	uint64_t ret;
-	asm volatile (
-	    "lock; cmpxchgq %2,%1"
-	    : "=a" (ret), "+m" (*v)
-	    : "r" (_new), "0" (old)
-	    : "memory");
-	return ret;
-}
-
-#  elif (defined(JEMALLOC_ATOMIC9))
-ATOMIC_INLINE uint64_t
-atomic_add_uint64(uint64_t *p, uint64_t x)
-{
-	/*
-	 * atomic_fetchadd_64() doesn't exist, but we only ever use this
-	 * function on LP64 systems, so atomic_fetchadd_long() will do.
-	 */
-	assert(sizeof(uint64_t) == sizeof(unsigned long));
-
-	return atomic_fetchadd_long(p, (unsigned long)x) + x;
-}
-
-ATOMIC_INLINE uint64_t
-atomic_sub_uint64(uint64_t *p, uint64_t x)
-{
-	assert(sizeof(uint64_t) == sizeof(unsigned long));
-
-	return atomic_fetchadd_long(p, (unsigned long)(-(long)x)) - x;
-}
-
-ATOMIC_INLINE uint64_t
-atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _new)
-{
-	assert(sizeof(uint64_t) == sizeof(unsigned long));
-
-	return atomic_cmpset_long(v, old, _new);
-}
-#  elif (defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_8))
-ATOMIC_INLINE uint64_t
-atomic_add_uint64(uint64_t *p, uint64_t x)
-{
-	return __sync_add_and_fetch(p, x);
-}
-
-ATOMIC_INLINE uint64_t
-atomic_sub_uint64(uint64_t *p, uint64_t x)
-{
-	return __sync_sub_and_fetch(p, x);
-}
-
-ATOMIC_INLINE uint64_t
-atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _new)
-{
-	return __sync_val_compare_and_swap(v, old, _new);
-}
-#  else
-#    error "Missing implementation for 64-bit atomic operations"
-#  endif
-#endif
+/* WARNING! Float 'atomics' are really faked ones, those are actually closer to some kind of spinlock-sync'ed operation,
+ *          which means they are only efficient if collisions are highly unlikely (i.e. if probability of two threads
+ *          working on the same pointer at the same time is very low). */
+ATOMIC_INLINE float atomic_add_fl(float *p, const float x);

 /******************************************************************************/
-/* 32-bit operations. */
-#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
-ATOMIC_INLINE uint32_t
-atomic_add_uint32(uint32_t *p, uint32_t x)
-{
-	return __sync_add_and_fetch(p, x);
-}
+/* Include system-dependent implementations. */

-ATOMIC_INLINE uint32_t
-atomic_sub_uint32(uint32_t *p, uint32_t x)
-{
-	return __sync_sub_and_fetch(p, x);
-}
-
-ATOMIC_INLINE uint32_t
-atomic_cas_uint32(uint32_t *v, uint32_t old, uint32_t _new)
-{
-   return __sync_val_compare_and_swap(v, old, _new);
-}
-#elif (defined(_MSC_VER))
-ATOMIC_INLINE uint32_t
-atomic_add_uint32(uint32_t *p, uint32_t x)
-{
-	return InterlockedExchangeAdd(p, x) + x;
-}
-
-ATOMIC_INLINE uint32_t
-atomic_sub_uint32(uint32_t *p, uint32_t x)
-{
-	return InterlockedExchangeAdd(p, -((int32_t)x)) - x;
-}
-
-ATOMIC_INLINE uint32_t
-atomic_cas_uint32(uint32_t *v, uint32_t old, uint32_t _new)
-{
-	return InterlockedCompareExchange((long *)v, _new, old);
-}
-#elif (defined(__APPLE__))
-ATOMIC_INLINE uint32_t
-atomic_add_uint32(uint32_t *p, uint32_t x)
-{
-	return (uint32_t)OSAtomicAdd32((int32_t)x, (int32_t *)p);
-}
-
-ATOMIC_INLINE uint32_t
-atomic_sub_uint32(uint32_t *p, uint32_t x)
-{
-	return (uint32_t)OSAtomicAdd32(-((int32_t)x), (int32_t *)p);
-}
-
-ATOMIC_INLINE uint32_t
-atomic_cas_uint32(uint32_t *v, uint32_t old, uint32_t _new)
-{
-	uint32_t init_val = *v;
-	OSAtomicCompareAndSwap32((int32_t)old, (int32_t)_new, (int32_t *)v);
-	return init_val;
-}
-#elif (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
-ATOMIC_INLINE uint32_t
-atomic_add_uint32(uint32_t *p, uint32_t x)
-{
-	asm volatile (
-	    "lock; xaddl %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
-	return x;
-}
-
-ATOMIC_INLINE uint32_t
-atomic_sub_uint32(uint32_t *p, uint32_t x)
-{
-	x = (uint32_t)(-(int32_t)x);
-	asm volatile (
-	    "lock; xaddl %0, %1;"
-	    : "+r" (x), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
-	return x;
-}
-
-ATOMIC_INLINE uint32_t
-atomic_cas_uint32(uint32_t *v, uint32_t old, uint32_t _new)
-{
-	uint32_t ret;
-	asm volatile (
-	    "lock; cmpxchgl %2,%1"
-	    : "=a" (ret), "+m" (*v)
-	    : "r" (_new), "0" (old)
-	    : "memory");
-	return ret;
-}
-#elif (defined(JEMALLOC_ATOMIC9))
-ATOMIC_INLINE uint32_t
-atomic_add_uint32(uint32_t *p, uint32_t x)
-{
-	return atomic_fetchadd_32(p, x) + x;
-}
-
-ATOMIC_INLINE uint32_t
-atomic_sub_uint32(uint32_t *p, uint32_t x)
-{
-	return atomic_fetchadd_32(p, (uint32_t)(-(int32_t)x)) - x;
-}
-
-ATOMIC_INLINE uint32_t
-atomic_cas_uint32(uint32_t *v, uint32_t old, uint32_t _new)
-{
-	return atomic_cmpset_32(v, old, _new);
-}
-#elif defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_4)
-ATOMIC_INLINE uint32_t
-atomic_add_uint32(uint32_t *p, uint32_t x)
-{
-	return __sync_add_and_fetch(p, x);
-}
-
-ATOMIC_INLINE uint32_t
-atomic_sub_uint32(uint32_t *p, uint32_t x)
-{
-	return __sync_sub_and_fetch(p, x);
-}
-
-ATOMIC_INLINE uint32_t
-atomic_cas_uint32(uint32_t *v, uint32_t old, uint32_t _new)
-{
-	return __sync_val_compare_and_swap(v, old, _new);
-}
+/* Note that we are using _unix flavor as fallback here (it will raise precompiler errors as needed). */
+#if defined(_MSC_VER)
+#  include "intern/atomic_ops_msvc.h"
 #else
-#  error "Missing implementation for 32-bit atomic operations"
+#  include "intern/atomic_ops_unix.h"
 #endif

-/******************************************************************************/
-/* 8-bit operations. */
-#ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
-ATOMIC_INLINE uint8_t
-atomic_fetch_and_and_uint8(uint8_t *p, uint8_t b)
-{
-	return __sync_fetch_and_and(p, b);
-}
-#elif (defined(_MSC_VER))
-#include <intrin.h>
-#pragma intrinsic(_InterlockedAnd8)
-ATOMIC_INLINE uint8_t
-atomic_fetch_and_and_uint8(uint8_t *p, uint8_t b)
-{
-#if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
-	return InterlockedAnd8((char *)p, (char)b);
-#else
-	return _InterlockedAnd8((char *)p, (char)b);
-#endif
-}
-#else
-#  error "Missing implementation for 8-bit atomic operations"
-#endif
-
-/******************************************************************************/
-/* size_t operations. */
-ATOMIC_INLINE size_t
-atomic_add_z(size_t *p, size_t x)
-{
-	assert(sizeof(size_t) == 1 << LG_SIZEOF_PTR);
-
-#if (LG_SIZEOF_PTR == 3)
-	return (size_t)atomic_add_uint64((uint64_t *)p, (uint64_t)x);
-#elif (LG_SIZEOF_PTR == 2)
-	return (size_t)atomic_add_uint32((uint32_t *)p, (uint32_t)x);
-#endif
-}
-
-ATOMIC_INLINE size_t
-atomic_sub_z(size_t *p, size_t x)
-{
-	assert(sizeof(size_t) == 1 << LG_SIZEOF_PTR);
-
-#if (LG_SIZEOF_PTR == 3)
-	return (size_t)atomic_add_uint64((uint64_t *)p,
-	                                 (uint64_t)-((int64_t)x));
-#elif (LG_SIZEOF_PTR == 2)
-	return (size_t)atomic_add_uint32((uint32_t *)p,
-	                                 (uint32_t)-((int32_t)x));
-#endif
-}
-
-ATOMIC_INLINE size_t
-atomic_cas_z(size_t *v, size_t old, size_t _new)
-{
-	assert(sizeof(size_t) == 1 << LG_SIZEOF_PTR);
-
-#if (LG_SIZEOF_PTR == 3)
-	return (size_t)atomic_cas_uint64((uint64_t *)v,
-	                                 (uint64_t)old,
-	                                 (uint64_t)_new);
-#elif (LG_SIZEOF_PTR == 2)
-	return (size_t)atomic_cas_uint32((uint32_t *)v,
-	                                 (uint32_t)old,
-	                                 (uint32_t)_new);
-#endif
-}
-
-/******************************************************************************/
-/* unsigned operations. */
-ATOMIC_INLINE unsigned
-atomic_add_u(unsigned *p, unsigned x)
-{
-	assert(sizeof(unsigned) == 1 << LG_SIZEOF_INT);
-
-#if (LG_SIZEOF_INT == 3)
-	return (unsigned)atomic_add_uint64((uint64_t *)p, (uint64_t)x);
-#elif (LG_SIZEOF_INT == 2)
-	return (unsigned)atomic_add_uint32((uint32_t *)p, (uint32_t)x);
-#endif
-}
-
-ATOMIC_INLINE unsigned
-atomic_sub_u(unsigned *p, unsigned x)
-{
-	assert(sizeof(unsigned) == 1 << LG_SIZEOF_INT);
-
-#if (LG_SIZEOF_INT == 3)
-	return (unsigned)atomic_add_uint64((uint64_t *)p,
-	                                   (uint64_t)-((int64_t)x));
-#elif (LG_SIZEOF_INT == 2)
-	return (unsigned)atomic_add_uint32((uint32_t *)p,
-	                                   (uint32_t)-((int32_t)x));
-#endif
-}
-
-ATOMIC_INLINE unsigned
-atomic_cas_u(unsigned *v, unsigned old, unsigned _new)
-{
-	assert(sizeof(unsigned) == 1 << LG_SIZEOF_INT);
-
-#if (LG_SIZEOF_PTR == 3)
-	return (unsigned)atomic_cas_uint64((uint64_t *)v,
-	                                   (uint64_t)old,
-	                                   (uint64_t)_new);
-#elif (LG_SIZEOF_PTR == 2)
-	return (unsigned)atomic_cas_uint32((uint32_t *)v,
-	                                   (uint32_t)old,
-	                                   (uint32_t)_new);
-#endif
-}
+/* Include 'fake' atomic extensions, built over real atomic primitives. */
+#include "intern/atomic_ops_ext.h"

 #endif /* __ATOMIC_OPS_H__ */
--- a/intern/atomic/intern/atomic_ops_ext.h
+++ b/intern/atomic/intern/atomic_ops_ext.h
@@ -0,0 +1,146 @@
+/*
+ * Original code from jemalloc with this license:
+ *
+ * Copyright (C) 2002-2013 Jason Evans <jasone@canonware.com>.
+ * All rights reserved.
+ * Copyright (C) 2007-2012 Mozilla Foundation.  All rights reserved.
+ * Copyright (C) 2009-2013 Facebook, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 1. Redistributions of source code must retain the above copyright notice(s),
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice(s),
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2016 Blender Foundation.
+ * All rights reserved.
+ *
+ * The Original Code is: adapted from jemalloc.
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+#ifndef __ATOMIC_OPS_EXT_H__
+#define __ATOMIC_OPS_EXT_H__
+
+#include "atomic_ops_utils.h"
+
+/******************************************************************************/
+/* size_t operations. */
+ATOMIC_INLINE size_t atomic_add_z(size_t *p, size_t x)
+{
+	assert(sizeof(size_t) == LG_SIZEOF_PTR);
+
+#if (LG_SIZEOF_PTR == 8)
+	return (size_t)atomic_add_uint64((uint64_t *)p, (uint64_t)x);
+#elif (LG_SIZEOF_PTR == 4)
+	return (size_t)atomic_add_uint32((uint32_t *)p, (uint32_t)x);
+#endif
+}
+
+ATOMIC_INLINE size_t atomic_sub_z(size_t *p, size_t x)
+{
+	assert(sizeof(size_t) == LG_SIZEOF_PTR);
+
+#if (LG_SIZEOF_PTR == 8)
+	return (size_t)atomic_add_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
+#elif (LG_SIZEOF_PTR == 4)
+	return (size_t)atomic_add_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
+#endif
+}
+
+ATOMIC_INLINE size_t atomic_cas_z(size_t *v, size_t old, size_t _new)
+{
+	assert(sizeof(size_t) == LG_SIZEOF_PTR);
+
+#if (LG_SIZEOF_PTR == 8)
+	return (size_t)atomic_cas_uint64((uint64_t *)v, (uint64_t)old, (uint64_t)_new);
+#elif (LG_SIZEOF_PTR == 4)
+	return (size_t)atomic_cas_uint32((uint32_t *)v, (uint32_t)old, (uint32_t)_new);
+#endif
+}
+
+/******************************************************************************/
+/* unsigned operations. */
+ATOMIC_INLINE unsigned atomic_add_u(unsigned *p, unsigned x)
+{
+	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+
+#if (LG_SIZEOF_INT == 8)
+	return (unsigned)atomic_add_uint64((uint64_t *)p, (uint64_t)x);
+#elif (LG_SIZEOF_INT == 4)
+	return (unsigned)atomic_add_uint32((uint32_t *)p, (uint32_t)x);
+#endif
+}
+
+ATOMIC_INLINE unsigned atomic_sub_u(unsigned *p, unsigned x)
+{
+	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+
+#if (LG_SIZEOF_INT == 8)
+	return (unsigned)atomic_add_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
+#elif (LG_SIZEOF_INT == 4)
+	return (unsigned)atomic_add_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
+#endif
+}
+
+ATOMIC_INLINE unsigned atomic_cas_u(unsigned *v, unsigned old, unsigned _new)
+{
+	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+
+#if (LG_SIZEOF_INT == 8)
+	return (unsigned)atomic_cas_uint64((uint64_t *)v, (uint64_t)old, (uint64_t)_new);
+#elif (LG_SIZEOF_INT == 4)
+	return (unsigned)atomic_cas_uint32((uint32_t *)v, (uint32_t)old, (uint32_t)_new);
+#endif
+}
+
+/******************************************************************************/
+/* float operations. */
+
+ATOMIC_INLINE float atomic_add_fl(float *p, const float x)
+{
+	assert(sizeof(float) == sizeof(uint32_t));
+
+	float oldval, newval;
+	uint32_t prevval;
+
+	do {  /* Note that since collisions are unlikely, loop will nearly always run once. */
+		oldval = *p;
+		newval = oldval + x;
+		prevval = atomic_cas_uint32((uint32_t *)p, *(uint32_t *)(&oldval), *(uint32_t *)(&newval));
+	} while (UNLIKELY(prevval != *(uint32_t *)(&oldval)));
+
+	return newval;
+}
+
+#endif /* __ATOMIC_OPS_EXT_H__ */
--- a/intern/atomic/intern/atomic_ops_msvc.h
+++ b/intern/atomic/intern/atomic_ops_msvc.h
@@ -0,0 +1,107 @@
+/*
+ * Adopted from jemalloc with this license:
+ *
+ * Copyright (C) 2002-2013 Jason Evans <jasone@canonware.com>.
+ * All rights reserved.
+ * Copyright (C) 2007-2012 Mozilla Foundation.  All rights reserved.
+ * Copyright (C) 2009-2013 Facebook, Inc.  All rights reserved.
+
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 1. Redistributions of source code must retain the above copyright notice(s),
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice(s),
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ATOMIC_OPS_MSVC_H__
+#define __ATOMIC_OPS_MSVC_H__
+
+#include "atomic_ops_utils.h"
+
+#define NOGDI
+#ifndef NOMINMAX
+#  define NOMINMAX
+#endif
+#define WIN32_LEAN_AND_MEAN
+
+#include <windows.h>
+#include <intrin.h>
+
+/******************************************************************************/
+/* 64-bit operations. */
+#if (LG_SIZEOF_PTR == 8 || LG_SIZEOF_INT == 8)
+ATOMIC_INLINE uint64_t atomic_add_uint64(uint64_t *p, uint64_t x)
+{
+	return InterlockedExchangeAdd64((int64_t *)p, (int64_t)x) + x;
+}
+
+ATOMIC_INLINE uint64_t atomic_sub_uint64(uint64_t *p, uint64_t x)
+{
+	return InterlockedExchangeAdd64((int64_t *)p, -((int64_t)x)) - x;
+}
+
+ATOMIC_INLINE uint64_t atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _new)
+{
+	return InterlockedCompareExchange64((int64_t *)v, _new, old);
+}
+#endif
+
+/******************************************************************************/
+/* 32-bit operations. */
+ATOMIC_INLINE uint32_t atomic_add_uint32(uint32_t *p, uint32_t x)
+{
+	return InterlockedExchangeAdd(p, x) + x;
+}
+
+ATOMIC_INLINE uint32_t atomic_sub_uint32(uint32_t *p, uint32_t x)
+{
+	return InterlockedExchangeAdd(p, -((int32_t)x)) - x;
+}
+
+ATOMIC_INLINE uint32_t atomic_cas_uint32(uint32_t *v, uint32_t old, uint32_t _new)
+{
+	return InterlockedCompareExchange((long *)v, _new, old);
+}
+
+ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
+{
+	return InterlockedExchangeAdd(p, x);
+}
+
+/******************************************************************************/
+/* 8-bit operations. */
+
+#pragma intrinsic(_InterlockedAnd8)
+ATOMIC_INLINE uint8_t atomic_fetch_and_and_uint8(uint8_t *p, uint8_t b)
+{
+#if (LG_SIZEOF_PTR == 8 || LG_SIZEOF_INT == 8)
+	return InterlockedAnd8((char *)p, (char)b);
+#else
+	return _InterlockedAnd8((char *)p, (char)b);
+#endif
+}
+
+#pragma intrinsic(_InterlockedOr8)
+ATOMIC_INLINE uint8_t atomic_fetch_and_or_uint8(uint8_t *p, uint8_t b)
+{
+#if (LG_SIZEOF_PTR == 8 || LG_SIZEOF_INT == 8)
+	return InterlockedOr8((char *)p, (char)b);
+#else
+	return _InterlockedOr8((char *)p, (char)b);
+#endif
+}
+
+#endif /* __ATOMIC_OPS_MSVC_H__ */
--- a/intern/atomic/intern/atomic_ops_unix.h
+++ b/intern/atomic/intern/atomic_ops_unix.h
@@ -0,0 +1,191 @@
+/*
+ * Original code from jemalloc with this license:
+ *
+ * Copyright (C) 2002-2013 Jason Evans <jasone@canonware.com>.
+ * All rights reserved.
+ * Copyright (C) 2007-2012 Mozilla Foundation.  All rights reserved.
+ * Copyright (C) 2009-2013 Facebook, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 1. Redistributions of source code must retain the above copyright notice(s),
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice(s),
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2016 Blender Foundation.
+ * All rights reserved.
+ *
+ * The Original Code is: adapted from jemalloc.
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+#ifndef __ATOMIC_OPS_UNIX_H__
+#define __ATOMIC_OPS_UNIX_H__
+
+#include "atomic_ops_utils.h"
+
+/******************************************************************************/
+/* 64-bit operations. */
+#if (LG_SIZEOF_PTR == 8 || LG_SIZEOF_INT == 8)
+#  if (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8) || defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_8))
+ATOMIC_INLINE uint64_t atomic_add_uint64(uint64_t *p, uint64_t x)
+{
+	return __sync_add_and_fetch(p, x);
+}
+
+ATOMIC_INLINE uint64_t atomic_sub_uint64(uint64_t *p, uint64_t x)
+{
+	return __sync_sub_and_fetch(p, x);
+}
+
+ATOMIC_INLINE uint64_t atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _new)
+{
+	return __sync_val_compare_and_swap(v, old, _new);
+}
+#  elif (defined(__amd64__) || defined(__x86_64__))
+ATOMIC_INLINE uint64_t atomic_add_uint64(uint64_t *p, uint64_t x)
+{
+	asm volatile (
+	    "lock; xaddq %0, %1;"
+	    : "+r" (x), "=m" (*p) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    );
+	return x;
+}
+
+ATOMIC_INLINE uint64_t atomic_sub_uint64(uint64_t *p, uint64_t x)
+{
+	x = (uint64_t)(-(int64_t)x);
+	asm volatile (
+	    "lock; xaddq %0, %1;"
+	    : "+r" (x), "=m" (*p) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    );
+	return x;
+}
+
+ATOMIC_INLINE uint64_t atomic_cas_uint64(uint64_t *v, uint64_t old, uint64_t _new)
+{
+	uint64_t ret;
+	asm volatile (
+	    "lock; cmpxchgq %2,%1"
+	    : "=a" (ret), "+m" (*v)
+	    : "r" (_new), "0" (old)
+	    : "memory");
+	return ret;
+}
+#  else
+#    error "Missing implementation for 64-bit atomic operations"
+#  endif
+#endif
+
+/******************************************************************************/
+/* 32-bit operations. */
+#if (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4) || defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_4))
+ATOMIC_INLINE uint32_t atomic_add_uint32(uint32_t *p, uint32_t x)
+{
+	return __sync_add_and_fetch(p, x);
+}
+
+ATOMIC_INLINE uint32_t atomic_sub_uint32(uint32_t *p, uint32_t x)
+{
+	return __sync_sub_and_fetch(p, x);
+}
+
+ATOMIC_INLINE uint32_t atomic_cas_uint32(uint32_t *v, uint32_t old, uint32_t _new)
+{
+   return __sync_val_compare_and_swap(v, old, _new);
+}
+#elif (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
+ATOMIC_INLINE uint32_t atomic_add_uint32(uint32_t *p, uint32_t x)
+{
+	uint32_t ret = x;
+	asm volatile (
+	    "lock; xaddl %0, %1;"
+	    : "+r" (ret), "=m" (*p) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    );
+	return ret+x;
+}
+
+ATOMIC_INLINE uint32_t atomic_sub_uint32(uint32_t *p, uint32_t x)
+{
+	ret = (uint32_t)(-(int32_t)x);
+	asm volatile (
+	    "lock; xaddl %0, %1;"
+	    : "+r" (ret), "=m" (*p) /* Outputs. */
+	    : "m" (*p) /* Inputs. */
+	    );
+	return ret-x;
+}
+
+ATOMIC_INLINE uint32_t atomic_cas_uint32(uint32_t *v, uint32_t old, uint32_t _new)
+{
+	uint32_t ret;
+	asm volatile (
+	    "lock; cmpxchgl %2,%1"
+	    : "=a" (ret), "+m" (*v)
+	    : "r" (_new), "0" (old)
+	    : "memory");
+	return ret;
+}
+#else
+#  error "Missing implementation for 32-bit atomic operations"
+#endif
+
+#if (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4) || defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_4))
+ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
+{
+	return __sync_fetch_and_add(p, x);
+}
+
+#else
+#  error "Missing implementation for 32-bit atomic operations"
+#endif
+
+/******************************************************************************/
+/* 8-bit operations. */
+#if (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1) || defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_1))
+ATOMIC_INLINE uint8_t atomic_fetch_and_and_uint8(uint8_t *p, uint8_t b)
+{
+	return __sync_fetch_and_and(p, b);
+}
+ATOMIC_INLINE uint8_t atomic_fetch_and_or_uint8(uint8_t *p, uint8_t b)
+{
+	return __sync_fetch_and_or(p, b);
+}
+#else
+#  error "Missing implementation for 8-bit atomic operations"
+#endif
+
+#endif /* __ATOMIC_OPS_UNIX_H__ */
--- a/intern/atomic/intern/atomic_ops_utils.h
+++ b/intern/atomic/intern/atomic_ops_utils.h
@@ -0,0 +1,110 @@
+/*
+ * Original code from jemalloc with this license:
+ *
+ * Copyright (C) 2002-2013 Jason Evans <jasone@canonware.com>.
+ * All rights reserved.
+ * Copyright (C) 2007-2012 Mozilla Foundation.  All rights reserved.
+ * Copyright (C) 2009-2013 Facebook, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * 1. Redistributions of source code must retain the above copyright notice(s),
+ *    this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice(s),
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2016 Blender Foundation.
+ * All rights reserved.
+ *
+ * The Original Code is: adapted from jemalloc.
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+#ifndef __ATOMIC_OPS_UTILS_H__
+#define __ATOMIC_OPS_UTILS_H__
+
+/* needed for int types */
+#include "../../../source/blender/blenlib/BLI_sys_types.h"
+#include <stdlib.h>
+#include <limits.h>
+
+#include <assert.h>
+
+/* little macro so inline keyword works */
+#if defined(_MSC_VER)
+#  define ATOMIC_INLINE static __forceinline
+#else
+#  if (defined(__APPLE__) && defined(__ppc__))
+/* static inline __attribute__ here breaks osx ppc gcc42 build */
+#    define ATOMIC_INLINE static __attribute__((always_inline))
+#  else
+#    define ATOMIC_INLINE static inline __attribute__((always_inline))
+#  endif
+#endif
+
+#ifndef LIKELY
+#  ifdef __GNUC__
+#    define LIKELY(x)       __builtin_expect(!!(x), 1)
+#    define UNLIKELY(x)     __builtin_expect(!!(x), 0)
+#  else
+#    define LIKELY(x)       (x)
+#    define UNLIKELY(x)     (x)
+#  endif
+#endif
+
+#ifdef UINTPTR_MAX
+#  if (UINTPTR_MAX == 0xFFFFFFFF)
+#    define LG_SIZEOF_PTR 4
+#  elif (UINTPTR_MAX == 0xFFFFFFFFFFFFFFFF)
+#    define LG_SIZEOF_PTR 8
+#  endif
+#elif defined(__WORDSIZE)  /* Fallback for older glibc and cpp */
+#  if (__WORDSIZE == 32)
+#    define LG_SIZEOF_PTR 4
+#  elif (__WORDSIZE == 64)
+#    define LG_SIZEOF_PTR 8
+#  endif
+#endif
+
+#ifndef LG_SIZEOF_PTR
+#  error "Cannot find pointer size"
+#endif
+
+#if (UINT_MAX == 0xFFFFFFFF)
+#  define LG_SIZEOF_INT 4
+#elif (UINT_MAX == 0xFFFFFFFFFFFFFFFF)
+#  define LG_SIZEOF_INT 8
+#else
+#  error "Cannot find int size"
+#endif
+
+#endif /* __ATOMIC_OPS_UTILS_H__ */
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -153,7 +153,9 @@ set(WITH_CYCLES_DEVICE_MULTI TRUE)
 if(CYCLES_STANDALONE_REPOSITORY)
 	TEST_UNORDERED_MAP_SUPPORT()
 endif()
-if(HAVE_STD_UNORDERED_MAP_HEADER)
+if(WITH_CXX11)
+	add_definitions(-DCYCLES_STD_UNORDERED_MAP)
+elseif(HAVE_STD_UNORDERED_MAP_HEADER)
 	if(HAVE_UNORDERED_MAP_IN_STD_NAMESPACE)
 		add_definitions(-DCYCLES_STD_UNORDERED_MAP)
 	else()
@@ -235,6 +237,7 @@ endif()
 add_subdirectory(bvh)
 add_subdirectory(device)
 add_subdirectory(doc)
+add_subdirectory(graph)
 add_subdirectory(kernel)
 add_subdirectory(render)
 add_subdirectory(subd)
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -1,13 +1,14 @@

 set(INC
 	.
+	../bvh
 	../device
+	../graph
 	../kernel
 	../kernel/svm
-	../bvh
-	../util
 	../render
 	../subd
+	../util
 )
 set(INC_SYS
 )
@@ -20,6 +21,7 @@ set(LIBRARIES
 	cycles_render
 	cycles_bvh
 	cycles_subd
+	cycles_graph
 	cycles_util
 	${BLENDER_GL_LIBRARIES}
 	${CYCLES_APP_GLEW_LIBRARY}
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -20,6 +20,8 @@
 #include <algorithm>
 #include <iterator>

+#include "node_xml.h"
+
 #include "background.h"
 #include "camera.h"
 #include "film.h"
@@ -29,6 +31,7 @@
 #include "mesh.h"
 #include "nodes.h"
 #include "object.h"
+#include "osl.h"
 #include "shader.h"
 #include "scene.h"

@@ -48,11 +51,11 @@ CCL_NAMESPACE_BEGIN

 /* XML reading state */

-struct XMLReadState {
+struct XMLReadState : public XMLReader {
 	Scene *scene;		/* scene pointer */
 	Transform tfm;		/* current transform state */
 	bool smooth;		/* smooth normal state */
-	int shader;			/* current shader */
+	Shader *shader;		/* current shader */
 	string base;		/* base path to current file*/
 	float dicing_rate;	/* current dicing rate */
 	Mesh::DisplacementMethod displacement_method;
@@ -60,7 +63,7 @@ struct XMLReadState {
 	XMLReadState()
 	  : scene(NULL),
 	    smooth(false),
-	    shader(0),
+	    shader(NULL),
 	    dicing_rate(0.0f),
 	    displacement_method(Mesh::DISPLACE_BUMP)
 	{
@@ -212,7 +215,7 @@ static bool xml_equal_string(pugi::xml_node node, const char *name, const char *
 	return false;
 }

-static bool xml_read_enum(ustring *str, ShaderEnum& enm, pugi::xml_node node, const char *name)
+static bool xml_read_enum_value(int *value, NodeEnum& enm, pugi::xml_node node, const char *name)
 {
 	pugi::xml_attribute attr = node.attribute(name);

@@ -220,7 +223,7 @@ static bool xml_read_enum(ustring *str, ShaderEnum& enm, pugi::xml_node node, co
 		ustring ustr(attr.value());

 		if(enm.exists(ustr)) {
-			*str = ustr;
+			*value = enm[ustr];
 			return true;
 		}
 		else
@@ -230,141 +233,16 @@ static bool xml_read_enum(ustring *str, ShaderEnum& enm, pugi::xml_node node, co
 	return false;
 }

-static ShaderSocketType xml_read_socket_type(pugi::xml_node node, const char *name)
-{
-	pugi::xml_attribute attr = node.attribute(name);
-
-	if(attr) {
-		string value = attr.value();
-		if(string_iequals(value, "float"))
-			return SHADER_SOCKET_FLOAT;
-		else if(string_iequals(value, "int"))
-			return SHADER_SOCKET_INT;
-		else if(string_iequals(value, "color"))
-			return SHADER_SOCKET_COLOR;
-		else if(string_iequals(value, "vector"))
-			return SHADER_SOCKET_VECTOR;
-		else if(string_iequals(value, "point"))
-			return SHADER_SOCKET_POINT;
-		else if(string_iequals(value, "normal"))
-			return SHADER_SOCKET_NORMAL;
-		else if(string_iequals(value, "closure color"))
-			return SHADER_SOCKET_CLOSURE;
-		else if(string_iequals(value, "string"))
-			return SHADER_SOCKET_STRING;
-		else
-			fprintf(stderr, "Unknown shader socket type \"%s\" for attribute \"%s\".\n", value.c_str(), name);
-	}
-	
-	return SHADER_SOCKET_UNDEFINED;
-}
-
-/* Film */
-
-static void xml_read_film(const XMLReadState& state, pugi::xml_node node)
-{
-	Film *film = state.scene->film;
-	
-	xml_read_float(&film->exposure, node, "exposure");
-
-	/* ToDo: Filter Type */
-	xml_read_float(&film->filter_width, node, "filter_width");
-}
-
-/* Integrator */
-
-static void xml_read_integrator(const XMLReadState& state, pugi::xml_node node)
-{
-	Integrator *integrator = state.scene->integrator;
-	
-	/* Branched Path */
-	bool branched = false;
-	xml_read_bool(&branched, node, "branched");
-
-	if(branched) {
-		integrator->method = Integrator::BRANCHED_PATH;
-
-		xml_read_int(&integrator->diffuse_samples, node, "diffuse_samples");
-		xml_read_int(&integrator->glossy_samples, node, "glossy_samples");
-		xml_read_int(&integrator->transmission_samples, node, "transmission_samples");
-		xml_read_int(&integrator->ao_samples, node, "ao_samples");
-		xml_read_int(&integrator->mesh_light_samples, node, "mesh_light_samples");
-		xml_read_int(&integrator->subsurface_samples, node, "subsurface_samples");
-		xml_read_int(&integrator->volume_samples, node, "volume_samples");
-		xml_read_bool(&integrator->sample_all_lights_direct, node, "sample_all_lights_direct");
-		xml_read_bool(&integrator->sample_all_lights_indirect, node, "sample_all_lights_indirect");
-	}
-	
-	/* Bounces */
-	xml_read_int(&integrator->min_bounce, node, "min_bounce");
-	xml_read_int(&integrator->max_bounce, node, "max_bounce");
-	
-	xml_read_int(&integrator->max_diffuse_bounce, node, "max_diffuse_bounce");
-	xml_read_int(&integrator->max_glossy_bounce, node, "max_glossy_bounce");
-	xml_read_int(&integrator->max_transmission_bounce, node, "max_transmission_bounce");
-	xml_read_int(&integrator->max_volume_bounce, node, "max_volume_bounce");
-	
-	/* Transparency */
-	xml_read_int(&integrator->transparent_min_bounce, node, "transparent_min_bounce");
-	xml_read_int(&integrator->transparent_max_bounce, node, "transparent_max_bounce");
-	xml_read_bool(&integrator->transparent_shadows, node, "transparent_shadows");
-	
-	/* Volume */
-	xml_read_float(&integrator->volume_step_size, node, "volume_step_size");
-	xml_read_int(&integrator->volume_max_steps, node, "volume_max_steps");
-	
-	/* Various Settings */
-	xml_read_bool(&integrator->caustics_reflective, node, "caustics_reflective");
-	xml_read_bool(&integrator->caustics_refractive, node, "caustics_refractive");
-	xml_read_float(&integrator->filter_glossy, node, "filter_glossy");
-	
-	xml_read_int(&integrator->seed, node, "seed");
-	xml_read_float(&integrator->sample_clamp_direct, node, "sample_clamp_direct");
-	xml_read_float(&integrator->sample_clamp_indirect, node, "sample_clamp_indirect");
-}
-
 /* Camera */

-static void xml_read_camera(const XMLReadState& state, pugi::xml_node node)
+static void xml_read_camera(XMLReadState& state, pugi::xml_node node)
 {
 	Camera *cam = state.scene->camera;

 	xml_read_int(&cam->width, node, "width");
 	xml_read_int(&cam->height, node, "height");

-	if(xml_read_float(&cam->fov, node, "fov"))
-		cam->fov = DEG2RADF(cam->fov);
-
-	xml_read_float(&cam->nearclip, node, "nearclip");
-	xml_read_float(&cam->farclip, node, "farclip");
-	xml_read_float(&cam->aperturesize, node, "aperturesize"); // 0.5*focallength/fstop
-	xml_read_float(&cam->focaldistance, node, "focaldistance");
-	xml_read_float(&cam->shuttertime, node, "shuttertime");
-	xml_read_float(&cam->aperture_ratio, node, "aperture_ratio");
-
-	if(xml_equal_string(node, "type", "orthographic"))
-		cam->type = CAMERA_ORTHOGRAPHIC;
-	else if(xml_equal_string(node, "type", "perspective"))
-		cam->type = CAMERA_PERSPECTIVE;
-	else if(xml_equal_string(node, "type", "panorama"))
-		cam->type = CAMERA_PANORAMA;
-
-	if(xml_equal_string(node, "panorama_type", "equirectangular"))
-		cam->panorama_type = PANORAMA_EQUIRECTANGULAR;
-	else if(xml_equal_string(node, "panorama_type", "fisheye_equidistant"))
-		cam->panorama_type = PANORAMA_FISHEYE_EQUIDISTANT;
-	else if(xml_equal_string(node, "panorama_type", "fisheye_equisolid"))
-		cam->panorama_type = PANORAMA_FISHEYE_EQUISOLID;
-
-	xml_read_float(&cam->fisheye_fov, node, "fisheye_fov");
-	xml_read_float(&cam->fisheye_lens, node, "fisheye_lens");
-
-	xml_read_bool(&cam->use_spherical_stereo, node, "use_spherical_stereo");
-	xml_read_float(&cam->interocular_distance, node, "interocular_distance");
-	xml_read_float(&cam->convergence_distance, node, "convergence_distance");
-
-	xml_read_float(&cam->sensorwidth, node, "sensorwidth");
-	xml_read_float(&cam->sensorheight, node, "sensorheight");
+	xml_read_node(state, cam, node);

 	cam->matrix = state.tfm;

@@ -385,8 +263,11 @@ static string xml_socket_name(const char *name)
 	return sname;
 }

-static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pugi::xml_node graph_node)
+static void xml_read_shader_graph(XMLReadState& state, Shader *shader, pugi::xml_node graph_node)
 {
+	xml_read_node(state, shader, graph_node);
+
+	ShaderManager *manager = state.scene->shader_manager;
 	ShaderGraph *graph = new ShaderGraph();

 	map<string, ShaderNode*> nodemap;
@@ -406,8 +287,8 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 			xml_read_string(&img->filename, node, "src");
 			img->filename = path_join(state.base, img->filename);
 			
-			xml_read_enum(&img->color_space, ImageTextureNode::color_space_enum, node, "color_space");
-			xml_read_enum(&img->projection, ImageTextureNode::projection_enum, node, "projection");
+			xml_read_enum_value((int*)&img->color_space, ImageTextureNode::color_space_enum, node, "color_space");
+			xml_read_enum_value((int*)&img->projection, ImageTextureNode::projection_enum, node, "projection");
 			xml_read_float(&img->projection_blend, node, "projection_blend");

 			/* ToDo: Interpolation */
@@ -420,58 +301,40 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 			xml_read_string(&env->filename, node, "src");
 			env->filename = path_join(state.base, env->filename);
 			
-			xml_read_enum(&env->color_space, EnvironmentTextureNode::color_space_enum, node, "color_space");
-			xml_read_enum(&env->projection, EnvironmentTextureNode::projection_enum, node, "projection");
+			xml_read_enum_value((int*)&env->color_space, EnvironmentTextureNode::color_space_enum, node, "color_space");
+			xml_read_enum_value((int*)&env->projection, EnvironmentTextureNode::projection_enum, node, "projection");

 			snode = env;
 		}
+#ifdef WITH_OSL
 		else if(string_iequals(node.name(), "osl_shader")) {
-			OSLScriptNode *osl = new OSLScriptNode();
+			if(manager->use_osl()) {
+				std::string filepath;

-			/* Source */
-			xml_read_string(&osl->filepath, node, "src");
-			if(path_is_relative(osl->filepath)) {
-				osl->filepath = path_join(state.base, osl->filepath);
-			}
+				if(xml_read_string(&filepath, node, "src")) {
+					if(path_is_relative(filepath)) {
+						filepath = path_join(state.base, filepath);
+					}

-			/* Generate inputs/outputs from node sockets
-			 *
-			 * Note: ShaderInput/ShaderOutput store shallow string copies only!
-			 * Socket names must be stored in the extra lists instead. */
-			/* read input values */
-			for(pugi::xml_node param = node.first_child(); param; param = param.next_sibling()) {
-				if(string_iequals(param.name(), "input")) {
-					string name;
-					if(!xml_read_string(&name, param, "name"))
-						continue;
-					
-					ShaderSocketType type = xml_read_socket_type(param, "type");
-					if(type == SHADER_SOCKET_UNDEFINED)
-						continue;
-					
-					osl->input_names.push_back(ustring(name));
-					osl->add_input(osl->input_names.back().c_str(), type);
+					snode = ((OSLShaderManager*)manager)->osl_node(filepath);
+
+					if(!snode) {
+						fprintf(stderr, "Failed to create OSL node from \"%s\".\n", filepath.c_str());
+					}
 				}
-				else if(string_iequals(param.name(), "output")) {
-					string name;
-					if(!xml_read_string(&name, param, "name"))
-						continue;
-					
-					ShaderSocketType type = xml_read_socket_type(param, "type");
-					if(type == SHADER_SOCKET_UNDEFINED)
-						continue;
-					
-					osl->output_names.push_back(ustring(name));
-					osl->add_output(osl->output_names.back().c_str(), type);
+				else {
+					fprintf(stderr, "OSL node missing \"src\" attribute.\n");
 				}
 			}
-			
-			snode = osl;
+			else {
+				fprintf(stderr, "OSL node without using --shadingsys osl.\n");
+			}
 		}
+#endif
 		else if(string_iequals(node.name(), "sky_texture")) {
 			SkyTextureNode *sky = new SkyTextureNode();
 			
-			xml_read_enum(&sky->type, SkyTextureNode::type_enum, node, "type");
+			xml_read_enum_value((int*)&sky->type, SkyTextureNode::type_enum, node, "type");
 			xml_read_float3(&sky->sun_direction, node, "sun_direction");
 			xml_read_float(&sky->turbidity, node, "turbidity");
 			xml_read_float(&sky->ground_albedo, node, "ground_albedo");
@@ -496,17 +359,17 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 		}
 		else if(string_iequals(node.name(), "gradient_texture")) {
 			GradientTextureNode *blend = new GradientTextureNode();
-			xml_read_enum(&blend->type, GradientTextureNode::type_enum, node, "type");
+			xml_read_enum_value((int*)&blend->type, GradientTextureNode::type_enum, node, "type");
 			snode = blend;
 		}
 		else if(string_iequals(node.name(), "voronoi_texture")) {
 			VoronoiTextureNode *voronoi = new VoronoiTextureNode();
-			xml_read_enum(&voronoi->coloring, VoronoiTextureNode::coloring_enum, node, "coloring");
+			xml_read_enum_value((int*)&voronoi->coloring, VoronoiTextureNode::coloring_enum, node, "coloring");
 			snode = voronoi;
 		}
 		else if(string_iequals(node.name(), "musgrave_texture")) {
 			MusgraveTextureNode *musgrave = new MusgraveTextureNode();
-			xml_read_enum(&musgrave->type, MusgraveTextureNode::type_enum, node, "type");
+			xml_read_enum_value((int*)&musgrave->type, MusgraveTextureNode::type_enum, node, "type");
 			snode = musgrave;
 		}
 		else if(string_iequals(node.name(), "magic_texture")) {
@@ -516,8 +379,8 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 		}
 		else if(string_iequals(node.name(), "wave_texture")) {
 			WaveTextureNode *wave = new WaveTextureNode();
-			xml_read_enum(&wave->type, WaveTextureNode::type_enum, node, "type");
-			xml_read_enum(&wave->profile, WaveTextureNode::profile_enum, node, "profile");
+			xml_read_enum_value((int*)&wave->type, WaveTextureNode::type_enum, node, "type");
+			xml_read_enum_value((int*)&wave->profile, WaveTextureNode::profile_enum, node, "profile");
 			snode = wave;
 		}
 		else if(string_iequals(node.name(), "normal")) {
@@ -531,11 +394,28 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 			snode = bump;
 		}
 		else if(string_iequals(node.name(), "mapping")) {
-			snode = new MappingNode();
+			MappingNode *map = new MappingNode();
+
+			TextureMapping *texmap = &map->tex_mapping;
+			xml_read_enum_value((int*) &texmap->type, TextureMapping::type_enum, node, "type");
+			xml_read_enum_value((int*) &texmap->projection, TextureMapping::projection_enum, node, "projection");
+			xml_read_enum_value((int*) &texmap->x_mapping, TextureMapping::mapping_enum, node, "x_mapping");
+			xml_read_enum_value((int*) &texmap->y_mapping, TextureMapping::mapping_enum, node, "y_mapping");
+			xml_read_enum_value((int*) &texmap->z_mapping, TextureMapping::mapping_enum, node, "z_mapping");
+			xml_read_bool(&texmap->use_minmax, node, "use_minmax");
+			if(texmap->use_minmax) {
+				xml_read_float3(&texmap->min, node, "min");
+				xml_read_float3(&texmap->max, node, "max");
+			}
+			xml_read_float3(&texmap->translation, node, "translation");
+			xml_read_float3(&texmap->rotation, node, "rotation");
+			xml_read_float3(&texmap->scale, node, "scale");
+
+			snode = map;
 		}
 		else if(string_iequals(node.name(), "anisotropic_bsdf")) {
 			AnisotropicBsdfNode *aniso = new AnisotropicBsdfNode();
-			xml_read_enum(&aniso->distribution, AnisotropicBsdfNode::distribution_enum, node, "distribution");
+			xml_read_enum_value((int*)&aniso->distribution, AnisotropicBsdfNode::distribution_enum, node, "distribution");
 			snode = aniso;
 		}
 		else if(string_iequals(node.name(), "diffuse_bsdf")) {
@@ -552,27 +432,27 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 		}
 		else if(string_iequals(node.name(), "toon_bsdf")) {
 			ToonBsdfNode *toon = new ToonBsdfNode();
-			xml_read_enum(&toon->component, ToonBsdfNode::component_enum, node, "component");
+			xml_read_enum_value((int*)&toon->component, ToonBsdfNode::component_enum, node, "component");
 			snode = toon;
 		}
 		else if(string_iequals(node.name(), "glossy_bsdf")) {
 			GlossyBsdfNode *glossy = new GlossyBsdfNode();
-			xml_read_enum(&glossy->distribution, GlossyBsdfNode::distribution_enum, node, "distribution");
+			xml_read_enum_value((int*)&glossy->distribution, GlossyBsdfNode::distribution_enum, node, "distribution");
 			snode = glossy;
 		}
 		else if(string_iequals(node.name(), "glass_bsdf")) {
 			GlassBsdfNode *diel = new GlassBsdfNode();
-			xml_read_enum(&diel->distribution, GlassBsdfNode::distribution_enum, node, "distribution");
+			xml_read_enum_value((int*)&diel->distribution, GlassBsdfNode::distribution_enum, node, "distribution");
 			snode = diel;
 		}
 		else if(string_iequals(node.name(), "refraction_bsdf")) {
 			RefractionBsdfNode *diel = new RefractionBsdfNode();
-			xml_read_enum(&diel->distribution, RefractionBsdfNode::distribution_enum, node, "distribution");
+			xml_read_enum_value((int*)&diel->distribution, RefractionBsdfNode::distribution_enum, node, "distribution");
 			snode = diel;
 		}
 		else if(string_iequals(node.name(), "hair_bsdf")) {
 			HairBsdfNode *hair = new HairBsdfNode();
-			xml_read_enum(&hair->component, HairBsdfNode::component_enum, node, "component");
+			xml_read_enum_value((int*)&hair->component, HairBsdfNode::component_enum, node, "component");
 			snode = hair;
 		}
 		else if(string_iequals(node.name(), "emission")) {
@@ -650,7 +530,7 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 		else if(string_iequals(node.name(), "mix")) {
 			/* ToDo: Tag Mix case for optimization */
 			MixNode *mix = new MixNode();
-			xml_read_enum(&mix->type, MixNode::type_enum, node, "type");
+			xml_read_enum_value((int*)&mix->type, MixNode::type_enum, node, "type");
 			xml_read_bool(&mix->use_clamp, node, "use_clamp");
 			snode = mix;
 		}
@@ -714,32 +594,32 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 		else if(string_iequals(node.name(), "normal_map")) {
 			NormalMapNode *nmap = new NormalMapNode;
 			xml_read_ustring(&nmap->attribute, node, "attribute");
-			xml_read_enum(&nmap->space, NormalMapNode::space_enum, node, "space");
+			xml_read_enum_value((int*)&nmap->space, NormalMapNode::space_enum, node, "space");
 			snode = nmap;
 		}
 		else if(string_iequals(node.name(), "tangent")) {
 			TangentNode *tangent = new TangentNode;
 			xml_read_ustring(&tangent->attribute, node, "attribute");
-			xml_read_enum(&tangent->direction_type, TangentNode::direction_type_enum, node, "direction_type");
-			xml_read_enum(&tangent->axis, TangentNode::axis_enum, node, "axis");
+			xml_read_enum_value((int*)&tangent->direction_type, TangentNode::direction_type_enum, node, "direction_type");
+			xml_read_enum_value((int*)&tangent->axis, TangentNode::axis_enum, node, "axis");
 			snode = tangent;
 		}
 		else if(string_iequals(node.name(), "math")) {
 			MathNode *math = new MathNode();
-			xml_read_enum(&math->type, MathNode::type_enum, node, "type");
+			xml_read_enum_value((int*)&math->type, MathNode::type_enum, node, "type");
 			xml_read_bool(&math->use_clamp, node, "use_clamp");
 			snode = math;
 		}
 		else if(string_iequals(node.name(), "vector_math")) {
 			VectorMathNode *vmath = new VectorMathNode();
-			xml_read_enum(&vmath->type, VectorMathNode::type_enum, node, "type");
+			xml_read_enum_value((int*)&vmath->type, VectorMathNode::type_enum, node, "type");
 			snode = vmath;
 		}
 		else if(string_iequals(node.name(), "vector_transform")) {
 			VectorTransformNode *vtransform = new VectorTransformNode();
-			xml_read_enum(&vtransform->type, VectorTransformNode::type_enum, node, "type");
-			xml_read_enum(&vtransform->convert_from, VectorTransformNode::convert_space_enum, node, "convert_from");
-			xml_read_enum(&vtransform->convert_to, VectorTransformNode::convert_space_enum, node, "convert_to");
+			xml_read_enum_value((int*)&vtransform->type, VectorTransformNode::type_enum, node, "type");
+			xml_read_enum_value((int*)&vtransform->convert_from, VectorTransformNode::convert_space_enum, node, "convert_from");
+			xml_read_enum_value((int*)&vtransform->convert_to, VectorTransformNode::convert_space_enum, node, "convert_to");
 			snode = vtransform;
 		}
 		else if(string_iequals(node.name(), "connect")) {
@@ -758,7 +638,7 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 					ShaderNode *fromnode = nodemap[from_tokens[0]];

 					foreach(ShaderOutput *out, fromnode->outputs)
-						if(string_iequals(xml_socket_name(out->name), from_tokens[1]))
+						if(string_iequals(xml_socket_name(out->name().c_str()), from_tokens[1]))
 							output = out;

 					if(!output)
@@ -771,7 +651,7 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 					ShaderNode *tonode = nodemap[to_tokens[0]];

 					foreach(ShaderInput *in, tonode->inputs)
-						if(string_iequals(xml_socket_name(in->name), to_tokens[1]))
+						if(string_iequals(xml_socket_name(in->name().c_str()), to_tokens[1]))
 							input = in;

 					if(!input)
@@ -803,20 +683,20 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 			/* read input values */
 			for(pugi::xml_attribute attr = node.first_attribute(); attr; attr = attr.next_attribute()) {
 				foreach(ShaderInput *in, snode->inputs) {
-					if(string_iequals(in->name, attr.name())) {
-						switch(in->type) {
-							case SHADER_SOCKET_FLOAT:
-							case SHADER_SOCKET_INT:
-								xml_read_float(&in->value.x, node, attr.name());
+					if(string_iequals(in->name().c_str(), attr.name())) {
+						switch(in->type()) {
+							case SocketType::FLOAT:
+							case SocketType::INT:
+								xml_read_float(&in->value_float(), node, attr.name());
 								break;
-							case SHADER_SOCKET_COLOR:
-							case SHADER_SOCKET_VECTOR:
-							case SHADER_SOCKET_POINT:
-							case SHADER_SOCKET_NORMAL:
-								xml_read_float3(&in->value, node, attr.name());
+							case SocketType::COLOR:
+							case SocketType::VECTOR:
+							case SocketType::POINT:
+							case SocketType::NORMAL:
+								xml_read_float3(&in->value(), node, attr.name());
 								break;
-							case SHADER_SOCKET_STRING:
-								xml_read_ustring( &in->value_string, node, attr.name() );
+							case SocketType::STRING:
+								xml_read_ustring( &in->value_string(), node, attr.name() );
 								break;
 							default:
 								break;
@@ -831,54 +711,22 @@ static void xml_read_shader_graph(const XMLReadState& state, Shader *shader, pug
 	shader->tag_update(state.scene);
 }

-static void xml_read_shader(const XMLReadState& state, pugi::xml_node node)
+static void xml_read_shader(XMLReadState& state, pugi::xml_node node)
 {
 	Shader *shader = new Shader();
-
-	xml_read_string(&shader->name, node, "name");
-	xml_read_bool(&shader->use_mis, node, "use_mis");
-	xml_read_bool(&shader->use_transparent_shadow, node, "use_transparent_shadow");
-
-	/* Volume */
-	xml_read_bool(&shader->heterogeneous_volume, node, "heterogeneous_volume");
-	xml_read_int(&shader->volume_interpolation_method, node, "volume_interpolation_method");
-
-	if(xml_equal_string(node, "volume_sampling_method", "distance"))
-		shader->volume_sampling_method = VOLUME_SAMPLING_DISTANCE;
-	else if(xml_equal_string(node, "volume_sampling_method", "equiangular"))
-		shader->volume_sampling_method = VOLUME_SAMPLING_EQUIANGULAR;
-	else if(xml_equal_string(node, "volume_sampling_method", "multiple_importance"))
-		shader->volume_sampling_method = VOLUME_SAMPLING_MULTIPLE_IMPORTANCE;
-
 	xml_read_shader_graph(state, shader, node);
 	state.scene->shaders.push_back(shader);
 }

 /* Background */

-static void xml_read_background(const XMLReadState& state, pugi::xml_node node)
+static void xml_read_background(XMLReadState& state, pugi::xml_node node)
 {
 	/* Background Settings */
-	Background *bg = state.scene->background;
-
-	xml_read_float(&bg->ao_distance, node, "ao_distance");
-	xml_read_float(&bg->ao_factor, node, "ao_factor");
-
-	xml_read_bool(&bg->transparent, node, "transparent");
+	xml_read_node(state, state.scene->background, node);

 	/* Background Shader */
-	Shader *shader = state.scene->shaders[state.scene->default_background];
-	
-	xml_read_bool(&shader->heterogeneous_volume, node, "heterogeneous_volume");
-	xml_read_int(&shader->volume_interpolation_method, node, "volume_interpolation_method");
-
-	if(xml_equal_string(node, "volume_sampling_method", "distance"))
-		shader->volume_sampling_method = VOLUME_SAMPLING_DISTANCE;
-	else if(xml_equal_string(node, "volume_sampling_method", "equiangular"))
-		shader->volume_sampling_method = VOLUME_SAMPLING_EQUIANGULAR;
-	else if(xml_equal_string(node, "volume_sampling_method", "multiple_importance"))
-		shader->volume_sampling_method = VOLUME_SAMPLING_MULTIPLE_IMPORTANCE;
-
+	Shader *shader = state.scene->default_background;
 	xml_read_shader_graph(state, shader, node);
 }

@@ -906,7 +754,7 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node)
 	mesh->used_shaders.push_back(state.shader);

 	/* read state */
-	int shader = state.shader;
+	int shader = 0;
 	bool smooth = state.smooth;

 	mesh->displacement_method = state.displacement_method;
@@ -967,6 +815,11 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node)
 		/* create vertices */
 		mesh->verts = P;

+		size_t num_triangles = 0;
+		for(size_t i = 0; i < nverts.size(); i++)
+			num_triangles += nverts[i]-2;
+		mesh->reserve_mesh(mesh->verts.size(), num_triangles);
+
 		/* create triangles */
 		int index_offset = 0;

@@ -995,9 +848,9 @@ static void xml_read_mesh(const XMLReadState& state, pugi::xml_node node)
 			index_offset = 0;
 			for(size_t i = 0; i < nverts.size(); i++) {
 				for(int j = 0; j < nverts[i]-2; j++) {
-					int v0 = verts[index_offset];
-					int v1 = verts[index_offset + j + 1];
-					int v2 = verts[index_offset + j + 2];
+					int v0 = index_offset;
+					int v1 = index_offset + j + 1;
+					int v2 = index_offset + j + 2;

 					assert(v0*2+1 < (int)UV.size());
 					assert(v1*2+1 < (int)UV.size());
@@ -1066,7 +919,7 @@ static void xml_read_patch(const XMLReadState& state, pugi::xml_node node)
 		mesh->used_shaders.push_back(state.shader);

 		/* split */
-		SubdParams sdparams(mesh, state.shader, state.smooth);
+		SubdParams sdparams(mesh, 0, state.smooth);
 		xml_read_float(&sdparams.dicing_rate, node, "dicing_rate");

 		DiagSplit dsplit(sdparams);
@@ -1081,47 +934,12 @@ static void xml_read_patch(const XMLReadState& state, pugi::xml_node node)

 /* Light */

-static void xml_read_light(const XMLReadState& state, pugi::xml_node node)
+static void xml_read_light(XMLReadState& state, pugi::xml_node node)
 {
 	Light *light = new Light();
+
 	light->shader = state.shader;
-
-	/* Light Type
-	 * 0: Point, 1: Sun, 3: Area, 5: Spot */
-	int type = 0;
-	xml_read_int(&type, node, "type");
-	light->type = (LightType)type;
-
-	/* Spot Light */
-	xml_read_float(&light->spot_angle, node, "spot_angle");
-	xml_read_float(&light->spot_smooth, node, "spot_smooth");
-
-	/* Area Light */
-	xml_read_float(&light->sizeu, node, "sizeu");
-	xml_read_float(&light->sizev, node, "sizev");
-	xml_read_float3(&light->axisu, node, "axisu");
-	xml_read_float3(&light->axisv, node, "axisv");
-
-	/* Portal? (Area light only) */
-	xml_read_bool(&light->is_portal, node, "is_portal");
-
-	/* Generic */
-	xml_read_float(&light->size, node, "size");
-	xml_read_float3(&light->dir, node, "dir");
-	xml_read_float3(&light->co, node, "P");
-	light->co = transform_point(&state.tfm, light->co);
-
-	/* Settings */
-	xml_read_bool(&light->cast_shadow, node, "cast_shadow");
-	xml_read_bool(&light->use_mis, node, "use_mis");
-	xml_read_int(&light->samples, node, "samples");
-	xml_read_int(&light->max_bounces, node, "max_bounces");
-
-	/* Ray Visibility */
-	xml_read_bool(&light->use_diffuse, node, "use_diffuse");
-	xml_read_bool(&light->use_glossy, node, "use_glossy");
-	xml_read_bool(&light->use_transmission, node, "use_transmission");
-	xml_read_bool(&light->use_scatter, node, "use_scatter");
+	xml_read_node(state, light, node);

 	state.scene->lights.push_back(light);
 }
@@ -1163,17 +981,14 @@ static void xml_read_state(XMLReadState& state, pugi::xml_node node)
 	string shadername;

 	if(xml_read_string(&shadername, node, "shader")) {
-		int i = 0;
 		bool found = false;

 		foreach(Shader *shader, state.scene->shaders) {
 			if(shader->name == shadername) {
-				state.shader = i;
+				state.shader = shader;
 				found = true;
 				break;
 			}
-
-			i++;
 		}

 		if(!found)
@@ -1199,16 +1014,16 @@ static void xml_read_state(XMLReadState& state, pugi::xml_node node)

 /* Scene */

-static void xml_read_include(const XMLReadState& state, const string& src);
+static void xml_read_include(XMLReadState& state, const string& src);

-static void xml_read_scene(const XMLReadState& state, pugi::xml_node scene_node)
+static void xml_read_scene(XMLReadState& state, pugi::xml_node scene_node)
 {
 	for(pugi::xml_node node = scene_node.first_child(); node; node = node.next_sibling()) {
 		if(string_iequals(node.name(), "film")) {
-			xml_read_film(state, node);
+			xml_read_node(state, state.scene->film, node);
 		}
 		else if(string_iequals(node.name(), "integrator")) {
-			xml_read_integrator(state, node);
+			xml_read_node(state, state.scene->integrator, node);
 		}
 		else if(string_iequals(node.name(), "camera")) {
 			xml_read_camera(state, node);
@@ -1253,7 +1068,7 @@ static void xml_read_scene(const XMLReadState& state, pugi::xml_node scene_node)

 /* Include */

-static void xml_read_include(const XMLReadState& state, const string& src)
+static void xml_read_include(XMLReadState& state, const string& src)
 {
 	/* open XML document */
 	pugi::xml_document doc;
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -1,5 +1,6 @@

 set(INC
+	../graph
 	../render
 	../device
 	../kernel
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -193,57 +193,57 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        cls.aa_samples = IntProperty(
                name="AA Samples",
                description="Number of antialiasing samples to render for each pixel",
-                min=1, max=10000,
+                min=1, max=2097151,
                default=4,
                )
        cls.preview_aa_samples = IntProperty(
                name="AA Samples",
                description="Number of antialiasing samples to render in the viewport, unlimited if 0",
-                min=0, max=10000,
+                min=0, max=2097151,
                default=4,
                )
        cls.diffuse_samples = IntProperty(
                name="Diffuse Samples",
                description="Number of diffuse bounce samples to render for each AA sample",
-                min=1, max=10000,
+                min=1, max=1024,
                default=1,
                )
        cls.glossy_samples = IntProperty(
                name="Glossy Samples",
                description="Number of glossy bounce samples to render for each AA sample",
-                min=1, max=10000,
+                min=1, max=1024,
                default=1,
                )
        cls.transmission_samples = IntProperty(
                name="Transmission Samples",
                description="Number of transmission bounce samples to render for each AA sample",
-                min=1, max=10000,
+                min=1, max=1024,
                default=1,
                )
        cls.ao_samples = IntProperty(
                name="Ambient Occlusion Samples",
                description="Number of ambient occlusion samples to render for each AA sample",
-                min=1, max=10000,
+                min=1, max=1024,
                default=1,
                )
        cls.mesh_light_samples = IntProperty(
                name="Mesh Light Samples",
                description="Number of mesh emission light samples to render for each AA sample",
-                min=1, max=10000,
+                min=1, max=1024,
                default=1,
                )

        cls.subsurface_samples = IntProperty(
                name="Subsurface Samples",
                description="Number of subsurface scattering samples to render for each AA sample",
-                min=1, max=10000,
+                min=1, max=1024,
                default=1,
                )

        cls.volume_samples = IntProperty(
                name="Volume Samples",
                description="Number of volume scattering samples to render for each AA sample",
-                min=1, max=10000,
+                min=1, max=1024,
                default=1,
                )

@@ -359,7 +359,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                description="Distance between volume shader samples when rendering the volume "
                            "(lower values give more accurate and detailed results, but also increased render time)",
                default=0.1,
-                min=0.0000001, max=100000.0, soft_min=0.01, soft_max=1.0
+                min=0.0000001, max=100000.0, soft_min=0.01, soft_max=1.0, precision=4
                )

        cls.volume_max_steps = IntProperty(
@@ -594,6 +594,8 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        cls.debug_use_cpu_sse2 = BoolProperty(name="SSE2", default=True)
        cls.debug_use_qbvh = BoolProperty(name="QBVH", default=True)

+        cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False)
+
        cls.debug_opencl_kernel_type = EnumProperty(
            name="OpenCL Kernel Type",
            default='DEFAULT',
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -76,9 +76,8 @@ def use_cuda(context):

 def use_branched_path(context):
    cscene = context.scene.cycles
-    device_type = context.user_preferences.system.compute_device_type

-    return (cscene.progressive == 'BRANCHED_PATH' and device_type != 'OPENCL')
+    return (cscene.progressive == 'BRANCHED_PATH' and not use_opencl(context))


 def use_sample_all_lights(context):
@@ -704,7 +703,7 @@ class Cycles_PT_mesh_displacement(CyclesButtonsPanel, Panel):

        col = split.column()
        sub = col.column(align=True)
-        sub.label(text="Displacment:")
+        sub.label(text="Displacement:")
        sub.prop(cdata, "displacement_method", text="")

        col = split.column()
@@ -1553,6 +1552,10 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
        row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
        col.prop(cscene, "debug_use_qbvh")

+        col = layout.column()
+        col.label('CUDA Flags:')
+        col.prop(cscene, "debug_use_cuda_adaptive_compile")
+
        col = layout.column()
        col.label('OpenCL Flags:')
        col.prop(cscene, "debug_opencl_kernel_type", text="Kernel")
--- a/intern/cycles/blender/blender_camera.cpp
+++ b/intern/cycles/blender/blender_camera.cpp
@@ -37,7 +37,7 @@ struct BlenderCamera {
 	float lens;
 	float shuttertime;
 	Camera::MotionPosition motion_position;
-	float shutter_curve[RAMP_TABLE_SIZE];
+	array<float> shutter_curve;

 	Camera::RollingShutterType rolling_shutter_type;
 	float rolling_shutter_duration;
@@ -65,6 +65,9 @@ struct BlenderCamera {
 	bool use_spherical_stereo;
 	float interocular_distance;
 	float convergence_distance;
+	bool use_pole_merge;
+	float pole_merge_angle_from;
+	float pole_merge_angle_to;

 	enum { AUTO, HORIZONTAL, VERTICAL } sensor_fit;
 	float sensor_width;
@@ -105,10 +108,6 @@ static void blender_camera_init(BlenderCamera *bcam,
 	/* render resolution */
 	bcam->full_width = render_resolution_x(b_render);
 	bcam->full_height = render_resolution_y(b_render);
-
-	/* pixel aspect */
-	bcam->pixelaspect.x = b_render.pixel_aspect_x();
-	bcam->pixelaspect.y = b_render.pixel_aspect_y();
 }

 static float blender_camera_focal_distance(BL::RenderEngine& b_engine,
@@ -183,6 +182,10 @@ static void blender_camera_from_object(BlenderCamera *bcam,
 		}
 		bcam->use_spherical_stereo = b_engine.use_spherical_stereo(b_ob);

+		bcam->use_pole_merge = b_camera.stereo().use_pole_merge();
+		bcam->pole_merge_angle_from = b_camera.stereo().pole_merge_angle_from();
+		bcam->pole_merge_angle_to = b_camera.stereo().pole_merge_angle_to();
+
 		bcam->ortho_scale = b_camera.ortho_scale();

 		bcam->lens = b_camera.lens();
@@ -427,6 +430,10 @@ static void blender_camera_sync(Camera *cam, BlenderCamera *bcam, int width, int
 			cam->stereo_eye = Camera::STEREO_NONE;
 	}

+	cam->use_pole_merge = bcam->use_pole_merge;
+	cam->pole_merge_angle_from = bcam->pole_merge_angle_from;
+	cam->pole_merge_angle_to = bcam->pole_merge_angle_to;
+
 	/* anamorphic lens bokeh */
 	cam->aperture_ratio = bcam->aperture_ratio;

@@ -453,7 +460,7 @@ static void blender_camera_sync(Camera *cam, BlenderCamera *bcam, int width, int
 	cam->rolling_shutter_type = bcam->rolling_shutter_type;
 	cam->rolling_shutter_duration = bcam->rolling_shutter_duration;

-	memcpy(cam->shutter_curve, bcam->shutter_curve, sizeof(cam->shutter_curve));
+	cam->shutter_curve = bcam->shutter_curve;

 	/* border */
 	cam->border = bcam->border;
@@ -552,6 +559,10 @@ void BlenderSync::sync_camera_motion(BL::RenderSettings& b_render,
 		float aspectratio, sensor_size;
 		blender_camera_init(&bcam, b_render);

+		/* TODO(sergey): Consider making it a part of blender_camera_init(). */
+		bcam.pixelaspect.x = b_render.pixel_aspect_x();
+		bcam.pixelaspect.y = b_render.pixel_aspect_y();
+
 		blender_camera_from_object(&bcam, b_engine, b_ob);
 		blender_camera_viewplane(&bcam,
 		                         width, height,
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -138,8 +138,7 @@ bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par
 			BL::ParticleSettings b_part((const PointerRNA)b_psys.settings().ptr);

 			if((b_part.render_type() == BL::ParticleSettings::render_type_PATH) && (b_part.type() == BL::ParticleSettings::type_HAIR)) {
-				int mi = clamp(b_part.material()-1, 0, mesh->used_shaders.size()-1);
-				int shader = mesh->used_shaders[mi];
+				int shader = clamp(b_part.material()-1, 0, mesh->used_shaders.size()-1);
 				int draw_step = background ? b_part.render_step() : b_part.draw_step();
 				int totparts = b_psys.particles.length();
 				int totchild = background ? b_psys.child_particles.length() : (int)((float)b_psys.child_particles.length() * (float)b_part.draw_percentage() / 100.0f);
@@ -157,16 +156,16 @@ bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par

 				PointerRNA cpsys = RNA_pointer_get(&b_part.ptr, "cycles");

-				CData->psys_firstcurve.push_back(curvenum);
-				CData->psys_curvenum.push_back(totcurves);
-				CData->psys_shader.push_back(shader);
+				CData->psys_firstcurve.push_back_slow(curvenum);
+				CData->psys_curvenum.push_back_slow(totcurves);
+				CData->psys_shader.push_back_slow(shader);

 				float radius = get_float(cpsys, "radius_scale") * 0.5f;
 	
-				CData->psys_rootradius.push_back(radius * get_float(cpsys, "root_width"));
-				CData->psys_tipradius.push_back(radius * get_float(cpsys, "tip_width"));
-				CData->psys_shape.push_back(get_float(cpsys, "shape"));
-				CData->psys_closetip.push_back(get_boolean(cpsys, "use_closetip"));
+				CData->psys_rootradius.push_back_slow(radius * get_float(cpsys, "root_width"));
+				CData->psys_tipradius.push_back_slow(radius * get_float(cpsys, "tip_width"));
+				CData->psys_shape.push_back_slow(get_float(cpsys, "shape"));
+				CData->psys_closetip.push_back_slow(get_boolean(cpsys, "use_closetip"));

 				int pa_no = 0;
 				if(!(b_part.child_type() == 0) && totchild != 0)
@@ -181,7 +180,7 @@ bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par

 				for(; pa_no < totparts+totchild; pa_no++) {
 					int keynum = 0;
-					CData->curve_firstkey.push_back(keyno);
+					CData->curve_firstkey.push_back_slow(keyno);
 					
 					float curve_length = 0.0f;
 					float3 pcKey;
@@ -196,15 +195,15 @@ bool ObtainCacheParticleData(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par
 								continue;
 							curve_length += step_length;
 						}
-						CData->curvekey_co.push_back(cKey);
-						CData->curvekey_time.push_back(curve_length);
+						CData->curvekey_co.push_back_slow(cKey);
+						CData->curvekey_time.push_back_slow(curve_length);
 						pcKey = cKey;
 						keynum++;
 					}
 					keyno += keynum;

-					CData->curve_keynum.push_back(keynum);
-					CData->curve_length.push_back(curve_length);
+					CData->curve_keynum.push_back_slow(keynum);
+					CData->curve_length.push_back_slow(curve_length);
 					curvenum++;
 				}
 			}
@@ -256,7 +255,7 @@ bool ObtainCacheParticleUV(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Parti
 					float3 uv = make_float3(0.0f, 0.0f, 0.0f);
 					if(b_mesh->tessface_uv_textures.length())
 						b_psys.uv_on_emitter(psmd, *b_pa, pa_no, uv_num, &uv.x);
-					CData->curve_uv.push_back(uv);
+					CData->curve_uv.push_back_slow(uv);

 					if(pa_no < totparts && b_pa != b_psys.particles.end())
 						++b_pa;
@@ -310,7 +309,7 @@ bool ObtainCacheParticleVcol(Mesh *mesh, BL::Mesh *b_mesh, BL::Object *b_ob, Par
 					float3 vcol = make_float3(0.0f, 0.0f, 0.0f);
 					if(b_mesh->tessface_vertex_colors.length())
 						b_psys.mcol_on_emitter(psmd, *b_pa, pa_no, vcol_num, &vcol.x);
-					CData->curve_vcol.push_back(vcol);
+					CData->curve_vcol.push_back_slow(vcol);

 					if(pa_no < totparts && b_pa != b_psys.particles.end())
 						++b_pa;
@@ -352,10 +351,7 @@ void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
 		}
 	}

-	mesh->verts.reserve(mesh->verts.size() + numverts);
-	mesh->triangles.reserve(mesh->triangles.size() + numtris);
-	mesh->shader.reserve(mesh->shader.size() + numtris);
-	mesh->smooth.reserve(mesh->smooth.size() + numtris);
+	mesh->reserve_mesh(mesh->verts.size() + numverts, mesh->num_triangles() + numtris);

 	/* actually export */
 	for(int sys = 0; sys < CData->psys_firstcurve.size() ; sys++) {
@@ -375,8 +371,8 @@ void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
 				xbasis = normalize(cross(RotCam - ickey_loc, v1));
 			float3 ickey_loc_shfl = ickey_loc - radius * xbasis;
 			float3 ickey_loc_shfr = ickey_loc + radius * xbasis;
-			mesh->verts.push_back(ickey_loc_shfl);
-			mesh->verts.push_back(ickey_loc_shfr);
+			mesh->add_vertex(ickey_loc_shfl);
+			mesh->add_vertex(ickey_loc_shfr);
 			vertexindex += 2;

 			for(int curvekey = CData->curve_firstkey[curve] + 1; curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve]; curvekey++) {
@@ -402,8 +398,8 @@ void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
 					xbasis = normalize(cross(RotCam - ickey_loc, v1));
 				float3 ickey_loc_shfl = ickey_loc - radius * xbasis;
 				float3 ickey_loc_shfr = ickey_loc + radius * xbasis;
-				mesh->verts.push_back(ickey_loc_shfl);
-				mesh->verts.push_back(ickey_loc_shfr);
+				mesh->add_vertex(ickey_loc_shfl);
+				mesh->add_vertex(ickey_loc_shfr);
 				mesh->add_triangle(vertexindex-2, vertexindex, vertexindex-1, CData->psys_shader[sys], true);
 				mesh->add_triangle(vertexindex+1, vertexindex-1, vertexindex, CData->psys_shader[sys], true);
 				vertexindex += 2;
@@ -411,7 +407,6 @@ void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
 		}
 	}

-	mesh->reserve(mesh->verts.size(), mesh->triangles.size(), 0, 0);
 	mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
 	mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
 	mesh->add_face_normals();
@@ -438,10 +433,7 @@ void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, int resol
 		}
 	}

-	mesh->verts.reserve(mesh->verts.size() + numverts);
-	mesh->triangles.reserve(mesh->triangles.size() + numtris);
-	mesh->shader.reserve(mesh->shader.size() + numtris);
-	mesh->smooth.reserve(mesh->smooth.size() + numtris);
+	mesh->reserve_mesh(mesh->verts.size() + numverts, mesh->num_triangles() + numtris);

 	/* actually export */
 	for(int sys = 0; sys < CData->psys_firstcurve.size() ; sys++) {
@@ -530,7 +522,7 @@ void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, int resol
 					float angle = M_2PI_F / (float)resolution;
 					for(int section = 0; section < resolution; section++) {
 						float3 ickey_loc_shf = ickey_loc + radius * (cosf(angle * section) * xbasis + sinf(angle * section) * ybasis);
-						mesh->verts.push_back(ickey_loc_shf);
+						mesh->add_vertex(ickey_loc_shf);
 					}

 					if(subv != 0) {
@@ -547,7 +539,6 @@ void ExportCurveTriangleGeometry(Mesh *mesh, ParticleCurveData *CData, int resol
 		}
 	}

-	mesh->reserve(mesh->verts.size(), mesh->triangles.size(), 0, 0);
 	mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
 	mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
 	mesh->add_face_normals();
@@ -562,7 +553,7 @@ void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
 	int num_keys = 0;
 	int num_curves = 0;

-	if(!(mesh->curves.empty() && mesh->curve_keys.empty()))
+	if(mesh->num_curves())
 		return;

 	Attribute *attr_intercept = NULL;
@@ -585,8 +576,7 @@ void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
 		VLOG(1) << "Exporting curve segments for mesh " << mesh->name;
 	}

-	mesh->curve_keys.reserve(mesh->curve_keys.size() + num_keys);
-	mesh->curves.reserve(mesh->curves.size() + num_curves);
+	mesh->reserve_curves(mesh->num_curves() + num_curves, mesh->curve_keys.size() + num_keys);

 	num_keys = 0;
 	num_curves = 0;
@@ -614,18 +604,16 @@ void ExportCurveSegments(Scene *scene, Mesh *mesh, ParticleCurveData *CData)
 				num_curve_keys++;
 			}

-			mesh->add_curve(num_keys, num_curve_keys, CData->psys_shader[sys]);
+			mesh->add_curve(num_keys, CData->psys_shader[sys]);
 			num_keys += num_curve_keys;
 			num_curves++;
 		}
 	}

 	/* check allocation */
-	if((mesh->curve_keys.size() != num_keys) || (mesh->curves.size() != num_curves)) {
+	if((mesh->curve_keys.size() != num_keys) || (mesh->num_curves() != num_curves)) {
 		VLOG(1) << "Allocation failed, clearing data";
-		mesh->curve_keys.clear();
-		mesh->curves.clear();
-		mesh->curve_attributes.clear();
+		mesh->clear();
 	}
 }

@@ -668,13 +656,16 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int
 					if(CData->psys_closetip[sys] && (curvekey == CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1))
 						radius = 0.0f;

+					/* curve motion keys store both position and radius in float4 */
 					mP[i] = float3_to_float4(ickey_loc);
 					mP[i].w = radius;

 					/* unlike mesh coordinates, these tend to be slightly different
 					 * between frames due to particle transforms into/out of object
 					 * space, so we use an epsilon to detect actual changes */
-					if(len_squared(mP[i] - mesh->curve_keys[i]) > 1e-5f*1e-5f)
+					float4 curve_key = float3_to_float4(mesh->curve_keys[i]);
+					curve_key.w = mesh->curve_radius[i];
+					if(len_squared(mP[i] - curve_key) > 1e-5f*1e-5f)
 						have_motion = true;
 				}

@@ -698,8 +689,10 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int
 			for(int step = 0; step < time_index; step++) {
 				float4 *mP = attr_mP->data_float4() + step*numkeys;

-				for(int key = 0; key < numkeys; key++)
-					mP[key] = mesh->curve_keys[key];
+				for(int key = 0; key < numkeys; key++) {
+					mP[key] = float3_to_float4(mesh->curve_keys[key]);
+					mP[key].w = mesh->curve_radius[key];
+				}
 			}
 		}
 	}
@@ -873,7 +866,9 @@ void BlenderSync::sync_curves(Mesh *mesh,
 	if(!motion) {
 		/* Clear stored curve data */
 		mesh->curve_keys.clear();
-		mesh->curves.clear();
+		mesh->curve_radius.clear();
+		mesh->curve_first_key.clear();
+		mesh->curve_shader.clear();
 		mesh->curve_attributes.clear();
 	}

@@ -890,7 +885,7 @@ void BlenderSync::sync_curves(Mesh *mesh,
 	int triangle_method = scene->curve_system_manager->triangle_method;
 	int resolution = scene->curve_system_manager->resolution;
 	size_t vert_num = mesh->verts.size();
-	size_t tri_num = mesh->triangles.size();
+	size_t tri_num = mesh->num_triangles();
 	int used_res = 1;

 	/* extract particle hair data - should be combined with connecting to mesh later*/
@@ -951,11 +946,10 @@ void BlenderSync::sync_curves(Mesh *mesh,
 			else {
 				Attribute *attr_generated = mesh->curve_attributes.add(ATTR_STD_GENERATED);
 				float3 *generated = attr_generated->data_float3();
-				size_t i = 0;

-				foreach(Mesh::Curve& curve, mesh->curves) {
-					float3 co = float4_to_float3(mesh->curve_keys[curve.first_key]);
-					generated[i++] = co*size - loc;
+				for(size_t i = 0; i < mesh->num_curves(); i++) {
+					float3 co = mesh->curve_keys[mesh->get_curve(i).first_key];
+					generated[i] = co*size - loc;
 				}
 			}
 		}
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -532,7 +532,7 @@ static void attr_create_pointiness(Scene *scene,
 static void create_mesh(Scene *scene,
                        Mesh *mesh,
                        BL::Mesh& b_mesh,
-                        const vector<uint>& used_shaders)
+                        const vector<Shader*>& used_shaders)
 {
 	/* count vertices and faces */
 	int numverts = b_mesh.vertices.length();
@@ -548,13 +548,12 @@ static void create_mesh(Scene *scene,
 		numtris += (vi[3] == 0)? 1: 2;
 	}

-	/* reserve memory */
-	mesh->reserve(numverts, numtris, 0, 0);
+	/* allocate memory */
+	mesh->reserve_mesh(numverts, numtris);

 	/* create vertex coordinates and normals */
-	int i = 0;
-	for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end(); ++v, ++i)
-		mesh->verts[i] = get_float3(v->co());
+	for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end(); ++v)
+		mesh->add_vertex(get_float3(v->co()));

 	Attribute *attr_N = mesh->attributes.add(ATTR_STD_VERTEX_NORMAL);
 	float3 *N = attr_N->data_float3();
@@ -583,13 +582,12 @@ static void create_mesh(Scene *scene,
 	/* create faces */
 	vector<int> nverts(numfaces);
 	vector<int> face_flags(numfaces, FACE_FLAG_NONE);
-	int fi = 0, ti = 0;
+	int fi = 0;

 	for(b_mesh.tessfaces.begin(f); f != b_mesh.tessfaces.end(); ++f, ++fi) {
 		int4 vi = get_int4(f->vertices_raw());
 		int n = (vi[3] == 0)? 3: 4;
-		int mi = clamp(f->material_index(), 0, used_shaders.size()-1);
-		int shader = used_shaders[mi];
+		int shader = clamp(f->material_index(), 0, used_shaders.size()-1);
 		bool smooth = f->use_smooth() || use_loop_normals;

 		/* split vertices if normal is different
@@ -619,18 +617,18 @@ static void create_mesh(Scene *scene,
 			   is_zero(cross(mesh->verts[vi[2]] - mesh->verts[vi[0]], mesh->verts[vi[3]] - mesh->verts[vi[0]])))
 			{
 				// TODO(mai): order here is probably wrong
-				mesh->set_triangle(ti++, vi[0], vi[1], vi[3], shader, smooth, true);
-				mesh->set_triangle(ti++, vi[2], vi[3], vi[1], shader, smooth, true);
+				mesh->add_triangle(vi[0], vi[1], vi[3], shader, smooth, true);
+				mesh->add_triangle(vi[2], vi[3], vi[1], shader, smooth, true);
 				face_flags[fi] |= FACE_FLAG_DIVIDE_24;
 			}
 			else {
-				mesh->set_triangle(ti++, vi[0], vi[1], vi[2], shader, smooth, true);
-				mesh->set_triangle(ti++, vi[0], vi[2], vi[3], shader, smooth, true);
+				mesh->add_triangle(vi[0], vi[1], vi[2], shader, smooth, true);
+				mesh->add_triangle(vi[0], vi[2], vi[3], shader, smooth, true);
 				face_flags[fi] |= FACE_FLAG_DIVIDE_13;
 			}
 		}
 		else
-			mesh->set_triangle(ti++, vi[0], vi[1], vi[2], shader, smooth, false);
+			mesh->add_triangle(vi[0], vi[1], vi[2], shader, smooth, false);

 		nverts[fi] = n;
 	}
@@ -660,14 +658,14 @@ static void create_subd_mesh(Scene *scene,
                             BL::Object& b_ob,
                             BL::Mesh& b_mesh,
                             PointerRNA *cmesh,
-                             const vector<uint>& used_shaders,
+                             const vector<Shader*>& used_shaders,
                             float dicing_rate,
                             int max_subdivisions)
 {
 	Mesh basemesh;
 	create_mesh(scene, &basemesh, b_mesh, used_shaders);

-	SubdParams sdparams(mesh, used_shaders[0], true, false);
+	SubdParams sdparams(mesh, 0, true, false);
 	sdparams.dicing_rate = max(0.1f, RNA_float_get(cmesh, "dicing_rate") * dicing_rate);
 	sdparams.max_level = max_subdivisions;

@@ -700,7 +698,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
 	BL::Material material_override = render_layer.material_override;

 	/* find shader indices */
-	vector<uint> used_shaders;
+	vector<Shader*> used_shaders;

 	BL::Object::material_slots_iterator slot;
 	for(b_ob.material_slots.begin(slot); slot != b_ob.material_slots.end(); ++slot) {
@@ -742,8 +740,8 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
 			 * because the shader needs different mesh attributes */
 			bool attribute_recalc = false;

-			foreach(uint shader, mesh->used_shaders)
-				if(scene->shaders[shader]->need_update_attributes)
+			foreach(Shader *shader, mesh->used_shaders)
+				if(shader->need_update_attributes)
 					attribute_recalc = true;

 			if(!attribute_recalc)
@@ -760,11 +758,12 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
 	/* create derived mesh */
 	PointerRNA cmesh = RNA_pointer_get(&b_ob_data.ptr, "cycles");

-	vector<Mesh::Triangle> oldtriangle = mesh->triangles;
+	array<int> oldtriangle = mesh->triangles;
 	
 	/* compares curve_keys rather than strands in order to handle quick hair
 	 * adjustments in dynamic BVH - other methods could probably do this better*/
-	vector<float4> oldcurve_keys = mesh->curve_keys;
+	array<float3> oldcurve_keys = mesh->curve_keys;
+	array<float> oldcurve_radius = mesh->curve_radius;

 	mesh->clear();
 	mesh->used_shaders = used_shaders;
@@ -828,14 +827,21 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
 	if(oldtriangle.size() != mesh->triangles.size())
 		rebuild = true;
 	else if(oldtriangle.size()) {
-		if(memcmp(&oldtriangle[0], &mesh->triangles[0], sizeof(Mesh::Triangle)*oldtriangle.size()) != 0)
+		if(memcmp(&oldtriangle[0], &mesh->triangles[0], sizeof(int)*oldtriangle.size()) != 0)
 			rebuild = true;
 	}

 	if(oldcurve_keys.size() != mesh->curve_keys.size())
 		rebuild = true;
 	else if(oldcurve_keys.size()) {
-		if(memcmp(&oldcurve_keys[0], &mesh->curve_keys[0], sizeof(float4)*oldcurve_keys.size()) != 0)
+		if(memcmp(&oldcurve_keys[0], &mesh->curve_keys[0], sizeof(float3)*oldcurve_keys.size()) != 0)
+			rebuild = true;
+	}
+
+	if(oldcurve_radius.size() != mesh->curve_radius.size())
+		rebuild = true;
+	else if(oldcurve_radius.size()) {
+		if(memcmp(&oldcurve_radius[0], &mesh->curve_radius[0], sizeof(float)*oldcurve_radius.size()) != 0)
 			rebuild = true;
 	}
 	
@@ -932,8 +938,8 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 			Attribute *attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);

 			if(attr_mP) {
-				float4 *keys = &mesh->curve_keys[0];
-				memcpy(attr_mP->data_float4() + time_index*numkeys, keys, sizeof(float4)*numkeys);
+				float3 *keys = &mesh->curve_keys[0];
+				memcpy(attr_mP->data_float3() + time_index*numkeys, keys, sizeof(float3)*numkeys);
 			}
 		}

--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -155,13 +155,8 @@ void BlenderSync::sync_light(BL::Object& b_parent,
 	light->dir = -transform_get_column(&tfm, 2);

 	/* shader */
-	vector<uint> used_shaders;
-
+	vector<Shader*> used_shaders;
 	find_shader(b_lamp, used_shaders, scene->default_light);
-
-	if(used_shaders.size() == 0)
-		used_shaders.push_back(scene->default_light);
-
 	light->shader = used_shaders[0];

 	/* shadow */
@@ -370,13 +365,12 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 	}

 	/* make holdout objects on excluded layer invisible for non-camera rays */
-	if(use_holdout && (layer_flag & render_layer.exclude_layer))
+	if(use_holdout && (layer_flag & render_layer.exclude_layer)) {
 		visibility &= ~(PATH_RAY_ALL_VISIBILITY - PATH_RAY_CAMERA);
+	}

-	/* camera flag is not actually used, instead is tested against render layer
-	 * flags */
-	if(visibility & PATH_RAY_CAMERA) {
-		visibility |= layer_flag << PATH_RAY_LAYER_SHIFT;
+	/* hide objects not on render layer from camera rays */
+	if(!(layer_flag & render_layer.layer)) {
 		visibility &= ~PATH_RAY_CAMERA;
 	}

@@ -577,7 +571,6 @@ void BlenderSync::sync_objects(BL::SpaceView3D& b_v3d, float motion_time)
 			bool hide = (render_layer.use_viewport_visibility)? b_ob.hide(): b_ob.hide_render();
 			uint ob_layer = get_layer(b_base->layers(),
 			                          b_base->layers_local_view(),
-			                          render_layer.use_localview,
 			                          object_is_light(b_ob),
 			                          scene_layers);
 			hide = hide || !(ob_layer & scene_layer);
--- a/intern/cycles/blender/blender_particles.cpp
+++ b/intern/cycles/blender/blender_particles.cpp
@@ -76,7 +76,7 @@ bool BlenderSync::sync_dupli_particle(BL::Object& b_ob,
 	pa.velocity = get_float3(b_pa.velocity());
 	pa.angular_velocity = get_float3(b_pa.angular_velocity());

-	psys->particles.push_back(pa);
+	psys->particles.push_back_slow(pa);

 	if(object->particle_index != psys->particles.size() - 1)
 		scene->object_manager->tag_update(scene);
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -70,6 +70,8 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
 	flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3");
 	flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
 	flags.cpu.qbvh = get_boolean(cscene, "debug_use_qbvh");
+	/* Synchronize CUDA flags. */
+	flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
 	/* Synchronize OpenCL kernel type. */
 	switch(get_enum(cscene, "debug_opencl_kernel_type")) {
 		case 0:
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -473,7 +473,7 @@ void BlenderSession::render()
 		BL::RenderLayer b_rlay = *b_single_rlay;

 		/* add passes */
-		vector<Pass> passes;
+		array<Pass> passes;
 		Pass::add(PASS_COMBINED, passes);

 		if(session_params.device.advanced_shading) {
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -32,23 +32,18 @@ CCL_NAMESPACE_BEGIN

 typedef map<void*, ShaderInput*> PtrInputMap;
 typedef map<void*, ShaderOutput*> PtrOutputMap;
-typedef map<std::string, ProxyNode*> ProxyMap;
+typedef map<std::string, ConvertNode*> ProxyMap;

 /* Find */

 void BlenderSync::find_shader(BL::ID& id,
-                              vector<uint>& used_shaders,
-                              int default_shader)
+                              vector<Shader*>& used_shaders,
+                              Shader *default_shader)
 {
-	Shader *shader = (id)? shader_map.find(id): scene->shaders[default_shader];
+	Shader *shader = (id)? shader_map.find(id): default_shader;

-	for(size_t i = 0; i < scene->shaders.size(); i++) {
-		if(scene->shaders[i] == shader) {
-			used_shaders.push_back(i);
-			scene->shaders[i]->tag_used(scene);
-			break;
-		}
-	}
+	used_shaders.push_back(shader);
+	shader->tag_used(scene);
 }

 /* RNA translation utilities */
@@ -132,82 +127,57 @@ static float3 get_node_output_vector(BL::Node& b_node, const string& name)
 	return make_float3(value[0], value[1], value[2]);
 }

-static ShaderSocketType convert_socket_type(BL::NodeSocket& b_socket)
+static SocketType::Type convert_socket_type(BL::NodeSocket& b_socket)
 {
 	switch(b_socket.type()) {
 		case BL::NodeSocket::type_VALUE:
-			return SHADER_SOCKET_FLOAT;
+			return SocketType::FLOAT;
 		case BL::NodeSocket::type_INT:
-			return SHADER_SOCKET_INT;
+			return SocketType::INT;
 		case BL::NodeSocket::type_VECTOR:
-			return SHADER_SOCKET_VECTOR;
+			return SocketType::VECTOR;
 		case BL::NodeSocket::type_RGBA:
-			return SHADER_SOCKET_COLOR;
+			return SocketType::COLOR;
 		case BL::NodeSocket::type_STRING:
-			return SHADER_SOCKET_STRING;
+			return SocketType::STRING;
 		case BL::NodeSocket::type_SHADER:
-			return SHADER_SOCKET_CLOSURE;
+			return SocketType::CLOSURE;
 		
 		default:
-			return SHADER_SOCKET_UNDEFINED;
+			return SocketType::UNDEFINED;
 	}
 }

-#ifdef WITH_OSL
-static ShaderSocketType convert_osl_socket_type(OSL::OSLQuery& query,
-                                                BL::NodeSocket& b_socket)
-{
-	ShaderSocketType socket_type = convert_socket_type(b_socket);
-	if(socket_type == SHADER_SOCKET_VECTOR) {
-		/* TODO(sergey): Do we need compatible_name() here? */
-		const OSL::OSLQuery::Parameter *param = query.getparam(b_socket.name());
-		assert(param != NULL);
-		if(param != NULL) {
-			if(param->type.vecsemantics == TypeDesc::POINT) {
-				socket_type = SHADER_SOCKET_POINT;
-			}
-			else if(param->type.vecsemantics == TypeDesc::NORMAL) {
-				socket_type = SHADER_SOCKET_NORMAL;
-			}
-		}
-	}
-
-	return socket_type;
-}
-#endif  /* WITH_OSL */
-
 static void set_default_value(ShaderInput *input,
                              BL::NodeSocket& b_sock,
                              BL::BlendData& b_data,
                              BL::ID& b_id)
 {
 	/* copy values for non linked inputs */
-	switch(input->type) {
-		case SHADER_SOCKET_FLOAT: {
+	switch(input->type()) {
+		case SocketType::FLOAT: {
 			input->set(get_float(b_sock.ptr, "default_value"));
 			break;
 		}
-		case SHADER_SOCKET_INT: {
-			input->set((float)get_int(b_sock.ptr, "default_value"));
+		case SocketType::INT: {
+			input->set(get_int(b_sock.ptr, "default_value"));
 			break;
 		}
-		case SHADER_SOCKET_COLOR: {
+		case SocketType::COLOR: {
 			input->set(float4_to_float3(get_float4(b_sock.ptr, "default_value")));
 			break;
 		}
-		case SHADER_SOCKET_NORMAL:
-		case SHADER_SOCKET_POINT:
-		case SHADER_SOCKET_VECTOR: {
+		case SocketType::NORMAL:
+		case SocketType::POINT:
+		case SocketType::VECTOR: {
 			input->set(get_float3(b_sock.ptr, "default_value"));
 			break;
 		}
-		case SHADER_SOCKET_STRING: {
+		case SocketType::STRING: {
 			input->set((ustring)blender_absolute_path(b_data, b_id, get_string(b_sock.ptr, "default_value")));
 			break;
 		}
-
-		case SHADER_SOCKET_CLOSURE:
-		case SHADER_SOCKET_UNDEFINED:
+		default:
 			break;
 	}
 }
@@ -291,7 +261,7 @@ static ShaderNode *add_node(Scene *scene,
 		RGBRampNode *ramp = new RGBRampNode();
 		BL::ShaderNodeValToRGB b_ramp_node(b_node);
 		BL::ColorRamp b_color_ramp(b_ramp_node.color_ramp());
-		colorramp_to_array(b_color_ramp, ramp->ramp, RAMP_TABLE_SIZE);
+		colorramp_to_array(b_color_ramp, ramp->ramp, ramp->ramp_alpha, RAMP_TABLE_SIZE);
 		ramp->interpolate = b_color_ramp.interpolation() != BL::ColorRamp::interpolation_CONSTANT;
 		node = ramp;
 	}
@@ -320,11 +290,7 @@ static ShaderNode *add_node(Scene *scene,
 	else if(b_node.is_a(&RNA_ShaderNodeMixRGB)) {
 		BL::ShaderNodeMixRGB b_mix_node(b_node);
 		MixNode *mix = new MixNode();
-		mix->type = MixNode::type_enum[b_mix_node.blend_type()];
-		/* Tag if it's Mix */
-		if(b_mix_node.blend_type() == 0) 
-			mix->special_type = SHADER_SPECIAL_TYPE_MIX_RGB;
-
+		mix->type = (NodeMix)b_mix_node.blend_type();
 		mix->use_clamp = b_mix_node.use_clamp();
 		node = mix;
 	}
@@ -350,27 +316,27 @@ static ShaderNode *add_node(Scene *scene,
 		node = new HSVNode();
 	}
 	else if(b_node.is_a(&RNA_ShaderNodeRGBToBW)) {
-		node = new ConvertNode(SHADER_SOCKET_COLOR, SHADER_SOCKET_FLOAT);
+		node = new RGBToBWNode();
 	}
 	else if(b_node.is_a(&RNA_ShaderNodeMath)) {
 		BL::ShaderNodeMath b_math_node(b_node);
 		MathNode *math = new MathNode();
-		math->type = MathNode::type_enum[b_math_node.operation()];
+		math->type = (NodeMath)b_math_node.operation();
 		math->use_clamp = b_math_node.use_clamp();
 		node = math;
 	}
 	else if(b_node.is_a(&RNA_ShaderNodeVectorMath)) {
 		BL::ShaderNodeVectorMath b_vector_math_node(b_node);
 		VectorMathNode *vmath = new VectorMathNode();
-		vmath->type = VectorMathNode::type_enum[b_vector_math_node.operation()];
+		vmath->type = (NodeVectorMath)b_vector_math_node.operation();
 		node = vmath;
 	}
 	else if(b_node.is_a(&RNA_ShaderNodeVectorTransform)) {
 		BL::ShaderNodeVectorTransform b_vector_transform_node(b_node);
 		VectorTransformNode *vtransform = new VectorTransformNode();
-		vtransform->type = VectorTransformNode::type_enum[b_vector_transform_node.vector_type()];
-		vtransform->convert_from = VectorTransformNode::convert_space_enum[b_vector_transform_node.convert_from()];
-		vtransform->convert_to = VectorTransformNode::convert_space_enum[b_vector_transform_node.convert_to()];
+		vtransform->type = (NodeVectorTransformType)b_vector_transform_node.vector_type();
+		vtransform->convert_from = (NodeVectorTransformConvertSpace)b_vector_transform_node.convert_from();
+		vtransform->convert_to = (NodeVectorTransformConvertSpace)b_vector_transform_node.convert_to();
 		node = vtransform;
 	}
 	else if(b_node.is_a(&RNA_ShaderNodeNormal)) {
@@ -419,13 +385,13 @@ static ShaderNode *add_node(Scene *scene,

 		switch(b_aniso_node.distribution()) {
 			case BL::ShaderNodeBsdfAnisotropic::distribution_BECKMANN:
-				aniso->distribution = ustring("Beckmann");
+				aniso->distribution = CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID;
 				break;
 			case BL::ShaderNodeBsdfAnisotropic::distribution_GGX:
-				aniso->distribution = ustring("GGX");
+				aniso->distribution = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
 				break;
 			case BL::ShaderNodeBsdfAnisotropic::distribution_ASHIKHMIN_SHIRLEY:
-				aniso->distribution = ustring("Ashikhmin-Shirley");
+				aniso->distribution = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID;
 				break;
 		}

@@ -441,13 +407,13 @@ static ShaderNode *add_node(Scene *scene,

 		switch(b_subsurface_node.falloff()) {
 			case BL::ShaderNodeSubsurfaceScattering::falloff_CUBIC:
-				subsurface->closure = CLOSURE_BSSRDF_CUBIC_ID;
+				subsurface->falloff = CLOSURE_BSSRDF_CUBIC_ID;
 				break;
 			case BL::ShaderNodeSubsurfaceScattering::falloff_GAUSSIAN:
-				subsurface->closure = CLOSURE_BSSRDF_GAUSSIAN_ID;
+				subsurface->falloff = CLOSURE_BSSRDF_GAUSSIAN_ID;
 				break;
 			case BL::ShaderNodeSubsurfaceScattering::falloff_BURLEY:
-				subsurface->closure = CLOSURE_BSSRDF_BURLEY_ID;
+				subsurface->falloff = CLOSURE_BSSRDF_BURLEY_ID;
 				break;
 		}

@@ -459,16 +425,16 @@ static ShaderNode *add_node(Scene *scene,
 		
 		switch(b_glossy_node.distribution()) {
 			case BL::ShaderNodeBsdfGlossy::distribution_SHARP:
-				glossy->distribution = ustring("Sharp");
+				glossy->distribution = CLOSURE_BSDF_REFLECTION_ID;
 				break;
 			case BL::ShaderNodeBsdfGlossy::distribution_BECKMANN:
-				glossy->distribution = ustring("Beckmann");
+				glossy->distribution = CLOSURE_BSDF_MICROFACET_BECKMANN_ID;
 				break;
 			case BL::ShaderNodeBsdfGlossy::distribution_GGX:
-				glossy->distribution = ustring("GGX");
+				glossy->distribution = CLOSURE_BSDF_MICROFACET_GGX_ID;
 				break;
 			case BL::ShaderNodeBsdfGlossy::distribution_ASHIKHMIN_SHIRLEY:
-				glossy->distribution = ustring("Ashikhmin-Shirley");
+				glossy->distribution = CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID;
 				break;
 		}
 		node = glossy;
@@ -478,13 +444,13 @@ static ShaderNode *add_node(Scene *scene,
 		GlassBsdfNode *glass = new GlassBsdfNode();
 		switch(b_glass_node.distribution()) {
 			case BL::ShaderNodeBsdfGlass::distribution_SHARP:
-				glass->distribution = ustring("Sharp");
+				glass->distribution = CLOSURE_BSDF_SHARP_GLASS_ID;
 				break;
 			case BL::ShaderNodeBsdfGlass::distribution_BECKMANN:
-				glass->distribution = ustring("Beckmann");
+				glass->distribution = CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID;
 				break;
 			case BL::ShaderNodeBsdfGlass::distribution_GGX:
-				glass->distribution = ustring("GGX");
+				glass->distribution = CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID;
 				break;
 		}
 		node = glass;
@@ -494,13 +460,13 @@ static ShaderNode *add_node(Scene *scene,
 		RefractionBsdfNode *refraction = new RefractionBsdfNode();
 		switch(b_refraction_node.distribution()) {
 			case BL::ShaderNodeBsdfRefraction::distribution_SHARP:
-				refraction->distribution = ustring("Sharp");
+				refraction->distribution = CLOSURE_BSDF_REFRACTION_ID;
 				break;
 			case BL::ShaderNodeBsdfRefraction::distribution_BECKMANN:
-				refraction->distribution = ustring("Beckmann");
+				refraction->distribution = CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
 				break;
 			case BL::ShaderNodeBsdfRefraction::distribution_GGX:
-				refraction->distribution = ustring("GGX");
+				refraction->distribution = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
 				break;
 		}
 		node = refraction;
@@ -510,10 +476,10 @@ static ShaderNode *add_node(Scene *scene,
 		ToonBsdfNode *toon = new ToonBsdfNode();
 		switch(b_toon_node.component()) {
 			case BL::ShaderNodeBsdfToon::component_DIFFUSE:
-				toon->component = ustring("Diffuse");
+				toon->component = CLOSURE_BSDF_DIFFUSE_TOON_ID;
 				break;
 			case BL::ShaderNodeBsdfToon::component_GLOSSY:
-				toon->component = ustring("Glossy");
+				toon->component = CLOSURE_BSDF_GLOSSY_TOON_ID;
 				break;
 		}
 		node = toon;
@@ -523,10 +489,10 @@ static ShaderNode *add_node(Scene *scene,
 		HairBsdfNode *hair = new HairBsdfNode();
 		switch(b_hair_node.component()) {
 			case BL::ShaderNodeBsdfHair::component_Reflection:
-				hair->component = ustring("Reflection");
+				hair->component = CLOSURE_BSDF_HAIR_REFLECTION_ID;
 				break;
 			case BL::ShaderNodeBsdfHair::component_Transmission:
-				hair->component = ustring("Transmission");
+				hair->component = CLOSURE_BSDF_HAIR_TRANSMISSION_ID;
 				break;
 		}
 		node = hair;
@@ -593,62 +559,17 @@ static ShaderNode *add_node(Scene *scene,
 		if(scene->shader_manager->use_osl()) {
 			/* create script node */
 			BL::ShaderNodeScript b_script_node(b_node);
-			OSLScriptNode *script_node = new OSLScriptNode();

 			OSLShaderManager *manager = (OSLShaderManager*)scene->shader_manager;
 			string bytecode_hash = b_script_node.bytecode_hash();

-			/* Gather additional information from the shader, such as
-			 * input/output type info needed for proper node construction.
-			 */
-			OSL::OSLQuery query;
-
 			if(!bytecode_hash.empty()) {
-				query.open_bytecode(b_script_node.bytecode());
+				node = manager->osl_node("", bytecode_hash, b_script_node.bytecode());
 			}
 			else {
-				OSLShaderManager::osl_query(query, b_script_node.filepath());
+				string absolute_filepath = blender_absolute_path(b_data, b_ntree, b_script_node.filepath());
+				node = manager->osl_node(absolute_filepath, "");
 			}
-			/* TODO(sergey): Add proper query info error parsing. */
-
-			/* Generate inputs/outputs from node sockets
-			 *
-			 * Note: the node sockets are generated from OSL parameters,
-			 * so the names match those of the corresponding parameters exactly.
-			 *
-			 * Note 2: ShaderInput/ShaderOutput store shallow string copies only!
-			 * Socket names must be stored in the extra lists instead. */
-			BL::Node::inputs_iterator b_input;
-
-			for(b_script_node.inputs.begin(b_input); b_input != b_script_node.inputs.end(); ++b_input) {
-				script_node->input_names.push_back(ustring(b_input->name()));
-				ShaderInput *input = script_node->add_input(script_node->input_names.back().c_str(),
-				                                            convert_osl_socket_type(query, *b_input));
-				set_default_value(input, *b_input, b_data, b_ntree);
-			}
-
-			BL::Node::outputs_iterator b_output;
-
-			for(b_script_node.outputs.begin(b_output); b_output != b_script_node.outputs.end(); ++b_output) {
-				script_node->output_names.push_back(ustring(b_output->name()));
-				script_node->add_output(script_node->output_names.back().c_str(),
-				                        convert_osl_socket_type(query, *b_output));
-			}
-
-			/* load bytecode or filepath */
-			if(!bytecode_hash.empty()) {
-				/* loaded bytecode if not already done */
-				if(!manager->shader_test_loaded(bytecode_hash))
-					manager->shader_load_bytecode(bytecode_hash, b_script_node.bytecode());
-
-				script_node->bytecode_hash = bytecode_hash;
-			}
-			else {
-				/* set filepath */
-				script_node->filepath = blender_absolute_path(b_data, b_ntree, b_script_node.filepath());
-			}
-
-			node = script_node;
 		}
 #else
 		(void)b_data;
@@ -701,8 +622,8 @@ static ShaderNode *add_node(Scene *scene,
 				        get_image_extension(b_image_node));
 			}
 		}
-		image->color_space = ImageTextureNode::color_space_enum[(int)b_image_node.color_space()];
-		image->projection = ImageTextureNode::projection_enum[(int)b_image_node.projection()];
+		image->color_space = (NodeImageColorSpace)b_image_node.color_space();
+		image->projection = (NodeImageProjection)b_image_node.projection();
 		image->interpolation = get_image_interpolation(b_image_node);
 		image->extension = get_image_extension(b_image_node);
 		image->projection_blend = b_image_node.projection_blend();
@@ -732,10 +653,10 @@ static ShaderNode *add_node(Scene *scene,
 				env->filename = image_user_file_path(b_image_user,
 				                                     b_image,
 				                                     b_scene.frame_current());
-				env->animated = b_env_node.image_user().use_auto_refresh();
 				env->builtin_data = NULL;
 			}

+			env->animated = b_env_node.image_user().use_auto_refresh();
 			env->use_alpha = b_image.use_alpha();

 			/* TODO(sergey): Does not work properly when we change builtin type. */
@@ -747,9 +668,9 @@ static ShaderNode *add_node(Scene *scene,
 				        EXTENSION_REPEAT);
 			}
 		}
-		env->color_space = EnvironmentTextureNode::color_space_enum[(int)b_env_node.color_space()];
+		env->color_space = (NodeImageColorSpace)b_env_node.color_space();
 		env->interpolation = get_image_interpolation(b_env_node);
-		env->projection = EnvironmentTextureNode::projection_enum[(int)b_env_node.projection()];
+		env->projection = (NodeEnvironmentProjection)b_env_node.projection();
 		BL::TexMapping b_texture_mapping(b_env_node.texture_mapping());
 		get_tex_mapping(&env->tex_mapping, b_texture_mapping);
 		node = env;
@@ -757,7 +678,7 @@ static ShaderNode *add_node(Scene *scene,
 	else if(b_node.is_a(&RNA_ShaderNodeTexGradient)) {
 		BL::ShaderNodeTexGradient b_gradient_node(b_node);
 		GradientTextureNode *gradient = new GradientTextureNode();
-		gradient->type = GradientTextureNode::type_enum[(int)b_gradient_node.gradient_type()];
+		gradient->type = (NodeGradientType)b_gradient_node.gradient_type();
 		BL::TexMapping b_texture_mapping(b_gradient_node.texture_mapping());
 		get_tex_mapping(&gradient->tex_mapping, b_texture_mapping);
 		node = gradient;
@@ -765,7 +686,7 @@ static ShaderNode *add_node(Scene *scene,
 	else if(b_node.is_a(&RNA_ShaderNodeTexVoronoi)) {
 		BL::ShaderNodeTexVoronoi b_voronoi_node(b_node);
 		VoronoiTextureNode *voronoi = new VoronoiTextureNode();
-		voronoi->coloring = VoronoiTextureNode::coloring_enum[(int)b_voronoi_node.coloring()];
+		voronoi->coloring = (NodeVoronoiColoring)b_voronoi_node.coloring();
 		BL::TexMapping b_texture_mapping(b_voronoi_node.texture_mapping());
 		get_tex_mapping(&voronoi->tex_mapping, b_texture_mapping);
 		node = voronoi;
@@ -781,8 +702,8 @@ static ShaderNode *add_node(Scene *scene,
 	else if(b_node.is_a(&RNA_ShaderNodeTexWave)) {
 		BL::ShaderNodeTexWave b_wave_node(b_node);
 		WaveTextureNode *wave = new WaveTextureNode();
-		wave->type = WaveTextureNode::type_enum[(int)b_wave_node.wave_type()];
-		wave->profile = WaveTextureNode::profile_enum[(int)b_wave_node.wave_profile()];
+		wave->type = (NodeWaveType)b_wave_node.wave_type();
+		wave->profile = (NodeWaveProfile)b_wave_node.wave_profile();
 		BL::TexMapping b_texture_mapping(b_wave_node.texture_mapping());
 		get_tex_mapping(&wave->tex_mapping, b_texture_mapping);
 		node = wave;
@@ -815,7 +736,7 @@ static ShaderNode *add_node(Scene *scene,
 	else if(b_node.is_a(&RNA_ShaderNodeTexMusgrave)) {
 		BL::ShaderNodeTexMusgrave b_musgrave_node(b_node);
 		MusgraveTextureNode *musgrave = new MusgraveTextureNode();
-		musgrave->type = MusgraveTextureNode::type_enum[(int)b_musgrave_node.musgrave_type()];
+		musgrave->type = (NodeMusgraveType)b_musgrave_node.musgrave_type();
 		BL::TexMapping b_texture_mapping(b_musgrave_node.texture_mapping());
 		get_tex_mapping(&musgrave->tex_mapping, b_texture_mapping);
 		node = musgrave;
@@ -833,7 +754,7 @@ static ShaderNode *add_node(Scene *scene,
 	else if(b_node.is_a(&RNA_ShaderNodeTexSky)) {
 		BL::ShaderNodeTexSky b_sky_node(b_node);
 		SkyTextureNode *sky = new SkyTextureNode();
-		sky->type = SkyTextureNode::type_enum[(int)b_sky_node.sky_type()];
+		sky->type = (NodeSkyType)b_sky_node.sky_type();
 		sky->sun_direction = normalize(get_float3(b_sky_node.sun_direction()));
 		sky->turbidity = b_sky_node.turbidity();
 		sky->ground_albedo = b_sky_node.ground_albedo();
@@ -844,15 +765,15 @@ static ShaderNode *add_node(Scene *scene,
 	else if(b_node.is_a(&RNA_ShaderNodeNormalMap)) {
 		BL::ShaderNodeNormalMap b_normal_map_node(b_node);
 		NormalMapNode *nmap = new NormalMapNode();
-		nmap->space = NormalMapNode::space_enum[(int)b_normal_map_node.space()];
+		nmap->space = (NodeNormalMapSpace)b_normal_map_node.space();
 		nmap->attribute = b_normal_map_node.uv_map();
 		node = nmap;
 	}
 	else if(b_node.is_a(&RNA_ShaderNodeTangent)) {
 		BL::ShaderNodeTangent b_tangent_node(b_node);
 		TangentNode *tangent = new TangentNode();
-		tangent->direction_type = TangentNode::direction_type_enum[(int)b_tangent_node.direction_type()];
-		tangent->axis = TangentNode::axis_enum[(int)b_tangent_node.axis()];
+		tangent->direction_type = (NodeTangentDirectionType)b_tangent_node.direction_type();
+		tangent->axis = (NodeTangentAxis)b_tangent_node.axis();
 		tangent->attribute = b_tangent_node.uv_map();
 		node = tangent;
 	}
@@ -867,8 +788,7 @@ static ShaderNode *add_node(Scene *scene,
 		BL::ShaderNodeTexPointDensity b_point_density_node(b_node);
 		PointDensityTextureNode *point_density = new PointDensityTextureNode();
 		point_density->filename = b_point_density_node.name();
-		point_density->space =
-		        PointDensityTextureNode::space_enum[(int)b_point_density_node.space()];
+		point_density->space = (NodeTexVoxelSpace)b_point_density_node.space();
 		point_density->interpolation = get_image_interpolation(b_point_density_node);
 		point_density->builtin_data = b_point_density_node.ptr.data;

@@ -1029,7 +949,8 @@ static void add_nodes(Scene *scene,
 			BL::Node::internal_links_iterator b_link;
 			for(b_node->internal_links.begin(b_link); b_link != b_node->internal_links.end(); ++b_link) {
 				BL::NodeSocket to_socket(b_link->to_socket());
-				ProxyNode *proxy = new ProxyNode(convert_socket_type(to_socket));
+				SocketType::Type to_socket_type = convert_socket_type(to_socket);
+				ConvertNode *proxy = new ConvertNode(to_socket_type, to_socket_type, true);

 				input_map[b_link->from_socket().ptr.data] = proxy->inputs[0];
 				output_map[b_link->to_socket().ptr.data] = proxy->outputs[0];
@@ -1051,7 +972,8 @@ static void add_nodes(Scene *scene,
 			 * so that links have something to connect to and assert won't fail.
 			 */
 			for(b_node->inputs.begin(b_input); b_input != b_node->inputs.end(); ++b_input) {
-				ProxyNode *proxy = new ProxyNode(convert_socket_type(*b_input));
+				SocketType::Type input_type = convert_socket_type(*b_input);
+				ConvertNode *proxy = new ConvertNode(input_type, input_type, true);
 				graph->add(proxy);

 				/* register the proxy node for internal binding */
@@ -1062,7 +984,8 @@ static void add_nodes(Scene *scene,
 				set_default_value(proxy->inputs[0], *b_input, b_data, b_ntree);
 			}
 			for(b_node->outputs.begin(b_output); b_output != b_node->outputs.end(); ++b_output) {
-				ProxyNode *proxy = new ProxyNode(convert_socket_type(*b_output));
+				SocketType::Type output_type = convert_socket_type(*b_output);
+				ConvertNode *proxy = new ConvertNode(output_type, output_type, true);
 				graph->add(proxy);

 				/* register the proxy node for internal binding */
@@ -1088,7 +1011,7 @@ static void add_nodes(Scene *scene,
 			for(b_node->outputs.begin(b_output); b_output != b_node->outputs.end(); ++b_output) {
 				ProxyMap::const_iterator proxy_it = proxy_input_map.find(b_output->identifier());
 				if(proxy_it != proxy_input_map.end()) {
-					ProxyNode *proxy = proxy_it->second;
+					ConvertNode *proxy = proxy_it->second;

 					output_map[b_output->ptr.data] = proxy->outputs[0];
 				}
@@ -1102,7 +1025,7 @@ static void add_nodes(Scene *scene,
 				for(b_node->inputs.begin(b_input); b_input != b_node->inputs.end(); ++b_input) {
 					ProxyMap::const_iterator proxy_it = proxy_output_map.find(b_input->identifier());
 					if(proxy_it != proxy_output_map.end()) {
-						ProxyNode *proxy = proxy_it->second;
+						ConvertNode *proxy = proxy_it->second;

 						input_map[b_input->ptr.data] = proxy->inputs[0];

@@ -1208,7 +1131,7 @@ static void add_nodes(Scene *scene,

 void BlenderSync::sync_materials(bool update_all)
 {
-	shader_map.set_default(scene->shaders[scene->default_surface]);
+	shader_map.set_default(scene->default_surface);

 	/* material loop */
 	BL::BlendData::materials_iterator b_mat;
@@ -1233,7 +1156,7 @@ void BlenderSync::sync_materials(bool update_all)
 				ShaderNode *closure, *out;

 				closure = graph->add(new DiffuseBsdfNode());
-				closure->input("Color")->value = get_float3(b_mat->diffuse_color());
+				closure->input("Color")->set(get_float3(b_mat->diffuse_color()));
 				out = graph->output();

 				graph->connect(closure->output("BSDF"), out->input("Surface"));
@@ -1263,7 +1186,7 @@ void BlenderSync::sync_world(bool update_all)
 	BL::World b_world = b_scene.world();

 	if(world_recalc || update_all || b_world.ptr.data != world_map) {
-		Shader *shader = scene->shaders[scene->default_background];
+		Shader *shader = scene->default_background;
 		ShaderGraph *graph = new ShaderGraph();

 		/* create nodes */
@@ -1282,7 +1205,7 @@ void BlenderSync::sync_world(bool update_all)
 			ShaderNode *closure, *out;

 			closure = graph->add(new BackgroundNode());
-			closure->input("Color")->value = get_float3(b_world.horizon_color());
+			closure->input("Color")->set(get_float3(b_world.horizon_color()));
 			out = graph->output();

 			graph->connect(closure->output("Background"), out->input("Surface"));
@@ -1343,7 +1266,7 @@ void BlenderSync::sync_world(bool update_all)

 void BlenderSync::sync_lamps(bool update_all)
 {
-	shader_map.set_default(scene->shaders[scene->default_light]);
+	shader_map.set_default(scene->default_light);

 	/* lamp loop */
 	BL::BlendData::lamps_iterator b_lamp;
@@ -1375,8 +1298,8 @@ void BlenderSync::sync_lamps(bool update_all)
 				}

 				closure = graph->add(new EmissionNode());
-				closure->input("Color")->value = get_float3(b_lamp->color());
-				closure->input("Strength")->value.x = strength;
+				closure->input("Color")->set(get_float3(b_lamp->color()));
+				closure->input("Strength")->set(strength);
 				out = graph->output();

 				graph->connect(closure->output("Emission"), out->input("Surface"));
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -175,8 +175,8 @@ bool BlenderSync::sync_recalc()
 				world_recalc = true;
 			}
 			else if(b_world->node_tree() && b_world->use_nodes()) {
-				Shader *shader = scene->shaders[scene->default_background];
-				if(has_updated_objects && shader != NULL && shader->has_object_dependency) {
+				Shader *shader = scene->default_background;
+				if(has_updated_objects && shader->has_object_dependency) {
 					world_recalc = true;
 				}
 			}
@@ -269,8 +269,6 @@ void BlenderSync::sync_integrator()
 	        SAMPLING_NUM_PATTERNS,
 	        SAMPLING_PATTERN_SOBOL);

-	integrator->layer_flag = render_layer.layer;
-
 	integrator->sample_clamp_direct = get_float(cscene, "sample_clamp_direct");
 	integrator->sample_clamp_indirect = get_float(cscene, "sample_clamp_indirect");
 #ifdef __CAMERA_MOTION__
@@ -377,8 +375,7 @@ void BlenderSync::sync_render_layers(BL::SpaceView3D& b_v3d, const char *layer)
 			layer = layername.c_str();
 		}
 		else {
-			render_layer.use_localview = (b_v3d.local_view() ? true : false);
-			render_layer.scene_layer = get_layer(b_v3d.layers(), b_v3d.layers_local_view(), render_layer.use_localview);
+			render_layer.scene_layer = get_layer(b_v3d.layers(), b_v3d.layers_local_view());
 			render_layer.layer = render_layer.scene_layer;
 			render_layer.exclude_layer = 0;
 			render_layer.holdout_layer = 0;
@@ -421,7 +418,6 @@ void BlenderSync::sync_render_layers(BL::SpaceView3D& b_v3d, const char *layer)
 			render_layer.use_surfaces = b_rlay->use_solid();
 			render_layer.use_hair = b_rlay->use_strand();
 			render_layer.use_viewport_visibility = false;
-			render_layer.use_localview = false;

 			render_layer.bound_samples = (use_layer_samples == 1);
 			if(use_layer_samples != 2) {
@@ -631,9 +627,9 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 	else
 		params.threads = 0;

-	params.cancel_timeout = get_float(cscene, "debug_cancel_timeout");
-	params.reset_timeout = get_float(cscene, "debug_reset_timeout");
-	params.text_timeout = get_float(cscene, "debug_text_timeout");
+	params.cancel_timeout = (double)get_float(cscene, "debug_cancel_timeout");
+	params.reset_timeout = (double)get_float(cscene, "debug_reset_timeout");
+	params.text_timeout = (double)get_float(cscene, "debug_text_timeout");

 	params.progressive_refine = get_boolean(cscene, "use_progressive_refine");

--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -146,7 +146,7 @@ private:
 	void sync_images();

 	/* util */
-	void find_shader(BL::ID& id, vector<uint>& used_shaders, int default_shader);
+	void find_shader(BL::ID& id, vector<Shader*>& used_shaders, Shader *default_shader);
 	bool BKE_object_is_modified(BL::Object& b_ob);
 	bool object_is_mesh(BL::Object& b_ob);
 	bool object_is_light(BL::Object& b_ob);
@@ -185,7 +185,6 @@ private:
 		  use_surfaces(true),
 		  use_hair(true),
 		  use_viewport_visibility(false),
-		  use_localview(false),
 		  samples(0), bound_samples(false)
 		{}

@@ -200,7 +199,6 @@ private:
 		bool use_surfaces;
 		bool use_hair;
 		bool use_viewport_visibility;
-		bool use_localview;
 		int samples;
 		bool bound_samples;
 	} render_layer;
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -58,14 +58,19 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data,
 }

 static inline void colorramp_to_array(BL::ColorRamp& ramp,
-                                      float4 *data,
+                                      array<float3>& ramp_color,
+                                      array<float>& ramp_alpha,
                                      int size)
 {
+	ramp_color.resize(size);
+	ramp_alpha.resize(size);
+
 	for(int i = 0; i < size; i++) {
 		float color[4];

 		ramp.evaluate((float)i/(float)(size-1), color);
-		data[i] = make_float4(color[0], color[1], color[2], color[3]);
+		ramp_color[i] = make_float3(color[0], color[1], color[2]);
+		ramp_alpha[i] = color[3];
 	}
 }

@@ -93,11 +98,12 @@ static inline void curvemapping_minmax(/*const*/ BL::CurveMapping& cumap,
 }

 static inline void curvemapping_to_array(BL::CurveMapping& cumap,
-                                         float *data,
+                                         array<float>& data,
                                         int size)
 {
 	cumap.update();
 	BL::CurveMap curve = cumap.curves[0];
+	data.resize(size);
 	for(int i = 0; i < size; i++) {
 		float t = (float)i/(float)(size-1);
 		data[i] = curve.evaluate(t);
@@ -105,7 +111,7 @@ static inline void curvemapping_to_array(BL::CurveMapping& cumap,
 }

 static inline void curvemapping_color_to_array(BL::CurveMapping& cumap,
-                                               float4 *data,
+                                               array<float3>& data,
                                               int size,
                                               bool rgb_curve)
 {
@@ -132,6 +138,8 @@ static inline void curvemapping_color_to_array(BL::CurveMapping& cumap,
 	BL::CurveMap mapG = cumap.curves[1];
 	BL::CurveMap mapB = cumap.curves[2];

+	data.resize(size);
+
 	if(rgb_curve) {
 		BL::CurveMap mapI = cumap.curves[3];

@@ -268,7 +276,6 @@ static inline uint get_layer(const BL::Array<int, 20>& array)

 static inline uint get_layer(const BL::Array<int, 20>& array,
                             const BL::Array<int, 8>& local_array,
-                             bool use_local,
                             bool is_light = false,
                             uint scene_layers = (1 << 20) - 1)
 {
@@ -293,13 +300,6 @@ static inline uint get_layer(const BL::Array<int, 20>& array,
 				layer |= (1 << (20+i));
 	}

-	/* we don't have spare bits for localview (normally 20-28) because
-	 * PATH_RAY_LAYER_SHIFT uses 20-32. So - check if we have localview and if
-	 * so, shift local view bits down to 1-8, since this is done for the view
-	 * port only - it should be OK and not conflict with render layers. */
-	if(use_local)
-		layer >>= 20;
-
 	return layer;
 }

--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -1,6 +1,7 @@

 set(INC
 	.
+	../graph
 	../kernel
 	../kernel/svm
 	../render
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -128,11 +128,11 @@ void BVH::pack_triangle(int idx, float4 storage[3])
 	const Mesh *mesh = objects[tob]->mesh;

 	int tidx = pack.prim_index[idx];
-	const int *vidx = mesh->triangles[tidx].v;
+	Mesh::Triangle t = mesh->get_triangle(tidx);
 	const float3* vpos = &mesh->verts[0];
-	float3 v0 = vpos[vidx[0]];
-	float3 v1 = vpos[vidx[1]];
-	float3 v2 = vpos[vidx[2]];
+	float3 v0 = vpos[t.v[0]];
+	float3 v1 = vpos[t.v[1]];
+	float3 v2 = vpos[t.v[2]];

 	storage[0] = float3_to_float4(v0);
 	storage[1] = float3_to_float4(v1);
@@ -506,10 +506,10 @@ void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility
 				if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
 					/* curves */
 					int str_offset = (params.top_level)? mesh->curve_offset: 0;
-					const Mesh::Curve& curve = mesh->curves[pidx - str_offset];
+					Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
 					int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);

-					curve.bounds_grow(k, &mesh->curve_keys[0], bbox);
+					curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);

 					visibility |= PATH_RAY_CURVE;

@@ -520,17 +520,17 @@ void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility
 						if(attr) {
 							size_t mesh_size = mesh->curve_keys.size();
 							size_t steps = mesh->motion_steps - 1;
-							float4 *key_steps = attr->data_float4();
+							float3 *key_steps = attr->data_float3();

 							for(size_t i = 0; i < steps; i++)
-								curve.bounds_grow(k, key_steps + i*mesh_size, bbox);
+								curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
 						}
 					}
 				}
 				else {
 					/* triangles */
 					int tri_offset = (params.top_level)? mesh->tri_offset: 0;
-					const Mesh::Triangle& triangle = mesh->triangles[pidx - tri_offset];
+					Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
 					const float3 *vpos = &mesh->verts[0];

 					triangle.bounds_grow(vpos, bbox);
@@ -770,10 +770,10 @@ void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 				if(pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
 					/* Curves. */
 					int str_offset = (params.top_level)? mesh->curve_offset: 0;
-					const Mesh::Curve& curve = mesh->curves[pidx - str_offset];
+					Mesh::Curve curve = mesh->get_curve(pidx - str_offset);
 					int k = PRIMITIVE_UNPACK_SEGMENT(pack.prim_type[prim]);

-					curve.bounds_grow(k, &mesh->curve_keys[0], bbox);
+					curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bbox);

 					visibility |= PATH_RAY_CURVE;

@@ -784,17 +784,17 @@ void QBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 						if(attr) {
 							size_t mesh_size = mesh->curve_keys.size();
 							size_t steps = mesh->motion_steps - 1;
-							float4 *key_steps = attr->data_float4();
+							float3 *key_steps = attr->data_float3();

 							for(size_t i = 0; i < steps; i++)
-								curve.bounds_grow(k, key_steps + i*mesh_size, bbox);
+								curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bbox);
 						}
 					}
 				}
 				else {
 					/* Triangles. */
 					int tri_offset = (params.top_level)? mesh->tri_offset: 0;
-					const Mesh::Triangle& triangle = mesh->triangles[pidx - tri_offset];
+					Mesh::Triangle triangle = mesh->get_triangle(pidx - tri_offset);
 					const float3 *vpos = &mesh->verts[0];

 					triangle.bounds_grow(vpos, bbox);
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -117,8 +117,9 @@ void BVHBuild::add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh,
 	if(mesh->has_motion_blur())
 		attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);

-	for(uint j = 0; j < mesh->triangles.size(); j++) {
-		Mesh::Triangle t = mesh->triangles[j];
+	size_t num_triangles = mesh->num_triangles();
+	for(uint j = 0; j < num_triangles; j++) {
+		Mesh::Triangle t = mesh->get_triangle(j);
 		BoundBox bounds = BoundBox::empty;
 		PrimitiveType type = PRIMITIVE_TRIANGLE;

@@ -148,22 +149,23 @@ void BVHBuild::add_reference_mesh(BoundBox& root, BoundBox& center, Mesh *mesh,
 	if(mesh->has_motion_blur())
 		curve_attr_mP = mesh->curve_attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);

-	for(uint j = 0; j < mesh->curves.size(); j++) {
-		Mesh::Curve curve = mesh->curves[j];
+	size_t num_curves = mesh->num_curves();
+	for(uint j = 0; j < num_curves; j++) {
+		Mesh::Curve curve = mesh->get_curve(j);
 		PrimitiveType type = PRIMITIVE_CURVE;

 		for(int k = 0; k < curve.num_keys - 1; k++) {
 			BoundBox bounds = BoundBox::empty;
-			curve.bounds_grow(k, &mesh->curve_keys[0], bounds);
+			curve.bounds_grow(k, &mesh->curve_keys[0], &mesh->curve_radius[0], bounds);

 			/* motion curve */
 			if(curve_attr_mP) {
 				size_t mesh_size = mesh->curve_keys.size();
 				size_t steps = mesh->motion_steps - 1;
-				float4 *key_steps = curve_attr_mP->data_float4();
+				float3 *key_steps = curve_attr_mP->data_float3();

 				for(size_t i = 0; i < steps; i++)
-					curve.bounds_grow(k, key_steps + i*mesh_size, bounds);
+					curve.bounds_grow(k, key_steps + i*mesh_size, &mesh->curve_radius[0], bounds);

 				type = PRIMITIVE_MOTION_CURVE;
 			}
@@ -188,10 +190,10 @@ void BVHBuild::add_reference_object(BoundBox& root, BoundBox& center, Object *ob

 static size_t count_curve_segments(Mesh *mesh)
 {
-	size_t num = 0, num_curves = mesh->curves.size();
+	size_t num = 0, num_curves = mesh->num_curves();

 	for(size_t i = 0; i < num_curves; i++)
-		num += mesh->curves[i].num_keys - 1;
+		num += mesh->get_curve(i).num_keys - 1;
 	
 	return num;
 }
@@ -203,15 +205,18 @@ void BVHBuild::add_references(BVHRange& root)

 	foreach(Object *ob, objects) {
 		if(params.top_level) {
+			if(!ob->is_traceable()) {
+				continue;
+			}
 			if(!ob->mesh->is_instanced()) {
-				num_alloc_references += ob->mesh->triangles.size();
+				num_alloc_references += ob->mesh->num_triangles();
 				num_alloc_references += count_curve_segments(ob->mesh);
 			}
 			else
 				num_alloc_references++;
 		}
 		else {
-			num_alloc_references += ob->mesh->triangles.size();
+			num_alloc_references += ob->mesh->num_triangles();
 			num_alloc_references += count_curve_segments(ob->mesh);
 		}
 	}
@@ -224,6 +229,9 @@ void BVHBuild::add_references(BVHRange& root)

 	foreach(Object *ob, objects) {
 		if(params.top_level) {
+			if(!ob->is_traceable()) {
+				continue;
+			}
 			if(!ob->mesh->is_instanced())
 				add_reference_mesh(bounds, center, ob->mesh, i);
 			else
@@ -326,11 +334,11 @@ BVHNode* BVHBuild::run()
 			VLOG(1) << "BVH build statistics:\n"
 			        << "  Build time: " << time_dt() - build_start_time << "\n"
 			        << "  Total number of nodes: "
-			        << rootnode->getSubtreeSize(BVH_STAT_NODE_COUNT) << "\n"
+			        << string_human_readable_number(rootnode->getSubtreeSize(BVH_STAT_NODE_COUNT)) << "\n"
 			        << "  Number of inner nodes: "
-			        << rootnode->getSubtreeSize(BVH_STAT_INNER_COUNT)  << "\n"
+			        << string_human_readable_number(rootnode->getSubtreeSize(BVH_STAT_INNER_COUNT)) << "\n"
 			        << "  Number of leaf nodes: "
-			        << rootnode->getSubtreeSize(BVH_STAT_LEAF_COUNT)  << "\n"
+			        << string_human_readable_number(rootnode->getSubtreeSize(BVH_STAT_LEAF_COUNT)) << "\n"
 			        << "  Allocation slop factor: "
 			               << ((prim_type.capacity() != 0)
 			                       ? (float)prim_type.size() / prim_type.capacity()
@@ -607,8 +615,10 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM_TOTAL];
 	vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM_TOTAL];
 	vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM_TOTAL];
+
 	/* TODO(sergey): In theory we should be able to store references. */
-	vector<BVHReference, LeafStackAllocator> object_references;
+	typedef StackAllocator<256, BVHReference> LeafReferenceStackAllocator;
+	vector<BVHReference, LeafReferenceStackAllocator> object_references;

 	uint visibility[PRIMITIVE_NUM_TOTAL] = {0};
 	/* NOTE: Keep initializtion in sync with actual number of primitives. */
@@ -629,6 +639,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,

 			bounds[type_index].grow(ref.bounds());
 			visibility[type_index] |= objects[ref.prim_object()]->visibility;
+			if(ref.prim_type() & PRIMITIVE_ALL_CURVE) {
+				visibility[type_index] |= PATH_RAY_CURVE;
+			}
 			++num_new_prims;
 		}
 		else {
--- a/intern/cycles/bvh/bvh_node.h
+++ b/intern/cycles/bvh/bvh_node.h
@@ -75,7 +75,7 @@ public:
 			m_visibility = 0; /* happens on build cancel */
 	}

-	InnerNode(const BoundBox& bounds)
+	explicit InnerNode(const BoundBox& bounds)
 	{
 		m_bounds = bounds;
 		m_visibility = 0;
--- a/intern/cycles/bvh/bvh_sort.cpp
+++ b/intern/cycles/bvh/bvh_sort.cpp
@@ -40,7 +40,7 @@ struct BVHReferenceCompare {
 public:
 	int dim;

-	BVHReferenceCompare(int dim_)
+	explicit BVHReferenceCompare(int dim_)
 	{
 		dim = dim_;
 	}
@@ -125,7 +125,7 @@ static void bvh_reference_sort_threaded(TaskPool *task_pool,
 		if(compare.compare(data[left], data[right]) > 0) {
 			swap(data[left], data[right]);
 		}
-		if (compare.compare(data[center], data[right]) > 0) {
+		if(compare.compare(data[center], data[right]) > 0) {
 			swap(data[center], data[right]);
 		}
 		swap(data[center], data[right - 1]);
@@ -184,7 +184,7 @@ void bvh_reference_sort(int start, int end, BVHReference *data, int dim)
 	}
 	else {
 		TaskPool task_pool;
-		bvh_reference_sort_threaded(&task_pool, data, start, end - 1, dim);
+		bvh_reference_sort_threaded(&task_pool, data, start, end - 1, compare);
 		task_pool.wait_work();
 	}
 }
--- a/intern/cycles/bvh/bvh_split.cpp
+++ b/intern/cycles/bvh/bvh_split.cpp
@@ -292,13 +292,13 @@ void BVHSpatialSplit::split_triangle_primitive(const Mesh *mesh,
                                               BoundBox& left_bounds,
                                               BoundBox& right_bounds)
 {
-	const int *inds = mesh->triangles[prim_index].v;
+	Mesh::Triangle t = mesh->get_triangle(prim_index);
 	const float3 *verts = &mesh->verts[0];
-	float3 v1 = tfm ? transform_point(tfm, verts[inds[2]]) : verts[inds[2]];
+	float3 v1 = tfm ? transform_point(tfm, verts[t.v[2]]) : verts[t.v[2]];

 	for(int i = 0; i < 3; i++) {
 		float3 v0 = v1;
-		int vindex = inds[i];
+		int vindex = t.v[i];
 		v1 = tfm ? transform_point(tfm, verts[vindex]) : verts[vindex];
 		float v0p = v0[dim];
 		float v1p = v1[dim];
@@ -329,12 +329,11 @@ void BVHSpatialSplit::split_curve_primitive(const Mesh *mesh,
                                            BoundBox& right_bounds)
 {
 	/* curve split: NOTE - Currently ignores curve width and needs to be fixed.*/
-	const int k0 = mesh->curves[prim_index].first_key + segment_index;
+	Mesh::Curve curve = mesh->get_curve(prim_index);
+	const int k0 = curve.first_key + segment_index;
 	const int k1 = k0 + 1;
-	const float4& key0 = mesh->curve_keys[k0];
-	const float4& key1 = mesh->curve_keys[k1];
-	float3 v0 = float4_to_float3(key0);
-	float3 v1 = float4_to_float3(key1);
+	float3 v0 = mesh->curve_keys[k0];
+	float3 v1 = mesh->curve_keys[k1];

 	if(tfm != NULL) {
 		v0 = transform_point(tfm, v0);
@@ -405,7 +404,7 @@ void BVHSpatialSplit::split_object_reference(const Object *object,
                                             BoundBox& right_bounds)
 {
 	Mesh *mesh = object->mesh;
-	for(int tri_idx = 0; tri_idx < mesh->triangles.size(); ++tri_idx) {
+	for(int tri_idx = 0; tri_idx < mesh->num_triangles(); ++tri_idx) {
 		split_triangle_primitive(mesh,
 		                         &object->tfm,
 		                         tri_idx,
@@ -414,8 +413,8 @@ void BVHSpatialSplit::split_object_reference(const Object *object,
 		                         left_bounds,
 		                         right_bounds);
 	}
-	for(int curve_idx = 0; curve_idx < mesh->curves.size(); ++curve_idx) {
-		Mesh::Curve &curve = mesh->curves[curve_idx];
+	for(int curve_idx = 0; curve_idx < mesh->num_curves(); ++curve_idx) {
+		Mesh::Curve curve = mesh->get_curve(curve_idx);
 		for(int segment_idx = 0;
 		    segment_idx < curve.num_keys - 1;
 		    ++segment_idx)
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -1,6 +1,7 @@

 set(INC
 	.
+	../graph
 	../kernel
 	../kernel/svm
 	../kernel/osl
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -56,6 +56,8 @@ std::ostream& operator <<(std::ostream &os,
 	   << string_from_bool(requested_features.use_camera_motion)  << std::endl;
 	os << "Use Baking: "
 	   << string_from_bool(requested_features.use_baking)  << std::endl;
+	os << "Use Volume: "
+	   << string_from_bool(requested_features.use_volume)  << std::endl;
 	return os;
 }

--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -54,7 +54,7 @@ public:
 	bool display_device;
 	bool advanced_shading;
 	bool pack_images;
-	bool extended_images; /* flag for GPU and Multi device */
+	bool has_bindless_textures; /* flag for GPU and Multi device */
 	bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */
 	vector<DeviceInfo> multi_devices;

@@ -66,7 +66,7 @@ public:
 		display_device = false;
 		advanced_shading = true;
 		pack_images = false;
-		extended_images = false;
+		has_bindless_textures = false;
 		use_split_kernel = false;
 	}
 };
@@ -103,6 +103,9 @@ public:
 	/* Use subsurface scattering materials. */
 	bool use_subsurface;

+	/* Use volume materials. */
+	bool use_volume;
+
 	/* Use branched integrator. */
 	bool use_integrator_branched;

@@ -118,6 +121,7 @@ public:
 		use_camera_motion = false;
 		use_baking = false;
 		use_subsurface = false;
+		use_volume = false;
 		use_integrator_branched = false;
 	}

@@ -132,6 +136,7 @@ public:
 		         use_camera_motion == requested_features.use_camera_motion &&
 		         use_baking == requested_features.use_baking &&
 		         use_subsurface == requested_features.use_subsurface &&
+		         use_volume == requested_features.use_volume &&
 		         use_integrator_branched == requested_features.use_integrator_branched);
 	}

@@ -161,6 +166,9 @@ public:
 		if(!use_baking) {
 			build_options += " -D__NO_BAKING__";
 		}
+		if(!use_volume) {
+			build_options += " -D__NO_VOLUME__";
+		}
 		if(!use_subsurface) {
 			build_options += " -D__NO_SUBSURFACE__";
 		}
@@ -222,6 +230,7 @@ public:
 		(void)interpolation;  /* Ignored. */
 		(void)extension;  /* Ignored. */
 	};
+
 	virtual void tex_free(device_memory& /*mem*/) {};

 	/* pixel memory */
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -155,7 +155,9 @@ public:
 	               InterpolationType interpolation,
 	               ExtensionType extension)
 	{
-		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
+		VLOG(1) << "Texture allocate: " << name << ", "
+		        << string_human_readable_number(mem.memory_size()) << " bytes. ("
+		        << string_human_readable_size(mem.memory_size()) << ")";
 		kernel_tex_copy(&kernel_globals,
 		                name,
 		                mem.data_pointer,
@@ -213,12 +215,7 @@ public:
 				return;
 		}

-		KernelGlobals kg = kernel_globals;
-
-#ifdef WITH_OSL
-		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
-#endif
-
+		KernelGlobals kg = thread_kernel_globals_init();
 		RenderTile tile;

 		void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
@@ -289,9 +286,7 @@ public:
 			}
 		}

-#ifdef WITH_OSL
-		OSLShader::thread_free(&kg);
-#endif
+		thread_kernel_globals_free(&kg);
 	}

 	void thread_film_convert(DeviceTask& task)
@@ -481,6 +476,40 @@ public:
 	{
 		task_pool.cancel();
 	}
+
+protected:
+	inline KernelGlobals thread_kernel_globals_init()
+	{
+		KernelGlobals kg = kernel_globals;
+		kg.transparent_shadow_intersections = NULL;
+		const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
+		                            sizeof(*kg.decoupled_volume_steps);
+		for(int i = 0; i < decoupled_count; ++i) {
+			kg.decoupled_volume_steps[i] = NULL;
+		}
+		kg.decoupled_volume_steps_index = 0;
+#ifdef WITH_OSL
+		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
+#endif
+		return kg;
+	}
+
+	inline void thread_kernel_globals_free(KernelGlobals *kg)
+	{
+		if(kg->transparent_shadow_intersections != NULL) {
+			free(kg->transparent_shadow_intersections);
+		}
+		const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
+		                            sizeof(*kg->decoupled_volume_steps);
+		for(int i = 0; i < decoupled_count; ++i) {
+			if(kg->decoupled_volume_steps[i] != NULL) {
+				free(kg->decoupled_volume_steps[i]);
+			}
+		}
+#ifdef WITH_OSL
+		OSLShader::thread_free(kg);
+#endif
+	}
 };

 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -41,11 +41,6 @@
 #include "util_types.h"
 #include "util_time.h"

-/* use feature-adaptive kernel compilation.
- * Requires CUDA toolkit to be installed and currently only works on Linux.
- */
-/* #define KERNEL_USE_ADAPTIVE */
-
 CCL_NAMESPACE_BEGIN

 #ifndef WITH_CUDA_DYNLOAD
@@ -90,10 +85,10 @@ public:
 	CUcontext cuContext;
 	CUmodule cuModule;
 	map<device_ptr, bool> tex_interp_map;
+	map<device_ptr, uint> tex_bindless_map;
 	int cuDevId;
 	int cuDevArchitecture;
 	bool first_error;
-	bool use_texture_storage;

 	struct PixelMem {
 		GLuint cuPBO;
@@ -104,6 +99,10 @@ public:

 	map<device_ptr, PixelMem> pixel_mem_map;

+	/* Bindless Textures */
+	device_vector<uint> bindless_mapping;
+	bool need_bindless_mapping;
+
 	CUdeviceptr cuda_device_ptr(device_ptr mem)
 	{
 		return (CUdeviceptr)mem;
@@ -181,12 +180,13 @@ public:
 	{
 		first_error = true;
 		background = background_;
-		use_texture_storage = true;

 		cuDevId = info.num;
 		cuDevice = 0;
 		cuContext = 0;

+		need_bindless_mapping = false;
+
 		/* intialize */
 		if(cuda_error(cuInit(0)))
 			return;
@@ -216,11 +216,6 @@ public:
 		cuDeviceComputeCapability(&major, &minor, cuDevId);
 		cuDevArchitecture = major*100 + minor*10;

-		/* In order to use full 6GB of memory on Titan cards, use arrays instead
-		 * of textures. On earlier cards this seems slower, but on Titan it is
-		 * actually slightly faster in tests. */
-		use_texture_storage = (cuDevArchitecture < 300);
-
 		cuda_pop_context();
 	}

@@ -228,6 +223,10 @@ public:
 	{
 		task_pool.stop();

+		if(info.has_bindless_textures) {
+			tex_free(bindless_mapping);
+		}
+
 		cuda_assert(cuCtxDestroy(cuContext));
 	}

@@ -245,40 +244,52 @@ public:
 		return true;
 	}

+	bool use_adaptive_compilation()
+	{
+		return DebugFlags().cuda.adaptive_compile;
+	}
+
 	string compile_kernel(const DeviceRequestedFeatures& requested_features)
 	{
-		/* compute cubin name */
+		/* Compute cubin name. */
 		int major, minor;
 		cuDeviceComputeCapability(&major, &minor, cuDevId);
 		string cubin;

-		/* attempt to use kernel provided with blender */
-		cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
-		VLOG(1) << "Testing for pre-compiled kernel " << cubin;
-		if(path_exists(cubin)) {
-			VLOG(1) << "Using precompiled kernel";
-			return cubin;
+		/* Adaptive Compile.
+		 * If enabled, always use that */
+		bool use_adaptive_compile = use_adaptive_compilation();
+
+		/* Attempt to use kernel provided with Blender. */
+		if(!use_adaptive_compile) {
+			cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
+			VLOG(1) << "Testing for pre-compiled kernel " << cubin;
+			if(path_exists(cubin)) {
+				VLOG(1) << "Using precompiled kernel";
+				return cubin;
+			}
 		}

-		/* not found, try to use locally compiled kernel */
+		/* Try to use locally compiled kernel. */
 		string kernel_path = path_get("kernel");
 		string md5 = path_files_md5_hash(kernel_path);

-#ifdef KERNEL_USE_ADAPTIVE
-		string feature_build_options = requested_features.get_build_options();
-		string device_md5 = util_md5_string(feature_build_options);
-		cubin = string_printf("cycles_kernel_%s_sm%d%d_%s.cubin",
-		                      device_md5.c_str(),
-		                      major, minor,
-		                      md5.c_str());
-#else
-		(void)requested_features;
-		cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
-#endif
+		string feature_build_options;
+		if(use_adaptive_compile) {
+			feature_build_options = requested_features.get_build_options();
+			string device_md5 = util_md5_string(feature_build_options);
+			cubin = string_printf("cycles_kernel_%s_sm%d%d_%s.cubin",
+		                          device_md5.c_str(),
+		                          major, minor,
+		                          md5.c_str());
+		}
+		else {
+			cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
+		}

 		cubin = path_user_get(path_join("cache", cubin));
 		VLOG(1) << "Testing for locally compiled kernel " << cubin;
-		/* if exists already, use it */
+		/* If exists already, use it. */
 		if(path_exists(cubin)) {
 			VLOG(1) << "Using locally compiled kernel";
 			return cubin;
@@ -294,7 +305,7 @@ public:
 		}
 #endif

-		/* if not, find CUDA compiler */
+		/* If not, find CUDA compiler. */
 		const char *nvcc = cuewCompilerPath();

 		if(nvcc == NULL) {
@@ -316,7 +327,7 @@ public:
 		else if(cuda_version != 75)
 			printf("CUDA version %d.%d detected, build may succeed but only CUDA 7.5 is officially supported.\n", cuda_version/10, cuda_version%10);

-		/* compile */
+		/* Compile. */
 		string kernel = path_join(kernel_path, path_join("kernels", path_join("cuda", "kernel.cu")));
 		string include = kernel_path;
 		const int machine = system_cpu_bits();
@@ -331,9 +342,8 @@ public:
 			"-DNVCC -D__KERNEL_CUDA_VERSION__=%d",
 			nvcc, major, minor, machine, kernel.c_str(), cubin.c_str(), include.c_str(), cuda_version);

-#ifdef KERNEL_USE_ADAPTIVE
-		command += " " + feature_build_options;
-#endif
+		if(use_adaptive_compile)
+			command += " " + feature_build_options;

 		const char* extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
 		if(extra_cflags) {
@@ -351,7 +361,7 @@ public:
 			return "";
 		}

-		/* verify if compilation succeeded */
+		/* Verify if compilation succeeded */
 		if(!path_exists(cubin)) {
 			cuda_error_message("CUDA kernel compilation failed, see console for details.");
 			return "";
@@ -397,6 +407,15 @@ public:
 		return (result == CUDA_SUCCESS);
 	}

+	void load_bindless_mapping()
+	{
+		if(info.has_bindless_textures && need_bindless_mapping) {
+			tex_free(bindless_mapping);
+			tex_alloc("__bindless_mapping", bindless_mapping, INTERPOLATION_NONE, EXTENSION_REPEAT);
+			need_bindless_mapping = false;
+		}
+	}
+
 	void mem_alloc(device_memory& mem, MemoryType /*type*/)
 	{
 		cuda_push_context();
@@ -474,128 +493,103 @@ public:
 	               InterpolationType interpolation,
 	               ExtensionType extension)
 	{
-		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
+		VLOG(1) << "Texture allocate: " << name << ", "
+		        << string_human_readable_number(mem.memory_size()) << " bytes. ("
+		        << string_human_readable_size(mem.memory_size()) << ")";

+		/* Check if we are on sm_30 or above.
+		 * We use arrays and bindles textures for storage there */
+		bool has_bindless_textures = info.has_bindless_textures;
+
+		/* General variables for both architectures */
 		string bind_name = name;
-		if(mem.data_depth > 1) {
-			/* Kernel uses different bind names for 2d and 3d float textures,
-			 * so we have to adjust couple of things here.
-			 */
-			vector<string> tokens;
-			string_split(tokens, name, "_");
-			bind_name = string_printf("__tex_image_%s3d_%s",
-			                          tokens[2].c_str(),
-			                          tokens[3].c_str());
-		}
-
-		/* determine format */
-		CUarray_format_enum format;
 		size_t dsize = datatype_size(mem.data_type);
 		size_t size = mem.memory_size();
-		bool use_texture = (interpolation != INTERPOLATION_NONE) || use_texture_storage;

-		if(use_texture) {
+		CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+		switch(extension) {
+			case EXTENSION_REPEAT:
+				address_mode = CU_TR_ADDRESS_MODE_WRAP;
+				break;
+			case EXTENSION_EXTEND:
+				address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+				break;
+			case EXTENSION_CLIP:
+				address_mode = CU_TR_ADDRESS_MODE_BORDER;
+				break;
+			default:
+				assert(0);
+				break;
+		}

-			switch(mem.data_type) {
-				case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
-				case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
-				case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
-				case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
-				default: assert(0); return;
+		CUfilter_mode filter_mode;
+		if(interpolation == INTERPOLATION_CLOSEST) {
+			filter_mode = CU_TR_FILTER_MODE_POINT;
+		}
+		else {
+			filter_mode = CU_TR_FILTER_MODE_LINEAR;
+		}
+
+		CUarray_format_enum format;
+		switch(mem.data_type) {
+			case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
+			case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
+			case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
+			case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
+			default: assert(0); return;
+		}
+
+		/* General variables for Fermi */
+		CUtexref texref = NULL;
+
+		if(!has_bindless_textures) {
+			if(mem.data_depth > 1) {
+				/* Kernel uses different bind names for 2d and 3d float textures,
+				 * so we have to adjust couple of things here.
+				 */
+				vector<string> tokens;
+				string_split(tokens, name, "_");
+				bind_name = string_printf("__tex_image_%s_3d_%s",
+				                          tokens[2].c_str(),
+				                          tokens[3].c_str());
 			}

-			CUtexref texref = NULL;
-
 			cuda_push_context();
 			cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str()));
+			cuda_pop_context();

 			if(!texref) {
-				cuda_pop_context();
 				return;
 			}
+		}

-			if(interpolation != INTERPOLATION_NONE) {
-				CUarray handle = NULL;
+		/* Data Storage */
+		if(interpolation == INTERPOLATION_NONE) {
+			if(has_bindless_textures) {
+				mem_alloc(mem, MEM_READ_ONLY);
+				mem_copy_to(mem);

-				if(mem.data_depth > 1) {
-					CUDA_ARRAY3D_DESCRIPTOR desc;
+				cuda_push_context();

-					desc.Width = mem.data_width;
-					desc.Height = mem.data_height;
-					desc.Depth = mem.data_depth;
-					desc.Format = format;
-					desc.NumChannels = mem.data_elements;
-					desc.Flags = 0;
+				CUdeviceptr cumem;
+				size_t cubytes;

-					cuda_assert(cuArray3DCreate(&handle, &desc));
+				cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
+
+				if(cubytes == 8) {
+					/* 64 bit device pointer */
+					uint64_t ptr = mem.device_pointer;
+					cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
 				}
 				else {
-					CUDA_ARRAY_DESCRIPTOR desc;
-
-					desc.Width = mem.data_width;
-					desc.Height = mem.data_height;
-					desc.Format = format;
-					desc.NumChannels = mem.data_elements;
-
-					cuda_assert(cuArrayCreate(&handle, &desc));
+					/* 32 bit device pointer */
+					uint32_t ptr = (uint32_t)mem.device_pointer;
+					cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
 				}

-				if(!handle) {
-					cuda_pop_context();
-					return;
-				}
-
-				if(mem.data_depth > 1) {
-					CUDA_MEMCPY3D param;
-					memset(&param, 0, sizeof(param));
-					param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-					param.dstArray = handle;
-					param.srcMemoryType = CU_MEMORYTYPE_HOST;
-					param.srcHost = (void*)mem.data_pointer;
-					param.srcPitch = mem.data_width*dsize*mem.data_elements;
-					param.WidthInBytes = param.srcPitch;
-					param.Height = mem.data_height;
-					param.Depth = mem.data_depth;
-
-					cuda_assert(cuMemcpy3D(&param));
-				}
-				if(mem.data_height > 1) {
-					CUDA_MEMCPY2D param;
-					memset(&param, 0, sizeof(param));
-					param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-					param.dstArray = handle;
-					param.srcMemoryType = CU_MEMORYTYPE_HOST;
-					param.srcHost = (void*)mem.data_pointer;
-					param.srcPitch = mem.data_width*dsize*mem.data_elements;
-					param.WidthInBytes = param.srcPitch;
-					param.Height = mem.data_height;
-
-					cuda_assert(cuMemcpy2D(&param));
-				}
-				else
-					cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size));
-
-				cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
-
-				if(interpolation == INTERPOLATION_CLOSEST) {
-					cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
-				}
-				else if(interpolation == INTERPOLATION_LINEAR) {
-					cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
-				}
-				else {/* CUBIC and SMART are unsupported for CUDA */
-					cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR));
-				}
-				cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
-
-				mem.device_pointer = (device_ptr)handle;
-				mem.device_size = size;
-
-				stats.mem_alloc(size);
+				cuda_pop_context();
 			}
 			else {
-				cuda_pop_context();
-
 				mem_alloc(mem, MEM_READ_ONLY);
 				mem_copy_to(mem);

@@ -604,23 +598,137 @@ public:
 				cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size));
 				cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
 				cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
+
+				cuda_pop_context();
+			}
+		}
+		/* Texture Storage */
+		else {
+			CUarray handle = NULL;
+
+			cuda_push_context();
+
+			if(mem.data_depth > 1) {
+				CUDA_ARRAY3D_DESCRIPTOR desc;
+
+				desc.Width = mem.data_width;
+				desc.Height = mem.data_height;
+				desc.Depth = mem.data_depth;
+				desc.Format = format;
+				desc.NumChannels = mem.data_elements;
+				desc.Flags = 0;
+
+				cuda_assert(cuArray3DCreate(&handle, &desc));
+			}
+			else {
+				CUDA_ARRAY_DESCRIPTOR desc;
+
+				desc.Width = mem.data_width;
+				desc.Height = mem.data_height;
+				desc.Format = format;
+				desc.NumChannels = mem.data_elements;
+
+				cuda_assert(cuArrayCreate(&handle, &desc));
 			}

-			CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-			switch(extension) {
-				case EXTENSION_REPEAT:
-					address_mode = CU_TR_ADDRESS_MODE_WRAP;
-					break;
-				case EXTENSION_EXTEND:
-					address_mode = CU_TR_ADDRESS_MODE_CLAMP;
-					break;
-				case EXTENSION_CLIP:
-					address_mode = CU_TR_ADDRESS_MODE_BORDER;
-					break;
-				default:
-					assert(0);
-					break;
+			if(!handle) {
+				cuda_pop_context();
+				return;
 			}
+
+			/* Allocate 3D, 2D or 1D memory */
+			if(mem.data_depth > 1) {
+				CUDA_MEMCPY3D param;
+				memset(&param, 0, sizeof(param));
+				param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+				param.dstArray = handle;
+				param.srcMemoryType = CU_MEMORYTYPE_HOST;
+				param.srcHost = (void*)mem.data_pointer;
+				param.srcPitch = mem.data_width*dsize*mem.data_elements;
+				param.WidthInBytes = param.srcPitch;
+				param.Height = mem.data_height;
+				param.Depth = mem.data_depth;
+
+				cuda_assert(cuMemcpy3D(&param));
+			}
+			else if(mem.data_height > 1) {
+				CUDA_MEMCPY2D param;
+				memset(&param, 0, sizeof(param));
+				param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+				param.dstArray = handle;
+				param.srcMemoryType = CU_MEMORYTYPE_HOST;
+				param.srcHost = (void*)mem.data_pointer;
+				param.srcPitch = mem.data_width*dsize*mem.data_elements;
+				param.WidthInBytes = param.srcPitch;
+				param.Height = mem.data_height;
+
+				cuda_assert(cuMemcpy2D(&param));
+			}
+			else
+				cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size));
+
+			/* Fermi and Kepler */
+			mem.device_pointer = (device_ptr)handle;
+			mem.device_size = size;
+
+			stats.mem_alloc(size);
+
+			/* Bindless Textures - Kepler */
+			if(has_bindless_textures) {
+				int flat_slot = 0;
+				if(string_startswith(name, "__tex_image")) {
+					int pos =  string(name).rfind("_");
+					flat_slot = atoi(name + pos + 1);
+				}
+				else {
+					assert(0);
+				}
+
+				CUDA_RESOURCE_DESC resDesc;
+				memset(&resDesc, 0, sizeof(resDesc));
+				resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+				resDesc.res.array.hArray = handle;
+				resDesc.flags = 0;
+
+				CUDA_TEXTURE_DESC texDesc;
+				memset(&texDesc, 0, sizeof(texDesc));
+				texDesc.addressMode[0] = address_mode;
+				texDesc.addressMode[1] = address_mode;
+				texDesc.addressMode[2] = address_mode;
+				texDesc.filterMode = filter_mode;
+				texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+				CUtexObject tex = 0;
+				cuda_assert(cuTexObjectCreate(&tex, &resDesc, &texDesc, NULL));
+
+				/* Safety check */
+				if((uint)tex > UINT_MAX) {
+					assert(0);
+				}
+
+				/* Resize once */
+				if(flat_slot >= bindless_mapping.size())
+					bindless_mapping.resize(4096); /*TODO(dingto): Make this a variable */
+
+				/* Set Mapping and tag that we need to (re-)upload to device */
+				bindless_mapping.get_data()[flat_slot] = (uint)tex;
+				tex_bindless_map[mem.device_pointer] = (uint)tex;
+				need_bindless_mapping = true;
+			}
+			/* Regular Textures - Fermi */
+			else {
+				cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT));
+				cuda_assert(cuTexRefSetFilterMode(texref, filter_mode));
+				cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
+			}
+
+			cuda_pop_context();
+		}
+
+		/* Fermi, Data and Image Textures */
+		if(!has_bindless_textures) {
+			cuda_push_context();
+
 			cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode));
 			cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode));
 			if(mem.data_depth > 1) {
@@ -631,31 +739,8 @@ public:

 			cuda_pop_context();
 		}
-		else {
-			mem_alloc(mem, MEM_READ_ONLY);
-			mem_copy_to(mem);
-
-			cuda_push_context();
-
-			CUdeviceptr cumem;
-			size_t cubytes;
-
-			cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
-
-			if(cubytes == 8) {
-				/* 64 bit device pointer */
-				uint64_t ptr = mem.device_pointer;
-				cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
-			}
-			else {
-				/* 32 bit device pointer */
-				uint32_t ptr = (uint32_t)mem.device_pointer;
-				cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
-			}
-
-			cuda_pop_context();
-		}

+		/* Fermi and Kepler */
 		tex_interp_map[mem.device_pointer] = (interpolation != INTERPOLATION_NONE);
 	}

@@ -667,6 +752,12 @@ public:
 				cuArrayDestroy((CUarray)mem.device_pointer);
 				cuda_pop_context();

+				/* Free CUtexObject (Bindless Textures) */
+				if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) {
+					uint flat_slot = tex_bindless_map[mem.device_pointer];
+					cuTexObjectDestroy(flat_slot);
+				}
+
 				tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
 				mem.device_pointer = 0;

@@ -723,8 +814,8 @@ public:
 		printf("threads_per_block %d\n", threads_per_block);
 		printf("num_registers %d\n", num_registers);*/

-		int xthreads = (int)sqrt((float)threads_per_block);
-		int ythreads = (int)sqrt((float)threads_per_block);
+		int xthreads = (int)sqrt(threads_per_block);
+		int ythreads = (int)sqrt(threads_per_block);
 		int xblocks = (rtile.w + xthreads - 1)/xthreads;
 		int yblocks = (rtile.h + ythreads - 1)/ythreads;

@@ -777,8 +868,8 @@ public:
 		int threads_per_block;
 		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));

-		int xthreads = (int)sqrt((float)threads_per_block);
-		int ythreads = (int)sqrt((float)threads_per_block);
+		int xthreads = (int)sqrt(threads_per_block);
+		int ythreads = (int)sqrt(threads_per_block);
 		int xblocks = (task.w + xthreads - 1)/xthreads;
 		int yblocks = (task.h + ythreads - 1)/ythreads;

@@ -1108,6 +1199,9 @@ public:
 			RenderTile tile;
 			
 			bool branched = task->integrator_branched;
+
+			/* Upload Bindless Mapping */
+			load_bindless_mapping();
 			
 			/* keep rendering tiles until done */
 			while(task->acquire_tile(this, tile)) {
@@ -1131,6 +1225,9 @@ public:
 			}
 		}
 		else if(task->type == DeviceTask::SHADER) {
+			/* Upload Bindless Mapping */
+			load_bindless_mapping();
+
 			shader(*task);

 			cuda_push_context();
@@ -1266,11 +1363,12 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 		info.num = num;

 		info.advanced_shading = (major >= 2);
-		info.extended_images = (major >= 3);
+		info.has_bindless_textures = (major >= 3);
 		info.pack_images = false;

 		/* if device has a kernel timeout, assume it is used for display */
 		if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) {
+			info.description += " (Display)";
 			info.display_device = true;
 			display_devices.push_back(info);
 		}
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -35,7 +35,7 @@ class MultiDevice : public Device
 {
 public:
 	struct SubDevice {
-		SubDevice(Device *device_)
+		explicit SubDevice(Device *device_)
 		: device(device_) {}

 		Device *device;
@@ -175,7 +175,9 @@ public:
 	               interpolation,
 	               ExtensionType extension)
 	{
-		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
+		VLOG(1) << "Texture allocate: " << name << ", "
+		        << string_human_readable_number(mem.memory_size()) << " bytes. ("
+		        << string_human_readable_size(mem.memory_size()) << ")";

 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = 0;
@@ -352,7 +354,7 @@ static bool device_multi_add(vector<DeviceInfo>& devices, DeviceType type, bool

 	info.advanced_shading = with_advanced_shading;
 	info.pack_images = false;
-	info.extended_images = true;
+	info.has_bindless_textures = true;

 	foreach(DeviceInfo& subinfo, devices) {
 		if(subinfo.type == type) {
@@ -376,7 +378,7 @@ static bool device_multi_add(vector<DeviceInfo>& devices, DeviceType type, bool
 			if(subinfo.display_device)
 				info.display_device = true;
 			info.pack_images = info.pack_images || subinfo.pack_images;
-			info.extended_images = info.extended_images && subinfo.extended_images;
+			info.has_bindless_textures = info.has_bindless_textures && subinfo.has_bindless_textures;
 			num_added++;
 		}
 	}
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -168,7 +168,9 @@ public:
 	               InterpolationType interpolation,
 	               ExtensionType extension)
 	{
-		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
+		VLOG(1) << "Texture allocate: " << name << ", "
+		        << string_human_readable_number(mem.memory_size()) << " bytes. ("
+		        << string_human_readable_size(mem.memory_size()) << ")";

 		thread_scoped_lock lock(rpc_lock);

--- a/intern/cycles/device/device_network.h
+++ b/intern/cycles/device/device_network.h
@@ -322,7 +322,7 @@ protected:

 class ServerDiscovery {
 public:
-	ServerDiscovery(bool discover = false)
+	explicit ServerDiscovery(bool discover = false)
 	: listen_socket(io_service), collect_servers(false)
 	{
 		/* setup listen socket */
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -1187,7 +1187,9 @@ public:
 	               InterpolationType /*interpolation*/,
 	               ExtensionType /*extension*/)
 	{
-		VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
+		VLOG(1) << "Texture allocate: " << name << ", "
+		        << string_human_readable_number(mem.memory_size()) << " bytes. ("
+		        << string_human_readable_size(mem.memory_size()) << ")";
 		mem_alloc(mem, MEM_READ_ONLY);
 		mem_copy_to(mem);
 		assert(mem_map.find(name) == mem_map.end());
@@ -1222,18 +1224,28 @@ public:
 			CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
 		clGetDeviceInfo(cdDevice,
 			CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
-	
-		/* try to divide evenly over 2 dimensions */
+
+		/* Try to divide evenly over 2 dimensions. */
 		size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
 		size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};

-		/* some implementations have max size 1 on 2nd dimension */
+		/* Some implementations have max size 1 on 2nd dimension. */
 		if(local_size[1] > max_work_items[1]) {
 			local_size[0] = workgroup_size/max_work_items[1];
 			local_size[1] = max_work_items[1];
 		}

-		size_t global_size[2] = {global_size_round_up(local_size[0], w), global_size_round_up(local_size[1], h)};
+		size_t global_size[2] = {global_size_round_up(local_size[0], w),
+		                         global_size_round_up(local_size[1], h)};
+
+		/* Vertical size of 1 is coming from bake/shade kernels where we should
+		 * not round anything up because otherwise we'll either be doing too
+		 * much work per pixel (if we don't check global ID on Y axis) or will
+		 * be checking for global ID to always have Y of 0.
+		 */
+		if (h == 1) {
+			global_size[h] = 1;
+		}

 		/* run kernel */
 		opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
@@ -1318,48 +1330,49 @@ public:
 		else
 			kernel = ckShaderKernel;

+		cl_uint start_arg_index =
+			kernel_set_args(kernel,
+			                0,
+			                d_data,
+			                d_input,
+			                d_output);
+
+		if(task.shader_eval_type < SHADER_EVAL_BAKE) {
+			start_arg_index += kernel_set_args(kernel,
+			                                   start_arg_index,
+			                                   d_output_luma);
+		}
+
+#define KERNEL_TEX(type, ttype, name) \
+		set_kernel_arg_mem(kernel, &start_arg_index, #name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+
+		start_arg_index += kernel_set_args(kernel,
+		                                   start_arg_index,
+		                                   d_shader_eval_type);
+		if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
+			start_arg_index += kernel_set_args(kernel,
+			                                   start_arg_index,
+			                                   d_shader_filter);
+		}
+		start_arg_index += kernel_set_args(kernel,
+		                                   start_arg_index,
+		                                   d_shader_x,
+		                                   d_shader_w,
+		                                   d_offset);
+
 		for(int sample = 0; sample < task.num_samples; sample++) {

 			if(task.get_cancel())
 				break;

-			cl_int d_sample = sample;
-
-			cl_uint start_arg_index =
-				kernel_set_args(kernel,
-				                0,
-				                d_data,
-				                d_input,
-				                d_output);
-
-			if(task.shader_eval_type < SHADER_EVAL_BAKE) {
-				start_arg_index += kernel_set_args(kernel,
-				                                   start_arg_index,
-				                                   d_output_luma);
-			}
-
-#define KERNEL_TEX(type, ttype, name) \
-			set_kernel_arg_mem(kernel, &start_arg_index, #name);
-#include "kernel_textures.h"
-#undef KERNEL_TEX
-
-			start_arg_index += kernel_set_args(kernel,
-			                                   start_arg_index,
-			                                   d_shader_eval_type);
-			if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
-				start_arg_index += kernel_set_args(kernel,
-				                                   start_arg_index,
-				                                   d_shader_filter);
-			}
-			start_arg_index += kernel_set_args(kernel,
-			                                   start_arg_index,
-			                                   d_shader_x,
-			                                   d_shader_w,
-			                                   d_offset,
-			                                   d_sample);
+			kernel_set_args(kernel, start_arg_index, sample);

 			enqueue_kernel(kernel, task.shader_w, 1);

+			clFinish(cqCommandQueue);
+
 			task.update_progress(NULL);
 		}
 	}
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -51,7 +51,7 @@ public:
 	int shader_filter;
 	int shader_x, shader_w;

-	DeviceTask(Type type = PATH_TRACE);
+	explicit DeviceTask(Type type = PATH_TRACE);

 	int get_subtask_count(int num, int max_size = 0);
 	void split(list<DeviceTask>& tasks, int num, int max_size = 0);
--- a/intern/cycles/graph/CMakeLists.txt
+++ b/intern/cycles/graph/CMakeLists.txt
@@ -0,0 +1,24 @@
+
+set(INC
+	.
+	../util
+)
+
+set(SRC
+	node.cpp
+	node_type.cpp
+	node_xml.cpp
+)
+
+set(SRC_HEADERS
+	node.h
+	node_enum.h
+	node_type.h
+	node_xml.h
+)
+
+include_directories(${INC})
+include_directories(SYSTEM ${INC_SYS})
+
+add_library(cycles_graph ${SRC} ${SRC_HEADERS})
+
--- a/intern/cycles/graph/node.cpp
+++ b/intern/cycles/graph/node.cpp
@@ -0,0 +1,395 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "node.h"
+#include "node_type.h"
+
+#include "util_foreach.h"
+#include "util_param.h"
+#include "util_transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Node Type */
+
+Node::Node(const NodeType *type_, ustring name_)
+: name(name_), type(type_)
+{
+	assert(type);
+
+	/* assign non-empty name, convenient for debugging */
+	if(name.empty()) {
+		name = type->name;
+	}
+
+	/* initialize default values */
+	foreach(const SocketType& socket, type->inputs) {
+		set_default_value(socket);
+	}
+}
+
+Node::~Node()
+{
+}
+
+template<typename T>
+static T& get_socket_value(const Node *node, const SocketType& socket)
+{
+	return (T&)*(((char*)node) + socket.struct_offset);
+}
+
+#ifndef NDEBUG
+static bool is_socket_float3(const SocketType& socket)
+{
+	return socket.type == SocketType::COLOR ||
+	       socket.type == SocketType::POINT ||
+		   socket.type == SocketType::VECTOR ||
+		   socket.type == SocketType::NORMAL;
+}
+
+static bool is_socket_array_float3(const SocketType& socket)
+{
+	return socket.type == SocketType::COLOR_ARRAY ||
+	       socket.type == SocketType::POINT_ARRAY ||
+		   socket.type == SocketType::VECTOR_ARRAY ||
+		   socket.type == SocketType::NORMAL_ARRAY;
+}
+#endif
+
+/* set values */
+void Node::set(const SocketType& input, bool value)
+{
+	assert(input.type == SocketType::BOOLEAN);
+	get_socket_value<bool>(this, input) = value;
+}
+
+void Node::set(const SocketType& input, int value)
+{
+	assert((input.type == SocketType::INT || input.type == SocketType::ENUM));
+	get_socket_value<int>(this, input) = value;
+}
+
+void Node::set(const SocketType& input, float value)
+{
+	assert(input.type == SocketType::FLOAT);
+	get_socket_value<float>(this, input) = value;
+}
+
+void Node::set(const SocketType& input, float2 value)
+{
+	assert(input.type == SocketType::FLOAT);
+	get_socket_value<float2>(this, input) = value;
+}
+
+void Node::set(const SocketType& input, float3 value)
+{
+	assert(is_socket_float3(input));
+	get_socket_value<float3>(this, input) = value;
+}
+
+void Node::set(const SocketType& input, const char *value)
+{
+	set(input, ustring(value));
+}
+
+void Node::set(const SocketType& input, ustring value)
+{
+	if(input.type == SocketType::STRING) {
+		get_socket_value<ustring>(this, input) = value;
+	}
+	else if(input.type == SocketType::ENUM) {
+		const NodeEnum& enm = *input.enum_values;
+		if(enm.exists(value)) {
+			get_socket_value<int>(this, input) = enm[value];
+		}
+		else {
+			assert(0);
+		}
+	}
+	else {
+		assert(0);
+	}
+}
+
+void Node::set(const SocketType& input, const Transform& value)
+{
+	assert(input.type == SocketType::TRANSFORM);
+	get_socket_value<Transform>(this, input) = value;
+}
+
+void Node::set(const SocketType& input, Node *value)
+{
+	assert(input.type == SocketType::TRANSFORM);
+	get_socket_value<Node*>(this, input) = value;
+}
+
+/* set array values */
+void Node::set(const SocketType& input, array<bool>& value)
+{
+	assert(input.type == SocketType::BOOLEAN_ARRAY);
+	get_socket_value<array<bool> >(this, input).steal_data(value);
+}
+
+void Node::set(const SocketType& input, array<int>& value)
+{
+	assert(input.type == SocketType::INT_ARRAY);
+	get_socket_value<array<int> >(this, input).steal_data(value);
+}
+
+void Node::set(const SocketType& input, array<float>& value)
+{
+	assert(input.type == SocketType::FLOAT_ARRAY);
+	get_socket_value<array<float> >(this, input).steal_data(value);
+}
+
+void Node::set(const SocketType& input, array<float2>& value)
+{
+	assert(input.type == SocketType::FLOAT_ARRAY);
+	get_socket_value<array<float2> >(this, input).steal_data(value);
+}
+
+void Node::set(const SocketType& input, array<float3>& value)
+{
+	assert(is_socket_array_float3(input));
+	get_socket_value<array<float3> >(this, input).steal_data(value);
+}
+
+void Node::set(const SocketType& input, array<ustring>& value)
+{
+	assert(input.type == SocketType::STRING_ARRAY);
+	get_socket_value<array<ustring> >(this, input).steal_data(value);
+}
+
+void Node::set(const SocketType& input, array<Transform>& value)
+{
+	assert(input.type == SocketType::TRANSFORM_ARRAY);
+	get_socket_value<array<Transform> >(this, input).steal_data(value);
+}
+
+void Node::set(const SocketType& input, array<Node*>& value)
+{
+	assert(input.type == SocketType::TRANSFORM_ARRAY);
+	get_socket_value<array<Node*> >(this, input).steal_data(value);
+}
+
+/* get values */
+bool Node::get_bool(const SocketType& input) const
+{
+	assert(input.type == SocketType::BOOLEAN);
+	return get_socket_value<bool>(this, input);
+}
+
+int Node::get_int(const SocketType& input) const
+{
+	assert(input.type == SocketType::INT || input.type == SocketType::ENUM);
+	return get_socket_value<int>(this, input);
+}
+
+float Node::get_float(const SocketType& input) const
+{
+	assert(input.type == SocketType::FLOAT);
+	return get_socket_value<float>(this, input);
+}
+
+float2 Node::get_float2(const SocketType& input) const
+{
+	assert(input.type == SocketType::FLOAT);
+	return get_socket_value<float2>(this, input);
+}
+
+float3 Node::get_float3(const SocketType& input) const
+{
+	assert(is_socket_float3(input));
+	return get_socket_value<float3>(this, input);
+}
+
+ustring Node::get_string(const SocketType& input) const
+{
+	if(input.type == SocketType::STRING) {
+		return get_socket_value<ustring>(this, input);
+	}
+	else if(input.type == SocketType::ENUM) {
+		const NodeEnum& enm = *input.enum_values;
+		int intvalue = get_socket_value<int>(this, input);
+		return (enm.exists(intvalue)) ? enm[intvalue] : ustring();
+	}
+	else {
+		assert(0);
+		return ustring();
+	}
+}
+
+Transform Node::get_transform(const SocketType& input) const
+{
+	assert(input.type == SocketType::TRANSFORM);
+	return get_socket_value<Transform>(this, input);
+}
+
+Node *Node::get_node(const SocketType& input) const
+{
+	assert(input.type == SocketType::NODE);
+	return get_socket_value<Node*>(this, input);
+}
+
+/* get array values */
+const array<bool>& Node::get_bool_array(const SocketType& input) const
+{
+	assert(input.type == SocketType::BOOLEAN_ARRAY);
+	return get_socket_value<array<bool> >(this, input);
+}
+
+const array<int>& Node::get_int_array(const SocketType& input) const
+{
+	assert(input.type == SocketType::INT_ARRAY);
+	return get_socket_value<array<int> >(this, input);
+}
+
+const array<float>& Node::get_float_array(const SocketType& input) const
+{
+	assert(input.type == SocketType::FLOAT_ARRAY);
+	return get_socket_value<array<float> >(this, input);
+}
+
+const array<float2>& Node::get_float2_array(const SocketType& input) const
+{
+	assert(input.type == SocketType::FLOAT_ARRAY);
+	return get_socket_value<array<float2> >(this, input);
+}
+
+const array<float3>& Node::get_float3_array(const SocketType& input) const
+{
+	assert(is_socket_array_float3(input));
+	return get_socket_value<array<float3> >(this, input);
+}
+
+const array<ustring>& Node::get_string_array(const SocketType& input) const
+{
+	assert(input.type == SocketType::STRING_ARRAY);
+	return get_socket_value<array<ustring> >(this, input);
+}
+
+const array<Transform>& Node::get_transform_array(const SocketType& input) const
+{
+	assert(input.type == SocketType::TRANSFORM_ARRAY);
+	return get_socket_value<array<Transform> >(this, input);
+}
+
+const array<Node*>& Node::get_node_array(const SocketType& input) const
+{
+	assert(input.type == SocketType::NODE_ARRAY);
+	return get_socket_value<array<Node*> >(this, input);
+}
+
+/* generic value operations */
+
+bool Node::has_default_value(const SocketType& input) const
+{
+	const void *src = input.default_value;
+	void *dst = &get_socket_value<char>(this, input);
+	return memcmp(dst, src, input.size()) == 0;
+}
+
+void Node::set_default_value(const SocketType& socket)
+{
+	const void *src = socket.default_value;
+	void *dst = ((char*)this) + socket.struct_offset;
+	memcpy(dst, src, socket.size());
+}
+
+template<typename T>
+static void copy_array(const Node *node, const SocketType& socket, const Node *other, const SocketType& other_socket)
+{
+	const array<T>* src = (const array<T>*)(((char*)other) + other_socket.struct_offset);
+	array<T>* dst = (array<T>*)(((char*)node) + socket.struct_offset);
+	*dst = *src;
+}
+
+void Node::copy_value(const SocketType& socket, const Node& other, const SocketType& other_socket)
+{
+	assert(socket.type == other_socket.type);
+
+	if(socket.is_array()) {
+		switch(socket.type) {
+			case SocketType::BOOLEAN_ARRAY: copy_array<bool>(this, socket, &other, other_socket); break;
+			case SocketType::FLOAT_ARRAY: copy_array<float>(this, socket, &other, other_socket); break;
+			case SocketType::INT_ARRAY: copy_array<int>(this, socket, &other, other_socket); break;
+			case SocketType::COLOR_ARRAY: copy_array<float3>(this, socket, &other, other_socket); break;
+			case SocketType::VECTOR_ARRAY: copy_array<float3>(this, socket, &other, other_socket); break;
+			case SocketType::POINT_ARRAY: copy_array<float3>(this, socket, &other, other_socket); break;
+			case SocketType::NORMAL_ARRAY: copy_array<float3>(this, socket, &other, other_socket); break;
+			case SocketType::POINT2_ARRAY: copy_array<float2>(this, socket, &other, other_socket); break;
+			case SocketType::STRING_ARRAY: copy_array<ustring>(this, socket, &other, other_socket); break;
+			case SocketType::TRANSFORM_ARRAY: copy_array<Transform>(this, socket, &other, other_socket); break;
+			case SocketType::NODE_ARRAY: copy_array<void*>(this, socket, &other, other_socket); break;
+			default: assert(0); break;
+		}
+	}
+	else {
+		const void *src = ((char*)&other) + other_socket.struct_offset;
+		void *dst = ((char*)this) + socket.struct_offset;
+		memcpy(dst, src, socket.size());
+	}
+}
+
+template<typename T>
+static bool is_array_equal(const Node *node, const Node *other, const SocketType& socket)
+{
+	const array<T>* a = (const array<T>*)(((char*)node) + socket.struct_offset);
+	const array<T>* b = (const array<T>*)(((char*)other) + socket.struct_offset);
+	return *a == *b;
+}
+
+bool Node::equals_value(const Node& other, const SocketType& socket) const
+{
+	if(socket.is_array()) {
+		switch(socket.type) {
+			case SocketType::BOOLEAN_ARRAY: return is_array_equal<bool>(this, &other, socket);
+			case SocketType::FLOAT_ARRAY: return is_array_equal<float>(this, &other, socket);
+			case SocketType::INT_ARRAY: return is_array_equal<int>(this, &other, socket);
+			case SocketType::COLOR_ARRAY: return is_array_equal<float3>(this, &other, socket);
+			case SocketType::VECTOR_ARRAY: return is_array_equal<float3>(this, &other, socket);
+			case SocketType::POINT_ARRAY: return is_array_equal<float3>(this, &other, socket);
+			case SocketType::NORMAL_ARRAY: return is_array_equal<float3>(this, &other, socket);
+			case SocketType::POINT2_ARRAY: return is_array_equal<float2>(this, &other, socket);
+			case SocketType::STRING_ARRAY: return is_array_equal<ustring>(this, &other, socket);
+			case SocketType::TRANSFORM_ARRAY: return is_array_equal<Transform>(this, &other, socket);
+			case SocketType::NODE_ARRAY: return is_array_equal<void*>(this, &other, socket);
+			default: assert(0); return true;
+		}
+	}
+	else {
+		const void *a = ((char*)this) + socket.struct_offset;
+		const void *b = ((char*)&other) + socket.struct_offset;
+		return (memcmp(a, b, socket.size()) == 0);
+	}
+}
+
+/* equals */
+
+bool Node::equals(const Node& other) const
+{
+	assert(type == other.type);
+
+	foreach(const SocketType& socket, type->inputs) {
+		if(!equals_value(other, socket))
+			return false;
+	}
+
+	return true;
+}
+
+CCL_NAMESPACE_END
+
--- a/intern/cycles/graph/node.h
+++ b/intern/cycles/graph/node.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "node_type.h"
+
+#include "util_map.h"
+#include "util_param.h"
+#include "util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct Node;
+struct NodeType;
+struct Transform;
+
+/* Node */
+
+struct Node
+{
+	explicit Node(const NodeType *type, ustring name = ustring());
+	virtual ~Node();
+
+	/* set values */
+	void set(const SocketType& input, bool value);
+	void set(const SocketType& input, int value);
+	void set(const SocketType& input, float value);
+	void set(const SocketType& input, float2 value);
+	void set(const SocketType& input, float3 value);
+	void set(const SocketType& input, const char *value);
+	void set(const SocketType& input, ustring value);
+	void set(const SocketType& input, const Transform& value);
+	void set(const SocketType& input, Node *value);
+
+	/* set array values. the memory from the input array will taken over
+	 * by the node and the input array will be empty after return */
+	void set(const SocketType& input, array<bool>& value);
+	void set(const SocketType& input, array<int>& value);
+	void set(const SocketType& input, array<float>& value);
+	void set(const SocketType& input, array<float2>& value);
+	void set(const SocketType& input, array<float3>& value);
+	void set(const SocketType& input, array<ustring>& value);
+	void set(const SocketType& input, array<Transform>& value);
+	void set(const SocketType& input, array<Node*>& value);
+
+	/* get values */
+	bool get_bool(const SocketType& input) const;
+	int get_int(const SocketType& input) const;
+	float get_float(const SocketType& input) const;
+	float2 get_float2(const SocketType& input) const;
+	float3 get_float3(const SocketType& input) const;
+	ustring get_string(const SocketType& input) const;
+	Transform get_transform(const SocketType& input) const;
+	Node *get_node(const SocketType& input) const;
+
+	/* get array values */
+	const array<bool>& get_bool_array(const SocketType& input) const;
+	const array<int>& get_int_array(const SocketType& input) const;
+	const array<float>& get_float_array(const SocketType& input) const;
+	const array<float2>& get_float2_array(const SocketType& input) const;
+	const array<float3>& get_float3_array(const SocketType& input) const;
+	const array<ustring>& get_string_array(const SocketType& input) const;
+	const array<Transform>& get_transform_array(const SocketType& input) const;
+	const array<Node*>& get_node_array(const SocketType& input) const;
+
+	/* generic values operations */
+	bool has_default_value(const SocketType& input) const;
+	void set_default_value(const SocketType& input);
+	bool equals_value(const Node& other, const SocketType& input) const;
+	void copy_value(const SocketType& input, const Node& other, const SocketType& other_input);
+
+	/* equals */
+	bool equals(const Node& other) const;
+
+	ustring name;
+	const NodeType *type;
+};
+
+CCL_NAMESPACE_END
+
--- a/intern/cycles/graph/node_enum.h
+++ b/intern/cycles/graph/node_enum.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util_map.h"
+#include "util_param.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Enum
+ *
+ * Utility class for enum values. */
+
+struct NodeEnum {
+	bool empty() const { return left.empty(); }
+	void insert(const char *x, int y) {
+		left[ustring(x)] = y;
+		right[y] = ustring(x);
+	}
+
+	bool exists(ustring x) const { return left.find(x) != left.end(); }
+	bool exists(int y) const { return right.find(y) != right.end(); }
+
+	int operator[](const char *x) const { return left.find(ustring(x))->second; }
+	int operator[](ustring x) const { return left.find(x)->second; }
+	ustring operator[](int y) const { return right.find(y)->second; }
+
+private:
+	unordered_map<ustring, int, ustringHash> left;
+	unordered_map<int, ustring> right;
+};
+
+CCL_NAMESPACE_END
+
--- a/intern/cycles/graph/node_type.cpp
+++ b/intern/cycles/graph/node_type.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "node_type.h"
+#include "util_foreach.h"
+#include "util_transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Node Socket Type */
+
+size_t SocketType::size() const
+{
+	return size(type);
+}
+
+bool SocketType::is_array() const
+{
+	return (type >= BOOLEAN_ARRAY);
+}
+
+size_t SocketType::size(Type type)
+{
+	switch(type)
+	{
+		case UNDEFINED: return 0;
+
+		case BOOLEAN: return sizeof(bool);
+		case FLOAT: return sizeof(float);
+		case INT: return sizeof(int);
+		case COLOR: return sizeof(float3);
+		case VECTOR: return sizeof(float3);
+		case POINT: return sizeof(float3);
+		case NORMAL: return sizeof(float3);
+		case POINT2: return sizeof(float2);
+		case CLOSURE: return 0;
+		case STRING: return sizeof(ustring);
+		case ENUM: return sizeof(int);
+		case TRANSFORM: return sizeof(Transform);
+		case NODE: return sizeof(void*);
+
+		case BOOLEAN_ARRAY: return sizeof(array<bool>);
+		case FLOAT_ARRAY: return sizeof(array<float>);
+		case INT_ARRAY: return sizeof(array<int>);
+		case COLOR_ARRAY: return sizeof(array<float3>);
+		case VECTOR_ARRAY: return sizeof(array<float3>);
+		case POINT_ARRAY: return sizeof(array<float3>);
+		case NORMAL_ARRAY: return sizeof(array<float3>);
+		case POINT2_ARRAY: return sizeof(array<float2>);
+		case STRING_ARRAY: return sizeof(array<ustring>);
+		case TRANSFORM_ARRAY: return sizeof(array<Transform>);
+		case NODE_ARRAY: return sizeof(array<void*>);
+	}
+
+	assert(0);
+	return 0;
+}
+
+size_t SocketType::max_size()
+{
+	return sizeof(Transform);
+}
+
+void *SocketType::zero_default_value()
+{
+	static Transform zero_transform = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};
+	return &zero_transform;
+}
+
+ustring SocketType::type_name(Type type)
+{
+	static ustring names[] = {
+		ustring("undefined"),
+
+		ustring("boolean"),
+		ustring("float"),
+		ustring("int"),
+		ustring("color"),
+		ustring("vector"),
+		ustring("point"),
+		ustring("normal"),
+		ustring("point2"),
+		ustring("closure"),
+		ustring("string"),
+		ustring("enum"),
+		ustring("transform"),
+		ustring("node"),
+
+		ustring("array_boolean"),
+		ustring("array_float"),
+		ustring("array_int"),
+		ustring("array_color"),
+		ustring("array_vector"),
+		ustring("array_point"),
+		ustring("array_normal"),
+		ustring("array_point2"),
+		ustring("array_string"),
+		ustring("array_transform"),
+		ustring("array_node")};
+
+	return names[(int)type];
+}
+
+bool SocketType::is_float3(Type type)
+{
+	return (type == COLOR || type == VECTOR || type == POINT || type == NORMAL);
+}
+
+/* Node Type */
+
+NodeType::NodeType(Type type_)
+: type(type_)
+{
+}
+
+NodeType::~NodeType()
+{
+}
+
+void NodeType::register_input(ustring name, ustring ui_name, SocketType::Type type, int struct_offset,
+                              const void *default_value, const NodeEnum *enum_values,
+							  const NodeType **node_type, int flags, int extra_flags)
+{
+	SocketType socket;
+	socket.name = name;
+	socket.ui_name = ui_name;
+	socket.type = type;
+	socket.struct_offset = struct_offset;
+	socket.default_value = default_value;
+	socket.enum_values = enum_values;
+	socket.node_type = node_type;
+	socket.flags = flags | extra_flags;
+	inputs.push_back(socket);
+}
+
+void NodeType::register_output(ustring name, ustring ui_name, SocketType::Type type)
+{
+	SocketType socket;
+	socket.name = name;
+	socket.ui_name = ui_name;
+	socket.type = type;
+	socket.struct_offset = 0;
+	socket.default_value = NULL;
+	socket.enum_values = NULL;
+	socket.node_type = NULL;
+	socket.flags = SocketType::LINKABLE;
+	outputs.push_back(socket);
+}
+
+const SocketType *NodeType::find_input(ustring name) const
+{
+	foreach(const SocketType& socket, inputs) {
+		if(socket.name == name) {
+			return &socket;
+		}
+	}
+
+	return NULL;
+}
+
+const SocketType *NodeType::find_output(ustring name) const
+{
+	foreach(const SocketType& socket, outputs) {
+		if(socket.name == name) {
+			return &socket;
+		}
+	}
+
+	return NULL;
+}
+
+/* Node Type Registry */
+
+unordered_map<ustring, NodeType, ustringHash>& NodeType::types()
+{
+	static unordered_map<ustring, NodeType, ustringHash> _types;
+	return _types;
+}
+
+NodeType *NodeType::add(const char *name_, CreateFunc create_, Type type_)
+{
+	ustring name(name_);
+
+	if(types().find(name) != types().end()) {
+		fprintf(stderr, "Node type %s registered twice!\n", name_);
+		assert(0);
+		return NULL;
+	}
+
+	types()[name] = NodeType(type_);
+
+	NodeType *type = &types()[name];
+	type->name = name;
+	type->create = create_;
+	return type;
+}
+
+const NodeType *NodeType::find(ustring name)
+{
+	unordered_map<ustring, NodeType, ustringHash>::iterator it = types().find(name);
+	return (it == types().end()) ? NULL : &it->second;
+}
+
+CCL_NAMESPACE_END
+
--- a/intern/cycles/graph/node_type.h
+++ b/intern/cycles/graph/node_type.h
@@ -0,0 +1,262 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "node_enum.h"
+
+#include "util_map.h"
+#include "util_param.h"
+#include "util_string.h"
+#include "util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct Node;
+struct NodeType;
+
+/* Socket Type */
+
+struct SocketType
+{
+	enum Type
+	{
+		UNDEFINED,
+
+		BOOLEAN,
+		FLOAT,
+		INT,
+		COLOR,
+		VECTOR,
+		POINT,
+		NORMAL,
+		POINT2,
+		CLOSURE,
+		STRING,
+		ENUM,
+		TRANSFORM,
+		NODE,
+
+		BOOLEAN_ARRAY,
+		FLOAT_ARRAY,
+		INT_ARRAY,
+		COLOR_ARRAY,
+		VECTOR_ARRAY,
+		POINT_ARRAY,
+		NORMAL_ARRAY,
+		POINT2_ARRAY,
+		STRING_ARRAY,
+		TRANSFORM_ARRAY,
+		NODE_ARRAY,
+	};
+
+	enum Flags {
+		LINKABLE               = (1 << 0),
+		ANIMATABLE             = (1 << 1),
+
+		SVM_INTERNAL           = (1 << 2),
+		OSL_INTERNAL           = (1 << 3),
+		INTERNAL               = (1 << 2) | (1 << 3),
+
+		LINK_TEXTURE_GENERATED = (1 << 4),
+		LINK_TEXTURE_UV        = (1 << 5),
+		LINK_INCOMING          = (1 << 6),
+		LINK_NORMAL            = (1 << 7),
+		LINK_POSITION          = (1 << 8),
+		LINK_TANGENT           = (1 << 9),
+		DEFAULT_LINK_MASK      = (1 << 4) | (1 << 5) | (1 << 6) | (1 << 7) | (1 << 8) | (1 << 9)
+	};
+
+	ustring name;
+	Type type;
+	int struct_offset;
+	const void *default_value;
+	const NodeEnum *enum_values;
+	const NodeType **node_type;
+	int flags;
+	ustring ui_name;
+
+	size_t size() const;
+	bool is_array() const;
+	static size_t size(Type type);
+	static size_t max_size();
+	static ustring type_name(Type type);
+	static void *zero_default_value();
+	static bool is_float3(Type type);
+};
+
+/* Node Type */
+
+struct NodeType
+{
+	enum Type {
+		NONE,
+		SHADER
+	};
+
+	explicit NodeType(Type type = NONE);
+	~NodeType();
+
+	void register_input(ustring name, ustring ui_name, SocketType::Type type,
+	                    int struct_offset, const void *default_value,
+						const NodeEnum *enum_values = NULL,
+						const NodeType **node_type = NULL,
+						int flags = 0, int extra_flags = 0);
+	void register_output(ustring name, ustring ui_name, SocketType::Type type);
+
+	const SocketType *find_input(ustring name) const;
+	const SocketType *find_output(ustring name) const;
+
+	typedef Node *(*CreateFunc)(const NodeType *type);
+
+	ustring name;
+	Type type;
+	std::vector<SocketType> inputs;
+	std::vector<SocketType> outputs;
+	CreateFunc create;
+
+	static NodeType *add(const char *name, CreateFunc create, Type type = NONE);
+	static const NodeType *find(ustring name);
+	static unordered_map<ustring, NodeType, ustringHash>& types();
+};
+
+/* Node Definition Macros */
+
+#define NODE_DECLARE                       \
+template<typename T>                       \
+static const NodeType *register_type();    \
+static Node *create(const NodeType *type); \
+static const NodeType *node_type;
+
+#define NODE_DEFINE(structname)                                                  \
+const NodeType *structname::node_type = structname::register_type<structname>(); \
+Node *structname::create(const NodeType*) { return new structname(); }           \
+template<typename T>                                                             \
+const NodeType *structname::register_type()
+
+/* Sock Definition Macros */
+
+#define SOCKET_OFFSETOF(T, name) (((char *)&(((T *)1)->name)) - (char *)1)
+#define SOCKET_SIZEOF(T, name) (sizeof(((T *)1)->name))
+#define SOCKET_DEFINE(name, ui_name, default_value, datatype, TYPE, flags, ...) \
+	{ \
+		static datatype defval = default_value; \
+		assert(SOCKET_SIZEOF(T, name) == sizeof(datatype)); \
+		type->register_input(ustring(#name), ustring(ui_name), TYPE, SOCKET_OFFSETOF(T, name), &defval, NULL, NULL, flags, ##__VA_ARGS__); \
+	}
+
+#define SOCKET_BOOLEAN(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, bool, SocketType::BOOLEAN, 0, ##__VA_ARGS__)
+#define SOCKET_INT(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, int, SocketType::INT, 0, ##__VA_ARGS__)
+#define SOCKET_FLOAT(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, float, SocketType::FLOAT, 0, ##__VA_ARGS__)
+#define SOCKET_COLOR(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, float3, SocketType::COLOR, 0, ##__VA_ARGS__)
+#define SOCKET_VECTOR(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, float3, SocketType::VECTOR, 0, ##__VA_ARGS__)
+#define SOCKET_POINT(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, float3, SocketType::POINT, 0, ##__VA_ARGS__)
+#define SOCKET_NORMAL(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, float3, SocketType::NORMAL, 0, ##__VA_ARGS__)
+#define SOCKET_POINT2(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, float2, SocketType::POINT2, 0, ##__VA_ARGS__)
+#define SOCKET_STRING(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, ustring, SocketType::STRING, 0, ##__VA_ARGS__)
+#define SOCKET_TRANSFORM(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, Transform, SocketType::TRANSFORM, 0, ##__VA_ARGS__)
+#define SOCKET_ENUM(name, ui_name, values, default_value, ...) \
+	{ \
+		static int defval = default_value; \
+		assert(SOCKET_SIZEOF(T, name) == sizeof(int)); \
+		type->register_input(ustring(#name), ustring(ui_name), SocketType::ENUM, SOCKET_OFFSETOF(T, name), &defval, &values, NULL, ##__VA_ARGS__); \
+	}
+#define SOCKET_NODE(name, ui_name, node_type, ...) \
+	{ \
+	    static Node *defval = NULL; \
+		assert(SOCKET_SIZEOF(T, name) == sizeof(Node*)); \
+		type->register_input(ustring(#name), ustring(ui_name), SocketType::NODE, SOCKET_OFFSETOF(T, name), &defval, NULL, node_type, ##__VA_ARGS__); \
+	}
+
+#define SOCKET_BOOLEAN_ARRAY(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, array<bool>, SocketType::BOOLEAN_ARRAY, 0, ##__VA_ARGS__)
+#define SOCKET_INT_ARRAY(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, array<int>, SocketType::INT_ARRAY, 0, ##__VA_ARGS__)
+#define SOCKET_FLOAT_ARRAY(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, array<float>, SocketType::FLOAT_ARRAY, 0, ##__VA_ARGS__)
+#define SOCKET_COLOR_ARRAY(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, array<float3>, SocketType::COLOR_ARRAY, 0, ##__VA_ARGS__)
+#define SOCKET_VECTOR_ARRAY(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, array<float3>, SocketType::VECTOR_ARRAY, 0, ##__VA_ARGS__)
+#define SOCKET_POINT_ARRAY(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, array<float3>, SocketType::POINT_ARRAY, 0, ##__VA_ARGS__)
+#define SOCKET_NORMAL_ARRAY(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, array<float3>, SocketType::NORMAL_ARRAY, 0, ##__VA_ARGS__)
+#define SOCKET_POINT2_ARRAY(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, array<float2>, SocketType::POINT2_ARRAY, 0, ##__VA_ARGS__)
+#define SOCKET_STRING_ARRAY(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, array<ustring>, SocketType::STRING_ARRAY, 0, ##__VA_ARGS__)
+#define SOCKET_TRANSFORM_ARRAY(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, array<Transform>, SocketType::TRANSFORM_ARRAY, 0, ##__VA_ARGS__)
+#define SOCKET_NODE_ARRAY(name, ui_name, node_type, ...) \
+	{ \
+	    static Node *defval = NULL; \
+		assert(SOCKET_SIZEOF(T, name) == sizeof(Node*)); \
+		type->register_input(ustring(#name), ustring(ui_name), SocketType::NODE_ARRAY, SOCKET_OFFSETOF(T, name), &defval, NULL, node_type, ##__VA_ARGS__); \
+	}
+
+#define SOCKET_IN_BOOLEAN(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, bool, SocketType::BOOLEAN, SocketType::LINKABLE, ##__VA_ARGS__)
+#define SOCKET_IN_INT(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, int, SocketType::INT, SocketType::LINKABLE, ##__VA_ARGS__)
+#define SOCKET_IN_FLOAT(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, float, SocketType::FLOAT, SocketType::LINKABLE, ##__VA_ARGS__)
+#define SOCKET_IN_COLOR(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, float3, SocketType::COLOR, SocketType::LINKABLE, ##__VA_ARGS__)
+#define SOCKET_IN_VECTOR(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, float3, SocketType::VECTOR, SocketType::LINKABLE, ##__VA_ARGS__)
+#define SOCKET_IN_POINT(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, float3, SocketType::POINT, SocketType::LINKABLE, ##__VA_ARGS__)
+#define SOCKET_IN_NORMAL(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, float3, SocketType::NORMAL, SocketType::LINKABLE, ##__VA_ARGS__)
+#define SOCKET_IN_STRING(name, ui_name, default_value, ...) \
+	SOCKET_DEFINE(name, ui_name, default_value, ustring, SocketType::STRING, SocketType::LINKABLE, ##__VA_ARGS__)
+#define SOCKET_IN_CLOSURE(name, ui_name, ...) \
+	type->register_input(ustring(#name), ustring(ui_name), SocketType::CLOSURE, 0, NULL, NULL, NULL, SocketType::LINKABLE, ##__VA_ARGS__)
+
+#define SOCKET_OUT_BOOLEAN(name, ui_name) \
+	{ type->register_output(ustring(#name), ustring(ui_name), SocketType::BOOLEAN); }
+#define SOCKET_OUT_INT(name, ui_name) \
+	{ type->register_output(ustring(#name), ustring(ui_name), SocketType::INT); }
+#define SOCKET_OUT_FLOAT(name, ui_name) \
+	{ type->register_output(ustring(#name), ustring(ui_name), SocketType::FLOAT); }
+#define SOCKET_OUT_COLOR(name, ui_name) \
+	{ type->register_output(ustring(#name), ustring(ui_name), SocketType::COLOR); }
+#define SOCKET_OUT_VECTOR(name, ui_name) \
+	{ type->register_output(ustring(#name), ustring(ui_name), SocketType::VECTOR); }
+#define SOCKET_OUT_POINT(name, ui_name) \
+	{ type->register_output(ustring(#name), ustring(ui_name), SocketType::POINT); }
+#define SOCKET_OUT_NORMAL(name, ui_name) \
+	{ type->register_output(ustring(#name), ustring(ui_name), SocketType::NORMAL); }
+#define SOCKET_OUT_CLOSURE(name, ui_name) \
+	{ type->register_output(ustring(#name), ustring(ui_name), SocketType::CLOSURE); }
+#define SOCKET_OUT_STRING(name, ui_name) \
+	{ type->register_output(ustring(#name), ustring(ui_name), SocketType::STRING); }
+#define SOCKET_OUT_ENUM(name, ui_name) \
+	{ type->register_output(ustring(#name), ustring(ui_name), SocketType::ENUM); }
+
+CCL_NAMESPACE_END
+
--- a/intern/cycles/graph/node_xml.cpp
+++ b/intern/cycles/graph/node_xml.cpp
@@ -0,0 +1,452 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "node_xml.h"
+
+#include "util_foreach.h"
+#include "util_string.h"
+#include "util_transform.h"
+
+CCL_NAMESPACE_BEGIN
+
+static bool xml_read_boolean(const char *value)
+{
+	return string_iequals(value, "true") || (atoi(value) != 0);
+}
+
+static const char *xml_write_boolean(bool value)
+{
+	return (value) ? "true" : "false";
+}
+
+template<int VECTOR_SIZE, typename T>
+static void xml_read_float_array(T& value, pugi::xml_attribute attr)
+{
+	vector<string> tokens;
+	string_split(tokens, attr.value());
+
+	if(tokens.size() % VECTOR_SIZE != 0) {
+		return;
+	}
+
+	value.resize(tokens.size() / VECTOR_SIZE);
+	for(size_t i = 0; i < value.size(); i++) {
+		float *value_float = (float*)&value[i];
+
+		for(size_t j = 0; j < VECTOR_SIZE; j++)
+			value_float[j] = (float)atof(tokens[i * VECTOR_SIZE + j].c_str());
+	}
+}
+
+void xml_read_node(XMLReader& reader, Node *node, pugi::xml_node xml_node)
+{
+	pugi::xml_attribute name_attr = xml_node.attribute("name");
+	if(name_attr) {
+		node->name = ustring(name_attr.value());
+	}
+
+	foreach(const SocketType& socket, node->type->inputs) {
+		if(socket.type == SocketType::CLOSURE || socket.type == SocketType::UNDEFINED) {
+			continue;
+		}
+		if(socket.flags & SocketType::INTERNAL) {
+			continue;
+		}
+
+		pugi::xml_attribute attr = xml_node.attribute(socket.name.c_str());
+
+		if(!attr) {
+			continue;
+		}
+
+		switch(socket.type)
+		{
+			case SocketType::BOOLEAN:
+			{
+				node->set(socket, xml_read_boolean(attr.value()));
+				break;
+			}
+			case SocketType::BOOLEAN_ARRAY:
+			{
+				vector<string> tokens;
+				string_split(tokens, attr.value());
+
+				array<bool> value;
+				value.resize(tokens.size());
+				for(size_t i = 0; i < value.size(); i++)
+					value[i] = xml_read_boolean(tokens[i].c_str());
+				node->set(socket, value);
+				break;
+			}
+			case SocketType::FLOAT:
+			{
+				node->set(socket, (float)atof(attr.value()));
+				break;
+			}
+			case SocketType::FLOAT_ARRAY:
+			{
+				array<float> value;
+				xml_read_float_array<1>(value, attr);
+				node->set(socket, value);
+				break;
+			}
+			case SocketType::INT:
+			{
+				node->set(socket, (int)atoi(attr.value()));
+				break;
+			}
+			case SocketType::INT_ARRAY:
+			{
+				vector<string> tokens;
+				string_split(tokens, attr.value());
+
+				array<int> value;
+				value.resize(tokens.size());
+				for(size_t i = 0; i < value.size(); i++) {
+					value[i] = (int)atoi(attr.value());
+				}
+				node->set(socket, value);
+				break;
+			}
+			case SocketType::COLOR:
+			case SocketType::VECTOR:
+			case SocketType::POINT:
+			case SocketType::NORMAL:
+			{
+				array<float3> value;
+				xml_read_float_array<3>(value, attr);
+				if(value.size() == 1) {
+					node->set(socket, value[0]);
+				}
+				break;
+			}
+			case SocketType::COLOR_ARRAY:
+			case SocketType::VECTOR_ARRAY:
+			case SocketType::POINT_ARRAY:
+			case SocketType::NORMAL_ARRAY:
+			{
+				array<float3> value;
+				xml_read_float_array<3>(value, attr);
+				node->set(socket, value);
+				break;
+			}
+			case SocketType::POINT2:
+			{
+				array<float2> value;
+				xml_read_float_array<2>(value, attr);
+				if(value.size() == 1) {
+					node->set(socket, value[0]);
+				}
+				break;
+			}
+			case SocketType::POINT2_ARRAY:
+			{
+				array<float2> value;
+				xml_read_float_array<2>(value, attr);
+				node->set(socket, value);
+				break;
+			}
+			case SocketType::STRING:
+			{
+				node->set(socket, attr.value());
+				break;
+			}
+			case SocketType::ENUM:
+			{
+				ustring value(attr.value());
+				if(socket.enum_values->exists(value)) {
+					node->set(socket, value);
+				}
+				else {
+					fprintf(stderr, "Unknown value \"%s\" for attribute \"%s\".\n", value.c_str(), socket.name.c_str());
+				}
+				break;
+			}
+			case SocketType::STRING_ARRAY:
+			{
+				vector<string> tokens;
+				string_split(tokens, attr.value());
+
+				array<ustring> value;
+				value.resize(tokens.size());
+				for(size_t i = 0; i < value.size(); i++) {
+					value[i] = ustring(tokens[i]);
+				}
+				node->set(socket, value);
+				break;
+			}
+			case SocketType::TRANSFORM:
+			{
+				array<Transform> value;
+				xml_read_float_array<16>(value, attr);
+				if(value.size() == 1) {
+					node->set(socket, value[0]);
+				}
+				break;
+			}
+			case SocketType::TRANSFORM_ARRAY:
+			{
+				array<Transform> value;
+				xml_read_float_array<16>(value, attr);
+				node->set(socket, value);
+				break;
+			}
+			case SocketType::NODE:
+			{
+				ustring value(attr.value());
+				map<ustring, Node*>::iterator it = reader.node_map.find(value);
+				if(it != reader.node_map.end())
+				{
+					Node *value_node = it->second;
+					if(value_node->type == *(socket.node_type))
+						node->set(socket, it->second);
+				}
+				break;
+			}
+			case SocketType::NODE_ARRAY:
+			{
+				vector<string> tokens;
+				string_split(tokens, attr.value());
+
+				array<Node*> value;
+				value.resize(tokens.size());
+				for(size_t i = 0; i < value.size(); i++)
+				{
+					map<ustring, Node*>::iterator it = reader.node_map.find(ustring(tokens[i]));
+					if(it != reader.node_map.end())
+					{
+						Node *value_node = it->second;
+						value[i] = (value_node->type == *(socket.node_type)) ? value_node : NULL;
+					}
+					else
+					{
+						value[i] = NULL;
+					}
+				}
+				node->set(socket, value);
+				break;
+			}
+			case SocketType::CLOSURE:
+			case SocketType::UNDEFINED:
+				break;
+		}
+	}
+
+	if(node->name)
+		reader.node_map[node->name] = node;
+}
+
+pugi::xml_node xml_write_node(Node *node, pugi::xml_node xml_root)
+{
+	pugi::xml_node xml_node = xml_root.append_child(node->type->name.c_str());
+
+	xml_node.append_attribute("name") = node->name.c_str();
+
+	foreach(const SocketType& socket, node->type->inputs) {
+		if(socket.type == SocketType::CLOSURE || socket.type == SocketType::UNDEFINED) {
+			continue;
+		}
+		if(socket.flags & SocketType::INTERNAL) {
+			continue;
+		}
+		if(node->has_default_value(socket)) {
+			continue;
+		}
+
+		pugi::xml_attribute attr = xml_node.append_attribute(socket.name.c_str());
+
+		switch(socket.type)
+		{
+			case SocketType::BOOLEAN:
+			{
+				attr = xml_write_boolean(node->get_bool(socket));
+				break;
+			}
+			case SocketType::BOOLEAN_ARRAY:
+			{
+				std::stringstream ss;
+				const array<bool>& value = node->get_bool_array(socket);
+				for(size_t i = 0; i < value.size(); i++) {
+					ss << xml_write_boolean(value[i]);
+					if(i != value.size() - 1)
+						ss << " ";
+				}
+				attr = ss.str().c_str();
+				break;
+			}
+			case SocketType::FLOAT:
+			{
+				attr = (double)node->get_float(socket);
+				break;
+			}
+			case SocketType::FLOAT_ARRAY:
+			{
+				std::stringstream ss;
+				const array<float>& value = node->get_float_array(socket);
+				for(size_t i = 0; i < value.size(); i++) {
+					ss << value[i];
+					if(i != value.size() - 1) {
+						ss << " ";
+					}
+				}
+				attr = ss.str().c_str();
+				break;
+			}
+			case SocketType::INT:
+			{
+				attr = node->get_int(socket);
+				break;
+			}
+			case SocketType::INT_ARRAY:
+			{
+				std::stringstream ss;
+				const array<int>& value = node->get_int_array(socket);
+				for(size_t i = 0; i < value.size(); i++) {
+					ss << value[i];
+					if(i != value.size() - 1) {
+						ss << " ";
+					}
+				}
+				attr = ss.str().c_str();
+				break;
+			}
+			case SocketType::COLOR:
+			case SocketType::VECTOR:
+			case SocketType::POINT:
+			case SocketType::NORMAL:
+			{
+				float3 value = node->get_float3(socket);
+				attr = string_printf("%g %g %g", (double)value.x, (double)value.y, (double)value.z).c_str();
+				break;
+			}
+			case SocketType::COLOR_ARRAY:
+			case SocketType::VECTOR_ARRAY:
+			case SocketType::POINT_ARRAY:
+			case SocketType::NORMAL_ARRAY:
+			{
+				std::stringstream ss;
+				const array<float3>& value = node->get_float3_array(socket);
+				for(size_t i = 0; i < value.size(); i++) {
+					ss << string_printf("%g %g %g", (double)value[i].x, (double)value[i].y, (double)value[i].z);
+					if(i != value.size() - 1) {
+						ss << " ";
+					}
+				}
+				attr = ss.str().c_str();
+				break;
+			}
+			case SocketType::POINT2:
+			{
+				float2 value = node->get_float2(socket);
+				attr = string_printf("%g %g", (double)value.x, (double)value.y).c_str();
+				break;
+			}
+			case SocketType::POINT2_ARRAY:
+			{
+				std::stringstream ss;
+				const array<float2>& value = node->get_float2_array(socket);
+				for(size_t i = 0; i < value.size(); i++) {
+					ss << string_printf("%g %g", (double)value[i].x, (double)value[i].y);
+					if(i != value.size() - 1) {
+						ss << " ";
+					}
+				}
+				attr = ss.str().c_str();
+				break;
+			}
+			case SocketType::STRING:
+			case SocketType::ENUM:
+			{
+				attr = node->get_string(socket).c_str();
+				break;
+			}
+			case SocketType::STRING_ARRAY:
+			{
+				std::stringstream ss;
+				const array<ustring>& value = node->get_string_array(socket);
+				for(size_t i = 0; i < value.size(); i++) {
+					ss << value[i];
+					if(i != value.size() - 1) {
+						ss << " ";
+					}
+				}
+				attr = ss.str().c_str();
+				break;
+			}
+			case SocketType::TRANSFORM:
+			{
+				Transform tfm = node->get_transform(socket);
+				std::stringstream ss;
+				for(int i = 0; i < 4; i++) {
+					ss << string_printf("%g %g %g %g", (double)tfm[i][0], (double)tfm[i][1], (double)tfm[i][2], (double)tfm[i][3]);
+					if(i != 3) {
+						ss << " ";
+					}
+				}
+				attr = ss.str().c_str();
+				break;
+			}
+			case SocketType::TRANSFORM_ARRAY:
+			{
+				std::stringstream ss;
+				const array<Transform>& value = node->get_transform_array(socket);
+				for(size_t j = 0; j < value.size(); j++) {
+					const Transform& tfm = value[j];
+
+					for(int i = 0; i < 4; i++) {
+						ss << string_printf("%g %g %g %g", (double)tfm[i][0], (double)tfm[i][1], (double)tfm[i][2], (double)tfm[i][3]);
+						if(j != value.size() - 1 || i != 3) {
+							ss << " ";
+						}
+					}
+				}
+				attr = ss.str().c_str();
+				break;
+			}
+			case SocketType::NODE:
+			{
+				Node *value = node->get_node(socket);
+				if(value) {
+					attr = value->name.c_str();
+				}
+				break;
+			}
+			case SocketType::NODE_ARRAY:
+			{
+				std::stringstream ss;
+				const array<Node*>& value = node->get_node_array(socket);
+				for(size_t i = 0; i < value.size(); i++) {
+					if(value[i]) {
+						ss << value[i]->name.c_str();
+					}
+					if(i != value.size() - 1) {
+						ss << " ";
+					}
+				}
+				attr = ss.str().c_str();
+				break;
+			}
+			case SocketType::CLOSURE:
+			case SocketType::UNDEFINED:
+				break;
+		}
+	}
+
+	return xml_node;
+}
+
+CCL_NAMESPACE_END
+
--- a/intern/cycles/graph/node_xml.h
+++ b/intern/cycles/graph/node_xml.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "node.h"
+
+#include "util_map.h"
+#include "util_string.h"
+#include "util_xml.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct XMLReader {
+	map<ustring, Node*> node_map;
+};
+
+void xml_read_node(XMLReader& reader, Node *node, pugi::xml_node xml_node);
+pugi::xml_node xml_write_node(Node *node, pugi::xml_node xml_root);
+
+CCL_NAMESPACE_END
+
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -67,6 +67,7 @@ set(SRC_KERNELS_CPU_HEADERS
 	kernel.h
 	kernels/cpu/kernel_cpu.h
 	kernels/cpu/kernel_cpu_impl.h
+	kernels/cpu/kernel_cpu_image.h
 )

 set(SRC_CLOSURE_HEADERS
@@ -195,8 +196,8 @@ if(WITH_CYCLES_CUDA_BINARIES)

 	# CUDA version
 	execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
-	string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR ${NVCC_OUT})
-	string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR ${NVCC_OUT})
+	string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR "${NVCC_OUT}")
+	string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR "${NVCC_OUT}")
 	set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")

 	# warn for other versions
@@ -233,6 +234,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 			OUTPUT ${cuda_cubin}
 			COMMAND ${CUDA_NVCC_EXECUTABLE}
 					-arch=${arch}
+					${CUDA_NVCC_FLAGS}
 					-m${CUDA_BITS}
 					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu
 					-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
--- a/intern/cycles/kernel/geom/geom_bvh.h
+++ b/intern/cycles/kernel/geom/geom_bvh.h
@@ -48,6 +48,28 @@ CCL_NAMESPACE_BEGIN

 #define BVH_FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)

+/* Debugging heleprs */
+#ifdef __KERNEL_DEBUG__
+#  define BVH_DEBUG_INIT() \
+	do { \
+		isect->num_traversal_steps = 0; \
+		isect->num_traversed_instances = 0; \
+	} while(0)
+#  define BVH_DEBUG_NEXT_STEP() \
+	do { \
+		++isect->num_traversal_steps; \
+	} while(0)
+#  define BVH_DEBUG_NEXT_INSTANCE() \
+	do { \
+		++isect->num_traversed_instances; \
+	} while(0)
+#else  /* __KERNEL_DEBUG__ */
+#  define BVH_DEBUG_INIT()
+#  define BVH_DEBUG_NEXT_STEP()
+#  define BVH_DEBUG_NEXT_INSTANCE()
+#endif  /* __KERNEL_DEBUG__ */
+
+
 /* Common QBVH functions. */
 #ifdef __QBVH__
 #  include "geom_qbvh.h"
--- a/intern/cycles/kernel/geom/geom_bvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h
@@ -74,10 +74,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	isect->prim = PRIM_NONE;
 	isect->object = OBJECT_NONE;

-#if defined(__KERNEL_DEBUG__)
-	isect->num_traversal_steps = 0;
-	isect->num_traversed_instances = 0;
-#endif
+	BVH_DEBUG_INIT();

 #if defined(__KERNEL_SSE2__)
 	const shuffle_swap_t shuf_identity = shuffle_swap_identity();
@@ -241,10 +238,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						--stackPtr;
 					}
 				}
-
-#if defined(__KERNEL_DEBUG__)
-				isect->num_traversal_steps++;
-#endif
+				BVH_DEBUG_NEXT_STEP();
 			}

 			/* if node is leaf, fetch triangle list */
@@ -266,9 +260,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					switch(type & PRIMITIVE_ALL) {
 						case PRIMITIVE_TRIANGLE: {
 							for(; primAddr < primAddr2; primAddr++) {
-#if defined(__KERNEL_DEBUG__)
-								isect->num_traversal_steps++;
-#endif
+								BVH_DEBUG_NEXT_STEP();
 								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
 								if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) {
 									/* shadow ray early termination */
@@ -287,9 +279,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_MOTION)
 						case PRIMITIVE_MOTION_TRIANGLE: {
 							for(; primAddr < primAddr2; primAddr++) {
-#  if defined(__KERNEL_DEBUG__)
-								isect->num_traversal_steps++;
-#  endif
+								BVH_DEBUG_NEXT_STEP();
 								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
 								if(motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr)) {
 									/* shadow ray early termination */
@@ -310,9 +300,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						case PRIMITIVE_CURVE:
 						case PRIMITIVE_MOTION_CURVE: {
 							for(; primAddr < primAddr2; primAddr++) {
-#  if defined(__KERNEL_DEBUG__)
-								isect->num_traversal_steps++;
-#  endif
+								BVH_DEBUG_NEXT_STEP();
 								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
 								bool hit;
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
@@ -364,9 +352,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,

 					nodeAddr = kernel_tex_fetch(__object_node, object);

-#  if defined(__KERNEL_DEBUG__)
-					isect->num_traversed_instances++;
-#  endif
+					BVH_DEBUG_NEXT_INSTANCE();
 				}
 			}
 #endif  /* FEATURE(BVH_INSTANCING) */
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -538,7 +538,7 @@ ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg,
                                                      float *t_fac,
                                                      Transform *itfm)
 {
-	*t_fac /= len(transform_direction(itfm, ray->D));
+	*t_fac = 1.0f / len(transform_direction(itfm, ray->D));
 	*P = ray->P;
 	*dir = bvh_clamp_direction(ray->D);
 	*idir = bvh_inverse_direction(*dir);
--- a/intern/cycles/kernel/geom/geom_qbvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_qbvh_traversal.h
@@ -78,10 +78,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	isect->prim = PRIM_NONE;
 	isect->object = OBJECT_NONE;

-#if defined(__KERNEL_DEBUG__)
-	isect->num_traversal_steps = 0;
-	isect->num_traversed_instances = 0;
-#endif
+	BVH_DEBUG_INIT();

 	ssef tnear(0.0f), tfar(ray->t);
 	sse3f idir4(ssef(idir.x), ssef(idir.y), ssef(idir.z));
@@ -120,9 +117,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				int traverseChild;
 				ssef dist;

-#if defined(__KERNEL_DEBUG__)
-				isect->num_traversal_steps++;
-#endif
+				BVH_DEBUG_NEXT_STEP();

 #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
 				if(difl != 0.0f) {
@@ -295,9 +290,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					switch(type & PRIMITIVE_ALL) {
 						case PRIMITIVE_TRIANGLE: {
 							for(; primAddr < primAddr2; primAddr++) {
-#if defined(__KERNEL_DEBUG__)
-								isect->num_traversal_steps++;
-#endif
+								BVH_DEBUG_NEXT_STEP();
 								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
 								if(triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, primAddr)) {
 									tfar = ssef(isect->t);
@@ -311,9 +304,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 #if BVH_FEATURE(BVH_MOTION)
 						case PRIMITIVE_MOTION_TRIANGLE: {
 							for(; primAddr < primAddr2; primAddr++) {
-#  if defined(__KERNEL_DEBUG__)
-								isect->num_traversal_steps++;
-#  endif
+								BVH_DEBUG_NEXT_STEP();
 								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
 								if(motion_triangle_intersect(kg, isect, P, dir, ray->time, visibility, object, primAddr)) {
 									tfar = ssef(isect->t);
@@ -329,9 +320,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						case PRIMITIVE_CURVE:
 						case PRIMITIVE_MOTION_CURVE: {
 							for(; primAddr < primAddr2; primAddr++) {
-#  if defined(__KERNEL_DEBUG__)
-								isect->num_traversal_steps++;
-#  endif
+								BVH_DEBUG_NEXT_STEP();
 								kernel_assert(kernel_tex_fetch(__prim_type, primAddr) == type);
 								bool hit;
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE)
@@ -381,9 +370,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,

 					nodeAddr = kernel_tex_fetch(__object_node, object);

-#  if defined(__KERNEL_DEBUG__)
-					isect->num_traversed_instances++;
-#  endif
+					BVH_DEBUG_NEXT_INSTANCE();
 				}
 			}
 #endif  /* FEATURE(BVH_INSTANCING) */
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -159,16 +159,11 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 	if(kernel_tex_fetch(__prim_visibility, triAddr) & visibility)
 #endif
 	{
-#ifdef __KERNEL_GPU__
-		float4 a = tri_b - tri_a, b = tri_c - tri_a;
-		if(len_squared(make_float3(a.y*b.z - a.z*b.y,
-		                           a.z*b.x - a.x*b.z,
-		                           a.x*b.y - a.y*b.x)) == 0.0f)
-		{
+#ifdef __KERNEL_CUDA__
+		if(A == B && B == C) {
 			return false;
 		}
 #endif
-
 		/* Normalize U, V, W, and T. */
 		const float inv_det = 1.0f / det;
 		isect->prim = triAddr;
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -29,16 +29,16 @@ CCL_NAMESPACE_BEGIN

 /* Return position normalized to 0..1 in mesh bounds */

-#ifdef __KERNEL_GPU__
+#if defined(__KERNEL_GPU__) && __CUDA_ARCH__ < 300
 ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z)
 {
 	float4 r;
 	switch(id) {
-		case 0: r = kernel_tex_image_interp_3d(__tex_image_float3d_000, x, y, z); break;
-		case 1: r = kernel_tex_image_interp_3d(__tex_image_float3d_001, x, y, z); break;
-		case 2: r = kernel_tex_image_interp_3d(__tex_image_float3d_002, x, y, z); break;
-		case 3: r = kernel_tex_image_interp_3d(__tex_image_float3d_003, x, y, z); break;
-		case 4: r = kernel_tex_image_interp_3d(__tex_image_float3d_004, x, y, z); break;
+		case 0: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_000, x, y, z); break;
+		case 1: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_001, x, y, z); break;
+		case 2: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_002, x, y, z); break;
+		case 3: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_003, x, y, z); break;
+		case 4: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_004, x, y, z); break;
 	}
 	return r;
 }
@@ -65,7 +65,13 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 {
 	float3 P = volume_normalized_position(kg, sd, sd->P);
 #ifdef __KERNEL_GPU__
+#  if __CUDA_ARCH__ >= 300
+	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
+	float f = kernel_tex_image_interp_3d_float(tex, P.x, P.y, P.z);
+	float4 r = make_float4(f, f, f, 1.0);
+#  else
 	float4 r = volume_image_texture_3d(id, P.x, P.y, P.z);
+#  endif
 #else
 	float4 r;
 	if(sd->flag & SD_VOLUME_CUBIC)
@@ -77,7 +83,6 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 	if(dx) *dx = 0.0f;
 	if(dy) *dy = 0.0f;

-	/* todo: support float textures to lower memory usage for single floats */
 	return average(float4_to_float3(r));
 }

@@ -85,7 +90,12 @@ ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *s
 {
 	float3 P = volume_normalized_position(kg, sd, sd->P);
 #ifdef __KERNEL_GPU__
+#  if __CUDA_ARCH__ >= 300
+	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
+	float4 r = kernel_tex_image_interp_3d_float4(tex, P.x, P.y, P.z);
+#  else
 	float4 r = volume_image_texture_3d(id, P.x, P.y, P.z);
+#  endif
 #else
 	float4 r;
 	if(sd->flag & SD_VOLUME_CUBIC)
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -16,7 +16,7 @@

 CCL_NAMESPACE_BEGIN

-#ifndef __NO_BAKING__
+#ifdef __BAKING__

 ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, RNG rng,
                                   int pass_filter, int sample)
@@ -30,6 +30,9 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
 	Ray ray;
 	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);

+	/* emission and indirect shader data memory used by various functions */
+	ShaderData emission_sd, indirect_sd;
+
 	ray.P = sd->P + sd->Ng;
 	ray.D = -sd->Ng;
 	ray.t = FLT_MAX;
@@ -41,7 +44,7 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
 	path_radiance_init(&L_sample, kernel_data.film.use_light_pass);

 	/* init path state */
-	path_state_init(kg, &state, &rng, sample, NULL);
+	path_state_init(kg, &emission_sd, &state, &rng, sample, NULL);

 	/* evaluate surface shader */
 	float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
@@ -56,7 +59,7 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian

 		/* sample ambient occlusion */
 		if(pass_filter & BAKE_FILTER_AO) {
-			kernel_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
+			kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput);
 		}

 		/* sample emission */
@@ -75,6 +78,7 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
 			kernel_path_subsurface_init_indirect(&ss_indirect);
 			if(kernel_path_subsurface_scatter(kg,
 			                                  sd,
+			                                  &emission_sd,
 			                                  &L_sample,
 			                                  &state,
 			                                  &rng,
@@ -90,6 +94,8 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
 					                                      &L_sample,
 					                                      &throughput);
 					kernel_path_indirect(kg,
+					                     &indirect_sd,
+					                     &emission_sd,
 					                     &rng,
 					                     &ray,
 					                     throughput,
@@ -105,14 +111,14 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian

 		/* sample light and BSDF */
 		if(!is_sss_sample && (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT))) {
-			kernel_path_surface_connect_light(kg, &rng, sd, throughput, &state, &L_sample);
+			kernel_path_surface_connect_light(kg, &rng, sd, &emission_sd, throughput, &state, &L_sample);

 			if(kernel_path_surface_bounce(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) {
 #ifdef __LAMP_MIS__
 				state.ray_t = 0.0f;
 #endif
 				/* compute indirect light */
-				kernel_path_indirect(kg, &rng, &ray, throughput, 1, &state, &L_sample);
+				kernel_path_indirect(kg, &indirect_sd, &emission_sd, &rng, &ray, throughput, 1, &state, &L_sample);

 				/* sum and reset indirect light pass variables for the next samples */
 				path_radiance_sum_indirect(&L_sample);
@@ -126,7 +132,7 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian

 		/* sample ambient occlusion */
 		if(pass_filter & BAKE_FILTER_AO) {
-			kernel_branched_path_ao(kg, sd, &L_sample, &state, &rng, throughput);
+			kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput);
 		}

 		/* sample emission */
@@ -139,7 +145,8 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
 		/* sample subsurface scattering */
 		if((pass_filter & BAKE_FILTER_SUBSURFACE) && (sd->flag & SD_BSSRDF)) {
 			/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
-			kernel_branched_path_subsurface_scatter(kg, sd, &L_sample, &state, &rng, &ray, throughput);
+			kernel_branched_path_subsurface_scatter(kg, sd, &indirect_sd,
+				&emission_sd, &L_sample, &state, &rng, &ray, throughput);
 		}
 #endif

@@ -150,13 +157,13 @@ ccl_device void compute_light_pass(KernelGlobals *kg, ShaderData *sd, PathRadian
 			if(kernel_data.integrator.use_direct_light) {
 				int all = kernel_data.integrator.sample_all_lights_direct;
 				kernel_branched_path_surface_connect_light(kg, &rng,
-					sd, &state, throughput, 1.0f, &L_sample, all);
+					sd, &emission_sd, &state, throughput, 1.0f, &L_sample, all);
 			}
 #endif

 			/* indirect light */
 			kernel_branched_path_surface_indirect_light(kg, &rng,
-				sd, throughput, 1.0f, &state, &L_sample);
+				sd, &indirect_sd, &emission_sd, throughput, 1.0f, &state, &L_sample);
 		}
 	}
 #endif
@@ -242,11 +249,11 @@ ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
 	}

 	if(is_direct) {
-		out += safe_divide_color(direct, color);
+		out += safe_divide_even_color(direct, color);
 	}

 	if(is_indirect) {
-		out += safe_divide_color(indirect, color);
+		out += safe_divide_even_color(indirect, color);
 	}

 	return out;
@@ -475,15 +482,13 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	}

 	/* write output */
-	float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f;
+	const float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f;
+	const float4 scaled_result = make_float4(out.x, out.y, out.z, 1.0f) * output_fac;

-	if(sample == 0)
-		output[i] = make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
-	else
-		output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
+	output[i] = (sample == 0)?  scaled_result: output[i] + scaled_result;
 }

-#endif  /* __NO_BAKING__ */
+#endif  /* __BAKING__ */

 ccl_device void kernel_shader_evaluate(KernelGlobals *kg,
                                       ccl_global uint4 *input,
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -40,6 +40,7 @@
 #include "util_simd.h"
 #include "util_half.h"
 #include "util_types.h"
+#include "util_texture.h"

 #define ccl_addr_space

@@ -108,6 +109,19 @@ template<typename T> struct texture_image  {
 		return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
 	}

+	ccl_always_inline float4 read(uchar r)
+	{
+		float f = r*(1.0f/255.0f);
+		return make_float4(f, f, f, 1.0);
+	}
+
+	ccl_always_inline float4 read(float r)
+	{
+		/* TODO(dingto): Optimize this, so interpolation
+		 * happens on float instead of float4 */
+		return make_float4(r, r, r, 1.0f);
+	}
+
 	ccl_always_inline int wrap_periodic(int x, int width)
 	{
 		x %= width;
@@ -470,6 +484,8 @@ typedef texture<uint> texture_uint;
 typedef texture<int> texture_int;
 typedef texture<uint4> texture_uint4;
 typedef texture<uchar4> texture_uchar4;
+typedef texture_image<float> texture_image_float;
+typedef texture_image<uchar> texture_image_uchar;
 typedef texture_image<float4> texture_image_float4;
 typedef texture_image<uchar4> texture_image_uchar4;

@@ -479,9 +495,10 @@ typedef texture_image<uchar4> texture_image_uchar4;
 #define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
 #define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
 #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
-#define kernel_tex_image_interp(tex, x, y) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp(x, y) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp(x, y))
-#define kernel_tex_image_interp_3d(tex, x, y, z) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d(x, y, z) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d(x, y, z))
-#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) ((tex < MAX_FLOAT_IMAGES) ? kg->texture_float_images[tex].interp_3d_ex(x, y, z, interpolation) : kg->texture_byte_images[tex - MAX_FLOAT_IMAGES].interp_3d_ex(x, y, z, interpolation))
+
+#define kernel_tex_image_interp(tex,x,y) kernel_tex_image_interp_impl(kg,tex,x,y)
+#define kernel_tex_image_interp_3d(tex, x, y, z) kernel_tex_image_interp_3d_impl(kg,tex,x,y,z)
+#define kernel_tex_image_interp_3d_ex(tex, x, y, z, interpolation) kernel_tex_image_interp_3d_ex_impl(kg,tex, x, y, z, interpolation)

 #define kernel_data (kg->__data)

--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -67,20 +67,29 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;

 /* Macros to handle different memory storage on different devices */

-/* In order to use full 6GB of memory on Titan cards, use arrays instead
- * of textures. On earlier cards this seems slower, but on Titan it is
- * actually slightly faster in tests. */
+/* On Fermi cards (4xx and 5xx), we use regular textures for both data and images.
+ * On Kepler (6xx) and above, we use Bindless Textures for images and arrays for data.
+ *
+ * Arrays are necessary in order to use the full VRAM on newer cards, and it's slightly faster.
+ * Using Arrays on Fermi turned out to be slower.*/
+
+/* Fermi */
 #if __CUDA_ARCH__ < 300
 #  define __KERNEL_CUDA_TEX_STORAGE__
-#endif
-
-#ifdef __KERNEL_CUDA_TEX_STORAGE__
 #  define kernel_tex_fetch(t, index) tex1Dfetch(t, index)
+
+#  define kernel_tex_image_interp(t, x, y) tex2D(t, x, y)
+#  define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z)
+
+/* Kepler */
 #else
 #  define kernel_tex_fetch(t, index) t[(index)]
+
+#  define kernel_tex_image_interp_float4(t, x, y) tex2D<float4>(t, x, y)
+#  define kernel_tex_image_interp_float(t, x, y) tex2D<float>(t, x, y)
+#  define kernel_tex_image_interp_3d_float4(t, x, y, z) tex3D<float4>(t, x, y, z)
+#  define kernel_tex_image_interp_3d_float(t, x, y, z) tex3D<float>(t, x, y, z)
 #endif
-#define kernel_tex_image_interp(t, x, y) tex2D(t, x, y)
-#define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z)

 #define kernel_data __data

--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -18,6 +18,7 @@ CCL_NAMESPACE_BEGIN

 /* Direction Emission */
 ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
+                                                ShaderData *emission_sd,
                                                LightSample *ls,
                                                ccl_addr_space PathState *state,
                                                float3 I,
@@ -26,12 +27,6 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
                                                float time)
 {
 	/* setup shading at emitter */
-#ifdef __SPLIT_KERNEL__
-	ShaderData *sd = kg->sd_input;
-#else
-	ShaderData sd_object;
-	ShaderData *sd = &sd_object;
-#endif
 	float3 eval;

 #ifdef __BACKGROUND_MIS__
@@ -46,28 +41,28 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		ray.dP = differential3_zero();
 		ray.dD = dI;

-		shader_setup_from_background(kg, sd, &ray);
+		shader_setup_from_background(kg, emission_sd, &ray);

 		path_state_modify_bounce(state, true);
-		eval = shader_eval_background(kg, sd, state, 0, SHADER_CONTEXT_EMISSION);
+		eval = shader_eval_background(kg, emission_sd, state, 0, SHADER_CONTEXT_EMISSION);
 		path_state_modify_bounce(state, false);
 	}
 	else
 #endif
 	{
-		shader_setup_from_sample(kg, sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time);
+		shader_setup_from_sample(kg, emission_sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time);

-		ls->Ng = ccl_fetch(sd, Ng);
+		ls->Ng = ccl_fetch(emission_sd, Ng);

 		/* no path flag, we're evaluating this for all closures. that's weak but
 		 * we'd have to do multiple evaluations otherwise */
 		path_state_modify_bounce(state, true);
-		shader_eval_surface(kg, sd, state, 0.0f, 0, SHADER_CONTEXT_EMISSION);
+		shader_eval_surface(kg, emission_sd, state, 0.0f, 0, SHADER_CONTEXT_EMISSION);
 		path_state_modify_bounce(state, false);

 		/* evaluate emissive closure */
-		if(ccl_fetch(sd, flag) & SD_EMISSION)
-			eval = shader_emissive_eval(kg, sd);
+		if(ccl_fetch(emission_sd, flag) & SD_EMISSION)
+			eval = shader_emissive_eval(kg, emission_sd);
 		else
 			eval = make_float3(0.0f, 0.0f, 0.0f);
 	}
@@ -79,6 +74,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,

 ccl_device_noinline bool direct_emission(KernelGlobals *kg,
                                         ShaderData *sd,
+                                         ShaderData *emission_sd,
                                         LightSample *ls,
                                         ccl_addr_space PathState *state,
                                         Ray *ray,
@@ -94,6 +90,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	/* evaluate closure */

 	float3 light_eval = direct_emissive_eval(kg,
+	                                         emission_sd,
 	                                         ls,
 	                                         state,
 	                                         -ls->D,
@@ -198,6 +195,7 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
 /* Indirect Lamp Emission */

 ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg,
+                                                ShaderData *emission_sd,
                                                ccl_addr_space PathState *state,
                                                Ray *ray,
                                                float3 *emission)
@@ -225,6 +223,7 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg,
 #endif

 		float3 L = direct_emissive_eval(kg,
+		                                emission_sd,
 		                                &ls,
 		                                state,
 		                                -ray->D,
@@ -238,7 +237,7 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg,
 			Ray volume_ray = *ray;
 			volume_ray.t = ls.t;
 			float3 volume_tp = make_float3(1.0f, 1.0f, 1.0f);
-			kernel_volume_shadow(kg, state, &volume_ray, &volume_tp);
+			kernel_volume_shadow(kg, emission_sd, state, &volume_ray, &volume_tp);
 			L *= volume_tp;
 		}
 #endif
@@ -260,6 +259,7 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg,
 /* Indirect Background */

 ccl_device_noinline float3 indirect_background(KernelGlobals *kg,
+                                               ShaderData *emission_sd,
                                               ccl_addr_space PathState *state,
                                               ccl_addr_space Ray *ray)
 {
@@ -280,19 +280,14 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg,
 	/* evaluate background closure */
 #  ifdef __SPLIT_KERNEL__
 	Ray priv_ray = *ray;
-	shader_setup_from_background(kg, kg->sd_input, &priv_ray);
-
-	path_state_modify_bounce(state, true);
-	float3 L = shader_eval_background(kg, kg->sd_input, state, state->flag, SHADER_CONTEXT_EMISSION);
-	path_state_modify_bounce(state, false);
+	shader_setup_from_background(kg, emission_sd, &priv_ray);
 #  else
-	ShaderData sd;
-	shader_setup_from_background(kg, &sd, ray);
+	shader_setup_from_background(kg, emission_sd, ray);
+#  endif

 	path_state_modify_bounce(state, true);
-	float3 L = shader_eval_background(kg, &sd, state, state->flag, SHADER_CONTEXT_EMISSION);
+	float3 L = shader_eval_background(kg, emission_sd, state, state->flag, SHADER_CONTEXT_EMISSION);
 	path_state_modify_bounce(state, false);
-#  endif

 #ifdef __BACKGROUND_MIS__
 	/* check if background light exists or if we should skip pdf */
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -31,12 +31,14 @@ struct OSLThreadData;
 struct OSLShadingSystem;
 #  endif

-#  define MAX_BYTE_IMAGES   1024
-#  define MAX_FLOAT_IMAGES  1024
+struct Intersection;
+struct VolumeStep;

 typedef struct KernelGlobals {
-	texture_image_uchar4 texture_byte_images[MAX_BYTE_IMAGES];
-	texture_image_float4 texture_float_images[MAX_FLOAT_IMAGES];
+	texture_image_uchar4 texture_byte4_images[TEX_NUM_BYTE4_CPU];
+	texture_image_float4 texture_float4_images[TEX_NUM_FLOAT4_CPU];
+	texture_image_float texture_float_images[TEX_NUM_FLOAT_CPU];
+	texture_image_uchar texture_byte_images[TEX_NUM_BYTE_CPU];

 #  define KERNEL_TEX(type, ttype, name) ttype name;
 #  define KERNEL_IMAGE_TEX(type, ttype, name)
@@ -52,6 +54,14 @@ typedef struct KernelGlobals {
 	OSLThreadData *osl_tdata;
 #  endif

+	/* **** Run-time data ****  */
+
+	/* Heap-allocated storage for transparent shadows intersections. */
+	Intersection *transparent_shadow_intersections;
+
+	/* Storage for decoupled volume steps. */
+	VolumeStep *decoupled_volume_steps[2];
+	int decoupled_volume_steps_index;
 } KernelGlobals;

 #endif  /* __KERNEL_CPU__ */
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -175,7 +175,7 @@ ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
 #else
 	int m = float_to_int(sqrtf(N));
 #endif
-	int n = (N + m - 1)/m;
+	int n = (N - 1)/m + 1;
 	float invN = 1.0f/N;
 	float invm = 1.0f/m;
 	float invn = 1.0f/n;
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -291,24 +291,13 @@ ccl_device float background_portal_pdf(KernelGlobals *kg,
 		}
 		num_possible++;

-		float t = -(dot(P, dir) - dot(lightpos, dir)) / dot(direction, dir);
-		if(t <= 1e-4f) {
-			/* Either behind the portal or too close. */
-			continue;
-		}
-
 		float4 data1 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 1);
 		float4 data2 = kernel_tex_fetch(__light_data, (p + kernel_data.integrator.portal_offset)*LIGHT_SIZE + 2);

 		float3 axisu = make_float3(data1.y, data1.z, data1.w);
 		float3 axisv = make_float3(data2.y, data2.z, data2.w);

-		float3 hit = P + t*direction;
-		float3 inplane = hit - lightpos;
-		/* Skip if the the ray doesn't pass through portal. */
-		if(fabsf(dot(inplane, axisu) / dot(axisu, axisu)) > 0.5f)
-			continue;
-		if(fabsf(dot(inplane, axisv) / dot(axisv, axisv)) > 0.5f)
+		if(!ray_quad_intersect(P, direction, 1e-4f, FLT_MAX, lightpos, axisu, axisv, dir, NULL, NULL))
 			continue;

 		portal_pdf += area_light_sample(P, &lightpos, axisu, axisv, 0.0f, 0.0f, false);
@@ -729,8 +718,8 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,

 		float3 light_P = make_float3(data0.y, data0.z, data0.w);

-		if(!ray_quad_intersect(P, D, t,
-		                       light_P, axisu, axisv, &ls->P, &ls->t))
+		if(!ray_quad_intersect(P, D, 0.0f, t,
+		                       light_P, axisu, axisv, Ng, &ls->P, &ls->t))
 		{
 			return false;
 		}
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -53,6 +53,8 @@
 CCL_NAMESPACE_BEGIN

 ccl_device void kernel_path_indirect(KernelGlobals *kg,
+                                     ShaderData *sd,
+                                     ShaderData *emission_sd,
                                     RNG *rng,
                                     Ray *ray,
                                     float3 throughput,
@@ -87,7 +89,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,

 			/* intersect with lamp */
 			float3 emission;
-			if(indirect_lamp_emission(kg, state, &light_ray, &emission)) {
+			if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission)) {
 				path_radiance_accum_emission(L,
 				                             throughput,
 				                             emission,
@@ -115,15 +117,14 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			if(decoupled) {
 				/* cache steps along volume for repeated sampling */
 				VolumeSegment volume_segment;
-				ShaderData volume_sd;

 				shader_setup_from_volume(kg,
-				                         &volume_sd,
+				                         sd,
 				                         &volume_ray);
 				kernel_volume_decoupled_record(kg,
 				                               state,
 				                               &volume_ray,
-				                               &volume_sd,
+				                               sd,
 				                               &volume_segment,
 				                               heterogeneous);

@@ -146,7 +147,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 					/* direct light sampling */
 					kernel_branched_path_volume_connect_light(kg,
 					                                          rng,
-					                                          &volume_sd,
+					                                          sd,
+					                                          emission_sd,
 					                                          throughput,
 					                                          state,
 					                                          L,
@@ -163,7 +165,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 					result = kernel_volume_decoupled_scatter(kg,
 					                                         state,
 					                                         &volume_ray,
-					                                         &volume_sd,
+					                                         sd,
 					                                         &throughput,
 					                                         rphase,
 					                                         rscatter,
@@ -178,7 +180,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 				if(result == VOLUME_PATH_SCATTERED) {
 					if(kernel_path_volume_bounce(kg,
 					                             rng,
-					                             &volume_sd,
+					                             sd,
 					                             &throughput,
 					                             state,
 					                             L,
@@ -198,16 +200,16 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 #  endif
 			{
 				/* integrate along volume segment with distance sampling */
-				ShaderData volume_sd;
 				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, state, &volume_sd, &volume_ray, L, &throughput, rng, heterogeneous);
+					kg, state, sd, &volume_ray, L, &throughput, rng, heterogeneous);

 #  ifdef __VOLUME_SCATTER__
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* direct lighting */
 					kernel_path_volume_connect_light(kg,
 					                                 rng,
-					                                 &volume_sd,
+					                                 sd,
+					                                 emission_sd,
 					                                 throughput,
 					                                 state,
 					                                 L);
@@ -215,7 +217,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 					/* indirect light bounce */
 					if(kernel_path_volume_bounce(kg,
 					                             rng,
-					                             &volume_sd,
+					                             sd,
 					                             &throughput,
 					                             state,
 					                             L,
@@ -235,7 +237,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 		if(!hit) {
 #ifdef __BACKGROUND__
 			/* sample background shader */
-			float3 L_background = indirect_background(kg, state, ray);
+			float3 L_background = indirect_background(kg, emission_sd, state, ray);
 			path_radiance_accum_background(L,
 			                               throughput,
 			                               L_background,
@@ -246,15 +248,14 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 		}

 		/* setup shading */
-		ShaderData sd;
 		shader_setup_from_ray(kg,
-		                      &sd,
+		                      sd,
 		                      &isect,
 		                      ray);
 		float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
-		shader_eval_surface(kg, &sd, state, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT);
+		shader_eval_surface(kg, sd, state, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT);
 #ifdef __BRANCHED_PATH__
-		shader_merge_closures(&sd);
+		shader_merge_closures(sd);
 #endif

 		/* blurring of bsdf after bounces, for rays that have a small likelihood
@@ -264,15 +265,15 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,

 			if(blur_pdf < 1.0f) {
 				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
-				shader_bsdf_blur(kg, &sd, blur_roughness);
+				shader_bsdf_blur(kg, sd, blur_roughness);
 			}
 		}

 #ifdef __EMISSION__
 		/* emission */
-		if(sd.flag & SD_EMISSION) {
+		if(sd->flag & SD_EMISSION) {
 			float3 emission = indirect_primitive_emission(kg,
-			                                              &sd,
+			                                              sd,
 			                                              isect.t,
 			                                              state->flag,
 			                                              state->ray_pdf);
@@ -302,33 +303,33 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,

 #ifdef __AO__
 		/* ambient occlusion */
-		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
+		if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
 			float bsdf_u, bsdf_v;
 			path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);

 			float ao_factor = kernel_data.background.ao_factor;
 			float3 ao_N;
-			float3 ao_bsdf = shader_bsdf_ao(kg, &sd, ao_factor, &ao_N);
+			float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
 			float3 ao_D;
 			float ao_pdf;
 			float3 ao_alpha = make_float3(0.0f, 0.0f, 0.0f);

 			sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);

-			if(dot(sd.Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
+			if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
 				Ray light_ray;
 				float3 ao_shadow;

-				light_ray.P = ray_offset(sd.P, sd.Ng);
+				light_ray.P = ray_offset(sd->P, sd->Ng);
 				light_ray.D = ao_D;
 				light_ray.t = kernel_data.background.ao_distance;
 #  ifdef __OBJECT_MOTION__
-				light_ray.time = sd.time;
+				light_ray.time = sd->time;
 #  endif
-				light_ray.dP = sd.dP;
+				light_ray.dP = sd->dP;
 				light_ray.dD = differential3_zero();

-				if(!shadow_blocked(kg, state, &light_ray, &ao_shadow)) {
+				if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
 					path_radiance_accum_ao(L,
 					                       throughput,
 					                       ao_alpha,
@@ -343,9 +344,9 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 #ifdef __SUBSURFACE__
 		/* bssrdf scatter to a different location on the same object, replacing
 		 * the closures with a diffuse BSDF */
-		if(sd.flag & SD_BSSRDF) {
+		if(sd->flag & SD_BSSRDF) {
 			float bssrdf_probability;
-			ShaderClosure *sc = subsurface_scatter_pick_closure(kg, &sd, &bssrdf_probability);
+			ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);

 			/* modify throughput for picking bssrdf or bsdf */
 			throughput *= bssrdf_probability;
@@ -361,7 +362,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 				                  PRNG_BSDF_U,
 				                  &bssrdf_u, &bssrdf_v);
 				subsurface_scatter_step(kg,
-				                        &sd,
+				                        sd,
 				                        state,
 				                        state->flag,
 				                        sc,
@@ -377,7 +378,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			int all = kernel_data.integrator.sample_all_lights_indirect;
 			kernel_branched_path_surface_connect_light(kg,
 			                                           rng,
-			                                           &sd,
+			                                           sd,
+			                                           emission_sd,
 			                                           state,
 			                                           throughput,
 			                                           1.0f,
@@ -386,13 +388,14 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 		}
 #endif

-		if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, state, L, ray))
+		if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray))
 			break;
 	}
 }

 ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
                                        ShaderData *sd,
+                                        ShaderData *emission_sd,
                                        PathRadiance *L,
                                        PathState *state,
                                        RNG *rng,
@@ -425,7 +428,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 		light_ray.dP = ccl_fetch(sd, dP);
 		light_ray.dD = differential3_zero();

-		if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
+		if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow))
 			path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
 	}
 }
@@ -435,6 +438,7 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 ccl_device bool kernel_path_subsurface_scatter(
        KernelGlobals *kg,
        ShaderData *sd,
+        ShaderData *emission_sd,
        PathRadiance *L,
        PathState *state,
        RNG *rng,
@@ -503,7 +507,7 @@ ccl_device bool kernel_path_subsurface_scatter(
 			hit_L->direct_throughput = L->direct_throughput;
 			path_radiance_copy_indirect(hit_L, L);

-			kernel_path_surface_connect_light(kg, rng, sd, *hit_tp, state, hit_L);
+			kernel_path_surface_connect_light(kg, rng, sd, emission_sd, *hit_tp, state, hit_L);

 			if(kernel_path_surface_bounce(kg,
 			                              rng,
@@ -526,6 +530,7 @@ ccl_device bool kernel_path_subsurface_scatter(

 					kernel_volume_stack_update_for_subsurface(
 					    kg,
+					    emission_sd,
 					    &volume_ray,
 					    hit_state->volume_stack);
 				}
@@ -604,8 +609,13 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,

 	path_radiance_init(&L, kernel_data.film.use_light_pass);

+	/* shader data memory used for both volumes and surfaces, saves stack space */
+	ShaderData sd;
+	/* shader data used by emission, shadows, volume stacks */
+	ShaderData emission_sd;
+
 	PathState state;
-	path_state_init(kg, &state, rng, sample, &ray);
+	path_state_init(kg, &emission_sd, &state, rng, sample, &ray);

 #ifdef __KERNEL_DEBUG__
 	DebugData debug_data;
@@ -669,7 +679,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			/* intersect with lamp */
 			float3 emission;

-			if(indirect_lamp_emission(kg, &state, &light_ray, &emission))
+			if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
 				path_radiance_accum_emission(&L, throughput, emission, state.bounce);
 		}
 #endif
@@ -689,11 +699,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			if(decoupled) {
 				/* cache steps along volume for repeated sampling */
 				VolumeSegment volume_segment;
-				ShaderData volume_sd;

-				shader_setup_from_volume(kg, &volume_sd, &volume_ray);
+				shader_setup_from_volume(kg, &sd, &volume_ray);
 				kernel_volume_decoupled_record(kg, &state,
-					&volume_ray, &volume_sd, &volume_segment, heterogeneous);
+					&volume_ray, &sd, &volume_segment, heterogeneous);

 				volume_segment.sampling_method = sampling_method;

@@ -708,8 +717,9 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 					int all = false;

 					/* direct light sampling */
-					kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
-						throughput, &state, &L, all, &volume_ray, &volume_segment);
+					kernel_branched_path_volume_connect_light(kg, rng, &sd,
+						&emission_sd, throughput, &state, &L, all,
+						&volume_ray, &volume_segment);

 					/* indirect sample. if we use distance sampling and take just
 					 * one sample for direct and indirect light, we could share
@@ -718,7 +728,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 					float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);

 					result = kernel_volume_decoupled_scatter(kg,
-						&state, &volume_ray, &volume_sd, &throughput,
+						&state, &volume_ray, &sd, &throughput,
 						rphase, rscatter, &volume_segment, NULL, true);
 				}

@@ -726,7 +736,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 				kernel_volume_decoupled_free(kg, &volume_segment);

 				if(result == VOLUME_PATH_SCATTERED) {
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
 						continue;
 					else
 						break;
@@ -739,17 +749,16 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #  endif
 			{
 				/* integrate along volume segment with distance sampling */
-				ShaderData volume_sd;
 				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous);
+					kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous);

 #  ifdef __VOLUME_SCATTER__
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* direct lighting */
-					kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L);
+					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);

 					/* indirect light bounce */
-					if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
 						continue;
 					else
 						break;
@@ -772,7 +781,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,

 #ifdef __BACKGROUND__
 			/* sample background shader */
-			float3 L_background = indirect_background(kg, &state, &ray);
+			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
 			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
 #endif

@@ -780,7 +789,6 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 		}

 		/* setup shading */
-		ShaderData sd;
 		shader_setup_from_ray(kg, &sd, &isect, &ray);
 		float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF);
 		shader_eval_surface(kg, &sd, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
@@ -848,7 +856,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_path_ao(kg, &sd, &L, &state, rng, throughput);
+			kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
 		}
 #endif

@@ -858,6 +866,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 		if(sd.flag & SD_BSSRDF) {
 			if(kernel_path_subsurface_scatter(kg,
 			                                  &sd,
+			                                  &emission_sd,
 			                                  &L,
 			                                  &state,
 			                                  rng,
@@ -871,7 +880,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #endif  /* __SUBSURFACE__ */

 		/* direct lighting */
-		kernel_path_surface_connect_light(kg, rng, &sd, throughput, &state, &L);
+		kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);

 		/* compute direct lighting and next bounce */
 		if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -18,7 +18,13 @@ CCL_NAMESPACE_BEGIN

 #ifdef __BRANCHED_PATH__

-ccl_device void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, PathRadiance *L, PathState *state, RNG *rng, float3 throughput)
+ccl_device void kernel_branched_path_ao(KernelGlobals *kg,
+                                        ShaderData *sd,
+                                        ShaderData *emission_sd,
+                                        PathRadiance *L,
+                                        PathState *state,
+                                        RNG *rng,
+                                        float3 throughput)
 {
 	int num_samples = kernel_data.integrator.ao_samples;
 	float num_samples_inv = 1.0f/num_samples;
@@ -49,7 +55,7 @@ ccl_device void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, PathR
 			light_ray.dP = ccl_fetch(sd, dP);
 			light_ray.dD = differential3_zero();

-			if(!shadow_blocked(kg, state, &light_ray, &ao_shadow))
+			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow))
 				path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
 		}
 	}
@@ -58,8 +64,8 @@ ccl_device void kernel_branched_path_ao(KernelGlobals *kg, ShaderData *sd, PathR

 /* bounce off surface and integrate indirect light */
 ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
-	RNG *rng, ShaderData *sd, float3 throughput, float num_samples_adjust,
-	PathState *state, PathRadiance *L)
+	RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
+	float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
 {
 	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
 		const ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
@@ -106,6 +112,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 			}

 			kernel_path_indirect(kg,
+						         indirect_sd,
+			                     emission_sd,
 			                     rng,
 			                     &bsdf_ray,
 			                     tp*num_samples_inv,
@@ -124,6 +132,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 #ifdef __SUBSURFACE__
 ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
                                                        ShaderData *sd,
+                                                        ShaderData *indirect_sd,
+                                                        ShaderData *emission_sd,
                                                        PathRadiance *L,
                                                        PathState *state,
                                                        RNG *rng,
@@ -186,6 +196,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,

 					kernel_volume_stack_update_for_subsurface(
 					    kg,
+					    emission_sd,
 					    &volume_ray,
 					    hit_state.volume_stack);
 				}
@@ -199,6 +210,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 					        kg,
 					        rng,
 					        &bssrdf_sd,
+					        emission_sd,
 					        &hit_state,
 					        throughput,
 					        num_samples_inv,
@@ -212,6 +224,8 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 				        kg,
 				        rng,
 				        &bssrdf_sd,
+						indirect_sd,
+				        emission_sd,
 				        throughput,
 				        num_samples_inv,
 				        &hit_state,
@@ -231,8 +245,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in

 	path_radiance_init(&L, kernel_data.film.use_light_pass);

+	/* shader data memory used for both volumes and surfaces, saves stack space */
+	ShaderData sd;
+	/* shader data used by emission, shadows, volume stacks, indirect path */
+	ShaderData emission_sd, indirect_sd;
+
 	PathState state;
-	path_state_init(kg, &state, rng, sample, &ray);
+	path_state_init(kg, &emission_sd, &state, rng, sample, &ray);

 #ifdef __KERNEL_DEBUG__
 	DebugData debug_data;
@@ -287,11 +306,10 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in

 			/* cache steps along volume for repeated sampling */
 			VolumeSegment volume_segment;
-			ShaderData volume_sd;

-			shader_setup_from_volume(kg, &volume_sd, &volume_ray);
+			shader_setup_from_volume(kg, &sd, &volume_ray);
 			kernel_volume_decoupled_record(kg, &state,
-				&volume_ray, &volume_sd, &volume_segment, heterogeneous);
+				&volume_ray, &sd, &volume_segment, heterogeneous);

 			/* direct light sampling */
 			if(volume_segment.closure_flag & SD_SCATTER) {
@@ -299,8 +317,9 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in

 				int all = kernel_data.integrator.sample_all_lights_direct;

-				kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
-					throughput, &state, &L, all, &volume_ray, &volume_segment);
+				kernel_branched_path_volume_connect_light(kg, rng, &sd,
+					&emission_sd, throughput, &state, &L, all,
+					&volume_ray, &volume_segment);

 				/* indirect light sampling */
 				int num_samples = kernel_data.integrator.volume_samples;
@@ -326,20 +345,22 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 					float rscatter = path_state_rng_1D_for_decision(kg, &tmp_rng, &ps, PRNG_SCATTER_DISTANCE);

 					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-						&ps, &pray, &volume_sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
+						&ps, &pray, &sd, &tp, rphase, rscatter, &volume_segment, NULL, false);

 					(void)result;
 					kernel_assert(result == VOLUME_PATH_SCATTERED);

 					if(kernel_path_volume_bounce(kg,
 					                             rng,
-					                             &volume_sd,
+					                             &sd,
 					                             &tp,
 					                             &ps,
 					                             &L,
 					                             &pray))
 					{
 						kernel_path_indirect(kg,
+						                     &indirect_sd,
+						                     &emission_sd,
 						                     rng,
 						                     &pray,
 						                     tp*num_samples_inv,
@@ -373,30 +394,31 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			for(int j = 0; j < num_samples; j++) {
 				PathState ps = state;
 				Ray pray = ray;
-				ShaderData volume_sd;
 				float3 tp = throughput * num_samples_inv;

 				/* branch RNG state */
 				path_state_branch(&ps, j, num_samples);

 				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &ps, &volume_sd, &volume_ray, &L, &tp, rng, heterogeneous);
+					kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous);

 #ifdef __VOLUME_SCATTER__
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* todo: support equiangular, MIS and all light sampling.
 					 * alternatively get decoupled ray marching working on the GPU */
-					kernel_path_volume_connect_light(kg, rng, &volume_sd, tp, &state, &L);
+					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L);

 					if(kernel_path_volume_bounce(kg,
 					                             rng,
-					                             &volume_sd,
+					                             &sd,
 					                             &tp,
 					                             &ps,
 					                             &L,
 					                             &pray))
 					{
 						kernel_path_indirect(kg,
+						                     &indirect_sd,
+						                     &emission_sd,
 						                     rng,
 						                     &pray,
 						                     tp,
@@ -414,7 +436,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			}

 			/* todo: avoid this calculation using decoupled ray marching */
-			kernel_volume_shadow(kg, &state, &volume_ray, &throughput);
+			kernel_volume_shadow(kg, &emission_sd, &state, &volume_ray, &throughput);
 #endif
 		}
 #endif
@@ -432,7 +454,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in

 #ifdef __BACKGROUND__
 			/* sample background shader */
-			float3 L_background = indirect_background(kg, &state, &ray);
+			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
 			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
 #endif

@@ -440,7 +462,6 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		}

 		/* setup shading */
-		ShaderData sd;
 		shader_setup_from_ray(kg, &sd, &isect, &ray);
 		shader_eval_surface(kg, &sd, &state, 0.0f, state.flag, SHADER_CONTEXT_MAIN);
 		shader_merge_closures(&sd);
@@ -499,15 +520,15 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_branched_path_ao(kg, &sd, &L, &state, rng, throughput);
+			kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
 		}
 #endif

 #ifdef __SUBSURFACE__
 		/* bssrdf scatter to a different location on the same object */
 		if(sd.flag & SD_BSSRDF) {
-			kernel_branched_path_subsurface_scatter(kg, &sd, &L, &state,
-			                                        rng, &ray, throughput);
+			kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
+			                                        &L, &state, rng, &ray, throughput);
 		}
 #endif

@@ -519,13 +540,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			if(kernel_data.integrator.use_direct_light) {
 				int all = kernel_data.integrator.sample_all_lights_direct;
 				kernel_branched_path_surface_connect_light(kg, rng,
-					&sd, &hit_state, throughput, 1.0f, &L, all);
+					&sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
 			}
 #endif

 			/* indirect light */
 			kernel_branched_path_surface_indirect_light(kg, rng,
-				&sd, throughput, 1.0f, &hit_state, &L);
+				&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L);

 			/* continue in case of transparency */
 			throughput *= shader_bsdf_transparency(kg, &sd);
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -16,7 +16,12 @@

 CCL_NAMESPACE_BEGIN

-ccl_device_inline void path_state_init(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space RNG *rng, int sample, ccl_addr_space Ray *ray)
+ccl_device_inline void path_state_init(KernelGlobals *kg,
+                                       ShaderData *stack_sd,
+                                       ccl_addr_space PathState *state,
+                                       ccl_addr_space RNG *rng,
+                                       int sample,
+                                       ccl_addr_space Ray *ray)
 {
 	state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP;

@@ -41,7 +46,7 @@ ccl_device_inline void path_state_init(KernelGlobals *kg, ccl_addr_space PathSta

 	if(kernel_data.integrator.use_volumes) {
 		/* initialize volume stack with volume we are inside of */
-		kernel_volume_stack_init(kg, ray, state->volume_stack);
+		kernel_volume_stack_init(kg, stack_sd, ray, state->volume_stack);
 		/* seed RNG for cases where we can't use stratified samples */
 		state->rng_congruential = lcg_init(*rng + sample*0x51633e2d);
 	}
@@ -131,9 +136,6 @@ ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *s
 	/* todo: this is not supported as its own ray visibility yet */
 	if(state->flag & PATH_RAY_VOLUME_SCATTER)
 		flag |= PATH_RAY_DIFFUSE;
-	/* for camera visibility, use render layer flags */
-	if(flag & PATH_RAY_CAMERA)
-		flag |= kernel_data.integrator.layer_flag;

 	return flag;
 }
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -20,7 +20,8 @@ CCL_NAMESPACE_BEGIN

 /* branched path tracing: connect path directly to position on one or more lights and add it to L */
 ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, PathState *state, float3 throughput, float num_samples_adjust, PathRadiance *L, int sample_all_lights)
+	ShaderData *sd, ShaderData *emission_sd, PathState *state, float3 throughput,
+	float num_samples_adjust, PathRadiance *L, int sample_all_lights)
 {
 #ifdef __EMISSION__
 	/* sample illumination from lights to find path contribution */
@@ -55,11 +56,11 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 				LightSample ls;
 				lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls);

-				if(direct_emission(kg, sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
+				if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
 					/* trace shadow ray */
 					float3 shadow;

-					if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+					if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 						/* accumulate */
 						path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
 					}
@@ -87,11 +88,11 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 				LightSample ls;
 				light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);

-				if(direct_emission(kg, sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
+				if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
 					/* trace shadow ray */
 					float3 shadow;

-					if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+					if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 						/* accumulate */
 						path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
 					}
@@ -109,11 +110,11 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 		light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);

 		/* sample random light */
-		if(direct_emission(kg, sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
+		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
 			/* trace shadow ray */
 			float3 shadow;

-			if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 				/* accumulate */
 				path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
 			}
@@ -184,7 +185,8 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 #ifndef __SPLIT_KERNEL__
 /* path tracing: connect path directly to position on a light and add it to L */
 ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_addr_space RNG *rng,
-	ShaderData *sd, float3 throughput, ccl_addr_space PathState *state, PathRadiance *L)
+	ShaderData *sd, ShaderData *emission_sd, float3 throughput, ccl_addr_space PathState *state,
+	PathRadiance *L)
 {
 #ifdef __EMISSION__
 	if(!(kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
@@ -206,11 +208,11 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_
 	LightSample ls;
 	light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);

-	if(direct_emission(kg, sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
+	if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
 		/* trace shadow ray */
 		float3 shadow;

-		if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+		if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 			/* accumulate */
 			path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
 		}
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -19,7 +19,7 @@ CCL_NAMESPACE_BEGIN
 #ifdef __VOLUME_SCATTER__

 ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L)
+	ShaderData *sd, ShaderData *emission_sd, float3 throughput, PathState *state, PathRadiance *L)
 {
 #ifdef __EMISSION__
 	if(!kernel_data.integrator.use_direct_light)
@@ -44,11 +44,11 @@ ccl_device void kernel_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
 	if(ls.pdf == 0.0f)
 		return;
 	
-	if(direct_emission(kg, sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
+	if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
 		/* trace shadow ray */
 		float3 shadow;

-		if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+		if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 			/* accumulate */
 			path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
 		}
@@ -106,7 +106,7 @@ bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 }

 ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 throughput, PathState *state, PathRadiance *L,
+	ShaderData *sd, ShaderData *emission_sd, float3 throughput, PathState *state, PathRadiance *L,
 	bool sample_all_lights, Ray *ray, const VolumeSegment *segment)
 {
 #ifdef __EMISSION__
@@ -160,11 +160,11 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 				if(ls.pdf == 0.0f)
 					continue;

-				if(direct_emission(kg, sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
+				if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
 					/* trace shadow ray */
 					float3 shadow;

-					if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+					if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 						/* accumulate */
 						path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
 					}
@@ -211,11 +211,11 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 				if(ls.pdf == 0.0f)
 					continue;

-				if(direct_emission(kg, sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
+				if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
 					/* trace shadow ray */
 					float3 shadow;

-					if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+					if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 						/* accumulate */
 						path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
 					}
@@ -251,11 +251,11 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 			return;

 		/* sample random light */
-		if(direct_emission(kg, sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
+		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
 			/* trace shadow ray */
 			float3 shadow;

-			if(!shadow_blocked(kg, state, &light_ray, &shadow)) {
+			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 				/* accumulate */
 				path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
 			}
--- a/Show More
+++ b/Show More