Buildbot: Port configuration to Blender2.7 branch

Fix (unreported) missing remapping of proxy_from pointer.
That would break proxy behavior after a library reload. The usual super-annoying loop-back pointers... At least that one is easily detectable and can be fixed in-place. Found while investigating T64764.
2019-06-27 12:41:52 +02:00 · 2019-05-23 10:19:30 +02:00 · 2019-05-22 21:10:36 +02:00 · 2019-05-22 21:08:51 +02:00 · 2019-05-04 15:12:55 +02:00 · 2019-04-25 14:05:33 +02:00
140 changed files with 3182 additions and 1204 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1517,6 +1517,8 @@ elseif(CMAKE_C_COMPILER_ID MATCHES "MSVC")
 		# warning level:
 		"/W3"
 		"/w34062"  # switch statement contains 'default' but no 'case' labels
+		"/w34115"  # 'type' : named type definition in parentheses
+		"/w34189"  # local variable is initialized but not referenced
 		# disable:
 		"/wd4018"  # signed/unsigned mismatch
 		"/wd4146"  # unary minus operator applied to unsigned type, result still unsigned
--- a/build_files/build_environment/CMakeLists.txt
+++ b/build_files/build_environment/CMakeLists.txt
@@ -25,16 +25,16 @@
 # Windows and macOS, and install_deps.sh on Linux.
 #
 # WINDOWS USAGE:
-#   Don't call this cmake file your self, use build_deps.cmd
+#   Don't call this cmake file yourself, use build_deps.cmd
 #   build_deps 2013 x64 / build_deps 2013 x86
 #   build_deps 2015 x64 / build_deps 2015 x86
 #
 # MAC OS X USAGE:
-#	Install with homebrew: brew install autoconf automake libtool yasm nasm
+#   Install with homebrew: brew install cmake autoconf automake libtool yasm nasm
 #   Run "make deps" from main Blender directory
 #
 # LINUX USAGE:
-#   Install compiler, cmake, autoconf, automake, libtool, yasm
+#   Install compiler cmake autoconf automake libtool yasm nasm tcl
 #   Run "make deps" from main Blender directory
 #
 ####################################################################################################
--- a/build_files/build_environment/cmake/llvm.cmake
+++ b/build_files/build_environment/cmake/llvm.cmake
@@ -47,7 +47,8 @@ if(MSVC)
 		set(LLVM_HARVEST_COMMAND ${CMAKE_COMMAND} -E copy_directory ${LIBDIR}/llvm/ ${HARVEST_TARGET}/llvm/ )
 	else()
 		set(LLVM_HARVEST_COMMAND
-			${CMAKE_COMMAND} -E copy_directory ${LIBDIR}/llvm/lib/ ${HARVEST_TARGET}/llvm/debug/lib/
+			${CMAKE_COMMAND} -E copy_directory ${LIBDIR}/llvm/lib/ ${HARVEST_TARGET}/llvm/debug/lib/ &&
+			${CMAKE_COMMAND} -E copy_directory ${LIBDIR}/llvm/include/ ${HARVEST_TARGET}/llvm/debug/include/
 		)
 	endif()
 	ExternalProject_Add_Step(ll after_install
--- a/build_files/build_environment/cmake/openal.cmake
+++ b/build_files/build_environment/cmake/openal.cmake
@@ -18,18 +18,31 @@

 if(BUILD_MODE STREQUAL Release)
 	set(OPENAL_EXTRA_ARGS
-		-DALSOFT_UTILS=Off
-		-DALSOFT_NO_CONFIG_UTIL=On
-		-DALSOFT_EXAMPLES=Off
-		-DALSOFT_TESTS=Off
-		-DALSOFT_CONFIG=Off
-		-DALSOFT_HRTF_DEFS=Off
-		-DALSOFT_INSTALL=On
-		-DALSOFT_BACKEND_SNDIO=Off
+		-DALSOFT_UTILS=OFF
+		-DALSOFT_NO_CONFIG_UTIL=ON
+		-DALSOFT_EXAMPLES=OFF
+		-DALSOFT_TESTS=OFF
+		-DALSOFT_CONFIG=OFF
+		-DALSOFT_HRTF_DEFS=OFF
+		-DALSOFT_INSTALL=ON
+		-DALSOFT_BACKEND_SNDIO=OFF
 	)

 	if(UNIX)
-		set(OPENAL_EXTRA_ARGS ${OPENAL_EXTRA_ARGS} -DLIBTYPE=STATIC)
+		set(OPENAL_EXTRA_ARGS
+			${OPENAL_EXTRA_ARGS}
+			-DLIBTYPE=STATIC
+		)
+	endif()
+
+	if(UNIX AND NOT APPLE)
+		# Ensure we have backends for playback.
+		set(OPENAL_EXTRA_ARGS
+			${OPENAL_EXTRA_ARGS}
+			-DALSOFT_REQUIRE_ALSA=ON
+			-DALSOFT_REQUIRE_OSS=ON
+			-DALSOFT_REQUIRE_PULSEAUDIO=ON
+		)
 	endif()

 	ExternalProject_Add(external_openal
--- a/build_files/build_environment/cmake/openmp.cmake
+++ b/build_files/build_environment/cmake/openmp.cmake
@@ -23,8 +23,8 @@ ExternalProject_Add(external_openmp
 	URL_HASH MD5=${OPENMP_HASH}
 	PREFIX ${BUILD_DIR}/openmp
 	CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBDIR}/openmp ${DEFAULT_CMAKE_FLAGS}
-	INSTALL_COMMAND cd ${BUILD_DIR}/openmp/src/external_openmp-build && install_name_tool -id '@executable_path/../Resources/lib/libomp.dylib' runtime/src/libomp.dylib && make install
-	INSTALL_PATH ${LIBDIR}/openmp
+	INSTALL_COMMAND cd ${BUILD_DIR}/openmp/src/external_openmp-build && install_name_tool -id @executable_path/../Resources/lib/libomp.dylib runtime/src/libomp.dylib && make install
+	INSTALL_DIR ${LIBDIR}/openmp
 )

 add_dependencies(
--- a/build_files/build_environment/cmake/options.cmake
+++ b/build_files/build_environment/cmake/options.cmake
@@ -127,8 +127,7 @@ else()
 		)
 		set(OSX_ARCHITECTURES x86_64)
 		set(OSX_DEPLOYMENT_TARGET 10.9)
-		set(OSX_SDK_VERSION 10.13)
-		set(OSX_SYSROOT ${XCODE_DEV_PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX${OSX_SDK_VERSION}.sdk)
+		set(OSX_SYSROOT ${XCODE_DEV_PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk)

 		set(PLATFORM_CFLAGS "-isysroot ${OSX_SYSROOT} -mmacosx-version-min=${OSX_DEPLOYMENT_TARGET}")
 		set(PLATFORM_CXXFLAGS "-isysroot ${OSX_SYSROOT} -mmacosx-version-min=${OSX_DEPLOYMENT_TARGET} -std=c++11 -stdlib=libc++")
--- a/build_files/build_environment/cmake/osl.cmake
+++ b/build_files/build_environment/cmake/osl.cmake
@@ -81,11 +81,6 @@ if(WIN32)
 		${OSL_EXTRA_ARGS}
 		-DPUGIXML_HOME=${LIBDIR}/pugixml
 	)
-elseif(UNIX AND NOT APPLE)
-	set(OSL_EXTRA_ARGS
-		${OSL_EXTRA_ARGS}
-		-DPUGIXML_HOME=${LIBDIR}/pugixml
-	)
 elseif(APPLE)
 	# Make symbol hiding consistent with OIIO which defaults to OFF,
 	# avoids linker warnings on macOS
--- a/build_files/build_environment/cmake/versions.cmake
+++ b/build_files/build_environment/cmake/versions.cmake
@@ -251,7 +251,7 @@ set(JEMALLOC_URI https://github.com/jemalloc/jemalloc/releases/download/${JEMALL
 set(JEMALLOC_HASH 507f7b6b882d868730d644510491d18f)

 set(XML2_VERSION 2.9.4)
-set(XML2_URI ftp://xmlsoft.org/libxml2/libxml2-${XML2_VERSION}.tar.gz)
+set(XML2_URI http://xmlsoft.org/sources/libxml2-${XML2_VERSION}.tar.gz)
 set(XML2_HASH ae249165c173b1ff386ee8ad676815f5)

 set(TINYXML_VERSION 2_6_2)
@@ -284,7 +284,7 @@ set(BZIP2_URI http://http.debian.net/debian/pool/main/b/bzip2/bzip2_${BZIP2_VERS
 set(BZIP2_HASH d70a9ccd8bdf47e302d96c69fecd54925f45d9c7b966bb4ef5f56b770960afa7)

 set(FFI_VERSION 3.2.1)
-set(FFI_URI ftp://sourceware.org/pub/libffi/libffi-${FFI_VERSION}.tar.gz)
+set(FFI_URI https://sourceware.org/pub/libffi/libffi-${FFI_VERSION}.tar.gz)
 set(FFI_HASH d06ebb8e1d9a22d19e38d63fdb83954253f39bedc5d46232a05645685722ca37)

 set(LZMA_VERSION 5.2.4)
--- a/build_files/build_environment/install_deps.sh
+++ b/build_files/build_environment/install_deps.sh
@@ -4529,18 +4529,33 @@ print_info() {
      PRINT "  $_1"
      _buildargs="$_buildargs $_1"
    fi
+    if [ -d $INST/blosc ]; then
+      _1="-D BLOSC_ROOT_DIR=$INST/blosc"
+      PRINT "  $_1"
+      _buildargs="$_buildargs $_1"
+    fi
  fi

  if [ "$WITH_OPENCOLLADA" = true ]; then
    _1="-D WITH_OPENCOLLADA=ON"
    PRINT "  $_1"
    _buildargs="$_buildargs $_1"
+    if [ -d $INST/opencollada ]; then
+      _1="-D OPENCOLLADA_ROOT_DIR=$INST/opencollada"
+      PRINT "  $_1"
+      _buildargs="$_buildargs $_1"
+    fi
  fi

  if [ "$WITH_EMBREE" = true ]; then
    _1="-D WITH_CYCLES_EMBREE=ON"
    PRINT "  $_1"
    _buildargs="$_buildargs $_1"
+    if [ -d $INST/embree ]; then
+      _1="-D EMBREE_ROOT_DIR=$INST/embree"
+      PRINT "  $_1"
+      _buildargs="$_buildargs $_1"
+    fi
  fi

  if [ "$WITH_JACK" = true ]; then
--- a/build_files/buildbot/config/blender_linux.cmake
+++ b/build_files/buildbot/config/blender_linux.cmake
@@ -2,33 +2,22 @@

 include("${CMAKE_CURRENT_LIST_DIR}/../../cmake/config/blender_release.cmake")

-# Detect which libc we'll be linking against.
-# Some of the paths will depend on this
-
+# For libc-2.24 we are using chroot which runs on a 64bit system.
+# There we can not use CPU bitness check since it is always 64bit. So instead
+# we check for a specific libraries.
+#
+# Other builders we are runnign in a bare virtual machine, and the libraries
+# are installed to /opt/.
+# We assume that only 64bit builders exists in such configuration.
 if(EXISTS "/lib/x86_64-linux-gnu/libc-2.24.so")
 	message(STATUS "Building in GLibc-2.24 environment")
-	set(GLIBC "2.24")
-	set(MULTILIB "/x86_64-linux-gnu")
 	set(LIBDIR_NAME "linux_x86_64")
 elseif(EXISTS "/lib/i386-linux-gnu//libc-2.24.so")
 	message(STATUS "Building in GLibc-2.24 environment")
-	set(GLIBC "2.24")
-	set(MULTILIB "/i386-linux-gnu")
 	set(LIBDIR_NAME "linux_i686")
-elseif(EXISTS "/lib/x86_64-linux-gnu/libc-2.19.so")
-	message(STATUS "Building in GLibc-2.19 environment")
-	set(GLIBC "2.19")
-	set(MULTILIB "/x86_64-linux-gnu")
-elseif(EXISTS "/lib/i386-linux-gnu//libc-2.19.so")
-	message(STATUS "Building in GLibc-2.19 environment")
-	set(GLIBC "2.19")
-	set(MULTILIB "/i386-linux-gnu")
-elseif(EXISTS "/lib/libc-2.11.3.so")
-	message(STATUS "Building in GLibc-2.11 environment")
-	set(GLIBC "2.11")
-	set(MULTILIB "")
 else()
-	message(FATAL_ERROR "Unknown build environment")
+	message(STATUS "Building in generic 64bit environment")
+	set(LIBDIR_NAME "linux_x86_64")
 endif()

 # Default to only build Blender, not the player
@@ -52,122 +41,8 @@ set(WITH_PYTHON_INSTALL_REQUESTS ON CACHE BOOL "" FORCE)

 # ######## Release environment specific settings ########

-if (NOT ${GLIBC} STREQUAL "2.24")
-
-# All the hardcoded library paths and such
-
-# LLVM libraries
-set(LLVM_VERSION             "3.4"  CACHE STRING "" FORCE)
-set(LLVM_ROOT_DIR            "/opt/lib/llvm-${LLVM_VERSION}"  CACHE STRING "" FORCE)
-set(LLVM_STATIC              ON  CACHE BOOL "" FORCE)
-
-# BOOST libraries
-set(BOOST_ROOT               "/opt/lib/boost" CACHE STRING "" FORCE)
-set(Boost_USE_STATIC_LIBS    ON CACHE BOOL "" FORCE)
-
-# FFmpeg libraries
-set(FFMPEG                   "/opt/lib/ffmpeg" CACHE STRING "" FORCE)
-set(FFMPEG_LIBRARIES
-	avdevice avformat avcodec avutil avfilter swscale swresample
-	/usr/lib${MULTILIB}/libxvidcore.a
-	/usr/lib${MULTILIB}/libx264.a
-	/usr/lib${MULTILIB}/libmp3lame.a
-	/usr/lib${MULTILIB}/libvpx.a
-	/usr/lib${MULTILIB}/libvorbis.a
-	/usr/lib${MULTILIB}/libogg.a
-	/usr/lib${MULTILIB}/libvorbisenc.a
-	/usr/lib${MULTILIB}/libtheora.a
-	/usr/lib${MULTILIB}/libschroedinger-1.0.a
-	/usr/lib${MULTILIB}/liborc-0.4.a
-	CACHE STRING "" FORCE
-)
-
-# SndFile libraries
-set(SNDFILE_LIBRARY          "/usr/lib${MULTILIB}/libsndfile.a;/usr/lib${MULTILIB}/libFLAC.a" CACHE STRING "" FORCE)
-
-# OpenAL libraries
-set(OPENAL_ROOT_DIR           "/opt/lib/openal" CACHE STRING "" FORCE)
-set(OPENAL_INCLUDE_DIR        "${OPENAL_ROOT_DIR}/include/AL" CACHE STRING "" FORCE)
-set(OPENAL_LIBRARY
-	${OPENAL_ROOT_DIR}/lib/libopenal.a
-	${OPENAL_ROOT_DIR}/lib/libcommon.a
-	CACHE STRING "" FORCE
-)
-
-# OpenCollada libraries
-set(OPENCOLLADA_UTF_LIBRARY   ""                              CACHE STRING "" FORCE)
-set(PCRE_INCLUDE_DIR          "/usr/include"                  CACHE STRING "" FORCE)
-set(PCRE_LIBRARY              "/usr/lib${MULTILIB}/libpcre.a" CACHE STRING "" FORCE)
-set(XML2_INCLUDE_DIR          "/usr/include"                  CACHE STRING "" FORCE)
-set(XML2_LIBRARY              "/usr/lib${MULTILIB}/libxml2.a" CACHE STRING "" FORCE)
-
-# OpenColorIO libraries
-set(OPENCOLORIO_ROOT_DIR      "/opt/lib/ocio" CACHE STRING "" FORCE)
-set(OPENCOLORIO_OPENCOLORIO_LIBRARY "${OPENCOLORIO_ROOT_DIR}/lib/libOpenColorIO.a" CACHE STRING "" FORCE)
-set(OPENCOLORIO_TINYXML_LIBRARY "${OPENCOLORIO_ROOT_DIR}/lib/libtinyxml.a"         CACHE STRING "" FORCE)
-set(OPENCOLORIO_YAML-CPP_LIBRARY "${OPENCOLORIO_ROOT_DIR}/lib/libyaml-cpp.a"       CACHE STRING "" FORCE)
-
-# Freetype
-set(FREETYPE_INCLUDE_DIRS "/usr/include/freetype2"       CACHE STRING "" FORCE)
-set(FREETYPE_LIBRARY "/usr/lib${MULTILIB}/libfreetype.a" CACHE STRING "" FORCE)
-
-# OpenImageIO
-if(GLIBC EQUAL "2.19")
-	set(OPENIMAGEIO_LIBRARY
-		/opt/lib/oiio/lib/libOpenImageIO.a
-		/opt/lib/oiio/lib/libOpenImageIO_Util.a
-		/usr/lib${MULTILIB}/libwebp.a
-		/usr/lib${MULTILIB}/liblzma.a
-		/usr/lib${MULTILIB}/libjbig.a
-		${FREETYPE_LIBRARY}
-		CACHE STRING "" FORCE
-	)
-endif()
-
-# OpenSubdiv libraries
-set(OPENSUBDIV_ROOT_DIR "/opt/lib/opensubdiv" CACHE STRING "" FORCE)
-set(OPENSUBDIV_OSDCPU_LIBRARY "${OPENSUBDIV_ROOT_DIR}/lib/libosdCPU.a" CACHE STRING "" FORCE)
-set(OPENSUBDIV_OSDGPU_LIBRARY "${OPENSUBDIV_ROOT_DIR}/lib/libosdGPU.a" CACHE STRING "" FORCE)
-
-# OpenEXR libraries
-set(OPENEXR_ROOT_DIR          "/opt/lib/openexr"                    CACHE STRING "" FORCE)
-set(OPENEXR_HALF_LIBRARY      "/opt/lib/openexr/lib/libHalf.a"      CACHE STRING "" FORCE)
-set(OPENEXR_IEX_LIBRARY       "/opt/lib/openexr/lib/libIex.a"       CACHE STRING "" FORCE)
-set(OPENEXR_ILMIMF_LIBRARY    "/opt/lib/openexr/lib/libIlmImf.a"    CACHE STRING "" FORCE)
-set(OPENEXR_ILMTHREAD_LIBRARY "/opt/lib/openexr/lib/libIlmThread.a" CACHE STRING "" FORCE)
-set(OPENEXR_IMATH_LIBRARY     "/opt/lib/openexr/lib/libImath.a"     CACHE STRING "" FORCE)
-
-# JeMalloc library
-set(JEMALLOC_LIBRARY    "/opt/lib/jemalloc/lib/libjemalloc.a" CACHE STRING "" FORCE)
-
-# Space navigation
-set(SPACENAV_ROOT_DIR       "/opt/lib/libspnav" CACHE STRING "" FORCE)
-
-# Force some system libraries to be static
-set(FFTW3_LIBRARY       "/usr/lib${MULTILIB}/libfftw3.a" CACHE STRING "" FORCE)
-set(JPEG_LIBRARY        "/usr/lib${MULTILIB}/libjpeg.a"  CACHE STRING "" FORCE)
-set(PNG_LIBRARY         "/usr/lib${MULTILIB}/libpng.a"   CACHE STRING "" FORCE)
-set(TIFF_LIBRARY        "/usr/lib${MULTILIB}/libtiff.a"  CACHE STRING "" FORCE)
-set(ZLIB_LIBRARY        "/usr/lib${MULTILIB}/libz.a"     CACHE STRING "" FORCE)
-
-# OpenVDB
-set(OPENVDB_LIBRARY
-	/opt/lib/openvdb/lib/libopenvdb.a
-	CACHE BOOL "" FORCE
-)
-
-set(BLOSC_LIBRARY
-	/opt/lib/blosc/lib/libblosc.a
-	CACHE BOOL "" FORCE
-)
-
-else()
-
 set(LIBDIR "/opt/blender-deps/${LIBDIR_NAME}" CACHE BOOL "" FORCE)

-# TODO(sergey): Remove once Python is oficially bumped to 3.7.
-set(PYTHON_VERSION    3.7 CACHE BOOL "" FORCE)
-
 # Platform specific configuration, to ensure static linking against everything.

 set(Boost_USE_STATIC_LIBS    ON CACHE BOOL "" FORCE)
@@ -178,7 +53,5 @@ set(Boost_USE_STATIC_LIBS    ON CACHE BOOL "" FORCE)
 set(PCRE_INCLUDE_DIR          "/usr/include"                        CACHE STRING "" FORCE)
 set(PCRE_LIBRARY              "${LIBDIR}/opencollada/lib/libpcre.a" CACHE STRING "" FORCE)

-endif()
-
 # Additional linking libraries
 set(CMAKE_EXE_LINKER_FLAGS   "-lrt -static-libstdc++ -no-pie"  CACHE STRING "" FORCE)
--- a/build_files/buildbot/slave_compile.py
+++ b/build_files/buildbot/slave_compile.py
@@ -52,21 +52,17 @@ if 'cmake' in builder:
    build_dir = os.path.abspath(os.path.join('..', 'build', builder))
    install_dir = os.path.abspath(os.path.join('..', 'install', builder))
    targets = ['blender']
+    command_prefix = []

-    chroot_name = None  # If not None command will be delegated to that chroot
-    cuda_chroot_name = None  # If not None cuda compilationcommand will be delegated to that chroot
-    build_cubins = True  # Whether to build Cycles CUDA kernels
    bits = 64

    # Config file to be used (relative to blender's sources root)
    cmake_config_file = "build_files/cmake/config/blender_release.cmake"
    cmake_player_config_file = None
-    cmake_cuda_config_file = None

    # Set build options.
    cmake_options = []
    cmake_extra_options = ['-DCMAKE_BUILD_TYPE:STRING=Release']
-    cuda_cmake_options = []

    if builder.startswith('mac'):
        # Set up OSX architecture
@@ -75,94 +71,42 @@ if 'cmake' in builder:
        cmake_extra_options.append('-DCMAKE_OSX_DEPLOYMENT_TARGET=10.9')

    elif builder.startswith('win'):
-        if builder.endswith('_vs2017'):
-            if builder.startswith('win64'):
-                cmake_options.extend(['-G', 'Visual Studio 15 2017 Win64'])
-            elif builder.startswith('win32'):
-                bits = 32
-                cmake_options.extend(['-G', 'Visual Studio 15 2017'])
-        elif builder.endswith('_vc2015'):
-            if builder.startswith('win64'):
-                cmake_options.extend(['-G', 'Visual Studio 14 2015 Win64'])
-            elif builder.startswith('win32'):
-                bits = 32
-                cmake_options.extend(['-G', 'Visual Studio 14 2015'])
-            cmake_extra_options.append('-DCUDA_NVCC_FLAGS=--cl-version;2013;' +
-                '--compiler-bindir;C:\\Program Files (x86)\\Microsoft Visual Studio 12.0\\VC\\bin')
-        else:
-            if builder.startswith('win64'):
-                cmake_options.extend(['-G', 'Visual Studio 12 2013 Win64'])
-            elif builder.startswith('win32'):
-                bits = 32
-                cmake_options.extend(['-G', 'Visual Studio 12 2013'])
+        if builder.startswith('win64'):
+            cmake_options.extend(['-G', 'Visual Studio 15 2017 Win64'])
+        elif builder.startswith('win32'):
+            bits = 32
+            cmake_options.extend(['-G', 'Visual Studio 15 2017'])

    elif builder.startswith('linux'):
+        cmake_config_file = "build_files/buildbot/config/blender_linux.cmake"
+        cmake_player_config_file = "build_files/buildbot/config/blender_linux_player.cmake"
+        targets = ['player', 'blender']
        tokens = builder.split("_")
        glibc = tokens[1]
        if glibc == 'glibc224':
            deb_name = "stretch"
-        elif glibc == 'glibc219':
-            deb_name = "jessie"
-        elif glibc == 'glibc211':
-            deb_name = "squeeze"
-        cmake_config_file = "build_files/buildbot/config/blender_linux.cmake"
-        cmake_player_config_file = "build_files/buildbot/config/blender_linux_player.cmake"
-        if builder.endswith('x86_64_cmake'):
-            chroot_name = 'buildbot_' + deb_name + '_x86_64'
-            targets = ['player', 'blender']
-        elif builder.endswith('i686_cmake'):
-            bits = 32
-            chroot_name = 'buildbot_' + deb_name + '_i686'
-            targets = ['player', 'blender']
-        if deb_name != "stretch":
-            cmake_extra_options.extend(["-DCMAKE_C_COMPILER=/usr/bin/gcc-7",
-                                        "-DCMAKE_CXX_COMPILER=/usr/bin/g++-7"])
-
-    # Workaround to build only sm_7x kernels with CUDA 10, until
-    # older kernels work well with this version.
-    if builder.startswith('win'):
-        cmake_extra_options.append('-DCUDA_VERSION=9.1')
-        cmake_extra_options.append('-DCUDA_TOOLKIT_INCLUDE:PATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.1/include')
-        cmake_extra_options.append('-DCUDA_TOOLKIT_ROOT_DIR:PATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.1')
-        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE:FILEPATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.1/bin/nvcc.exe')
-        cmake_extra_options.append('-DCUDA10_NVCC_EXECUTABLE:FILEPATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0/bin/nvcc.exe')
-        cmake_extra_options.append('-DCUDA10_TOOLKIT_ROOT_DIR:PATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0')
-    elif builder.startswith('linux'):
-        cmake_extra_options.append('-DCUDA_VERSION=9.1')
-        cmake_extra_options.append('-DCUDA_TOOLKIT_INCLDUE:PATH=/usr/local/cuda-9.1/include')
-        cmake_extra_options.append('-DCUDA_TOOLKIT_ROOT_DIR:PATH=/usr/local/cuda-9.1')
-        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE:FILEPATH=/usr/local/cuda-9.1/bin/nvcc')
-        cmake_extra_options.append('-DCUDA10_NVCC_EXECUTABLE:FILEPATH=/usr/local/cuda-10.0/bin/nvcc')
-        cmake_extra_options.append('-DCUDA10_TOOLKIT_ROOT_DIR:PATH=/usr/local/cuda-10.0')
+            if builder.endswith('x86_64_cmake'):
+                chroot_name = 'buildbot_' + deb_name + '_x86_64'
+            elif builder.endswith('i686_cmake'):
+                bits = 32
+                chroot_name = 'buildbot_' + deb_name + '_i686'
+            command_prefix = ['schroot', '-c', chroot_name, '--']
+        elif glibc == 'glibc217':
+            command_prefix = ['scl', 'enable', 'devtoolset-6', '--']

    cmake_options.append("-C" + os.path.join(blender_dir, cmake_config_file))

    # Prepare CMake options needed to configure cuda binaries compilation, 64bit only.
    if bits == 64:
-        cuda_cmake_options.append("-DWITH_CYCLES_CUDA_BINARIES=%s" % ('ON' if build_cubins else 'OFF'))
-        if build_cubins or 'cuda' in targets:
-            cuda_cmake_options.append("-DCUDA_64_BIT_DEVICE_CODE=ON")
-
-        # Only modify common cmake options if cuda doesn't require separate target.
-        if 'cuda' not in targets:
-            cmake_options += cuda_cmake_options
+        cmake_options.append("-DWITH_CYCLES_CUDA_BINARIES=ON")
+        cmake_options.append("-DCUDA_64_BIT_DEVICE_CODE=ON")
    else:
-        cuda_cmake_options.append("-DWITH_CYCLES_CUDA_BINARIES=OFF")
+        cmake_options.append("-DWITH_CYCLES_CUDA_BINARIES=OFF")

    cmake_options.append("-DCMAKE_INSTALL_PREFIX=%s" % (install_dir))

    cmake_options += cmake_extra_options

-    # Prepare chroot command prefix if needed
-    if chroot_name:
-        chroot_prefix = ['schroot', '-c', chroot_name, '--']
-    else:
-        chroot_prefix = []
-    if cuda_chroot_name:
-        cuda_chroot_prefix = ['schroot', '-c', cuda_chroot_name, '--']
-    else:
-        cuda_chroot_prefix = chroot_prefix[:]
-
    # Make sure no garbage remained from the previous run
    if os.path.isdir(install_dir):
        shutil.rmtree(install_dir)
@@ -171,7 +115,7 @@ if 'cmake' in builder:
        print("Building target %s" % (target))
        # Construct build directory name based on the target
        target_build_dir = build_dir
-        target_chroot_prefix = chroot_prefix[:]
+        target_command_prefix = command_prefix[:]
        if target != 'blender':
            target_build_dir += '_' + target
        target_name = 'install'
@@ -179,21 +123,13 @@ if 'cmake' in builder:
        target_cmake_options = cmake_options[:]
        if target == 'player':
            target_cmake_options.append("-C" + os.path.join(blender_dir, cmake_player_config_file))
-        elif target == 'cuda':
-            target_cmake_options += cuda_cmake_options
-            target_chroot_prefix = cuda_chroot_prefix[:]
-            target_name = 'cycles_kernel_cuda'
-        # If cuda binaries are compiled as a separate target, make sure
-        # other targets don't compile cuda binaries.
-        if 'cuda' in targets and target != 'cuda':
-            target_cmake_options.append("-DWITH_CYCLES_CUDA_BINARIES=OFF")
        # Do extra git fetch because not all platform/git/buildbot combinations
        # update the origin remote, causing buildinfo to detect local changes.
        os.chdir(blender_dir)
        print("Fetching remotes")
        command = ['git', 'fetch', '--all']
        print(command)
-        retcode = subprocess.call(target_chroot_prefix + command)
+        retcode = subprocess.call(target_command_prefix + command)
        if retcode != 0:
            sys.exit(retcode)
        # Make sure build directory exists and enter it
@@ -206,7 +142,13 @@ if 'cmake' in builder:
        if os.path.exists('CMakeCache.txt'):
            print("Removing CMake cache")
            os.remove('CMakeCache.txt')
-        retcode = subprocess.call(target_chroot_prefix + ['cmake', blender_dir] + target_cmake_options)
+        # Remove buildinfo files to force buildbot to re-generate them.
+        for buildinfo in ('buildinfo.h', 'buildinfo.h.txt', ):
+            full_path = os.path.join('source', 'creator', buildinfo)
+            if os.path.exists(full_path):
+                print("Removing {}" . format(buildinfo))
+                os.remove(full_path)
+        retcode = subprocess.call(target_command_prefix + ['cmake', blender_dir] + target_cmake_options)
        if retcode != 0:
            print('Configuration FAILED!')
            sys.exit(retcode)
@@ -218,21 +160,11 @@ if 'cmake' in builder:

        print("Executing command:")
        print(command)
-        retcode = subprocess.call(target_chroot_prefix + command)
+        retcode = subprocess.call(target_command_prefix + command)

        if retcode != 0:
            sys.exit(retcode)

-        if builder.startswith('linux') and target == 'cuda':
-            blender_h = os.path.join(blender_dir, "source", "blender", "blenkernel", "BKE_blender_version.h")
-            blender_version = int(parse_header_file(blender_h, 'BLENDER_VERSION'))
-            blender_version = "%d.%d" % (blender_version // 100, blender_version % 100)
-            kernels = os.path.join(target_build_dir, 'intern', 'cycles', 'kernel')
-            install_kernels = os.path.join(install_dir, blender_version, 'scripts', 'addons', 'cycles', 'lib')
-            os.mkdir(install_kernels)
-            print("Copying cuda binaries from %s to %s" % (kernels, install_kernels))
-            os.system('cp %s/*.cubin %s' % (kernels, install_kernels))
-
 else:
    print("Unknown building system")
    sys.exit(1)
--- a/build_files/buildbot/slave_pack.py
+++ b/build_files/buildbot/slave_pack.py
@@ -126,7 +126,6 @@ if builder.find('cmake') != -1:

    elif builder.startswith('linux_'):
        blender = os.path.join(install_dir, 'blender')
-        blenderplayer = os.path.join(install_dir, 'blenderplayer')

        buildinfo_h = os.path.join(build_dir, "source", "creator", "buildinfo.h")
        blender_h = os.path.join(blender_dir, "source", "blender", "blenkernel", "BKE_blender_version.h")
@@ -136,24 +135,28 @@ if builder.find('cmake') != -1:
        blender_version = "%d.%d" % (blender_version // 100, blender_version % 100)
        blender_hash = parse_header_file(buildinfo_h, 'BUILD_HASH')[1:-1]
        blender_glibc = builder.split('_')[1]
+        command_prefix = []
+        bits = 64
+        blender_arch = 'x86_64'

-        if builder.endswith('x86_64_cmake'):
-            chroot_name = 'buildbot_jessie_x86_64'
-            bits = 64
-            blender_arch = 'x86_64'
-        elif builder.endswith('i686_cmake'):
-            chroot_name = 'buildbot_jessie_i686'
-            bits = 32
-            blender_arch = 'i686'
+        if blender_glibc == 'glibc224':
+            if builder.endswith('x86_64_cmake'):
+                chroot_name = 'buildbot_stretch_x86_64'
+            elif builder.endswith('i686_cmake'):
+                chroot_name = 'buildbot_stretch_i686'
+                bits = 32
+                blender_arch = 'i686'
+            command_prefix = ['schroot', '-c', chroot_name, '--']
+        elif blender_glibc == 'glibc217':
+            command_prefix = ['scl', 'enable', 'devtoolset-6', '--']

        # Strip all unused symbols from the binaries
        print("Stripping binaries...")
-        chroot_prefix = ['schroot', '-c', chroot_name, '--']
-        subprocess.call(chroot_prefix + ['strip', '--strip-all', blender, blenderplayer])
+        subprocess.call(command_prefix + ['strip', '--strip-all', blender])

        print("Stripping python...")
        py_target = os.path.join(install_dir, blender_version)
-        subprocess.call(chroot_prefix + ['find', py_target, '-iname', '*.so', '-exec', 'strip', '-s', '{}', ';'])
+        subprocess.call(command_prefix + ['find', py_target, '-iname', '*.so', '-exec', 'strip', '-s', '{}', ';'])

        # Copy all specific files which are too specific to be copied by
        # the CMake rules themselves
--- a/build_files/cmake/config/blender_release.cmake
+++ b/build_files/cmake/config/blender_release.cmake
@@ -51,9 +51,10 @@ set(WITH_SDL                 ON  CACHE BOOL "" FORCE)
 set(WITH_X11_XINPUT          ON  CACHE BOOL "" FORCE)
 set(WITH_X11_XF86VMODE       ON  CACHE BOOL "" FORCE)

-set(WITH_PLAYER              ON  CACHE BOOL "" FORCE)
-set(WITH_MEM_JEMALLOC        ON  CACHE BOOL "" FORCE)
-set(WITH_CYCLES_CUDA_BINARIES ON  CACHE BOOL "" FORCE)
+set(WITH_PLAYER                ON  CACHE BOOL "" FORCE)
+set(WITH_MEM_JEMALLOC          ON  CACHE BOOL "" FORCE)
+set(WITH_CYCLES_CUDA_BINARIES  ON  CACHE BOOL "" FORCE)
+set(WITH_CYCLES_CUBIN_COMPILER OFF CACHE BOOL "" FORCE)
 set(CYCLES_CUDA_BINARIES_ARCH sm_30;sm_35;sm_37;sm_50;sm_52;sm_60;sm_61;sm_70;sm_75 CACHE STRING "" FORCE)

 # platform dependent options
--- a/build_files/cmake/platform/platform_apple.cmake
+++ b/build_files/cmake/platform/platform_apple.cmake
@@ -390,9 +390,9 @@ if(WITH_OPENMP)
 		message(STATUS "Using ${LIBDIR}/openmp for OpenMP")
 		set(OPENMP_CUSTOM ON)
 		set(OPENMP_FOUND ON)
-		set(OpenMP_C_FLAGS "-Xclang -fopenmp -I\"${LIBDIR}/openmp/include\"")
-		set(OpenMP_CXX_FLAGS "-Xclang -fopenmp -I\"${LIBDIR}/openmp/include\"")
-		set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L\"${LIBDIR}/openmp/lib\" -lomp")
+		set(OpenMP_C_FLAGS "-Xclang -fopenmp -I'${LIBDIR}/openmp/include'")
+		set(OpenMP_CXX_FLAGS "-Xclang -fopenmp -I'${LIBDIR}/openmp/include'")
+		set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L'${LIBDIR}/openmp/lib' -lomp")

 		# Copy libomp.dylib to allow executables like datatoc to work.
 		execute_process(
--- a/build_files/cmake/platform/platform_win32.cmake
+++ b/build_files/cmake/platform/platform_win32.cmake
@@ -430,6 +430,7 @@ endif()

 if(WITH_LLVM)
 	set(LLVM_ROOT_DIR ${LIBDIR}/llvm CACHE PATH	"Path to the LLVM installation")
+	set(LLVM_INCLUDE_DIRS ${LLVM_ROOT_DIR}/$<$<CONFIG:Debug>:Debug>/include CACHE PATH	"Path to the LLVM include directory")
 	file(GLOB LLVM_LIBRARY_OPTIMIZED ${LLVM_ROOT_DIR}/lib/*.lib)

 	if(EXISTS ${LLVM_ROOT_DIR}/debug/lib)
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -301,6 +301,8 @@ if(WITH_CYCLES_CUDA_BINARIES AND (NOT WITH_CYCLES_CUBIN_COMPILER))
 			set(MAX_MSVC 1911)
 		elseif(${CUDA_VERSION} EQUAL "10.0")
 			set(MAX_MSVC 1999)
+		elseif(${CUDA_VERSION} EQUAL "10.1")
+			set(MAX_MSVC 1999)
 		endif()
 		if(NOT MSVC_VERSION LESS ${MAX_MSVC} OR CMAKE_C_COMPILER_ID MATCHES "Clang")
 			message(STATUS "nvcc not supported for this compiler version, using cycles_cubin_cc instead.")
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -58,7 +58,12 @@ link_directories(${OPENIMAGEIO_LIBPATH}
                 ${JPEG_LIBPATH}
                 ${ZLIB_LIBPATH}
                 ${TIFF_LIBPATH}
-                 ${OPENEXR_LIBPATH})
+                 ${OPENEXR_LIBPATH}
+                 ${OPENJPEG_LIBPATH})
+
+if(WITH_OPENCOLORIO)
+	link_directories(${OPENCOLORIO_LIBPATH})
+endif()

 add_definitions(${GL_DEFINITIONS})

@@ -84,13 +89,13 @@ macro(cycles_target_link_libraries target)
 		target_link_libraries(${target} ${OPENSUBDIV_LIBRARIES})
 	endif()
 	if(WITH_OPENCOLORIO)
-		link_directories(${OPENCOLORIO_LIBPATH})
 		target_link_libraries(${target} ${OPENCOLORIO_LIBRARIES})
 	endif()
 	target_link_libraries(
 		${target}
 		${OPENIMAGEIO_LIBRARIES}
 		${OPENEXR_LIBRARIES}
+		${OPENJPEG_LIBRARIES}
 		${PUGIXML_LIBRARIES}
 		${BOOST_LIBRARIES}
 		${CMAKE_DL_LIBS}
@@ -148,6 +153,7 @@ if(WITH_CYCLES_CUBIN_COMPILER)
 			extern_cuew
 			${OPENIMAGEIO_LIBRARIES}
 			${OPENEXR_LIBRARIES}
+			${OPENJPEG_LIBRARIES}
 			${PUGIXML_LIBRARIES}
 			${BOOST_LIBRARIES}
 			${PLATFORM_LINKLIBS}
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -442,7 +442,7 @@ static void xml_read_mesh(const XMLReadState& state, xml_node node)
 		if(xml_read_float_array(UV, node, "UV")) {
 			ustring name = ustring("UVMap");
 			Attribute *attr = mesh->attributes.add(ATTR_STD_UV, name);
-			float3 *fdata = attr->data_float3();
+			float2 *fdata = attr->data_float2();

 			/* loop over the triangles */
 			index_offset = 0;
@@ -456,9 +456,9 @@ static void xml_read_mesh(const XMLReadState& state, xml_node node)
 					assert(v1*2+1 < (int)UV.size());
 					assert(v2*2+1 < (int)UV.size());

-					fdata[0] = make_float3(UV[v0*2], UV[v0*2+1], 0.0);
-					fdata[1] = make_float3(UV[v1*2], UV[v1*2+1], 0.0);
-					fdata[2] = make_float3(UV[v2*2], UV[v2*2+1], 0.0);
+					fdata[0] = make_float2(UV[v0*2], UV[v0*2+1]);
+					fdata[1] = make_float2(UV[v1*2], UV[v1*2+1]);
+					fdata[2] = make_float2(UV[v2*2], UV[v2*2+1]);
 					fdata += 3;
 				}

@@ -516,8 +516,6 @@ static void xml_read_mesh(const XMLReadState& state, xml_node node)
 		xml_read_float(&sdparams.dicing_rate, node, "dicing_rate");
 		sdparams.dicing_rate = std::max(0.1f, sdparams.dicing_rate);

-		state.scene->camera->update(state.scene);
-		sdparams.camera = state.scene->camera;
 		sdparams.objecttoworld = state.tfm;
 	}

--- a/intern/cycles/app/io_export_cycles_xml.py
+++ b/intern/cycles/app/io_export_cycles_xml.py
@@ -16,7 +16,6 @@

 # XML exporter for generating test files, not intended for end users

-import os
 import xml.etree.ElementTree as etree
 import xml.dom.minidom as dom

--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -253,8 +253,6 @@ def register_passes(engine, scene, srl):
    if crl.use_pass_volume_direct:             engine.register_pass(scene, srl, "VolumeDir",                     3, "RGB", 'COLOR')
    if crl.use_pass_volume_indirect:           engine.register_pass(scene, srl, "VolumeInd",                     3, "RGB", 'COLOR')

-    cscene = scene.cycles
-
    if crl.use_pass_crypto_object:
        for i in range(0, crl.pass_crypto_depth, 2):
            engine.register_pass(scene, srl, "CryptoObject" + '{:02d}'.format(i), 4, "RGBA", 'COLOR')
--- a/intern/cycles/blender/addon/operators.py
+++ b/intern/cycles/blender/addon/operators.py
@@ -124,9 +124,48 @@ class CYCLES_OT_denoise_animation(Operator):
        return {'FINISHED'}


+class CYCLES_OT_merge_images(Operator):
+    "Combine OpenEXR multilayer images rendered with different sample" \
+    "ranges into one image with reduced noise"
+    bl_idname = "cycles.merge_images"
+    bl_label = "Merge Images"
+
+    input_filepath1: StringProperty(
+        name='Input Filepath',
+        description='File path for image to merge',
+        default='',
+        subtype='FILE_PATH')
+
+    input_filepath2: StringProperty(
+        name='Input Filepath',
+        description='File path for image to merge',
+        default='',
+        subtype='FILE_PATH')
+
+    output_filepath: StringProperty(
+        name='Output Filepath',
+        description='File path for merged image',
+        default='',
+        subtype='FILE_PATH')
+
+    def execute(self, context):
+        in_filepaths = [self.input_filepath1, self.input_filepath2]
+        out_filepath = self.output_filepath
+
+        import _cycles
+        try:
+            _cycles.merge(input=in_filepaths, output=out_filepath)
+        except Exception as e:
+            self.report({'ERROR'}, str(e))
+            return {'FINISHED'}
+
+        return {'FINISHED'}
+
+
 classes = (
    CYCLES_OT_use_shading_nodes,
-    CYCLES_OT_denoise_animation
+    CYCLES_OT_denoise_animation,
+    CYCLES_OT_merge_images
 )

 def register():
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -194,13 +194,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        cls.samples = IntProperty(
            name="Samples",
            description="Number of samples to render for each pixel",
-            min=1, max=2147483647,
+            min=1, max=(1 << 24),
            default=128,
        )
        cls.preview_samples = IntProperty(
            name="Preview Samples",
            description="Number of samples to render in the viewport, unlimited if 0",
-            min=0, max=2147483647,
+            min=0, max=(1 << 24),
            default=32,
        )
        cls.preview_pause = BoolProperty(
@@ -724,12 +724,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
            update=devices_update_callback
        )

-        cls.debug_opencl_kernel_single_program = BoolProperty(
-            name="Single Program",
-            default=False,
-            update=devices_update_callback,
-        )
-
        cls.debug_use_opencl_debug = BoolProperty(name="Debug OpenCL", default=False)

        cls.debug_opencl_mem_limit = IntProperty(name="Memory limit", default=0,
@@ -1481,10 +1475,11 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                # Update name in case it changed
                entry.name = device[0]

-    def get_devices(self):
+    # Gets all devices types by default.
+    def get_devices(self, compute_device_type=''):
        import _cycles
        # Layout of the device tuples: (Name, Type, Persistent ID)
-        device_list = _cycles.available_devices(self.compute_device_type)
+        device_list = _cycles.available_devices(compute_device_type)
        # Make sure device entries are up to date and not referenced before
        # we know we don't add new devices. This way we guarantee to not
        # hold pointers to a resized array.
@@ -1541,7 +1536,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
        row = layout.row()
        row.prop(self, "compute_device_type", expand=True)

-        cuda_devices, opencl_devices = self.get_devices()
+        cuda_devices, opencl_devices = self.get_devices(self.compute_device_type)
        row = layout.row()
        if self.compute_device_type == 'CUDA':
            self._draw_devices(row, 'CUDA', cuda_devices)
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -1635,7 +1635,6 @@ class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
        col = layout.column()
        col.label('OpenCL Flags:')
        col.prop(cscene, "debug_opencl_device_type", text="Device")
-        col.prop(cscene, "debug_opencl_kernel_single_program", text="Single Program")
        col.prop(cscene, "debug_use_opencl_debug", text="Debug")
        col.prop(cscene, "debug_opencl_mem_limit")

--- a/intern/cycles/blender/addon/version_update.py
+++ b/intern/cycles/blender/addon/version_update.py
@@ -254,7 +254,7 @@ def do_versions(self):
                pass

            # Init device list for UI
-            prop.get_devices()
+            prop.get_devices(prop.compute_device_type)

    # We don't modify startup file because it assumes to
    # have all the default values only.
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -254,7 +254,7 @@ static bool ObtainCacheParticleUV(Mesh *mesh,
 					BL::Mesh::tessface_uv_textures_iterator l;
 					b_mesh->tessface_uv_textures.begin(l);

-					float3 uv = make_float3(0.0f, 0.0f, 0.0f);
+					float2 uv = make_float2(0.0f, 0.0f);
 					if(b_mesh->tessface_uv_textures.length())
 						b_psys.uv_on_emitter(psmd, *b_pa, pa_no, uv_num, &uv.x);
 					CData->curve_uv.push_back_slow(uv);
@@ -776,14 +776,10 @@ static void ExportCurveSegmentsMotion(Mesh *mesh, ParticleCurveData *CData, int
 static void ExportCurveTriangleUV(ParticleCurveData *CData,
                                  int vert_offset,
                                  int resol,
-                                  float3 *uvdata)
+                                  float2 *uvdata)
 {
 	if(uvdata == NULL)
 		return;
-
-	float time = 0.0f;
-	float prevtime = 0.0f;
-
 	int vertexindex = vert_offset;

 	for(int sys = 0; sys < CData->psys_firstcurve.size(); sys++) {
@@ -792,30 +788,20 @@ static void ExportCurveTriangleUV(ParticleCurveData *CData,
 				continue;

 			for(int curvekey = CData->curve_firstkey[curve]; curvekey < CData->curve_firstkey[curve] + CData->curve_keynum[curve] - 1; curvekey++) {
-				time = CData->curvekey_time[curvekey]/CData->curve_length[curve];
-
 				for(int section = 0; section < resol; section++) {
 					uvdata[vertexindex] = CData->curve_uv[curve];
-					uvdata[vertexindex].z = prevtime;
 					vertexindex++;
 					uvdata[vertexindex] = CData->curve_uv[curve];
-					uvdata[vertexindex].z = time;
 					vertexindex++;
 					uvdata[vertexindex] = CData->curve_uv[curve];
-					uvdata[vertexindex].z = prevtime;
 					vertexindex++;
 					uvdata[vertexindex] = CData->curve_uv[curve];
-					uvdata[vertexindex].z = time;
 					vertexindex++;
 					uvdata[vertexindex] = CData->curve_uv[curve];
-					uvdata[vertexindex].z = prevtime;
 					vertexindex++;
 					uvdata[vertexindex] = CData->curve_uv[curve];
-					uvdata[vertexindex].z = time;
 					vertexindex++;
 				}
-
-				prevtime = time;
 			}
 		}
 	}
@@ -1094,9 +1080,9 @@ void BlenderSync::sync_curves(Mesh *mesh,
 					if(active_render)
 						attr_uv = mesh->attributes.add(std, name);
 					else
-						attr_uv = mesh->attributes.add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CORNER);
+						attr_uv = mesh->attributes.add(name, TypeFloat2, ATTR_ELEMENT_CORNER);

-					float3 *uv = attr_uv->data_float3();
+					float2 *uv = attr_uv->data_float2();

 					ExportCurveTriangleUV(&CData, tri_num * 3, used_res, uv);
 				}
@@ -1104,9 +1090,9 @@ void BlenderSync::sync_curves(Mesh *mesh,
 					if(active_render)
 						attr_uv = mesh->curve_attributes.add(std, name);
 					else
-						attr_uv = mesh->curve_attributes.add(name, TypeDesc::TypePoint,  ATTR_ELEMENT_CURVE);
+						attr_uv = mesh->curve_attributes.add(name, TypeFloat2,  ATTR_ELEMENT_CURVE);

-					float3 *uv = attr_uv->data_float3();
+					float2 *uv = attr_uv->data_float2();

 					if(uv) {
 						size_t i = 0;
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -106,7 +106,7 @@ struct MikkUserData {
 		else {
 			Attribute *attr_uv = attributes.find(ustring(layer_name));
 			if(attr_uv != NULL) {
-				texface = attr_uv->data_float3();
+				texface = attr_uv->data_float2();
 			}
 		}
 	}
@@ -115,7 +115,7 @@ struct MikkUserData {
 	int num_faces;

 	float3 *vertex_normal;
-	float3 *texface;
+	float2 *texface;
 	float3 *orco;
 	float3 orco_loc, orco_size;

@@ -190,7 +190,7 @@ static void mikk_get_texture_coordinate(const SMikkTSpaceContext *context,
 	const Mesh *mesh = userdata->mesh;
 	if(userdata->texface != NULL) {
 		const int corner_index = mikk_corner_index(mesh, face_num, vert_num);
-		float3 tfuv = userdata->texface[corner_index];
+		float2 tfuv = userdata->texface[corner_index];
 		uv[0] = tfuv.x;
 		uv[1] = tfuv.y;
 	}
@@ -489,24 +489,24 @@ static void attr_create_uv_map(Scene *scene,
 				}
 				else {
 					uv_attr = mesh->attributes.add(uv_name,
-					                               TypeDesc::TypePoint,
+					                               TypeFloat2,
 					                               ATTR_ELEMENT_CORNER);
 				}

 				BL::MeshTextureFaceLayer::data_iterator t;
-				float3 *fdata = uv_attr->data_float3();
+				float2 *fdata = uv_attr->data_float2();
 				size_t i = 0;

 				for(l->data.begin(t); t != l->data.end(); ++t, ++i) {
 					int tri_a[3], tri_b[3];
 					face_split_tri_indices(face_flags[i], tri_a, tri_b);

-					float3 uvs[4];
-					uvs[0] = get_float3(t->uv1());
-					uvs[1] = get_float3(t->uv2());
-					uvs[2] = get_float3(t->uv3());
+					float2 uvs[4];
+					uvs[0] = get_float2(t->uv1());
+					uvs[1] = get_float2(t->uv2());
+					uvs[2] = get_float2(t->uv3());
 					if(nverts[i] == 4) {
-						uvs[3] = get_float3(t->uv4());
+						uvs[3] = get_float2(t->uv4());
 					}

 					fdata[0] = uvs[tri_a[0]];
@@ -586,19 +586,19 @@ static void attr_create_subd_uv_map(Scene *scene,
 				if(active_render)
 					uv_attr = mesh->subd_attributes.add(uv_std, uv_name);
 				else
-					uv_attr = mesh->subd_attributes.add(uv_name, TypeDesc::TypePoint, ATTR_ELEMENT_CORNER);
+					uv_attr = mesh->subd_attributes.add(uv_name, TypeFloat2, ATTR_ELEMENT_CORNER);

 				if(subdivide_uvs) {
 					uv_attr->flags |= ATTR_SUBDIVIDED;
 				}

 				BL::Mesh::polygons_iterator p;
-				float3 *fdata = uv_attr->data_float3();
+				float2 *fdata = uv_attr->data_float2();

 				for(b_mesh.polygons.begin(p); p != b_mesh.polygons.end(); ++p) {
 					int n = p->loop_total();
 					for(int j = 0; j < n; j++) {
-						*(fdata++) = get_float3(l->data[p->loop_start() + j].uv());
+						*(fdata++) = get_float2(l->data[p->loop_start() + j].uv());
 					}
 				}
 			}
@@ -1027,8 +1027,6 @@ static void create_subd_mesh(Scene *scene,
 	sdparams.dicing_rate = max(0.1f, RNA_float_get(&cobj, "dicing_rate") * dicing_rate);
 	sdparams.max_level = max_subdivisions;

-	scene->dicing_camera->update(scene);
-	sdparams.camera = scene->dicing_camera;
 	sdparams.objecttoworld = get_transform(b_ob.matrix_world());
 }

--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -23,6 +23,7 @@
 #include "blender/blender_session.h"

 #include "render/denoising.h"
+#include "render/merge.h"

 #include "util/util_debug.h"
 #include "util/util_foreach.h"
@@ -104,7 +105,6 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
 	/* Synchronize other OpenCL flags. */
 	flags.opencl.debug = get_boolean(cscene, "debug_use_opencl_debug");
 	flags.opencl.mem_limit = ((size_t)get_int(cscene, "debug_opencl_mem_limit"))*1024*1024;
-	flags.opencl.single_program = get_boolean(cscene, "debug_opencl_kernel_single_program");
 	return flags.opencl.device_type != opencl_device_type;
 }

@@ -639,9 +639,8 @@ static PyObject *opencl_compile_func(PyObject * /*self*/, PyObject *args)
 }
 #endif

-static bool denoise_parse_filepaths(PyObject *pyfilepaths, vector<string>& filepaths)
+static bool image_parse_filepaths(PyObject *pyfilepaths, vector<string>& filepaths)
 {
-
 	if(PyUnicode_Check(pyfilepaths)) {
 		const char *filepath = PyUnicode_AsUTF8(pyfilepaths);
 		filepaths.push_back(filepath);
@@ -710,12 +709,12 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key
 	/* Parse file paths list. */
 	vector<string> input, output;

-	if(!denoise_parse_filepaths(pyinput, input)) {
+	if(!image_parse_filepaths(pyinput, input)) {
 		return NULL;
 	}

 	if(pyoutput) {
-		if(!denoise_parse_filepaths(pyoutput, output)) {
+		if(!image_parse_filepaths(pyoutput, output)) {
 			return NULL;
 		}
 	}
@@ -754,6 +753,42 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key
 	Py_RETURN_NONE;
 }

+static PyObject *merge_func(PyObject * /*self*/, PyObject *args, PyObject *keywords)
+{
+	static const char *keyword_list[] = {"input", "output", NULL};
+	PyObject *pyinput, *pyoutput = NULL;
+
+	if (!PyArg_ParseTupleAndKeywords(args, keywords, "OO", (char**)keyword_list, &pyinput, &pyoutput)) {
+		return NULL;
+	}
+
+	/* Parse input list. */
+	vector<string> input;
+	if(!image_parse_filepaths(pyinput, input)) {
+		return NULL;
+	}
+
+	/* Parse output string. */
+	if(!PyUnicode_Check(pyoutput)) {
+		PyErr_SetString(PyExc_ValueError, "Output must be a string.");
+		return NULL;
+	}
+	string output = PyUnicode_AsUTF8(pyoutput);
+
+	/* Merge. */
+	ImageMerger merger;
+	merger.input = input;
+	merger.output = output;
+
+	if(!merger.run()) {
+		PyErr_SetString(PyExc_ValueError, merger.error.c_str());
+		return NULL;
+	}
+
+	Py_RETURN_NONE;
+}
+
+
 static PyObject *debug_flags_update_func(PyObject * /*self*/, PyObject *args)
 {
 	PyObject *pyscene;
@@ -917,6 +952,7 @@ static PyMethodDef methods[] = {

 	/* Standalone denoising */
 	{"denoise", (PyCFunction)denoise_func, METH_VARARGS|METH_KEYWORDS, ""},
+	{"merge", (PyCFunction)merge_func, METH_VARARGS|METH_KEYWORDS, ""},

 	/* Debugging routines */
 	{"debug_flags_update", debug_flags_update_func, METH_VARARGS, ""},
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -433,7 +433,7 @@ void BlenderSession::render()
 		BL::RenderLayer b_rlay = *b_single_rlay;

 		/* add passes */
-		vector<Pass> passes = sync->sync_render_passes(b_rlay, *b_layer_iter, session_params);
+		vector<Pass> passes = sync->sync_render_passes(b_rlay, *b_layer_iter);
 		buffer_params.passes = passes;

 		PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles");
@@ -933,6 +933,11 @@ void BlenderSession::get_status(string& status, string& substatus)
 	session->progress.get_status(status, substatus);
 }

+void BlenderSession::get_kernel_status(string& kernel_status)
+{
+	session->progress.get_kernel_status(kernel_status);
+}
+
 void BlenderSession::get_progress(float& progress, double& total_time, double& render_time)
 {
 	session->progress.get_time(total_time, render_time);
@@ -951,15 +956,15 @@ void BlenderSession::update_bake_progress()

 void BlenderSession::update_status_progress()
 {
-	string timestatus, status, substatus;
+	string timestatus, status, substatus, kernel_status;
 	string scene = "";
 	float progress;
 	double total_time, remaining_time = 0, render_time;
-	char time_str[128];
 	float mem_used = (float)session->stats.mem_used / 1024.0f / 1024.0f;
 	float mem_peak = (float)session->stats.mem_peak / 1024.0f / 1024.0f;

 	get_status(status, substatus);
+	get_kernel_status(kernel_status);
 	get_progress(progress, total_time, render_time);

 	if(progress > 0)
@@ -974,13 +979,11 @@ void BlenderSession::update_status_progress()
 			scene += ", " + b_rview_name;
 	}
 	else {
-		BLI_timecode_string_from_time_simple(time_str, sizeof(time_str), total_time);
-		timestatus = "Time:" + string(time_str) + " | ";
+		timestatus = "Time:" + time_human_readable_from_seconds(total_time) + " | ";
 	}

 	if(remaining_time > 0) {
-		BLI_timecode_string_from_time_simple(time_str, sizeof(time_str), remaining_time);
-		timestatus += "Remaining:" + string(time_str) + " | ";
+		timestatus += "Remaining:" + time_human_readable_from_seconds(remaining_time) + " | ";
 	}

 	timestatus += string_printf("Mem:%.2fM, Peak:%.2fM", (double)mem_used, (double)mem_peak);
@@ -989,6 +992,8 @@ void BlenderSession::update_status_progress()
 		status = " | " + status;
 	if(substatus.size() > 0)
 		status += " | " + substatus;
+	if(kernel_status.size() > 0)
+		status += " | " + kernel_status;

 	double current_time = time_dt();
 	/* When rendering in a window, redraw the status at least once per second to keep the elapsed and remaining time up-to-date.
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -90,6 +90,7 @@ public:
 	void tag_redraw();
 	void tag_update();
 	void get_status(string& status, string& substatus);
+	void get_kernel_status(string& kernel_status);
 	void get_progress(float& progress, double& total_time, double& render_time);
 	void test_cancel();
 	void update_status_progress();
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -104,10 +104,10 @@ bool BlenderSync::sync_recalc()
 		if(b_lamp->is_updated() || (b_lamp->node_tree() && b_lamp->node_tree().is_updated()))
 			shader_map.set_recalc(*b_lamp);

-	bool dicing_prop_changed = false;
-
 	if(experimental) {
+		/* Mark all meshes as needing to be exported again if dicing changed. */
 		PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+		bool dicing_prop_changed = false;

 		float updated_dicing_rate = preview ? RNA_float_get(&cscene, "preview_dicing_rate")
 		                                    : RNA_float_get(&cscene, "dicing_rate");
@@ -123,6 +123,15 @@ bool BlenderSync::sync_recalc()
 			max_subdivisions = updated_max_subdivisions;
 			dicing_prop_changed = true;
 		}
+
+		if(dicing_prop_changed) {
+			for(const pair<void*, Mesh*>& iter: mesh_map.key_to_scene_data()) {
+				Mesh *mesh = iter.second;
+				if(mesh->subdivision_type != Mesh::SUBDIVISION_NONE) {
+					mesh_map.set_recalc(iter.first);
+				}
+			}
+		}
 	}

 	BL::BlendData::objects_iterator b_ob;
@@ -134,9 +143,7 @@ bool BlenderSync::sync_recalc()
 		}

 		if(object_is_mesh(*b_ob)) {
-			if(b_ob->is_updated_data() || b_ob->data().is_updated() ||
-			   (dicing_prop_changed && object_subdivision_type(*b_ob, preview, experimental) != Mesh::SUBDIVISION_NONE))
-			{
+			if(b_ob->is_updated_data() || b_ob->data().is_updated()) {
 				BL::ID key = BKE_object_is_modified(*b_ob)? *b_ob: b_ob->data();
 				mesh_map.set_recalc(key);
 			}
@@ -553,16 +560,11 @@ int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
 }

 vector<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
-                                             BL::SceneRenderLayer& b_srlay,
-                                             const SessionParams &session_params)
+                                             BL::SceneRenderLayer& b_srlay)
 {
 	vector<Pass> passes;
 	Pass::add(PASS_COMBINED, passes);

-	if(!session_params.device.advanced_shading) {
-		return passes;
-	}
-
 	/* loop over passes */
 	BL::RenderLayer::passes_iterator b_pass_iter;

@@ -794,6 +796,9 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 		}
 	}

+	/* Clamp samples. */
+	params.samples = min(params.samples, Integrator::MAX_SAMPLES);
+
 	/* tiles */
 	const bool is_cpu = (params.device.type == DEVICE_CPU);
 	if(!is_cpu && !background) {
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -67,8 +67,7 @@ public:
 	               const char *layer = 0);
 	void sync_render_layers(BL::SpaceView3D& b_v3d, const char *layer);
 	vector<Pass> sync_render_passes(BL::RenderLayer& b_rlay,
-	                                BL::SceneRenderLayer& b_srlay,
-	                                const SessionParams &session_params);
+	                                BL::SceneRenderLayer& b_srlay);
 	void sync_integrator();
 	void sync_camera(BL::RenderSettings& b_render,
 	                 BL::Object& b_override,
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -32,7 +32,6 @@
 * todo: clean this up ... */

 extern "C" {
-size_t BLI_timecode_string_from_time_simple(char *str, size_t maxlen, double time_seconds);
 void BKE_image_user_frame_calc(void *iuser, int cfra, int fieldnr);
 void BKE_image_user_file_path(void *iuser, void *ima, char *path);
 unsigned char *BKE_image_get_pixels_for_frame(void *image, int frame);
@@ -645,6 +644,11 @@ public:
 		b_recalc.insert(id.ptr.data);
 	}

+	void set_recalc(void *id_ptr)
+	{
+		b_recalc.insert(id_ptr);
+	}
+
 	bool has_recalc()
 	{
 		return !(b_recalc.empty());
@@ -740,6 +744,11 @@ public:
 		return deleted;
 	}

+	const map<K, T*>& key_to_scene_data()
+	{
+		return b_map;
+	}
+
 protected:
 	vector<T*> *scene_data;
 	map<K, T*> b_map;
--- a/intern/cycles/bvh/bvh_embree.cpp
+++ b/intern/cycles/bvh/bvh_embree.cpp
@@ -149,6 +149,13 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments* args)
 				break;
 			}

+			/* Ignore curves. */
+			if(hit->geomID & 1) {
+				/* This tells Embree to continue tracing. */
+				*args->valid = 0;
+				break;
+			}
+
 			/* See triangle_intersect_subsurface() for the native equivalent. */
 			for(int i = min(ctx->max_hits, ctx->ss_isect->num_hits) - 1; i >= 0; --i) {
 				if(ctx->ss_isect->hits[i].t == ray->tfar) {
@@ -389,6 +396,45 @@ void BVHEmbree::build(Progress& progress, Stats *stats_)
 	               (params.use_spatial_split ? RTC_BUILD_QUALITY_HIGH : RTC_BUILD_QUALITY_MEDIUM);
 	rtcSetSceneBuildQuality(scene, build_quality);

+	/* Count triangles and curves first, reserve arrays once. */
+	size_t prim_count = 0;
+
+	foreach(Object *ob, objects) {
+		if (params.top_level) {
+			if (!ob->is_traceable()) {
+				continue;
+			}
+			if (!ob->mesh->is_instanced()) {
+				if(params.primitive_mask & PRIMITIVE_ALL_TRIANGLE) {
+					prim_count += ob->mesh->num_triangles();
+				}
+				if (params.primitive_mask & PRIMITIVE_ALL_CURVE) {
+					for (size_t j = 0; j < ob->mesh->num_curves(); ++j) {
+						prim_count += ob->mesh->get_curve(j).num_segments();
+					}
+				}
+			}
+			else {
+				++prim_count;
+			}
+		}
+		else {
+			if (params.primitive_mask & PRIMITIVE_ALL_TRIANGLE && ob->mesh->num_triangles() > 0) {
+				prim_count += ob->mesh->num_triangles();
+			}
+			if (params.primitive_mask & PRIMITIVE_ALL_CURVE) {
+				for (size_t j = 0; j < ob->mesh->num_curves(); ++j) {
+					prim_count += ob->mesh->get_curve(j).num_segments();
+				}
+			}
+		}
+	}
+
+	pack.prim_object.reserve(prim_count);
+	pack.prim_type.reserve(prim_count);
+	pack.prim_index.reserve(prim_count);
+	pack.prim_tri_index.reserve(prim_count);
+
 	int i = 0;

 	pack.object_node.clear();
@@ -530,15 +576,20 @@ void BVHEmbree::add_triangles(Object *ob, int i)

 	update_tri_vertex_buffer(geom_id, mesh);

-	pack.prim_object.reserve(pack.prim_object.size() + num_triangles);
-	pack.prim_type.reserve(pack.prim_type.size() + num_triangles);
-	pack.prim_index.reserve(pack.prim_index.size() + num_triangles);
-	pack.prim_tri_index.reserve(pack.prim_index.size() + num_triangles);
+	size_t prim_object_size = pack.prim_object.size();
+	pack.prim_object.resize(prim_object_size + num_triangles);
+	size_t prim_type_size = pack.prim_type.size();
+	pack.prim_type.resize(prim_type_size + num_triangles);
+	size_t prim_index_size = pack.prim_index.size();
+	pack.prim_index.resize(prim_index_size + num_triangles);
+	pack.prim_tri_index.resize(prim_index_size + num_triangles);
+	int prim_type = (num_motion_steps > 1 ? PRIMITIVE_MOTION_TRIANGLE : PRIMITIVE_TRIANGLE);
+
 	for(size_t j = 0; j < num_triangles; ++j) {
-		pack.prim_object.push_back_reserved(i);
-		pack.prim_type.push_back_reserved(num_motion_steps > 1 ? PRIMITIVE_MOTION_TRIANGLE : PRIMITIVE_TRIANGLE);
-		pack.prim_index.push_back_reserved(j);
-		pack.prim_tri_index.push_back_reserved(j);
+		pack.prim_object[prim_object_size + j] = i;
+		pack.prim_type[prim_type_size + j] = prim_type;
+		pack.prim_index[prim_index_size + j] = j;
+		pack.prim_tri_index[prim_index_size + j] = j;
 	}

 	rtcSetGeometryUserData(geom_id, (void*) prim_offset);
@@ -629,7 +680,7 @@ void BVHEmbree::update_curve_vertex_buffer(RTCGeometry geom_id, const Mesh* mesh
 		float4 *rtc_tangents = NULL;
 		if(use_curves) {
 			rtc_tangents = (float4*)rtcSetNewGeometryBuffer(geom_id, RTC_BUFFER_TYPE_TANGENT, t,
-																RTC_FORMAT_FLOAT4, sizeof (float) * 4, num_keys);
+			                                                RTC_FORMAT_FLOAT4, sizeof (float) * 4, num_keys);
 			assert(rtc_tangents);
 		}
 		assert(rtc_verts);
@@ -691,10 +742,14 @@ void BVHEmbree::add_curves(Object *ob, int i)
 	}

 	/* Make room for Cycles specific data. */
-	pack.prim_object.reserve(pack.prim_object.size() + num_segments);
-	pack.prim_type.reserve(pack.prim_type.size() + num_segments);
-	pack.prim_index.reserve(pack.prim_index.size() + num_segments);
-	pack.prim_tri_index.reserve(pack.prim_index.size() + num_segments);
+	size_t prim_object_size = pack.prim_object.size();
+	pack.prim_object.resize(prim_object_size + num_segments);
+	size_t prim_type_size = pack.prim_type.size();
+	pack.prim_type.resize(prim_type_size + num_segments);
+	size_t prim_index_size = pack.prim_index.size();
+	pack.prim_index.resize(prim_index_size + num_segments);
+	size_t prim_tri_index_size = pack.prim_index.size();
+	pack.prim_tri_index.resize(prim_tri_index_size + num_segments);

 	enum RTCGeometryType type = (!use_curves) ? RTC_GEOMETRY_TYPE_FLAT_LINEAR_CURVE :
 	                            (use_ribbons ? RTC_GEOMETRY_TYPE_FLAT_HERMITE_CURVE :
@@ -703,18 +758,18 @@ void BVHEmbree::add_curves(Object *ob, int i)
 	RTCGeometry geom_id = rtcNewGeometry(rtc_shared_device, type);
 	rtcSetGeometryTessellationRate(geom_id, curve_subdivisions);
 	unsigned *rtc_indices = (unsigned*) rtcSetNewGeometryBuffer(geom_id, RTC_BUFFER_TYPE_INDEX, 0,
-																RTC_FORMAT_UINT, sizeof (int), num_segments);
+	                                                            RTC_FORMAT_UINT, sizeof (int), num_segments);
 	size_t rtc_index = 0;
 	for(size_t j = 0; j < num_curves; ++j) {
 		Mesh::Curve c = mesh->get_curve(j);
 		for(size_t k = 0; k < c.num_segments(); ++k) {
 			rtc_indices[rtc_index] = c.first_key + k;
 			/* Cycles specific data. */
-			pack.prim_object.push_back_reserved(i);
-			pack.prim_type.push_back_reserved(PRIMITIVE_PACK_SEGMENT(num_motion_steps > 1 ?
-																	 PRIMITIVE_MOTION_CURVE : PRIMITIVE_CURVE, k));
-			pack.prim_index.push_back_reserved(j);
-			pack.prim_tri_index.push_back_reserved(rtc_index);
+			pack.prim_object[prim_object_size + rtc_index] = i;
+			pack.prim_type[prim_type_size + rtc_index] = (PRIMITIVE_PACK_SEGMENT(num_motion_steps > 1 ?
+			                                              PRIMITIVE_MOTION_CURVE : PRIMITIVE_CURVE, k));
+			pack.prim_index[prim_index_size + rtc_index] = j;
+			pack.prim_tri_index[prim_tri_index_size + rtc_index] = rtc_index;

 			++rtc_index;
 		}
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -56,6 +56,14 @@ enum DeviceTypeMask {
 	DEVICE_MASK_ALL = ~0
 };

+enum DeviceKernelStatus {
+	DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL = 0,
+	DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE,
+	DEVICE_KERNEL_USING_FEATURE_KERNEL,
+	DEVICE_KERNEL_FEATURE_KERNEL_INVALID,
+	DEVICE_KERNEL_UNKNOWN,
+};
+
 #define DEVICE_MASK(type) (DeviceTypeMask)(1 << type)

 class DeviceInfo {
@@ -65,7 +73,6 @@ public:
 	string id; /* used for user preferences, should stay fixed with changing hardware config */
 	int num;
 	bool display_device;            /* GPU is used as a display device. */
-	bool advanced_shading;          /* Supports full shading system. */
 	bool has_half_images;           /* Support half-float textures. */
 	bool has_volume_decoupled;      /* Decoupled volume shading. */
 	bool has_osl;                   /* Support Open Shading Language. */
@@ -81,7 +88,6 @@ public:
 		num = 0;
 		cpu_threads = 0;
 		display_device = false;
-		advanced_shading = true;
 		has_half_images = false;
 		has_volume_decoupled = false;
 		has_osl = false;
@@ -321,6 +327,20 @@ public:
 	        const DeviceRequestedFeatures& /*requested_features*/)
 	{ return true; }

+	/* Wait for device to become available to upload data and receive tasks
+	 * This method is used by the OpenCL device to load the
+	 * optimized kernels or when not (yet) available load the
+	 * generic kernels (only during foreground rendering) */
+	virtual bool wait_for_availability(
+	        const DeviceRequestedFeatures& /*requested_features*/)
+	{ return true; }
+	/* Check if there are 'better' kernels available to be used
+	 * We can switch over to these kernels
+	 * This method is used to determine if we can switch the preview kernels
+	 * to regular kernels */
+	virtual DeviceKernelStatus get_active_kernel_switch_state()
+	{ return DEVICE_KERNEL_USING_FEATURE_KERNEL; }
+
 	/* tasks */
 	virtual int get_split_task_count(DeviceTask& task) = 0;
 	virtual void task_add(DeviceTask& task) = 0;
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -761,8 +761,8 @@ public:
 		int start_sample = tile.start_sample;
 		int end_sample = tile.start_sample + tile.num_samples;

-		_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
-		_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+		/* Needed for Embree. */
+		SIMD_SET_FLUSH_TO_ZERO;

 		for(int sample = start_sample; sample < end_sample; sample++) {
 			if(task.get_cancel() || task_pool.canceled()) {
@@ -968,6 +968,7 @@ protected:
 			kg.decoupled_volume_steps[i] = NULL;
 		}
 		kg.decoupled_volume_steps_index = 0;
+		kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL;
 #ifdef WITH_OSL
 		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
@@ -1122,7 +1123,6 @@ void device_cpu_info(vector<DeviceInfo>& devices)
 	info.description = system_cpu_brand_string();
 	info.id = "CPU";
 	info.num = 0;
-	info.advanced_shading = true;
 	info.has_volume_decoupled = true;
 	info.has_osl = true;
 	info.has_half_images = true;
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -400,9 +400,9 @@ public:
 			       major, minor);
 			return false;
 		}
-		else if(cuda_version != 80) {
+		else if(cuda_version != 101) {
 			printf("CUDA version %d.%d detected, build may succeed but only "
-			       "CUDA 8.0 is officially supported.\n",
+			       "CUDA 10.1 is officially supported.\n",
 			       major, minor);
 		}
 		return true;
@@ -2512,7 +2512,6 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 		info.description = string(name);
 		info.num = num;

-		info.advanced_shading = (major >= 3);
 		info.has_half_images = (major >= 3);
 		info.has_volume_decoupled = false;

--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -120,6 +120,38 @@ public:
 		return true;
 	}

+	bool wait_for_availability(const DeviceRequestedFeatures& requested_features)
+	{
+		foreach(SubDevice& sub, devices)
+			if(!sub.device->wait_for_availability(requested_features))
+				return false;
+
+		return true;
+	}
+
+	DeviceKernelStatus get_active_kernel_switch_state()
+	{
+		DeviceKernelStatus result = DEVICE_KERNEL_USING_FEATURE_KERNEL;
+
+		foreach(SubDevice& sub, devices) {
+			DeviceKernelStatus subresult = sub.device->get_active_kernel_switch_state();
+			switch (subresult) {
+				case DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL:
+					result = subresult;
+					break;
+
+				case DEVICE_KERNEL_FEATURE_KERNEL_INVALID:
+				case DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE:
+					return subresult;
+
+				case DEVICE_KERNEL_USING_FEATURE_KERNEL:
+				case DEVICE_KERNEL_UNKNOWN:
+					break;
+			}
+		}
+		return result;
+	}
+
 	void mem_alloc(device_memory& mem)
 	{
 		device_ptr key = unique_key++;
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -308,7 +308,6 @@ void device_network_info(vector<DeviceInfo>& devices)
 	info.num = 0;

 	/* todo: get this info from device */
-	info.advanced_shading = true;
 	info.has_volume_decoupled = false;
 	info.has_osl = false;

--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -119,7 +119,6 @@ void device_opencl_info(vector<DeviceInfo>& devices)
 		info.num = num_devices;
 		/* We don't know if it's used for display, but assume it is. */
 		info.display_device = true;
-		info.advanced_shading = OpenCLInfo::kernel_use_advanced_shading(platform_name);
 		info.use_split_kernel = true;
 		info.has_volume_decoupled = false;
 		info.id = id;
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -84,7 +84,6 @@ class OpenCLInfo
 public:
 	static cl_device_type device_type();
 	static bool use_debug();
-	static bool kernel_use_advanced_shading(const string& platform_name);
 	static bool device_supported(const string& platform_name,
 	                             const cl_device_id device_id);
 	static bool platform_version_check(cl_platform_id platform,
@@ -95,7 +94,6 @@ public:
 	                              cl_device_id device_id);
 	static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
 	                               bool force_all = false);
-	static bool use_single_program();

 	/* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */

@@ -262,16 +260,22 @@ class OpenCLDevice : public Device
 {
 public:
 	DedicatedTaskPool task_pool;
+
+	/* Task pool for required kernels (base, AO kernels during foreground rendering) */
+	TaskPool load_required_kernel_task_pool;
+	/* Task pool for optional kernels (feature kernels during foreground rendering) */
+	TaskPool load_kernel_task_pool;
 	cl_context cxContext;
 	cl_command_queue cqCommandQueue;
 	cl_platform_id cpPlatform;
 	cl_device_id cdDevice;
 	cl_int ciErr;
 	int device_num;
+	bool use_preview_kernels;

 	class OpenCLProgram {
 	public:
-		OpenCLProgram() : loaded(false), program(NULL), device(NULL) {}
+		OpenCLProgram() : loaded(false), needs_compiling(true), program(NULL), device(NULL) {}
 		OpenCLProgram(OpenCLDevice *device,
 		              const string& program_name,
 		              const string& kernel_name,
@@ -280,12 +284,24 @@ public:
 		~OpenCLProgram();

 		void add_kernel(ustring name);
-		void load();
+
+		/* Try to load the program from device cache or disk */
+		bool load();
+		/* Compile the kernel (first separate, failback to local) */
+		void compile();
+		/* Create the OpenCL kernels after loading or compiling */
+		void create_kernels();

 		bool is_loaded() const { return loaded; }
 		const string& get_log() const { return log; }
 		void report_error();

+		/* Wait until this kernel is available to be used
+		 * It will return true when the kernel is available.
+		 * It will return false when the kernel is not available
+		 * or could not be loaded. */
+		bool wait_for_availability();
+
 		cl_kernel operator()();
 		cl_kernel operator()(ustring name);

@@ -309,6 +325,8 @@ public:
 		void add_error(const string& msg);

 		bool loaded;
+		bool needs_compiling;
+
 		cl_program program;
 		OpenCLDevice *device;

@@ -324,26 +342,42 @@ public:
 		map<ustring, cl_kernel> kernels;
 	};

+	/* Container for all types of split programs. */
+	class OpenCLSplitPrograms {
+		public:
+			OpenCLDevice *device;
+			OpenCLProgram program_split;
+			OpenCLProgram program_lamp_emission;
+			OpenCLProgram program_do_volume;
+			OpenCLProgram program_indirect_background;
+			OpenCLProgram program_shader_eval;
+			OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
+			OpenCLProgram program_subsurface_scatter;
+			OpenCLProgram program_direct_lighting;
+			OpenCLProgram program_shadow_blocked_ao;
+			OpenCLProgram program_shadow_blocked_dl;
+
+			OpenCLSplitPrograms(OpenCLDevice *device);
+			~OpenCLSplitPrograms();
+
+			/* Load the kernels and put the created kernels in the given `programs`
+			 * paramter. */
+			void load_kernels(vector<OpenCLProgram*> &programs,
+			                  const DeviceRequestedFeatures& requested_features,
+			                  bool is_preview=false);
+	};
+
 	DeviceSplitKernel *split_kernel;

-	OpenCLProgram program_split;
-
-	OpenCLProgram program_lamp_emission;
-	OpenCLProgram program_do_volume;
-	OpenCLProgram program_indirect_background;
-	OpenCLProgram program_shader_eval;
-	OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
-	OpenCLProgram program_subsurface_scatter;
-	OpenCLProgram program_direct_lighting;
-	OpenCLProgram program_shadow_blocked_ao;
-	OpenCLProgram program_shadow_blocked_dl;
-
 	OpenCLProgram base_program;
 	OpenCLProgram bake_program;
 	OpenCLProgram displace_program;
 	OpenCLProgram background_program;
 	OpenCLProgram denoising_program;

+	OpenCLSplitPrograms kernel_programs;
+	OpenCLSplitPrograms preview_programs;
+
 	typedef map<string, device_vector<uchar>*> ConstMemMap;
 	typedef map<string, device_ptr> MemMap;

@@ -359,22 +393,32 @@ public:
 	void opencl_error(const string& message);
 	void opencl_assert_err(cl_int err, const char* where);

-	OpenCLDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_);
+	OpenCLDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background);
 	~OpenCLDevice();

 	static void CL_CALLBACK context_notify_callback(const char *err_info,
 		const void * /*private_info*/, size_t /*cb*/, void *user_data);

 	bool opencl_version_check();
+	OpenCLSplitPrograms* get_split_programs();

 	string device_md5_hash(string kernel_custom_build_options = "");
 	bool load_kernels(const DeviceRequestedFeatures& requested_features);
+	void load_required_kernels(const DeviceRequestedFeatures& requested_features);
+	void load_preview_kernels();
+
+	bool wait_for_availability(const DeviceRequestedFeatures& requested_features);
+	DeviceKernelStatus get_active_kernel_switch_state();

 	/* Get the name of the opencl program for the given kernel */
-	const string get_opencl_program_name(bool single_program, const string& kernel_name);
+	const string get_opencl_program_name(const string& kernel_name);
 	/* Get the program file name to compile (*.cl) for the given kernel */
-	const string get_opencl_program_filename(bool single_program, const string& kernel_name);
-	string get_build_options(const DeviceRequestedFeatures& requested_features, const string& opencl_program_name);
+	const string get_opencl_program_filename(const string& kernel_name);
+	string get_build_options(const DeviceRequestedFeatures& requested_features,
+	                         const string& opencl_program_name,
+	                         bool preview_kernel=false);
+	/* Enable the default features to reduce recompilation events */
+	void enable_default_features(DeviceRequestedFeatures& features);

 	void mem_alloc(device_memory& mem);
 	void mem_copy_to(device_memory& mem);
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -40,7 +40,13 @@ struct texture_slot_t {
 	int slot;
 };

-static const string fast_compiled_kernels =
+static const string NON_SPLIT_KERNELS =
+	"denoising "
+	"base "
+	"background "
+	"displace ";
+
+static const string SPLIT_BUNDLE_KERNELS =
 	"data_init "
 	"path_init "
 	"state_buffer_size "
@@ -53,37 +59,52 @@ static const string fast_compiled_kernels =
 	"indirect_subsurface "
 	"buffer_update";

-const string OpenCLDevice::get_opencl_program_name(bool single_program, const string& kernel_name)
+const string OpenCLDevice::get_opencl_program_name(const string& kernel_name)
 {
-	if (single_program) {
-		return "split";
+	if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) {
+		return kernel_name;
+	}
+	else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
+		return "split_bundle";
 	}
 	else {
-		if (fast_compiled_kernels.find(kernel_name) != std::string::npos) {
-			return "split_bundle";
-		}
-		else {
-			return "split_" + kernel_name;
-		}
+		return "split_" + kernel_name;
 	}
 }

-const string OpenCLDevice::get_opencl_program_filename(bool single_program, const string& kernel_name)
+const string OpenCLDevice::get_opencl_program_filename(const string& kernel_name)
 {
-	if (single_program) {
-		return "kernel_split.cl";
+	if (kernel_name == "denoising") {
+		return "filter.cl";
+	}
+	else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
+		return "kernel_split_bundle.cl";
 	}
 	else {
-		if (fast_compiled_kernels.find(kernel_name) != std::string::npos) {
-			return "kernel_split_bundle.cl";
-		}
-		else {
-			return "kernel_" + kernel_name + ".cl";
-		}
+		return "kernel_" + kernel_name + ".cl";
 	}
 }

-string OpenCLDevice::get_build_options(const DeviceRequestedFeatures& requested_features, const string& opencl_program_name)
+/* Enable features that we always want to compile to reduce recompilation events */
+void OpenCLDevice::enable_default_features(DeviceRequestedFeatures& features)
+{
+	features.use_transparent = true;
+	features.use_shadow_tricks = true;
+	features.use_principled = true;
+	features.use_denoising = true;
+
+	if (!background)
+	{
+		features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
+		features.nodes_features = NODE_FEATURE_ALL;
+		features.use_hair = true;
+		features.use_subsurface = true;
+		features.use_camera_motion = false;
+		features.use_object_motion = false;
+	}
+}
+
+string OpenCLDevice::get_build_options(const DeviceRequestedFeatures& requested_features, const string& opencl_program_name, bool preview_kernel)
 {
 	/* first check for non-split kernel programs */
 	if (opencl_program_name == "base" || opencl_program_name == "denoising") {
@@ -91,18 +112,25 @@ string OpenCLDevice::get_build_options(const DeviceRequestedFeatures& requested_
 	}
 	else if (opencl_program_name == "bake") {
 		/* Note: get_build_options for bake is only requested when baking is enabled.
-		   displace and background are always requested.
-		   `__SPLIT_KERNEL__` must not be present in the compile directives for bake */
+		 * displace and background are always requested.
+		 * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */
 		DeviceRequestedFeatures features(requested_features);
+		enable_default_features(features);
 		features.use_denoising = false;
 		features.use_object_motion = false;
 		features.use_camera_motion = false;
+		features.use_hair = true;
+		features.use_subsurface = true;
+		features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
+		features.nodes_features = NODE_FEATURE_ALL;
+		features.use_integrator_branched = false;
 		return features.get_build_options();
 	}
 	else if (opencl_program_name == "displace") {
 		/* As displacement does not use any nodes from the Shading group (eg BSDF).
-		   We disable all features that are related to shading. */
+		 * We disable all features that are related to shading. */
 		DeviceRequestedFeatures features(requested_features);
+		enable_default_features(features);
 		features.use_denoising = false;
 		features.use_object_motion = false;
 		features.use_camera_motion = false;
@@ -114,27 +142,33 @@ string OpenCLDevice::get_build_options(const DeviceRequestedFeatures& requested_
 		features.nodes_features &= ~NODE_FEATURE_VOLUME;
 		features.use_denoising = false;
 		features.use_principled = false;
+		features.use_integrator_branched = false;
 		return features.get_build_options();
 	}
 	else if (opencl_program_name == "background") {
 		/* Background uses Background shading
-		   It is save to disable shadow features, subsurface and volumetric. */
+		 * It is save to disable shadow features, subsurface and volumetric. */
 		DeviceRequestedFeatures features(requested_features);
+		enable_default_features(features);
 		features.use_baking = false;
+		features.use_object_motion = false;
+		features.use_camera_motion = false;
 		features.use_transparent = false;
 		features.use_shadow_tricks = false;
 		features.use_denoising = false;
 		/* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node.
-		   Perhaps we should remove them in UI as it does not make any sense when
-		   rendering background. */
+		 * Perhaps we should remove them in UI as it does not make any sense when
+		 * rendering background. */
 		features.nodes_features &= ~NODE_FEATURE_VOLUME;
 		features.use_subsurface = false;
 		features.use_volume = false;
+		features.use_shader_raytrace = false;
+		features.use_patch_evaluation = false;
+		features.use_integrator_branched = false;
 		return features.get_build_options();
 	}

 	string build_options = "-D__SPLIT_KERNEL__ ";
-	DeviceRequestedFeatures nofeatures;
 	/* Set compute device build option. */
 	cl_device_type device_type;
 	OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr);
@@ -143,28 +177,34 @@ string OpenCLDevice::get_build_options(const DeviceRequestedFeatures& requested_
 		build_options += "-D__COMPUTE_DEVICE_GPU__ ";
 	}

+	DeviceRequestedFeatures nofeatures;
+	enable_default_features(nofeatures);
+
 	/* Add program specific optimized compile directives */
-	if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) {
-		build_options += nofeatures.get_build_options();
+	if (preview_kernel) {
+		DeviceRequestedFeatures preview_features;
+		preview_features.use_hair = true;
+		build_options += "-D__KERNEL_AO_PREVIEW__ ";
+		build_options += preview_features.get_build_options();
 	}
-	else if (opencl_program_name == "split_subsurface_scatter" && !requested_features.use_subsurface) {
-		/* When subsurface is off, the kernel updates indexes and does not need any
-		   Compile directives */
+	else if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) {
 		build_options += nofeatures.get_build_options();
 	}
 	else {
 		DeviceRequestedFeatures features(requested_features);
+		enable_default_features(features);

 		/* Always turn off baking at this point. Baking is only usefull when building the bake kernel.
-		   this also makes sure that the kernels that are build during baking can be reused
-		   when not doing any baking. */
+		 * this also makes sure that the kernels that are build during baking can be reused
+		 * when not doing any baking. */
 		features.use_baking = false;

 		/* Do not vary on shaders when program doesn't do any shading.
-		   We have bundled them in a single program. */
+		 * We have bundled them in a single program. */
 		if (opencl_program_name == "split_bundle") {
 			features.max_nodes_group = 0;
 			features.nodes_features = 0;
+			features.use_shader_raytrace = false;
 		}

 		/* No specific settings, just add the regular ones */
@@ -174,6 +214,77 @@ string OpenCLDevice::get_build_options(const DeviceRequestedFeatures& requested_
 	return build_options;
 }

+OpenCLDevice::OpenCLSplitPrograms::OpenCLSplitPrograms(OpenCLDevice *device_)
+{
+	device = device_;
+}
+
+OpenCLDevice::OpenCLSplitPrograms::~OpenCLSplitPrograms()
+{
+	program_split.release();
+	program_lamp_emission.release();
+	program_do_volume.release();
+	program_indirect_background.release();
+	program_shader_eval.release();
+	program_holdout_emission_blurring_pathtermination_ao.release();
+	program_subsurface_scatter.release();
+	program_direct_lighting.release();
+	program_shadow_blocked_ao.release();
+	program_shadow_blocked_dl.release();
+}
+
+void OpenCLDevice::OpenCLSplitPrograms::load_kernels(vector<OpenCLProgram*> &programs, const DeviceRequestedFeatures& requested_features, bool is_preview)
+{
+	if (!requested_features.use_baking) {
+#define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) program_split.add_kernel(ustring("path_trace_"#kernel_name));
+#define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \
+		const string program_name_##kernel_name = "split_"#kernel_name; \
+		program_##kernel_name = \
+			OpenCLDevice::OpenCLProgram(device, \
+			                            program_name_##kernel_name, \
+			                            "kernel_"#kernel_name".cl", \
+			                            device->get_build_options(requested_features, program_name_##kernel_name, is_preview)); \
+		program_##kernel_name.add_kernel(ustring("path_trace_"#kernel_name)); \
+		programs.push_back(&program_##kernel_name);
+
+		/* Ordered with most complex kernels first, to reduce overall compile time. */
+		ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter);
+		if (requested_features.use_volume || is_preview) {
+			ADD_SPLIT_KERNEL_PROGRAM(do_volume);
+		}
+		ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl);
+		ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao);
+		ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
+		ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
+		ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
+		ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
+		ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
+
+		/* Quick kernels bundled in a single program to reduce overhead of starting
+			* Blender processes. */
+		program_split = OpenCLDevice::OpenCLProgram(device,
+		                                            "split_bundle" ,
+		                                            "kernel_split_bundle.cl",
+		                                            device->get_build_options(requested_features, "split_bundle", is_preview));
+
+		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init);
+		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size);
+		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init);
+		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect);
+		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue);
+		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup);
+		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort);
+		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive);
+		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
+		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
+		ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
+		programs.push_back(&program_split);
+
+#undef ADD_SPLIT_KERNEL_PROGRAM
+#undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM
+	}
+}
+
 namespace {

 /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
@@ -280,13 +391,14 @@ public:
 	{
 		OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory);

-		bool single_program = OpenCLInfo::use_single_program();
-		const string program_name = device->get_opencl_program_name(single_program, kernel_name);
+		const string program_name = device->get_opencl_program_name(kernel_name);
 		kernel->program =
 			OpenCLDevice::OpenCLProgram(device,
 			                            program_name,
-			                            device->get_opencl_program_filename(single_program, kernel_name),
-			                            device->get_build_options(requested_features, program_name));
+			                            device->get_opencl_program_filename(kernel_name),
+			                            device->get_build_options(requested_features,
+			                                                      program_name,
+			                                                      device->use_preview_kernels));

 		kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
 		kernel->program.load();
@@ -306,7 +418,8 @@ public:
 		size_buffer.zero_to_device();

 		uint threads = num_threads;
-		cl_kernel kernel_state_buffer_size = device->program_split(ustring("path_trace_state_buffer_size"));
+		OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
+		cl_kernel kernel_state_buffer_size = programs->program_split(ustring("path_trace_state_buffer_size"));
 		device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer);

 		size_t global_size = 64;
@@ -356,7 +469,8 @@ public:
 		cl_int start_sample = rtile.start_sample;
 		cl_int end_sample = rtile.start_sample + rtile.num_samples;

-		cl_kernel kernel_data_init = device->program_split(ustring("path_trace_data_init"));
+		OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
+		cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init"));

 		cl_uint start_arg_index =
 			device->kernel_set_args(kernel_data_init,
@@ -489,6 +603,8 @@ void OpenCLDevice::opencl_assert_err(cl_int err, const char* where)

 OpenCLDevice::OpenCLDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background)
 : Device(info, stats, profiler, background),
+  kernel_programs(this),
+  preview_programs(this),
  memory_manager(this),
  texture_info(this, "__texture_info", MEM_TEXTURE)
 {
@@ -499,6 +615,7 @@ OpenCLDevice::OpenCLDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, b
 	null_mem = 0;
 	device_initialized = false;
 	textures_need_update = true;
+	use_preview_kernels = !background;

 	vector<OpenCLPlatformDevice> usable_devices;
 	OpenCLInfo::get_usable_devices(&usable_devices);
@@ -562,11 +679,16 @@ OpenCLDevice::OpenCLDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, b
 	device_initialized = true;

 	split_kernel = new OpenCLSplitKernel(this);
+	if (!background) {
+		load_preview_kernels();
+	}
 }

 OpenCLDevice::~OpenCLDevice()
 {
 	task_pool.stop();
+	load_required_kernel_task_pool.stop();
+	load_kernel_task_pool.stop();

 	memory_manager.free();

@@ -582,7 +704,7 @@ OpenCLDevice::~OpenCLDevice()
 	bake_program.release();
 	displace_program.release();
 	background_program.release();
-	program_split.release();
+	denoising_program.release();

 	if(cqCommandQueue)
 		clReleaseCommandQueue(cqCommandQueue);
@@ -649,7 +771,50 @@ bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures& requested_feature
 	if(!opencl_version_check())
 		return false;

+	load_required_kernels(requested_features);
+
 	vector<OpenCLProgram*> programs;
+	kernel_programs.load_kernels(programs, requested_features, false);
+
+	if (!requested_features.use_baking && requested_features.use_denoising) {
+		denoising_program = OpenCLProgram(this, "denoising", "filter.cl", get_build_options(requested_features, "denoising"));
+		denoising_program.add_kernel(ustring("filter_divide_shadow"));
+		denoising_program.add_kernel(ustring("filter_get_feature"));
+		denoising_program.add_kernel(ustring("filter_write_feature"));
+		denoising_program.add_kernel(ustring("filter_detect_outliers"));
+		denoising_program.add_kernel(ustring("filter_combine_halves"));
+		denoising_program.add_kernel(ustring("filter_construct_transform"));
+		denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
+		denoising_program.add_kernel(ustring("filter_nlm_blur"));
+		denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
+		denoising_program.add_kernel(ustring("filter_nlm_update_output"));
+		denoising_program.add_kernel(ustring("filter_nlm_normalize"));
+		denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
+		denoising_program.add_kernel(ustring("filter_finalize"));
+		programs.push_back(&denoising_program);
+	}
+
+	load_required_kernel_task_pool.wait_work();
+
+	/* Parallel compilation of Cycles kernels, this launches multiple
+	 * processes to workaround OpenCL frameworks serializing the calls
+	 * internally within a single process. */
+	foreach(OpenCLProgram *program, programs) {
+		if (!program->load()) {
+			load_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
+		}
+	}
+	return true;
+}
+
+void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures& requested_features)
+{
+	vector<OpenCLProgram*> programs;
+	base_program = OpenCLProgram(this, "base", "kernel_base.cl", get_build_options(requested_features, "base"));
+	base_program.add_kernel(ustring("convert_to_byte"));
+	base_program.add_kernel(ustring("convert_to_half_float"));
+	base_program.add_kernel(ustring("zero_buffer"));
+	programs.push_back(&base_program);

 	if (requested_features.use_true_displacement) {
 		displace_program = OpenCLProgram(this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace"));
@@ -663,133 +828,89 @@ bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures& requested_feature
 		programs.push_back(&background_program);
 	}

-	bool single_program = OpenCLInfo::use_single_program();
-
-#define ADD_SPLIT_KERNEL_SINGLE_PROGRAM(kernel_name) program_split.add_kernel(ustring("path_trace_"#kernel_name));
-#define ADD_SPLIT_KERNEL_SPLIT_PROGRAM(kernel_name) \
-		const string program_name_##kernel_name = "split_"#kernel_name; \
-		program_##kernel_name = \
-			OpenCLDevice::OpenCLProgram(this, \
-			                            program_name_##kernel_name, \
-			                            "kernel_"#kernel_name".cl", \
-			                            get_build_options(requested_features, program_name_##kernel_name)); \
-		program_##kernel_name.add_kernel(ustring("path_trace_"#kernel_name)); \
-		programs.push_back(&program_##kernel_name);
-
-	if (single_program) {
-		program_split = OpenCLDevice::OpenCLProgram(this,
-		                                            "split" ,
-		                                            "kernel_split.cl",
-		                                            get_build_options(requested_features, "split"));
-
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(state_buffer_size);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(data_init);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(path_init);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(scene_intersect);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(lamp_emission);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(do_volume);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(queue_enqueue);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_background);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_setup);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_sort);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_eval);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(holdout_emission_blurring_pathtermination_ao);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(subsurface_scatter);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(direct_lighting);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shadow_blocked_ao);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shadow_blocked_dl);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(enqueue_inactive);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(next_iteration_setup);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_subsurface);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(buffer_update);
-
-		programs.push_back(&program_split);
-	}
-	else {
-		/* Ordered with most complex kernels first, to reduce overall compile time. */
-		ADD_SPLIT_KERNEL_SPLIT_PROGRAM(subsurface_scatter);
-		if (requested_features.use_volume) {
-			ADD_SPLIT_KERNEL_SPLIT_PROGRAM(do_volume);
-		}
-		ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shadow_blocked_dl);
-		ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shadow_blocked_ao);
-		ADD_SPLIT_KERNEL_SPLIT_PROGRAM(holdout_emission_blurring_pathtermination_ao);
-		ADD_SPLIT_KERNEL_SPLIT_PROGRAM(lamp_emission);
-		ADD_SPLIT_KERNEL_SPLIT_PROGRAM(direct_lighting);
-		ADD_SPLIT_KERNEL_SPLIT_PROGRAM(indirect_background);
-		ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shader_eval);
-
-		/* Quick kernels bundled in a single program to reduce overhead of starting
-			* Blender processes. */
-		program_split = OpenCLDevice::OpenCLProgram(this,
-		                                            "split_bundle" ,
-		                                            "kernel_split_bundle.cl",
-		                                            get_build_options(requested_features, "split_bundle"));
-
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(data_init);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(state_buffer_size);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(path_init);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(scene_intersect);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(queue_enqueue);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_setup);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_sort);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(enqueue_inactive);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(next_iteration_setup);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_subsurface);
-		ADD_SPLIT_KERNEL_SINGLE_PROGRAM(buffer_update);
-		programs.push_back(&program_split);
-	}
-#undef ADD_SPLIT_KERNEL_SPLIT_PROGRAM
-#undef ADD_SPLIT_KERNEL_SINGLE_PROGRAM
-
-	base_program = OpenCLProgram(this, "base", "kernel_base.cl", get_build_options(requested_features, "base"));
-	base_program.add_kernel(ustring("convert_to_byte"));
-	base_program.add_kernel(ustring("convert_to_half_float"));
-	base_program.add_kernel(ustring("zero_buffer"));
-	programs.push_back(&base_program);
-
 	if (requested_features.use_baking) {
 		bake_program = OpenCLProgram(this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake"));
 		bake_program.add_kernel(ustring("bake"));
 		programs.push_back(&bake_program);
 	}

-	denoising_program = OpenCLProgram(this, "denoising", "filter.cl", get_build_options(requested_features, "denoising"));
-	denoising_program.add_kernel(ustring("filter_divide_shadow"));
-	denoising_program.add_kernel(ustring("filter_get_feature"));
-	denoising_program.add_kernel(ustring("filter_write_feature"));
-	denoising_program.add_kernel(ustring("filter_detect_outliers"));
-	denoising_program.add_kernel(ustring("filter_combine_halves"));
-	denoising_program.add_kernel(ustring("filter_construct_transform"));
-	denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
-	denoising_program.add_kernel(ustring("filter_nlm_blur"));
-	denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
-	denoising_program.add_kernel(ustring("filter_nlm_update_output"));
-	denoising_program.add_kernel(ustring("filter_nlm_normalize"));
-	denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
-	denoising_program.add_kernel(ustring("filter_finalize"));
-	programs.push_back(&denoising_program);
-
-	/* Parallel compilation of Cycles kernels, this launches multiple
-	 * processes to workaround OpenCL frameworks serializing the calls
-	 * internally within a single process. */
-	TaskPool task_pool;
 	foreach(OpenCLProgram *program, programs) {
-		task_pool.push(function_bind(&OpenCLProgram::load, program));
-	}
-	task_pool.wait_work();
-
-	foreach(OpenCLProgram *program, programs) {
-		VLOG(2) << program->get_log();
-		if(!program->is_loaded()) {
-			program->report_error();
-			return false;
+		if (!program->load()) {
+			load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
 		}
 	}
+}

+void OpenCLDevice::load_preview_kernels()
+{
+	DeviceRequestedFeatures no_features;
+	vector<OpenCLProgram*> programs;
+	preview_programs.load_kernels(programs, no_features, true);
+
+	foreach(OpenCLProgram *program, programs) {
+		if (!program->load()) {
+			load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
+		}
+	}
+}
+
+bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures& requested_features)
+{
+	if (background) {
+		load_kernel_task_pool.wait_work();
+		use_preview_kernels = false;
+	}
+	else {
+		/* We use a device setting to determine to load preview kernels or not
+		 * Better to check on device level than per kernel as mixing preview and
+		 * non-preview kernels does not work due to different data types */
+		if (use_preview_kernels) {
+			use_preview_kernels = !load_kernel_task_pool.finished();
+		}
+	}
 	return split_kernel->load_kernels(requested_features);
 }

+OpenCLDevice::OpenCLSplitPrograms* OpenCLDevice::get_split_programs()
+{
+	return use_preview_kernels?&preview_programs:&kernel_programs;
+}
+
+DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state()
+{
+	/* Do not switch kernels for background renderings
+	 * We do foreground rendering but use the preview kernels
+	 * Check for the optimized kernels
+	 *
+	 * This works also the other way around, where we are using
+	 * optimized kernels but new ones are being compiled due
+	 * to other features that are needed */
+	if (background) {
+		/* The if-statements below would find the same result,
+		 * But as the `finished` method uses a mutex we added
+		 * this as an early exit */
+		return DEVICE_KERNEL_USING_FEATURE_KERNEL;
+	}
+
+	bool other_kernels_finished = load_kernel_task_pool.finished();
+	if (use_preview_kernels) {
+		if (other_kernels_finished) {
+			return DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE;
+		}
+		else {
+			return DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL;
+		}
+	}
+	else {
+		if (other_kernels_finished) {
+			return DEVICE_KERNEL_USING_FEATURE_KERNEL;
+		}
+		else {
+			return DEVICE_KERNEL_FEATURE_KERNEL_INVALID;
+		}
+	}
+}
+
 void OpenCLDevice::mem_alloc(device_memory& mem)
 {
 	if(mem.name) {
@@ -891,6 +1012,7 @@ void OpenCLDevice::mem_copy_from(device_memory& mem, int y, int w, int h, int el

 void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size)
 {
+	base_program.wait_for_availability();
 	cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));

 	size_t global_size[] = {1024, 1024};
@@ -1230,13 +1352,13 @@ void OpenCLDevice::thread_run(DeviceTask *task)

 				/* Complete kernel execution before release tile. */
 				/* This helps in multi-device render;
-					* The device that reaches the critical-section function
-					* release_tile waits (stalling other devices from entering
-					* release_tile) for all kernels to complete. If device1 (a
-					* slow-render device) reaches release_tile first then it would
-					* stall device2 (a fast-render device) from proceeding to render
-					* next tile.
-					*/
+				 * The device that reaches the critical-section function
+				 * release_tile waits (stalling other devices from entering
+				 * release_tile) for all kernels to complete. If device1 (a
+				 * slow-render device) reaches release_tile first then it would
+				 * stall device2 (a fast-render device) from proceeding to render
+				 * next tile.
+				 */
 				clFinish(cqCommandQueue);
 			}
 			else if(tile.task == RenderTile::DENOISE) {
@@ -1718,17 +1840,15 @@ void OpenCLDevice::shader(DeviceTask& task)
 	cl_int d_shader_w = task.shader_w;
 	cl_int d_offset = task.offset;

-	cl_kernel kernel;
-
+	OpenCLDevice::OpenCLProgram *program = &background_program;
 	if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
-		kernel = bake_program(ustring("bake"));
+		program = &bake_program;
 	}
 	else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) {
-		kernel = displace_program(ustring("displace"));
-	}
-	else {
-		kernel = background_program(ustring("background"));
+		program = &displace_program;
 	}
+	program->wait_for_availability();
+	cl_kernel kernel = (*program)();

 	cl_uint start_arg_index =
 		kernel_set_args(kernel,
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -243,6 +243,18 @@ string OpenCLCache::get_kernel_md5()
 	return self.kernel_md5;
 }

+static string get_program_source(const string& kernel_file)
+{
+	string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
+	/* We compile kernels consisting of many files. unfortunately OpenCL
+	 * kernel caches do not seem to recognize changes in included files.
+	 * so we force recompile on changes by adding the md5 hash of all files.
+	 */
+	source = path_source_replace_includes(source, path_get("source"));
+	source += "\n// " + util_md5_string(source) + "\n";
+	return source;
+}
+
 OpenCLDevice::OpenCLProgram::OpenCLProgram(OpenCLDevice *device,
                                               const string& program_name,
                                               const string& kernel_file,
@@ -255,6 +267,7 @@ OpenCLDevice::OpenCLProgram::OpenCLProgram(OpenCLDevice *device,
   use_stdout(use_stdout)
 {
 	loaded = false;
+	needs_compiling = true;
 	program = NULL;
 }

@@ -343,13 +356,7 @@ bool OpenCLDevice::OpenCLProgram::build_kernel(const string *debug_src)

 bool OpenCLDevice::OpenCLProgram::compile_kernel(const string *debug_src)
 {
-	string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
-	/* We compile kernels consisting of many files. unfortunately OpenCL
-	 * kernel caches do not seem to recognize changes in included files.
-	 * so we force recompile on changes by adding the md5 hash of all files.
-	 */
-	source = path_source_replace_includes(source, path_get("source"));
-	source += "\n// " + util_md5_string(source) + "\n";
+	string source = get_program_source(kernel_file);

 	if(debug_src) {
 		path_write_text(*debug_src, source);
@@ -473,8 +480,7 @@ bool device_opencl_compile_kernel(const vector<string>& parameters)
 		return false;
 	}

-	string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\" // " + path_files_md5_hash(path_get("kernel")) + "\n";
-	source = path_source_replace_includes(source, path_get("source"));
+	string source = get_program_source(kernel_file);
 	size_t source_len = source.size();
 	const char *source_str = source.c_str();
 	cl_program program = clCreateProgramWithSource(context, 1, &source_str, &source_len, &err);
@@ -548,12 +554,55 @@ bool OpenCLDevice::OpenCLProgram::save_binary(const string& clbin)
 	return path_write_binary(clbin, binary);
 }

-void OpenCLDevice::OpenCLProgram::load()
+bool OpenCLDevice::OpenCLProgram::load()
+{
+	loaded = false;
+	string device_md5 = device->device_md5_hash(kernel_build_options);
+
+	/* Try to use cached kernel. */
+	thread_scoped_lock cache_locker;
+	ustring cache_key(program_name + device_md5);
+	program = device->load_cached_kernel(cache_key,
+	                                     cache_locker);
+	if (!program) {
+		add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
+
+		/* need to create source to get md5 */
+		string source = get_program_source(kernel_file);
+
+		string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source);
+		basename = path_cache_get(path_join("kernels", basename));
+		string clbin = basename + ".clbin";
+
+		/* If binary kernel exists already, try use it. */
+		if(path_exists(clbin) && load_binary(clbin)) {
+			/* Kernel loaded from binary, nothing to do. */
+			add_log(string("Loaded program from ") + clbin + ".", true);
+
+			/* Cache the program. */
+			device->store_cached_kernel(program,
+			                            cache_key,
+			                            cache_locker);
+		}
+		else {
+			add_log(string("OpenCL program ") + program_name + " not found on disk.", true);
+			cache_locker.unlock();
+		}
+	}
+
+	if (program) {
+		create_kernels();
+		loaded = true;
+		needs_compiling = false;
+	}
+
+	return loaded;
+}
+
+void OpenCLDevice::OpenCLProgram::compile()
 {
 	assert(device);

-	loaded = false;
-
 	string device_md5 = device->device_md5_hash(kernel_build_options);

 	/* Try to use cached kernel. */
@@ -562,12 +611,13 @@ void OpenCLDevice::OpenCLProgram::load()
 	program = device->load_cached_kernel(cache_key,
 	                                     cache_locker);

-	if(!program) {
+	if (!program)
+	{
+
 		add_log(string("OpenCL program ") + program_name + " not found in cache.", true);

 		/* need to create source to get md5 */
-		string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
-		source = path_source_replace_includes(source, path_get("source"));
+		string source = get_program_source(kernel_file);

 		string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source);
 		basename = path_cache_get(path_join("kernels", basename));
@@ -582,49 +632,38 @@ void OpenCLDevice::OpenCLProgram::load()
 		}

 		/* If binary kernel exists already, try use it. */
-		if(path_exists(clbin) && load_binary(clbin)) {
-			/* Kernel loaded from binary, nothing to do. */
-			add_log(string("Loaded program from ") + clbin + ".", true);
+		if(compile_separate(clbin)) {
+			add_log(string("Built and loaded program from ") + clbin + ".", true);
+			loaded = true;
 		}
 		else {
-			add_log(string("Kernel file ") + clbin + " either doesn't exist or failed to be loaded by driver.", true);
-			if(!path_exists(clbin)) {
-				if(compile_separate(clbin)) {
-					add_log(string("Built and loaded program from ") + clbin + ".", true);
-					loaded = true;
-				}
-				else {
-					add_log(string("Separate-process building of ") + clbin + " failed, will fall back to regular building.", true);
+			add_log(string("Separate-process building of ") + clbin + " failed, will fall back to regular building.", true);

-					/* If does not exist or loading binary failed, compile kernel. */
-					if(!compile_kernel(debug_src)) {
-						return;
-					}
-
-					/* Save binary for reuse. */
-					if(!save_binary(clbin)) {
-						add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true);
-					}
-				}
+			/* If does not exist or loading binary failed, compile kernel. */
+			if(!compile_kernel(debug_src)) {
+				needs_compiling = false;
+				return;
 			}
-			else {
-				add_log(string("Kernel file ") + clbin + "exists, but failed to be loaded by driver.", true);
-				/* Fall back to compiling. */
-				if(!compile_kernel(debug_src)) {
-					return;
-				}
+
+			/* Save binary for reuse. */
+			if(!save_binary(clbin)) {
+				add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true);
 			}
 		}

 		/* Cache the program. */
 		device->store_cached_kernel(program,
-		                            cache_key,
-		                            cache_locker);
-	}
-	else {
-		add_log(string("Found cached OpenCL program ") + program_name + ".", true);
+									cache_key,
+									cache_locker);
 	}

+	create_kernels();
+	needs_compiling = false;
+	loaded = true;
+}
+
+void OpenCLDevice::OpenCLProgram::create_kernels()
+{
 	for(map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); ++kernel) {
 		assert(kernel->second == NULL);
 		cl_int ciErr;
@@ -635,8 +674,15 @@ void OpenCLDevice::OpenCLProgram::load()
 			return;
 		}
 	}
+}

-	loaded = true;
+bool OpenCLDevice::OpenCLProgram::wait_for_availability()
+{
+	add_log(string("Waiting for availability of ") + program_name + ".", true);
+	while (needs_compiling) {
+		time_sleep(0.1);
+	}
+	return loaded;
 }

 void OpenCLDevice::OpenCLProgram::report_error()
@@ -691,28 +737,6 @@ bool OpenCLInfo::use_debug()
 	return DebugFlags().opencl.debug;
 }

-bool OpenCLInfo::use_single_program()
-{
-	return DebugFlags().opencl.single_program;
-}
-
-bool OpenCLInfo::kernel_use_advanced_shading(const string& platform)
-{
-	/* keep this in sync with kernel_types.h! */
-	if(platform == "NVIDIA CUDA")
-		return true;
-	else if(platform == "Apple")
-		return true;
-	else if(platform == "AMD Accelerated Parallel Processing")
-		return true;
-	else if(platform == "Intel(R) OpenCL")
-		return true;
-	/* Make sure officially unsupported OpenCL platforms
-	 * does not set up to use advanced shading.
-	 */
-	return false;
-}
-
 bool OpenCLInfo::device_supported(const string& platform_name,
                                  const cl_device_id device_id)
 {
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -41,7 +41,6 @@ set(SRC_OPENCL_KERNELS
 	kernels/opencl/kernel_displace.cl
 	kernels/opencl/kernel_background.cl
 	kernels/opencl/kernel_state_buffer_size.cl
-	kernels/opencl/kernel_split.cl
 	kernels/opencl/kernel_split_bundle.cl
 	kernels/opencl/kernel_data_init.cl
 	kernels/opencl/kernel_path_init.cl
@@ -347,11 +346,11 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")

 	# warn for other versions
-	if(CUDA_VERSION MATCHES "90" OR CUDA_VERSION MATCHES "91" OR CUDA_VERSION MATCHES "100")
+	if(CUDA_VERSION MATCHES "101")
 	else()
 		message(WARNING
 			"CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, "
-			"build may succeed but only CUDA 9.0, 9.1 and 10.0 are officially supported")
+			"build may succeed but only CUDA 10.1 is officially supported")
 	endif()

 	# build for each arch
@@ -401,29 +400,17 @@ if(WITH_CYCLES_CUDA_BINARIES)
 			set(cuda_flags ${cuda_flags} -D __KERNEL_DEBUG__)
 		endif()

-		# Workaround to build only sm_7x kernels with CUDA 10, until
-		# older kernels work well with this version.
-		if(DEFINED CUDA10_NVCC_EXECUTABLE  AND (${arch} MATCHES "sm_7."))
-			set(with_cubin_compiler OFF)
-			set(cuda_nvcc_executable "${CUDA10_NVCC_EXECUTABLE}")
-			set(cuda_toolkit_root_dir "${CUDA10_TOOLKIT_ROOT_DIR}")
-		else()
-			set(with_cubin_compiler ${WITH_CYCLES_CUBIN_COMPILER})
-			set(cuda_nvcc_executable "${CUDA_NVCC_EXECUTABLE}")
-			set(cuda_toolkit_root_dir "${CUDA_TOOLKIT_ROOT_DIR}")
-		endif()
-
-		if(with_cubin_compiler)
+		if(WITH_CYCLES_CUBIN_COMPILER)
 			string(SUBSTRING ${arch} 3 -1 CUDA_ARCH)

 			# Needed to find libnvrtc-builtins.so. Can't do it from inside
 			# cycles_cubin_cc since the env variable is read before main()
 			if(APPLE)
 				set(CUBIN_CC_ENV ${CMAKE_COMMAND}
-					-E env DYLD_LIBRARY_PATH="${cuda_toolkit_root_dir}/lib")
+					-E env DYLD_LIBRARY_PATH="${CUDA_TOOLKIT_ROOT_DIR}/lib")
 			elseif(UNIX)
 				set(CUBIN_CC_ENV ${CMAKE_COMMAND}
-					-E env LD_LIBRARY_PATH="${cuda_toolkit_root_dir}/lib64")
+					-E env LD_LIBRARY_PATH="${CUDA_TOOLKIT_ROOT_DIR}/lib64")
 			endif()

 			add_custom_command(
@@ -434,12 +421,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
 						-i ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
 						${cuda_flags}
 						-v
-						-cuda-toolkit-dir "${cuda_toolkit_root_dir}"
+						-cuda-toolkit-dir "${CUDA_TOOLKIT_ROOT_DIR}"
 				DEPENDS ${kernel_sources} cycles_cubin_cc)
 		else()
 			add_custom_command(
 				OUTPUT ${cuda_cubin}
-				COMMAND ${cuda_nvcc_executable}
+				COMMAND ${CUDA_NVCC_EXECUTABLE}
 						-arch=${arch}
 						${CUDA_NVCC_FLAGS}
 						--cubin
@@ -458,7 +445,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
 		if(${arch} MATCHES "sm_2.")
 			message(STATUS "CUDA binaries for ${arch} are no longer supported, skipped.")
-		elseif(${arch} MATCHES "sm_7." AND (${CUDA_VERSION} LESS 100) AND (NOT DEFINED CUDA10_NVCC_EXECUTABLE))
+		elseif(${arch} MATCHES "sm_7." AND ${CUDA_VERSION} LESS 100)
 			message(STATUS "CUDA binaries for ${arch} require CUDA 10.0+, skipped.")
 		else()
 			# Compile regular kernel
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -253,6 +253,7 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
 	PROFILING_INIT(kg, PROFILING_INTERSECT_LOCAL);

 	if(!scene_intersect_valid(&ray)) {
+		local_isect->num_hits = 0;
 		return false;
 	}
 #ifdef __EMBREE__
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -95,14 +95,16 @@ ccl_device_inline void kernel_filter_finalize(int x, int y,
 	}

 	/* The weighted average of pixel colors (essentially, the NLM-filtered image).
-	 * In case the solution of the linear model fails due to numerical issues,
-	 * fall back to this value. */
+	 * In case the solution of the linear model fails due to numerical issues or
+	 * returns non-sensical negative values, fall back to this value. */
 	float3 mean_color = XtWY[0]/XtWX[0];

 	math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);

 	float3 final_color = XtWY[0];
-	if(!isfinite3_safe(final_color)) {
+	if(!isfinite3_safe(final_color) ||
+	   (final_color.x < -0.01f || final_color.y < -0.01f || final_color.z < -0.01f))
+	{
 		final_color = mean_color;
 	}

--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -87,6 +87,45 @@ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 	}
 }

+ccl_device float2 curve_attribute_float2(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float2 *dx, float2 *dy)
+{
+	if(desc.element == ATTR_ELEMENT_CURVE) {
+		/* idea: we can't derive any useful differentials here, but for tiled
+		 * mipmap image caching it would be useful to avoid reading the highest
+		 * detail level always. maybe a derivative based on the hair density
+		 * could be computed somehow? */
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = make_float2(0.0f, 0.0f);
+		if(dy) *dy = make_float2(0.0f, 0.0f);
+#endif
+
+		return kernel_tex_fetch(__attributes_float2, desc.offset + sd->prim);
+	}
+	else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+		int k1 = k0 + 1;
+
+		float2 f0 = kernel_tex_fetch(__attributes_float2, desc.offset + k0);
+		float2 f1 = kernel_tex_fetch(__attributes_float2, desc.offset + k1);
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = sd->du.dx*(f1 - f0);
+		if(dy) *dy = make_float2(0.0f, 0.0f);
+#endif
+
+		return (1.0f - sd->u)*f0 + sd->u*f1;
+	}
+	else {
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = make_float2(0.0f, 0.0f);
+		if(dy) *dy = make_float2(0.0f, 0.0f);
+#endif
+
+		return make_float2(0.0f, 0.0f);
+	}
+}
+
 ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy)
 {
 	if(desc.element == ATTR_ELEMENT_CURVE) {
--- a/intern/cycles/kernel/geom/geom_patch.h
+++ b/intern/cycles/kernel/geom/geom_patch.h
@@ -284,6 +284,33 @@ ccl_device float patch_eval_float(KernelGlobals *kg, const ShaderData *sd, int o
 	return val;
 }

+ccl_device float2 patch_eval_float2(KernelGlobals *kg, const ShaderData *sd, int offset,
+                                    int patch, float u, float v, int channel,
+                                    float2 *du, float2 *dv)
+{
+	int indices[PATCH_MAX_CONTROL_VERTS];
+	float weights[PATCH_MAX_CONTROL_VERTS];
+	float weights_du[PATCH_MAX_CONTROL_VERTS];
+	float weights_dv[PATCH_MAX_CONTROL_VERTS];
+
+	int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
+	                                           indices, weights, weights_du, weights_dv);
+
+	float2 val = make_float2(0.0f, 0.0f);
+	if(du) *du = make_float2(0.0f, 0.0f);
+	if(dv) *dv = make_float2(0.0f, 0.0f);
+
+	for(int i = 0; i < num_control; i++) {
+		float2 v = kernel_tex_fetch(__attributes_float2, offset + indices[i]);
+
+		val += v * weights[i];
+		if(du) *du += v * weights_du[i];
+		if(dv) *dv += v * weights_dv[i];
+	}
+
+	return val;
+}
+
 ccl_device float3 patch_eval_float3(KernelGlobals *kg, const ShaderData *sd, int offset,
                                    int patch, float u, float v, int channel,
                                    float3 *du, float3 *dv)
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -89,6 +89,37 @@ ccl_device_inline float primitive_volume_attribute_float(KernelGlobals *kg,
 }
 #endif

+ccl_device_inline float2 primitive_attribute_float2(KernelGlobals *kg,
+                                                    const ShaderData *sd,
+                                                    const AttributeDescriptor desc,
+                                                    float2 *dx, float2 *dy)
+{
+	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
+		if(subd_triangle_patch(kg, sd) == ~0)
+			return triangle_attribute_float2(kg, sd, desc, dx, dy);
+		else
+			return subd_triangle_attribute_float2(kg, sd, desc, dx, dy);
+	}
+#ifdef __HAIR__
+	else if(sd->type & PRIMITIVE_ALL_CURVE) {
+		return curve_attribute_float2(kg, sd, desc, dx, dy);
+	}
+#endif
+#ifdef __VOLUME__
+	else if(sd->object != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
+		kernel_assert(0);
+		if(dx) *dx = make_float2(0.0f, 0.0f);
+		if(dy) *dy = make_float2(0.0f, 0.0f);
+		return make_float2(0.0f, 0.0f);
+	}
+#endif
+	else {
+		if(dx) *dx = make_float2(0.0f, 0.0f);
+		if(dy) *dy = make_float2(0.0f, 0.0f);
+		return make_float2(0.0f, 0.0f);
+	}
+}
+
 ccl_device_inline float3 primitive_attribute_float3(KernelGlobals *kg,
                                                    const ShaderData *sd,
                                                    const AttributeDescriptor desc,
@@ -119,6 +150,29 @@ ccl_device_inline float3 primitive_attribute_float3(KernelGlobals *kg,
 	}
 }

+ccl_device_inline float2 primitive_surface_attribute_float2(KernelGlobals *kg,
+                                                            const ShaderData *sd,
+                                                            const AttributeDescriptor desc,
+                                                            float2 *dx, float2 *dy)
+{
+	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
+		if(subd_triangle_patch(kg, sd) == ~0)
+			return triangle_attribute_float2(kg, sd, desc, dx, dy);
+		else
+			return subd_triangle_attribute_float2(kg, sd, desc, dx, dy);
+	}
+#ifdef __HAIR__
+	else if(sd->type & PRIMITIVE_ALL_CURVE) {
+		return curve_attribute_float2(kg, sd, desc, dx, dy);
+	}
+#endif
+	else {
+		if(dx) *dx = make_float2(0.0f, 0.0f);
+		if(dy) *dy = make_float2(0.0f, 0.0f);
+		return make_float2(0.0f, 0.0f);
+	}
+}
+
 ccl_device_inline float3 primitive_surface_attribute_float3(KernelGlobals *kg,
                                                            const ShaderData *sd,
                                                            const AttributeDescriptor desc,
@@ -165,9 +219,8 @@ ccl_device_inline float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
 	if(desc.offset == ATTR_STD_NOT_FOUND)
 		return make_float3(0.0f, 0.0f, 0.0f);

-	float3 uv = primitive_surface_attribute_float3(kg, sd, desc, NULL, NULL);
-	uv.z = 1.0f;
-	return uv;
+	float2 uv = primitive_surface_attribute_float2(kg, sd, desc, NULL, NULL);
+	return make_float3(uv.x, uv.y, 1.0f);
 }

 /* Ptex coordinates */
--- a/intern/cycles/kernel/geom/geom_subd_triangle.h
+++ b/intern/cycles/kernel/geom/geom_subd_triangle.h
@@ -216,6 +216,128 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 	}
 }

+ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float2 *dx, float2 *dy)
+{
+	int patch = subd_triangle_patch(kg, sd);
+
+#ifdef __PATCH_EVAL__
+	if(desc.flags & ATTR_SUBDIVIDED) {
+		float2 uv[3];
+		subd_triangle_patch_uv(kg, sd, uv);
+
+		float2 dpdu = uv[0] - uv[2];
+		float2 dpdv = uv[1] - uv[2];
+
+		/* p is [s, t] */
+		float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
+
+		float2 a, dads, dadt;
+
+		a = patch_eval_float2(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt);
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx || dy) {
+			float dsdu = dpdu.x;
+			float dtdu = dpdu.y;
+			float dsdv = dpdv.x;
+			float dtdv = dpdv.y;
+
+			if(dx) {
+				float dudx = sd->du.dx;
+				float dvdx = sd->dv.dx;
+
+				float dsdx = dsdu*dudx + dsdv*dvdx;
+				float dtdx = dtdu*dudx + dtdv*dvdx;
+
+				*dx = dads*dsdx + dadt*dtdx;
+			}
+			if(dy) {
+				float dudy = sd->du.dy;
+				float dvdy = sd->dv.dy;
+
+				float dsdy = dsdu*dudy + dsdv*dvdy;
+				float dtdy = dtdu*dudy + dtdv*dvdy;
+
+				*dy = dads*dsdy + dadt*dtdy;
+			}
+		}
+#endif
+
+		return a;
+	}
+	else
+#endif  /* __PATCH_EVAL__ */
+		if(desc.element == ATTR_ELEMENT_FACE) {
+			if(dx) *dx = make_float2(0.0f, 0.0f);
+			if(dy) *dy = make_float2(0.0f, 0.0f);
+
+			return kernel_tex_fetch(__attributes_float2, desc.offset + subd_triangle_patch_face(kg, patch));
+		}
+		else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
+			float2 uv[3];
+			subd_triangle_patch_uv(kg, sd, uv);
+
+			uint4 v = subd_triangle_patch_indices(kg, patch);
+
+			float2 f0 = kernel_tex_fetch(__attributes_float2, desc.offset + v.x);
+			float2 f1 = kernel_tex_fetch(__attributes_float2, desc.offset + v.y);
+			float2 f2 = kernel_tex_fetch(__attributes_float2, desc.offset + v.z);
+			float2 f3 = kernel_tex_fetch(__attributes_float2, desc.offset + v.w);
+
+			if(subd_triangle_patch_num_corners(kg, patch) != 4) {
+				f1 = (f1+f0)*0.5f;
+				f3 = (f3+f0)*0.5f;
+			}
+
+			float2 a = mix(mix(f0, f1, uv[0].x), mix(f3, f2, uv[0].x), uv[0].y);
+			float2 b = mix(mix(f0, f1, uv[1].x), mix(f3, f2, uv[1].x), uv[1].y);
+			float2 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
+
+#ifdef __RAY_DIFFERENTIALS__
+			if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+			if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
+#endif
+
+			return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
+		}
+		else if(desc.element == ATTR_ELEMENT_CORNER) {
+			float2 uv[3];
+			subd_triangle_patch_uv(kg, sd, uv);
+
+			int corners[4];
+			subd_triangle_patch_corners(kg, patch, corners);
+
+			float2 f0, f1, f2, f3;
+
+			f0 = kernel_tex_fetch(__attributes_float2, corners[0] + desc.offset);
+			f1 = kernel_tex_fetch(__attributes_float2, corners[1] + desc.offset);
+			f2 = kernel_tex_fetch(__attributes_float2, corners[2] + desc.offset);
+			f3 = kernel_tex_fetch(__attributes_float2, corners[3] + desc.offset);
+
+			if(subd_triangle_patch_num_corners(kg, patch) != 4) {
+				f1 = (f1+f0)*0.5f;
+				f3 = (f3+f0)*0.5f;
+			}
+
+			float2 a = mix(mix(f0, f1, uv[0].x), mix(f3, f2, uv[0].x), uv[0].y);
+			float2 b = mix(mix(f0, f1, uv[1].x), mix(f3, f2, uv[1].x), uv[1].y);
+			float2 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
+
+#ifdef __RAY_DIFFERENTIALS__
+			if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+			if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
+#endif
+
+			return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
+		}
+		else {
+			if(dx) *dx = make_float2(0.0f, 0.0f);
+			if(dy) *dy = make_float2(0.0f, 0.0f);
+
+			return make_float2(0.0f, 0.0f);
+		}
+}
+
 ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy)
 {
 	int patch = subd_triangle_patch(kg, sd);
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -149,6 +149,53 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s
 	}
 }

+ccl_device float2 triangle_attribute_float2(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float2 *dx, float2 *dy)
+{
+	if(desc.element == ATTR_ELEMENT_FACE) {
+		if(dx) *dx = make_float2(0.0f, 0.0f);
+		if(dy) *dy = make_float2(0.0f, 0.0f);
+
+		return kernel_tex_fetch(__attributes_float2, desc.offset + sd->prim);
+	}
+	else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
+		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+
+		float2 f0 = kernel_tex_fetch(__attributes_float2, desc.offset + tri_vindex.x);
+		float2 f1 = kernel_tex_fetch(__attributes_float2, desc.offset + tri_vindex.y);
+		float2 f2 = kernel_tex_fetch(__attributes_float2, desc.offset + tri_vindex.z);
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+#endif
+
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+	}
+	else if(desc.element == ATTR_ELEMENT_CORNER) {
+		int tri = desc.offset + sd->prim*3;
+		float2 f0, f1, f2;
+
+		if(desc.element == ATTR_ELEMENT_CORNER) {
+			f0 = kernel_tex_fetch(__attributes_float2, tri + 0);
+			f1 = kernel_tex_fetch(__attributes_float2, tri + 1);
+			f2 = kernel_tex_fetch(__attributes_float2, tri + 2);
+		}
+
+#ifdef __RAY_DIFFERENTIALS__
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+#endif
+
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+	}
+	else {
+		if(dx) *dx = make_float2(0.0f, 0.0f);
+		if(dy) *dy = make_float2(0.0f, 0.0f);
+
+		return make_float2(0.0f, 0.0f);
+	}
+}
+
 ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy)
 {
 	if(desc.element == ATTR_ELEMENT_FACE) {
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -351,7 +351,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 				out = make_float3(roughness, roughness, roughness);
 			}
 			else {
-				out = shader_emissive_eval(kg, &sd);
+				out = shader_emissive_eval(&sd);
 			}
 			break;
 		}
@@ -475,8 +475,9 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 			shader_setup_from_background(kg, &sd, &ray);

 			/* evaluate */
-			int flag = 0; /* we can't know which type of BSDF this is for */
-			out = shader_eval_background(kg, &sd, &state, flag);
+			int path_flag = 0; /* we can't know which type of BSDF this is for */
+			shader_eval_surface(kg, &sd, &state, path_flag | PATH_RAY_EMISSION);
+			out = shader_background_eval(&sd);
 			break;
 		}
 		default:
@@ -554,8 +555,9 @@ ccl_device void kernel_background_evaluate(KernelGlobals *kg,
 	shader_setup_from_background(kg, &sd, &ray);

 	/* evaluate */
-	int flag = 0; /* we can't know which type of BSDF this is for */
-	float3 color = shader_eval_background(kg, &sd, &state, flag);
+	int path_flag = 0; /* we can't know which type of BSDF this is for */
+	shader_eval_surface(kg, &sd, &state, path_flag | PATH_RAY_EMISSION);
+	float3 color = shader_background_eval(&sd);

 	/* write output */
 	output[i] += make_float4(color.x, color.y, color.z, 0.0f);
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -125,7 +125,9 @@
 #define fmodf(x, y) fmod((float)(x), (float)(y))
 #define sinhf(x) sinh(((float)(x)))

-#ifndef __CL_USE_NATIVE__
+/* Use native functions with possibly lower precision for performance,
+ * no issues found so far. */
+#if 1
 #  define sinf(x) native_sin(((float)(x)))
 #  define cosf(x) native_cos(((float)(x)))
 #  define tanf(x) native_tan(((float)(x)))
@@ -140,7 +142,7 @@
 #  define expf(x) exp(((float)(x)))
 #  define sqrtf(x) sqrt(((float)(x)))
 #  define logf(x) log(((float)(x)))
-#  define rcp(x)  recip(x))
+#  define rcp(x)  recip(x)
 #endif

 /* data lookup defines */
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -29,43 +29,36 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 	/* setup shading at emitter */
 	float3 eval;

-	int shader_flag = kernel_tex_fetch(__shaders, (ls->shader & SHADER_MASK)).flags;
-
-#ifdef __BACKGROUND_MIS__
-	if(ls->type == LIGHT_BACKGROUND) {
-		Ray ray;
-		ray.D = ls->D;
-		ray.P = ls->P;
-		ray.t = 1.0f;
-		ray.time = time;
-		ray.dP = differential3_zero();
-		ray.dD = dI;
-
-		shader_setup_from_background(kg, emission_sd, &ray);
-
-		path_state_modify_bounce(state, true);
-		eval = shader_eval_background(kg, emission_sd, state, 0);
-		path_state_modify_bounce(state, false);
-	}
-	else
-#endif
-	if(shader_flag & SD_HAS_CONSTANT_EMISSION)
-	{
-		eval.x = kernel_tex_fetch(__shaders, (ls->shader & SHADER_MASK)).constant_emission[0];
-		eval.y = kernel_tex_fetch(__shaders, (ls->shader & SHADER_MASK)).constant_emission[1];
-		eval.z = kernel_tex_fetch(__shaders, (ls->shader & SHADER_MASK)).constant_emission[2];
+	if(shader_constant_emission_eval(kg, ls->shader, &eval)) {
 		if((ls->prim != PRIM_NONE) && dot(ls->Ng, I) < 0.0f) {
 			ls->Ng = -ls->Ng;
 		}
 	}
-	else
-	{
-		shader_setup_from_sample(kg, emission_sd,
-		                         ls->P, ls->Ng, I,
-		                         ls->shader, ls->object, ls->prim,
-		                         ls->u, ls->v, t, time, false, ls->lamp);
+	else {
+		/* Setup shader data and call shader_eval_surface once, better
+		 * for GPU coherence and compile times. */
+#ifdef __BACKGROUND_MIS__
+		if(ls->type == LIGHT_BACKGROUND) {
+			Ray ray;
+			ray.D = ls->D;
+			ray.P = ls->P;
+			ray.t = 1.0f;
+			ray.time = time;
+			ray.dP = differential3_zero();
+			ray.dD = dI;

-		ls->Ng = emission_sd->Ng;
+			shader_setup_from_background(kg, emission_sd, &ray);
+		}
+		else
+#endif
+		{
+			shader_setup_from_sample(kg, emission_sd,
+			                         ls->P, ls->Ng, I,
+			                         ls->shader, ls->object, ls->prim,
+			                         ls->u, ls->v, t, time, false, ls->lamp);
+
+			ls->Ng = emission_sd->Ng;
+		}

 		/* No proper path flag, we're evaluating this for all closures. that's
 		 * weak but we'd have to do multiple evaluations otherwise. */
@@ -73,8 +66,16 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		shader_eval_surface(kg, emission_sd, state, PATH_RAY_EMISSION);
 		path_state_modify_bounce(state, false);

-		/* Evaluate emissive closure. */
-		eval = shader_emissive_eval(kg, emission_sd);
+		/* Evaluate closures. */
+#ifdef __BACKGROUND_MIS__
+		if (ls->type == LIGHT_BACKGROUND) {
+			eval = shader_background_eval(emission_sd);
+		}
+		else
+#endif
+		{
+			eval = shader_emissive_eval(emission_sd);
+		}
 	}

 	eval *= ls->eval_fac;
@@ -201,7 +202,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, ShaderData *sd, float t, int path_flag, float bsdf_pdf)
 {
 	/* evaluate emissive closure */
-	float3 L = shader_emissive_eval(kg, sd);
+	float3 L = shader_emissive_eval(sd);

 #ifdef __HAIR__
 	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE))
@@ -294,7 +295,7 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg,
 #ifdef __BACKGROUND__
 	int shader = kernel_data.background.surface_shader;

-	/* use visibility flag to skip lights */
+	/* Use visibility flag to skip lights. */
 	if(shader & SHADER_EXCLUDE_ANY) {
 		if(((shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
 		   ((shader & SHADER_EXCLUDE_GLOSSY) &&
@@ -305,20 +306,27 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg,
 			return make_float3(0.0f, 0.0f, 0.0f);
 	}

-	/* evaluate background closure */
+
+	/* Evaluate background shader. */
+	float3 L;
+	if(!shader_constant_emission_eval(kg, shader, &L)) {
 #  ifdef __SPLIT_KERNEL__
-	Ray priv_ray = *ray;
-	shader_setup_from_background(kg, emission_sd, &priv_ray);
+		Ray priv_ray = *ray;
+		shader_setup_from_background(kg, emission_sd, &priv_ray);
 #  else
-	shader_setup_from_background(kg, emission_sd, ray);
+		shader_setup_from_background(kg, emission_sd, ray);
 #  endif

-	path_state_modify_bounce(state, true);
-	float3 L = shader_eval_background(kg, emission_sd, state, state->flag);
-	path_state_modify_bounce(state, false);
+		path_state_modify_bounce(state, true);
+		shader_eval_surface(kg, emission_sd, state, state->flag | PATH_RAY_EMISSION);
+		path_state_modify_bounce(state, false);

+		L = shader_background_eval(emission_sd);
+	}
+
+	/* Background MIS weights. */
 #ifdef __BACKGROUND_MIS__
-	/* check if background light exists or if we should skip pdf */
+	/* Check if background light exists or if we should skip pdf. */
 	int res_x = kernel_data.integrator.pdf_background_res_x;

 	if(!(state->flag & PATH_RAY_MIS_SKIP) && res_x) {
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -44,7 +44,7 @@ typedef struct LightSample {
 *
 * Note: light_p is modified when sample_coord is true.
 */
-ccl_device_inline float area_light_sample(float3 P,
+ccl_device_inline float rect_light_sample(float3 P,
                                          float3 *light_p,
                                          float3 axisu, float3 axisv,
                                          float randu, float randv,
@@ -118,6 +118,60 @@ ccl_device_inline float area_light_sample(float3 P,
 		return 0.0f;
 }

+ccl_device_inline float3 ellipse_sample(float3 ru, float3 rv, float randu, float randv)
+{
+	to_unit_disk(&randu, &randv);
+	return ru*randu + rv*randv;
+}
+
+ccl_device float3 disk_light_sample(float3 v, float randu, float randv)
+{
+	float3 ru, rv;
+
+	make_orthonormals(v, &ru, &rv);
+
+	return ellipse_sample(ru, rv, randu, randv);
+}
+
+ccl_device float3 distant_light_sample(float3 D, float radius, float randu, float randv)
+{
+	return normalize(D + disk_light_sample(D, randu, randv)*radius);
+}
+
+ccl_device float3 sphere_light_sample(float3 P, float3 center, float radius, float randu, float randv)
+{
+	return disk_light_sample(normalize(P - center), randu, randv)*radius;
+}
+
+ccl_device float spot_light_attenuation(float3 dir, float spot_angle, float spot_smooth, LightSample *ls)
+{
+	float3 I = ls->Ng;
+
+	float attenuation = dot(dir, I);
+
+	if(attenuation <= spot_angle) {
+		attenuation = 0.0f;
+	}
+	else {
+		float t = attenuation - spot_angle;
+
+		if(t < spot_smooth && spot_smooth != 0.0f)
+			attenuation *= smoothstepf(t/spot_smooth);
+	}
+
+	return attenuation;
+}
+
+ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 I, float t)
+{
+	float cos_pi = dot(Ng, I);
+
+	if(cos_pi <= 0.0f)
+		return 0.0f;
+
+	return t*t/cos_pi;
+}
+
 /* Background Light */

 #ifdef __BACKGROUND_MIS__
@@ -291,11 +345,19 @@ ccl_device_inline float background_portal_pdf(KernelGlobals *kg,
 		const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
 		float3 axisu = make_float3(klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
 		float3 axisv = make_float3(klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
+		bool is_round = (klight->area.invarea < 0.0f);

-		if(!ray_quad_intersect(P, direction, 1e-4f, FLT_MAX, lightpos, axisu, axisv, dir, NULL, NULL, NULL, NULL))
+		if(!ray_quad_intersect(P, direction, 1e-4f, FLT_MAX, lightpos, axisu, axisv, dir, NULL, NULL, NULL, NULL, is_round))
 			continue;

-		portal_pdf += area_light_sample(P, &lightpos, axisu, axisv, 0.0f, 0.0f, false);
+		if(is_round) {
+			float t;
+			float3 D = normalize_len(lightpos - P, &t);
+			portal_pdf += fabsf(klight->area.invarea) * lamp_light_pdf(kg, dir, -D, t);
+		}
+		else {
+			portal_pdf += rect_light_sample(P, &lightpos, axisu, axisv, 0.0f, 0.0f, false);
+		}
 	}

 	if(ignore_portal >= 0) {
@@ -345,15 +407,26 @@ ccl_device float3 background_portal_sample(KernelGlobals *kg,
 			const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
 			float3 axisu = make_float3(klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
 			float3 axisv = make_float3(klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
+			bool is_round = (klight->area.invarea < 0.0f);

-			*pdf = area_light_sample(P, &lightpos,
-			                         axisu, axisv,
-			                         randu, randv,
-			                         true);
+			float3 D;
+			if(is_round) {
+				lightpos += ellipse_sample(axisu*0.5f, axisv*0.5f, randu, randv);
+				float t;
+				D = normalize_len(lightpos - P, &t);
+				*pdf = fabsf(klight->area.invarea) * lamp_light_pdf(kg, dir, -D, t);
+			}
+			else {
+				*pdf = rect_light_sample(P, &lightpos,
+				                         axisu, axisv,
+				                         randu, randv,
+				                         true);
+				D = normalize(lightpos - P);
+			}

 			*pdf /= num_possible;
 			*sampled_portal = p;
-			return normalize(lightpos - P);
+			return D;
 		}

 		portal--;
@@ -454,55 +527,6 @@ ccl_device float background_light_pdf(KernelGlobals *kg, float3 P, float3 direct

 /* Regular Light */

-ccl_device float3 disk_light_sample(float3 v, float randu, float randv)
-{
-	float3 ru, rv;
-
-	make_orthonormals(v, &ru, &rv);
-	to_unit_disk(&randu, &randv);
-
-	return ru*randu + rv*randv;
-}
-
-ccl_device float3 distant_light_sample(float3 D, float radius, float randu, float randv)
-{
-	return normalize(D + disk_light_sample(D, randu, randv)*radius);
-}
-
-ccl_device float3 sphere_light_sample(float3 P, float3 center, float radius, float randu, float randv)
-{
-	return disk_light_sample(normalize(P - center), randu, randv)*radius;
-}
-
-ccl_device float spot_light_attenuation(float3 dir, float spot_angle, float spot_smooth, LightSample *ls)
-{
-	float3 I = ls->Ng;
-
-	float attenuation = dot(dir, I);
-
-	if(attenuation <= spot_angle) {
-		attenuation = 0.0f;
-	}
-	else {
-		float t = attenuation - spot_angle;
-
-		if(t < spot_smooth && spot_smooth != 0.0f)
-			attenuation *= smoothstepf(t/spot_smooth);
-	}
-
-	return attenuation;
-}
-
-ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 I, float t)
-{
-	float cos_pi = dot(Ng, I);
-
-	if(cos_pi <= 0.0f)
-		return 0.0f;
-
-	return t*t/cos_pi;
-}
-
 ccl_device_inline bool lamp_light_sample(KernelGlobals *kg,
                                         int lamp,
                                         float randu, float randv,
@@ -597,26 +621,39 @@ ccl_device_inline bool lamp_light_sample(KernelGlobals *kg,
 			float3 D = make_float3(klight->area.dir[0],
 			                       klight->area.dir[1],
 			                       klight->area.dir[2]);
+			float invarea = fabsf(klight->area.invarea);
+			bool is_round = (klight->area.invarea < 0.0f);

 			if(dot(ls->P - P, D) > 0.0f) {
 				return false;
 			}

-			float3 inplane = ls->P;
-			ls->pdf = area_light_sample(P, &ls->P,
-			                          axisu, axisv,
-			                          randu, randv,
-			                          true);
+			float3 inplane;
+
+			if(is_round) {
+				inplane = ellipse_sample(axisu*0.5f, axisv*0.5f, randu, randv);
+				ls->P += inplane;
+				ls->pdf = invarea;
+			}
+			else {
+				inplane = ls->P;
+				ls->pdf = rect_light_sample(P, &ls->P,
+				                            axisu, axisv,
+				                            randu, randv,
+				                            true);
+				inplane = ls->P - inplane;
+			}

-			inplane = ls->P - inplane;
 			ls->u = dot(inplane, axisu) * (1.0f / dot(axisu, axisu)) + 0.5f;
 			ls->v = dot(inplane, axisv) * (1.0f / dot(axisv, axisv)) + 0.5f;

 			ls->Ng = D;
 			ls->D = normalize_len(ls->P - P, &ls->t);

-			float invarea = klight->area.invarea;
 			ls->eval_fac = 0.25f*invarea;
+			if(is_round) {
+				ls->pdf *= lamp_light_pdf(kg, D, -ls->D, ls->t);
+			}
 		}
 	}

@@ -727,7 +764,8 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 	}
 	else if(type == LIGHT_AREA) {
 		/* area light */
-		float invarea = klight->area.invarea;
+		float invarea = fabsf(klight->area.invarea);
+		bool is_round = (klight->area.invarea < 0.0f);
 		if(invarea == 0.0f)
 			return false;

@@ -750,14 +788,20 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 		if(!ray_quad_intersect(P, D, 0.0f, t, light_P,
 		                       axisu, axisv, Ng,
 		                       &ls->P, &ls->t,
-		                       &ls->u, &ls->v))
+		                       &ls->u, &ls->v,
+		                       is_round))
 		{
 			return false;
 		}

 		ls->D = D;
 		ls->Ng = Ng;
-		ls->pdf = area_light_sample(P, &light_P, axisu, axisv, 0, 0, false);
+		if(is_round) {
+			ls->pdf = invarea * lamp_light_pdf(kg, Ng, -D, ls->t);
+		}
+		else {
+			ls->pdf = rect_light_sample(P, &light_P, axisu, axisv, 0, 0, false);
+		}
 		ls->eval_fac = 0.25f*invarea;
 	}
 	else {
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -56,7 +56,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
 	PROFILING_INIT(kg, PROFILING_SHADER_SETUP);

 #ifdef __INSTANCING__
-	sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
+	sd->object = (isect->object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
 #endif
 	sd->lamp = LAMP_NONE;

@@ -299,6 +299,10 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
 		sd->ob_tfm  = lamp_fetch_transform(kg, lamp, false);
 		sd->ob_itfm = lamp_fetch_transform(kg, lamp, true);
 		sd->lamp = lamp;
+#else
+	}
+	else if(lamp != LAMP_NONE) {
+		sd->lamp = lamp;
 #endif
 	}

@@ -407,7 +411,7 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderDat
 	sd->ray_length = 0.0f;

 #ifdef __INSTANCING__
-	sd->object = PRIM_NONE;
+	sd->object = OBJECT_NONE;
 #endif
 	sd->lamp = LAMP_NONE;
 	sd->prim = PRIM_NONE;
@@ -453,7 +457,7 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s
 	sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */

 #  ifdef __INSTANCING__
-	sd->object = PRIM_NONE; /* todo: fill this for texture coordinates */
+	sd->object = OBJECT_NONE; /* todo: fill this for texture coordinates */
 #  endif
 	sd->lamp = LAMP_NONE;
 	sd->prim = PRIM_NONE;
@@ -980,9 +984,40 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 }
 #endif  /* __SUBSURFACE__ */

+/* Constant emission optimization */
+
+ccl_device bool shader_constant_emission_eval(KernelGlobals *kg, int shader, float3 *eval)
+{
+	int shader_index = shader & SHADER_MASK;
+	int shader_flag = kernel_tex_fetch(__shaders, shader_index).flags;
+
+	if (shader_flag & SD_HAS_CONSTANT_EMISSION) {
+		*eval = make_float3(
+			kernel_tex_fetch(__shaders, shader_index).constant_emission[0],
+			kernel_tex_fetch(__shaders, shader_index).constant_emission[1],
+			kernel_tex_fetch(__shaders, shader_index).constant_emission[2]);
+
+		return true;
+	}
+
+	return false;
+}
+
+/* Background */
+
+ccl_device float3 shader_background_eval(ShaderData *sd)
+{
+	if(sd->flag & SD_EMISSION) {
+		return sd->closure_emission_background;
+	}
+	else {
+		return make_float3(0.0f, 0.0f, 0.0f);
+	}
+}
+
 /* Emission */

-ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_emissive_eval(ShaderData *sd)
 {
 	if(sd->flag & SD_EMISSION) {
 		return emissive_simple_eval(sd->Ng, sd->I) * sd->closure_emission_background;
@@ -1030,20 +1065,32 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 	sd->num_closure_left = max_closures;

 #ifdef __OSL__
-	if(kg->osl)
-		OSLShader::eval_surface(kg, sd, state, path_flag);
+	if(kg->osl) {
+		if (sd->object == OBJECT_NONE) {
+			OSLShader::eval_background(kg, sd, state, path_flag);
+		}
+		else {
+			OSLShader::eval_surface(kg, sd, state, path_flag);
+		}
+	}
 	else
 #endif
 	{
 #ifdef __SVM__
 		svm_eval_nodes(kg, sd, state, SHADER_TYPE_SURFACE, path_flag);
 #else
-		DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd,
-		                                             sizeof(DiffuseBsdf),
-		                                             make_float3(0.8f, 0.8f, 0.8f));
-		if(bsdf != NULL) {
-			bsdf->N = sd->N;
-			sd->flag |= bsdf_diffuse_setup(bsdf);
+		if(sd->object == OBJECT_NONE) {
+			sd->closure_emission_background = make_float3(0.8f, 0.8f, 0.8f);
+			sd->flag |= SD_EMISSION;
+		}
+		else {
+			DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd,
+			                                             sizeof(DiffuseBsdf),
+			                                             make_float3(0.8f, 0.8f, 0.8f));
+			if(bsdf != NULL) {
+				bsdf->N = sd->N;
+				sd->flag |= bsdf_diffuse_setup(bsdf);
+			}
 		}
 #endif
 	}
@@ -1053,36 +1100,6 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
 	}
 }

-/* Background Evaluation */
-
-ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd,
-	ccl_addr_space PathState *state, int path_flag)
-{
-	sd->num_closure = 0;
-	sd->num_closure_left = 0;
-
-#ifdef __SVM__
-#  ifdef __OSL__
-	if(kg->osl) {
-		OSLShader::eval_background(kg, sd, state, path_flag);
-	}
-	else
-#  endif  /* __OSL__ */
-	{
-		svm_eval_nodes(kg, sd, state, SHADER_TYPE_SURFACE, path_flag);
-	}
-
-	if(sd->flag & SD_EMISSION) {
-		return sd->closure_emission_background;
-	}
-	else {
-		return make_float3(0.0f, 0.0f, 0.0f);
-	}
-#else  /* __SVM__ */
-	return make_float3(0.8f, 0.8f, 0.8f);
-#endif  /* __SVM__ */
-}
-
 /* Volume */

 #ifdef __VOLUME__
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -56,6 +56,7 @@ KERNEL_TEX(uint, __patches)
 /* attributes */
 KERNEL_TEX(uint4, __attributes_map)
 KERNEL_TEX(float, __attributes_float)
+KERNEL_TEX(float2, __attributes_float2)
 KERNEL_TEX(float4, __attributes_float3)
 KERNEL_TEX(uchar4, __attributes_uchar4)

--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -83,92 +83,6 @@ CCL_NAMESPACE_BEGIN
 #  define SHADER_SORT_LOCAL_SIZE 1
 #endif

-
-/* Device capabilities */
-#ifdef __KERNEL_CPU__
-#  ifdef __KERNEL_SSE2__
-#    define __QBVH__
-#  endif
-#  define __KERNEL_SHADING__
-#  define __KERNEL_ADV_SHADING__
-#  define __BRANCHED_PATH__
-#  ifdef WITH_OSL
-#    define __OSL__
-#  endif
-#  define __PRINCIPLED__
-#  define __SUBSURFACE__
-#  define __CMJ__
-#  define __VOLUME__
-#  define __VOLUME_SCATTER__
-#  define __SHADOW_RECORD_ALL__
-#  define __VOLUME_DECOUPLED__
-#  define __VOLUME_RECORD_ALL__
-#endif  /* __KERNEL_CPU__ */
-
-#ifdef __KERNEL_CUDA__
-#  define __KERNEL_SHADING__
-#  define __KERNEL_ADV_SHADING__
-#  define __VOLUME__
-#  define __VOLUME_SCATTER__
-#  define __SUBSURFACE__
-#  define __PRINCIPLED__
-#  define __SHADOW_RECORD_ALL__
-#  define __CMJ__
-#  ifndef __SPLIT_KERNEL__
-#    define __BRANCHED_PATH__
-#  endif
-#endif  /* __KERNEL_CUDA__ */
-
-#ifdef __KERNEL_OPENCL__
-
-/* keep __KERNEL_ADV_SHADING__ in sync with opencl_kernel_use_advanced_shading! */
-
-#  ifdef __KERNEL_OPENCL_NVIDIA__
-#    define __KERNEL_SHADING__
-#    define __KERNEL_ADV_SHADING__
-#    define __SUBSURFACE__
-#    define __PRINCIPLED__
-#    define __VOLUME__
-#    define __VOLUME_SCATTER__
-#    define __SHADOW_RECORD_ALL__
-#    define __CMJ__
-#    define __BRANCHED_PATH__
-#  endif  /* __KERNEL_OPENCL_NVIDIA__ */
-
-#  ifdef __KERNEL_OPENCL_APPLE__
-#    define __KERNEL_SHADING__
-#    define __KERNEL_ADV_SHADING__
-#    define __PRINCIPLED__
-#    define __CMJ__
-/* TODO(sergey): Currently experimental section is ignored here,
- * this is because megakernel in device_opencl does not support
- * custom cflags depending on the scene features.
- */
-#  endif  /* __KERNEL_OPENCL_APPLE__ */
-
-#  ifdef __KERNEL_OPENCL_AMD__
-#    define __CL_USE_NATIVE__
-#    define __KERNEL_SHADING__
-#    define __KERNEL_ADV_SHADING__
-#    define __SUBSURFACE__
-#    define __PRINCIPLED__
-#    define __VOLUME__
-#    define __VOLUME_SCATTER__
-#    define __SHADOW_RECORD_ALL__
-#    define __CMJ__
-#    define __BRANCHED_PATH__
-#  endif  /* __KERNEL_OPENCL_AMD__ */
-
-#  ifdef __KERNEL_OPENCL_INTEL_CPU__
-#    define __CL_USE_NATIVE__
-#    define __KERNEL_SHADING__
-#    define __KERNEL_ADV_SHADING__
-#    define __PRINCIPLED__
-#    define __CMJ__
-#  endif  /* __KERNEL_OPENCL_INTEL_CPU__ */
-
-#endif  /* __KERNEL_OPENCL__ */
-
 /* Kernel features */
 #define __SOBOL__
 #define __INSTANCING__
@@ -185,28 +99,55 @@ CCL_NAMESPACE_BEGIN
 #define __SHADOW_TRICKS__
 #define __DENOISING_FEATURES__
 #define __SHADER_RAYTRACE__
+#define __AO__
+#define __PASSES__
+#define __HAIR__

-#ifdef __KERNEL_SHADING__
+/* Without these we get an AO render, used by OpenCL preview kernel. */
+#ifndef __KERNEL_AO_PREVIEW__
 #  define __SVM__
 #  define __EMISSION__
 #  define __TEXTURES__
 #  define __EXTRA_NODES__
 #  define __HOLDOUT__
-#endif
-
-#ifdef __KERNEL_ADV_SHADING__
 #  define __MULTI_CLOSURE__
 #  define __TRANSPARENT_SHADOWS__
-#  define __PASSES__
 #  define __BACKGROUND_MIS__
 #  define __LAMP_MIS__
-#  define __AO__
 #  define __CAMERA_MOTION__
 #  define __OBJECT_MOTION__
 #  define __HAIR__
 #  define __BAKING__
+#  define __PRINCIPLED__
+#  define __SUBSURFACE__
+#  define __VOLUME__
+#  define __VOLUME_SCATTER__
+#  define __CMJ__
+#  define __SHADOW_RECORD_ALL__
+#  define __BRANCHED_PATH__
 #endif

+/* Device specific features */
+#ifdef __KERNEL_CPU__
+#  ifdef __KERNEL_SSE2__
+#    define __QBVH__
+#  endif
+#  ifdef WITH_OSL
+#    define __OSL__
+#  endif
+#  define __VOLUME_DECOUPLED__
+#  define __VOLUME_RECORD_ALL__
+#endif  /* __KERNEL_CPU__ */
+
+#ifdef __KERNEL_CUDA__
+#  ifdef __SPLIT_KERNEL__
+#    undef __BRANCHED_PATH__
+#  endif
+#endif  /* __KERNEL_CUDA__ */
+
+#ifdef __KERNEL_OPENCL__
+#endif  /* __KERNEL_OPENCL__ */
+
 /* Scene-based selective features compilation. */
 #ifdef __NO_CAMERA_MOTION__
 #  undef __CAMERA_MOTION__
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -66,9 +66,15 @@ ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
                                      ccl_private uint *y,
                                      ccl_private uint *sample)
 {
+#ifdef __KERNEL_CUDA__
+	/* Keeping threads for the same pixel together improves performance on CUDA. */
+	uint sample_offset = global_work_index % tile->num_samples;
+	uint pixel_offset = global_work_index / tile->num_samples;
+#else /* __KERNEL_CUDA__ */
 	uint tile_pixels = tile->w * tile->h;
 	uint sample_offset = global_work_index / tile_pixels;
 	uint pixel_offset = global_work_index - sample_offset * tile_pixels;
+#endif /* __KERNEL_CUDA__ */
 	uint y_offset = pixel_offset / tile->w;
 	uint x_offset = pixel_offset - y_offset * tile->w;

--- a/intern/cycles/kernel/kernels/opencl/kernel_split.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
@@ -1,41 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"  // PRECOMPILED
-#include "kernel/split/kernel_split_common.h"  // PRECOMPILED
-
-#include "kernel/kernels/opencl/kernel_state_buffer_size.cl"
-#include "kernel/kernels/opencl/kernel_data_init.cl"
-#include "kernel/kernels/opencl/kernel_path_init.cl"
-
-#include "kernel/kernels/opencl/kernel_scene_intersect.cl"
-#include "kernel/kernels/opencl/kernel_lamp_emission.cl"
-#include "kernel/kernels/opencl/kernel_do_volume.cl"
-#include "kernel/kernels/opencl/kernel_indirect_background.cl"
-#include "kernel/kernels/opencl/kernel_queue_enqueue.cl"
-#include "kernel/kernels/opencl/kernel_shader_setup.cl"
-#include "kernel/kernels/opencl/kernel_shader_sort.cl"
-#include "kernel/kernels/opencl/kernel_shader_eval.cl"
-#include "kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl"
-#include "kernel/kernels/opencl/kernel_subsurface_scatter.cl"
-#include "kernel/kernels/opencl/kernel_direct_lighting.cl"
-#include "kernel/kernels/opencl/kernel_shadow_blocked_ao.cl"
-#include "kernel/kernels/opencl/kernel_shadow_blocked_dl.cl"
-#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl"
-#include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
-#include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
-#include "kernel/kernels/opencl/kernel_buffer_update.cl"
-
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -392,6 +392,44 @@ bool OSLRenderServices::get_array_attribute(OSL::ShaderGlobals *sg, bool derivat
 	return false;
 }

+static bool set_attribute_float2(float2 f[3], TypeDesc type, bool derivatives, void *val)
+{
+	if(type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
+	   type == TypeDesc::TypeNormal || type == TypeDesc::TypeColor)
+	{
+		float *fval = (float *)val;
+
+		fval[0] = f[0].x;
+		fval[1] = f[0].y;
+		fval[2] = 0.0f;
+
+		if(derivatives) {
+			fval[3] = f[1].x;
+			fval[4] = f[1].y;
+			fval[5] = 0.0f;
+
+			fval[6] = f[2].x;
+			fval[7] = f[2].y;
+			fval[8] = 0.0f;
+		}
+
+		return true;
+	}
+	else if(type == TypeDesc::TypeFloat) {
+		float *fval = (float *)val;
+		fval[0] = average(f[0]);
+
+		if(derivatives) {
+			fval[1] = average(f[1]);
+			fval[2] = average(f[2]);
+		}
+
+		return true;
+	}
+
+	return false;
+}
+
 static bool set_attribute_float3(float3 f[3], TypeDesc type, bool derivatives, void *val)
 {
 	if(type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
@@ -572,6 +610,12 @@ static bool get_primitive_attribute(KernelGlobals *kg, const ShaderData *sd, con
 		                                     (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
 		return set_attribute_float3(fval, type, derivatives, val);
 	}
+	else if(attr.type == TypeFloat2) {
+		float2 fval[2];
+		fval[0] = primitive_attribute_float2(kg, sd, attr.desc,
+		                                      (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
+		return set_attribute_float2(fval, type, derivatives, val);
+	}
 	else if(attr.type == TypeDesc::TypeFloat) {
 		float fval[3];
 		fval[0] = primitive_attribute_float(kg, sd, attr.desc,
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -52,18 +52,27 @@ ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, u
 	AttributeDescriptor desc = svm_node_attr_init(kg, sd, node, &type, &out_offset);

 	/* fetch and store attribute */
-	if (desc.type == NODE_ATTR_FLOAT) {
+	if(desc.type == NODE_ATTR_FLOAT) {
 		float f = primitive_attribute_float(kg, sd, desc, NULL, NULL);
-		if (type == NODE_ATTR_FLOAT) {
+		if(type == NODE_ATTR_FLOAT) {
 			stack_store_float(stack, out_offset, f);
 		}
 		else {
 			stack_store_float3(stack, out_offset, make_float3(f, f, f));
 		}
 	}
+	else if(desc.type == NODE_ATTR_FLOAT2) {
+		float2 f = primitive_attribute_float2(kg, sd, desc, NULL, NULL);
+		if(type == NODE_ATTR_FLOAT) {
+			stack_store_float(stack, out_offset, f.x);
+		}
+		else {
+			stack_store_float3(stack, out_offset, make_float3(f.x, f.y, 0.0f));
+		}
+	}
 	else {
 		float3 f = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
-		if (type == NODE_ATTR_FLOAT) {
+		if(type == NODE_ATTR_FLOAT) {
 			stack_store_float(stack, out_offset, average(f));
 		}
 		else {
@@ -84,20 +93,30 @@ void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint
 	AttributeDescriptor desc = svm_node_attr_init(kg, sd, node, &type, &out_offset);

 	/* fetch and store attribute */
-	if (desc.type == NODE_ATTR_FLOAT) {
+	if(desc.type == NODE_ATTR_FLOAT) {
 		float dx;
 		float f = primitive_surface_attribute_float(kg, sd, desc, &dx, NULL);
-		if (type == NODE_ATTR_FLOAT) {
+		if(type == NODE_ATTR_FLOAT) {
 			stack_store_float(stack, out_offset, f+dx);
 		}
 		else {
 			stack_store_float3(stack, out_offset, make_float3(f+dx, f+dx, f+dx));
 		}
 	}
+	else if(desc.type == NODE_ATTR_FLOAT2) {
+		float2 dx;
+		float2 f = primitive_attribute_float2(kg, sd, desc, &dx, NULL);
+		if (type == NODE_ATTR_FLOAT) {
+			stack_store_float(stack, out_offset, f.x + dx.x);
+		}
+		else {
+			stack_store_float3(stack, out_offset, make_float3(f.x+dx.x, f.y+dx.y, 0.0f));
+		}
+	}
 	else {
 		float3 dx;
 		float3 f = primitive_surface_attribute_float3(kg, sd, desc, &dx, NULL);
-		if (type == NODE_ATTR_FLOAT) {
+		if(type == NODE_ATTR_FLOAT) {
 			stack_store_float(stack, out_offset, average(f+dx));
 		}
 		else {
@@ -121,20 +140,30 @@ void svm_node_attr_bump_dy(KernelGlobals *kg,
 	AttributeDescriptor desc = svm_node_attr_init(kg, sd, node, &type, &out_offset);

 	/* fetch and store attribute */
-		if (desc.type == NODE_ATTR_FLOAT) {
+	if(desc.type == NODE_ATTR_FLOAT) {
 		float dy;
 		float f = primitive_surface_attribute_float(kg, sd, desc, NULL, &dy);
-		if (type == NODE_ATTR_FLOAT) {
+		if(type == NODE_ATTR_FLOAT) {
 			stack_store_float(stack, out_offset, f+dy);
 		}
 		else {
 			stack_store_float3(stack, out_offset, make_float3(f+dy, f+dy, f+dy));
 		}
 	}
+	else if(desc.type == NODE_ATTR_FLOAT2) {
+		float2 dy;
+		float2 f = primitive_attribute_float2(kg, sd, desc, NULL, &dy);
+		if(type == NODE_ATTR_FLOAT) {
+			stack_store_float(stack, out_offset, f.x + dy.x);
+		}
+		else {
+			stack_store_float3(stack, out_offset, make_float3(f.x+dy.x, f.y+dy.y, 0.0f));
+		}
+	}
 	else {
 		float3 dy;
 		float3 f = primitive_surface_attribute_float3(kg, sd, desc, NULL, &dy);
-		if (type == NODE_ATTR_FLOAT) {
+		if(type == NODE_ATTR_FLOAT) {
 			stack_store_float(stack, out_offset, average(f+dy));
 		}
 		else {
--- a/intern/cycles/kernel/svm/svm_bevel.h
+++ b/intern/cycles/kernel/svm/svm_bevel.h
@@ -146,7 +146,13 @@ ccl_device_noinline float3 svm_bevel(
 			}
 #endif  /* __OBJECT_MOTION__ */

+			/* Get geometric normal. */
 			float3 hit_Ng = isect.Ng[hit];
+			int object = (isect.hits[hit].object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, isect.hits[hit].prim): isect.hits[hit].object;
+			int object_flag = kernel_tex_fetch(__object_flag, object);
+			if(object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
+				hit_Ng = -hit_Ng;
+			}

 			/* Compute smooth normal. */
 			float3 N = hit_Ng;
@@ -168,7 +174,7 @@ ccl_device_noinline float3 svm_bevel(
 			}

 			/* Transform normals to world space. */
-			if(isect.hits[hit].object != OBJECT_NONE) {
+			if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 				object_normal_transform(kg, sd, &N);
 				object_normal_transform(kg, sd, &hit_Ng);
 			}
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -363,7 +363,15 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 	float3 attribute_value;
 	const AttributeDescriptor desc = find_attribute(kg, sd, node.z);
 	if (desc.offset != ATTR_STD_NOT_FOUND) {
-		attribute_value = primitive_surface_attribute_float3(kg, sd, desc, NULL, NULL);
+		if(desc.type == NODE_ATTR_FLOAT2) {
+			float2 value = primitive_surface_attribute_float2(kg, sd, desc, NULL, NULL);
+			attribute_value.x = value.x;
+			attribute_value.y = value.y;
+			attribute_value.z = 0.0f;
+		}
+		else {
+			attribute_value = primitive_surface_attribute_float3(kg, sd, desc, NULL, NULL);
+		}
 	}


--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -141,6 +141,7 @@ typedef enum ShaderNodeType {

 typedef enum NodeAttributeType {
 	NODE_ATTR_FLOAT = 0,
+	NODE_ATTR_FLOAT2,
 	NODE_ATTR_FLOAT3,
 	NODE_ATTR_MATRIX
 } NodeAttributeType;
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -22,6 +22,7 @@ set(SRC
 	image.cpp
 	integrator.cpp
 	light.cpp
+	merge.cpp
 	mesh.cpp
 	mesh_displace.cpp
 	mesh_subdivision.cpp
@@ -55,6 +56,7 @@ set(SRC_HEADERS
 	image.h
 	integrator.h
 	light.h
+	merge.h
 	mesh.h
 	nodes.h
 	object.h
--- a/intern/cycles/render/attribute.cpp
+++ b/intern/cycles/render/attribute.cpp
@@ -48,7 +48,8 @@ void Attribute::set(ustring name_, TypeDesc type_, AttributeElement element_)
 	/* string and matrix not supported! */
 	assert(type == TypeDesc::TypeFloat || type == TypeDesc::TypeColor ||
 		type == TypeDesc::TypePoint || type == TypeDesc::TypeVector ||
-		type == TypeDesc::TypeNormal || type == TypeDesc::TypeMatrix);
+		type == TypeDesc::TypeNormal || type == TypeDesc::TypeMatrix ||
+		type == TypeFloat2);
 }

 void Attribute::resize(Mesh *mesh, AttributePrimitive prim, bool reserve_only)
@@ -68,6 +69,8 @@ void Attribute::resize(size_t num_elements)

 void Attribute::add(const float& f)
 {
+	assert(data_sizeof() == sizeof(float));
+
 	char *data = (char*)&f;
 	size_t size = sizeof(f);

@@ -77,6 +80,19 @@ void Attribute::add(const float& f)

 void Attribute::add(const uchar4& f)
 {
+	assert(data_sizeof() == sizeof(uchar4));
+
+	char *data = (char*)&f;
+	size_t size = sizeof(f);
+
+	for(size_t i = 0; i < size; i++)
+		buffer.push_back(data[i]);
+}
+
+void Attribute::add(const float2& f)
+{
+	assert(data_sizeof() == sizeof(float2));
+
 	char *data = (char*)&f;
 	size_t size = sizeof(f);

@@ -86,6 +102,8 @@ void Attribute::add(const uchar4& f)

 void Attribute::add(const float3& f)
 {
+	assert(data_sizeof() == sizeof(float3));
+
 	char *data = (char*)&f;
 	size_t size = sizeof(f);

@@ -95,6 +113,8 @@ void Attribute::add(const float3& f)

 void Attribute::add(const Transform& f)
 {
+	assert(data_sizeof() == sizeof(Transform));
+
 	char *data = (char*)&f;
 	size_t size = sizeof(f);

@@ -104,6 +124,8 @@ void Attribute::add(const Transform& f)

 void Attribute::add(const VoxelAttribute& f)
 {
+	assert(data_sizeof() == sizeof(VoxelAttribute));
+
 	char *data = (char*)&f;
 	size_t size = sizeof(f);

@@ -127,6 +149,8 @@ size_t Attribute::data_sizeof() const
 		return sizeof(uchar4);
 	else if(type == TypeDesc::TypeFloat)
 		return sizeof(float);
+	else if(type == TypeFloat2)
+		return sizeof(float2);
 	else if(type == TypeDesc::TypeMatrix)
 		return sizeof(Transform);
 	else
@@ -230,6 +254,9 @@ void Attribute::add_with_weight(void* dst, void* src, float weight)
 	else if(same_storage(type, TypeDesc::TypeFloat)) {
 		*((float*)dst) += *((float*)src) * weight;
 	}
+	else if(same_storage(type, TypeFloat2)) {
+		*((float2*)dst) += *((float2*)src) * weight;
+	}
 	else if(same_storage(type, TypeDesc::TypeVector)) {
 		*((float4*)dst) += *((float4*)src) * weight;
 	}
@@ -400,7 +427,7 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
 				attr = add(name, TypeDesc::TypeNormal, ATTR_ELEMENT_FACE);
 				break;
 			case ATTR_STD_UV:
-				attr = add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CORNER);
+				attr = add(name, TypeFloat2, ATTR_ELEMENT_CORNER);
 				break;
 			case ATTR_STD_UV_TANGENT:
 				attr = add(name, TypeDesc::TypeVector, ATTR_ELEMENT_CORNER);
@@ -451,6 +478,8 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
 	else if(curve_mesh) {
 		switch(std) {
 			case ATTR_STD_UV:
+				attr = add(name, TypeFloat2, ATTR_ELEMENT_CURVE);
+				break;
 			case ATTR_STD_GENERATED:
 				attr = add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CURVE);
 				break;
--- a/intern/cycles/render/attribute.h
+++ b/intern/cycles/render/attribute.h
@@ -66,25 +66,86 @@ public:
 	size_t element_size(Mesh *mesh, AttributePrimitive prim) const;
 	size_t buffer_size(Mesh *mesh, AttributePrimitive prim) const;

-	char *data() { return (buffer.size())? &buffer[0]: NULL; };
-	float3 *data_float3() { return (float3*)data(); }
-	float4 *data_float4() { return (float4*)data(); }
-	float *data_float() { return (float*)data(); }
-	uchar4 *data_uchar4() { return (uchar4*)data(); }
-	Transform *data_transform() { return (Transform*)data(); }
-	VoxelAttribute *data_voxel()  { return ( VoxelAttribute*)data(); }
+	char *data()
+	{
+		return (buffer.size())? &buffer[0]: NULL;
+	}
+	float2 *data_float2()
+	{
+		assert(data_sizeof() == sizeof(float2));
+		return (float2*)data();
+	}
+	float3 *data_float3()
+	{
+		assert(data_sizeof() == sizeof(float3));
+		return (float3*)data();
+	}
+	float4 *data_float4()
+	{
+		assert(data_sizeof() == sizeof(float4));
+		return (float4*)data();
+	}
+	float *data_float()
+	{
+		assert(data_sizeof() == sizeof(float));
+		return (float*)data();
+	}
+	uchar4 *data_uchar4()
+	{
+		assert(data_sizeof() == sizeof(uchar4));
+		return (uchar4*)data();
+	}
+	Transform *data_transform()
+	{
+		assert(data_sizeof() == sizeof(Transform));
+		return (Transform*)data();
+	}
+	VoxelAttribute *data_voxel()
+	{
+		assert(data_sizeof() == sizeof(VoxelAttribute));
+		return ( VoxelAttribute*)data();
+	}

-	const char *data() const { return (buffer.size())? &buffer[0]: NULL; }
-	const float3 *data_float3() const { return (const float3*)data(); }
-	const float4 *data_float4() const { return (const float4*)data(); }
-	const float *data_float() const { return (const float*)data(); }
-	const Transform *data_transform() const { return (const Transform*)data(); }
-	const VoxelAttribute *data_voxel() const { return (const VoxelAttribute*)data(); }
+	const char *data() const
+	{
+		return (buffer.size())? &buffer[0]: NULL;
+	}
+	const float2 *data_float2() const
+	{
+		assert(data_sizeof() == sizeof(float2));
+		return (const float2*)data();
+	}
+	const float3 *data_float3() const
+	{
+		assert(data_sizeof() == sizeof(float3));
+		return (const float3*)data();
+	}
+	const float4 *data_float4() const
+	{
+		assert(data_sizeof() == sizeof(float4));
+		return (const float4*)data();
+	}
+	const float *data_float() const
+	{
+		assert(data_sizeof() == sizeof(float));
+		return (const float*)data();
+	}
+	const Transform *data_transform() const
+	{
+		assert(data_sizeof() == sizeof(Transform));
+		return (const Transform*)data();
+	}
+	const VoxelAttribute *data_voxel() const
+	{
+		assert(data_sizeof() == sizeof(VoxelAttribute));
+		return (const VoxelAttribute*)data();
+	}

 	void zero_data(void* dst);
 	void add_with_weight(void* dst, void* src, float weight);

 	void add(const float& f);
+	void add(const float2& f);
 	void add(const float3& f);
 	void add(const uchar4& f);
 	void add(const Transform& f);
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -184,6 +184,7 @@ bool RenderBuffers::get_denoising_pass_rect(int type, float exposure, int sample
 	if(type == DENOISING_PASS_CLEAN) {
 		/* The clean pass isn't changed by prefiltering, so we use the original one there. */
 		offset = type + params.get_denoising_offset();
+		scale /= sample;
 	}
 	else if (type == DENOISING_PASS_PREFILTERED_COLOR && !params.denoising_prefiltered_pass) {
 		/* If we're not saving the prefiltering result, return the original noisy pass. */
--- a/intern/cycles/render/camera.cpp
+++ b/intern/cycles/render/camera.cpp
@@ -169,6 +169,8 @@ Camera::Camera()
 	cameratoworld = transform_identity();
 	worldtoraster = projection_identity();

+	full_rastertocamera = projection_identity();
+
 	dx = make_float3(0.0f, 0.0f, 0.0f);
 	dy = make_float3(0.0f, 0.0f, 0.0f);

@@ -251,7 +253,7 @@ void Camera::update(Scene *scene)
 	ProjectionTransform screentocamera = projection_inverse(cameratoscreen);

 	rastertocamera = screentocamera * rastertoscreen;
-	ProjectionTransform full_rastertocamera = screentocamera * full_rastertoscreen;
+	full_rastertocamera = screentocamera * full_rastertoscreen;
 	cameratoraster = screentoraster * cameratoscreen;

 	cameratoworld = matrix;
@@ -627,7 +629,7 @@ float Camera::world_to_raster_size(float3 P)

 		if(offscreen_dicing_scale > 1.0f) {
 			float3 p = transform_point(&worldtocamera, P);
-			float3 v = transform_perspective(&rastertocamera, make_float3(width, height, 0.0f));
+			float3 v = transform_perspective(&full_rastertocamera, make_float3(full_width, full_height, 0.0f));

 			/* Create point clamped to frustum */
 			float3 c;
@@ -644,8 +646,8 @@ float Camera::world_to_raster_size(float3 P)
 	}
 	else if(type == CAMERA_PERSPECTIVE) {
 		/* Calculate as if point is directly ahead of the camera. */
-		float3 raster = make_float3(0.5f*width, 0.5f*height, 0.0f);
-		float3 Pcamera = transform_perspective(&rastertocamera, raster);
+		float3 raster = make_float3(0.5f*full_width, 0.5f*full_height, 0.0f);
+		float3 Pcamera = transform_perspective(&full_rastertocamera, raster);

 		/* dDdx */
 		float3 Ddiff = transform_direction(&cameratoworld, Pcamera);
@@ -728,22 +730,21 @@ float Camera::world_to_raster_size(float3 P)
 		 * point directly ahead seems to produce good enough results. */
 #if 0
 		float2 dir = direction_to_panorama(&kernel_camera, kernel_camera_motion.data(), normalize(D));
-		float3 raster = transform_perspective(&cameratoraster, make_float3(dir.x, dir.y, 0.0f));
+		float3 raster = transform_perspective(&full_cameratoraster, make_float3(dir.x, dir.y, 0.0f));

 		ray.t = 1.0f;
 		camera_sample_panorama(&kernel_camera, kernel_camera_motion.data(), raster.x, raster.y, 0.0f, 0.0f, &ray);
 		if(ray.t == 0.0f) {
 			/* No differentials, just use from directly ahead. */
-			camera_sample_panorama(&kernel_camera, kernel_camera_motion.data(), 0.5f*width, 0.5f*height, 0.0f, 0.0f, &ray);
+			camera_sample_panorama(&kernel_camera, kernel_camera_motion.data(), 0.5f*full_width, 0.5f*full_height, 0.0f, 0.0f, &ray);
 		}
 #else
-		camera_sample_panorama(&kernel_camera, kernel_camera_motion.data(), 0.5f*width, 0.5f*height, 0.0f, 0.0f, &ray);
+		camera_sample_panorama(&kernel_camera, kernel_camera_motion.data(), 0.5f*full_width, 0.5f*full_height, 0.0f, 0.0f, &ray);
 #endif

 		differential_transfer(&ray.dP, ray.dP, ray.D, ray.dD, ray.D, dist);

-		return max(len(ray.dP.dx) * (float(width)/float(full_width)),
-		           len(ray.dP.dy) * (float(height)/float(full_height)));
+		return max(len(ray.dP.dx),len(ray.dP.dy));
 	}

 	return res;
--- a/intern/cycles/render/camera.h
+++ b/intern/cycles/render/camera.h
@@ -160,6 +160,8 @@ public:
 	ProjectionTransform rastertocamera;
 	ProjectionTransform cameratoraster;

+	ProjectionTransform full_rastertocamera;
+
 	float3 dx;
 	float3 dy;

--- a/intern/cycles/render/curves.h
+++ b/intern/cycles/render/curves.h
@@ -75,7 +75,7 @@ public:
 	array<int> curve_firstkey;
 	array<int> curve_keynum;
 	array<float> curve_length;
-	array<float3> curve_uv;
+	array<float2> curve_uv;
 	array<float3> curve_vcol;

 	array<float3> curvekey_co;
--- a/intern/cycles/render/denoising.cpp
+++ b/intern/cycles/render/denoising.cpp
@@ -123,16 +123,23 @@ static void fill_mapping(vector<ChannelMapping> &map, int pos, string name, stri
 }

 static const int INPUT_NUM_CHANNELS = 15;
+static const int INPUT_DENOISING_DEPTH = 0;
+static const int INPUT_DENOISING_NORMAL = 1;
+static const int INPUT_DENOISING_SHADOWING = 4;
+static const int INPUT_DENOISING_ALBEDO = 5;
+static const int INPUT_NOISY_IMAGE = 8;
+static const int INPUT_DENOISING_VARIANCE = 11;
+static const int INPUT_DENOISING_INTENSITY = 14;
 static vector<ChannelMapping> input_channels()
 {
 	vector<ChannelMapping> map;
-	fill_mapping(map, 0, "Denoising Depth", "Z");
-	fill_mapping(map, 1, "Denoising Normal", "XYZ");
-	fill_mapping(map, 4, "Denoising Shadowing", "X");
-	fill_mapping(map, 5, "Denoising Albedo", "RGB");
-	fill_mapping(map, 8, "Noisy Image", "RGB");
-	fill_mapping(map, 11, "Denoising Variance", "RGB");
-	fill_mapping(map, 14, "Denoising Intensity", "X");
+	fill_mapping(map, INPUT_DENOISING_DEPTH, "Denoising Depth", "Z");
+	fill_mapping(map, INPUT_DENOISING_NORMAL, "Denoising Normal", "XYZ");
+	fill_mapping(map, INPUT_DENOISING_SHADOWING, "Denoising Shadowing", "X");
+	fill_mapping(map, INPUT_DENOISING_ALBEDO, "Denoising Albedo", "RGB");
+	fill_mapping(map, INPUT_NOISY_IMAGE, "Noisy Image", "RGB");
+	fill_mapping(map, INPUT_DENOISING_VARIANCE, "Denoising Variance", "RGB");
+	fill_mapping(map, INPUT_DENOISING_INTENSITY, "Denoising Intensity", "X");
 	return map;
 }

@@ -261,6 +268,7 @@ bool DenoiseTask::acquire_tile(Device *device, Device *tile_device, RenderTile &
 * a different buffer to avoid having to copy an entire horizontal slice of the image. */
 void DenoiseTask::map_neighboring_tiles(RenderTile *tiles, Device *tile_device)
 {
+	/* Fill tile information. */
 	for(int i = 0; i < 9; i++) {
 		if(i == 4) {
 			continue;
@@ -278,10 +286,30 @@ void DenoiseTask::map_neighboring_tiles(RenderTile *tiles, Device *tile_device)
 		tiles[i].stride = image.width;
 	}

+	/* Allocate output buffer. */
 	device_vector<float> *output_mem = new device_vector<float>(tile_device, "denoising_output", MEM_READ_WRITE);
 	output_mem->alloc(OUTPUT_NUM_CHANNELS*tiles[4].w*tiles[4].h);
-	output_mem->zero_to_device();

+	/* Fill output buffer with noisy image, assumed by kernel_filter_finalize
+	 * when skipping denoising of some pixels. */
+	float *result = output_mem->data();
+	float *in = &image.pixels[image.num_channels*(tiles[4].y*image.width + tiles[4].x)];
+
+	const DenoiseImageLayer& layer = image.layers[current_layer];
+	const int *input_to_image_channel = layer.input_to_image_channel.data();
+
+	for(int y = 0; y < tiles[4].h; y++) {
+		for(int x = 0; x < tiles[4].w; x++, result += OUTPUT_NUM_CHANNELS) {
+			for(int i = 0; i < OUTPUT_NUM_CHANNELS; i++) {
+				result[i] = in[image.num_channels*x + input_to_image_channel[INPUT_NOISY_IMAGE + i]];
+			}
+		}
+		in += image.num_channels * image.width;
+	}
+
+	output_mem->copy_to_device();
+
+	/* Fill output tile info. */
 	tiles[9] = tiles[4];
 	tiles[9].buffer = output_mem->device_pointer;
 	tiles[9].stride = tiles[9].w;
@@ -300,6 +328,7 @@ void DenoiseTask::unmap_neighboring_tiles(RenderTile *tiles)
 	output_pixels.erase(tiles[4].tile_index);
 	output_lock.unlock();

+	/* Copy denoised pixels from device. */
 	output_mem->copy_from_device(0, OUTPUT_NUM_CHANNELS*tiles[9].w, tiles[9].h);

 	float *result = output_mem->data();
@@ -317,6 +346,7 @@ void DenoiseTask::unmap_neighboring_tiles(RenderTile *tiles)
 		out += image.num_channels * image.width;
 	}

+	/* Free device buffer. */
 	output_mem->free();
 	delete output_mem;
 }
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -329,8 +329,17 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 	for(size_t i = 0; i < passes.size(); i++) {
 		Pass& pass = passes[i];

-		if(pass.type == PASS_NONE)
+		if(pass.type == PASS_NONE) {
 			continue;
+		}
+
+		/* Can't do motion pass if no motion vectors are available. */
+		if (pass.type == PASS_MOTION || pass.type == PASS_MOTION_WEIGHT) {
+			if (scene->need_motion() != Scene::MOTION_PASS) {
+				kfilm->pass_stride += pass.components;
+				continue;
+			}
+		}

 		int pass_flag = (1 << (pass.type % 32));
 		if(pass.type <= PASS_CATEGORY_MAIN_END) {
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -1020,6 +1020,39 @@ void ImageManager::device_update_slot(Device *device,
 	}
 }

+void ImageManager::device_load_builtin(Device *device,
+                                       Scene *scene,
+                                       Progress& progress)
+{
+	/* Load only builtin images, Blender needs this to load evaluated
+	 * scene data from depsgraph before it is freed. */
+	if(!need_update) {
+		return;
+	}
+
+	TaskPool pool;
+	for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
+		for(size_t slot = 0; slot < images[type].size(); slot++) {
+			if(!images[type][slot])
+				continue;
+
+			if(images[type][slot]->need_load) {
+				if(images[type][slot]->builtin_data) {
+					pool.push(function_bind(&ImageManager::device_load_image,
+					                        this,
+					                        device,
+					                        scene,
+					                        (ImageDataType)type,
+					                        slot,
+					                        &progress));
+				}
+			}
+		}
+	}
+
+	pool.wait_work();
+}
+
 void ImageManager::device_free_builtin(Device *device)
 {
 	for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -96,6 +96,10 @@ public:
 	                        int flat_slot,
 	                        Progress *progress);
 	void device_free(Device *device);
+
+	void device_load_builtin(Device *device,
+	                         Scene *scene,
+	                         Progress& progress);
 	void device_free_builtin(Device *device);

 	void set_osl_texture_system(void *texture_system);
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -55,6 +55,10 @@ public:
 	float sample_clamp_indirect;
 	bool motion_blur;

+	/* Maximum number of samples, beyond which we are likely to run into
+	 * precision issues for sampling patterns. */
+	static const int MAX_SAMPLES = (1 << 24);
+
 	int aa_samples;
 	int diffuse_samples;
 	int glossy_samples;
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -118,6 +118,7 @@ NODE_DEFINE(Light)
 	SOCKET_FLOAT(sizeu, "Size U", 1.0f);
 	SOCKET_VECTOR(axisv, "Axis V", make_float3(0.0f, 0.0f, 0.0f));
 	SOCKET_FLOAT(sizev, "Size V", 1.0f);
+	SOCKET_BOOLEAN(round, "Round", false);

 	SOCKET_INT(map_resolution, "Map Resolution", 0);

@@ -184,14 +185,14 @@ LightManager::~LightManager()
 bool LightManager::has_background_light(Scene *scene)
 {
 	foreach(Light *light, scene->lights) {
-		if(light->type == LIGHT_BACKGROUND) {
+		if(light->type == LIGHT_BACKGROUND && light->is_enabled) {
 			return true;
 		}
 	}
 	return false;
 }

-void LightManager::disable_ineffective_light(Device *device, Scene *scene)
+void LightManager::disable_ineffective_light(Scene *scene)
 {
 	/* Make all lights enabled by default, and perform some preliminary checks
 	 * needed for finer-tuning of settings (for example, check whether we've
@@ -210,8 +211,7 @@ void LightManager::disable_ineffective_light(Device *device, Scene *scene)
 		 * - If we don't need it (no HDRs etc.)
 		 */
 		Shader *shader = (scene->background->shader) ? scene->background->shader : scene->default_background;
-		bool disable_mis = !(has_portal || shader->has_surface_spatial_varying) ||
-		                   !(device->info.advanced_shading);
+		bool disable_mis = !(has_portal || shader->has_surface_spatial_varying);
 		if(disable_mis) {
 			VLOG(1) << "Background MIS has been disabled.\n";
 			foreach(Light *light, scene->lights) {
@@ -757,12 +757,15 @@ void LightManager::device_update_points(Device *,
 			float3 axisu = light->axisu*(light->sizeu*light->size);
 			float3 axisv = light->axisv*(light->sizev*light->size);
 			float area = len(axisu)*len(axisv);
-			float invarea = (area > 0.0f)? 1.0f/area: 1.0f;
+			if(light->round) {
+				area *= -M_PI_4_F;
+			}
+			float invarea = (area != 0.0f)? 1.0f/area: 1.0f;
 			float3 dir = light->dir;

 			dir = safe_normalize(dir);

-			if(light->use_mis && area > 0.0f)
+			if(light->use_mis && area != 0.0f)
 				shader_id |= SHADER_USE_MIS;

 			klights[light_index].co[0] = co.x;
@@ -830,7 +833,10 @@ void LightManager::device_update_points(Device *,
 		float3 axisu = light->axisu*(light->sizeu*light->size);
 		float3 axisv = light->axisv*(light->sizev*light->size);
 		float area = len(axisu)*len(axisv);
-		float invarea = (area > 0.0f)? 1.0f/area: 1.0f;
+		if(light->round) {
+			area *= -M_PI_4_F;
+		}
+		float invarea = (area != 0.0f)? 1.0f/area: 1.0f;
 		float3 dir = light->dir;

 		dir = safe_normalize(dir);
@@ -874,7 +880,7 @@ void LightManager::device_update(Device *device, DeviceScene *dscene, Scene *sce

 	use_light_visibility = false;

-	disable_ineffective_light(device, scene);
+	disable_ineffective_light(scene);

 	device_update_points(device, dscene, scene);
 	if(progress.get_cancel()) return;
--- a/intern/cycles/render/light.h
+++ b/intern/cycles/render/light.h
@@ -51,6 +51,7 @@ public:
 	float sizeu;
 	float3 axisv;
 	float sizev;
+	bool round;

 	Transform tfm;

@@ -109,7 +110,7 @@ protected:
 	 * which doesn't contribute to the scene or which is only used for MIS
 	 * and scene doesn't need MIS.
 	 */
-	void disable_ineffective_light(Device *device, Scene *scene);
+	void disable_ineffective_light(Scene *scene);

 	void device_update_points(Device *device,
 	                          DeviceScene *dscene,
--- a/intern/cycles/render/merge.cpp
+++ b/intern/cycles/render/merge.cpp
@@ -0,0 +1,526 @@
+/*
+ * Copyright 2011-2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/merge.h"
+
+#include "util/util_array.h"
+#include "util/util_map.h"
+#include "util/util_system.h"
+#include "util/util_time.h"
+#include "util/util_unique_ptr.h"
+
+#include <OpenImageIO/imageio.h>
+#include <OpenImageIO/filesystem.h>
+
+OIIO_NAMESPACE_USING
+
+CCL_NAMESPACE_BEGIN
+
+/* Merge Image Layer */
+
+enum MergeChannelOp {
+	MERGE_CHANNEL_NOP,
+	MERGE_CHANNEL_COPY,
+	MERGE_CHANNEL_SUM,
+	MERGE_CHANNEL_AVERAGE
+};
+
+struct MergeImagePass {
+	/* Full channel name. */
+    string channel_name;
+	/* Channel format in the file. */
+    TypeDesc format;
+	/* Type of operation to perform when merging. */
+	MergeChannelOp op;
+	/* Offset of layer channels in input image. */
+	int offset;
+	/* Offset of layer channels in merged image. */
+	int merge_offset;
+};
+
+struct MergeImageLayer {
+	/* Layer name. */
+	string name;
+	/* Passes. */
+	vector<MergeImagePass> passes;
+	/* Sample amount that was used for rendering this layer. */
+	int samples;
+};
+
+/* Merge Image */
+
+struct MergeImage {
+	/* OIIO file handle. */
+	unique_ptr<ImageInput> in;
+	/* Image file path. */
+	string filepath;
+	/* Render layers. */
+	vector<MergeImageLayer> layers;
+};
+
+/* Channel Parsing */
+
+static MergeChannelOp parse_channel_operation(const string& pass_name)
+{
+	if(pass_name == "Depth" ||
+	   pass_name == "IndexMA" ||
+	   pass_name == "IndexOB" ||
+	   string_startswith(pass_name, "Crypto"))
+	{
+		return MERGE_CHANNEL_COPY;
+	}
+	else if(string_startswith(pass_name, "Debug BVH") ||
+	        string_startswith(pass_name, "Debug Ray") ||
+	        string_startswith(pass_name, "Debug Render Time"))
+	{
+		return MERGE_CHANNEL_SUM;
+	}
+	else {
+		return MERGE_CHANNEL_AVERAGE;
+	}
+}
+
+/* Splits in at its last dot, setting suffix to the part after the dot and
+ * into the part before it. Returns whether a dot was found. */
+static bool split_last_dot(string &in, string &suffix)
+{
+	size_t pos = in.rfind(".");
+	if(pos == string::npos) {
+		return false;
+	}
+	suffix = in.substr(pos+1);
+	in = in.substr(0, pos);
+	return true;
+}
+
+/* Separate channel names as generated by Blender.
+ * Multiview format: RenderLayer.Pass.View.Channel
+ * Otherwise: RenderLayer.Pass.Channel */
+static bool parse_channel_name(string name,
+                               string &renderlayer,
+                               string &pass,
+                               string &channel,
+                               bool multiview_channels)
+{
+	if(!split_last_dot(name, channel)) {
+		return false;
+	}
+	string view;
+	if(multiview_channels && !split_last_dot(name, view)) {
+		return false;
+	}
+	if(!split_last_dot(name, pass)) {
+		return false;
+	}
+	renderlayer = name;
+
+	if(multiview_channels) {
+		renderlayer += "." + view;
+	}
+
+	return true;
+}
+
+static bool parse_channels(const ImageSpec &in_spec,
+                           vector<MergeImageLayer>& layers,
+                           string& error)
+{
+	const ParamValue *multiview = in_spec.find_attribute("multiView");
+	const bool multiview_channels = (multiview &&
+	                                 multiview->type().basetype == TypeDesc::STRING &&
+	                                 multiview->type().arraylen >= 2);
+
+	layers.clear();
+
+	/* Loop over all the channels in the file, parse their name and sort them
+	 * by RenderLayer.
+	 * Channels that can't be parsed are directly passed through to the output. */
+	map<string, MergeImageLayer> file_layers;
+	for(int i = 0; i < in_spec.nchannels; i++) {
+		MergeImagePass pass;
+		pass.channel_name = in_spec.channelnames[i];
+		pass.format = (in_spec.channelformats.size() > 0) ? in_spec.channelformats[i] : in_spec.format;
+		pass.offset = i;
+		pass.merge_offset = i;
+
+		string layername, passname, channelname;
+		if(parse_channel_name(pass.channel_name, layername, passname, channelname, multiview_channels)) {
+			/* Channer part of a render layer. */
+			pass.op = parse_channel_operation(passname);
+		}
+		else {
+			/* Other channels are added in unnamed layer. */
+			layername = "";
+			pass.op = parse_channel_operation(pass.channel_name);
+		}
+
+		file_layers[layername].passes.push_back(pass);
+	}
+
+	/* Loop over all detected RenderLayers, check whether they contain a full set of input channels.
+	 * Any channels that won't be processed internally are also passed through. */
+	for(auto& i: file_layers) {
+		const string& name = i.first;
+		MergeImageLayer& layer = i.second;
+
+		layer.name = name;
+		layer.samples = 0;
+
+		/* Determine number of samples from metadata. */
+		if(layer.name == "") {
+			layer.samples = 1;
+		}
+		else if(layer.samples < 1) {
+			string sample_string = in_spec.get_string_attribute("cycles." + name + ".samples", "");
+			if(sample_string != "") {
+				if(!sscanf(sample_string.c_str(), "%d", &layer.samples)) {
+					error = "Failed to parse samples metadata: " + sample_string;
+					return false;
+				}
+			}
+		}
+
+		if(layer.samples < 1) {
+			error = string_printf("No sample number specified in the file for layer %s or on the command line", name.c_str());
+			return false;
+		}
+
+		layers.push_back(layer);
+	}
+
+	return true;
+}
+
+static bool open_images(const vector<string>& filepaths,
+                        vector<MergeImage>& images,
+                        string& error)
+{
+	for(const string& filepath: filepaths) {
+		unique_ptr<ImageInput> in(ImageInput::open(filepath));
+		if(!in) {
+			error = "Couldn't open file: " + filepath;
+			return false;
+		}
+
+		MergeImage image;
+		image.in = std::move(in);
+		image.filepath = filepath;
+		if(!parse_channels(image.in->spec(), image.layers, error)) {
+			return false;
+		}
+
+		if(image.layers.size() == 0) {
+			error = "Could not find a render layer for merging";
+			return false;
+		}
+
+		if(image.in->spec().deep) {
+			error = "Merging deep images not supported.";
+			return false;
+		}
+
+		if(images.size() > 0) {
+			const ImageSpec& base_spec = images[0].in->spec();
+			const ImageSpec& spec = image.in->spec();
+
+			if(base_spec.width != spec.width ||
+			   base_spec.height != spec.height ||
+			   base_spec.depth != spec.depth ||
+			   base_spec.format != spec.format ||
+			   base_spec.deep != spec.deep)
+			{
+				error = "Images do not have matching size and data layout.";
+				return false;
+			}
+		}
+
+		images.push_back(std::move(image));
+	}
+
+	return true;
+}
+
+static void merge_render_time(ImageSpec& spec,
+                              const vector<MergeImage>& images,
+                              const string& name,
+                              const bool average)
+{
+	double time = 0.0;
+
+	for(const MergeImage& image: images) {
+		string time_str = image.in->spec().get_string_attribute(name, "");
+		time += time_human_readable_to_seconds(time_str);
+	}
+
+	if(average) {
+		time /= images.size();
+	}
+
+	spec.attribute(name, TypeDesc::STRING, time_human_readable_from_seconds(time));
+}
+
+static void merge_layer_render_time(ImageSpec& spec,
+                                    const vector<MergeImage>& images,
+                                    const string& layer_name,
+                                    const string& time_name,
+                                    const bool average)
+{
+	string name = "cycles." + layer_name + "." + time_name;
+	double time = 0.0;
+
+	for(const MergeImage& image: images) {
+		string time_str = image.in->spec().get_string_attribute(name, "");
+		time += time_human_readable_to_seconds(time_str);
+	}
+
+	if(average) {
+		time /= images.size();
+	}
+
+	spec.attribute(name, TypeDesc::STRING, time_human_readable_from_seconds(time));
+}
+
+static void merge_channels_metadata(vector<MergeImage>& images,
+                                    ImageSpec& out_spec,
+                                    vector<int>& channel_total_samples)
+{
+	/* Based on first image. */
+	out_spec  = images[0].in->spec();
+
+	/* Merge channels and compute offsets. */
+	out_spec.nchannels = 0;
+	out_spec.channelformats.clear();
+	out_spec.channelnames.clear();
+
+	for(MergeImage& image: images) {
+		for(MergeImageLayer& layer: image.layers) {
+			for(MergeImagePass& pass: layer.passes) {
+				/* Test if matching channel already exists in merged image. */
+				bool found = false;
+
+				for(size_t i = 0; i < out_spec.nchannels; i++) {
+					if(pass.channel_name == out_spec.channelnames[i]) {
+						pass.merge_offset = i;
+						channel_total_samples[i] += layer.samples;
+						/* First image wins for channels that can't be averaged or summed. */
+						if (pass.op == MERGE_CHANNEL_COPY) {
+							pass.op = MERGE_CHANNEL_NOP;
+						}
+						found = true;
+						break;
+					}
+				}
+
+				if(!found) {
+					/* Add new channel. */
+					pass.merge_offset = out_spec.nchannels;
+					channel_total_samples.push_back(layer.samples);
+
+					out_spec.channelnames.push_back(pass.channel_name);
+					out_spec.channelformats.push_back(pass.format);
+					out_spec.nchannels++;
+				}
+			}
+		}
+	}
+
+	/* Merge metadata. */
+	merge_render_time(out_spec, images, "RenderTime", false);
+
+	map<string, int> layer_num_samples;
+	for(MergeImage& image: images) {
+		for(MergeImageLayer& layer: image.layers) {
+			if(layer.name != "") {
+				layer_num_samples[layer.name] += layer.samples;
+			}
+		}
+	}
+
+	for(const auto& i: layer_num_samples) {
+		string name = "cycles." + i.first + ".samples";
+		out_spec.attribute(name, TypeDesc::STRING, string_printf("%d", i.second));
+
+		merge_layer_render_time(out_spec, images, i.first, "total_time", false);
+		merge_layer_render_time(out_spec, images, i.first, "render_time", false);
+		merge_layer_render_time(out_spec, images, i.first, "synchronization_time", true);
+	}
+}
+
+static void alloc_pixels(const ImageSpec& spec, array<float>& pixels)
+{
+	const size_t width = spec.width;
+	const size_t height = spec.height;
+	const size_t num_channels = spec.nchannels;
+
+	const size_t num_pixels = (size_t)width * (size_t)height;
+	pixels.resize(num_pixels * num_channels);
+}
+
+static bool merge_pixels(const vector<MergeImage>& images,
+                         const ImageSpec& out_spec,
+                         const vector<int>& channel_total_samples,
+                         array<float>& out_pixels,
+                         string& error)
+{
+	alloc_pixels(out_spec, out_pixels);
+	memset(out_pixels.data(), 0, out_pixels.size() * sizeof(float));
+
+	for(const MergeImage& image: images) {
+		/* Read all channels into buffer. Reading all channels at once is
+		 * faster than individually due to interleaved EXR channel storage. */
+		array<float> pixels;
+		alloc_pixels(image.in->spec(), pixels);
+
+		if(!image.in->read_image(TypeDesc::FLOAT, pixels.data())) {
+			error = "Failed to read image: " + image.filepath;
+			return false;
+		}
+
+		for(size_t li = 0; li < image.layers.size(); li++) {
+			const MergeImageLayer& layer = image.layers[li];
+
+			const size_t stride = image.in->spec().nchannels;
+			const size_t out_stride = out_spec.nchannels;
+			const size_t num_pixels = pixels.size();
+
+			for(const MergeImagePass& pass: layer.passes) {
+				size_t offset = pass.offset;
+				size_t out_offset = pass.merge_offset;
+
+				switch(pass.op) {
+					case MERGE_CHANNEL_NOP:
+						break;
+					case MERGE_CHANNEL_COPY:
+						for(; offset < num_pixels; offset += stride, out_offset += out_stride) {
+							out_pixels[out_offset] = pixels[offset];
+						}
+						break;
+					case MERGE_CHANNEL_SUM:
+						for(; offset < num_pixels; offset += stride, out_offset += out_stride) {
+							out_pixels[out_offset] += pixels[offset];
+						}
+						break;
+					case MERGE_CHANNEL_AVERAGE:
+						/* Weights based on sample metadata. Per channel since not
+						 * all files are guaranteed to have the same channels. */
+						const int total_samples = channel_total_samples[out_offset];
+						const float t = (float)layer.samples / (float)total_samples;
+
+						for(; offset < num_pixels; offset += stride, out_offset += out_stride) {
+							out_pixels[out_offset] += t * pixels[offset];
+						}
+						break;
+				}
+			}
+		}
+	}
+
+	return true;
+}
+
+static bool save_output(const string& filepath,
+                        const ImageSpec& spec,
+                        const array<float>& pixels,
+                        string& error)
+{
+	/* Write to temporary file path, so we merge images in place and don't
+	 * risk destroying files when something goes wrong in file saving. */
+	string extension = OIIO::Filesystem::extension(filepath);
+	string unique_name = ".merge-tmp-" + OIIO::Filesystem::unique_path();
+	string tmp_filepath = filepath + unique_name + extension;
+	unique_ptr<ImageOutput> out(ImageOutput::create(tmp_filepath));
+
+	if(!out) {
+		error = "Failed to open temporary file " + tmp_filepath + " for writing";
+		return false;
+	}
+
+	/* Open temporary file and write image buffers. */
+	if(!out->open(tmp_filepath, spec)) {
+		error = "Failed to open file " + tmp_filepath + " for writing: " + out->geterror();
+		return false;
+	}
+
+	bool ok = true;
+	if(!out->write_image(TypeDesc::FLOAT, pixels.data())) {
+		error = "Failed to write to file " + tmp_filepath + ": " + out->geterror();
+		ok = false;
+	}
+
+	if(!out->close()) {
+		error = "Failed to save to file " + tmp_filepath + ": " + out->geterror();
+		ok = false;
+	}
+
+	out.reset();
+
+	/* Copy temporary file to outputput filepath. */
+	string rename_error;
+	if(ok && !OIIO::Filesystem::rename(tmp_filepath, filepath, rename_error)) {
+		error = "Failed to move merged image to " + filepath + ": " + rename_error;
+		ok = false;
+	}
+
+	if(!ok) {
+		OIIO::Filesystem::remove(tmp_filepath);
+	}
+
+	return ok;
+}
+
+/* Image Merger */
+
+ImageMerger::ImageMerger()
+{
+}
+
+bool ImageMerger::run()
+{
+	if(input.empty()) {
+		error = "No input file paths specified.";
+		return false;
+	}
+	if(output.empty()) {
+		error = "No output file path specified.";
+		return false;
+	}
+
+	/* Open images and verify they have matching layout. */
+	vector<MergeImage> images;
+	if(!open_images(input, images, error)) {
+		return false;
+	}
+
+	/* Merge metadata and setup channels and offsets. */
+	ImageSpec out_spec;
+	vector<int> channel_total_samples;
+	merge_channels_metadata(images, out_spec, channel_total_samples);
+
+	/* Merge pixels. */
+	array<float> out_pixels;
+	if(!merge_pixels(images, out_spec, channel_total_samples, out_pixels, error)) {
+		return false;
+	}
+
+	/* We don't need input anymore at this point, and will possibly
+	 * overwrite the same file. */
+	images.clear();
+
+	/* Save output file. */
+	return save_output(output, out_spec, out_pixels, error);
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/render/merge.h
+++ b/intern/cycles/render/merge.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2011-2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MERGE_H__
+#define __MERGE_H__
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Merge OpenEXR multilayer renders. */
+
+class ImageMerger {
+public:
+	ImageMerger();
+	bool run();
+
+	/* Error message after running, in case of failure. */
+	string error;
+
+	/* List of image filepaths to merge. */
+	vector<string> input;
+	/* Output filepath. */
+	string output;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __MERGE_H__ */
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -1233,6 +1233,8 @@ void MeshManager::update_osl_attributes(Device *device, Scene *scene, vector<Att
 					osl_attr.type = TypeDesc::TypeFloat;
 				else if(req.triangle_type == TypeDesc::TypeMatrix)
 					osl_attr.type = TypeDesc::TypeMatrix;
+				else if(req.triangle_type == TypeFloat2)
+					osl_attr.type = TypeFloat2;
 				else
 					osl_attr.type = TypeDesc::TypeColor;

@@ -1342,6 +1344,8 @@ void MeshManager::update_svm_attributes(Device *, DeviceScene *dscene, Scene *sc
 					attr_map[index].w = NODE_ATTR_FLOAT;
 				else if(req.triangle_type == TypeDesc::TypeMatrix)
 					attr_map[index].w = NODE_ATTR_MATRIX;
+				else if(req.triangle_type == TypeFloat2)
+					attr_map[index].w = NODE_ATTR_FLOAT2;
 				else
 					attr_map[index].w = NODE_ATTR_FLOAT3;

@@ -1359,6 +1363,8 @@ void MeshManager::update_svm_attributes(Device *, DeviceScene *dscene, Scene *sc
 					attr_map[index].w = NODE_ATTR_FLOAT;
 				else if(req.curve_type == TypeDesc::TypeMatrix)
 					attr_map[index].w = NODE_ATTR_MATRIX;
+				else if(req.curve_type == TypeFloat2)
+					attr_map[index].w = NODE_ATTR_FLOAT2;
 				else
 					attr_map[index].w = NODE_ATTR_FLOAT3;

@@ -1376,6 +1382,8 @@ void MeshManager::update_svm_attributes(Device *, DeviceScene *dscene, Scene *sc
 					attr_map[index].w = NODE_ATTR_FLOAT;
 				else if(req.subd_type == TypeDesc::TypeMatrix)
 					attr_map[index].w = NODE_ATTR_MATRIX;
+				else if(req.subd_type == TypeFloat2)
+					attr_map[index].w = NODE_ATTR_FLOAT2;
 				else
 					attr_map[index].w = NODE_ATTR_FLOAT3;

@@ -1404,6 +1412,7 @@ static void update_attribute_element_size(Mesh *mesh,
                                          Attribute *mattr,
                                          AttributePrimitive prim,
                                          size_t *attr_float_size,
+										  size_t *attr_float2_size,
                                          size_t *attr_float3_size,
                                          size_t *attr_uchar4_size)
 {
@@ -1419,6 +1428,9 @@ static void update_attribute_element_size(Mesh *mesh,
 		else if(mattr->type == TypeDesc::TypeFloat) {
 			*attr_float_size += size;
 		}
+		else if(mattr->type == TypeFloat2) {
+			*attr_float2_size += size;
+		}
 		else if(mattr->type == TypeDesc::TypeMatrix) {
 			*attr_float3_size += size * 4;
 		}
@@ -1431,6 +1443,8 @@ static void update_attribute_element_size(Mesh *mesh,
 static void update_attribute_element_offset(Mesh *mesh,
                                            device_vector<float>& attr_float,
                                            size_t& attr_float_offset,
+											device_vector<float2>& attr_float2,
+											size_t& attr_float2_offset,
                                            device_vector<float4>& attr_float3,
                                            size_t& attr_float3_offset,
                                            device_vector<uchar4>& attr_uchar4,
@@ -1477,6 +1491,16 @@ static void update_attribute_element_offset(Mesh *mesh,
 			}
 			attr_float_offset += size;
 		}
+		else if(mattr->type == TypeFloat2) {
+			float2 *data = mattr->data_float2();
+			offset = attr_float2_offset;
+
+			assert(attr_float2.size() >= offset + size);
+			for(size_t k = 0; k < size; k++) {
+				attr_float2[offset+k] = data[k];
+			}
+			attr_float2_offset += size;
+		}
 		else if(mattr->type == TypeDesc::TypeMatrix) {
 			Transform *tfm = mattr->data_transform();
 			offset = attr_float3_offset;
@@ -1561,6 +1585,7 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene,
 	 * take 2x of overall attribute memory usage.
 	 */
 	size_t attr_float_size = 0;
+	size_t attr_float2_size = 0;
 	size_t attr_float3_size = 0;
 	size_t attr_uchar4_size = 0;
 	for(size_t i = 0; i < scene->meshes.size(); i++) {
@@ -1575,28 +1600,33 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene,
 			                              triangle_mattr,
 			                              ATTR_PRIM_TRIANGLE,
 			                              &attr_float_size,
+										  &attr_float2_size,
 			                              &attr_float3_size,
 			                              &attr_uchar4_size);
 			update_attribute_element_size(mesh,
 			                              curve_mattr,
 			                              ATTR_PRIM_CURVE,
 			                              &attr_float_size,
+										  &attr_float2_size,
 			                              &attr_float3_size,
 			                              &attr_uchar4_size);
 			update_attribute_element_size(mesh,
 			                              subd_mattr,
 			                              ATTR_PRIM_SUBD,
 			                              &attr_float_size,
+										  &attr_float2_size,
 			                              &attr_float3_size,
 			                              &attr_uchar4_size);
 		}
 	}

 	dscene->attributes_float.alloc(attr_float_size);
+	dscene->attributes_float2.alloc(attr_float2_size);
 	dscene->attributes_float3.alloc(attr_float3_size);
 	dscene->attributes_uchar4.alloc(attr_uchar4_size);

 	size_t attr_float_offset = 0;
+	size_t attr_float2_offset = 0;
 	size_t attr_float3_offset = 0;
 	size_t attr_uchar4_offset = 0;

@@ -1614,6 +1644,7 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene,

 			update_attribute_element_offset(mesh,
 			                                dscene->attributes_float, attr_float_offset,
+											dscene->attributes_float2, attr_float2_offset,
 			                                dscene->attributes_float3, attr_float3_offset,
 			                                dscene->attributes_uchar4, attr_uchar4_offset,
 			                                triangle_mattr,
@@ -1623,6 +1654,7 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene,

 			update_attribute_element_offset(mesh,
 			                                dscene->attributes_float, attr_float_offset,
+											dscene->attributes_float2, attr_float2_offset,
 			                                dscene->attributes_float3, attr_float3_offset,
 			                                dscene->attributes_uchar4, attr_uchar4_offset,
 			                                curve_mattr,
@@ -1632,6 +1664,7 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene,

 			update_attribute_element_offset(mesh,
 			                                dscene->attributes_float, attr_float_offset,
+											dscene->attributes_float2, attr_float2_offset,
 			                                dscene->attributes_float3, attr_float3_offset,
 			                                dscene->attributes_uchar4, attr_uchar4_offset,
 			                                subd_mattr,
@@ -1657,6 +1690,9 @@ void MeshManager::device_update_attributes(Device *device, DeviceScene *dscene,
 	if(dscene->attributes_float.size()) {
 		dscene->attributes_float.copy_to_device();
 	}
+	if(dscene->attributes_float2.size()) {
+		dscene->attributes_float2.copy_to_device();
+	}
 	if(dscene->attributes_float3.size()) {
 		dscene->attributes_float3.copy_to_device();
 	}
@@ -2132,6 +2168,9 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen

 	/* Tessellate meshes that are using subdivision */
 	if(total_tess_needed) {
+		Camera *dicing_camera = scene->dicing_camera;
+		dicing_camera->update(scene);
+
 		size_t i = 0;
 		foreach(Mesh *mesh, scene->meshes) {
 			if(mesh->need_update &&
@@ -2147,6 +2186,7 @@ void MeshManager::device_update(Device *device, DeviceScene *dscene, Scene *scen

 				progress.set_status("Updating Mesh", msg);

+				mesh->subd_params->camera = dicing_camera;
 				DiagSplit dsplit(*mesh->subd_params);
 				mesh->tessellate(&dsplit);

@@ -2289,6 +2329,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
 	dscene->patches.free();
 	dscene->attributes_map.free();
 	dscene->attributes_float.free();
+	dscene->attributes_float2.free();
 	dscene->attributes_float3.free();
 	dscene->attributes_uchar4.free();

--- a/intern/cycles/render/mesh_subdivision.cpp
+++ b/intern/cycles/render/mesh_subdivision.cpp
@@ -231,6 +231,9 @@ public:
 				if(attr.same_storage(attr.type, TypeDesc::TypeFloat)) {
 					primvar_refiner.Interpolate(i+1, (OsdValue<float>*)src, (OsdValue<float>*&)dest);
 				}
+				else if(attr.same_storage(attr.type, TypeFloat2)) {
+					primvar_refiner.Interpolate(i+1, (OsdValue<float2>*)src, (OsdValue<float2>*&)dest);
+				}
 				else {
 					primvar_refiner.Interpolate(i+1, (OsdValue<float4>*)src, (OsdValue<float4>*&)dest);
 				}
@@ -243,6 +246,10 @@ public:
 					patch_table->ComputeLocalPointValues((OsdValue<float>*)&attr.buffer[0],
 							                             (OsdValue<float>*)&attr.buffer[num_refiner_verts * attr.data_sizeof()]);
 				}
+				else if(attr.same_storage(attr.type, TypeFloat2)) {
+					patch_table->ComputeLocalPointValues((OsdValue<float2>*)&attr.buffer[0],
+														 (OsdValue<float2>*)&attr.buffer[num_refiner_verts * attr.data_sizeof()]);
+				}
 				else {
 					patch_table->ComputeLocalPointValues((OsdValue<float4>*)&attr.buffer[0],
 							                             (OsdValue<float4>*)&attr.buffer[num_refiner_verts * attr.data_sizeof()]);
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -1520,6 +1520,19 @@ void PointDensityTextureNode::attributes(Shader *shader,
 	ShaderNode::attributes(shader, attributes);
 }

+void PointDensityTextureNode::add_image()
+{
+	if(slot == -1) {
+		ImageMetaData metadata;
+		slot = image_manager->add_image(filename.string(), builtin_data,
+		                                false, 0,
+		                                interpolation,
+		                                EXTENSION_CLIP,
+		                                true,
+		                                metadata);
+	}
+}
+
 void PointDensityTextureNode::compile(SVMCompiler& compiler)
 {
 	ShaderInput *vector_in = input("Vector");
@@ -1532,15 +1545,7 @@ void PointDensityTextureNode::compile(SVMCompiler& compiler)
 	image_manager = compiler.image_manager;

 	if(use_density || use_color) {
-		if(slot == -1) {
-			ImageMetaData metadata;
-			slot = image_manager->add_image(filename.string(), builtin_data,
-			                                false, 0,
-			                                interpolation,
-			                                EXTENSION_CLIP,
-			                                true,
-			                                metadata);
-		}
+		add_image();

 		if(slot != -1) {
 			compiler.stack_assign(vector_in);
@@ -1583,15 +1588,7 @@ void PointDensityTextureNode::compile(OSLCompiler& compiler)
 	image_manager = compiler.image_manager;

 	if(use_density || use_color) {
-		if(slot == -1) {
-			ImageMetaData metadata;
-			slot = image_manager->add_image(filename.string(), builtin_data,
-			                                false, 0,
-			                                interpolation,
-			                                EXTENSION_CLIP,
-			                                true,
-			                                metadata);
-		}
+		add_image();

 		if(slot != -1) {
 			compiler.parameter("filename", string_printf("@i%d", slot).c_str());
@@ -3384,6 +3381,20 @@ void GeometryNode::compile(OSLCompiler& compiler)
 	compiler.add(this, "node_geometry");
 }

+int GeometryNode::get_group()
+{
+	ShaderOutput *out;
+	int result = ShaderNode::get_group();
+
+	/* Backfacing uses NODE_LIGHT_PATH */
+	out = output("Backfacing");
+	if (!out->links.empty()) {
+		result = max(result, NODE_GROUP_LEVEL_1);
+	}
+
+	return result;
+}
+
 /* TextureCoordinate */

 NODE_DEFINE(TextureCoordinateNode)
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -268,6 +268,8 @@ public:
 	bool has_spatial_varying() { return true; }
 	bool has_object_dependency() { return true; }

+	void add_image();
+
 	ustring filename;
 	NodeTexVoxelSpace space;
 	InterpolationType interpolation;
@@ -666,6 +668,7 @@ public:
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
 	bool has_attribute_dependency() { return true; }
 	bool has_spatial_varying() { return true; }
+	int get_group();

 	float3 normal_osl;
 };
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -66,6 +66,10 @@ OSLShaderManager::~OSLShaderManager()
 {
 	shading_system_free();
 	texture_system_free();
+}
+
+void OSLShaderManager::free_memory()
+{
 #ifdef OSL_HAS_BLENDER_CLEANUP_FIX
 	/* There is a problem with llvm+osl: The order global destructors across
 	 * different compilation units run cannot be guaranteed, on windows this means
--- a/intern/cycles/render/osl.h
+++ b/intern/cycles/render/osl.h
@@ -70,6 +70,8 @@ public:
 	OSLShaderManager();
 	~OSLShaderManager();

+	static void free_memory();
+
 	void reset(Scene *scene);

 	bool use_osl() { return true; }
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -66,6 +66,7 @@ DeviceScene::DeviceScene(Device *device)
  camera_motion(device, "__camera_motion", MEM_TEXTURE),
  attributes_map(device, "__attributes_map", MEM_TEXTURE),
  attributes_float(device, "__attributes_float", MEM_TEXTURE),
+  attributes_float2(device, "__attributes_float2", MEM_TEXTURE),
  attributes_float3(device, "__attributes_float3", MEM_TEXTURE),
  attributes_uchar4(device, "__attributes_uchar4", MEM_TEXTURE),
  light_distribution(device, "__light_distribution", MEM_TEXTURE),
@@ -83,7 +84,10 @@ DeviceScene::DeviceScene(Device *device)
 }

 Scene::Scene(const SceneParams& params_, Device *device)
-: device(device), dscene(device), params(params_)
+        : name("Scene"),
+          device(device),
+          dscene(device),
+          params(params_)
 {
 	memset((void *)&dscene.data, 0, sizeof(dscene.data));

--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -98,6 +98,7 @@ public:
 	/* attributes */
 	device_vector<uint4> attributes_map;
 	device_vector<float> attributes_float;
+	device_vector<float2> attributes_float2;
 	device_vector<float4> attributes_float3;
 	device_vector<uchar4> attributes_uchar4;

@@ -196,6 +197,9 @@ public:

 class Scene {
 public:
+	/* Optional name. Is used for logging and reporting. */
+	string name;
+
 	/* data */
 	Camera *camera;
 	Camera *dicing_camera;
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -212,6 +212,11 @@ void Session::run_gpu()
 		/* advance to next tile */
 		bool no_tiles = !tile_manager.next();

+		DeviceKernelStatus kernel_state = DEVICE_KERNEL_UNKNOWN;
+		if (no_tiles) {
+			kernel_state = device->get_active_kernel_switch_state();
+		}
+
 		if(params.background) {
 			/* if no work left and in background mode, we can stop immediately */
 			if(no_tiles) {
@@ -219,6 +224,16 @@ void Session::run_gpu()
 				break;
 			}
 		}
+
+		/* Don't go in pause mode when image was rendered with preview kernels
+		 * When feature kernels become available the session will be resetted. */
+		else if (no_tiles && kernel_state == DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL) {
+			time_sleep(0.1);
+		}
+		else if (no_tiles && kernel_state == DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE) {
+			reset_gpu(tile_manager.params, params.samples);
+		}
+
 		else {
 			/* if in interactive mode, and we are either paused or done for now,
 			 * wait for pause condition notify to wake up again */
@@ -540,6 +555,11 @@ void Session::run_cpu()
 		bool no_tiles = !tile_manager.next();
 		bool need_tonemap = false;

+		DeviceKernelStatus kernel_state = DEVICE_KERNEL_UNKNOWN;
+		if (no_tiles) {
+			kernel_state = device->get_active_kernel_switch_state();
+		}
+
 		if(params.background) {
 			/* if no work left and in background mode, we can stop immediately */
 			if(no_tiles) {
@@ -547,6 +567,16 @@ void Session::run_cpu()
 				break;
 			}
 		}
+
+		/* Don't go in pause mode when preview kernels are used
+		 * When feature kernels become available the session will be resetted. */
+		else if (no_tiles && kernel_state == DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL) {
+			time_sleep(0.1);
+		}
+		else if (no_tiles && kernel_state == DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE) {
+			reset_cpu(tile_manager.params, params.samples);
+		}
+
 		else {
 			/* if in interactive mode, and we are either paused or done for now,
 			 * wait for pause condition notify to wake up again */
@@ -658,11 +688,6 @@ DeviceRequestedFeatures Session::get_requested_device_features()
 	scene->shader_manager->get_requested_features(
 	        scene,
 	        &requested_features);
-	if(!params.background) {
-		/* Avoid too much re-compilations for viewport render. */
-		requested_features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
-		requested_features.nodes_features = NODE_FEATURE_ALL;
-	}

 	/* This features are not being tweaked as often as shaders,
 	 * so could be done selective magic for the viewport as well.
@@ -704,7 +729,7 @@ DeviceRequestedFeatures Session::get_requested_device_features()
 	return requested_features;
 }

-void Session::load_kernels(bool lock_scene)
+bool Session::load_kernels(bool lock_scene)
 {
 	thread_scoped_lock scene_lock;
 	if(lock_scene) {
@@ -727,7 +752,7 @@ void Session::load_kernels(bool lock_scene)
 			progress.set_error(message);
 			progress.set_status("Error", message);
 			progress.set_update();
-			return;
+			return false;
 		}

 		progress.add_skip_time(timer, false);
@@ -735,14 +760,13 @@ void Session::load_kernels(bool lock_scene)

 		kernels_loaded = true;
 		loaded_kernel_features = requested_features;
+		return true;
 	}
+	return false;
 }

 void Session::run()
 {
-	/* load kernels */
-	load_kernels();
-
 	if(params.use_profiling && (params.device.type == DEVICE_CPU)) {
 		profiler.start();
 	}
@@ -884,7 +908,7 @@ bool Session::update_scene()

 	/* update scene */
 	if(scene->need_update()) {
-		load_kernels(false);
+		bool new_kernels_needed = load_kernels(false);

 		/* Update max_closures. */
 		KernelIntegrator *kintegrator = &scene->dscene.data.integrator;
@@ -899,6 +923,21 @@ bool Session::update_scene()
 		progress.set_status("Updating Scene");
 		MEM_GUARDED_CALL(&progress, scene->device_update, device, progress);

+		DeviceKernelStatus kernel_switch_status = device->get_active_kernel_switch_state();
+		bool kernel_switch_needed = kernel_switch_status == DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE ||
+		                            kernel_switch_status == DEVICE_KERNEL_FEATURE_KERNEL_INVALID;
+		if (kernel_switch_status == DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL) {
+			progress.set_kernel_status("Compiling render kernels");
+		}
+		if (new_kernels_needed || kernel_switch_needed) {
+			progress.set_kernel_status("Compiling render kernels");
+			device->wait_for_availability(loaded_kernel_features);
+			progress.set_kernel_status("");
+		}
+
+		if (kernel_switch_needed) {
+			reset(tile_manager.params, params.samples);
+		}
 		return true;
 	}
 	return false;
@@ -1092,6 +1131,12 @@ void Session::collect_statistics(RenderStats *render_stats)

 int Session::get_max_closure_count()
 {
+	if (scene->shader_manager->use_osl()) {
+		/* OSL always needs the maximum as we can't predict the
+		 * number of closures a shader might generate. */
+		return MAX_CLOSURE;
+	}
+
 	int max_closures = 0;
 	for(int i = 0; i < scene->shaders.size(); i++) {
 		int num_closures = scene->shaders[i]->graph->get_num_closures();
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -84,7 +84,7 @@ public:

 		progressive = false;
 		experimental = false;
-		samples = INT_MAX;
+		samples = 1024;
 		tile_size = make_int2(64, 64);
 		start_resolution = INT_MAX;
 		pixel_size = 1;
@@ -162,7 +162,7 @@ public:
 	void set_pause(bool pause);

 	bool update_scene();
-	void load_kernels(bool lock_scene=true);
+	bool load_kernels(bool lock_scene=true);

 	void device_free();

--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -208,6 +208,7 @@ Shader::Shader()

 	need_update = true;
 	need_update_mesh = true;
+	need_sync_object = false;
 }

 Shader::~Shader()
@@ -219,21 +220,38 @@ bool Shader::is_constant_emission(float3 *emission)
 {
 	ShaderInput *surf = graph->output()->input("Surface");

-	if(!surf->link || surf->link->parent->type != EmissionNode::node_type) {
+	if(surf->link == NULL) {
 		return false;
 	}

-	EmissionNode *node = (EmissionNode*) surf->link->parent;
+	if(surf->link->parent->type == EmissionNode::node_type) {
+		EmissionNode *node = (EmissionNode*) surf->link->parent;

-	assert(node->input("Color"));
-	assert(node->input("Strength"));
+		assert(node->input("Color"));
+		assert(node->input("Strength"));

-	if(node->input("Color")->link || node->input("Strength")->link) {
+		if(node->input("Color")->link || node->input("Strength")->link) {
+			return false;
+		}
+
+		*emission = node->color*node->strength;
+	}
+	else if(surf->link->parent->type == BackgroundNode::node_type) {
+		BackgroundNode *node = (BackgroundNode*) surf->link->parent;
+
+		assert(node->input("Color"));
+		assert(node->input("Strength"));
+
+		if(node->input("Color")->link || node->input("Strength")->link) {
+			return false;
+		}
+
+		*emission = node->color*node->strength;
+	}
+	else {
 		return false;
 	}

-	*emission = node->color*node->strength;
-
 	return true;
 }

@@ -692,6 +710,10 @@ void ShaderManager::get_requested_features(Scene *scene,
 void ShaderManager::free_memory()
 {
 	beckmann_table.free_memory();
+
+#ifdef WITH_OSL
+	OSLShaderManager::free_memory();
+#endif
 }

 float ShaderManager::linear_rgb_to_gray(float3 c)
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -99,6 +99,7 @@ public:
 	/* synchronization */
 	bool need_update;
 	bool need_update_mesh;
+	bool need_sync_object;

 	/* If the shader has only volume components, the surface is assumed to
 	 * be transparent.
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@@ -101,5 +101,6 @@ set(CMAKE_EXE_LINKER_FLAGS_DEBUG "${CMAKE_EXE_LINKER_FLAGS_DEBUG} ${PLATFORM_LIN
 CYCLES_TEST(render_graph_finalize "${ALL_CYCLES_LIBRARIES};bf_intern_numaapi")
 CYCLES_TEST(util_aligned_malloc "cycles_util")
 CYCLES_TEST(util_path "cycles_util;${BOOST_LIBRARIES};${OPENIMAGEIO_LIBRARIES}")
-CYCLES_TEST(util_string "cycles_util;${BOOST_LIBRARIES}")
-CYCLES_TEST(util_task "cycles_util;${BOOST_LIBRARIES};bf_intern_numaapi")
+CYCLES_TEST(util_string "cycles_util;${BOOST_LIBRARIES};${OPENIMAGEIO_LIBRARIES}")
+CYCLES_TEST(util_task "cycles_util;${BOOST_LIBRARIES};${OPENIMAGEIO_LIBRARIES};bf_intern_numaapi")
+CYCLES_TEST(util_time "cycles_util;${BOOST_LIBRARIES};${OPENIMAGEIO_LIBRARIES}")
--- a/intern/cycles/test/util_time_test.cpp
+++ b/intern/cycles/test/util_time_test.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2011-2019 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+TEST(time_human_readable_to_seconds, Empty) {
+	EXPECT_EQ(time_human_readable_to_seconds(""), 0.0);
+	EXPECT_EQ(time_human_readable_from_seconds(0.0), "00:00.00");
+}
+
+TEST(time_human_readable_to_seconds, Fraction) {
+	EXPECT_NEAR(time_human_readable_to_seconds(".1"), 0.1, 1e-8f);
+	EXPECT_NEAR(time_human_readable_to_seconds(".10"), 0.1, 1e-8f);
+	EXPECT_EQ(time_human_readable_from_seconds(0.1), "00:00.10");
+}
+
+TEST(time_human_readable_to_seconds, Seconds) {
+	EXPECT_NEAR(time_human_readable_to_seconds("2.1"), 2.1, 1e-8f);
+	EXPECT_NEAR(time_human_readable_to_seconds("02.10"), 2.1, 1e-8f);
+	EXPECT_EQ(time_human_readable_from_seconds(2.1), "00:02.10");
+
+	EXPECT_NEAR(time_human_readable_to_seconds("12.1"), 12.1, 1e-8f);
+	EXPECT_NEAR(time_human_readable_to_seconds("12.10"), 12.1, 1e-8f);
+	EXPECT_EQ(time_human_readable_from_seconds(12.1), "00:12.10");
+}
+
+TEST(time_human_readable_to_seconds, MinutesSeconds) {
+	EXPECT_NEAR(time_human_readable_to_seconds("3:2.1"), 182.1, 1e-8f);
+	EXPECT_NEAR(time_human_readable_to_seconds("03:02.10"), 182.1, 1e-8f);
+	EXPECT_EQ(time_human_readable_from_seconds(182.1), "03:02.10");
+
+	EXPECT_NEAR(time_human_readable_to_seconds("34:12.1"), 2052.1, 1e-8f);
+	EXPECT_NEAR(time_human_readable_to_seconds("34:12.10"), 2052.1, 1e-8f);
+	EXPECT_EQ(time_human_readable_from_seconds(2052.1), "34:12.10");
+}
+
+TEST(time_human_readable_to_seconds, HoursMinutesSeconds) {
+	EXPECT_NEAR(time_human_readable_to_seconds("4:3:2.1"), 14582.1, 1e-8f);
+	EXPECT_NEAR(time_human_readable_to_seconds("04:03:02.10"), 14582.1, 1e-8f);
+	EXPECT_EQ(time_human_readable_from_seconds(14582.1), "04:03:02.10");
+
+	EXPECT_NEAR(time_human_readable_to_seconds("56:34:12.1"), 203652.1, 1e-8f);
+	EXPECT_NEAR(time_human_readable_to_seconds("56:34:12.10"), 203652.1, 1e-8f);
+	EXPECT_EQ(time_human_readable_from_seconds(203652.1), "56:34:12.10");
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -90,8 +90,7 @@ void DebugFlags::CUDA::reset()

 DebugFlags::OpenCL::OpenCL()
  : device_type(DebugFlags::OpenCL::DEVICE_ALL),
-    debug(false),
-    single_program(false)
+    debug(false)
 {
 	reset();
 }
@@ -123,7 +122,6 @@ void DebugFlags::OpenCL::reset()
 	}
 	/* Initialize other flags from environment variables. */
 	debug = (getenv("CYCLES_OPENCL_DEBUG") != NULL);
-	single_program = (getenv("CYCLES_OPENCL_SINGLE_PROGRAM") != NULL);
 }

 DebugFlags::DebugFlags()
@@ -179,7 +177,6 @@ std::ostream& operator <<(std::ostream &os,
 	os << "OpenCL flags:\n"
 	   << "  Device type    : " << opencl_device_type << "\n"
 	   << "  Debug          : " << string_from_bool(debug_flags.opencl.debug) << "\n"
-	   << "  Single program : " << string_from_bool(debug_flags.opencl.single_program) << "\n"
 	   << "  Memory limit   : " << string_human_readable_size(debug_flags.opencl.mem_limit) << "\n";
 	return os;
 }
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -126,9 +126,6 @@ public:
 		/* Use debug version of the kernel. */
 		bool debug;

-		/* Use single program */
-		bool single_program;
-
 		/* TODO(mai): Currently this is only for OpenCL, but we should have it implemented for all devices. */
 		/* Artificial memory limit in bytes (0 if disabled). */
 		size_t mem_limit;
--- a/intern/cycles/util/util_ies.cpp
+++ b/intern/cycles/util/util_ies.cpp
@@ -293,7 +293,7 @@ bool IESFile::process_type_c()
 {
 	if(h_angles[0] == 90.0f) {
 		/* Some files are stored from 90° to 270°, so we just rotate them to the regular 0°-180° range here. */
-		for(int i = 0; i < v_angles.size(); i++) {
+		for(int i = 0; i < h_angles.size(); i++) {
 			h_angles[i] -= 90.0f;
 		}
 	}
--- a/intern/cycles/util/util_ies.h
+++ b/intern/cycles/util/util_ies.h
@@ -40,7 +40,7 @@ protected:
 	bool process_type_c();

 	/* The brightness distribution is stored in spherical coordinates.
-	 * The horizontal angles correspond to to theta in the regular notation
+	 * The horizontal angles correspond to theta in the regular notation
 	 * and always span the full range from 0° to 360°.
 	 * The vertical angles correspond to phi and always start at 0°. */
 	vector<float> v_angles, h_angles;
--- a/intern/cycles/util/util_math_float2.h
+++ b/intern/cycles/util/util_math_float2.h
@@ -220,6 +220,12 @@ ccl_device_inline float2 interp(const float2& a, const float2& b, float t)
 {
 	return a + t*(b - a);
 }
+
+ccl_device_inline float2 mix(const float2& a, const float2& b, float t)
+{
+	return a + t*(b - a);
+}
+
 #endif  /* !__KERNEL_OPENCL__ */

 CCL_NAMESPACE_END
--- a/intern/cycles/util/util_math_intersect.h
+++ b/intern/cycles/util/util_math_intersect.h
@@ -186,12 +186,17 @@ ccl_device_forceinline bool ray_triangle_intersect(
 #undef dot3
 }

+/* Tests for an intersection between a ray and a quad defined by
+ * its midpoint, normal and sides.
+ * If ellipse is true, hits outside the ellipse that's enclosed by the
+ * quad are rejected.
+ */
 ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D,
                                   float ray_mint, float ray_maxt,
                                   float3 quad_P,
                                   float3 quad_u, float3 quad_v, float3 quad_n,
                                   float3 *isect_P, float *isect_t,
-                                   float *isect_u, float *isect_v)
+                                   float *isect_u, float *isect_v, bool ellipse)
 {
 	/* Perform intersection test. */
 	float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n);
@@ -200,20 +205,23 @@ ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D,
 	}
 	const float3 hit = ray_P + t*ray_D;
 	const float3 inplane = hit - quad_P;
-	const float u = dot(inplane, quad_u) / dot(quad_u, quad_u) + 0.5f;
-	if(u < 0.0f || u > 1.0f) {
+	const float u = dot(inplane, quad_u) / dot(quad_u, quad_u);
+	if(u < -0.5f || u > 0.5f) {
 		return false;
 	}
-	const float v = dot(inplane, quad_v) / dot(quad_v, quad_v) + 0.5f;
-	if(v < 0.0f || v > 1.0f) {
+	const float v = dot(inplane, quad_v) / dot(quad_v, quad_v);
+	if(v < -0.5f || v > 0.5f) {
+		return false;
+	}
+	if(ellipse && (u*u + v*v > 0.25f)) {
 		return false;
 	}
 	/* Store the result. */
 	/* TODO(sergey): Check whether we can avoid some checks here. */
 	if(isect_P != NULL) *isect_P = hit;
 	if(isect_t != NULL) *isect_t = t;
-	if(isect_u != NULL) *isect_u = u;
-	if(isect_v != NULL) *isect_v = v;
+	if(isect_u != NULL) *isect_u = u + 0.5f;
+	if(isect_v != NULL) *isect_v = v + 0.5f;
 	return true;
 }

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Sergey Sharybin	e045fe53f1	Buildbot: Port configuration to Blender2.7 branch	2019-06-27 12:41:52 +02:00
Bastien Montagne	054dbb833e	Fix (unreported) missing remapping of proxy_from pointer. That would break proxy behavior after a library reload. The usual super-annoying loop-back pointers... At least that one is easily detectable and can be fixed in-place. Found while investigating T64764.	2019-05-23 10:19:30 +02:00
Bastien Montagne	3a702ec028	Fix (unreported) usercount of linked IDs becoming garbage after undo/redo. Not re-reading linked data-blocks in undo/redo case also means that we do not touch to their usercounts. Even worse, lib_link process in readfile will increase those (for cases where local data uses linked one). Whole data management code is now heavily relying on valid consistent refcount of all IDs, so we cannot allow that anymore. Simple solution here could have been to then not increase that one for linked IDs in `newlibadr_us()`, but unfortunately that would not be totally bullet-proof, as some local users of linked data may be added or removed by an undo step... So I cannot think of any other solution than the ugly brute force one, i.e. going over the whole Main database and recompute linked IDs users count... Should not be a big issue performance wise though, this is fairly cheap process.	2019-05-22 21:10:36 +02:00
Bastien Montagne	3600e94eba	BKE Library handling: add function to recompute usercounts of IDs. This will be needed in undo/redo case, since we do not re-read linked IDs, their usercounts become total garbage (especially in 'used by local ID' cases)...	2019-05-22 21:08:51 +02:00
Julian Eisel	d83a72ec10	Fix crash activating a fullscreened screen Steps to reproduce were: * Duplicate some area into new window (shift-click corner triangle) * Make it fullscreen * Close the window again * Activate the added screen from the menu (the one without the -nonnormal prefix) -> Crash (you may have to press "Back to Previous" first though) When activating a screen, code should check if there's a fullscreen variant of it and activate that instead. From what I can tell that's what the code tried to do, but incorrectly. Same issue as T64045, but things are a bit different for 2.7.	2019-05-04 15:12:55 +02:00
Alex Fuller	2adf4b401e	Fix for Cycles UV adaptive subdivision after float2 changes. Differential Revision: https://developer.blender.org/D4717	2019-04-25 14:05:33 +02:00
Brecht Van Lommel	c07bce5bf2	Fix T63796: Cycles OSL shader with closure not working in final render.	2019-04-23 18:53:37 +02:00
Brecht Van Lommel	8982f0cfee	Fix T62408: Cycles viewport adaptive subdivision hangs after updates Backporting fix from the master branch.	2019-04-21 03:05:38 +02:00
Sergey Sharybin	cc9528d3c8	Buildbot: Attempt to fix wrong branch in buildinfo For some reason the buildinfo header was not re-generated. The root reason is not really clear to me, so simply remove the header similar to the CMake cache.	2019-04-18 11:52:34 +02:00
Ray molenkamp	10f724cec5	Fix: Build error with msvc. Unused label spawned a warn but bmesh builds with warns as errors.	2019-04-09 10:45:39 -06:00
Robert-André Mauchin	d780409156	Fix for GCC9 new OpenMP data sharing GCC 9 started implementing the OpenMP 4.0 and later behavior. When not using default clause or when using default(shared), this makes no difference, but if using default(none), previously the choice was not specify the const qualified variables on the construct at all, or specify in firstprivate clause. In GCC 9 as well as for OpenMP 4.0 compliance, those variables need to be specified on constructs in which they are used, either in shared or in firstprivate clause. Specifying them in firstprivate clause is one way to achieve compatibility with both older GCC versions and GCC 9, another option is to drop the default(none) clause. This patch thus drops the default(none) clause. See https://gcc.gnu.org/gcc-9/porting_to.html#ompdatasharing Signed-off-by: Robert-André Mauchin <zebob.m@gmail.com>	2019-04-07 11:20:50 +02:00
Brecht Van Lommel	44b54baf96	Fix build error with Visual Studio, appears to be a compiler bug It's not clear that "if (0 &&" was introduced intentionally, but this test only seems to have been an insignificant optimization anyway. Fixes T63329	2019-04-07 02:41:46 +02:00
Sergey Sharybin	4c2b79a1d5	Constraints: Use RNA update instead of block update Allows to have more control over which tags are done for which properties. This is a part of T62960 which fixes the issue in the 2.7 series.	2019-04-02 17:44:18 +02:00
Sergey Sharybin	4e2667ddf6	Constraints: Mark proxy lcoal as not editable This can not work reliably anyway.	2019-04-02 17:43:33 +02:00
Sergey Sharybin	e9aa0d1e48	Cleanup: Remove space at the end of description	2019-04-02 17:42:30 +02:00
Brecht Van Lommel	cefc058dd8	Build environment: update comment about required packages.	2019-03-29 00:39:19 +01:00
Brecht Van Lommel	a29446da52	Cycles: sync various master changes to blender2.7. Many of these were left out accidentally. We will only do important bugfixes in blender2.7 for Cycles from this point on.	2019-03-26 14:42:26 +01:00
Stephen Hassard	ede1ca0b3f	Change remaining FTP server URIs to HTTP(s) Most of the source tarballs are retrieved via http, but a few remain that are still downloaded via ftp. This causes some pain with corporate firewalls, so moving the last two URIs to http helps ease the build process. Reviewers: sergey Differential Revision: https://developer.blender.org/D4192	2019-03-25 11:19:14 +01:00
Bastien Montagne	d46cb486ed	Include blosc, embree and opencollada into BUILD_NOTES.txt file. For custom path selected during 'install_deps.sh' using '--source'/'--install', paths for blosc, embree and opencollada are not printed/inclued into BUILD_NOTES.txt file. As '/opt/lib/<package>' paths are hardcoded into CMakes's Find* modules, this error is not noticeable, but for custom paths it is. This patch includes those fixes/prints for those packages. Reviewers: mont29 Reviewed By: mont29 Differential Revision: https://developer.blender.org/D4574	2019-03-25 11:04:26 +01:00
Bastien Montagne	cfe044b4ef	Fix T62175: Crash on Undo. Again nodetree broken code... as usual...	2019-03-25 10:34:56 +01:00
Germano Cavalcante	edcb5415a2	MSVC: add C4115 and C4189 warnings. This matches the warnings of the other compilers commonly used in Blender.	2019-03-22 16:15:46 -03:00
Brecht Van Lommel	9e3e92a908	Revert "Cleanup: remove legacy mesh save support" Fixes T62793. Leave this in the blender2.7 branch for those that still rely on it, but it will remain removed in master.	2019-03-22 18:16:46 +01:00
Brecht Van Lommel	8e6f765964	Fix T62844: Cycles crash with bevel and degenerate geometry.	2019-03-22 17:30:36 +01:00
Brecht Van Lommel	d7cecc2ba3	Fix cycles.merge_images not merging correctly for some channel layouts.	2019-03-20 18:39:26 +01:00
Brecht Van Lommel	b06d32e4be	Cycles: make cycles.merge_images work with incomplete layers and passes. If layers and passes are not exactly the same in all files, we make a best effort to merge them instead of failing.	2019-03-20 17:26:25 +01:00
Jacques Lucke	c4908c8e8c	Fix T62758: hair curves with UV mapped textures renders wrong. Differential Revision: https://developer.blender.org/D4562	2019-03-20 17:26:21 +01:00
Stefan Werner	63cb789551	Cycles: Made Embree ignore curve intersections with SSS.	2019-03-20 12:30:33 +01:00
Stefan Werner	5eb38df4af	Cycles: Performance optimization for Embree, resizing arrays once instead of per object.	2019-03-20 12:30:33 +01:00
Brecht Van Lommel	d555c92e3c	Cleanup: fix debug warnign due to tooltip ending in dot.	2019-03-19 19:16:39 +01:00
Brecht Van Lommel	83de13f75a	Cycles: add cycles.merge_images operator for combing EXR renders. This is only available through the API, mainly intended for render farms to combine rendered multilayer EXR Files with different samples. The images are currently expected to have the exact same render layers and passes, just with different samples. Variance passes are still simply a weighted average, ideally these should be merged more intelligently. Differential Revision: https://developer.blender.org/D4554	2019-03-19 18:23:19 +01:00
Sergey Sharybin	5b7b7101c8	Cycles: Implement function to format and parse human readable time Gives value in seconds for a string which is encoded in format HH:MM:SS.hh.	2019-03-19 17:38:43 +01:00
Brecht Van Lommel	01df4818a6	Fix Cycles curve UVs wrong after recent changes.	2019-03-18 19:02:59 +01:00
Brecht Van Lommel	d76fb8ec67	Fix T62712: Cycles world light path node not working.	2019-03-18 18:46:14 +01:00
Brecht Van Lommel	dff88a92a4	Fix AMD OpenCL build error after recent changes. Always use native function since this was already the case due to __CL_USE_NATIVE__ not being defined in time, and seems to have caused no known issues.	2019-03-18 16:39:57 +01:00
Brecht Van Lommel	8162a6c51d	Cleanup: fix compiler warnings.	2019-03-18 14:56:08 +01:00
Sergey Sharybin	7c5be750a3	Cycles: Cleanup strict compiler warnings	2019-03-18 12:02:41 +01:00
Brecht Van Lommel	9d2397c710	Cleanup: remove unused buildbot code.	2019-03-17 12:01:19 +01:00
Brecht Van Lommel	0676badb80	Fix Windows 32bit buildbot trying to use CUDA, should be disabled.	2019-03-17 12:01:19 +01:00
Brecht Van Lommel	7778a1a0a1	Cycles: optimization for constant background colors. Skip shader evaluation then, as we already do for lights. Less than 1% faster in my tests, but might as well be consistent for both.	2019-03-17 12:01:19 +01:00
Brecht Van Lommel	9c7517fb63	Fix Cycles OpenCL compile waiting unnecessarily for background shader. Makes preview kernel appear quicker when background color is fixed.	2019-03-17 12:01:19 +01:00
Brecht Van Lommel	9873005ecd	Cleanup: simplify kernel features definition. No functional changes, logic here got too complex after many changes over the years.	2019-03-17 12:01:19 +01:00
Brecht Van Lommel	e17f7af0ce	Cleanup: remove Cycles advanced shading features toggle. It's effectively always enabled, only not on some unsupported OpenCL devices. For testing those it's not useful to disable these features. This is replaced by the more fine grained feature toggles that we have now.	2019-03-17 01:58:39 +01:00
Brecht Van Lommel	52a7636c29	Tests: remove unnecessary _test postfix on test names.	2019-03-15 19:11:33 +01:00
Brecht Van Lommel	65d95879f7	Cycles: upgrade to CUDA 10.1 as the one officially supported version. This version fixes various bugs, and there is no need anymore to use both 9.1 and 10.0 for different cards. There is a bug related to WITH_CYCLES_CUBIN_COMPILER and bump mapping in the regression tests, so that remains disabled same as it was for CUDA 10.0. Fix T59286: CUDA bake failing on some cards. Fix T56858: CUDA 9.2 and 10 issues.	2019-03-15 16:52:28 +01:00
Brecht Van Lommel	56a8c2a320	Fix cycles preferences.get_devices() not returning all devices as before. It only returned those for the active device type. For backwards compatibility return them all again, but still avoid enumerating them from our own code on startup or opening preferences.	2019-03-15 16:52:28 +01:00
Jeroen Bakker	2f6257fd7f	Cycles/OpenCL: Compile Kernels During Scene Update The main goals of this change is faster starting when using foreground rendering. This patch will build kernels in parallel to the update process of the scene. When these optimized kernels are not available (yet) an AO kernel will be used. These AO kernels are fast to compile (3-7 seconds) and can be reused by all scenes. When the final kernels become available we will switch to these kernels. In background mode the AO kernels will not be used. Some kernels are being used during Scene update (displace, background light). When these kernels are being used the process can halt until these become available. Reviewed By: brecht, #cycles Maniphest Tasks: T61752 Differential Revision: https://developer.blender.org/D4428	2019-03-15 16:18:21 +01:00
Jeroen Bakker	6237743111	Cycles/OpenCL: Added missing opencl programs The functions that determine the program name + filename of kernels were missing some base kernels like denoising and base. For completeness I added those kernels so the function returns the correct results.	2019-03-15 08:11:28 +01:00
Brecht Van Lommel	57b5852bc8	Fix T62120: number button editing outside of soft max range jumps.	2019-03-14 18:29:42 +01:00
Brecht Van Lommel	7b38ad7286	Fix T57138: Cycles CMJ failing with viewport samples set to 0. Can't use INT_MAX, CMJ runs into precision/overflow issues before that.	2019-03-14 17:39:00 +01:00
Stefan Werner	47da8dcbca	Cycles: Improved thread order for better CUDA performance. This patch puts threads that render the same pixel closer together, as opposed to threads that render the same sample. Thus threads within a warp are more coherent in memory access and control flow, leading to performance improvements. Example benchmarks on a Quadro RTX4000 (WDDM) on Windows 10: Koro: 4:23 -> 3:46 BMW: 1:18 -> 1:25 Barbershop Interior: 17:52 -> 14:55 Classroom: 4:37 -> 3:45 Performance differences on OpenCL/AMD were hit and miss, some scenes became faster, others lost significantly. Therefore, this is kept as CUDA only change for now.	2019-03-14 11:45:58 +01:00
Jeroen Bakker	4887baf7d6	Fix T62145: Geometry.Backface Node Renders Black The NODE_GROUP_LEVEL of the Geometry node should be bumped to 1 when Backface is connected. Backface uses `NODE_LIGHT_PATH` that is part of NODE_GROUP_LEVEL1, the rest of the geometry ndoe is NODE_GROUP_LEVEL_0.	2019-03-14 09:20:11 +01:00
Brecht Van Lommel	6eeac735f2	Build environment: require Alsa/OSS/PulseAudio backends for OpenAL on Linux. Otherwise we can silently build an OpenAL that can't actually playback audio.	2019-03-13 18:33:26 +01:00
Brecht Van Lommel	1be16466e7	Fix T60434: crash with OSL and viewport + preview render at the same time. Don't free LLVM JIT memory until process exit, there might be multiple OSL instances using it.	2019-03-13 18:31:12 +01:00
Brecht Van Lommel	6a8d87db18	Build environment: backport master changes to blender2.7. We can keep these in sync for now.	2019-03-13 18:31:12 +01:00
Brecht Van Lommel	1f39b8b228	Build environment: require Alsa/OSS/PulseAudio backends for OpenAL on Linux. Otherwise we can silently build an OpenAL that can't actually playback audio.	2019-03-13 13:00:03 +01:00
Brecht Van Lommel	e3b1ae9a81	Fix T62481: Cycles crash rendering with UV pass after recent changes.	2019-03-12 14:11:36 +01:00
Jeroen Bakker	298dabc79b	Cycles/OpenCL: Reduce How Often Kernel Recompilations Are Needed This patch will reduce the number of times that we need to recompile kernels. It does this by (en/dis)abling features by default. So when the user needs them that the kernels are already available. Other features are enabled by default for background and foreground rendering. When in background rendering the user wants the best render performance. When in foreground rendering the user wants the least amount of recompilations. Enabling volumetrics or subdivision evaluation will still trigger a recompilation during foreground rendering. Reviewed By: #cycles, brecht Differential Revision: https://developer.blender.org/D4485	2019-03-12 14:06:45 +01:00
Brecht Van Lommel	2c8bd1d8cb	Fix T61053: crash baking to float image after file save. This is the wrong flag to check, no other code actually reads it.	2019-03-11 14:31:59 +01:00
Brecht Van Lommel	56a633fd2c	Fix T61103: Cycles bevel wrong on objects with negative scale.	2019-03-11 14:26:06 +01:00
Brecht Van Lommel	6503b4f90f	Fix T61831: Denoising Clean pass not scaled correctly with samples.	2019-03-11 14:26:06 +01:00
Sergey Sharybin	0152bf2edf	Color management: Fix/workaround broken getDefaultDisplay() This is something what is caused by OCIO library. The patch has been submitted there: https://github.com/imageworks/OpenColorIO/pull/638 For until it is refined and checked we do workaround from our side.	2019-03-11 11:27:04 +01:00
Sergey Sharybin	c603a755bd	Color management: Query default view from display Solves weird situation when default display name is queried from OCIO, but Default view being assumed to be set for it. Now view is initialized to a default view of that display.	2019-03-11 11:26:51 +01:00
Julian Eisel	0a28bb1422	Fix stashed full-screens returning to wrong layout Steps to reproduce were: * Maximize area (Shift+Spacebar in 2.7, Ctrl+Spacebar in 2.8) * Open temp file browser (Ctrl+O) * Cancel file browser (Esc) - should return to previous full-screen * Press "Return to Previous" button The previously maximized area would turn into a file-browser. Note that the issue will still happen when opening old files saved while in maximized area full-screen.	2019-03-09 16:58:13 +01:00
Jeroen Bakker	02a7e875d7	Cycles OpenCL: Remove single program Part of the cleanup of the OpenCL codebase. Single program is not effective when using OpenCL, it is slower to compile and slower during rendering (when used in for example `barbershop` or `victor`). Reviewers: brecht, #cycles Maniphest Tasks: T62267 Differential Revision: https://developer.blender.org/D4481	2019-03-08 16:31:35 +01:00
Ray molenkamp	7ecbf9b409	cmake/windows: Set LLVM_INCLUDE_DIRS variable.	2019-03-08 07:18:17 -07:00
Ray molenkamp	285577a378	build_environment/Windows: Copy llvm debug headers. llvm generates some header files at build time that differ between debug/release causing linker errors when you used the release headers for a debug build.	2019-03-08 07:18:17 -07:00
Jeroen Bakker	76442e676e	Codestyle: comments	2019-03-08 08:56:16 +01:00
Brecht Van Lommel	f08191a459	Fix Cycles build error on non-x86 processors.	2019-03-06 13:37:06 +01:00
Brecht Van Lommel	e290a0b056	Cleanup: add asserts to catch cases where wrong attribute type is used.	2019-03-05 19:05:24 +01:00
Brecht Van Lommel	25c935e65f	Fix Cycles bug rendering with multiple UV maps after recent changes.	2019-03-05 18:39:55 +01:00
Stefan Werner	304a89eccf	Cycles: Changed standalone XML parser to read UV as float2	2019-03-05 15:29:50 +01:00
Brecht Van Lommel	db7f9a70b0	Cycles: Added Float2 attribute type. Float2 are now a new type for attributes in Cycles. Before, the choices for attribute storage were float and float3, the latter padded to float4. This meant that UV maps were inflated to twice the size necessary. Reviewers: brecht, sergey Reviewed By: brecht Subscribers: #cycles Tags: #cycles Differential Revision: https://developer.blender.org/D4409	2019-03-05 14:55:21 +01:00
Jeroen Bakker	a325bc6bf3	Fix T58953: Lamp data not always set The Lamp data was not always set. When using CUDA or CPU it was, but when using OpenCL without `OBJECT_MOTION` `sd->lamp` not updated to the actual lamp. This made the TextureCoordinate output the wrong normal when used in a light shader. As the normal was incorrect it made the IES node render incorrectly. (what is the default for the IES node). By setting the lamp data when no `__OBJECT_MOTION__` compile directive is present makes sure that the normal is correctly calculated. Fix D4450 Reviewed By: Brecht van Lommel	2019-03-05 14:22:54 +01:00
Dalai Felinto	69c8248a1c	Fixup for fix for OSX build using a build folder name with spaces Bug introduced on: `1f22e3f311`. This was making regular Mac builds to fail, where they were not failing before. Tested by William Reynish.	2019-03-04 16:53:56 +01:00
Brecht Van Lommel	3c5113221d	Fix missing image editor redraw when reloading image through API.	2019-03-04 16:06:57 +01:00
Brecht Van Lommel	e53db8342a	Fix Cycles animation denoising giving black pixels for some outliers. The denoising code expects the output buffer to be filled with the noisy image, which was not the case for standalone denoising.	2019-03-04 16:06:56 +01:00
Sergey Sharybin	d3306f0272	Fix bad level calls	2019-03-04 11:58:37 +01:00