Merge branch 'master' into soc-2021-uv-edge-select-support

Code cleanup and minor fixes
2021-12-17 18:31:32 +05:30 · 2021-11-20 00:04:44 +05:30 · 2021-11-19 23:54:27 +05:30 · 2021-11-03 17:16:00 +05:30 · 2021-10-04 16:37:02 +05:30 · 2021-10-02 12:40:33 +05:30
1656 changed files with 46304 additions and 42082 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -559,14 +559,12 @@ if(WIN32)
  set(CPACK_INSTALL_PREFIX ${CMAKE_GENERIC_PROGRAM_FILES}/${})
 endif()

-# Compiler tool-chain.
-if(UNIX AND NOT APPLE)
-  if(CMAKE_COMPILER_IS_GNUCC)
-    option(WITH_LINKER_GOLD "Use ld.gold linker which is usually faster than ld.bfd" ON)
-    mark_as_advanced(WITH_LINKER_GOLD)
-    option(WITH_LINKER_LLD "Use ld.lld linker which is usually faster than ld.gold" OFF)
-    mark_as_advanced(WITH_LINKER_LLD)
-  endif()
+# Compiler toolchain
+if(CMAKE_COMPILER_IS_GNUCC)
+  option(WITH_LINKER_GOLD "Use ld.gold linker which is usually faster than ld.bfd" ON)
+  mark_as_advanced(WITH_LINKER_GOLD)
+  option(WITH_LINKER_LLD "Use ld.lld linker which is usually faster than ld.gold" OFF)
+  mark_as_advanced(WITH_LINKER_LLD)
 endif()

 option(WITH_COMPILER_ASAN "Build and link against address sanitizer (only for Debug & RelWithDebInfo targets)." OFF)
--- a/build_files/build_environment/cmake/gmp.cmake
+++ b/build_files/build_environment/cmake/gmp.cmake
@@ -38,6 +38,13 @@ elseif(UNIX AND NOT APPLE)
  )
 endif()

+if(BLENDER_PLATFORM_ARM)
+  set(GMP_OPTIONS
+    ${GMP_OPTIONS}
+    --disable-assembly
+  )
+endif()
+
 ExternalProject_Add(external_gmp
  URL file://${PACKAGE_DIR}/${GMP_FILE}
  DOWNLOAD_DIR ${DOWNLOAD_DIR}
--- a/build_files/build_environment/cmake/versions.cmake
+++ b/build_files/build_environment/cmake/versions.cmake
@@ -474,9 +474,9 @@ set(ISPC_HASH 2e3abedbc0ea9aaec17d6562c632454d)
 set(ISPC_HASH_TYPE MD5)
 set(ISPC_FILE ispc-${ISPC_VERSION}.tar.gz)

-set(GMP_VERSION 6.2.1)
+set(GMP_VERSION 6.2.0)
 set(GMP_URI https://gmplib.org/download/gmp/gmp-${GMP_VERSION}.tar.xz)
-set(GMP_HASH 0b82665c4a92fd2ade7440c13fcaa42b)
+set(GMP_HASH a325e3f09e6d91e62101e59f9bda3ec1)
 set(GMP_HASH_TYPE MD5)
 set(GMP_FILE gmp-${GMP_VERSION}.tar.xz)

--- a/build_files/build_environment/install_deps.sh
+++ b/build_files/build_environment/install_deps.sh
@@ -2083,9 +2083,9 @@ compile_OIIO() {
    cmake_d="$cmake_d -D OPENEXR_VERSION=$OPENEXR_VERSION"

    if [ "$_with_built_openexr" = true ]; then
-      cmake_d="$cmake_d -D ILMBASE_ROOT=$INST/openexr"
-      cmake_d="$cmake_d -D OPENEXR_ROOT=$INST/openexr"
-      INFO "Ilmbase_ROOT=$INST/openexr"
+      cmake_d="$cmake_d -D ILMBASE_HOME=$INST/openexr"
+      cmake_d="$cmake_d -D OPENEXR_HOME=$INST/openexr"
+      INFO "ILMBASE_HOME=$INST/openexr"
    fi

    # ptex is only needed when nicholas bishop is ready
@@ -2374,9 +2374,9 @@ compile_OSL() {
    #~ cmake_d="$cmake_d -D ILMBASE_VERSION=$ILMBASE_VERSION"

    if [ "$_with_built_openexr" = true ]; then
-      cmake_d="$cmake_d -D ILMBASE_ROOT=$INST/openexr"
-      cmake_d="$cmake_d -D OPENEXR_ROOT=$INST/openexr"
-      INFO "Ilmbase_ROOT=$INST/openexr"
+      INFO "ILMBASE_HOME=$INST/openexr"
+      cmake_d="$cmake_d -D OPENEXR_ROOT_DIR=$INST/openexr"
+      cmake_d="$cmake_d -D ILMBASE_ROOT_DIR=$INST/openexr"
      # XXX Temp workaround... sigh, ILMBase really messed the things up by defining their custom names ON by default :(
    fi

@@ -3620,8 +3620,8 @@ compile_FFmpeg() {
    fi

    ./configure --cc="gcc -Wl,--as-needed" \
-        --extra-ldflags="-pthread" \
-        --prefix=$_inst --enable-shared \
+        --extra-ldflags="-pthread -static-libgcc" \
+        --prefix=$_inst --enable-static \
        --disable-ffplay --disable-doc \
        --enable-gray \
        --enable-avfilter --disable-vdpau \
@@ -5721,6 +5721,76 @@ install_OTHER() {
 # ----------------------------------------------------------------------------
 # Printing User Info

+print_info_ffmpeglink_DEB() {
+  dpkg -L $_packages | grep -e ".*\/lib[^\/]\+\.so" | gawk '{ printf(nlines ? "'"$_ffmpeg_list_sep"'%s" : "%s", gensub(/.*lib([^\/]+)\.so/, "\\1", "g", $0)); nlines++ }'
+}
+
+print_info_ffmpeglink_RPM() {
+  rpm -ql $_packages | grep -e ".*\/lib[^\/]\+\.so" | gawk '{ printf(nlines ? "'"$_ffmpeg_list_sep"'%s" : "%s", gensub(/.*lib([^\/]+)\.so/, "\\1", "g", $0)); nlines++ }'
+}
+
+print_info_ffmpeglink_ARCH() {
+  pacman -Ql $_packages | grep -e ".*\/lib[^\/]\+\.so$" | gawk '{ printf(nlines ? "'"$_ffmpeg_list_sep"'%s" : "%s", gensub(/.*lib([^\/]+)\.so/, "\\1", "g", $0)); nlines++ }'
+}
+
+print_info_ffmpeglink() {
+  # This func must only print a ';'-separated list of libs...
+  if [ -z "$DISTRO" ]; then
+    ERROR "Failed to detect distribution type"
+    exit 1
+  fi
+
+  # Create list of packages from which to get libs names...
+  _packages=""
+
+  if [ "$THEORA_USE" = true ]; then
+    _packages="$_packages $THEORA_DEV"
+  fi
+
+  if [ "$VORBIS_USE" = true ]; then
+    _packages="$_packages $VORBIS_DEV"
+  fi
+
+  if [ "$OGG_USE" = true ]; then
+    _packages="$_packages $OGG_DEV"
+  fi
+
+  if [ "$XVID_USE" = true ]; then
+    _packages="$_packages $XVID_DEV"
+  fi
+
+  if [ "$VPX_USE" = true ]; then
+    _packages="$_packages $VPX_DEV"
+  fi
+
+  if [ "$OPUS_USE" = true ]; then
+    _packages="$_packages $OPUS_DEV"
+  fi
+
+  if [ "$MP3LAME_USE" = true ]; then
+    _packages="$_packages $MP3LAME_DEV"
+  fi
+
+  if [ "$X264_USE" = true ]; then
+    _packages="$_packages $X264_DEV"
+  fi
+
+  if [ "$OPENJPEG_USE" = true ]; then
+    _packages="$_packages $OPENJPEG_DEV"
+  fi
+
+  if [ "$DISTRO" = "DEB" ]; then
+    print_info_ffmpeglink_DEB
+  elif [ "$DISTRO" = "RPM" ]; then
+    print_info_ffmpeglink_RPM
+  elif [ "$DISTRO" = "ARCH" ]; then
+    print_info_ffmpeglink_ARCH
+  # XXX TODO!
+  else
+    PRINT "<Could not determine additional link libraries needed for ffmpeg, replace this by valid list of libs...>"
+  fi
+}
+
 print_info() {
  PRINT ""
  PRINT ""
@@ -5731,7 +5801,7 @@ print_info() {
  PRINT "If you're using CMake add this to your configuration flags:"

  _buildargs="-U *SNDFILE* -U PYTHON* -U *BOOST* -U *Boost* -U *TBB*"
-  _buildargs="$_buildargs -U *OPENCOLORIO* -U *OPENEXR* -U *OPENIMAGEIO* -U *LLVM* -U *CLANG* -U *CYCLES*"
+  _buildargs="$_buildargs -U *OPENCOLORIO* -U *OPENEXR* -U *OPENIMAGEIO* -U *LLVM* -U *CYCLES*"
  _buildargs="$_buildargs -U *OPENSUBDIV* -U *OPENVDB*  -U *BLOSC* -U *COLLADA* -U *FFMPEG* -U *ALEMBIC* -U *USD*"
  _buildargs="$_buildargs -U *EMBREE* -U *OPENIMAGEDENOISE* -U *OPENXR*"

@@ -5932,10 +6002,12 @@ print_info() {

  if [ "$FFMPEG_SKIP" = false ]; then
    _1="-D WITH_CODEC_FFMPEG=ON"
+    _2="-D FFMPEG_LIBRARIES='avformat;avcodec;avutil;avdevice;swscale;swresample;lzma;rt;`print_info_ffmpeglink`'"
    PRINT "  $_1"
-    _buildargs="$_buildargs $_1"
+    PRINT "  $_2"
+    _buildargs="$_buildargs $_1 $_2"
    if [ -d $INST/ffmpeg ]; then
-      _1="-D FFMPEG_ROOT_DIR=$INST/ffmpeg"
+      _1="-D FFMPEG=$INST/ffmpeg"
      PRINT "  $_1"
      _buildargs="$_buildargs $_1"
    fi
--- a/build_files/build_environment/patches/usd.diff
+++ b/build_files/build_environment/patches/usd.diff
@@ -197,38 +197,3 @@ index 67ec0d15f..6dc3e85a0 100644
 #else
 #error Unknown architecture.
 #endif
-
-diff --git a/pxr/base/arch/demangle.cpp b/pxr/base/arch/demangle.cpp
-index 67ec0d15f..6dc3e85a0 100644
--- a/pxr/base/arch/demangle.cpp
-+++ b/pxr/base/arch/demangle.cpp
-@@ -36,6 +36,7 @@
- #if (ARCH_COMPILER_GCC_MAJOR == 3 && ARCH_COMPILER_GCC_MINOR >= 1) || \
-     ARCH_COMPILER_GCC_MAJOR > 3 || defined(ARCH_COMPILER_CLANG)
- #define _AT_LEAST_GCC_THREE_ONE_OR_CLANG
-+#include <cxxabi.h>
- #endif
- 
- PXR_NAMESPACE_OPEN_SCOPE
-@@ -138,7 +139,6 @@
- #endif
- 
- #if defined(_AT_LEAST_GCC_THREE_ONE_OR_CLANG)
-#include <cxxabi.h>
- 
- /*
-  * This routine doesn't work when you get to gcc3.4.
-
-diff --git a/pxr/base/work/singularTask.h b/pxr/base/work/singularTask.h
-index 67ec0d15f..6dc3e85a0 100644
--- a/pxr/base/work/singularTask.h
-+++ b/pxr/base/work/singularTask.h
-@@ -120,7 +120,7 @@
-                     // case we go again to ensure the task can do whatever it
-                     // was awakened to do.  Once we successfully take the count
-                     // to zero, we stop.
-                    size_t old = count;
-+                    std::size_t old = count;
-                     do { _fn(); } while (
-                         !count.compare_exchange_strong(old, 0));
-                 });
--- a/build_files/cmake/Modules/FindFFmpeg.cmake
+++ b/build_files/cmake/Modules/FindFFmpeg.cmake
@@ -33,8 +33,6 @@ if(NOT FFMPEG_FIND_COMPONENTS)
    avfilter
    avformat
    avutil
-    swscale
-    swresample
  )
 endif()

@@ -52,9 +50,9 @@ foreach(_component ${FFMPEG_FIND_COMPONENTS})
  string(TOUPPER ${_component} _upper_COMPONENT)
  find_library(FFMPEG_${_upper_COMPONENT}_LIBRARY
    NAMES
-      ${_component}
+      ${_upper_COMPONENT}
    HINTS
-      ${_ffmpeg_SEARCH_DIRS}
+      ${LIBDIR}/ffmpeg
    PATH_SUFFIXES
      lib64 lib
  )
--- a/build_files/cmake/Modules/FindOptiX.cmake
+++ b/build_files/cmake/Modules/FindOptiX.cmake
@@ -21,7 +21,7 @@ ENDIF()

 SET(_optix_SEARCH_DIRS
  ${OPTIX_ROOT_DIR}
-  "$ENV{PROGRAMDATA}/NVIDIA Corporation/OptiX SDK 7.3.0"
+  "$ENV{PROGRAMDATA}/NVIDIA Corporation/OptiX SDK 7.0.0"
 )

 FIND_PATH(OPTIX_INCLUDE_DIR
--- a/build_files/cmake/cmake_consistency_check.py
+++ b/build_files/cmake/cmake_consistency_check.py
@@ -114,7 +114,7 @@ def is_c_header(filename: str) -> bool:

 def is_c(filename: str) -> bool:
    ext = splitext(filename)[1]
-    return (ext in {".c", ".cpp", ".cxx", ".m", ".mm", ".rc", ".cc", ".inl", ".metal"})
+    return (ext in {".c", ".cpp", ".cxx", ".m", ".mm", ".rc", ".cc", ".inl"})


 def is_c_any(filename: str) -> bool:
--- a/build_files/cmake/macros.cmake
+++ b/build_files/cmake/macros.cmake
@@ -488,6 +488,7 @@ function(blender_add_test_executable

  include_directories(${includes})
  include_directories(${includes_sys})
+  setup_libdirs()

  BLENDER_SRC_GTEST_EX(
    NAME ${name}
@@ -524,6 +525,83 @@ function(setup_heavy_lib_pool)
  endif()
 endfunction()

+function(SETUP_LIBDIRS)
+
+  # NOTE: For all new libraries, use absolute library paths.
+  # This should eventually be phased out.
+  # APPLE platform uses full paths for linking libraries, and avoids link_directories.
+  if(NOT MSVC AND NOT APPLE)
+    link_directories(${JPEG_LIBPATH} ${PNG_LIBPATH} ${ZLIB_LIBPATH} ${FREETYPE_LIBPATH})
+
+    if(WITH_PYTHON)  #  AND NOT WITH_PYTHON_MODULE  # WIN32 needs
+      link_directories(${PYTHON_LIBPATH})
+    endif()
+    if(WITH_SDL AND NOT WITH_SDL_DYNLOAD)
+      link_directories(${SDL_LIBPATH})
+    endif()
+    if(WITH_CODEC_FFMPEG)
+      link_directories(${FFMPEG_LIBPATH})
+    endif()
+    if(WITH_IMAGE_OPENEXR)
+      link_directories(${OPENEXR_LIBPATH})
+    endif()
+    if(WITH_IMAGE_TIFF)
+      link_directories(${TIFF_LIBPATH})
+    endif()
+    if(WITH_BOOST)
+      link_directories(${BOOST_LIBPATH})
+    endif()
+    if(WITH_OPENIMAGEIO)
+      link_directories(${OPENIMAGEIO_LIBPATH})
+    endif()
+    if(WITH_OPENIMAGEDENOISE)
+      link_directories(${OPENIMAGEDENOISE_LIBPATH})
+    endif()
+    if(WITH_OPENCOLORIO)
+      link_directories(${OPENCOLORIO_LIBPATH})
+    endif()
+    if(WITH_OPENVDB)
+      link_directories(${OPENVDB_LIBPATH})
+    endif()
+    if(WITH_OPENAL)
+      link_directories(${OPENAL_LIBPATH})
+    endif()
+    if(WITH_JACK AND NOT WITH_JACK_DYNLOAD)
+      link_directories(${JACK_LIBPATH})
+    endif()
+    if(WITH_PULSEAUDIO AND NOT WITH_PULSEAUDIO_DYNLOAD)
+      link_directories(${LIBPULSE_LIBPATH})
+    endif()
+    if(WITH_CODEC_SNDFILE)
+      link_directories(${LIBSNDFILE_LIBPATH})
+    endif()
+    if(WITH_FFTW3)
+      link_directories(${FFTW3_LIBPATH})
+    endif()
+    if(WITH_OPENCOLLADA)
+      link_directories(${OPENCOLLADA_LIBPATH})
+      # # Never set
+      # link_directories(${PCRE_LIBPATH})
+      # link_directories(${EXPAT_LIBPATH})
+    endif()
+    if(WITH_LLVM)
+      link_directories(${LLVM_LIBPATH})
+    endif()
+
+    if(WITH_ALEMBIC)
+      link_directories(${ALEMBIC_LIBPATH})
+    endif()
+
+    if(WITH_GMP)
+      link_directories(${GMP_LIBPATH})
+    endif()
+
+    if(WIN32 AND NOT UNIX)
+      link_directories(${PTHREADS_LIBPATH})
+    endif()
+  endif()
+endfunction()
+
 # Platform specific linker flags for targets.
 function(setup_platform_linker_flags
  target)
@@ -1214,6 +1292,29 @@ macro(openmp_delayload
    endif()
 endmacro()

+macro(blender_precompile_headers target cpp header)
+  if(MSVC)
+    # get the name for the pch output file
+    get_filename_component(pchbase ${cpp} NAME_WE)
+    set(pchfinal "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CFG_INTDIR}/${pchbase}.pch")
+
+    # mark the cpp as the one outputting the pch
+    set_property(SOURCE ${cpp} APPEND PROPERTY OBJECT_OUTPUTS "${pchfinal}")
+
+    # get all sources for the target
+    get_target_property(sources ${target} SOURCES)
+
+    # make all sources depend on the pch to enforce the build order
+    foreach(src ${sources})
+      set_property(SOURCE ${src} APPEND PROPERTY OBJECT_DEPENDS "${pchfinal}")
+    endforeach()
+
+    target_sources(${target} PRIVATE ${cpp} ${header})
+    set_target_properties(${target} PROPERTIES COMPILE_FLAGS "/Yu${header} /Fp${pchfinal} /FI${header}")
+    set_source_files_properties(${cpp} PROPERTIES COMPILE_FLAGS "/Yc${header} /Fp${pchfinal}")
+  endif()
+endmacro()
+
 macro(set_and_warn_dependency
  _dependency _setting _val)
  # when $_dependency is disabled, forces $_setting = $_val
--- a/build_files/cmake/platform/platform_apple.cmake
+++ b/build_files/cmake/platform/platform_apple.cmake
@@ -173,7 +173,6 @@ if(WITH_IMAGE_OPENEXR)
 endif()

 if(WITH_CODEC_FFMPEG)
-  set(FFMPEG_ROOT_DIR ${LIBDIR}/ffmpeg)
  set(FFMPEG_FIND_COMPONENTS
    avcodec avdevice avformat avutil
    mp3lame ogg opus swresample swscale
--- a/build_files/cmake/platform/platform_apple_xcode.cmake
+++ b/build_files/cmake/platform/platform_apple_xcode.cmake
@@ -96,7 +96,7 @@ else()
    # Detect SDK version to use.
    if(NOT DEFINED OSX_SYSTEM)
      execute_process(
-          COMMAND xcrun --sdk macosx --show-sdk-version
+          COMMAND xcrun --show-sdk-version
          OUTPUT_VARIABLE OSX_SYSTEM
          OUTPUT_STRIP_TRAILING_WHITESPACE)
    endif()
--- a/build_files/cmake/platform/platform_unix.cmake
+++ b/build_files/cmake/platform/platform_unix.cmake
@@ -18,7 +18,7 @@
 # All rights reserved.
 # ***** END GPL LICENSE BLOCK *****

-# Libraries configuration for any *nix system including Linux and Unix (excluding APPLE).
+# Libraries configuration for any *nix system including Linux and Unix.

 # Detect precompiled library directory
 if(NOT DEFINED LIBDIR)
@@ -178,30 +178,26 @@ endif()

 if(WITH_CODEC_FFMPEG)
  if(EXISTS ${LIBDIR})
-    set(FFMPEG_ROOT_DIR ${LIBDIR}/ffmpeg)
-    # Override FFMPEG components to also include static library dependencies
-    # included with precompiled libraries, and to ensure correct link order.
-    set(FFMPEG_FIND_COMPONENTS
-      avformat avcodec avdevice avutil swresample swscale
-      sndfile
-      FLAC
-      mp3lame
-      opus
-      theora theoradec theoraenc
-      vorbis vorbisenc vorbisfile ogg
-      vpx
-      x264
-      xvidcore)
-  elseif(FFMPEG)
-    # Old cache variable used for root dir, convert to new standard.
-    set(FFMPEG_ROOT_DIR ${FFMPEG})
+    # For precompiled lib directory, all ffmpeg dependencies are in the same folder
+    file(GLOB ffmpeg_libs ${LIBDIR}/ffmpeg/lib/*.a ${LIBDIR}/sndfile/lib/*.a)
+    set(FFMPEG ${LIBDIR}/ffmpeg CACHE PATH "FFMPEG Directory")
+    set(FFMPEG_LIBRARIES ${ffmpeg_libs} ${ffmpeg_libs} CACHE STRING "FFMPEG Libraries")
+  else()
+    set(FFMPEG /usr CACHE PATH "FFMPEG Directory")
+    set(FFMPEG_LIBRARIES avformat avcodec avutil avdevice swscale CACHE STRING "FFMPEG Libraries")
  endif()
-  find_package(FFmpeg)

-  if(NOT FFMPEG_FOUND)
-    set(WITH_CODEC_FFMPEG OFF)
-    message(STATUS "FFmpeg not found, disabling it")
+  mark_as_advanced(FFMPEG)
+
+  # lame, but until we have proper find module for ffmpeg
+  set(FFMPEG_INCLUDE_DIRS ${FFMPEG}/include)
+  if(EXISTS "${FFMPEG}/include/ffmpeg/")
+    list(APPEND FFMPEG_INCLUDE_DIRS "${FFMPEG}/include/ffmpeg")
  endif()
+  # end lameness
+
+  mark_as_advanced(FFMPEG_LIBRARIES)
+  set(FFMPEG_LIBPATH ${FFMPEG}/lib)
 endif()

 if(WITH_FFTW3)
--- a/doc/doxygen/doxygen.intern.h
+++ b/doc/doxygen/doxygen.intern.h
@@ -51,6 +51,9 @@
 /** \defgroup intern_mikktspace MikktSpace
 *  \ingroup intern */

+/** \defgroup intern_numaapi NUMA (Non Uniform Memory Architecture)
+ *  \ingroup intern */
+
 /** \defgroup intern_rigidbody Rigid-Body C-API
 *  \ingroup intern */

--- a/extern/hipew/src/hipew.c
+++ b/extern/hipew/src/hipew.c
@@ -257,7 +257,7 @@ static int hipewHipInit(void) {
 #endif
  static int initialized = 0;
  static int result = 0;
-  int error;
+  int error, driver_version;

  if (initialized) {
    return result;
@@ -565,6 +565,8 @@ int hipewCompilerVersion(void) {
  const char *path = hipewCompilerPath();
  const char *marker = "Hip compilation tools, release ";
  FILE *pipe;
+  int major, minor;
+  char *versionstr;
  char buf[128];
  char output[65536] = "\0";
  char command[65536] = "\0";
--- a/intern/CMakeLists.txt
+++ b/intern/CMakeLists.txt
@@ -25,6 +25,7 @@ add_subdirectory(ghost)
 add_subdirectory(guardedalloc)
 add_subdirectory(libmv)
 add_subdirectory(memutil)
+add_subdirectory(numaapi)
 add_subdirectory(opencolorio)
 add_subdirectory(opensubdiv)
 add_subdirectory(mikktspace)
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -51,6 +51,8 @@ list(APPEND LIBRARIES ${CYCLES_GL_LIBRARIES})

 # Common configuration.

+cycles_link_directories()
+
 add_definitions(${GL_DEFINITIONS})

 include_directories(${INC})
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -82,7 +82,7 @@ static void session_print_status()
  string status, substatus;

  /* get status */
-  double progress = options.session->progress.get_progress();
+  float progress = options.session->progress.get_progress();
  options.session->progress.get_status(status, substatus);

  if (substatus != "")
@@ -183,7 +183,7 @@ static void display_info(Progress &progress)

  progress.get_time(total_time, sample_time);
  progress.get_status(status, substatus);
-  double progress_val = progress.get_progress();
+  float progress_val = progress.get_progress();

  if (substatus != "")
    status += ": " + substatus;
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -60,8 +60,9 @@ def init():

    path = os.path.dirname(__file__)
    user_path = os.path.dirname(os.path.abspath(bpy.utils.user_resource('CONFIG', path='')))
+    temp_path = bpy.app.tempdir

-    _cycles.init(path, user_path, bpy.app.background)
+    _cycles.init(path, user_path, temp_path, bpy.app.background)
    _parse_command_line()


--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -802,7 +802,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        name="Tile Size",
        default=2048,
        description="",
-        min=8, max=8192,
+        min=8, max=16384,
    )

    # Various fine-tuning debug flags
--- a/intern/cycles/blender/display_driver.cpp
+++ b/intern/cycles/blender/display_driver.cpp
@@ -272,300 +272,12 @@ uint BlenderDisplaySpaceShader::get_shader_program()
  return shader_program_;
 }

-/* --------------------------------------------------------------------
- * DrawTile.
- */
-
-/* Higher level representation of a texture from the graphics library. */
-class GLTexture {
- public:
-  /* Global counter for all allocated OpenGL textures used by instances of this class. */
-  static inline std::atomic<int> num_used = 0;
-
-  GLTexture() = default;
-
-  ~GLTexture()
-  {
-    assert(gl_id == 0);
-  }
-
-  GLTexture(const GLTexture &other) = delete;
-  GLTexture &operator=(GLTexture &other) = delete;
-
-  GLTexture(GLTexture &&other) noexcept
-      : gl_id(other.gl_id), width(other.width), height(other.height)
-  {
-    other.reset();
-  }
-
-  GLTexture &operator=(GLTexture &&other)
-  {
-    if (this == &other) {
-      return *this;
-    }
-
-    gl_id = other.gl_id;
-    width = other.width;
-    height = other.height;
-
-    other.reset();
-
-    return *this;
-  }
-
-  bool gl_resources_ensure()
-  {
-    if (gl_id) {
-      return true;
-    }
-
-    /* Create texture. */
-    glGenTextures(1, &gl_id);
-    if (!gl_id) {
-      LOG(ERROR) << "Error creating texture.";
-      return false;
-    }
-
-    /* Configure the texture. */
-    glActiveTexture(GL_TEXTURE0);
-    glBindTexture(GL_TEXTURE_2D, gl_id);
-
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-
-    /* Clamp to edge so that precision issues when zoomed out (which forces linear interpolation)
-     * does not cause unwanted repetition. */
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-
-    glBindTexture(GL_TEXTURE_2D, 0);
-
-    ++num_used;
-
-    return true;
-  }
-
-  void gl_resources_destroy()
-  {
-    if (!gl_id) {
-      return;
-    }
-
-    glDeleteTextures(1, &gl_id);
-
-    reset();
-
-    --num_used;
-  }
-
-  /* OpenGL resource IDs of the texture.
-   *
-   * NOTE: Allocated on the render engine's context. */
-  uint gl_id = 0;
-
-  /* Dimensions of the texture in pixels. */
-  int width = 0;
-  int height = 0;
-
- protected:
-  void reset()
-  {
-    gl_id = 0;
-    width = 0;
-    height = 0;
-  }
-};
-
-/* Higher level representation of a Pixel Buffer Object (PBO) from the graphics library. */
-class GLPixelBufferObject {
- public:
-  /* Global counter for all allocated OpenGL PBOs used by instances of this class. */
-  static inline std::atomic<int> num_used = 0;
-
-  GLPixelBufferObject() = default;
-
-  ~GLPixelBufferObject()
-  {
-    assert(gl_id == 0);
-  }
-
-  GLPixelBufferObject(const GLPixelBufferObject &other) = delete;
-  GLPixelBufferObject &operator=(GLPixelBufferObject &other) = delete;
-
-  GLPixelBufferObject(GLPixelBufferObject &&other) noexcept
-      : gl_id(other.gl_id), width(other.width), height(other.height)
-  {
-    other.reset();
-  }
-
-  GLPixelBufferObject &operator=(GLPixelBufferObject &&other)
-  {
-    if (this == &other) {
-      return *this;
-    }
-
-    gl_id = other.gl_id;
-    width = other.width;
-    height = other.height;
-
-    other.reset();
-
-    return *this;
-  }
-
-  bool gl_resources_ensure()
-  {
-    if (gl_id) {
-      return true;
-    }
-
-    glGenBuffers(1, &gl_id);
-    if (!gl_id) {
-      LOG(ERROR) << "Error creating texture pixel buffer object.";
-      return false;
-    }
-
-    ++num_used;
-
-    return true;
-  }
-
-  void gl_resources_destroy()
-  {
-    if (!gl_id) {
-      return;
-    }
-
-    glDeleteBuffers(1, &gl_id);
-
-    reset();
-
-    --num_used;
-  }
-
-  /* OpenGL resource IDs of the PBO.
-   *
-   * NOTE: Allocated on the render engine's context. */
-  uint gl_id = 0;
-
-  /* Dimensions of the PBO. */
-  int width = 0;
-  int height = 0;
-
- protected:
-  void reset()
-  {
-    gl_id = 0;
-    width = 0;
-    height = 0;
-  }
-};
-
-class DrawTile {
- public:
-  DrawTile() = default;
-  ~DrawTile() = default;
-
-  DrawTile(const DrawTile &other) = delete;
-  DrawTile &operator=(const DrawTile &other) = delete;
-
-  DrawTile(DrawTile &&other) noexcept = default;
-
-  DrawTile &operator=(DrawTile &&other) = default;
-
-  bool gl_resources_ensure()
-  {
-    if (!texture.gl_resources_ensure()) {
-      gl_resources_destroy();
-      return false;
-    }
-
-    if (!gl_vertex_buffer) {
-      glGenBuffers(1, &gl_vertex_buffer);
-      if (!gl_vertex_buffer) {
-        LOG(ERROR) << "Error allocating tile VBO.";
-        gl_resources_destroy();
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  void gl_resources_destroy()
-  {
-    texture.gl_resources_destroy();
-
-    if (gl_vertex_buffer) {
-      glDeleteBuffers(1, &gl_vertex_buffer);
-      gl_vertex_buffer = 0;
-    }
-  }
-
-  inline bool ready_to_draw() const
-  {
-    return texture.gl_id != 0;
-  }
-
-  /* Texture which contains pixels of the tile. */
-  GLTexture texture;
-
-  /* Display parameters the texture of this tile has been updated for. */
-  BlenderDisplayDriver::Params params;
-
-  /* OpenGL resources needed for drawing. */
-  uint gl_vertex_buffer = 0;
-};
-
-class DrawTileAndPBO {
- public:
-  bool gl_resources_ensure()
-  {
-    if (!tile.gl_resources_ensure() || !buffer_object.gl_resources_ensure()) {
-      gl_resources_destroy();
-      return false;
-    }
-
-    return true;
-  }
-
-  void gl_resources_destroy()
-  {
-    tile.gl_resources_destroy();
-    buffer_object.gl_resources_destroy();
-  }
-
-  DrawTile tile;
-  GLPixelBufferObject buffer_object;
-};
-
 /* --------------------------------------------------------------------
 * BlenderDisplayDriver.
 */

-struct BlenderDisplayDriver::Tiles {
-  /* Resources of a tile which is being currently rendered. */
-  DrawTileAndPBO current_tile;
-
-  /* All tiles which rendering is finished and which content will not be changed. */
-  struct {
-    vector<DrawTile> tiles;
-
-    void gl_resources_destroy_and_clear()
-    {
-      for (DrawTile &tile : tiles) {
-        tile.gl_resources_destroy();
-      }
-
-      tiles.clear();
-    }
-  } finished_tiles;
-};
-
 BlenderDisplayDriver::BlenderDisplayDriver(BL::RenderEngine &b_engine, BL::Scene &b_scene)
-    : b_engine_(b_engine),
-      display_shader_(BlenderDisplayShader::create(b_engine, b_scene)),
-      tiles_(make_unique<Tiles>())
+    : b_engine_(b_engine), display_shader_(BlenderDisplayShader::create(b_engine, b_scene))
 {
  /* Create context while on the main thread. */
  gl_context_create();
@@ -580,21 +292,6 @@ BlenderDisplayDriver::~BlenderDisplayDriver()
 * Update procedure.
 */

-void BlenderDisplayDriver::next_tile_begin()
-{
-  if (!tiles_->current_tile.tile.ready_to_draw()) {
-    LOG(ERROR)
-        << "Unexpectedly moving to the next tile without any data provided for current tile.";
-    return;
-  }
-
-  /* Moving to the next tile without giving render data for the current tile is not an expected
-   * situation. */
-  DCHECK(!need_clear_);
-
-  tiles_->finished_tiles.tiles.emplace_back(std::move(tiles_->current_tile.tile));
-}
-
 bool BlenderDisplayDriver::update_begin(const Params &params,
                                        int texture_width,
                                        int texture_height)
@@ -615,33 +312,24 @@ bool BlenderDisplayDriver::update_begin(const Params &params,
    glWaitSync((GLsync)gl_render_sync_, 0, GL_TIMEOUT_IGNORED);
  }

-  DrawTile &current_tile = tiles_->current_tile.tile;
-  GLPixelBufferObject &current_tile_buffer_object = tiles_->current_tile.buffer_object;
-
-  /* Clear storage of all finished tiles when display clear is requested.
-   * Do it when new tile data is provided to handle the display clear flag in a single place.
-   * It also makes the logic reliable from the whether drawing did happen or not point of view. */
-  if (need_clear_) {
-    tiles_->finished_tiles.gl_resources_destroy_and_clear();
-    need_clear_ = false;
-  }
-
-  if (!tiles_->current_tile.gl_resources_ensure()) {
-    tiles_->current_tile.gl_resources_destroy();
+  if (!gl_texture_resources_ensure()) {
    gl_context_disable();
    return false;
  }

  /* Update texture dimensions if needed. */
-  if (current_tile.texture.width != texture_width ||
-      current_tile.texture.height != texture_height) {
+  if (texture_.width != texture_width || texture_.height != texture_height) {
    glActiveTexture(GL_TEXTURE0);
-    glBindTexture(GL_TEXTURE_2D, current_tile.texture.gl_id);
+    glBindTexture(GL_TEXTURE_2D, texture_.gl_id);
    glTexImage2D(
        GL_TEXTURE_2D, 0, GL_RGBA16F, texture_width, texture_height, 0, GL_RGBA, GL_HALF_FLOAT, 0);
-    current_tile.texture.width = texture_width;
-    current_tile.texture.height = texture_height;
+    texture_.width = texture_width;
+    texture_.height = texture_height;
    glBindTexture(GL_TEXTURE_2D, 0);
+
+    /* Texture did change, and no pixel storage was provided. Tag for an explicit zeroing out to
+     * avoid undefined content. */
+    texture_.need_clear = true;
  }

  /* Update PBO dimensions if needed.
@@ -653,58 +341,29 @@ bool BlenderDisplayDriver::update_begin(const Params &params,
   * sending too much data to GPU when resolution divider is not 1. */
  /* TODO(sergey): Investigate whether keeping the PBO exact size of the texture makes non-interop
   * mode faster. */
-  const int buffer_width = params.size.x;
-  const int buffer_height = params.size.y;
-  if (current_tile_buffer_object.width != buffer_width ||
-      current_tile_buffer_object.height != buffer_height) {
+  const int buffer_width = params.full_size.x;
+  const int buffer_height = params.full_size.y;
+  if (texture_.buffer_width != buffer_width || texture_.buffer_height != buffer_height) {
    const size_t size_in_bytes = sizeof(half4) * buffer_width * buffer_height;
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, current_tile_buffer_object.gl_id);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);
    glBufferData(GL_PIXEL_UNPACK_BUFFER, size_in_bytes, 0, GL_DYNAMIC_DRAW);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);

-    current_tile_buffer_object.width = buffer_width;
-    current_tile_buffer_object.height = buffer_height;
+    texture_.buffer_width = buffer_width;
+    texture_.buffer_height = buffer_height;
  }

-  /* Store an updated parameters of the current tile.
-   * In theory it is only needed once per update of the tile, but doing it on every update is
-   * the easiest and is not expensive. */
-  tiles_->current_tile.tile.params = params;
+  /* New content will be provided to the texture in one way or another, so mark this in a
+   * centralized place. */
+  texture_.need_update = true;
+
+  texture_.params = params;

  return true;
 }

-static void update_tile_texture_pixels(const DrawTileAndPBO &tile)
-{
-  const GLTexture &texture = tile.tile.texture;
-
-  DCHECK_NE(tile.buffer_object.gl_id, 0);
-
-  glActiveTexture(GL_TEXTURE0);
-  glBindTexture(GL_TEXTURE_2D, texture.gl_id);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, tile.buffer_object.gl_id);
-
-  glTexSubImage2D(
-      GL_TEXTURE_2D, 0, 0, 0, texture.width, texture.height, GL_RGBA, GL_HALF_FLOAT, 0);
-
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-  glBindTexture(GL_TEXTURE_2D, 0);
-}
-
 void BlenderDisplayDriver::update_end()
 {
-  /* Unpack the PBO into the texture as soon as the new content is provided.
-   *
-   * This allows to ensure that the unpacking happens while resources like graphics interop (which
-   * lifetime is outside of control of the display driver) are still valid, as well as allows to
-   * move the tile from being current to finished immediately after this call.
-   *
-   * One concern with this approach is that if the update happens more often than drawing then
-   * doing the unpack here occupies GPU transfer for no good reason. However, the render scheduler
-   * takes care of ensuring updates don't happen that often. In regular applications redraw will
-   * happen much more often than this update. */
-  update_tile_texture_pixels(tiles_->current_tile);
-
  gl_upload_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
  glFlush();

@@ -717,11 +376,7 @@ void BlenderDisplayDriver::update_end()

 half4 *BlenderDisplayDriver::map_texture_buffer()
 {
-  const uint pbo_gl_id = tiles_->current_tile.buffer_object.gl_id;
-
-  DCHECK_NE(pbo_gl_id, 0);
-
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_gl_id);
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);

  half4 *mapped_rgba_pixels = reinterpret_cast<half4 *>(
      glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_WRITE_ONLY));
@@ -729,6 +384,15 @@ half4 *BlenderDisplayDriver::map_texture_buffer()
    LOG(ERROR) << "Error mapping BlenderDisplayDriver pixel buffer object.";
  }

+  if (texture_.need_clear) {
+    const int64_t texture_width = texture_.width;
+    const int64_t texture_height = texture_.height;
+    memset(reinterpret_cast<void *>(mapped_rgba_pixels),
+           0,
+           texture_width * texture_height * sizeof(half4));
+    texture_.need_clear = false;
+  }
+
  return mapped_rgba_pixels;
 }

@@ -747,9 +411,12 @@ BlenderDisplayDriver::GraphicsInterop BlenderDisplayDriver::graphics_interop_get
 {
  GraphicsInterop interop_dst;

-  interop_dst.buffer_width = tiles_->current_tile.buffer_object.width;
-  interop_dst.buffer_height = tiles_->current_tile.buffer_object.height;
-  interop_dst.opengl_pbo_id = tiles_->current_tile.buffer_object.gl_id;
+  interop_dst.buffer_width = texture_.buffer_width;
+  interop_dst.buffer_height = texture_.buffer_height;
+  interop_dst.opengl_pbo_id = texture_.gl_pbo_id;
+
+  interop_dst.need_clear = texture_.need_clear;
+  texture_.need_clear = false;

  return interop_dst;
 }
@@ -770,7 +437,7 @@ void BlenderDisplayDriver::graphics_interop_deactivate()

 void BlenderDisplayDriver::clear()
 {
-  need_clear_ = true;
+  texture_.need_clear = true;
 }

 void BlenderDisplayDriver::set_zoom(float zoom_x, float zoom_y)
@@ -778,155 +445,26 @@ void BlenderDisplayDriver::set_zoom(float zoom_x, float zoom_y)
  zoom_ = make_float2(zoom_x, zoom_y);
 }

-/* Update vertex buffer with new coordinates of vertex positions and texture coordinates.
- * This buffer is used to render texture in the viewport.
- *
- * NOTE: The buffer needs to be bound. */
-static void vertex_buffer_update(const DisplayDriver::Params &params)
-{
-  const int x = params.full_offset.x;
-  const int y = params.full_offset.y;
-
-  const int width = params.size.x;
-  const int height = params.size.y;
-
-  /* Invalidate old contents - avoids stalling if the buffer is still waiting in queue to be
-   * rendered. */
-  glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
-  float *vpointer = reinterpret_cast<float *>(glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY));
-  if (!vpointer) {
-    return;
-  }
-
-  vpointer[0] = 0.0f;
-  vpointer[1] = 0.0f;
-  vpointer[2] = x;
-  vpointer[3] = y;
-
-  vpointer[4] = 1.0f;
-  vpointer[5] = 0.0f;
-  vpointer[6] = x + width;
-  vpointer[7] = y;
-
-  vpointer[8] = 1.0f;
-  vpointer[9] = 1.0f;
-  vpointer[10] = x + width;
-  vpointer[11] = y + height;
-
-  vpointer[12] = 0.0f;
-  vpointer[13] = 1.0f;
-  vpointer[14] = x;
-  vpointer[15] = y + height;
-
-  glUnmapBuffer(GL_ARRAY_BUFFER);
-}
-
-static void draw_tile(const float2 &zoom,
-                      const int texcoord_attribute,
-                      const int position_attribute,
-                      const DrawTile &draw_tile)
-{
-  if (!draw_tile.ready_to_draw()) {
-    return;
-  }
-
-  const GLTexture &texture = draw_tile.texture;
-
-  DCHECK_NE(texture.gl_id, 0);
-  DCHECK_NE(draw_tile.gl_vertex_buffer, 0);
-
-  glBindBuffer(GL_ARRAY_BUFFER, draw_tile.gl_vertex_buffer);
-
-  /* Draw at the parameters for which the texture has been updated for. This allows to always draw
-   * texture during bordered-rendered camera view without flickering. The validness of the display
-   * parameters for a texture is guaranteed by the initial "clear" state which makes drawing to
-   * have an early output.
-   *
-   * Such approach can cause some extra "jelly" effect during panning, but it is not more jelly
-   * than overlay of selected objects. Also, it's possible to redraw texture at an intersection of
-   * the texture draw parameters and the latest updated draw parameters (although, complexity of
-   * doing it might not worth it. */
-  vertex_buffer_update(draw_tile.params);
-
-  glBindTexture(GL_TEXTURE_2D, texture.gl_id);
-
-  /* Trick to keep sharp rendering without jagged edges on all GPUs.
-   *
-   * The idea here is to enforce driver to use linear interpolation when the image is not zoomed
-   * in.
-   * For the render result with a resolution divider in effect we always use nearest interpolation.
-   *
-   * Use explicit MIN assignment to make sure the driver does not have an undefined behavior at
-   * the zoom level 1. The MAG filter is always NEAREST. */
-  const float zoomed_width = draw_tile.params.size.x * zoom.x;
-  const float zoomed_height = draw_tile.params.size.y * zoom.y;
-  if (texture.width != draw_tile.params.size.x || texture.height != draw_tile.params.size.y) {
-    /* Resolution divider is different from 1, force nearest interpolation. */
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-  }
-  else if (zoomed_width - draw_tile.params.size.x > 0.5f ||
-           zoomed_height - draw_tile.params.size.y > 0.5f) {
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-  }
-  else {
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-  }
-
-  glVertexAttribPointer(
-      texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
-  glVertexAttribPointer(position_attribute,
-                        2,
-                        GL_FLOAT,
-                        GL_FALSE,
-                        4 * sizeof(float),
-                        (const GLvoid *)(sizeof(float) * 2));
-
-  glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-}
-
-void BlenderDisplayDriver::flush()
-{
-  /* This is called from the render thread that also calls update_begin/end, right before ending
-   * the render loop. We wait for any queued PBO and render commands to be done, before destroying
-   * the render thread and activating the context in the main thread to destroy resources.
-   *
-   * If we don't do this, the NVIDIA driver hangs for a few seconds for when ending 3D viewport
-   * rendering, for unknown reasons. This was found with NVIDIA driver version 470.73 and a Quadro
-   * RTX 6000 on Linux. */
-  if (!gl_context_enable()) {
-    return;
-  }
-
-  if (gl_upload_sync_) {
-    glWaitSync((GLsync)gl_upload_sync_, 0, GL_TIMEOUT_IGNORED);
-  }
-
-  if (gl_render_sync_) {
-    glWaitSync((GLsync)gl_render_sync_, 0, GL_TIMEOUT_IGNORED);
-  }
-
-  gl_context_disable();
-}
-
 void BlenderDisplayDriver::draw(const Params &params)
 {
  /* See do_update_begin() for why no locking is required here. */
  const bool transparent = true;  // TODO(sergey): Derive this from Film.

+  if (!gl_draw_resources_ensure()) {
+    return;
+  }
+
  if (use_gl_context_) {
    gl_context_mutex_.lock();
  }

-  if (need_clear_) {
+  if (texture_.need_clear) {
    /* Texture is requested to be cleared and was not yet cleared.
     *
     * Do early return which should be equivalent of drawing all-zero texture.
     * Watch out for the lock though so that the clear happening during update is properly
     * synchronized here. */
-    if (use_gl_context_) {
-      gl_context_mutex_.unlock();
-    }
+    gl_context_mutex_.unlock();
    return;
  }

@@ -939,37 +477,66 @@ void BlenderDisplayDriver::draw(const Params &params)
    glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
  }

-  glActiveTexture(GL_TEXTURE0);
+  display_shader_->bind(params.full_size.x, params.full_size.y);

-  /* NOTE: The VAO is to be allocated on the drawing context as it is not shared across contexts.
-   * Simplest is to allocate it on every redraw so that it is possible to destroy it from a
-   * correct context. */
+  glActiveTexture(GL_TEXTURE0);
+  glBindTexture(GL_TEXTURE_2D, texture_.gl_id);
+
+  /* Trick to keep sharp rendering without jagged edges on all GPUs.
+   *
+   * The idea here is to enforce driver to use linear interpolation when the image is not zoomed
+   * in.
+   * For the render result with a resolution divider in effect we always use nearest interpolation.
+   *
+   * Use explicit MIN assignment to make sure the driver does not have an undefined behavior at
+   * the zoom level 1. The MAG filter is always NEAREST. */
+  const float zoomed_width = params.size.x * zoom_.x;
+  const float zoomed_height = params.size.y * zoom_.y;
+  if (texture_.width != params.size.x || texture_.height != params.size.y) {
+    /* Resolution divider is different from 1, force nearest interpolation. */
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  }
+  else if (zoomed_width - params.size.x > 0.5f || zoomed_height - params.size.y > 0.5f) {
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  }
+  else {
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+  }
+
+  glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer_);
+
+  texture_update_if_needed();
+  vertex_buffer_update(params);
+
+  /* TODO(sergey): Does it make sense/possible to cache/reuse the VAO? */
  GLuint vertex_array_object;
  glGenVertexArrays(1, &vertex_array_object);
  glBindVertexArray(vertex_array_object);

-  display_shader_->bind(params.full_size.x, params.full_size.y);
-
  const int texcoord_attribute = display_shader_->get_tex_coord_attrib_location();
  const int position_attribute = display_shader_->get_position_attrib_location();

  glEnableVertexAttribArray(texcoord_attribute);
  glEnableVertexAttribArray(position_attribute);

-  draw_tile(zoom_, texcoord_attribute, position_attribute, tiles_->current_tile.tile);
+  glVertexAttribPointer(
+      texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
+  glVertexAttribPointer(position_attribute,
+                        2,
+                        GL_FLOAT,
+                        GL_FALSE,
+                        4 * sizeof(float),
+                        (const GLvoid *)(sizeof(float) * 2));

-  for (const DrawTile &tile : tiles_->finished_tiles.tiles) {
-    draw_tile(zoom_, texcoord_attribute, position_attribute, tile);
-  }
+  glDrawArrays(GL_TRIANGLE_FAN, 0, 4);

-  display_shader_->unbind();
-
-  glBindTexture(GL_TEXTURE_2D, 0);
-  glBindVertexArray(0);
  glBindBuffer(GL_ARRAY_BUFFER, 0);
+  glBindTexture(GL_TEXTURE_2D, 0);

  glDeleteVertexArrays(1, &vertex_array_object);

+  display_shader_->unbind();
+
  if (transparent) {
    glDisable(GL_BLEND);
  }
@@ -977,11 +544,6 @@ void BlenderDisplayDriver::draw(const Params &params)
  gl_render_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
  glFlush();

-  if (VLOG_IS_ON(5)) {
-    VLOG(5) << "Number of textures: " << GLTexture::num_used;
-    VLOG(5) << "Number of PBOs: " << GLPixelBufferObject::num_used;
-  }
-
  if (use_gl_context_) {
    gl_context_mutex_.unlock();
  }
@@ -1056,16 +618,154 @@ void BlenderDisplayDriver::gl_context_dispose()
  }
 }

+bool BlenderDisplayDriver::gl_draw_resources_ensure()
+{
+  if (!texture_.gl_id) {
+    /* If there is no texture allocated, there is nothing to draw. Inform the draw call that it can
+     * can not continue. Note that this is not an unrecoverable error, so once the texture is known
+     * we will come back here and create all the GPU resources needed for draw. */
+    return false;
+  }
+
+  if (gl_draw_resource_creation_attempted_) {
+    return gl_draw_resources_created_;
+  }
+  gl_draw_resource_creation_attempted_ = true;
+
+  if (!vertex_buffer_) {
+    glGenBuffers(1, &vertex_buffer_);
+    if (!vertex_buffer_) {
+      LOG(ERROR) << "Error creating vertex buffer.";
+      return false;
+    }
+  }
+
+  gl_draw_resources_created_ = true;
+
+  return true;
+}
+
 void BlenderDisplayDriver::gl_resources_destroy()
 {
  gl_context_enable();

-  tiles_->current_tile.gl_resources_destroy();
-  tiles_->finished_tiles.gl_resources_destroy_and_clear();
+  if (vertex_buffer_ != 0) {
+    glDeleteBuffers(1, &vertex_buffer_);
+  }
+
+  if (texture_.gl_pbo_id) {
+    glDeleteBuffers(1, &texture_.gl_pbo_id);
+    texture_.gl_pbo_id = 0;
+  }
+
+  if (texture_.gl_id) {
+    glDeleteTextures(1, &texture_.gl_id);
+    texture_.gl_id = 0;
+  }

  gl_context_disable();

  gl_context_dispose();
 }

+bool BlenderDisplayDriver::gl_texture_resources_ensure()
+{
+  if (texture_.creation_attempted) {
+    return texture_.is_created;
+  }
+  texture_.creation_attempted = true;
+
+  DCHECK(!texture_.gl_id);
+  DCHECK(!texture_.gl_pbo_id);
+
+  /* Create texture. */
+  glGenTextures(1, &texture_.gl_id);
+  if (!texture_.gl_id) {
+    LOG(ERROR) << "Error creating texture.";
+    return false;
+  }
+
+  /* Configure the texture. */
+  glActiveTexture(GL_TEXTURE0);
+  glBindTexture(GL_TEXTURE_2D, texture_.gl_id);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+  glBindTexture(GL_TEXTURE_2D, 0);
+
+  /* Create PBO for the texture. */
+  glGenBuffers(1, &texture_.gl_pbo_id);
+  if (!texture_.gl_pbo_id) {
+    LOG(ERROR) << "Error creating texture pixel buffer object.";
+    return false;
+  }
+
+  /* Creation finished with a success. */
+  texture_.is_created = true;
+
+  return true;
+}
+
+void BlenderDisplayDriver::texture_update_if_needed()
+{
+  if (!texture_.need_update) {
+    return;
+  }
+
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);
+  glTexSubImage2D(
+      GL_TEXTURE_2D, 0, 0, 0, texture_.width, texture_.height, GL_RGBA, GL_HALF_FLOAT, 0);
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+  texture_.need_update = false;
+}
+
+void BlenderDisplayDriver::vertex_buffer_update(const Params & /*params*/)
+{
+  /* Draw at the parameters for which the texture has been updated for. This allows to always draw
+   * texture during bordered-rendered camera view without flickering. The validness of the display
+   * parameters for a texture is guaranteed by the initial "clear" state which makes drawing to
+   * have an early output.
+   *
+   * Such approach can cause some extra "jelly" effect during panning, but it is not more jelly
+   * than overlay of selected objects. Also, it's possible to redraw texture at an intersection of
+   * the texture draw parameters and the latest updated draw parameters (although, complexity of
+   * doing it might not worth it. */
+  const int x = texture_.params.full_offset.x;
+  const int y = texture_.params.full_offset.y;
+
+  const int width = texture_.params.size.x;
+  const int height = texture_.params.size.y;
+
+  /* Invalidate old contents - avoids stalling if the buffer is still waiting in queue to be
+   * rendered. */
+  glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
+
+  float *vpointer = reinterpret_cast<float *>(glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY));
+  if (!vpointer) {
+    return;
+  }
+
+  vpointer[0] = 0.0f;
+  vpointer[1] = 0.0f;
+  vpointer[2] = x;
+  vpointer[3] = y;
+
+  vpointer[4] = 1.0f;
+  vpointer[5] = 0.0f;
+  vpointer[6] = x + width;
+  vpointer[7] = y;
+
+  vpointer[8] = 1.0f;
+  vpointer[9] = 1.0f;
+  vpointer[10] = x + width;
+  vpointer[11] = y + height;
+
+  vpointer[12] = 0.0f;
+  vpointer[13] = 1.0f;
+  vpointer[14] = x;
+  vpointer[15] = y + height;
+
+  glUnmapBuffer(GL_ARRAY_BUFFER);
+}
+
 CCL_NAMESPACE_END
--- a/intern/cycles/blender/display_driver.h
+++ b/intern/cycles/blender/display_driver.h
@@ -26,7 +26,6 @@

 #include "util/thread.h"
 #include "util/unique_ptr.h"
-#include "util/vector.h"

 CCL_NAMESPACE_BEGIN

@@ -113,8 +112,6 @@ class BlenderDisplayDriver : public DisplayDriver {
  void set_zoom(float zoom_x, float zoom_y);

 protected:
-  virtual void next_tile_begin() override;
-
  virtual bool update_begin(const Params &params, int texture_width, int texture_height) override;
  virtual void update_end() override;

@@ -125,17 +122,33 @@ class BlenderDisplayDriver : public DisplayDriver {

  virtual void draw(const Params &params) override;

-  virtual void flush() override;
-
  /* Helper function which allocates new GPU context. */
  void gl_context_create();
  bool gl_context_enable();
  void gl_context_disable();
  void gl_context_dispose();

+  /* Make sure texture is allocated and its initial configuration is performed. */
+  bool gl_texture_resources_ensure();
+
+  /* Ensure all runtime GPU resources needed for drawing are allocated.
+   * Returns true if all resources needed for drawing are available. */
+  bool gl_draw_resources_ensure();
+
  /* Destroy all GPU resources which are being used by this object. */
  void gl_resources_destroy();

+  /* Update GPU texture dimensions and content if needed (new pixel data was provided).
+   *
+   * NOTE: The texture needs to be bound. */
+  void texture_update_if_needed();
+
+  /* Update vertex buffer with new coordinates of vertex positions and texture coordinates.
+   * This buffer is used to render texture in the viewport.
+   *
+   * NOTE: The buffer needs to be bound. */
+  void vertex_buffer_update(const Params &params);
+
  BL::RenderEngine b_engine_;

  /* OpenGL context which is used the render engine doesn't have its own. */
@@ -146,14 +159,50 @@ class BlenderDisplayDriver : public DisplayDriver {
  /* Mutex used to guard the `gl_context_`. */
  thread_mutex gl_context_mutex_;

-  /* Content of the display is to be filled with zeroes. */
-  std::atomic<bool> need_clear_ = true;
+  /* Texture which contains pixels of the render result. */
+  struct {
+    /* Indicates whether texture creation was attempted and succeeded.
+     * Used to avoid multiple attempts of texture creation on GPU issues or GPU context
+     * misconfiguration. */
+    bool creation_attempted = false;
+    bool is_created = false;
+
+    /* OpenGL resource IDs of the texture itself and Pixel Buffer Object (PBO) used to write
+     * pixels to it.
+     *
+     * NOTE: Allocated on the engine's context. */
+    uint gl_id = 0;
+    uint gl_pbo_id = 0;
+
+    /* Is true when new data was written to the PBO, meaning, the texture might need to be resized
+     * and new data is to be uploaded to the GPU. */
+    bool need_update = false;
+
+    /* Content of the texture is to be filled with zeroes. */
+    std::atomic<bool> need_clear = true;
+
+    /* Dimensions of the texture in pixels. */
+    int width = 0;
+    int height = 0;
+
+    /* Dimensions of the underlying PBO. */
+    int buffer_width = 0;
+    int buffer_height = 0;
+
+    /* Display parameters the texture has been updated for. */
+    Params params;
+  } texture_;

  unique_ptr<BlenderDisplayShader> display_shader_;

-  /* Opaque storage for an internal state and data for tiles. */
-  struct Tiles;
-  unique_ptr<Tiles> tiles_;
+  /* Special track of whether GPU resources were attempted to be created, to avoid attempts of
+   * their re-creation on failure on every redraw. */
+  bool gl_draw_resource_creation_attempted_ = false;
+  bool gl_draw_resources_created_ = false;
+
+  /* Vertex buffer which hold vertices of a triangle fan which is textures with the texture
+   * holding the render result. */
+  uint vertex_buffer_ = 0;

  void *gl_render_sync_ = nullptr;
  void *gl_upload_sync_ = nullptr;
--- a/intern/cycles/blender/mesh.cpp
+++ b/intern/cycles/blender/mesh.cpp
@@ -1086,6 +1086,40 @@ static void create_subd_mesh(Scene *scene,

 /* Sync */

+/* Check whether some of "built-in" motion-related attributes are needed to be exported (includes
+ * things like velocity from cache modifier, fluid simulation).
+ *
+ * NOTE: This code is run prior to object motion blur initialization. so can not access properties
+ * set by `sync_object_motion_init()`. */
+static bool mesh_need_motion_attribute(BObjectInfo &b_ob_info, Scene *scene)
+{
+  const Scene::MotionType need_motion = scene->need_motion();
+  if (need_motion == Scene::MOTION_NONE) {
+    /* Simple case: neither motion pass nor motion blur is needed, no need in the motion related
+     * attributes. */
+    return false;
+  }
+
+  if (need_motion == Scene::MOTION_BLUR) {
+    /* A bit tricky and implicit case:
+     * - Motion blur is enabled in the scene, which implies specific number of time steps for
+     *   objects.
+     * - If the object has motion blur disabled on it, it will have 0 time steps.
+     * - Motion attribute expects non-zero time steps.
+     *
+     * Avoid adding motion attributes if the motion blur will enforce 0 motion steps. */
+    PointerRNA cobject = RNA_pointer_get(&b_ob_info.real_object.ptr, "cycles");
+    const bool use_motion = get_boolean(cobject, "use_motion_blur");
+    if (!use_motion) {
+      return false;
+    }
+  }
+
+  /* Motion pass which implies 3 motion steps, or motion blur which is not disabled on object
+   * level. */
+  return true;
+}
+
 void BlenderSync::sync_mesh(BL::Depsgraph b_depsgraph, BObjectInfo &b_ob_info, Mesh *mesh)
 {
  /* make a copy of the shaders as the caller in the main thread still need them for syncing the
@@ -1110,7 +1144,7 @@ void BlenderSync::sync_mesh(BL::Depsgraph b_depsgraph, BObjectInfo &b_ob_info, M

    if (b_mesh) {
      /* Motion blur attribute is relative to seconds, we need it relative to frames. */
-      const bool need_motion = object_need_motion_attribute(b_ob_info, scene);
+      const bool need_motion = mesh_need_motion_attribute(b_ob_info, scene);
      const float motion_scale = (need_motion) ?
                                     scene->motion_shutter_time() /
                                         (b_scene.render().fps() / b_scene.render().fps_base()) :
--- a/intern/cycles/blender/output_driver.cpp
+++ b/intern/cycles/blender/output_driver.cpp
@@ -120,7 +120,7 @@ void BlenderOutputDriver::write_render_tile(const Tile &tile)
    b_pass.rect(&pixels[0]);
  }

-  b_engine_.end_result(b_rr, false, false, true);
+  b_engine_.end_result(b_rr, true, false, true);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/blender/pointcloud.cpp
+++ b/intern/cycles/blender/pointcloud.cpp
@@ -37,52 +37,12 @@ static void fill_generic_attribute(BL::PointCloud &b_pointcloud,
  }
 }

-static void attr_create_motion(PointCloud *pointcloud,
-                               BL::Attribute &b_attribute,
-                               const float motion_scale)
-{
-  if (!(b_attribute.domain() == BL::Attribute::domain_POINT) &&
-      (b_attribute.data_type() == BL::Attribute::data_type_FLOAT_VECTOR)) {
-    return;
-  }
-
-  BL::FloatVectorAttribute b_vector_attribute(b_attribute);
-  const int num_points = pointcloud->get_points().size();
-
-  /* Find or add attribute */
-  float3 *P = &pointcloud->get_points()[0];
-  Attribute *attr_mP = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-
-  if (!attr_mP) {
-    attr_mP = pointcloud->attributes.add(ATTR_STD_MOTION_VERTEX_POSITION);
-  }
-
-  /* Only export previous and next frame, we don't have any in between data. */
-  float motion_times[2] = {-1.0f, 1.0f};
-  for (int step = 0; step < 2; step++) {
-    const float relative_time = motion_times[step] * 0.5f * motion_scale;
-    float3 *mP = attr_mP->data_float3() + step * num_points;
-
-    for (int i = 0; i < num_points; i++) {
-      mP[i] = P[i] + get_float3(b_vector_attribute.data[i].vector()) * relative_time;
-    }
-  }
-}
-
-static void copy_attributes(PointCloud *pointcloud,
-                            BL::PointCloud b_pointcloud,
-                            const bool need_motion,
-                            const float motion_scale)
+static void copy_attributes(PointCloud *pointcloud, BL::PointCloud b_pointcloud)
 {
  AttributeSet &attributes = pointcloud->attributes;
-  static const ustring u_velocity("velocity");
  for (BL::Attribute &b_attribute : b_pointcloud.attributes) {
    const ustring name{b_attribute.name().c_str()};

-    if (need_motion && name == u_velocity) {
-      attr_create_motion(pointcloud, b_attribute, motion_scale);
-    }
-
    if (attributes.find(name)) {
      continue;
    }
@@ -151,11 +111,7 @@ static void copy_attributes(PointCloud *pointcloud,
  }
 }

-static void export_pointcloud(Scene *scene,
-                              PointCloud *pointcloud,
-                              BL::PointCloud b_pointcloud,
-                              const bool need_motion,
-                              const float motion_scale)
+static void export_pointcloud(Scene *scene, PointCloud *pointcloud, BL::PointCloud b_pointcloud)
 {
  /* TODO: optimize so we can straight memcpy arrays from Blender? */

@@ -185,7 +141,7 @@ static void export_pointcloud(Scene *scene,
  }

  /* Export attributes */
-  copy_attributes(pointcloud, b_pointcloud, need_motion, motion_scale);
+  copy_attributes(pointcloud, b_pointcloud);
 }

 static void export_pointcloud_motion(PointCloud *pointcloud,
@@ -237,7 +193,7 @@ static void export_pointcloud_motion(PointCloud *pointcloud,
  }

  /* Export attributes */
-  copy_attributes(pointcloud, b_pointcloud, false, 0.0f);
+  copy_attributes(pointcloud, b_pointcloud);
 }

 void BlenderSync::sync_pointcloud(PointCloud *pointcloud, BObjectInfo &b_ob_info)
@@ -251,13 +207,7 @@ void BlenderSync::sync_pointcloud(PointCloud *pointcloud, BObjectInfo &b_ob_info

  /* TODO: add option to filter out points in the view layer. */
  BL::PointCloud b_pointcloud(b_ob_info.object_data);
-  /* Motion blur attribute is relative to seconds, we need it relative to frames. */
-  const bool need_motion = object_need_motion_attribute(b_ob_info, scene);
-  const float motion_scale = (need_motion) ?
-                                 scene->motion_shutter_time() /
-                                     (b_scene.render().fps() / b_scene.render().fps_base()) :
-                                 0.0f;
-  export_pointcloud(scene, &new_pointcloud, b_pointcloud, need_motion, motion_scale);
+  export_pointcloud(scene, &new_pointcloud, b_pointcloud);

  /* update original sockets */
  for (const SocketType &socket : new_pointcloud.type->inputs) {
--- a/intern/cycles/blender/python.cpp
+++ b/intern/cycles/blender/python.cpp
@@ -138,18 +138,20 @@ static const char *PyC_UnicodeAsByte(PyObject *py_str, PyObject **coerce)

 static PyObject *init_func(PyObject * /*self*/, PyObject *args)
 {
-  PyObject *path, *user_path;
+  PyObject *path, *user_path, *temp_path;
  int headless;

-  if (!PyArg_ParseTuple(args, "OOi", &path, &user_path, &headless)) {
+  if (!PyArg_ParseTuple(args, "OOOi", &path, &user_path, &temp_path, &headless)) {
    return nullptr;
  }

-  PyObject *path_coerce = nullptr, *user_path_coerce = nullptr;
+  PyObject *path_coerce = nullptr, *user_path_coerce = nullptr, *temp_path_coerce = nullptr;
  path_init(PyC_UnicodeAsByte(path, &path_coerce),
-            PyC_UnicodeAsByte(user_path, &user_path_coerce));
+            PyC_UnicodeAsByte(user_path, &user_path_coerce),
+            PyC_UnicodeAsByte(temp_path, &temp_path_coerce));
  Py_XDECREF(path_coerce);
  Py_XDECREF(user_path_coerce);
+  Py_XDECREF(temp_path_coerce);

  BlenderSession::headless = headless;

@@ -733,20 +735,27 @@ static bool image_parse_filepaths(PyObject *pyfilepaths, vector<string> &filepat

 static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *keywords)
 {
+#if 1
+  (void)args;
+  (void)keywords;
+#else
  static const char *keyword_list[] = {
-      "preferences", "scene", "view_layer", "input", "output", NULL};
+      "preferences", "scene", "view_layer", "input", "output", "tile_size", "samples", NULL};
  PyObject *pypreferences, *pyscene, *pyviewlayer;
  PyObject *pyinput, *pyoutput = NULL;
+  int tile_size = 0, samples = 0;

  if (!PyArg_ParseTupleAndKeywords(args,
                                   keywords,
-                                   "OOOO|O",
+                                   "OOOO|Oii",
                                   (char **)keyword_list,
                                   &pypreferences,
                                   &pyscene,
                                   &pyviewlayer,
                                   &pyinput,
-                                   &pyoutput)) {
+                                   &pyoutput,
+                                   &tile_size,
+                                   &samples)) {
    return NULL;
  }

@@ -768,10 +777,14 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key
                     &RNA_ViewLayer,
                     PyLong_AsVoidPtr(pyviewlayer),
                     &viewlayerptr);
-  BL::ViewLayer b_view_layer(viewlayerptr);
+  PointerRNA cviewlayer = RNA_pointer_get(&viewlayerptr, "cycles");

-  DenoiseParams params = BlenderSync::get_denoise_params(b_scene, b_view_layer, true);
-  params.use = true;
+  DenoiseParams params;
+  params.radius = get_int(cviewlayer, "denoising_radius");
+  params.strength = get_float(cviewlayer, "denoising_strength");
+  params.feature_strength = get_float(cviewlayer, "denoising_feature_strength");
+  params.relative_pca = get_boolean(cviewlayer, "denoising_relative_pca");
+  params.neighbor_frames = get_int(cviewlayer, "denoising_neighbor_frames");

  /* Parse file paths list. */
  vector<string> input, output;
@@ -799,15 +812,24 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key
  }

  /* Create denoiser. */
-  DenoiserPipeline denoiser(device, params);
+  DenoiserPipeline denoiser(device);
+  denoiser.params = params;
  denoiser.input = input;
  denoiser.output = output;

+  if (tile_size > 0) {
+    denoiser.tile_size = make_int2(tile_size, tile_size);
+  }
+  if (samples > 0) {
+    denoiser.samples_override = samples;
+  }
+
  /* Run denoiser. */
  if (!denoiser.run()) {
    PyErr_SetString(PyExc_ValueError, denoiser.error.c_str());
    return NULL;
  }
+#endif

  Py_RETURN_NONE;
 }
--- a/intern/cycles/blender/session.cpp
+++ b/intern/cycles/blender/session.cpp
@@ -502,15 +502,10 @@ void BlenderSession::render_frame_finish()
    path_remove(filename);
  }

-  /* Clear output driver. */
+  /* Clear driver. */
  session->set_output_driver(nullptr);
  session->full_buffer_written_cb = function_null;

-  /* The display driver holds OpenGL resources which belong to an OpenGL context held by the render
-   * engine on Blender side. Force destruction of those resources. */
-  display_driver_ = nullptr;
-  session->set_display_driver(nullptr);
-
  /* All the files are handled.
   * Clear the list so that this session can be re-used by Persistent Data. */
  full_buffer_files_.clear();
--- a/intern/cycles/blender/shader.cpp
+++ b/intern/cycles/blender/shader.cpp
@@ -776,7 +776,7 @@ static ShaderNode *add_node(Scene *scene,
      }
      else {
        ustring filename = ustring(
-            image_user_file_path(b_image_user, b_image, b_scene.frame_current()));
+            image_user_file_path(b_image_user, b_image, b_scene.frame_current(), true));
        image->set_filename(filename);
      }
    }
@@ -813,7 +813,7 @@ static ShaderNode *add_node(Scene *scene,
      }
      else {
        env->set_filename(
-            ustring(image_user_file_path(b_image_user, b_image, b_scene.frame_current())));
+            ustring(image_user_file_path(b_image_user, b_image, b_scene.frame_current(), false)));
      }
    }
    node = env;
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -832,14 +832,6 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
  SessionParams params;
  PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");

-  if (background && !b_engine.is_preview()) {
-    /* Viewport and preview renders do not require temp directory and do request session
-     * parameters more often than the background render.
-     * Optimize RNA-C++ usage and memory allocation a bit by saving string access which we know is
-     * not needed for viewport render. */
-    params.temp_dir = b_engine.temporary_directory();
-  }
-
  /* feature set */
  params.experimental = (get_enum(cscene, "feature_set") != 0);

--- a/intern/cycles/blender/sync.h
+++ b/intern/cycles/blender/sync.h
@@ -105,11 +105,11 @@ class BlenderSync {
  static BufferParams get_buffer_params(
      BL::SpaceView3D &b_v3d, BL::RegionView3D &b_rv3d, Camera *cam, int width, int height);

+ private:
  static DenoiseParams get_denoise_params(BL::Scene &b_scene,
                                          BL::ViewLayer &b_view_layer,
                                          bool background);

- private:
  /* sync */
  void sync_lights(BL::Depsgraph &b_depsgraph, bool update_all);
  void sync_materials(BL::Depsgraph &b_depsgraph, bool update_all);
--- a/intern/cycles/blender/util.h
+++ b/intern/cycles/blender/util.h
@@ -18,7 +18,6 @@
 #define __BLENDER_UTIL_H__

 #include "scene/mesh.h"
-#include "scene/scene.h"

 #include "util/algorithm.h"
 #include "util/array.h"
@@ -34,7 +33,7 @@

 extern "C" {
 void BKE_image_user_frame_calc(void *ima, void *iuser, int cfra);
-void BKE_image_user_file_path_ex(void *iuser, void *ima, char *path, bool resolve_udim);
+void BKE_image_user_file_path(void *iuser, void *ima, char *path);
 unsigned char *BKE_image_get_pixels_for_frame(void *image, int frame, int tile);
 float *BKE_image_get_float_pixels_for_frame(void *image, int frame, int tile);
 }
@@ -291,14 +290,25 @@ static inline int render_resolution_y(BL::RenderSettings &b_render)
  return b_render.resolution_y() * b_render.resolution_percentage() / 100;
 }

-static inline string image_user_file_path(BL::ImageUser &iuser, BL::Image &ima, int cfra)
+static inline string image_user_file_path(BL::ImageUser &iuser,
+                                          BL::Image &ima,
+                                          int cfra,
+                                          bool load_tiled)
 {
  char filepath[1024];
  iuser.tile(0);
  BKE_image_user_frame_calc(ima.ptr.data, iuser.ptr.data, cfra);
-  BKE_image_user_file_path_ex(iuser.ptr.data, ima.ptr.data, filepath, false);
+  BKE_image_user_file_path(iuser.ptr.data, ima.ptr.data, filepath);

-  return string(filepath);
+  string filepath_str = string(filepath);
+  if (load_tiled && ima.source() == BL::Image::source_TILED) {
+    string udim;
+    if (!ima.tiles.empty()) {
+      udim = to_string(ima.tiles[0].number());
+    }
+    string_replace(filepath_str, udim, "<UDIM>");
+  }
+  return filepath_str;
 }

 static inline int image_user_frame_number(BL::ImageUser &iuser, BL::Image &ima, int cfra)
@@ -671,40 +681,6 @@ static inline uint object_ray_visibility(BL::Object &b_ob)
  return flag;
 }

-/* Check whether some of "built-in" motion-related attributes are needed to be exported (includes
- * things like velocity from cache modifier, fluid simulation).
- *
- * NOTE: This code is run prior to object motion blur initialization. so can not access properties
- * set by `sync_object_motion_init()`. */
-static inline bool object_need_motion_attribute(BObjectInfo &b_ob_info, Scene *scene)
-{
-  const Scene::MotionType need_motion = scene->need_motion();
-  if (need_motion == Scene::MOTION_NONE) {
-    /* Simple case: neither motion pass nor motion blur is needed, no need in the motion related
-     * attributes. */
-    return false;
-  }
-
-  if (need_motion == Scene::MOTION_BLUR) {
-    /* A bit tricky and implicit case:
-     * - Motion blur is enabled in the scene, which implies specific number of time steps for
-     *   objects.
-     * - If the object has motion blur disabled on it, it will have 0 time steps.
-     * - Motion attribute expects non-zero time steps.
-     *
-     * Avoid adding motion attributes if the motion blur will enforce 0 motion steps. */
-    PointerRNA cobject = RNA_pointer_get(&b_ob_info.real_object.ptr, "cycles");
-    const bool use_motion = get_boolean(cobject, "use_motion_blur");
-    if (!use_motion) {
-      return false;
-    }
-  }
-
-  /* Motion pass which implies 3 motion steps, or motion blur which is not disabled on object
-   * level. */
-  return true;
-}
-
 class EdgeMap {
 public:
  EdgeMap()
--- a/intern/cycles/bvh/build.cpp
+++ b/intern/cycles/bvh/build.cpp
@@ -656,24 +656,24 @@ bool BVHBuild::range_within_max_leaf_size(const BVHRange &range,
  for (int i = 0; i < size; i++) {
    const BVHReference &ref = references[range.start() + i];

-    if (ref.prim_type() & PRIMITIVE_CURVE) {
-      if (ref.prim_type() & PRIMITIVE_MOTION) {
+    if (ref.prim_type() & PRIMITIVE_ALL_CURVE) {
+      if (ref.prim_type() & PRIMITIVE_ALL_MOTION) {
        num_motion_curves++;
      }
      else {
        num_curves++;
      }
    }
-    else if (ref.prim_type() & PRIMITIVE_TRIANGLE) {
-      if (ref.prim_type() & PRIMITIVE_MOTION) {
+    else if (ref.prim_type() & PRIMITIVE_ALL_TRIANGLE) {
+      if (ref.prim_type() & PRIMITIVE_ALL_MOTION) {
        num_motion_triangles++;
      }
      else {
        num_triangles++;
      }
    }
-    else if (ref.prim_type() & PRIMITIVE_POINT) {
-      if (ref.prim_type() & PRIMITIVE_MOTION) {
+    else if (ref.prim_type() & PRIMITIVE_ALL_POINT) {
+      if (ref.prim_type() & PRIMITIVE_ALL_MOTION) {
        num_motion_points++;
      }
      else {
@@ -973,7 +973,7 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer
  for (int i = 0; i < range.size(); i++) {
    const BVHReference &ref = references[range.start() + i];
    if (ref.prim_index() != -1) {
-      uint32_t type_index = PRIMITIVE_INDEX(ref.prim_type() & PRIMITIVE_ALL);
+      uint32_t type_index = bitscan((uint32_t)(ref.prim_type() & PRIMITIVE_ALL));
      p_ref[type_index].push_back(ref);
      p_type[type_index].push_back(ref.prim_type());
      p_index[type_index].push_back(ref.prim_index());
--- a/intern/cycles/bvh/bvh2.cpp
+++ b/intern/cycles/bvh/bvh2.cpp
@@ -387,7 +387,7 @@ void BVH2::refit_primitives(int start, int end, BoundBox &bbox, uint &visibility
    }
    else {
      /* Primitives. */
-      if (pack.prim_type[prim] & PRIMITIVE_CURVE) {
+      if (pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
        /* Curves. */
        const Hair *hair = static_cast<const Hair *>(ob->get_geometry());
        int prim_offset = (params.top_level) ? hair->prim_offset : 0;
@@ -410,7 +410,7 @@ void BVH2::refit_primitives(int start, int end, BoundBox &bbox, uint &visibility
          }
        }
      }
-      else if (pack.prim_type[prim] & PRIMITIVE_POINT) {
+      else if (pack.prim_type[prim] & PRIMITIVE_ALL_POINT) {
        /* Points. */
        const PointCloud *pointcloud = static_cast<const PointCloud *>(ob->get_geometry());
        int prim_offset = (params.top_level) ? pointcloud->prim_offset : 0;
@@ -590,7 +590,13 @@ void BVH2::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
      float2 *bvh_prim_time = bvh->pack.prim_time.size() ? &bvh->pack.prim_time[0] : NULL;

      for (size_t i = 0; i < bvh_prim_index_size; i++) {
-        pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + geom_prim_offset;
+        if (bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
+          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + geom_prim_offset;
+        }
+        else {
+          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + geom_prim_offset;
+        }
+
        pack_prim_type[pack_prim_index_offset] = bvh_prim_type[i];
        pack_prim_visibility[pack_prim_index_offset] = bvh_prim_visibility[i];
        pack_prim_object[pack_prim_index_offset] = 0;  // unused for instances
--- a/intern/cycles/bvh/embree.cpp
+++ b/intern/cycles/bvh/embree.cpp
@@ -91,7 +91,7 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
      ++ctx->num_hits;

      /* Always use baked shadow transparency for curves. */
-      if (current_isect.type & PRIMITIVE_CURVE) {
+      if (current_isect.type & PRIMITIVE_ALL_CURVE) {
        ctx->throughput *= intersection_curve_shadow_transparency(
            kg, current_isect.object, current_isect.prim, current_isect.u);

--- a/intern/cycles/bvh/split.cpp
+++ b/intern/cycles/bvh/split.cpp
@@ -535,15 +535,15 @@ void BVHSpatialSplit::split_reference(const BVHBuild &builder,
  /* loop over vertices/edges. */
  const Object *ob = builder.objects[ref.prim_object()];

-  if (ref.prim_type() & PRIMITIVE_TRIANGLE) {
+  if (ref.prim_type() & PRIMITIVE_ALL_TRIANGLE) {
    Mesh *mesh = static_cast<Mesh *>(ob->get_geometry());
    split_triangle_reference(ref, mesh, dim, pos, left_bounds, right_bounds);
  }
-  else if (ref.prim_type() & PRIMITIVE_CURVE) {
+  else if (ref.prim_type() & PRIMITIVE_ALL_CURVE) {
    Hair *hair = static_cast<Hair *>(ob->get_geometry());
    split_curve_reference(ref, hair, dim, pos, left_bounds, right_bounds);
  }
-  else if (ref.prim_type() & PRIMITIVE_POINT) {
+  else if (ref.prim_type() & PRIMITIVE_ALL_POINT) {
    PointCloud *pointcloud = static_cast<PointCloud *>(ob->get_geometry());
    split_point_reference(ref, pointcloud, dim, pos, left_bounds, right_bounds);
  }
--- a/intern/cycles/bvh/unaligned.cpp
+++ b/intern/cycles/bvh/unaligned.cpp
@@ -69,7 +69,7 @@ bool BVHUnaligned::compute_aligned_space(const BVHReference &ref, Transform *ali
  const int packed_type = ref.prim_type();
  const int type = (packed_type & PRIMITIVE_ALL);
  /* No motion blur curves here, we can't fit them to aligned boxes well. */
-  if ((type & PRIMITIVE_CURVE) && !(type & PRIMITIVE_MOTION)) {
+  if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_CURVE_THICK)) {
    const int curve_index = ref.prim_index();
    const int segment = PRIMITIVE_UNPACK_SEGMENT(packed_type);
    const Hair *hair = static_cast<const Hair *>(object->get_geometry());
@@ -95,7 +95,7 @@ BoundBox BVHUnaligned::compute_aligned_prim_boundbox(const BVHReference &prim,
  const int packed_type = prim.prim_type();
  const int type = (packed_type & PRIMITIVE_ALL);
  /* No motion blur curves here, we can't fit them to aligned boxes well. */
-  if ((type & PRIMITIVE_CURVE) && !(type & PRIMITIVE_MOTION)) {
+  if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_CURVE_THICK)) {
    const int curve_index = prim.prim_index();
    const int segment = PRIMITIVE_UNPACK_SEGMENT(packed_type);
    const Hair *hair = static_cast<const Hair *>(object->get_geometry());
--- a/intern/cycles/cmake/macros.cmake
+++ b/intern/cycles/cmake/macros.cmake
@@ -84,6 +84,39 @@ macro(cycles_add_library target library_deps)
  cycles_set_solution_folder(${target})
 endmacro()

+# Cycles library dependencies common to all executables
+
+function(cycles_link_directories)
+  if(APPLE)
+    # APPLE platform uses full paths for linking libraries, and avoids link_directories.
+    return()
+  endif()
+
+  if(WITH_OPENCOLORIO)
+    link_directories(${OPENCOLORIO_LIBPATH})
+  endif()
+  if(WITH_OPENVDB)
+    link_directories(${OPENVDB_LIBPATH} ${BLOSC_LIBPATH})
+  endif()
+  if(WITH_OPENSUBDIV)
+    link_directories(${OPENSUBDIV_LIBPATH})
+  endif()
+  if(WITH_OPENIMAGEDENOISE)
+    link_directories(${OPENIMAGEDENOISE_LIBPATH})
+  endif()
+
+  link_directories(
+    ${OPENIMAGEIO_LIBPATH}
+    ${BOOST_LIBPATH}
+    ${PNG_LIBPATH}
+    ${JPEG_LIBPATH}
+    ${ZLIB_LIBPATH}
+    ${TIFF_LIBPATH}
+    ${OPENEXR_LIBPATH}
+    ${OPENJPEG_LIBPATH}
+  )
+endfunction()
+
 macro(cycles_target_link_libraries target)
  if(WITH_CYCLES_LOGGING)
    target_link_libraries(${target} ${GLOG_LIBRARIES} ${GFLAGS_LIBRARIES})
@@ -135,6 +168,12 @@ macro(cycles_target_link_libraries target)
    target_link_libraries(${target} extern_hipew)
  endif()

+  if(CYCLES_STANDALONE_REPOSITORY)
+    target_link_libraries(${target} extern_numaapi)
+  else()
+    target_link_libraries(${target} bf_intern_numaapi)
+  endif()
+
  if(UNIX AND NOT APPLE)
    if(CYCLES_STANDALONE_REPOSITORY)
      target_link_libraries(${target} extern_libc_compat)
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -72,7 +72,7 @@ CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_
          << " CPU kernels.";

  if (info.cpu_threads == 0) {
-    info.cpu_threads = TaskScheduler::max_concurrency();
+    info.cpu_threads = TaskScheduler::num_threads();
  }

 #ifdef WITH_OSL
--- a/intern/cycles/device/cuda/graphics_interop.cpp
+++ b/intern/cycles/device/cuda/graphics_interop.cpp
@@ -45,10 +45,8 @@ void CUDADeviceGraphicsInterop::set_display_interop(

  need_clear_ = display_interop.need_clear;

-  if (!display_interop.need_recreate) {
-    if (opengl_pbo_id_ == display_interop.opengl_pbo_id && buffer_area_ == new_buffer_area) {
-      return;
-    }
+  if (opengl_pbo_id_ == display_interop.opengl_pbo_id && buffer_area_ == new_buffer_area) {
+    return;
  }

  CUDAContextScope scope(device_);
--- a/intern/cycles/device/denoise.cpp
+++ b/intern/cycles/device/denoise.cpp
@@ -76,8 +76,6 @@ NODE_DEFINE(DenoiseParams)
  SOCKET_BOOLEAN(use_pass_albedo, "Use Pass Albedo", true);
  SOCKET_BOOLEAN(use_pass_normal, "Use Pass Normal", false);

-  SOCKET_BOOLEAN(temporally_stable, "Temporally Stable", false);
-
  SOCKET_ENUM(prefilter, "Prefilter", *prefilter_enum, DENOISER_PREFILTER_FAST);

  return type;
--- a/intern/cycles/device/denoise.h
+++ b/intern/cycles/device/denoise.h
@@ -72,9 +72,6 @@ class DenoiseParams : public Node {
  bool use_pass_albedo = true;
  bool use_pass_normal = true;

-  /* Configure the denoiser to use motion vectors, previous image and a temporally stable model. */
-  bool temporally_stable = false;
-
  DenoiserPrefilter prefilter = DENOISER_PREFILTER_FAST;

  static const NodeEnum *get_type_enum();
@@ -86,8 +83,7 @@ class DenoiseParams : public Node {
  {
    return !(use == other.use && type == other.type && start_sample == other.start_sample &&
             use_pass_albedo == other.use_pass_albedo &&
-             use_pass_normal == other.use_pass_normal &&
-             temporally_stable == other.temporally_stable && prefilter == other.prefilter);
+             use_pass_normal == other.use_pass_normal && prefilter == other.prefilter);
  }
 };

--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -37,7 +37,6 @@
 #include "util/math.h"
 #include "util/string.h"
 #include "util/system.h"
-#include "util/task.h"
 #include "util/time.h"
 #include "util/types.h"
 #include "util/vector.h"
@@ -334,7 +333,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
    /* Ensure CPU device does not slow down GPU. */
    if (device.type == DEVICE_CPU && subdevices.size() > 1) {
      if (background) {
-        int orig_cpu_threads = (threads) ? threads : TaskScheduler::max_concurrency();
+        int orig_cpu_threads = (threads) ? threads : system_cpu_thread_count();
        int cpu_threads = max(orig_cpu_threads - (subdevices.size() - 1), 0);

        VLOG(1) << "CPU render threads reduced from " << orig_cpu_threads << " to " << cpu_threads
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -566,19 +566,6 @@ class OptiXDevice::DenoiseContext {
      }
    }

-    if (denoise_params.temporally_stable) {
-      prev_output.device_pointer = render_buffers->buffer.device_pointer;
-
-      prev_output.offset = buffer_params.get_pass_offset(PASS_DENOISING_PREVIOUS);
-
-      prev_output.stride = buffer_params.stride;
-      prev_output.pass_stride = buffer_params.pass_stride;
-
-      num_input_passes += 1;
-      use_pass_flow = true;
-      pass_motion = buffer_params.get_pass_offset(PASS_MOTION);
-    }
-
    use_guiding_passes = (num_input_passes - 1) > 0;

    if (use_guiding_passes) {
@@ -587,7 +574,6 @@ class OptiXDevice::DenoiseContext {

        guiding_params.pass_albedo = pass_denoising_albedo;
        guiding_params.pass_normal = pass_denoising_normal;
-        guiding_params.pass_flow = pass_motion;

        guiding_params.stride = buffer_params.stride;
        guiding_params.pass_stride = buffer_params.pass_stride;
@@ -602,10 +588,6 @@ class OptiXDevice::DenoiseContext {
          guiding_params.pass_normal = guiding_params.pass_stride;
          guiding_params.pass_stride += 3;
        }
-        if (use_pass_flow) {
-          guiding_params.pass_flow = guiding_params.pass_stride;
-          guiding_params.pass_stride += 2;
-        }

        guiding_params.stride = buffer_params.width;

@@ -623,16 +605,6 @@ class OptiXDevice::DenoiseContext {
  RenderBuffers *render_buffers = nullptr;
  const BufferParams &buffer_params;

-  /* Previous output. */
-  struct {
-    device_ptr device_pointer = 0;
-
-    int offset = PASS_UNUSED;
-
-    int stride = -1;
-    int pass_stride = -1;
-  } prev_output;
-
  /* Device-side storage of the guiding passes. */
  device_only_memory<float> guiding_buffer;

@@ -642,7 +614,6 @@ class OptiXDevice::DenoiseContext {
    /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
    int pass_albedo = PASS_UNUSED;
    int pass_normal = PASS_UNUSED;
-    int pass_flow = PASS_UNUSED;

    int stride = -1;
    int pass_stride = -1;
@@ -653,7 +624,6 @@ class OptiXDevice::DenoiseContext {
  bool use_guiding_passes = false;
  bool use_pass_albedo = false;
  bool use_pass_normal = false;
-  bool use_pass_flow = false;

  int num_samples = 0;

@@ -662,7 +632,6 @@ class OptiXDevice::DenoiseContext {
  /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
  int pass_denoising_albedo = PASS_UNUSED;
  int pass_denoising_normal = PASS_UNUSED;
-  int pass_motion = PASS_UNUSED;

  /* For passes which don't need albedo channel for denoising we replace the actual albedo with
   * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
@@ -733,7 +702,6 @@ bool OptiXDevice::denoise_filter_guiding_preprocess(DenoiseContext &context)
                             &context.guiding_params.pass_stride,
                             &context.guiding_params.pass_albedo,
                             &context.guiding_params.pass_normal,
-                             &context.guiding_params.pass_flow,
                             &context.render_buffers->buffer.device_pointer,
                             &buffer_params.offset,
                             &buffer_params.stride,
@@ -741,7 +709,6 @@ bool OptiXDevice::denoise_filter_guiding_preprocess(DenoiseContext &context)
                             &context.pass_sample_count,
                             &context.pass_denoising_albedo,
                             &context.pass_denoising_normal,
-                             &context.pass_motion,
                             &buffer_params.full_x,
                             &buffer_params.full_y,
                             &buffer_params.width,
@@ -914,8 +881,7 @@ bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context)
 {
  const bool recreate_denoiser = (denoiser_.optix_denoiser == nullptr) ||
                                 (denoiser_.use_pass_albedo != context.use_pass_albedo) ||
-                                 (denoiser_.use_pass_normal != context.use_pass_normal) ||
-                                 (denoiser_.use_pass_flow != context.use_pass_flow);
+                                 (denoiser_.use_pass_normal != context.use_pass_normal);
  if (!recreate_denoiser) {
    return true;
  }
@@ -929,14 +895,8 @@ bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context)
  OptixDenoiserOptions denoiser_options = {};
  denoiser_options.guideAlbedo = context.use_pass_albedo;
  denoiser_options.guideNormal = context.use_pass_normal;
-
-  OptixDenoiserModelKind model = OPTIX_DENOISER_MODEL_KIND_HDR;
-  if (context.use_pass_flow) {
-    model = OPTIX_DENOISER_MODEL_KIND_TEMPORAL;
-  }
-
  const OptixResult result = optixDenoiserCreate(
-      this->context, model, &denoiser_options, &denoiser_.optix_denoiser);
+      this->context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser_.optix_denoiser);

  if (result != OPTIX_SUCCESS) {
    set_error("Failed to create OptiX denoiser");
@@ -946,7 +906,6 @@ bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context)
  /* OptiX denoiser handle was created with the requested number of input passes. */
  denoiser_.use_pass_albedo = context.use_pass_albedo;
  denoiser_.use_pass_normal = context.use_pass_normal;
-  denoiser_.use_pass_flow = context.use_pass_flow;

  /* OptiX denoiser has been created, but it needs configuration. */
  denoiser_.is_configured = false;
@@ -1006,10 +965,8 @@ bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)
  OptixImage2D color_layer = {0};
  OptixImage2D albedo_layer = {0};
  OptixImage2D normal_layer = {0};
-  OptixImage2D flow_layer = {0};

  OptixImage2D output_layer = {0};
-  OptixImage2D prev_output_layer = {0};

  /* Color pass. */
  {
@@ -1025,19 +982,6 @@ bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)
    color_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
  }

-  /* Previous output. */
-  if (context.prev_output.offset != PASS_UNUSED) {
-    const int64_t pass_stride_in_bytes = context.prev_output.pass_stride * sizeof(float);
-
-    prev_output_layer.data = context.prev_output.device_pointer +
-                             context.prev_output.offset * sizeof(float);
-    prev_output_layer.width = width;
-    prev_output_layer.height = height;
-    prev_output_layer.rowStrideInBytes = pass_stride_in_bytes * context.prev_output.stride;
-    prev_output_layer.pixelStrideInBytes = pass_stride_in_bytes;
-    prev_output_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
-  }
-
  /* Optional albedo and color passes. */
  if (context.num_input_passes > 1) {
    const device_ptr d_guiding_buffer = context.guiding_params.device_pointer;
@@ -1061,32 +1005,21 @@ bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)
      normal_layer.pixelStrideInBytes = pixel_stride_in_bytes;
      normal_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
    }
-
-    if (context.use_pass_flow) {
-      flow_layer.data = d_guiding_buffer + context.guiding_params.pass_flow * sizeof(float);
-      flow_layer.width = width;
-      flow_layer.height = height;
-      flow_layer.rowStrideInBytes = row_stride_in_bytes;
-      flow_layer.pixelStrideInBytes = pixel_stride_in_bytes;
-      flow_layer.format = OPTIX_PIXEL_FORMAT_FLOAT2;
-    }
  }

  /* Denoise in-place of the noisy input in the render buffers. */
  output_layer = color_layer;

-  OptixDenoiserGuideLayer guide_layers = {};
-  guide_layers.albedo = albedo_layer;
-  guide_layers.normal = normal_layer;
-  guide_layers.flow = flow_layer;
+  /* Finally run denoising. */
+  OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */

  OptixDenoiserLayer image_layers = {};
  image_layers.input = color_layer;
-  image_layers.previousOutput = prev_output_layer;
  image_layers.output = output_layer;

-  /* Finally run denoising. */
-  OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */
+  OptixDenoiserGuideLayer guide_layers = {};
+  guide_layers.albedo = albedo_layer;
+  guide_layers.normal = normal_layer;

  optix_assert(optixUtilDenoiserInvokeTiled(denoiser_.optix_denoiser,
                                            denoiser_.queue.stream(),
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -104,7 +104,6 @@ class OptiXDevice : public CUDADevice {

    bool use_pass_albedo = false;
    bool use_pass_normal = false;
-    bool use_pass_flow = false;
  };
  Denoiser denoiser_;

--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -19,7 +19,6 @@
 #include "device/kernel.h"

 #include "device/graphics_interop.h"
-#include "util/debug.h"
 #include "util/log.h"
 #include "util/map.h"
 #include "util/string.h"
@@ -43,7 +42,7 @@ struct DeviceKernelArguments {
    KERNEL_FILM_CONVERT,
  };

-  static const int MAX_ARGS = 18;
+  static const int MAX_ARGS = 16;
  Type types[MAX_ARGS];
  void *values[MAX_ARGS];
  size_t sizes[MAX_ARGS];
@@ -86,8 +85,6 @@ struct DeviceKernelArguments {
  }
  void add(const Type type, const void *value, size_t size)
  {
-    assert(count < MAX_ARGS);
-
    types[count] = type;
    values[count] = (void *)value;
    sizes[count] = size;
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -115,9 +115,7 @@ bool PathTrace::ready_to_reset()
  return false;
 }

-void PathTrace::reset(const BufferParams &full_params,
-                      const BufferParams &big_tile_params,
-                      const bool reset_rendering)
+void PathTrace::reset(const BufferParams &full_params, const BufferParams &big_tile_params)
 {
  if (big_tile_params_.modified(big_tile_params)) {
    big_tile_params_ = big_tile_params;
@@ -130,7 +128,7 @@ void PathTrace::reset(const BufferParams &full_params,
   * It is requires to inform about reset whenever it happens, so that the redraw state tracking is
   * properly updated. */
  if (display_) {
-    display_->reset(big_tile_params, reset_rendering);
+    display_->reset(full_params);
  }

  render_state_.has_denoised_result = false;
@@ -596,15 +594,6 @@ void PathTrace::draw()
  did_draw_after_reset_ |= display_->draw();
 }

-void PathTrace::flush_display()
-{
-  if (!display_) {
-    return;
-  }
-
-  display_->flush();
-}
-
 void PathTrace::update_display(const RenderWork &render_work)
 {
  if (!render_work.display.update) {
@@ -633,8 +622,9 @@ void PathTrace::update_display(const RenderWork &render_work)
  if (display_) {
    VLOG(3) << "Perform copy to GPUDisplay work.";

-    const int texture_width = render_state_.effective_big_tile_params.window_width;
-    const int texture_height = render_state_.effective_big_tile_params.window_height;
+    const int resolution_divider = render_work.resolution_divider;
+    const int texture_width = max(1, full_params_.width / resolution_divider);
+    const int texture_height = max(1, full_params_.height / resolution_divider);
    if (!display_->update_begin(texture_width, texture_height)) {
      LOG(ERROR) << "Error beginning GPUDisplay update.";
      return;
--- a/intern/cycles/integrator/path_trace.h
+++ b/intern/cycles/integrator/path_trace.h
@@ -72,9 +72,7 @@ class PathTrace {
   * render result. */
  bool ready_to_reset();

-  void reset(const BufferParams &full_params,
-             const BufferParams &big_tile_params,
-             bool reset_rendering);
+  void reset(const BufferParams &full_params, const BufferParams &big_tile_params);

  void device_free();

@@ -114,9 +112,6 @@ class PathTrace {
  /* Perform drawing of the current state of the DisplayDriver. */
  void draw();

-  /* Flush outstanding display commands before ending the render loop. */
-  void flush_display();
-
  /* Cancel rendering process as soon as possible, without waiting for full tile to be sampled.
   * Used in cases like reset of render session.
   *
--- a/intern/cycles/integrator/path_trace_display.cpp
+++ b/intern/cycles/integrator/path_trace_display.cpp
@@ -26,20 +26,15 @@ PathTraceDisplay::PathTraceDisplay(unique_ptr<DisplayDriver> driver) : driver_(m
 {
 }

-void PathTraceDisplay::reset(const BufferParams &buffer_params, const bool reset_rendering)
+void PathTraceDisplay::reset(const BufferParams &buffer_params)
 {
  thread_scoped_lock lock(mutex_);

-  params_.full_offset = make_int2(buffer_params.full_x + buffer_params.window_x,
-                                  buffer_params.full_y + buffer_params.window_y);
+  params_.full_offset = make_int2(buffer_params.full_x, buffer_params.full_y);
  params_.full_size = make_int2(buffer_params.full_width, buffer_params.full_height);
-  params_.size = make_int2(buffer_params.window_width, buffer_params.window_height);
+  params_.size = make_int2(buffer_params.width, buffer_params.height);

  texture_state_.is_outdated = true;
-
-  if (!reset_rendering) {
-    driver_->next_tile_begin();
-  }
 }

 void PathTraceDisplay::mark_texture_updated()
@@ -253,9 +248,4 @@ bool PathTraceDisplay::draw()
  return !is_outdated;
 }

-void PathTraceDisplay::flush()
-{
-  driver_->flush();
-}
-
 CCL_NAMESPACE_END
--- a/intern/cycles/integrator/path_trace_display.h
+++ b/intern/cycles/integrator/path_trace_display.h
@@ -38,17 +38,14 @@ class BufferParams;

 class PathTraceDisplay {
 public:
-  explicit PathTraceDisplay(unique_ptr<DisplayDriver> driver);
+  PathTraceDisplay(unique_ptr<DisplayDriver> driver);
  virtual ~PathTraceDisplay() = default;

  /* Reset the display for the new state of render session. Is called whenever session is reset,
   * which happens on changes like viewport navigation or viewport dimension change.
   *
-   * This call will configure parameters for a changed buffer and reset the texture state.
-   *
-   * When the `reset_rendering` a complete display reset happens. When it is false reset happens
-   * for a new state of the buffer parameters which is assumed to correspond to the next tile. */
-  void reset(const BufferParams &buffer_params, bool reset_rendering);
+   * This call will configure parameters for a changed buffer and reset the texture state. */
+  void reset(const BufferParams &buffer_params);

  /* --------------------------------------------------------------------
   * Update procedure.
@@ -154,9 +151,6 @@ class PathTraceDisplay {
   * Returns true if this call did draw an updated state of the texture. */
  bool draw();

-  /* Flush outstanding display commands before ending the render loop. */
-  void flush();
-
 private:
  /* Display driver implemented by the host application. */
  unique_ptr<DisplayDriver> driver_;
--- a/intern/cycles/integrator/path_trace_work.cpp
+++ b/intern/cycles/integrator/path_trace_work.cpp
@@ -194,10 +194,10 @@ PassAccessor::Destination PathTraceWork::get_display_destination_template(
  PassAccessor::Destination destination(film_->get_display_pass());

  const int2 display_texture_size = display->get_texture_size();
-  const int texture_x = effective_buffer_params_.full_x - effective_big_tile_params_.full_x +
-                        effective_buffer_params_.window_x - effective_big_tile_params_.window_x;
-  const int texture_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y +
-                        effective_buffer_params_.window_y - effective_big_tile_params_.window_y;
+  const int texture_x = effective_buffer_params_.full_x - effective_full_params_.full_x +
+                        effective_buffer_params_.window_x;
+  const int texture_y = effective_buffer_params_.full_y - effective_full_params_.full_y +
+                        effective_buffer_params_.window_y;

  destination.offset = texture_y * display_texture_size.x + texture_x;
  destination.stride = display_texture_size.x;
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -875,10 +875,8 @@ void PathTraceWorkGPU::copy_to_display_naive(PathTraceDisplay *display,
  const int final_width = buffers_->params.window_width;
  const int final_height = buffers_->params.window_height;

-  const int texture_x = full_x - effective_big_tile_params_.full_x +
-                        effective_buffer_params_.window_x - effective_big_tile_params_.window_x;
-  const int texture_y = full_y - effective_big_tile_params_.full_y +
-                        effective_buffer_params_.window_y - effective_big_tile_params_.window_y;
+  const int texture_x = full_x - effective_full_params_.full_x + effective_buffer_params_.window_x;
+  const int texture_y = full_y - effective_full_params_.full_y + effective_buffer_params_.window_y;

  /* Re-allocate display memory if needed, and make sure the device pointer is allocated.
   *
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -406,6 +406,9 @@ bool RenderScheduler::set_postprocess_render_work(RenderWork *render_work)
    any_scheduled = true;
  }

+  /* Force update. */
+  any_scheduled = true;
+
  if (any_scheduled) {
    render_work->display.update = true;
  }
--- a/intern/cycles/integrator/render_scheduler.h
+++ b/intern/cycles/integrator/render_scheduler.h
@@ -283,7 +283,7 @@ class RenderScheduler {
  /* Check whether timing report about the given work need to reset accumulated average time. */
  bool work_report_reset_average(const RenderWork &render_work);

-  /* Check whether render time limit has been reached (or exceeded), and if so store related
+  /* CHeck whether render time limit has been reached (or exceeded), and if so store related
   * information in the state so that rendering is considered finished, and is possible to report
   * average render time information. */
  void check_time_limit_reached();
--- a/intern/cycles/kernel/bvh/shadow_all.h
+++ b/intern/cycles/kernel/bvh/shadow_all.h
@@ -174,7 +174,7 @@ ccl_device_inline
              case PRIMITIVE_MOTION_CURVE_THICK:
              case PRIMITIVE_CURVE_RIBBON:
              case PRIMITIVE_MOTION_CURVE_RIBBON: {
-                if ((type & PRIMITIVE_MOTION) && kernel_data.bvh.use_bvh_steps) {
+                if ((type & PRIMITIVE_ALL_MOTION) && kernel_data.bvh.use_bvh_steps) {
                  const float2 prim_time = kernel_tex_fetch(__prim_time, prim_addr);
                  if (ray->time < prim_time.x || ray->time > prim_time.y) {
                    hit = false;
@@ -203,7 +203,7 @@ ccl_device_inline
 #if BVH_FEATURE(BVH_POINTCLOUD)
              case PRIMITIVE_POINT:
              case PRIMITIVE_MOTION_POINT: {
-                if ((type & PRIMITIVE_MOTION) && kernel_data.bvh.use_bvh_steps) {
+                if ((type & PRIMITIVE_ALL_MOTION) && kernel_data.bvh.use_bvh_steps) {
                  const float2 prim_time = kernel_tex_fetch(__prim_time, prim_addr);
                  if (ray->time < prim_time.x || ray->time > prim_time.y) {
                    hit = false;
@@ -255,7 +255,7 @@ ccl_device_inline
              bool record_intersection = true;

              /* Always use baked shadow transparency for curves. */
-              if (isect.type & PRIMITIVE_CURVE) {
+              if (isect.type & PRIMITIVE_ALL_CURVE) {
                *throughput *= intersection_curve_shadow_transparency(
                    kg, isect.object, isect.prim, isect.u);

--- a/intern/cycles/kernel/bvh/traversal.h
+++ b/intern/cycles/kernel/bvh/traversal.h
@@ -166,7 +166,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
            case PRIMITIVE_CURVE_RIBBON:
            case PRIMITIVE_MOTION_CURVE_RIBBON: {
              for (; prim_addr < prim_addr2; prim_addr++) {
-                if ((type & PRIMITIVE_MOTION) && kernel_data.bvh.use_bvh_steps) {
+                if ((type & PRIMITIVE_ALL_MOTION) && kernel_data.bvh.use_bvh_steps) {
                  const float2 prim_time = kernel_tex_fetch(__prim_time, prim_addr);
                  if (ray->time < prim_time.x || ray->time > prim_time.y) {
                    continue;
@@ -193,7 +193,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
            case PRIMITIVE_POINT:
            case PRIMITIVE_MOTION_POINT: {
              for (; prim_addr < prim_addr2; prim_addr++) {
-                if ((type & PRIMITIVE_MOTION) && kernel_data.bvh.use_bvh_steps) {
+                if ((type & PRIMITIVE_ALL_MOTION) && kernel_data.bvh.use_bvh_steps) {
                  const float2 prim_time = kernel_tex_fetch(__prim_time, prim_addr);
                  if (ray->time < prim_time.x || ray->time > prim_time.y) {
                    continue;
--- a/intern/cycles/kernel/bvh/util.h
+++ b/intern/cycles/kernel/bvh/util.h
@@ -118,16 +118,16 @@ ccl_device_forceinline int intersection_get_shader_flags(KernelGlobals kg,
 {
  int shader = 0;

-  if (type & PRIMITIVE_TRIANGLE) {
+  if (type & PRIMITIVE_ALL_TRIANGLE) {
    shader = kernel_tex_fetch(__tri_shader, prim);
  }
 #ifdef __POINTCLOUD__
-  else if (type & PRIMITIVE_POINT) {
+  else if (type & PRIMITIVE_ALL_POINT) {
    shader = kernel_tex_fetch(__points_shader, prim);
  }
 #endif
 #ifdef __HAIR__
-  else if (type & PRIMITIVE_CURVE) {
+  else if (type & PRIMITIVE_ALL_CURVE) {
    shader = kernel_tex_fetch(__curves, prim).shader_id;
  }
 #endif
@@ -141,16 +141,16 @@ ccl_device_forceinline int intersection_get_shader_from_isect_prim(KernelGlobals
 {
  int shader = 0;

-  if (isect_type & PRIMITIVE_TRIANGLE) {
+  if (isect_type & PRIMITIVE_ALL_TRIANGLE) {
    shader = kernel_tex_fetch(__tri_shader, prim);
  }
 #ifdef __POINTCLOUD__
-  else if (isect_type & PRIMITIVE_POINT) {
+  else if (isect_type & PRIMITIVE_ALL_POINT) {
    shader = kernel_tex_fetch(__points_shader, prim);
  }
 #endif
 #ifdef __HAIR__
-  else if (isect_type & PRIMITIVE_CURVE) {
+  else if (isect_type & PRIMITIVE_ALL_CURVE) {
    shader = kernel_tex_fetch(__curves, prim).shader_id;
  }
 #endif
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -124,7 +124,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals kg,
  /* For curves use the smooth normal, particularly for ribbons the geometric
   * normal gives too much darkening otherwise. */
  int label;
-  const float3 Ng = (sd->type & PRIMITIVE_CURVE) ? sc->N : sd->Ng;
+  const float3 Ng = (sd->type & PRIMITIVE_ALL_CURVE) ? sc->N : sd->Ng;

  switch (sc->type) {
    case CLOSURE_BSDF_DIFFUSE_ID:
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -213,7 +213,9 @@ ccl_device int bsdf_principled_hair_setup(ccl_private ShaderData *sd,

  /* TODO: we convert this value to a cosine later and discard the sign, so
   * we could probably save some operations. */
-  float h = (sd->type & PRIMITIVE_CURVE_RIBBON) ? -sd->v : dot(cross(sd->Ng, X), Z);
+  float h = (sd->type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) ?
+                -sd->v :
+                dot(cross(sd->Ng, X), Z);

  kernel_assert(fabsf(h) < 1.0f + 1e-4f);
  kernel_assert(isfinite3_safe(Y));
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -243,10 +243,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  }
 }

-#ifdef __KERNEL_METAL__
-constant int __dummy_constant [[function_constant(0)]];
-#endif
-
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
    ccl_gpu_kernel_signature(integrator_shade_surface_raytrace,
                             ccl_global const int *path_index_array,
@@ -257,16 +253,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)

  if (global_index < work_size) {
    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-
-#ifdef __KERNEL_METAL__
-    KernelGlobals kg = NULL;
-    /* Workaround Ambient Occlusion and Bevel nodes not working with Metal.
-     * Dummy offset should not affect result, but somehow fixes bug! */
-    kg += __dummy_constant;
-    ccl_gpu_kernel_call(integrator_shade_surface_raytrace(kg, state, render_buffer));
-#else
    ccl_gpu_kernel_call(integrator_shade_surface_raytrace(NULL, state, render_buffer));
-#endif
  }
 }

@@ -769,7 +756,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
                             int guiding_pass_stride,
                             int guiding_pass_albedo,
                             int guiding_pass_normal,
-                             int guiding_pass_flow,
                             ccl_global const float *render_buffer,
                             int render_offset,
                             int render_stride,
@@ -777,7 +763,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
                             int render_pass_sample_count,
                             int render_pass_denoising_albedo,
                             int render_pass_denoising_normal,
-                             int render_pass_motion,
                             int full_x,
                             int full_y,
                             int width,
@@ -829,17 +814,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
    normal_out[1] = normal_in[1] * pixel_scale;
    normal_out[2] = normal_in[2] * pixel_scale;
  }
-
-  /* Flow pass. */
-  if (guiding_pass_flow != PASS_UNUSED) {
-    kernel_assert(render_pass_motion != PASS_UNUSED);
-
-    ccl_global const float *motion_in = buffer + render_pass_motion;
-    ccl_global float *flow_out = guiding_pixel + guiding_pass_flow;
-
-    flow_out[0] = -motion_in[0] * pixel_scale;
-    flow_out[1] = -motion_in[1] * pixel_scale;
-  }
 }

 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
@@ -925,6 +899,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  else {
    /* Assigning to zero since this is a default alpha value for 3-component passes, and it
     * is an opaque pixel for 4 component passes. */
+
    denoised_pixel[3] = 0;
  }
 }
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -98,12 +98,8 @@ using namespace metal::raytracing;
 #define FN14(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14;
 #define FN15(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15;
 #define FN16(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16;
-#define FN17(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17;
-#define FN18(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18;
-#define FN19(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; p19;
-#define FN20(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; p19; p20;
-#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, ...) p20
-#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN20, FN19, FN18, FN17, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0)
+#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, ...) p16
+#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0)

 /* Generate a struct containing the entry-point parameters and a "run"
 * method which can access them implicitly via this-> */
--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -211,7 +211,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
  }
  
  /* Always use baked shadow transparency for curves. */
-  if (type & PRIMITIVE_CURVE) {
+  if (type & PRIMITIVE_ALL_CURVE) {
    float throughput = payload.throughput;
    throughput *= context.intersection_curve_shadow_transparency(nullptr, object, prim, u);
    payload.throughput = throughput;
@@ -476,7 +476,7 @@ __intersection__curve_ribbon(constant KernelParamsMetal &launch_params_metal [[b
  result.continue_search = true;
  result.distance = ray_tmax;

-  if (segment.type & PRIMITIVE_CURVE_RIBBON) {
+  if (segment.type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
    metalrt_intersection_curve(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
 #  if defined(__METALRT_MOTION__)
                               payload.time,
@@ -507,7 +507,7 @@ __intersection__curve_ribbon_shadow(constant KernelParamsMetal &launch_params_me
  result.continue_search = true;
  result.distance = ray_tmax;

-  if (segment.type & PRIMITIVE_CURVE_RIBBON) {
+  if (segment.type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
    metalrt_intersection_curve_shadow(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
 #  if defined(__METALRT_MOTION__)
                               payload.time,
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -194,7 +194,7 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
    type = kernel_tex_fetch(__objects, object).primitive_type;
  }
 #  ifdef __HAIR__
-  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
+  else if (optixGetHitKind() & PRIMITIVE_ALL_CURVE) {
    u = __uint_as_float(optixGetAttribute_0());
    v = __uint_as_float(optixGetAttribute_1());

@@ -234,7 +234,7 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
  }

  /* Always use baked shadow transparency for curves. */
-  if (type & PRIMITIVE_CURVE) {
+  if (type & PRIMITIVE_ALL_CURVE) {
    float throughput = __uint_as_float(optixGetPayload_1());
    throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, u);
    optixSetPayload_1(__float_as_uint(throughput));
@@ -320,7 +320,7 @@ extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
 {
 #ifdef __HAIR__
 #  if OPTIX_ABI_VERSION < 55
-  if (optixGetPrimitiveType() == OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE) {
+  if (optixGetHitKind() & PRIMITIVE_ALL_CURVE) {
    /* Filter out curve endcaps. */
    const float u = __uint_as_float(optixGetAttribute_0());
    if (u == 0.0f || u == 1.0f) {
@@ -359,7 +359,7 @@ extern "C" __global__ void __closesthit__kernel_optix_hit()
    optixSetPayload_3(prim);
    optixSetPayload_5(kernel_tex_fetch(__objects, object).primitive_type);
  }
-  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
+  else if (optixGetHitKind() & PRIMITIVE_ALL_CURVE) {
    const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
    optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */
    optixSetPayload_2(optixGetAttribute_1());
@@ -406,7 +406,6 @@ ccl_device_inline void optix_intersection_curve(const int prim, const int type)
    isect.t *= len;

  if (curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
-    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
    optixReportIntersection(isect.t / len,
                            type & PRIMITIVE_ALL,
                            __float_as_int(isect.u),  /* Attribute_0 */
@@ -419,7 +418,7 @@ extern "C" __global__ void __intersection__curve_ribbon()
  const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, optixGetPrimitiveIndex());
  const int prim = segment.prim;
  const int type = segment.type;
-  if (type & PRIMITIVE_CURVE_RIBBON) {
+  if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
    optix_intersection_curve(prim, type);
  }
 }
@@ -461,7 +460,6 @@ extern "C" __global__ void __intersection__point()
  }

  if (point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
-    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
    optixReportIntersection(isect.t / len, type & PRIMITIVE_ALL);
  }
 }
--- a/intern/cycles/kernel/film/passes.h
+++ b/intern/cycles/kernel/film/passes.h
@@ -92,14 +92,6 @@ ccl_device_forceinline void kernel_write_denoising_features_surface(
    else if (sc->type == CLOSURE_BSDF_HAIR_PRINCIPLED_ID) {
      closure_albedo *= bsdf_principled_hair_albedo(sc);
    }
-    else if (sc->type == CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID) {
-      /* BSSRDF already accounts for weight, retro-reflection would double up. */
-      ccl_private const PrincipledDiffuseBsdf *bsdf = (ccl_private const PrincipledDiffuseBsdf *)
-          sc;
-      if (bsdf->components == PRINCIPLED_DIFFUSE_RETRO_REFLECTION) {
-        continue;
-      }
-    }

    if (bsdf_get_specular_roughness_squared(sc) > sqr(0.075f)) {
      diffuse_albedo += closure_albedo;
--- a/intern/cycles/kernel/geom/attribute.h
+++ b/intern/cycles/kernel/geom/attribute.h
@@ -36,7 +36,7 @@ ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const S

 ccl_device_inline uint attribute_primitive_type(KernelGlobals kg, ccl_private const ShaderData *sd)
 {
-  if ((sd->type & PRIMITIVE_TRIANGLE) && subd_triangle_patch(kg, sd) != ~0) {
+  if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && subd_triangle_patch(kg, sd) != ~0) {
    return ATTR_PRIM_SUBD;
  }
  else {
--- a/intern/cycles/kernel/geom/curve.h
+++ b/intern/cycles/kernel/geom/curve.h
@@ -205,14 +205,14 @@ ccl_device float curve_thickness(KernelGlobals kg, ccl_private const ShaderData
 {
  float r = 0.0f;

-  if (sd->type & PRIMITIVE_CURVE) {
+  if (sd->type & PRIMITIVE_ALL_CURVE) {
    KernelCurve curve = kernel_tex_fetch(__curves, sd->prim);
    int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
    int k1 = k0 + 1;

    float4 P_curve[2];

-    if (!(sd->type & PRIMITIVE_MOTION)) {
+    if (!(sd->type & PRIMITIVE_ALL_MOTION)) {
      P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
      P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
    }
@@ -249,7 +249,7 @@ ccl_device float3 curve_tangent_normal(KernelGlobals kg, ccl_private const Shade
 {
  float3 tgN = make_float3(0.0f, 0.0f, 0.0f);

-  if (sd->type & PRIMITIVE_CURVE) {
+  if (sd->type & PRIMITIVE_ALL_CURVE) {

    tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu, -sd->I) / len_squared(sd->dPdu)));
    tgN = normalize(tgN);
--- a/intern/cycles/kernel/geom/curve_intersect.h
+++ b/intern/cycles/kernel/geom/curve_intersect.h
@@ -635,7 +635,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals kg,
                                            float time,
                                            int type)
 {
-  const bool is_motion = (type & PRIMITIVE_MOTION);
+  const bool is_motion = (type & PRIMITIVE_ALL_MOTION);

  KernelCurve kcurve = kernel_tex_fetch(__curves, prim);

@@ -655,7 +655,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals kg,
    motion_curve_keys(kg, object, prim, time, ka, k0, k1, kb, curve);
  }

-  if (type & PRIMITIVE_CURVE_RIBBON) {
+  if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
    /* todo: adaptive number of subdivisions could help performance here. */
    const int subdivisions = kernel_data.bvh.curve_subdivisions;
    if (ribbon_intersect(P, dir, tmax, subdivisions, curve, isect)) {
@@ -704,7 +704,7 @@ ccl_device_inline void curve_shader_setup(KernelGlobals kg,

  float4 P_curve[4];

-  if (!(sd->type & PRIMITIVE_MOTION)) {
+  if (!(sd->type & PRIMITIVE_ALL_MOTION)) {
    P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
    P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
    P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
@@ -719,7 +719,7 @@ ccl_device_inline void curve_shader_setup(KernelGlobals kg,
  const float4 dPdu4 = catmull_rom_basis_derivative(P_curve, sd->u);
  const float3 dPdu = float4_to_float3(dPdu4);

-  if (sd->type & PRIMITIVE_CURVE_RIBBON) {
+  if (sd->type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
    /* Rounded smooth normals for ribbons, to approximate thick curve shape. */
    const float3 tangent = normalize(dPdu);
    const float3 bitangent = normalize(cross(tangent, -D));
@@ -727,6 +727,8 @@ ccl_device_inline void curve_shader_setup(KernelGlobals kg,
    const float cosine = safe_sqrtf(1.0f - sine * sine);

    sd->N = normalize(sine * bitangent - cosine * normalize(cross(tangent, bitangent)));
+    sd->Ng = -D;
+
 #  if 0
    /* This approximates the position and geometric normal of a thick curve too,
     * but gives too many issues with wrong self intersections. */
@@ -742,27 +744,25 @@ ccl_device_inline void curve_shader_setup(KernelGlobals kg,
    /* NOTE: It is possible that P will be the same as P_inside (precision issues, or very small
     * radius). In this case use the view direction to approximate the normal. */
    const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, sd->u));
-    const float3 N = (!isequal_float3(P, P_inside)) ? normalize(P - P_inside) : -sd->I;
+    const float3 Ng = (!isequal_float3(P, P_inside)) ? normalize(P - P_inside) : -sd->I;

-    sd->N = N;
+    sd->N = Ng;
+    sd->Ng = Ng;
    sd->v = 0.0f;
  }

 #  ifdef __DPDU__
  /* dPdu/dPdv */
  sd->dPdu = dPdu;
+  sd->dPdv = cross(dPdu, sd->Ng);
 #  endif

-  /* Convert to world space. */
  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-    object_position_transform_auto(kg, sd, &P);
-    object_normal_transform_auto(kg, sd, &sd->N);
-    object_dir_transform_auto(kg, sd, &sd->dPdu);
+    const Transform tfm = object_get_transform(kg, sd);
+    P = transform_point(&tfm, P);
  }

  sd->P = P;
-  sd->Ng = (sd->type & PRIMITIVE_CURVE_RIBBON) ? sd->I : sd->N;
-  sd->dPdv = cross(sd->dPdu, sd->Ng);
  sd->shader = kernel_tex_fetch(__curves, sd->prim).shader_id;
 }

--- a/intern/cycles/kernel/geom/motion_triangle.h
+++ b/intern/cycles/kernel/geom/motion_triangle.h
@@ -116,52 +116,6 @@ ccl_device_inline void motion_triangle_vertices(
  verts[2] = (1.0f - t) * verts[2] + t * next_verts[2];
 }

-ccl_device_inline void motion_triangle_vertices_and_normals(
-    KernelGlobals kg, int object, int prim, float time, float3 verts[3], float3 normals[3])
-{
-  /* get motion info */
-  int numsteps, numverts;
-  object_motion_info(kg, object, &numsteps, &numverts, NULL);
-
-  /* Figure out which steps we need to fetch and their interpolation factor. */
-  int maxstep = numsteps * 2;
-  int step = min((int)(time * maxstep), maxstep - 1);
-  float t = time * maxstep - step;
-
-  /* Find attribute. */
-  int offset = intersection_find_attribute(kg, object, ATTR_STD_MOTION_VERTEX_POSITION);
-  kernel_assert(offset != ATTR_STD_NOT_FOUND);
-
-  /* Fetch vertex coordinates. */
-  float3 next_verts[3];
-  uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-
-  motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
-  motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step + 1, next_verts);
-
-  /* Interpolate between steps. */
-  verts[0] = (1.0f - t) * verts[0] + t * next_verts[0];
-  verts[1] = (1.0f - t) * verts[1] + t * next_verts[1];
-  verts[2] = (1.0f - t) * verts[2] + t * next_verts[2];
-
-  /* Compute smooth normal. */
-
-  /* Find attribute. */
-  offset = intersection_find_attribute(kg, object, ATTR_STD_MOTION_VERTEX_NORMAL);
-  kernel_assert(offset != ATTR_STD_NOT_FOUND);
-
-  /* Fetch vertex coordinates. */
-  float3 next_normals[3];
-  motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step, normals);
-  motion_triangle_normals_for_step(
-      kg, tri_vindex, offset, numverts, numsteps, step + 1, next_normals);
-
-  /* Interpolate between steps. */
-  normals[0] = (1.0f - t) * normals[0] + t * next_normals[0];
-  normals[1] = (1.0f - t) * normals[1] + t * next_normals[1];
-  normals[2] = (1.0f - t) * normals[2] + t * next_normals[2];
-}
-
 ccl_device_inline float3 motion_triangle_smooth_normal(
    KernelGlobals kg, float3 Ng, int object, int prim, float u, float v, float time)
 {
--- a/intern/cycles/kernel/geom/point.h
+++ b/intern/cycles/kernel/geom/point.h
@@ -46,11 +46,8 @@ ccl_device float point_attribute_float(KernelGlobals kg,
  }
 }

-ccl_device float2 point_attribute_float2(KernelGlobals kg,
-                                         ccl_private const ShaderData *sd,
-                                         const AttributeDescriptor desc,
-                                         ccl_private float2 *dx,
-                                         ccl_private float2 *dy)
+ccl_device float2 point_attribute_float2(
+    KernelGlobals kg, const ShaderData *sd, const AttributeDescriptor desc, float2 *dx, float2 *dy)
 {
 #  ifdef __RAY_DIFFERENTIALS__
  if (dx)
@@ -67,11 +64,8 @@ ccl_device float2 point_attribute_float2(KernelGlobals kg,
  }
 }

-ccl_device float3 point_attribute_float3(KernelGlobals kg,
-                                         ccl_private const ShaderData *sd,
-                                         const AttributeDescriptor desc,
-                                         ccl_private float3 *dx,
-                                         ccl_private float3 *dy)
+ccl_device float3 point_attribute_float3(
+    KernelGlobals kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy)
 {
 #  ifdef __RAY_DIFFERENTIALS__
  if (dx)
@@ -88,11 +82,8 @@ ccl_device float3 point_attribute_float3(KernelGlobals kg,
  }
 }

-ccl_device float4 point_attribute_float4(KernelGlobals kg,
-                                         ccl_private const ShaderData *sd,
-                                         const AttributeDescriptor desc,
-                                         ccl_private float4 *dx,
-                                         ccl_private float4 *dy)
+ccl_device float4 point_attribute_float4(
+    KernelGlobals kg, const ShaderData *sd, const AttributeDescriptor desc, float4 *dx, float4 *dy)
 {
 #  ifdef __RAY_DIFFERENTIALS__
  if (dx)
@@ -113,7 +104,7 @@ ccl_device float4 point_attribute_float4(KernelGlobals kg,

 ccl_device float point_radius(KernelGlobals kg, ccl_private const ShaderData *sd)
 {
-  if (sd->type & PRIMITIVE_POINT) {
+  if (sd->type & PRIMITIVE_ALL_POINT) {
    return kernel_tex_fetch(__points, sd->prim).w;
  }

--- a/intern/cycles/kernel/geom/point_intersect.h
+++ b/intern/cycles/kernel/geom/point_intersect.h
@@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN
 #ifdef __POINTCLOUD__

 ccl_device_forceinline bool point_intersect_test(
-    const float4 point, const float3 P, const float3 dir, const float tmax, ccl_private float *t)
+    const float4 point, const float3 P, const float3 dir, const float tmax, float *t)
 {
  const float3 center = float4_to_float3(point);
  const float radius = point.w;
@@ -75,8 +75,8 @@ ccl_device_forceinline bool point_intersect(KernelGlobals kg,
                                            const float time,
                                            const int type)
 {
-  const float4 point = (type & PRIMITIVE_MOTION) ? motion_point(kg, object, prim, time) :
-                                                   kernel_tex_fetch(__points, prim);
+  const float4 point = (type & PRIMITIVE_ALL_MOTION) ? motion_point(kg, object, prim, time) :
+                                                       kernel_tex_fetch(__points, prim);

  if (!point_intersect_test(point, P, dir, tmax, &isect->t)) {
    return false;
@@ -93,7 +93,7 @@ ccl_device_forceinline bool point_intersect(KernelGlobals kg,
 ccl_device_inline void point_shader_setup(KernelGlobals kg,
                                          ccl_private ShaderData *sd,
                                          ccl_private const Intersection *isect,
-                                          ccl_private const Ray *ray)
+                                          const Ray *ray)
 {
  sd->shader = kernel_tex_fetch(__points_shader, isect->prim);
  sd->P = ray->P + ray->D * isect->t;
@@ -104,12 +104,17 @@ ccl_device_inline void point_shader_setup(KernelGlobals kg,
  sd->v = isect->v;
 #  endif

-  /* Compute point center for normal. */
-  float3 center = float4_to_float3((isect->type & PRIMITIVE_MOTION) ?
+  /* Computer point center for normal. */
+  float3 center = float4_to_float3((isect->type & PRIMITIVE_ALL_MOTION) ?
                                       motion_point(kg, sd->object, sd->prim, sd->time) :
                                       kernel_tex_fetch(__points, sd->prim));
+
  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-    object_position_transform_auto(kg, sd, &center);
+    const Transform tfm = object_get_transform(kg, sd);
+
+#  ifndef __KERNEL_OPTIX__
+    center = transform_point(&tfm, center);
+#  endif
  }

  /* Normal */
--- a/intern/cycles/kernel/geom/primitive.h
+++ b/intern/cycles/kernel/geom/primitive.h
@@ -37,19 +37,19 @@ ccl_device_inline float primitive_surface_attribute_float(KernelGlobals kg,
                                                          ccl_private float *dx,
                                                          ccl_private float *dy)
 {
-  if (sd->type & PRIMITIVE_TRIANGLE) {
+  if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
    if (subd_triangle_patch(kg, sd) == ~0)
      return triangle_attribute_float(kg, sd, desc, dx, dy);
    else
      return subd_triangle_attribute_float(kg, sd, desc, dx, dy);
  }
 #ifdef __HAIR__
-  else if (sd->type & PRIMITIVE_CURVE) {
+  else if (sd->type & PRIMITIVE_ALL_CURVE) {
    return curve_attribute_float(kg, sd, desc, dx, dy);
  }
 #endif
 #ifdef __POINTCLOUD__
-  else if (sd->type & PRIMITIVE_POINT) {
+  else if (sd->type & PRIMITIVE_ALL_POINT) {
    return point_attribute_float(kg, sd, desc, dx, dy);
  }
 #endif
@@ -68,19 +68,19 @@ ccl_device_inline float2 primitive_surface_attribute_float2(KernelGlobals kg,
                                                            ccl_private float2 *dx,
                                                            ccl_private float2 *dy)
 {
-  if (sd->type & PRIMITIVE_TRIANGLE) {
+  if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
    if (subd_triangle_patch(kg, sd) == ~0)
      return triangle_attribute_float2(kg, sd, desc, dx, dy);
    else
      return subd_triangle_attribute_float2(kg, sd, desc, dx, dy);
  }
 #ifdef __HAIR__
-  else if (sd->type & PRIMITIVE_CURVE) {
+  else if (sd->type & PRIMITIVE_ALL_CURVE) {
    return curve_attribute_float2(kg, sd, desc, dx, dy);
  }
 #endif
 #ifdef __POINTCLOUD__
-  else if (sd->type & PRIMITIVE_POINT) {
+  else if (sd->type & PRIMITIVE_ALL_POINT) {
    return point_attribute_float2(kg, sd, desc, dx, dy);
  }
 #endif
@@ -99,19 +99,19 @@ ccl_device_inline float3 primitive_surface_attribute_float3(KernelGlobals kg,
                                                            ccl_private float3 *dx,
                                                            ccl_private float3 *dy)
 {
-  if (sd->type & PRIMITIVE_TRIANGLE) {
+  if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
    if (subd_triangle_patch(kg, sd) == ~0)
      return triangle_attribute_float3(kg, sd, desc, dx, dy);
    else
      return subd_triangle_attribute_float3(kg, sd, desc, dx, dy);
  }
 #ifdef __HAIR__
-  else if (sd->type & PRIMITIVE_CURVE) {
+  else if (sd->type & PRIMITIVE_ALL_CURVE) {
    return curve_attribute_float3(kg, sd, desc, dx, dy);
  }
 #endif
 #ifdef __POINTCLOUD__
-  else if (sd->type & PRIMITIVE_POINT) {
+  else if (sd->type & PRIMITIVE_ALL_POINT) {
    return point_attribute_float3(kg, sd, desc, dx, dy);
  }
 #endif
@@ -130,19 +130,19 @@ ccl_device_forceinline float4 primitive_surface_attribute_float4(KernelGlobals k
                                                                 ccl_private float4 *dx,
                                                                 ccl_private float4 *dy)
 {
-  if (sd->type & PRIMITIVE_TRIANGLE) {
+  if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
    if (subd_triangle_patch(kg, sd) == ~0)
      return triangle_attribute_float4(kg, sd, desc, dx, dy);
    else
      return subd_triangle_attribute_float4(kg, sd, desc, dx, dy);
  }
 #ifdef __HAIR__
-  else if (sd->type & PRIMITIVE_CURVE) {
+  else if (sd->type & PRIMITIVE_ALL_CURVE) {
    return curve_attribute_float4(kg, sd, desc, dx, dy);
  }
 #endif
 #ifdef __POINTCLOUD__
-  else if (sd->type & PRIMITIVE_POINT) {
+  else if (sd->type & PRIMITIVE_ALL_POINT) {
    return point_attribute_float4(kg, sd, desc, dx, dy);
  }
 #endif
@@ -246,7 +246,7 @@ ccl_device bool primitive_ptex(KernelGlobals kg,
 ccl_device float3 primitive_tangent(KernelGlobals kg, ccl_private ShaderData *sd)
 {
 #if defined(__HAIR__) || defined(__POINTCLOUD__)
-  if (sd->type & (PRIMITIVE_CURVE | PRIMITIVE_POINT))
+  if (sd->type & (PRIMITIVE_ALL_CURVE | PRIMITIVE_ALL_POINT))
 #  ifdef __DPDU__
    return normalize(sd->dPdu);
 #  else
@@ -282,16 +282,16 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals kg,
  float3 center;

 #if defined(__HAIR__) || defined(__POINTCLOUD__)
-  bool is_curve_or_point = sd->type & (PRIMITIVE_CURVE | PRIMITIVE_POINT);
+  bool is_curve_or_point = sd->type & (PRIMITIVE_ALL_CURVE | PRIMITIVE_ALL_POINT);
  if (is_curve_or_point) {
    center = make_float3(0.0f, 0.0f, 0.0f);

-    if (sd->type & PRIMITIVE_CURVE) {
+    if (sd->type & PRIMITIVE_ALL_CURVE) {
 #  if defined(__HAIR__)
      center = curve_motion_center_location(kg, sd);
 #  endif
    }
-    else if (sd->type & PRIMITIVE_POINT) {
+    else if (sd->type & PRIMITIVE_ALL_POINT) {
 #  if defined(__POINTCLOUD__)
      center = point_motion_center_location(kg, sd);
 #  endif
@@ -331,7 +331,7 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals kg,
    }
    else
 #endif
-        if (sd->type & PRIMITIVE_TRIANGLE) {
+        if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
      /* Triangle */
      if (subd_triangle_patch(kg, sd) == ~0) {
        motion_pre = triangle_attribute_float3(kg, sd, desc, NULL, NULL);
--- a/intern/cycles/kernel/geom/shader_data.h
+++ b/intern/cycles/kernel/geom/shader_data.h
@@ -69,57 +69,55 @@ ccl_device_inline void shader_setup_from_ray(KernelGlobals kg,
  sd->I = -ray->D;

 #ifdef __HAIR__
-  if (sd->type & PRIMITIVE_CURVE) {
+  if (sd->type & PRIMITIVE_ALL_CURVE) {
    /* curve */
    curve_shader_setup(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim);
  }
  else
 #endif
 #ifdef __POINTCLOUD__
-      if (sd->type & PRIMITIVE_POINT) {
+      if (sd->type & PRIMITIVE_ALL_POINT) {
    /* point */
    point_shader_setup(kg, sd, isect, ray);
  }
  else
 #endif
-  {
-    if (sd->type == PRIMITIVE_TRIANGLE) {
-      /* static triangle */
-      float3 Ng = triangle_normal(kg, sd);
-      sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
+      if (sd->type & PRIMITIVE_TRIANGLE) {
+    /* static triangle */
+    float3 Ng = triangle_normal(kg, sd);
+    sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);

-      /* vectors */
-      sd->P = triangle_refine(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim);
-      sd->Ng = Ng;
-      sd->N = Ng;
+    /* vectors */
+    sd->P = triangle_refine(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim);
+    sd->Ng = Ng;
+    sd->N = Ng;

-      /* smooth normal */
-      if (sd->shader & SHADER_SMOOTH_NORMAL)
-        sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
+    /* smooth normal */
+    if (sd->shader & SHADER_SMOOTH_NORMAL)
+      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);

 #ifdef __DPDU__
-      /* dPdu/dPdv */
-      triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+    /* dPdu/dPdv */
+    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
 #endif
-    }
-    else {
-      /* motion triangle */
-      motion_triangle_shader_setup(
-          kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim, false);
-    }
-
-    if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-      /* instance transform */
-      object_normal_transform_auto(kg, sd, &sd->N);
-      object_normal_transform_auto(kg, sd, &sd->Ng);
-#ifdef __DPDU__
-      object_dir_transform_auto(kg, sd, &sd->dPdu);
-      object_dir_transform_auto(kg, sd, &sd->dPdv);
-#endif
-    }
+  }
+  else {
+    /* motion triangle */
+    motion_triangle_shader_setup(
+        kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim, false);
  }

-  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+  sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+
+  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+    /* instance transform */
+    object_normal_transform_auto(kg, sd, &sd->N);
+    object_normal_transform_auto(kg, sd, &sd->Ng);
+#ifdef __DPDU__
+    object_dir_transform_auto(kg, sd, &sd->dPdu);
+    object_dir_transform_auto(kg, sd, &sd->dPdv);
+#endif
+  }

  /* backfacing test */
  bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
@@ -203,7 +201,7 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals kg,
    object_dir_transform_auto(kg, sd, &sd->I);
  }

-  if (sd->type == PRIMITIVE_TRIANGLE) {
+  if (sd->type & PRIMITIVE_TRIANGLE) {
    /* smooth normal */
    if (sd->shader & SHADER_SMOOTH_NORMAL) {
      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -82,7 +82,7 @@ ccl_device_forceinline void integrate_surface_emission(KernelGlobals kg,

 #  ifdef __HAIR__
  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) &&
-      (sd->type & PRIMITIVE_TRIANGLE))
+      (sd->type & PRIMITIVE_ALL_TRIANGLE))
 #  else
  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
 #  endif
--- a/intern/cycles/kernel/light/sample.h
+++ b/intern/cycles/kernel/light/sample.h
@@ -141,23 +141,14 @@ ccl_device_inline float3 shadow_ray_smooth_surface_offset(
    KernelGlobals kg, ccl_private const ShaderData *ccl_restrict sd, float3 Ng)
 {
  float3 V[3], N[3];
-
-  if (sd->type == PRIMITIVE_MOTION_TRIANGLE) {
-    motion_triangle_vertices_and_normals(kg, sd->object, sd->prim, sd->time, V, N);
-  }
-  else {
-    kernel_assert(sd->type == PRIMITIVE_TRIANGLE);
-    triangle_vertices_and_normals(kg, sd->prim, V, N);
-  }
+  triangle_vertices_and_normals(kg, sd->prim, V, N);

  const float u = sd->u, v = sd->v;
  const float w = 1 - u - v;
  float3 P = V[0] * u + V[1] * v + V[2] * w; /* Local space */
  float3 n = N[0] * u + N[1] * v + N[2] * w; /* We get away without normalization */

-  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-    object_normal_transform(kg, sd, &n); /* Normal x scale, world space */
-  }
+  object_normal_transform(kg, sd, &n); /* Normal x scale, world space */

  /* Parabolic approximation */
  float a = dot(N[2] - N[0], V[0] - V[2]);
@@ -200,7 +191,7 @@ ccl_device_inline float3 shadow_ray_offset(KernelGlobals kg,
  float3 Ng = (transmit ? -sd->Ng : sd->Ng);
  float3 P = ray_offset(sd->P, Ng);

-  if ((sd->type & PRIMITIVE_TRIANGLE) && (sd->shader & SHADER_SMOOTH_NORMAL)) {
+  if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && (sd->shader & SHADER_SMOOTH_NORMAL)) {
    const float offset_cutoff =
        kernel_tex_fetch(__objects, sd->object).shadow_terminator_geometry_offset;
    /* Do ray offset (heavy stuff) only for close to be terminated triangles:
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@@ -960,15 +960,13 @@ bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg
    return set_attribute_int(3, type, derivatives, val);
  }
  else if ((name == u_geom_trianglevertices || name == u_geom_polyvertices) &&
-           sd->type & PRIMITIVE_TRIANGLE) {
+           sd->type & PRIMITIVE_ALL_TRIANGLE) {
    float3 P[3];

-    if (sd->type & PRIMITIVE_MOTION) {
-      motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, P);
-    }
-    else {
+    if (sd->type & PRIMITIVE_TRIANGLE)
      triangle_vertices(kg, sd->prim, P);
-    }
+    else
+      motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, P);

    if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
      object_position_transform(kg, sd, &P[0]);
@@ -988,7 +986,7 @@ bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg
  }
  /* Hair Attributes */
  else if (name == u_is_curve) {
-    float f = (sd->type & PRIMITIVE_CURVE) != 0;
+    float f = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
    return set_attribute_float(f, type, derivatives, val);
  }
  else if (name == u_curve_thickness) {
@@ -1001,7 +999,7 @@ bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg
  }
  /* point attributes */
  else if (name == u_is_point) {
-    float f = (sd->type & PRIMITIVE_POINT) != 0;
+    float f = (sd->type & PRIMITIVE_ALL_POINT) != 0;
    return set_attribute_float(f, type, derivatives, val);
  }
  else if (name == u_point_radius) {
@@ -1009,7 +1007,7 @@ bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg
    return set_attribute_float(f, type, derivatives, val);
  }
  else if (name == u_normal_map_normal) {
-    if (sd->type & PRIMITIVE_TRIANGLE) {
+    if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
      float3 f = triangle_smooth_normal_unnormalized(kg, sd, sd->Ng, sd->prim, sd->u, sd->v);
      return set_attribute_float3(f, type, derivatives, val);
    }
--- a/intern/cycles/kernel/osl/shaders/node_normal_map.osl
+++ b/intern/cycles/kernel/osl/shaders/node_normal_map.osl
@@ -85,4 +85,6 @@ shader node_normal_map(normal NormalIn = N,

  if (Strength != 1.0)
    Normal = normalize(NormalIn + (Normal - NormalIn) * max(Strength, 0.0));
+
+  Normal = ensure_valid_reflection(Ng, I, Normal);
 }
--- a/intern/cycles/kernel/svm/bevel.h
+++ b/intern/cycles/kernel/svm/bevel.h
@@ -206,12 +206,12 @@ ccl_device float3 svm_bevel(
    for (int hit = 0; hit < num_eval_hits; hit++) {
      /* Quickly retrieve P and Ng without setting up ShaderData. */
      float3 hit_P;
-      if (sd->type == PRIMITIVE_TRIANGLE) {
+      if (sd->type & PRIMITIVE_TRIANGLE) {
        hit_P = triangle_refine_local(
            kg, sd, ray.P, ray.D, ray.t, isect.hits[hit].object, isect.hits[hit].prim);
      }
 #  ifdef __OBJECT_MOTION__
-      else if (sd->type == PRIMITIVE_MOTION_TRIANGLE) {
+      else if (sd->type & PRIMITIVE_MOTION_TRIANGLE) {
        float3 verts[3];
        motion_triangle_vertices(kg, sd->object, isect.hits[hit].prim, sd->time, verts);
        hit_P = motion_triangle_refine_local(
@@ -236,11 +236,11 @@ ccl_device float3 svm_bevel(
        float u = isect.hits[hit].u;
        float v = isect.hits[hit].v;

-        if (sd->type == PRIMITIVE_TRIANGLE) {
+        if (sd->type & PRIMITIVE_TRIANGLE) {
          N = triangle_smooth_normal(kg, N, prim, u, v);
        }
 #  ifdef __OBJECT_MOTION__
-        else if (sd->type == PRIMITIVE_MOTION_TRIANGLE) {
+        else if (sd->type & PRIMITIVE_MOTION_TRIANGLE) {
          N = motion_triangle_smooth_normal(kg, N, sd->object, prim, u, v, sd->time);
        }
 #  endif /* __OBJECT_MOTION__ */
--- a/intern/cycles/kernel/svm/closure.h
+++ b/intern/cycles/kernel/svm/closure.h
@@ -107,7 +107,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
  }

  float3 N = stack_valid(data_node.x) ? stack_load_float3(stack, data_node.x) : sd->N;
-  if (!(sd->type & PRIMITIVE_CURVE)) {
+  if (!(sd->type & PRIMITIVE_ALL_CURVE)) {
    N = ensure_valid_reflection(sd->Ng, sd->I, N);
  }

@@ -191,7 +191,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
      float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ?
                                    stack_load_float3(stack, data_cn_ssr.x) :
                                    sd->N;
-      if (!(sd->type & PRIMITIVE_CURVE)) {
+      if (!(sd->type & PRIMITIVE_ALL_CURVE)) {
        clearcoat_normal = ensure_valid_reflection(sd->Ng, sd->I, clearcoat_normal);
      }
      float3 subsurface_radius = stack_valid(data_cn_ssr.y) ?
@@ -902,7 +902,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
        if (stack_valid(data_node.y)) {
          bsdf->T = normalize(stack_load_float3(stack, data_node.y));
        }
-        else if (!(sd->type & PRIMITIVE_CURVE)) {
+        else if (!(sd->type & PRIMITIVE_ALL_CURVE)) {
          bsdf->T = normalize(sd->dPdv);
          bsdf->offset = 0.0f;
        }
--- a/intern/cycles/kernel/svm/geometry.h
+++ b/intern/cycles/kernel/svm/geometry.h
@@ -227,7 +227,7 @@ ccl_device_noinline void svm_node_hair_info(KernelGlobals kg,

  switch (type) {
    case NODE_INFO_CURVE_IS_STRAND: {
-      data = (sd->type & PRIMITIVE_CURVE) != 0;
+      data = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
      stack_store_float(stack, out_offset, data);
      break;
    }
--- a/intern/cycles/kernel/svm/magic.h
+++ b/intern/cycles/kernel/svm/magic.h
@@ -25,7 +25,7 @@ ccl_device_noinline_cpu float3 svm_magic(float3 p, float scale, int n, float dis
  /*
   * Prevent NaNs due to input p
   * Sin and Cosine are periodic about [0 2*PI) so the following
-   * will yield a more accurate result. As it stops the input values
+   * will yeild a more accurate result. As it stops the input values
   * going out of range for floats which caused a NaN. The
   * calculation of (px + py + pz)*5 can cause an Inf when one or more
   * values are very large the cos or sin of this results in a NaN
--- a/intern/cycles/kernel/svm/tex_coord.h
+++ b/intern/cycles/kernel/svm/tex_coord.h
@@ -291,7 +291,7 @@ ccl_device_noinline void svm_node_normal_map(KernelGlobals kg,

  if (space == NODE_NORMAL_MAP_TANGENT) {
    /* tangent space */
-    if (sd->object == OBJECT_NONE || (sd->type & PRIMITIVE_TRIANGLE) == 0) {
+    if (sd->object == OBJECT_NONE || (sd->type & PRIMITIVE_ALL_TRIANGLE) == 0) {
      /* Fallback to unperturbed normal. */
      stack_store_float3(stack, normal_offset, sd->N);
      return;
--- a/intern/cycles/kernel/svm/types.h
+++ b/intern/cycles/kernel/svm/types.h
@@ -124,7 +124,7 @@ typedef enum ShaderNodeType {
  NODE_AOV_VALUE,
  NODE_FLOAT_CURVE,
  /* NOTE: for best OpenCL performance, item definition in the enum must
-   * match the switch case order in `svm.h`. */
+   * match the switch case order in svm.h. */
 } ShaderNodeType;

 typedef enum NodeAttributeOutputType {
--- a/intern/cycles/kernel/svm/wireframe.h
+++ b/intern/cycles/kernel/svm/wireframe.h
@@ -43,7 +43,7 @@ ccl_device_inline float wireframe(KernelGlobals kg,
                                  ccl_private float3 *P)
 {
 #if defined(__HAIR__) || defined(__POINTCLOUD__)
-  if (sd->prim != PRIM_NONE && sd->type & PRIMITIVE_TRIANGLE)
+  if (sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE)
 #else
  if (sd->prim != PRIM_NONE)
 #endif
@@ -54,12 +54,10 @@ ccl_device_inline float wireframe(KernelGlobals kg,
    /* Triangles */
    int np = 3;

-    if (sd->type & PRIMITIVE_MOTION) {
-      motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co);
-    }
-    else {
+    if (sd->type & PRIMITIVE_TRIANGLE)
      triangle_vertices(kg, sd->prim, Co);
-    }
+    else
+      motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co);

    if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
      object_position_transform(kg, sd, &Co[0]);
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -202,7 +202,7 @@ enum SamplingPattern {

 /* These flags values correspond to `raytypes` in `osl.cpp`, so keep them in sync! */

-enum PathRayFlag : uint32_t {
+enum PathRayFlag {
  /* --------------------------------------------------------------------
   * Ray visibility.
   *
@@ -388,7 +388,6 @@ typedef enum PassType {
  PASS_DENOISING_NORMAL,
  PASS_DENOISING_ALBEDO,
  PASS_DENOISING_DEPTH,
-  PASS_DENOISING_PREVIOUS,

  /* PASS_SHADOW_CATCHER accumulates contribution of shadow catcher object which is not affected by
   * any other object. The pass accessor will divide the combined pass by the shadow catcher. The
@@ -538,34 +537,31 @@ typedef struct Intersection {
 typedef enum PrimitiveType {
  PRIMITIVE_NONE = 0,
  PRIMITIVE_TRIANGLE = (1 << 0),
-  PRIMITIVE_CURVE_THICK = (1 << 1),
-  PRIMITIVE_CURVE_RIBBON = (1 << 2),
-  PRIMITIVE_POINT = (1 << 3),
-  PRIMITIVE_VOLUME = (1 << 4),
-  PRIMITIVE_LAMP = (1 << 5),
+  PRIMITIVE_MOTION_TRIANGLE = (1 << 1),
+  PRIMITIVE_CURVE_THICK = (1 << 2),
+  PRIMITIVE_MOTION_CURVE_THICK = (1 << 3),
+  PRIMITIVE_CURVE_RIBBON = (1 << 4),
+  PRIMITIVE_MOTION_CURVE_RIBBON = (1 << 5),
+  PRIMITIVE_POINT = (1 << 6),
+  PRIMITIVE_MOTION_POINT = (1 << 7),
+  PRIMITIVE_VOLUME = (1 << 8),
+  PRIMITIVE_LAMP = (1 << 9),

-  PRIMITIVE_MOTION = (1 << 6),
-  PRIMITIVE_MOTION_TRIANGLE = (PRIMITIVE_TRIANGLE | PRIMITIVE_MOTION),
-  PRIMITIVE_MOTION_CURVE_THICK = (PRIMITIVE_CURVE_THICK | PRIMITIVE_MOTION),
-  PRIMITIVE_MOTION_CURVE_RIBBON = (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION),
-  PRIMITIVE_MOTION_POINT = (PRIMITIVE_POINT | PRIMITIVE_MOTION),
+  PRIMITIVE_ALL_TRIANGLE = (PRIMITIVE_TRIANGLE | PRIMITIVE_MOTION_TRIANGLE),
+  PRIMITIVE_ALL_CURVE = (PRIMITIVE_CURVE_THICK | PRIMITIVE_MOTION_CURVE_THICK |
+                         PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON),
+  PRIMITIVE_ALL_POINT = (PRIMITIVE_POINT | PRIMITIVE_MOTION_POINT),
+  PRIMITIVE_ALL_VOLUME = (PRIMITIVE_VOLUME),
+  PRIMITIVE_ALL_MOTION = (PRIMITIVE_MOTION_TRIANGLE | PRIMITIVE_MOTION_CURVE_THICK |
+                          PRIMITIVE_MOTION_CURVE_RIBBON | PRIMITIVE_MOTION_POINT),
+  PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE | PRIMITIVE_ALL_CURVE | PRIMITIVE_ALL_VOLUME |
+                   PRIMITIVE_LAMP | PRIMITIVE_ALL_POINT),

-  PRIMITIVE_CURVE = (PRIMITIVE_CURVE_THICK | PRIMITIVE_CURVE_RIBBON),
-
-  PRIMITIVE_ALL = (PRIMITIVE_TRIANGLE | PRIMITIVE_CURVE | PRIMITIVE_POINT | PRIMITIVE_VOLUME |
-                   PRIMITIVE_LAMP | PRIMITIVE_MOTION),
-
-  PRIMITIVE_NUM_SHAPES = 6,
-  PRIMITIVE_NUM_BITS = PRIMITIVE_NUM_SHAPES + 1, /* All shapes + motion bit. */
-  PRIMITIVE_NUM = PRIMITIVE_NUM_SHAPES * 2,      /* With and without motion. */
+  PRIMITIVE_NUM = 10,
 } PrimitiveType;

-/* Convert type to index in range 0..PRIMITIVE_NUM-1. */
-#define PRIMITIVE_INDEX(type) (bitscan((uint32_t)(type)) * 2 + (((type)&PRIMITIVE_MOTION) ? 1 : 0))
-
-/* Pack segment into type value to save space. */
-#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM_BITS) | (type))
-#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM_BITS)
+#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM) | (type))
+#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM)

 typedef enum CurveShapeType {
  CURVE_RIBBON = 0,
@@ -1560,7 +1556,7 @@ enum {

 /* Kernel Features */

-enum KernelFeatureFlag : uint32_t {
+enum KernelFeatureFlag : unsigned int {
  /* Shader nodes. */
  KERNEL_FEATURE_NODE_BSDF = (1U << 0U),
  KERNEL_FEATURE_NODE_EMISSION = (1U << 1U),
--- a/intern/cycles/scene/geometry.cpp
+++ b/intern/cycles/scene/geometry.cpp
@@ -1002,10 +1002,10 @@ void GeometryManager::device_update_attributes(Device *device,

  /* After mesh attributes and patch tables have been copied to device memory,
   * we need to update offsets in the objects. */
-  scene->object_manager->device_update_geom_offsets(device, dscene, scene);
+  scene->object_manager->device_update_mesh_offsets(device, dscene, scene);
 }

-void GeometryManager::geom_calc_offset(Scene *scene, BVHLayout bvh_layout)
+void GeometryManager::mesh_calc_offset(Scene *scene, BVHLayout bvh_layout)
 {
  size_t vert_size = 0;
  size_t tri_size = 0;
@@ -1370,22 +1370,22 @@ enum {
  DEVICE_MESH_DATA_MODIFIED = (1 << 1),
  DEVICE_POINT_DATA_MODIFIED = (1 << 2),

-  ATTR_FLOAT_MODIFIED = (1 << 3),
-  ATTR_FLOAT2_MODIFIED = (1 << 4),
-  ATTR_FLOAT3_MODIFIED = (1 << 5),
-  ATTR_FLOAT4_MODIFIED = (1 << 6),
-  ATTR_UCHAR4_MODIFIED = (1 << 7),
+  ATTR_FLOAT_MODIFIED = (1 << 2),
+  ATTR_FLOAT2_MODIFIED = (1 << 3),
+  ATTR_FLOAT3_MODIFIED = (1 << 4),
+  ATTR_FLOAT4_MODIFIED = (1 << 5),
+  ATTR_UCHAR4_MODIFIED = (1 << 6),

-  CURVE_DATA_NEED_REALLOC = (1 << 8),
-  MESH_DATA_NEED_REALLOC = (1 << 9),
-  POINT_DATA_NEED_REALLOC = (1 << 10),
+  CURVE_DATA_NEED_REALLOC = (1 << 7),
+  MESH_DATA_NEED_REALLOC = (1 << 8),
+  POINT_DATA_NEED_REALLOC = (1 << 9),

-  ATTR_FLOAT_NEEDS_REALLOC = (1 << 11),
-  ATTR_FLOAT2_NEEDS_REALLOC = (1 << 12),
-  ATTR_FLOAT3_NEEDS_REALLOC = (1 << 13),
-  ATTR_FLOAT4_NEEDS_REALLOC = (1 << 14),
+  ATTR_FLOAT_NEEDS_REALLOC = (1 << 10),
+  ATTR_FLOAT2_NEEDS_REALLOC = (1 << 11),
+  ATTR_FLOAT3_NEEDS_REALLOC = (1 << 12),
+  ATTR_FLOAT4_NEEDS_REALLOC = (1 << 13),

-  ATTR_UCHAR4_NEEDS_REALLOC = (1 << 15),
+  ATTR_UCHAR4_NEEDS_REALLOC = (1 << 14),

  ATTRS_NEED_REALLOC = (ATTR_FLOAT_NEEDS_REALLOC | ATTR_FLOAT2_NEEDS_REALLOC |
                        ATTR_FLOAT3_NEEDS_REALLOC | ATTR_FLOAT4_NEEDS_REALLOC |
@@ -1922,7 +1922,7 @@ void GeometryManager::device_update(Device *device,

  const BVHLayout bvh_layout = BVHParams::best_bvh_layout(scene->params.bvh_layout,
                                                          device->get_bvh_layout_mask());
-  geom_calc_offset(scene, bvh_layout);
+  mesh_calc_offset(scene, bvh_layout);
  if (true_displacement_used || curve_shadow_transparency_used) {
    scoped_callback_timer timer([scene](double time) {
      if (scene->update_stats) {
--- a/intern/cycles/scene/geometry.h
+++ b/intern/cycles/scene/geometry.h
@@ -242,7 +242,7 @@ class GeometryManager {
                             vector<AttributeRequestSet> &object_attributes);

  /* Compute verts/triangles/curves offsets in global arrays. */
-  void geom_calc_offset(Scene *scene, BVHLayout bvh_layout);
+  void mesh_calc_offset(Scene *scene, BVHLayout bvh_layout);

  void device_update_object(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);

--- a/intern/cycles/scene/image.cpp
+++ b/intern/cycles/scene/image.cpp
@@ -381,15 +381,8 @@ ImageHandle ImageManager::add_image(const string &filename,

  foreach (int tile, tiles) {
    string tile_filename = filename;
-
-    /* Since we don't have information about the exact tile format used in this code location,
-     * just attempt all replacement patterns that Blender supports. */
    if (tile != 0) {
      string_replace(tile_filename, "<UDIM>", string_printf("%04d", tile));
-
-      int u = ((tile - 1001) % 10);
-      int v = ((tile - 1001) / 10);
-      string_replace(tile_filename, "<UVTILE>", string_printf("u%d_v%d", u + 1, v + 1));
    }
    const int slot = add_image_slot(new OIIOImageLoader(tile_filename), params, false);
    handle.tile_slots.push_back(slot);
--- a/intern/cycles/scene/object.cpp
+++ b/intern/cycles/scene/object.cpp
@@ -821,7 +821,7 @@ void ObjectManager::device_update_flags(
  dscene->object_volume_step.clear_modified();
 }

-void ObjectManager::device_update_geom_offsets(Device *, DeviceScene *dscene, Scene *scene)
+void ObjectManager::device_update_mesh_offsets(Device *, DeviceScene *dscene, Scene *scene)
 {
  if (dscene->objects.size() == 0) {
    return;
--- a/intern/cycles/scene/object.h
+++ b/intern/cycles/scene/object.h
@@ -162,7 +162,7 @@ class ObjectManager {
                           Scene *scene,
                           Progress &progress,
                           bool bounds_valid = true);
-  void device_update_geom_offsets(Device *device, DeviceScene *dscene, Scene *scene);
+  void device_update_mesh_offsets(Device *device, DeviceScene *dscene, Scene *scene);

  void device_free(Device *device, DeviceScene *dscene, bool force_free);

--- a/intern/cycles/scene/pass.cpp
+++ b/intern/cycles/scene/pass.cpp
@@ -101,7 +101,6 @@ const NodeEnum *Pass::get_type_enum()
    pass_type_enum.insert("denoising_normal", PASS_DENOISING_NORMAL);
    pass_type_enum.insert("denoising_albedo", PASS_DENOISING_ALBEDO);
    pass_type_enum.insert("denoising_depth", PASS_DENOISING_DEPTH);
-    pass_type_enum.insert("denoising_previous", PASS_DENOISING_PREVIOUS);

    pass_type_enum.insert("shadow_catcher", PASS_SHADOW_CATCHER);
    pass_type_enum.insert("shadow_catcher_sample_count", PASS_SHADOW_CATCHER_SAMPLE_COUNT);
@@ -300,10 +299,6 @@ PassInfo Pass::get_info(const PassType type, const bool include_albedo)
    case PASS_DENOISING_DEPTH:
      pass_info.num_components = 1;
      break;
-    case PASS_DENOISING_PREVIOUS:
-      pass_info.num_components = 3;
-      pass_info.use_exposure = true;
-      break;

    case PASS_SHADOW_CATCHER:
      pass_info.num_components = 3;
--- a/intern/cycles/scene/shader_nodes.cpp
+++ b/intern/cycles/scene/shader_nodes.cpp
@@ -5901,7 +5901,7 @@ VectorMapRangeNode::VectorMapRangeNode() : ShaderNode(get_node_type())
 {
 }

-void VectorMapRangeNode::expand(ShaderGraph * /*graph*/)
+void VectorMapRangeNode::expand(ShaderGraph *graph)
 {
 }

--- a/intern/cycles/session/denoising.cpp
+++ b/intern/cycles/session/denoising.cpp
@@ -16,17 +16,62 @@

 #include "session/denoising.h"

-#include "util/map.h"
-#include "util/system.h"
-#include "util/task.h"
-#include "util/time.h"
+#if 0

-#include <OpenImageIO/filesystem.h>
+#  include "kernel/filter/filter_defines.h"
+
+#  include "util/util_foreach.h"
+#  include "util/util_map.h"
+#  include "util/util_system.h"
+#  include "util/util_task.h"
+#  include "util/util_time.h"
+
+#  include <OpenImageIO/filesystem.h>

 CCL_NAMESPACE_BEGIN

 /* Utility Functions */

+static void print_progress(int num, int total, int frame, int num_frames)
+{
+  const char *label = "Denoise Frame ";
+  int cols = system_console_width();
+
+  cols -= strlen(label);
+
+  int len = 1;
+  for (int x = total; x > 9; x /= 10) {
+    len++;
+  }
+
+  int bars = cols - 2 * len - 6;
+
+  printf("\r%s", label);
+
+  if (num_frames > 1) {
+    int frame_len = 1;
+    for (int x = num_frames - 1; x > 9; x /= 10) {
+      frame_len++;
+    }
+    bars -= frame_len + 2;
+    printf("%*d ", frame_len, frame);
+  }
+
+  int v = int(float(num) * bars / total);
+  printf("[");
+  for (int i = 0; i < v; i++) {
+    printf("=");
+  }
+  if (v < bars) {
+    printf(">");
+  }
+  for (int i = v + 1; i < bars; i++) {
+    printf(" ");
+  }
+  printf(string_printf("] %%%dd / %d", len, total).c_str(), num);
+  fflush(stdout);
+}
+
 /* Splits in at its last dot, setting suffix to the part after the dot and in to the part before
 * it. Returns whether a dot was found. */
 static bool split_last_dot(string &in, string &suffix)
@@ -80,18 +125,24 @@ static void fill_mapping(vector<ChannelMapping> &map, int pos, string name, stri
  }
 }

-static const int INPUT_NUM_CHANNELS = 13;
-static const int INPUT_NOISY_IMAGE = 0;
-static const int INPUT_DENOISING_NORMAL = 3;
-static const int INPUT_DENOISING_ALBEDO = 6;
-static const int INPUT_MOTION = 9;
+static const int INPUT_NUM_CHANNELS = 15;
+static const int INPUT_DENOISING_DEPTH = 0;
+static const int INPUT_DENOISING_NORMAL = 1;
+static const int INPUT_DENOISING_SHADOWING = 4;
+static const int INPUT_DENOISING_ALBEDO = 5;
+static const int INPUT_NOISY_IMAGE = 8;
+static const int INPUT_DENOISING_VARIANCE = 11;
+static const int INPUT_DENOISING_INTENSITY = 14;
 static vector<ChannelMapping> input_channels()
 {
  vector<ChannelMapping> map;
-  fill_mapping(map, INPUT_NOISY_IMAGE, "Combined", "RGB");
+  fill_mapping(map, INPUT_DENOISING_DEPTH, "Denoising Depth", "Z");
  fill_mapping(map, INPUT_DENOISING_NORMAL, "Denoising Normal", "XYZ");
+  fill_mapping(map, INPUT_DENOISING_SHADOWING, "Denoising Shadowing", "X");
  fill_mapping(map, INPUT_DENOISING_ALBEDO, "Denoising Albedo", "RGB");
-  fill_mapping(map, INPUT_MOTION, "Vector", "XYZW");
+  fill_mapping(map, INPUT_NOISY_IMAGE, "Noisy Image", "RGB");
+  fill_mapping(map, INPUT_DENOISING_VARIANCE, "Denoising Variance", "RGB");
+  fill_mapping(map, INPUT_DENOISING_INTENSITY, "Denoising Intensity", "X");
  return map;
 }

@@ -111,7 +162,7 @@ bool DenoiseImageLayer::detect_denoising_channels()
  input_to_image_channel.clear();
  input_to_image_channel.resize(INPUT_NUM_CHANNELS, -1);

-  for (const ChannelMapping &mapping : input_channels()) {
+  foreach (const ChannelMapping &mapping, input_channels()) {
    vector<string>::iterator i = find(channels.begin(), channels.end(), mapping.name);
    if (i == channels.end()) {
      return false;
@@ -126,7 +177,7 @@ bool DenoiseImageLayer::detect_denoising_channels()
  output_to_image_channel.clear();
  output_to_image_channel.resize(OUTPUT_NUM_CHANNELS, -1);

-  for (const ChannelMapping &mapping : output_channels()) {
+  foreach (const ChannelMapping &mapping, output_channels()) {
    vector<string>::iterator i = find(channels.begin(), channels.end(), mapping.name);
    if (i == channels.end()) {
      return false;
@@ -148,16 +199,18 @@ bool DenoiseImageLayer::detect_denoising_channels()
  return true;
 }

-bool DenoiseImageLayer::match_channels(const std::vector<string> &channelnames,
+bool DenoiseImageLayer::match_channels(int neighbor,
+                                       const std::vector<string> &channelnames,
                                       const std::vector<string> &neighbor_channelnames)
 {
-  vector<int> &mapping = previous_output_to_image_channel;
+  neighbor_input_to_image_channel.resize(neighbor + 1);
+  vector<int> &mapping = neighbor_input_to_image_channel[neighbor];

  assert(mapping.size() == 0);
-  mapping.resize(output_to_image_channel.size(), -1);
+  mapping.resize(input_to_image_channel.size(), -1);

-  for (int i = 0; i < output_to_image_channel.size(); i++) {
-    const string &channel = channelnames[output_to_image_channel[i]];
+  for (int i = 0; i < input_to_image_channel.size(); i++) {
+    const string &channel = channelnames[input_to_image_channel[i]];
    std::vector<string>::const_iterator frame_channel = find(
        neighbor_channelnames.begin(), neighbor_channelnames.end(), channel);

@@ -173,9 +226,19 @@ bool DenoiseImageLayer::match_channels(const std::vector<string> &channelnames,

 /* Denoise Task */

-DenoiseTask::DenoiseTask(Device *device, DenoiserPipeline *denoiser, int frame)
-    : denoiser(denoiser), device(device), frame(frame), current_layer(0), buffers(device)
+DenoiseTask::DenoiseTask(Device *device,
+                         DenoiserPipeline *denoiser,
+                         int frame,
+                         const vector<int> &neighbor_frames)
+    : denoiser(denoiser),
+      device(device),
+      frame(frame),
+      neighbor_frames(neighbor_frames),
+      current_layer(0),
+      input_pixels(device, "filter input buffer", MEM_READ_ONLY),
+      num_tiles(0)
 {
+  image.samples = denoiser->samples_override;
 }

 DenoiseTask::~DenoiseTask()
@@ -183,39 +246,284 @@ DenoiseTask::~DenoiseTask()
  free();
 }

+/* Device callbacks */
+
+bool DenoiseTask::acquire_tile(Device *device, Device *tile_device, RenderTile &tile)
+{
+  thread_scoped_lock tile_lock(tiles_mutex);
+
+  if (tiles.empty()) {
+    return false;
+  }
+
+  tile = tiles.front();
+  tiles.pop_front();
+
+  device->map_tile(tile_device, tile);
+
+  print_progress(num_tiles - tiles.size(), num_tiles, frame, denoiser->num_frames);
+
+  return true;
+}
+
+/* Mapping tiles is required for regular rendering since each tile has its separate memory
+ * which may be allocated on a different device.
+ * For standalone denoising, there is a single memory that is present on all devices, so the only
+ * thing that needs to be done here is to specify the surrounding tile geometry.
+ *
+ * However, since there is only one large memory, the denoised result has to be written to
+ * a different buffer to avoid having to copy an entire horizontal slice of the image. */
+void DenoiseTask::map_neighboring_tiles(RenderTileNeighbors &neighbors, Device *tile_device)
+{
+  RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
+  RenderTile &target_tile = neighbors.target;
+
+  /* Fill tile information. */
+  for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+    if (i == RenderTileNeighbors::CENTER) {
+      continue;
+    }
+
+    RenderTile &tile = neighbors.tiles[i];
+    int dx = (i % 3) - 1;
+    int dy = (i / 3) - 1;
+    tile.x = clamp(center_tile.x + dx * denoiser->tile_size.x, 0, image.width);
+    tile.w = clamp(center_tile.x + (dx + 1) * denoiser->tile_size.x, 0, image.width) - tile.x;
+    tile.y = clamp(center_tile.y + dy * denoiser->tile_size.y, 0, image.height);
+    tile.h = clamp(center_tile.y + (dy + 1) * denoiser->tile_size.y, 0, image.height) - tile.y;
+
+    tile.buffer = center_tile.buffer;
+    tile.offset = center_tile.offset;
+    tile.stride = image.width;
+  }
+
+  /* Allocate output buffer. */
+  device_vector<float> *output_mem = new device_vector<float>(
+      tile_device, "denoising_output", MEM_READ_WRITE);
+  output_mem->alloc(OUTPUT_NUM_CHANNELS * center_tile.w * center_tile.h);
+
+  /* Fill output buffer with noisy image, assumed by kernel_filter_finalize
+   * when skipping denoising of some pixels. */
+  float *result = output_mem->data();
+  float *in = &image.pixels[image.num_channels * (center_tile.y * image.width + center_tile.x)];
+
+  const DenoiseImageLayer &layer = image.layers[current_layer];
+  const int *input_to_image_channel = layer.input_to_image_channel.data();
+
+  for (int y = 0; y < center_tile.h; y++) {
+    for (int x = 0; x < center_tile.w; x++, result += OUTPUT_NUM_CHANNELS) {
+      for (int i = 0; i < OUTPUT_NUM_CHANNELS; i++) {
+        result[i] = in[image.num_channels * x + input_to_image_channel[INPUT_NOISY_IMAGE + i]];
+      }
+    }
+    in += image.num_channels * image.width;
+  }
+
+  output_mem->copy_to_device();
+
+  /* Fill output tile info. */
+  target_tile = center_tile;
+  target_tile.buffer = output_mem->device_pointer;
+  target_tile.stride = target_tile.w;
+  target_tile.offset -= target_tile.x + target_tile.y * target_tile.stride;
+
+  thread_scoped_lock output_lock(output_mutex);
+  assert(output_pixels.count(center_tile.tile_index) == 0);
+  output_pixels[target_tile.tile_index] = output_mem;
+}
+
+void DenoiseTask::unmap_neighboring_tiles(RenderTileNeighbors &neighbors)
+{
+  RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
+  RenderTile &target_tile = neighbors.target;
+
+  thread_scoped_lock output_lock(output_mutex);
+  assert(output_pixels.count(center_tile.tile_index) == 1);
+  device_vector<float> *output_mem = output_pixels[target_tile.tile_index];
+  output_pixels.erase(center_tile.tile_index);
+  output_lock.unlock();
+
+  /* Copy denoised pixels from device. */
+  output_mem->copy_from_device(0, OUTPUT_NUM_CHANNELS * target_tile.w, target_tile.h);
+
+  float *result = output_mem->data();
+  float *out = &image.pixels[image.num_channels * (target_tile.y * image.width + target_tile.x)];
+
+  const DenoiseImageLayer &layer = image.layers[current_layer];
+  const int *output_to_image_channel = layer.output_to_image_channel.data();
+
+  for (int y = 0; y < target_tile.h; y++) {
+    for (int x = 0; x < target_tile.w; x++, result += OUTPUT_NUM_CHANNELS) {
+      for (int i = 0; i < OUTPUT_NUM_CHANNELS; i++) {
+        out[image.num_channels * x + output_to_image_channel[i]] = result[i];
+      }
+    }
+    out += image.num_channels * image.width;
+  }
+
+  /* Free device buffer. */
+  output_mem->free();
+  delete output_mem;
+}
+
+void DenoiseTask::release_tile()
+{
+}
+
+bool DenoiseTask::get_cancel()
+{
+  return false;
+}
+
+void DenoiseTask::create_task(DeviceTask &task)
+{
+  /* Callback functions. */
+  task.acquire_tile = function_bind(&DenoiseTask::acquire_tile, this, device, _1, _2);
+  task.map_neighbor_tiles = function_bind(&DenoiseTask::map_neighboring_tiles, this, _1, _2);
+  task.unmap_neighbor_tiles = function_bind(&DenoiseTask::unmap_neighboring_tiles, this, _1);
+  task.release_tile = function_bind(&DenoiseTask::release_tile, this);
+  task.get_cancel = function_bind(&DenoiseTask::get_cancel, this);
+
+  /* Denoising parameters. */
+  task.denoising = denoiser->params;
+  task.denoising.type = DENOISER_NLM;
+  task.denoising.use = true;
+  task.denoising_from_render = false;
+
+  task.denoising_frames.resize(neighbor_frames.size());
+  for (int i = 0; i < neighbor_frames.size(); i++) {
+    task.denoising_frames[i] = neighbor_frames[i] - frame;
+  }
+
+  /* Buffer parameters. */
+  task.pass_stride = INPUT_NUM_CHANNELS;
+  task.target_pass_stride = OUTPUT_NUM_CHANNELS;
+  task.pass_denoising_data = 0;
+  task.pass_denoising_clean = -1;
+  task.frame_stride = image.width * image.height * INPUT_NUM_CHANNELS;
+
+  /* Create tiles. */
+  thread_scoped_lock tile_lock(tiles_mutex);
+  thread_scoped_lock output_lock(output_mutex);
+
+  tiles.clear();
+  assert(output_pixels.empty());
+  output_pixels.clear();
+
+  int tiles_x = divide_up(image.width, denoiser->tile_size.x);
+  int tiles_y = divide_up(image.height, denoiser->tile_size.y);
+
+  for (int ty = 0; ty < tiles_y; ty++) {
+    for (int tx = 0; tx < tiles_x; tx++) {
+      RenderTile tile;
+      tile.x = tx * denoiser->tile_size.x;
+      tile.y = ty * denoiser->tile_size.y;
+      tile.w = min(image.width - tile.x, denoiser->tile_size.x);
+      tile.h = min(image.height - tile.y, denoiser->tile_size.y);
+      tile.start_sample = 0;
+      tile.num_samples = image.layers[current_layer].samples;
+      tile.sample = 0;
+      tile.offset = 0;
+      tile.stride = image.width;
+      tile.tile_index = ty * tiles_x + tx;
+      tile.task = RenderTile::DENOISE;
+      tile.buffers = NULL;
+      tile.buffer = input_pixels.device_pointer;
+      tiles.push_back(tile);
+    }
+  }
+
+  num_tiles = tiles.size();
+}
+
 /* Denoiser Operations */

 bool DenoiseTask::load_input_pixels(int layer)
 {
+  int w = image.width;
+  int h = image.height;
+  int num_pixels = image.width * image.height;
+  int frame_stride = num_pixels * INPUT_NUM_CHANNELS;
+
  /* Load center image */
  DenoiseImageLayer &image_layer = image.layers[layer];

-  float *buffer_data = buffers.buffer.data();
-  image.read_pixels(image_layer, buffers.params, buffer_data);
+  float *buffer_data = input_pixels.data();
+  image.read_pixels(image_layer, buffer_data);
+  buffer_data += frame_stride;

-  /* Load previous image */
-  if (frame > 0 && !image.read_previous_pixels(image_layer, buffers.params, buffer_data)) {
-    error = "Failed to read neighbor frame pixels";
-    return false;
+  /* Load neighbor images */
+  for (int i = 0; i < image.in_neighbors.size(); i++) {
+    if (!image.read_neighbor_pixels(i, image_layer, buffer_data)) {
+      error = "Failed to read neighbor frame pixels";
+      return false;
+    }
+    buffer_data += frame_stride;
+  }
+
+  /* Preprocess */
+  buffer_data = input_pixels.data();
+  for (int neighbor = 0; neighbor < image.in_neighbors.size() + 1; neighbor++) {
+    /* Clamp */
+    if (denoiser->params.clamp_input) {
+      for (int i = 0; i < num_pixels * INPUT_NUM_CHANNELS; i++) {
+        buffer_data[i] = clamp(buffer_data[i], -1e8f, 1e8f);
+      }
+    }
+
+    /* Box blur */
+    int r = 5 * denoiser->params.radius;
+    float *data = buffer_data + 14;
+    array<float> temp(num_pixels);
+
+    for (int y = 0; y < h; y++) {
+      for (int x = 0; x < w; x++) {
+        int n = 0;
+        float sum = 0.0f;
+        for (int dx = max(x - r, 0); dx < min(x + r + 1, w); dx++, n++) {
+          sum += data[INPUT_NUM_CHANNELS * (y * w + dx)];
+        }
+        temp[y * w + x] = sum / n;
+      }
+    }
+
+    for (int y = 0; y < h; y++) {
+      for (int x = 0; x < w; x++) {
+        int n = 0;
+        float sum = 0.0f;
+
+        for (int dy = max(y - r, 0); dy < min(y + r + 1, h); dy++, n++) {
+          sum += temp[dy * w + x];
+        }
+
+        data[INPUT_NUM_CHANNELS * (y * w + x)] = sum / n;
+      }
+    }
+
+    /* Highlight compression */
+    data = buffer_data + 8;
+    for (int y = 0; y < h; y++) {
+      for (int x = 0; x < w; x++) {
+        int idx = INPUT_NUM_CHANNELS * (y * w + x);
+        float3 color = make_float3(data[idx], data[idx + 1], data[idx + 2]);
+        color = color_highlight_compress(color, NULL);
+        data[idx] = color.x;
+        data[idx + 1] = color.y;
+        data[idx + 2] = color.z;
+      }
+    }
+
+    buffer_data += frame_stride;
  }

  /* Copy to device */
-  buffers.buffer.copy_to_device();
+  input_pixels.copy_to_device();

  return true;
 }

 /* Task stages */

-static void add_pass(vector<Pass *> &passes, PassType type, PassMode mode = PassMode::NOISY)
-{
-  Pass *pass = new Pass();
-  pass->set_type(type);
-  pass->set_mode(mode);
-
-  passes.push_back(pass);
-}
-
 bool DenoiseTask::load()
 {
  string center_filepath = denoiser->input[frame];
@@ -223,8 +531,7 @@ bool DenoiseTask::load()
    return false;
  }

-  /* Use previous frame output as input for subsequent frames. */
-  if (frame > 0 && !image.load_previous(denoiser->output[frame - 1], error)) {
+  if (!image.load_neighbors(denoiser->input, neighbor_frames, error)) {
    return false;
  }

@@ -233,35 +540,10 @@ bool DenoiseTask::load()
    return false;
  }

-  /* Enable temporal denoising for frames after the first (which will use the output from the
-   * previous frames). */
-  DenoiseParams params = denoiser->denoiser->get_params();
-  params.temporally_stable = frame > 0;
-  denoiser->denoiser->set_params(params);
-
  /* Allocate device buffer. */
-  vector<Pass *> passes;
-  add_pass(passes, PassType::PASS_COMBINED);
-  add_pass(passes, PassType::PASS_DENOISING_ALBEDO);
-  add_pass(passes, PassType::PASS_DENOISING_NORMAL);
-  add_pass(passes, PassType::PASS_MOTION);
-  add_pass(passes, PassType::PASS_DENOISING_PREVIOUS);
-  add_pass(passes, PassType::PASS_COMBINED, PassMode::DENOISED);
-
-  BufferParams buffer_params;
-  buffer_params.width = image.width;
-  buffer_params.height = image.height;
-  buffer_params.full_x = 0;
-  buffer_params.full_y = 0;
-  buffer_params.full_width = image.width;
-  buffer_params.full_height = image.height;
-  buffer_params.update_passes(passes);
-
-  for (Pass *pass : passes) {
-    delete pass;
-  }
-
-  buffers.reset(buffer_params);
+  int num_frames = image.in_neighbors.size() + 1;
+  input_pixels.alloc(image.width * INPUT_NUM_CHANNELS, image.height * num_frames);
+  input_pixels.zero_to_device();

  /* Read pixels for first layer. */
  current_layer = 0;
@@ -283,26 +565,10 @@ bool DenoiseTask::exec()
    }

    /* Run task on device. */
-    denoiser->denoiser->denoise_buffer(buffers.params, &buffers, 1, true);
-
-    /* Copy denoised pixels from device. */
-    buffers.buffer.copy_from_device();
-
-    float *result = buffers.buffer.data(), *out = image.pixels.data();
-
-    const DenoiseImageLayer &layer = image.layers[current_layer];
-    const int *output_to_image_channel = layer.output_to_image_channel.data();
-
-    for (int y = 0; y < image.height; y++) {
-      for (int x = 0; x < image.width; x++, result += buffers.params.pass_stride) {
-        for (int j = 0; j < OUTPUT_NUM_CHANNELS; j++) {
-          int offset = buffers.params.get_pass_offset(PASS_COMBINED, PassMode::DENOISED);
-          int image_channel = output_to_image_channel[j];
-          out[image.num_channels * x + image_channel] = result[offset + j];
-        }
-      }
-      out += image.num_channels * image.width;
-    }
+    DeviceTask task(DeviceTask::RENDER);
+    create_task(task);
+    device->task_add(task);
+    device->task_wait();

    printf("\n");
  }
@@ -320,7 +586,8 @@ bool DenoiseTask::save()
 void DenoiseTask::free()
 {
  image.free();
-  buffers.buffer.free();
+  input_pixels.free();
+  assert(output_pixels.empty());
 }

 /* Denoise Image Storage */
@@ -340,7 +607,7 @@ DenoiseImage::~DenoiseImage()

 void DenoiseImage::close_input()
 {
-  in_previous.reset();
+  in_neighbors.clear();
 }

 void DenoiseImage::free()
@@ -410,61 +677,39 @@ bool DenoiseImage::parse_channels(const ImageSpec &in_spec, string &error)
  return true;
 }

-void DenoiseImage::read_pixels(const DenoiseImageLayer &layer,
-                               const BufferParams &params,
-                               float *input_pixels)
+void DenoiseImage::read_pixels(const DenoiseImageLayer &layer, float *input_pixels)
 {
  /* Pixels from center file have already been loaded into pixels.
   * We copy a subset into the device input buffer with channels reshuffled. */
  const int *input_to_image_channel = layer.input_to_image_channel.data();

  for (int i = 0; i < width * height; i++) {
-    for (int j = 0; j < 3; ++j) {
-      int offset = params.get_pass_offset(PASS_COMBINED);
-      int image_channel = input_to_image_channel[INPUT_NOISY_IMAGE + j];
-      input_pixels[i * params.pass_stride + offset + j] =
-          pixels[((size_t)i) * num_channels + image_channel];
-    }
-    for (int j = 0; j < 3; ++j) {
-      int offset = params.get_pass_offset(PASS_DENOISING_NORMAL);
-      int image_channel = input_to_image_channel[INPUT_DENOISING_NORMAL + j];
-      input_pixels[i * params.pass_stride + offset + j] =
-          pixels[((size_t)i) * num_channels + image_channel];
-    }
-    for (int j = 0; j < 3; ++j) {
-      int offset = params.get_pass_offset(PASS_DENOISING_ALBEDO);
-      int image_channel = input_to_image_channel[INPUT_DENOISING_ALBEDO + j];
-      input_pixels[i * params.pass_stride + offset + j] =
-          pixels[((size_t)i) * num_channels + image_channel];
-    }
-    for (int j = 0; j < 4; ++j) {
-      int offset = params.get_pass_offset(PASS_MOTION);
-      int image_channel = input_to_image_channel[INPUT_MOTION + j];
-      input_pixels[i * params.pass_stride + offset + j] =
+    for (int j = 0; j < INPUT_NUM_CHANNELS; j++) {
+      int image_channel = input_to_image_channel[j];
+      input_pixels[i * INPUT_NUM_CHANNELS + j] =
          pixels[((size_t)i) * num_channels + image_channel];
    }
  }
 }

-bool DenoiseImage::read_previous_pixels(const DenoiseImageLayer &layer,
-                                        const BufferParams &params,
+bool DenoiseImage::read_neighbor_pixels(int neighbor,
+                                        const DenoiseImageLayer &layer,
                                        float *input_pixels)
 {
  /* Load pixels from neighboring frames, and copy them into device buffer
   * with channels reshuffled. */
  size_t num_pixels = (size_t)width * (size_t)height;
  array<float> neighbor_pixels(num_pixels * num_channels);
-  if (!in_previous->read_image(TypeDesc::FLOAT, neighbor_pixels.data())) {
+  if (!in_neighbors[neighbor]->read_image(TypeDesc::FLOAT, neighbor_pixels.data())) {
    return false;
  }

-  const int *output_to_image_channel = layer.previous_output_to_image_channel.data();
+  const int *input_to_image_channel = layer.neighbor_input_to_image_channel[neighbor].data();

  for (int i = 0; i < width * height; i++) {
-    for (int j = 0; j < 3; ++j) {
-      int offset = params.get_pass_offset(PASS_DENOISING_PREVIOUS);
-      int image_channel = output_to_image_channel[j];
-      input_pixels[i * params.pass_stride + offset + j] =
+    for (int j = 0; j < INPUT_NUM_CHANNELS; j++) {
+      int image_channel = input_to_image_channel[j];
+      input_pixels[i * INPUT_NUM_CHANNELS + j] =
          neighbor_pixels[((size_t)i) * num_channels + image_channel];
    }
  }
@@ -494,8 +739,8 @@ bool DenoiseImage::load(const string &in_filepath, string &error)
    return false;
  }

-  if (layers.empty()) {
-    error = "Could not find a render layer containing denoising data and motion vector passes";
+  if (layers.size() == 0) {
+    error = "Could not find a render layer containing denoising info";
    return false;
  }

@@ -512,33 +757,45 @@ bool DenoiseImage::load(const string &in_filepath, string &error)
  return true;
 }

-bool DenoiseImage::load_previous(const string &filepath, string &error)
+bool DenoiseImage::load_neighbors(const vector<string> &filepaths,
+                                  const vector<int> &frames,
+                                  string &error)
 {
-  if (!Filesystem::is_regular(filepath)) {
-    error = "Couldn't find neighbor frame: " + filepath;
+  if (frames.size() > DENOISE_MAX_FRAMES - 1) {
+    error = string_printf("Maximum number of neighbors (%d) exceeded\n", DENOISE_MAX_FRAMES - 1);
    return false;
  }

-  unique_ptr<ImageInput> in_neighbor(ImageInput::open(filepath));
-  if (!in_neighbor) {
-    error = "Couldn't open neighbor frame: " + filepath;
-    return false;
-  }
+  for (int neighbor = 0; neighbor < frames.size(); neighbor++) {
+    int frame = frames[neighbor];
+    const string &filepath = filepaths[frame];

-  const ImageSpec &neighbor_spec = in_neighbor->spec();
-  if (neighbor_spec.width != width || neighbor_spec.height != height) {
-    error = "Neighbor frame has different dimensions: " + filepath;
-    return false;
-  }
-
-  for (DenoiseImageLayer &layer : layers) {
-    if (!layer.match_channels(in_spec.channelnames, neighbor_spec.channelnames)) {
-      error = "Neighbor frame misses denoising data passes: " + filepath;
+    if (!Filesystem::is_regular(filepath)) {
+      error = "Couldn't find neighbor frame: " + filepath;
      return false;
    }
-  }

-  in_previous = std::move(in_neighbor);
+    unique_ptr<ImageInput> in_neighbor(ImageInput::open(filepath));
+    if (!in_neighbor) {
+      error = "Couldn't open neighbor frame: " + filepath;
+      return false;
+    }
+
+    const ImageSpec &neighbor_spec = in_neighbor->spec();
+    if (neighbor_spec.width != width || neighbor_spec.height != height) {
+      error = "Neighbor frame has different dimensions: " + filepath;
+      return false;
+    }
+
+    foreach (DenoiseImageLayer &layer, layers) {
+      if (!layer.match_channels(neighbor, in_spec.channelnames, neighbor_spec.channelnames)) {
+        error = "Neighbor frame misses denoising data passes: " + filepath;
+        return false;
+      }
+    }
+
+    in_neighbors.push_back(std::move(in_neighbor));
+  }

  return true;
 }
@@ -607,22 +864,24 @@ bool DenoiseImage::save_output(const string &out_filepath, string &error)

 /* File pattern handling and outer loop over frames */

-DenoiserPipeline::DenoiserPipeline(DeviceInfo &device_info, const DenoiseParams &params)
+DenoiserPipeline::DenoiserPipeline(DeviceInfo &device_info)
 {
+  samples_override = 0;
+  tile_size = make_int2(64, 64);
+
+  num_frames = 0;
+
  /* Initialize task scheduler. */
  TaskScheduler::init();

  /* Initialize device. */
-  device = Device::create(device_info, stats, profiler);
-  device->load_kernels(KERNEL_FEATURE_DENOISING);
+  device = Device::create(device_info, stats, profiler, true);

-  denoiser = Denoiser::create(device, params);
-  denoiser->load_kernels(nullptr);
+  device->load_kernels(KERNEL_FEATURE_DENOISING);
 }

 DenoiserPipeline::~DenoiserPipeline()
 {
-  denoiser.reset();
  delete device;
  TaskScheduler::exit();
 }
@@ -631,7 +890,7 @@ bool DenoiserPipeline::run()
 {
  assert(input.size() == output.size());

-  int num_frames = output.size();
+  num_frames = output.size();

  for (int frame = 0; frame < num_frames; frame++) {
    /* Skip empty output paths. */
@@ -639,8 +898,16 @@ bool DenoiserPipeline::run()
      continue;
    }

+    /* Determine neighbor frame numbers that should be used for filtering. */
+    vector<int> neighbor_frames;
+    for (int f = frame - params.neighbor_frames; f <= frame + params.neighbor_frames; f++) {
+      if (f >= 0 && f < num_frames && f != frame) {
+        neighbor_frames.push_back(f);
+      }
+    }
+
    /* Execute task. */
-    DenoiseTask task(device, this, frame);
+    DenoiseTask task(device, this, frame, neighbor_frames);
    if (!task.load()) {
      error = task.error;
      return false;
@@ -663,3 +930,5 @@ bool DenoiserPipeline::run()
 }

 CCL_NAMESPACE_END
+
+#endif
--- a/intern/cycles/session/denoising.h
+++ b/intern/cycles/session/denoising.h
@@ -17,17 +17,20 @@
 #ifndef __DENOISING_H__
 #define __DENOISING_H__

+#if 0
+
 /* TODO(sergey): Make it explicit and clear when something is a denoiser, its pipeline or
 * parameters. Currently it is an annoying mixture of terms used interchangeably. */

-#include "device/device.h"
-#include "integrator/denoiser.h"
+#  include "device/device.h"

-#include "util/string.h"
-#include "util/unique_ptr.h"
-#include "util/vector.h"
+#  include "render/buffers.h"

-#include <OpenImageIO/imageio.h>
+#  include "util/util_string.h"
+#  include "util/util_unique_ptr.h"
+#  include "util/util_vector.h"
+
+#  include <OpenImageIO/imageio.h>

 OIIO_NAMESPACE_USING

@@ -37,7 +40,7 @@ CCL_NAMESPACE_BEGIN

 class DenoiserPipeline {
 public:
-  DenoiserPipeline(DeviceInfo &device_info, const DenoiseParams &params);
+  DenoiserPipeline(DeviceInfo &device_info);
  ~DenoiserPipeline();

  bool run();
@@ -52,13 +55,22 @@ class DenoiserPipeline {
   * taking into account all input frames. */
  vector<string> output;

+  /* Sample number override, takes precedence over values from input frames. */
+  int samples_override;
+  /* Tile size for processing on device. */
+  int2 tile_size;
+
+  /* Equivalent to the settings in the regular denoiser. */
+  DenoiseParams params;
+
 protected:
  friend class DenoiseTask;

  Stats stats;
  Profiler profiler;
  Device *device;
-  std::unique_ptr<Denoiser> denoiser;
+
+  int num_frames;
 };

 /* Denoise Image Layer */
@@ -76,13 +88,13 @@ struct DenoiseImageLayer {
  /* Device input channel will be copied from image channel input_to_image_channel[i]. */
  vector<int> input_to_image_channel;

+  /* input_to_image_channel of the secondary frames, if any are used. */
+  vector<vector<int>> neighbor_input_to_image_channel;
+
  /* Write i-th channel of the processing output to output_to_image_channel[i]-th channel of the
   * file. */
  vector<int> output_to_image_channel;

-  /* output_to_image_channel of the previous frame, if used. */
-  vector<int> previous_output_to_image_channel;
-
  /* Detect whether this layer contains a full set of channels and set up the offsets accordingly.
   */
  bool detect_denoising_channels();
@@ -90,7 +102,8 @@ struct DenoiseImageLayer {
  /* Map the channels of a secondary frame to the channels that are required for processing,
   * fill neighbor_input_to_image_channel if all are present or return false if a channel are
   * missing. */
-  bool match_channels(const std::vector<string> &channelnames,
+  bool match_channels(int neighbor,
+                      const std::vector<string> &channelnames,
                      const std::vector<string> &neighbor_channelnames);
 };

@@ -112,7 +125,7 @@ class DenoiseImage {

  /* Image file handles */
  ImageSpec in_spec;
-  unique_ptr<ImageInput> in_previous;
+  vector<unique_ptr<ImageInput>> in_neighbors;

  /* Render layers */
  vector<DenoiseImageLayer> layers;
@@ -124,16 +137,12 @@ class DenoiseImage {
  bool load(const string &in_filepath, string &error);

  /* Load neighboring frames. */
-  bool load_previous(const string &in_filepath, string &error);
+  bool load_neighbors(const vector<string> &filepaths, const vector<int> &frames, string &error);

  /* Load subset of pixels from file buffer into input buffer, as needed for denoising
   * on the device. Channels are reshuffled following the provided mapping. */
-  void read_pixels(const DenoiseImageLayer &layer,
-                   const BufferParams &params,
-                   float *input_pixels);
-  bool read_previous_pixels(const DenoiseImageLayer &layer,
-                            const BufferParams &params,
-                            float *input_pixels);
+  void read_pixels(const DenoiseImageLayer &layer, float *input_pixels);
+  bool read_neighbor_pixels(int neighbor, const DenoiseImageLayer &layer, float *input_pixels);

  bool save_output(const string &out_filepath, string &error);

@@ -150,7 +159,10 @@ class DenoiseImage {

 class DenoiseTask {
 public:
-  DenoiseTask(Device *device, DenoiserPipeline *denoiser, int frame);
+  DenoiseTask(Device *device,
+              DenoiserPipeline *denoiser,
+              int frame,
+              const vector<int> &neighbor_frames);
  ~DenoiseTask();

  /* Task stages */
@@ -168,17 +180,37 @@ class DenoiseTask {

  /* Frame number to be denoised */
  int frame;
+  vector<int> neighbor_frames;

  /* Image file data */
  DenoiseImage image;
  int current_layer;

-  RenderBuffers buffers;
+  /* Device input buffer */
+  device_vector<float> input_pixels;
+
+  /* Tiles */
+  thread_mutex tiles_mutex;
+  list<RenderTile> tiles;
+  int num_tiles;
+
+  thread_mutex output_mutex;
+  map<int, device_vector<float> *> output_pixels;

  /* Task handling */
  bool load_input_pixels(int layer);
+  void create_task(DeviceTask &task);
+
+  /* Device task callbacks */
+  bool acquire_tile(Device *device, Device *tile_device, RenderTile &tile);
+  void map_neighboring_tiles(RenderTileNeighbors &neighbors, Device *tile_device);
+  void unmap_neighboring_tiles(RenderTileNeighbors &neighbors);
+  void release_tile();
+  bool get_cancel();
 };

 CCL_NAMESPACE_END

+#endif
+
 #endif /* __DENOISING_H__ */
--- a/intern/cycles/session/display_driver.h
+++ b/intern/cycles/session/display_driver.h
@@ -54,8 +54,6 @@ class DisplayDriver {
    }
  };

-  virtual void next_tile_begin() = 0;
-
  /* Update the render from the rendering thread.
   *
   * Cycles periodically updates the render to be displayed. For multithreaded updates with
@@ -82,9 +80,6 @@ class DisplayDriver {
  virtual bool update_begin(const Params &params, int width, int height) = 0;
  virtual void update_end() = 0;

-  /* Optionally flush outstanding display commands before ending the render loop. */
-  virtual void flush(){};
-
  virtual half4 *map_texture_buffer() = 0;
  virtual void unmap_texture_buffer() = 0;

@@ -102,17 +97,6 @@ class DisplayDriver {

    /* Clear the entire buffer before doing partial write to it. */
    bool need_clear = false;
-
-    /* Enforce re-creation of the graphics interop object.
-     *
-     * When this field is true then the graphics interop will be re-created no matter what the
-     * rest of the configuration is.
-     * When this field is false the graphics interop will be re-created if the PBO or buffer size
-     * did change.
-     *
-     * This allows to ensure graphics interop is re-created when there is a possibility that an
-     * underlying PBO was re-allocated but did not change its ID. */
-    bool need_recreate = false;
  };

  virtual GraphicsInterop graphics_interop_get()
--- a/intern/cycles/session/session.cpp
+++ b/intern/cycles/session/session.cpp
@@ -192,8 +192,6 @@ void Session::run_main_render_loop()
      break;
    }
  }
-
-  path_trace_->flush_display();
 }

 void Session::run()
@@ -305,7 +303,7 @@ RenderWork Session::run_update_for_next_iteration()

      tile_params.update_offset_stride();

-      path_trace_->reset(buffer_params_, tile_params, did_reset);
+      path_trace_->reset(buffer_params_, tile_params);
    }

    const int resolution = render_work.resolution_divider;
@@ -386,8 +384,7 @@ int2 Session::get_effective_tile_size() const
  const int tile_size = tile_manager_.compute_render_tile_size(params.tile_size);
  const int64_t actual_tile_area = static_cast<int64_t>(tile_size) * tile_size;

-  if (actual_tile_area >= image_area && image_width <= TileManager::MAX_TILE_SIZE &&
-      image_height <= TileManager::MAX_TILE_SIZE) {
+  if (actual_tile_area >= image_area) {
    return make_int2(image_width, image_height);
  }

@@ -426,11 +423,6 @@ void Session::do_delayed_reset()
  buffer_params_.update_passes(scene->passes);
  tile_manager_.update(buffer_params_, scene);

-  /* Update temp directory on reset.
-   * This potentially allows to finish the existing rendering with a previously configure temporary
-   * direcotry in the host software and switch to a new temp directory when new render starts. */
-  tile_manager_.set_temp_dir(params.temp_dir);
-
  /* Progress. */
  progress.reset_sample();
  progress.set_total_pixel_samples(static_cast<uint64_t>(buffer_params_.width) *
--- a/intern/cycles/session/session.h
+++ b/intern/cycles/session/session.h
@@ -69,9 +69,6 @@ class SessionParams {

  ShadingSystem shadingsystem;

-  /* Session-specific temporary directory to store in-progress EXR files in. */
-  string temp_dir;
-
  SessionParams()
  {
    headless = false;
--- a/intern/cycles/session/tile.cpp
+++ b/intern/cycles/session/tile.cpp
@@ -23,7 +23,6 @@
 #include "scene/film.h"
 #include "scene/integrator.h"
 #include "scene/scene.h"
-#include "session/session.h"
 #include "util/algorithm.h"
 #include "util/foreach.h"
 #include "util/log.h"
@@ -342,10 +341,8 @@ int TileManager::compute_render_tile_size(const int suggested_tile_size) const
  /* Must be a multiple of IMAGE_TILE_SIZE so that we can write render tiles into the image file
   * aligned on image tile boundaries. We can't set IMAGE_TILE_SIZE equal to the render tile size
   * because too big tile size leads to integer overflow inside OpenEXR. */
-  const int computed_tile_size = (suggested_tile_size <= IMAGE_TILE_SIZE) ?
-                                     suggested_tile_size :
-                                     align_up(suggested_tile_size, IMAGE_TILE_SIZE);
-  return min(computed_tile_size, MAX_TILE_SIZE);
+  return (suggested_tile_size <= IMAGE_TILE_SIZE) ? suggested_tile_size :
+                                                    align_up(suggested_tile_size, IMAGE_TILE_SIZE);
 }

 void TileManager::reset_scheduling(const BufferParams &params, int2 tile_size)
@@ -395,11 +392,6 @@ void TileManager::update(const BufferParams &params, const Scene *scene)
  }
 }

-void TileManager::set_temp_dir(const string &temp_dir)
-{
-  temp_dir_ = temp_dir;
-}
-
 bool TileManager::done()
 {
  return tile_state_.next_tile_index == tile_state_.num_tiles;
@@ -458,8 +450,7 @@ const int2 TileManager::get_size() const

 bool TileManager::open_tile_output()
 {
-  write_state_.filename = path_join(temp_dir_,
-                                    "cycles-tile-buffer-" + tile_file_unique_part_ + "-" +
+  write_state_.filename = path_temp_get("cycles-tile-buffer-" + tile_file_unique_part_ + "-" +
                                        to_string(write_state_.tile_file_index) + ".exr");

  write_state_.tile_out = ImageOutput::create(write_state_.filename);
--- a/intern/cycles/session/tile.h
+++ b/intern/cycles/session/tile.h
@@ -71,8 +71,6 @@ class TileManager {
   * Will store all parameters needed for buffers access outside of the scene graph. */
  void update(const BufferParams &params, const Scene *scene);

-  void set_temp_dir(const string &temp_dir);
-
  inline int get_num_tiles() const
  {
    return tile_state_.num_tiles;
@@ -124,12 +122,6 @@ class TileManager {
  /* Tile size in the image file. */
  static const int IMAGE_TILE_SIZE = 128;

-  /* Maximum supported tile size.
-   * Needs to be safe from allocation on a GPU point of view: the display driver needs to be able
-   * to allocate texture with the side size of this value.
-   * Use conservative value which is safe for most of OpenGL drivers and GPUs. */
-  static const int MAX_TILE_SIZE = 8192;
-
 protected:
  /* Get tile configuration for its index.
   * The tile index must be within [0, state_.tile_state_). */
@@ -138,8 +130,6 @@ class TileManager {
  bool open_tile_output();
  bool close_tile_output();

-  string temp_dir_;
-
  /* Part of an on-disk tile file name which avoids conflicts between several Cycles instances or
   * several sessions. */
  string tile_file_unique_part_;
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@@ -38,6 +38,8 @@ set(ALL_CYCLES_LIBRARIES
 )
 include_directories(${INC})

+cycles_link_directories()
+
 set(SRC
  integrator_adaptive_sampling_test.cpp
  integrator_render_scheduler_test.cpp
@@ -52,21 +54,17 @@ set(SRC
  util_transform_test.cpp
 )

-# Disable AVX tests on macOS. Rosetta has problems running them, and other
-# platforms should be enough to verify AVX operations are implemented correctly.
-if(NOT APPLE)
-  if(CXX_HAS_AVX)
-    list(APPEND SRC
-      util_avxf_avx_test.cpp
-    )
-    set_source_files_properties(util_avxf_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
-  endif()
-  if(CXX_HAS_AVX2)
-    list(APPEND SRC
-      util_avxf_avx2_test.cpp
-    )
-    set_source_files_properties(util_avxf_avx2_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
-  endif()
+if(CXX_HAS_AVX)
+  list(APPEND SRC
+    util_avxf_avx_test.cpp
+  )
+  set_source_files_properties(util_avxf_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+endif()
+if(CXX_HAS_AVX2)
+  list(APPEND SRC
+    util_avxf_avx2_test.cpp
+  )
+  set_source_files_properties(util_avxf_avx2_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()

 if(WITH_GTESTS)
--- a/intern/cycles/test/util_avxf_test.h
+++ b/intern/cycles/test/util_avxf_test.h
@@ -32,13 +32,9 @@ static bool validate_cpu_capabilities()
 #endif
 }

-#define INIT_AVX_TEST \
+#define VALIDATECPU \
  if (!validate_cpu_capabilities()) \
-    return; \
-\
-  const avxf avxf_a(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f); \
-  const avxf avxf_b(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); \
-  const avxf avxf_c(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f);
+    return;

 #define compare_vector_scalar(a, b) \
  for (size_t index = 0; index < a.size; index++) \
@@ -53,18 +49,21 @@ static bool validate_cpu_capabilities()
    EXPECT_NEAR(a[index], b[index], abserror);

 #define basic_test_vv(a, b, op) \
-  INIT_AVX_TEST \
+  VALIDATECPU \
  avxf c = a op b; \
  for (size_t i = 0; i < a.size; i++) \
    EXPECT_FLOAT_EQ(c[i], a[i] op b[i]);

 /* vector op float tests */
 #define basic_test_vf(a, b, op) \
-  INIT_AVX_TEST \
+  VALIDATECPU \
  avxf c = a op b; \
  for (size_t i = 0; i < a.size; i++) \
    EXPECT_FLOAT_EQ(c[i], a[i] op b);

+static const avxf avxf_a(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f);
+static const avxf avxf_b(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
+static const avxf avxf_c(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f);
 static const float float_b = 1.5f;

 TEST(TEST_CATEGORY_NAME, avxf_add_vv){basic_test_vv(avxf_a, avxf_b, +)} TEST(TEST_CATEGORY_NAME,
@@ -79,7 +78,7 @@ TEST(TEST_CATEGORY_NAME, avxf_add_vv){basic_test_vv(avxf_a, avxf_b, +)} TEST(TES

 TEST(TEST_CATEGORY_NAME, avxf_ctor)
 {
-  INIT_AVX_TEST
+  VALIDATECPU
  compare_vector_scalar(avxf(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f),
                        static_cast<float>(index));
  compare_vector_scalar(avxf(1.0f), 1.0f);
@@ -92,28 +91,28 @@ TEST(TEST_CATEGORY_NAME, avxf_ctor)

 TEST(TEST_CATEGORY_NAME, avxf_sqrt)
 {
-  INIT_AVX_TEST
+  VALIDATECPU
  compare_vector_vector(mm256_sqrt(avxf(1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f, 49.0f, 64.0f)),
                        avxf(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f));
 }

 TEST(TEST_CATEGORY_NAME, avxf_min_max)
 {
-  INIT_AVX_TEST
+  VALIDATECPU
  compare_vector_vector(min(avxf_a, avxf_b), avxf_a);
  compare_vector_vector(max(avxf_a, avxf_b), avxf_b);
 }

 TEST(TEST_CATEGORY_NAME, avxf_set_sign)
 {
-  INIT_AVX_TEST
+  VALIDATECPU
  avxf res = set_sign_bit<1, 0, 0, 0, 0, 0, 0, 0>(avxf_a);
  compare_vector_vector(res, avxf(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, -0.8f));
 }

 TEST(TEST_CATEGORY_NAME, avxf_msub)
 {
-  INIT_AVX_TEST
+  VALIDATECPU
  avxf res = msub(avxf_a, avxf_b, avxf_c);
  avxf exp = avxf((avxf_a[7] * avxf_b[7]) - avxf_c[7],
                  (avxf_a[6] * avxf_b[6]) - avxf_c[6],
@@ -128,7 +127,7 @@ TEST(TEST_CATEGORY_NAME, avxf_msub)

 TEST(TEST_CATEGORY_NAME, avxf_madd)
 {
-  INIT_AVX_TEST
+  VALIDATECPU
  avxf res = madd(avxf_a, avxf_b, avxf_c);
  avxf exp = avxf((avxf_a[7] * avxf_b[7]) + avxf_c[7],
                  (avxf_a[6] * avxf_b[6]) + avxf_c[6],
@@ -143,7 +142,7 @@ TEST(TEST_CATEGORY_NAME, avxf_madd)

 TEST(TEST_CATEGORY_NAME, avxf_nmadd)
 {
-  INIT_AVX_TEST
+  VALIDATECPU
  avxf res = nmadd(avxf_a, avxf_b, avxf_c);
  avxf exp = avxf(avxf_c[7] - (avxf_a[7] * avxf_b[7]),
                  avxf_c[6] - (avxf_a[6] * avxf_b[6]),
@@ -158,7 +157,7 @@ TEST(TEST_CATEGORY_NAME, avxf_nmadd)

 TEST(TEST_CATEGORY_NAME, avxf_compare)
 {
-  INIT_AVX_TEST
+  VALIDATECPU
  avxf a(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f);
  avxf b(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
  avxb res = a <= b;
@@ -177,28 +176,28 @@ TEST(TEST_CATEGORY_NAME, avxf_compare)

 TEST(TEST_CATEGORY_NAME, avxf_permute)
 {
-  INIT_AVX_TEST
+  VALIDATECPU
  avxf res = permute<3, 0, 1, 7, 6, 5, 2, 4>(avxf_b);
  compare_vector_vector(res, avxf(4.0f, 6.0f, 3.0f, 2.0f, 1.0f, 7.0f, 8.0f, 5.0f));
 }

 TEST(TEST_CATEGORY_NAME, avxf_blend)
 {
-  INIT_AVX_TEST
+  VALIDATECPU
  avxf res = blend<0, 0, 1, 0, 1, 0, 1, 0>(avxf_a, avxf_b);
  compare_vector_vector(res, avxf(0.1f, 0.2f, 3.0f, 0.4f, 5.0f, 0.6f, 7.0f, 0.8f));
 }

 TEST(TEST_CATEGORY_NAME, avxf_shuffle)
 {
-  INIT_AVX_TEST
+  VALIDATECPU
  avxf res = shuffle<0, 1, 2, 3, 1, 3, 2, 0>(avxf_a);
  compare_vector_vector(res, avxf(0.4f, 0.2f, 0.1f, 0.3f, 0.5f, 0.6f, 0.7f, 0.8f));
 }

 TEST(TEST_CATEGORY_NAME, avxf_cross)
 {
-  INIT_AVX_TEST
+  VALIDATECPU
  avxf res = cross(avxf_b, avxf_c);
  compare_vector_vector_near(res,
                             avxf(0.0f,
@@ -214,7 +213,7 @@ TEST(TEST_CATEGORY_NAME, avxf_cross)

 TEST(TEST_CATEGORY_NAME, avxf_dot3)
 {
-  INIT_AVX_TEST
+  VALIDATECPU
  float den, den2;
  dot3(avxf_a, avxf_b, den, den2);
  EXPECT_FLOAT_EQ(den, 14.9f);
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -53,6 +53,16 @@ if(WITH_CYCLES_STANDALONE)
  endif()
 endif()

+if(CYCLES_STANDALONE_REPOSITORY)
+  list(APPEND INC_SYS
+    ../../third_party/numaapi/include
+  )
+else()
+  list(APPEND INC_SYS
+    ../../numaapi/include
+  )
+endif()
+
 set(SRC_HEADERS
  algorithm.h
  aligned_malloc.h
--- a/intern/cycles/util/path.cpp
+++ b/intern/cycles/util/path.cpp
@@ -66,6 +66,7 @@ typedef struct stat path_stat_t;

 static string cached_path = "";
 static string cached_user_path = "";
+static string cached_temp_path = "";
 static string cached_xdg_cache_path = "";

 namespace {
@@ -335,10 +336,11 @@ static string path_xdg_cache_get()
 }
 #endif

-void path_init(const string &path, const string &user_path)
+void path_init(const string &path, const string &user_path, const string &temp_path)
 {
  cached_path = path;
  cached_user_path = user_path;
+  cached_temp_path = temp_path;

 #ifdef _MSC_VER
  // workaround for https://svn.boost.org/trac/boost/ticket/6320
@@ -382,6 +384,15 @@ string path_cache_get(const string &sub)
 #endif
 }

+string path_temp_get(const string &sub)
+{
+  if (cached_temp_path == "") {
+    cached_temp_path = Filesystem::temp_directory_path();
+  }
+
+  return path_join(cached_temp_path, sub);
+}
+
 #if defined(__linux__) || defined(__APPLE__)
 string path_xdg_home_get(const string &sub = "");
 #endif
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Siddhartha Jejurkar	dbc41b30f8	Merge branch 'master' into soc-2021-uv-edge-select-support	2021-12-17 18:31:32 +05:30
Siddhartha Jejurkar	99a2af76d1	Merge branch 'master' into soc-2021-uv-edge-select-support	2021-11-20 00:04:44 +05:30
Siddhartha Jejurkar	139606bd37	Code cleanup and minor fixes	2021-11-19 23:54:27 +05:30
Siddhartha Jejurkar	4e6f73e5df	Merge branch 'master' into soc-2021-uv-edge-select-support	2021-11-03 17:16:00 +05:30
Siddhartha Jejurkar	7f8f2ff1a8	Merge branch 'master' into soc-2021-uv-edge-select-support	2021-10-04 16:37:02 +05:30
Siddhartha Jejurkar	52822b218d	Merge branch 'master' into soc-2021-uv-edge-select-support	2021-10-02 12:40:33 +05:30
Siddhartha Jejurkar	90707ca72f	Merge branch 'master' into soc-2021-uv-edge-select-support	2021-09-22 18:51:28 +05:30
Siddhartha Jejurkar	29deaaee00	Fix: Deselecting edges in sticky vertex mode Fixes the bug where deselecting an UV edge in sticky vertex mode would deselect the UV vertices as well.	2021-09-11 15:25:40 +05:30
Siddhartha Jejurkar	fdc9318360	UV: Drawing selected edges * Changes vertex count for UV edge polygon from 4 to 6 in the geometry shader * Edge center for a selected edge will not be highlighted if the edge is not selected. This new behavior provides a visual aid to the user that helps in identifying selected and unselected UV edges	2021-09-10 14:55:02 +05:30
Siddhartha Jejurkar	64d640d196	Merge branch 'master' into soc-2021-uv-edge-select-support	2021-08-27 09:18:46 +05:30
Siddhartha Jejurkar	2766e00ecf	Merge branch 'master' into soc-2021-uv-edge-select-support	2021-08-16 10:38:50 +05:30
Siddhartha Jejurkar	6f61cd90cc	Cleanup: Use utility function	2021-08-14 20:12:38 +05:30
Siddhartha Jejurkar	70824abc3f	Merge branch 'master' into soc-2021-uv-editor-improvements-edge-selection	2021-08-09 07:05:27 +05:30
Siddhartha Jejurkar	a7f24a8307	Edge selection support for Edge ring and loop select operator * Adds edge selection support for edge ring and edge loop select operator * Selecting face loops in the UV editor is now done through the loop select operator	2021-08-09 03:00:27 +05:30
Siddhartha Jejurkar	752c840f1e	Cleanup and fix: UV Invert selection * Code and comment cleanup * Fix: Make invert selection respect sticky modes when using edge select mode	2021-08-07 23:39:29 +05:30
Siddhartha Jejurkar	ddbbbf40f6	Fix: Check edge selection only for common vertex Allow edge selection test only for UV edges that share a common UV vertex. Previous logic didn't check if the UV edges shared the same UV vertex.	2021-08-07 20:38:18 +05:30
Siddhartha Jejurkar	a4af0f530c	UV: Edge selection for UV select pinned operator Ensure edge selection for UV select pinned operator by flushing the selection upwards (verts to edge)	2021-08-07 14:07:06 +05:30
Siddhartha Jejurkar	a342ea1cf5	UV: Refactor UV select split operator Refactor UV select split operator to use MLOOPUV_EDGESEL flag	2021-08-07 13:39:40 +05:30
Siddhartha Jejurkar	e88563e472	Merge branch 'master' into soc-2021-uv-editor-improvements-edge-selection	2021-08-05 16:42:53 +05:30
Siddhartha Jejurkar	db821af172	Fix: Invert selection and free unreleased memory - Free unreleased memory from vertex map - Select Invert operator now works with UV face select mode as well	2021-08-04 06:19:27 +05:30
Siddhartha Jejurkar	d6e02d92e1	UV: Extend edge selection support - Add edge selection support for operators: mouse select, box select, circle select, lasso select, (de)select all, invert selection and select more/less - Flush selections between UV verts and edges - Fix: prevent deselection of neighbouring edges when deselecting an edge in sticky location + edge select mode	2021-08-04 06:19:26 +05:30
Siddhartha Jejurkar	8b3b353caf	Fix: Prevent deselection of surrounding UV faces With face + sticky loc/vertex mode, trying to deselect UV faces would sometimes result in surrounding UV faces being deselected as well. This commit fixes that allowing conditional deselection of shared UV vertices based on the selection state of surrounding UV faces.	2021-08-04 06:19:25 +05:30
Siddhartha Jejurkar	035fa7985d	UV: Edge selection support - Initial * Add UV edge selection flag - MLOOPUV_EDGESEL * Refactor existing UV element selection functions to use the edge selection flag wherever required * Refactor existing UV element check functions to ensure proper selection states using the edge selection flag * Refactor UV select all operator to use edge selection flag * New functions for selecting vertices or edges that share the same location, either on 3D mesh or in UV space. * Add small penalty for finding the nearest UV edge. Ensures that UV edge selection will select other edges sharing the same location in successive selection attempts. * Expose UV edge selection flag as boolean in Python Differential Revision: https://developer.blender.org/D12028	2021-08-04 06:19:24 +05:30