test using compiled function in math node

initial ir optimization
fix function generation
2022-01-02 20:46:01 +01:00 · 2022-01-02 18:23:04 +01:00 · 2022-01-02 16:00:33 +01:00 · 2022-01-02 16:49:23 +01:00 · 2022-01-02 14:59:02 +01:00 · 2021-12-29 21:44:40 +01:00
2329 changed files with 72404 additions and 43550 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -433,30 +433,40 @@ mark_as_advanced(WITH_CYCLES_DEBUG_NAN)
 mark_as_advanced(WITH_CYCLES_NATIVE_ONLY)

 # NVIDIA CUDA & OptiX
-option(WITH_CYCLES_DEVICE_CUDA       "Enable Cycles NVIDIA CUDA compute support" ON)
-option(WITH_CYCLES_DEVICE_OPTIX      "Enable Cycles NVIDIA OptiX support" ON)
-mark_as_advanced(WITH_CYCLES_DEVICE_CUDA)
+if(NOT APPLE)
+  option(WITH_CYCLES_DEVICE_CUDA       "Enable Cycles NVIDIA CUDA compute support" ON)
+  option(WITH_CYCLES_DEVICE_OPTIX      "Enable Cycles NVIDIA OptiX support" ON)
+  mark_as_advanced(WITH_CYCLES_DEVICE_CUDA)

-option(WITH_CYCLES_CUDA_BINARIES     "Build Cycles NVIDIA CUDA binaries" OFF)
-set(CYCLES_CUDA_BINARIES_ARCH sm_30 sm_35 sm_37 sm_50 sm_52 sm_60 sm_61 sm_70 sm_75 sm_86 compute_75 CACHE STRING "CUDA architectures to build binaries for")
-option(WITH_CYCLES_CUBIN_COMPILER    "Build cubins with nvrtc based compiler instead of nvcc" OFF)
-option(WITH_CYCLES_CUDA_BUILD_SERIAL "Build cubins one after another (useful on machines with limited RAM)" OFF)
-option(WITH_CUDA_DYNLOAD             "Dynamically load CUDA libraries at runtime (for developers, makes cuda-gdb work)" ON)
-mark_as_advanced(CYCLES_CUDA_BINARIES_ARCH)
-mark_as_advanced(WITH_CYCLES_CUBIN_COMPILER)
-mark_as_advanced(WITH_CYCLES_CUDA_BUILD_SERIAL)
-mark_as_advanced(WITH_CUDA_DYNLOAD)
+  option(WITH_CYCLES_CUDA_BINARIES     "Build Cycles NVIDIA CUDA binaries" OFF)
+  set(CYCLES_CUDA_BINARIES_ARCH sm_30 sm_35 sm_37 sm_50 sm_52 sm_60 sm_61 sm_70 sm_75 sm_86 compute_75 CACHE STRING "CUDA architectures to build binaries for")
+  option(WITH_CYCLES_CUBIN_COMPILER    "Build cubins with nvrtc based compiler instead of nvcc" OFF)
+  option(WITH_CYCLES_CUDA_BUILD_SERIAL "Build cubins one after another (useful on machines with limited RAM)" OFF)
+  option(WITH_CUDA_DYNLOAD             "Dynamically load CUDA libraries at runtime (for developers, makes cuda-gdb work)" ON)
+  mark_as_advanced(CYCLES_CUDA_BINARIES_ARCH)
+  mark_as_advanced(WITH_CYCLES_CUBIN_COMPILER)
+  mark_as_advanced(WITH_CYCLES_CUDA_BUILD_SERIAL)
+  mark_as_advanced(WITH_CUDA_DYNLOAD)
+endif()

 # AMD HIP
-if(WIN32)
-  option(WITH_CYCLES_DEVICE_HIP        "Enable Cycles AMD HIP support" ON)
-else()
-  option(WITH_CYCLES_DEVICE_HIP        "Enable Cycles AMD HIP support" OFF)
+if(NOT APPLE)
+  if(WIN32)
+    option(WITH_CYCLES_DEVICE_HIP        "Enable Cycles AMD HIP support" ON)
+  else()
+    option(WITH_CYCLES_DEVICE_HIP        "Enable Cycles AMD HIP support" OFF)
+  endif()
+
+  option(WITH_CYCLES_HIP_BINARIES      "Build Cycles AMD HIP binaries" OFF)
+  set(CYCLES_HIP_BINARIES_ARCH gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 CACHE STRING "AMD HIP architectures to build binaries for")
+  mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
+  mark_as_advanced(CYCLES_HIP_BINARIES_ARCH)
+endif()
+
+# Apple Metal
+if(APPLE)
+  option(WITH_CYCLES_DEVICE_METAL       "Enable Cycles Apple Metal compute support" ON)
 endif()
-option(WITH_CYCLES_HIP_BINARIES      "Build Cycles AMD HIP binaries" OFF)
-set(CYCLES_HIP_BINARIES_ARCH gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 CACHE STRING "AMD HIP architectures to build binaries for")
-mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
-mark_as_advanced(CYCLES_HIP_BINARIES_ARCH)

 # Draw Manager
 option(WITH_DRAW_DEBUG "Add extra debug capabilities to Draw Manager" OFF)
@@ -846,7 +856,7 @@ if(WITH_AUDASPACE)
 endif()

 # Auto-enable CUDA dynload if toolkit is not found.
-if(NOT WITH_CUDA_DYNLOAD)
+if(WITH_CYCLES AND WITH_CYCLES_DEVICE_CUDA AND NOT WITH_CUDA_DYNLOAD)
  find_package(CUDA)
  if(NOT CUDA_FOUND)
    message(STATUS "CUDA toolkit not found, using dynamic runtime loading of libraries (WITH_CUDA_DYNLOAD) instead")
--- a/build_files/build_environment/install_deps.sh
+++ b/build_files/build_environment/install_deps.sh
@@ -2083,9 +2083,9 @@ compile_OIIO() {
    cmake_d="$cmake_d -D OPENEXR_VERSION=$OPENEXR_VERSION"

    if [ "$_with_built_openexr" = true ]; then
-      cmake_d="$cmake_d -D ILMBASE_HOME=$INST/openexr"
-      cmake_d="$cmake_d -D OPENEXR_HOME=$INST/openexr"
-      INFO "ILMBASE_HOME=$INST/openexr"
+      cmake_d="$cmake_d -D ILMBASE_ROOT=$INST/openexr"
+      cmake_d="$cmake_d -D OPENEXR_ROOT=$INST/openexr"
+      INFO "Ilmbase_ROOT=$INST/openexr"
    fi

    # ptex is only needed when nicholas bishop is ready
@@ -2374,9 +2374,9 @@ compile_OSL() {
    #~ cmake_d="$cmake_d -D ILMBASE_VERSION=$ILMBASE_VERSION"

    if [ "$_with_built_openexr" = true ]; then
-      INFO "ILMBASE_HOME=$INST/openexr"
-      cmake_d="$cmake_d -D OPENEXR_ROOT_DIR=$INST/openexr"
-      cmake_d="$cmake_d -D ILMBASE_ROOT_DIR=$INST/openexr"
+      cmake_d="$cmake_d -D ILMBASE_ROOT=$INST/openexr"
+      cmake_d="$cmake_d -D OPENEXR_ROOT=$INST/openexr"
+      INFO "Ilmbase_ROOT=$INST/openexr"
      # XXX Temp workaround... sigh, ILMBase really messed the things up by defining their custom names ON by default :(
    fi

--- a/build_files/build_environment/patches/usd.diff
+++ b/build_files/build_environment/patches/usd.diff
@@ -197,3 +197,38 @@ index 67ec0d15f..6dc3e85a0 100644
 #else
 #error Unknown architecture.
 #endif
+
+diff --git a/pxr/base/arch/demangle.cpp b/pxr/base/arch/demangle.cpp
+index 67ec0d15f..6dc3e85a0 100644
+--- a/pxr/base/arch/demangle.cpp
+++ b/pxr/base/arch/demangle.cpp
+@@ -36,6 +36,7 @@
+ #if (ARCH_COMPILER_GCC_MAJOR == 3 && ARCH_COMPILER_GCC_MINOR >= 1) || \
+     ARCH_COMPILER_GCC_MAJOR > 3 || defined(ARCH_COMPILER_CLANG)
+ #define _AT_LEAST_GCC_THREE_ONE_OR_CLANG
+#include <cxxabi.h>
+ #endif
+ 
+ PXR_NAMESPACE_OPEN_SCOPE
+@@ -138,7 +139,6 @@
+ #endif
+ 
+ #if defined(_AT_LEAST_GCC_THREE_ONE_OR_CLANG)
+-#include <cxxabi.h>
+ 
+ /*
+  * This routine doesn't work when you get to gcc3.4.
+
+diff --git a/pxr/base/work/singularTask.h b/pxr/base/work/singularTask.h
+index 67ec0d15f..6dc3e85a0 100644
+--- a/pxr/base/work/singularTask.h
+++ b/pxr/base/work/singularTask.h
+@@ -120,7 +120,7 @@
+                     // case we go again to ensure the task can do whatever it
+                     // was awakened to do.  Once we successfully take the count
+                     // to zero, we stop.
+-                    size_t old = count;
+                    std::size_t old = count;
+                     do { _fn(); } while (
+                         !count.compare_exchange_strong(old, 0));
+                 });
--- a/build_files/cmake/config/blender_lite.cmake
+++ b/build_files/cmake/config/blender_lite.cmake
@@ -19,9 +19,6 @@ set(WITH_CODEC_SNDFILE       OFF CACHE BOOL "" FORCE)
 set(WITH_COMPOSITOR          OFF CACHE BOOL "" FORCE)
 set(WITH_COREAUDIO           OFF CACHE BOOL "" FORCE)
 set(WITH_CYCLES              OFF CACHE BOOL "" FORCE)
-set(WITH_CYCLES_DEVICE_OPTIX OFF CACHE BOOL "" FORCE)
-set(WITH_CYCLES_EMBREE       OFF CACHE BOOL "" FORCE)
-set(WITH_CYCLES_OSL          OFF CACHE BOOL "" FORCE)
 set(WITH_DRACO               OFF CACHE BOOL "" FORCE)
 set(WITH_FFTW3               OFF CACHE BOOL "" FORCE)
 set(WITH_FREESTYLE           OFF CACHE BOOL "" FORCE)
--- a/build_files/cmake/config/blender_release.cmake
+++ b/build_files/cmake/config/blender_release.cmake
@@ -61,6 +61,7 @@ set(WITH_MEM_JEMALLOC          ON  CACHE BOOL "" FORCE)
 # platform dependent options
 if(APPLE)
  set(WITH_COREAUDIO           ON  CACHE BOOL "" FORCE)
+  set(WITH_CYCLES_DEVICE_METAL ON  CACHE BOOL "" FORCE)
 endif()
 if(NOT WIN32)
  set(WITH_JACK                ON  CACHE BOOL "" FORCE)
--- a/build_files/cmake/platform/platform_apple.cmake
+++ b/build_files/cmake/platform/platform_apple.cmake
@@ -257,9 +257,6 @@ if(WITH_BOOST)
  if(WITH_INTERNATIONAL)
    list(APPEND _boost_FIND_COMPONENTS locale)
  endif()
-  if(WITH_CYCLES_NETWORK)
-    list(APPEND _boost_FIND_COMPONENTS serialization)
-  endif()
  if(WITH_OPENVDB)
    list(APPEND _boost_FIND_COMPONENTS iostreams)
  endif()
@@ -339,7 +336,7 @@ if(WITH_LLVM)

 endif()

-if(WITH_CYCLES_OSL)
+if(WITH_CYCLES AND WITH_CYCLES_OSL)
  set(CYCLES_OSL ${LIBDIR}/osl)

  find_library(OSL_LIB_EXEC NAMES oslexec PATHS ${CYCLES_OSL}/lib)
@@ -359,7 +356,7 @@ if(WITH_CYCLES_OSL)
  endif()
 endif()

-if(WITH_CYCLES_EMBREE)
+if(WITH_CYCLES AND WITH_CYCLES_EMBREE)
  find_package(Embree 3.8.0 REQUIRED)
  # Increase stack size for Embree, only works for executables.
  if(NOT WITH_PYTHON_MODULE)
--- a/build_files/cmake/platform/platform_unix.cmake
+++ b/build_files/cmake/platform/platform_unix.cmake
@@ -241,7 +241,7 @@ if(WITH_INPUT_NDOF)
  endif()
 endif()

-if(WITH_CYCLES_OSL)
+if(WITH_CYCLES AND WITH_CYCLES_OSL)
  set(CYCLES_OSL ${LIBDIR}/osl CACHE PATH "Path to OpenShadingLanguage installation")
  if(EXISTS ${CYCLES_OSL} AND NOT OSL_ROOT)
    set(OSL_ROOT ${CYCLES_OSL})
@@ -314,7 +314,7 @@ if(WITH_BOOST)
    endif()
    set(Boost_USE_MULTITHREADED ON)
    set(__boost_packages filesystem regex thread date_time)
-    if(WITH_CYCLES_OSL)
+    if(WITH_CYCLES AND WITH_CYCLES_OSL)
      if(NOT (${OSL_LIBRARY_VERSION_MAJOR} EQUAL "1" AND ${OSL_LIBRARY_VERSION_MINOR} LESS "6"))
        list(APPEND __boost_packages wave)
      else()
@@ -323,9 +323,6 @@ if(WITH_BOOST)
    if(WITH_INTERNATIONAL)
      list(APPEND __boost_packages locale)
    endif()
-    if(WITH_CYCLES_NETWORK)
-      list(APPEND __boost_packages serialization)
-    endif()
    if(WITH_OPENVDB)
      list(APPEND __boost_packages iostreams)
    endif()
@@ -403,7 +400,7 @@ if(WITH_OPENCOLORIO)
  endif()
 endif()

-if(WITH_CYCLES_EMBREE)
+if(WITH_CYCLES AND WITH_CYCLES_EMBREE)
  find_package(Embree 3.8.0 REQUIRED)
 endif()

--- a/build_files/cmake/platform/platform_win32.cmake
+++ b/build_files/cmake/platform/platform_win32.cmake
@@ -477,7 +477,7 @@ if(WITH_PYTHON)
 endif()

 if(WITH_BOOST)
-  if(WITH_CYCLES_OSL)
+  if(WITH_CYCLES AND WITH_CYCLES_OSL)
    set(boost_extra_libs wave)
  endif()
  if(WITH_INTERNATIONAL)
@@ -520,7 +520,7 @@ if(WITH_BOOST)
      debug ${BOOST_LIBPATH}/libboost_thread-${BOOST_DEBUG_POSTFIX}
      debug ${BOOST_LIBPATH}/libboost_chrono-${BOOST_DEBUG_POSTFIX}
    )
-    if(WITH_CYCLES_OSL)
+    if(WITH_CYCLES AND WITH_CYCLES_OSL)
      set(BOOST_LIBRARIES ${BOOST_LIBRARIES}
        optimized ${BOOST_LIBPATH}/libboost_wave-${BOOST_POSTFIX}
        debug ${BOOST_LIBPATH}/libboost_wave-${BOOST_DEBUG_POSTFIX})
@@ -708,7 +708,7 @@ if(WITH_CODEC_SNDFILE)
  set(LIBSNDFILE_LIBRARIES ${LIBSNDFILE_LIBPATH}/libsndfile-1.lib)
 endif()

-if(WITH_CYCLES_OSL)
+if(WITH_CYCLES AND WITH_CYCLES_OSL)
  set(CYCLES_OSL ${LIBDIR}/osl CACHE PATH "Path to OpenShadingLanguage installation")
  set(OSL_SHADER_DIR ${CYCLES_OSL}/shaders)
  # Shaders have moved around a bit between OSL versions, check multiple locations
@@ -741,7 +741,7 @@ if(WITH_CYCLES_OSL)
  endif()
 endif()

-if(WITH_CYCLES_EMBREE)
+if(WITH_CYCLES AND WITH_CYCLES_EMBREE)
  windows_find_package(Embree)
  if(NOT EMBREE_FOUND)
    set(EMBREE_INCLUDE_DIRS ${LIBDIR}/embree/include)
--- a/doc/doxygen/doxygen.intern.h
+++ b/doc/doxygen/doxygen.intern.h
@@ -6,91 +6,90 @@
 *  as part of the normal development process.
 */

-/** \defgroup MEM Guarded memory (de)allocation
- *  \ingroup intern
+/* TODO: other modules.
+ * - `libmv`
+ * - `cycles`
+ * - `opencolorio`
+ * - `opensubdiv`
+ * - `openvdb`
+ * - `quadriflow`
 */

-/** \defgroup clog C-Logging (CLOG)
- *  \ingroup intern
- */
+/** \defgroup intern_atomic Atomic Operations
+ *  \ingroup intern */

-/** \defgroup ctr container
- *  \ingroup intern
- */
+/** \defgroup intern_clog C-Logging (CLOG)
+ *  \ingroup intern */

-/** \defgroup iksolver iksolver
- *  \ingroup intern
- */
+/** \defgroup intern_eigen Eigen
+ *  \ingroup intern */

-/** \defgroup itasc itasc
- *  \ingroup intern
- */
+/** \defgroup intern_glew-mx GLEW with Multiple Rendering Context's
+ *  \ingroup intern */

-/** \defgroup memutil memutil
- *  \ingroup intern
- */
+/** \defgroup intern_iksolver Inverse Kinematics (Solver)
+ *  \ingroup intern */

-/** \defgroup mikktspace mikktspace
- *  \ingroup intern
- */
+/** \defgroup intern_itasc Inverse Kinematics (ITASC)
+ *  \ingroup intern */

-/** \defgroup moto moto
- *  \ingroup intern
- */
+/** \defgroup intern_libc_compat libc Compatibility For Linux
+ *  \ingroup intern */

-/** \defgroup eigen eigen
- *  \ingroup intern
- */
+/** \defgroup intern_locale Locale
+ *  \ingroup intern */

-/** \defgroup smoke smoke
- *  \ingroup intern
- */
+/** \defgroup intern_mantaflow Manta-Flow Fluid Simulation
+ *  \ingroup intern */

-/** \defgroup string string
- *  \ingroup intern
- */
+/** \defgroup intern_mem Guarded Memory (de)allocation
+ *  \ingroup intern */
+
+/** \defgroup intern_memutil Memory Utilities (memutil)
+ *  \ingroup intern */
+
+/** \defgroup intern_mikktspace MikktSpace
+ *  \ingroup intern */
+
+/** \defgroup intern_numaapi NUMA (Non Uniform Memory Architecture)
+ *  \ingroup intern */
+
+/** \defgroup intern_rigidbody Rigid-Body C-API
+ *  \ingroup intern */
+
+/** \defgroup intern_sky_model Sky Model
+ *  \ingroup intern */
+
+/** \defgroup intern_utf_conv UTF-8/16 Conversion (utfconv)
+ *  \ingroup intern */

 /** \defgroup audaspace Audaspace
 *  \ingroup intern undoc
- *  \todo add to doxygen
- */
+ *  \todo add to doxygen */
 /** \defgroup audcoreaudio Audaspace CoreAudio
- *  \ingroup audaspace
- */
+ *  \ingroup audaspace */
 /** \defgroup audfx Audaspace FX
- *  \ingroup audaspace
- */
+ *  \ingroup audaspace */
 /** \defgroup audopenal Audaspace OpenAL
- *  \ingroup audaspace
- */
+ *  \ingroup audaspace */
 /** \defgroup audpulseaudio Audaspace PulseAudio
- *  \ingroup audaspace
- */
+ *  \ingroup audaspace */
 /** \defgroup audwasapi Audaspace WASAPI
- *  \ingroup audaspace
- */
+ *  \ingroup audaspace */
 /** \defgroup audpython Audaspace Python
- *  \ingroup audaspace
- */
+ *  \ingroup audaspace */
 /** \defgroup audsdl Audaspace SDL
- *  \ingroup audaspace
- */
+ *  \ingroup audaspace */
 /** \defgroup audsrc Audaspace SRC
- *
- *  \ingroup audaspace
- */
+ *  \ingroup audaspace */
 /** \defgroup audffmpeg Audaspace FFMpeg
- *  \ingroup audaspace
- */
+ *  \ingroup audaspace */
 /** \defgroup audfftw Audaspace FFTW
- *  \ingroup audaspace
- */
+ *  \ingroup audaspace */
 /** \defgroup audjack Audaspace Jack
- *  \ingroup audaspace
- */
+ *  \ingroup audaspace */
 /** \defgroup audsndfile Audaspace sndfile
- *  \ingroup audaspace
- */
+ *  \ingroup audaspace */

 /** \defgroup GHOST GHOST API
 * \ingroup intern GUI
--- a/doc/doxygen/doxygen.source.h
+++ b/doc/doxygen/doxygen.source.h
@@ -5,7 +5,8 @@
 /** \defgroup bmesh BMesh
 *  \ingroup blender
 */
-/** \defgroup compositor Compositing */
+/** \defgroup compositor Compositing
+ *  \ingroup blender */

 /** \defgroup python Python
 *  \ingroup blender
@@ -78,7 +79,8 @@
 *  \ingroup blender
 */

-/** \defgroup data DNA, RNA and .blend access*/
+/** \defgroup data DNA, RNA and .blend access
+ *  \ingroup blender */

 /** \defgroup gpu GPU
 *  \ingroup blender
@@ -101,11 +103,12 @@
 *   merged in docs.
 */

-/** \defgroup gui GUI */
+/**
+ * \defgroup gui GUI
+ * \ingroup blender */

 /** \defgroup wm Window Manager
- *  \ingroup blender gui
- */
+ *  \ingroup gui */

 /* ================================ */

@@ -279,7 +282,8 @@
 *  \ingroup gui
 */

-/** \defgroup externformats External Formats */
+/** \defgroup externformats External Formats
+ *  \ingroup blender */

 /** \defgroup collada COLLADA
 *  \ingroup externformats
@@ -308,4 +312,7 @@
 /* ================================ */

 /** \defgroup undoc Undocumented
- *  \brief Modules and libraries that are still undocumented, or lacking proper integration into the doxygen system, are marked in this group. */
+ *
+ * \brief Modules and libraries that are still undocumented,
+ * or lacking proper integration into the doxygen system, are marked in this group.
+ */
--- a/doc/manpage/blender.1.py
+++ b/doc/manpage/blender.1.py
@@ -61,7 +61,7 @@ def blender_extract_info(blender_bin: str) -> Dict[str, str]:
        stdout=subprocess.PIPE,
    ).stdout.decode(encoding="utf-8")

-    blender_version_ouput = subprocess.run(
+    blender_version_output = subprocess.run(
        [blender_bin, "--version"],
        env=blender_env,
        check=True,
@@ -73,7 +73,7 @@ def blender_extract_info(blender_bin: str) -> Dict[str, str]:
    # check for each lines prefix to ensure these aren't included.
    blender_version = ""
    blender_date = ""
-    for l in blender_version_ouput.split("\n"):
+    for l in blender_version_output.split("\n"):
        if l.startswith("Blender "):
            # Remove 'Blender' prefix.
            blender_version = l.split(" ", 1)[1].strip()
--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -1125,7 +1125,7 @@ context_type_map = {
    "soft_body": ("SoftBodyModifier", False),
    "speaker": ("Speaker", False),
    "texture": ("Texture", False),
-    "texture_slot": ("MaterialTextureSlot", False),
+    "texture_slot": ("TextureSlot", False),
    "texture_user": ("ID", False),
    "texture_user_property": ("Property", False),
    "ui_list": ("UIList", False),
--- a/intern/atomic/atomic_ops.h
+++ b/intern/atomic/atomic_ops.h
@@ -45,7 +45,7 @@
 */

 /** \file
- * \ingroup Atomic
+ * \ingroup intern_atomic
 *
 * \brief Provides wrapper around system-specific atomic primitives,
 * and some extensions (faked-atomic operations over float numbers).
--- a/intern/atomic/intern/atomic_ops_ext.h
+++ b/intern/atomic/intern/atomic_ops_ext.h
@@ -44,6 +44,10 @@
 * The Original Code is: adapted from jemalloc.
 */

+/** \file
+ * \ingroup intern_atomic
+ */
+
 #ifndef __ATOMIC_OPS_EXT_H__
 #define __ATOMIC_OPS_EXT_H__

--- a/intern/atomic/intern/atomic_ops_msvc.h
+++ b/intern/atomic/intern/atomic_ops_msvc.h
@@ -26,6 +26,10 @@
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

+/** \file
+ * \ingroup intern_atomic
+ */
+
 #ifndef __ATOMIC_OPS_MSVC_H__
 #define __ATOMIC_OPS_MSVC_H__

--- a/intern/atomic/intern/atomic_ops_unix.h
+++ b/intern/atomic/intern/atomic_ops_unix.h
@@ -44,6 +44,10 @@
 * The Original Code is: adapted from jemalloc.
 */

+/** \file
+ * \ingroup intern_atomic
+ */
+
 #ifndef __ATOMIC_OPS_UNIX_H__
 #define __ATOMIC_OPS_UNIX_H__

--- a/intern/atomic/intern/atomic_ops_utils.h
+++ b/intern/atomic/intern/atomic_ops_utils.h
@@ -44,6 +44,10 @@
 * The Original Code is: adapted from jemalloc.
 */

+/** \file
+ * \ingroup intern_atomic
+ */
+
 #ifndef __ATOMIC_OPS_UTILS_H__
 #define __ATOMIC_OPS_UTILS_H__

--- a/intern/clog/CLG_log.h
+++ b/intern/clog/CLG_log.h
@@ -14,11 +14,8 @@
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */

-#ifndef __CLG_LOG_H__
-#define __CLG_LOG_H__
-
 /** \file
- * \ingroup clog
+ * \ingroup intern_clog
 *
 * C Logging Library (clog)
 * ========================
@@ -68,6 +65,9 @@
 * - 4+: May be used for more details than 3, should be avoided but not prevented.
 */

+#ifndef __CLG_LOG_H__
+#define __CLG_LOG_H__
+
 #ifdef __cplusplus
 extern "C" {
 #endif /* __cplusplus */
--- a/intern/clog/clog.c
+++ b/intern/clog/clog.c
@@ -15,7 +15,7 @@
 */

 /** \file
- * \ingroup clog
+ * \ingroup intern_clog
 */

 #include <assert.h>
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -40,6 +40,7 @@ set(SRC
  object_cull.cpp
  output_driver.cpp
  particles.cpp
+  pointcloud.cpp
  curves.cpp
  logging.cpp
  python.cpp
@@ -87,6 +88,7 @@ endif()

 set(ADDON_FILES
  addon/__init__.py
+  addon/camera.py
  addon/engine.py
  addon/operators.py
  addon/osl.py
@@ -101,6 +103,11 @@ add_definitions(${GL_DEFINITIONS})
 if(WITH_CYCLES_DEVICE_HIP)
  add_definitions(-DWITH_HIP)
 endif()
+
+if(WITH_CYCLES_DEVICE_METAL)
+  add_definitions(-DWITH_METAL)
+endif()
+
 if(WITH_MOD_FLUID)
  add_definitions(-DWITH_FLUID)
 endif()
--- a/intern/cycles/blender/addon/camera.py
+++ b/intern/cycles/blender/addon/camera.py
@@ -0,0 +1,84 @@
+#
+# Copyright 2011-2021 Blender Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# <pep8 compliant>
+
+# Fit to match default projective camera with focal_length 50 and sensor_width 36.
+default_fisheye_polynomial = [-1.1735143712967577e-05,
+                              -0.019988736953434998,
+                              -3.3525322965709175e-06,
+                              3.099275275886036e-06,
+                              -2.6064646454854524e-08]
+
+# Utilities to generate lens polynomials to match built-in camera types, only here
+# for reference at the moment, not used by the code.
+def create_grid(sensor_height, sensor_width):
+    import numpy as np
+    if sensor_height is None:
+        sensor_height = sensor_width / (16 / 9)  # Default aspect ration 16:9
+    uu, vv = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100))
+    uu = (uu - 0.5) * sensor_width
+    vv = (vv - 0.5) * sensor_height
+    rr = np.sqrt(uu ** 2 + vv ** 2)
+    return rr
+
+
+def fisheye_lens_polynomial_from_projective(focal_length=50, sensor_width=36, sensor_height=None):
+    import numpy as np
+    rr = create_grid(sensor_height, sensor_width)
+    polynomial = np.polyfit(rr.flat, (-np.arctan(rr / focal_length)).flat, 4)
+    return list(reversed(polynomial))
+
+
+def fisheye_lens_polynomial_from_projective_fov(fov, sensor_width=36, sensor_height=None):
+    import numpy as np
+    f = sensor_width / 2 / np.tan(fov / 2)
+    return fisheye_lens_polynomial_from_projective(f, sensor_width, sensor_height)
+
+
+def fisheye_lens_polynomial_from_equisolid(lens=10.5, sensor_width=36, sensor_height=None):
+    import numpy as np
+    rr = create_grid(sensor_height, sensor_width)
+    x = rr.reshape(-1)
+    x = np.stack([x**i for i in [1, 2, 3, 4]])
+    y = (-2 * np.arcsin(rr / (2 * lens))).reshape(-1)
+    polynomial = np.linalg.lstsq(x.T, y.T, rcond=None)[0]
+    return [0] + list(polynomial)
+
+
+def fisheye_lens_polynomial_from_equidistant(fov=180, sensor_width=36, sensor_height=None):
+    import numpy as np
+    return [0, -np.radians(fov) / sensor_width, 0, 0, 0]
+
+
+def fisheye_lens_polynomial_from_distorted_projective_polynomial(k1, k2, k3, focal_length=50, sensor_width=36, sensor_height=None):
+    import numpy as np
+    rr = create_grid(sensor_height, sensor_width)
+    r2 = (rr / focal_length) ** 2
+    r4 = r2 * r2
+    r6 = r4 * r2
+    r_coeff = 1 + k1 * r2 + k2 * r4 + k3 * r6
+    polynomial = np.polyfit(rr.flat, (-np.arctan(rr / focal_length * r_coeff)).flat, 4)
+    return list(reversed(polynomial))
+
+def fisheye_lens_polynomial_from_distorted_projective_divisions(k1, k2, focal_length=50, sensor_width=36, sensor_height=None):
+    import numpy as np
+    rr = create_grid(sensor_height, sensor_width)
+    r2 = (rr / focal_length) ** 2
+    r4 = r2 * r2
+    r_coeff = 1 + k1 * r2 + k2 * r4
+    polynomial = np.polyfit(rr.flat, (-np.arctan(rr / focal_length / r_coeff)).flat, 4)
+    return list(reversed(polynomial))
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -28,7 +28,7 @@ def _configure_argument_parser():
                        action='store_true')
    parser.add_argument("--cycles-device",
                        help="Set the device to use for Cycles, overriding user preferences and the scene setting."
-                             "Valid options are 'CPU', 'CUDA', 'OPTIX', or 'HIP'"
+                             "Valid options are 'CPU', 'CUDA', 'OPTIX', 'HIP' or 'METAL'."
                             "Additionally, you can append '+CPU' to any GPU type for hybrid rendering.",
                        default=None)
    return parser
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -33,6 +33,7 @@ from math import pi
 # enums

 from . import engine
+from . import camera

 enum_devices = (
    ('CPU', "CPU", "Use CPU for rendering"),
@@ -72,6 +73,8 @@ enum_panorama_types = (
    ('FISHEYE_EQUISOLID', "Fisheye Equisolid",
                          "Similar to most fisheye modern lens, takes sensor dimensions into consideration"),
    ('MIRRORBALL', "Mirror Ball", "Uses the mirror ball mapping"),
+    ('FISHEYE_LENS_POLYNOMIAL', "Fisheye Lens Polynomial",
+     "Defines the lens projection as polynomial to allow real world camera lenses to be mimicked."),
 )

 enum_curve_shape = (
@@ -111,7 +114,8 @@ enum_device_type = (
    ('CPU', "CPU", "CPU", 0),
    ('CUDA', "CUDA", "CUDA", 1),
    ('OPTIX', "OptiX", "OptiX", 3),
-    ("HIP", "HIP", "HIP", 4)
+    ('HIP', "HIP", "HIP", 4),
+    ('METAL', "Metal", "Metal", 5)
 )

 enum_texture_limit = (
@@ -890,6 +894,32 @@ class CyclesCameraSettings(bpy.types.PropertyGroup):
        default=pi,
    )

+    fisheye_polynomial_k0: FloatProperty(
+        name="Fisheye Polynomial K0",
+        description="Coefficient K0 of the lens polinomial",
+        default=camera.default_fisheye_polynomial[0], precision=6, step=0.1, subtype='ANGLE',
+    )
+    fisheye_polynomial_k1: FloatProperty(
+        name="Fisheye Polynomial K1",
+        description="Coefficient K1 of the lens polinomial",
+        default=camera.default_fisheye_polynomial[1], precision=6, step=0.1, subtype='ANGLE',
+    )
+    fisheye_polynomial_k2: FloatProperty(
+        name="Fisheye Polynomial K2",
+        description="Coefficient K2 of the lens polinomial",
+        default=camera.default_fisheye_polynomial[2], precision=6, step=0.1, subtype='ANGLE',
+    )
+    fisheye_polynomial_k3: FloatProperty(
+        name="Fisheye Polynomial K3",
+        description="Coefficient K3 of the lens polinomial",
+        default=camera.default_fisheye_polynomial[3], precision=6, step=0.1, subtype='ANGLE',
+    )
+    fisheye_polynomial_k4: FloatProperty(
+        name="Fisheye Polynomial K4",
+        description="Coefficient K4 of the lens polinomial",
+        default=camera.default_fisheye_polynomial[4], precision=6, step=0.1, subtype='ANGLE',
+    )
+
    @classmethod
    def register(cls):
        bpy.types.Camera.cycles = PointerProperty(
@@ -1312,8 +1342,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):

    def get_device_types(self, context):
        import _cycles
-        has_cuda, has_optix, has_hip = _cycles.get_device_types()
-
+        has_cuda, has_optix, has_hip, has_metal = _cycles.get_device_types()
        list = [('NONE', "None", "Don't use compute device", 0)]
        if has_cuda:
            list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1))
@@ -1321,6 +1350,8 @@ class CyclesPreferences(bpy.types.AddonPreferences):
            list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3))
        if has_hip:
            list.append(('HIP', "HIP", "Use HIP for GPU acceleration", 4))
+        if has_metal:
+            list.append(('METAL', "Metal", "Use Metal for GPU acceleration", 5))

        return list

@@ -1346,7 +1377,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):

    def update_device_entries(self, device_list):
        for device in device_list:
-            if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP'}:
+            if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP', 'METAL'}:
                continue
            # Try to find existing Device entry
            entry = self.find_existing_device_entry(device)
@@ -1390,7 +1421,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
        import _cycles
        # Ensure `self.devices` is not re-allocated when the second call to
        # get_devices_for_type is made, freeing items from the first list.
-        for device_type in ('CUDA', 'OPTIX', 'HIP'):
+        for device_type in ('CUDA', 'OPTIX', 'HIP', 'METAL'):
            self.update_device_entries(_cycles.available_devices(device_type))

    # Deprecated: use refresh_devices instead.
@@ -1442,6 +1473,8 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                col.label(text="Requires discrete AMD GPU with RDNA architecture", icon='BLANK1')
                if sys.platform[:3] == "win":
                    col.label(text="and AMD Radeon Pro 21.Q4 driver or newer", icon='BLANK1')
+            elif device_type == 'METAL':
+                col.label(text="Requires Apple Silicon and macOS 12.0 or newer", icon='BLANK1')
            return

        for device in devices:
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -97,6 +97,11 @@ def use_cpu(context):
    return (get_device_type(context) == 'NONE' or cscene.device == 'CPU')


+def use_metal(context):
+    cscene = context.scene.cycles
+
+    return (get_device_type(context) == 'METAL' and cscene.device == 'GPU')
+
 def use_cuda(context):
    cscene = context.scene.cycles

@@ -1015,7 +1020,7 @@ class CYCLES_OBJECT_PT_motion_blur(CyclesButtonsPanel, Panel):
    def poll(cls, context):
        ob = context.object
        if CyclesButtonsPanel.poll(context) and ob:
-            if ob.type in {'MESH', 'CURVE', 'CURVE', 'SURFACE', 'FONT', 'META', 'CAMERA'}:
+            if ob.type in {'MESH', 'CURVE', 'CURVE', 'SURFACE', 'FONT', 'META', 'CAMERA', 'HAIR', 'POINTCLOUD'}:
                return True
            if ob.instance_type == 'COLLECTION' and ob.instance_collection:
                return True
--- a/intern/cycles/blender/camera.cpp
+++ b/intern/cycles/blender/camera.cpp
@@ -69,6 +69,12 @@ struct BlenderCamera {
  float pole_merge_angle_from;
  float pole_merge_angle_to;

+  float fisheye_polynomial_k0;
+  float fisheye_polynomial_k1;
+  float fisheye_polynomial_k2;
+  float fisheye_polynomial_k3;
+  float fisheye_polynomial_k4;
+
  enum { AUTO, HORIZONTAL, VERTICAL } sensor_fit;
  float sensor_width;
  float sensor_height;
@@ -200,6 +206,12 @@ static void blender_camera_from_object(BlenderCamera *bcam,
    bcam->longitude_min = RNA_float_get(&ccamera, "longitude_min");
    bcam->longitude_max = RNA_float_get(&ccamera, "longitude_max");

+    bcam->fisheye_polynomial_k0 = RNA_float_get(&ccamera, "fisheye_polynomial_k0");
+    bcam->fisheye_polynomial_k1 = RNA_float_get(&ccamera, "fisheye_polynomial_k1");
+    bcam->fisheye_polynomial_k2 = RNA_float_get(&ccamera, "fisheye_polynomial_k2");
+    bcam->fisheye_polynomial_k3 = RNA_float_get(&ccamera, "fisheye_polynomial_k3");
+    bcam->fisheye_polynomial_k4 = RNA_float_get(&ccamera, "fisheye_polynomial_k4");
+
    bcam->interocular_distance = b_camera.stereo().interocular_distance();
    if (b_camera.stereo().convergence_mode() == BL::CameraStereoData::convergence_mode_PARALLEL) {
      bcam->convergence_distance = FLT_MAX;
@@ -422,7 +434,8 @@ static void blender_camera_sync(Camera *cam,
  cam->set_full_height(height);

  /* panorama sensor */
-  if (bcam->type == CAMERA_PANORAMA && bcam->panorama_type == PANORAMA_FISHEYE_EQUISOLID) {
+  if (bcam->type == CAMERA_PANORAMA && (bcam->panorama_type == PANORAMA_FISHEYE_EQUISOLID ||
+                                        bcam->panorama_type == PANORAMA_FISHEYE_LENS_POLYNOMIAL)) {
    float fit_xratio = (float)bcam->render_width * bcam->pixelaspect.x;
    float fit_yratio = (float)bcam->render_height * bcam->pixelaspect.y;
    bool horizontal_fit;
@@ -465,6 +478,12 @@ static void blender_camera_sync(Camera *cam,
  cam->set_latitude_min(bcam->latitude_min);
  cam->set_latitude_max(bcam->latitude_max);

+  cam->set_fisheye_polynomial_k0(bcam->fisheye_polynomial_k0);
+  cam->set_fisheye_polynomial_k1(bcam->fisheye_polynomial_k1);
+  cam->set_fisheye_polynomial_k2(bcam->fisheye_polynomial_k2);
+  cam->set_fisheye_polynomial_k3(bcam->fisheye_polynomial_k3);
+  cam->set_fisheye_polynomial_k4(bcam->fisheye_polynomial_k4);
+
  cam->set_longitude_min(bcam->longitude_min);
  cam->set_longitude_max(bcam->longitude_max);

--- a/intern/cycles/blender/device.cpp
+++ b/intern/cycles/blender/device.cpp
@@ -27,6 +27,7 @@ enum ComputeDevice {
  COMPUTE_DEVICE_CUDA = 1,
  COMPUTE_DEVICE_OPTIX = 3,
  COMPUTE_DEVICE_HIP = 4,
+  COMPUTE_DEVICE_METAL = 5,

  COMPUTE_DEVICE_NUM
 };
@@ -85,6 +86,9 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
      else if (compute_device == COMPUTE_DEVICE_HIP) {
        mask |= DEVICE_MASK_HIP;
      }
+      else if (compute_device == COMPUTE_DEVICE_METAL) {
+        mask |= DEVICE_MASK_METAL;
+      }
      vector<DeviceInfo> devices = Device::available_devices(mask);

      /* Match device preferences and available devices. */
--- a/intern/cycles/blender/geometry.cpp
+++ b/intern/cycles/blender/geometry.cpp
@@ -19,6 +19,7 @@
 #include "scene/hair.h"
 #include "scene/mesh.h"
 #include "scene/object.h"
+#include "scene/pointcloud.h"
 #include "scene/volume.h"

 #include "blender/sync.h"
@@ -39,6 +40,10 @@ static Geometry::Type determine_geom_type(BObjectInfo &b_ob_info, bool use_parti
    return Geometry::HAIR;
  }

+  if (b_ob_info.object_data.is_a(&RNA_PointCloud)) {
+    return Geometry::POINTCLOUD;
+  }
+
  if (b_ob_info.object_data.is_a(&RNA_Volume) ||
      (b_ob_info.object_data == b_ob_info.real_object.data() &&
       object_fluid_gas_domain_find(b_ob_info.real_object))) {
@@ -111,6 +116,9 @@ Geometry *BlenderSync::sync_geometry(BL::Depsgraph &b_depsgraph,
    else if (geom_type == Geometry::VOLUME) {
      geom = scene->create_node<Volume>();
    }
+    else if (geom_type == Geometry::POINTCLOUD) {
+      geom = scene->create_node<PointCloud>();
+    }
    else {
      geom = scene->create_node<Mesh>();
    }
@@ -170,6 +178,10 @@ Geometry *BlenderSync::sync_geometry(BL::Depsgraph &b_depsgraph,
      Volume *volume = static_cast<Volume *>(geom);
      sync_volume(b_ob_info, volume);
    }
+    else if (geom_type == Geometry::POINTCLOUD) {
+      PointCloud *pointcloud = static_cast<PointCloud *>(geom);
+      sync_pointcloud(pointcloud, b_ob_info);
+    }
    else {
      Mesh *mesh = static_cast<Mesh *>(geom);
      sync_mesh(b_depsgraph, b_ob_info, mesh);
@@ -231,6 +243,10 @@ void BlenderSync::sync_geometry_motion(BL::Depsgraph &b_depsgraph,
             object_fluid_gas_domain_find(b_ob_info.real_object)) {
      /* No volume motion blur support yet. */
    }
+    else if (b_ob_info.object_data.is_a(&RNA_PointCloud)) {
+      PointCloud *pointcloud = static_cast<PointCloud *>(geom);
+      sync_pointcloud_motion(pointcloud, b_ob_info, motion_step);
+    }
    else {
      Mesh *mesh = static_cast<Mesh *>(geom);
      sync_mesh_motion(b_depsgraph, b_ob_info, mesh, motion_step);
--- a/intern/cycles/blender/object.cpp
+++ b/intern/cycles/blender/object.cpp
@@ -72,7 +72,8 @@ bool BlenderSync::object_is_geometry(BObjectInfo &b_ob_info)

  BL::Object::type_enum type = b_ob_info.iter_object.type();

-  if (type == BL::Object::type_VOLUME || type == BL::Object::type_HAIR) {
+  if (type == BL::Object::type_VOLUME || type == BL::Object::type_HAIR ||
+      type == BL::Object::type_POINTCLOUD) {
    /* Will be exported attached to mesh. */
    return true;
  }
@@ -206,7 +207,7 @@ Object *BlenderSync::sync_object(BL::Depsgraph &b_depsgraph,
    return NULL;
  }

-  /* only interested in object that we can create meshes from */
+  /* only interested in object that we can create geometry from */
  if (!object_is_geometry(b_ob_info)) {
    return NULL;
  }
--- a/intern/cycles/blender/output_driver.cpp
+++ b/intern/cycles/blender/output_driver.cpp
@@ -66,7 +66,7 @@ bool BlenderOutputDriver::read_render_tile(const Tile &tile)

 bool BlenderOutputDriver::update_render_tile(const Tile &tile)
 {
-  /* Use final write for preview renders, otherwise render result wouldn't be be updated
+  /* Use final write for preview renders, otherwise render result wouldn't be updated
   * quickly on Blender side. For all other cases we use the display driver. */
  if (b_engine_.is_preview()) {
    write_render_tile(tile);
--- a/intern/cycles/blender/pointcloud.cpp
+++ b/intern/cycles/blender/pointcloud.cpp
@@ -0,0 +1,253 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scene/pointcloud.h"
+#include "scene/attribute.h"
+#include "scene/scene.h"
+
+#include "blender/sync.h"
+#include "blender/util.h"
+
+#include "util/foreach.h"
+#include "util/hash.h"
+
+CCL_NAMESPACE_BEGIN
+
+template<typename TypeInCycles, typename GetValueAtIndex>
+static void fill_generic_attribute(BL::PointCloud &b_pointcloud,
+                                   TypeInCycles *data,
+                                   const GetValueAtIndex &get_value_at_index)
+{
+  const int num_points = b_pointcloud.points.length();
+  for (int i = 0; i < num_points; i++) {
+    data[i] = get_value_at_index(i);
+  }
+}
+
+static void copy_attributes(PointCloud *pointcloud, BL::PointCloud b_pointcloud)
+{
+  AttributeSet &attributes = pointcloud->attributes;
+  for (BL::Attribute &b_attribute : b_pointcloud.attributes) {
+    const ustring name{b_attribute.name().c_str()};
+
+    if (attributes.find(name)) {
+      continue;
+    }
+
+    const AttributeElement element = ATTR_ELEMENT_VERTEX;
+    const BL::Attribute::data_type_enum b_data_type = b_attribute.data_type();
+    switch (b_data_type) {
+      case BL::Attribute::data_type_FLOAT: {
+        BL::FloatAttribute b_float_attribute{b_attribute};
+        Attribute *attr = attributes.add(name, TypeFloat, element);
+        float *data = attr->data_float();
+        fill_generic_attribute(
+            b_pointcloud, data, [&](int i) { return b_float_attribute.data[i].value(); });
+        break;
+      }
+      case BL::Attribute::data_type_BOOLEAN: {
+        BL::BoolAttribute b_bool_attribute{b_attribute};
+        Attribute *attr = attributes.add(name, TypeFloat, element);
+        float *data = attr->data_float();
+        fill_generic_attribute(
+            b_pointcloud, data, [&](int i) { return (float)b_bool_attribute.data[i].value(); });
+        break;
+      }
+      case BL::Attribute::data_type_INT: {
+        BL::IntAttribute b_int_attribute{b_attribute};
+        Attribute *attr = attributes.add(name, TypeFloat, element);
+        float *data = attr->data_float();
+        fill_generic_attribute(
+            b_pointcloud, data, [&](int i) { return (float)b_int_attribute.data[i].value(); });
+        break;
+      }
+      case BL::Attribute::data_type_FLOAT_VECTOR: {
+        BL::FloatVectorAttribute b_vector_attribute{b_attribute};
+        Attribute *attr = attributes.add(name, TypeVector, element);
+        float3 *data = attr->data_float3();
+        fill_generic_attribute(b_pointcloud, data, [&](int i) {
+          BL::Array<float, 3> v = b_vector_attribute.data[i].vector();
+          return make_float3(v[0], v[1], v[2]);
+        });
+        break;
+      }
+      case BL::Attribute::data_type_FLOAT_COLOR: {
+        BL::FloatColorAttribute b_color_attribute{b_attribute};
+        Attribute *attr = attributes.add(name, TypeRGBA, element);
+        float4 *data = attr->data_float4();
+        fill_generic_attribute(b_pointcloud, data, [&](int i) {
+          BL::Array<float, 4> v = b_color_attribute.data[i].color();
+          return make_float4(v[0], v[1], v[2], v[3]);
+        });
+        break;
+      }
+      case BL::Attribute::data_type_FLOAT2: {
+        BL::Float2Attribute b_float2_attribute{b_attribute};
+        Attribute *attr = attributes.add(name, TypeFloat2, element);
+        float2 *data = attr->data_float2();
+        fill_generic_attribute(b_pointcloud, data, [&](int i) {
+          BL::Array<float, 2> v = b_float2_attribute.data[i].vector();
+          return make_float2(v[0], v[1]);
+        });
+        break;
+      }
+      default:
+        /* Not supported. */
+        break;
+    }
+  }
+}
+
+static void export_pointcloud(Scene *scene, PointCloud *pointcloud, BL::PointCloud b_pointcloud)
+{
+  /* TODO: optimize so we can straight memcpy arrays from Blender? */
+
+  /* Add requested attributes. */
+  Attribute *attr_random = NULL;
+  if (pointcloud->need_attribute(scene, ATTR_STD_POINT_RANDOM)) {
+    attr_random = pointcloud->attributes.add(ATTR_STD_POINT_RANDOM);
+  }
+
+  /* Reserve memory. */
+  const int num_points = b_pointcloud.points.length();
+  pointcloud->reserve(num_points);
+
+  /* Export points. */
+  BL::PointCloud::points_iterator b_point_iter;
+  for (b_pointcloud.points.begin(b_point_iter); b_point_iter != b_pointcloud.points.end();
+       ++b_point_iter) {
+    BL::Point b_point = *b_point_iter;
+    const float3 co = get_float3(b_point.co());
+    const float radius = b_point.radius();
+    pointcloud->add_point(co, radius);
+
+    /* Random number per point. */
+    if (attr_random != NULL) {
+      attr_random->add(hash_uint2_to_float(b_point.index(), 0));
+    }
+  }
+
+  /* Export attributes */
+  copy_attributes(pointcloud, b_pointcloud);
+}
+
+static void export_pointcloud_motion(PointCloud *pointcloud,
+                                     BL::PointCloud b_pointcloud,
+                                     int motion_step)
+{
+  /* Find or add attribute. */
+  Attribute *attr_mP = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+  bool new_attribute = false;
+
+  if (!attr_mP) {
+    attr_mP = pointcloud->attributes.add(ATTR_STD_MOTION_VERTEX_POSITION);
+    new_attribute = true;
+  }
+
+  /* Export motion points. */
+  const int num_points = pointcloud->num_points();
+  float3 *mP = attr_mP->data_float3() + motion_step * num_points;
+  bool have_motion = false;
+  int num_motion_points = 0;
+  const array<float3> &pointcloud_points = pointcloud->get_points();
+
+  BL::PointCloud::points_iterator b_point_iter;
+  for (b_pointcloud.points.begin(b_point_iter); b_point_iter != b_pointcloud.points.end();
+       ++b_point_iter) {
+    BL::Point b_point = *b_point_iter;
+
+    if (num_motion_points < num_points) {
+      float3 P = get_float3(b_point.co());
+      P.w = b_point.radius();
+      mP[num_motion_points] = P;
+      have_motion = have_motion || (P != pointcloud_points[num_motion_points]);
+      num_motion_points++;
+    }
+  }
+
+  /* In case of new attribute, we verify if there really was any motion. */
+  if (new_attribute) {
+    if (num_motion_points != num_points || !have_motion) {
+      pointcloud->attributes.remove(ATTR_STD_MOTION_VERTEX_POSITION);
+    }
+    else if (motion_step > 0) {
+      /* Motion, fill up previous steps that we might have skipped because
+       * they had no motion, but we need them anyway now. */
+      for (int step = 0; step < motion_step; step++) {
+        pointcloud->copy_center_to_motion_step(step);
+      }
+    }
+  }
+
+  /* Export attributes */
+  copy_attributes(pointcloud, b_pointcloud);
+}
+
+void BlenderSync::sync_pointcloud(PointCloud *pointcloud, BObjectInfo &b_ob_info)
+{
+  size_t old_numpoints = pointcloud->num_points();
+
+  array<Node *> used_shaders = pointcloud->get_used_shaders();
+
+  PointCloud new_pointcloud;
+  new_pointcloud.set_used_shaders(used_shaders);
+
+  /* TODO: add option to filter out points in the view layer. */
+  BL::PointCloud b_pointcloud(b_ob_info.object_data);
+  export_pointcloud(scene, &new_pointcloud, b_pointcloud);
+
+  /* update original sockets */
+  for (const SocketType &socket : new_pointcloud.type->inputs) {
+    /* Those sockets are updated in sync_object, so do not modify them. */
+    if (socket.name == "use_motion_blur" || socket.name == "motion_steps" ||
+        socket.name == "used_shaders") {
+      continue;
+    }
+    pointcloud->set_value(socket, new_pointcloud, socket);
+  }
+
+  pointcloud->attributes.clear();
+  foreach (Attribute &attr, new_pointcloud.attributes.attributes) {
+    pointcloud->attributes.attributes.push_back(std::move(attr));
+  }
+
+  /* tag update */
+  const bool rebuild = (pointcloud && old_numpoints != pointcloud->num_points());
+  pointcloud->tag_update(scene, rebuild);
+}
+
+void BlenderSync::sync_pointcloud_motion(PointCloud *pointcloud,
+                                         BObjectInfo &b_ob_info,
+                                         int motion_step)
+{
+  /* Skip if nothing exported. */
+  if (pointcloud->num_points() == 0) {
+    return;
+  }
+
+  /* Export deformed coordinates. */
+  if (ccl::BKE_object_is_deform_modified(b_ob_info, b_scene, preview)) {
+    /* PointCloud object. */
+    BL::PointCloud b_pointcloud(b_ob_info.object_data);
+    export_pointcloud_motion(pointcloud, b_pointcloud, motion_step);
+  }
+  else {
+    /* No deformation on this frame, copy coordinates if other frames did have it. */
+    pointcloud->copy_center_to_motion_step(motion_step);
+  }
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/blender/python.cpp
+++ b/intern/cycles/blender/python.cpp
@@ -906,16 +906,18 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*
 static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
 {
  vector<DeviceType> device_types = Device::available_types();
-  bool has_cuda = false, has_optix = false, has_hip = false;
+  bool has_cuda = false, has_optix = false, has_hip = false, has_metal = false;
  foreach (DeviceType device_type, device_types) {
    has_cuda |= (device_type == DEVICE_CUDA);
    has_optix |= (device_type == DEVICE_OPTIX);
    has_hip |= (device_type == DEVICE_HIP);
+    has_metal |= (device_type == DEVICE_METAL);
  }
-  PyObject *list = PyTuple_New(3);
+  PyObject *list = PyTuple_New(4);
  PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
  PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix));
  PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_hip));
+  PyTuple_SET_ITEM(list, 3, PyBool_FromLong(has_metal));
  return list;
 }

@@ -944,6 +946,9 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg)
  else if (override == "HIP") {
    BlenderSession::device_override = DEVICE_MASK_HIP;
  }
+  else if (override == "METAL") {
+    BlenderSession::device_override = DEVICE_MASK_METAL;
+  }
  else {
    printf("\nError: %s is not a valid Cycles device.\n", override.c_str());
    Py_RETURN_FALSE;
--- a/intern/cycles/blender/session.cpp
+++ b/intern/cycles/blender/session.cpp
@@ -396,6 +396,13 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
    /* set the current view */
    b_engine.active_view_set(b_rview_name.c_str());

+    /* Force update in this case, since the camera transform on each frame changes
+     * in different views. This could be optimized by somehow storing the animated
+     * camera transforms separate from the fixed stereo transform. */
+    if ((scene->need_motion() != Scene::MOTION_NONE) && view_index > 0) {
+      sync->tag_update();
+    }
+
    /* update scene */
    BL::Object b_camera_override(b_engine.camera_override());
    sync->sync_camera(b_render, b_camera_override, width, height, b_rview_name.c_str());
--- a/intern/cycles/blender/shader.cpp
+++ b/intern/cycles/blender/shader.cpp
@@ -378,10 +378,19 @@ static ShaderNode *add_node(Scene *scene,
  }
  else if (b_node.is_a(&RNA_ShaderNodeMapRange)) {
    BL::ShaderNodeMapRange b_map_range_node(b_node);
-    MapRangeNode *map_range_node = graph->create_node<MapRangeNode>();
-    map_range_node->set_clamp(b_map_range_node.clamp());
-    map_range_node->set_range_type((NodeMapRangeType)b_map_range_node.interpolation_type());
-    node = map_range_node;
+    if (b_map_range_node.data_type() == BL::ShaderNodeMapRange::data_type_FLOAT_VECTOR) {
+      VectorMapRangeNode *vector_map_range_node = graph->create_node<VectorMapRangeNode>();
+      vector_map_range_node->set_use_clamp(b_map_range_node.clamp());
+      vector_map_range_node->set_range_type(
+          (NodeMapRangeType)b_map_range_node.interpolation_type());
+      node = vector_map_range_node;
+    }
+    else {
+      MapRangeNode *map_range_node = graph->create_node<MapRangeNode>();
+      map_range_node->set_clamp(b_map_range_node.clamp());
+      map_range_node->set_range_type((NodeMapRangeType)b_map_range_node.interpolation_type());
+      node = map_range_node;
+    }
  }
  else if (b_node.is_a(&RNA_ShaderNodeClamp)) {
    BL::ShaderNodeClamp b_clamp_node(b_node);
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -95,6 +95,11 @@ void BlenderSync::reset(BL::BlendData &b_data, BL::Scene &b_scene)
  this->b_scene = b_scene;
 }

+void BlenderSync::tag_update()
+{
+  has_updates_ = true;
+}
+
 /* Sync */

 void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d)
--- a/intern/cycles/blender/sync.h
+++ b/intern/cycles/blender/sync.h
@@ -66,6 +66,8 @@ class BlenderSync {

  void reset(BL::BlendData &b_data, BL::Scene &b_scene);

+  void tag_update();
+
  /* sync */
  void sync_recalc(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d);
  void sync_data(BL::RenderSettings &b_render,
@@ -167,12 +169,16 @@ class BlenderSync {
      Hair *hair, BL::Mesh &b_mesh, BObjectInfo &b_ob_info, bool motion, int motion_step = 0);
  bool object_has_particle_hair(BL::Object b_ob);

+  /* Point Cloud */
+  void sync_pointcloud(PointCloud *pointcloud, BObjectInfo &b_ob_info);
+  void sync_pointcloud_motion(PointCloud *pointcloud, BObjectInfo &b_ob_info, int motion_step = 0);
+
  /* Camera */
  void sync_camera_motion(
      BL::RenderSettings &b_render, BL::Object &b_ob, int width, int height, float motion_time);

  /* Geometry */
-  Geometry *sync_geometry(BL::Depsgraph &b_depsgrpah,
+  Geometry *sync_geometry(BL::Depsgraph &b_depsgraph,
                          BObjectInfo &b_ob_info,
                          bool object_updated,
                          bool use_particle_hair,
@@ -267,7 +273,6 @@ class BlenderSync {

  Progress &progress;

- protected:
  /* Indicates that `sync_recalc()` detected changes in the scene.
   * If this flag is false then the data is considered to be up-to-date and will not be
   * synchronized at all. */
--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -33,6 +33,17 @@ set(SRC
  unaligned.cpp
 )

+set(SRC_METAL
+  metal.mm
+)
+
+if(WITH_CYCLES_DEVICE_METAL)
+  list(APPEND SRC
+    ${SRC_METAL}
+  )
+  add_definitions(-DWITH_METAL)
+endif()
+
 set(SRC_HEADERS
  bvh.h
  bvh2.h
@@ -46,6 +57,7 @@ set(SRC_HEADERS
  sort.h
  split.h
  unaligned.h
+  metal.h
 )

 set(LIB
--- a/intern/cycles/bvh/build.cpp
+++ b/intern/cycles/bvh/build.cpp
@@ -26,6 +26,7 @@
 #include "scene/hair.h"
 #include "scene/mesh.h"
 #include "scene/object.h"
+#include "scene/pointcloud.h"
 #include "scene/scene.h"

 #include "util/algorithm.h"
@@ -113,9 +114,9 @@ void BVHBuild::add_reference_triangles(BoundBox &root,
    else {
      /* Motion triangles, trace optimized case:  we split triangle
       * primitives into separate nodes for each of the time steps.
-       * This way we minimize overlap of neighbor curve primitives.
+       * This way we minimize overlap of neighbor triangle primitives.
       */
-      const int num_bvh_steps = params.num_motion_curve_steps * 2 + 1;
+      const int num_bvh_steps = params.num_motion_triangle_steps * 2 + 1;
      const float num_bvh_steps_inv_1 = 1.0f / (num_bvh_steps - 1);
      const size_t num_verts = mesh->verts.size();
      const size_t num_steps = mesh->motion_steps;
@@ -269,6 +270,101 @@ void BVHBuild::add_reference_curves(BoundBox &root, BoundBox &center, Hair *hair
  }
 }

+void BVHBuild::add_reference_points(BoundBox &root,
+                                    BoundBox &center,
+                                    PointCloud *pointcloud,
+                                    int i)
+{
+  const Attribute *point_attr_mP = NULL;
+  if (pointcloud->has_motion_blur()) {
+    point_attr_mP = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+  }
+
+  const float3 *points_data = &pointcloud->points[0];
+  const float *radius_data = &pointcloud->radius[0];
+  const size_t num_points = pointcloud->num_points();
+  const float3 *motion_data = (point_attr_mP) ? point_attr_mP->data_float3() : NULL;
+  const size_t num_steps = pointcloud->get_motion_steps();
+
+  if (point_attr_mP == NULL) {
+    /* Really simple logic for static points. */
+    for (uint j = 0; j < num_points; j++) {
+      const PointCloud::Point point = pointcloud->get_point(j);
+      BoundBox bounds = BoundBox::empty;
+      point.bounds_grow(points_data, radius_data, bounds);
+      if (bounds.valid()) {
+        references.push_back(BVHReference(bounds, j, i, PRIMITIVE_POINT));
+        root.grow(bounds);
+        center.grow(bounds.center2());
+      }
+    }
+  }
+  else if (params.num_motion_point_steps == 0 || params.use_spatial_split) {
+    /* Simple case of motion points: single node for the whole
+     * shutter time. Lowest memory usage but less optimal
+     * rendering.
+     */
+    /* TODO(sergey): Support motion steps for spatially split BVH. */
+    for (uint j = 0; j < num_points; j++) {
+      const PointCloud::Point point = pointcloud->get_point(j);
+      BoundBox bounds = BoundBox::empty;
+      point.bounds_grow(points_data, radius_data, bounds);
+      for (size_t step = 0; step < num_steps - 1; step++) {
+        point.bounds_grow(motion_data + step * num_points, radius_data, bounds);
+      }
+      if (bounds.valid()) {
+        references.push_back(BVHReference(bounds, j, i, PRIMITIVE_MOTION_POINT));
+        root.grow(bounds);
+        center.grow(bounds.center2());
+      }
+    }
+  }
+  else {
+    /* Motion points, trace optimized case:  we split point
+     * primitives into separate nodes for each of the time steps.
+     * This way we minimize overlap of neighbor point primitives.
+     */
+    const int num_bvh_steps = params.num_motion_point_steps * 2 + 1;
+    const float num_bvh_steps_inv_1 = 1.0f / (num_bvh_steps - 1);
+
+    for (uint j = 0; j < num_points; j++) {
+      const PointCloud::Point point = pointcloud->get_point(j);
+      const size_t num_steps = pointcloud->get_motion_steps();
+      const float3 *point_steps = point_attr_mP->data_float3();
+
+      /* Calculate bounding box of the previous time step.
+       * Will be reused later to avoid duplicated work on
+       * calculating BVH time step boundbox.
+       */
+      float4 prev_key = point.motion_key(
+          points_data, radius_data, point_steps, num_points, num_steps, 0.0f, j);
+      BoundBox prev_bounds = BoundBox::empty;
+      point.bounds_grow(prev_key, prev_bounds);
+      /* Create all primitive time steps, */
+      for (int bvh_step = 1; bvh_step < num_bvh_steps; ++bvh_step) {
+        const float curr_time = (float)(bvh_step)*num_bvh_steps_inv_1;
+        float4 curr_key = point.motion_key(
+            points_data, radius_data, point_steps, num_points, num_steps, curr_time, j);
+        BoundBox curr_bounds = BoundBox::empty;
+        point.bounds_grow(curr_key, curr_bounds);
+        BoundBox bounds = prev_bounds;
+        bounds.grow(curr_bounds);
+        if (bounds.valid()) {
+          const float prev_time = (float)(bvh_step - 1) * num_bvh_steps_inv_1;
+          references.push_back(
+              BVHReference(bounds, j, i, PRIMITIVE_MOTION_POINT, prev_time, curr_time));
+          root.grow(bounds);
+          center.grow(bounds.center2());
+        }
+        /* Current time boundbox becomes previous one for the
+         * next time step.
+         */
+        prev_bounds = curr_bounds;
+      }
+    }
+  }
+}
+
 void BVHBuild::add_reference_geometry(BoundBox &root,
                                      BoundBox &center,
                                      Geometry *geom,
@@ -282,6 +378,10 @@ void BVHBuild::add_reference_geometry(BoundBox &root,
    Hair *hair = static_cast<Hair *>(geom);
    add_reference_curves(root, center, hair, object_index);
  }
+  else if (geom->geometry_type == Geometry::POINTCLOUD) {
+    PointCloud *pointcloud = static_cast<PointCloud *>(geom);
+    add_reference_points(root, center, pointcloud, object_index);
+  }
 }

 void BVHBuild::add_reference_object(BoundBox &root, BoundBox &center, Object *ob, int i)
@@ -311,6 +411,10 @@ static size_t count_primitives(Geometry *geom)
    Hair *hair = static_cast<Hair *>(geom);
    return count_curve_segments(hair);
  }
+  else if (geom->geometry_type == Geometry::POINTCLOUD) {
+    PointCloud *pointcloud = static_cast<PointCloud *>(geom);
+    return pointcloud->num_points();
+  }

  return 0;
 }
@@ -328,8 +432,9 @@ void BVHBuild::add_references(BVHRange &root)
      if (!ob->get_geometry()->is_instanced()) {
        num_alloc_references += count_primitives(ob->get_geometry());
      }
-      else
+      else {
        num_alloc_references++;
+      }
    }
    else {
      num_alloc_references += count_primitives(ob->get_geometry());
@@ -394,7 +499,7 @@ BVHNode *BVHBuild::run()
  spatial_min_overlap = root.bounds().safe_area() * params.spatial_split_alpha;
  spatial_free_index = 0;

-  need_prim_time = params.num_motion_curve_steps > 0 || params.num_motion_triangle_steps > 0;
+  need_prim_time = params.use_motion_steps();

  /* init progress updates */
  double build_start_time;
@@ -535,7 +640,8 @@ bool BVHBuild::range_within_max_leaf_size(const BVHRange &range,
                                          const vector<BVHReference> &references) const
 {
  size_t size = range.size();
-  size_t max_leaf_size = max(params.max_triangle_leaf_size, params.max_curve_leaf_size);
+  size_t max_leaf_size = max(max(params.max_triangle_leaf_size, params.max_curve_leaf_size),
+                             params.max_point_leaf_size);

  if (size > max_leaf_size)
    return false;
@@ -544,32 +650,44 @@ bool BVHBuild::range_within_max_leaf_size(const BVHRange &range,
  size_t num_motion_triangles = 0;
  size_t num_curves = 0;
  size_t num_motion_curves = 0;
+  size_t num_points = 0;
+  size_t num_motion_points = 0;

  for (int i = 0; i < size; i++) {
    const BVHReference &ref = references[range.start() + i];

-    if (ref.prim_type() & PRIMITIVE_ALL_CURVE) {
-      if (ref.prim_type() & PRIMITIVE_ALL_MOTION) {
+    if (ref.prim_type() & PRIMITIVE_CURVE) {
+      if (ref.prim_type() & PRIMITIVE_MOTION) {
        num_motion_curves++;
      }
      else {
        num_curves++;
      }
    }
-    else if (ref.prim_type() & PRIMITIVE_ALL_TRIANGLE) {
-      if (ref.prim_type() & PRIMITIVE_ALL_MOTION) {
+    else if (ref.prim_type() & PRIMITIVE_TRIANGLE) {
+      if (ref.prim_type() & PRIMITIVE_MOTION) {
        num_motion_triangles++;
      }
      else {
        num_triangles++;
      }
    }
+    else if (ref.prim_type() & PRIMITIVE_POINT) {
+      if (ref.prim_type() & PRIMITIVE_MOTION) {
+        num_motion_points++;
+      }
+      else {
+        num_points++;
+      }
+    }
  }

  return (num_triangles <= params.max_triangle_leaf_size) &&
         (num_motion_triangles <= params.max_motion_triangle_leaf_size) &&
         (num_curves <= params.max_curve_leaf_size) &&
-         (num_motion_curves <= params.max_motion_curve_leaf_size);
+         (num_motion_curves <= params.max_motion_curve_leaf_size) &&
+         (num_points <= params.max_point_leaf_size) &&
+         (num_motion_points <= params.max_motion_point_leaf_size);
 }

 /* multithreaded binning builder */
@@ -855,7 +973,7 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer
  for (int i = 0; i < range.size(); i++) {
    const BVHReference &ref = references[range.start() + i];
    if (ref.prim_index() != -1) {
-      uint32_t type_index = bitscan((uint32_t)(ref.prim_type() & PRIMITIVE_ALL));
+      uint32_t type_index = PRIMITIVE_INDEX(ref.prim_type() & PRIMITIVE_ALL);
      p_ref[type_index].push_back(ref);
      p_type[type_index].push_back(ref.prim_type());
      p_index[type_index].push_back(ref.prim_index());
--- a/intern/cycles/bvh/build.h
+++ b/intern/cycles/bvh/build.h
@@ -39,6 +39,7 @@ class Geometry;
 class Hair;
 class Mesh;
 class Object;
+class PointCloud;
 class Progress;

 /* BVH Builder */
@@ -68,6 +69,7 @@ class BVHBuild {
  /* Adding references. */
  void add_reference_triangles(BoundBox &root, BoundBox &center, Mesh *mesh, int i);
  void add_reference_curves(BoundBox &root, BoundBox &center, Hair *hair, int i);
+  void add_reference_points(BoundBox &root, BoundBox &center, PointCloud *pointcloud, int i);
  void add_reference_geometry(BoundBox &root, BoundBox &center, Geometry *geom, int i);
  void add_reference_object(BoundBox &root, BoundBox &center, Object *ob, int i);
  void add_references(BVHRange &root);
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -19,6 +19,7 @@

 #include "bvh/bvh2.h"
 #include "bvh/embree.h"
+#include "bvh/metal.h"
 #include "bvh/multi.h"
 #include "bvh/optix.h"

@@ -43,6 +44,7 @@ const char *bvh_layout_name(BVHLayout layout)
    case BVH_LAYOUT_METAL:
      return "METAL";
    case BVH_LAYOUT_MULTI_OPTIX:
+    case BVH_LAYOUT_MULTI_METAL:
    case BVH_LAYOUT_MULTI_OPTIX_EMBREE:
    case BVH_LAYOUT_MULTI_METAL_EMBREE:
      return "MULTI";
@@ -105,13 +107,19 @@ BVH *BVH::create(const BVHParams &params,
 #else
      (void)device;
      break;
+#endif
+    case BVH_LAYOUT_METAL:
+#ifdef WITH_METAL
+      return bvh_metal_create(params, geometry, objects, device);
+#else
+      (void)device;
+      break;
 #endif
    case BVH_LAYOUT_MULTI_OPTIX:
+    case BVH_LAYOUT_MULTI_METAL:
    case BVH_LAYOUT_MULTI_OPTIX_EMBREE:
    case BVH_LAYOUT_MULTI_METAL_EMBREE:
      return new BVHMulti(params, geometry, objects);
-    case BVH_LAYOUT_METAL:
-      /* host-side changes for BVH_LAYOUT_METAL are imminent */
    case BVH_LAYOUT_NONE:
    case BVH_LAYOUT_ALL:
      break;
--- a/intern/cycles/bvh/bvh2.cpp
+++ b/intern/cycles/bvh/bvh2.cpp
@@ -20,6 +20,7 @@
 #include "scene/hair.h"
 #include "scene/mesh.h"
 #include "scene/object.h"
+#include "scene/pointcloud.h"

 #include "bvh/build.h"
 #include "bvh/node.h"
@@ -386,7 +387,7 @@ void BVH2::refit_primitives(int start, int end, BoundBox &bbox, uint &visibility
    }
    else {
      /* Primitives. */
-      if (pack.prim_type[prim] & PRIMITIVE_ALL_CURVE) {
+      if (pack.prim_type[prim] & PRIMITIVE_CURVE) {
        /* Curves. */
        const Hair *hair = static_cast<const Hair *>(ob->get_geometry());
        int prim_offset = (params.top_level) ? hair->prim_offset : 0;
@@ -409,6 +410,30 @@ void BVH2::refit_primitives(int start, int end, BoundBox &bbox, uint &visibility
          }
        }
      }
+      else if (pack.prim_type[prim] & PRIMITIVE_POINT) {
+        /* Points. */
+        const PointCloud *pointcloud = static_cast<const PointCloud *>(ob->get_geometry());
+        int prim_offset = (params.top_level) ? pointcloud->prim_offset : 0;
+        const float3 *points = &pointcloud->points[0];
+        const float *radius = &pointcloud->radius[0];
+        PointCloud::Point point = pointcloud->get_point(pidx - prim_offset);
+
+        point.bounds_grow(points, radius, bbox);
+
+        /* Motion points. */
+        if (pointcloud->get_use_motion_blur()) {
+          Attribute *attr = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+          if (attr) {
+            size_t pointcloud_size = pointcloud->points.size();
+            size_t steps = pointcloud->get_motion_steps() - 1;
+            float3 *point_steps = attr->data_float3();
+
+            for (size_t i = 0; i < steps; i++)
+              point.bounds_grow(point_steps + i * pointcloud_size, radius, bbox);
+          }
+        }
+      }
      else {
        /* Triangles. */
        const Mesh *mesh = static_cast<const Mesh *>(ob->get_geometry());
@@ -505,7 +530,8 @@ void BVH2::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
  pack.leaf_nodes.resize(leaf_nodes_size);
  pack.object_node.resize(objects.size());

-  if (params.num_motion_curve_steps > 0 || params.num_motion_triangle_steps > 0) {
+  if (params.num_motion_curve_steps > 0 || params.num_motion_triangle_steps > 0 ||
+      params.num_motion_point_steps > 0) {
    pack.prim_time.resize(prim_index_size);
  }

@@ -564,13 +590,7 @@ void BVH2::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
      float2 *bvh_prim_time = bvh->pack.prim_time.size() ? &bvh->pack.prim_time[0] : NULL;

      for (size_t i = 0; i < bvh_prim_index_size; i++) {
-        if (bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
-          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + geom_prim_offset;
-        }
-        else {
-          pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + geom_prim_offset;
-        }
-
+        pack_prim_index[pack_prim_index_offset] = bvh_prim_index[i] + geom_prim_offset;
        pack_prim_type[pack_prim_index_offset] = bvh_prim_type[i];
        pack_prim_visibility[pack_prim_index_offset] = bvh_prim_visibility[i];
        pack_prim_object[pack_prim_index_offset] = 0;  // unused for instances
--- a/intern/cycles/bvh/embree.cpp
+++ b/intern/cycles/bvh/embree.cpp
@@ -45,6 +45,7 @@
 #  include "scene/hair.h"
 #  include "scene/mesh.h"
 #  include "scene/object.h"
+#  include "scene/pointcloud.h"

 #  include "util/foreach.h"
 #  include "util/log.h"
@@ -90,7 +91,7 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
      ++ctx->num_hits;

      /* Always use baked shadow transparency for curves. */
-      if (current_isect.type & PRIMITIVE_ALL_CURVE) {
+      if (current_isect.type & PRIMITIVE_CURVE) {
        ctx->throughput *= intersection_curve_shadow_transparency(
            kg, current_isect.object, current_isect.prim, current_isect.u);

@@ -245,7 +246,7 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
  }
 }

-static void rtc_filter_func_thick_curve(const RTCFilterFunctionNArguments *args)
+static void rtc_filter_func_backface_cull(const RTCFilterFunctionNArguments *args)
 {
  const RTCRay *ray = (RTCRay *)args->ray;
  RTCHit *hit = (RTCHit *)args->hit;
@@ -258,7 +259,7 @@ static void rtc_filter_func_thick_curve(const RTCFilterFunctionNArguments *args)
  }
 }

-static void rtc_filter_occluded_func_thick_curve(const RTCFilterFunctionNArguments *args)
+static void rtc_filter_occluded_func_backface_cull(const RTCFilterFunctionNArguments *args)
 {
  const RTCRay *ray = (RTCRay *)args->ray;
  RTCHit *hit = (RTCHit *)args->hit;
@@ -410,6 +411,12 @@ void BVHEmbree::add_object(Object *ob, int i)
      add_curves(ob, hair, i);
    }
  }
+  else if (geom->geometry_type == Geometry::POINTCLOUD) {
+    PointCloud *pointcloud = static_cast<PointCloud *>(geom);
+    if (pointcloud->num_points() > 0) {
+      add_points(ob, pointcloud, i);
+    }
+  }
 }

 void BVHEmbree::add_instance(Object *ob, int i)
@@ -624,6 +631,89 @@ void BVHEmbree::set_curve_vertex_buffer(RTCGeometry geom_id, const Hair *hair, c
  }
 }

+void BVHEmbree::set_point_vertex_buffer(RTCGeometry geom_id,
+                                        const PointCloud *pointcloud,
+                                        const bool update)
+{
+  const Attribute *attr_mP = NULL;
+  size_t num_motion_steps = 1;
+  if (pointcloud->has_motion_blur()) {
+    attr_mP = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+    if (attr_mP) {
+      num_motion_steps = pointcloud->get_motion_steps();
+    }
+  }
+
+  const size_t num_points = pointcloud->num_points();
+
+  /* Copy the point data to Embree */
+  const int t_mid = (num_motion_steps - 1) / 2;
+  const float *radius = pointcloud->get_radius().data();
+  for (int t = 0; t < num_motion_steps; ++t) {
+    const float3 *verts;
+    if (t == t_mid || attr_mP == NULL) {
+      verts = pointcloud->get_points().data();
+    }
+    else {
+      int t_ = (t > t_mid) ? (t - 1) : t;
+      verts = &attr_mP->data_float3()[t_ * num_points];
+    }
+
+    float4 *rtc_verts = (update) ? (float4 *)rtcGetGeometryBufferData(
+                                       geom_id, RTC_BUFFER_TYPE_VERTEX, t) :
+                                   (float4 *)rtcSetNewGeometryBuffer(geom_id,
+                                                                     RTC_BUFFER_TYPE_VERTEX,
+                                                                     t,
+                                                                     RTC_FORMAT_FLOAT4,
+                                                                     sizeof(float) * 4,
+                                                                     num_points);
+
+    assert(rtc_verts);
+    if (rtc_verts) {
+      for (size_t j = 0; j < num_points; ++j) {
+        rtc_verts[j] = float3_to_float4(verts[j]);
+        rtc_verts[j].w = radius[j];
+      }
+    }
+
+    if (update) {
+      rtcUpdateGeometryBuffer(geom_id, RTC_BUFFER_TYPE_VERTEX, t);
+    }
+  }
+}
+
+void BVHEmbree::add_points(const Object *ob, const PointCloud *pointcloud, int i)
+{
+  size_t prim_offset = pointcloud->prim_offset;
+
+  const Attribute *attr_mP = NULL;
+  size_t num_motion_steps = 1;
+  if (pointcloud->has_motion_blur()) {
+    attr_mP = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+    if (attr_mP) {
+      num_motion_steps = pointcloud->get_motion_steps();
+    }
+  }
+
+  enum RTCGeometryType type = RTC_GEOMETRY_TYPE_SPHERE_POINT;
+
+  RTCGeometry geom_id = rtcNewGeometry(rtc_device, type);
+
+  rtcSetGeometryBuildQuality(geom_id, build_quality);
+  rtcSetGeometryTimeStepCount(geom_id, num_motion_steps);
+
+  set_point_vertex_buffer(geom_id, pointcloud, false);
+
+  rtcSetGeometryUserData(geom_id, (void *)prim_offset);
+  rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func_backface_cull);
+  rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func_backface_cull);
+  rtcSetGeometryMask(geom_id, ob->visibility_for_tracing());
+
+  rtcCommitGeometry(geom_id);
+  rtcAttachGeometryByID(scene, geom_id, i * 2);
+  rtcReleaseGeometry(geom_id);
+}
+
 void BVHEmbree::add_curves(const Object *ob, const Hair *hair, int i)
 {
  size_t prim_offset = hair->curve_segment_offset;
@@ -678,8 +768,8 @@ void BVHEmbree::add_curves(const Object *ob, const Hair *hair, int i)
    rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func);
  }
  else {
-    rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func_thick_curve);
-    rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func_thick_curve);
+    rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func_backface_cull);
+    rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func_backface_cull);
  }
  rtcSetGeometryMask(geom_id, ob->visibility_for_tracing());

@@ -716,6 +806,14 @@ void BVHEmbree::refit(Progress &progress)
          rtcCommitGeometry(geom);
        }
      }
+      else if (geom->geometry_type == Geometry::POINTCLOUD) {
+        PointCloud *pointcloud = static_cast<PointCloud *>(geom);
+        if (pointcloud->num_points() > 0) {
+          RTCGeometry geom = rtcGetGeometry(scene, geom_id);
+          set_point_vertex_buffer(geom, pointcloud, true);
+          rtcCommitGeometry(geom);
+        }
+      }
    }
    geom_id += 2;
  }
--- a/intern/cycles/bvh/embree.h
+++ b/intern/cycles/bvh/embree.h
@@ -33,6 +33,7 @@ CCL_NAMESPACE_BEGIN

 class Hair;
 class Mesh;
+class PointCloud;

 class BVHEmbree : public BVH {
 public:
@@ -51,11 +52,15 @@ class BVHEmbree : public BVH {
  void add_object(Object *ob, int i);
  void add_instance(Object *ob, int i);
  void add_curves(const Object *ob, const Hair *hair, int i);
+  void add_points(const Object *ob, const PointCloud *pointcloud, int i);
  void add_triangles(const Object *ob, const Mesh *mesh, int i);

 private:
  void set_tri_vertex_buffer(RTCGeometry geom_id, const Mesh *mesh, const bool update);
  void set_curve_vertex_buffer(RTCGeometry geom_id, const Hair *hair, const bool update);
+  void set_point_vertex_buffer(RTCGeometry geom_id,
+                               const PointCloud *pointcloud,
+                               const bool update);

  RTCDevice rtc_device;
  enum RTCBuildQuality build_quality;
--- a/intern/cycles/bvh/metal.h
+++ b/intern/cycles/bvh/metal.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BVH_METAL_H__
+#define __BVH_METAL_H__
+
+#ifdef WITH_METAL
+
+#  include "bvh/bvh.h"
+
+CCL_NAMESPACE_BEGIN
+
+BVH *bvh_metal_create(const BVHParams &params,
+                      const vector<Geometry *> &geometry,
+                      const vector<Object *> &objects,
+                      Device *device);
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL */
+
+#endif /* __BVH_METAL_H__ */
--- a/intern/cycles/bvh/metal.mm
+++ b/intern/cycles/bvh/metal.mm
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_METAL
+
+#  include "device/metal/bvh.h"
+
+CCL_NAMESPACE_BEGIN
+
+BVH *bvh_metal_create(const BVHParams &params,
+                      const vector<Geometry *> &geometry,
+                      const vector<Object *> &objects,
+                      Device *device)
+{
+  return new BVHMetal(params, geometry, objects, device);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL */
--- a/intern/cycles/bvh/params.h
+++ b/intern/cycles/bvh/params.h
@@ -83,6 +83,8 @@ class BVHParams {
  int max_motion_triangle_leaf_size;
  int max_curve_leaf_size;
  int max_motion_curve_leaf_size;
+  int max_point_leaf_size;
+  int max_motion_point_leaf_size;

  /* object or mesh level bvh */
  bool top_level;
@@ -98,13 +100,13 @@ class BVHParams {
  /* Split time range to this number of steps and create leaf node for each
   * of this time steps.
   *
-   * Speeds up rendering of motion curve primitives in the cost of higher
-   * memory usage.
+   * Speeds up rendering of motion primitives in the cost of higher memory usage.
   */
-  int num_motion_curve_steps;

  /* Same as above, but for triangle primitives. */
  int num_motion_triangle_steps;
+  int num_motion_curve_steps;
+  int num_motion_point_steps;

  /* Same as in SceneParams. */
  int bvh_type;
@@ -132,6 +134,8 @@ class BVHParams {
    max_motion_triangle_leaf_size = 8;
    max_curve_leaf_size = 1;
    max_motion_curve_leaf_size = 4;
+    max_point_leaf_size = 8;
+    max_motion_point_leaf_size = 8;

    top_level = false;
    bvh_layout = BVH_LAYOUT_BVH2;
@@ -139,6 +143,7 @@ class BVHParams {

    num_motion_curve_steps = 0;
    num_motion_triangle_steps = 0;
+    num_motion_point_steps = 0;

    bvh_type = 0;

@@ -166,6 +171,12 @@ class BVHParams {
    return (size <= min_leaf_size || level >= MAX_DEPTH);
  }

+  bool use_motion_steps()
+  {
+    return num_motion_curve_steps > 0 || num_motion_triangle_steps > 0 ||
+           num_motion_point_steps > 0;
+  }
+
  /* Gets best matching BVH.
   *
   * If the requested layout is supported by the device, it will be used.
--- a/intern/cycles/bvh/split.cpp
+++ b/intern/cycles/bvh/split.cpp
@@ -23,6 +23,7 @@
 #include "scene/hair.h"
 #include "scene/mesh.h"
 #include "scene/object.h"
+#include "scene/pointcloud.h"

 #include "util/algorithm.h"

@@ -426,6 +427,32 @@ void BVHSpatialSplit::split_curve_primitive(const Hair *hair,
  }
 }

+void BVHSpatialSplit::split_point_primitive(const PointCloud *pointcloud,
+                                            const Transform *tfm,
+                                            int prim_index,
+                                            int dim,
+                                            float pos,
+                                            BoundBox &left_bounds,
+                                            BoundBox &right_bounds)
+{
+  /* No real splitting support for points, assume they are small enough for it
+   * not to matter. */
+  float3 point = pointcloud->get_points()[prim_index];
+
+  if (tfm != NULL) {
+    point = transform_point(tfm, point);
+  }
+  point = get_unaligned_point(point);
+
+  if (point[dim] <= pos) {
+    left_bounds.grow(point);
+  }
+
+  if (point[dim] >= pos) {
+    right_bounds.grow(point);
+  }
+}
+
 void BVHSpatialSplit::split_triangle_reference(const BVHReference &ref,
                                               const Mesh *mesh,
                                               int dim,
@@ -453,6 +480,16 @@ void BVHSpatialSplit::split_curve_reference(const BVHReference &ref,
                        right_bounds);
 }

+void BVHSpatialSplit::split_point_reference(const BVHReference &ref,
+                                            const PointCloud *pointcloud,
+                                            int dim,
+                                            float pos,
+                                            BoundBox &left_bounds,
+                                            BoundBox &right_bounds)
+{
+  split_point_primitive(pointcloud, NULL, ref.prim_index(), dim, pos, left_bounds, right_bounds);
+}
+
 void BVHSpatialSplit::split_object_reference(
    const Object *object, int dim, float pos, BoundBox &left_bounds, BoundBox &right_bounds)
 {
@@ -475,6 +512,13 @@ void BVHSpatialSplit::split_object_reference(
      }
    }
  }
+  else if (geom->geometry_type == Geometry::POINTCLOUD) {
+    PointCloud *pointcloud = static_cast<PointCloud *>(geom);
+    for (int point_idx = 0; point_idx < pointcloud->num_points(); ++point_idx) {
+      split_point_primitive(
+          pointcloud, &object->get_tfm(), point_idx, dim, pos, left_bounds, right_bounds);
+    }
+  }
 }

 void BVHSpatialSplit::split_reference(const BVHBuild &builder,
@@ -491,14 +535,18 @@ void BVHSpatialSplit::split_reference(const BVHBuild &builder,
  /* loop over vertices/edges. */
  const Object *ob = builder.objects[ref.prim_object()];

-  if (ref.prim_type() & PRIMITIVE_ALL_TRIANGLE) {
+  if (ref.prim_type() & PRIMITIVE_TRIANGLE) {
    Mesh *mesh = static_cast<Mesh *>(ob->get_geometry());
    split_triangle_reference(ref, mesh, dim, pos, left_bounds, right_bounds);
  }
-  else if (ref.prim_type() & PRIMITIVE_ALL_CURVE) {
+  else if (ref.prim_type() & PRIMITIVE_CURVE) {
    Hair *hair = static_cast<Hair *>(ob->get_geometry());
    split_curve_reference(ref, hair, dim, pos, left_bounds, right_bounds);
  }
+  else if (ref.prim_type() & PRIMITIVE_POINT) {
+    PointCloud *pointcloud = static_cast<PointCloud *>(ob->get_geometry());
+    split_point_reference(ref, pointcloud, dim, pos, left_bounds, right_bounds);
+  }
  else {
    split_object_reference(ob, dim, pos, left_bounds, right_bounds);
  }
--- a/intern/cycles/bvh/split.h
+++ b/intern/cycles/bvh/split.h
@@ -26,6 +26,7 @@ CCL_NAMESPACE_BEGIN
 class BVHBuild;
 class Hair;
 class Mesh;
+class PointCloud;
 struct Transform;

 /* Object Split */
@@ -123,6 +124,13 @@ class BVHSpatialSplit {
                             float pos,
                             BoundBox &left_bounds,
                             BoundBox &right_bounds);
+  void split_point_primitive(const PointCloud *pointcloud,
+                             const Transform *tfm,
+                             int prim_index,
+                             int dim,
+                             float pos,
+                             BoundBox &left_bounds,
+                             BoundBox &right_bounds);

  /* Lower-level functions which calculates boundaries of left and right nodes
   * needed for spatial split.
@@ -141,6 +149,12 @@ class BVHSpatialSplit {
                             float pos,
                             BoundBox &left_bounds,
                             BoundBox &right_bounds);
+  void split_point_reference(const BVHReference &ref,
+                             const PointCloud *pointcloud,
+                             int dim,
+                             float pos,
+                             BoundBox &left_bounds,
+                             BoundBox &right_bounds);
  void split_object_reference(
      const Object *object, int dim, float pos, BoundBox &left_bounds, BoundBox &right_bounds);

--- a/intern/cycles/bvh/unaligned.cpp
+++ b/intern/cycles/bvh/unaligned.cpp
@@ -69,7 +69,7 @@ bool BVHUnaligned::compute_aligned_space(const BVHReference &ref, Transform *ali
  const int packed_type = ref.prim_type();
  const int type = (packed_type & PRIMITIVE_ALL);
  /* No motion blur curves here, we can't fit them to aligned boxes well. */
-  if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_CURVE_THICK)) {
+  if ((type & PRIMITIVE_CURVE) && !(type & PRIMITIVE_MOTION)) {
    const int curve_index = ref.prim_index();
    const int segment = PRIMITIVE_UNPACK_SEGMENT(packed_type);
    const Hair *hair = static_cast<const Hair *>(object->get_geometry());
@@ -95,7 +95,7 @@ BoundBox BVHUnaligned::compute_aligned_prim_boundbox(const BVHReference &prim,
  const int packed_type = prim.prim_type();
  const int type = (packed_type & PRIMITIVE_ALL);
  /* No motion blur curves here, we can't fit them to aligned boxes well. */
-  if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_CURVE_THICK)) {
+  if ((type & PRIMITIVE_CURVE) && !(type & PRIMITIVE_MOTION)) {
    const int curve_index = prim.prim_index();
    const int segment = PRIMITIVE_UNPACK_SEGMENT(packed_type);
    const Hair *hair = static_cast<const Hair *>(object->get_geometry());
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -551,4 +551,23 @@ if(NOT WITH_HIP_DYNLOAD)
  set(WITH_HIP_DYNLOAD ON)
 endif()

+###########################################################################
+# Metal
+###########################################################################
+
+if(WITH_CYCLES_DEVICE_METAL)
+  find_library(METAL_LIBRARY Metal)
+
+  # This file was added in the 12.0 SDK, use it as a way to detect the version.
+  if (METAL_LIBRARY AND NOT EXISTS "${METAL_LIBRARY}/Headers/MTLFunctionStitching.h")
+    message(STATUS "Metal version too old, must be SDK 12.0 or newer, disabling WITH_CYCLES_DEVICE_METAL")
+    set(WITH_CYCLES_DEVICE_METAL OFF)
+  elseif (NOT METAL_LIBRARY)
+    message(STATUS "Metal not found, disabling WITH_CYCLES_DEVICE_METAL")
+    set(WITH_CYCLES_DEVICE_METAL OFF)
+  else()
+    message(STATUS "Found Metal: ${METAL_LIBRARY}")
+  endif()
+endif()
+
 unset(_cycles_lib_dir)
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -43,7 +43,7 @@ if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
  add_definitions(-DWITH_HIP_DYNLOAD)
 endif()

-set(SRC
+set(SRC_BASE
  device.cpp
  denoise.cpp
  graphics_interop.cpp
@@ -104,6 +104,21 @@ set(SRC_MULTI
  multi/device.h
 )

+set(SRC_METAL
+  metal/bvh.mm
+  metal/bvh.h
+  metal/device.mm
+  metal/device.h
+  metal/device_impl.mm
+  metal/device_impl.h
+  metal/kernel.mm
+  metal/kernel.h
+  metal/queue.mm
+  metal/queue.h
+  metal/util.mm
+  metal/util.h
+)
+
 set(SRC_OPTIX
  optix/device.cpp
  optix/device.h
@@ -123,6 +138,17 @@ set(SRC_HEADERS
  queue.h
 )

+set(SRC
+  ${SRC_BASE}
+  ${SRC_CPU}
+  ${SRC_CUDA}
+  ${SRC_HIP}
+  ${SRC_DUMMY}
+  ${SRC_MULTI}
+  ${SRC_OPTIX}
+  ${SRC_HEADERS}
+)
+
 set(LIB
  cycles_kernel
  cycles_util
@@ -158,6 +184,15 @@ endif()
 if(WITH_CYCLES_DEVICE_OPTIX)
  add_definitions(-DWITH_OPTIX)
 endif()
+if(WITH_CYCLES_DEVICE_METAL)
+  list(APPEND LIB
+    ${METAL_LIBRARY}
+  )
+  add_definitions(-DWITH_METAL)
+  list(APPEND SRC
+    ${SRC_METAL}
+  )
+endif()

 if(WITH_OPENIMAGEDENOISE)
  list(APPEND LIB
@@ -168,20 +203,12 @@ endif()
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})

-cycles_add_library(cycles_device "${LIB}"
-  ${SRC}
-  ${SRC_CPU}
-  ${SRC_CUDA}
-  ${SRC_HIP}
-  ${SRC_DUMMY}
-  ${SRC_MULTI}
-  ${SRC_OPTIX}
-  ${SRC_HEADERS}
-)
+cycles_add_library(cycles_device "${LIB}" ${SRC})

 source_group("cpu" FILES ${SRC_CPU})
 source_group("cuda" FILES ${SRC_CUDA})
 source_group("dummy" FILES ${SRC_DUMMY})
 source_group("multi" FILES ${SRC_MULTI})
+source_group("metal" FILES ${SRC_METAL})
 source_group("optix" FILES ${SRC_OPTIX})
 source_group("common" FILES ${SRC} ${SRC_HEADERS})
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -129,8 +129,7 @@ void CPUDevice::mem_alloc(device_memory &mem)
              << string_human_readable_size(mem.memory_size()) << ")";
    }

-    if (mem.type == MEM_DEVICE_ONLY) {
-      assert(!mem.host_pointer);
+    if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
      size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
      void *data = util_aligned_malloc(mem.memory_size(), alignment);
      mem.device_pointer = (device_ptr)data;
@@ -189,7 +188,7 @@ void CPUDevice::mem_free(device_memory &mem)
    tex_free((device_texture &)mem);
  }
  else if (mem.device_pointer) {
-    if (mem.type == MEM_DEVICE_ONLY) {
+    if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
      util_aligned_free((void *)mem.device_pointer);
    }
    mem.device_pointer = 0;
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -678,7 +678,7 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_

  void *shared_pointer = 0;

-  if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
+  if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
    if (mem.shared_pointer) {
      /* Another device already allocated host memory. */
      mem_alloc_result = CUDA_SUCCESS;
@@ -701,8 +701,14 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_
  }

  if (mem_alloc_result != CUDA_SUCCESS) {
-    status = " failed, out of device and host memory";
-    set_error("System is out of GPU and shared host memory");
+    if (mem.type == MEM_DEVICE_ONLY) {
+      status = " failed, out of device memory";
+      set_error("System is out of GPU memory");
+    }
+    else {
+      status = " failed, out of device and host memory";
+      set_error("System is out of GPU and shared host memory");
+    }
  }

  if (mem.name) {
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -27,6 +27,7 @@
 #include "device/cuda/device.h"
 #include "device/dummy/device.h"
 #include "device/hip/device.h"
+#include "device/metal/device.h"
 #include "device/multi/device.h"
 #include "device/optix/device.h"

@@ -49,6 +50,7 @@ vector<DeviceInfo> Device::cuda_devices;
 vector<DeviceInfo> Device::optix_devices;
 vector<DeviceInfo> Device::cpu_devices;
 vector<DeviceInfo> Device::hip_devices;
+vector<DeviceInfo> Device::metal_devices;
 uint Device::devices_initialized_mask = 0;

 /* Device */
@@ -105,6 +107,12 @@ Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
      break;
 #endif

+#ifdef WITH_METAL
+    case DEVICE_METAL:
+      if (device_metal_init())
+        device = device_metal_create(info, stats, profiler);
+      break;
+#endif
    default:
      break;
  }
@@ -128,6 +136,8 @@ DeviceType Device::type_from_string(const char *name)
    return DEVICE_MULTI;
  else if (strcmp(name, "HIP") == 0)
    return DEVICE_HIP;
+  else if (strcmp(name, "METAL") == 0)
+    return DEVICE_METAL;

  return DEVICE_NONE;
 }
@@ -144,6 +154,8 @@ string Device::string_from_type(DeviceType type)
    return "MULTI";
  else if (type == DEVICE_HIP)
    return "HIP";
+  else if (type == DEVICE_METAL)
+    return "METAL";

  return "";
 }
@@ -161,7 +173,9 @@ vector<DeviceType> Device::available_types()
 #ifdef WITH_HIP
  types.push_back(DEVICE_HIP);
 #endif
-
+#ifdef WITH_METAL
+  types.push_back(DEVICE_METAL);
+#endif
  return types;
 }

@@ -227,6 +241,20 @@ vector<DeviceInfo> Device::available_devices(uint mask)
    }
  }

+#ifdef WITH_METAL
+  if (mask & DEVICE_MASK_METAL) {
+    if (!(devices_initialized_mask & DEVICE_MASK_METAL)) {
+      if (device_metal_init()) {
+        device_metal_info(metal_devices);
+      }
+      devices_initialized_mask |= DEVICE_MASK_METAL;
+    }
+    foreach (DeviceInfo &info, metal_devices) {
+      devices.push_back(info);
+    }
+  }
+#endif
+
  return devices;
 }

@@ -266,6 +294,15 @@ string Device::device_capabilities(uint mask)
  }
 #endif

+#ifdef WITH_METAL
+  if (mask & DEVICE_MASK_METAL) {
+    if (device_metal_init()) {
+      capabilities += "\nMetal device capabilities:\n";
+      capabilities += device_metal_capabilities();
+    }
+  }
+#endif
+
  return capabilities;
 }

@@ -354,6 +391,7 @@ void Device::free_memory()
  optix_devices.free_memory();
  hip_devices.free_memory();
  cpu_devices.free_memory();
+  metal_devices.free_memory();
 }

 unique_ptr<DeviceQueue> Device::gpu_queue_create()
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -52,6 +52,7 @@ enum DeviceType {
  DEVICE_MULTI,
  DEVICE_OPTIX,
  DEVICE_HIP,
+  DEVICE_METAL,
  DEVICE_DUMMY,
 };

@@ -60,6 +61,7 @@ enum DeviceTypeMask {
  DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
  DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
  DEVICE_MASK_HIP = (1 << DEVICE_HIP),
+  DEVICE_MASK_METAL = (1 << DEVICE_METAL),
  DEVICE_MASK_ALL = ~0
 };

@@ -281,6 +283,7 @@ class Device {
  static vector<DeviceInfo> optix_devices;
  static vector<DeviceInfo> cpu_devices;
  static vector<DeviceInfo> hip_devices;
+  static vector<DeviceInfo> metal_devices;
  static uint devices_initialized_mask;
 };

--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@@ -263,6 +263,7 @@ class device_memory {
  friend class CUDADevice;
  friend class OptiXDevice;
  friend class HIPDevice;
+  friend class MetalDevice;

  /* Only create through subclasses. */
  device_memory(Device *device, const char *name, MemoryType type);
--- a/intern/cycles/device/metal/bvh.h
+++ b/intern/cycles/device/metal/bvh.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_METAL
+
+#  include "bvh/bvh.h"
+#  include "bvh/params.h"
+#  include "device/memory.h"
+
+#  include <Metal/Metal.h>
+
+CCL_NAMESPACE_BEGIN
+
+class BVHMetal : public BVH {
+ public:
+  API_AVAILABLE(macos(11.0))
+  id<MTLAccelerationStructure> accel_struct = nil;
+  bool accel_struct_building = false;
+
+  API_AVAILABLE(macos(11.0))
+  vector<id<MTLAccelerationStructure>> blas_array;
+
+  bool motion_blur = false;
+
+  Stats &stats;
+
+  bool build(Progress &progress, id<MTLDevice> device, id<MTLCommandQueue> queue, bool refit);
+
+  BVHMetal(const BVHParams &params,
+           const vector<Geometry *> &geometry,
+           const vector<Object *> &objects,
+           Device *device);
+  virtual ~BVHMetal();
+
+  bool build_BLAS(Progress &progress, id<MTLDevice> device, id<MTLCommandQueue> queue, bool refit);
+  bool build_BLAS_mesh(Progress &progress,
+                       id<MTLDevice> device,
+                       id<MTLCommandQueue> queue,
+                       Geometry *const geom,
+                       bool refit);
+  bool build_BLAS_hair(Progress &progress,
+                       id<MTLDevice> device,
+                       id<MTLCommandQueue> queue,
+                       Geometry *const geom,
+                       bool refit);
+  bool build_TLAS(Progress &progress, id<MTLDevice> device, id<MTLCommandQueue> queue, bool refit);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL */
--- a/intern/cycles/device/metal/bvh.mm
+++ b/intern/cycles/device/metal/bvh.mm
@@ -0,0 +1,813 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_METAL
+
+#  include "scene/hair.h"
+#  include "scene/mesh.h"
+#  include "scene/object.h"
+
+#  include "util/progress.h"
+
+#  include "device/metal/bvh.h"
+
+CCL_NAMESPACE_BEGIN
+
+#  define BVH_status(...) \
+    { \
+      string str = string_printf(__VA_ARGS__); \
+      progress.set_substatus(str); \
+    }
+
+BVHMetal::BVHMetal(const BVHParams &params_,
+                   const vector<Geometry *> &geometry_,
+                   const vector<Object *> &objects_,
+                   Device *device)
+    : BVH(params_, geometry_, objects_), stats(device->stats)
+{
+}
+
+BVHMetal::~BVHMetal()
+{
+  if (@available(macos 12.0, *)) {
+    if (accel_struct) {
+      stats.mem_free(accel_struct.allocatedSize);
+      [accel_struct release];
+    }
+  }
+}
+
+bool BVHMetal::build_BLAS_mesh(Progress &progress,
+                               id<MTLDevice> device,
+                               id<MTLCommandQueue> queue,
+                               Geometry *const geom,
+                               bool refit)
+{
+  if (@available(macos 12.0, *)) {
+    /* Build BLAS for triangle primitives */
+    Mesh *const mesh = static_cast<Mesh *const>(geom);
+    if (mesh->num_triangles() == 0) {
+      return false;
+    }
+
+    /*------------------------------------------------*/
+    BVH_status(
+        "Building mesh BLAS | %7d tris | %s", (int)mesh->num_triangles(), geom->name.c_str());
+    /*------------------------------------------------*/
+
+    const bool use_fast_trace_bvh = (params.bvh_type == BVH_TYPE_STATIC);
+
+    const array<float3> &verts = mesh->get_verts();
+    const array<int> &tris = mesh->get_triangles();
+    const size_t num_verts = verts.size();
+    const size_t num_indices = tris.size();
+
+    size_t num_motion_steps = 1;
+    Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+    if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
+      num_motion_steps = mesh->get_motion_steps();
+    }
+
+    MTLResourceOptions storage_mode;
+    if (device.hasUnifiedMemory) {
+      storage_mode = MTLResourceStorageModeShared;
+    }
+    else {
+      storage_mode = MTLResourceStorageModeManaged;
+    }
+
+    /* Upload the mesh data to the GPU */
+    id<MTLBuffer> posBuf = nil;
+    id<MTLBuffer> indexBuf = [device newBufferWithBytes:tris.data()
+                                                 length:num_indices * sizeof(tris.data()[0])
+                                                options:storage_mode];
+
+    if (num_motion_steps == 1) {
+      posBuf = [device newBufferWithBytes:verts.data()
+                                   length:num_verts * sizeof(verts.data()[0])
+                                  options:storage_mode];
+    }
+    else {
+      posBuf = [device newBufferWithLength:num_verts * num_motion_steps * sizeof(verts.data()[0])
+                                   options:storage_mode];
+      float3 *dest_data = (float3 *)[posBuf contents];
+      size_t center_step = (num_motion_steps - 1) / 2;
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        const float3 *verts = mesh->get_verts().data();
+
+        /* The center step for motion vertices is not stored in the attribute. */
+        if (step != center_step) {
+          verts = motion_keys->data_float3() + (step > center_step ? step - 1 : step) * num_verts;
+        }
+        memcpy(dest_data + num_verts * step, verts, num_verts * sizeof(float3));
+      }
+      if (storage_mode == MTLResourceStorageModeManaged) {
+        [posBuf didModifyRange:NSMakeRange(0, posBuf.length)];
+      }
+    }
+
+    /* Create an acceleration structure. */
+    MTLAccelerationStructureGeometryDescriptor *geomDesc;
+    if (num_motion_steps > 1) {
+      std::vector<MTLMotionKeyframeData *> vertex_ptrs;
+      vertex_ptrs.reserve(num_motion_steps);
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        MTLMotionKeyframeData *k = [MTLMotionKeyframeData data];
+        k.buffer = posBuf;
+        k.offset = num_verts * step * sizeof(float3);
+        vertex_ptrs.push_back(k);
+      }
+
+      MTLAccelerationStructureMotionTriangleGeometryDescriptor *geomDescMotion =
+          [MTLAccelerationStructureMotionTriangleGeometryDescriptor descriptor];
+      geomDescMotion.vertexBuffers = [NSArray arrayWithObjects:vertex_ptrs.data()
+                                                         count:vertex_ptrs.size()];
+      geomDescMotion.vertexStride = sizeof(verts.data()[0]);
+      geomDescMotion.indexBuffer = indexBuf;
+      geomDescMotion.indexBufferOffset = 0;
+      geomDescMotion.indexType = MTLIndexTypeUInt32;
+      geomDescMotion.triangleCount = num_indices / 3;
+      geomDescMotion.intersectionFunctionTableOffset = 0;
+
+      geomDesc = geomDescMotion;
+    }
+    else {
+      MTLAccelerationStructureTriangleGeometryDescriptor *geomDescNoMotion =
+          [MTLAccelerationStructureTriangleGeometryDescriptor descriptor];
+      geomDescNoMotion.vertexBuffer = posBuf;
+      geomDescNoMotion.vertexBufferOffset = 0;
+      geomDescNoMotion.vertexStride = sizeof(verts.data()[0]);
+      geomDescNoMotion.indexBuffer = indexBuf;
+      geomDescNoMotion.indexBufferOffset = 0;
+      geomDescNoMotion.indexType = MTLIndexTypeUInt32;
+      geomDescNoMotion.triangleCount = num_indices / 3;
+      geomDescNoMotion.intersectionFunctionTableOffset = 0;
+
+      geomDesc = geomDescNoMotion;
+    }
+
+    /* Force a single any-hit call, so shadow record-all behavior works correctly */
+    /* (Match optix behavior: unsigned int build_flags =
+     * OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;) */
+    geomDesc.allowDuplicateIntersectionFunctionInvocation = false;
+
+    MTLPrimitiveAccelerationStructureDescriptor *accelDesc =
+        [MTLPrimitiveAccelerationStructureDescriptor descriptor];
+    accelDesc.geometryDescriptors = @[ geomDesc ];
+    if (num_motion_steps > 1) {
+      accelDesc.motionStartTime = 0.0f;
+      accelDesc.motionEndTime = 1.0f;
+      accelDesc.motionStartBorderMode = MTLMotionBorderModeClamp;
+      accelDesc.motionEndBorderMode = MTLMotionBorderModeClamp;
+      accelDesc.motionKeyframeCount = num_motion_steps;
+    }
+
+    if (!use_fast_trace_bvh) {
+      accelDesc.usage |= (MTLAccelerationStructureUsageRefit |
+                          MTLAccelerationStructureUsagePreferFastBuild);
+    }
+
+    MTLAccelerationStructureSizes accelSizes = [device
+        accelerationStructureSizesWithDescriptor:accelDesc];
+    id<MTLAccelerationStructure> accel_uncompressed = [device
+        newAccelerationStructureWithSize:accelSizes.accelerationStructureSize];
+    id<MTLBuffer> scratchBuf = [device newBufferWithLength:accelSizes.buildScratchBufferSize
+                                                   options:MTLResourceStorageModePrivate];
+    id<MTLBuffer> sizeBuf = [device newBufferWithLength:8 options:MTLResourceStorageModeShared];
+    id<MTLCommandBuffer> accelCommands = [queue commandBuffer];
+    id<MTLAccelerationStructureCommandEncoder> accelEnc =
+        [accelCommands accelerationStructureCommandEncoder];
+    if (refit) {
+      [accelEnc refitAccelerationStructure:accel_struct
+                                descriptor:accelDesc
+                               destination:accel_uncompressed
+                             scratchBuffer:scratchBuf
+                       scratchBufferOffset:0];
+    }
+    else {
+      [accelEnc buildAccelerationStructure:accel_uncompressed
+                                descriptor:accelDesc
+                             scratchBuffer:scratchBuf
+                       scratchBufferOffset:0];
+    }
+    if (use_fast_trace_bvh) {
+      [accelEnc writeCompactedAccelerationStructureSize:accel_uncompressed
+                                               toBuffer:sizeBuf
+                                                 offset:0
+                                           sizeDataType:MTLDataTypeULong];
+    }
+    [accelEnc endEncoding];
+    [accelCommands addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
+      /* free temp resources */
+      [scratchBuf release];
+      [indexBuf release];
+      [posBuf release];
+
+      if (use_fast_trace_bvh) {
+        /* Compact the accel structure */
+        uint64_t compressed_size = *(uint64_t *)sizeBuf.contents;
+
+        dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+          id<MTLCommandBuffer> accelCommands = [queue commandBuffer];
+          id<MTLAccelerationStructureCommandEncoder> accelEnc =
+              [accelCommands accelerationStructureCommandEncoder];
+          id<MTLAccelerationStructure> accel = [device
+              newAccelerationStructureWithSize:compressed_size];
+          [accelEnc copyAndCompactAccelerationStructure:accel_uncompressed
+                                toAccelerationStructure:accel];
+          [accelEnc endEncoding];
+          [accelCommands addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
+            uint64_t allocated_size = [accel allocatedSize];
+            stats.mem_alloc(allocated_size);
+            accel_struct = accel;
+            [accel_uncompressed release];
+            accel_struct_building = false;
+          }];
+          [accelCommands commit];
+        });
+      }
+      else {
+        /* set our acceleration structure to the uncompressed structure */
+        accel_struct = accel_uncompressed;
+
+        uint64_t allocated_size = [accel_struct allocatedSize];
+        stats.mem_alloc(allocated_size);
+        accel_struct_building = false;
+      }
+      [sizeBuf release];
+    }];
+
+    accel_struct_building = true;
+    [accelCommands commit];
+
+    return true;
+  }
+  return false;
+}
+
+bool BVHMetal::build_BLAS_hair(Progress &progress,
+                               id<MTLDevice> device,
+                               id<MTLCommandQueue> queue,
+                               Geometry *const geom,
+                               bool refit)
+{
+  if (@available(macos 12.0, *)) {
+    /* Build BLAS for hair curves */
+    Hair *hair = static_cast<Hair *>(geom);
+    if (hair->num_curves() == 0) {
+      return false;
+    }
+
+    /*------------------------------------------------*/
+    BVH_status(
+        "Building hair BLAS | %7d curves | %s", (int)hair->num_curves(), geom->name.c_str());
+    /*------------------------------------------------*/
+
+    const bool use_fast_trace_bvh = (params.bvh_type == BVH_TYPE_STATIC);
+    const size_t num_segments = hair->num_segments();
+
+    size_t num_motion_steps = 1;
+    Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+    if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
+      num_motion_steps = hair->get_motion_steps();
+    }
+
+    const size_t num_aabbs = num_segments * num_motion_steps;
+
+    MTLResourceOptions storage_mode;
+    if (device.hasUnifiedMemory) {
+      storage_mode = MTLResourceStorageModeShared;
+    }
+    else {
+      storage_mode = MTLResourceStorageModeManaged;
+    }
+
+    /* Allocate a GPU buffer for the AABB data and populate it */
+    id<MTLBuffer> aabbBuf = [device
+        newBufferWithLength:num_aabbs * sizeof(MTLAxisAlignedBoundingBox)
+                    options:storage_mode];
+    MTLAxisAlignedBoundingBox *aabb_data = (MTLAxisAlignedBoundingBox *)[aabbBuf contents];
+
+    /* Get AABBs for each motion step */
+    size_t center_step = (num_motion_steps - 1) / 2;
+    for (size_t step = 0; step < num_motion_steps; ++step) {
+      /* The center step for motion vertices is not stored in the attribute */
+      const float3 *keys = hair->get_curve_keys().data();
+      if (step != center_step) {
+        size_t attr_offset = (step > center_step) ? step - 1 : step;
+        /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4) */
+        keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
+      }
+
+      for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
+        const Hair::Curve curve = hair->get_curve(j);
+
+        for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
+          {
+            BoundBox bounds = BoundBox::empty;
+            curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
+
+            const size_t index = step * num_segments + i;
+            aabb_data[index].min = (MTLPackedFloat3 &)bounds.min;
+            aabb_data[index].max = (MTLPackedFloat3 &)bounds.max;
+          }
+        }
+      }
+    }
+
+    if (storage_mode == MTLResourceStorageModeManaged) {
+      [aabbBuf didModifyRange:NSMakeRange(0, aabbBuf.length)];
+    }
+
+#  if 0
+    for (size_t i=0; i<num_aabbs && i < 400; i++) {
+      MTLAxisAlignedBoundingBox& bb = aabb_data[i];
+      printf("  %d:   %.1f,%.1f,%.1f -- %.1f,%.1f,%.1f\n", int(i), bb.min.x, bb.min.y, bb.min.z, bb.max.x, bb.max.y, bb.max.z);
+    }
+#  endif
+
+    MTLAccelerationStructureGeometryDescriptor *geomDesc;
+    if (motion_blur) {
+      std::vector<MTLMotionKeyframeData *> aabb_ptrs;
+      aabb_ptrs.reserve(num_motion_steps);
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        MTLMotionKeyframeData *k = [MTLMotionKeyframeData data];
+        k.buffer = aabbBuf;
+        k.offset = step * num_segments * sizeof(MTLAxisAlignedBoundingBox);
+        aabb_ptrs.push_back(k);
+      }
+
+      MTLAccelerationStructureMotionBoundingBoxGeometryDescriptor *geomDescMotion =
+          [MTLAccelerationStructureMotionBoundingBoxGeometryDescriptor descriptor];
+      geomDescMotion.boundingBoxBuffers = [NSArray arrayWithObjects:aabb_ptrs.data()
+                                                              count:aabb_ptrs.size()];
+      geomDescMotion.boundingBoxCount = num_segments;
+      geomDescMotion.boundingBoxStride = sizeof(aabb_data[0]);
+      geomDescMotion.intersectionFunctionTableOffset = 1;
+
+      /* Force a single any-hit call, so shadow record-all behavior works correctly */
+      /* (Match optix behavior: unsigned int build_flags =
+       * OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;) */
+      geomDescMotion.allowDuplicateIntersectionFunctionInvocation = false;
+      geomDescMotion.opaque = true;
+      geomDesc = geomDescMotion;
+    }
+    else {
+      MTLAccelerationStructureBoundingBoxGeometryDescriptor *geomDescNoMotion =
+          [MTLAccelerationStructureBoundingBoxGeometryDescriptor descriptor];
+      geomDescNoMotion.boundingBoxBuffer = aabbBuf;
+      geomDescNoMotion.boundingBoxBufferOffset = 0;
+      geomDescNoMotion.boundingBoxCount = int(num_aabbs);
+      geomDescNoMotion.boundingBoxStride = sizeof(aabb_data[0]);
+      geomDescNoMotion.intersectionFunctionTableOffset = 1;
+
+      /* Force a single any-hit call, so shadow record-all behavior works correctly */
+      /* (Match optix behavior: unsigned int build_flags =
+       * OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;) */
+      geomDescNoMotion.allowDuplicateIntersectionFunctionInvocation = false;
+      geomDescNoMotion.opaque = true;
+      geomDesc = geomDescNoMotion;
+    }
+
+    MTLPrimitiveAccelerationStructureDescriptor *accelDesc =
+        [MTLPrimitiveAccelerationStructureDescriptor descriptor];
+    accelDesc.geometryDescriptors = @[ geomDesc ];
+
+    if (motion_blur) {
+      accelDesc.motionStartTime = 0.0f;
+      accelDesc.motionEndTime = 1.0f;
+      accelDesc.motionStartBorderMode = MTLMotionBorderModeVanish;
+      accelDesc.motionEndBorderMode = MTLMotionBorderModeVanish;
+      accelDesc.motionKeyframeCount = num_motion_steps;
+    }
+
+    if (!use_fast_trace_bvh) {
+      accelDesc.usage |= (MTLAccelerationStructureUsageRefit |
+                          MTLAccelerationStructureUsagePreferFastBuild);
+    }
+
+    MTLAccelerationStructureSizes accelSizes = [device
+        accelerationStructureSizesWithDescriptor:accelDesc];
+    id<MTLAccelerationStructure> accel_uncompressed = [device
+        newAccelerationStructureWithSize:accelSizes.accelerationStructureSize];
+    id<MTLBuffer> scratchBuf = [device newBufferWithLength:accelSizes.buildScratchBufferSize
+                                                   options:MTLResourceStorageModePrivate];
+    id<MTLBuffer> sizeBuf = [device newBufferWithLength:8 options:MTLResourceStorageModeShared];
+    id<MTLCommandBuffer> accelCommands = [queue commandBuffer];
+    id<MTLAccelerationStructureCommandEncoder> accelEnc =
+        [accelCommands accelerationStructureCommandEncoder];
+    if (refit) {
+      [accelEnc refitAccelerationStructure:accel_struct
+                                descriptor:accelDesc
+                               destination:accel_uncompressed
+                             scratchBuffer:scratchBuf
+                       scratchBufferOffset:0];
+    }
+    else {
+      [accelEnc buildAccelerationStructure:accel_uncompressed
+                                descriptor:accelDesc
+                             scratchBuffer:scratchBuf
+                       scratchBufferOffset:0];
+    }
+    if (use_fast_trace_bvh) {
+      [accelEnc writeCompactedAccelerationStructureSize:accel_uncompressed
+                                               toBuffer:sizeBuf
+                                                 offset:0
+                                           sizeDataType:MTLDataTypeULong];
+    }
+    [accelEnc endEncoding];
+    [accelCommands addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
+      /* free temp resources */
+      [scratchBuf release];
+      [aabbBuf release];
+
+      if (use_fast_trace_bvh) {
+        /* Compact the accel structure */
+        uint64_t compressed_size = *(uint64_t *)sizeBuf.contents;
+
+        dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+          id<MTLCommandBuffer> accelCommands = [queue commandBuffer];
+          id<MTLAccelerationStructureCommandEncoder> accelEnc =
+              [accelCommands accelerationStructureCommandEncoder];
+          id<MTLAccelerationStructure> accel = [device
+              newAccelerationStructureWithSize:compressed_size];
+          [accelEnc copyAndCompactAccelerationStructure:accel_uncompressed
+                                toAccelerationStructure:accel];
+          [accelEnc endEncoding];
+          [accelCommands addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
+            uint64_t allocated_size = [accel allocatedSize];
+            stats.mem_alloc(allocated_size);
+            accel_struct = accel;
+            [accel_uncompressed release];
+            accel_struct_building = false;
+          }];
+          [accelCommands commit];
+        });
+      }
+      else {
+        /* set our acceleration structure to the uncompressed structure */
+        accel_struct = accel_uncompressed;
+
+        uint64_t allocated_size = [accel_struct allocatedSize];
+        stats.mem_alloc(allocated_size);
+        accel_struct_building = false;
+      }
+      [sizeBuf release];
+    }];
+
+    accel_struct_building = true;
+    [accelCommands commit];
+    return true;
+  }
+  return false;
+}
+
+bool BVHMetal::build_BLAS(Progress &progress,
+                          id<MTLDevice> device,
+                          id<MTLCommandQueue> queue,
+                          bool refit)
+{
+  if (@available(macos 12.0, *)) {
+    assert(objects.size() == 1 && geometry.size() == 1);
+
+    /* Build bottom level acceleration structures (BLAS) */
+    Geometry *const geom = geometry[0];
+    switch (geom->geometry_type) {
+      case Geometry::VOLUME:
+      case Geometry::MESH:
+        return build_BLAS_mesh(progress, device, queue, geom, refit);
+      case Geometry::HAIR:
+        return build_BLAS_hair(progress, device, queue, geom, refit);
+      default:
+        return false;
+    }
+  }
+  return false;
+}
+
+bool BVHMetal::build_TLAS(Progress &progress,
+                          id<MTLDevice> device,
+                          id<MTLCommandQueue> queue,
+                          bool refit)
+{
+  if (@available(macos 12.0, *)) {
+
+    /* we need to sync here and ensure that all BLAS have completed async generation by both GCD
+     * and Metal */
+    {
+      __block bool complete_bvh = false;
+      while (!complete_bvh) {
+        dispatch_sync(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+          complete_bvh = true;
+          for (Object *ob : objects) {
+            /* Skip non-traceable objects */
+            if (!ob->is_traceable())
+              continue;
+
+            Geometry const *geom = ob->get_geometry();
+            BVHMetal const *blas = static_cast<BVHMetal const *>(geom->bvh);
+            if (blas->accel_struct_building) {
+              complete_bvh = false;
+
+              /* We're likely waiting on a command buffer that's in flight to complete.
+               * Queue up a command buffer and wait for it complete before checking the BLAS again
+               */
+              id<MTLCommandBuffer> command_buffer = [queue commandBuffer];
+              [command_buffer commit];
+              [command_buffer waitUntilCompleted];
+              break;
+            }
+          }
+        });
+      }
+    }
+
+    uint32_t num_instances = 0;
+    uint32_t num_motion_transforms = 0;
+    for (Object *ob : objects) {
+      /* Skip non-traceable objects */
+      if (!ob->is_traceable())
+        continue;
+      num_instances++;
+
+      if (ob->use_motion()) {
+        num_motion_transforms += max(1, ob->get_motion().size());
+      }
+      else {
+        num_motion_transforms++;
+      }
+    }
+
+    /*------------------------------------------------*/
+    BVH_status("Building TLAS      | %7d instances", (int)num_instances);
+    /*------------------------------------------------*/
+
+    const bool use_fast_trace_bvh = (params.bvh_type == BVH_TYPE_STATIC);
+
+    NSMutableArray *all_blas = [NSMutableArray array];
+    unordered_map<BVHMetal const *, int> instance_mapping;
+
+    /* Lambda function to build/retrieve the BLAS index mapping */
+    auto get_blas_index = [&](BVHMetal const *blas) {
+      auto it = instance_mapping.find(blas);
+      if (it != instance_mapping.end()) {
+        return it->second;
+      }
+      else {
+        int blas_index = (int)[all_blas count];
+        instance_mapping[blas] = blas_index;
+        if (@available(macos 12.0, *)) {
+          [all_blas addObject:blas->accel_struct];
+        }
+        return blas_index;
+      }
+    };
+
+    MTLResourceOptions storage_mode;
+    if (device.hasUnifiedMemory) {
+      storage_mode = MTLResourceStorageModeShared;
+    }
+    else {
+      storage_mode = MTLResourceStorageModeManaged;
+    }
+
+    size_t instance_size;
+    if (motion_blur) {
+      instance_size = sizeof(MTLAccelerationStructureMotionInstanceDescriptor);
+    }
+    else {
+      instance_size = sizeof(MTLAccelerationStructureUserIDInstanceDescriptor);
+    }
+
+    /* Allocate a GPU buffer for the instance data and populate it */
+    id<MTLBuffer> instanceBuf = [device newBufferWithLength:num_instances * instance_size
+                                                    options:storage_mode];
+    id<MTLBuffer> motion_transforms_buf = nil;
+    MTLPackedFloat4x3 *motion_transforms = nullptr;
+    if (motion_blur && num_motion_transforms) {
+      motion_transforms_buf = [device
+          newBufferWithLength:num_motion_transforms * sizeof(MTLPackedFloat4x3)
+                      options:storage_mode];
+      motion_transforms = (MTLPackedFloat4x3 *)motion_transforms_buf.contents;
+    }
+
+    uint32_t instance_index = 0;
+    uint32_t motion_transform_index = 0;
+    for (Object *ob : objects) {
+      /* Skip non-traceable objects */
+      if (!ob->is_traceable())
+        continue;
+
+      Geometry const *geom = ob->get_geometry();
+
+      BVHMetal const *blas = static_cast<BVHMetal const *>(geom->bvh);
+      uint32_t accel_struct_index = get_blas_index(blas);
+
+      /* Add some of the object visibility bits to the mask.
+       * __prim_visibility contains the combined visibility bits of all instances, so is not
+       * reliable if they differ between instances.
+       *
+       * METAL_WIP: OptiX visibility mask can only contain 8 bits, so have to trade-off here
+       * and select just a few important ones.
+       */
+      uint32_t mask = ob->visibility_for_tracing() & 0xFF;
+
+      /* Have to have at least one bit in the mask, or else instance would always be culled. */
+      if (0 == mask) {
+        mask = 0xFF;
+      }
+
+      /* Set user instance ID to object index */
+      int object_index = ob->get_device_index();
+      uint32_t user_id = uint32_t(object_index);
+
+      /* Bake into the appropriate descriptor */
+      if (motion_blur) {
+        MTLAccelerationStructureMotionInstanceDescriptor *instances =
+            (MTLAccelerationStructureMotionInstanceDescriptor *)[instanceBuf contents];
+        MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[instance_index++];
+
+        desc.accelerationStructureIndex = accel_struct_index;
+        desc.userID = user_id;
+        desc.mask = mask;
+        desc.motionStartTime = 0.0f;
+        desc.motionEndTime = 1.0f;
+        desc.motionTransformsStartIndex = motion_transform_index;
+        desc.motionStartBorderMode = MTLMotionBorderModeVanish;
+        desc.motionEndBorderMode = MTLMotionBorderModeVanish;
+        desc.intersectionFunctionTableOffset = 0;
+
+        int key_count = ob->get_motion().size();
+        if (key_count) {
+          desc.motionTransformsCount = key_count;
+
+          Transform *keys = ob->get_motion().data();
+          for (int i = 0; i < key_count; i++) {
+            float *t = (float *)&motion_transforms[motion_transform_index++];
+            /* Transpose transform */
+            auto src = (float const *)&keys[i];
+            for (int i = 0; i < 12; i++) {
+              t[i] = src[(i / 3) + 4 * (i % 3)];
+            }
+          }
+        }
+        else {
+          desc.motionTransformsCount = 1;
+
+          float *t = (float *)&motion_transforms[motion_transform_index++];
+          if (ob->get_geometry()->is_instanced()) {
+            /* Transpose transform */
+            auto src = (float const *)&ob->get_tfm();
+            for (int i = 0; i < 12; i++) {
+              t[i] = src[(i / 3) + 4 * (i % 3)];
+            }
+          }
+          else {
+            /* Clear transform to identity matrix */
+            t[0] = t[4] = t[8] = 1.0f;
+          }
+        }
+      }
+      else {
+        MTLAccelerationStructureUserIDInstanceDescriptor *instances =
+            (MTLAccelerationStructureUserIDInstanceDescriptor *)[instanceBuf contents];
+        MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[instance_index++];
+
+        desc.accelerationStructureIndex = accel_struct_index;
+        desc.userID = user_id;
+        desc.mask = mask;
+        desc.intersectionFunctionTableOffset = 0;
+
+        float *t = (float *)&desc.transformationMatrix;
+        if (ob->get_geometry()->is_instanced()) {
+          /* Transpose transform */
+          auto src = (float const *)&ob->get_tfm();
+          for (int i = 0; i < 12; i++) {
+            t[i] = src[(i / 3) + 4 * (i % 3)];
+          }
+        }
+        else {
+          /* Clear transform to identity matrix */
+          t[0] = t[4] = t[8] = 1.0f;
+        }
+      }
+    }
+
+    if (storage_mode == MTLResourceStorageModeManaged) {
+      [instanceBuf didModifyRange:NSMakeRange(0, instanceBuf.length)];
+      if (motion_transforms_buf) {
+        [motion_transforms_buf didModifyRange:NSMakeRange(0, motion_transforms_buf.length)];
+        assert(num_motion_transforms == motion_transform_index);
+      }
+    }
+
+    MTLInstanceAccelerationStructureDescriptor *accelDesc =
+        [MTLInstanceAccelerationStructureDescriptor descriptor];
+    accelDesc.instanceCount = num_instances;
+    accelDesc.instanceDescriptorType = MTLAccelerationStructureInstanceDescriptorTypeUserID;
+    accelDesc.instanceDescriptorBuffer = instanceBuf;
+    accelDesc.instanceDescriptorBufferOffset = 0;
+    accelDesc.instanceDescriptorStride = instance_size;
+    accelDesc.instancedAccelerationStructures = all_blas;
+
+    if (motion_blur) {
+      accelDesc.instanceDescriptorType = MTLAccelerationStructureInstanceDescriptorTypeMotion;
+      accelDesc.motionTransformBuffer = motion_transforms_buf;
+      accelDesc.motionTransformCount = num_motion_transforms;
+    }
+
+    if (!use_fast_trace_bvh) {
+      accelDesc.usage |= (MTLAccelerationStructureUsageRefit |
+                          MTLAccelerationStructureUsagePreferFastBuild);
+    }
+
+    MTLAccelerationStructureSizes accelSizes = [device
+        accelerationStructureSizesWithDescriptor:accelDesc];
+    id<MTLAccelerationStructure> accel = [device
+        newAccelerationStructureWithSize:accelSizes.accelerationStructureSize];
+    id<MTLBuffer> scratchBuf = [device newBufferWithLength:accelSizes.buildScratchBufferSize
+                                                   options:MTLResourceStorageModePrivate];
+    id<MTLCommandBuffer> accelCommands = [queue commandBuffer];
+    id<MTLAccelerationStructureCommandEncoder> accelEnc =
+        [accelCommands accelerationStructureCommandEncoder];
+    if (refit) {
+      [accelEnc refitAccelerationStructure:accel_struct
+                                descriptor:accelDesc
+                               destination:accel
+                             scratchBuffer:scratchBuf
+                       scratchBufferOffset:0];
+    }
+    else {
+      [accelEnc buildAccelerationStructure:accel
+                                descriptor:accelDesc
+                             scratchBuffer:scratchBuf
+                       scratchBufferOffset:0];
+    }
+    [accelEnc endEncoding];
+    [accelCommands commit];
+    [accelCommands waitUntilCompleted];
+
+    if (motion_transforms_buf) {
+      [motion_transforms_buf release];
+    }
+    [instanceBuf release];
+    [scratchBuf release];
+
+    uint64_t allocated_size = [accel allocatedSize];
+    stats.mem_alloc(allocated_size);
+
+    /* Cache top and bottom-level acceleration structs */
+    accel_struct = accel;
+    blas_array.clear();
+    blas_array.reserve(all_blas.count);
+    for (id<MTLAccelerationStructure> blas in all_blas) {
+      blas_array.push_back(blas);
+    }
+
+    return true;
+  }
+  return false;
+}
+
+bool BVHMetal::build(Progress &progress,
+                     id<MTLDevice> device,
+                     id<MTLCommandQueue> queue,
+                     bool refit)
+{
+  if (@available(macos 12.0, *)) {
+    if (refit && params.bvh_type != BVH_TYPE_STATIC) {
+      assert(accel_struct);
+    }
+    else {
+      if (accel_struct) {
+        stats.mem_free(accel_struct.allocatedSize);
+        [accel_struct release];
+        accel_struct = nil;
+      }
+    }
+  }
+
+  if (!params.top_level) {
+    return build_BLAS(progress, device, queue, refit);
+  }
+  else {
+    return build_TLAS(progress, device, queue, refit);
+  }
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL */
--- a/intern/cycles/device/metal/device.h
+++ b/intern/cycles/device/metal/device.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/string.h"
+#include "util/vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_metal_init();
+
+Device *device_metal_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_metal_info(vector<DeviceInfo> &devices);
+
+string device_metal_capabilities();
+
+CCL_NAMESPACE_END
--- a/intern/cycles/device/metal/device.mm
+++ b/intern/cycles/device/metal/device.mm
@@ -0,0 +1,136 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_METAL
+
+#  include "device/metal/device.h"
+#  include "device/metal/device_impl.h"
+
+#endif
+
+#include "util/debug.h"
+#include "util/set.h"
+#include "util/system.h"
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef WITH_METAL
+
+Device *device_metal_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+  return new MetalDevice(info, stats, profiler);
+}
+
+bool device_metal_init()
+{
+  return true;
+}
+
+static int device_metal_get_num_devices_safe(uint32_t *num_devices)
+{
+  *num_devices = MTLCopyAllDevices().count;
+  return 0;
+}
+
+void device_metal_info(vector<DeviceInfo> &devices)
+{
+  uint32_t num_devices = 0;
+  device_metal_get_num_devices_safe(&num_devices);
+  if (num_devices == 0) {
+    return;
+  }
+
+  vector<MetalPlatformDevice> usable_devices;
+  MetalInfo::get_usable_devices(&usable_devices);
+  /* Devices are numbered consecutively across platforms. */
+  set<string> unique_ids;
+  int device_index = 0;
+  for (MetalPlatformDevice &device : usable_devices) {
+    /* Compute unique ID for persistent user preferences. */
+    const string &device_name = device.device_name;
+    string id = string("METAL_") + device_name;
+
+    /* Hardware ID might not be unique, add device number in that case. */
+    if (unique_ids.find(id) != unique_ids.end()) {
+      id += string_printf("_ID_%d", num_devices);
+    }
+    unique_ids.insert(id);
+
+    /* Create DeviceInfo. */
+    DeviceInfo info;
+    info.type = DEVICE_METAL;
+    info.description = string_remove_trademark(string(device_name));
+
+    /* Ensure unique naming on Apple Silicon / SoC devices which return the same string for CPU and
+     * GPU */
+    if (info.description == system_cpu_brand_string()) {
+      info.description += " (GPU)";
+    }
+
+    info.num = device_index;
+    /* We don't know if it's used for display, but assume it is. */
+    info.display_device = true;
+    info.denoisers = DENOISER_NONE;
+    info.id = id;
+
+    devices.push_back(info);
+    device_index++;
+  }
+}
+
+string device_metal_capabilities()
+{
+  string result = "";
+  string error_msg = "";
+  uint32_t num_devices = 0;
+  assert(device_metal_get_num_devices_safe(&num_devices));
+  if (num_devices == 0) {
+    return "No Metal devices found\n";
+  }
+  result += string_printf("Number of devices: %u\n", num_devices);
+
+  NSArray<id<MTLDevice>> *allDevices = MTLCopyAllDevices();
+  for (id<MTLDevice> device in allDevices) {
+    result += string_printf("\t\tDevice: %s\n", [device.name UTF8String]);
+  }
+
+  return result;
+}
+
+#else
+
+Device *device_metal_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+  return nullptr;
+}
+
+bool device_metal_init()
+{
+  return false;
+}
+
+void device_metal_info(vector<DeviceInfo> &devices)
+{
+}
+
+string device_metal_capabilities()
+{
+  return "";
+}
+
+#endif
+
+CCL_NAMESPACE_END
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_METAL
+
+#  include "bvh/bvh.h"
+#  include "device/device.h"
+#  include "device/metal/bvh.h"
+#  include "device/metal/device.h"
+#  include "device/metal/kernel.h"
+#  include "device/metal/queue.h"
+#  include "device/metal/util.h"
+
+#  include <Metal/Metal.h>
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+class MetalDevice : public Device {
+ public:
+  id<MTLDevice> mtlDevice = nil;
+  id<MTLLibrary> mtlLibrary[PSO_NUM] = {nil};
+  id<MTLArgumentEncoder> mtlBufferKernelParamsEncoder =
+      nil; /* encoder used for fetching device pointers from MTLBuffers */
+  id<MTLCommandQueue> mtlGeneralCommandQueue = nil;
+  id<MTLArgumentEncoder> mtlAncillaryArgEncoder =
+      nil; /* encoder used for fetching device pointers from MTLBuffers */
+  string source_used_for_compile[PSO_NUM];
+
+  KernelParamsMetal launch_params = {0};
+
+  /* MetalRT members ----------------------------------*/
+  BVHMetal *bvhMetalRT = nullptr;
+  bool motion_blur = false;
+  id<MTLArgumentEncoder> mtlASArgEncoder =
+      nil; /* encoder used for fetching device pointers from MTLAccelerationStructure */
+  /*---------------------------------------------------*/
+
+  string device_name;
+  MetalGPUVendor device_vendor;
+
+  uint kernel_features;
+  MTLResourceOptions default_storage_mode;
+  int max_threads_per_threadgroup;
+
+  int mtlDevId = 0;
+  bool first_error = true;
+
+  struct MetalMem {
+    device_memory *mem = nullptr;
+    int pointer_index = -1;
+    id<MTLBuffer> mtlBuffer = nil;
+    id<MTLTexture> mtlTexture = nil;
+    uint64_t offset = 0;
+    uint64_t size = 0;
+    void *hostPtr = nullptr;
+    bool use_UMA = false; /* If true, UMA memory in shared_pointer is being used. */
+  };
+  typedef map<device_memory *, unique_ptr<MetalMem>> MetalMemMap;
+  MetalMemMap metal_mem_map;
+  std::vector<id<MTLResource>> delayed_free_list;
+  std::recursive_mutex metal_mem_map_mutex;
+
+  /* Bindless Textures */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+  id<MTLArgumentEncoder> mtlTextureArgEncoder = nil;
+  id<MTLBuffer> texture_bindings_2d = nil;
+  id<MTLBuffer> texture_bindings_3d = nil;
+  std::vector<id<MTLTexture>> texture_slot_map;
+
+  MetalDeviceKernels kernels;
+  bool use_metalrt = false;
+  bool use_function_specialisation = false;
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+  void set_error(const string &error) override;
+
+  MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+  virtual ~MetalDevice();
+
+  bool support_device(const uint /*kernel_features*/);
+
+  bool check_peer_access(Device *peer_device) override;
+
+  bool use_adaptive_compilation();
+
+  string get_source(const uint kernel_features);
+
+  string compile_kernel(const uint kernel_features, const char *name);
+
+  virtual bool load_kernels(const uint kernel_features) override;
+
+  void reserve_local_memory(const uint kernel_features);
+
+  void init_host_memory();
+
+  void load_texture_info();
+
+  virtual bool should_use_graphics_interop() override;
+
+  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+  virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
+
+  /* ------------------------------------------------------------------ */
+  /* low-level memory management */
+
+  MetalMem *generic_alloc(device_memory &mem);
+
+  void generic_copy_to(device_memory &mem);
+
+  void generic_free(device_memory &mem);
+
+  void mem_alloc(device_memory &mem) override;
+
+  void mem_copy_to(device_memory &mem) override;
+
+  void mem_copy_from(device_memory &mem)
+  {
+    mem_copy_from(mem, -1, -1, -1, -1);
+  }
+  void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override;
+
+  void mem_zero(device_memory &mem) override;
+
+  void mem_free(device_memory &mem) override;
+
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override;
+
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void global_alloc(device_memory &mem);
+
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+
+  void tex_alloc_as_buffer(device_texture &mem);
+
+  void tex_free(device_texture &mem);
+
+  void flush_delayed_free_list();
+};
+
+CCL_NAMESPACE_END
+
+#endif
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_METAL
+
+#  include "device/kernel.h"
+#  include <Metal/Metal.h>
+
+CCL_NAMESPACE_BEGIN
+
+class MetalDevice;
+
+enum {
+  METALRT_FUNC_DEFAULT_TRI,
+  METALRT_FUNC_DEFAULT_BOX,
+  METALRT_FUNC_SHADOW_TRI,
+  METALRT_FUNC_SHADOW_BOX,
+  METALRT_FUNC_LOCAL_TRI,
+  METALRT_FUNC_LOCAL_BOX,
+  METALRT_FUNC_CURVE_RIBBON,
+  METALRT_FUNC_CURVE_RIBBON_SHADOW,
+  METALRT_FUNC_CURVE_ALL,
+  METALRT_FUNC_CURVE_ALL_SHADOW,
+  METALRT_FUNC_NUM
+};
+
+enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };
+
+/* Pipeline State Object types */
+enum {
+  /* A kernel that can be used with all scenes, supporting all features.
+   * It is slow to compile, but only needs to be compiled once and is then
+   * cached for future render sessions. This allows a render to get underway
+   * on the GPU quickly.
+   */
+  PSO_GENERIC,
+
+  /* A kernel that is relatively quick to compile, but is specialized for the
+   * scene being rendered. It only contains the functionality and even baked in
+   * constants for values that means it needs to be recompiled whenever a
+   * dependent setting is changed. The render performance of this kernel is
+   * significantly faster though, and justifies the extra compile time.
+   */
+  /* METAL_WIP: This isn't used and will require more changes to enable. */
+  PSO_SPECIALISED,
+
+  PSO_NUM
+};
+
+const char *kernel_type_as_string(int kernel_type);
+
+struct MetalKernelPipeline {
+  void release()
+  {
+    if (pipeline) {
+      [pipeline release];
+      pipeline = nil;
+      if (@available(macOS 11.0, *)) {
+        for (int i = 0; i < METALRT_TABLE_NUM; i++) {
+          if (intersection_func_table[i]) {
+            [intersection_func_table[i] release];
+            intersection_func_table[i] = nil;
+          }
+        }
+      }
+    }
+    if (function) {
+      [function release];
+      function = nil;
+    }
+    if (@available(macOS 11.0, *)) {
+      for (int i = 0; i < METALRT_TABLE_NUM; i++) {
+        if (intersection_func_table[i]) {
+          [intersection_func_table[i] release];
+        }
+      }
+    }
+  }
+
+  bool loaded = false;
+  id<MTLFunction> function = nil;
+  id<MTLComputePipelineState> pipeline = nil;
+
+  API_AVAILABLE(macos(11.0))
+  id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil};
+};
+
+struct MetalKernelLoadDesc {
+  int pso_index = 0;
+  const char *function_name = nullptr;
+  int kernel_index = 0;
+  int threads_per_threadgroup = 0;
+  MTLFunctionConstantValues *constant_values = nullptr;
+  NSArray *linked_functions = nullptr;
+
+  struct IntersectorFunctions {
+    NSArray *defaults;
+    NSArray *shadow;
+    NSArray *local;
+    NSArray *operator[](int index) const
+    {
+      if (index == METALRT_TABLE_DEFAULT)
+        return defaults;
+      if (index == METALRT_TABLE_SHADOW)
+        return shadow;
+      return local;
+    }
+  } intersector_functions = {nullptr};
+};
+
+/* Metal kernel and associate occupancy information. */
+class MetalDeviceKernel {
+ public:
+  ~MetalDeviceKernel();
+
+  bool load(MetalDevice *device, MetalKernelLoadDesc const &desc, class MD5Hash const &md5);
+
+  void mark_loaded(int pso_index)
+  {
+    pso[pso_index].loaded = true;
+  }
+
+  int get_num_threads_per_block() const
+  {
+    return num_threads_per_block;
+  }
+  const MetalKernelPipeline &get_pso() const;
+
+  double load_duration = 0.0;
+
+ private:
+  MetalKernelPipeline pso[PSO_NUM];
+
+  int num_threads_per_block = 0;
+};
+
+/* Cache of Metal kernels for each DeviceKernel. */
+class MetalDeviceKernels {
+ public:
+  bool load(MetalDevice *device, int kernel_type);
+  bool available(DeviceKernel kernel) const;
+  const MetalDeviceKernel &get(DeviceKernel kernel) const;
+
+  MetalDeviceKernel kernels_[DEVICE_KERNEL_NUM];
+
+  id<MTLFunction> rt_intersection_funcs[PSO_NUM][METALRT_FUNC_NUM] = {{nil}};
+
+  string loaded_md5[PSO_NUM];
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL */
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -0,0 +1,525 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_METAL
+
+#  include "device/metal/kernel.h"
+#  include "device/metal/device_impl.h"
+#  include "util/md5.h"
+#  include "util/path.h"
+#  include "util/tbb.h"
+#  include "util/time.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* limit to 2 MTLCompiler instances */
+int max_mtlcompiler_threads = 2;
+
+const char *kernel_type_as_string(int kernel_type)
+{
+  switch (kernel_type) {
+    case PSO_GENERIC:
+      return "PSO_GENERIC";
+    case PSO_SPECIALISED:
+      return "PSO_SPECIALISED";
+    default:
+      assert(0);
+  }
+  return "";
+}
+
+MetalDeviceKernel::~MetalDeviceKernel()
+{
+  for (int i = 0; i < PSO_NUM; i++) {
+    pso[i].release();
+  }
+}
+
+bool MetalDeviceKernel::load(MetalDevice *device,
+                             MetalKernelLoadDesc const &desc_in,
+                             MD5Hash const &md5)
+{
+  __block MetalKernelLoadDesc const desc(desc_in);
+  if (desc.kernel_index == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+    /* skip megakernel */
+    return true;
+  }
+
+  bool use_binary_archive = true;
+  if (getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
+    use_binary_archive = false;
+  }
+
+  id<MTLBinaryArchive> archive = nil;
+  string metalbin_path;
+  if (use_binary_archive) {
+    NSProcessInfo *processInfo = [NSProcessInfo processInfo];
+    string osVersion = [[processInfo operatingSystemVersionString] UTF8String];
+    MD5Hash local_md5(md5);
+    local_md5.append(osVersion);
+    string metalbin_name = string(desc.function_name) + "." + local_md5.get_hex() +
+                           to_string(desc.pso_index) + ".bin";
+    metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
+    path_create_directories(metalbin_path);
+
+    if (path_exists(metalbin_path) && use_binary_archive) {
+      if (@available(macOS 11.0, *)) {
+        MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
+        archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
+        archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
+        [archiveDesc release];
+      }
+    }
+  }
+
+  NSString *entryPoint = [@(desc.function_name) copy];
+
+  NSError *error = NULL;
+  if (@available(macOS 11.0, *)) {
+    MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
+    func_desc.name = entryPoint;
+    if (desc.constant_values) {
+      func_desc.constantValues = desc.constant_values;
+    }
+    pso[desc.pso_index].function = [device->mtlLibrary[desc.pso_index]
+        newFunctionWithDescriptor:func_desc
+                            error:&error];
+  }
+  [entryPoint release];
+
+  if (pso[desc.pso_index].function == nil) {
+    NSString *err = [error localizedDescription];
+    string errors = [err UTF8String];
+
+    device->set_error(
+        string_printf("Error getting function \"%s\": %s", desc.function_name, errors.c_str()));
+    return false;
+  }
+
+  pso[desc.pso_index].function.label = [@(desc.function_name) copy];
+
+  __block MTLComputePipelineDescriptor *computePipelineStateDescriptor =
+      [[MTLComputePipelineDescriptor alloc] init];
+
+  computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable;
+  computePipelineStateDescriptor.buffers[1].mutability = MTLMutabilityImmutable;
+  computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable;
+
+  if (@available(macos 10.14, *)) {
+    computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = desc.threads_per_threadgroup;
+  }
+  computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true;
+
+  computePipelineStateDescriptor.computeFunction = pso[desc.pso_index].function;
+  if (@available(macOS 11.0, *)) {
+    /* Attach the additional functions to an MTLLinkedFunctions object */
+    if (desc.linked_functions) {
+      computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc] init];
+      computePipelineStateDescriptor.linkedFunctions.functions = desc.linked_functions;
+    }
+
+    computePipelineStateDescriptor.maxCallStackDepth = 1;
+  }
+
+  /* Create a new Compute pipeline state object */
+  MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
+
+  bool creating_new_archive = false;
+  if (@available(macOS 11.0, *)) {
+    if (use_binary_archive) {
+      if (!archive) {
+        MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
+        archiveDesc.url = nil;
+        archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
+        creating_new_archive = true;
+
+        double starttime = time_dt();
+
+        if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
+                                                          error:&error]) {
+          NSString *errStr = [error localizedDescription];
+          metal_printf("Failed to add PSO to archive:\n%s\n",
+                       errStr ? [errStr UTF8String] : "nil");
+        }
+        else {
+          double duration = time_dt() - starttime;
+          metal_printf("%2d | %-55s | %7.2fs\n",
+                       desc.kernel_index,
+                       device_kernel_as_string((DeviceKernel)desc.kernel_index),
+                       duration);
+
+          if (desc.pso_index == PSO_GENERIC) {
+            this->load_duration = duration;
+          }
+        }
+      }
+      computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil];
+      pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss;
+    }
+  }
+
+  double starttime = time_dt();
+
+  MTLNewComputePipelineStateWithReflectionCompletionHandler completionHandler = ^(
+      id<MTLComputePipelineState> computePipelineState,
+      MTLComputePipelineReflection *reflection,
+      NSError *error) {
+    bool recreate_archive = false;
+    if (computePipelineState == nil && archive && !creating_new_archive) {
+
+      assert(0);
+
+      NSString *errStr = [error localizedDescription];
+      metal_printf(
+          "Failed to create compute pipeline state \"%s\" from archive - attempting recreation... "
+          "(error: %s)\n",
+          device_kernel_as_string((DeviceKernel)desc.kernel_index),
+          errStr ? [errStr UTF8String] : "nil");
+      computePipelineState = [device->mtlDevice
+          newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
+                                        options:MTLPipelineOptionNone
+                                     reflection:nullptr
+                                          error:&error];
+      recreate_archive = true;
+    }
+
+    double duration = time_dt() - starttime;
+
+    if (computePipelineState == nil) {
+      NSString *errStr = [error localizedDescription];
+      device->set_error(string_printf("Failed to create compute pipeline state \"%s\", error: \n",
+                                      device_kernel_as_string((DeviceKernel)desc.kernel_index)) +
+                        (errStr ? [errStr UTF8String] : "nil"));
+      metal_printf("%2d | %-55s | %7.2fs | FAILED!\n",
+                   desc.kernel_index,
+                   device_kernel_as_string((DeviceKernel)desc.kernel_index),
+                   duration);
+      return;
+    }
+
+    pso[desc.pso_index].pipeline = computePipelineState;
+    num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
+                                       computePipelineState.threadExecutionWidth);
+    num_threads_per_block = std::max(num_threads_per_block,
+                                     (int)computePipelineState.threadExecutionWidth);
+
+    if (!use_binary_archive) {
+      metal_printf("%2d | %-55s | %7.2fs\n",
+                   desc.kernel_index,
+                   device_kernel_as_string((DeviceKernel)desc.kernel_index),
+                   duration);
+
+      if (desc.pso_index == PSO_GENERIC) {
+        this->load_duration = duration;
+      }
+    }
+
+    if (@available(macOS 11.0, *)) {
+      if (creating_new_archive || recreate_archive) {
+        if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())]
+                               error:&error]) {
+          metal_printf("Failed to save binary archive, error:\n%s\n",
+                       [[error localizedDescription] UTF8String]);
+        }
+      }
+    }
+
+    [computePipelineStateDescriptor release];
+    computePipelineStateDescriptor = nil;
+
+    if (device->use_metalrt && desc.linked_functions) {
+      for (int table = 0; table < METALRT_TABLE_NUM; table++) {
+        if (@available(macOS 11.0, *)) {
+          MTLIntersectionFunctionTableDescriptor *ift_desc =
+              [[MTLIntersectionFunctionTableDescriptor alloc] init];
+          ift_desc.functionCount = desc.intersector_functions[table].count;
+
+          pso[desc.pso_index].intersection_func_table[table] = [pso[desc.pso_index].pipeline
+              newIntersectionFunctionTableWithDescriptor:ift_desc];
+
+          /* Finally write the function handles into this pipeline's table */
+          for (int i = 0; i < 2; i++) {
+            id<MTLFunctionHandle> handle = [pso[desc.pso_index].pipeline
+                functionHandleWithFunction:desc.intersector_functions[table][i]];
+            [pso[desc.pso_index].intersection_func_table[table] setFunction:handle atIndex:i];
+          }
+        }
+      }
+    }
+
+    mark_loaded(desc.pso_index);
+  };
+
+  if (desc.pso_index == PSO_SPECIALISED) {
+    /* Asynchronous load */
+    dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+      NSError *error;
+      id<MTLComputePipelineState> pipeline = [device->mtlDevice
+          newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
+                                        options:pipelineOptions
+                                     reflection:nullptr
+                                          error:&error];
+      completionHandler(pipeline, nullptr, error);
+    });
+  }
+  else {
+    /* Block on load to ensure we continue with a valid kernel function */
+    id<MTLComputePipelineState> pipeline = [device->mtlDevice
+        newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
+                                      options:pipelineOptions
+                                   reflection:nullptr
+                                        error:&error];
+    completionHandler(pipeline, nullptr, error);
+  }
+
+  return true;
+}
+
+const MetalKernelPipeline &MetalDeviceKernel::get_pso() const
+{
+  if (pso[PSO_SPECIALISED].loaded) {
+    return pso[PSO_SPECIALISED];
+  }
+
+  assert(pso[PSO_GENERIC].loaded);
+  return pso[PSO_GENERIC];
+}
+
+bool MetalDeviceKernels::load(MetalDevice *device, int kernel_type)
+{
+  bool any_error = false;
+
+  MD5Hash md5;
+
+  /* Build the function constant table */
+  MTLFunctionConstantValues *constant_values = nullptr;
+  if (kernel_type == PSO_SPECIALISED) {
+    constant_values = [MTLFunctionConstantValues new];
+
+#  define KERNEL_FILM(_type, name) \
+    [constant_values setConstantValue:&data.film.name \
+                                 type:get_MTLDataType_##_type() \
+                              atIndex:KernelData_film_##name]; \
+    md5.append((uint8_t *)&data.film.name, sizeof(data.film.name));
+
+#  define KERNEL_BACKGROUND(_type, name) \
+    [constant_values setConstantValue:&data.background.name \
+                                 type:get_MTLDataType_##_type() \
+                              atIndex:KernelData_background_##name]; \
+    md5.append((uint8_t *)&data.background.name, sizeof(data.background.name));
+
+#  define KERNEL_INTEGRATOR(_type, name) \
+    [constant_values setConstantValue:&data.integrator.name \
+                                 type:get_MTLDataType_##_type() \
+                              atIndex:KernelData_integrator_##name]; \
+    md5.append((uint8_t *)&data.integrator.name, sizeof(data.integrator.name));
+
+#  define KERNEL_BVH(_type, name) \
+    [constant_values setConstantValue:&data.bvh.name \
+                                 type:get_MTLDataType_##_type() \
+                              atIndex:KernelData_bvh_##name]; \
+    md5.append((uint8_t *)&data.bvh.name, sizeof(data.bvh.name));
+
+    /* METAL_WIP: populate constant_values based on KernelData */
+    assert(0);
+    /*
+        const KernelData &data = device->launch_params.data;
+    #    include "kernel/types/background.h"
+    #    include "kernel/types/bvh.h"
+    #    include "kernel/types/film.h"
+    #    include "kernel/types/integrator.h"
+    */
+  }
+
+  if (device->use_metalrt) {
+    if (@available(macOS 11.0, *)) {
+      /* create the id<MTLFunction> for each intersection function */
+      const char *function_names[] = {
+          "__anyhit__cycles_metalrt_visibility_test_tri",
+          "__anyhit__cycles_metalrt_visibility_test_box",
+          "__anyhit__cycles_metalrt_shadow_all_hit_tri",
+          "__anyhit__cycles_metalrt_shadow_all_hit_box",
+          "__anyhit__cycles_metalrt_local_hit_tri",
+          "__anyhit__cycles_metalrt_local_hit_box",
+          "__intersection__curve_ribbon",
+          "__intersection__curve_ribbon_shadow",
+          "__intersection__curve_all",
+          "__intersection__curve_all_shadow",
+      };
+      assert(sizeof(function_names) / sizeof(function_names[0]) == METALRT_FUNC_NUM);
+
+      MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
+      if (kernel_type == PSO_SPECIALISED) {
+        desc.constantValues = constant_values;
+      }
+      for (int i = 0; i < METALRT_FUNC_NUM; i++) {
+        const char *function_name = function_names[i];
+        desc.name = [@(function_name) copy];
+
+        NSError *error = NULL;
+        rt_intersection_funcs[kernel_type][i] = [device->mtlLibrary[kernel_type]
+            newFunctionWithDescriptor:desc
+                                error:&error];
+
+        if (rt_intersection_funcs[kernel_type][i] == nil) {
+          NSString *err = [error localizedDescription];
+          string errors = [err UTF8String];
+
+          device->set_error(string_printf(
+              "Error getting intersection function \"%s\": %s", function_name, errors.c_str()));
+          any_error = true;
+          break;
+        }
+
+        rt_intersection_funcs[kernel_type][i].label = [@(function_name) copy];
+      }
+    }
+  }
+  md5.append(device->source_used_for_compile[kernel_type]);
+
+  string hash = md5.get_hex();
+  if (loaded_md5[kernel_type] == hash) {
+    return true;
+  }
+
+  if (!any_error) {
+    NSArray *table_functions[METALRT_TABLE_NUM] = {nil};
+    NSArray *function_list = nil;
+
+    if (device->use_metalrt) {
+      id<MTLFunction> box_intersect_default = nil;
+      id<MTLFunction> box_intersect_shadow = nil;
+      if (device->kernel_features & KERNEL_FEATURE_HAIR) {
+        /* Add curve intersection programs. */
+        if (device->kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+          /* Slower programs for thick hair since that also slows down ribbons.
+           * Ideally this should not be needed. */
+          box_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL];
+          box_intersect_shadow = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL_SHADOW];
+        }
+        else {
+          box_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON];
+          box_intersect_shadow =
+              rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON_SHADOW];
+        }
+      }
+      table_functions[METALRT_TABLE_DEFAULT] = [NSArray
+          arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_TRI],
+                           box_intersect_default ?
+                               box_intersect_default :
+                               rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_BOX],
+                           nil];
+      table_functions[METALRT_TABLE_SHADOW] = [NSArray
+          arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_TRI],
+                           box_intersect_shadow ?
+                               box_intersect_shadow :
+                               rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_BOX],
+                           nil];
+      table_functions[METALRT_TABLE_LOCAL] = [NSArray
+          arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_TRI],
+                           rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_BOX],
+                           nil];
+
+      NSMutableSet *unique_functions = [NSMutableSet
+          setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
+      [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
+      [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
+
+      function_list = [[NSArray arrayWithArray:[unique_functions allObjects]]
+          sortedArrayUsingComparator:^NSComparisonResult(id<MTLFunction> f1, id<MTLFunction> f2) {
+            return [f1.label compare:f2.label];
+          }];
+
+      unique_functions = nil;
+    }
+
+    metal_printf("Starting %s \"cycles_metal_...\" pipeline builds\n",
+                 kernel_type_as_string(kernel_type));
+
+    tbb::task_arena local_arena(max_mtlcompiler_threads);
+    local_arena.execute([&]() {
+      tbb::parallel_for(int(0), int(DEVICE_KERNEL_NUM), [&](int i) {
+        /* skip megakernel */
+        if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+          return;
+        }
+
+        /* Only specialize kernels where it can make an impact. */
+        if (kernel_type == PSO_SPECIALISED) {
+          if (i < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+              i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+            return;
+          }
+        }
+
+        MetalDeviceKernel &kernel = kernels_[i];
+
+        const std::string function_name = std::string("cycles_metal_") +
+                                          device_kernel_as_string((DeviceKernel)i);
+        int threads_per_threadgroup = device->max_threads_per_threadgroup;
+        if (i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL && i < DEVICE_KERNEL_INTEGRATOR_RESET) {
+          /* Always use 512 for the sorting kernels */
+          threads_per_threadgroup = 512;
+        }
+
+        NSArray *kernel_function_list = nil;
+
+        if (i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+            i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+            i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+            i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
+            i == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+          kernel_function_list = function_list;
+        }
+
+        MetalKernelLoadDesc desc;
+        desc.pso_index = kernel_type;
+        desc.kernel_index = i;
+        desc.linked_functions = kernel_function_list;
+        desc.intersector_functions.defaults = table_functions[METALRT_TABLE_DEFAULT];
+        desc.intersector_functions.shadow = table_functions[METALRT_TABLE_SHADOW];
+        desc.intersector_functions.local = table_functions[METALRT_TABLE_LOCAL];
+        desc.constant_values = constant_values;
+        desc.threads_per_threadgroup = threads_per_threadgroup;
+        desc.function_name = function_name.c_str();
+
+        bool success = kernel.load(device, desc, md5);
+
+        any_error |= !success;
+      });
+    });
+  }
+
+  bool loaded = !any_error;
+  if (loaded) {
+    loaded_md5[kernel_type] = hash;
+  }
+  return loaded;
+}
+
+const MetalDeviceKernel &MetalDeviceKernels::get(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel];
+}
+
+bool MetalDeviceKernels::available(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel].get_pso().function != nil;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL*/
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_METAL
+
+#  include "device/kernel.h"
+#  include "device/memory.h"
+#  include "device/queue.h"
+
+#  include "device/metal/util.h"
+#  include "kernel/device/metal/globals.h"
+
+#  define metal_printf VLOG(4) << string_printf
+
+CCL_NAMESPACE_BEGIN
+
+class MetalDevice;
+
+/* Base class for Metal queues. */
+class MetalDeviceQueue : public DeviceQueue {
+ public:
+  MetalDeviceQueue(MetalDevice *device);
+  ~MetalDeviceQueue();
+
+  virtual int num_concurrent_states(const size_t) const override;
+  virtual int num_concurrent_busy_states() const override;
+
+  virtual void init_execution() override;
+
+  virtual bool enqueue(DeviceKernel kernel,
+                       const int work_size,
+                       DeviceKernelArguments const &args) override;
+
+  virtual bool synchronize() override;
+
+  virtual void zero_to_device(device_memory &mem) override;
+  virtual void copy_to_device(device_memory &mem) override;
+  virtual void copy_from_device(device_memory &mem) override;
+
+  virtual bool kernel_available(DeviceKernel kernel) const override;
+
+ protected:
+  void prepare_resources(DeviceKernel kernel);
+
+  id<MTLComputeCommandEncoder> get_compute_encoder(DeviceKernel kernel);
+  id<MTLBlitCommandEncoder> get_blit_encoder();
+
+  MetalDevice *metal_device;
+  MetalBufferPool temp_buffer_pool;
+
+  API_AVAILABLE(macos(11.0), ios(14.0))
+  MTLCommandBufferDescriptor *command_buffer_desc = nullptr;
+  id<MTLDevice> mtlDevice = nil;
+  id<MTLCommandQueue> mtlCommandQueue = nil;
+  id<MTLCommandBuffer> mtlCommandBuffer = nil;
+  id<MTLComputeCommandEncoder> mtlComputeEncoder = nil;
+  id<MTLBlitCommandEncoder> mtlBlitEncoder = nil;
+  API_AVAILABLE(macos(10.14), ios(14.0))
+  id<MTLSharedEvent> shared_event = nil;
+  API_AVAILABLE(macos(10.14), ios(14.0))
+  MTLSharedEventListener *shared_event_listener = nil;
+
+  dispatch_queue_t event_queue;
+  dispatch_semaphore_t wait_semaphore;
+
+  struct CopyBack {
+    void *host_pointer;
+    void *gpu_mem;
+    uint64_t size;
+  };
+  std::vector<CopyBack> copy_back_mem;
+
+  uint64_t shared_event_id;
+  uint64_t command_buffers_submitted = 0;
+  uint64_t command_buffers_completed = 0;
+  Stats &stats;
+
+  void close_compute_encoder();
+  void close_blit_encoder();
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL */
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -0,0 +1,610 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_METAL
+
+#  include "device/metal/queue.h"
+
+#  include "device/metal/device_impl.h"
+#  include "device/metal/kernel.h"
+
+#  include "util/path.h"
+#  include "util/string.h"
+#  include "util/time.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* MetalDeviceQueue */
+
+MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device)
+    : DeviceQueue(device), metal_device(device), stats(device->stats)
+{
+  if (@available(macos 11.0, *)) {
+    command_buffer_desc = [[MTLCommandBufferDescriptor alloc] init];
+    command_buffer_desc.errorOptions = MTLCommandBufferErrorOptionEncoderExecutionStatus;
+  }
+
+  mtlDevice = device->mtlDevice;
+  mtlCommandQueue = [mtlDevice newCommandQueue];
+
+  if (@available(macos 10.14, *)) {
+    shared_event = [mtlDevice newSharedEvent];
+    shared_event_id = 1;
+
+    /* Shareable event listener */
+    event_queue = dispatch_queue_create("com.cycles.metal.event_queue", NULL);
+    shared_event_listener = [[MTLSharedEventListener alloc] initWithDispatchQueue:event_queue];
+  }
+
+  wait_semaphore = dispatch_semaphore_create(0);
+}
+
+MetalDeviceQueue::~MetalDeviceQueue()
+{
+  /* Tidying up here isn't really practical - we should expect and require the work
+   * queue to be empty here. */
+  assert(mtlCommandBuffer == nil);
+  assert(command_buffers_submitted == command_buffers_completed);
+
+  if (@available(macos 10.14, *)) {
+    [shared_event_listener release];
+    [shared_event release];
+  }
+
+  if (@available(macos 11.0, *)) {
+    [command_buffer_desc release];
+  }
+  if (mtlCommandQueue) {
+    [mtlCommandQueue release];
+    mtlCommandQueue = nil;
+  }
+}
+
+int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
+{
+  /* METAL_WIP */
+  /* TODO: compute automatically. */
+  /* TODO: must have at least num_threads_per_block. */
+  int result = 1048576;
+  if (metal_device->device_vendor == METAL_GPU_AMD) {
+    result *= 2;
+  }
+  else if (metal_device->device_vendor == METAL_GPU_APPLE) {
+    result *= 4;
+  }
+  return result;
+}
+
+int MetalDeviceQueue::num_concurrent_busy_states() const
+{
+  /* METAL_WIP */
+  /* TODO: compute automatically. */
+  int result = 65536;
+  if (metal_device->device_vendor == METAL_GPU_AMD) {
+    result *= 2;
+  }
+  else if (metal_device->device_vendor == METAL_GPU_APPLE) {
+    result *= 4;
+  }
+  return result;
+}
+
+void MetalDeviceQueue::init_execution()
+{
+  /* Synchronize all textures and memory copies before executing task. */
+  metal_device->load_texture_info();
+
+  synchronize();
+}
+
+bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
+                               const int work_size,
+                               DeviceKernelArguments const &args)
+{
+  if (metal_device->have_error()) {
+    return false;
+  }
+
+  VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
+          << work_size;
+
+  const MetalDeviceKernel &metal_kernel = metal_device->kernels.get(kernel);
+  const MetalKernelPipeline &metal_kernel_pso = metal_kernel.get_pso();
+
+  id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
+
+  /* Determine size requirement for argument buffer. */
+  size_t arg_buffer_length = 0;
+  for (size_t i = 0; i < args.count; i++) {
+    size_t size_in_bytes = args.sizes[i];
+    arg_buffer_length = round_up(arg_buffer_length, size_in_bytes) + size_in_bytes;
+  }
+  /* 256 is the Metal offset alignment for constant address space bindings */
+  arg_buffer_length = round_up(arg_buffer_length, 256);
+
+  /* Globals placed after "vanilla" arguments. */
+  size_t globals_offsets = arg_buffer_length;
+  arg_buffer_length += sizeof(KernelParamsMetal);
+  arg_buffer_length = round_up(arg_buffer_length, 256);
+
+  /* Metal ancillary bindless pointers. */
+  size_t metal_offsets = arg_buffer_length;
+  arg_buffer_length += metal_device->mtlAncillaryArgEncoder.encodedLength;
+  arg_buffer_length = round_up(arg_buffer_length, metal_device->mtlAncillaryArgEncoder.alignment);
+
+  /* Temporary buffer used to prepare arg_buffer */
+  uint8_t *init_arg_buffer = (uint8_t *)alloca(arg_buffer_length);
+  memset(init_arg_buffer, 0, arg_buffer_length);
+
+  /* Prepare the non-pointer "enqueue" arguments */
+  size_t bytes_written = 0;
+  for (size_t i = 0; i < args.count; i++) {
+    size_t size_in_bytes = args.sizes[i];
+    bytes_written = round_up(bytes_written, size_in_bytes);
+    if (args.types[i] != DeviceKernelArguments::POINTER) {
+      memcpy(init_arg_buffer + bytes_written, args.values[i], size_in_bytes);
+    }
+    bytes_written += size_in_bytes;
+  }
+
+  /* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */
+  /* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */
+  size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, __integrator_state) +
+                                        sizeof(IntegratorStateGPU);
+  size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset;
+  memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset,
+         (uint8_t *)&metal_device->launch_params + plain_old_launch_data_offset,
+         plain_old_launch_data_size);
+
+  /* Allocate an argument buffer. */
+  MTLResourceOptions arg_buffer_options = MTLResourceStorageModeManaged;
+  if (@available(macOS 11.0, *)) {
+    if ([mtlDevice hasUnifiedMemory]) {
+      arg_buffer_options = MTLResourceStorageModeShared;
+    }
+  }
+
+  id<MTLBuffer> arg_buffer = temp_buffer_pool.get_buffer(
+      mtlDevice, mtlCommandBuffer, arg_buffer_length, arg_buffer_options, init_arg_buffer, stats);
+
+  /* Encode the pointer "enqueue" arguments */
+  bytes_written = 0;
+  for (size_t i = 0; i < args.count; i++) {
+    size_t size_in_bytes = args.sizes[i];
+    bytes_written = round_up(bytes_written, size_in_bytes);
+    if (args.types[i] == DeviceKernelArguments::POINTER) {
+      [metal_device->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer
+                                                             offset:bytes_written];
+      if (MetalDevice::MetalMem *mmem = *(MetalDevice::MetalMem **)args.values[i]) {
+        [mtlComputeCommandEncoder useResource:mmem->mtlBuffer
+                                        usage:MTLResourceUsageRead | MTLResourceUsageWrite];
+        [metal_device->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0];
+      }
+      else {
+        if (@available(macos 12.0, *)) {
+          [metal_device->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:0];
+        }
+      }
+    }
+    bytes_written += size_in_bytes;
+  }
+
+  /* Encode KernelParamsMetal buffers */
+  [metal_device->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer offset:globals_offsets];
+
+  /* this relies on IntegratorStateGPU layout being contiguous device_ptrs  */
+  const size_t pointer_block_end = offsetof(KernelParamsMetal, __integrator_state) +
+                                   sizeof(IntegratorStateGPU);
+  for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) {
+    int pointer_index = offset / sizeof(device_ptr);
+    MetalDevice::MetalMem *mmem = *(
+        MetalDevice::MetalMem **)((uint8_t *)&metal_device->launch_params + offset);
+    if (mmem && (mmem->mtlBuffer || mmem->mtlTexture)) {
+      [metal_device->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer
+                                                     offset:0
+                                                    atIndex:pointer_index];
+    }
+    else {
+      if (@available(macos 12.0, *)) {
+        [metal_device->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:pointer_index];
+      }
+    }
+  }
+  bytes_written = globals_offsets + sizeof(KernelParamsMetal);
+
+  /* Encode ancillaries */
+  [metal_device->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
+  [metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_2d
+                                           offset:0
+                                          atIndex:0];
+  [metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_3d
+                                           offset:0
+                                          atIndex:1];
+  if (@available(macos 12.0, *)) {
+    if (metal_device->use_metalrt) {
+      if (metal_device->bvhMetalRT) {
+        id<MTLAccelerationStructure> accel_struct = metal_device->bvhMetalRT->accel_struct;
+        [metal_device->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
+      }
+
+      for (int table = 0; table < METALRT_TABLE_NUM; table++) {
+        if (metal_kernel_pso.intersection_func_table[table]) {
+          [metal_kernel_pso.intersection_func_table[table] setBuffer:arg_buffer
+                                                              offset:globals_offsets
+                                                             atIndex:1];
+          [metal_device->mtlAncillaryArgEncoder
+              setIntersectionFunctionTable:metal_kernel_pso.intersection_func_table[table]
+                                   atIndex:3 + table];
+          [mtlComputeCommandEncoder useResource:metal_kernel_pso.intersection_func_table[table]
+                                          usage:MTLResourceUsageRead];
+        }
+        else {
+          [metal_device->mtlAncillaryArgEncoder setIntersectionFunctionTable:nil
+                                                                     atIndex:3 + table];
+        }
+      }
+    }
+    bytes_written = metal_offsets + metal_device->mtlAncillaryArgEncoder.encodedLength;
+  }
+
+  if (arg_buffer.storageMode == MTLStorageModeManaged) {
+    [arg_buffer didModifyRange:NSMakeRange(0, bytes_written)];
+  }
+
+  [mtlComputeCommandEncoder setBuffer:arg_buffer offset:0 atIndex:0];
+  [mtlComputeCommandEncoder setBuffer:arg_buffer offset:globals_offsets atIndex:1];
+  [mtlComputeCommandEncoder setBuffer:arg_buffer offset:metal_offsets atIndex:2];
+
+  if (metal_device->use_metalrt) {
+    if (@available(macos 12.0, *)) {
+
+      auto bvhMetalRT = metal_device->bvhMetalRT;
+      switch (kernel) {
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+          break;
+        default:
+          bvhMetalRT = nil;
+          break;
+      }
+
+      if (bvhMetalRT) {
+        /* Mark all Accelerations resources as used */
+        [mtlComputeCommandEncoder useResource:bvhMetalRT->accel_struct usage:MTLResourceUsageRead];
+        [mtlComputeCommandEncoder useResources:bvhMetalRT->blas_array.data()
+                                         count:bvhMetalRT->blas_array.size()
+                                         usage:MTLResourceUsageRead];
+      }
+    }
+  }
+
+  [mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso.pipeline];
+
+  /* Compute kernel launch parameters. */
+  const int num_threads_per_block = metal_kernel.get_num_threads_per_block();
+
+  int shared_mem_bytes = 0;
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
+      /* See parallel_active_index.h for why this amount of shared memory is needed.
+       * Rounded up to 16 bytes for Metal */
+      shared_mem_bytes = round_up((num_threads_per_block + 1) * sizeof(int), 16);
+      [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
+      break;
+
+    default:
+      break;
+  }
+
+  MTLSize size_threadgroups_per_dispatch = MTLSizeMake(
+      divide_up(work_size, num_threads_per_block), 1, 1);
+  MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
+  [mtlComputeCommandEncoder dispatchThreadgroups:size_threadgroups_per_dispatch
+                           threadsPerThreadgroup:size_threads_per_threadgroup];
+
+  [mtlCommandBuffer addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
+    NSString *kernel_name = metal_kernel_pso.function.label;
+
+    /* Enhanced command buffer errors are only available in 11.0+ */
+    if (@available(macos 11.0, *)) {
+      if (command_buffer.status == MTLCommandBufferStatusError && command_buffer.error != nil) {
+        printf("CommandBuffer Failed: %s\n", [kernel_name UTF8String]);
+        NSArray<id<MTLCommandBufferEncoderInfo>> *encoderInfos = [command_buffer.error.userInfo
+            valueForKey:MTLCommandBufferEncoderInfoErrorKey];
+        if (encoderInfos != nil) {
+          for (id<MTLCommandBufferEncoderInfo> encoderInfo : encoderInfos) {
+            NSLog(@"%@", encoderInfo);
+          }
+        }
+        id<MTLLogContainer> logs = command_buffer.logs;
+        for (id<MTLFunctionLog> log in logs) {
+          NSLog(@"%@", log);
+        }
+      }
+      else if (command_buffer.error) {
+        printf("CommandBuffer Failed: %s\n", [kernel_name UTF8String]);
+      }
+    }
+  }];
+
+  return !(metal_device->have_error());
+}
+
+bool MetalDeviceQueue::synchronize()
+{
+  if (metal_device->have_error()) {
+    return false;
+  }
+
+  if (mtlComputeEncoder) {
+    close_compute_encoder();
+  }
+  close_blit_encoder();
+
+  if (mtlCommandBuffer) {
+    uint64_t shared_event_id = this->shared_event_id++;
+
+    if (@available(macos 10.14, *)) {
+      __block dispatch_semaphore_t block_sema = wait_semaphore;
+      [shared_event notifyListener:shared_event_listener
+                           atValue:shared_event_id
+                             block:^(id<MTLSharedEvent> sharedEvent, uint64_t value) {
+                               dispatch_semaphore_signal(block_sema);
+                             }];
+
+      [mtlCommandBuffer encodeSignalEvent:shared_event value:shared_event_id];
+      [mtlCommandBuffer commit];
+      dispatch_semaphore_wait(wait_semaphore, DISPATCH_TIME_FOREVER);
+    }
+
+    [mtlCommandBuffer release];
+
+    for (const CopyBack &mmem : copy_back_mem) {
+      memcpy((uchar *)mmem.host_pointer, (uchar *)mmem.gpu_mem, mmem.size);
+    }
+    copy_back_mem.clear();
+
+    temp_buffer_pool.process_command_buffer_completion(mtlCommandBuffer);
+    metal_device->flush_delayed_free_list();
+
+    mtlCommandBuffer = nil;
+  }
+
+  return !(metal_device->have_error());
+}
+
+void MetalDeviceQueue::zero_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    metal_device->mem_alloc(mem);
+  }
+
+  /* Zero memory on device. */
+  assert(mem.device_pointer != 0);
+
+  std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex);
+  MetalDevice::MetalMem &mmem = *metal_device->metal_mem_map.at(&mem);
+  if (mmem.mtlBuffer) {
+    id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
+    [blitEncoder fillBuffer:mmem.mtlBuffer range:NSMakeRange(mmem.offset, mmem.size) value:0];
+  }
+  else {
+    metal_device->mem_zero(mem);
+  }
+}
+
+void MetalDeviceQueue::copy_to_device(device_memory &mem)
+{
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    metal_device->mem_alloc(mem);
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex);
+  auto result = metal_device->metal_mem_map.find(&mem);
+  if (result != metal_device->metal_mem_map.end()) {
+    if (mem.host_pointer == mem.shared_pointer) {
+      return;
+    }
+
+    MetalDevice::MetalMem &mmem = *result->second;
+    id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
+
+    id<MTLBuffer> buffer = temp_buffer_pool.get_buffer(mtlDevice,
+                                                       mtlCommandBuffer,
+                                                       mmem.size,
+                                                       MTLResourceStorageModeShared,
+                                                       mem.host_pointer,
+                                                       stats);
+
+    [blitEncoder copyFromBuffer:buffer
+                   sourceOffset:0
+                       toBuffer:mmem.mtlBuffer
+              destinationOffset:mmem.offset
+                           size:mmem.size];
+  }
+  else {
+    metal_device->mem_copy_to(mem);
+  }
+}
+
+void MetalDeviceQueue::copy_from_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex);
+  MetalDevice::MetalMem &mmem = *metal_device->metal_mem_map.at(&mem);
+  if (mmem.mtlBuffer) {
+    const size_t size = mem.memory_size();
+
+    if (mem.device_pointer) {
+      if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) {
+        id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
+        [blitEncoder synchronizeResource:mmem.mtlBuffer];
+      }
+      if (mem.host_pointer != mmem.hostPtr) {
+        if (mtlCommandBuffer) {
+          copy_back_mem.push_back({mem.host_pointer, mmem.hostPtr, size});
+        }
+        else {
+          memcpy((uchar *)mem.host_pointer, (uchar *)mmem.hostPtr, size);
+        }
+      }
+    }
+    else {
+      memset((char *)mem.host_pointer, 0, size);
+    }
+  }
+  else {
+    metal_device->mem_copy_from(mem);
+  }
+}
+
+bool MetalDeviceQueue::kernel_available(DeviceKernel kernel) const
+{
+  return metal_device->kernels.available(kernel);
+}
+
+void MetalDeviceQueue::prepare_resources(DeviceKernel kernel)
+{
+  std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex);
+
+  /* declare resource usage */
+  for (auto &it : metal_device->metal_mem_map) {
+    device_memory *mem = it.first;
+
+    MTLResourceUsage usage = MTLResourceUsageRead;
+    if (mem->type != MEM_GLOBAL && mem->type != MEM_READ_ONLY && mem->type != MEM_TEXTURE) {
+      usage |= MTLResourceUsageWrite;
+    }
+
+    if (it.second->mtlBuffer) {
+      /* METAL_WIP - use array version (i.e. useResources) */
+      [mtlComputeEncoder useResource:it.second->mtlBuffer usage:usage];
+    }
+    else if (it.second->mtlTexture) {
+      /* METAL_WIP - use array version (i.e. useResources) */
+      [mtlComputeEncoder useResource:it.second->mtlTexture usage:usage | MTLResourceUsageSample];
+    }
+  }
+
+  /* ancillaries */
+  [mtlComputeEncoder useResource:metal_device->texture_bindings_2d usage:MTLResourceUsageRead];
+  [mtlComputeEncoder useResource:metal_device->texture_bindings_3d usage:MTLResourceUsageRead];
+}
+
+id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel kernel)
+{
+  bool concurrent = (kernel < DEVICE_KERNEL_INTEGRATOR_NUM);
+
+  if (@available(macos 10.14, *)) {
+    if (mtlComputeEncoder) {
+      if (mtlComputeEncoder.dispatchType == concurrent ? MTLDispatchTypeConcurrent :
+                                                         MTLDispatchTypeSerial) {
+        /* declare usage of MTLBuffers etc */
+        prepare_resources(kernel);
+
+        return mtlComputeEncoder;
+      }
+      close_compute_encoder();
+    }
+
+    close_blit_encoder();
+
+    if (!mtlCommandBuffer) {
+      mtlCommandBuffer = [mtlCommandQueue commandBuffer];
+      [mtlCommandBuffer retain];
+    }
+
+    mtlComputeEncoder = [mtlCommandBuffer
+        computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
+                                                           MTLDispatchTypeSerial];
+
+    /* declare usage of MTLBuffers etc */
+    prepare_resources(kernel);
+  }
+
+  return mtlComputeEncoder;
+}
+
+id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder()
+{
+  if (mtlBlitEncoder) {
+    return mtlBlitEncoder;
+  }
+
+  if (mtlComputeEncoder) {
+    close_compute_encoder();
+  }
+
+  if (!mtlCommandBuffer) {
+    mtlCommandBuffer = [mtlCommandQueue commandBuffer];
+    [mtlCommandBuffer retain];
+  }
+
+  mtlBlitEncoder = [mtlCommandBuffer blitCommandEncoder];
+  return mtlBlitEncoder;
+}
+
+void MetalDeviceQueue::close_compute_encoder()
+{
+  [mtlComputeEncoder endEncoding];
+  mtlComputeEncoder = nil;
+}
+
+void MetalDeviceQueue::close_blit_encoder()
+{
+  if (mtlBlitEncoder) {
+    [mtlBlitEncoder endEncoding];
+    mtlBlitEncoder = nil;
+  }
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL */
--- a/intern/cycles/device/metal/util.h
+++ b/intern/cycles/device/metal/util.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_METAL
+
+#  include <Metal/Metal.h>
+#  include <string>
+
+#  include "device/metal/device.h"
+#  include "device/metal/kernel.h"
+#  include "device/queue.h"
+
+#  include "util/thread.h"
+
+CCL_NAMESPACE_BEGIN
+
+enum MetalGPUVendor {
+  METAL_GPU_UNKNOWN = 0,
+  METAL_GPU_APPLE = 1,
+  METAL_GPU_AMD = 2,
+  METAL_GPU_INTEL = 3,
+};
+
+/* Retains a named MTLDevice for device enumeration. */
+struct MetalPlatformDevice {
+  MetalPlatformDevice(id<MTLDevice> device, const string &device_name)
+      : device_id(device), device_name(device_name)
+  {
+    [device_id retain];
+  }
+  ~MetalPlatformDevice()
+  {
+    [device_id release];
+  }
+  id<MTLDevice> device_id;
+  string device_name;
+};
+
+/* Contains static Metal helper functions. */
+struct MetalInfo {
+  static bool device_version_check(id<MTLDevice> device);
+  static void get_usable_devices(vector<MetalPlatformDevice> *usable_devices);
+  static MetalGPUVendor get_vendor_from_device_name(string const &device_name);
+
+  /* Platform information. */
+  static bool get_num_devices(uint32_t *num_platforms);
+  static uint32_t get_num_devices();
+
+  static bool get_device_name(id<MTLDevice> device_id, string *device_name);
+  static string get_device_name(id<MTLDevice> device_id);
+};
+
+/* Pool of MTLBuffers whose lifetime is linked to a single MTLCommandBuffer */
+class MetalBufferPool {
+  struct MetalBufferListEntry {
+    MetalBufferListEntry(id<MTLBuffer> buffer, id<MTLCommandBuffer> command_buffer)
+        : buffer(buffer), command_buffer(command_buffer)
+    {
+    }
+
+    MetalBufferListEntry() = delete;
+
+    id<MTLBuffer> buffer;
+    id<MTLCommandBuffer> command_buffer;
+  };
+  std::vector<MetalBufferListEntry> buffer_free_list;
+  std::vector<MetalBufferListEntry> buffer_in_use_list;
+  thread_mutex buffer_mutex;
+  size_t total_temp_mem_size = 0;
+
+ public:
+  MetalBufferPool() = default;
+  ~MetalBufferPool();
+
+  id<MTLBuffer> get_buffer(id<MTLDevice> device,
+                           id<MTLCommandBuffer> command_buffer,
+                           NSUInteger length,
+                           MTLResourceOptions options,
+                           const void *pointer,
+                           Stats &stats);
+  void process_command_buffer_completion(id<MTLCommandBuffer> command_buffer);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL */
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -0,0 +1,218 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_METAL
+
+#  include "device/metal/util.h"
+#  include "device/metal/device_impl.h"
+#  include "util/md5.h"
+#  include "util/path.h"
+#  include "util/string.h"
+#  include "util/time.h"
+
+#  include <pwd.h>
+#  include <sys/shm.h>
+#  include <time.h>
+
+CCL_NAMESPACE_BEGIN
+
+MetalGPUVendor MetalInfo::get_vendor_from_device_name(string const &device_name)
+{
+  if (device_name.find("Intel") != string::npos) {
+    return METAL_GPU_INTEL;
+  }
+  else if (device_name.find("AMD") != string::npos) {
+    return METAL_GPU_AMD;
+  }
+  else if (device_name.find("Apple") != string::npos) {
+    return METAL_GPU_APPLE;
+  }
+  return METAL_GPU_UNKNOWN;
+}
+
+bool MetalInfo::device_version_check(id<MTLDevice> device)
+{
+  /* Metal Cycles doesn't work correctly on macOS versions older than 12.0 */
+  if (@available(macos 12.0, *)) {
+    MetalGPUVendor vendor = get_vendor_from_device_name([[device name] UTF8String]);
+
+    /* Metal Cycles works on Apple Silicon GPUs at present */
+    return (vendor == METAL_GPU_APPLE);
+  }
+
+  return false;
+}
+
+void MetalInfo::get_usable_devices(vector<MetalPlatformDevice> *usable_devices)
+{
+  static bool first_time = true;
+#  define FIRST_VLOG(severity) \
+    if (first_time) \
+    VLOG(severity)
+
+  usable_devices->clear();
+
+  NSArray<id<MTLDevice>> *allDevices = MTLCopyAllDevices();
+  for (id<MTLDevice> device in allDevices) {
+    string device_name;
+    if (!get_device_name(device, &device_name)) {
+      FIRST_VLOG(2) << "Failed to get device name, ignoring.";
+      continue;
+    }
+
+    static const char *forceIntelStr = getenv("CYCLES_METAL_FORCE_INTEL");
+    bool forceIntel = forceIntelStr ? (atoi(forceIntelStr) != 0) : false;
+    if (forceIntel && device_name.find("Intel") == string::npos) {
+      FIRST_VLOG(2) << "CYCLES_METAL_FORCE_INTEL causing non-Intel device " << device_name
+                    << " to be ignored.";
+      continue;
+    }
+
+    if (!device_version_check(device)) {
+      FIRST_VLOG(2) << "Ignoring device " << device_name << " due to too old compiler version.";
+      continue;
+    }
+    FIRST_VLOG(2) << "Adding new device " << device_name << ".";
+    string hardware_id;
+    usable_devices->push_back(MetalPlatformDevice(device, device_name));
+  }
+  first_time = false;
+}
+
+bool MetalInfo::get_num_devices(uint32_t *num_devices)
+{
+  *num_devices = MTLCopyAllDevices().count;
+  return true;
+}
+
+uint32_t MetalInfo::get_num_devices()
+{
+  uint32_t num_devices;
+  if (!get_num_devices(&num_devices)) {
+    return 0;
+  }
+  return num_devices;
+}
+
+bool MetalInfo::get_device_name(id<MTLDevice> device, string *platform_name)
+{
+  *platform_name = [device.name UTF8String];
+  return true;
+}
+
+string MetalInfo::get_device_name(id<MTLDevice> device)
+{
+  string platform_name;
+  if (!get_device_name(device, &platform_name)) {
+    return "";
+  }
+  return platform_name;
+}
+
+id<MTLBuffer> MetalBufferPool::get_buffer(id<MTLDevice> device,
+                                          id<MTLCommandBuffer> command_buffer,
+                                          NSUInteger length,
+                                          MTLResourceOptions options,
+                                          const void *pointer,
+                                          Stats &stats)
+{
+  id<MTLBuffer> buffer;
+
+  MTLStorageMode storageMode = MTLStorageMode((options & MTLResourceStorageModeMask) >>
+                                              MTLResourceStorageModeShift);
+  MTLCPUCacheMode cpuCacheMode = MTLCPUCacheMode((options & MTLResourceCPUCacheModeMask) >>
+                                                 MTLResourceCPUCacheModeShift);
+
+  buffer_mutex.lock();
+  for (auto entry = buffer_free_list.begin(); entry != buffer_free_list.end(); entry++) {
+    MetalBufferListEntry bufferEntry = *entry;
+
+    /* Check if buffer matches size and storage mode and is old enough to reuse */
+    if (bufferEntry.buffer.length == length && storageMode == bufferEntry.buffer.storageMode &&
+        cpuCacheMode == bufferEntry.buffer.cpuCacheMode) {
+      buffer = bufferEntry.buffer;
+      buffer_free_list.erase(entry);
+      bufferEntry.command_buffer = command_buffer;
+      buffer_in_use_list.push_back(bufferEntry);
+      buffer_mutex.unlock();
+
+      /* Copy over data */
+      if (pointer) {
+        memcpy(buffer.contents, pointer, length);
+        if (bufferEntry.buffer.storageMode == MTLStorageModeManaged) {
+          [buffer didModifyRange:NSMakeRange(0, length)];
+        }
+      }
+
+      return buffer;
+    }
+  }
+  // NSLog(@"Creating buffer of length %lu (%lu)", length, frameCount);
+  if (pointer) {
+    buffer = [device newBufferWithBytes:pointer length:length options:options];
+  }
+  else {
+    buffer = [device newBufferWithLength:length options:options];
+  }
+
+  MetalBufferListEntry buffer_entry(buffer, command_buffer);
+
+  stats.mem_alloc(buffer.allocatedSize);
+
+  total_temp_mem_size += buffer.allocatedSize;
+  buffer_in_use_list.push_back(buffer_entry);
+  buffer_mutex.unlock();
+
+  return buffer;
+}
+
+void MetalBufferPool::process_command_buffer_completion(id<MTLCommandBuffer> command_buffer)
+{
+  assert(command_buffer);
+  thread_scoped_lock lock(buffer_mutex);
+  /* Release all buffers that have not been recently reused back into the free pool */
+  for (auto entry = buffer_in_use_list.begin(); entry != buffer_in_use_list.end();) {
+    MetalBufferListEntry buffer_entry = *entry;
+    if (buffer_entry.command_buffer == command_buffer) {
+      entry = buffer_in_use_list.erase(entry);
+      buffer_entry.command_buffer = nil;
+      buffer_free_list.push_back(buffer_entry);
+    }
+    else {
+      entry++;
+    }
+  }
+}
+
+MetalBufferPool::~MetalBufferPool()
+{
+  thread_scoped_lock lock(buffer_mutex);
+  /* Release all buffers that have not been recently reused */
+  for (auto entry = buffer_free_list.begin(); entry != buffer_free_list.end();) {
+    MetalBufferListEntry buffer_entry = *entry;
+
+    id<MTLBuffer> buffer = buffer_entry.buffer;
+    // NSLog(@"Releasing buffer of length %lu (%lu) (%lu outstanding)", buffer.length, frameCount,
+    // bufferFreeList.size());
+    total_temp_mem_size -= buffer.allocatedSize;
+    [buffer release];
+    entry = buffer_free_list.erase(entry);
+  }
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_METAL */
--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@@ -124,6 +124,11 @@ class MultiDevice : public Device {
      return BVH_LAYOUT_MULTI_OPTIX;
    }

+    /* With multiple Metal devices, every device needs its own acceleration structure */
+    if (bvh_layout_mask == BVH_LAYOUT_METAL) {
+      return BVH_LAYOUT_MULTI_METAL;
+    }
+
    /* When devices do not share a common BVH layout, fall back to creating one for each */
    const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
    if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
@@ -155,6 +160,7 @@ class MultiDevice : public Device {
    }

    assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX ||
+           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL ||
           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE ||
           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL_EMBREE);

@@ -179,9 +185,14 @@ class MultiDevice : public Device {
        BVHParams params = bvh->params;
        if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
          params.bvh_layout = BVH_LAYOUT_OPTIX;
+        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL)
+          params.bvh_layout = BVH_LAYOUT_METAL;
        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
          params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
                                                                      BVH_LAYOUT_EMBREE;
+        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL_EMBREE)
+          params.bvh_layout = sub.device->info.type == DEVICE_METAL ? BVH_LAYOUT_METAL :
+                                                                      BVH_LAYOUT_EMBREE;

        /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
         * (since they are put into the top level directly, see bvh_embree.cpp) */
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -28,6 +28,7 @@
 #  include "scene/mesh.h"
 #  include "scene/object.h"
 #  include "scene/pass.h"
+#  include "scene/pointcloud.h"
 #  include "scene/scene.h"

 #  include "util/debug.h"
@@ -46,14 +47,14 @@
 CCL_NAMESPACE_BEGIN

 OptiXDevice::Denoiser::Denoiser(OptiXDevice *device)
-    : device(device), queue(device), state(device, "__denoiser_state")
+    : device(device), queue(device), state(device, "__denoiser_state", true)
 {
 }

 OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
    : CUDADevice(info, stats, profiler),
      sbt_data(this, "__sbt", MEM_READ_ONLY),
-      launch_params(this, "__params"),
+      launch_params(this, "__params", false),
      denoiser_(this)
 {
  /* Make the CUDA context current. */
@@ -242,6 +243,9 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
    else
      pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
  }
+  if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
+    pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
+  }

  /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
   * This is necessary since objects may be reported to have motion if the Vector pass is
@@ -372,6 +376,18 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
    }
  }

+  /* Pointclouds */
+  if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
+    group_descs[PG_HITD_POINTCLOUD] = group_descs[PG_HITD];
+    group_descs[PG_HITD_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+    group_descs[PG_HITD_POINTCLOUD].hitgroup.moduleIS = optix_module;
+    group_descs[PG_HITD_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point";
+    group_descs[PG_HITS_POINTCLOUD] = group_descs[PG_HITS];
+    group_descs[PG_HITS_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+    group_descs[PG_HITS_POINTCLOUD].hitgroup.moduleIS = optix_module;
+    group_descs[PG_HITS_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point";
+  }
+
  if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) {
    /* Add hit group for local intersections. */
    group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
@@ -419,6 +435,10 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
                       stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
  trace_css = std::max(trace_css,
                       stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
+  trace_css = std::max(
+      trace_css, stack_size[PG_HITD_POINTCLOUD].cssIS + stack_size[PG_HITD_POINTCLOUD].cssAH);
+  trace_css = std::max(
+      trace_css, stack_size[PG_HITS_POINTCLOUD].cssIS + stack_size[PG_HITS_POINTCLOUD].cssAH);

  OptixPipelineLinkOptions link_options = {};
  link_options.maxTraceDepth = 1;
@@ -444,6 +464,10 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
    }
+    if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
+      pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
+      pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
+    }
    pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
    pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);

@@ -483,6 +507,10 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
    }
+    if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
+      pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
+      pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
+    }

    optix_assert(optixPipelineCreate(context,
                                     &pipeline_options,
@@ -523,7 +551,7 @@ class OptiXDevice::DenoiseContext {
      : denoise_params(task.params),
        render_buffers(task.render_buffers),
        buffer_params(task.buffer_params),
-        guiding_buffer(device, "denoiser guiding passes buffer"),
+        guiding_buffer(device, "denoiser guiding passes buffer", true),
        num_samples(task.num_samples)
  {
    num_input_passes = 1;
@@ -538,9 +566,9 @@ class OptiXDevice::DenoiseContext {
      }
    }

-    const int num_guiding_passes = num_input_passes - 1;
+    use_guiding_passes = (num_input_passes - 1) > 0;

-    if (num_guiding_passes) {
+    if (use_guiding_passes) {
      if (task.allow_inplace_modification) {
        guiding_params.device_pointer = render_buffers->buffer.device_pointer;

@@ -593,6 +621,7 @@ class OptiXDevice::DenoiseContext {

  /* Number of input passes. Including the color and extra auxiliary passes. */
  int num_input_passes = 0;
+  bool use_guiding_passes = false;
  bool use_pass_albedo = false;
  bool use_pass_normal = false;

@@ -724,7 +753,7 @@ void OptiXDevice::denoise_pass(DenoiseContext &context, PassType pass_type)
      return;
    }
  }
-  else if (!context.albedo_replaced_with_fake) {
+  else if (context.use_guiding_passes && !context.albedo_replaced_with_fake) {
    context.albedo_replaced_with_fake = true;
    if (!denoise_filter_guiding_set_fake_albedo(context)) {
      LOG(ERROR) << "Error replacing real albedo with the fake one.";
@@ -1015,6 +1044,13 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
                                  const OptixBuildInput &build_input,
                                  uint16_t num_motion_steps)
 {
+  /* Allocate and build acceleration structures only one at a time, to prevent parallel builds
+   * from running out of memory (since both original and compacted acceleration structure memory
+   * may be allocated at the same time for the duration of this function). The builds would
+   * otherwise happen on the same CUDA stream anyway. */
+  static thread_mutex mutex;
+  thread_scoped_lock lock(mutex);
+
  const CUDAContextScope scope(this);

  const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
@@ -1040,13 +1076,14 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
  optix_assert(optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));

  /* Allocate required output buffers. */
-  device_only_memory<char> temp_mem(this, "optix temp as build mem");
+  device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
  temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
  if (!temp_mem.device_pointer) {
    /* Make sure temporary memory allocation succeeded. */
    return false;
  }

+  /* Acceleration structure memory has to be allocated on the device (not allowed on the host). */
  device_only_memory<char> &out_data = *bvh->as_data;
  if (operation == OPTIX_BUILD_OPERATION_BUILD) {
    assert(out_data.device == this);
@@ -1095,12 +1132,13 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,

    /* There is no point compacting if the size does not change. */
    if (compacted_size < sizes.outputSizeInBytes) {
-      device_only_memory<char> compacted_data(this, "optix compacted as");
+      device_only_memory<char> compacted_data(this, "optix compacted as", false);
      compacted_data.alloc_to_device(compacted_size);
-      if (!compacted_data.device_pointer)
+      if (!compacted_data.device_pointer) {
        /* Do not compact if memory allocation for compacted acceleration structure fails.
         * Can just use the uncompacted one then, so succeed here regardless. */
        return !have_error();
+      }

      optix_assert(optixAccelCompact(
          context, NULL, out_handle, compacted_data.device_pointer, compacted_size, &out_handle));
@@ -1111,6 +1149,8 @@ bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,

      std::swap(out_data.device_size, compacted_data.device_size);
      std::swap(out_data.device_pointer, compacted_data.device_pointer);
+      /* Original acceleration structure memory is freed when 'compacted_data' goes out of scope.
+       */
    }
  }

@@ -1208,7 +1248,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
              const float4 pw = make_float4(
                  curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);

-              /* Convert Catmull-Rom data to Bezier spline. */
+              /* Convert Catmull-Rom data to B-spline. */
              static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
              static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
              static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
@@ -1362,6 +1402,86 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
      build_input.triangleArray.numSbtRecords = 1;
      build_input.triangleArray.primitiveIndexOffset = mesh->prim_offset;

+      if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+        progress.set_error("Failed to build OptiX acceleration structure");
+      }
+    }
+    else if (geom->geometry_type == Geometry::POINTCLOUD) {
+      /* Build BLAS for points primitives. */
+      PointCloud *const pointcloud = static_cast<PointCloud *const>(geom);
+      const size_t num_points = pointcloud->num_points();
+      if (num_points == 0) {
+        return;
+      }
+
+      size_t num_motion_steps = 1;
+      Attribute *motion_points = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+      if (motion_blur && pointcloud->get_use_motion_blur() && motion_points) {
+        num_motion_steps = pointcloud->get_motion_steps();
+      }
+
+      device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
+      aabb_data.alloc(num_points * num_motion_steps);
+
+      /* Get AABBs for each motion step. */
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        /* The center step for motion vertices is not stored in the attribute. */
+        const float3 *points = pointcloud->get_points().data();
+        const float *radius = pointcloud->get_radius().data();
+        size_t center_step = (num_motion_steps - 1) / 2;
+        if (step != center_step) {
+          size_t attr_offset = (step > center_step) ? step - 1 : step;
+          /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4). */
+          points = motion_points->data_float3() + attr_offset * num_points;
+        }
+
+        for (size_t i = 0; i < num_points; ++i) {
+          const PointCloud::Point point = pointcloud->get_point(i);
+          BoundBox bounds = BoundBox::empty;
+          point.bounds_grow(points, radius, bounds);
+
+          const size_t index = step * num_points + i;
+          aabb_data[index].minX = bounds.min.x;
+          aabb_data[index].minY = bounds.min.y;
+          aabb_data[index].minZ = bounds.min.z;
+          aabb_data[index].maxX = bounds.max.x;
+          aabb_data[index].maxY = bounds.max.y;
+          aabb_data[index].maxZ = bounds.max.z;
+        }
+      }
+
+      /* Upload AABB data to GPU. */
+      aabb_data.copy_to_device();
+
+      vector<device_ptr> aabb_ptrs;
+      aabb_ptrs.reserve(num_motion_steps);
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        aabb_ptrs.push_back(aabb_data.device_pointer + step * num_points * sizeof(OptixAabb));
+      }
+
+      /* Disable visibility test any-hit program, since it is already checked during
+       * intersection. Those trace calls that require anyhit can force it with a ray flag.
+       * For those, force a single any-hit call, so shadow record-all behavior works correctly. */
+      unsigned int build_flags = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT |
+                                 OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+      OptixBuildInput build_input = {};
+      build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
+#  if OPTIX_ABI_VERSION < 23
+      build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+      build_input.aabbArray.numPrimitives = num_points;
+      build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
+      build_input.aabbArray.flags = &build_flags;
+      build_input.aabbArray.numSbtRecords = 1;
+      build_input.aabbArray.primitiveIndexOffset = pointcloud->prim_offset;
+#  else
+      build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+      build_input.customPrimitiveArray.numPrimitives = num_points;
+      build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
+      build_input.customPrimitiveArray.flags = &build_flags;
+      build_input.customPrimitiveArray.numSbtRecords = 1;
+      build_input.customPrimitiveArray.primitiveIndexOffset = pointcloud->prim_offset;
+#  endif
+
      if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
        progress.set_error("Failed to build OptiX acceleration structure");
      }
@@ -1449,12 +1569,22 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
          instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
        }
      }
+      else if (ob->get_geometry()->geometry_type == Geometry::POINTCLOUD) {
+        /* Use the hit group that has an intersection program for point clouds. */
+        instance.sbtOffset = PG_HITD_POINTCLOUD - PG_HITD;
+
+        /* Also skip point clouds in local trace calls. */
+        instance.visibilityMask |= 4;
+      }
+
 #  if OPTIX_ABI_VERSION < 55
      /* Cannot disable any-hit program for thick curves, since it needs to filter out end-caps. */
      else
 #  endif
      {
-        /* Can disable __anyhit__kernel_optix_visibility_test by default.
+        /* Can disable __anyhit__kernel_optix_visibility_test by default (except for thick curves,
+         * since it needs to filter out end-caps there).
+
         * It is enabled where necessary (visibility mask exceeds 8 bits or the other any-hit
         * programs like __anyhit__kernel_optix_shadow_all_hit) via OPTIX_RAY_FLAG_ENFORCE_ANYHIT.
         */
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -44,6 +44,8 @@ enum {
  PG_HITV, /* __VOLUME__ hit group. */
  PG_HITD_MOTION,
  PG_HITS_MOTION,
+  PG_HITD_POINTCLOUD,
+  PG_HITS_POINTCLOUD,
  PG_CALL_SVM_AO,
  PG_CALL_SVM_BEVEL,
  NUM_PROGRAM_GROUPS
@@ -52,9 +54,9 @@ enum {
 static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS;
 static const int NUM_MIS_PROGRAM_GROUPS = 1;
 static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD;
-static const int NUM_HIT_PROGRAM_GROUPS = 6;
+static const int NUM_HIT_PROGRAM_GROUPS = 8;
 static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
-static const int NUM_CALLABLE_PROGRAM_GROUPS = 3;
+static const int NUM_CALLABLE_PROGRAM_GROUPS = 2;

 /* List of OptiX pipelines. */
 enum { PIP_SHADE_RAYTRACE, PIP_INTERSECT, NUM_PIPELINES };
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -1093,6 +1093,8 @@ static const char *device_type_for_description(const DeviceType type)
      return "Dummy";
    case DEVICE_MULTI:
      return "Multi";
+    case DEVICE_METAL:
+      return "Metal";
  }

  return "UNKNOWN";
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -112,7 +112,7 @@ int RenderScheduler::get_rendered_sample() const
 {
  DCHECK_GT(get_num_rendered_samples(), 0);

-  return start_sample_ + get_num_rendered_samples() - 1;
+  return start_sample_ + get_num_rendered_samples() - 1 - sample_offset_;
 }

 int RenderScheduler::get_num_rendered_samples() const
@@ -877,7 +877,8 @@ int RenderScheduler::get_num_samples_to_path_trace() const
   * is to ensure that the final render is pixel-matched regardless of how many samples per second
   * compute device can do. */

-  return adaptive_sampling_.align_samples(path_trace_start_sample, num_samples_to_render);
+  return adaptive_sampling_.align_samples(path_trace_start_sample - sample_offset_,
+                                          num_samples_to_render);
 }

 int RenderScheduler::get_num_samples_during_navigation(int resolution_divider) const
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -179,11 +179,14 @@ set(SRC_KERNEL_GEOM_HEADERS
  geom/curve.h
  geom/curve_intersect.h
  geom/motion_curve.h
+  geom/motion_point.h
  geom/motion_triangle.h
  geom/motion_triangle_intersect.h
  geom/motion_triangle_shader.h
  geom/object.h
  geom/patch.h
+  geom/point.h
+  geom/point_intersect.h
  geom/primitive.h
  geom/shader_data.h
  geom/subd_triangle.h
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -49,24 +49,24 @@ CCL_NAMESPACE_BEGIN
 #  include "kernel/bvh/nodes.h"

 #  define BVH_FUNCTION_NAME bvh_intersect
-#  define BVH_FUNCTION_FEATURES 0
+#  define BVH_FUNCTION_FEATURES BVH_POINTCLOUD
 #  include "kernel/bvh/traversal.h"

 #  if defined(__HAIR__)
 #    define BVH_FUNCTION_NAME bvh_intersect_hair
-#    define BVH_FUNCTION_FEATURES BVH_HAIR
+#    define BVH_FUNCTION_FEATURES BVH_HAIR | BVH_POINTCLOUD
 #    include "kernel/bvh/traversal.h"
 #  endif

 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_motion
-#    define BVH_FUNCTION_FEATURES BVH_MOTION
+#    define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_POINTCLOUD
 #    include "kernel/bvh/traversal.h"
 #  endif

 #  if defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_hair_motion
-#    define BVH_FUNCTION_FEATURES BVH_HAIR | BVH_MOTION
+#    define BVH_FUNCTION_FEATURES BVH_HAIR | BVH_MOTION | BVH_POINTCLOUD
 #    include "kernel/bvh/traversal.h"
 #  endif

@@ -102,26 +102,27 @@ CCL_NAMESPACE_BEGIN

 #  if defined(__SHADOW_RECORD_ALL__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all
-#    define BVH_FUNCTION_FEATURES 0
+#    define BVH_FUNCTION_FEATURES BVH_POINTCLOUD
 #    include "kernel/bvh/shadow_all.h"

 #    if defined(__HAIR__)
 #      define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
-#      define BVH_FUNCTION_FEATURES BVH_HAIR
+#      define BVH_FUNCTION_FEATURES BVH_HAIR | BVH_POINTCLOUD
 #      include "kernel/bvh/shadow_all.h"
 #    endif

 #    if defined(__OBJECT_MOTION__)
 #      define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
-#      define BVH_FUNCTION_FEATURES BVH_MOTION
+#      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_POINTCLOUD
 #      include "kernel/bvh/shadow_all.h"
 #    endif

 #    if defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #      define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
-#      define BVH_FUNCTION_FEATURES BVH_HAIR | BVH_MOTION
+#      define BVH_FUNCTION_FEATURES BVH_HAIR | BVH_MOTION | BVH_POINTCLOUD
 #      include "kernel/bvh/shadow_all.h"
 #    endif
+
 #  endif /* __SHADOW_RECORD_ALL__ */

 /* Record all intersections - Volume BVH traversal. */
--- a/intern/cycles/kernel/bvh/shadow_all.h
+++ b/intern/cycles/kernel/bvh/shadow_all.h
@@ -28,6 +28,7 @@
 * without new features slowing things down.
 *
 * BVH_HAIR: hair curve rendering
+ * BVH_POINTCLOUD: point cloud rendering
 * BVH_MOTION: motion blur rendering
 */

@@ -173,7 +174,7 @@ ccl_device_inline
              case PRIMITIVE_MOTION_CURVE_THICK:
              case PRIMITIVE_CURVE_RIBBON:
              case PRIMITIVE_MOTION_CURVE_RIBBON: {
-                if ((type & PRIMITIVE_ALL_MOTION) && kernel_data.bvh.use_bvh_steps) {
+                if ((type & PRIMITIVE_MOTION) && kernel_data.bvh.use_bvh_steps) {
                  const float2 prim_time = kernel_tex_fetch(__prim_time, prim_addr);
                  if (ray->time < prim_time.x || ray->time > prim_time.y) {
                    hit = false;
@@ -199,6 +200,34 @@ ccl_device_inline
                break;
              }
 #endif
+#if BVH_FEATURE(BVH_POINTCLOUD)
+              case PRIMITIVE_POINT:
+              case PRIMITIVE_MOTION_POINT: {
+                if ((type & PRIMITIVE_MOTION) && kernel_data.bvh.use_bvh_steps) {
+                  const float2 prim_time = kernel_tex_fetch(__prim_time, prim_addr);
+                  if (ray->time < prim_time.x || ray->time > prim_time.y) {
+                    hit = false;
+                    break;
+                  }
+                }
+
+                const int point_object = (object == OBJECT_NONE) ?
+                                             kernel_tex_fetch(__prim_object, prim_addr) :
+                                             object;
+                const int point_prim = kernel_tex_fetch(__prim_index, prim_addr);
+                const int point_type = kernel_tex_fetch(__prim_type, prim_addr);
+                hit = point_intersect(kg,
+                                      &isect,
+                                      P,
+                                      dir,
+                                      t_max_current,
+                                      point_object,
+                                      point_prim,
+                                      ray->time,
+                                      point_type);
+                break;
+              }
+#endif /* BVH_FEATURE(BVH_POINTCLOUD) */
              default: {
                hit = false;
                break;
@@ -226,7 +255,7 @@ ccl_device_inline
              bool record_intersection = true;

              /* Always use baked shadow transparency for curves. */
-              if (isect.type & PRIMITIVE_ALL_CURVE) {
+              if (isect.type & PRIMITIVE_CURVE) {
                *throughput *= intersection_curve_shadow_transparency(
                    kg, isect.object, isect.prim, isect.u);

--- a/intern/cycles/kernel/bvh/traversal.h
+++ b/intern/cycles/kernel/bvh/traversal.h
@@ -28,6 +28,7 @@
 * without new features slowing things down.
 *
 * BVH_HAIR: hair curve rendering
+ * BVH_POINTCLOUD: point cloud rendering
 * BVH_MOTION: motion blur rendering
 */

@@ -165,7 +166,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
            case PRIMITIVE_CURVE_RIBBON:
            case PRIMITIVE_MOTION_CURVE_RIBBON: {
              for (; prim_addr < prim_addr2; prim_addr++) {
-                if ((type & PRIMITIVE_ALL_MOTION) && kernel_data.bvh.use_bvh_steps) {
+                if ((type & PRIMITIVE_MOTION) && kernel_data.bvh.use_bvh_steps) {
                  const float2 prim_time = kernel_tex_fetch(__prim_time, prim_addr);
                  if (ray->time < prim_time.x || ray->time > prim_time.y) {
                    continue;
@@ -188,6 +189,33 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
              break;
            }
 #endif /* BVH_FEATURE(BVH_HAIR) */
+#if BVH_FEATURE(BVH_POINTCLOUD)
+            case PRIMITIVE_POINT:
+            case PRIMITIVE_MOTION_POINT: {
+              for (; prim_addr < prim_addr2; prim_addr++) {
+                if ((type & PRIMITIVE_MOTION) && kernel_data.bvh.use_bvh_steps) {
+                  const float2 prim_time = kernel_tex_fetch(__prim_time, prim_addr);
+                  if (ray->time < prim_time.x || ray->time > prim_time.y) {
+                    continue;
+                  }
+                }
+
+                const int point_object = (object == OBJECT_NONE) ?
+                                             kernel_tex_fetch(__prim_object, prim_addr) :
+                                             object;
+                const int point_prim = kernel_tex_fetch(__prim_index, prim_addr);
+                const int point_type = kernel_tex_fetch(__prim_type, prim_addr);
+                const bool hit = point_intersect(
+                    kg, isect, P, dir, isect->t, point_object, point_prim, ray->time, point_type);
+                if (hit) {
+                  /* shadow ray early termination */
+                  if (visibility & PATH_RAY_SHADOW_OPAQUE)
+                    return true;
+                }
+              }
+              break;
+            }
+#endif /* BVH_FEATURE(BVH_POINTCLOUD) */
          }
        }
        else {
--- a/intern/cycles/kernel/bvh/types.h
+++ b/intern/cycles/kernel/bvh/types.h
@@ -34,6 +34,7 @@ CCL_NAMESPACE_BEGIN

 #define BVH_MOTION 1
 #define BVH_HAIR 2
+#define BVH_POINTCLOUD 4

 #define BVH_NAME_JOIN(x, y) x##_##y
 #define BVH_NAME_EVAL(x, y) BVH_NAME_JOIN(x, y)
--- a/intern/cycles/kernel/bvh/util.h
+++ b/intern/cycles/kernel/bvh/util.h
@@ -118,14 +118,16 @@ ccl_device_forceinline int intersection_get_shader_flags(KernelGlobals kg,
 {
  int shader = 0;

-#ifdef __HAIR__
-  if (type & PRIMITIVE_ALL_TRIANGLE)
-#endif
-  {
+  if (type & PRIMITIVE_TRIANGLE) {
    shader = kernel_tex_fetch(__tri_shader, prim);
  }
+#ifdef __POINTCLOUD__
+  else if (type & PRIMITIVE_POINT) {
+    shader = kernel_tex_fetch(__points_shader, prim);
+  }
+#endif
 #ifdef __HAIR__
-  else {
+  else if (type & PRIMITIVE_CURVE) {
    shader = kernel_tex_fetch(__curves, prim).shader_id;
  }
 #endif
@@ -139,14 +141,16 @@ ccl_device_forceinline int intersection_get_shader_from_isect_prim(KernelGlobals
 {
  int shader = 0;

-#ifdef __HAIR__
-  if (isect_type & PRIMITIVE_ALL_TRIANGLE)
-#endif
-  {
+  if (isect_type & PRIMITIVE_TRIANGLE) {
    shader = kernel_tex_fetch(__tri_shader, prim);
  }
+#ifdef __POINTCLOUD__
+  else if (isect_type & PRIMITIVE_POINT) {
+    shader = kernel_tex_fetch(__points_shader, prim);
+  }
+#endif
 #ifdef __HAIR__
-  else {
+  else if (isect_type & PRIMITIVE_CURVE) {
    shader = kernel_tex_fetch(__curves, prim).shader_id;
  }
 #endif
--- a/intern/cycles/kernel/camera/projection.h
+++ b/intern/cycles/kernel/camera/projection.h
@@ -146,6 +146,49 @@ fisheye_equisolid_to_direction(float u, float v, float lens, float fov, float wi
  return make_float3(cosf(theta), -cosf(phi) * sinf(theta), sinf(phi) * sinf(theta));
 }

+ccl_device_inline float3 fisheye_lens_polynomial_to_direction(
+    float u, float v, float coeff0, float4 coeffs, float fov, float width, float height)
+{
+  u = (u - 0.5f) * width;
+  v = (v - 0.5f) * height;
+
+  float r = sqrtf(u * u + v * v);
+  float r2 = r * r;
+  float4 rr = make_float4(r, r2, r2 * r, r2 * r2);
+  float theta = -(coeff0 + dot(coeffs, rr));
+
+  if (fabsf(theta) > 0.5f * fov)
+    return zero_float3();
+
+  float phi = safe_acosf((r != 0.0f) ? u / r : 0.0f);
+
+  if (v < 0.0f)
+    phi = -phi;
+
+  return make_float3(cosf(theta), -cosf(phi) * sinf(theta), sinf(phi) * sinf(theta));
+}
+
+ccl_device float2 direction_to_fisheye_lens_polynomial(
+    float3 dir, float coeff0, float4 coeffs, float width, float height)
+{
+  float theta = -safe_acosf(dir.x);
+
+  float r = (theta - coeff0) / coeffs.x;
+
+  for (int i = 0; i < 20; i++) {
+    float r2 = r * r;
+    float4 rr = make_float4(r, r2, r2 * r, r2 * r2);
+    r = (theta - (coeff0 + dot(coeffs, rr))) / coeffs.x;
+  }
+
+  float phi = atan2f(dir.z, dir.y);
+
+  float u = r * cosf(phi) / width + 0.5f;
+  float v = r * sinf(phi) / height + 0.5f;
+
+  return make_float2(u, v);
+}
+
 /* Mirror Ball <-> Cartesion direction */

 ccl_device float3 mirrorball_to_direction(float u, float v)
@@ -191,6 +234,14 @@ ccl_device_inline float3 panorama_to_direction(ccl_constant KernelCamera *cam, f
      return mirrorball_to_direction(u, v);
    case PANORAMA_FISHEYE_EQUIDISTANT:
      return fisheye_to_direction(u, v, cam->fisheye_fov);
+    case PANORAMA_FISHEYE_LENS_POLYNOMIAL:
+      return fisheye_lens_polynomial_to_direction(u,
+                                                  v,
+                                                  cam->fisheye_lens_polynomial_bias,
+                                                  cam->fisheye_lens_polynomial_coefficients,
+                                                  cam->fisheye_fov,
+                                                  cam->sensorwidth,
+                                                  cam->sensorheight);
    case PANORAMA_FISHEYE_EQUISOLID:
    default:
      return fisheye_equisolid_to_direction(
@@ -207,6 +258,12 @@ ccl_device_inline float2 direction_to_panorama(ccl_constant KernelCamera *cam, f
      return direction_to_mirrorball(dir);
    case PANORAMA_FISHEYE_EQUIDISTANT:
      return direction_to_fisheye(dir, cam->fisheye_fov);
+    case PANORAMA_FISHEYE_LENS_POLYNOMIAL:
+      return direction_to_fisheye_lens_polynomial(dir,
+                                                  cam->fisheye_lens_polynomial_bias,
+                                                  cam->fisheye_lens_polynomial_coefficients,
+                                                  cam->sensorwidth,
+                                                  cam->sensorheight);
    case PANORAMA_FISHEYE_EQUISOLID:
    default:
      return direction_to_fisheye_equisolid(
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -124,7 +124,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals kg,
  /* For curves use the smooth normal, particularly for ribbons the geometric
   * normal gives too much darkening otherwise. */
  int label;
-  const float3 Ng = (sd->type & PRIMITIVE_ALL_CURVE) ? sc->N : sd->Ng;
+  const float3 Ng = (sd->type & PRIMITIVE_CURVE) ? sc->N : sd->Ng;

  switch (sc->type) {
    case CLOSURE_BSDF_DIFFUSE_ID:
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -213,9 +213,7 @@ ccl_device int bsdf_principled_hair_setup(ccl_private ShaderData *sd,

  /* TODO: we convert this value to a cosine later and discard the sign, so
   * we could probably save some operations. */
-  float h = (sd->type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) ?
-                -sd->v :
-                dot(cross(sd->Ng, X), Z);
+  float h = (sd->type & PRIMITIVE_CURVE_RIBBON) ? -sd->v : dot(cross(sd->Ng, X), Z);

  kernel_assert(fabsf(h) < 1.0f + 1e-4f);
  kernel_assert(isfinite3_safe(Y));
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -19,7 +19,6 @@
 #include "kernel/device/gpu/parallel_active_index.h"
 #include "kernel/device/gpu/parallel_prefix_sum.h"
 #include "kernel/device/gpu/parallel_sorted_index.h"
-#include "kernel/device/gpu/work_stealing.h"

 #include "kernel/sample/lcg.h"

@@ -30,6 +29,8 @@
 #  include "kernel/device/metal/context_begin.h"
 #endif

+#include "kernel/device/gpu/work_stealing.h"
+
 #include "kernel/integrator/state.h"
 #include "kernel/integrator/state_flow.h"
 #include "kernel/integrator/state_util.h"
@@ -96,7 +97,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  const int state = tile->path_index_offset + tile_work_index;

  uint x, y, sample;
-  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+  ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample));

  ccl_gpu_kernel_call(
      integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample));
@@ -127,7 +128,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  const int state = tile->path_index_offset + tile_work_index;

  uint x, y, sample;
-  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+  ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample));

  ccl_gpu_kernel_call(
      integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample));
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -117,7 +117,7 @@ struct kernel_gpu_##name \
           uint simd_group_index, \
           uint num_simd_groups) ccl_global const; \
 }; \
-kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \
+kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
                                constant KernelParamsMetal &ccl_restrict   _launch_params_metal, \
                                constant MetalAncillaries *_metal_ancillaries, \
                                threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -126,7 +126,7 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,

 [[intersection(triangle, triangle_data, METALRT_TAGS)]]
 TriangleIntersectionResult
-__anyhit__kernel_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+__anyhit__cycles_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
                                       ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
                                       uint instance_id [[user_instance_id]],
                                       uint primitive_id [[primitive_id]],
@@ -139,7 +139,7 @@ __anyhit__kernel_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params

 [[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
 BoundingBoxIntersectionResult
-__anyhit__kernel_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
+__anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
 {
  /* unused function */
  BoundingBoxIntersectionResult result;
@@ -211,7 +211,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
  }
  
  /* Always use baked shadow transparency for curves. */
-  if (type & PRIMITIVE_ALL_CURVE) {
+  if (type & PRIMITIVE_CURVE) {
    float throughput = payload.throughput;
    throughput *= context.intersection_curve_shadow_transparency(nullptr, object, prim, u);
    payload.throughput = throughput;
@@ -274,7 +274,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,

 [[intersection(triangle, triangle_data, METALRT_TAGS)]]
 TriangleIntersectionResult
-__anyhit__kernel_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+__anyhit__cycles_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
                                            ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
                                            unsigned int object [[user_instance_id]],
                                            unsigned int primitive_id [[primitive_id]],
@@ -292,7 +292,7 @@ __anyhit__kernel_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_p

 [[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
 BoundingBoxIntersectionResult
-__anyhit__kernel_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance]])
+__anyhit__cycles_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance]])
 {
  /* unused function */
  BoundingBoxIntersectionResult result;
@@ -345,7 +345,7 @@ inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_pa

 [[intersection(triangle, triangle_data, METALRT_TAGS)]]
 TriangleIntersectionResult
-__anyhit__kernel_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+__anyhit__cycles_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
                                             ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
                                             unsigned int object [[user_instance_id]],
                                             unsigned int primitive_id [[primitive_id]])
@@ -362,7 +362,7 @@ __anyhit__kernel_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_

 [[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
 BoundingBoxIntersectionResult
-__anyhit__kernel_metalrt_visibility_test_box(const float ray_tmax [[max_distance]])
+__anyhit__cycles_metalrt_visibility_test_box(const float ray_tmax [[max_distance]])
 {
  /* Unused function */
  BoundingBoxIntersectionResult result;
@@ -476,7 +476,7 @@ __intersection__curve_ribbon(constant KernelParamsMetal &launch_params_metal [[b
  result.continue_search = true;
  result.distance = ray_tmax;

-  if (segment.type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
+  if (segment.type & PRIMITIVE_CURVE_RIBBON) {
    metalrt_intersection_curve(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
 #  if defined(__METALRT_MOTION__)
                               payload.time,
@@ -507,7 +507,7 @@ __intersection__curve_ribbon_shadow(constant KernelParamsMetal &launch_params_me
  result.continue_search = true;
  result.distance = ray_tmax;

-  if (segment.type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
+  if (segment.type & PRIMITIVE_CURVE_RIBBON) {
    metalrt_intersection_curve_shadow(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
 #  if defined(__METALRT_MOTION__)
                               payload.time,
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -97,9 +97,9 @@ extern "C" __global__ void __miss__kernel_optix_miss()

 extern "C" __global__ void __anyhit__kernel_optix_local_hit()
 {
-#ifdef __HAIR__
+#if defined(__HAIR__) || defined(__POINTCLOUD__)
  if (!optixIsTriangleHit()) {
-    /* Ignore curves. */
+    /* Ignore curves and points. */
    return optixIgnoreIntersection();
  }
 #endif
@@ -194,7 +194,7 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
    type = kernel_tex_fetch(__objects, object).primitive_type;
  }
 #  ifdef __HAIR__
-  else {
+  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
    u = __uint_as_float(optixGetAttribute_0());
    v = __uint_as_float(optixGetAttribute_1());

@@ -210,6 +210,11 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
 #    endif
  }
 #  endif
+  else {
+    type = kernel_tex_fetch(__objects, object).primitive_type;
+    u = 0.0f;
+    v = 0.0f;
+  }

 #  ifndef __TRANSPARENT_SHADOWS__
  /* No transparent shadows support compiled in, make opaque. */
@@ -229,7 +234,7 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
  }

  /* Always use baked shadow transparency for curves. */
-  if (type & PRIMITIVE_ALL_CURVE) {
+  if (type & PRIMITIVE_CURVE) {
    float throughput = __uint_as_float(optixGetPayload_1());
    throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, u);
    optixSetPayload_1(__float_as_uint(throughput));
@@ -291,7 +296,7 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()

 extern "C" __global__ void __anyhit__kernel_optix_volume_test()
 {
-#ifdef __HAIR__
+#if defined(__HAIR__) || defined(__POINTCLOUD__)
  if (!optixIsTriangleHit()) {
    /* Ignore curves. */
    return optixIgnoreIntersection();
@@ -315,7 +320,7 @@ extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
 {
 #ifdef __HAIR__
 #  if OPTIX_ABI_VERSION < 55
-  if (!optixIsTriangleHit()) {
+  if (optixGetPrimitiveType() == OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE) {
    /* Filter out curve endcaps. */
    const float u = __uint_as_float(optixGetAttribute_0());
    if (u == 0.0f || u == 1.0f) {
@@ -354,13 +359,19 @@ extern "C" __global__ void __closesthit__kernel_optix_hit()
    optixSetPayload_3(prim);
    optixSetPayload_5(kernel_tex_fetch(__objects, object).primitive_type);
  }
-  else {
+  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
    const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
    optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */
    optixSetPayload_2(optixGetAttribute_1());
    optixSetPayload_3(segment.prim);
    optixSetPayload_5(segment.type);
  }
+  else {
+    optixSetPayload_1(0);
+    optixSetPayload_2(0);
+    optixSetPayload_3(prim);
+    optixSetPayload_5(kernel_tex_fetch(__objects, object).primitive_type);
+  }
 }

 #ifdef __HAIR__
@@ -395,6 +406,7 @@ ccl_device_inline void optix_intersection_curve(const int prim, const int type)
    isect.t *= len;

  if (curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
    optixReportIntersection(isect.t / len,
                            type & PRIMITIVE_ALL,
                            __float_as_int(isect.u),  /* Attribute_0 */
@@ -407,8 +419,50 @@ extern "C" __global__ void __intersection__curve_ribbon()
  const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, optixGetPrimitiveIndex());
  const int prim = segment.prim;
  const int type = segment.type;
-  if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
+  if (type & PRIMITIVE_CURVE_RIBBON) {
    optix_intersection_curve(prim, type);
  }
 }
+
+#endif
+
+#ifdef __POINTCLOUD__
+extern "C" __global__ void __intersection__point()
+{
+  const int prim = optixGetPrimitiveIndex();
+  const int object = get_object_id();
+  const int type = kernel_tex_fetch(__objects, object).primitive_type;
+
+#  ifdef __VISIBILITY_FLAG__
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+    return;
+  }
+#  endif
+
+  float3 P = optixGetObjectRayOrigin();
+  float3 dir = optixGetObjectRayDirection();
+
+  /* The direction is not normalized by default, the point intersection routine expects that. */
+  float len;
+  dir = normalize_len(dir, &len);
+
+#  ifdef __OBJECT_MOTION__
+  const float time = optixGetRayTime();
+#  else
+  const float time = 0.0f;
+#  endif
+
+  Intersection isect;
+  isect.t = optixGetRayTmax();
+  /* Transform maximum distance into object space. */
+  if (isect.t != FLT_MAX) {
+    isect.t *= len;
+  }
+
+  if (point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
+    optixReportIntersection(isect.t / len, type & PRIMITIVE_ALL);
+  }
+}
 #endif
--- a/intern/cycles/kernel/geom/attribute.h
+++ b/intern/cycles/kernel/geom/attribute.h
@@ -36,7 +36,7 @@ ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const S

 ccl_device_inline uint attribute_primitive_type(KernelGlobals kg, ccl_private const ShaderData *sd)
 {
-  if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && subd_triangle_patch(kg, sd) != ~0) {
+  if ((sd->type & PRIMITIVE_TRIANGLE) && subd_triangle_patch(kg, sd) != ~0) {
    return ATTR_PRIM_SUBD;
  }
  else {
--- a/intern/cycles/kernel/geom/curve.h
+++ b/intern/cycles/kernel/geom/curve.h
@@ -205,14 +205,14 @@ ccl_device float curve_thickness(KernelGlobals kg, ccl_private const ShaderData
 {
  float r = 0.0f;

-  if (sd->type & PRIMITIVE_ALL_CURVE) {
+  if (sd->type & PRIMITIVE_CURVE) {
    KernelCurve curve = kernel_tex_fetch(__curves, sd->prim);
    int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
    int k1 = k0 + 1;

    float4 P_curve[2];

-    if (!(sd->type & PRIMITIVE_ALL_MOTION)) {
+    if (!(sd->type & PRIMITIVE_MOTION)) {
      P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
      P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
    }
@@ -249,7 +249,7 @@ ccl_device float3 curve_tangent_normal(KernelGlobals kg, ccl_private const Shade
 {
  float3 tgN = make_float3(0.0f, 0.0f, 0.0f);

-  if (sd->type & PRIMITIVE_ALL_CURVE) {
+  if (sd->type & PRIMITIVE_CURVE) {

    tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu, -sd->I) / len_squared(sd->dPdu)));
    tgN = normalize(tgN);
--- a/intern/cycles/kernel/geom/curve_intersect.h
+++ b/intern/cycles/kernel/geom/curve_intersect.h
@@ -635,7 +635,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals kg,
                                            float time,
                                            int type)
 {
-  const bool is_motion = (type & PRIMITIVE_ALL_MOTION);
+  const bool is_motion = (type & PRIMITIVE_MOTION);

  KernelCurve kcurve = kernel_tex_fetch(__curves, prim);

@@ -655,7 +655,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals kg,
    motion_curve_keys(kg, object, prim, time, ka, k0, k1, kb, curve);
  }

-  if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
+  if (type & PRIMITIVE_CURVE_RIBBON) {
    /* todo: adaptive number of subdivisions could help performance here. */
    const int subdivisions = kernel_data.bvh.curve_subdivisions;
    if (ribbon_intersect(P, dir, tmax, subdivisions, curve, isect)) {
@@ -704,7 +704,7 @@ ccl_device_inline void curve_shader_setup(KernelGlobals kg,

  float4 P_curve[4];

-  if (!(sd->type & PRIMITIVE_ALL_MOTION)) {
+  if (!(sd->type & PRIMITIVE_MOTION)) {
    P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
    P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
    P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
@@ -719,7 +719,7 @@ ccl_device_inline void curve_shader_setup(KernelGlobals kg,
  const float4 dPdu4 = catmull_rom_basis_derivative(P_curve, sd->u);
  const float3 dPdu = float4_to_float3(dPdu4);

-  if (sd->type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
+  if (sd->type & PRIMITIVE_CURVE_RIBBON) {
    /* Rounded smooth normals for ribbons, to approximate thick curve shape. */
    const float3 tangent = normalize(dPdu);
    const float3 bitangent = normalize(cross(tangent, -D));
@@ -727,8 +727,6 @@ ccl_device_inline void curve_shader_setup(KernelGlobals kg,
    const float cosine = safe_sqrtf(1.0f - sine * sine);

    sd->N = normalize(sine * bitangent - cosine * normalize(cross(tangent, bitangent)));
-    sd->Ng = -D;
-
 #  if 0
    /* This approximates the position and geometric normal of a thick curve too,
     * but gives too many issues with wrong self intersections. */
@@ -744,25 +742,27 @@ ccl_device_inline void curve_shader_setup(KernelGlobals kg,
    /* NOTE: It is possible that P will be the same as P_inside (precision issues, or very small
     * radius). In this case use the view direction to approximate the normal. */
    const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, sd->u));
-    const float3 Ng = (!isequal_float3(P, P_inside)) ? normalize(P - P_inside) : -sd->I;
+    const float3 N = (!isequal_float3(P, P_inside)) ? normalize(P - P_inside) : -sd->I;

-    sd->N = Ng;
-    sd->Ng = Ng;
+    sd->N = N;
    sd->v = 0.0f;
  }

 #  ifdef __DPDU__
  /* dPdu/dPdv */
  sd->dPdu = dPdu;
-  sd->dPdv = cross(dPdu, sd->Ng);
 #  endif

+  /* Convert to world space. */
  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-    const Transform tfm = object_get_transform(kg, sd);
-    P = transform_point(&tfm, P);
+    object_position_transform_auto(kg, sd, &P);
+    object_normal_transform_auto(kg, sd, &sd->N);
+    object_dir_transform_auto(kg, sd, &sd->dPdu);
  }

  sd->P = P;
+  sd->Ng = (sd->type & PRIMITIVE_CURVE_RIBBON) ? sd->I : sd->N;
+  sd->dPdv = cross(sd->dPdu, sd->Ng);
  sd->shader = kernel_tex_fetch(__curves, sd->prim).shader_id;
 }

--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -29,6 +29,9 @@
 #include "kernel/geom/motion_triangle_intersect.h"
 #include "kernel/geom/motion_triangle_shader.h"
 #include "kernel/geom/motion_curve.h"
+#include "kernel/geom/motion_point.h"
+#include "kernel/geom/point.h"
+#include "kernel/geom/point_intersect.h"
 #include "kernel/geom/curve.h"
 #include "kernel/geom/curve_intersect.h"
 #include "kernel/geom/volume.h"
--- a/intern/cycles/kernel/geom/motion_point.h
+++ b/intern/cycles/kernel/geom/motion_point.h
@@ -0,0 +1,74 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Motion Point Primitive
+ *
+ * These are stored as regular points, plus extra positions and radii at times
+ * other than the frame center. Computing the point at a given ray time is
+ * a matter of interpolation of the two steps between which the ray time lies.
+ *
+ * The extra points are stored as ATTR_STD_MOTION_VERTEX_POSITION.
+ */
+
+#ifdef __POINTCLOUD__
+
+ccl_device_inline float4
+motion_point_for_step(KernelGlobals kg, int offset, int numkeys, int numsteps, int step, int prim)
+{
+  if (step == numsteps) {
+    /* center step: regular key location */
+    return kernel_tex_fetch(__points, prim);
+  }
+  else {
+    /* center step is not stored in this array */
+    if (step > numsteps)
+      step--;
+
+    offset += step * numkeys;
+
+    return kernel_tex_fetch(__attributes_float4, offset + prim);
+  }
+}
+
+/* return 2 point key locations */
+ccl_device_inline float4 motion_point(KernelGlobals kg, int object, int prim, float time)
+{
+  /* get motion info */
+  int numsteps, numkeys;
+  object_motion_info(kg, object, &numsteps, NULL, &numkeys);
+
+  /* figure out which steps we need to fetch and their interpolation factor */
+  int maxstep = numsteps * 2;
+  int step = min((int)(time * maxstep), maxstep - 1);
+  float t = time * maxstep - step;
+
+  /* find attribute */
+  int offset = intersection_find_attribute(kg, object, ATTR_STD_MOTION_VERTEX_POSITION);
+  kernel_assert(offset != ATTR_STD_NOT_FOUND);
+
+  /* fetch key coordinates */
+  float4 point = motion_point_for_step(kg, offset, numkeys, numsteps, step, prim);
+  float4 next_point = motion_point_for_step(kg, offset, numkeys, numsteps, step + 1, prim);
+
+  /* interpolate between steps */
+  return (1.0f - t) * point + t * next_point;
+}
+
+#endif
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/geom/point.h
+++ b/intern/cycles/kernel/geom/point.h
@@ -0,0 +1,133 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Point Primitive
+ *
+ * Point primitive for rendering point clouds.
+ */
+
+#ifdef __POINTCLOUD__
+
+/* Reading attributes on various point elements */
+
+ccl_device float point_attribute_float(KernelGlobals kg,
+                                       ccl_private const ShaderData *sd,
+                                       const AttributeDescriptor desc,
+                                       ccl_private float *dx,
+                                       ccl_private float *dy)
+{
+#  ifdef __RAY_DIFFERENTIALS__
+  if (dx)
+    *dx = 0.0f;
+  if (dy)
+    *dy = 0.0f;
+#  endif
+
+  if (desc.element == ATTR_ELEMENT_VERTEX) {
+    return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim);
+  }
+  else {
+    return 0.0f;
+  }
+}
+
+ccl_device float2 point_attribute_float2(KernelGlobals kg,
+                                         ccl_private const ShaderData *sd,
+                                         const AttributeDescriptor desc,
+                                         ccl_private float2 *dx,
+                                         ccl_private float2 *dy)
+{
+#  ifdef __RAY_DIFFERENTIALS__
+  if (dx)
+    *dx = make_float2(0.0f, 0.0f);
+  if (dy)
+    *dy = make_float2(0.0f, 0.0f);
+#  endif
+
+  if (desc.element == ATTR_ELEMENT_VERTEX) {
+    return kernel_tex_fetch(__attributes_float2, desc.offset + sd->prim);
+  }
+  else {
+    return make_float2(0.0f, 0.0f);
+  }
+}
+
+ccl_device float3 point_attribute_float3(KernelGlobals kg,
+                                         ccl_private const ShaderData *sd,
+                                         const AttributeDescriptor desc,
+                                         ccl_private float3 *dx,
+                                         ccl_private float3 *dy)
+{
+#  ifdef __RAY_DIFFERENTIALS__
+  if (dx)
+    *dx = make_float3(0.0f, 0.0f, 0.0f);
+  if (dy)
+    *dy = make_float3(0.0f, 0.0f, 0.0f);
+#  endif
+
+  if (desc.element == ATTR_ELEMENT_VERTEX) {
+    return float4_to_float3(kernel_tex_fetch(__attributes_float4, desc.offset + sd->prim));
+  }
+  else {
+    return make_float3(0.0f, 0.0f, 0.0f);
+  }
+}
+
+ccl_device float4 point_attribute_float4(KernelGlobals kg,
+                                         ccl_private const ShaderData *sd,
+                                         const AttributeDescriptor desc,
+                                         ccl_private float4 *dx,
+                                         ccl_private float4 *dy)
+{
+#  ifdef __RAY_DIFFERENTIALS__
+  if (dx)
+    *dx = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  if (dy)
+    *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+#  endif
+
+  if (desc.element == ATTR_ELEMENT_VERTEX) {
+    return kernel_tex_fetch(__attributes_float4, desc.offset + sd->prim);
+  }
+  else {
+    return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  }
+}
+
+/* Point radius */
+
+ccl_device float point_radius(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  if (sd->type & PRIMITIVE_POINT) {
+    return kernel_tex_fetch(__points, sd->prim).w;
+  }
+
+  return 0.0f;
+}
+
+/* Point location for motion pass, linear interpolation between keys and
+ * ignoring radius because we do the same for the motion keys */
+
+ccl_device float3 point_motion_center_location(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  return float4_to_float3(kernel_tex_fetch(__points, sd->prim));
+}
+
+#endif /* __POINTCLOUD__ */
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/geom/point_intersect.h
+++ b/intern/cycles/kernel/geom/point_intersect.h
@@ -0,0 +1,128 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Based on Embree code, copyright 2009-2020 Intel Corporation.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Point primitive intersection functions. */
+
+#ifdef __POINTCLOUD__
+
+ccl_device_forceinline bool point_intersect_test(
+    const float4 point, const float3 P, const float3 dir, const float tmax, ccl_private float *t)
+{
+  const float3 center = float4_to_float3(point);
+  const float radius = point.w;
+
+  const float rd2 = 1.0f / dot(dir, dir);
+
+  const float3 c0 = center - P;
+  const float projC0 = dot(c0, dir) * rd2;
+  const float3 perp = c0 - projC0 * dir;
+  const float l2 = dot(perp, perp);
+  const float r2 = radius * radius;
+  if (!(l2 <= r2)) {
+    return false;
+  }
+
+  const float td = sqrt((r2 - l2) * rd2);
+  const float t_front = projC0 - td;
+  const bool valid_front = (0.0f <= t_front) & (t_front <= tmax);
+
+  /* Always back-face culling for now. */
+#  if 0
+  const float t_back = projC0 + td;
+  const bool valid_back = (0.0f <= t_back) & (t_back <= tmax);
+
+  /* check if there is a first hit */
+  const bool valid_first = valid_front | valid_back;
+  if (!valid_first) {
+    return false;
+  }
+
+  *t = (valid_front) ? t_front : t_back;
+  return true;
+#  else
+  if (!valid_front) {
+    return false;
+  }
+  *t = t_front;
+  return true;
+#  endif
+}
+
+ccl_device_forceinline bool point_intersect(KernelGlobals kg,
+                                            ccl_private Intersection *isect,
+                                            const float3 P,
+                                            const float3 dir,
+                                            const float tmax,
+                                            const int object,
+                                            const int prim,
+                                            const float time,
+                                            const int type)
+{
+  const float4 point = (type & PRIMITIVE_MOTION) ? motion_point(kg, object, prim, time) :
+                                                   kernel_tex_fetch(__points, prim);
+
+  if (!point_intersect_test(point, P, dir, tmax, &isect->t)) {
+    return false;
+  }
+
+  isect->prim = prim;
+  isect->object = object;
+  isect->type = type;
+  isect->u = 0.0f;
+  isect->v = 0.0f;
+  return true;
+}
+
+ccl_device_inline void point_shader_setup(KernelGlobals kg,
+                                          ccl_private ShaderData *sd,
+                                          ccl_private const Intersection *isect,
+                                          ccl_private const Ray *ray)
+{
+  sd->shader = kernel_tex_fetch(__points_shader, isect->prim);
+  sd->P = ray->P + ray->D * isect->t;
+
+  /* Texture coordinates, zero for now. */
+#  ifdef __UV__
+  sd->u = isect->u;
+  sd->v = isect->v;
+#  endif
+
+  /* Compute point center for normal. */
+  float3 center = float4_to_float3((isect->type & PRIMITIVE_MOTION) ?
+                                       motion_point(kg, sd->object, sd->prim, sd->time) :
+                                       kernel_tex_fetch(__points, sd->prim));
+  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+    object_position_transform_auto(kg, sd, &center);
+  }
+
+  /* Normal */
+  sd->Ng = normalize(sd->P - center);
+  sd->N = sd->Ng;
+
+#  ifdef __DPDU__
+  /* dPdu/dPdv */
+  sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
+  sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
+#  endif
+}
+
+#endif
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/geom/primitive.h
+++ b/intern/cycles/kernel/geom/primitive.h
@@ -37,16 +37,21 @@ ccl_device_inline float primitive_surface_attribute_float(KernelGlobals kg,
                                                          ccl_private float *dx,
                                                          ccl_private float *dy)
 {
-  if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
+  if (sd->type & PRIMITIVE_TRIANGLE) {
    if (subd_triangle_patch(kg, sd) == ~0)
      return triangle_attribute_float(kg, sd, desc, dx, dy);
    else
      return subd_triangle_attribute_float(kg, sd, desc, dx, dy);
  }
 #ifdef __HAIR__
-  else if (sd->type & PRIMITIVE_ALL_CURVE) {
+  else if (sd->type & PRIMITIVE_CURVE) {
    return curve_attribute_float(kg, sd, desc, dx, dy);
  }
+#endif
+#ifdef __POINTCLOUD__
+  else if (sd->type & PRIMITIVE_POINT) {
+    return point_attribute_float(kg, sd, desc, dx, dy);
+  }
 #endif
  else {
    if (dx)
@@ -63,16 +68,21 @@ ccl_device_inline float2 primitive_surface_attribute_float2(KernelGlobals kg,
                                                            ccl_private float2 *dx,
                                                            ccl_private float2 *dy)
 {
-  if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
+  if (sd->type & PRIMITIVE_TRIANGLE) {
    if (subd_triangle_patch(kg, sd) == ~0)
      return triangle_attribute_float2(kg, sd, desc, dx, dy);
    else
      return subd_triangle_attribute_float2(kg, sd, desc, dx, dy);
  }
 #ifdef __HAIR__
-  else if (sd->type & PRIMITIVE_ALL_CURVE) {
+  else if (sd->type & PRIMITIVE_CURVE) {
    return curve_attribute_float2(kg, sd, desc, dx, dy);
  }
+#endif
+#ifdef __POINTCLOUD__
+  else if (sd->type & PRIMITIVE_POINT) {
+    return point_attribute_float2(kg, sd, desc, dx, dy);
+  }
 #endif
  else {
    if (dx)
@@ -89,16 +99,21 @@ ccl_device_inline float3 primitive_surface_attribute_float3(KernelGlobals kg,
                                                            ccl_private float3 *dx,
                                                            ccl_private float3 *dy)
 {
-  if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
+  if (sd->type & PRIMITIVE_TRIANGLE) {
    if (subd_triangle_patch(kg, sd) == ~0)
      return triangle_attribute_float3(kg, sd, desc, dx, dy);
    else
      return subd_triangle_attribute_float3(kg, sd, desc, dx, dy);
  }
 #ifdef __HAIR__
-  else if (sd->type & PRIMITIVE_ALL_CURVE) {
+  else if (sd->type & PRIMITIVE_CURVE) {
    return curve_attribute_float3(kg, sd, desc, dx, dy);
  }
+#endif
+#ifdef __POINTCLOUD__
+  else if (sd->type & PRIMITIVE_POINT) {
+    return point_attribute_float3(kg, sd, desc, dx, dy);
+  }
 #endif
  else {
    if (dx)
@@ -115,16 +130,21 @@ ccl_device_forceinline float4 primitive_surface_attribute_float4(KernelGlobals k
                                                                 ccl_private float4 *dx,
                                                                 ccl_private float4 *dy)
 {
-  if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
+  if (sd->type & PRIMITIVE_TRIANGLE) {
    if (subd_triangle_patch(kg, sd) == ~0)
      return triangle_attribute_float4(kg, sd, desc, dx, dy);
    else
      return subd_triangle_attribute_float4(kg, sd, desc, dx, dy);
  }
 #ifdef __HAIR__
-  else if (sd->type & PRIMITIVE_ALL_CURVE) {
+  else if (sd->type & PRIMITIVE_CURVE) {
    return curve_attribute_float4(kg, sd, desc, dx, dy);
  }
+#endif
+#ifdef __POINTCLOUD__
+  else if (sd->type & PRIMITIVE_POINT) {
+    return point_attribute_float4(kg, sd, desc, dx, dy);
+  }
 #endif
  else {
    if (dx)
@@ -225,8 +245,8 @@ ccl_device bool primitive_ptex(KernelGlobals kg,

 ccl_device float3 primitive_tangent(KernelGlobals kg, ccl_private ShaderData *sd)
 {
-#ifdef __HAIR__
-  if (sd->type & PRIMITIVE_ALL_CURVE)
+#if defined(__HAIR__) || defined(__POINTCLOUD__)
+  if (sd->type & (PRIMITIVE_CURVE | PRIMITIVE_POINT))
 #  ifdef __DPDU__
    return normalize(sd->dPdu);
 #  else
@@ -261,10 +281,21 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals kg,
  /* center position */
  float3 center;

-#ifdef __HAIR__
-  bool is_curve_primitive = sd->type & PRIMITIVE_ALL_CURVE;
-  if (is_curve_primitive) {
-    center = curve_motion_center_location(kg, sd);
+#if defined(__HAIR__) || defined(__POINTCLOUD__)
+  bool is_curve_or_point = sd->type & (PRIMITIVE_CURVE | PRIMITIVE_POINT);
+  if (is_curve_or_point) {
+    center = make_float3(0.0f, 0.0f, 0.0f);
+
+    if (sd->type & PRIMITIVE_CURVE) {
+#  if defined(__HAIR__)
+      center = curve_motion_center_location(kg, sd);
+#  endif
+    }
+    else if (sd->type & PRIMITIVE_POINT) {
+#  if defined(__POINTCLOUD__)
+      center = point_motion_center_location(kg, sd);
+#  endif
+    }

    if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
      object_position_transform(kg, sd, &center);
@@ -272,7 +303,9 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals kg,
  }
  else
 #endif
+  {
    center = sd->P;
+  }

  float3 motion_pre = center, motion_post = center;

@@ -284,8 +317,8 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals kg,
    int numverts, numkeys;
    object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);

-#ifdef __HAIR__
-    if (is_curve_primitive) {
+#if defined(__HAIR__) || defined(__POINTCLOUD__)
+    if (is_curve_or_point) {
      motion_pre = float4_to_float3(curve_attribute_float4(kg, sd, desc, NULL, NULL));
      desc.offset += numkeys;
      motion_post = float4_to_float3(curve_attribute_float4(kg, sd, desc, NULL, NULL));
@@ -298,7 +331,7 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals kg,
    }
    else
 #endif
-        if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
+        if (sd->type & PRIMITIVE_TRIANGLE) {
      /* Triangle */
      if (subd_triangle_patch(kg, sd) == ~0) {
        motion_pre = triangle_attribute_float3(kg, sd, desc, NULL, NULL);
--- a/intern/cycles/kernel/geom/shader_data.h
+++ b/intern/cycles/kernel/geom/shader_data.h
@@ -69,49 +69,58 @@ ccl_device_inline void shader_setup_from_ray(KernelGlobals kg,
  sd->I = -ray->D;

 #ifdef __HAIR__
-  if (sd->type & PRIMITIVE_ALL_CURVE) {
+  if (sd->type & PRIMITIVE_CURVE) {
    /* curve */
    curve_shader_setup(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim);
  }
  else
 #endif
-      if (sd->type & PRIMITIVE_TRIANGLE) {
-    /* static triangle */
-    float3 Ng = triangle_normal(kg, sd);
-    sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
+#ifdef __POINTCLOUD__
+      if (sd->type & PRIMITIVE_POINT) {
+    /* point */
+    point_shader_setup(kg, sd, isect, ray);
+  }
+  else
+#endif
+  {
+    if (sd->type == PRIMITIVE_TRIANGLE) {
+      /* static triangle */
+      float3 Ng = triangle_normal(kg, sd);
+      sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);

-    /* vectors */
-    sd->P = triangle_refine(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim);
-    sd->Ng = Ng;
-    sd->N = Ng;
+      /* vectors */
+      sd->P = triangle_refine(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim);
+      sd->Ng = Ng;
+      sd->N = Ng;

-    /* smooth normal */
-    if (sd->shader & SHADER_SMOOTH_NORMAL)
-      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
+      /* smooth normal */
+      if (sd->shader & SHADER_SMOOTH_NORMAL)
+        sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);

 #ifdef __DPDU__
-    /* dPdu/dPdv */
-    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+      /* dPdu/dPdv */
+      triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
 #endif
-  }
-  else {
-    /* motion triangle */
-    motion_triangle_shader_setup(
-        kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim, false);
-  }
+    }
+    else {
+      /* motion triangle */
+      motion_triangle_shader_setup(
+          kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim, false);
+    }

-  sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-
-  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-    /* instance transform */
-    object_normal_transform_auto(kg, sd, &sd->N);
-    object_normal_transform_auto(kg, sd, &sd->Ng);
+    if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+      /* instance transform */
+      object_normal_transform_auto(kg, sd, &sd->N);
+      object_normal_transform_auto(kg, sd, &sd->Ng);
 #ifdef __DPDU__
-    object_dir_transform_auto(kg, sd, &sd->dPdu);
-    object_dir_transform_auto(kg, sd, &sd->dPdv);
+      object_dir_transform_auto(kg, sd, &sd->dPdu);
+      object_dir_transform_auto(kg, sd, &sd->dPdv);
 #endif
+    }
  }

+  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+
  /* backfacing test */
  bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);

@@ -194,7 +203,7 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals kg,
    object_dir_transform_auto(kg, sd, &sd->I);
  }

-  if (sd->type & PRIMITIVE_TRIANGLE) {
+  if (sd->type == PRIMITIVE_TRIANGLE) {
    /* smooth normal */
    if (sd->shader & SHADER_SMOOTH_NORMAL) {
      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
--- a/intern/cycles/kernel/integrator/path_state.h
+++ b/intern/cycles/kernel/integrator/path_state.h
@@ -70,6 +70,9 @@ ccl_device_inline void path_state_init_integrator(KernelGlobals kg,
  INTEGRATOR_STATE_WRITE(state, path, continuation_probability) = 1.0f;
  INTEGRATOR_STATE_WRITE(state, path, throughput) = make_float3(1.0f, 1.0f, 1.0f);

+  INTEGRATOR_STATE_WRITE(state, isect, object) = OBJECT_NONE;
+  INTEGRATOR_STATE_WRITE(state, isect, prim) = PRIM_NONE;
+
  if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
    INTEGRATOR_STATE_ARRAY_WRITE(state, volume_stack, 0, object) = OBJECT_NONE;
    INTEGRATOR_STATE_ARRAY_WRITE(
--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -82,7 +82,7 @@ ccl_device_forceinline void integrate_surface_emission(KernelGlobals kg,

 #  ifdef __HAIR__
  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) &&
-      (sd->type & PRIMITIVE_ALL_TRIANGLE))
+      (sd->type & PRIMITIVE_TRIANGLE))
 #  else
  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
 #  endif
--- a/intern/cycles/kernel/light/sample.h
+++ b/intern/cycles/kernel/light/sample.h
@@ -191,7 +191,7 @@ ccl_device_inline float3 shadow_ray_offset(KernelGlobals kg,
  float3 Ng = (transmit ? -sd->Ng : sd->Ng);
  float3 P = ray_offset(sd->P, Ng);

-  if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && (sd->shader & SHADER_SMOOTH_NORMAL)) {
+  if ((sd->type & PRIMITIVE_TRIANGLE) && (sd->shader & SHADER_SMOOTH_NORMAL)) {
    const float offset_cutoff =
        kernel_tex_fetch(__objects, sd->object).shadow_terminator_geometry_offset;
    /* Do ray offset (heavy stuff) only for close to be terminated triangles:
@@ -200,6 +200,9 @@ ccl_device_inline float3 shadow_ray_offset(KernelGlobals kg,
    if (offset_cutoff > 0.0f) {
      float NgL = dot(Ng, L);
      float offset_amount = 0.0f;
+      if (NL < 0) {
+        NL = -NL;
+      }
      if (NL < offset_cutoff) {
        offset_amount = clamp(2.0f - (NgL + NL) / offset_cutoff, 0.0f, 1.0f);
      }
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@@ -28,6 +28,7 @@
 #include "scene/colorspace.h"
 #include "scene/mesh.h"
 #include "scene/object.h"
+#include "scene/pointcloud.h"
 #include "scene/scene.h"

 #include "kernel/osl/closures.h"
@@ -113,6 +114,8 @@ ustring OSLRenderServices::u_curve_thickness("geom:curve_thickness");
 ustring OSLRenderServices::u_curve_length("geom:curve_length");
 ustring OSLRenderServices::u_curve_tangent_normal("geom:curve_tangent_normal");
 ustring OSLRenderServices::u_curve_random("geom:curve_random");
+ustring OSLRenderServices::u_is_point("geom:is_point");
+ustring OSLRenderServices::u_point_radius("geom:point_radius");
 ustring OSLRenderServices::u_normal_map_normal("geom:normal_map_normal");
 ustring OSLRenderServices::u_path_ray_length("path:ray_length");
 ustring OSLRenderServices::u_path_ray_depth("path:ray_depth");
@@ -957,13 +960,15 @@ bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg
    return set_attribute_int(3, type, derivatives, val);
  }
  else if ((name == u_geom_trianglevertices || name == u_geom_polyvertices) &&
-           sd->type & PRIMITIVE_ALL_TRIANGLE) {
+           sd->type & PRIMITIVE_TRIANGLE) {
    float3 P[3];

-    if (sd->type & PRIMITIVE_TRIANGLE)
-      triangle_vertices(kg, sd->prim, P);
-    else
+    if (sd->type & PRIMITIVE_MOTION) {
      motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, P);
+    }
+    else {
+      triangle_vertices(kg, sd->prim, P);
+    }

    if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
      object_position_transform(kg, sd, &P[0]);
@@ -983,7 +988,7 @@ bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg
  }
  /* Hair Attributes */
  else if (name == u_is_curve) {
-    float f = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
+    float f = (sd->type & PRIMITIVE_CURVE) != 0;
    return set_attribute_float(f, type, derivatives, val);
  }
  else if (name == u_curve_thickness) {
@@ -994,8 +999,17 @@ bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg
    float3 f = curve_tangent_normal(kg, sd);
    return set_attribute_float3(f, type, derivatives, val);
  }
+  /* point attributes */
+  else if (name == u_is_point) {
+    float f = (sd->type & PRIMITIVE_POINT) != 0;
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == u_point_radius) {
+    float f = point_radius(kg, sd);
+    return set_attribute_float(f, type, derivatives, val);
+  }
  else if (name == u_normal_map_normal) {
-    if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
+    if (sd->type & PRIMITIVE_TRIANGLE) {
      float3 f = triangle_smooth_normal_unnormalized(kg, sd, sd->Ng, sd->prim, sd->u, sd->v);
      return set_attribute_float3(f, type, derivatives, val);
    }
--- a/intern/cycles/kernel/osl/services.h
+++ b/intern/cycles/kernel/osl/services.h
@@ -297,6 +297,8 @@ class OSLRenderServices : public OSL::RendererServices {
  static ustring u_curve_length;
  static ustring u_curve_tangent_normal;
  static ustring u_curve_random;
+  static ustring u_is_point;
+  static ustring u_point_radius;
  static ustring u_normal_map_normal;
  static ustring u_path_ray_length;
  static ustring u_path_ray_depth;
--- a/intern/cycles/kernel/osl/shaders/CMakeLists.txt
+++ b/intern/cycles/kernel/osl/shaders/CMakeLists.txt
@@ -92,6 +92,7 @@ set(SRC_OSL
  node_value.osl
  node_vector_curves.osl
  node_vector_math.osl
+  node_vector_map_range.osl
  node_vector_rotate.osl
  node_vector_transform.osl
  node_velvet_bsdf.osl
--- a/Show More
+++ b/Show More