Fix missing new line

Make mixed vec4 type constructor works with different vector types
Add constructor tests
2022-01-12 12:10:14 +01:00 · 2022-01-11 21:08:49 +01:00 · 2022-01-11 21:06:11 +01:00 · 2022-01-11 21:02:09 +01:00 · 2022-01-11 19:20:11 +01:00 · 2022-01-11 17:11:04 +01:00
1054 changed files with 11469 additions and 13689 deletions
--- a/build_files/cmake/Modules/FindOptiX.cmake
+++ b/build_files/cmake/Modules/FindOptiX.cmake
@@ -21,7 +21,7 @@ ENDIF()

 SET(_optix_SEARCH_DIRS
  ${OPTIX_ROOT_DIR}
-  "$ENV{PROGRAMDATA}/NVIDIA Corporation/OptiX SDK 7.0.0"
+  "$ENV{PROGRAMDATA}/NVIDIA Corporation/OptiX SDK 7.3.0"
 )

 FIND_PATH(OPTIX_INCLUDE_DIR
--- a/build_files/cmake/cmake_consistency_check.py
+++ b/build_files/cmake/cmake_consistency_check.py
@@ -114,7 +114,7 @@ def is_c_header(filename: str) -> bool:

 def is_c(filename: str) -> bool:
    ext = splitext(filename)[1]
-    return (ext in {".c", ".cpp", ".cxx", ".m", ".mm", ".rc", ".cc", ".inl"})
+    return (ext in {".c", ".cpp", ".cxx", ".m", ".mm", ".rc", ".cc", ".inl", ".metal"})


 def is_c_any(filename: str) -> bool:
--- a/build_files/cmake/platform/platform_apple_xcode.cmake
+++ b/build_files/cmake/platform/platform_apple_xcode.cmake
@@ -96,7 +96,7 @@ else()
    # Detect SDK version to use.
    if(NOT DEFINED OSX_SYSTEM)
      execute_process(
-          COMMAND xcrun --show-sdk-version
+          COMMAND xcrun --sdk macosx --show-sdk-version
          OUTPUT_VARIABLE OSX_SYSTEM
          OUTPUT_STRIP_TRAILING_WHITESPACE)
    endif()
--- a/doc/doxygen/doxygen.intern.h
+++ b/doc/doxygen/doxygen.intern.h
@@ -51,9 +51,6 @@
 /** \defgroup intern_mikktspace MikktSpace
 *  \ingroup intern */

-/** \defgroup intern_numaapi NUMA (Non Uniform Memory Architecture)
- *  \ingroup intern */
-
 /** \defgroup intern_rigidbody Rigid-Body C-API
 *  \ingroup intern */

--- a/intern/CMakeLists.txt
+++ b/intern/CMakeLists.txt
@@ -25,7 +25,6 @@ add_subdirectory(ghost)
 add_subdirectory(guardedalloc)
 add_subdirectory(libmv)
 add_subdirectory(memutil)
-add_subdirectory(numaapi)
 add_subdirectory(opencolorio)
 add_subdirectory(opensubdiv)
 add_subdirectory(mikktspace)
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -82,7 +82,7 @@ static void session_print_status()
  string status, substatus;

  /* get status */
-  float progress = options.session->progress.get_progress();
+  double progress = options.session->progress.get_progress();
  options.session->progress.get_status(status, substatus);

  if (substatus != "")
@@ -183,7 +183,7 @@ static void display_info(Progress &progress)

  progress.get_time(total_time, sample_time);
  progress.get_status(status, substatus);
-  float progress_val = progress.get_progress();
+  double progress_val = progress.get_progress();

  if (substatus != "")
    status += ": " + substatus;
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -60,9 +60,8 @@ def init():

    path = os.path.dirname(__file__)
    user_path = os.path.dirname(os.path.abspath(bpy.utils.user_resource('CONFIG', path='')))
-    temp_path = bpy.app.tempdir

-    _cycles.init(path, user_path, temp_path, bpy.app.background)
+    _cycles.init(path, user_path, bpy.app.background)
    _parse_command_line()


--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -802,7 +802,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        name="Tile Size",
        default=2048,
        description="",
-        min=8, max=16384,
+        min=8, max=8192,
    )

    # Various fine-tuning debug flags
--- a/intern/cycles/blender/display_driver.cpp
+++ b/intern/cycles/blender/display_driver.cpp
@@ -272,12 +272,300 @@ uint BlenderDisplaySpaceShader::get_shader_program()
  return shader_program_;
 }

+/* --------------------------------------------------------------------
+ * DrawTile.
+ */
+
+/* Higher level representation of a texture from the graphics library. */
+class GLTexture {
+ public:
+  /* Global counter for all allocated OpenGL textures used by instances of this class. */
+  static inline std::atomic<int> num_used = 0;
+
+  GLTexture() = default;
+
+  ~GLTexture()
+  {
+    assert(gl_id == 0);
+  }
+
+  GLTexture(const GLTexture &other) = delete;
+  GLTexture &operator=(GLTexture &other) = delete;
+
+  GLTexture(GLTexture &&other) noexcept
+      : gl_id(other.gl_id), width(other.width), height(other.height)
+  {
+    other.reset();
+  }
+
+  GLTexture &operator=(GLTexture &&other)
+  {
+    if (this == &other) {
+      return *this;
+    }
+
+    gl_id = other.gl_id;
+    width = other.width;
+    height = other.height;
+
+    other.reset();
+
+    return *this;
+  }
+
+  bool gl_resources_ensure()
+  {
+    if (gl_id) {
+      return true;
+    }
+
+    /* Create texture. */
+    glGenTextures(1, &gl_id);
+    if (!gl_id) {
+      LOG(ERROR) << "Error creating texture.";
+      return false;
+    }
+
+    /* Configure the texture. */
+    glActiveTexture(GL_TEXTURE0);
+    glBindTexture(GL_TEXTURE_2D, gl_id);
+
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+
+    /* Clamp to edge so that precision issues when zoomed out (which forces linear interpolation)
+     * does not cause unwanted repetition. */
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+
+    glBindTexture(GL_TEXTURE_2D, 0);
+
+    ++num_used;
+
+    return true;
+  }
+
+  void gl_resources_destroy()
+  {
+    if (!gl_id) {
+      return;
+    }
+
+    glDeleteTextures(1, &gl_id);
+
+    reset();
+
+    --num_used;
+  }
+
+  /* OpenGL resource IDs of the texture.
+   *
+   * NOTE: Allocated on the render engine's context. */
+  uint gl_id = 0;
+
+  /* Dimensions of the texture in pixels. */
+  int width = 0;
+  int height = 0;
+
+ protected:
+  void reset()
+  {
+    gl_id = 0;
+    width = 0;
+    height = 0;
+  }
+};
+
+/* Higher level representation of a Pixel Buffer Object (PBO) from the graphics library. */
+class GLPixelBufferObject {
+ public:
+  /* Global counter for all allocated OpenGL PBOs used by instances of this class. */
+  static inline std::atomic<int> num_used = 0;
+
+  GLPixelBufferObject() = default;
+
+  ~GLPixelBufferObject()
+  {
+    assert(gl_id == 0);
+  }
+
+  GLPixelBufferObject(const GLPixelBufferObject &other) = delete;
+  GLPixelBufferObject &operator=(GLPixelBufferObject &other) = delete;
+
+  GLPixelBufferObject(GLPixelBufferObject &&other) noexcept
+      : gl_id(other.gl_id), width(other.width), height(other.height)
+  {
+    other.reset();
+  }
+
+  GLPixelBufferObject &operator=(GLPixelBufferObject &&other)
+  {
+    if (this == &other) {
+      return *this;
+    }
+
+    gl_id = other.gl_id;
+    width = other.width;
+    height = other.height;
+
+    other.reset();
+
+    return *this;
+  }
+
+  bool gl_resources_ensure()
+  {
+    if (gl_id) {
+      return true;
+    }
+
+    glGenBuffers(1, &gl_id);
+    if (!gl_id) {
+      LOG(ERROR) << "Error creating texture pixel buffer object.";
+      return false;
+    }
+
+    ++num_used;
+
+    return true;
+  }
+
+  void gl_resources_destroy()
+  {
+    if (!gl_id) {
+      return;
+    }
+
+    glDeleteBuffers(1, &gl_id);
+
+    reset();
+
+    --num_used;
+  }
+
+  /* OpenGL resource IDs of the PBO.
+   *
+   * NOTE: Allocated on the render engine's context. */
+  uint gl_id = 0;
+
+  /* Dimensions of the PBO. */
+  int width = 0;
+  int height = 0;
+
+ protected:
+  void reset()
+  {
+    gl_id = 0;
+    width = 0;
+    height = 0;
+  }
+};
+
+class DrawTile {
+ public:
+  DrawTile() = default;
+  ~DrawTile() = default;
+
+  DrawTile(const DrawTile &other) = delete;
+  DrawTile &operator=(const DrawTile &other) = delete;
+
+  DrawTile(DrawTile &&other) noexcept = default;
+
+  DrawTile &operator=(DrawTile &&other) = default;
+
+  bool gl_resources_ensure()
+  {
+    if (!texture.gl_resources_ensure()) {
+      gl_resources_destroy();
+      return false;
+    }
+
+    if (!gl_vertex_buffer) {
+      glGenBuffers(1, &gl_vertex_buffer);
+      if (!gl_vertex_buffer) {
+        LOG(ERROR) << "Error allocating tile VBO.";
+        gl_resources_destroy();
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  void gl_resources_destroy()
+  {
+    texture.gl_resources_destroy();
+
+    if (gl_vertex_buffer) {
+      glDeleteBuffers(1, &gl_vertex_buffer);
+      gl_vertex_buffer = 0;
+    }
+  }
+
+  inline bool ready_to_draw() const
+  {
+    return texture.gl_id != 0;
+  }
+
+  /* Texture which contains pixels of the tile. */
+  GLTexture texture;
+
+  /* Display parameters the texture of this tile has been updated for. */
+  BlenderDisplayDriver::Params params;
+
+  /* OpenGL resources needed for drawing. */
+  uint gl_vertex_buffer = 0;
+};
+
+class DrawTileAndPBO {
+ public:
+  bool gl_resources_ensure()
+  {
+    if (!tile.gl_resources_ensure() || !buffer_object.gl_resources_ensure()) {
+      gl_resources_destroy();
+      return false;
+    }
+
+    return true;
+  }
+
+  void gl_resources_destroy()
+  {
+    tile.gl_resources_destroy();
+    buffer_object.gl_resources_destroy();
+  }
+
+  DrawTile tile;
+  GLPixelBufferObject buffer_object;
+};
+
 /* --------------------------------------------------------------------
 * BlenderDisplayDriver.
 */

+struct BlenderDisplayDriver::Tiles {
+  /* Resources of a tile which is being currently rendered. */
+  DrawTileAndPBO current_tile;
+
+  /* All tiles which rendering is finished and which content will not be changed. */
+  struct {
+    vector<DrawTile> tiles;
+
+    void gl_resources_destroy_and_clear()
+    {
+      for (DrawTile &tile : tiles) {
+        tile.gl_resources_destroy();
+      }
+
+      tiles.clear();
+    }
+  } finished_tiles;
+};
+
 BlenderDisplayDriver::BlenderDisplayDriver(BL::RenderEngine &b_engine, BL::Scene &b_scene)
-    : b_engine_(b_engine), display_shader_(BlenderDisplayShader::create(b_engine, b_scene))
+    : b_engine_(b_engine),
+      display_shader_(BlenderDisplayShader::create(b_engine, b_scene)),
+      tiles_(make_unique<Tiles>())
 {
  /* Create context while on the main thread. */
  gl_context_create();
@@ -292,6 +580,21 @@ BlenderDisplayDriver::~BlenderDisplayDriver()
 * Update procedure.
 */

+void BlenderDisplayDriver::next_tile_begin()
+{
+  if (!tiles_->current_tile.tile.ready_to_draw()) {
+    LOG(ERROR)
+        << "Unexpectedly moving to the next tile without any data provided for current tile.";
+    return;
+  }
+
+  /* Moving to the next tile without giving render data for the current tile is not an expected
+   * situation. */
+  DCHECK(!need_clear_);
+
+  tiles_->finished_tiles.tiles.emplace_back(std::move(tiles_->current_tile.tile));
+}
+
 bool BlenderDisplayDriver::update_begin(const Params &params,
                                        int texture_width,
                                        int texture_height)
@@ -312,24 +615,33 @@ bool BlenderDisplayDriver::update_begin(const Params &params,
    glWaitSync((GLsync)gl_render_sync_, 0, GL_TIMEOUT_IGNORED);
  }

-  if (!gl_texture_resources_ensure()) {
+  DrawTile &current_tile = tiles_->current_tile.tile;
+  GLPixelBufferObject &current_tile_buffer_object = tiles_->current_tile.buffer_object;
+
+  /* Clear storage of all finished tiles when display clear is requested.
+   * Do it when new tile data is provided to handle the display clear flag in a single place.
+   * It also makes the logic reliable from the whether drawing did happen or not point of view. */
+  if (need_clear_) {
+    tiles_->finished_tiles.gl_resources_destroy_and_clear();
+    need_clear_ = false;
+  }
+
+  if (!tiles_->current_tile.gl_resources_ensure()) {
+    tiles_->current_tile.gl_resources_destroy();
    gl_context_disable();
    return false;
  }

  /* Update texture dimensions if needed. */
-  if (texture_.width != texture_width || texture_.height != texture_height) {
+  if (current_tile.texture.width != texture_width ||
+      current_tile.texture.height != texture_height) {
    glActiveTexture(GL_TEXTURE0);
-    glBindTexture(GL_TEXTURE_2D, texture_.gl_id);
+    glBindTexture(GL_TEXTURE_2D, current_tile.texture.gl_id);
    glTexImage2D(
        GL_TEXTURE_2D, 0, GL_RGBA16F, texture_width, texture_height, 0, GL_RGBA, GL_HALF_FLOAT, 0);
-    texture_.width = texture_width;
-    texture_.height = texture_height;
+    current_tile.texture.width = texture_width;
+    current_tile.texture.height = texture_height;
    glBindTexture(GL_TEXTURE_2D, 0);
-
-    /* Texture did change, and no pixel storage was provided. Tag for an explicit zeroing out to
-     * avoid undefined content. */
-    texture_.need_clear = true;
  }

  /* Update PBO dimensions if needed.
@@ -341,29 +653,58 @@ bool BlenderDisplayDriver::update_begin(const Params &params,
   * sending too much data to GPU when resolution divider is not 1. */
  /* TODO(sergey): Investigate whether keeping the PBO exact size of the texture makes non-interop
   * mode faster. */
-  const int buffer_width = params.full_size.x;
-  const int buffer_height = params.full_size.y;
-  if (texture_.buffer_width != buffer_width || texture_.buffer_height != buffer_height) {
+  const int buffer_width = params.size.x;
+  const int buffer_height = params.size.y;
+  if (current_tile_buffer_object.width != buffer_width ||
+      current_tile_buffer_object.height != buffer_height) {
    const size_t size_in_bytes = sizeof(half4) * buffer_width * buffer_height;
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, current_tile_buffer_object.gl_id);
    glBufferData(GL_PIXEL_UNPACK_BUFFER, size_in_bytes, 0, GL_DYNAMIC_DRAW);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);

-    texture_.buffer_width = buffer_width;
-    texture_.buffer_height = buffer_height;
+    current_tile_buffer_object.width = buffer_width;
+    current_tile_buffer_object.height = buffer_height;
  }

-  /* New content will be provided to the texture in one way or another, so mark this in a
-   * centralized place. */
-  texture_.need_update = true;
-
-  texture_.params = params;
+  /* Store an updated parameters of the current tile.
+   * In theory it is only needed once per update of the tile, but doing it on every update is
+   * the easiest and is not expensive. */
+  tiles_->current_tile.tile.params = params;

  return true;
 }

+static void update_tile_texture_pixels(const DrawTileAndPBO &tile)
+{
+  const GLTexture &texture = tile.tile.texture;
+
+  DCHECK_NE(tile.buffer_object.gl_id, 0);
+
+  glActiveTexture(GL_TEXTURE0);
+  glBindTexture(GL_TEXTURE_2D, texture.gl_id);
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, tile.buffer_object.gl_id);
+
+  glTexSubImage2D(
+      GL_TEXTURE_2D, 0, 0, 0, texture.width, texture.height, GL_RGBA, GL_HALF_FLOAT, 0);
+
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+  glBindTexture(GL_TEXTURE_2D, 0);
+}
+
 void BlenderDisplayDriver::update_end()
 {
+  /* Unpack the PBO into the texture as soon as the new content is provided.
+   *
+   * This allows to ensure that the unpacking happens while resources like graphics interop (which
+   * lifetime is outside of control of the display driver) are still valid, as well as allows to
+   * move the tile from being current to finished immediately after this call.
+   *
+   * One concern with this approach is that if the update happens more often than drawing then
+   * doing the unpack here occupies GPU transfer for no good reason. However, the render scheduler
+   * takes care of ensuring updates don't happen that often. In regular applications redraw will
+   * happen much more often than this update. */
+  update_tile_texture_pixels(tiles_->current_tile);
+
  gl_upload_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
  glFlush();

@@ -376,7 +717,11 @@ void BlenderDisplayDriver::update_end()

 half4 *BlenderDisplayDriver::map_texture_buffer()
 {
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);
+  const uint pbo_gl_id = tiles_->current_tile.buffer_object.gl_id;
+
+  DCHECK_NE(pbo_gl_id, 0);
+
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_gl_id);

  half4 *mapped_rgba_pixels = reinterpret_cast<half4 *>(
      glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_WRITE_ONLY));
@@ -384,15 +729,6 @@ half4 *BlenderDisplayDriver::map_texture_buffer()
    LOG(ERROR) << "Error mapping BlenderDisplayDriver pixel buffer object.";
  }

-  if (texture_.need_clear) {
-    const int64_t texture_width = texture_.width;
-    const int64_t texture_height = texture_.height;
-    memset(reinterpret_cast<void *>(mapped_rgba_pixels),
-           0,
-           texture_width * texture_height * sizeof(half4));
-    texture_.need_clear = false;
-  }
-
  return mapped_rgba_pixels;
 }

@@ -411,12 +747,9 @@ BlenderDisplayDriver::GraphicsInterop BlenderDisplayDriver::graphics_interop_get
 {
  GraphicsInterop interop_dst;

-  interop_dst.buffer_width = texture_.buffer_width;
-  interop_dst.buffer_height = texture_.buffer_height;
-  interop_dst.opengl_pbo_id = texture_.gl_pbo_id;
-
-  interop_dst.need_clear = texture_.need_clear;
-  texture_.need_clear = false;
+  interop_dst.buffer_width = tiles_->current_tile.buffer_object.width;
+  interop_dst.buffer_height = tiles_->current_tile.buffer_object.height;
+  interop_dst.opengl_pbo_id = tiles_->current_tile.buffer_object.gl_id;

  return interop_dst;
 }
@@ -437,7 +770,7 @@ void BlenderDisplayDriver::graphics_interop_deactivate()

 void BlenderDisplayDriver::clear()
 {
-  texture_.need_clear = true;
+  need_clear_ = true;
 }

 void BlenderDisplayDriver::set_zoom(float zoom_x, float zoom_y)
@@ -445,26 +778,155 @@ void BlenderDisplayDriver::set_zoom(float zoom_x, float zoom_y)
  zoom_ = make_float2(zoom_x, zoom_y);
 }

+/* Update vertex buffer with new coordinates of vertex positions and texture coordinates.
+ * This buffer is used to render texture in the viewport.
+ *
+ * NOTE: The buffer needs to be bound. */
+static void vertex_buffer_update(const DisplayDriver::Params &params)
+{
+  const int x = params.full_offset.x;
+  const int y = params.full_offset.y;
+
+  const int width = params.size.x;
+  const int height = params.size.y;
+
+  /* Invalidate old contents - avoids stalling if the buffer is still waiting in queue to be
+   * rendered. */
+  glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
+
+  float *vpointer = reinterpret_cast<float *>(glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY));
+  if (!vpointer) {
+    return;
+  }
+
+  vpointer[0] = 0.0f;
+  vpointer[1] = 0.0f;
+  vpointer[2] = x;
+  vpointer[3] = y;
+
+  vpointer[4] = 1.0f;
+  vpointer[5] = 0.0f;
+  vpointer[6] = x + width;
+  vpointer[7] = y;
+
+  vpointer[8] = 1.0f;
+  vpointer[9] = 1.0f;
+  vpointer[10] = x + width;
+  vpointer[11] = y + height;
+
+  vpointer[12] = 0.0f;
+  vpointer[13] = 1.0f;
+  vpointer[14] = x;
+  vpointer[15] = y + height;
+
+  glUnmapBuffer(GL_ARRAY_BUFFER);
+}
+
+static void draw_tile(const float2 &zoom,
+                      const int texcoord_attribute,
+                      const int position_attribute,
+                      const DrawTile &draw_tile)
+{
+  if (!draw_tile.ready_to_draw()) {
+    return;
+  }
+
+  const GLTexture &texture = draw_tile.texture;
+
+  DCHECK_NE(texture.gl_id, 0);
+  DCHECK_NE(draw_tile.gl_vertex_buffer, 0);
+
+  glBindBuffer(GL_ARRAY_BUFFER, draw_tile.gl_vertex_buffer);
+
+  /* Draw at the parameters for which the texture has been updated for. This allows to always draw
+   * texture during bordered-rendered camera view without flickering. The validness of the display
+   * parameters for a texture is guaranteed by the initial "clear" state which makes drawing to
+   * have an early output.
+   *
+   * Such approach can cause some extra "jelly" effect during panning, but it is not more jelly
+   * than overlay of selected objects. Also, it's possible to redraw texture at an intersection of
+   * the texture draw parameters and the latest updated draw parameters (although, complexity of
+   * doing it might not worth it. */
+  vertex_buffer_update(draw_tile.params);
+
+  glBindTexture(GL_TEXTURE_2D, texture.gl_id);
+
+  /* Trick to keep sharp rendering without jagged edges on all GPUs.
+   *
+   * The idea here is to enforce driver to use linear interpolation when the image is not zoomed
+   * in.
+   * For the render result with a resolution divider in effect we always use nearest interpolation.
+   *
+   * Use explicit MIN assignment to make sure the driver does not have an undefined behavior at
+   * the zoom level 1. The MAG filter is always NEAREST. */
+  const float zoomed_width = draw_tile.params.size.x * zoom.x;
+  const float zoomed_height = draw_tile.params.size.y * zoom.y;
+  if (texture.width != draw_tile.params.size.x || texture.height != draw_tile.params.size.y) {
+    /* Resolution divider is different from 1, force nearest interpolation. */
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  }
+  else if (zoomed_width - draw_tile.params.size.x > 0.5f ||
+           zoomed_height - draw_tile.params.size.y > 0.5f) {
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  }
+  else {
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+  }
+
+  glVertexAttribPointer(
+      texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
+  glVertexAttribPointer(position_attribute,
+                        2,
+                        GL_FLOAT,
+                        GL_FALSE,
+                        4 * sizeof(float),
+                        (const GLvoid *)(sizeof(float) * 2));
+
+  glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+}
+
+void BlenderDisplayDriver::flush()
+{
+  /* This is called from the render thread that also calls update_begin/end, right before ending
+   * the render loop. We wait for any queued PBO and render commands to be done, before destroying
+   * the render thread and activating the context in the main thread to destroy resources.
+   *
+   * If we don't do this, the NVIDIA driver hangs for a few seconds for when ending 3D viewport
+   * rendering, for unknown reasons. This was found with NVIDIA driver version 470.73 and a Quadro
+   * RTX 6000 on Linux. */
+  if (!gl_context_enable()) {
+    return;
+  }
+
+  if (gl_upload_sync_) {
+    glWaitSync((GLsync)gl_upload_sync_, 0, GL_TIMEOUT_IGNORED);
+  }
+
+  if (gl_render_sync_) {
+    glWaitSync((GLsync)gl_render_sync_, 0, GL_TIMEOUT_IGNORED);
+  }
+
+  gl_context_disable();
+}
+
 void BlenderDisplayDriver::draw(const Params &params)
 {
  /* See do_update_begin() for why no locking is required here. */
  const bool transparent = true;  // TODO(sergey): Derive this from Film.

-  if (!gl_draw_resources_ensure()) {
-    return;
-  }
-
  if (use_gl_context_) {
    gl_context_mutex_.lock();
  }

-  if (texture_.need_clear) {
+  if (need_clear_) {
    /* Texture is requested to be cleared and was not yet cleared.
     *
     * Do early return which should be equivalent of drawing all-zero texture.
     * Watch out for the lock though so that the clear happening during update is properly
     * synchronized here. */
-    gl_context_mutex_.unlock();
+    if (use_gl_context_) {
+      gl_context_mutex_.unlock();
+    }
    return;
  }

@@ -477,66 +939,37 @@ void BlenderDisplayDriver::draw(const Params &params)
    glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
  }

-  display_shader_->bind(params.full_size.x, params.full_size.y);
-
  glActiveTexture(GL_TEXTURE0);
-  glBindTexture(GL_TEXTURE_2D, texture_.gl_id);

-  /* Trick to keep sharp rendering without jagged edges on all GPUs.
-   *
-   * The idea here is to enforce driver to use linear interpolation when the image is not zoomed
-   * in.
-   * For the render result with a resolution divider in effect we always use nearest interpolation.
-   *
-   * Use explicit MIN assignment to make sure the driver does not have an undefined behavior at
-   * the zoom level 1. The MAG filter is always NEAREST. */
-  const float zoomed_width = params.size.x * zoom_.x;
-  const float zoomed_height = params.size.y * zoom_.y;
-  if (texture_.width != params.size.x || texture_.height != params.size.y) {
-    /* Resolution divider is different from 1, force nearest interpolation. */
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-  }
-  else if (zoomed_width - params.size.x > 0.5f || zoomed_height - params.size.y > 0.5f) {
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-  }
-  else {
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-  }
-
-  glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer_);
-
-  texture_update_if_needed();
-  vertex_buffer_update(params);
-
-  /* TODO(sergey): Does it make sense/possible to cache/reuse the VAO? */
+  /* NOTE: The VAO is to be allocated on the drawing context as it is not shared across contexts.
+   * Simplest is to allocate it on every redraw so that it is possible to destroy it from a
+   * correct context. */
  GLuint vertex_array_object;
  glGenVertexArrays(1, &vertex_array_object);
  glBindVertexArray(vertex_array_object);

+  display_shader_->bind(params.full_size.x, params.full_size.y);
+
  const int texcoord_attribute = display_shader_->get_tex_coord_attrib_location();
  const int position_attribute = display_shader_->get_position_attrib_location();

  glEnableVertexAttribArray(texcoord_attribute);
  glEnableVertexAttribArray(position_attribute);

-  glVertexAttribPointer(
-      texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
-  glVertexAttribPointer(position_attribute,
-                        2,
-                        GL_FLOAT,
-                        GL_FALSE,
-                        4 * sizeof(float),
-                        (const GLvoid *)(sizeof(float) * 2));
+  draw_tile(zoom_, texcoord_attribute, position_attribute, tiles_->current_tile.tile);

-  glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-  glBindBuffer(GL_ARRAY_BUFFER, 0);
-  glBindTexture(GL_TEXTURE_2D, 0);
-
-  glDeleteVertexArrays(1, &vertex_array_object);
+  for (const DrawTile &tile : tiles_->finished_tiles.tiles) {
+    draw_tile(zoom_, texcoord_attribute, position_attribute, tile);
+  }

  display_shader_->unbind();

+  glBindTexture(GL_TEXTURE_2D, 0);
+  glBindVertexArray(0);
+  glBindBuffer(GL_ARRAY_BUFFER, 0);
+
+  glDeleteVertexArrays(1, &vertex_array_object);
+
  if (transparent) {
    glDisable(GL_BLEND);
  }
@@ -544,6 +977,11 @@ void BlenderDisplayDriver::draw(const Params &params)
  gl_render_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
  glFlush();

+  if (VLOG_IS_ON(5)) {
+    VLOG(5) << "Number of textures: " << GLTexture::num_used;
+    VLOG(5) << "Number of PBOs: " << GLPixelBufferObject::num_used;
+  }
+
  if (use_gl_context_) {
    gl_context_mutex_.unlock();
  }
@@ -618,154 +1056,16 @@ void BlenderDisplayDriver::gl_context_dispose()
  }
 }

-bool BlenderDisplayDriver::gl_draw_resources_ensure()
-{
-  if (!texture_.gl_id) {
-    /* If there is no texture allocated, there is nothing to draw. Inform the draw call that it can
-     * can not continue. Note that this is not an unrecoverable error, so once the texture is known
-     * we will come back here and create all the GPU resources needed for draw. */
-    return false;
-  }
-
-  if (gl_draw_resource_creation_attempted_) {
-    return gl_draw_resources_created_;
-  }
-  gl_draw_resource_creation_attempted_ = true;
-
-  if (!vertex_buffer_) {
-    glGenBuffers(1, &vertex_buffer_);
-    if (!vertex_buffer_) {
-      LOG(ERROR) << "Error creating vertex buffer.";
-      return false;
-    }
-  }
-
-  gl_draw_resources_created_ = true;
-
-  return true;
-}
-
 void BlenderDisplayDriver::gl_resources_destroy()
 {
  gl_context_enable();

-  if (vertex_buffer_ != 0) {
-    glDeleteBuffers(1, &vertex_buffer_);
-  }
-
-  if (texture_.gl_pbo_id) {
-    glDeleteBuffers(1, &texture_.gl_pbo_id);
-    texture_.gl_pbo_id = 0;
-  }
-
-  if (texture_.gl_id) {
-    glDeleteTextures(1, &texture_.gl_id);
-    texture_.gl_id = 0;
-  }
+  tiles_->current_tile.gl_resources_destroy();
+  tiles_->finished_tiles.gl_resources_destroy_and_clear();

  gl_context_disable();

  gl_context_dispose();
 }

-bool BlenderDisplayDriver::gl_texture_resources_ensure()
-{
-  if (texture_.creation_attempted) {
-    return texture_.is_created;
-  }
-  texture_.creation_attempted = true;
-
-  DCHECK(!texture_.gl_id);
-  DCHECK(!texture_.gl_pbo_id);
-
-  /* Create texture. */
-  glGenTextures(1, &texture_.gl_id);
-  if (!texture_.gl_id) {
-    LOG(ERROR) << "Error creating texture.";
-    return false;
-  }
-
-  /* Configure the texture. */
-  glActiveTexture(GL_TEXTURE0);
-  glBindTexture(GL_TEXTURE_2D, texture_.gl_id);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-  glBindTexture(GL_TEXTURE_2D, 0);
-
-  /* Create PBO for the texture. */
-  glGenBuffers(1, &texture_.gl_pbo_id);
-  if (!texture_.gl_pbo_id) {
-    LOG(ERROR) << "Error creating texture pixel buffer object.";
-    return false;
-  }
-
-  /* Creation finished with a success. */
-  texture_.is_created = true;
-
-  return true;
-}
-
-void BlenderDisplayDriver::texture_update_if_needed()
-{
-  if (!texture_.need_update) {
-    return;
-  }
-
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);
-  glTexSubImage2D(
-      GL_TEXTURE_2D, 0, 0, 0, texture_.width, texture_.height, GL_RGBA, GL_HALF_FLOAT, 0);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-  texture_.need_update = false;
-}
-
-void BlenderDisplayDriver::vertex_buffer_update(const Params & /*params*/)
-{
-  /* Draw at the parameters for which the texture has been updated for. This allows to always draw
-   * texture during bordered-rendered camera view without flickering. The validness of the display
-   * parameters for a texture is guaranteed by the initial "clear" state which makes drawing to
-   * have an early output.
-   *
-   * Such approach can cause some extra "jelly" effect during panning, but it is not more jelly
-   * than overlay of selected objects. Also, it's possible to redraw texture at an intersection of
-   * the texture draw parameters and the latest updated draw parameters (although, complexity of
-   * doing it might not worth it. */
-  const int x = texture_.params.full_offset.x;
-  const int y = texture_.params.full_offset.y;
-
-  const int width = texture_.params.size.x;
-  const int height = texture_.params.size.y;
-
-  /* Invalidate old contents - avoids stalling if the buffer is still waiting in queue to be
-   * rendered. */
-  glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
-  float *vpointer = reinterpret_cast<float *>(glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY));
-  if (!vpointer) {
-    return;
-  }
-
-  vpointer[0] = 0.0f;
-  vpointer[1] = 0.0f;
-  vpointer[2] = x;
-  vpointer[3] = y;
-
-  vpointer[4] = 1.0f;
-  vpointer[5] = 0.0f;
-  vpointer[6] = x + width;
-  vpointer[7] = y;
-
-  vpointer[8] = 1.0f;
-  vpointer[9] = 1.0f;
-  vpointer[10] = x + width;
-  vpointer[11] = y + height;
-
-  vpointer[12] = 0.0f;
-  vpointer[13] = 1.0f;
-  vpointer[14] = x;
-  vpointer[15] = y + height;
-
-  glUnmapBuffer(GL_ARRAY_BUFFER);
-}
-
 CCL_NAMESPACE_END
--- a/intern/cycles/blender/display_driver.h
+++ b/intern/cycles/blender/display_driver.h
@@ -26,6 +26,7 @@

 #include "util/thread.h"
 #include "util/unique_ptr.h"
+#include "util/vector.h"

 CCL_NAMESPACE_BEGIN

@@ -112,6 +113,8 @@ class BlenderDisplayDriver : public DisplayDriver {
  void set_zoom(float zoom_x, float zoom_y);

 protected:
+  virtual void next_tile_begin() override;
+
  virtual bool update_begin(const Params &params, int texture_width, int texture_height) override;
  virtual void update_end() override;

@@ -122,33 +125,17 @@ class BlenderDisplayDriver : public DisplayDriver {

  virtual void draw(const Params &params) override;

+  virtual void flush() override;
+
  /* Helper function which allocates new GPU context. */
  void gl_context_create();
  bool gl_context_enable();
  void gl_context_disable();
  void gl_context_dispose();

-  /* Make sure texture is allocated and its initial configuration is performed. */
-  bool gl_texture_resources_ensure();
-
-  /* Ensure all runtime GPU resources needed for drawing are allocated.
-   * Returns true if all resources needed for drawing are available. */
-  bool gl_draw_resources_ensure();
-
  /* Destroy all GPU resources which are being used by this object. */
  void gl_resources_destroy();

-  /* Update GPU texture dimensions and content if needed (new pixel data was provided).
-   *
-   * NOTE: The texture needs to be bound. */
-  void texture_update_if_needed();
-
-  /* Update vertex buffer with new coordinates of vertex positions and texture coordinates.
-   * This buffer is used to render texture in the viewport.
-   *
-   * NOTE: The buffer needs to be bound. */
-  void vertex_buffer_update(const Params &params);
-
  BL::RenderEngine b_engine_;

  /* OpenGL context which is used the render engine doesn't have its own. */
@@ -159,50 +146,14 @@ class BlenderDisplayDriver : public DisplayDriver {
  /* Mutex used to guard the `gl_context_`. */
  thread_mutex gl_context_mutex_;

-  /* Texture which contains pixels of the render result. */
-  struct {
-    /* Indicates whether texture creation was attempted and succeeded.
-     * Used to avoid multiple attempts of texture creation on GPU issues or GPU context
-     * misconfiguration. */
-    bool creation_attempted = false;
-    bool is_created = false;
-
-    /* OpenGL resource IDs of the texture itself and Pixel Buffer Object (PBO) used to write
-     * pixels to it.
-     *
-     * NOTE: Allocated on the engine's context. */
-    uint gl_id = 0;
-    uint gl_pbo_id = 0;
-
-    /* Is true when new data was written to the PBO, meaning, the texture might need to be resized
-     * and new data is to be uploaded to the GPU. */
-    bool need_update = false;
-
-    /* Content of the texture is to be filled with zeroes. */
-    std::atomic<bool> need_clear = true;
-
-    /* Dimensions of the texture in pixels. */
-    int width = 0;
-    int height = 0;
-
-    /* Dimensions of the underlying PBO. */
-    int buffer_width = 0;
-    int buffer_height = 0;
-
-    /* Display parameters the texture has been updated for. */
-    Params params;
-  } texture_;
+  /* Content of the display is to be filled with zeroes. */
+  std::atomic<bool> need_clear_ = true;

  unique_ptr<BlenderDisplayShader> display_shader_;

-  /* Special track of whether GPU resources were attempted to be created, to avoid attempts of
-   * their re-creation on failure on every redraw. */
-  bool gl_draw_resource_creation_attempted_ = false;
-  bool gl_draw_resources_created_ = false;
-
-  /* Vertex buffer which hold vertices of a triangle fan which is textures with the texture
-   * holding the render result. */
-  uint vertex_buffer_ = 0;
+  /* Opaque storage for an internal state and data for tiles. */
+  struct Tiles;
+  unique_ptr<Tiles> tiles_;

  void *gl_render_sync_ = nullptr;
  void *gl_upload_sync_ = nullptr;
--- a/intern/cycles/blender/mesh.cpp
+++ b/intern/cycles/blender/mesh.cpp
@@ -1086,40 +1086,6 @@ static void create_subd_mesh(Scene *scene,

 /* Sync */

-/* Check whether some of "built-in" motion-related attributes are needed to be exported (includes
- * things like velocity from cache modifier, fluid simulation).
- *
- * NOTE: This code is run prior to object motion blur initialization. so can not access properties
- * set by `sync_object_motion_init()`. */
-static bool mesh_need_motion_attribute(BObjectInfo &b_ob_info, Scene *scene)
-{
-  const Scene::MotionType need_motion = scene->need_motion();
-  if (need_motion == Scene::MOTION_NONE) {
-    /* Simple case: neither motion pass nor motion blur is needed, no need in the motion related
-     * attributes. */
-    return false;
-  }
-
-  if (need_motion == Scene::MOTION_BLUR) {
-    /* A bit tricky and implicit case:
-     * - Motion blur is enabled in the scene, which implies specific number of time steps for
-     *   objects.
-     * - If the object has motion blur disabled on it, it will have 0 time steps.
-     * - Motion attribute expects non-zero time steps.
-     *
-     * Avoid adding motion attributes if the motion blur will enforce 0 motion steps. */
-    PointerRNA cobject = RNA_pointer_get(&b_ob_info.real_object.ptr, "cycles");
-    const bool use_motion = get_boolean(cobject, "use_motion_blur");
-    if (!use_motion) {
-      return false;
-    }
-  }
-
-  /* Motion pass which implies 3 motion steps, or motion blur which is not disabled on object
-   * level. */
-  return true;
-}
-
 void BlenderSync::sync_mesh(BL::Depsgraph b_depsgraph, BObjectInfo &b_ob_info, Mesh *mesh)
 {
  /* make a copy of the shaders as the caller in the main thread still need them for syncing the
@@ -1144,7 +1110,7 @@ void BlenderSync::sync_mesh(BL::Depsgraph b_depsgraph, BObjectInfo &b_ob_info, M

    if (b_mesh) {
      /* Motion blur attribute is relative to seconds, we need it relative to frames. */
-      const bool need_motion = mesh_need_motion_attribute(b_ob_info, scene);
+      const bool need_motion = object_need_motion_attribute(b_ob_info, scene);
      const float motion_scale = (need_motion) ?
                                     scene->motion_shutter_time() /
                                         (b_scene.render().fps() / b_scene.render().fps_base()) :
--- a/intern/cycles/blender/output_driver.cpp
+++ b/intern/cycles/blender/output_driver.cpp
@@ -120,7 +120,7 @@ void BlenderOutputDriver::write_render_tile(const Tile &tile)
    b_pass.rect(&pixels[0]);
  }

-  b_engine_.end_result(b_rr, true, false, true);
+  b_engine_.end_result(b_rr, false, false, true);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/blender/pointcloud.cpp
+++ b/intern/cycles/blender/pointcloud.cpp
@@ -37,12 +37,52 @@ static void fill_generic_attribute(BL::PointCloud &b_pointcloud,
  }
 }

-static void copy_attributes(PointCloud *pointcloud, BL::PointCloud b_pointcloud)
+static void attr_create_motion(PointCloud *pointcloud,
+                               BL::Attribute &b_attribute,
+                               const float motion_scale)
+{
+  if (!(b_attribute.domain() == BL::Attribute::domain_POINT) &&
+      (b_attribute.data_type() == BL::Attribute::data_type_FLOAT_VECTOR)) {
+    return;
+  }
+
+  BL::FloatVectorAttribute b_vector_attribute(b_attribute);
+  const int num_points = pointcloud->get_points().size();
+
+  /* Find or add attribute */
+  float3 *P = &pointcloud->get_points()[0];
+  Attribute *attr_mP = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+
+  if (!attr_mP) {
+    attr_mP = pointcloud->attributes.add(ATTR_STD_MOTION_VERTEX_POSITION);
+  }
+
+  /* Only export previous and next frame, we don't have any in between data. */
+  float motion_times[2] = {-1.0f, 1.0f};
+  for (int step = 0; step < 2; step++) {
+    const float relative_time = motion_times[step] * 0.5f * motion_scale;
+    float3 *mP = attr_mP->data_float3() + step * num_points;
+
+    for (int i = 0; i < num_points; i++) {
+      mP[i] = P[i] + get_float3(b_vector_attribute.data[i].vector()) * relative_time;
+    }
+  }
+}
+
+static void copy_attributes(PointCloud *pointcloud,
+                            BL::PointCloud b_pointcloud,
+                            const bool need_motion,
+                            const float motion_scale)
 {
  AttributeSet &attributes = pointcloud->attributes;
+  static const ustring u_velocity("velocity");
  for (BL::Attribute &b_attribute : b_pointcloud.attributes) {
    const ustring name{b_attribute.name().c_str()};

+    if (need_motion && name == u_velocity) {
+      attr_create_motion(pointcloud, b_attribute, motion_scale);
+    }
+
    if (attributes.find(name)) {
      continue;
    }
@@ -111,7 +151,11 @@ static void copy_attributes(PointCloud *pointcloud, BL::PointCloud b_pointcloud)
  }
 }

-static void export_pointcloud(Scene *scene, PointCloud *pointcloud, BL::PointCloud b_pointcloud)
+static void export_pointcloud(Scene *scene,
+                              PointCloud *pointcloud,
+                              BL::PointCloud b_pointcloud,
+                              const bool need_motion,
+                              const float motion_scale)
 {
  /* TODO: optimize so we can straight memcpy arrays from Blender? */

@@ -141,7 +185,7 @@ static void export_pointcloud(Scene *scene, PointCloud *pointcloud, BL::PointClo
  }

  /* Export attributes */
-  copy_attributes(pointcloud, b_pointcloud);
+  copy_attributes(pointcloud, b_pointcloud, need_motion, motion_scale);
 }

 static void export_pointcloud_motion(PointCloud *pointcloud,
@@ -193,7 +237,7 @@ static void export_pointcloud_motion(PointCloud *pointcloud,
  }

  /* Export attributes */
-  copy_attributes(pointcloud, b_pointcloud);
+  copy_attributes(pointcloud, b_pointcloud, false, 0.0f);
 }

 void BlenderSync::sync_pointcloud(PointCloud *pointcloud, BObjectInfo &b_ob_info)
@@ -207,7 +251,13 @@ void BlenderSync::sync_pointcloud(PointCloud *pointcloud, BObjectInfo &b_ob_info

  /* TODO: add option to filter out points in the view layer. */
  BL::PointCloud b_pointcloud(b_ob_info.object_data);
-  export_pointcloud(scene, &new_pointcloud, b_pointcloud);
+  /* Motion blur attribute is relative to seconds, we need it relative to frames. */
+  const bool need_motion = object_need_motion_attribute(b_ob_info, scene);
+  const float motion_scale = (need_motion) ?
+                                 scene->motion_shutter_time() /
+                                     (b_scene.render().fps() / b_scene.render().fps_base()) :
+                                 0.0f;
+  export_pointcloud(scene, &new_pointcloud, b_pointcloud, need_motion, motion_scale);

  /* update original sockets */
  for (const SocketType &socket : new_pointcloud.type->inputs) {
--- a/intern/cycles/blender/python.cpp
+++ b/intern/cycles/blender/python.cpp
@@ -138,20 +138,18 @@ static const char *PyC_UnicodeAsByte(PyObject *py_str, PyObject **coerce)

 static PyObject *init_func(PyObject * /*self*/, PyObject *args)
 {
-  PyObject *path, *user_path, *temp_path;
+  PyObject *path, *user_path;
  int headless;

-  if (!PyArg_ParseTuple(args, "OOOi", &path, &user_path, &temp_path, &headless)) {
+  if (!PyArg_ParseTuple(args, "OOi", &path, &user_path, &headless)) {
    return nullptr;
  }

-  PyObject *path_coerce = nullptr, *user_path_coerce = nullptr, *temp_path_coerce = nullptr;
+  PyObject *path_coerce = nullptr, *user_path_coerce = nullptr;
  path_init(PyC_UnicodeAsByte(path, &path_coerce),
-            PyC_UnicodeAsByte(user_path, &user_path_coerce),
-            PyC_UnicodeAsByte(temp_path, &temp_path_coerce));
+            PyC_UnicodeAsByte(user_path, &user_path_coerce));
  Py_XDECREF(path_coerce);
  Py_XDECREF(user_path_coerce);
-  Py_XDECREF(temp_path_coerce);

  BlenderSession::headless = headless;

@@ -735,27 +733,20 @@ static bool image_parse_filepaths(PyObject *pyfilepaths, vector<string> &filepat

 static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *keywords)
 {
-#if 1
-  (void)args;
-  (void)keywords;
-#else
  static const char *keyword_list[] = {
-      "preferences", "scene", "view_layer", "input", "output", "tile_size", "samples", NULL};
+      "preferences", "scene", "view_layer", "input", "output", NULL};
  PyObject *pypreferences, *pyscene, *pyviewlayer;
  PyObject *pyinput, *pyoutput = NULL;
-  int tile_size = 0, samples = 0;

  if (!PyArg_ParseTupleAndKeywords(args,
                                   keywords,
-                                   "OOOO|Oii",
+                                   "OOOO|O",
                                   (char **)keyword_list,
                                   &pypreferences,
                                   &pyscene,
                                   &pyviewlayer,
                                   &pyinput,
-                                   &pyoutput,
-                                   &tile_size,
-                                   &samples)) {
+                                   &pyoutput)) {
    return NULL;
  }

@@ -777,14 +768,10 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key
                     &RNA_ViewLayer,
                     PyLong_AsVoidPtr(pyviewlayer),
                     &viewlayerptr);
-  PointerRNA cviewlayer = RNA_pointer_get(&viewlayerptr, "cycles");
+  BL::ViewLayer b_view_layer(viewlayerptr);

-  DenoiseParams params;
-  params.radius = get_int(cviewlayer, "denoising_radius");
-  params.strength = get_float(cviewlayer, "denoising_strength");
-  params.feature_strength = get_float(cviewlayer, "denoising_feature_strength");
-  params.relative_pca = get_boolean(cviewlayer, "denoising_relative_pca");
-  params.neighbor_frames = get_int(cviewlayer, "denoising_neighbor_frames");
+  DenoiseParams params = BlenderSync::get_denoise_params(b_scene, b_view_layer, true);
+  params.use = true;

  /* Parse file paths list. */
  vector<string> input, output;
@@ -812,24 +799,15 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key
  }

  /* Create denoiser. */
-  DenoiserPipeline denoiser(device);
-  denoiser.params = params;
+  DenoiserPipeline denoiser(device, params);
  denoiser.input = input;
  denoiser.output = output;

-  if (tile_size > 0) {
-    denoiser.tile_size = make_int2(tile_size, tile_size);
-  }
-  if (samples > 0) {
-    denoiser.samples_override = samples;
-  }
-
  /* Run denoiser. */
  if (!denoiser.run()) {
    PyErr_SetString(PyExc_ValueError, denoiser.error.c_str());
    return NULL;
  }
-#endif

  Py_RETURN_NONE;
 }
--- a/intern/cycles/blender/session.cpp
+++ b/intern/cycles/blender/session.cpp
@@ -502,10 +502,15 @@ void BlenderSession::render_frame_finish()
    path_remove(filename);
  }

-  /* Clear driver. */
+  /* Clear output driver. */
  session->set_output_driver(nullptr);
  session->full_buffer_written_cb = function_null;

+  /* The display driver holds OpenGL resources which belong to an OpenGL context held by the render
+   * engine on Blender side. Force destruction of those resources. */
+  display_driver_ = nullptr;
+  session->set_display_driver(nullptr);
+
  /* All the files are handled.
   * Clear the list so that this session can be re-used by Persistent Data. */
  full_buffer_files_.clear();
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -832,6 +832,14 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
  SessionParams params;
  PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");

+  if (background && !b_engine.is_preview()) {
+    /* Viewport and preview renders do not require temp directory and do request session
+     * parameters more often than the background render.
+     * Optimize RNA-C++ usage and memory allocation a bit by saving string access which we know is
+     * not needed for viewport render. */
+    params.temp_dir = b_engine.temporary_directory();
+  }
+
  /* feature set */
  params.experimental = (get_enum(cscene, "feature_set") != 0);

--- a/intern/cycles/blender/sync.h
+++ b/intern/cycles/blender/sync.h
@@ -105,11 +105,11 @@ class BlenderSync {
  static BufferParams get_buffer_params(
      BL::SpaceView3D &b_v3d, BL::RegionView3D &b_rv3d, Camera *cam, int width, int height);

- private:
  static DenoiseParams get_denoise_params(BL::Scene &b_scene,
                                          BL::ViewLayer &b_view_layer,
                                          bool background);

+ private:
  /* sync */
  void sync_lights(BL::Depsgraph &b_depsgraph, bool update_all);
  void sync_materials(BL::Depsgraph &b_depsgraph, bool update_all);
--- a/intern/cycles/blender/util.h
+++ b/intern/cycles/blender/util.h
@@ -18,6 +18,7 @@
 #define __BLENDER_UTIL_H__

 #include "scene/mesh.h"
+#include "scene/scene.h"

 #include "util/algorithm.h"
 #include "util/array.h"
@@ -670,6 +671,40 @@ static inline uint object_ray_visibility(BL::Object &b_ob)
  return flag;
 }

+/* Check whether some of "built-in" motion-related attributes are needed to be exported (includes
+ * things like velocity from cache modifier, fluid simulation).
+ *
+ * NOTE: This code is run prior to object motion blur initialization. so can not access properties
+ * set by `sync_object_motion_init()`. */
+static inline bool object_need_motion_attribute(BObjectInfo &b_ob_info, Scene *scene)
+{
+  const Scene::MotionType need_motion = scene->need_motion();
+  if (need_motion == Scene::MOTION_NONE) {
+    /* Simple case: neither motion pass nor motion blur is needed, no need in the motion related
+     * attributes. */
+    return false;
+  }
+
+  if (need_motion == Scene::MOTION_BLUR) {
+    /* A bit tricky and implicit case:
+     * - Motion blur is enabled in the scene, which implies specific number of time steps for
+     *   objects.
+     * - If the object has motion blur disabled on it, it will have 0 time steps.
+     * - Motion attribute expects non-zero time steps.
+     *
+     * Avoid adding motion attributes if the motion blur will enforce 0 motion steps. */
+    PointerRNA cobject = RNA_pointer_get(&b_ob_info.real_object.ptr, "cycles");
+    const bool use_motion = get_boolean(cobject, "use_motion_blur");
+    if (!use_motion) {
+      return false;
+    }
+  }
+
+  /* Motion pass which implies 3 motion steps, or motion blur which is not disabled on object
+   * level. */
+  return true;
+}
+
 class EdgeMap {
 public:
  EdgeMap()
--- a/intern/cycles/cmake/macros.cmake
+++ b/intern/cycles/cmake/macros.cmake
@@ -168,12 +168,6 @@ macro(cycles_target_link_libraries target)
    target_link_libraries(${target} extern_hipew)
  endif()

-  if(CYCLES_STANDALONE_REPOSITORY)
-    target_link_libraries(${target} extern_numaapi)
-  else()
-    target_link_libraries(${target} bf_intern_numaapi)
-  endif()
-
  if(UNIX AND NOT APPLE)
    if(CYCLES_STANDALONE_REPOSITORY)
      target_link_libraries(${target} extern_libc_compat)
--- a/intern/cycles/device/cuda/graphics_interop.cpp
+++ b/intern/cycles/device/cuda/graphics_interop.cpp
@@ -45,8 +45,10 @@ void CUDADeviceGraphicsInterop::set_display_interop(

  need_clear_ = display_interop.need_clear;

-  if (opengl_pbo_id_ == display_interop.opengl_pbo_id && buffer_area_ == new_buffer_area) {
-    return;
+  if (!display_interop.need_recreate) {
+    if (opengl_pbo_id_ == display_interop.opengl_pbo_id && buffer_area_ == new_buffer_area) {
+      return;
+    }
  }

  CUDAContextScope scope(device_);
--- a/intern/cycles/device/denoise.cpp
+++ b/intern/cycles/device/denoise.cpp
@@ -76,6 +76,8 @@ NODE_DEFINE(DenoiseParams)
  SOCKET_BOOLEAN(use_pass_albedo, "Use Pass Albedo", true);
  SOCKET_BOOLEAN(use_pass_normal, "Use Pass Normal", false);

+  SOCKET_BOOLEAN(temporally_stable, "Temporally Stable", false);
+
  SOCKET_ENUM(prefilter, "Prefilter", *prefilter_enum, DENOISER_PREFILTER_FAST);

  return type;
--- a/intern/cycles/device/denoise.h
+++ b/intern/cycles/device/denoise.h
@@ -72,6 +72,9 @@ class DenoiseParams : public Node {
  bool use_pass_albedo = true;
  bool use_pass_normal = true;

+  /* Configure the denoiser to use motion vectors, previous image and a temporally stable model. */
+  bool temporally_stable = false;
+
  DenoiserPrefilter prefilter = DENOISER_PREFILTER_FAST;

  static const NodeEnum *get_type_enum();
@@ -83,7 +86,8 @@ class DenoiseParams : public Node {
  {
    return !(use == other.use && type == other.type && start_sample == other.start_sample &&
             use_pass_albedo == other.use_pass_albedo &&
-             use_pass_normal == other.use_pass_normal && prefilter == other.prefilter);
+             use_pass_normal == other.use_pass_normal &&
+             temporally_stable == other.temporally_stable && prefilter == other.prefilter);
  }
 };

--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -37,6 +37,7 @@
 #include "util/math.h"
 #include "util/string.h"
 #include "util/system.h"
+#include "util/task.h"
 #include "util/time.h"
 #include "util/types.h"
 #include "util/vector.h"
@@ -333,7 +334,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
    /* Ensure CPU device does not slow down GPU. */
    if (device.type == DEVICE_CPU && subdevices.size() > 1) {
      if (background) {
-        int orig_cpu_threads = (threads) ? threads : system_cpu_thread_count();
+        int orig_cpu_threads = (threads) ? threads : TaskScheduler::num_threads();
        int cpu_threads = max(orig_cpu_threads - (subdevices.size() - 1), 0);

        VLOG(1) << "CPU render threads reduced from " << orig_cpu_threads << " to " << cpu_threads
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -566,6 +566,19 @@ class OptiXDevice::DenoiseContext {
      }
    }

+    if (denoise_params.temporally_stable) {
+      prev_output.device_pointer = render_buffers->buffer.device_pointer;
+
+      prev_output.offset = buffer_params.get_pass_offset(PASS_DENOISING_PREVIOUS);
+
+      prev_output.stride = buffer_params.stride;
+      prev_output.pass_stride = buffer_params.pass_stride;
+
+      num_input_passes += 1;
+      use_pass_flow = true;
+      pass_motion = buffer_params.get_pass_offset(PASS_MOTION);
+    }
+
    use_guiding_passes = (num_input_passes - 1) > 0;

    if (use_guiding_passes) {
@@ -574,6 +587,7 @@ class OptiXDevice::DenoiseContext {

        guiding_params.pass_albedo = pass_denoising_albedo;
        guiding_params.pass_normal = pass_denoising_normal;
+        guiding_params.pass_flow = pass_motion;

        guiding_params.stride = buffer_params.stride;
        guiding_params.pass_stride = buffer_params.pass_stride;
@@ -588,6 +602,10 @@ class OptiXDevice::DenoiseContext {
          guiding_params.pass_normal = guiding_params.pass_stride;
          guiding_params.pass_stride += 3;
        }
+        if (use_pass_flow) {
+          guiding_params.pass_flow = guiding_params.pass_stride;
+          guiding_params.pass_stride += 2;
+        }

        guiding_params.stride = buffer_params.width;

@@ -605,6 +623,16 @@ class OptiXDevice::DenoiseContext {
  RenderBuffers *render_buffers = nullptr;
  const BufferParams &buffer_params;

+  /* Previous output. */
+  struct {
+    device_ptr device_pointer = 0;
+
+    int offset = PASS_UNUSED;
+
+    int stride = -1;
+    int pass_stride = -1;
+  } prev_output;
+
  /* Device-side storage of the guiding passes. */
  device_only_memory<float> guiding_buffer;

@@ -614,6 +642,7 @@ class OptiXDevice::DenoiseContext {
    /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
    int pass_albedo = PASS_UNUSED;
    int pass_normal = PASS_UNUSED;
+    int pass_flow = PASS_UNUSED;

    int stride = -1;
    int pass_stride = -1;
@@ -624,6 +653,7 @@ class OptiXDevice::DenoiseContext {
  bool use_guiding_passes = false;
  bool use_pass_albedo = false;
  bool use_pass_normal = false;
+  bool use_pass_flow = false;

  int num_samples = 0;

@@ -632,6 +662,7 @@ class OptiXDevice::DenoiseContext {
  /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
  int pass_denoising_albedo = PASS_UNUSED;
  int pass_denoising_normal = PASS_UNUSED;
+  int pass_motion = PASS_UNUSED;

  /* For passes which don't need albedo channel for denoising we replace the actual albedo with
   * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
@@ -702,6 +733,7 @@ bool OptiXDevice::denoise_filter_guiding_preprocess(DenoiseContext &context)
                             &context.guiding_params.pass_stride,
                             &context.guiding_params.pass_albedo,
                             &context.guiding_params.pass_normal,
+                             &context.guiding_params.pass_flow,
                             &context.render_buffers->buffer.device_pointer,
                             &buffer_params.offset,
                             &buffer_params.stride,
@@ -709,6 +741,7 @@ bool OptiXDevice::denoise_filter_guiding_preprocess(DenoiseContext &context)
                             &context.pass_sample_count,
                             &context.pass_denoising_albedo,
                             &context.pass_denoising_normal,
+                             &context.pass_motion,
                             &buffer_params.full_x,
                             &buffer_params.full_y,
                             &buffer_params.width,
@@ -881,7 +914,8 @@ bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context)
 {
  const bool recreate_denoiser = (denoiser_.optix_denoiser == nullptr) ||
                                 (denoiser_.use_pass_albedo != context.use_pass_albedo) ||
-                                 (denoiser_.use_pass_normal != context.use_pass_normal);
+                                 (denoiser_.use_pass_normal != context.use_pass_normal) ||
+                                 (denoiser_.use_pass_flow != context.use_pass_flow);
  if (!recreate_denoiser) {
    return true;
  }
@@ -895,8 +929,14 @@ bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context)
  OptixDenoiserOptions denoiser_options = {};
  denoiser_options.guideAlbedo = context.use_pass_albedo;
  denoiser_options.guideNormal = context.use_pass_normal;
+
+  OptixDenoiserModelKind model = OPTIX_DENOISER_MODEL_KIND_HDR;
+  if (context.use_pass_flow) {
+    model = OPTIX_DENOISER_MODEL_KIND_TEMPORAL;
+  }
+
  const OptixResult result = optixDenoiserCreate(
-      this->context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser_.optix_denoiser);
+      this->context, model, &denoiser_options, &denoiser_.optix_denoiser);

  if (result != OPTIX_SUCCESS) {
    set_error("Failed to create OptiX denoiser");
@@ -906,6 +946,7 @@ bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context)
  /* OptiX denoiser handle was created with the requested number of input passes. */
  denoiser_.use_pass_albedo = context.use_pass_albedo;
  denoiser_.use_pass_normal = context.use_pass_normal;
+  denoiser_.use_pass_flow = context.use_pass_flow;

  /* OptiX denoiser has been created, but it needs configuration. */
  denoiser_.is_configured = false;
@@ -965,8 +1006,10 @@ bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)
  OptixImage2D color_layer = {0};
  OptixImage2D albedo_layer = {0};
  OptixImage2D normal_layer = {0};
+  OptixImage2D flow_layer = {0};

  OptixImage2D output_layer = {0};
+  OptixImage2D prev_output_layer = {0};

  /* Color pass. */
  {
@@ -982,6 +1025,19 @@ bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)
    color_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
  }

+  /* Previous output. */
+  if (context.prev_output.offset != PASS_UNUSED) {
+    const int64_t pass_stride_in_bytes = context.prev_output.pass_stride * sizeof(float);
+
+    prev_output_layer.data = context.prev_output.device_pointer +
+                             context.prev_output.offset * sizeof(float);
+    prev_output_layer.width = width;
+    prev_output_layer.height = height;
+    prev_output_layer.rowStrideInBytes = pass_stride_in_bytes * context.prev_output.stride;
+    prev_output_layer.pixelStrideInBytes = pass_stride_in_bytes;
+    prev_output_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+  }
+
  /* Optional albedo and color passes. */
  if (context.num_input_passes > 1) {
    const device_ptr d_guiding_buffer = context.guiding_params.device_pointer;
@@ -1005,21 +1061,32 @@ bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)
      normal_layer.pixelStrideInBytes = pixel_stride_in_bytes;
      normal_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
    }
+
+    if (context.use_pass_flow) {
+      flow_layer.data = d_guiding_buffer + context.guiding_params.pass_flow * sizeof(float);
+      flow_layer.width = width;
+      flow_layer.height = height;
+      flow_layer.rowStrideInBytes = row_stride_in_bytes;
+      flow_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+      flow_layer.format = OPTIX_PIXEL_FORMAT_FLOAT2;
+    }
  }

  /* Denoise in-place of the noisy input in the render buffers. */
  output_layer = color_layer;

-  /* Finally run denoising. */
-  OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */
-
-  OptixDenoiserLayer image_layers = {};
-  image_layers.input = color_layer;
-  image_layers.output = output_layer;
-
  OptixDenoiserGuideLayer guide_layers = {};
  guide_layers.albedo = albedo_layer;
  guide_layers.normal = normal_layer;
+  guide_layers.flow = flow_layer;
+
+  OptixDenoiserLayer image_layers = {};
+  image_layers.input = color_layer;
+  image_layers.previousOutput = prev_output_layer;
+  image_layers.output = output_layer;
+
+  /* Finally run denoising. */
+  OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */

  optix_assert(optixUtilDenoiserInvokeTiled(denoiser_.optix_denoiser,
                                            denoiser_.queue.stream(),
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -104,6 +104,7 @@ class OptiXDevice : public CUDADevice {

    bool use_pass_albedo = false;
    bool use_pass_normal = false;
+    bool use_pass_flow = false;
  };
  Denoiser denoiser_;

--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -19,6 +19,7 @@
 #include "device/kernel.h"

 #include "device/graphics_interop.h"
+#include "util/debug.h"
 #include "util/log.h"
 #include "util/map.h"
 #include "util/string.h"
@@ -42,7 +43,7 @@ struct DeviceKernelArguments {
    KERNEL_FILM_CONVERT,
  };

-  static const int MAX_ARGS = 16;
+  static const int MAX_ARGS = 18;
  Type types[MAX_ARGS];
  void *values[MAX_ARGS];
  size_t sizes[MAX_ARGS];
@@ -85,6 +86,8 @@ struct DeviceKernelArguments {
  }
  void add(const Type type, const void *value, size_t size)
  {
+    assert(count < MAX_ARGS);
+
    types[count] = type;
    values[count] = (void *)value;
    sizes[count] = size;
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -115,7 +115,9 @@ bool PathTrace::ready_to_reset()
  return false;
 }

-void PathTrace::reset(const BufferParams &full_params, const BufferParams &big_tile_params)
+void PathTrace::reset(const BufferParams &full_params,
+                      const BufferParams &big_tile_params,
+                      const bool reset_rendering)
 {
  if (big_tile_params_.modified(big_tile_params)) {
    big_tile_params_ = big_tile_params;
@@ -128,7 +130,7 @@ void PathTrace::reset(const BufferParams &full_params, const BufferParams &big_t
   * It is requires to inform about reset whenever it happens, so that the redraw state tracking is
   * properly updated. */
  if (display_) {
-    display_->reset(full_params);
+    display_->reset(big_tile_params, reset_rendering);
  }

  render_state_.has_denoised_result = false;
@@ -594,6 +596,15 @@ void PathTrace::draw()
  did_draw_after_reset_ |= display_->draw();
 }

+void PathTrace::flush_display()
+{
+  if (!display_) {
+    return;
+  }
+
+  display_->flush();
+}
+
 void PathTrace::update_display(const RenderWork &render_work)
 {
  if (!render_work.display.update) {
@@ -622,9 +633,8 @@ void PathTrace::update_display(const RenderWork &render_work)
  if (display_) {
    VLOG(3) << "Perform copy to GPUDisplay work.";

-    const int resolution_divider = render_work.resolution_divider;
-    const int texture_width = max(1, full_params_.width / resolution_divider);
-    const int texture_height = max(1, full_params_.height / resolution_divider);
+    const int texture_width = render_state_.effective_big_tile_params.window_width;
+    const int texture_height = render_state_.effective_big_tile_params.window_height;
    if (!display_->update_begin(texture_width, texture_height)) {
      LOG(ERROR) << "Error beginning GPUDisplay update.";
      return;
--- a/intern/cycles/integrator/path_trace.h
+++ b/intern/cycles/integrator/path_trace.h
@@ -72,7 +72,9 @@ class PathTrace {
   * render result. */
  bool ready_to_reset();

-  void reset(const BufferParams &full_params, const BufferParams &big_tile_params);
+  void reset(const BufferParams &full_params,
+             const BufferParams &big_tile_params,
+             bool reset_rendering);

  void device_free();

@@ -112,6 +114,9 @@ class PathTrace {
  /* Perform drawing of the current state of the DisplayDriver. */
  void draw();

+  /* Flush outstanding display commands before ending the render loop. */
+  void flush_display();
+
  /* Cancel rendering process as soon as possible, without waiting for full tile to be sampled.
   * Used in cases like reset of render session.
   *
--- a/intern/cycles/integrator/path_trace_display.cpp
+++ b/intern/cycles/integrator/path_trace_display.cpp
@@ -26,15 +26,20 @@ PathTraceDisplay::PathTraceDisplay(unique_ptr<DisplayDriver> driver) : driver_(m
 {
 }

-void PathTraceDisplay::reset(const BufferParams &buffer_params)
+void PathTraceDisplay::reset(const BufferParams &buffer_params, const bool reset_rendering)
 {
  thread_scoped_lock lock(mutex_);

-  params_.full_offset = make_int2(buffer_params.full_x, buffer_params.full_y);
+  params_.full_offset = make_int2(buffer_params.full_x + buffer_params.window_x,
+                                  buffer_params.full_y + buffer_params.window_y);
  params_.full_size = make_int2(buffer_params.full_width, buffer_params.full_height);
-  params_.size = make_int2(buffer_params.width, buffer_params.height);
+  params_.size = make_int2(buffer_params.window_width, buffer_params.window_height);

  texture_state_.is_outdated = true;
+
+  if (!reset_rendering) {
+    driver_->next_tile_begin();
+  }
 }

 void PathTraceDisplay::mark_texture_updated()
@@ -248,4 +253,9 @@ bool PathTraceDisplay::draw()
  return !is_outdated;
 }

+void PathTraceDisplay::flush()
+{
+  driver_->flush();
+}
+
 CCL_NAMESPACE_END
--- a/intern/cycles/integrator/path_trace_display.h
+++ b/intern/cycles/integrator/path_trace_display.h
@@ -38,14 +38,17 @@ class BufferParams;

 class PathTraceDisplay {
 public:
-  PathTraceDisplay(unique_ptr<DisplayDriver> driver);
+  explicit PathTraceDisplay(unique_ptr<DisplayDriver> driver);
  virtual ~PathTraceDisplay() = default;

  /* Reset the display for the new state of render session. Is called whenever session is reset,
   * which happens on changes like viewport navigation or viewport dimension change.
   *
-   * This call will configure parameters for a changed buffer and reset the texture state. */
-  void reset(const BufferParams &buffer_params);
+   * This call will configure parameters for a changed buffer and reset the texture state.
+   *
+   * When the `reset_rendering` a complete display reset happens. When it is false reset happens
+   * for a new state of the buffer parameters which is assumed to correspond to the next tile. */
+  void reset(const BufferParams &buffer_params, bool reset_rendering);

  /* --------------------------------------------------------------------
   * Update procedure.
@@ -151,6 +154,9 @@ class PathTraceDisplay {
   * Returns true if this call did draw an updated state of the texture. */
  bool draw();

+  /* Flush outstanding display commands before ending the render loop. */
+  void flush();
+
 private:
  /* Display driver implemented by the host application. */
  unique_ptr<DisplayDriver> driver_;
--- a/intern/cycles/integrator/path_trace_work.cpp
+++ b/intern/cycles/integrator/path_trace_work.cpp
@@ -194,10 +194,10 @@ PassAccessor::Destination PathTraceWork::get_display_destination_template(
  PassAccessor::Destination destination(film_->get_display_pass());

  const int2 display_texture_size = display->get_texture_size();
-  const int texture_x = effective_buffer_params_.full_x - effective_full_params_.full_x +
-                        effective_buffer_params_.window_x;
-  const int texture_y = effective_buffer_params_.full_y - effective_full_params_.full_y +
-                        effective_buffer_params_.window_y;
+  const int texture_x = effective_buffer_params_.full_x - effective_big_tile_params_.full_x +
+                        effective_buffer_params_.window_x - effective_big_tile_params_.window_x;
+  const int texture_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y +
+                        effective_buffer_params_.window_y - effective_big_tile_params_.window_y;

  destination.offset = texture_y * display_texture_size.x + texture_x;
  destination.stride = display_texture_size.x;
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -875,8 +875,10 @@ void PathTraceWorkGPU::copy_to_display_naive(PathTraceDisplay *display,
  const int final_width = buffers_->params.window_width;
  const int final_height = buffers_->params.window_height;

-  const int texture_x = full_x - effective_full_params_.full_x + effective_buffer_params_.window_x;
-  const int texture_y = full_y - effective_full_params_.full_y + effective_buffer_params_.window_y;
+  const int texture_x = full_x - effective_big_tile_params_.full_x +
+                        effective_buffer_params_.window_x - effective_big_tile_params_.window_x;
+  const int texture_y = full_y - effective_big_tile_params_.full_y +
+                        effective_buffer_params_.window_y - effective_big_tile_params_.window_y;

  /* Re-allocate display memory if needed, and make sure the device pointer is allocated.
   *
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -406,9 +406,6 @@ bool RenderScheduler::set_postprocess_render_work(RenderWork *render_work)
    any_scheduled = true;
  }

-  /* Force update. */
-  any_scheduled = true;
-
  if (any_scheduled) {
    render_work->display.update = true;
  }
--- a/intern/cycles/integrator/render_scheduler.h
+++ b/intern/cycles/integrator/render_scheduler.h
@@ -283,7 +283,7 @@ class RenderScheduler {
  /* Check whether timing report about the given work need to reset accumulated average time. */
  bool work_report_reset_average(const RenderWork &render_work);

-  /* CHeck whether render time limit has been reached (or exceeded), and if so store related
+  /* Check whether render time limit has been reached (or exceeded), and if so store related
   * information in the state so that rendering is considered finished, and is possible to report
   * average render time information. */
  void check_time_limit_reached();
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -756,6 +756,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
                             int guiding_pass_stride,
                             int guiding_pass_albedo,
                             int guiding_pass_normal,
+                             int guiding_pass_flow,
                             ccl_global const float *render_buffer,
                             int render_offset,
                             int render_stride,
@@ -763,6 +764,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
                             int render_pass_sample_count,
                             int render_pass_denoising_albedo,
                             int render_pass_denoising_normal,
+                             int render_pass_motion,
                             int full_x,
                             int full_y,
                             int width,
@@ -814,6 +816,17 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
    normal_out[1] = normal_in[1] * pixel_scale;
    normal_out[2] = normal_in[2] * pixel_scale;
  }
+
+  /* Flow pass. */
+  if (guiding_pass_flow != PASS_UNUSED) {
+    kernel_assert(render_pass_motion != PASS_UNUSED);
+
+    ccl_global const float *motion_in = buffer + render_pass_motion;
+    ccl_global float *flow_out = guiding_pixel + guiding_pass_flow;
+
+    flow_out[0] = -motion_in[0] * pixel_scale;
+    flow_out[1] = -motion_in[1] * pixel_scale;
+  }
 }

 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
@@ -899,7 +912,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  else {
    /* Assigning to zero since this is a default alpha value for 3-component passes, and it
     * is an opaque pixel for 4 component passes. */
-
    denoised_pixel[3] = 0;
  }
 }
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -98,8 +98,12 @@ using namespace metal::raytracing;
 #define FN14(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14;
 #define FN15(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15;
 #define FN16(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16;
-#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, ...) p16
-#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0)
+#define FN17(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17;
+#define FN18(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18;
+#define FN19(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; p19;
+#define FN20(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; p19; p20;
+#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, ...) p20
+#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN20, FN19, FN18, FN17, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0)

 /* Generate a struct containing the entry-point parameters and a "run"
 * method which can access them implicitly via this-> */
--- a/intern/cycles/kernel/geom/motion_triangle.h
+++ b/intern/cycles/kernel/geom/motion_triangle.h
@@ -116,6 +116,52 @@ ccl_device_inline void motion_triangle_vertices(
  verts[2] = (1.0f - t) * verts[2] + t * next_verts[2];
 }

+ccl_device_inline void motion_triangle_vertices_and_normals(
+    KernelGlobals kg, int object, int prim, float time, float3 verts[3], float3 normals[3])
+{
+  /* get motion info */
+  int numsteps, numverts;
+  object_motion_info(kg, object, &numsteps, &numverts, NULL);
+
+  /* Figure out which steps we need to fetch and their interpolation factor. */
+  int maxstep = numsteps * 2;
+  int step = min((int)(time * maxstep), maxstep - 1);
+  float t = time * maxstep - step;
+
+  /* Find attribute. */
+  int offset = intersection_find_attribute(kg, object, ATTR_STD_MOTION_VERTEX_POSITION);
+  kernel_assert(offset != ATTR_STD_NOT_FOUND);
+
+  /* Fetch vertex coordinates. */
+  float3 next_verts[3];
+  uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
+
+  motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
+  motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step + 1, next_verts);
+
+  /* Interpolate between steps. */
+  verts[0] = (1.0f - t) * verts[0] + t * next_verts[0];
+  verts[1] = (1.0f - t) * verts[1] + t * next_verts[1];
+  verts[2] = (1.0f - t) * verts[2] + t * next_verts[2];
+
+  /* Compute smooth normal. */
+
+  /* Find attribute. */
+  offset = intersection_find_attribute(kg, object, ATTR_STD_MOTION_VERTEX_NORMAL);
+  kernel_assert(offset != ATTR_STD_NOT_FOUND);
+
+  /* Fetch vertex coordinates. */
+  float3 next_normals[3];
+  motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step, normals);
+  motion_triangle_normals_for_step(
+      kg, tri_vindex, offset, numverts, numsteps, step + 1, next_normals);
+
+  /* Interpolate between steps. */
+  normals[0] = (1.0f - t) * normals[0] + t * next_normals[0];
+  normals[1] = (1.0f - t) * normals[1] + t * next_normals[1];
+  normals[2] = (1.0f - t) * normals[2] + t * next_normals[2];
+}
+
 ccl_device_inline float3 motion_triangle_smooth_normal(
    KernelGlobals kg, float3 Ng, int object, int prim, float u, float v, float time)
 {
--- a/intern/cycles/kernel/light/sample.h
+++ b/intern/cycles/kernel/light/sample.h
@@ -141,14 +141,23 @@ ccl_device_inline float3 shadow_ray_smooth_surface_offset(
    KernelGlobals kg, ccl_private const ShaderData *ccl_restrict sd, float3 Ng)
 {
  float3 V[3], N[3];
-  triangle_vertices_and_normals(kg, sd->prim, V, N);
+
+  if (sd->type == PRIMITIVE_MOTION_TRIANGLE) {
+    motion_triangle_vertices_and_normals(kg, sd->object, sd->prim, sd->time, V, N);
+  }
+  else {
+    kernel_assert(sd->type == PRIMITIVE_TRIANGLE);
+    triangle_vertices_and_normals(kg, sd->prim, V, N);
+  }

  const float u = sd->u, v = sd->v;
  const float w = 1 - u - v;
  float3 P = V[0] * u + V[1] * v + V[2] * w; /* Local space */
  float3 n = N[0] * u + N[1] * v + N[2] * w; /* We get away without normalization */

-  object_normal_transform(kg, sd, &n); /* Normal x scale, world space */
+  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+    object_normal_transform(kg, sd, &n); /* Normal x scale, world space */
+  }

  /* Parabolic approximation */
  float a = dot(N[2] - N[0], V[0] - V[2]);
--- a/intern/cycles/kernel/svm/magic.h
+++ b/intern/cycles/kernel/svm/magic.h
@@ -25,7 +25,7 @@ ccl_device_noinline_cpu float3 svm_magic(float3 p, float scale, int n, float dis
  /*
   * Prevent NaNs due to input p
   * Sin and Cosine are periodic about [0 2*PI) so the following
-   * will yeild a more accurate result. As it stops the input values
+   * will yield a more accurate result. As it stops the input values
   * going out of range for floats which caused a NaN. The
   * calculation of (px + py + pz)*5 can cause an Inf when one or more
   * values are very large the cos or sin of this results in a NaN
--- a/intern/cycles/kernel/svm/types.h
+++ b/intern/cycles/kernel/svm/types.h
@@ -124,7 +124,7 @@ typedef enum ShaderNodeType {
  NODE_AOV_VALUE,
  NODE_FLOAT_CURVE,
  /* NOTE: for best OpenCL performance, item definition in the enum must
-   * match the switch case order in svm.h. */
+   * match the switch case order in `svm.h`. */
 } ShaderNodeType;

 typedef enum NodeAttributeOutputType {
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -388,6 +388,7 @@ typedef enum PassType {
  PASS_DENOISING_NORMAL,
  PASS_DENOISING_ALBEDO,
  PASS_DENOISING_DEPTH,
+  PASS_DENOISING_PREVIOUS,

  /* PASS_SHADOW_CATCHER accumulates contribution of shadow catcher object which is not affected by
   * any other object. The pass accessor will divide the combined pass by the shadow catcher. The
--- a/intern/cycles/scene/geometry.cpp
+++ b/intern/cycles/scene/geometry.cpp
@@ -1002,10 +1002,10 @@ void GeometryManager::device_update_attributes(Device *device,

  /* After mesh attributes and patch tables have been copied to device memory,
   * we need to update offsets in the objects. */
-  scene->object_manager->device_update_mesh_offsets(device, dscene, scene);
+  scene->object_manager->device_update_geom_offsets(device, dscene, scene);
 }

-void GeometryManager::mesh_calc_offset(Scene *scene, BVHLayout bvh_layout)
+void GeometryManager::geom_calc_offset(Scene *scene, BVHLayout bvh_layout)
 {
  size_t vert_size = 0;
  size_t tri_size = 0;
@@ -1922,7 +1922,7 @@ void GeometryManager::device_update(Device *device,

  const BVHLayout bvh_layout = BVHParams::best_bvh_layout(scene->params.bvh_layout,
                                                          device->get_bvh_layout_mask());
-  mesh_calc_offset(scene, bvh_layout);
+  geom_calc_offset(scene, bvh_layout);
  if (true_displacement_used || curve_shadow_transparency_used) {
    scoped_callback_timer timer([scene](double time) {
      if (scene->update_stats) {
--- a/intern/cycles/scene/geometry.h
+++ b/intern/cycles/scene/geometry.h
@@ -242,7 +242,7 @@ class GeometryManager {
                             vector<AttributeRequestSet> &object_attributes);

  /* Compute verts/triangles/curves offsets in global arrays. */
-  void mesh_calc_offset(Scene *scene, BVHLayout bvh_layout);
+  void geom_calc_offset(Scene *scene, BVHLayout bvh_layout);

  void device_update_object(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);

--- a/intern/cycles/scene/object.cpp
+++ b/intern/cycles/scene/object.cpp
@@ -821,7 +821,7 @@ void ObjectManager::device_update_flags(
  dscene->object_volume_step.clear_modified();
 }

-void ObjectManager::device_update_mesh_offsets(Device *, DeviceScene *dscene, Scene *scene)
+void ObjectManager::device_update_geom_offsets(Device *, DeviceScene *dscene, Scene *scene)
 {
  if (dscene->objects.size() == 0) {
    return;
--- a/intern/cycles/scene/object.h
+++ b/intern/cycles/scene/object.h
@@ -162,7 +162,7 @@ class ObjectManager {
                           Scene *scene,
                           Progress &progress,
                           bool bounds_valid = true);
-  void device_update_mesh_offsets(Device *device, DeviceScene *dscene, Scene *scene);
+  void device_update_geom_offsets(Device *device, DeviceScene *dscene, Scene *scene);

  void device_free(Device *device, DeviceScene *dscene, bool force_free);

--- a/intern/cycles/scene/pass.cpp
+++ b/intern/cycles/scene/pass.cpp
@@ -101,6 +101,7 @@ const NodeEnum *Pass::get_type_enum()
    pass_type_enum.insert("denoising_normal", PASS_DENOISING_NORMAL);
    pass_type_enum.insert("denoising_albedo", PASS_DENOISING_ALBEDO);
    pass_type_enum.insert("denoising_depth", PASS_DENOISING_DEPTH);
+    pass_type_enum.insert("denoising_previous", PASS_DENOISING_PREVIOUS);

    pass_type_enum.insert("shadow_catcher", PASS_SHADOW_CATCHER);
    pass_type_enum.insert("shadow_catcher_sample_count", PASS_SHADOW_CATCHER_SAMPLE_COUNT);
@@ -299,6 +300,10 @@ PassInfo Pass::get_info(const PassType type, const bool include_albedo)
    case PASS_DENOISING_DEPTH:
      pass_info.num_components = 1;
      break;
+    case PASS_DENOISING_PREVIOUS:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      break;

    case PASS_SHADOW_CATCHER:
      pass_info.num_components = 3;
--- a/intern/cycles/session/denoising.cpp
+++ b/intern/cycles/session/denoising.cpp
@@ -16,62 +16,17 @@

 #include "session/denoising.h"

-#if 0
+#include "util/map.h"
+#include "util/system.h"
+#include "util/task.h"
+#include "util/time.h"

-#  include "kernel/filter/filter_defines.h"
-
-#  include "util/util_foreach.h"
-#  include "util/util_map.h"
-#  include "util/util_system.h"
-#  include "util/util_task.h"
-#  include "util/util_time.h"
-
-#  include <OpenImageIO/filesystem.h>
+#include <OpenImageIO/filesystem.h>

 CCL_NAMESPACE_BEGIN

 /* Utility Functions */

-static void print_progress(int num, int total, int frame, int num_frames)
-{
-  const char *label = "Denoise Frame ";
-  int cols = system_console_width();
-
-  cols -= strlen(label);
-
-  int len = 1;
-  for (int x = total; x > 9; x /= 10) {
-    len++;
-  }
-
-  int bars = cols - 2 * len - 6;
-
-  printf("\r%s", label);
-
-  if (num_frames > 1) {
-    int frame_len = 1;
-    for (int x = num_frames - 1; x > 9; x /= 10) {
-      frame_len++;
-    }
-    bars -= frame_len + 2;
-    printf("%*d ", frame_len, frame);
-  }
-
-  int v = int(float(num) * bars / total);
-  printf("[");
-  for (int i = 0; i < v; i++) {
-    printf("=");
-  }
-  if (v < bars) {
-    printf(">");
-  }
-  for (int i = v + 1; i < bars; i++) {
-    printf(" ");
-  }
-  printf(string_printf("] %%%dd / %d", len, total).c_str(), num);
-  fflush(stdout);
-}
-
 /* Splits in at its last dot, setting suffix to the part after the dot and in to the part before
 * it. Returns whether a dot was found. */
 static bool split_last_dot(string &in, string &suffix)
@@ -125,24 +80,18 @@ static void fill_mapping(vector<ChannelMapping> &map, int pos, string name, stri
  }
 }

-static const int INPUT_NUM_CHANNELS = 15;
-static const int INPUT_DENOISING_DEPTH = 0;
-static const int INPUT_DENOISING_NORMAL = 1;
-static const int INPUT_DENOISING_SHADOWING = 4;
-static const int INPUT_DENOISING_ALBEDO = 5;
-static const int INPUT_NOISY_IMAGE = 8;
-static const int INPUT_DENOISING_VARIANCE = 11;
-static const int INPUT_DENOISING_INTENSITY = 14;
+static const int INPUT_NUM_CHANNELS = 13;
+static const int INPUT_NOISY_IMAGE = 0;
+static const int INPUT_DENOISING_NORMAL = 3;
+static const int INPUT_DENOISING_ALBEDO = 6;
+static const int INPUT_MOTION = 9;
 static vector<ChannelMapping> input_channels()
 {
  vector<ChannelMapping> map;
-  fill_mapping(map, INPUT_DENOISING_DEPTH, "Denoising Depth", "Z");
+  fill_mapping(map, INPUT_NOISY_IMAGE, "Combined", "RGB");
  fill_mapping(map, INPUT_DENOISING_NORMAL, "Denoising Normal", "XYZ");
-  fill_mapping(map, INPUT_DENOISING_SHADOWING, "Denoising Shadowing", "X");
  fill_mapping(map, INPUT_DENOISING_ALBEDO, "Denoising Albedo", "RGB");
-  fill_mapping(map, INPUT_NOISY_IMAGE, "Noisy Image", "RGB");
-  fill_mapping(map, INPUT_DENOISING_VARIANCE, "Denoising Variance", "RGB");
-  fill_mapping(map, INPUT_DENOISING_INTENSITY, "Denoising Intensity", "X");
+  fill_mapping(map, INPUT_MOTION, "Vector", "XYZW");
  return map;
 }

@@ -162,7 +111,7 @@ bool DenoiseImageLayer::detect_denoising_channels()
  input_to_image_channel.clear();
  input_to_image_channel.resize(INPUT_NUM_CHANNELS, -1);

-  foreach (const ChannelMapping &mapping, input_channels()) {
+  for (const ChannelMapping &mapping : input_channels()) {
    vector<string>::iterator i = find(channels.begin(), channels.end(), mapping.name);
    if (i == channels.end()) {
      return false;
@@ -177,7 +126,7 @@ bool DenoiseImageLayer::detect_denoising_channels()
  output_to_image_channel.clear();
  output_to_image_channel.resize(OUTPUT_NUM_CHANNELS, -1);

-  foreach (const ChannelMapping &mapping, output_channels()) {
+  for (const ChannelMapping &mapping : output_channels()) {
    vector<string>::iterator i = find(channels.begin(), channels.end(), mapping.name);
    if (i == channels.end()) {
      return false;
@@ -199,18 +148,16 @@ bool DenoiseImageLayer::detect_denoising_channels()
  return true;
 }

-bool DenoiseImageLayer::match_channels(int neighbor,
-                                       const std::vector<string> &channelnames,
+bool DenoiseImageLayer::match_channels(const std::vector<string> &channelnames,
                                       const std::vector<string> &neighbor_channelnames)
 {
-  neighbor_input_to_image_channel.resize(neighbor + 1);
-  vector<int> &mapping = neighbor_input_to_image_channel[neighbor];
+  vector<int> &mapping = previous_output_to_image_channel;

  assert(mapping.size() == 0);
-  mapping.resize(input_to_image_channel.size(), -1);
+  mapping.resize(output_to_image_channel.size(), -1);

-  for (int i = 0; i < input_to_image_channel.size(); i++) {
-    const string &channel = channelnames[input_to_image_channel[i]];
+  for (int i = 0; i < output_to_image_channel.size(); i++) {
+    const string &channel = channelnames[output_to_image_channel[i]];
    std::vector<string>::const_iterator frame_channel = find(
        neighbor_channelnames.begin(), neighbor_channelnames.end(), channel);

@@ -226,19 +173,9 @@ bool DenoiseImageLayer::match_channels(int neighbor,

 /* Denoise Task */

-DenoiseTask::DenoiseTask(Device *device,
-                         DenoiserPipeline *denoiser,
-                         int frame,
-                         const vector<int> &neighbor_frames)
-    : denoiser(denoiser),
-      device(device),
-      frame(frame),
-      neighbor_frames(neighbor_frames),
-      current_layer(0),
-      input_pixels(device, "filter input buffer", MEM_READ_ONLY),
-      num_tiles(0)
+DenoiseTask::DenoiseTask(Device *device, DenoiserPipeline *denoiser, int frame)
+    : denoiser(denoiser), device(device), frame(frame), current_layer(0), buffers(device)
 {
-  image.samples = denoiser->samples_override;
 }

 DenoiseTask::~DenoiseTask()
@@ -246,284 +183,39 @@ DenoiseTask::~DenoiseTask()
  free();
 }

-/* Device callbacks */
-
-bool DenoiseTask::acquire_tile(Device *device, Device *tile_device, RenderTile &tile)
-{
-  thread_scoped_lock tile_lock(tiles_mutex);
-
-  if (tiles.empty()) {
-    return false;
-  }
-
-  tile = tiles.front();
-  tiles.pop_front();
-
-  device->map_tile(tile_device, tile);
-
-  print_progress(num_tiles - tiles.size(), num_tiles, frame, denoiser->num_frames);
-
-  return true;
-}
-
-/* Mapping tiles is required for regular rendering since each tile has its separate memory
- * which may be allocated on a different device.
- * For standalone denoising, there is a single memory that is present on all devices, so the only
- * thing that needs to be done here is to specify the surrounding tile geometry.
- *
- * However, since there is only one large memory, the denoised result has to be written to
- * a different buffer to avoid having to copy an entire horizontal slice of the image. */
-void DenoiseTask::map_neighboring_tiles(RenderTileNeighbors &neighbors, Device *tile_device)
-{
-  RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
-  RenderTile &target_tile = neighbors.target;
-
-  /* Fill tile information. */
-  for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-    if (i == RenderTileNeighbors::CENTER) {
-      continue;
-    }
-
-    RenderTile &tile = neighbors.tiles[i];
-    int dx = (i % 3) - 1;
-    int dy = (i / 3) - 1;
-    tile.x = clamp(center_tile.x + dx * denoiser->tile_size.x, 0, image.width);
-    tile.w = clamp(center_tile.x + (dx + 1) * denoiser->tile_size.x, 0, image.width) - tile.x;
-    tile.y = clamp(center_tile.y + dy * denoiser->tile_size.y, 0, image.height);
-    tile.h = clamp(center_tile.y + (dy + 1) * denoiser->tile_size.y, 0, image.height) - tile.y;
-
-    tile.buffer = center_tile.buffer;
-    tile.offset = center_tile.offset;
-    tile.stride = image.width;
-  }
-
-  /* Allocate output buffer. */
-  device_vector<float> *output_mem = new device_vector<float>(
-      tile_device, "denoising_output", MEM_READ_WRITE);
-  output_mem->alloc(OUTPUT_NUM_CHANNELS * center_tile.w * center_tile.h);
-
-  /* Fill output buffer with noisy image, assumed by kernel_filter_finalize
-   * when skipping denoising of some pixels. */
-  float *result = output_mem->data();
-  float *in = &image.pixels[image.num_channels * (center_tile.y * image.width + center_tile.x)];
-
-  const DenoiseImageLayer &layer = image.layers[current_layer];
-  const int *input_to_image_channel = layer.input_to_image_channel.data();
-
-  for (int y = 0; y < center_tile.h; y++) {
-    for (int x = 0; x < center_tile.w; x++, result += OUTPUT_NUM_CHANNELS) {
-      for (int i = 0; i < OUTPUT_NUM_CHANNELS; i++) {
-        result[i] = in[image.num_channels * x + input_to_image_channel[INPUT_NOISY_IMAGE + i]];
-      }
-    }
-    in += image.num_channels * image.width;
-  }
-
-  output_mem->copy_to_device();
-
-  /* Fill output tile info. */
-  target_tile = center_tile;
-  target_tile.buffer = output_mem->device_pointer;
-  target_tile.stride = target_tile.w;
-  target_tile.offset -= target_tile.x + target_tile.y * target_tile.stride;
-
-  thread_scoped_lock output_lock(output_mutex);
-  assert(output_pixels.count(center_tile.tile_index) == 0);
-  output_pixels[target_tile.tile_index] = output_mem;
-}
-
-void DenoiseTask::unmap_neighboring_tiles(RenderTileNeighbors &neighbors)
-{
-  RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
-  RenderTile &target_tile = neighbors.target;
-
-  thread_scoped_lock output_lock(output_mutex);
-  assert(output_pixels.count(center_tile.tile_index) == 1);
-  device_vector<float> *output_mem = output_pixels[target_tile.tile_index];
-  output_pixels.erase(center_tile.tile_index);
-  output_lock.unlock();
-
-  /* Copy denoised pixels from device. */
-  output_mem->copy_from_device(0, OUTPUT_NUM_CHANNELS * target_tile.w, target_tile.h);
-
-  float *result = output_mem->data();
-  float *out = &image.pixels[image.num_channels * (target_tile.y * image.width + target_tile.x)];
-
-  const DenoiseImageLayer &layer = image.layers[current_layer];
-  const int *output_to_image_channel = layer.output_to_image_channel.data();
-
-  for (int y = 0; y < target_tile.h; y++) {
-    for (int x = 0; x < target_tile.w; x++, result += OUTPUT_NUM_CHANNELS) {
-      for (int i = 0; i < OUTPUT_NUM_CHANNELS; i++) {
-        out[image.num_channels * x + output_to_image_channel[i]] = result[i];
-      }
-    }
-    out += image.num_channels * image.width;
-  }
-
-  /* Free device buffer. */
-  output_mem->free();
-  delete output_mem;
-}
-
-void DenoiseTask::release_tile()
-{
-}
-
-bool DenoiseTask::get_cancel()
-{
-  return false;
-}
-
-void DenoiseTask::create_task(DeviceTask &task)
-{
-  /* Callback functions. */
-  task.acquire_tile = function_bind(&DenoiseTask::acquire_tile, this, device, _1, _2);
-  task.map_neighbor_tiles = function_bind(&DenoiseTask::map_neighboring_tiles, this, _1, _2);
-  task.unmap_neighbor_tiles = function_bind(&DenoiseTask::unmap_neighboring_tiles, this, _1);
-  task.release_tile = function_bind(&DenoiseTask::release_tile, this);
-  task.get_cancel = function_bind(&DenoiseTask::get_cancel, this);
-
-  /* Denoising parameters. */
-  task.denoising = denoiser->params;
-  task.denoising.type = DENOISER_NLM;
-  task.denoising.use = true;
-  task.denoising_from_render = false;
-
-  task.denoising_frames.resize(neighbor_frames.size());
-  for (int i = 0; i < neighbor_frames.size(); i++) {
-    task.denoising_frames[i] = neighbor_frames[i] - frame;
-  }
-
-  /* Buffer parameters. */
-  task.pass_stride = INPUT_NUM_CHANNELS;
-  task.target_pass_stride = OUTPUT_NUM_CHANNELS;
-  task.pass_denoising_data = 0;
-  task.pass_denoising_clean = -1;
-  task.frame_stride = image.width * image.height * INPUT_NUM_CHANNELS;
-
-  /* Create tiles. */
-  thread_scoped_lock tile_lock(tiles_mutex);
-  thread_scoped_lock output_lock(output_mutex);
-
-  tiles.clear();
-  assert(output_pixels.empty());
-  output_pixels.clear();
-
-  int tiles_x = divide_up(image.width, denoiser->tile_size.x);
-  int tiles_y = divide_up(image.height, denoiser->tile_size.y);
-
-  for (int ty = 0; ty < tiles_y; ty++) {
-    for (int tx = 0; tx < tiles_x; tx++) {
-      RenderTile tile;
-      tile.x = tx * denoiser->tile_size.x;
-      tile.y = ty * denoiser->tile_size.y;
-      tile.w = min(image.width - tile.x, denoiser->tile_size.x);
-      tile.h = min(image.height - tile.y, denoiser->tile_size.y);
-      tile.start_sample = 0;
-      tile.num_samples = image.layers[current_layer].samples;
-      tile.sample = 0;
-      tile.offset = 0;
-      tile.stride = image.width;
-      tile.tile_index = ty * tiles_x + tx;
-      tile.task = RenderTile::DENOISE;
-      tile.buffers = NULL;
-      tile.buffer = input_pixels.device_pointer;
-      tiles.push_back(tile);
-    }
-  }
-
-  num_tiles = tiles.size();
-}
-
 /* Denoiser Operations */

 bool DenoiseTask::load_input_pixels(int layer)
 {
-  int w = image.width;
-  int h = image.height;
-  int num_pixels = image.width * image.height;
-  int frame_stride = num_pixels * INPUT_NUM_CHANNELS;
-
  /* Load center image */
  DenoiseImageLayer &image_layer = image.layers[layer];

-  float *buffer_data = input_pixels.data();
-  image.read_pixels(image_layer, buffer_data);
-  buffer_data += frame_stride;
+  float *buffer_data = buffers.buffer.data();
+  image.read_pixels(image_layer, buffers.params, buffer_data);

-  /* Load neighbor images */
-  for (int i = 0; i < image.in_neighbors.size(); i++) {
-    if (!image.read_neighbor_pixels(i, image_layer, buffer_data)) {
-      error = "Failed to read neighbor frame pixels";
-      return false;
-    }
-    buffer_data += frame_stride;
-  }
-
-  /* Preprocess */
-  buffer_data = input_pixels.data();
-  for (int neighbor = 0; neighbor < image.in_neighbors.size() + 1; neighbor++) {
-    /* Clamp */
-    if (denoiser->params.clamp_input) {
-      for (int i = 0; i < num_pixels * INPUT_NUM_CHANNELS; i++) {
-        buffer_data[i] = clamp(buffer_data[i], -1e8f, 1e8f);
-      }
-    }
-
-    /* Box blur */
-    int r = 5 * denoiser->params.radius;
-    float *data = buffer_data + 14;
-    array<float> temp(num_pixels);
-
-    for (int y = 0; y < h; y++) {
-      for (int x = 0; x < w; x++) {
-        int n = 0;
-        float sum = 0.0f;
-        for (int dx = max(x - r, 0); dx < min(x + r + 1, w); dx++, n++) {
-          sum += data[INPUT_NUM_CHANNELS * (y * w + dx)];
-        }
-        temp[y * w + x] = sum / n;
-      }
-    }
-
-    for (int y = 0; y < h; y++) {
-      for (int x = 0; x < w; x++) {
-        int n = 0;
-        float sum = 0.0f;
-
-        for (int dy = max(y - r, 0); dy < min(y + r + 1, h); dy++, n++) {
-          sum += temp[dy * w + x];
-        }
-
-        data[INPUT_NUM_CHANNELS * (y * w + x)] = sum / n;
-      }
-    }
-
-    /* Highlight compression */
-    data = buffer_data + 8;
-    for (int y = 0; y < h; y++) {
-      for (int x = 0; x < w; x++) {
-        int idx = INPUT_NUM_CHANNELS * (y * w + x);
-        float3 color = make_float3(data[idx], data[idx + 1], data[idx + 2]);
-        color = color_highlight_compress(color, NULL);
-        data[idx] = color.x;
-        data[idx + 1] = color.y;
-        data[idx + 2] = color.z;
-      }
-    }
-
-    buffer_data += frame_stride;
+  /* Load previous image */
+  if (frame > 0 && !image.read_previous_pixels(image_layer, buffers.params, buffer_data)) {
+    error = "Failed to read neighbor frame pixels";
+    return false;
  }

  /* Copy to device */
-  input_pixels.copy_to_device();
+  buffers.buffer.copy_to_device();

  return true;
 }

 /* Task stages */

+static void add_pass(vector<Pass *> &passes, PassType type, PassMode mode = PassMode::NOISY)
+{
+  Pass *pass = new Pass();
+  pass->set_type(type);
+  pass->set_mode(mode);
+
+  passes.push_back(pass);
+}
+
 bool DenoiseTask::load()
 {
  string center_filepath = denoiser->input[frame];
@@ -531,7 +223,8 @@ bool DenoiseTask::load()
    return false;
  }

-  if (!image.load_neighbors(denoiser->input, neighbor_frames, error)) {
+  /* Use previous frame output as input for subsequent frames. */
+  if (frame > 0 && !image.load_previous(denoiser->output[frame - 1], error)) {
    return false;
  }

@@ -540,10 +233,35 @@ bool DenoiseTask::load()
    return false;
  }

+  /* Enable temporal denoising for frames after the first (which will use the output from the
+   * previous frames). */
+  DenoiseParams params = denoiser->denoiser->get_params();
+  params.temporally_stable = frame > 0;
+  denoiser->denoiser->set_params(params);
+
  /* Allocate device buffer. */
-  int num_frames = image.in_neighbors.size() + 1;
-  input_pixels.alloc(image.width * INPUT_NUM_CHANNELS, image.height * num_frames);
-  input_pixels.zero_to_device();
+  vector<Pass *> passes;
+  add_pass(passes, PassType::PASS_COMBINED);
+  add_pass(passes, PassType::PASS_DENOISING_ALBEDO);
+  add_pass(passes, PassType::PASS_DENOISING_NORMAL);
+  add_pass(passes, PassType::PASS_MOTION);
+  add_pass(passes, PassType::PASS_DENOISING_PREVIOUS);
+  add_pass(passes, PassType::PASS_COMBINED, PassMode::DENOISED);
+
+  BufferParams buffer_params;
+  buffer_params.width = image.width;
+  buffer_params.height = image.height;
+  buffer_params.full_x = 0;
+  buffer_params.full_y = 0;
+  buffer_params.full_width = image.width;
+  buffer_params.full_height = image.height;
+  buffer_params.update_passes(passes);
+
+  for (Pass *pass : passes) {
+    delete pass;
+  }
+
+  buffers.reset(buffer_params);

  /* Read pixels for first layer. */
  current_layer = 0;
@@ -565,10 +283,26 @@ bool DenoiseTask::exec()
    }

    /* Run task on device. */
-    DeviceTask task(DeviceTask::RENDER);
-    create_task(task);
-    device->task_add(task);
-    device->task_wait();
+    denoiser->denoiser->denoise_buffer(buffers.params, &buffers, 1, true);
+
+    /* Copy denoised pixels from device. */
+    buffers.buffer.copy_from_device();
+
+    float *result = buffers.buffer.data(), *out = image.pixels.data();
+
+    const DenoiseImageLayer &layer = image.layers[current_layer];
+    const int *output_to_image_channel = layer.output_to_image_channel.data();
+
+    for (int y = 0; y < image.height; y++) {
+      for (int x = 0; x < image.width; x++, result += buffers.params.pass_stride) {
+        for (int j = 0; j < OUTPUT_NUM_CHANNELS; j++) {
+          int offset = buffers.params.get_pass_offset(PASS_COMBINED, PassMode::DENOISED);
+          int image_channel = output_to_image_channel[j];
+          out[image.num_channels * x + image_channel] = result[offset + j];
+        }
+      }
+      out += image.num_channels * image.width;
+    }

    printf("\n");
  }
@@ -586,8 +320,7 @@ bool DenoiseTask::save()
 void DenoiseTask::free()
 {
  image.free();
-  input_pixels.free();
-  assert(output_pixels.empty());
+  buffers.buffer.free();
 }

 /* Denoise Image Storage */
@@ -607,7 +340,7 @@ DenoiseImage::~DenoiseImage()

 void DenoiseImage::close_input()
 {
-  in_neighbors.clear();
+  in_previous.reset();
 }

 void DenoiseImage::free()
@@ -677,39 +410,61 @@ bool DenoiseImage::parse_channels(const ImageSpec &in_spec, string &error)
  return true;
 }

-void DenoiseImage::read_pixels(const DenoiseImageLayer &layer, float *input_pixels)
+void DenoiseImage::read_pixels(const DenoiseImageLayer &layer,
+                               const BufferParams &params,
+                               float *input_pixels)
 {
  /* Pixels from center file have already been loaded into pixels.
   * We copy a subset into the device input buffer with channels reshuffled. */
  const int *input_to_image_channel = layer.input_to_image_channel.data();

  for (int i = 0; i < width * height; i++) {
-    for (int j = 0; j < INPUT_NUM_CHANNELS; j++) {
-      int image_channel = input_to_image_channel[j];
-      input_pixels[i * INPUT_NUM_CHANNELS + j] =
+    for (int j = 0; j < 3; ++j) {
+      int offset = params.get_pass_offset(PASS_COMBINED);
+      int image_channel = input_to_image_channel[INPUT_NOISY_IMAGE + j];
+      input_pixels[i * params.pass_stride + offset + j] =
+          pixels[((size_t)i) * num_channels + image_channel];
+    }
+    for (int j = 0; j < 3; ++j) {
+      int offset = params.get_pass_offset(PASS_DENOISING_NORMAL);
+      int image_channel = input_to_image_channel[INPUT_DENOISING_NORMAL + j];
+      input_pixels[i * params.pass_stride + offset + j] =
+          pixels[((size_t)i) * num_channels + image_channel];
+    }
+    for (int j = 0; j < 3; ++j) {
+      int offset = params.get_pass_offset(PASS_DENOISING_ALBEDO);
+      int image_channel = input_to_image_channel[INPUT_DENOISING_ALBEDO + j];
+      input_pixels[i * params.pass_stride + offset + j] =
+          pixels[((size_t)i) * num_channels + image_channel];
+    }
+    for (int j = 0; j < 4; ++j) {
+      int offset = params.get_pass_offset(PASS_MOTION);
+      int image_channel = input_to_image_channel[INPUT_MOTION + j];
+      input_pixels[i * params.pass_stride + offset + j] =
          pixels[((size_t)i) * num_channels + image_channel];
    }
  }
 }

-bool DenoiseImage::read_neighbor_pixels(int neighbor,
-                                        const DenoiseImageLayer &layer,
+bool DenoiseImage::read_previous_pixels(const DenoiseImageLayer &layer,
+                                        const BufferParams &params,
                                        float *input_pixels)
 {
  /* Load pixels from neighboring frames, and copy them into device buffer
   * with channels reshuffled. */
  size_t num_pixels = (size_t)width * (size_t)height;
  array<float> neighbor_pixels(num_pixels * num_channels);
-  if (!in_neighbors[neighbor]->read_image(TypeDesc::FLOAT, neighbor_pixels.data())) {
+  if (!in_previous->read_image(TypeDesc::FLOAT, neighbor_pixels.data())) {
    return false;
  }

-  const int *input_to_image_channel = layer.neighbor_input_to_image_channel[neighbor].data();
+  const int *output_to_image_channel = layer.previous_output_to_image_channel.data();

  for (int i = 0; i < width * height; i++) {
-    for (int j = 0; j < INPUT_NUM_CHANNELS; j++) {
-      int image_channel = input_to_image_channel[j];
-      input_pixels[i * INPUT_NUM_CHANNELS + j] =
+    for (int j = 0; j < 3; ++j) {
+      int offset = params.get_pass_offset(PASS_DENOISING_PREVIOUS);
+      int image_channel = output_to_image_channel[j];
+      input_pixels[i * params.pass_stride + offset + j] =
          neighbor_pixels[((size_t)i) * num_channels + image_channel];
    }
  }
@@ -739,8 +494,8 @@ bool DenoiseImage::load(const string &in_filepath, string &error)
    return false;
  }

-  if (layers.size() == 0) {
-    error = "Could not find a render layer containing denoising info";
+  if (layers.empty()) {
+    error = "Could not find a render layer containing denoising data and motion vector passes";
    return false;
  }

@@ -757,46 +512,34 @@ bool DenoiseImage::load(const string &in_filepath, string &error)
  return true;
 }

-bool DenoiseImage::load_neighbors(const vector<string> &filepaths,
-                                  const vector<int> &frames,
-                                  string &error)
+bool DenoiseImage::load_previous(const string &filepath, string &error)
 {
-  if (frames.size() > DENOISE_MAX_FRAMES - 1) {
-    error = string_printf("Maximum number of neighbors (%d) exceeded\n", DENOISE_MAX_FRAMES - 1);
+  if (!Filesystem::is_regular(filepath)) {
+    error = "Couldn't find neighbor frame: " + filepath;
    return false;
  }

-  for (int neighbor = 0; neighbor < frames.size(); neighbor++) {
-    int frame = frames[neighbor];
-    const string &filepath = filepaths[frame];
-
-    if (!Filesystem::is_regular(filepath)) {
-      error = "Couldn't find neighbor frame: " + filepath;
-      return false;
-    }
-
-    unique_ptr<ImageInput> in_neighbor(ImageInput::open(filepath));
-    if (!in_neighbor) {
-      error = "Couldn't open neighbor frame: " + filepath;
-      return false;
-    }
-
-    const ImageSpec &neighbor_spec = in_neighbor->spec();
-    if (neighbor_spec.width != width || neighbor_spec.height != height) {
-      error = "Neighbor frame has different dimensions: " + filepath;
-      return false;
-    }
-
-    foreach (DenoiseImageLayer &layer, layers) {
-      if (!layer.match_channels(neighbor, in_spec.channelnames, neighbor_spec.channelnames)) {
-        error = "Neighbor frame misses denoising data passes: " + filepath;
-        return false;
-      }
-    }
-
-    in_neighbors.push_back(std::move(in_neighbor));
+  unique_ptr<ImageInput> in_neighbor(ImageInput::open(filepath));
+  if (!in_neighbor) {
+    error = "Couldn't open neighbor frame: " + filepath;
+    return false;
  }

+  const ImageSpec &neighbor_spec = in_neighbor->spec();
+  if (neighbor_spec.width != width || neighbor_spec.height != height) {
+    error = "Neighbor frame has different dimensions: " + filepath;
+    return false;
+  }
+
+  for (DenoiseImageLayer &layer : layers) {
+    if (!layer.match_channels(in_spec.channelnames, neighbor_spec.channelnames)) {
+      error = "Neighbor frame misses denoising data passes: " + filepath;
+      return false;
+    }
+  }
+
+  in_previous = std::move(in_neighbor);
+
  return true;
 }

@@ -864,24 +607,22 @@ bool DenoiseImage::save_output(const string &out_filepath, string &error)

 /* File pattern handling and outer loop over frames */

-DenoiserPipeline::DenoiserPipeline(DeviceInfo &device_info)
+DenoiserPipeline::DenoiserPipeline(DeviceInfo &device_info, const DenoiseParams &params)
 {
-  samples_override = 0;
-  tile_size = make_int2(64, 64);
-
-  num_frames = 0;
-
  /* Initialize task scheduler. */
  TaskScheduler::init();

  /* Initialize device. */
-  device = Device::create(device_info, stats, profiler, true);
-
+  device = Device::create(device_info, stats, profiler);
  device->load_kernels(KERNEL_FEATURE_DENOISING);
+
+  denoiser = Denoiser::create(device, params);
+  denoiser->load_kernels(nullptr);
 }

 DenoiserPipeline::~DenoiserPipeline()
 {
+  denoiser.reset();
  delete device;
  TaskScheduler::exit();
 }
@@ -890,7 +631,7 @@ bool DenoiserPipeline::run()
 {
  assert(input.size() == output.size());

-  num_frames = output.size();
+  int num_frames = output.size();

  for (int frame = 0; frame < num_frames; frame++) {
    /* Skip empty output paths. */
@@ -898,16 +639,8 @@ bool DenoiserPipeline::run()
      continue;
    }

-    /* Determine neighbor frame numbers that should be used for filtering. */
-    vector<int> neighbor_frames;
-    for (int f = frame - params.neighbor_frames; f <= frame + params.neighbor_frames; f++) {
-      if (f >= 0 && f < num_frames && f != frame) {
-        neighbor_frames.push_back(f);
-      }
-    }
-
    /* Execute task. */
-    DenoiseTask task(device, this, frame, neighbor_frames);
+    DenoiseTask task(device, this, frame);
    if (!task.load()) {
      error = task.error;
      return false;
@@ -930,5 +663,3 @@ bool DenoiserPipeline::run()
 }

 CCL_NAMESPACE_END
-
-#endif
--- a/intern/cycles/session/denoising.h
+++ b/intern/cycles/session/denoising.h
@@ -17,20 +17,17 @@
 #ifndef __DENOISING_H__
 #define __DENOISING_H__

-#if 0
-
 /* TODO(sergey): Make it explicit and clear when something is a denoiser, its pipeline or
 * parameters. Currently it is an annoying mixture of terms used interchangeably. */

-#  include "device/device.h"
+#include "device/device.h"
+#include "integrator/denoiser.h"

-#  include "render/buffers.h"
+#include "util/string.h"
+#include "util/unique_ptr.h"
+#include "util/vector.h"

-#  include "util/util_string.h"
-#  include "util/util_unique_ptr.h"
-#  include "util/util_vector.h"
-
-#  include <OpenImageIO/imageio.h>
+#include <OpenImageIO/imageio.h>

 OIIO_NAMESPACE_USING

@@ -40,7 +37,7 @@ CCL_NAMESPACE_BEGIN

 class DenoiserPipeline {
 public:
-  DenoiserPipeline(DeviceInfo &device_info);
+  DenoiserPipeline(DeviceInfo &device_info, const DenoiseParams &params);
  ~DenoiserPipeline();

  bool run();
@@ -55,22 +52,13 @@ class DenoiserPipeline {
   * taking into account all input frames. */
  vector<string> output;

-  /* Sample number override, takes precedence over values from input frames. */
-  int samples_override;
-  /* Tile size for processing on device. */
-  int2 tile_size;
-
-  /* Equivalent to the settings in the regular denoiser. */
-  DenoiseParams params;
-
 protected:
  friend class DenoiseTask;

  Stats stats;
  Profiler profiler;
  Device *device;
-
-  int num_frames;
+  std::unique_ptr<Denoiser> denoiser;
 };

 /* Denoise Image Layer */
@@ -88,13 +76,13 @@ struct DenoiseImageLayer {
  /* Device input channel will be copied from image channel input_to_image_channel[i]. */
  vector<int> input_to_image_channel;

-  /* input_to_image_channel of the secondary frames, if any are used. */
-  vector<vector<int>> neighbor_input_to_image_channel;
-
  /* Write i-th channel of the processing output to output_to_image_channel[i]-th channel of the
   * file. */
  vector<int> output_to_image_channel;

+  /* output_to_image_channel of the previous frame, if used. */
+  vector<int> previous_output_to_image_channel;
+
  /* Detect whether this layer contains a full set of channels and set up the offsets accordingly.
   */
  bool detect_denoising_channels();
@@ -102,8 +90,7 @@ struct DenoiseImageLayer {
  /* Map the channels of a secondary frame to the channels that are required for processing,
   * fill neighbor_input_to_image_channel if all are present or return false if a channel are
   * missing. */
-  bool match_channels(int neighbor,
-                      const std::vector<string> &channelnames,
+  bool match_channels(const std::vector<string> &channelnames,
                      const std::vector<string> &neighbor_channelnames);
 };

@@ -125,7 +112,7 @@ class DenoiseImage {

  /* Image file handles */
  ImageSpec in_spec;
-  vector<unique_ptr<ImageInput>> in_neighbors;
+  unique_ptr<ImageInput> in_previous;

  /* Render layers */
  vector<DenoiseImageLayer> layers;
@@ -137,12 +124,16 @@ class DenoiseImage {
  bool load(const string &in_filepath, string &error);

  /* Load neighboring frames. */
-  bool load_neighbors(const vector<string> &filepaths, const vector<int> &frames, string &error);
+  bool load_previous(const string &in_filepath, string &error);

  /* Load subset of pixels from file buffer into input buffer, as needed for denoising
   * on the device. Channels are reshuffled following the provided mapping. */
-  void read_pixels(const DenoiseImageLayer &layer, float *input_pixels);
-  bool read_neighbor_pixels(int neighbor, const DenoiseImageLayer &layer, float *input_pixels);
+  void read_pixels(const DenoiseImageLayer &layer,
+                   const BufferParams &params,
+                   float *input_pixels);
+  bool read_previous_pixels(const DenoiseImageLayer &layer,
+                            const BufferParams &params,
+                            float *input_pixels);

  bool save_output(const string &out_filepath, string &error);

@@ -159,10 +150,7 @@ class DenoiseImage {

 class DenoiseTask {
 public:
-  DenoiseTask(Device *device,
-              DenoiserPipeline *denoiser,
-              int frame,
-              const vector<int> &neighbor_frames);
+  DenoiseTask(Device *device, DenoiserPipeline *denoiser, int frame);
  ~DenoiseTask();

  /* Task stages */
@@ -180,37 +168,17 @@ class DenoiseTask {

  /* Frame number to be denoised */
  int frame;
-  vector<int> neighbor_frames;

  /* Image file data */
  DenoiseImage image;
  int current_layer;

-  /* Device input buffer */
-  device_vector<float> input_pixels;
-
-  /* Tiles */
-  thread_mutex tiles_mutex;
-  list<RenderTile> tiles;
-  int num_tiles;
-
-  thread_mutex output_mutex;
-  map<int, device_vector<float> *> output_pixels;
+  RenderBuffers buffers;

  /* Task handling */
  bool load_input_pixels(int layer);
-  void create_task(DeviceTask &task);
-
-  /* Device task callbacks */
-  bool acquire_tile(Device *device, Device *tile_device, RenderTile &tile);
-  void map_neighboring_tiles(RenderTileNeighbors &neighbors, Device *tile_device);
-  void unmap_neighboring_tiles(RenderTileNeighbors &neighbors);
-  void release_tile();
-  bool get_cancel();
 };

 CCL_NAMESPACE_END

-#endif
-
 #endif /* __DENOISING_H__ */
--- a/intern/cycles/session/display_driver.h
+++ b/intern/cycles/session/display_driver.h
@@ -54,6 +54,8 @@ class DisplayDriver {
    }
  };

+  virtual void next_tile_begin() = 0;
+
  /* Update the render from the rendering thread.
   *
   * Cycles periodically updates the render to be displayed. For multithreaded updates with
@@ -80,6 +82,9 @@ class DisplayDriver {
  virtual bool update_begin(const Params &params, int width, int height) = 0;
  virtual void update_end() = 0;

+  /* Optionally flush outstanding display commands before ending the render loop. */
+  virtual void flush(){};
+
  virtual half4 *map_texture_buffer() = 0;
  virtual void unmap_texture_buffer() = 0;

@@ -97,6 +102,17 @@ class DisplayDriver {

    /* Clear the entire buffer before doing partial write to it. */
    bool need_clear = false;
+
+    /* Enforce re-creation of the graphics interop object.
+     *
+     * When this field is true then the graphics interop will be re-created no matter what the
+     * rest of the configuration is.
+     * When this field is false the graphics interop will be re-created if the PBO or buffer size
+     * did change.
+     *
+     * This allows to ensure graphics interop is re-created when there is a possibility that an
+     * underlying PBO was re-allocated but did not change its ID. */
+    bool need_recreate = false;
  };

  virtual GraphicsInterop graphics_interop_get()
--- a/intern/cycles/session/session.cpp
+++ b/intern/cycles/session/session.cpp
@@ -192,6 +192,8 @@ void Session::run_main_render_loop()
      break;
    }
  }
+
+  path_trace_->flush_display();
 }

 void Session::run()
@@ -303,7 +305,7 @@ RenderWork Session::run_update_for_next_iteration()

      tile_params.update_offset_stride();

-      path_trace_->reset(buffer_params_, tile_params);
+      path_trace_->reset(buffer_params_, tile_params, did_reset);
    }

    const int resolution = render_work.resolution_divider;
@@ -384,7 +386,8 @@ int2 Session::get_effective_tile_size() const
  const int tile_size = tile_manager_.compute_render_tile_size(params.tile_size);
  const int64_t actual_tile_area = static_cast<int64_t>(tile_size) * tile_size;

-  if (actual_tile_area >= image_area) {
+  if (actual_tile_area >= image_area && image_width <= TileManager::MAX_TILE_SIZE &&
+      image_height <= TileManager::MAX_TILE_SIZE) {
    return make_int2(image_width, image_height);
  }

@@ -423,6 +426,11 @@ void Session::do_delayed_reset()
  buffer_params_.update_passes(scene->passes);
  tile_manager_.update(buffer_params_, scene);

+  /* Update temp directory on reset.
+   * This potentially allows to finish the existing rendering with a previously configure temporary
+   * direcotry in the host software and switch to a new temp directory when new render starts. */
+  tile_manager_.set_temp_dir(params.temp_dir);
+
  /* Progress. */
  progress.reset_sample();
  progress.set_total_pixel_samples(static_cast<uint64_t>(buffer_params_.width) *
--- a/intern/cycles/session/session.h
+++ b/intern/cycles/session/session.h
@@ -69,6 +69,9 @@ class SessionParams {

  ShadingSystem shadingsystem;

+  /* Session-specific temporary directory to store in-progress EXR files in. */
+  string temp_dir;
+
  SessionParams()
  {
    headless = false;
--- a/intern/cycles/session/tile.cpp
+++ b/intern/cycles/session/tile.cpp
@@ -23,6 +23,7 @@
 #include "scene/film.h"
 #include "scene/integrator.h"
 #include "scene/scene.h"
+#include "session/session.h"
 #include "util/algorithm.h"
 #include "util/foreach.h"
 #include "util/log.h"
@@ -341,8 +342,10 @@ int TileManager::compute_render_tile_size(const int suggested_tile_size) const
  /* Must be a multiple of IMAGE_TILE_SIZE so that we can write render tiles into the image file
   * aligned on image tile boundaries. We can't set IMAGE_TILE_SIZE equal to the render tile size
   * because too big tile size leads to integer overflow inside OpenEXR. */
-  return (suggested_tile_size <= IMAGE_TILE_SIZE) ? suggested_tile_size :
-                                                    align_up(suggested_tile_size, IMAGE_TILE_SIZE);
+  const int computed_tile_size = (suggested_tile_size <= IMAGE_TILE_SIZE) ?
+                                     suggested_tile_size :
+                                     align_up(suggested_tile_size, IMAGE_TILE_SIZE);
+  return min(computed_tile_size, MAX_TILE_SIZE);
 }

 void TileManager::reset_scheduling(const BufferParams &params, int2 tile_size)
@@ -392,6 +395,11 @@ void TileManager::update(const BufferParams &params, const Scene *scene)
  }
 }

+void TileManager::set_temp_dir(const string &temp_dir)
+{
+  temp_dir_ = temp_dir;
+}
+
 bool TileManager::done()
 {
  return tile_state_.next_tile_index == tile_state_.num_tiles;
@@ -450,7 +458,8 @@ const int2 TileManager::get_size() const

 bool TileManager::open_tile_output()
 {
-  write_state_.filename = path_temp_get("cycles-tile-buffer-" + tile_file_unique_part_ + "-" +
+  write_state_.filename = path_join(temp_dir_,
+                                    "cycles-tile-buffer-" + tile_file_unique_part_ + "-" +
                                        to_string(write_state_.tile_file_index) + ".exr");

  write_state_.tile_out = ImageOutput::create(write_state_.filename);
--- a/intern/cycles/session/tile.h
+++ b/intern/cycles/session/tile.h
@@ -71,6 +71,8 @@ class TileManager {
   * Will store all parameters needed for buffers access outside of the scene graph. */
  void update(const BufferParams &params, const Scene *scene);

+  void set_temp_dir(const string &temp_dir);
+
  inline int get_num_tiles() const
  {
    return tile_state_.num_tiles;
@@ -122,6 +124,12 @@ class TileManager {
  /* Tile size in the image file. */
  static const int IMAGE_TILE_SIZE = 128;

+  /* Maximum supported tile size.
+   * Needs to be safe from allocation on a GPU point of view: the display driver needs to be able
+   * to allocate texture with the side size of this value.
+   * Use conservative value which is safe for most of OpenGL drivers and GPUs. */
+  static const int MAX_TILE_SIZE = 8192;
+
 protected:
  /* Get tile configuration for its index.
   * The tile index must be within [0, state_.tile_state_). */
@@ -130,6 +138,8 @@ class TileManager {
  bool open_tile_output();
  bool close_tile_output();

+  string temp_dir_;
+
  /* Part of an on-disk tile file name which avoids conflicts between several Cycles instances or
   * several sessions. */
  string tile_file_unique_part_;
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@@ -54,17 +54,21 @@ set(SRC
  util_transform_test.cpp
 )

-if(CXX_HAS_AVX)
-  list(APPEND SRC
-    util_avxf_avx_test.cpp
-  )
-  set_source_files_properties(util_avxf_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
-endif()
-if(CXX_HAS_AVX2)
-  list(APPEND SRC
-    util_avxf_avx2_test.cpp
-  )
-  set_source_files_properties(util_avxf_avx2_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+# Disable AVX tests on macOS. Rosetta has problems running them, and other
+# platforms should be enough to verify AVX operations are implemented correctly.
+if(NOT APPLE)
+  if(CXX_HAS_AVX)
+    list(APPEND SRC
+      util_avxf_avx_test.cpp
+    )
+    set_source_files_properties(util_avxf_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+  endif()
+  if(CXX_HAS_AVX2)
+    list(APPEND SRC
+      util_avxf_avx2_test.cpp
+    )
+    set_source_files_properties(util_avxf_avx2_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+  endif()
 endif()

 if(WITH_GTESTS)
--- a/intern/cycles/test/util_avxf_test.h
+++ b/intern/cycles/test/util_avxf_test.h
@@ -32,9 +32,13 @@ static bool validate_cpu_capabilities()
 #endif
 }

-#define VALIDATECPU \
+#define INIT_AVX_TEST \
  if (!validate_cpu_capabilities()) \
-    return;
+    return; \
+\
+  const avxf avxf_a(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f); \
+  const avxf avxf_b(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); \
+  const avxf avxf_c(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f);

 #define compare_vector_scalar(a, b) \
  for (size_t index = 0; index < a.size; index++) \
@@ -49,21 +53,18 @@ static bool validate_cpu_capabilities()
    EXPECT_NEAR(a[index], b[index], abserror);

 #define basic_test_vv(a, b, op) \
-  VALIDATECPU \
+  INIT_AVX_TEST \
  avxf c = a op b; \
  for (size_t i = 0; i < a.size; i++) \
    EXPECT_FLOAT_EQ(c[i], a[i] op b[i]);

 /* vector op float tests */
 #define basic_test_vf(a, b, op) \
-  VALIDATECPU \
+  INIT_AVX_TEST \
  avxf c = a op b; \
  for (size_t i = 0; i < a.size; i++) \
    EXPECT_FLOAT_EQ(c[i], a[i] op b);

-static const avxf avxf_a(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f);
-static const avxf avxf_b(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f);
-static const avxf avxf_c(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f);
 static const float float_b = 1.5f;

 TEST(TEST_CATEGORY_NAME, avxf_add_vv){basic_test_vv(avxf_a, avxf_b, +)} TEST(TEST_CATEGORY_NAME,
@@ -78,7 +79,7 @@ TEST(TEST_CATEGORY_NAME, avxf_add_vv){basic_test_vv(avxf_a, avxf_b, +)} TEST(TES

 TEST(TEST_CATEGORY_NAME, avxf_ctor)
 {
-  VALIDATECPU
+  INIT_AVX_TEST
  compare_vector_scalar(avxf(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f),
                        static_cast<float>(index));
  compare_vector_scalar(avxf(1.0f), 1.0f);
@@ -91,28 +92,28 @@ TEST(TEST_CATEGORY_NAME, avxf_ctor)

 TEST(TEST_CATEGORY_NAME, avxf_sqrt)
 {
-  VALIDATECPU
+  INIT_AVX_TEST
  compare_vector_vector(mm256_sqrt(avxf(1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f, 49.0f, 64.0f)),
                        avxf(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f));
 }

 TEST(TEST_CATEGORY_NAME, avxf_min_max)
 {
-  VALIDATECPU
+  INIT_AVX_TEST
  compare_vector_vector(min(avxf_a, avxf_b), avxf_a);
  compare_vector_vector(max(avxf_a, avxf_b), avxf_b);
 }

 TEST(TEST_CATEGORY_NAME, avxf_set_sign)
 {
-  VALIDATECPU
+  INIT_AVX_TEST
  avxf res = set_sign_bit<1, 0, 0, 0, 0, 0, 0, 0>(avxf_a);
  compare_vector_vector(res, avxf(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, -0.8f));
 }

 TEST(TEST_CATEGORY_NAME, avxf_msub)
 {
-  VALIDATECPU
+  INIT_AVX_TEST
  avxf res = msub(avxf_a, avxf_b, avxf_c);
  avxf exp = avxf((avxf_a[7] * avxf_b[7]) - avxf_c[7],
                  (avxf_a[6] * avxf_b[6]) - avxf_c[6],
@@ -127,7 +128,7 @@ TEST(TEST_CATEGORY_NAME, avxf_msub)

 TEST(TEST_CATEGORY_NAME, avxf_madd)
 {
-  VALIDATECPU
+  INIT_AVX_TEST
  avxf res = madd(avxf_a, avxf_b, avxf_c);
  avxf exp = avxf((avxf_a[7] * avxf_b[7]) + avxf_c[7],
                  (avxf_a[6] * avxf_b[6]) + avxf_c[6],
@@ -142,7 +143,7 @@ TEST(TEST_CATEGORY_NAME, avxf_madd)

 TEST(TEST_CATEGORY_NAME, avxf_nmadd)
 {
-  VALIDATECPU
+  INIT_AVX_TEST
  avxf res = nmadd(avxf_a, avxf_b, avxf_c);
  avxf exp = avxf(avxf_c[7] - (avxf_a[7] * avxf_b[7]),
                  avxf_c[6] - (avxf_a[6] * avxf_b[6]),
@@ -157,7 +158,7 @@ TEST(TEST_CATEGORY_NAME, avxf_nmadd)

 TEST(TEST_CATEGORY_NAME, avxf_compare)
 {
-  VALIDATECPU
+  INIT_AVX_TEST
  avxf a(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f);
  avxf b(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
  avxb res = a <= b;
@@ -176,28 +177,28 @@ TEST(TEST_CATEGORY_NAME, avxf_compare)

 TEST(TEST_CATEGORY_NAME, avxf_permute)
 {
-  VALIDATECPU
+  INIT_AVX_TEST
  avxf res = permute<3, 0, 1, 7, 6, 5, 2, 4>(avxf_b);
  compare_vector_vector(res, avxf(4.0f, 6.0f, 3.0f, 2.0f, 1.0f, 7.0f, 8.0f, 5.0f));
 }

 TEST(TEST_CATEGORY_NAME, avxf_blend)
 {
-  VALIDATECPU
+  INIT_AVX_TEST
  avxf res = blend<0, 0, 1, 0, 1, 0, 1, 0>(avxf_a, avxf_b);
  compare_vector_vector(res, avxf(0.1f, 0.2f, 3.0f, 0.4f, 5.0f, 0.6f, 7.0f, 0.8f));
 }

 TEST(TEST_CATEGORY_NAME, avxf_shuffle)
 {
-  VALIDATECPU
+  INIT_AVX_TEST
  avxf res = shuffle<0, 1, 2, 3, 1, 3, 2, 0>(avxf_a);
  compare_vector_vector(res, avxf(0.4f, 0.2f, 0.1f, 0.3f, 0.5f, 0.6f, 0.7f, 0.8f));
 }

 TEST(TEST_CATEGORY_NAME, avxf_cross)
 {
-  VALIDATECPU
+  INIT_AVX_TEST
  avxf res = cross(avxf_b, avxf_c);
  compare_vector_vector_near(res,
                             avxf(0.0f,
@@ -213,7 +214,7 @@ TEST(TEST_CATEGORY_NAME, avxf_cross)

 TEST(TEST_CATEGORY_NAME, avxf_dot3)
 {
-  VALIDATECPU
+  INIT_AVX_TEST
  float den, den2;
  dot3(avxf_a, avxf_b, den, den2);
  EXPECT_FLOAT_EQ(den, 14.9f);
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -53,16 +53,6 @@ if(WITH_CYCLES_STANDALONE)
  endif()
 endif()

-if(CYCLES_STANDALONE_REPOSITORY)
-  list(APPEND INC_SYS
-    ../../third_party/numaapi/include
-  )
-else()
-  list(APPEND INC_SYS
-    ../../numaapi/include
-  )
-endif()
-
 set(SRC_HEADERS
  algorithm.h
  aligned_malloc.h
--- a/intern/cycles/util/path.cpp
+++ b/intern/cycles/util/path.cpp
@@ -66,7 +66,6 @@ typedef struct stat path_stat_t;

 static string cached_path = "";
 static string cached_user_path = "";
-static string cached_temp_path = "";
 static string cached_xdg_cache_path = "";

 namespace {
@@ -336,11 +335,10 @@ static string path_xdg_cache_get()
 }
 #endif

-void path_init(const string &path, const string &user_path, const string &temp_path)
+void path_init(const string &path, const string &user_path)
 {
  cached_path = path;
  cached_user_path = user_path;
-  cached_temp_path = temp_path;

 #ifdef _MSC_VER
  // workaround for https://svn.boost.org/trac/boost/ticket/6320
@@ -384,15 +382,6 @@ string path_cache_get(const string &sub)
 #endif
 }

-string path_temp_get(const string &sub)
-{
-  if (cached_temp_path == "") {
-    cached_temp_path = Filesystem::temp_directory_path();
-  }
-
-  return path_join(cached_temp_path, sub);
-}
-
 #if defined(__linux__) || defined(__APPLE__)
 string path_xdg_home_get(const string &sub = "");
 #endif
--- a/intern/cycles/util/path.h
+++ b/intern/cycles/util/path.h
@@ -32,10 +32,9 @@
 CCL_NAMESPACE_BEGIN

 /* program paths */
-void path_init(const string &path = "", const string &user_path = "", const string &tmp_path = "");
+void path_init(const string &path = "", const string &user_path = "");
 string path_get(const string &sub = "");
 string path_user_get(const string &sub = "");
-string path_temp_get(const string &sub = "");
 string path_cache_get(const string &sub = "");

 /* path string manipulation */
--- a/intern/cycles/util/system.cpp
+++ b/intern/cycles/util/system.cpp
@@ -20,9 +20,8 @@
 #include "util/string.h"
 #include "util/types.h"

-#include <numaapi.h>
-
 #include <OpenImageIO/sysutil.h>
+
 OIIO_NAMESPACE_USING

 #ifdef _WIN32
@@ -41,83 +40,6 @@ OIIO_NAMESPACE_USING

 CCL_NAMESPACE_BEGIN

-bool system_cpu_ensure_initialized()
-{
-  static bool is_initialized = false;
-  static bool result = false;
-  if (is_initialized) {
-    return result;
-  }
-  is_initialized = true;
-  const NUMAAPI_Result numa_result = numaAPI_Initialize();
-  result = (numa_result == NUMAAPI_SUCCESS);
-  return result;
-}
-
-/* Fallback solution, which doesn't use NUMA/CPU groups. */
-static int system_cpu_thread_count_fallback()
-{
-#ifdef _WIN32
-  SYSTEM_INFO info;
-  GetSystemInfo(&info);
-  return info.dwNumberOfProcessors;
-#elif defined(__APPLE__)
-  int count;
-  size_t len = sizeof(count);
-  int mib[2] = {CTL_HW, HW_NCPU};
-  sysctl(mib, 2, &count, &len, NULL, 0);
-  return count;
-#else
-  return sysconf(_SC_NPROCESSORS_ONLN);
-#endif
-}
-
-int system_cpu_thread_count()
-{
-  const int num_nodes = system_cpu_num_numa_nodes();
-  int num_threads = 0;
-  for (int node = 0; node < num_nodes; ++node) {
-    if (!system_cpu_is_numa_node_available(node)) {
-      continue;
-    }
-    num_threads += system_cpu_num_numa_node_processors(node);
-  }
-  return num_threads;
-}
-
-int system_cpu_num_numa_nodes()
-{
-  if (!system_cpu_ensure_initialized()) {
-    /* Fallback to a single node with all the threads. */
-    return 1;
-  }
-  return numaAPI_GetNumNodes();
-}
-
-bool system_cpu_is_numa_node_available(int node)
-{
-  if (!system_cpu_ensure_initialized()) {
-    return true;
-  }
-  return numaAPI_IsNodeAvailable(node);
-}
-
-int system_cpu_num_numa_node_processors(int node)
-{
-  if (!system_cpu_ensure_initialized()) {
-    return system_cpu_thread_count_fallback();
-  }
-  return numaAPI_GetNumNodeProcessors(node);
-}
-
-bool system_cpu_run_thread_on_node(int node)
-{
-  if (!system_cpu_ensure_initialized()) {
-    return true;
-  }
-  return numaAPI_RunThreadOnNode(node);
-}
-
 int system_console_width()
 {
  int columns = 0;
@@ -137,14 +59,6 @@ int system_console_width()
  return (columns > 0) ? columns : 80;
 }

-int system_cpu_num_active_group_processors()
-{
-  if (!system_cpu_ensure_initialized()) {
-    return system_cpu_thread_count_fallback();
-  }
-  return numaAPI_GetNumCurrentNodesProcessors();
-}
-
 /* Equivalent of Windows __cpuid for x86 processors on other platforms. */
 #if (!defined(_WIN32) || defined(FREE_WINDOWS)) && (defined(__x86_64__) || defined(__i386__))
 static void __cpuid(int data[4], int selector)
--- a/intern/cycles/util/system.h
+++ b/intern/cycles/util/system.h
@@ -22,36 +22,9 @@

 CCL_NAMESPACE_BEGIN

-/* Make sure CPU groups / NUMA API is initialized. */
-bool system_cpu_ensure_initialized();
-
-/* Get total number of threads in all NUMA nodes / CPU groups. */
-int system_cpu_thread_count();
-
 /* Get width in characters of the current console output. */
 int system_console_width();

-/* Get number of available nodes.
- *
- * This is in fact an index of last node plus one and it's not guaranteed
- * that all nodes up to this one are available. */
-int system_cpu_num_numa_nodes();
-
-/* Returns truth if the given node is available for compute. */
-bool system_cpu_is_numa_node_available(int node);
-
-/* Get number of available processors on a given node. */
-int system_cpu_num_numa_node_processors(int node);
-
-/* Runs the current thread and its children on a specific node.
- *
- * Returns truth if affinity has successfully changed. */
-bool system_cpu_run_thread_on_node(int node);
-
-/* Number of processors within the current CPU group (or within active thread
- * thread affinity). */
-int system_cpu_num_active_group_processors();
-
 string system_cpu_brand_string();
 int system_cpu_bits();
 bool system_cpu_support_sse2();
--- a/intern/cycles/util/task.cpp
+++ b/intern/cycles/util/task.cpp
@@ -89,7 +89,7 @@ void TaskScheduler::init(int num_threads)
    active_num_threads = num_threads;
  }
  else {
-    active_num_threads = system_cpu_thread_count();
+    active_num_threads = tbb::this_task_arena::max_concurrency();
  }
 }

--- a/intern/cycles/util/thread.cpp
+++ b/intern/cycles/util/thread.cpp
@@ -21,7 +21,7 @@

 CCL_NAMESPACE_BEGIN

-thread::thread(function<void()> run_cb, int node) : run_cb_(run_cb), joined_(false), node_(node)
+thread::thread(function<void()> run_cb) : run_cb_(run_cb), joined_(false)
 {
 #ifdef __APPLE__
  /* Set the stack size to 2MB to match Linux. The default 512KB on macOS is
@@ -46,9 +46,6 @@ thread::~thread()
 void *thread::run(void *arg)
 {
  thread *self = (thread *)(arg);
-  if (self->node_ != -1) {
-    system_cpu_run_thread_on_node(self->node_);
-  }
  self->run_cb_();
  return NULL;
 }
--- a/intern/cycles/util/thread.h
+++ b/intern/cycles/util/thread.h
@@ -46,9 +46,7 @@ typedef std::condition_variable thread_condition_variable;

 class thread {
 public:
-  /* NOTE: Node index of -1 means that affinity will be inherited from the
-   * parent thread and no override on top of that will happen. */
-  thread(function<void()> run_cb, int node = -1);
+  thread(function<void()> run_cb);
  ~thread();

  static void *run(void *arg);
@@ -62,7 +60,6 @@ class thread {
  std::thread std_thread;
 #endif
  bool joined_;
-  int node_;
 };

 using thread_spin_lock = tbb::spin_mutex;
--- a/intern/ghost/intern/GHOST_ImeWin32.cpp
+++ b/intern/ghost/intern/GHOST_ImeWin32.cpp
@@ -106,7 +106,7 @@ bool GHOST_ImeWin32::IsImeKeyEvent(char ascii)
    if (IsLanguage(IMELANG_JAPANESE) && (ascii >= ' ' && ascii <= '~')) {
      return true;
    }
-    else if (IsLanguage(IMELANG_CHINESE) && ascii && strchr("!\"$'(),.:;<>?[\\]^_`", ascii)) {
+    else if (IsLanguage(IMELANG_CHINESE) && ascii && strchr("!\"$'(),.:;<>?[\\]^_`/", ascii)) {
      return true;
    }
  }
--- a/intern/ghost/intern/GHOST_SystemWin32.cpp
+++ b/intern/ghost/intern/GHOST_SystemWin32.cpp
@@ -1552,8 +1552,8 @@ LRESULT WINAPI GHOST_SystemWin32::s_wndProc(HWND hwnd, UINT msg, WPARAM wParam,
           * button is press for menu. To prevent this we must return preventing DefWindowProc.
           *
           * Note that the four low-order bits of the wParam parameter are used internally by the
-           * OS. To obtain the correct result when testing the value of wParam, an application
-           * must combine the value 0xFFF0 with the wParam value by using the bitwise AND operator.
+           * OS. To obtain the correct result when testing the value of wParam, an application must
+           * combine the value 0xFFF0 with the wParam value by using the bit-wise AND operator.
           */
          switch (wParam & 0xFFF0) {
            case SC_KEYMENU:
--- a/intern/ghost/intern/GHOST_Window.h
+++ b/intern/ghost/intern/GHOST_Window.h
@@ -41,8 +41,8 @@ class GHOST_Window : public GHOST_IWindow {
   * Constructor.
   * Creates a new window and opens it.
   * To check if the window was created properly, use the getValid() method.
-   * \param width: The width the window.
-   * \param height: The height the window.
+   * \param width: The width of the window.
+   * \param height: The height of the window.
   * \param state: The state the window is initially opened with.
   * \param wantStereoVisual: Stereo visual for quad buffered stereo.
   * \param exclusive: Use to show the window ontop and ignore others (used full-screen).
--- a/intern/ghost/intern/GHOST_Wintab.cpp
+++ b/intern/ghost/intern/GHOST_Wintab.cpp
@@ -298,14 +298,12 @@ GHOST_TabletData GHOST_Wintab::getLastTabletData()
 void GHOST_Wintab::getInput(std::vector<GHOST_WintabInfoWin32> &outWintabInfo)
 {
  const int numPackets = m_fpPacketsGet(m_context.get(), m_pkts.size(), m_pkts.data());
-  outWintabInfo.resize(numPackets);
-  size_t outExtent = 0;
+  outWintabInfo.reserve(numPackets);

  for (int i = 0; i < numPackets; i++) {
    PACKET pkt = m_pkts[i];
-    GHOST_WintabInfoWin32 &out = outWintabInfo[i + outExtent];
+    GHOST_WintabInfoWin32 out;

-    out.tabletData = GHOST_TABLET_DATA_NONE;
    /* % 3 for multiple devices ("DualTrack"). */
    switch (pkt.pkCursor % 3) {
      case 0:
@@ -328,12 +326,7 @@ void GHOST_Wintab::getInput(std::vector<GHOST_WintabInfoWin32> &outWintabInfo)
    }

    if ((m_maxAzimuth > 0) && (m_maxAltitude > 0)) {
-      ORIENTATION ort = pkt.pkOrientation;
-      float vecLen;
-      float altRad, azmRad; /* In radians. */
-
-      /*
-       * From the wintab spec:
+      /* From the wintab spec:
       * orAzimuth: Specifies the clockwise rotation of the cursor about the z axis through a
       * full circular range.
       * orAltitude: Specifies the angle with the x-y plane through a signed, semicircular range.
@@ -346,12 +339,14 @@ void GHOST_Wintab::getInput(std::vector<GHOST_WintabInfoWin32> &outWintabInfo)
       * value.
       */

+      ORIENTATION ort = pkt.pkOrientation;
+
      /* Convert raw fixed point data to radians. */
-      altRad = (float)((fabs((float)ort.orAltitude) / (float)m_maxAltitude) * M_PI / 2.0);
-      azmRad = (float)(((float)ort.orAzimuth / (float)m_maxAzimuth) * M_PI * 2.0);
+      float altRad = (float)((fabs((float)ort.orAltitude) / (float)m_maxAltitude) * M_PI / 2.0);
+      float azmRad = (float)(((float)ort.orAzimuth / (float)m_maxAzimuth) * M_PI * 2.0);

      /* Find length of the stylus' projected vector on the XY plane. */
-      vecLen = cos(altRad);
+      float vecLen = cos(altRad);

      /* From there calculate X and Y components based on azimuth. */
      out.tabletData.Xtilt = sin(azmRad) * vecLen;
@@ -362,13 +357,8 @@ void GHOST_Wintab::getInput(std::vector<GHOST_WintabInfoWin32> &outWintabInfo)

    /* Some Wintab libraries don't handle relative button input, so we track button presses
     * manually. */
-    out.button = GHOST_kButtonMaskNone;
-    out.type = GHOST_kEventCursorMove;
-
    DWORD buttonsChanged = m_buttons ^ pkt.pkButtons;
    WORD buttonIndex = 0;
-    GHOST_WintabInfoWin32 buttonRef = out;
-    int buttons = 0;

    while (buttonsChanged) {
      if (buttonsChanged & 1) {
@@ -376,23 +366,14 @@ void GHOST_Wintab::getInput(std::vector<GHOST_WintabInfoWin32> &outWintabInfo)
        GHOST_TButtonMask button = mapWintabToGhostButton(pkt.pkCursor, buttonIndex);

        if (button != GHOST_kButtonMaskNone) {
-          /* Extend output if multiple buttons are pressed. We don't extend input until we confirm
-           * a Wintab buttons maps to a system button. */
-          if (buttons > 0) {
-            outWintabInfo.resize(outWintabInfo.size() + 1);
-            outExtent++;
-            GHOST_WintabInfoWin32 &out = outWintabInfo[i + outExtent];
-            out = buttonRef;
+          /* If this is not the first button found, push info for the prior Wintab button. */
+          if (out.button != GHOST_kButtonMaskNone) {
+            outWintabInfo.push_back(out);
          }
-          buttons++;

          out.button = button;
-          if (buttonsChanged & pkt.pkButtons) {
-            out.type = GHOST_kEventButtonDown;
-          }
-          else {
-            out.type = GHOST_kEventButtonUp;
-          }
+          out.type = buttonsChanged & pkt.pkButtons ? GHOST_kEventButtonDown :
+                                                      GHOST_kEventButtonUp;
        }

        m_buttons ^= 1 << buttonIndex;
@@ -401,6 +382,8 @@ void GHOST_Wintab::getInput(std::vector<GHOST_WintabInfoWin32> &outWintabInfo)
      buttonsChanged >>= 1;
      buttonIndex++;
    }
+
+    outWintabInfo.push_back(out);
  }

  if (!outWintabInfo.empty()) {
--- a/intern/ghost/intern/GHOST_Wintab.h
+++ b/intern/ghost/intern/GHOST_Wintab.h
@@ -56,11 +56,12 @@ typedef std::unique_ptr<std::remove_pointer_t<HMODULE>, decltype(&::FreeLibrary)
 typedef std::unique_ptr<std::remove_pointer_t<HCTX>, GHOST_WIN32_WTClose> unique_hctx;

 struct GHOST_WintabInfoWin32 {
-  int32_t x, y;
-  GHOST_TEventType type;
-  GHOST_TButtonMask button;
-  uint64_t time;
-  GHOST_TabletData tabletData;
+  int32_t x = 0;
+  int32_t y = 0;
+  GHOST_TEventType type = GHOST_kEventCursorMove;
+  GHOST_TButtonMask button = GHOST_kButtonMaskNone;
+  uint64_t time = 0;
+  GHOST_TabletData tabletData = GHOST_TABLET_DATA_NONE;
 };

 class GHOST_Wintab {
--- a/intern/iksolver/intern/IK_QJacobian.cpp
+++ b/intern/iksolver/intern/IK_QJacobian.cpp
@@ -196,12 +196,12 @@ void IK_QJacobian::InvertSDLS()
  // Compute the dampeds least squeares pseudo inverse of J.
  //
  // Since J is usually not invertible (most of the times it's not even
-  // square), the psuedo inverse is used. This gives us a least squares
+  // square), the pseudo inverse is used. This gives us a least squares
  // solution.
  //
  // This is fine when the J*Jt is of full rank. When J*Jt is near to
  // singular the least squares inverse tries to minimize |J(dtheta) - dX)|
-  // and doesn't try to minimize  dTheta. This results in eratic changes in
+  // and doesn't try to minimize  dTheta. This results in erratic changes in
  // angle. The damped least squares minimizes |dtheta| to try and reduce this
  // erratic behavior.
  //
@@ -323,7 +323,7 @@ void IK_QJacobian::InvertDLS()
  // least squares solution. This is fine when the m_jjt is
  // of full rank. When m_jjt is near to singular the least squares
  // inverse tries to minimize |J(dtheta) - dX)| and doesn't
-  // try to minimize  dTheta. This results in eratic changes in angle.
+  // try to minimize  dTheta. This results in erratic changes in angle.
  // Damped least squares minimizes |dtheta| to try and reduce this
  // erratic behavior.

--- a/intern/libmv/libmv/autotrack/autotrack.cc
+++ b/intern/libmv/libmv/autotrack/autotrack.cc
@@ -178,7 +178,7 @@ bool AutoTrack::TrackMarker(Marker* tracked_marker,
    return false;
  }

-  // Store original position befoer tracking, so we can claculate offset later.
+  // Store original position before tracking, so we can claculate offset later.
  Vec2f original_center = tracked_marker->center;

  // Do the tracking!
--- a/intern/libmv/libmv/build/build_config.h
+++ b/intern/libmv/libmv/build/build_config.h
@@ -239,7 +239,7 @@
 // Check what is the latest C++ specification the compiler supports.
 //
 // NOTE: Use explicit definition here to avoid expansion-to-defined warning from
-// being geenrated. While this will most likely a false-positive warning in this
+// being generated. While this will most likely a false-positive warning in this
 // particular case, that warning might be helpful to catch errors elsewhere.

 // C++11 check.
--- a/intern/libmv/libmv/image/tuple.h
+++ b/intern/libmv/libmv/image/tuple.h
@@ -25,7 +25,7 @@

 namespace libmv {

-// A vector of elements with fixed lenght and deep copy semantics.
+// A vector of elements with fixed length and deep copy semantics.
 template <typename T, int N>
 class Tuple {
 public:
--- a/intern/libmv/libmv/multiview/panography.h
+++ b/intern/libmv/libmv/multiview/panography.h
@@ -38,7 +38,7 @@ namespace libmv {
 // The 2-point algorithm solves for the rotation of the camera with a single
 // focal length (4 degrees of freedom).
 //
-// Compute from 1 to 3 possible focal lenght for 2 point correspondences.
+// Compute from 1 to 3 possible focal length for 2 point correspondences.
 // Suppose that the cameras share the same optical center and focal lengths:
 //
 //   Image 1 => H*x = x'  =>  Image 2
--- a/intern/libmv/libmv/numeric/numeric.h
+++ b/intern/libmv/libmv/numeric/numeric.h
@@ -261,7 +261,7 @@ Mat3 RotationRodrigues(const Vec3& axis);
 // positive z-axis, and y is oriented close to up.
 Mat3 LookAt(Vec3 center);

-// Return a diagonal matrix from a vector containg the diagonal values.
+// Return a diagonal matrix from a vector containing the diagonal values.
 template <typename TVec>
 inline Mat Diag(const TVec& x) {
  return x.asDiagonal();
--- a/intern/libmv/libmv/numeric/poly.h
+++ b/intern/libmv/libmv/numeric/poly.h
@@ -50,7 +50,7 @@ int SolveCubicPolynomial(Real a, Real b, Real c, Real* x0, Real* x1, Real* x2) {
  Real CQ3 = 2916 * q * q * q;

  if (R == 0 && Q == 0) {
-    // Tripple root in one place.
+    // Triple root in one place.
    *x0 = *x1 = *x2 = -a / 3;
    return 3;

--- a/intern/memutil/MEM_RefCounted.h
+++ b/intern/memutil/MEM_RefCounted.h
@@ -41,7 +41,7 @@
 class MEM_RefCounted {
 public:
  /**
-   * Constructs a a shared object.
+   * Constructs a shared object.
   */
  MEM_RefCounted() : m_refCount(1)
  {
--- a/intern/numaapi/AUTHORS
+++ b/intern/numaapi/AUTHORS
@@ -1 +0,0 @@
-Sergey Sharybin <sergey.vfx@gmail.com>
--- a/intern/numaapi/CMakeLists.txt
+++ b/intern/numaapi/CMakeLists.txt
@@ -1,42 +0,0 @@
-# ***** BEGIN GPL LICENSE BLOCK *****
-#
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License
-# as published by the Free Software Foundation; either version 2
-# of the License, or (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software Foundation,
-# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-#
-# ***** END GPL LICENSE BLOCK *****
-
-set(INC
-  include
-)
-
-set(INC_SYS
-
-)
-
-set(SRC
-  source/numaapi.c
-  source/numaapi_linux.c
-  source/numaapi_stub.c
-  source/numaapi_win32.c
-
-  include/numaapi.h
-  source/build_config.h
-)
-
-set(LIB
-)
-
-add_definitions(-DWITH_DYNLOAD)
-
-blender_add_lib(bf_intern_numaapi "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
--- a/intern/numaapi/LICENSE
+++ b/intern/numaapi/LICENSE
@@ -1,19 +0,0 @@
-Copyright (c) 2016 libnumaapi authors.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to
-deal in the Software without restriction, including without limitation the
-rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-sell copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-IN THE SOFTWARE.
--- a/intern/numaapi/README
+++ b/intern/numaapi/README
@@ -1,7 +0,0 @@
-LibNumaAPI is aimed to provide one common cross-platform API for all
-possible platforms, so cross-platform applications might not worry
-about implementation details.
-
-LICENSE
-
-LibNumaAPI library is released under the MIT license.
--- a/intern/numaapi/README.blender
+++ b/intern/numaapi/README.blender
@@ -1,5 +0,0 @@
-Project: LibNumaAPI
-URL: https://github.com/Nazg-Gul/libNumaAPI
-License: MIT
-Upstream version: 1c1ae7bc78e
-Local modifications: None
--- a/intern/numaapi/include/numaapi.h
+++ b/intern/numaapi/include/numaapi.h
@@ -1,122 +0,0 @@
-// Copyright (c) 2016, libnumaapi authors
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to
-// deal in the Software without restriction, including without limitation the
-// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-// sell copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-// IN THE SOFTWARE.
-//
-// Author: Sergey Sharybin <sergey.vfx@gmail.com>
-
-/** \file
- * \ingroup intern_numaapi
- */
-
-#ifndef __LIBNUMAAPI_H__
-#define __LIBNUMAAPI_H__
-
-#include <stdbool.h>
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define NUMAAPI_VERSION_MAJOR 1
-#define NUMAAPI_VERSION_MINOR 0
-
-typedef enum NUMAAPI_Result {
-  NUMAAPI_SUCCESS       = 0,
-  // NUMA is not available on this platform.
-  NUMAAPI_NOT_AVAILABLE = 1,
-  // Generic error, no real details are available,
-  NUMAAPI_ERROR         = 2,
-  // Error installing atexit() handlers.
-  NUMAAPI_ERROR_ATEXIT  = 3,
-} NUMAAPI_Result;
-
-////////////////////////////////////////////////////////////////////////////////
-// Initialization.
-
-// Initialize NUMA API.
-//
-// This is first call which should be called before any other NUMA functions
-// can be used.
-NUMAAPI_Result numaAPI_Initialize(void);
-
-// Get string representation of NUMAPIResult.
-const char* numaAPI_ResultAsString(NUMAAPI_Result result);
-
-////////////////////////////////////////////////////////////////////////////////
-// Topology query.
-
-// Get number of available nodes.
-//
-// This is in fact an index of last node plus one and it's not guaranteed
-// that all nodes up to this one are available.
-int numaAPI_GetNumNodes(void);
-
-// Returns truth if the given node is available for compute.
-bool numaAPI_IsNodeAvailable(int node);
-
-// Get number of available processors on a given node.
-int numaAPI_GetNumNodeProcessors(int node);
-
-////////////////////////////////////////////////////////////////////////////////
-// Topology helpers.
-//
-// Those are a bit higher level queries, but is still rather platform-specific
-// and generally useful.
-
-// Get number of processors within the NUMA nodes on which current thread is
-// set affinity on.
-int numaAPI_GetNumCurrentNodesProcessors(void);
-
-////////////////////////////////////////////////////////////////////////////////
-// Affinities.
-
-// Runs the current process and its children on a specific node.
-//
-// Returns truth if affinity has successfully changed.
-//
-// NOTE: This function can not change active CPU group. Mainly designed to deal
-// with Threadripper 2 topology, to make it possible to gain maximum performance
-// for the main application thread.
-bool numaAPI_RunProcessOnNode(int node);
-
-// Runs the current thread and its children on a specific node.
-//
-// Returns truth if affinity has successfully changed.
-bool numaAPI_RunThreadOnNode(int node);
-
-////////////////////////////////////////////////////////////////////////////////
-// Memory management.
-
-// Allocate memory on a given node,
-void* numaAPI_AllocateOnNode(size_t size, int node);
-
-// Allocate memory in the local memory, closest to the current node.
-void* numaAPI_AllocateLocal(size_t size);
-
-// Frees size bytes of memory starting at start.
-//
-// TODO(sergey): Consider making it regular free() semantic.
-void numaAPI_Free(void* start, size_t size);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // __LIBNUMAAPI_H__
--- a/intern/numaapi/source/build_config.h
+++ b/intern/numaapi/source/build_config.h
@@ -1,443 +0,0 @@
-// Copyright (c) 2018, libnumaapi authors
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to
-// deal in the Software without restriction, including without limitation the
-// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-// sell copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-// IN THE SOFTWARE.
-//
-// Author: Sergey Sharybin <sergey.vfx@gmail.com>
-
-/** \file
- * \ingroup intern_numaapi
- */
-
-#ifndef __BUILD_CONFIG_H__
-#define __BUILD_CONFIG_H__
-
-#include <limits.h>
-#include <stdint.h>
-
-// Initially is based on Chromium's build_config.h, with tweaks and extensions
-// needed for this project.
-//
-// NOTE: All commonly used symbols (which are checked on a "top" level, from
-// outside of any platform-specific ifdef block) are to be explicitly defined
-// to 0 when they are not "active". This is extra lines of code in this file,
-// but is not being edited that often. Such approach helps catching cases when
-// one attempted to access build configuration variable without including the
-// header by simply using -Wundef compiler attribute.
-//
-// NOTE: Not having things explicitly defined to 0 is harmless (in terms it
-// follows same rules as Google projects) and will simply cause compiler to
-// become more noisy, which is simple to correct.
-
-////////////////////////////////////////////////////////////////////////////////
-// A set of macros to use for platform detection.
-
-#if defined(__native_client__)
-// __native_client__ must be first, so that other OS_ defines are not set.
-#  define OS_NACL 1
-// OS_NACL comes in two sandboxing technology flavors, SFI or Non-SFI.
-// PNaCl toolchain defines __native_client_nonsfi__ macro in Non-SFI build
-// mode, while it does not in SFI build mode.
-#  if defined(__native_client_nonsfi__)
-#    define OS_NACL_NONSFI
-#  else
-#    define OS_NACL_SFI
-#  endif
-#elif defined(_AIX)
-#  define OS_AIX 1
-#elif defined(ANDROID)
-#  define OS_ANDROID 1
-#elif defined(__APPLE__)
-// Only include TargetConditions after testing ANDROID as some android builds
-// on mac don't have this header available and it's not needed unless the target
-// is really mac/ios.
-#  include <TargetConditionals.h>
-#  define OS_MACOSX 1
-#  if defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE
-#    define OS_IOS 1
-#  endif  // defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE
-#elif defined(__HAIKU__)
-#  define OS_HAIKU 1
-#elif defined(__hpux)
-#  define OS_HPUX 1
-#elif defined(__linux__)
-#  define OS_LINUX 1
-// Include a system header to pull in features.h for glibc/uclibc macros.
-#  include <unistd.h>
-#  if defined(__GLIBC__) && !defined(__UCLIBC__)
-// We really are using glibc, not uClibc pretending to be glibc.
-#    define LIBC_GLIBC 1
-#  endif
-#elif defined(__sgi)
-#  define OS_IRIX 1
-#elif defined(_WIN32)
-#  define OS_WIN 1
-#elif defined(__Fuchsia__)
-#  define OS_FUCHSIA 1
-#elif defined(__FreeBSD__)
-#  define OS_FREEBSD 1
-#elif defined(__NetBSD__)
-#  define OS_NETBSD 1
-#elif defined(__OpenBSD__)
-#  define OS_OPENBSD 1
-#elif defined(__sun)
-#  define OS_SOLARIS 1
-#elif defined(__QNXNTO__)
-#  define OS_QNX 1
-#elif defined(__asmjs__) || defined(__wasm__)
-#  define OS_ASMJS 1
-#else
-#  error Please add support for your platform in build_config.h
-#endif
-
-#if !defined(OS_AIX)
-#  define OS_AIX 0
-#endif
-#if !defined(OS_ASMJS)
-#  define OS_ASMJS 0
-#endif
-#if !defined(OS_NACL)
-#  define OS_NACL 0
-#endif
-#if !defined(OS_NACL_NONSFI)
-#  define OS_NACL_NONSFI 0
-#endif
-#if !defined(OS_NACL_SFI)
-#  define OS_NACL_SFI 0
-#endif
-#if !defined(OS_ANDROID)
-#  define OS_ANDROID 0
-#endif
-#if !defined(OS_MACOSX)
-#  define OS_MACOSX 0
-#endif
-#if !defined(OS_IOS)
-#  define OS_IOS 0
-#endif
-#if !defined(OS_HAIKU)
-#  define OS_HAIKU 0
-#endif
-#if !defined(OS_HPUX)
-#  define OS_HPUX 0
-#endif
-#if !defined(OS_IRIX)
-#  define OS_IRIX 0
-#endif
-#if !defined(OS_LINUX)
-#  define OS_LINUX 0
-#endif
-#if !defined(LIBC_GLIBC)
-#  define LIBC_GLIBC 0
-#endif
-#if !defined(OS_WIN)
-#  define OS_WIN 0
-#endif
-#if !defined(OS_FUCHSIA)
-#  define OS_FUCHSIA 0
-#endif
-#if !defined(OS_FREEBSD)
-#  define OS_FREEBSD 0
-#endif
-#if !defined(OS_NETBSD)
-#  define OS_NETBSD 0
-#endif
-#if !defined(OS_OPENBSD)
-#  define OS_OPENBSD 0
-#endif
-#if !defined(OS_SOLARIS)
-#  define OS_SOLARIS 0
-#endif
-#if !defined(OS_QNX)
-#  define OS_QNX 0
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// *BSD OS family detection.
-//
-// For access to standard BSD features, use OS_BSD instead of a
-// more specific macro.
-#if OS_FREEBSD || OS_OPENBSD || OS_NETBSD
-#  define OS_BSD 1
-#else
-#  define OS_BSD 0
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// POSIX system detection.
-//
-// For access to standard POSIXish features use OS_POSIX instead of a
-// more specific macro.
-#if OS_AIX || OS_ANDROID || OS_ASMJS || OS_FREEBSD || OS_LINUX || OS_MACOSX || \
-    OS_NACL || OS_NETBSD || OS_OPENBSD || OS_QNX || OS_SOLARIS
-#  define OS_POSIX 1
-#else
-#  define OS_POSIX 0
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// Compiler detection, including its capabilities.
-
-#if defined(__clang__)
-#  define COMPILER_CLANG 1
-#elif defined(__GNUC__)
-#  define COMPILER_GCC 1
-#  define COMPILER_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-#elif defined(_MSC_VER)
-#  define COMPILER_MSVC 1
-#  define COMPILER_MSVC_VERSION (_MSC_VER)
-#elif defined(__MINGW32__)
-#  define COMPILER_MINGW32 1
-#elif defined(__MINGW64__)
-#  define COMPILER_MINGW64 1
-#else
-#  error Please add support for your compiler in build_config.h
-#endif
-
-#if !defined(COMPILER_CLANG)
-#  define COMPILER_CLANG 0
-#endif
-#if !defined(COMPILER_GCC)
-#  define COMPILER_GCC 0
-#endif
-#if !defined(COMPILER_MSVC)
-#  define COMPILER_MSVC 0
-#endif
-#if !defined(COMPILER_MINGW32)
-#  define COMPILER_MINGW32 0
-#endif
-#if !defined(COMPILER_MINGW64)
-#  define COMPILER_MINGW64 0
-#endif
-
-// Compiler is any of MinGW family.
-#if COMPILER_MINGW32 || COMPILER_MINGW64
-#  define COMPILER_MINGW 1
-#else
-#  define COMPILER_MINGW 0
-#endif
-
-// Check what is the latest C++ specification the compiler supports.
-//
-// NOTE: Use explicit definition here to avoid expansion-to-defined warning from
-// being geenrated. While this will most likely a false-positive warning in this
-// particular case, that warning might be helpful to catch errors elsewhere.
-
-// C++11 check.
-#if ((defined(__cplusplus) && (__cplusplus > 199711L)) ||                      \
-     (defined(_MSC_VER) && (_MSC_VER >= 1800)))
-#  define COMPILER_SUPPORTS_CXX11 1
-#else
-#  define COMPILER_SUPPORTS_CXX11 0
-#endif
-// C++14 check.
-#if (defined(__cplusplus) && (__cplusplus > 201311L))
-#  define COMPILER_SUPPORTS_CXX14 1
-#else
-#  define COMPILER_SUPPORTS_CXX14 0
-#endif
-// C++17 check.
-#if (defined(__cplusplus) && (__cplusplus > 201611L))
-#  define COMPILER_SUPPORTS_CXX17 1
-#else
-#  define COMPILER_SUPPORTS_CXX17 0
-#endif
-// C++20 check.
-#if (defined(__cplusplus) && (__cplusplus > 201911L))
-#  define COMPILER_SUPPORTS_CXX20 1
-#else
-#  define COMPILER_SUPPORTS_CXX20 0
-#endif
-
-// COMPILER_USE_ADDRESS_SANITIZER is defined when program is detected that
-// compilation happened wit haddress sanitizer enabled. This allows to give
-// tips to sanitizer, or maybe work around some known issues with third party
-// libraries.
-#if !defined(COMPILER_USE_ADDRESS_SANITIZER)
-#  if defined(__has_feature)
-#    define COMPILER_USE_ADDRESS_SANITIZER 1
-#  elif defined(__SANITIZE_ADDRESS__)
-#    define COMPILER_USE_ADDRESS_SANITIZER 1
-#  endif
-#endif
-
-#if !defined(COMPILER_USE_ADDRESS_SANITIZER)
-#  define COMPILER_USE_ADDRESS_SANITIZER 0
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// Processor architecture detection.
-//
-// For more info on what's defined, see:
-//
-//   http://msdn.microsoft.com/en-us/library/b0084kay.aspx
-//   http://www.agner.org/optimize/calling_conventions.pdf
-//
-//   or with gcc, run: "echo | gcc -E -dM -"
-#if defined(_M_X64) || defined(__x86_64__)
-#  define ARCH_CPU_X86_FAMILY 1
-#  define ARCH_CPU_X86_64 1
-#  define ARCH_CPU_64_BITS 1
-#  define ARCH_CPU_LITTLE_ENDIAN 1
-#elif defined(_M_IX86) || defined(__i386__)
-#  define ARCH_CPU_X86_FAMILY 1
-#  define ARCH_CPU_X86 1
-#  define ARCH_CPU_32_BITS 1
-#  define ARCH_CPU_LITTLE_ENDIAN 1
-#elif defined(__s390x__)
-#  define ARCH_CPU_S390_FAMILY 1
-#  define ARCH_CPU_S390X 1
-#  define ARCH_CPU_64_BITS 1
-#  define ARCH_CPU_BIG_ENDIAN 1
-#elif defined(__s390__)
-#  define ARCH_CPU_S390_FAMILY 1
-#  define ARCH_CPU_S390 1
-#  define ARCH_CPU_31_BITS 1
-#  define ARCH_CPU_BIG_ENDIAN 1
-#elif (defined(__PPC64__) || defined(__PPC__)) && defined(__BIG_ENDIAN__)
-#  define ARCH_CPU_PPC64_FAMILY 1
-#  define ARCH_CPU_PPC64 1
-#  define ARCH_CPU_64_BITS 1
-#  define ARCH_CPU_BIG_ENDIAN 1
-#elif defined(__PPC64__)
-#  define ARCH_CPU_PPC64_FAMILY 1
-#  define ARCH_CPU_PPC64 1
-#  define ARCH_CPU_64_BITS 1
-#  define ARCH_CPU_LITTLE_ENDIAN 1
-#elif defined(__ARMEL__)
-#  define ARCH_CPU_ARM_FAMILY 1
-#  define ARCH_CPU_ARMEL 1
-#  define ARCH_CPU_32_BITS 1
-#  define ARCH_CPU_LITTLE_ENDIAN 1
-#elif defined(__aarch64__) || defined(_M_ARM64)
-#  define ARCH_CPU_ARM_FAMILY 1
-#  define ARCH_CPU_ARM64 1
-#  define ARCH_CPU_64_BITS 1
-#  define ARCH_CPU_LITTLE_ENDIAN 1
-#elif defined(__riscv) && __riscv_xlen == 32
-#  define ARCH_CPU_RISCV_FAMILY 1
-#  define ARCH_CPU_RISCV32 1
-#  define ARCH_CPU_64_BITS 0
-#  define ARCH_CPU_LITTLE_ENDIAN 1
-#elif defined(__riscv) && __riscv_xlen == 64
-#  define ARCH_CPU_RISCV_FAMILY 1
-#  define ARCH_CPU_RISCV64 1
-#  define ARCH_CPU_64_BITS 1
-#  define ARCH_CPU_LITTLE_ENDIAN 1
-#elif defined(__pnacl__) || defined(__asmjs__) || defined(__wasm__)
-#  define ARCH_CPU_32_BITS 1
-#  define ARCH_CPU_LITTLE_ENDIAN 1
-#elif defined(__MIPSEL__)
-#  if defined(__LP64__)
-#    define ARCH_CPU_MIPS_FAMILY 1
-#    define ARCH_CPU_MIPS64EL 1
-#    define ARCH_CPU_64_BITS 1
-#    define ARCH_CPU_LITTLE_ENDIAN 1
-#  else
-#    define ARCH_CPU_MIPS_FAMILY 1
-#    define ARCH_CPU_MIPSEL 1
-#    define ARCH_CPU_32_BITS 1
-#    define ARCH_CPU_LITTLE_ENDIAN 1
-#  endif
-#elif defined(__MIPSEB__)
-#  if defined(__LP64__)
-#    define ARCH_CPU_MIPS_FAMILY 1
-#    define ARCH_CPU_MIPS64 1
-#    define ARCH_CPU_64_BITS 1
-#    define ARCH_CPU_BIG_ENDIAN 1
-#  else
-#    define ARCH_CPU_MIPS_FAMILY 1
-#    define ARCH_CPU_MIPS 1
-#    define ARCH_CPU_32_BITS 1
-#    define ARCH_CPU_BIG_ENDIAN 1
-#  endif
-#else
-#  error Please add support for your architecture in build_config.h
-#endif
-
-#if !defined(ARCH_CPU_LITTLE_ENDIAN)
-#  define ARCH_CPU_LITTLE_ENDIAN 0
-#endif
-#if !defined(ARCH_CPU_BIG_ENDIAN)
-#  define ARCH_CPU_BIG_ENDIAN 0
-#endif
-
-#if !defined(ARCH_CPU_32_BITS)
-#  define ARCH_CPU_32_BITS 0
-#endif
-#if !defined(ARCH_CPU_64_BITS)
-#  define ARCH_CPU_64_BITS 0
-#endif
-
-#if !defined(ARCH_CPU_X86_FAMILY)
-#  define ARCH_CPU_X86_FAMILY 0
-#endif
-#if !defined(ARCH_CPU_ARM_FAMILY)
-#  define ARCH_CPU_ARM_FAMILY 0
-#endif
-#if !defined(ARCH_CPU_MIPS_FAMILY)
-#  define ARCH_CPU_MIPS_FAMILY 0
-#endif
-#if !defined(ARCH_CPU_PPC64_FAMILY)
-#  define ARCH_CPU_PPC64_FAMILY 0
-#endif
-#if !defined(ARCH_CPU_RISCV_FAMILY)
-#  define ARCH_CPU_RISCV_FAMILY 0
-#endif
-#if !defined(ARCH_CPU_S390_FAMILY)
-#  define ARCH_CPU_S390_FAMILY 0
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// Sizes of platform-dependent types.
-
-#if defined(__SIZEOF_POINTER__)
-#  define PLATFORM_SIZEOF_PTR __SIZEOF_POINTER__
-#elif defined(UINTPTR_MAX)
-#  if (UINTPTR_MAX == 0xffffffff)
-#    define PLATFORM_SIZEOF_PTR 4
-#  elif (UINTPTR_MAX == 0xffffffffffffffff)  // NOLINT
-#    define PLATFORM_SIZEOF_PTR 8
-#  endif
-#elif defined(__WORDSIZE)
-#  if (__WORDSIZE == 32)
-#    define PLATFORM_SIZEOF_PTR 4
-#  else if (__WORDSIZE == 64)
-#    define PLATFORM_SIZEOF_PTR 8
-#  endif
-#endif
-#if !defined(PLATFORM_SIZEOF_PTR)
-#  error Cannot find pointer size.
-#endif
-
-#if (UINT_MAX == 0xffffffff)
-#  define PLATFORM_SIZEOF_INT 4
-#elif (UINT_MAX == 0xffffffffffffffff)  // NOLINT
-#  define PLATFORM_SIZEOF_INT 8
-#else
-#  error Cannot find "int" size.
-#endif
-
-#if (USHRT_MAX == 0xffffffff)
-#  define PLATFORM_SIZEOF_SHORT 4
-#elif (USHRT_MAX == 0xffff)  // NOLINT
-#  define PLATFORM_SIZEOF_SHORT 2
-#else
-#  error Cannot find "short" size.
-#endif
-
-#endif  // __BUILD_CONFIG_H__
--- a/intern/numaapi/source/numaapi.c
+++ b/intern/numaapi/source/numaapi.c
@@ -1,40 +0,0 @@
-// Copyright (c) 2018, libnumaapi authors
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to
-// deal in the Software without restriction, including without limitation the
-// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-// sell copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-// IN THE SOFTWARE.
-//
-// Author: Sergey Sharybin <sergey.vfx@gmail.com>
-
-/** \file
- * \ingroup intern_numaapi
- */
-
-#include "numaapi.h"
-
-#include <assert.h>
-
-const char* numaAPI_ResultAsString(NUMAAPI_Result result) {
-  switch (result) {
-    case NUMAAPI_SUCCESS: return "SUCCESS";
-    case NUMAAPI_NOT_AVAILABLE: return "NOT_AVAILABLE";
-    case NUMAAPI_ERROR: return "ERROR";
-    case NUMAAPI_ERROR_ATEXIT: return "ERROR_AT_EXIT";
-  }
-  assert(!"Unknown result was passed to numapi_ResultAsString().");
-  return "UNKNOWN";
-}
--- a/intern/numaapi/source/numaapi_linux.c
+++ b/intern/numaapi/source/numaapi_linux.c
@@ -1,298 +0,0 @@
-// Copyright (c) 2016, libnumaapi authors
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to
-// deal in the Software without restriction, including without limitation the
-// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-// sell copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-// IN THE SOFTWARE.
-//
-// Author: Sergey Sharybin <sergey.vfx@gmail.com>
-
-/** \file
- * \ingroup intern_numaapi
- */
-
-#include "build_config.h"
-
-#if OS_LINUX
-
-#include "numaapi.h"
-
-#include <stdlib.h>
-
-#ifndef WITH_DYNLOAD
-#  include <numa.h>
-#else
-#  include <dlfcn.h>
-#endif
-
-#ifdef WITH_DYNLOAD
-
-// Descriptor numa library.
-static void* numa_lib;
-
-// Types of all symbols which are read from the library.
-struct bitmask;
-typedef int tnuma_available(void);
-typedef int tnuma_max_node(void);
-typedef int tnuma_node_to_cpus(int node, struct bitmask* mask);
-typedef long tnuma_node_size(int node, long* freep);
-typedef int tnuma_run_on_node(int node);
-typedef void* tnuma_alloc_onnode(size_t size, int node);
-typedef void* tnuma_alloc_local(size_t size);
-typedef void tnuma_free(void* start, size_t size);
-typedef struct bitmask* tnuma_bitmask_clearall(struct bitmask *bitmask);
-typedef int tnuma_bitmask_isbitset(const struct bitmask *bitmask,
-                                   unsigned int n);
-typedef struct bitmask* tnuma_bitmask_setbit(struct bitmask *bitmask,
-                                             unsigned int n);
-typedef unsigned int tnuma_bitmask_nbytes(struct bitmask *bitmask);
-typedef void tnuma_bitmask_free(struct bitmask *bitmask);
-typedef struct bitmask* tnuma_allocate_cpumask(void);
-typedef struct bitmask* tnuma_allocate_nodemask(void);
-typedef void tnuma_free_cpumask(struct bitmask* bitmask);
-typedef void tnuma_free_nodemask(struct bitmask* bitmask);
-typedef int tnuma_run_on_node_mask(struct bitmask *nodemask);
-typedef int tnuma_run_on_node_mask_all(struct bitmask *nodemask);
-typedef struct bitmask *tnuma_get_run_node_mask(void);
-typedef void tnuma_set_interleave_mask(struct bitmask *nodemask);
-typedef void tnuma_set_localalloc(void);
-
-// Actual symbols.
-static tnuma_available* numa_available;
-static tnuma_max_node* numa_max_node;
-static tnuma_node_to_cpus* numa_node_to_cpus;
-static tnuma_node_size* numa_node_size;
-static tnuma_run_on_node* numa_run_on_node;
-static tnuma_alloc_onnode* numa_alloc_onnode;
-static tnuma_alloc_local* numa_alloc_local;
-static tnuma_free* numa_free;
-static tnuma_bitmask_clearall* numa_bitmask_clearall;
-static tnuma_bitmask_isbitset* numa_bitmask_isbitset;
-static tnuma_bitmask_setbit* numa_bitmask_setbit;
-static tnuma_bitmask_nbytes* numa_bitmask_nbytes;
-static tnuma_bitmask_free* numa_bitmask_free;
-static tnuma_allocate_cpumask* numa_allocate_cpumask;
-static tnuma_allocate_nodemask* numa_allocate_nodemask;
-static tnuma_free_nodemask* numa_free_nodemask;
-static tnuma_free_cpumask* numa_free_cpumask;
-static tnuma_run_on_node_mask* numa_run_on_node_mask;
-static tnuma_run_on_node_mask_all* numa_run_on_node_mask_all;
-static tnuma_get_run_node_mask* numa_get_run_node_mask;
-static tnuma_set_interleave_mask* numa_set_interleave_mask;
-static tnuma_set_localalloc* numa_set_localalloc;
-
-static void* findLibrary(const char** paths) {
-  int i = 0;
-  while (paths[i] != NULL) {
-      void* lib = dlopen(paths[i], RTLD_LAZY);
-      if (lib != NULL) {
-        return lib;
-      }
-      ++i;
-  }
-  return NULL;
-}
-
-static void numaExit(void) {
-  if (numa_lib == NULL) {
-    return;
-  }
-  dlclose(numa_lib);
-  numa_lib = NULL;
-}
-
-static NUMAAPI_Result loadNumaSymbols(void) {
-  // Prevent multiple initializations.
-  static bool initialized = false;
-  static NUMAAPI_Result result = NUMAAPI_NOT_AVAILABLE;
-  if (initialized) {
-    return result;
-  }
-  initialized = true;
-  // Find appropriate .so library.
-  const char* numa_paths[] = {
-      "libnuma.so.1",
-      "libnuma.so",
-      NULL};
-  // Register de-initialization.
-  const int error = atexit(numaExit);
-  if (error) {
-    result = NUMAAPI_ERROR_ATEXIT;
-    return result;
-  }
-  // Load library.
-  numa_lib = findLibrary(numa_paths);
-  if (numa_lib == NULL) {
-    result = NUMAAPI_NOT_AVAILABLE;
-    return result;
-  }
-  // Load symbols.
-
-#define _LIBRARY_FIND(lib, name)          \
-  do {                                    \
-    name = (t##name *)dlsym(lib, #name);  \
-  } while (0)
-#define NUMA_LIBRARY_FIND(name) _LIBRARY_FIND(numa_lib, name)
-
-  NUMA_LIBRARY_FIND(numa_available);
-  NUMA_LIBRARY_FIND(numa_max_node);
-  NUMA_LIBRARY_FIND(numa_node_to_cpus);
-  NUMA_LIBRARY_FIND(numa_node_size);
-  NUMA_LIBRARY_FIND(numa_run_on_node);
-  NUMA_LIBRARY_FIND(numa_alloc_onnode);
-  NUMA_LIBRARY_FIND(numa_alloc_local);
-  NUMA_LIBRARY_FIND(numa_free);
-  NUMA_LIBRARY_FIND(numa_bitmask_clearall);
-  NUMA_LIBRARY_FIND(numa_bitmask_isbitset);
-  NUMA_LIBRARY_FIND(numa_bitmask_setbit);
-  NUMA_LIBRARY_FIND(numa_bitmask_nbytes);
-  NUMA_LIBRARY_FIND(numa_bitmask_free);
-  NUMA_LIBRARY_FIND(numa_allocate_cpumask);
-  NUMA_LIBRARY_FIND(numa_allocate_nodemask);
-  NUMA_LIBRARY_FIND(numa_free_cpumask);
-  NUMA_LIBRARY_FIND(numa_free_nodemask);
-  NUMA_LIBRARY_FIND(numa_run_on_node_mask);
-  NUMA_LIBRARY_FIND(numa_run_on_node_mask_all);
-  NUMA_LIBRARY_FIND(numa_get_run_node_mask);
-  NUMA_LIBRARY_FIND(numa_set_interleave_mask);
-  NUMA_LIBRARY_FIND(numa_set_localalloc);
-
-#undef NUMA_LIBRARY_FIND
-#undef _LIBRARY_FIND
-
-  result = NUMAAPI_SUCCESS;
-  return result;
-}
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// Initialization.
-
-NUMAAPI_Result numaAPI_Initialize(void) {
-#ifdef WITH_DYNLOAD
-  NUMAAPI_Result result = loadNumaSymbols();
-  if (result != NUMAAPI_SUCCESS) {
-    return result;
-  }
-#endif
-  if (numa_available() < 0) {
-    return NUMAAPI_NOT_AVAILABLE;
-  }
-  return NUMAAPI_SUCCESS;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Topology query.
-
-int numaAPI_GetNumNodes(void) {
-  return numa_max_node() + 1;
-}
-
-bool numaAPI_IsNodeAvailable(int node) {
-  return numaAPI_GetNumNodeProcessors(node) > 0;
-}
-
-int numaAPI_GetNumNodeProcessors(int node) {
-  struct bitmask* cpu_mask = numa_allocate_cpumask();
-  numa_node_to_cpus(node, cpu_mask);
-  const unsigned int num_bytes = numa_bitmask_nbytes(cpu_mask);
-  const unsigned int num_bits = num_bytes * 8;
-  // TODO(sergey): There might be faster way calculating number of set bits.
-  int num_processors = 0;
-  for (unsigned int bit = 0; bit < num_bits; ++bit) {
-    if (numa_bitmask_isbitset(cpu_mask, bit)) {
-      ++num_processors;
-    }
-  }
-#ifdef WITH_DYNLOAD
-  if (numa_free_cpumask != NULL) {
-    numa_free_cpumask(cpu_mask);
-  } else {
-    numa_bitmask_free(cpu_mask);
-  }
-#else
-  numa_free_cpumask(cpu_mask);
-#endif
-  return num_processors;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Topology helpers.
-
-int numaAPI_GetNumCurrentNodesProcessors(void) {
-  struct bitmask* node_mask = numa_get_run_node_mask();
-  const unsigned int num_bytes = numa_bitmask_nbytes(node_mask);
-  const unsigned int num_bits = num_bytes * 8;
-  int num_processors = 0;
-  for (unsigned int bit = 0; bit < num_bits; ++bit) {
-    if (numa_bitmask_isbitset(node_mask, bit)) {
-      num_processors += numaAPI_GetNumNodeProcessors(bit);
-    }
-  }
-  numa_bitmask_free(node_mask);
-  return num_processors;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Affinities.
-
-bool numaAPI_RunProcessOnNode(int node) {
-  numaAPI_RunThreadOnNode(node);
-  return true;
-}
-
-bool numaAPI_RunThreadOnNode(int node) {
-  // Construct bit mask from node index.
-  struct bitmask* node_mask = numa_allocate_nodemask();
-  numa_bitmask_clearall(node_mask);
-  numa_bitmask_setbit(node_mask, node);
-  numa_run_on_node_mask_all(node_mask);
-  // TODO(sergey): The following commands are based on x265 code, we might want
-  // to make those optional, or require to call those explicitly.
-  //
-  // Current assumption is that this is similar to SetThreadGroupAffinity().
-  if (numa_node_size(node, NULL) > 0) {
-    numa_set_interleave_mask(node_mask);
-    numa_set_localalloc();
-  }
-#ifdef WITH_DYNLOAD
-  if (numa_free_nodemask != NULL) {
-    numa_free_nodemask(node_mask);
-  } else {
-    numa_bitmask_free(node_mask);
-  }
-#else
-  numa_free_nodemask(node_mask);
-#endif
-  return true;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Memory management.
-
-void* numaAPI_AllocateOnNode(size_t size, int node) {
-  return numa_alloc_onnode(size, node);
-}
-
-void* numaAPI_AllocateLocal(size_t size) {
-  return numa_alloc_local(size);
-}
-
-void numaAPI_Free(void* start, size_t size) {
-  numa_free(start, size);
-}
-
-#endif  // OS_LINUX
--- a/intern/numaapi/source/numaapi_stub.c
+++ b/intern/numaapi/source/numaapi_stub.c
@@ -1,98 +0,0 @@
-// Copyright (c) 2016, libnumaapi authors
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to
-// deal in the Software without restriction, including without limitation the
-// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-// sell copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-// IN THE SOFTWARE.
-//
-// Author: Sergey Sharybin <sergey.vfx@gmail.com>
-
-/** \file
- * \ingroup intern_numaapi
- */
-
-#include "numaapi.h"
-
-#include "build_config.h"
-
-// Stub implementation for platforms which doesn't have NUMA support.
-
-#if !OS_LINUX && !OS_WIN
-
-////////////////////////////////////////////////////////////////////////////////
-// Initialization.
-
-NUMAAPI_Result numaAPI_Initialize(void) {
-  return NUMAAPI_NOT_AVAILABLE;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Topology query.
-
-int numaAPI_GetNumNodes(void) {
-  return 0;
-}
-
-bool numaAPI_IsNodeAvailable(int node) {
-  (void) node;  // Ignored.
-  return false;
-}
-
-int numaAPI_GetNumNodeProcessors(int node) {
-  (void) node;  // Ignored.
-  return 0;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Topology helpers.
-
-int numaAPI_GetNumCurrentNodesProcessors(void) {
-  return 0;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Affinities.
-
-bool numaAPI_RunProcessOnNode(int node) {
-  (void) node;  // Ignored.
-  return false;
-}
-
-bool numaAPI_RunThreadOnNode(int node) {
-  (void) node;  // Ignored.
-  return false;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Memory management.
-
-void* numaAPI_AllocateOnNode(size_t size, int node) {
-  (void) size;  // Ignored.
-  (void) node;  // Ignored.
-  return 0;
-}
-
-void* numaAPI_AllocateLocal(size_t size) {
-  (void) size;  // Ignored.
-  return NULL;
-}
-
-void numaAPI_Free(void* start, size_t size) {
-  (void) start;  // Ignored.
-  (void) size;  // Ignored.
-}
-
-#endif  // !OS_LINUX && !OS_WIN
--- a/intern/numaapi/source/numaapi_win32.c
+++ b/intern/numaapi/source/numaapi_win32.c
@@ -1,296 +0,0 @@
-// Copyright (c) 2016, libnumaapi authors
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to
-// deal in the Software without restriction, including without limitation the
-// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-// sell copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-// IN THE SOFTWARE.
-//
-// Author: Sergey Sharybin <sergey.vfx@gmail.com>
-
-/** \file
- * \ingroup intern_numaapi
- */
-
-#include "build_config.h"
-
-#if OS_WIN
-
-#include "numaapi.h"
-
-#ifndef NOGDI
-#  define NOGDI
-#endif
-#ifndef NOMINMAX
-#  define NOMINMAX
-#endif
-#ifndef WIN32_LEAN_AND_MEAN
-#  define WIN32_LEAN_AND_MEAN
-#endif
-#ifndef NOCOMM
-#  define NOCOMM
-#endif
-
-#include <stdlib.h>
-#include <stdint.h>
-#include <windows.h>
-
-#if ARCH_CPU_64_BITS
-#  include <VersionHelpers.h>
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-// Initialization.
-
-// Kernel library, from where the symbols come.
-static HMODULE kernel_lib;
-
-// Types of all symbols which are read from the library.
-
-// NUMA function types.
-typedef BOOL t_GetNumaHighestNodeNumber(PULONG highest_node_number);
-typedef BOOL t_GetNumaNodeProcessorMask(UCHAR node, ULONGLONG* processor_mask);
-typedef BOOL t_GetNumaNodeProcessorMaskEx(USHORT node,
-                                          GROUP_AFFINITY* processor_mask);
-typedef BOOL t_GetNumaProcessorNode(UCHAR processor, UCHAR* node_number);
-typedef void* t_VirtualAllocExNuma(HANDLE process_handle,
-                                   LPVOID address,
-                                   SIZE_T size,
-                                   DWORD  allocation_type,
-                                   DWORD  protect,
-                                   DWORD  preferred);
-typedef BOOL t_VirtualFree(void* address, SIZE_T size, DWORD free_type);
-// Threading function types.
-typedef BOOL t_SetProcessAffinityMask(HANDLE process_handle,
-                                      DWORD_PTR process_affinity_mask);
-typedef BOOL t_SetThreadGroupAffinity(HANDLE thread_handle,
-                                      const GROUP_AFFINITY* group_affinity,
-                                      GROUP_AFFINITY* PreviousGroupAffinity);
-typedef BOOL t_GetThreadGroupAffinity(HANDLE thread_handle,
-                                      GROUP_AFFINITY* group_affinity);
-typedef DWORD t_GetCurrentProcessorNumber(void);
-typedef void t_GetCurrentProcessorNumberEx(PROCESSOR_NUMBER* proc_number);
-typedef DWORD t_GetActiveProcessorCount(WORD group_number);
-
-
-// NUMA symbols.
-static t_GetNumaHighestNodeNumber* _GetNumaHighestNodeNumber;
-static t_GetNumaNodeProcessorMask* _GetNumaNodeProcessorMask;
-static t_GetNumaNodeProcessorMaskEx* _GetNumaNodeProcessorMaskEx;
-static t_GetNumaProcessorNode* _GetNumaProcessorNode;
-static t_VirtualAllocExNuma* _VirtualAllocExNuma;
-static t_VirtualFree* _VirtualFree;
-// Threading symbols.
-static t_SetProcessAffinityMask* _SetProcessAffinityMask;
-static t_SetThreadGroupAffinity* _SetThreadGroupAffinity;
-static t_GetThreadGroupAffinity* _GetThreadGroupAffinity;
-static t_GetCurrentProcessorNumber* _GetCurrentProcessorNumber;
-static t_GetCurrentProcessorNumberEx* _GetCurrentProcessorNumberEx;
-static t_GetActiveProcessorCount* _GetActiveProcessorCount;
-
-static void numaExit(void) {
-  // TODO(sergey): Consider closing library here.
-}
-
-static NUMAAPI_Result loadNumaSymbols(void) {
-  // Prevent multiple initializations.
-  static bool initialized = false;
-  static NUMAAPI_Result result = NUMAAPI_NOT_AVAILABLE;
-  if (initialized) {
-    return result;
-  }
-  initialized = true;
-  // Register de-initialization.
-  const int error = atexit(numaExit);
-  if (error) {
-    result = NUMAAPI_ERROR_ATEXIT;
-    return result;
-  }
-  // Load library.
-  kernel_lib = LoadLibraryA("Kernel32.dll");
-  // Load symbols.
-
-#define _LIBRARY_FIND(lib, name)                   \
-  do {                                             \
-    _##name = (t_##name *)GetProcAddress(lib, #name);  \
-  } while (0)
-#define KERNEL_LIBRARY_FIND(name) _LIBRARY_FIND(kernel_lib, name)
-
-  // NUMA.
-  KERNEL_LIBRARY_FIND(GetNumaHighestNodeNumber);
-  KERNEL_LIBRARY_FIND(GetNumaNodeProcessorMask);
-  KERNEL_LIBRARY_FIND(GetNumaNodeProcessorMaskEx);
-  KERNEL_LIBRARY_FIND(GetNumaProcessorNode);
-  KERNEL_LIBRARY_FIND(VirtualAllocExNuma);
-  KERNEL_LIBRARY_FIND(VirtualFree);
-  // Threading.
-  KERNEL_LIBRARY_FIND(SetProcessAffinityMask);
-  KERNEL_LIBRARY_FIND(SetThreadGroupAffinity);
-  KERNEL_LIBRARY_FIND(GetThreadGroupAffinity);
-  KERNEL_LIBRARY_FIND(GetCurrentProcessorNumber);
-  KERNEL_LIBRARY_FIND(GetCurrentProcessorNumberEx);
-  KERNEL_LIBRARY_FIND(GetActiveProcessorCount);
-
-#undef KERNEL_LIBRARY_FIND
-#undef _LIBRARY_FIND
-
-  result = NUMAAPI_SUCCESS;
-  return result;
-}
-
-NUMAAPI_Result numaAPI_Initialize(void) {
-#if !ARCH_CPU_64_BITS
-  // No NUMA on 32 bit platforms.
-  return NUMAAPI_NOT_AVAILABLE;
-#else
-  if (!IsWindows7OrGreater()) {
-    // Require Windows 7 or higher.
-    NUMAAPI_NOT_AVAILABLE;
-  }
-  loadNumaSymbols();
-  return NUMAAPI_SUCCESS;
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Internal helpers.
-
-static int countNumSetBits(ULONGLONG mask) {
-  // TODO(sergey): There might be faster way calculating number of set bits.
-  // NOTE: mask must be unsigned, there is undefined behavior for signed ints.
-  int num_bits = 0;
-  while (mask != 0) {
-    num_bits += (mask & 1);
-    mask = (mask >> 1);
-  }
-  return num_bits;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Topology query.
-
-int numaAPI_GetNumNodes(void) {
-  ULONG highest_node_number;
-  if (!_GetNumaHighestNodeNumber(&highest_node_number)) {
-    return 0;
-  }
-  // TODO(sergey): Resolve the type narrowing.
-  // NOTE: This is not necessarily a total amount of nodes in the system.
-  return (int)highest_node_number + 1;
-}
-
-bool numaAPI_IsNodeAvailable(int node) {
-  // Trick to detect whether the node is usable or not: check whether
-  // there are any processors associated with it.
-  //
-  // This is needed because numaApiGetNumNodes() is not guaranteed to
-  // give total amount of nodes and some nodes might be unavailable.
-  GROUP_AFFINITY processor_mask = { 0 };
-  if (!_GetNumaNodeProcessorMaskEx(node, &processor_mask)) {
-    return false;
-  }
-  if (processor_mask.Mask == 0) {
-    return false;
-  }
-  return true;
-}
-
-int numaAPI_GetNumNodeProcessors(int node) {
-  GROUP_AFFINITY processor_mask = { 0 };
-  if (!_GetNumaNodeProcessorMaskEx(node, &processor_mask)) {
-    return 0;
-  }
-  return countNumSetBits(processor_mask.Mask);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Topology helpers.
-
-int numaAPI_GetNumCurrentNodesProcessors(void) {
-  HANDLE thread_handle = GetCurrentThread();
-  GROUP_AFFINITY group_affinity;
-  // TODO(sergey): Needs implementation.
-  if (!_GetThreadGroupAffinity(thread_handle, &group_affinity)) {
-    return 0;
-  }
-  // First, count number of possible bits in the affinity mask.
-  const int num_processors = countNumSetBits(group_affinity.Mask);
-  // Then check that it's not exceeding number of processors in tjhe group.
-  const int num_group_processors =
-      _GetActiveProcessorCount(group_affinity.Group);
-  if (num_group_processors < num_processors) {
-    return num_group_processors;
-  }
-  return num_processors;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Affinities.
-
-bool numaAPI_RunProcessOnNode(int node) {
-  // TODO(sergey): Make sure requested node is within active CPU group.
-  // Change affinity of the proces to make it to run on a given node.
-  HANDLE process_handle = GetCurrentProcess();
-  GROUP_AFFINITY processor_mask = { 0 };
-  if (_GetNumaNodeProcessorMaskEx(node, &processor_mask) == 0) {
-    return false;
-  }
-  // TODO: Affinity should respect processor group.
-  if (_SetProcessAffinityMask(process_handle, processor_mask.Mask) == 0) {
-    return false;
-  }
-  return true;
-}
-
-bool numaAPI_RunThreadOnNode(int node) {
-  HANDLE thread_handle = GetCurrentThread();
-  GROUP_AFFINITY group_affinity = { 0 };
-  if (_GetNumaNodeProcessorMaskEx(node, &group_affinity) == 0) {
-    return false;
-  }
-  if (_SetThreadGroupAffinity(thread_handle, &group_affinity, NULL) == 0) {
-    return false;
-  }
-  return true;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Memory management.
-
-void* numaAPI_AllocateOnNode(size_t size, int node) {
-  return _VirtualAllocExNuma(GetCurrentProcess(),
-                             NULL,
-                             size,
-                             MEM_RESERVE | MEM_COMMIT,
-                             PAGE_READWRITE,
-                             node);
-}
-
-void* numaAPI_AllocateLocal(size_t size) {
-  UCHAR current_processor = (UCHAR)_GetCurrentProcessorNumber();
-  UCHAR node;
-  if (!_GetNumaProcessorNode(current_processor, &node)) {
-    return NULL;
-  }
-  return numaAPI_AllocateOnNode(size, node);
-}
-
-void numaAPI_Free(void* start, size_t size) {
-  if (!_VirtualFree(start, size, MEM_RELEASE)) {
-    // TODO(sergey): Throw an error!
-  }
-}
-
-#endif  // OS_WIN
--- a/intern/opencolorio/gpu_shader_display_transform.glsl
+++ b/intern/opencolorio/gpu_shader_display_transform.glsl
@@ -119,7 +119,7 @@ vec4 curvemapping_evaluate_premulRGBF(vec4 col)

 /* Using a triangle distribution which gives a more final uniform noise.
 * See Banding in Games:A Noisy Rant(revision 5) Mikkel Gjøl, Playdead (slide 27) */
-/* GPUs are rounding before writting to framebuffer so we center the distribution around 0.0. */
+/* GPUs are rounding before writing to framebuffer so we center the distribution around 0.0. */
 /* Return triangle noise in [-1..1[ range */
 float dither_random_value(vec2 co)
 {
--- a/intern/opensubdiv/internal/evaluator/patch_map.cc
+++ b/intern/opensubdiv/internal/evaluator/patch_map.cc
@@ -23,6 +23,7 @@
 // Modifications copyright 2021 Blender Foundation. All rights reserved.

 #include "internal/evaluator/patch_map.h"
+#include <algorithm>

 using OpenSubdiv::Far::ConstPatchParamArray;
 using OpenSubdiv::Far::Index;
--- a/intern/opensubdiv/internal/topology/mesh_topology.cc
+++ b/intern/opensubdiv/internal/topology/mesh_topology.cc
@@ -183,7 +183,7 @@ void MeshTopology::setNumFaces(int num_faces)
  num_faces_ = num_faces;

  // NOTE: Extra element to store fake face past the last real one to make it
-  // possible to calculate number of verticies in the last face.
+  // possible to calculate number of vertices in the last face.
  faces_first_vertex_index_.resize(num_faces + 1, 0);
 }

--- a/intern/opensubdiv/internal/topology/mesh_topology.h
+++ b/intern/opensubdiv/internal/topology/mesh_topology.h
@@ -111,7 +111,7 @@ class MeshTopology {
  // Pipeline related.

  // This function is to be called when number of vertices, edges, faces, and
-  // face-verticies are known.
+  // face-vertices are known.
  //
  // Usually is called from the end of topology refiner factory's
  // resizeComponentTopology().
@@ -162,7 +162,7 @@ class MeshTopology {

  int num_faces_;

-  // Continuous array of all verticies of all faces:
+  // Continuous array of all vertices of all faces:
  //  [vertex indices of face 0][vertex indices of face 1] .. [vertex indices of face n].
  vector<int> face_vertex_indices_;

--- a/intern/opensubdiv/opensubdiv_converter_capi.h
+++ b/intern/opensubdiv/opensubdiv_converter_capi.h
@@ -135,7 +135,7 @@ typedef struct OpenSubdiv_Converter {
  // specified in precalcUVLayer().
  int (*getNumUVCoordinates)(const struct OpenSubdiv_Converter *converter);
  // For the given face index and its corner (known as loop in Blender)
-  // get corrsponding UV coordinate index.
+  // get corresponding UV coordinate index.
  int (*getFaceCornerUVIndex)(const struct OpenSubdiv_Converter *converter,
                              const int face_index,
                              const int corner_index);
--- a/intern/utfconv/utfconv.c
+++ b/intern/utfconv/utfconv.c
@@ -56,7 +56,7 @@ size_t count_utf_8_from_16(const wchar_t *string16)
          }
          else {
            if (u < 0xE000) {
-              /*illigal*/;
+              /*illegal*/;
            }
            else {
              count += 3;
--- a/release/scripts/modules/keyingsets_utils.py
+++ b/release/scripts/modules/keyingsets_utils.py
@@ -238,11 +238,15 @@ def RKS_GEN_custom_props(_ksi, _context, ks, data):
            continue

        prop_path = '["%s"]' % bpy.utils.escape_identifier(cprop_name)
+
        try:
            rna_property = data.path_resolve(prop_path, False)
        except ValueError:
-            # This happens when a custom property is set to None. In that case it cannot
-            # be converted to an FCurve-compatible value, so we can't keyframe it anyway.
+            # Can technically happen, but there is no known case.
+            continue
+        if rna_property is None:
+            # In this case the property cannot be converted to an 
+            # FCurve-compatible value, so we can't keyframe it anyways.
            continue
        if rna_property.rna_type not in prop_type_compat:
            continue
--- a/release/scripts/startup/bl_ui/properties_animviz.py
+++ b/release/scripts/startup/bl_ui/properties_animviz.py
@@ -70,14 +70,10 @@ class MotionPathButtonsPanel:

            col = layout.column(align=True)

-            row = col.row(align=True)
            if bones:
-                row.operator("pose.paths_update", text="Update Paths", icon='BONE_DATA')
-                row.operator("pose.paths_clear", text="", icon='X')
+                col.operator("pose.paths_update", text="Update Paths", icon='BONE_DATA')
            else:
-                row.operator("object.paths_update", text="Update Paths", icon='OBJECT_DATA')
-                row.operator("object.paths_clear", text="", icon='X')
-            col.operator("object.paths_update_visible", text="Update All Paths", icon='WORLD')
+                col.operator("object.paths_update", text="Update Paths", icon='OBJECT_DATA')
        else:
            col = layout.column(align=True)
            col.label(text="Nothing to show yet...", icon='ERROR')
@@ -86,7 +82,13 @@ class MotionPathButtonsPanel:
                col.operator("pose.paths_calculate", text="Calculate...", icon='BONE_DATA')
            else:
                col.operator("object.paths_calculate", text="Calculate...", icon='OBJECT_DATA')
-            col.operator("object.paths_update_visible", text="Update All Paths", icon='WORLD')
+
+        row = col.row(align=True)
+        row.operator("object.paths_update_visible", text="Update All Paths", icon='WORLD')
+        if bones:
+            row.operator("pose.paths_clear", text="", icon='X')
+        else:
+            row.operator("object.paths_clear", text="", icon='X')


 class MotionPathButtonsPanel_display:
--- a/release/scripts/startup/bl_ui/space_dopesheet.py
+++ b/release/scripts/startup/bl_ui/space_dopesheet.py
@@ -382,7 +382,7 @@ class DOPESHEET_MT_view(Menu):
 class DOPESHEET_MT_view_pie(Menu):
    bl_label = "View"

-    def draw(self, context):
+    def draw(self, _context):
        layout = self.layout

        pie = layout.menu_pie()
@@ -544,7 +544,7 @@ class DopesheetActionPanelBase:
    bl_label = "Action"

    @classmethod
-    def draw_generic_panel(cls, context, layout, action):
+    def draw_generic_panel(cls, _context, layout, action):
        layout.label(text=action.name, icon='ACTION')

        layout.prop(action, "use_frame_range")
--- a/release/scripts/startup/bl_ui/space_filebrowser.py
+++ b/release/scripts/startup/bl_ui/space_filebrowser.py
@@ -847,10 +847,10 @@ classes = (
    ASSETBROWSER_MT_context_menu,
 )

-def asset_path_str_get(self):
+def asset_path_str_get(_self):
    asset_file_handle = bpy.context.asset_file_handle
    if asset_file_handle is None:
-        return None
+        return ""

    if asset_file_handle.local_id:
        return "Current File"
--- a/release/scripts/startup/bl_ui/space_graph.py
+++ b/release/scripts/startup/bl_ui/space_graph.py
@@ -344,7 +344,7 @@ class GRAPH_MT_slider(Menu):

    def draw(self, _context):
        layout = self.layout
-        
+
        layout.operator("graph.breakdown", text="Breakdown")
        layout.operator("graph.blend_to_neighbor", text="Blend To Neighbor")

@@ -352,7 +352,7 @@ class GRAPH_MT_slider(Menu):
 class GRAPH_MT_view_pie(Menu):
    bl_label = "View"

-    def draw(self, context):
+    def draw(self, _context):
        layout = self.layout

        pie = layout.menu_pie()
--- a/release/scripts/startup/bl_ui/space_nla.py
+++ b/release/scripts/startup/bl_ui/space_nla.py
@@ -266,7 +266,7 @@ class NLA_MT_snap_pie(Menu):
 class NLA_MT_view_pie(Menu):
    bl_label = "View"

-    def draw(self, context):
+    def draw(self, _context):
        layout = self.layout

        pie = layout.menu_pie()
--- a/release/scripts/startup/bl_ui/space_node.py
+++ b/release/scripts/startup/bl_ui/space_node.py
@@ -370,7 +370,7 @@ class NODE_MT_node(Menu):
 class NODE_MT_view_pie(Menu):
    bl_label = "View"

-    def draw(self, context):
+    def draw(self, _context):
        layout = self.layout

        pie = layout.menu_pie()
--- a/Show More
+++ b/Show More