blender-archive/intern/cycles/device/metal/device_impl.mm

/* SPDX-License-Identifier: Apache-2.0
 * Copyright 2021-2022 Blender Foundation */

#ifdef WITH_METAL

#  include "device/metal/device_impl.h"
#  include "device/metal/device.h"

#  include "scene/scene.h"

#  include "util/debug.h"
#  include "util/md5.h"
#  include "util/path.h"
#  include "util/time.h"

#  include <crt_externs.h>

CCL_NAMESPACE_BEGIN

class MetalDevice;

thread_mutex MetalDevice::existing_devices_mutex;
std::map<int, MetalDevice *> MetalDevice::active_device_ids;

/* Thread-safe device access for async work. Calling code must pass an appropriately scoped lock
 * to existing_devices_mutex to safeguard against destruction of the returned instance. */
MetalDevice *MetalDevice::get_device_by_ID(int ID, thread_scoped_lock &existing_devices_mutex_lock)
{
  auto it = active_device_ids.find(ID);
  if (it != active_device_ids.end()) {
    return it->second;
  }
  return nullptr;
}

bool MetalDevice::is_device_cancelled(int ID)
{
  thread_scoped_lock lock(existing_devices_mutex);
  return get_device_by_ID(ID, lock) == nullptr;
}

BVHLayoutMask MetalDevice::get_bvh_layout_mask() const
{
  return use_metalrt ? BVH_LAYOUT_METAL : BVH_LAYOUT_BVH2;
}

void MetalDevice::set_error(const string &error)
{
  static std::mutex s_error_mutex;
  std::lock_guard<std::mutex> lock(s_error_mutex);

  Device::set_error(error);

  if (first_error) {
    fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
    fprintf(stderr,
            "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
    first_error = false;
  }
}

MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
    : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
{
  {
    /* Assign an ID for this device which we can use to query whether async shader compilation
     * requests are still relevant. */
    thread_scoped_lock lock(existing_devices_mutex);
    static int existing_devices_counter = 1;
    device_id = existing_devices_counter++;
    active_device_ids[device_id] = this;
  }

  mtlDevId = info.num;

  /* select chosen device */
  auto usable_devices = MetalInfo::get_usable_devices();
  assert(mtlDevId < usable_devices.size());
  mtlDevice = usable_devices[mtlDevId];
  device_vendor = MetalInfo::get_device_vendor(mtlDevice);
  assert(device_vendor != METAL_GPU_UNKNOWN);
  metal_printf("Creating new Cycles device for Metal: %s\n", info.description.c_str());

  /* determine default storage mode based on whether UMA is supported */

  default_storage_mode = MTLResourceStorageModeManaged;

  if (@available(macos 11.0, *)) {
    if ([mtlDevice hasUnifiedMemory]) {
      default_storage_mode = MTLResourceStorageModeShared;
    }
  }

  texture_bindings_2d = [mtlDevice newBufferWithLength:4096 options:default_storage_mode];
  texture_bindings_3d = [mtlDevice newBufferWithLength:4096 options:default_storage_mode];

  stats.mem_alloc(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);

  switch (device_vendor) {
    default:
      break;
    case METAL_GPU_INTEL: {
      max_threads_per_threadgroup = 64;
      break;
    }
    case METAL_GPU_AMD: {
      max_threads_per_threadgroup = 128;
      break;
    }
    case METAL_GPU_APPLE: {
      max_threads_per_threadgroup = 512;
      use_metalrt = info.use_metalrt;
      break;
    }
  }

  if (auto metalrt = getenv("CYCLES_METALRT")) {
    use_metalrt = (atoi(metalrt) != 0);
  }

  if (getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
    capture_enabled = true;
  }

  if (device_vendor == METAL_GPU_APPLE) {
    /* Set kernel_specialization_level based on user prefs. */
    switch (info.kernel_optimization_level) {
      case KERNEL_OPTIMIZATION_LEVEL_OFF:
        kernel_specialization_level = PSO_GENERIC;
        break;
      default:
      case KERNEL_OPTIMIZATION_LEVEL_INTERSECT:
        kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
        break;
      case KERNEL_OPTIMIZATION_LEVEL_FULL:
        kernel_specialization_level = PSO_SPECIALIZED_SHADE;
        break;
    }
  }

  if (auto envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) {
    kernel_specialization_level = (MetalPipelineType)atoi(envstr);
  }
  metal_printf("kernel_specialization_level = %s\n",
               kernel_type_as_string(
                   (MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1)));

  MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init];
  arg_desc_params.dataType = MTLDataTypePointer;
  arg_desc_params.access = MTLArgumentAccessReadOnly;
  arg_desc_params.arrayLength = sizeof(KernelParamsMetal) / sizeof(device_ptr);
  mtlBufferKernelParamsEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_params ]];

  MTLArgumentDescriptor *arg_desc_texture = [[MTLArgumentDescriptor alloc] init];
  arg_desc_texture.dataType = MTLDataTypeTexture;
  arg_desc_texture.access = MTLArgumentAccessReadOnly;
  mtlTextureArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_texture ]];

  /* command queue for non-tracing work on the GPU */
  mtlGeneralCommandQueue = [mtlDevice newCommandQueue];

  /* Acceleration structure arg encoder, if needed */
  if (@available(macos 12.0, *)) {
    if (use_metalrt) {
      MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init];
      arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
      arg_desc_as.access = MTLArgumentAccessReadOnly;
      mtlASArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_as ]];
      [arg_desc_as release];
    }
  }

  /* Build the arg encoder for the ancillary bindings */
  {
    NSMutableArray *ancillary_desc = [[NSMutableArray alloc] init];

    int index = 0;
    MTLArgumentDescriptor *arg_desc_tex = [[MTLArgumentDescriptor alloc] init];
    arg_desc_tex.dataType = MTLDataTypePointer;
    arg_desc_tex.access = MTLArgumentAccessReadOnly;

    arg_desc_tex.index = index++;
    [ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_2d */
    arg_desc_tex.index = index++;
    [ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_3d */

    [arg_desc_tex release];

    if (@available(macos 12.0, *)) {
      if (use_metalrt) {
        MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init];
        arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
        arg_desc_as.access = MTLArgumentAccessReadOnly;

        MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
        arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
        arg_desc_ift.access = MTLArgumentAccessReadOnly;

        arg_desc_as.index = index++;
        [ancillary_desc addObject:[arg_desc_as copy]]; /* accel_struct */
        arg_desc_ift.index = index++;
        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_default */
        arg_desc_ift.index = index++;
        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
        arg_desc_ift.index = index++;
        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */

        [arg_desc_ift release];
        [arg_desc_as release];
      }
    }

    mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];

    for (int i = 0; i < ancillary_desc.count; i++) {
      [ancillary_desc[i] release];
    }
    [ancillary_desc release];
  }
  [arg_desc_params release];
  [arg_desc_texture release];
}

MetalDevice::~MetalDevice()
{
  /* Cancel any async shader compilations that are in flight. */
  cancel();

  /* This lock safeguards against destruction during use (see other uses of
   * existing_devices_mutex). */
  thread_scoped_lock lock(existing_devices_mutex);

  for (auto &tex : texture_slot_map) {
    if (tex) {
      [tex release];
      tex = nil;
    }
  }
  flush_delayed_free_list();

  if (texture_bindings_2d) {
    stats.mem_free(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);

    [texture_bindings_2d release];
    [texture_bindings_3d release];
  }
  [mtlTextureArgEncoder release];
  [mtlBufferKernelParamsEncoder release];
  [mtlASArgEncoder release];
  [mtlAncillaryArgEncoder release];
  [mtlGeneralCommandQueue release];
  [mtlDevice release];

  texture_info.free();
}

bool MetalDevice::support_device(const uint kernel_features /*requested_features*/)
{
  return true;
}

bool MetalDevice::check_peer_access(Device *peer_device)
{
  assert(0);
  /* does peer access make sense? */
  return false;
}

bool MetalDevice::use_adaptive_compilation()
{
  return DebugFlags().metal.adaptive_compile;
}

void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
{
  string global_defines;
  if (use_adaptive_compilation()) {
    global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
  }

  if (use_metalrt) {
    global_defines += "#define __METALRT__\n";
    if (motion_blur) {
      global_defines += "#define __METALRT_MOTION__\n";
    }
  }

#  ifdef WITH_CYCLES_DEBUG
  global_defines += "#define __KERNEL_DEBUG__\n";
#  endif

  switch (device_vendor) {
    default:
      break;
    case METAL_GPU_INTEL:
      global_defines += "#define __KERNEL_METAL_INTEL__\n";
      break;
    case METAL_GPU_AMD:
      global_defines += "#define __KERNEL_METAL_AMD__\n";
      break;
    case METAL_GPU_APPLE:
      global_defines += "#define __KERNEL_METAL_APPLE__\n";
      break;
  }

  NSProcessInfo *processInfo = [NSProcessInfo processInfo];
  NSOperatingSystemVersion macos_ver = [processInfo operatingSystemVersion];
  global_defines += "#define __KERNEL_METAL_MACOS__ " + to_string(macos_ver.majorVersion) + "\n";

  string &source = this->source[pso_type];
  source = "\n#include \"kernel/device/metal/kernel.metal\"\n";
  source = path_source_replace_includes(source, path_get("source"));

  /* Perform any required specialization on the source.
   * With Metal function constants we can generate a single variant of the kernel source which can
   * be repeatedly respecialized.
   */
  string baked_constants;

  /* Replace specific KernelData "dot" dereferences with a Metal function_constant identifier of
   * the same character length. Build a string of all active constant values which is then hashed
   * in order to identify the PSO.
   */
  if (pso_type != PSO_GENERIC) {
    const double starttime = time_dt();

#  define KERNEL_STRUCT_BEGIN(name, parent) \
    string_replace_same_length(source, "kernel_data." #parent ".", "kernel_data_" #parent "_");

    bool next_member_is_specialized = true;

#  define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false;

    /* Add constants to md5 so that 'get_best_pipeline' is able to return a suitable match. */
#  define KERNEL_STRUCT_MEMBER(parent, _type, name) \
    if (next_member_is_specialized) { \
      baked_constants += string(#parent "." #name "=") + \
                         to_string(_type(launch_params.data.parent.name)) + "\n"; \
    } \
    else { \
      string_replace( \
          source, "kernel_data_" #parent "_" #name, "kernel_data." #parent ".__unused_" #name); \
      next_member_is_specialized = true; \
    }

#  include "kernel/data_template.h"

    /* Opt in to all of available specializations. This can be made more granular for the
     * PSO_SPECIALIZED_INTERSECT case in order to minimize the number of specialization requests,
     * but the overhead should be negligible as these are very quick to (re)build and aren't
     * serialized to disk via MTLBinaryArchives.
     */
    global_defines += "#define __KERNEL_USE_DATA_CONSTANTS__\n";

    metal_printf("KernelData patching took %.1f ms\n", (time_dt() - starttime) * 1000.0);
  }

  source = global_defines + source;
#  if 0
  metal_printf("================\n%s================\n\%s================\n",
               global_defines.c_str(),
               baked_constants.c_str());
#  endif

  /* Generate an MD5 from the source and include any baked constants. This is used when caching
   * PSOs. */
  MD5Hash md5;
  md5.append(baked_constants);
  md5.append(source);
  if (use_metalrt) {
    md5.append(std::to_string(kernel_features & METALRT_FEATURE_MASK));
  }
  source_md5[pso_type] = md5.get_hex();
}

bool MetalDevice::load_kernels(const uint _kernel_features)
{
  kernel_features = _kernel_features;

  /* check if GPU is supported */
  if (!support_device(kernel_features))
    return false;

  /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
   * This is necessary since objects may be reported to have motion if the Vector pass is
   * active, but may still need to be rendered without motion blur if that isn't active as well. */
  motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;

  /* Only request generic kernels if they aren't cached in memory. */
  if (make_source_and_check_if_compile_needed(PSO_GENERIC)) {
    /* If needed, load them asynchronously in order to responsively message progress to the user.
     */
    int this_device_id = this->device_id;
    auto compile_kernels_fn = ^() {
      compile_and_load(this_device_id, PSO_GENERIC);
    };

    dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
                   compile_kernels_fn);
  }

  return true;
}

bool MetalDevice::make_source_and_check_if_compile_needed(MetalPipelineType pso_type)
{
  if (this->source[pso_type].empty()) {
    make_source(pso_type, kernel_features);
  }
  return MetalDeviceKernels::should_load_kernels(this, pso_type);
}

void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
{
  /* Thread-safe front-end compilation. Typically the MSL->AIR compilation can take a few seconds,
   * so we avoid blocking device tear-down if the user cancels a render immediately. */

  id<MTLDevice> mtlDevice;
  string source;
  MetalGPUVendor device_vendor;

  /* Safely gather any state required for the MSL->AIR compilation. */
  {
    thread_scoped_lock lock(existing_devices_mutex);

    /* Check whether the device still exists. */
    MetalDevice *instance = get_device_by_ID(device_id, lock);
    if (!instance) {
      metal_printf("Ignoring %s compilation request - device no longer exists\n",
                   kernel_type_as_string(pso_type));
      return;
    }

    if (!instance->make_source_and_check_if_compile_needed(pso_type)) {
      /* We already have a full set of matching pipelines which are cached or queued. Return early
       * to avoid redundant MTLLibrary compilation. */
      metal_printf("Ignoreing %s compilation request - kernels already requested\n",
                   kernel_type_as_string(pso_type));
      return;
    }

    mtlDevice = instance->mtlDevice;
    device_vendor = instance->device_vendor;
    source = instance->source[pso_type];
  }

  /* Perform the actual compilation using our cached context. The MetalDevice can safely destruct
   * in this time. */

  MTLCompileOptions *options = [[MTLCompileOptions alloc] init];

#  if defined(MAC_OS_VERSION_13_0)
  if (@available(macos 13.0, *)) {
    if (device_vendor == METAL_GPU_INTEL) {
      [options setOptimizationLevel:MTLLibraryOptimizationLevelSize];
    }
  }
#  endif

  options.fastMathEnabled = YES;
  if (@available(macOS 12.0, *)) {
    options.languageVersion = MTLLanguageVersion2_4;
  }

  if (getenv("CYCLES_METAL_PROFILING") || getenv("CYCLES_METAL_DEBUG")) {
    path_write_text(path_cache_get(string_printf("%s.metal", kernel_type_as_string(pso_type))),
                    source);
  }

  double starttime = time_dt();

  NSError *error = NULL;
  id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
                                                      options:options
                                                        error:&error];

  metal_printf("Front-end compilation finished in %.1f seconds (%s)\n",
               time_dt() - starttime,
               kernel_type_as_string(pso_type));

  [options release];

  bool blocking_pso_build = (getenv("CYCLES_METAL_PROFILING") ||
                             MetalDeviceKernels::is_benchmark_warmup());
  if (blocking_pso_build) {
    MetalDeviceKernels::wait_for_all();
    starttime = 0.0;
  }

  /* Save the compiled MTLLibrary and trigger the AIR->PSO builds (if the MetalDevice still
   * exists). */
  {
    thread_scoped_lock lock(existing_devices_mutex);
    if (MetalDevice *instance = get_device_by_ID(device_id, lock)) {
      if (mtlLibrary) {
        instance->mtlLibrary[pso_type] = mtlLibrary;

        starttime = time_dt();
        MetalDeviceKernels::load(instance, pso_type);
      }
      else {
        NSString *err = [error localizedDescription];
        instance->set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
      }
    }
  }

  if (starttime && blocking_pso_build) {
    MetalDeviceKernels::wait_for_all();

    metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
                 time_dt() - starttime,
                 kernel_type_as_string(pso_type));
  }
}

void MetalDevice::load_texture_info()
{
  if (need_texture_info) {
    /* Unset flag before copying. */
    need_texture_info = false;
    texture_info.copy_to_device();

    int num_textures = texture_info.size();

    for (int tex = 0; tex < num_textures; tex++) {
      uint64_t offset = tex * sizeof(void *);

      id<MTLTexture> metal_texture = texture_slot_map[tex];
      if (!metal_texture) {
        [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
        [mtlTextureArgEncoder setTexture:nil atIndex:0];
        [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
        [mtlTextureArgEncoder setTexture:nil atIndex:0];
      }
      else {
        MTLTextureType type = metal_texture.textureType;
        [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
        [mtlTextureArgEncoder setTexture:type == MTLTextureType2D ? metal_texture : nil atIndex:0];
        [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
        [mtlTextureArgEncoder setTexture:type == MTLTextureType3D ? metal_texture : nil atIndex:0];
      }
    }
    if (default_storage_mode == MTLResourceStorageModeManaged) {
      [texture_bindings_2d didModifyRange:NSMakeRange(0, num_textures * sizeof(void *))];
      [texture_bindings_3d didModifyRange:NSMakeRange(0, num_textures * sizeof(void *))];
    }
  }
}

void MetalDevice::erase_allocation(device_memory &mem)
{
  stats.mem_free(mem.device_size);
  mem.device_pointer = 0;
  mem.device_size = 0;

  auto it = metal_mem_map.find(&mem);
  if (it != metal_mem_map.end()) {
    MetalMem *mmem = it->second.get();

    /* blank out reference to MetalMem* in the launch params (fixes crash T94736) */
    if (mmem->pointer_index >= 0) {
      device_ptr *pointers = (device_ptr *)&launch_params;
      pointers[mmem->pointer_index] = 0;
    }
    metal_mem_map.erase(it);
  }
}

bool MetalDevice::max_working_set_exceeded(size_t safety_margin) const
{
  /* We're allowed to allocate beyond the safe working set size, but then if all resources are made
   * resident we will get command buffer failures at render time. */
  size_t available = [mtlDevice recommendedMaxWorkingSetSize] - safety_margin;
  return (stats.mem_used > available);
}

MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem)
{
  size_t size = mem.memory_size();

  mem.device_pointer = 0;

  id<MTLBuffer> metal_buffer = nil;
  MTLResourceOptions options = default_storage_mode;

  /* Workaround for "bake" unit tests which fail if RenderBuffers is allocated with
   * MTLResourceStorageModeShared. */
  if (strstr(mem.name, "RenderBuffers")) {
    options = MTLResourceStorageModeManaged;
  }

  if (size > 0) {
    if (mem.type == MEM_DEVICE_ONLY && !capture_enabled) {
      options = MTLResourceStorageModePrivate;
    }

    metal_buffer = [mtlDevice newBufferWithLength:size options:options];

    if (!metal_buffer) {
      set_error("System is out of GPU memory");
      return nullptr;
    }
  }

  if (mem.name) {
    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
              << string_human_readable_number(mem.memory_size()) << " bytes. ("
              << string_human_readable_size(mem.memory_size()) << ")";
  }

  mem.device_size = metal_buffer.allocatedSize;
  stats.mem_alloc(mem.device_size);

  metal_buffer.label = [[NSString alloc] initWithFormat:@"%s", mem.name];

  std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);

  assert(metal_mem_map.count(&mem) == 0); /* assert against double-alloc */
  MetalMem *mmem = new MetalMem;
  metal_mem_map[&mem] = std::unique_ptr<MetalMem>(mmem);

  mmem->mem = &mem;
  mmem->mtlBuffer = metal_buffer;
  mmem->offset = 0;
  mmem->size = size;
  if (options != MTLResourceStorageModePrivate) {
    mmem->hostPtr = [metal_buffer contents];
  }
  else {
    mmem->hostPtr = nullptr;
  }

  /* encode device_pointer as (MetalMem*) in order to handle resource relocation and device pointer
   * recalculation */
  mem.device_pointer = device_ptr(mmem);

  if (metal_buffer.storageMode == MTLResourceStorageModeShared) {
    /* Replace host pointer with our host allocation. */

    if (mem.host_pointer && mem.host_pointer != mmem->hostPtr) {
      memcpy(mmem->hostPtr, mem.host_pointer, size);

      mem.host_free();
      mem.host_pointer = mmem->hostPtr;
    }
    mem.shared_pointer = mmem->hostPtr;
    mem.shared_counter++;
    mmem->use_UMA = true;
  }
  else {
    mmem->use_UMA = false;
  }

  if (max_working_set_exceeded()) {
    set_error("System is out of GPU memory");
    return nullptr;
  }

  return mmem;
}

void MetalDevice::generic_copy_to(device_memory &mem)
{
  if (!mem.host_pointer || !mem.device_pointer) {
    return;
  }

  std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
  if (!metal_mem_map.at(&mem)->use_UMA || mem.host_pointer != mem.shared_pointer) {
    MetalMem &mmem = *metal_mem_map.at(&mem);
    memcpy(mmem.hostPtr, mem.host_pointer, mem.memory_size());
    if (mmem.mtlBuffer.storageMode == MTLStorageModeManaged) {
      [mmem.mtlBuffer didModifyRange:NSMakeRange(0, mem.memory_size())];
    }
  }
}

void MetalDevice::generic_free(device_memory &mem)
{
  if (mem.device_pointer) {
    std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
    MetalMem &mmem = *metal_mem_map.at(&mem);
    size_t size = mmem.size;

    /* If mmem.use_uma is true, reference counting is used
     * to safely free memory. */

    bool free_mtlBuffer = false;

    if (mmem.use_UMA) {
      assert(mem.shared_pointer);
      if (mem.shared_pointer) {
        assert(mem.shared_counter > 0);
        if (--mem.shared_counter == 0) {
          free_mtlBuffer = true;
        }
      }
    }
    else {
      free_mtlBuffer = true;
    }

    if (free_mtlBuffer) {
      if (mem.host_pointer && mem.host_pointer == mem.shared_pointer) {
        /* Safely move the device-side data back to the host before it is freed. */
        mem.host_pointer = mem.host_alloc(size);
        memcpy(mem.host_pointer, mem.shared_pointer, size);
        mmem.use_UMA = false;
      }

      mem.shared_pointer = 0;

      /* Free device memory. */
      delayed_free_list.push_back(mmem.mtlBuffer);
      mmem.mtlBuffer = nil;
    }

    erase_allocation(mem);
  }
}

void MetalDevice::mem_alloc(device_memory &mem)
{
  if (mem.type == MEM_TEXTURE) {
    assert(!"mem_alloc not supported for textures.");
  }
  else if (mem.type == MEM_GLOBAL) {
    generic_alloc(mem);
  }
  else {
    generic_alloc(mem);
  }
}

void MetalDevice::mem_copy_to(device_memory &mem)
{
  if (mem.type == MEM_GLOBAL) {
    global_free(mem);
    global_alloc(mem);
  }
  else if (mem.type == MEM_TEXTURE) {
    tex_free((device_texture &)mem);
    tex_alloc((device_texture &)mem);
  }
  else {
    if (!mem.device_pointer) {
      generic_alloc(mem);
    }
    generic_copy_to(mem);
  }
}

void MetalDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem)
{
  if (mem.host_pointer) {

    bool subcopy = (w >= 0 && h >= 0);
    const size_t size = subcopy ? (elem * w * h) : mem.memory_size();
    const size_t offset = subcopy ? (elem * y * w) : 0;

    if (mem.device_pointer) {
      std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
      MetalMem &mmem = *metal_mem_map.at(&mem);

      if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) {

        id<MTLCommandBuffer> cmdBuffer = [mtlGeneralCommandQueue commandBuffer];
        id<MTLBlitCommandEncoder> blitEncoder = [cmdBuffer blitCommandEncoder];
        [blitEncoder synchronizeResource:mmem.mtlBuffer];
        [blitEncoder endEncoding];
        [cmdBuffer commit];
        [cmdBuffer waitUntilCompleted];
      }

      if (mem.host_pointer != mmem.hostPtr) {
        memcpy((uchar *)mem.host_pointer + offset, (uchar *)mmem.hostPtr + offset, size);
      }
    }
    else {
      memset((char *)mem.host_pointer + offset, 0, size);
    }
  }
}

void MetalDevice::mem_zero(device_memory &mem)
{
  if (!mem.device_pointer) {
    mem_alloc(mem);
  }
  if (!mem.device_pointer) {
    return;
  }

  size_t size = mem.memory_size();
  std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
  MetalMem &mmem = *metal_mem_map.at(&mem);
  memset(mmem.hostPtr, 0, size);
  if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) {
    [mmem.mtlBuffer didModifyRange:NSMakeRange(0, size)];
  }
}

void MetalDevice::mem_free(device_memory &mem)
{
  if (mem.type == MEM_GLOBAL) {
    global_free(mem);
  }
  else if (mem.type == MEM_TEXTURE) {
    tex_free((device_texture &)mem);
  }
  else {
    generic_free(mem);
  }
}

device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/)
{
  /* METAL_WIP - revive if necessary */
  assert(0);
  return 0;
}

void MetalDevice::cancel()
{
  /* Remove this device's ID from the list of active devices. Any pending compilation requests
   * originating from this session will be cancelled. */
  thread_scoped_lock lock(existing_devices_mutex);
  if (device_id) {
    active_device_ids.erase(device_id);
    device_id = 0;
  }
}

bool MetalDevice::is_ready(string &status) const
{
  int num_loaded = MetalDeviceKernels::get_loaded_kernel_count(this, PSO_GENERIC);
  if (num_loaded < DEVICE_KERNEL_NUM) {
    status = string_printf("%d / %d render kernels loaded (may take a few minutes the first time)",
                           num_loaded,
                           DEVICE_KERNEL_NUM);
    return false;
  }
  metal_printf("MetalDevice::is_ready(...) --> true\n");
  return true;
}

void MetalDevice::optimize_for_scene(Scene *scene)
{
  MetalPipelineType specialization_level = kernel_specialization_level;

  if (!scene->params.background) {
    /* In live viewport, don't specialize beyond intersection kernels for responsiveness. */
    specialization_level = (MetalPipelineType)min(specialization_level, PSO_SPECIALIZED_INTERSECT);
  }

  /* For responsive rendering, specialize the kernels in the background, and only if there isn't an
   * existing "optimize_for_scene" request in flight. */
  int this_device_id = this->device_id;
  auto specialize_kernels_fn = ^() {
    for (int level = 1; level <= int(specialization_level); level++) {
      compile_and_load(this_device_id, MetalPipelineType(level));
    }
  };

  /* In normal use, we always compile the specialized kernels in the background. */
  bool specialize_in_background = true;

  /* Block if a per-kernel profiling is enabled (ensure steady rendering rate). */
  if (getenv("CYCLES_METAL_PROFILING") != nullptr) {
    specialize_in_background = false;
  }

  /* Block during benchmark warm-up to ensure kernels are cached prior to the observed run. */
  if (MetalDeviceKernels::is_benchmark_warmup()) {
    specialize_in_background = false;
  }

  if (specialize_in_background) {
    if (!MetalDeviceKernels::any_specialization_happening_now()) {
      dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
                     specialize_kernels_fn);
    }
    else {
      metal_printf("\"optimize_for_scene\" request already in flight - dropping request\n");
    }
  }
  else {
    specialize_kernels_fn();
  }
}

void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
{
  if (strcmp(name, "data") == 0) {
    assert(size == sizeof(KernelData));
    memcpy((uint8_t *)&launch_params.data, host, sizeof(KernelData));
    return;
  }

  auto update_launch_pointers =
      [&](size_t offset, void *data, size_t data_size, size_t pointers_size) {
        memcpy((uint8_t *)&launch_params + offset, data, data_size);

        MetalMem **mmem = (MetalMem **)data;
        int pointer_count = pointers_size / sizeof(device_ptr);
        int pointer_index = offset / sizeof(device_ptr);
        for (int i = 0; i < pointer_count; i++) {
          if (mmem[i]) {
            mmem[i]->pointer_index = pointer_index + i;
          }
        }
      };

  /* Update data storage pointers in launch parameters. */
  if (strcmp(name, "integrator_state") == 0) {
    /* IntegratorStateGPU is contiguous pointers */
    const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor);
    update_launch_pointers(
        offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size);
  }
#  define KERNEL_DATA_ARRAY(data_type, tex_name) \
    else if (strcmp(name, #tex_name) == 0) \
    { \
      update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size, size); \
    }
#  include "kernel/data_arrays.h"
#  undef KERNEL_DATA_ARRAY
}

void MetalDevice::global_alloc(device_memory &mem)
{
  if (mem.is_resident(this)) {
    generic_alloc(mem);
    generic_copy_to(mem);
  }

  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
}

void MetalDevice::global_free(device_memory &mem)
{
  if (mem.is_resident(this) && mem.device_pointer) {
    generic_free(mem);
  }
}

void MetalDevice::tex_alloc_as_buffer(device_texture &mem)
{
  generic_alloc(mem);
  generic_copy_to(mem);

  /* Resize once */
  const uint slot = mem.slot;
  if (slot >= texture_info.size()) {
    /* Allocate some slots in advance, to reduce amount
     * of re-allocations. */
    texture_info.resize(round_up(slot + 1, 128));
  }

  mem.info.data = (uint64_t)mem.device_pointer;

  /* Set Mapping and tag that we need to (re-)upload to device */
  texture_info[slot] = mem.info;
  need_texture_info = true;
}

void MetalDevice::tex_alloc(device_texture &mem)
{
  /* Check that dimensions fit within maximum allowable size.
   * See: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf */
  if (mem.data_width > 16384 || mem.data_height > 16384) {
    set_error(string_printf(
        "Texture exceeds maximum allowed size of 16384 x 16384 (requested: %zu x %zu)",
        mem.data_width,
        mem.data_height));
    return;
  }

  MTLStorageMode storage_mode = MTLStorageModeManaged;
  if (@available(macos 10.15, *)) {
    if ([mtlDevice hasUnifiedMemory] &&
        device_vendor !=
            METAL_GPU_INTEL) { /* Intel GPUs don't support MTLStorageModeShared for MTLTextures */
      storage_mode = MTLStorageModeShared;
    }
  }

  /* General variables for both architectures */
  string bind_name = mem.name;
  size_t dsize = datatype_size(mem.data_type);
  size_t size = mem.memory_size();

  /* sampler_index maps into the GPU's constant 'metal_samplers' array */
  uint64_t sampler_index = mem.info.extension;
  if (mem.info.interpolation != INTERPOLATION_CLOSEST) {
    sampler_index += 4;
  }

  /* Image Texture Storage */
  MTLPixelFormat format;
  switch (mem.data_type) {
    case TYPE_UCHAR: {
      MTLPixelFormat formats[] = {MTLPixelFormatR8Unorm,
                                  MTLPixelFormatRG8Unorm,
                                  MTLPixelFormatInvalid,
                                  MTLPixelFormatRGBA8Unorm};
      format = formats[mem.data_elements - 1];
    } break;
    case TYPE_UINT16: {
      MTLPixelFormat formats[] = {MTLPixelFormatR16Unorm,
                                  MTLPixelFormatRG16Unorm,
                                  MTLPixelFormatInvalid,
                                  MTLPixelFormatRGBA16Unorm};
      format = formats[mem.data_elements - 1];
    } break;
    case TYPE_UINT: {
      MTLPixelFormat formats[] = {MTLPixelFormatR32Uint,
                                  MTLPixelFormatRG32Uint,
                                  MTLPixelFormatInvalid,
                                  MTLPixelFormatRGBA32Uint};
      format = formats[mem.data_elements - 1];
    } break;
    case TYPE_INT: {
      MTLPixelFormat formats[] = {MTLPixelFormatR32Sint,
                                  MTLPixelFormatRG32Sint,
                                  MTLPixelFormatInvalid,
                                  MTLPixelFormatRGBA32Sint};
      format = formats[mem.data_elements - 1];
    } break;
    case TYPE_FLOAT: {
      MTLPixelFormat formats[] = {MTLPixelFormatR32Float,
                                  MTLPixelFormatRG32Float,
                                  MTLPixelFormatInvalid,
                                  MTLPixelFormatRGBA32Float};
      format = formats[mem.data_elements - 1];
    } break;
    case TYPE_HALF: {
      MTLPixelFormat formats[] = {MTLPixelFormatR16Float,
                                  MTLPixelFormatRG16Float,
                                  MTLPixelFormatInvalid,
                                  MTLPixelFormatRGBA16Float};
      format = formats[mem.data_elements - 1];
    } break;
    default:
      assert(0);
      return;
  }

  assert(format != MTLPixelFormatInvalid);

  id<MTLTexture> mtlTexture = nil;
  size_t src_pitch = mem.data_width * dsize * mem.data_elements;

  if (mem.data_depth > 1) {
    /* 3D texture using array */
    MTLTextureDescriptor *desc;

    desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:format
                                                              width:mem.data_width
                                                             height:mem.data_height
                                                          mipmapped:NO];

    desc.storageMode = storage_mode;
    desc.usage = MTLTextureUsageShaderRead;

    desc.textureType = MTLTextureType3D;
    desc.depth = mem.data_depth;

    VLOG_WORK << "Texture 3D allocate: " << mem.name << ", "
              << string_human_readable_number(mem.memory_size()) << " bytes. ("
              << string_human_readable_size(mem.memory_size()) << ")";

    mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
    if (!mtlTexture) {
      set_error("System is out of GPU memory");
      return;
    }

    const size_t imageBytes = src_pitch * mem.data_height;
    for (size_t d = 0; d < mem.data_depth; d++) {
      const size_t offset = d * imageBytes;
      [mtlTexture replaceRegion:MTLRegionMake3D(0, 0, d, mem.data_width, mem.data_height, 1)
                    mipmapLevel:0
                          slice:0
                      withBytes:(uint8_t *)mem.host_pointer + offset
                    bytesPerRow:src_pitch
                  bytesPerImage:0];
    }
  }
  else if (mem.data_height > 0) {
    /* 2D texture */
    MTLTextureDescriptor *desc;

    desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:format
                                                              width:mem.data_width
                                                             height:mem.data_height
                                                          mipmapped:NO];

    desc.storageMode = storage_mode;
    desc.usage = MTLTextureUsageShaderRead;

    VLOG_WORK << "Texture 2D allocate: " << mem.name << ", "
              << string_human_readable_number(mem.memory_size()) << " bytes. ("
              << string_human_readable_size(mem.memory_size()) << ")";

    mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
    if (!mtlTexture) {
      set_error("System is out of GPU memory");
      return;
    }

    [mtlTexture replaceRegion:MTLRegionMake2D(0, 0, mem.data_width, mem.data_height)
                  mipmapLevel:0
                    withBytes:mem.host_pointer
                  bytesPerRow:src_pitch];
  }
  else {
    assert(0);
    /* 1D texture, using linear memory. */
  }

  mem.device_pointer = (device_ptr)mtlTexture;
  mem.device_size = size;
  stats.mem_alloc(size);

  std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
  MetalMem *mmem = new MetalMem;
  metal_mem_map[&mem] = std::unique_ptr<MetalMem>(mmem);
  mmem->mem = &mem;
  mmem->mtlTexture = mtlTexture;

  /* Resize once */
  const uint slot = mem.slot;
  if (slot >= texture_info.size()) {
    /* Allocate some slots in advance, to reduce amount
     * of re-allocations. */
    texture_info.resize(slot + 128);
    texture_slot_map.resize(slot + 128);

    ssize_t min_buffer_length = sizeof(void *) * texture_info.size();
    if (!texture_bindings_2d || (texture_bindings_2d.length < min_buffer_length)) {
      if (texture_bindings_2d) {
        delayed_free_list.push_back(texture_bindings_2d);
        delayed_free_list.push_back(texture_bindings_3d);

        stats.mem_free(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
      }
      texture_bindings_2d = [mtlDevice newBufferWithLength:min_buffer_length
                                                   options:default_storage_mode];
      texture_bindings_3d = [mtlDevice newBufferWithLength:min_buffer_length
                                                   options:default_storage_mode];

      stats.mem_alloc(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
    }
  }

  if (@available(macos 10.14, *)) {
    /* Optimize the texture for GPU access. */
    id<MTLCommandBuffer> commandBuffer = [mtlGeneralCommandQueue commandBuffer];
    id<MTLBlitCommandEncoder> blitCommandEncoder = [commandBuffer blitCommandEncoder];
    [blitCommandEncoder optimizeContentsForGPUAccess:mtlTexture];
    [blitCommandEncoder endEncoding];
    [commandBuffer commit];
  }

  /* Set Mapping and tag that we need to (re-)upload to device */
  texture_slot_map[slot] = mtlTexture;
  texture_info[slot] = mem.info;
  need_texture_info = true;

  texture_info[slot].data = uint64_t(slot) | (sampler_index << 32);

  if (max_working_set_exceeded()) {
    set_error("System is out of GPU memory");
  }
}

void MetalDevice::tex_free(device_texture &mem)
{
  if (metal_mem_map.count(&mem)) {
    std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
    MetalMem &mmem = *metal_mem_map.at(&mem);

    assert(texture_slot_map[mem.slot] == mmem.mtlTexture);
    texture_slot_map[mem.slot] = nil;

    if (mmem.mtlTexture) {
      /* Free bindless texture. */
      delayed_free_list.push_back(mmem.mtlTexture);
      mmem.mtlTexture = nil;
    }
    erase_allocation(mem);
  }
}

unique_ptr<DeviceQueue> MetalDevice::gpu_queue_create()
{
  return make_unique<MetalDeviceQueue>(this);
}

bool MetalDevice::should_use_graphics_interop()
{
  /* METAL_WIP - provide fast interop */
  return false;
}

void MetalDevice::flush_delayed_free_list()
{
  /* free any Metal buffers that may have been freed by host while a command
   * buffer was being generated. This function should be called after each
   * completion of a command buffer */
  std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
  for (auto &it : delayed_free_list) {
    [it release];
  }
  delayed_free_list.clear();
}

void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
{
  if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
    Device::build_bvh(bvh, progress, refit);
    return;
  }

  BVHMetal *bvh_metal = static_cast<BVHMetal *>(bvh);
  bvh_metal->motion_blur = motion_blur;
  if (bvh_metal->build(progress, mtlDevice, mtlGeneralCommandQueue, refit)) {

    if (@available(macos 11.0, *)) {
      if (bvh->params.top_level) {
        bvhMetalRT = bvh_metal;
      }
    }
  }

  if (max_working_set_exceeded()) {
    set_error("System is out of GPU memory");
  }
}

CCL_NAMESPACE_END

#endif