7 changed files with 45 additions and 15 deletions
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@ -182,7 +182,7 @@ class Device {
  {
  }

-  /* Return true if device is ready for rendering, or report status if not. */
+  /* Report status and return true if device is ready for rendering. */
  virtual bool is_ready(string & /*status*/) const
  {
    return true;
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@ -74,6 +74,7 @@ class MetalDevice : public Device {
  id<MTLBuffer> texture_bindings_3d = nil;
  std::vector<id<MTLTexture>> texture_slot_map;

+  bool have_mtlbuffer_textures = false;
  id<MTLArgumentEncoder> mtlTextureBufferArgEncoder = nil;
  id<MTLBuffer> texture_buffers = nil;
  std::vector<id<MTLBuffer>> texture_buffer_slot_map;
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@ -322,6 +322,11 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
    global_defines += "#define __KERNEL_LOCAL_ATOMIC_SORT__\n";
  }

+  if (pso_type == PSO_GENERIC || have_mtlbuffer_textures) {
+    /* Only enable MTLBuffer textures if needed as they add a small overhead. */ 
+    global_defines += "#define __KERNEL_METAL_BUFFER_TEXTURES__\n";
+  }
+
  if (use_metalrt) {
    global_defines += "#define __METALRT__\n";
    if (motion_blur) {
@ -897,6 +902,17 @@ bool MetalDevice::is_ready(string &status) const
                           DEVICE_KERNEL_NUM);
    return false;
  }
+
+  if (int num_requests = MetalDeviceKernels::num_incomplete_specialization_requests()) {
+    status = string_printf("%d kernels to optimize", num_requests);
+  }
+  else if (kernel_specialization_level == PSO_SPECIALIZED_INTERSECT) {
+    status = "Using optimized intersection kernels";
+  }
+  else if (kernel_specialization_level == PSO_SPECIALIZED_SHADE) {
+    status = "Using optimized kernels";
+  }
+  
  metal_printf("MetalDevice::is_ready(...) --> true\n");
  return true;
 }
@ -933,7 +949,7 @@ void MetalDevice::optimize_for_scene(Scene *scene)
  }

  if (specialize_in_background) {
-    if (!MetalDeviceKernels::any_specialization_happening_now()) {
+    if (MetalDeviceKernels::num_incomplete_specialization_requests() == 0) {
      dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
                     specialize_kernels_fn);
    }
@ -1103,7 +1119,7 @@ void MetalDevice::tex_alloc(device_texture &mem)
  if (mem.data_width > 16384 || mem.data_height > 16384) {
    use_tex = false;
  }
-  if (auto str = getenv("USE_TEX")) {
+  if (auto str = getenv("CYCLES_METAL_FORCE_MTLTEXTURE")) {
    use_tex = atoi(str);
  }
  if (use_tex) {
@ -1189,12 +1205,12 @@ void MetalDevice::tex_alloc(device_texture &mem)
    mmem->mtlBuffer = mtlBuffer;
  }
  else {
+    have_mtlbuffer_textures = true;
    generic_alloc(mem);
    
    std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
    mtlBuffer = metal_mem_map[&mem]->mtlBuffer;
    mem.device_pointer = (device_ptr)mtlBuffer;
-    //mem.host_pointer = 0;
  }

  /* Resize once */
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@ -104,7 +104,7 @@ struct MetalKernelPipeline {
 /* Cache of Metal kernels for each DeviceKernel. */
 namespace MetalDeviceKernels {

-bool any_specialization_happening_now();
+int num_incomplete_specialization_requests();
 int get_loaded_kernel_count(MetalDevice const *device, MetalPipelineType pso_type);
 bool should_load_kernels(MetalDevice const *device, MetalPipelineType pso_type);
 bool load(MetalDevice *device, MetalPipelineType pso_type);
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@ -857,16 +857,15 @@ void MetalDeviceKernels::wait_for_all()
  }
 }

-bool MetalDeviceKernels::any_specialization_happening_now()
+int MetalDeviceKernels::num_incomplete_specialization_requests()
 {
  /* Return true if any ShaderCaches have ongoing specialization requests (typically there will be
   * only 1). */
+  int total = 0;
  for (int i = 0; i < g_shaderCacheCount; i++) {
-    if (g_shaderCache[i].second->incomplete_specialization_requests > 0) {
-      return true;
-    }
+    total += g_shaderCache[i].second->incomplete_specialization_requests;
  }
-  return false;
+  return total;
 }

 int MetalDeviceKernels::get_loaded_kernel_count(MetalDevice const *device,
--- a/intern/cycles/kernel/device/metal/context_begin.h
+++ b/intern/cycles/kernel/device/metal/context_begin.h
@ -18,10 +18,7 @@ class MetalKernelContext {
    : launch_params_metal(_launch_params_metal)
    {}

-    /* texture fetch adapter functions */
-    typedef uint64_t ccl_gpu_tex_object_2D;
-    typedef uint64_t ccl_gpu_tex_object_3D;
-
+#ifdef __KERNEL_METAL_BUFFER_TEXTURES__
    template<typename T> ccl_device_forceinline T tex_fetch(device void* data, int64_t index)
    {
      return reinterpret_cast<ccl_global T *>(data)[index];
@ -162,13 +159,18 @@ class MetalKernelContext {
        u[3] = (1.0f / 6.0f) * t * t * t; \
      } \
      (void)0
-  
+
+#endif /* __KERNEL_METAL_BUFFER_TEXTURES__ */
+
    ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int tex_id, float x, float y)
    {
      device const TextureInfo &info = kernel_data_fetch(texture_info, tex_id);
      
      const uint tid(info.data);
      const uint sid(info.data >> 32);
+#ifndef __KERNEL_METAL_BUFFER_TEXTURES__
+      return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], make_float2(x,y));
+#else
      if (sid < 256) {
        return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], make_float2(x,y));
      }
@ -216,6 +218,7 @@ class MetalKernelContext {
        }
        return r;
      }
+ #endif
 #endif
    }
    
@ -225,6 +228,10 @@ class MetalKernelContext {
      
      const uint tid(info.data);
      const uint sid(info.data >> 32);
+
+#ifndef __KERNEL_METAL_BUFFER_TEXTURES__
+      return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], P);
+#else
      if (sid < 256) {
        return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], P);
      }
@ -323,6 +330,7 @@ class MetalKernelContext {
        return r;
      }
  #endif
+  #endif
  }

  // clang-format on
--- a/intern/cycles/session/session.cpp
+++ b/intern/cycles/session/session.cpp
@ -706,6 +706,12 @@ void Session::update_status_time(bool show_pause, bool show_done)
                              string_printf("Sample %d/%d", current_sample, num_samples));
  }

+  /* Append any device-specific status (such as background kernel optimization) */
+  string device_status;
+  if (device->is_ready(device_status) && !device_status.empty()) {
+    substatus += string_printf(" (%s)", device_status.c_str());
+  }
+
  /* TODO(sergey): Denoising status from the path trace. */

  if (show_pause) {