7 changed files with 156 additions and 6 deletions
--- a/extern/cuew/include/cuew.h
+++ b/extern/cuew/include/cuew.h
@ -886,9 +886,11 @@ typedef enum  {
 } nvrtcResult;

 typedef struct _nvrtcProgram* nvrtcProgram;
-
-
+// FRL_CGR
 /* Function types. */
+typedef void CUDAAPI tnvtxRangePushA(const char *msg);
+typedef void CUDAAPI tnvtxRangePop();
+// FRL_CGR
 typedef CUresult CUDAAPI tcuGetErrorString(CUresult error, const char** pStr);
 typedef CUresult CUDAAPI tcuGetErrorName(CUresult error, const char** pStr);
 typedef CUresult CUDAAPI tcuInit(unsigned int Flags);
@ -1130,6 +1132,8 @@ typedef nvrtcResult CUDAAPI tnvrtcGetLoweredName(nvrtcProgram prog, const char*


 /* Function declarations. */
+extern tnvtxRangePushA *nvtxRangePushA;
+extern tnvtxRangePop *nvtxRangePop;
 extern tcuGetErrorString *cuGetErrorString;
 extern tcuGetErrorName *cuGetErrorName;
 extern tcuInit *cuInit;
@ -1378,7 +1382,10 @@ enum {

 enum {
 	CUEW_INIT_CUDA = 1,
-	CUEW_INIT_NVRTC = 2
+// FRL_CGR
+	CUEW_INIT_NVRTC = 2,
+	CUEW_INIT_NVTX = 4,
+// FRL_CGR
 };

 int cuewInit(cuuint32_t flags);
--- a/extern/cuew/src/cuew.c
+++ b/extern/cuew/src/cuew.c
@ -66,10 +66,22 @@ typedef void* DynamicLibrary;
        _LIBRARY_FIND_CHECKED(nvrtc_lib, name)
 #define NVRTC_LIBRARY_FIND(name) _LIBRARY_FIND(nvrtc_lib, name)

+// FRL_CGR
+#define NVTX_LIBRARY_FIND_CHECKED(name) \
+        _LIBRARY_FIND_CHECKED(nvrtc_lib, name)
+#define NVTX_LIBRARY_FIND(name) _LIBRARY_FIND(nvtx_lib, name)
+// FRL_CGR
+
 static DynamicLibrary cuda_lib;
 static DynamicLibrary nvrtc_lib;
+static DynamicLibrary nvtx_lib;

 /* Function definitions. */
+// FRL_CGR
+tnvtxRangePushA *nvtxRangePushA;
+tnvtxRangePop *nvtxRangePop;
+// FRL_CGR
+
 tcuGetErrorString *cuGetErrorString;
 tcuGetErrorName *cuGetErrorName;
 tcuInit *cuInit;
@ -611,6 +623,16 @@ static int cuewCudaInit(void) {
  return result;
 }

+// FRL_CGR
+static void cuewExitNvtx(void) {
+  if (nvrtc_lib != NULL) {
+    /*  Ignore errors. */
+    dynamic_library_close(nvtx_lib);
+    nvtx_lib = NULL;
+  }
+}
+// FRL_CGR
+
 static void cuewExitNvrtc(void) {
  if (nvrtc_lib != NULL) {
    /*  Ignore errors. */
@ -681,6 +703,56 @@ static int cuewNvrtcInit(void) {
  return result;
 }

+// FRL_CGR
+static int cuewNvtxInit(void) {
+  /* Library paths. */
+#ifdef _WIN32
+  /* Expected in c:/windows/system or similar, no path needed. */
+  const char *nvtc_paths[] = {"nvToolsExt64_101_0.dll",
+                               NULL};
+#elif defined(__APPLE__)
+  /* Default installation path. */
+  const char *nvtx_paths[] = {"/usr/local/cuda/lib/libnvToolsExt.dylib", NULL};
+#else
+  const char *nvtx_paths[] = {"libnvToolsExt.so",
+#  if defined(__x86_64__) || defined(_M_X64)
+                               "/usr/local/cuda/lib64/libnvToolsExt.so",
+#else
+                               "/usr/local/cuda/lib/libnvToolsExt.so",
+#endif
+                               NULL};
+#endif
+  static int nvtx_initialized = 0;
+  static int result = 0;
+  int error;
+
+  if (nvtx_initialized) {
+    return result;
+  }
+
+  nvtx_initialized = 1;
+
+  error = atexit(cuewExitNvtx);
+  if (error) {
+    result = CUEW_ERROR_ATEXIT_FAILED;
+    return result;
+  }
+
+  /* Load library. */
+  nvtx_lib = dynamic_library_open_find(nvtx_paths);
+
+  if (nvtx_lib == NULL) {
+    result = CUEW_ERROR_OPEN_FAILED;
+    return result;
+  }
+
+  NVTX_LIBRARY_FIND(nvtxRangePushA);
+  NVTX_LIBRARY_FIND(nvtxRangePop);
+
+  result = CUEW_SUCCESS;
+  return result;
+}
+// FRL_CGR

 int cuewInit(cuuint32_t flags) {
  int result = CUEW_SUCCESS;
@ -698,6 +770,14 @@ int cuewInit(cuuint32_t flags) {
      return result;
    }
  }
+// FRL_CGR
+  if(flags & CUEW_INIT_NVTX) {
+    result = cuewNvtxInit();
+    if(result != CUEW_SUCCESS) {
+      return result;
+    }
+  }
+// FRL_CGR

  return result;
 }
--- a/intern/cycles/device/cuda/device.cpp
+++ b/intern/cycles/device/cuda/device.cpp
@ -27,7 +27,13 @@ bool device_cuda_init()
    return result;

  initialized = true;
-  int cuew_result = cuewInit(CUEW_INIT_CUDA);
+  // FRL_CGR
+  int flags = CUEW_INIT_CUDA;
+#ifdef USE_SCOPED_MARKER
+  flags |= CUEW_INIT_NVTX;
+#endif
+  int cuew_result = cuewInit(flags);
+  // FRL_CGR
  if (cuew_result == CUEW_SUCCESS) {
    VLOG_INFO << "CUEW initialization succeeded";
    if (CUDADevice::have_precompiled_kernels()) {
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@ -987,7 +987,15 @@ int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute, int d
  }
  return value;
 }
+// FRL_CGR
+void CUDADevice::push_marker(const string name) {
+  nvtxRangePushA(name.c_str());
+}

+void CUDADevice::pop_marker() {
+  nvtxRangePop();
+}
+// FRL_CGR
 CCL_NAMESPACE_END

 #endif
--- a/intern/cycles/device/cuda/device_impl.h
+++ b/intern/cycles/device/cuda/device_impl.h
@ -100,6 +100,8 @@ class CUDADevice : public GPUDevice {
  int get_num_multiprocessors();
  int get_max_num_threads_per_multiprocessor();

+  virtual void push_marker(const string name) override;
+  virtual void pop_marker() override;
 protected:
  bool get_device_attribute(CUdevice_attribute attribute, int *value);
  int get_device_default_attribute(CUdevice_attribute attribute, int default_value);
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@ -216,6 +216,16 @@ class Device {
  {
    return 0;
  }
+  // FRL_CGR END
+
+  // FRL_CGR BEGIN
+  virtual void push_marker(const string) {    
+  }
+
+  virtual void pop_marker() {    
+  }
+
+  // FRL_CGR END

  /* Called after kernel texture setup, and prior to integrator state setup. */
  virtual void optimize_for_scene(Scene * /*scene*/)
@ -309,6 +319,31 @@ class Device {
  static uint devices_initialized_mask;
 };

+// FRL_CGR
+class ScopedMarker {
+private:
+  Device *_device;
+public:
+  ScopedMarker(Device *p_device, const string name) {
+    _device = p_device;
+    _device->push_marker(name.c_str() + std::to_string(p_device->info.num));
+  }
+  
+  ~ScopedMarker() {
+    _device->pop_marker();
+  }
+};
+
+#define USE_SCOPED_MARKER
+#ifndef SCOPED_MARKER
+#   ifdef USE_SCOPED_MARKER
+#      define SCOPED_MARKER(device, msg) ScopedMarker scoped_marker(device, msg)
+#   else
+#      define SCOPED_MARKER(device, msg)
+#   endif
+#endif
+// FRL_CGR
+
 /* Device, which is GPU, with some common functionality for GPU backends */
 class GPUDevice : public Device {
 protected:
--- a/intern/cycles/scene/geometry.cpp
+++ b/intern/cycles/scene/geometry.cpp
@ -702,6 +702,7 @@ void GeometryManager::device_update_attributes(Device *device,
                                               Scene *scene,
                                               Progress &progress)
 {
+  SCOPED_MARKER(device, "GeometryManager::device_update_attributes");
  progress.set_status("Updating Mesh", "Computing attributes");

  /* gather per mesh requested attributes. as meshes may have multiple
@ -1042,11 +1043,12 @@ void GeometryManager::geom_calc_offset(Scene *scene, BVHLayout bvh_layout)
  }
 }

-void GeometryManager::device_update_mesh(Device *,
+void GeometryManager::device_update_mesh(Device *device,
                                         DeviceScene *dscene,
                                         Scene *scene,
                                         Progress &progress)
 {
+  SCOPED_MARKER(device, "GeometryManager::device_update_mesh");
  /* Count. */
  size_t vert_size = 0;
  size_t tri_size = 0;
@ -1769,6 +1771,7 @@ void GeometryManager::device_update(Device *device,
                                    Scene *scene,
                                    Progress &progress)
 {
+  SCOPED_MARKER(device, "GeometryManager::device_update");
  if (!need_update())
    return;

@ -1784,7 +1787,8 @@ void GeometryManager::device_update(Device *device,
        scene->update_stats->geometry.times.add_entry({"device_update (normals)", time});
      }
    });
-
+    {
+    SCOPED_MARKER(device, "Update face and vertex normals");
    foreach (Geometry *geom, scene->geometry) {
      if (geom->is_modified()) {
        if ((geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME)) {
@ -1820,6 +1824,7 @@ void GeometryManager::device_update(Device *device,
        }
      }
    }
+    }
  }

  if (progress.get_cancel()) {
@ -1828,6 +1833,7 @@ void GeometryManager::device_update(Device *device,

  /* Tessellate meshes that are using subdivision */
  if (total_tess_needed) {
+    SCOPED_MARKER(device, "Tesselate");
    scoped_callback_timer timer([scene](double time) {
      if (scene->update_stats) {
        scene->update_stats->geometry.times.add_entry(
@ -1877,6 +1883,7 @@ void GeometryManager::device_update(Device *device,
  /* Update images needed for true displacement. */
  bool old_need_object_flags_update = false;
  if (true_displacement_used || curve_shadow_transparency_used) {
+    SCOPED_MARKER(device, "Update displacement images");
    scoped_callback_timer timer([scene](double time) {
      if (scene->update_stats) {
        scene->update_stats->geometry.times.add_entry(
@ -1925,6 +1932,7 @@ void GeometryManager::device_update(Device *device,
  size_t num_bvh = 0;

  {
+    SCOPED_MARKER(device, "displace and shadow transp");
    /* Copy constant data needed by shader evaluation. */
    device->const_copy_to("data", &dscene->data, sizeof(dscene->data));

@ -1990,6 +1998,7 @@ void GeometryManager::device_update(Device *device,
  bool need_update_scene_bvh = (scene->bvh == nullptr ||
                                (update_flags & (TRANSFORM_MODIFIED | VISIBILITY_MODIFIED)) != 0);
  {
+    SCOPED_MARKER(device, "Build Object BVH");
    scoped_callback_timer timer([scene](double time) {
      if (scene->update_stats) {
        scene->update_stats->geometry.times.add_entry({"device_update (build object BVHs)", time});
@ -2025,6 +2034,7 @@ void GeometryManager::device_update(Device *device,

  /* Update objects. */
  {
+    SCOPED_MARKER(device, "compute bounds");
    scoped_callback_timer timer([scene](double time) {
      if (scene->update_stats) {
        scene->update_stats->geometry.times.add_entry({"device_update (compute bounds)", time});
@ -2040,6 +2050,7 @@ void GeometryManager::device_update(Device *device,
  }

  if (need_update_scene_bvh) {
+    SCOPED_MARKER(device, "update scene BVH");
    scoped_callback_timer timer([scene](double time) {
      if (scene->update_stats) {
        scene->update_stats->geometry.times.add_entry({"device_update (build scene BVH)", time});
@ -2123,6 +2134,7 @@ void GeometryManager::device_update(Device *device,

 void GeometryManager::device_free(Device *device, DeviceScene *dscene, bool force_free)
 {
+  SCOPED_MARKER(device, "GeometryManager::device_free");
  dscene->bvh_nodes.free_if_need_realloc(force_free);
  dscene->bvh_leaf_nodes.free_if_need_realloc(force_free);
  dscene->object_node.free_if_need_realloc(force_free);