Alternative Upload geometry data in parallel to multiple GPUs using the "Multi-Device" #107552

Open
William Leeson wants to merge 137 commits from leesonw/blender-cluster:upload_changed into main

When changing the target branch, be careful to rebase the branch in your fork to match. See documentation.
7 changed files with 156 additions and 6 deletions
Showing only changes of commit 72b918a9e2 - Show all commits

View File

@ -886,9 +886,11 @@ typedef enum {
} nvrtcResult;
typedef struct _nvrtcProgram* nvrtcProgram;
// FRL_CGR
/* Function types. */
typedef void CUDAAPI tnvtxRangePushA(const char *msg);
typedef void CUDAAPI tnvtxRangePop();
// FRL_CGR
typedef CUresult CUDAAPI tcuGetErrorString(CUresult error, const char** pStr);
typedef CUresult CUDAAPI tcuGetErrorName(CUresult error, const char** pStr);
typedef CUresult CUDAAPI tcuInit(unsigned int Flags);
@ -1130,6 +1132,8 @@ typedef nvrtcResult CUDAAPI tnvrtcGetLoweredName(nvrtcProgram prog, const char*
/* Function declarations. */
extern tnvtxRangePushA *nvtxRangePushA;
extern tnvtxRangePop *nvtxRangePop;
extern tcuGetErrorString *cuGetErrorString;
extern tcuGetErrorName *cuGetErrorName;
extern tcuInit *cuInit;
@ -1378,7 +1382,10 @@ enum {
enum {
CUEW_INIT_CUDA = 1,
CUEW_INIT_NVRTC = 2
// FRL_CGR
CUEW_INIT_NVRTC = 2,
CUEW_INIT_NVTX = 4,
// FRL_CGR
};
int cuewInit(cuuint32_t flags);

View File

@ -66,10 +66,22 @@ typedef void* DynamicLibrary;
_LIBRARY_FIND_CHECKED(nvrtc_lib, name)
#define NVRTC_LIBRARY_FIND(name) _LIBRARY_FIND(nvrtc_lib, name)
// FRL_CGR
#define NVTX_LIBRARY_FIND_CHECKED(name) \
_LIBRARY_FIND_CHECKED(nvrtc_lib, name)
#define NVTX_LIBRARY_FIND(name) _LIBRARY_FIND(nvtx_lib, name)
// FRL_CGR
static DynamicLibrary cuda_lib;
static DynamicLibrary nvrtc_lib;
static DynamicLibrary nvtx_lib;
/* Function definitions. */
// FRL_CGR
tnvtxRangePushA *nvtxRangePushA;
tnvtxRangePop *nvtxRangePop;
// FRL_CGR
tcuGetErrorString *cuGetErrorString;
tcuGetErrorName *cuGetErrorName;
tcuInit *cuInit;
@ -611,6 +623,16 @@ static int cuewCudaInit(void) {
return result;
}
// FRL_CGR
static void cuewExitNvtx(void) {
if (nvrtc_lib != NULL) {
/* Ignore errors. */
dynamic_library_close(nvtx_lib);
nvtx_lib = NULL;
}
}
// FRL_CGR
static void cuewExitNvrtc(void) {
if (nvrtc_lib != NULL) {
/* Ignore errors. */
@ -681,6 +703,56 @@ static int cuewNvrtcInit(void) {
return result;
}
// FRL_CGR
static int cuewNvtxInit(void) {
/* Library paths. */
#ifdef _WIN32
/* Expected in c:/windows/system or similar, no path needed. */
const char *nvtc_paths[] = {"nvToolsExt64_101_0.dll",
NULL};
#elif defined(__APPLE__)
/* Default installation path. */
const char *nvtx_paths[] = {"/usr/local/cuda/lib/libnvToolsExt.dylib", NULL};
#else
const char *nvtx_paths[] = {"libnvToolsExt.so",
# if defined(__x86_64__) || defined(_M_X64)
"/usr/local/cuda/lib64/libnvToolsExt.so",
#else
"/usr/local/cuda/lib/libnvToolsExt.so",
#endif
NULL};
#endif
static int nvtx_initialized = 0;
static int result = 0;
int error;
if (nvtx_initialized) {
return result;
}
nvtx_initialized = 1;
error = atexit(cuewExitNvtx);
if (error) {
result = CUEW_ERROR_ATEXIT_FAILED;
return result;
}
/* Load library. */
nvtx_lib = dynamic_library_open_find(nvtx_paths);
if (nvtx_lib == NULL) {
result = CUEW_ERROR_OPEN_FAILED;
return result;
}
NVTX_LIBRARY_FIND(nvtxRangePushA);
NVTX_LIBRARY_FIND(nvtxRangePop);
result = CUEW_SUCCESS;
return result;
}
// FRL_CGR
int cuewInit(cuuint32_t flags) {
int result = CUEW_SUCCESS;
@ -698,6 +770,14 @@ int cuewInit(cuuint32_t flags) {
return result;
}
}
// FRL_CGR
if(flags & CUEW_INIT_NVTX) {
result = cuewNvtxInit();
if(result != CUEW_SUCCESS) {
return result;
}
}
// FRL_CGR
return result;
}

View File

@ -27,7 +27,13 @@ bool device_cuda_init()
return result;
initialized = true;
int cuew_result = cuewInit(CUEW_INIT_CUDA);
// FRL_CGR
int flags = CUEW_INIT_CUDA;
#ifdef USE_SCOPED_MARKER
flags |= CUEW_INIT_NVTX;
#endif
int cuew_result = cuewInit(flags);
// FRL_CGR
if (cuew_result == CUEW_SUCCESS) {
VLOG_INFO << "CUEW initialization succeeded";
if (CUDADevice::have_precompiled_kernels()) {

View File

@ -987,7 +987,15 @@ int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute, int d
}
return value;
}
// FRL_CGR
void CUDADevice::push_marker(const string name) {
nvtxRangePushA(name.c_str());
}
void CUDADevice::pop_marker() {
nvtxRangePop();
}
// FRL_CGR
CCL_NAMESPACE_END
#endif

View File

@ -100,6 +100,8 @@ class CUDADevice : public GPUDevice {
int get_num_multiprocessors();
int get_max_num_threads_per_multiprocessor();
virtual void push_marker(const string name) override;
virtual void pop_marker() override;
protected:
bool get_device_attribute(CUdevice_attribute attribute, int *value);
int get_device_default_attribute(CUdevice_attribute attribute, int default_value);

View File

@ -216,6 +216,16 @@ class Device {
{
return 0;
}
// FRL_CGR END
// FRL_CGR BEGIN
virtual void push_marker(const string) {
}
virtual void pop_marker() {
}
// FRL_CGR END
/* Called after kernel texture setup, and prior to integrator state setup. */
virtual void optimize_for_scene(Scene * /*scene*/)
@ -309,6 +319,31 @@ class Device {
static uint devices_initialized_mask;
};
// FRL_CGR
class ScopedMarker {
private:
Device *_device;
public:
ScopedMarker(Device *p_device, const string name) {
_device = p_device;
_device->push_marker(name.c_str() + std::to_string(p_device->info.num));
}
~ScopedMarker() {
_device->pop_marker();
}
};
#define USE_SCOPED_MARKER
#ifndef SCOPED_MARKER
# ifdef USE_SCOPED_MARKER
# define SCOPED_MARKER(device, msg) ScopedMarker scoped_marker(device, msg)
# else
# define SCOPED_MARKER(device, msg)
# endif
#endif
// FRL_CGR
/* Device, which is GPU, with some common functionality for GPU backends */
class GPUDevice : public Device {
protected:

View File

@ -702,6 +702,7 @@ void GeometryManager::device_update_attributes(Device *device,
Scene *scene,
Progress &progress)
{
SCOPED_MARKER(device, "GeometryManager::device_update_attributes");
progress.set_status("Updating Mesh", "Computing attributes");
/* gather per mesh requested attributes. as meshes may have multiple
@ -1042,11 +1043,12 @@ void GeometryManager::geom_calc_offset(Scene *scene, BVHLayout bvh_layout)
}
}
void GeometryManager::device_update_mesh(Device *,
void GeometryManager::device_update_mesh(Device *device,
DeviceScene *dscene,
Scene *scene,
Progress &progress)
{
SCOPED_MARKER(device, "GeometryManager::device_update_mesh");
/* Count. */
size_t vert_size = 0;
size_t tri_size = 0;
@ -1769,6 +1771,7 @@ void GeometryManager::device_update(Device *device,
Scene *scene,
Progress &progress)
{
SCOPED_MARKER(device, "GeometryManager::device_update");
if (!need_update())
return;
@ -1784,7 +1787,8 @@ void GeometryManager::device_update(Device *device,
scene->update_stats->geometry.times.add_entry({"device_update (normals)", time});
}
});
{
SCOPED_MARKER(device, "Update face and vertex normals");
foreach (Geometry *geom, scene->geometry) {
if (geom->is_modified()) {
if ((geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME)) {
@ -1820,6 +1824,7 @@ void GeometryManager::device_update(Device *device,
}
}
}
}
}
if (progress.get_cancel()) {
@ -1828,6 +1833,7 @@ void GeometryManager::device_update(Device *device,
/* Tessellate meshes that are using subdivision */
if (total_tess_needed) {
SCOPED_MARKER(device, "Tesselate");
scoped_callback_timer timer([scene](double time) {
if (scene->update_stats) {
scene->update_stats->geometry.times.add_entry(
@ -1877,6 +1883,7 @@ void GeometryManager::device_update(Device *device,
/* Update images needed for true displacement. */
bool old_need_object_flags_update = false;
if (true_displacement_used || curve_shadow_transparency_used) {
SCOPED_MARKER(device, "Update displacement images");
scoped_callback_timer timer([scene](double time) {
if (scene->update_stats) {
scene->update_stats->geometry.times.add_entry(
@ -1925,6 +1932,7 @@ void GeometryManager::device_update(Device *device,
size_t num_bvh = 0;
{
SCOPED_MARKER(device, "displace and shadow transp");
/* Copy constant data needed by shader evaluation. */
device->const_copy_to("data", &dscene->data, sizeof(dscene->data));
@ -1990,6 +1998,7 @@ void GeometryManager::device_update(Device *device,
bool need_update_scene_bvh = (scene->bvh == nullptr ||
(update_flags & (TRANSFORM_MODIFIED | VISIBILITY_MODIFIED)) != 0);
{
SCOPED_MARKER(device, "Build Object BVH");
scoped_callback_timer timer([scene](double time) {
if (scene->update_stats) {
scene->update_stats->geometry.times.add_entry({"device_update (build object BVHs)", time});
@ -2025,6 +2034,7 @@ void GeometryManager::device_update(Device *device,
/* Update objects. */
{
SCOPED_MARKER(device, "compute bounds");
scoped_callback_timer timer([scene](double time) {
if (scene->update_stats) {
scene->update_stats->geometry.times.add_entry({"device_update (compute bounds)", time});
@ -2040,6 +2050,7 @@ void GeometryManager::device_update(Device *device,
}
if (need_update_scene_bvh) {
SCOPED_MARKER(device, "update scene BVH");
scoped_callback_timer timer([scene](double time) {
if (scene->update_stats) {
scene->update_stats->geometry.times.add_entry({"device_update (build scene BVH)", time});
@ -2123,6 +2134,7 @@ void GeometryManager::device_update(Device *device,
void GeometryManager::device_free(Device *device, DeviceScene *dscene, bool force_free)
{
SCOPED_MARKER(device, "GeometryManager::device_free");
dscene->bvh_nodes.free_if_need_realloc(force_free);
dscene->bvh_leaf_nodes.free_if_need_realloc(force_free);
dscene->object_node.free_if_need_realloc(force_free);