Tweaks for threading schedule for Threadripper2 and EPYC

The idea is to make main thread and job threads to be scheduled on CPU dies which has direct access to memory (those are NUMA nodes 0 and 2). We also do this for new EPYC CPUs since their NUMA nodes 1 and 3 do have access but only to a higher range DDR slots. By preferring nodes 0 and 2 on EPYC we make it so users with partially filled DDR slots has fast memory access. One thing which is not really solved yet is localization of memory allocation: we do not guarantee that memory is allocated on the closest to the NUMA node DDR slot and hope that memory manager of OS is acting in favor of us.
2018-11-27 18:21:43 +01:00
parent b3e2c69416
commit ce927e15e0
5 changed files with 111 additions and 0 deletions
--- a/source/blender/blenlib/BLI_threads.h
+++ b/source/blender/blenlib/BLI_threads.h
@@ -204,6 +204,12 @@ void BLI_thread_queue_nowait(ThreadQueue *queue);
 #  define BLI_thread_local_set(name, value) name = value
 #endif  /* defined(__APPLE__) */

+/* **** Special functions to help performance on crazy NUMA setups. **** */
+
+/* Make sure process/thread is using NUMA node with fast memory access. */
+void BLI_thread_put_process_on_fast_node(void);
+void BLI_thread_put_thread_on_fast_node(void);
+
 #ifdef __cplusplus
 }
 #endif
--- a/source/blender/blenlib/CMakeLists.txt
+++ b/source/blender/blenlib/CMakeLists.txt
@@ -30,6 +30,7 @@ set(INC
 	../../../intern/guardedalloc
 	../../../intern/atomic
 	../../../intern/eigen
+	../../../intern/numaapi/include
 	../../../extern/wcwidth
 )

--- a/source/blender/blenlib/intern/threads.c
+++ b/source/blender/blenlib/intern/threads.c
@@ -37,6 +37,7 @@

 #include "BLI_listbase.h"
 #include "BLI_gsqueue.h"
+#include "BLI_system.h"
 #include "BLI_task.h"
 #include "BLI_threads.h"

@@ -55,6 +56,7 @@
 #endif

 #include "atomic_ops.h"
+#include "numaapi.h"

 #if defined(__APPLE__) && defined(_OPENMP) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 2) && !defined(__clang__)
 #  define USE_APPLE_OMP_FIX
@@ -126,6 +128,7 @@ static pthread_mutex_t _colormanage_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t _fftw_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t _view3d_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_t mainid;
+static bool is_numa_available = false;
 static unsigned int thread_levels = 0;  /* threads can be invoked inside threads */
 static int num_threads_override = 0;

@@ -155,6 +158,9 @@ void BLI_threadapi_init(void)
 	mainid = pthread_self();

 	BLI_spin_init(&_malloc_lock);
+	if (numaAPI_Initialize() == NUMAAPI_SUCCESS) {
+		is_numa_available = true;
+	}
 }

 void BLI_threadapi_exit(void)
@@ -840,3 +846,98 @@ void BLI_threaded_malloc_end(void)
 		MEM_set_lock_callback(NULL, NULL);
 	}
 }
+
+/* **** Special functions to help performance on crazy NUMA setups. **** */
+
+static bool check_is_threadripper2_alike_topology(void)
+{
+	/* NOTE: We hope operating system does not support CPU hotswap to
+	 * a different brand. And that SMP of different types is also not
+	 * encouraged by the system. */
+	static bool is_initialized = false;
+	static bool is_threadripper2 = false;
+	if (is_initialized) {
+		return is_threadripper2;
+	}
+	is_initialized = true;
+	char *cpu_brand = BLI_cpu_brand_string();
+	if (cpu_brand == NULL) {
+		return false;
+	}
+	if (strstr(cpu_brand, "Threadripper")) {
+		/* NOTE: We consinder all Threadrippers having similar topology to
+		* the second one. This is because we are trying to utilize NUMA node
+		* 0 as much as possible. This node does exist on earlier versions of
+		* threadripper and setting affinity to it should not have negative
+		* effect.
+		* This allows us to avoid per-model check, making the code more
+		* reliable for the CPUs which are not yet released.
+		*/
+		if (strstr(cpu_brand, "2990WX") || strstr(cpu_brand, "2950X")) {
+			is_threadripper2 = true;
+		}
+	}
+	/* NOTE: While all dies of EPYC has memory controller, only two f them
+	 * has access to a lower-indexed DDR slots. Those dies are same as on
+	 * Threadripper2 with the memory controller.
+	 * Now, it is rather likely that reasonable amount of users don't max
+	 * up their DR slots, making it only two dies connected to a DDR slot
+	 * with actual memory in it. */
+	if (strstr(cpu_brand, "EPYC")) {
+		/* NOTE: Similarly to Threadripper we do not do model check. */
+		is_threadripper2 = true;
+	}
+	return is_threadripper2;
+}
+
+static void threadripper_put_process_on_fast_node(void)
+{
+	if (!is_numa_available) {
+		return;
+	}
+	/* NOTE: Technically, we can use NUMA nodes 0 and 2 and usning both of
+	 * them in the affinity mask will allow OS to schedule threads more
+	 * flexible,possibly increasing overall performance when multiple apps
+	 * are crunching numbers.
+	 *
+	 * However, if scene fits into memory adjacent to a single die we don't
+	 * want OS to re-schedule the process to another die since that will make
+	 * it further away from memory allocated for .blend file. */
+	/* NOTE: Even if NUMA is avasilable in the API but is disabled in BIOS on
+	 * this workstation we still process here. If NUMA is disabled it will be a
+	 * single node, so our action is no-visible-changes, but allows to keep
+	 * things simple and unified. */
+	numaAPI_RunProcessOnNode(0);
+}
+
+static void threadripper_put_thread_on_fast_node(void)
+{
+	if (!is_numa_available) {
+		return;
+	}
+	/* NOTE: This is where things becomes more interesting. On the one hand
+	 * we can use nodes 0 and 2 and allow operating system to do balancing
+	 * of processes/threads for the maximum performance when multiple apps
+	 * are running.
+	 * On another hand, however, we probably want to use same node as the
+	 * main thread since that's where the memory of .blend file is likely
+	 * to be allocated.
+	 * Since the main thread is currently on node 0, we also put thread on
+	 * same node. */
+	/* See additional note about NUMA disabled in BIOS above. */
+	numaAPI_RunThreadOnNode(0);
+}
+
+void BLI_thread_put_process_on_fast_node(void)
+{
+	if (check_is_threadripper2_alike_topology()) {
+		threadripper_put_process_on_fast_node();
+	}
+}
+
+void BLI_thread_put_thread_on_fast_node(void)
+{
+	if (check_is_threadripper2_alike_topology()) {
+		threadripper_put_thread_on_fast_node();
+	}
+}
--- a/source/blender/windowmanager/intern/wm_jobs.c
+++ b/source/blender/windowmanager/intern/wm_jobs.c
@@ -334,6 +334,7 @@ static void *do_job_thread(void *job_v)
 {
 	wmJob *wm_job = job_v;

+	BLI_thread_put_thread_on_fast_node();
 	wm_job->startjob(wm_job->run_customdata, &wm_job->stop, &wm_job->do_update, &wm_job->progress);
 	wm_job->ready = true;

--- a/source/creator/creator.c
+++ b/source/creator/creator.c
@@ -52,6 +52,7 @@
 #include "BLI_callbacks.h"
 #include "BLI_string.h"
 #include "BLI_system.h"
+#include "BLI_threads.h"

 /* mostly init functions */
 #include "BKE_appdir.h"
@@ -364,6 +365,7 @@ int main(
 	BKE_appdir_program_path_init(argv[0]);

 	BLI_threadapi_init();
+	BLI_thread_put_process_on_fast_node();

 	DNA_sdna_current_init();