2013-10-12 14:08:59 +00:00
|
|
|
/*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version 2
|
|
|
|
* of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write to the Free Software Foundation,
|
|
|
|
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
*/
|
|
|
|
|
2021-01-04 13:10:22 +11:00
|
|
|
/* Use a define instead of `#pragma once` because of `bmesh_iterators_inline.h` */
|
2013-10-12 14:08:59 +00:00
|
|
|
#ifndef __BLI_TASK_H__
|
2018-01-08 11:35:48 +01:00
|
|
|
#define __BLI_TASK_H__
|
|
|
|
|
|
|
|
#include <string.h> /* for memset() */
|
2013-10-12 14:08:59 +00:00
|
|
|
|
2016-05-13 11:03:04 +02:00
|
|
|
struct ListBase;
|
|
|
|
|
2019-02-18 08:08:12 +11:00
|
|
|
/** \file
|
|
|
|
* \ingroup bli
|
2013-10-12 14:08:59 +00:00
|
|
|
*/
|
|
|
|
|
2020-05-08 18:16:39 +02:00
|
|
|
#include "BLI_threads.h"
|
|
|
|
#include "BLI_utildefines.h"
|
|
|
|
|
2013-10-12 14:08:59 +00:00
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
2017-11-23 21:14:43 +01:00
|
|
|
struct BLI_mempool;
|
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/* -------------------------------------------------------------------- */
|
|
|
|
/** \name Task Scheduler
|
2018-06-01 18:19:39 +02:00
|
|
|
*
|
2021-12-09 20:01:44 +11:00
|
|
|
* Central scheduler that holds running threads ready to execute tasks.
|
|
|
|
* A single queue holds the task from all pools.
|
2013-10-12 14:08:59 +00:00
|
|
|
*
|
2021-12-09 20:01:44 +11:00
|
|
|
* Initialize/exit must be called before/after any task pools are created/freed, and must
|
|
|
|
* be called from the main threads. All other scheduler and pool functions are thread-safe.
|
|
|
|
* \{ */
|
2013-10-12 14:08:59 +00:00
|
|
|
|
2020-04-30 07:59:23 +02:00
|
|
|
void BLI_task_scheduler_init(void);
|
|
|
|
void BLI_task_scheduler_exit(void);
|
|
|
|
int BLI_task_scheduler_num_threads(void);
|
2013-10-12 14:08:59 +00:00
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/** \} */
|
|
|
|
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
|
|
/** \name Task Pool
|
2013-10-12 14:08:59 +00:00
|
|
|
*
|
2020-05-09 17:01:40 +02:00
|
|
|
* Pool of tasks that will be executed by the central task scheduler. For each
|
2013-10-12 14:08:59 +00:00
|
|
|
* pool, we can wait for all tasks to be done, or cancel them before they are
|
|
|
|
* done.
|
|
|
|
*
|
|
|
|
* Running tasks may spawn new tasks.
|
|
|
|
*
|
|
|
|
* Pools may be nested, i.e. a thread running a task can create another task
|
|
|
|
* pool with smaller tasks. When other threads are busy they will continue
|
|
|
|
* working on their own tasks, if not they will join in, no new threads will
|
|
|
|
* be launched.
|
2021-12-09 20:01:44 +11:00
|
|
|
* \{ */
|
2013-10-12 14:08:59 +00:00
|
|
|
|
2021-10-19 18:33:42 +11:00
|
|
|
typedef enum eTaskPriority {
|
2013-10-12 14:08:59 +00:00
|
|
|
TASK_PRIORITY_LOW,
|
2019-04-16 16:40:47 +02:00
|
|
|
TASK_PRIORITY_HIGH,
|
2021-10-19 18:33:42 +11:00
|
|
|
} eTaskPriority;
|
2013-10-12 14:08:59 +00:00
|
|
|
|
|
|
|
typedef struct TaskPool TaskPool;
|
2020-04-30 07:59:23 +02:00
|
|
|
typedef void (*TaskRunFunction)(TaskPool *__restrict pool, void *taskdata);
|
2020-04-21 15:57:51 +02:00
|
|
|
typedef void (*TaskFreeFunction)(TaskPool *__restrict pool, void *taskdata);
|
2013-10-12 14:08:59 +00:00
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/**
|
|
|
|
* Regular task pool that immediately starts executing tasks as soon as they
|
|
|
|
* are pushed, either on the current or another thread.
|
|
|
|
*/
|
2021-10-19 18:33:42 +11:00
|
|
|
TaskPool *BLI_task_pool_create(void *userdata, eTaskPriority priority);
|
2020-05-09 17:01:40 +02:00
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/**
|
|
|
|
* Background: always run tasks in a background thread, never immediately
|
|
|
|
* execute them. For running background jobs.
|
|
|
|
*/
|
2021-10-19 18:33:42 +11:00
|
|
|
TaskPool *BLI_task_pool_create_background(void *userdata, eTaskPriority priority);
|
2020-05-09 17:01:40 +02:00
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/**
|
|
|
|
* Background Serial: run tasks one after the other in the background,
|
|
|
|
* without parallelization between the tasks.
|
|
|
|
*/
|
2021-10-19 18:33:42 +11:00
|
|
|
TaskPool *BLI_task_pool_create_background_serial(void *userdata, eTaskPriority priority);
|
2020-05-09 17:01:40 +02:00
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/**
|
|
|
|
* Suspended: don't execute tasks until work_and_wait is called. This is slower
|
2020-05-09 17:01:40 +02:00
|
|
|
* as threads can't immediately start working. But it can be used if the data
|
2021-12-09 20:01:44 +11:00
|
|
|
* structures the threads operate on are not fully initialized until all tasks are created.
|
|
|
|
*/
|
2021-10-19 18:33:42 +11:00
|
|
|
TaskPool *BLI_task_pool_create_suspended(void *userdata, eTaskPriority priority);
|
2020-05-09 17:01:40 +02:00
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/**
|
|
|
|
* No threads: immediately executes tasks on the same thread. For debugging.
|
|
|
|
*/
|
2020-04-30 07:59:23 +02:00
|
|
|
TaskPool *BLI_task_pool_create_no_threads(void *userdata);
|
2020-05-09 17:01:40 +02:00
|
|
|
|
2013-10-12 14:08:59 +00:00
|
|
|
void BLI_task_pool_free(TaskPool *pool);
|
|
|
|
|
|
|
|
void BLI_task_pool_push(TaskPool *pool,
|
|
|
|
TaskRunFunction run,
|
2016-05-10 09:55:58 +02:00
|
|
|
void *taskdata,
|
|
|
|
bool free_taskdata,
|
2020-04-09 15:51:44 +02:00
|
|
|
TaskFreeFunction freedata);
|
2013-10-12 14:08:59 +00:00
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/**
|
|
|
|
* Work and wait until all tasks are done.
|
|
|
|
*/
|
2013-10-12 14:08:59 +00:00
|
|
|
void BLI_task_pool_work_and_wait(TaskPool *pool);
|
2021-12-09 20:01:44 +11:00
|
|
|
/**
|
|
|
|
* Cancel all tasks, keep worker threads running.
|
|
|
|
*/
|
2013-10-12 14:08:59 +00:00
|
|
|
void BLI_task_pool_cancel(TaskPool *pool);
|
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/**
|
|
|
|
* For worker threads, test if current task pool canceled. this function may
|
2021-01-15 17:21:14 +01:00
|
|
|
* only be called from worker threads and pool must be the task pool that the
|
2021-12-09 20:01:44 +11:00
|
|
|
* thread is currently executing a task from.
|
|
|
|
*/
|
2021-01-15 17:21:14 +01:00
|
|
|
bool BLI_task_pool_current_canceled(TaskPool *pool);
|
2013-10-12 14:08:59 +00:00
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/**
|
|
|
|
* Optional `userdata` pointer to pass along to run function.
|
|
|
|
*/
|
2020-04-21 15:36:35 +02:00
|
|
|
void *BLI_task_pool_user_data(TaskPool *pool);
|
2013-10-12 14:08:59 +00:00
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/**
|
|
|
|
* Optional mutex to use from run function.
|
|
|
|
*/
|
2013-10-12 14:08:59 +00:00
|
|
|
ThreadMutex *BLI_task_pool_user_mutex(TaskPool *pool);
|
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/** \} */
|
2018-01-05 16:33:13 +01:00
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/* -------------------------------------------------------------------- */
|
|
|
|
/** \name Parallel for Routines
|
|
|
|
* \{ */
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Per-thread specific data passed to the callback.
|
|
|
|
*/
|
2019-07-30 14:56:47 +02:00
|
|
|
typedef struct TaskParallelTLS {
|
2021-12-09 20:01:44 +11:00
|
|
|
/**
|
|
|
|
* Copy of user-specifier chunk, which is copied from original chunk to all worker threads.
|
|
|
|
* This is similar to OpenMP's `firstprivate`.
|
2018-01-05 16:33:13 +01:00
|
|
|
*/
|
|
|
|
void *userdata_chunk;
|
2019-07-30 14:56:47 +02:00
|
|
|
} TaskParallelTLS;
|
|
|
|
|
2018-01-10 12:49:51 +01:00
|
|
|
typedef void (*TaskParallelRangeFunc)(void *__restrict userdata,
|
2022-01-07 11:38:08 +11:00
|
|
|
int iter,
|
2019-07-30 14:56:47 +02:00
|
|
|
const TaskParallelTLS *__restrict tls);
|
2021-07-15 14:43:25 +10:00
|
|
|
|
|
|
|
typedef void (*TaskParallelInitFunc)(const void *__restrict userdata, void *__restrict chunk);
|
|
|
|
|
2020-04-17 10:00:54 +02:00
|
|
|
typedef void (*TaskParallelReduceFunc)(const void *__restrict userdata,
|
|
|
|
void *__restrict chunk_join,
|
|
|
|
void *__restrict chunk);
|
|
|
|
|
|
|
|
typedef void (*TaskParallelFreeFunc)(const void *__restrict userdata, void *__restrict chunk);
|
2014-10-22 11:56:52 +02:00
|
|
|
|
2019-07-30 14:56:47 +02:00
|
|
|
typedef struct TaskParallelSettings {
|
2018-01-08 11:35:48 +01:00
|
|
|
/* Whether caller allows to do threading of the particular range.
|
|
|
|
* Usually set by some equation, which forces threading off when threading
|
|
|
|
* overhead becomes higher than speed benefit.
|
|
|
|
* BLI_task_parallel_range() by itself will always use threading when range
|
|
|
|
* is higher than a chunk size. As in, threading will always be performed.
|
|
|
|
*/
|
|
|
|
bool use_threading;
|
|
|
|
/* Each instance of looping chunks will get a copy of this data
|
|
|
|
* (similar to OpenMP's firstprivate).
|
|
|
|
*/
|
|
|
|
void *userdata_chunk; /* Pointer to actual data. */
|
2021-06-24 15:56:58 +10:00
|
|
|
size_t userdata_chunk_size; /* Size of that data. */
|
2018-01-08 11:35:48 +01:00
|
|
|
/* Function called from calling thread once whole range have been
|
|
|
|
* processed.
|
|
|
|
*/
|
2021-07-15 14:43:25 +10:00
|
|
|
/* Function called to initialize user data chunk,
|
|
|
|
* typically to allocate data, freed by `func_free`.
|
|
|
|
*/
|
|
|
|
TaskParallelInitFunc func_init;
|
2020-04-17 10:00:54 +02:00
|
|
|
/* Function called to join user data chunk into another, to reduce
|
|
|
|
* the result to the original userdata_chunk memory.
|
|
|
|
* The reduce functions should have no side effects, so that they
|
|
|
|
* can be run on any thread. */
|
|
|
|
TaskParallelReduceFunc func_reduce;
|
|
|
|
/* Function called to free data created by TaskParallelRangeFunc. */
|
|
|
|
TaskParallelFreeFunc func_free;
|
2018-01-08 12:08:18 +01:00
|
|
|
/* Minimum allowed number of range iterators to be handled by a single
|
|
|
|
* thread. This allows to achieve following:
|
|
|
|
* - Reduce amount of threading overhead.
|
|
|
|
* - Partially occupy thread pool with ranges which are computationally
|
|
|
|
* expensive, but which are smaller than amount of available threads.
|
|
|
|
* For example, it's possible to multi-thread [0 .. 64] range into 4
|
|
|
|
* thread which will be doing 16 iterators each.
|
|
|
|
* This is a preferred way to tell scheduler when to start threading than
|
|
|
|
* having a global use_threading switch based on just range size.
|
|
|
|
*/
|
|
|
|
int min_iter_per_thread;
|
2019-07-30 14:56:47 +02:00
|
|
|
} TaskParallelSettings;
|
2018-01-08 11:35:48 +01:00
|
|
|
|
2019-07-30 14:56:47 +02:00
|
|
|
BLI_INLINE void BLI_parallel_range_settings_defaults(TaskParallelSettings *settings);
|
2018-01-08 11:35:48 +01:00
|
|
|
|
2022-01-07 11:38:08 +11:00
|
|
|
void BLI_task_parallel_range(int start,
|
|
|
|
int stop,
|
2016-05-16 17:15:18 +02:00
|
|
|
void *userdata,
|
2018-01-05 16:33:13 +01:00
|
|
|
TaskParallelRangeFunc func,
|
2020-04-30 07:59:23 +02:00
|
|
|
const TaskParallelSettings *settings);
|
BLI_task: Add pooled threaded index range iterator, Take II.
This code allows to push a set of different operations all based on
iterations over a range of indices, and then process them all at once
over multiple threads.
This commit also adds unit tests for both old un-pooled, and new pooled
task_parallel_range family of functions, as well as some basic
performances tests.
This is mainly interesting for relatively low amount of individual
tasks, as expected.
E.g. performance tests on a 32 threads machine, for a set of 10
different tasks, shows following improvements when using pooled version
instead of ten sequential calls to BLI_task_parallel_range():
| Num Items | Sequential | Pooled | Speed-up |
| --------- | ---------- | ------- | -------- |
| 10K | 365 us | 138 us | 2.5 x |
| 100K | 877 us | 530 us | 1.66 x |
| 1000K | 5521 us | 4625 us | 1.25 x |
Differential Revision: https://developer.blender.org/D6189
Note: Compared to previous commit yesterday, this reworks atomic handling in
parallel iter code, and fixes a dummy double-free bug.
Now we should only use the two critical values for synchronization from
atomic calls results, which is the proper way to do things.
Reading a value after an atomic operation does not guarantee you will
get the latest value in all cases (especially on Windows release builds
it seems).
2019-11-26 14:26:47 +01:00
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/**
|
|
|
|
* This data is shared between all tasks, its access needs thread lock or similar protection.
|
BLI_task: Add pooled threaded index range iterator, Take II.
This code allows to push a set of different operations all based on
iterations over a range of indices, and then process them all at once
over multiple threads.
This commit also adds unit tests for both old un-pooled, and new pooled
task_parallel_range family of functions, as well as some basic
performances tests.
This is mainly interesting for relatively low amount of individual
tasks, as expected.
E.g. performance tests on a 32 threads machine, for a set of 10
different tasks, shows following improvements when using pooled version
instead of ten sequential calls to BLI_task_parallel_range():
| Num Items | Sequential | Pooled | Speed-up |
| --------- | ---------- | ------- | -------- |
| 10K | 365 us | 138 us | 2.5 x |
| 100K | 877 us | 530 us | 1.66 x |
| 1000K | 5521 us | 4625 us | 1.25 x |
Differential Revision: https://developer.blender.org/D6189
Note: Compared to previous commit yesterday, this reworks atomic handling in
parallel iter code, and fixes a dummy double-free bug.
Now we should only use the two critical values for synchronization from
atomic calls results, which is the proper way to do things.
Reading a value after an atomic operation does not guarantee you will
get the latest value in all cases (especially on Windows release builds
it seems).
2019-11-26 14:26:47 +01:00
|
|
|
*/
|
BLI_task: Add new generic `BLI_task_parallel_iterator()`.
This new function is part of the 'parallel for loops' functions. It
takes an iterator callback to generate items to be processed, in
addition to the usual 'process' func callback.
This allows to use common code from BLI_task for a wide range of custom
iteratiors, whithout having to re-invent the wheel of the whole tasks &
data chuncks handling.
This supports all settings features from `BLI_task_parallel_range()`,
including dynamic and static (if total number of items is knwon)
scheduling, TLS data and its finalize callback, etc.
One question here is whether we should provide usercode with a spinlock
by default, or enforce it to always handle its own sync mechanism.
I kept it, since imho it will be needed very often, and generating one
is pretty cheap even if unused...
----------
Additionaly, this commit converts (currently unused)
`BLI_task_parallel_listbase()` to use that generic code. This was done
mostly as proof of concept, but performance-wise it shows some
interesting data, roughly:
- Very light processing (that should not be threaded anyway) is several
times slower, which is expected due to more overhead in loop management
code.
- Heavier processing can be up to 10% quicker (probably thanks to the
switch from dynamic to static scheduling, which reduces a lot locking
to fill-in the per-tasks chunks of data). Similar speed-up in
non-threaded case comes as a surprise though, not sure what can
explain that.
While this conversion is not really needed, imho we should keep it
(instead of existing code for that function), it's easier to have
complex handling logic in as few places as possible, for maintaining and
for improving it.
Note: That work was initially done to allow for D5372 to be possible... Unfortunately that one proved to be not better than orig code on performances point of view.
Reviewed By: sergey
Differential Revision: https://developer.blender.org/D5371
2019-10-30 12:23:45 +01:00
|
|
|
typedef struct TaskParallelIteratorStateShared {
|
|
|
|
/* Maximum amount of items to acquire at once. */
|
|
|
|
int chunk_size;
|
|
|
|
/* Next item to be acquired. */
|
|
|
|
void *next_item;
|
|
|
|
/* Index of the next item to be acquired. */
|
|
|
|
int next_index;
|
|
|
|
/* Indicates that end of iteration has been reached. */
|
|
|
|
bool is_finished;
|
|
|
|
/* Helper lock to protect access to this data in iterator getter callback,
|
|
|
|
* can be ignored (if the callback implements its own protection system, using atomics e.g.).
|
|
|
|
* Will be NULL when iterator is actually processed in a single thread. */
|
|
|
|
SpinLock *spin_lock;
|
|
|
|
} TaskParallelIteratorStateShared;
|
|
|
|
|
|
|
|
typedef void (*TaskParallelIteratorIterFunc)(void *__restrict userdata,
|
|
|
|
const TaskParallelTLS *__restrict tls,
|
|
|
|
void **r_next_item,
|
|
|
|
int *r_next_index,
|
|
|
|
bool *r_do_abort);
|
|
|
|
|
|
|
|
typedef void (*TaskParallelIteratorFunc)(void *__restrict userdata,
|
|
|
|
void *item,
|
|
|
|
int index,
|
|
|
|
const TaskParallelTLS *__restrict tls);
|
|
|
|
|
2021-12-10 21:42:06 +11:00
|
|
|
/**
|
|
|
|
* This function allows to parallelize for loops using a generic iterator.
|
|
|
|
*
|
|
|
|
* \param userdata: Common userdata passed to all instances of \a func.
|
|
|
|
* \param iter_func: Callback function used to generate chunks of items.
|
|
|
|
* \param init_item: The initial item, if necessary (may be NULL if unused).
|
|
|
|
* \param init_index: The initial index.
|
|
|
|
* \param tot_items: The total amount of items to iterate over
|
|
|
|
* (if unknown, set it to a negative number).
|
|
|
|
* \param func: Callback function.
|
|
|
|
* \param settings: See public API doc of TaskParallelSettings for description of all settings.
|
|
|
|
*
|
|
|
|
* \note Static scheduling is only available when \a tot_items is >= 0.
|
|
|
|
*/
|
BLI_task: Add new generic `BLI_task_parallel_iterator()`.
This new function is part of the 'parallel for loops' functions. It
takes an iterator callback to generate items to be processed, in
addition to the usual 'process' func callback.
This allows to use common code from BLI_task for a wide range of custom
iteratiors, whithout having to re-invent the wheel of the whole tasks &
data chuncks handling.
This supports all settings features from `BLI_task_parallel_range()`,
including dynamic and static (if total number of items is knwon)
scheduling, TLS data and its finalize callback, etc.
One question here is whether we should provide usercode with a spinlock
by default, or enforce it to always handle its own sync mechanism.
I kept it, since imho it will be needed very often, and generating one
is pretty cheap even if unused...
----------
Additionaly, this commit converts (currently unused)
`BLI_task_parallel_listbase()` to use that generic code. This was done
mostly as proof of concept, but performance-wise it shows some
interesting data, roughly:
- Very light processing (that should not be threaded anyway) is several
times slower, which is expected due to more overhead in loop management
code.
- Heavier processing can be up to 10% quicker (probably thanks to the
switch from dynamic to static scheduling, which reduces a lot locking
to fill-in the per-tasks chunks of data). Similar speed-up in
non-threaded case comes as a surprise though, not sure what can
explain that.
While this conversion is not really needed, imho we should keep it
(instead of existing code for that function), it's easier to have
complex handling logic in as few places as possible, for maintaining and
for improving it.
Note: That work was initially done to allow for D5372 to be possible... Unfortunately that one proved to be not better than orig code on performances point of view.
Reviewed By: sergey
Differential Revision: https://developer.blender.org/D5371
2019-10-30 12:23:45 +01:00
|
|
|
void BLI_task_parallel_iterator(void *userdata,
|
|
|
|
TaskParallelIteratorIterFunc iter_func,
|
|
|
|
void *init_item,
|
2022-01-07 11:38:08 +11:00
|
|
|
int init_index,
|
|
|
|
int tot_items,
|
BLI_task: Add new generic `BLI_task_parallel_iterator()`.
This new function is part of the 'parallel for loops' functions. It
takes an iterator callback to generate items to be processed, in
addition to the usual 'process' func callback.
This allows to use common code from BLI_task for a wide range of custom
iteratiors, whithout having to re-invent the wheel of the whole tasks &
data chuncks handling.
This supports all settings features from `BLI_task_parallel_range()`,
including dynamic and static (if total number of items is knwon)
scheduling, TLS data and its finalize callback, etc.
One question here is whether we should provide usercode with a spinlock
by default, or enforce it to always handle its own sync mechanism.
I kept it, since imho it will be needed very often, and generating one
is pretty cheap even if unused...
----------
Additionaly, this commit converts (currently unused)
`BLI_task_parallel_listbase()` to use that generic code. This was done
mostly as proof of concept, but performance-wise it shows some
interesting data, roughly:
- Very light processing (that should not be threaded anyway) is several
times slower, which is expected due to more overhead in loop management
code.
- Heavier processing can be up to 10% quicker (probably thanks to the
switch from dynamic to static scheduling, which reduces a lot locking
to fill-in the per-tasks chunks of data). Similar speed-up in
non-threaded case comes as a surprise though, not sure what can
explain that.
While this conversion is not really needed, imho we should keep it
(instead of existing code for that function), it's easier to have
complex handling logic in as few places as possible, for maintaining and
for improving it.
Note: That work was initially done to allow for D5372 to be possible... Unfortunately that one proved to be not better than orig code on performances point of view.
Reviewed By: sergey
Differential Revision: https://developer.blender.org/D5371
2019-10-30 12:23:45 +01:00
|
|
|
TaskParallelIteratorFunc func,
|
|
|
|
const TaskParallelSettings *settings);
|
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/**
|
|
|
|
* This function allows to parallelize for loops over ListBase items.
|
|
|
|
*
|
|
|
|
* \param listbase: The double linked list to loop over.
|
|
|
|
* \param userdata: Common userdata passed to all instances of \a func.
|
|
|
|
* \param func: Callback function.
|
|
|
|
* \param settings: See public API doc of ParallelRangeSettings for description of all settings.
|
|
|
|
*
|
|
|
|
* \note There is no static scheduling here,
|
|
|
|
* since it would need another full loop over items to count them.
|
|
|
|
*/
|
2016-05-13 11:03:04 +02:00
|
|
|
void BLI_task_parallel_listbase(struct ListBase *listbase,
|
|
|
|
void *userdata,
|
BLI_task: Add new generic `BLI_task_parallel_iterator()`.
This new function is part of the 'parallel for loops' functions. It
takes an iterator callback to generate items to be processed, in
addition to the usual 'process' func callback.
This allows to use common code from BLI_task for a wide range of custom
iteratiors, whithout having to re-invent the wheel of the whole tasks &
data chuncks handling.
This supports all settings features from `BLI_task_parallel_range()`,
including dynamic and static (if total number of items is knwon)
scheduling, TLS data and its finalize callback, etc.
One question here is whether we should provide usercode with a spinlock
by default, or enforce it to always handle its own sync mechanism.
I kept it, since imho it will be needed very often, and generating one
is pretty cheap even if unused...
----------
Additionaly, this commit converts (currently unused)
`BLI_task_parallel_listbase()` to use that generic code. This was done
mostly as proof of concept, but performance-wise it shows some
interesting data, roughly:
- Very light processing (that should not be threaded anyway) is several
times slower, which is expected due to more overhead in loop management
code.
- Heavier processing can be up to 10% quicker (probably thanks to the
switch from dynamic to static scheduling, which reduces a lot locking
to fill-in the per-tasks chunks of data). Similar speed-up in
non-threaded case comes as a surprise though, not sure what can
explain that.
While this conversion is not really needed, imho we should keep it
(instead of existing code for that function), it's easier to have
complex handling logic in as few places as possible, for maintaining and
for improving it.
Note: That work was initially done to allow for D5372 to be possible... Unfortunately that one proved to be not better than orig code on performances point of view.
Reviewed By: sergey
Differential Revision: https://developer.blender.org/D5371
2019-10-30 12:23:45 +01:00
|
|
|
TaskParallelIteratorFunc func,
|
|
|
|
const TaskParallelSettings *settings);
|
2016-05-13 11:03:04 +02:00
|
|
|
|
2017-11-23 21:14:43 +01:00
|
|
|
typedef struct MempoolIterData MempoolIterData;
|
2021-06-09 22:49:45 +10:00
|
|
|
|
|
|
|
typedef void (*TaskParallelMempoolFunc)(void *userdata,
|
|
|
|
MempoolIterData *iter,
|
|
|
|
const TaskParallelTLS *__restrict tls);
|
2021-12-09 20:01:44 +11:00
|
|
|
/**
|
|
|
|
* This function allows to parallelize for loops over Mempool items.
|
|
|
|
*
|
|
|
|
* \param mempool: The iterable BLI_mempool to loop over.
|
|
|
|
* \param userdata: Common userdata passed to all instances of \a func.
|
|
|
|
* \param func: Callback function.
|
|
|
|
* \param settings: See public API doc of TaskParallelSettings for description of all settings.
|
|
|
|
*
|
|
|
|
* \note There is no static scheduling here.
|
|
|
|
*/
|
2017-11-23 21:14:43 +01:00
|
|
|
void BLI_task_parallel_mempool(struct BLI_mempool *mempool,
|
|
|
|
void *userdata,
|
|
|
|
TaskParallelMempoolFunc func,
|
2021-06-09 22:49:45 +10:00
|
|
|
const TaskParallelSettings *settings);
|
2017-11-23 21:14:43 +01:00
|
|
|
|
2021-12-20 19:01:14 +11:00
|
|
|
/** TODO(sergey): Think of a better place for this. */
|
2019-07-30 14:56:47 +02:00
|
|
|
BLI_INLINE void BLI_parallel_range_settings_defaults(TaskParallelSettings *settings)
|
2018-01-08 11:35:48 +01:00
|
|
|
{
|
|
|
|
memset(settings, 0, sizeof(*settings));
|
|
|
|
settings->use_threading = true;
|
2019-07-30 14:36:59 +02:00
|
|
|
/* Use default heuristic to define actual chunk size. */
|
|
|
|
settings->min_iter_per_thread = 0;
|
2018-01-08 11:35:48 +01:00
|
|
|
}
|
|
|
|
|
2021-06-09 22:49:45 +10:00
|
|
|
BLI_INLINE void BLI_parallel_mempool_settings_defaults(TaskParallelSettings *settings)
|
|
|
|
{
|
|
|
|
memset(settings, 0, sizeof(*settings));
|
|
|
|
settings->use_threading = true;
|
|
|
|
}
|
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/**
|
|
|
|
* Don't use this, store any thread specific data in `tls->userdata_chunk` instead.
|
|
|
|
* Only here for code to be removed.
|
|
|
|
*/
|
2020-04-30 07:59:23 +02:00
|
|
|
int BLI_task_parallel_thread_id(const TaskParallelTLS *tls);
|
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/** \} */
|
|
|
|
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
|
|
/** \name Task Graph Scheduling
|
|
|
|
*
|
|
|
|
* Task Graphs can be used to create a forest of directional trees and schedule work to any tree.
|
2020-05-25 12:24:56 +02:00
|
|
|
* The nodes in the graph can be run in separate threads.
|
|
|
|
*
|
2021-12-09 20:01:44 +11:00
|
|
|
* \code{.unparsed}
|
2020-05-25 12:24:56 +02:00
|
|
|
* +---- [root] ----+
|
|
|
|
* | |
|
|
|
|
* v v
|
|
|
|
* [node_1] +---- [node_2] ----+
|
|
|
|
* | |
|
|
|
|
* v v
|
|
|
|
* [node_3] [node_4]
|
2021-12-09 20:01:44 +11:00
|
|
|
* \endcode
|
2020-05-25 12:24:56 +02:00
|
|
|
*
|
2021-12-09 20:01:44 +11:00
|
|
|
* \code{.c}
|
|
|
|
* TaskGraph *task_graph = BLI_task_graph_create();
|
|
|
|
* TaskNode *root = BLI_task_graph_node_create(task_graph, root_exec, NULL, NULL);
|
|
|
|
* TaskNode *node_1 = BLI_task_graph_node_create(task_graph, node_exec, NULL, NULL);
|
|
|
|
* TaskNode *node_2 = BLI_task_graph_node_create(task_graph, node_exec, NULL, NULL);
|
|
|
|
* TaskNode *node_3 = BLI_task_graph_node_create(task_graph, node_exec, NULL, NULL);
|
|
|
|
* TaskNode *node_4 = BLI_task_graph_node_create(task_graph, node_exec, NULL, NULL);
|
2020-05-25 12:24:56 +02:00
|
|
|
*
|
2021-12-09 20:01:44 +11:00
|
|
|
* BLI_task_graph_edge_create(root, node_1);
|
|
|
|
* BLI_task_graph_edge_create(root, node_2);
|
|
|
|
* BLI_task_graph_edge_create(node_2, node_3);
|
|
|
|
* BLI_task_graph_edge_create(node_2, node_4);
|
|
|
|
* \endcode
|
2020-05-25 12:24:56 +02:00
|
|
|
*
|
|
|
|
* Any node can be triggered to start a chain of tasks. Normally you would trigger a root node but
|
|
|
|
* it is supported to start the chain of tasks anywhere in the forest or tree. When a node
|
|
|
|
* completes, the execution flow is forwarded via the created edges.
|
|
|
|
* When a child node has multiple parents the child node will be triggered once for each parent.
|
|
|
|
*
|
2021-12-09 20:01:44 +11:00
|
|
|
* `BLI_task_graph_node_push_work(root);`
|
2020-05-25 12:24:56 +02:00
|
|
|
*
|
|
|
|
* In this example After `root` is finished, `node_1` and `node_2` will be started.
|
|
|
|
* Only after `node_2` is finished `node_3` and `node_4` will be started.
|
|
|
|
*
|
|
|
|
* After scheduling work we need to wait until all the tasks have been finished.
|
|
|
|
*
|
2021-12-09 20:01:44 +11:00
|
|
|
* `BLI_task_graph_work_and_wait();`
|
2020-05-25 12:24:56 +02:00
|
|
|
*
|
|
|
|
* When finished you can clean up all the resources by freeing the task_graph. Nodes are owned by
|
|
|
|
* the graph and are freed task_data will only be freed if a free_func was given.
|
|
|
|
*
|
2021-12-09 20:01:44 +11:00
|
|
|
* `BLI_task_graph_free(task_graph);`
|
2020-05-25 12:24:56 +02:00
|
|
|
*
|
|
|
|
* Work can enter a tree on any node. Normally this would be the root_node.
|
|
|
|
* A `task_graph` can be reused, but the caller needs to make sure the task_data is reset.
|
|
|
|
*
|
2021-12-09 20:01:44 +11:00
|
|
|
* Task-Data
|
|
|
|
* ---------
|
2020-05-25 12:24:56 +02:00
|
|
|
*
|
|
|
|
* Typically you want give a task data to work on.
|
2020-06-05 14:34:00 +10:00
|
|
|
* Task data can be shared with other nodes, but be careful not to free the data multiple times.
|
2021-12-09 20:01:44 +11:00
|
|
|
* Task data is freed when calling #BLI_task_graph_free.
|
2020-05-25 12:24:56 +02:00
|
|
|
*
|
2021-12-09 20:01:44 +11:00
|
|
|
* \code{.c}
|
|
|
|
* MyData *task_data = MEM_callocN(sizeof(MyData), __func__);
|
|
|
|
* TaskNode *root = BLI_task_graph_node_create(task_graph, root_exec, task_data, MEM_freeN);
|
|
|
|
* TaskNode *node_1 = BLI_task_graph_node_create(task_graph, node_exec, task_data, NULL);
|
|
|
|
* TaskNode *node_2 = BLI_task_graph_node_create(task_graph, node_exec, task_data, NULL);
|
|
|
|
* TaskNode *node_3 = BLI_task_graph_node_create(task_graph, node_exec, task_data, NULL);
|
|
|
|
* TaskNode *node_4 = BLI_task_graph_node_create(task_graph, node_exec, task_data, NULL);
|
|
|
|
* \endcode
|
|
|
|
* \{ */
|
|
|
|
|
2020-05-25 12:24:56 +02:00
|
|
|
struct TaskGraph;
|
|
|
|
struct TaskNode;
|
|
|
|
|
|
|
|
typedef void (*TaskGraphNodeRunFunction)(void *__restrict task_data);
|
|
|
|
typedef void (*TaskGraphNodeFreeFunction)(void *task_data);
|
|
|
|
|
|
|
|
struct TaskGraph *BLI_task_graph_create(void);
|
|
|
|
void BLI_task_graph_work_and_wait(struct TaskGraph *task_graph);
|
|
|
|
void BLI_task_graph_free(struct TaskGraph *task_graph);
|
|
|
|
struct TaskNode *BLI_task_graph_node_create(struct TaskGraph *task_graph,
|
|
|
|
TaskGraphNodeRunFunction run,
|
2020-09-04 20:59:13 +02:00
|
|
|
void *user_data,
|
2020-05-25 12:24:56 +02:00
|
|
|
TaskGraphNodeFreeFunction free_func);
|
|
|
|
bool BLI_task_graph_node_push_work(struct TaskNode *task_node);
|
|
|
|
void BLI_task_graph_edge_create(struct TaskNode *from_node, struct TaskNode *to_node);
|
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/** \} */
|
|
|
|
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
|
|
/** \name Task Isolation
|
2021-06-14 23:50:24 +10:00
|
|
|
*
|
|
|
|
* Task isolation helps avoid unexpected task scheduling decisions that can lead to bugs if wrong
|
|
|
|
* assumptions were made. Typically that happens when doing "nested threading", i.e. one thread
|
|
|
|
* schedules a bunch of main-tasks and those spawn new sub-tasks.
|
|
|
|
*
|
|
|
|
* What can happen is that when a main-task waits for its sub-tasks to complete on other threads,
|
|
|
|
* another main-task is scheduled within the already running main-task. Generally, this is good,
|
|
|
|
* because it leads to better performance. However, sometimes code (often unintentionally) makes
|
|
|
|
* the assumption that at most one main-task runs on a thread at a time.
|
|
|
|
*
|
|
|
|
* The bugs often show themselves in two ways:
|
|
|
|
* - Deadlock, when a main-task holds a mutex while waiting for its sub-tasks to complete.
|
|
|
|
* - Data corruption, when a main-task makes wrong assumptions about a thread-local variable.
|
|
|
|
*
|
|
|
|
* Task isolation can avoid these bugs by making sure that a main-task does not start executing
|
|
|
|
* another main-task while waiting for its sub-tasks. More precisely, a function that runs in an
|
|
|
|
* isolated region is only allowed to run sub-tasks that were spawned in the same isolated region.
|
|
|
|
*
|
|
|
|
* Unfortunately, incorrect use of task isolation can lead to deadlocks itself. This can happen
|
|
|
|
* when threading primitives are used that separate spawning tasks from executing them. The problem
|
|
|
|
* occurs when a task is spawned in one isolated region while the tasks are waited for in another
|
|
|
|
* isolated region. In this setup, the thread that is waiting for the spawned tasks to complete
|
|
|
|
* cannot run the tasks itself. On a single thread, that causes a deadlock already. When there are
|
|
|
|
* multiple threads, another thread will typically run the task and avoid the deadlock. However, if
|
|
|
|
* this situation happens on all threads at the same time, all threads will deadlock. This happened
|
|
|
|
* in T88598.
|
2021-12-09 20:01:44 +11:00
|
|
|
* \{ */
|
|
|
|
|
2021-06-14 23:50:24 +10:00
|
|
|
void BLI_task_isolate(void (*func)(void *userdata), void *userdata);
|
|
|
|
|
2021-12-09 20:01:44 +11:00
|
|
|
/** \} */
|
|
|
|
|
2013-10-12 14:08:59 +00:00
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif
|