move some texture nodes to c++

2021-09-13 10:40:14 +02:00
76 changed files with 8406 additions and 6245 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -747,7 +747,7 @@ set_and_warn_dependency(WITH_TBB WITH_CYCLES            OFF)
 set_and_warn_dependency(WITH_TBB WITH_USD               OFF)
 set_and_warn_dependency(WITH_TBB WITH_OPENIMAGEDENOISE  OFF)
 set_and_warn_dependency(WITH_TBB WITH_OPENVDB           OFF)
-#set_and_warn_dependency(WITH_TBB WITH_MOD_FLUID         OFF)
+set_and_warn_dependency(WITH_TBB WITH_MOD_FLUID         OFF)

 # NanoVDB requires OpenVDB to convert the data structure
 set_and_warn_dependency(WITH_OPENVDB WITH_NANOVDB       OFF)
@@ -1239,29 +1239,15 @@ endif()
 if(WITH_OPENMP)
  if(NOT OPENMP_CUSTOM)
    find_package(OpenMP)
-
-    list(APPEND CMAKE_MODULE_PATH "${LLVM_LIBPATH}/cmake/openmp")
-    find_package(OpenMPTarget REQUIRED NVPTX)
  endif()

  if(OPENMP_FOUND)
    if(NOT WITH_OPENMP_STATIC)
-      message(STATUS "============ No Static OpenMP")
-      message(${OpenMPTarget_NVPTX_FLAGS})
-
      string(APPEND CMAKE_C_FLAGS " ${OpenMP_C_FLAGS}")
      string(APPEND CMAKE_CXX_FLAGS " ${OpenMP_CXX_FLAGS}")
      string(APPEND CMAKE_EXE_LINKER_FLAGS " ${OpenMP_LINKER_FLAGS}")
      string(APPEND CMAKE_MODULE_LINKER_FLAGS " ${OpenMP_LINKER_FLAGS}")
-
-      string(APPEND CMAKE_C_FLAGS " -gline-tables-only -fopenmp-targets=nvptx64-nvidia-cuda")
-      string(APPEND CMAKE_CXX_FLAGS " -gline-tables-only -fopenmp-targets=nvptx64-nvidia-cuda")
-      string(APPEND CMAKE_EXE_LINKER_FLAGS " -fopenmp-targets=nvptx64-nvidia-cuda")
-      string(APPEND CMAKE_MODULE_LINKER_FLAGS " -fopenmp-targets=nvptx64-nvidia-cuda")
-
-      set(PLATFORM_LINKLIBS "${PLATFORM_LINKLIBS};-fopenmp=libomp;-fopenmp-targets=nvptx64-nvidia-cuda")
    else()
-      message(STATUS "============= Static OpenMP")
      # Typically avoid adding flags as defines but we can't
      # pass OpenMP flags to the linker for static builds, meaning
      # we can't add any OpenMP related flags to CFLAGS variables
--- a/build_files/build_environment/cmake/llvm.cmake
+++ b/build_files/build_environment/cmake/llvm.cmake
@@ -30,10 +30,6 @@ if(APPLE)
  set(BUILD_CLANG_TOOLS ON)
 endif()

-if(UNIX AND NOT APPLE)
-  set(LLVM_BUILD_OPENMP ^^openmp)
-  set(LLVM_TARGETS ${LLVM_TARGETS} ^^NVPTX)
-endif()

 set(LLVM_EXTRA_ARGS
  -DLLVM_USE_CRT_RELEASE=MD
@@ -44,18 +40,10 @@ set(LLVM_EXTRA_ARGS
  -DLLVM_ENABLE_TERMINFO=OFF
  -DLLVM_BUILD_LLVM_C_DYLIB=OFF
  -DLLVM_ENABLE_UNWIND_TABLES=OFF
-  -DLLVM_ENABLE_PROJECTS=clang${LLVM_BUILD_CLANG_TOOLS_EXTRA}${LLVM_BUILD_OPENMP}
+  -DLLVM_ENABLE_PROJECTS=clang${LLVM_BUILD_CLANG_TOOLS_EXTRA}
  ${LLVM_XML2_ARGS}
 )

-if(UNIX AND NOT APPLE)
-  list(APPEND LLVM_EXTRA_ARGS
-    -DCLANG_OPENMP_NVPTX_DEFAULT_ARCH=sm_61
-    -DLIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES=61
-  )
-endif()
-
-
 if(WIN32)
  set(LLVM_GENERATOR "Ninja")
 else()
--- a/build_files/cmake/platform/platform_unix.cmake
+++ b/build_files/cmake/platform/platform_unix.cmake
@@ -63,8 +63,7 @@ if(EXISTS ${LIBDIR})
  # OpenMP usually can't be statically linked into shared libraries,
  # due to not being compiled with position independent code.
  if(NOT WITH_PYTHON_MODULE)
-    #message(STATUS "============= ENABLING Static OpenMP")
-    #set(WITH_OPENMP_STATIC ON)
+    set(WITH_OPENMP_STATIC ON)
  endif()
  set(Boost_NO_BOOST_CMAKE ON)
  set(BOOST_ROOT ${LIBDIR}/boost)
--- a/build_files/utils/make_update.py
+++ b/build_files/utils/make_update.py
@@ -222,10 +222,6 @@ if __name__ == "__main__":

    # Test if we are building a specific release version.
    branch = make_utils.git_branch(args.git_command)
-    if branch == 'HEAD':
-        sys.stderr.write('Blender git repository is in detached HEAD state, must be in a branch\n')
-        sys.exit(1)
-
    tag = make_utils.git_tag(args.git_command)
    release_version = make_utils.git_branch_release_version(branch, tag)

--- a/build_files/utils/make_utils.py
+++ b/build_files/utils/make_utils.py
@@ -70,7 +70,7 @@ def git_branch_release_version(branch, tag):
    return release_version


-def svn_libraries_base_url(release_version, branch=None):
+def svn_libraries_base_url(release_version, branch):
    if release_version:
        svn_branch = "tags/blender-" + release_version + "-release"
    elif branch:
--- a/extern/curve_fit_nd/intern/curve_fit_corners_detect.c
+++ b/extern/curve_fit_nd/intern/curve_fit_corners_detect.c
@@ -81,7 +81,7 @@ static double cos_vnvnvn(
 	normalize_vn_vnvn(dvec1, v1, v2, dims);
 	double d = dot_vnvn(dvec0, dvec1, dims);
 	/* sanity check */
-	d = maxV(-1.0, minV(1.0, d));
+	d = max(-1.0, min(1.0, d));
 	return d;
 }

--- a/extern/curve_fit_nd/intern/curve_fit_cubic.c
+++ b/extern/curve_fit_nd/intern/curve_fit_cubic.c
@@ -29,9 +29,9 @@
 *  \ingroup curve_fit
 */

-//#ifdef _MSC_VER
+#ifdef _MSC_VER
 #  define _USE_MATH_DEFINES
-//#endif
+#endif

 #include <math.h>
 #include <float.h>
@@ -456,7 +456,7 @@ static double points_calc_circumference_factor(
 	const double len_tangent = dot < 0.0 ? len_vnvn(tan_l, tan_r, dims) : len_negated_vnvn(tan_l, tan_r, dims);
 	if (len_tangent > DBL_EPSILON) {
 		/* only clamp to avoid precision error */
-		double angle = acos(maxV(-fabs(dot), -1.0));
+		double angle = acos(max(-fabs(dot), -1.0));
 		/* Angle may be less than the length when the tangents define >180 degrees of the circle,
 		 * (tangents that point away from each other).
 		 * We could try support this but will likely cause extreme >1 scales which could cause other issues. */
@@ -607,7 +607,7 @@ static void cubic_from_points_offset_fallback(
 		for (uint k = 0; k < 2; k++) {
 			sub_vn_vnvn(tmp, p0, pt, dims);
 			project_vn_vnvn_normalized(tmp, tmp, a[k], dims);
-			dists[k] = maxV(dists[k], dot_vnvn(tmp, a[k], dims));
+			dists[k] = max(dists[k], dot_vnvn(tmp, a[k], dims));
 		}
 	}

@@ -796,7 +796,7 @@ static void cubic_from_points(
 					dist_sq_test += sq((pt[j] - center[j]) * clamp_scale);
 				}
 #endif
-				dist_sq_max = maxV(dist_sq_max, dist_sq_test);
+				dist_sq_max = max(dist_sq_max, dist_sq_test);
 			}
 		}

--- a/extern/curve_fit_nd/intern/curve_fit_inline.h
+++ b/extern/curve_fit_nd/intern/curve_fit_inline.h
@@ -45,12 +45,12 @@ MINLINE double sq(const double d)
 }

 #ifndef _MSC_VER
-MINLINE double minV(const double a, const double b)
+MINLINE double min(const double a, const double b)
 {
 	return b < a ? b : a;
 }

-MINLINE double maxV(const double a, const double b)
+MINLINE double max(const double a, const double b)
 {
 	return a < b ? b : a;
 }
--- a/extern/draco/draco/src/draco/animation/keyframe_animation.cc
+++ b/extern/draco/draco/src/draco/animation/keyframe_animation.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 //
 #include "draco/animation/keyframe_animation.h"
-#include <iostream>

 namespace draco {

--- a/extern/mantaflow/CMakeLists.txt
+++ b/extern/mantaflow/CMakeLists.txt
@@ -55,30 +55,26 @@ if(NOT WITH_MANTA_DEPENDENCIES)
  add_definitions(-DNO_CNPY=1)
 endif()

+set(MANTA_HLP
+  helper
+)
+set(MANTA_PP
+  preprocessed
+)
 if(WITH_MANTA_DEPENDENCIES)
  set(MANTA_DEP
    dependencies
  )
 endif()

-if(WITH_OPENMP)
-  add_definitions(-DOPENMP=1)
-
-  if(WITH_OPENMP_STATIC)
-    list(APPEND LIB
-      ${OpenMP_LIBRARIES}
-    )
-  endif()
-endif()
-
-if(WITH_OPENMP)
-  add_definitions(-DOPENMP_OFFLOAD=1)
+if(WITH_TBB)
+  add_definitions(-DTBB=1)
 endif()

 if(WITH_OPENVDB)
  add_definitions(-DOPENVDB=1)
  # OpenVDB headers use deprecated TBB headers, silence warning.
-  #add_definitions(-DTBB_SUPPRESS_DEPRECATED_MESSAGES=1)
+  add_definitions(-DTBB_SUPPRESS_DEPRECATED_MESSAGES=1)
 endif()

 if(WITH_OPENVDB_BLOSC)
@@ -94,12 +90,12 @@ if(WITH_MANTA_NUMPY AND WITH_PYTHON_NUMPY)
 endif()

 set(INC
-  preprocessed
-  preprocessed/fileio
-  preprocessed/python
-  preprocessed/plugin
-  helper/pwrapper
-  helper/util
+  ${MANTA_PP}
+  ${MANTA_PP}/fileio
+  ${MANTA_PP}/python
+  ${MANTA_PP}/plugin
+  ${MANTA_HLP}/pwrapper
+  ${MANTA_HLP}/util
 )

 if(WITH_MANTA_DEPENDENCIES)
@@ -119,6 +115,15 @@ if(WITH_MANTA_NUMPY AND WITH_PYTHON_NUMPY)
  )
 endif()

+if(WITH_TBB)
+  list(APPEND INC_SYS
+    ${TBB_INCLUDE_DIRS}
+  )
+  list(APPEND LIB
+    ${TBB_LIBRARIES}
+  )
+endif()
+
 if(WITH_OPENVDB)
  list(APPEND INC_SYS
    ${OPENVDB_INCLUDE_DIRS}
@@ -137,120 +142,120 @@ if(WITH_OPENVDB)
 endif()

 set(SRC
-  preprocessed/commonkernels.h
-  preprocessed/commonkernels.h.reg.cpp
-  preprocessed/conjugategrad.cpp
-  preprocessed/conjugategrad.h
-  preprocessed/conjugategrad.h.reg.cpp
-  preprocessed/edgecollapse.cpp
-  preprocessed/edgecollapse.h
-  preprocessed/edgecollapse.h.reg.cpp
-  preprocessed/fastmarch.cpp
-  preprocessed/fastmarch.h
-  preprocessed/fastmarch.h.reg.cpp
-  preprocessed/fileio/iogrids.cpp
-  preprocessed/fileio/iomeshes.cpp
-  preprocessed/fileio/ioparticles.cpp
-  preprocessed/fileio/ioutil.cpp
-  preprocessed/fileio/iovdb.cpp
-  preprocessed/fileio/mantaio.cpp
-  preprocessed/fileio/mantaio.h
-  preprocessed/fileio/mantaio.h.reg.cpp
-  preprocessed/fluidsolver.cpp
-  preprocessed/fluidsolver.h
-  preprocessed/fluidsolver.h.reg.cpp
-  preprocessed/general.cpp
-  preprocessed/general.h
-  preprocessed/general.h.reg.cpp
-  preprocessed/gitinfo.h
-  preprocessed/grid.cpp
-  preprocessed/grid.h
-  preprocessed/grid.h.reg.cpp
-  preprocessed/grid4d.cpp
-  preprocessed/grid4d.h
-  preprocessed/grid4d.h.reg.cpp
-  preprocessed/kernel.cpp
-  preprocessed/kernel.h
-  preprocessed/kernel.h.reg.cpp
-  preprocessed/levelset.cpp
-  preprocessed/levelset.h
-  preprocessed/levelset.h.reg.cpp
-  preprocessed/mesh.cpp
-  preprocessed/mesh.h
-  preprocessed/mesh.h.reg.cpp
-  preprocessed/movingobs.cpp
-  preprocessed/movingobs.h
-  preprocessed/movingobs.h.reg.cpp
-  preprocessed/multigrid.cpp
-  preprocessed/multigrid.h
-  preprocessed/multigrid.h.reg.cpp
-  preprocessed/noisefield.cpp
-  preprocessed/noisefield.h
-  preprocessed/noisefield.h.reg.cpp
-  preprocessed/particle.cpp
-  preprocessed/particle.h
-  preprocessed/particle.h.reg.cpp
-  preprocessed/plugin/advection.cpp
-  preprocessed/plugin/apic.cpp
-  preprocessed/plugin/extforces.cpp
-  preprocessed/plugin/fire.cpp
-  preprocessed/plugin/flip.cpp
-  preprocessed/plugin/fluidguiding.cpp
-  preprocessed/plugin/initplugins.cpp
-  preprocessed/plugin/kepsilon.cpp
-  preprocessed/plugin/meshplugins.cpp
-  preprocessed/plugin/pressure.cpp
-  preprocessed/plugin/ptsplugins.cpp
-  preprocessed/plugin/secondaryparticles.cpp
-  preprocessed/plugin/surfaceturbulence.cpp
-#  preprocessed/plugin/viscosity.cpp
-  preprocessed/plugin/vortexplugins.cpp
-  preprocessed/plugin/waveletturbulence.cpp
-  preprocessed/plugin/waves.cpp
-  preprocessed/python/defines.py
-  preprocessed/python/defines.py.reg.cpp
-  preprocessed/registration.cpp
-  preprocessed/shapes.cpp
-  preprocessed/shapes.h
-  preprocessed/shapes.h.reg.cpp
-  preprocessed/test.cpp
-  preprocessed/timing.cpp
-  preprocessed/timing.h
-  preprocessed/timing.h.reg.cpp
-  preprocessed/turbulencepart.cpp
-  preprocessed/turbulencepart.h
-  preprocessed/turbulencepart.h.reg.cpp
-  preprocessed/vortexpart.cpp
-  preprocessed/vortexpart.h
-  preprocessed/vortexpart.h.reg.cpp
-  preprocessed/vortexsheet.cpp
-  preprocessed/vortexsheet.h
-  preprocessed/vortexsheet.h.reg.cpp
+  ${MANTA_PP}/commonkernels.h
+  ${MANTA_PP}/commonkernels.h.reg.cpp
+  ${MANTA_PP}/conjugategrad.cpp
+  ${MANTA_PP}/conjugategrad.h
+  ${MANTA_PP}/conjugategrad.h.reg.cpp
+  ${MANTA_PP}/edgecollapse.cpp
+  ${MANTA_PP}/edgecollapse.h
+  ${MANTA_PP}/edgecollapse.h.reg.cpp
+  ${MANTA_PP}/fastmarch.cpp
+  ${MANTA_PP}/fastmarch.h
+  ${MANTA_PP}/fastmarch.h.reg.cpp
+  ${MANTA_PP}/fileio/iogrids.cpp
+  ${MANTA_PP}/fileio/iomeshes.cpp
+  ${MANTA_PP}/fileio/ioparticles.cpp
+  ${MANTA_PP}/fileio/ioutil.cpp
+  ${MANTA_PP}/fileio/iovdb.cpp
+  ${MANTA_PP}/fileio/mantaio.cpp
+  ${MANTA_PP}/fileio/mantaio.h
+  ${MANTA_PP}/fileio/mantaio.h.reg.cpp
+  ${MANTA_PP}/fluidsolver.cpp
+  ${MANTA_PP}/fluidsolver.h
+  ${MANTA_PP}/fluidsolver.h.reg.cpp
+  ${MANTA_PP}/general.cpp
+  ${MANTA_PP}/general.h
+  ${MANTA_PP}/general.h.reg.cpp
+  ${MANTA_PP}/gitinfo.h
+  ${MANTA_PP}/grid.cpp
+  ${MANTA_PP}/grid.h
+  ${MANTA_PP}/grid.h.reg.cpp
+  ${MANTA_PP}/grid4d.cpp
+  ${MANTA_PP}/grid4d.h
+  ${MANTA_PP}/grid4d.h.reg.cpp
+  ${MANTA_PP}/kernel.cpp
+  ${MANTA_PP}/kernel.h
+  ${MANTA_PP}/kernel.h.reg.cpp
+  ${MANTA_PP}/levelset.cpp
+  ${MANTA_PP}/levelset.h
+  ${MANTA_PP}/levelset.h.reg.cpp
+  ${MANTA_PP}/mesh.cpp
+  ${MANTA_PP}/mesh.h
+  ${MANTA_PP}/mesh.h.reg.cpp
+  ${MANTA_PP}/movingobs.cpp
+  ${MANTA_PP}/movingobs.h
+  ${MANTA_PP}/movingobs.h.reg.cpp
+  ${MANTA_PP}/multigrid.cpp
+  ${MANTA_PP}/multigrid.h
+  ${MANTA_PP}/multigrid.h.reg.cpp
+  ${MANTA_PP}/noisefield.cpp
+  ${MANTA_PP}/noisefield.h
+  ${MANTA_PP}/noisefield.h.reg.cpp
+  ${MANTA_PP}/particle.cpp
+  ${MANTA_PP}/particle.h
+  ${MANTA_PP}/particle.h.reg.cpp
+  ${MANTA_PP}/plugin/advection.cpp
+  ${MANTA_PP}/plugin/apic.cpp
+  ${MANTA_PP}/plugin/extforces.cpp
+  ${MANTA_PP}/plugin/fire.cpp
+  ${MANTA_PP}/plugin/flip.cpp
+  ${MANTA_PP}/plugin/fluidguiding.cpp
+  ${MANTA_PP}/plugin/initplugins.cpp
+  ${MANTA_PP}/plugin/kepsilon.cpp
+  ${MANTA_PP}/plugin/meshplugins.cpp
+  ${MANTA_PP}/plugin/pressure.cpp
+  ${MANTA_PP}/plugin/ptsplugins.cpp
+  ${MANTA_PP}/plugin/secondaryparticles.cpp
+  ${MANTA_PP}/plugin/surfaceturbulence.cpp
+  ${MANTA_PP}/plugin/viscosity.cpp
+  ${MANTA_PP}/plugin/vortexplugins.cpp
+  ${MANTA_PP}/plugin/waveletturbulence.cpp
+  ${MANTA_PP}/plugin/waves.cpp
+  ${MANTA_PP}/python/defines.py
+  ${MANTA_PP}/python/defines.py.reg.cpp
+  ${MANTA_PP}/registration.cpp
+  ${MANTA_PP}/shapes.cpp
+  ${MANTA_PP}/shapes.h
+  ${MANTA_PP}/shapes.h.reg.cpp
+  ${MANTA_PP}/test.cpp
+  ${MANTA_PP}/timing.cpp
+  ${MANTA_PP}/timing.h
+  ${MANTA_PP}/timing.h.reg.cpp
+  ${MANTA_PP}/turbulencepart.cpp
+  ${MANTA_PP}/turbulencepart.h
+  ${MANTA_PP}/turbulencepart.h.reg.cpp
+  ${MANTA_PP}/vortexpart.cpp
+  ${MANTA_PP}/vortexpart.h
+  ${MANTA_PP}/vortexpart.h.reg.cpp
+  ${MANTA_PP}/vortexsheet.cpp
+  ${MANTA_PP}/vortexsheet.h
+  ${MANTA_PP}/vortexsheet.h.reg.cpp

-  helper/pwrapper/manta.h
-  helper/pwrapper/pclass.cpp
-  helper/pwrapper/pclass.h
-  helper/pwrapper/pconvert.cpp
-  helper/pwrapper/pconvert.h
-  helper/pwrapper/pvec3.cpp
-  helper/pwrapper/pythonInclude.h
-  helper/pwrapper/registry.cpp
-  helper/pwrapper/registry.h
-  helper/util/integrator.h
-  helper/util/interpol.h
-  helper/util/interpolHigh.h
-  helper/util/matrixbase.h
-  helper/util/mcubes.h
-  helper/util/quaternion.h
-  helper/util/randomstream.h
-  helper/util/rcmatrix.h
-  helper/util/simpleimage.cpp
-  helper/util/simpleimage.h
-  helper/util/solvana.h
-  helper/util/vector4d.cpp
-  helper/util/vector4d.h
-  helper/util/vectorbase.cpp
-  helper/util/vectorbase.h
+  ${MANTA_HLP}/pwrapper/manta.h
+  ${MANTA_HLP}/pwrapper/pclass.cpp
+  ${MANTA_HLP}/pwrapper/pclass.h
+  ${MANTA_HLP}/pwrapper/pconvert.cpp
+  ${MANTA_HLP}/pwrapper/pconvert.h
+  ${MANTA_HLP}/pwrapper/pvec3.cpp
+  ${MANTA_HLP}/pwrapper/pythonInclude.h
+  ${MANTA_HLP}/pwrapper/registry.cpp
+  ${MANTA_HLP}/pwrapper/registry.h
+  ${MANTA_HLP}/util/integrator.h
+  ${MANTA_HLP}/util/interpol.h
+  ${MANTA_HLP}/util/interpolHigh.h
+  ${MANTA_HLP}/util/matrixbase.h
+  ${MANTA_HLP}/util/mcubes.h
+  ${MANTA_HLP}/util/quaternion.h
+  ${MANTA_HLP}/util/randomstream.h
+  ${MANTA_HLP}/util/rcmatrix.h
+  ${MANTA_HLP}/util/simpleimage.cpp
+  ${MANTA_HLP}/util/simpleimage.h
+  ${MANTA_HLP}/util/solvana.h
+  ${MANTA_HLP}/util/vector4d.cpp
+  ${MANTA_HLP}/util/vector4d.h
+  ${MANTA_HLP}/util/vectorbase.cpp
+  ${MANTA_HLP}/util/vectorbase.h
 )

 if(WITH_MANTA_DEPENDENCIES)
@@ -261,34 +266,16 @@ if(WITH_MANTA_DEPENDENCIES)
 endif()
 if(WITH_MANTA_NUMPY AND WITH_PYTHON_NUMPY)
  list(APPEND SRC
-    preprocessed/plugin/numpyconvert.cpp
-    preprocessed/plugin/tfplugins.cpp
-    helper/pwrapper/numpyWrap.cpp
-    helper/pwrapper/numpyWrap.h
+    ${MANTA_PP}/plugin/numpyconvert.cpp
+    ${MANTA_PP}/plugin/tfplugins.cpp
+    ${MANTA_HLP}/pwrapper/numpyWrap.cpp
+    ${MANTA_HLP}/pwrapper/numpyWrap.h
  )
 endif()

 set(LIB
  ${PYTHON_LINKFLAGS}
  ${PYTHON_LIBRARIES}
-  ${OPENVDB_LIBRARIES}  
 )

-#blender_add_lib(extern_mantaflow "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
-
-add_library(extern_mantaflow SHARED "${SRC}")
-
-include_directories(preprocessed)
-include_directories(preprocessed/fileio)
-include_directories(preprocessed/python)
-include_directories(preprocessed/plugin)
-include_directories(helper/pwrapper)
-include_directories(helper/util)
-include_directories(${PYTHON_INCLUDE_DIRS})
-include_directories(${ZLIB_INCLUDE_DIRS})
-include_directories(${OPENVDB_INCLUDE_DIRS})
-
-#target_include_directories(extern_mantaflow PRIVATE "${INC}")
-target_link_libraries(extern_mantaflow PRIVATE "${LIB}")
-
-blender_source_group(extern_mantaflow "${SRC}")
+blender_add_lib(extern_mantaflow "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
--- a/extern/mantaflow/UPDATE.sh
+++ b/extern/mantaflow/UPDATE.sh
@@ -7,11 +7,8 @@
 # ====================  1) ENVIRONMENT SETUP =============================================

 # YOUR INSTALLATION PATHS GO HERE:
-MANTA_INSTALLATION=/home/sebbas/Developer/Mantaflow
-BLENDER_INSTALLATION=/home/sebbas/Developer/Blender
-
-CC=/home/sebbas/Developer/LLVM-Project/install/bin/clang
-CXX=/home/sebbas/Developer/LLVM-Project/install/bin/clang++
+MANTA_INSTALLATION=/Users/sebbas/Developer/Mantaflow/mantaflowDevelop
+BLENDER_INSTALLATION=/Users/sebbas/Developer/Blender

 # Try to check out Mantaflow repository before building?
 CLEAN_REPOSITORY=0
@@ -23,13 +20,8 @@ WITH_DEPENDENCIES=0
 USE_NUMPY=0

 # Choose which multithreading platform to use for Mantaflow preprocessing
-USE_OMP=1
-USE_TBB=0
-
-# Use OpenMP offloading too?
-if [[ "$USE_OMP" -eq "1" ]]; then
-  USE_OMP_OFFLOAD=1
-fi
+USE_OMP=0
+USE_TBB=1

 if [[ "$USE_OMP" -eq "1" && "$USE_TBB" -eq "1" ]]; then
  echo "Cannot build Mantaflow for OpenMP and TBB at the same time"
@@ -64,7 +56,7 @@ fi
 MANTA_BUILD_PATH=$MANTA_INSTALLATION/build_blender/
 mkdir -p $MANTA_BUILD_PATH
 cd $MANTA_BUILD_PATH
-cmake ../mantaflowgit -DGUI=0 -DOPENMP=$USE_OMP -DTBB=$USE_TBB -DOPENMP_OFFLOAD=$USE_OMP_OFFLOAD -DBLENDER=1 -DPREPDEBUG=1 -DNUMPY=$USE_NUMPY -DPYTHON_VERSION=3 -DCMAKE_C_COMPILER=$CC -DCMAKE_CXX_COMPILER=$CXX && make -j8
+cmake ../mantaflowgit -DGUI=0 -DOPENMP=$USE_OMP -DTBB=$USE_TBB -DBLENDER=1 -DPREPDEBUG=1 -DNUMPY=$USE_NUMPY && make -j8

 # ==================== 3) COPY MANTAFLOW FILES TO BLENDER ROOT ===========================

--- a/extern/mantaflow/helper/util/rcmatrix.h
+++ b/extern/mantaflow/helper/util/rcmatrix.h
@@ -1035,7 +1035,7 @@ template<class N, class T> struct RCFixedMatrix {
 typedef RCMatrix<int, Real> Matrix;
 typedef RCFixedMatrix<int, Real> FixedMatrix;

-}  // namespace Manta
+}

 #undef parallel_for
 #undef parallel_end
--- a/extern/mantaflow/preprocessed/commonkernels.h
+++ b/extern/mantaflow/preprocessed/commonkernels.h
--- a/extern/mantaflow/preprocessed/conjugategrad.cpp
+++ b/extern/mantaflow/preprocessed/conjugategrad.cpp
@@ -18,8 +18,6 @@

 #include "conjugategrad.h"
 #include "commonkernels.h"
-#include <chrono>
-using namespace std::chrono;

 using namespace std;
 namespace Manta {
@@ -215,9 +213,9 @@ struct GridDotProduct : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, const Grid<Real> &a, const Grid<Real> &b, double &result)
+  inline void op(IndexInt idx, const Grid<Real> &a, const Grid<Real> &b, double &result)
  {
-    result += (a(i, j, k) * b(i, j, k));
+    result += (a[idx] * b[idx]);
  }
  inline operator double()
  {
@@ -237,39 +235,28 @@ struct GridDotProduct : public KernelBase {
    return b;
  }
  typedef Grid<Real> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel GridDotProduct ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, a, b, result);
+  }
  void run()
  {
-    const int _maxX = maxX;
-    const int _maxY = maxY;
-    if (maxZ > 1) {
-      const Grid<Real> &a = getArg0();
-      const Grid<Real> &b = getArg1();
-#pragma omp target teams distribute parallel for reduction(+:result) collapse(2) schedule(static,1)
-      {
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, a, b, result);
-      }
-      {
-        this->result = result;
-      }
-    }
-    else {
-      const int k = 0;
-      const Grid<Real> &a = getArg0();
-      const Grid<Real> &b = getArg1();
-#pragma omp target teams distribute parallel for reduction(+:result) collapse(1) schedule(static,1)
-      {
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, a, b, result);
-      }
-      {
-        this->result = result;
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  GridDotProduct(GridDotProduct &o, tbb::split) : KernelBase(o), a(o.a), b(o.b), result(0.0)
+  {
+  }
+  void join(const GridDotProduct &o)
+  {
+    result += o.result;
  }
  const Grid<Real> &a;
  const Grid<Real> &b;
@@ -328,21 +315,29 @@ struct InitSigma : public KernelBase {
    return temp;
  }
  typedef Grid<Real> type3;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel InitSigma ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, flags, dst, rhs, temp, sigma);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-      double sigma = 0;
-#pragma omp for nowait
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, flags, dst, rhs, temp, sigma);
-#pragma omp critical
-      {
-        this->sigma += sigma;
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  InitSigma(InitSigma &o, tbb::split)
+      : KernelBase(o), flags(o.flags), dst(o.dst), rhs(o.rhs), temp(o.temp), sigma(0)
+  {
+  }
+  void join(const InitSigma &o)
+  {
+    sigma += o.sigma;
  }
  const FlagGrid &flags;
  Grid<Real> &dst;
@@ -361,9 +356,8 @@ struct UpdateSearchVec : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, Grid<Real> &dst, Grid<Real> &src, Real factor)
+  inline void op(IndexInt idx, Grid<Real> &dst, Grid<Real> &src, Real factor) const
  {
-    const IndexInt idx = dst.index(i, j, k);
    dst[idx] = src[idx] + factor * dst[idx];
  }
  inline Grid<Real> &getArg0()
@@ -381,35 +375,21 @@ struct UpdateSearchVec : public KernelBase {
    return factor;
  }
  typedef Real type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel UpdateSearchVec ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, dst, src, factor);
+  }
  void run()
  {
-    const int _maxX = maxX;
-    const int _maxY = maxY;
-    if (maxZ > 1) {
-      Grid<Real> &dst = getArg0();
-      Grid<Real> &src = getArg1();
-      Real &factor = getArg2();
-#pragma omp target teams distribute parallel for collapse(3) schedule(static, 1)
-      {
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, dst, src, factor);
-      }
-    }
-    else {
-      const int k = 0;
-      Grid<Real> &dst = getArg0();
-      Grid<Real> &src = getArg1();
-      Real &factor = getArg2();
-#pragma omp target teams distribute parallel for collapse(2) schedule(static, 1)
-      {
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, dst, src, factor);
-      }
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid<Real> &dst;
  Grid<Real> &src;
@@ -426,10 +406,8 @@ GridCg<APPLYMAT>::GridCg(Grid<Real> &dst,
                         Grid<Real> &search,
                         const FlagGrid &flags,
                         Grid<Real> &tmp,
-                         Grid<Real> *pA0,
-                         Grid<Real> *pAi,
-                         Grid<Real> *pAj,
-                         Grid<Real> *pAk)
+                         std::vector<Grid<Real> *> matrixAVec,
+                         std::vector<Grid<Real> *> rhsVec)
    : GridCgInterface(),
      mInited(false),
      mIterations(0),
@@ -439,10 +417,8 @@ GridCg<APPLYMAT>::GridCg(Grid<Real> &dst,
      mSearch(search),
      mFlags(flags),
      mTmp(tmp),
-      mpA0(pA0),
-      mpAi(pAi),
-      mpAj(pAj),
-      mpAk(pAk),
+      mMatrixA(matrixAVec),
+      mVecRhs(rhsVec),
      mPcMethod(PC_None),
      mpPCA0(nullptr),
      mpPCAi(nullptr),
@@ -460,37 +436,54 @@ template<class APPLYMAT> void GridCg<APPLYMAT>::doInit()
  mInited = true;
  mIterations = 0;

-  mDst.clear(1);
-  mResidual.copyFrom(mRhs, true, 1);  // p=0, residual = b
+  mDst.clear();
+  mResidual.copyFrom(mRhs);  // p=0, residual = b

  if (mPcMethod == PC_ICP) {
-    // assertMsg(mDst.is3D(), "ICP only supports 3D grids so far");
-    InitPreconditionIncompCholesky(
-        mFlags, *mpPCA0, *mpPCAi, *mpPCAj, *mpPCAk, *mpA0, *mpAi, *mpAj, *mpAk);
-    ApplyPreconditionIncompCholesky(
-        mTmp, mResidual, mFlags, *mpPCA0, *mpPCAi, *mpPCAj, *mpPCAk, *mpA0, *mpAi, *mpAj, *mpAk);
+    assertMsg(mDst.is3D(), "ICP only supports 3D grids so far");
+    InitPreconditionIncompCholesky(mFlags,
+                                   *mpPCA0,
+                                   *mpPCAi,
+                                   *mpPCAj,
+                                   *mpPCAk,
+                                   *mMatrixA[0],
+                                   *mMatrixA[1],
+                                   *mMatrixA[2],
+                                   *mMatrixA[3]);
+    ApplyPreconditionIncompCholesky(mTmp,
+                                    mResidual,
+                                    mFlags,
+                                    *mpPCA0,
+                                    *mpPCAi,
+                                    *mpPCAj,
+                                    *mpPCAk,
+                                    *mMatrixA[0],
+                                    *mMatrixA[1],
+                                    *mMatrixA[2],
+                                    *mMatrixA[3]);
  }
  else if (mPcMethod == PC_mICP) {
-    // assertMsg(mDst.is3D(), "mICP only supports 3D grids so far");
-    InitPreconditionModifiedIncompCholesky2(mFlags, *mpPCA0, *mpA0, *mpAi, *mpAj, *mpAk);
+    assertMsg(mDst.is3D(), "mICP only supports 3D grids so far");
+    InitPreconditionModifiedIncompCholesky2(
+        mFlags, *mpPCA0, *mMatrixA[0], *mMatrixA[1], *mMatrixA[2], *mMatrixA[3]);
    ApplyPreconditionModifiedIncompCholesky2(
-        mTmp, mResidual, mFlags, *mpPCA0, *mpA0, *mpAi, *mpAj, *mpAk);
+        mTmp, mResidual, mFlags, *mpPCA0, *mMatrixA[0], *mMatrixA[1], *mMatrixA[2], *mMatrixA[3]);
  }
  else if (mPcMethod == PC_MGP) {
-    InitPreconditionMultigrid(mMG, *mpA0, *mpAi, *mpAj, *mpAk, mAccuracy);
+    InitPreconditionMultigrid(
+        mMG, *mMatrixA[0], *mMatrixA[1], *mMatrixA[2], *mMatrixA[3], mAccuracy);
    ApplyPreconditionMultigrid(mMG, mTmp, mResidual);
  }
  else {
-    mTmp.copyFrom(mResidual, true, 1);
+    mTmp.copyFrom(mResidual);
  }

-  mSearch.copyFrom(mTmp, true, 1);
+  mSearch.copyFrom(mTmp);
  mSigma = GridDotProduct(mTmp, mResidual);
 }

-template<class APPLYMAT> bool GridCg<APPLYMAT>::iterate(Real &time)
+template<class APPLYMAT> bool GridCg<APPLYMAT>::iterate()
 {
-  auto start = high_resolution_clock::now();
  if (!mInited)
    doInit();

@@ -500,14 +493,7 @@ template<class APPLYMAT> bool GridCg<APPLYMAT>::iterate(Real &time)
  // this could reinterpret the mpA pointers (not so clean right now)
  // tmp = applyMat(search)

-  APPLYMAT(mFlags, mTmp, mSearch, *mpA0, *mpAi, *mpAj, *mpAk);
-
-  auto stop = high_resolution_clock::now();
-  auto duration = duration_cast<microseconds>(stop - start);
-  time += duration.count();
-  // std::cout << "APPLYMAT Time taken: " << duration.count() << std::endl;
-
-  start = high_resolution_clock::now();
+  APPLYMAT(mFlags, mTmp, mSearch, mMatrixA, mVecRhs);

  // alpha = sigma/dot(tmp, search)
  Real dp = GridDotProduct(mTmp, mSearch);
@@ -515,49 +501,35 @@ template<class APPLYMAT> bool GridCg<APPLYMAT>::iterate(Real &time)
  if (fabs(dp) > 0.)
    alpha = mSigma / (Real)dp;

-  stop = high_resolution_clock::now();
-  duration = duration_cast<microseconds>(stop - start);
-  time += duration.count();
-  // std::cout << "GridDotProduct Time taken: " << duration.count() << std::endl;
-
-  start = high_resolution_clock::now();
-
  gridScaledAdd<Real, Real>(mDst, mSearch, alpha);     // dst += search * alpha
  gridScaledAdd<Real, Real>(mResidual, mTmp, -alpha);  // residual += tmp * -alpha

-  stop = high_resolution_clock::now();
-  duration = duration_cast<microseconds>(stop - start);
-  time += duration.count();
-  // std::cout << "gridScaledAdd Time taken: " << duration.count() << std::endl;
-
-  start = high_resolution_clock::now();
-
  if (mPcMethod == PC_ICP)
-    ApplyPreconditionIncompCholesky(
-        mTmp, mResidual, mFlags, *mpPCA0, *mpPCAi, *mpPCAj, *mpPCAk, *mpA0, *mpAi, *mpAj, *mpAk);
+    ApplyPreconditionIncompCholesky(mTmp,
+                                    mResidual,
+                                    mFlags,
+                                    *mpPCA0,
+                                    *mpPCAi,
+                                    *mpPCAj,
+                                    *mpPCAk,
+                                    *mMatrixA[0],
+                                    *mMatrixA[1],
+                                    *mMatrixA[2],
+                                    *mMatrixA[3]);
  else if (mPcMethod == PC_mICP)
    ApplyPreconditionModifiedIncompCholesky2(
-        mTmp, mResidual, mFlags, *mpPCA0, *mpA0, *mpAi, *mpAj, *mpAk);
+        mTmp, mResidual, mFlags, *mpPCA0, *mMatrixA[0], *mMatrixA[1], *mMatrixA[2], *mMatrixA[3]);
  else if (mPcMethod == PC_MGP)
    ApplyPreconditionMultigrid(mMG, mTmp, mResidual);
  else
-    mTmp.copyFrom(mResidual, true, 1);
-
-  stop = high_resolution_clock::now();
-  duration = duration_cast<microseconds>(stop - start);
-  time += duration.count();
-  // std::cout << "copyFrom Time taken: " << duration.count() << std::endl;
-
-  start = high_resolution_clock::now();
+    mTmp.copyFrom(mResidual);

  // use the l2 norm of the residual for convergence check? (usually max norm is recommended
  // instead)
  if (this->mUseL2Norm) {
-    // std::cout << "USING L2" << std::endl;
    mResNorm = GridSumSqr(mResidual).sum;
  }
  else {
-    // std::cout << "NOT USING L2" << std::endl;
    mResNorm = mResidual.getMaxAbs();
  }

@@ -567,43 +539,27 @@ template<class APPLYMAT> bool GridCg<APPLYMAT>::iterate(Real &time)
    return false;
  }

-  stop = high_resolution_clock::now();
-  duration = duration_cast<microseconds>(stop - start);
-  time += duration.count();
-  // std::cout << "GridSumSqr Time taken: " << duration.count() << std::endl;
-
-  start = high_resolution_clock::now();
-
  Real sigmaNew = GridDotProduct(mTmp, mResidual);
  Real beta = sigmaNew / mSigma;

-  stop = high_resolution_clock::now();
-  duration = duration_cast<microseconds>(stop - start);
-  time += duration.count();
-  // std::cout << "GridDotProduct Time taken: " << duration.count() << std::endl;
-
-  start = high_resolution_clock::now();
-
  // search =  tmp + beta * search
  UpdateSearchVec(mSearch, mTmp, beta);

-  stop = high_resolution_clock::now();
-  duration = duration_cast<microseconds>(stop - start);
-  time += duration.count();
-  // std::cout << "UpdateSearchVec Time taken: " << duration.count() << std::endl;
-
-  // debMsg("GridCg::iterate i="<<mIterations<<" sigmaNew="<<sigmaNew<<" sigmaLast="<<mSigma<<"
-  // alpha="<<alpha<<" beta="<<beta<<" ", CG_DEBUGLEVEL);
+  debMsg("GridCg::iterate i=" << mIterations << " sigmaNew=" << sigmaNew << " sigmaLast=" << mSigma
+                              << " alpha=" << alpha << " beta=" << beta << " ",
+         CG_DEBUGLEVEL);
  mSigma = sigmaNew;

  if (!(mResNorm < 1e35)) {
    if (mPcMethod == PC_MGP) {
      // diverging solves can be caused by the static multigrid mode, we cannot detect this here,
      // though only the pressure solve call "knows" whether the MG is static or dynamics...
-      // debMsg("GridCg::iterate: Warning - this diverging solve can be caused by the 'static' mode
-      // of the MG preconditioner. If the static mode is active, try switching to dynamic.", 1);
+      debMsg(
+          "GridCg::iterate: Warning - this diverging solve can be caused by the 'static' mode of "
+          "the MG preconditioner. If the static mode is active, try switching to dynamic.",
+          1);
    }
-    // errMsg("GridCg::iterate: The CG solver diverged, residual norm > 1e30, stopping.");
+    errMsg("GridCg::iterate: The CG solver diverged, residual norm > 1e30, stopping.");
  }

  // debMsg("PB-CG-Norms::p"<<sqrt( GridOpNormNosqrt(mpDst, mpFlags).getValue() ) <<"
@@ -615,9 +571,8 @@ template<class APPLYMAT> bool GridCg<APPLYMAT>::iterate(Real &time)

 template<class APPLYMAT> void GridCg<APPLYMAT>::solve(int maxIter)
 {
-  Real time = 0;
  for (int iter = 0; iter < maxIter; iter++) {
-    if (!iterate(time))
+    if (!iterate())
      iter = maxIter;
  }
  return;
@@ -628,13 +583,13 @@ template<class APPLYMAT>
 void GridCg<APPLYMAT>::setICPreconditioner(
    PreconditionType method, Grid<Real> *A0, Grid<Real> *Ai, Grid<Real> *Aj, Grid<Real> *Ak)
 {
-  // assertMsg(method==PC_ICP || method==PC_mICP, "GridCg<APPLYMAT>::setICPreconditioner: Invalid
-  // method specified.");
+  assertMsg(method == PC_ICP || method == PC_mICP,
+            "GridCg<APPLYMAT>::setICPreconditioner: Invalid method specified.");

  mPcMethod = method;
  if ((!A0->is3D())) {
    if (gPrint2dWarning) {
-      // debMsg("ICP/mICP pre-conditioning only supported in 3D for now, disabling it.", 1);
+      debMsg("ICP/mICP pre-conditioning only supported in 3D for now, disabling it.", 1);
      gPrint2dWarning = false;
    }
    mPcMethod = PC_None;
@@ -648,7 +603,7 @@ void GridCg<APPLYMAT>::setICPreconditioner(
 template<class APPLYMAT>
 void GridCg<APPLYMAT>::setMGPreconditioner(PreconditionType method, GridMg *MG)
 {
-  // assertMsg(method==PC_MGP, "GridCg<APPLYMAT>::setMGPreconditioner: Invalid method specified.");
+  assertMsg(method == PC_MGP, "GridCg<APPLYMAT>::setMGPreconditioner: Invalid method specified.");

  mPcMethod = method;
  mMG = MG;
@@ -657,6 +612,9 @@ void GridCg<APPLYMAT>::setMGPreconditioner(PreconditionType method, GridMg *MG)
 // explicit instantiation
 template class GridCg<ApplyMatrix>;
 template class GridCg<ApplyMatrix2D>;
+template class GridCg<ApplyMatrixViscosityU>;
+template class GridCg<ApplyMatrixViscosityV>;
+template class GridCg<ApplyMatrixViscosityW>;

 //*****************************************************************************
 // diffusion for real and vec grids, e.g. for viscosity
@@ -697,33 +655,44 @@ void cgSolveDiffusion(const FlagGrid &flags,
    }
  }

-  GridCgInterface *gcg = nullptr;
+  GridCgInterface *gcg;
  // note , no preconditioning for now...
  const int maxIter = (int)(cgMaxIterFac * flags.getSize().max()) * (flags.is3D() ? 1 : 4);

  if (grid.getType() & GridBase::TypeReal) {
    Grid<Real> &u = ((Grid<Real> &)grid);
    rhs.copyFrom(u);
-    if (flags.is3D())
-      gcg = new GridCg<ApplyMatrix>(u, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj, &Ak);
-    else
-      gcg = new GridCg<ApplyMatrix2D>(u, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj, &Ak);
+    vector<Grid<Real> *> matA{&A0, &Ai, &Aj};
+
+    if (flags.is3D()) {
+      matA.push_back(&Ak);
+      gcg = new GridCg<ApplyMatrix>(u, rhs, residual, search, flags, tmp, matA);
+    }
+    else {
+      gcg = new GridCg<ApplyMatrix2D>(u, rhs, residual, search, flags, tmp, matA);
+    }

    gcg->setAccuracy(cgAccuracy);
    gcg->solve(maxIter);

-    // debMsg("FluidSolver::solveDiffusion iterations:"<<gcg->getIterations()<<",
-    // res:"<<gcg->getSigma(), CG_DEBUGLEVEL);
+    debMsg("FluidSolver::solveDiffusion iterations:" << gcg->getIterations()
+                                                     << ", res:" << gcg->getSigma(),
+           CG_DEBUGLEVEL);
  }
  else if ((grid.getType() & GridBase::TypeVec3) || (grid.getType() & GridBase::TypeMAC)) {
    Grid<Vec3> &vec = ((Grid<Vec3> &)grid);
    Grid<Real> u(parent);
+    vector<Grid<Real> *> matA{&A0, &Ai, &Aj};

    // core solve is same as for a regular real grid
-    if (flags.is3D())
-      gcg = new GridCg<ApplyMatrix>(u, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj, &Ak);
-    else
-      gcg = new GridCg<ApplyMatrix2D>(u, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj, &Ak);
+    if (flags.is3D()) {
+      matA.push_back(&Ak);
+      gcg = new GridCg<ApplyMatrix>(u, rhs, residual, search, flags, tmp, matA);
+    }
+    else {
+      gcg = new GridCg<ApplyMatrix2D>(u, rhs, residual, search, flags, tmp, matA);
+    }
+
    gcg->setAccuracy(cgAccuracy);

    // diffuse every component separately
@@ -733,14 +702,15 @@ void cgSolveDiffusion(const FlagGrid &flags,

      rhs.copyFrom(u);
      gcg->solve(maxIter);
-      // debMsg("FluidSolver::solveDiffusion vec3, iterations:"<<gcg->getIterations()<<",
-      // res:"<<gcg->getSigma(), CG_DEBUGLEVEL);
+      debMsg("FluidSolver::solveDiffusion vec3, iterations:" << gcg->getIterations()
+                                                             << ", res:" << gcg->getSigma(),
+             CG_DEBUGLEVEL);

      setComponent(u, vec, component);
    }
  }
  else {
-    // errMsg("cgSolveDiffusion: Grid Type is not supported (only Real, Vec3, MAC, or Levelset)");
+    errMsg("cgSolveDiffusion: Grid Type is not supported (only Real, Vec3, MAC, or Levelset)");
  }

  delete gcg;
--- a/extern/mantaflow/preprocessed/conjugategrad.h
+++ b/extern/mantaflow/preprocessed/conjugategrad.h
@@ -37,7 +37,7 @@ class GridCgInterface {
  virtual ~GridCgInterface(){};

  // solving functions
-  virtual bool iterate(Real &time) = 0;
+  virtual bool iterate() = 0;
  virtual void solve(int maxIter) = 0;

  // precond
@@ -78,16 +78,12 @@ template<class APPLYMAT> class GridCg : public GridCgInterface {
         Grid<Real> &search,
         const FlagGrid &flags,
         Grid<Real> &tmp,
-         Grid<Real> *A0,
-         Grid<Real> *pAi,
-         Grid<Real> *pAj,
-         Grid<Real> *pAk);
-  ~GridCg()
-  {
-  }
+         std::vector<Grid<Real> *> matrixAVec,
+         std::vector<Grid<Real> *> rhsVec = {});
+  ~GridCg(){};

  void doInit();
-  bool iterate(Real &time);
+  bool iterate();
  void solve(int maxIter);
  //! init pointers, and copy values from "normal" matrix
  void setICPreconditioner(
@@ -133,7 +129,10 @@ template<class APPLYMAT> class GridCg : public GridCgInterface {
  const FlagGrid &mFlags;
  Grid<Real> &mTmp;

-  Grid<Real> *mpA0, *mpAi, *mpAj, *mpAk;
+  //! shape of A matrix defined here (e.g. diagonal,  positive neighbor cells, etc)
+  std::vector<Grid<Real> *> mMatrixA;
+  //! shape of rhs vector defined here (e.g. 1 rhs for regular fluids solve, 3 rhs for viscosity)
+  std::vector<Grid<Real> *> mVecRhs;

  PreconditionType mPcMethod;
  //! preconditioning grids
@@ -154,32 +153,33 @@ struct ApplyMatrix : public KernelBase {
  ApplyMatrix(const FlagGrid &flags,
              Grid<Real> &dst,
              const Grid<Real> &src,
-              Grid<Real> &A0,
-              Grid<Real> &Ai,
-              Grid<Real> &Aj,
-              Grid<Real> &Ak)
-      : KernelBase(&flags, 0), flags(flags), dst(dst), src(src), A0(A0), Ai(Ai), Aj(Aj), Ak(Ak)
+              const std::vector<Grid<Real> *> matrixA,
+              const std::vector<Grid<Real> *> vecRhs)
+      : KernelBase(&flags, 0), flags(flags), dst(dst), src(src), matrixA(matrixA), vecRhs(vecRhs)
  {
    runMessage();
    run();
  }
-  inline void op(int i,
-                 int j,
-                 int k,
+  inline void op(IndexInt idx,
                 const FlagGrid &flags,
                 Grid<Real> &dst,
                 const Grid<Real> &src,
-                 Grid<Real> &A0,
-                 Grid<Real> &Ai,
-                 Grid<Real> &Aj,
-                 Grid<Real> &Ak)
+                 const std::vector<Grid<Real> *> matrixA,
+                 const std::vector<Grid<Real> *> vecRhs) const
  {
-    const IndexInt idx = dst.index(i, j, k);
+    unusedParameter(vecRhs);  // Not needed in this matrix application
+
+    if (matrixA.size() != 4)
+      errMsg("ConjugateGrad: Invalid A matrix in apply matrix step");
+    Grid<Real> &A0 = *matrixA[0];
+    Grid<Real> &Ai = *matrixA[1];
+    Grid<Real> &Aj = *matrixA[2];
+    Grid<Real> &Ak = *matrixA[3];
+
    if (!flags.isFluid(idx)) {
      dst[idx] = src[idx];
      return;
    }
-    const IndexInt X = flags.getStrideX(), Y = flags.getStrideY(), Z = flags.getStrideZ();

    dst[idx] = src[idx] * A0[idx] + src[idx - X] * Ai[idx - X] + src[idx + X] * Ai[idx] +
               src[idx - Y] * Aj[idx - Y] + src[idx + Y] * Aj[idx] + src[idx - Z] * Ak[idx - Z] +
@@ -200,71 +200,37 @@ struct ApplyMatrix : public KernelBase {
    return src;
  }
  typedef Grid<Real> type2;
-  inline Grid<Real> &getArg3()
+  inline const std::vector<Grid<Real> *> &getArg3()
  {
-    return A0;
+    return matrixA;
  }
-  typedef Grid<Real> type3;
-  inline Grid<Real> &getArg4()
+  typedef std::vector<Grid<Real> *> type3;
+  inline const std::vector<Grid<Real> *> &getArg4()
  {
-    return Ai;
+    return vecRhs;
  }
-  typedef Grid<Real> type4;
-  inline Grid<Real> &getArg5()
+  typedef std::vector<Grid<Real> *> type4;
+  void runMessage()
  {
-    return Aj;
-  }
-  typedef Grid<Real> type5;
-  inline Grid<Real> &getArg6()
+    debMsg("Executing kernel ApplyMatrix ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
-    return Ak;
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, flags, dst, src, matrixA, vecRhs);
  }
-  typedef Grid<Real> type6;
-  void runMessage(){};
  void run()
  {
-    const int _maxX = maxX;
-    const int _maxY = maxY;
-    if (maxZ > 1) {
-      const FlagGrid &flags = getArg0();
-      Grid<Real> &dst = getArg1();
-      const Grid<Real> &src = getArg2();
-      Grid<Real> &A0 = getArg3();
-      Grid<Real> &Ai = getArg4();
-      Grid<Real> &Aj = getArg5();
-      Grid<Real> &Ak = getArg6();
-#pragma omp target teams distribute parallel for collapse(3) schedule(static, 1)
-      {
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, dst, src, A0, Ai, Aj, Ak);
-      }
-    }
-    else {
-      const int k = 0;
-      const FlagGrid &flags = getArg0();
-      Grid<Real> &dst = getArg1();
-      const Grid<Real> &src = getArg2();
-      Grid<Real> &A0 = getArg3();
-      Grid<Real> &Ai = getArg4();
-      Grid<Real> &Aj = getArg5();
-      Grid<Real> &Ak = getArg6();
-#pragma omp target teams distribute parallel for collapse(2) schedule(static, 1)
-      {
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, dst, src, A0, Ai, Aj, Ak);
-      }
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const FlagGrid &flags;
  Grid<Real> &dst;
  const Grid<Real> &src;
-  Grid<Real> &A0;
-  Grid<Real> &Ai;
-  Grid<Real> &Aj;
-  Grid<Real> &Ak;
+  const std::vector<Grid<Real> *> matrixA;
+  const std::vector<Grid<Real> *> vecRhs;
 };

 //! Kernel: Apply symmetric stored Matrix. 2D version
@@ -273,34 +239,32 @@ struct ApplyMatrix2D : public KernelBase {
  ApplyMatrix2D(const FlagGrid &flags,
                Grid<Real> &dst,
                const Grid<Real> &src,
-                Grid<Real> &A0,
-                Grid<Real> &Ai,
-                Grid<Real> &Aj,
-                Grid<Real> &Ak)
-      : KernelBase(&flags, 0), flags(flags), dst(dst), src(src), A0(A0), Ai(Ai), Aj(Aj), Ak(Ak)
+                const std::vector<Grid<Real> *> matrixA,
+                const std::vector<Grid<Real> *> vecRhs)
+      : KernelBase(&flags, 0), flags(flags), dst(dst), src(src), matrixA(matrixA), vecRhs(vecRhs)
  {
    runMessage();
    run();
  }
-  inline void op(int i,
-                 int j,
-                 int k,
+  inline void op(IndexInt idx,
                 const FlagGrid &flags,
                 Grid<Real> &dst,
                 const Grid<Real> &src,
-                 Grid<Real> &A0,
-                 Grid<Real> &Ai,
-                 Grid<Real> &Aj,
-                 Grid<Real> &Ak)
+                 const std::vector<Grid<Real> *> matrixA,
+                 const std::vector<Grid<Real> *> vecRhs) const
  {
-    unusedParameter(Ak);  // only there for parameter compatibility with ApplyMatrix
+    unusedParameter(vecRhs);  // Not needed in this matrix application
+
+    if (matrixA.size() != 3)
+      errMsg("ConjugateGrad: Invalid A matrix in apply matrix step");
+    Grid<Real> &A0 = *matrixA[0];
+    Grid<Real> &Ai = *matrixA[1];
+    Grid<Real> &Aj = *matrixA[2];

-    const IndexInt idx = dst.index(i, j, k);
    if (!flags.isFluid(idx)) {
      dst[idx] = src[idx];
      return;
    }
-    const IndexInt X = flags.getStrideX(), Y = flags.getStrideY(), Z = flags.getStrideZ();

    dst[idx] = src[idx] * A0[idx] + src[idx - X] * Ai[idx - X] + src[idx + X] * Ai[idx] +
               src[idx - Y] * Aj[idx - Y] + src[idx + Y] * Aj[idx];
@@ -320,73 +284,387 @@ struct ApplyMatrix2D : public KernelBase {
    return src;
  }
  typedef Grid<Real> type2;
-  inline Grid<Real> &getArg3()
+  inline const std::vector<Grid<Real> *> &getArg3()
  {
-    return A0;
+    return matrixA;
  }
-  typedef Grid<Real> type3;
-  inline Grid<Real> &getArg4()
+  typedef std::vector<Grid<Real> *> type3;
+  inline const std::vector<Grid<Real> *> &getArg4()
  {
-    return Ai;
+    return vecRhs;
  }
-  typedef Grid<Real> type4;
-  inline Grid<Real> &getArg5()
+  typedef std::vector<Grid<Real> *> type4;
+  void runMessage()
  {
-    return Aj;
-  }
-  typedef Grid<Real> type5;
-  inline Grid<Real> &getArg6()
+    debMsg("Executing kernel ApplyMatrix2D ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
-    return Ak;
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, flags, dst, src, matrixA, vecRhs);
  }
-  typedef Grid<Real> type6;
-  void runMessage(){};
  void run()
  {
-    const int _maxX = maxX;
-    const int _maxY = maxY;
-    if (maxZ > 1) {
-      const FlagGrid &flags = getArg0();
-      Grid<Real> &dst = getArg1();
-      const Grid<Real> &src = getArg2();
-      Grid<Real> &A0 = getArg3();
-      Grid<Real> &Ai = getArg4();
-      Grid<Real> &Aj = getArg5();
-      Grid<Real> &Ak = getArg6();
-#pragma omp target teams distribute parallel for collapse(2) schedule(static, 1)
-      {
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, dst, src, A0, Ai, Aj, Ak);
-      }
-    }
-    else {
-      const int k = 0;
-      const FlagGrid &flags = getArg0();
-      Grid<Real> &dst = getArg1();
-      const Grid<Real> &src = getArg2();
-      Grid<Real> &A0 = getArg3();
-      Grid<Real> &Ai = getArg4();
-      Grid<Real> &Aj = getArg5();
-      Grid<Real> &Ak = getArg6();
-#pragma omp target teams distribute parallel for collapse(1) schedule(static, 1)
-      {
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, flags, dst, src, A0, Ai, Aj, Ak);
-      }
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const FlagGrid &flags;
  Grid<Real> &dst;
  const Grid<Real> &src;
-  Grid<Real> &A0;
-  Grid<Real> &Ai;
-  Grid<Real> &Aj;
-  Grid<Real> &Ak;
+  const std::vector<Grid<Real> *> matrixA;
+  const std::vector<Grid<Real> *> vecRhs;
 };

+struct ApplyMatrixViscosityU : public KernelBase {
+  ApplyMatrixViscosityU(const FlagGrid &flags,
+                        Grid<Real> &dst,
+                        const Grid<Real> &src,
+                        const std::vector<Grid<Real> *> matrixA,
+                        const std::vector<Grid<Real> *> vecRhs)
+      : KernelBase(&flags, 1), flags(flags), dst(dst), src(src), matrixA(matrixA), vecRhs(vecRhs)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 Grid<Real> &dst,
+                 const Grid<Real> &src,
+                 const std::vector<Grid<Real> *> matrixA,
+                 const std::vector<Grid<Real> *> vecRhs) const
+  {
+    if (matrixA.size() != 15)
+      errMsg("ConjugateGrad: Invalid A matrix in apply matrix step");
+    Grid<Real> &A0 = *matrixA[0];
+    Grid<Real> &Aplusi = *matrixA[1];
+    Grid<Real> &Aplusj = *matrixA[2];
+    Grid<Real> &Aplusk = *matrixA[3];
+    Grid<Real> &Aminusi = *matrixA[4];
+    Grid<Real> &Aminusj = *matrixA[5];
+    Grid<Real> &Aminusk = *matrixA[6];
+
+    if (vecRhs.size() != 2)
+      errMsg("ConjugateGrad: Invalid rhs vector in apply matrix step");
+    Grid<Real> &srcV = *vecRhs[0];
+    Grid<Real> &srcW = *vecRhs[1];
+
+    dst(i, j, k) = src(i, j, k) * A0(i, j, k) + src(i + 1, j, k) * Aplusi(i, j, k) +
+                   src(i, j + 1, k) * Aplusj(i, j, k) + src(i, j, k + 1) * Aplusk(i, j, k) +
+                   src(i - 1, j, k) * Aminusi(i, j, k) + src(i, j - 1, k) * Aminusj(i, j, k) +
+                   src(i, j, k - 1) * Aminusk(i, j, k);
+
+    dst(i, j, k) += srcV(i, j + 1, k) * (*matrixA[7])(i, j, k) +
+                    srcV(i - 1, j + 1, k) * (*matrixA[8])(i, j, k) +
+                    srcV(i, j, k) * (*matrixA[9])(i, j, k) +
+                    srcV(i - 1, j, k) * (*matrixA[10])(i, j, k) +
+                    srcW(i, j, k + 1) * (*matrixA[11])(i, j, k) +
+                    srcW(i - 1, j, k + 1) * (*matrixA[12])(i, j, k) +
+                    srcW(i, j, k) * (*matrixA[13])(i, j, k) +
+                    srcW(i - 1, j, k) * (*matrixA[14])(i, j, k);
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline Grid<Real> &getArg1()
+  {
+    return dst;
+  }
+  typedef Grid<Real> type1;
+  inline const Grid<Real> &getArg2()
+  {
+    return src;
+  }
+  typedef Grid<Real> type2;
+  inline const std::vector<Grid<Real> *> &getArg3()
+  {
+    return matrixA;
+  }
+  typedef std::vector<Grid<Real> *> type3;
+  inline const std::vector<Grid<Real> *> &getArg4()
+  {
+    return vecRhs;
+  }
+  typedef std::vector<Grid<Real> *> type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel ApplyMatrixViscosityU ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, dst, src, matrixA, vecRhs);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, dst, src, matrixA, vecRhs);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const FlagGrid &flags;
+  Grid<Real> &dst;
+  const Grid<Real> &src;
+  const std::vector<Grid<Real> *> matrixA;
+  const std::vector<Grid<Real> *> vecRhs;
+};
+
+struct ApplyMatrixViscosityV : public KernelBase {
+  ApplyMatrixViscosityV(const FlagGrid &flags,
+                        Grid<Real> &dst,
+                        const Grid<Real> &src,
+                        const std::vector<Grid<Real> *> matrixA,
+                        const std::vector<Grid<Real> *> vecRhs)
+      : KernelBase(&flags, 1), flags(flags), dst(dst), src(src), matrixA(matrixA), vecRhs(vecRhs)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 Grid<Real> &dst,
+                 const Grid<Real> &src,
+                 const std::vector<Grid<Real> *> matrixA,
+                 const std::vector<Grid<Real> *> vecRhs) const
+  {
+    if (matrixA.size() != 15)
+      errMsg("ConjugateGrad: Invalid A matrix in apply matrix step");
+    Grid<Real> &A0 = *matrixA[0];
+    Grid<Real> &Aplusi = *matrixA[1];
+    Grid<Real> &Aplusj = *matrixA[2];
+    Grid<Real> &Aplusk = *matrixA[3];
+    Grid<Real> &Aminusi = *matrixA[4];
+    Grid<Real> &Aminusj = *matrixA[5];
+    Grid<Real> &Aminusk = *matrixA[6];
+
+    if (vecRhs.size() != 2)
+      errMsg("ConjugateGrad: Invalid rhs vector in apply matrix step");
+    Grid<Real> &srcU = *vecRhs[0];
+    Grid<Real> &srcW = *vecRhs[1];
+
+    dst(i, j, k) = src(i, j, k) * A0(i, j, k) + src(i + 1, j, k) * Aplusi(i, j, k) +
+                   src(i, j + 1, k) * Aplusj(i, j, k) + src(i, j, k + 1) * Aplusk(i, j, k) +
+                   src(i - 1, j, k) * Aminusi(i, j, k) + src(i, j - 1, k) * Aminusj(i, j, k) +
+                   src(i, j, k - 1) * Aminusk(i, j, k);
+
+    dst(i, j, k) += srcU(i + 1, j, k) * (*matrixA[7])(i, j, k) +
+                    srcU(i + 1, j - 1, k) * (*matrixA[8])(i, j, k) +
+                    srcU(i, j, k) * (*matrixA[9])(i, j, k) +
+                    srcU(i, j - 1, k) * (*matrixA[10])(i, j, k) +
+                    srcW(i, j, k + 1) * (*matrixA[11])(i, j, k) +
+                    srcW(i, j - 1, k + 1) * (*matrixA[12])(i, j, k) +
+                    srcW(i, j, k) * (*matrixA[13])(i, j, k) +
+                    srcW(i, j - 1, k) * (*matrixA[14])(i, j, k);
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline Grid<Real> &getArg1()
+  {
+    return dst;
+  }
+  typedef Grid<Real> type1;
+  inline const Grid<Real> &getArg2()
+  {
+    return src;
+  }
+  typedef Grid<Real> type2;
+  inline const std::vector<Grid<Real> *> &getArg3()
+  {
+    return matrixA;
+  }
+  typedef std::vector<Grid<Real> *> type3;
+  inline const std::vector<Grid<Real> *> &getArg4()
+  {
+    return vecRhs;
+  }
+  typedef std::vector<Grid<Real> *> type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel ApplyMatrixViscosityV ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, dst, src, matrixA, vecRhs);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, dst, src, matrixA, vecRhs);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const FlagGrid &flags;
+  Grid<Real> &dst;
+  const Grid<Real> &src;
+  const std::vector<Grid<Real> *> matrixA;
+  const std::vector<Grid<Real> *> vecRhs;
+};
+
+struct ApplyMatrixViscosityW : public KernelBase {
+  ApplyMatrixViscosityW(const FlagGrid &flags,
+                        Grid<Real> &dst,
+                        const Grid<Real> &src,
+                        const std::vector<Grid<Real> *> matrixA,
+                        const std::vector<Grid<Real> *> vecRhs)
+      : KernelBase(&flags, 1), flags(flags), dst(dst), src(src), matrixA(matrixA), vecRhs(vecRhs)
+  {
+    runMessage();
+    run();
+  }
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const FlagGrid &flags,
+                 Grid<Real> &dst,
+                 const Grid<Real> &src,
+                 const std::vector<Grid<Real> *> matrixA,
+                 const std::vector<Grid<Real> *> vecRhs) const
+  {
+    if (matrixA.size() != 15)
+      errMsg("ConjugateGrad: Invalid A matrix in apply matrix step");
+    Grid<Real> &A0 = *matrixA[0];
+    Grid<Real> &Aplusi = *matrixA[1];
+    Grid<Real> &Aplusj = *matrixA[2];
+    Grid<Real> &Aplusk = *matrixA[3];
+    Grid<Real> &Aminusi = *matrixA[4];
+    Grid<Real> &Aminusj = *matrixA[5];
+    Grid<Real> &Aminusk = *matrixA[6];
+
+    if (vecRhs.size() != 2)
+      errMsg("ConjugateGrad: Invalid rhs vector in apply matrix step");
+    Grid<Real> &srcU = *vecRhs[0];
+    Grid<Real> &srcV = *vecRhs[1];
+
+    dst(i, j, k) = src(i, j, k) * A0(i, j, k) + src(i + 1, j, k) * Aplusi(i, j, k) +
+                   src(i, j + 1, k) * Aplusj(i, j, k) + src(i, j, k + 1) * Aplusk(i, j, k) +
+                   src(i - 1, j, k) * Aminusi(i, j, k) + src(i, j - 1, k) * Aminusj(i, j, k) +
+                   src(i, j, k - 1) * Aminusk(i, j, k);
+
+    dst(i, j, k) += srcU(i + 1, j, k) * (*matrixA[7])(i, j, k) +
+                    srcU(i + 1, j, k - 1) * (*matrixA[8])(i, j, k) +
+                    srcU(i, j, k) * (*matrixA[9])(i, j, k) +
+                    srcU(i, j, k - 1) * (*matrixA[10])(i, j, k) +
+                    srcV(i, j + 1, k) * (*matrixA[11])(i, j, k) +
+                    srcV(i, j + 1, k - 1) * (*matrixA[12])(i, j, k) +
+                    srcV(i, j, k) * (*matrixA[13])(i, j, k) +
+                    srcV(i, j, k - 1) * (*matrixA[14])(i, j, k);
+  }
+  inline const FlagGrid &getArg0()
+  {
+    return flags;
+  }
+  typedef FlagGrid type0;
+  inline Grid<Real> &getArg1()
+  {
+    return dst;
+  }
+  typedef Grid<Real> type1;
+  inline const Grid<Real> &getArg2()
+  {
+    return src;
+  }
+  typedef Grid<Real> type2;
+  inline const std::vector<Grid<Real> *> &getArg3()
+  {
+    return matrixA;
+  }
+  typedef std::vector<Grid<Real> *> type3;
+  inline const std::vector<Grid<Real> *> &getArg4()
+  {
+    return vecRhs;
+  }
+  typedef std::vector<Grid<Real> *> type4;
+  void runMessage()
+  {
+    debMsg("Executing kernel ApplyMatrixViscosityW ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    const int _maxX = maxX;
+    const int _maxY = maxY;
+    if (maxZ > 1) {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 1; j < _maxY; j++)
+          for (int i = 1; i < _maxX; i++)
+            op(i, j, k, flags, dst, src, matrixA, vecRhs);
+    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, dst, src, matrixA, vecRhs);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  const FlagGrid &flags;
+  Grid<Real> &dst;
+  const Grid<Real> &src;
+  const std::vector<Grid<Real> *> matrixA;
+  const std::vector<Grid<Real> *> vecRhs;
+};
+
+/* NOTE: Use this template for new matrix application kernels
+
+//! Template for matrix application kernels
+KERNEL()
+void ApplyMatrixTemplate (const FlagGrid& flags, Grid<Real>& dst, const Grid<Real>& src,
+  const std::vector<Grid<Real> *> matrixA, const std::vector<Grid<Real> *> vecRhs)
+{
+  // The kernel must define how to use the grids from the matrixA and vecRhs lists
+}
+
+*/
+
 //! Kernel: Construct the matrix for the poisson equation

 struct MakeLaplaceMatrix : public KernelBase {
@@ -409,7 +687,7 @@ struct MakeLaplaceMatrix : public KernelBase {
                 Grid<Real> &Ai,
                 Grid<Real> &Aj,
                 Grid<Real> &Ak,
-                 const MACGrid *fractions = 0)
+                 const MACGrid *fractions = 0) const
  {
    if (!flags.isFluid(i, j, k))
      return;
@@ -487,41 +765,36 @@ struct MakeLaplaceMatrix : public KernelBase {
    return fractions;
  }
  typedef MACGrid type5;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel MakeLaplaceMatrix ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-      const FlagGrid &flags = getArg0();
-      Grid<Real> &A0 = getArg1();
-      Grid<Real> &Ai = getArg2();
-      Grid<Real> &Aj = getArg3();
-      Grid<Real> &Ak = getArg4();
-      const MACGrid *fractions = getArg5();
-#pragma omp target teams distribute parallel for collapse(3) schedule(static, 1)
-      {
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, flags, A0, Ai, Aj, Ak, fractions);
-      }
-    }
-    else {
-      const int k = 0;
-      const FlagGrid &flags = getArg0();
-      Grid<Real> &A0 = getArg1();
-      Grid<Real> &Ai = getArg2();
-      Grid<Real> &Aj = getArg3();
-      Grid<Real> &Ak = getArg4();
-      const MACGrid *fractions = getArg5();
-#pragma omp target teams distribute parallel for collapse(2) schedule(static, 1)
-      {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, flags, A0, Ai, Aj, Ak, fractions);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, A0, Ai, Aj, Ak, fractions);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const FlagGrid &flags;
  Grid<Real> &A0;
--- a/extern/mantaflow/preprocessed/fastmarch.cpp
+++ b/extern/mantaflow/preprocessed/fastmarch.cpp
@@ -236,7 +236,13 @@ struct SetLevelsetBoundaries : public KernelBase {
    return phi;
  }
  typedef Grid<Real> type0;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel SetLevelsetBoundaries ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
  void run()
  {
    const int _maxX = maxX;
@@ -291,8 +297,14 @@ struct knExtrapolateMACSimple : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(
-      int i, int j, int k, MACGrid &vel, int distance, Grid<int> &tmp, const int d, const int c)
+  inline void op(int i,
+                 int j,
+                 int k,
+                 MACGrid &vel,
+                 int distance,
+                 Grid<int> &tmp,
+                 const int d,
+                 const int c) const
  {
    static const Vec3i nb[6] = {Vec3i(1, 0, 0),
                                Vec3i(-1, 0, 0),
@@ -347,34 +359,36 @@ struct knExtrapolateMACSimple : public KernelBase {
    return c;
  }
  typedef int type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knExtrapolateMACSimple ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, vel, distance, tmp, d, c);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, vel, distance, tmp, d, c);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, vel, distance, tmp, d, c);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  MACGrid &vel;
  int distance;
@@ -391,7 +405,7 @@ struct knExtrapolateIntoBnd : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, FlagGrid &flags, MACGrid &vel, const MACGrid &velTmp)
+  inline void op(int i, int j, int k, FlagGrid &flags, MACGrid &vel, const MACGrid &velTmp) const
  {
    int c = 0;
    Vec3 v(0, 0, 0);
@@ -453,34 +467,36 @@ struct knExtrapolateIntoBnd : public KernelBase {
    return velTmp;
  }
  typedef MACGrid type2;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knExtrapolateIntoBnd ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, vel, velTmp);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, flags, vel, velTmp);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, vel, velTmp);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  FlagGrid &flags;
  MACGrid &vel;
@@ -522,7 +538,8 @@ struct knUnprojectNormalComp : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, FlagGrid &flags, MACGrid &vel, Grid<Real> &phi, Real maxDist)
+  inline void op(
+      int i, int j, int k, FlagGrid &flags, MACGrid &vel, Grid<Real> &phi, Real maxDist) const
  {
    // apply inside, within range near obstacle surface
    if (phi(i, j, k) > 0. || phi(i, j, k) < -maxDist)
@@ -556,34 +573,36 @@ struct knUnprojectNormalComp : public KernelBase {
    return maxDist;
  }
  typedef Real type3;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knUnprojectNormalComp ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, flags, vel, phi, maxDist);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, flags, vel, phi, maxDist);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, vel, phi, maxDist);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  FlagGrid &flags;
  MACGrid &vel;
@@ -693,7 +712,7 @@ struct knExtrapolateMACFromWeight : public KernelBase {
                 Grid<Vec3> &weight,
                 int distance,
                 const int d,
-                 const int c)
+                 const int c) const
  {
    static const Vec3i nb[6] = {Vec3i(1, 0, 0),
                                Vec3i(-1, 0, 0),
@@ -747,34 +766,36 @@ struct knExtrapolateMACFromWeight : public KernelBase {
    return c;
  }
  typedef int type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knExtrapolateMACFromWeight ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, vel, weight, distance, d, c);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, vel, weight, distance, d, c);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, vel, weight, distance, d, c);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  MACGrid &vel;
  Grid<Vec3> &weight;
@@ -990,8 +1011,14 @@ template<class S> struct knExtrapolateLsSimple : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(
-      int i, int j, int k, Grid<S> &val, int distance, Grid<int> &tmp, const int d, S direction)
+  inline void op(int i,
+                 int j,
+                 int k,
+                 Grid<S> &val,
+                 int distance,
+                 Grid<int> &tmp,
+                 const int d,
+                 S direction) const
  {
    const int dim = (val.is3D() ? 3 : 2);
    if (tmp(i, j, k) != 0)
@@ -1038,34 +1065,36 @@ template<class S> struct knExtrapolateLsSimple : public KernelBase {
    return direction;
  }
  typedef S type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knExtrapolateLsSimple ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, val, distance, tmp, d, direction);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, val, distance, tmp, d, direction);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, val, distance, tmp, d, direction);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  Grid<S> &val;
  int distance;
@@ -1081,7 +1110,7 @@ template<class S> struct knSetRemaining : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, Grid<S> &phi, Grid<int> &tmp, S distance)
+  inline void op(int i, int j, int k, Grid<S> &phi, Grid<int> &tmp, S distance) const
  {
    if (tmp(i, j, k) != 0)
      return;
@@ -1102,34 +1131,36 @@ template<class S> struct knSetRemaining : public KernelBase {
    return distance;
  }
  typedef S type2;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knSetRemaining ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, phi, tmp, distance);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, phi, tmp, distance);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, phi, tmp, distance);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  Grid<S> &phi;
  Grid<int> &tmp;
--- a/extern/mantaflow/preprocessed/fileio/iogrids.cpp
+++ b/extern/mantaflow/preprocessed/fileio/iogrids.cpp
@@ -1256,7 +1256,7 @@ struct knQuantize : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid<Real> &grid, Real step)
+  inline void op(IndexInt idx, Grid<Real> &grid, Real step) const
  {
    quantizeReal(grid(idx), step);
  }
@@ -1270,17 +1270,21 @@ struct knQuantize : public KernelBase {
    return step;
  }
  typedef Real type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knQuantize ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, grid, step);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, grid, step);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid<Real> &grid;
  Real step;
@@ -1327,7 +1331,7 @@ struct knQuantizeVec3 : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid<Vec3> &grid, Real step)
+  inline void op(IndexInt idx, Grid<Vec3> &grid, Real step) const
  {
    for (int c = 0; c < 3; ++c)
      quantizeReal(grid(idx)[c], step);
@@ -1342,17 +1346,21 @@ struct knQuantizeVec3 : public KernelBase {
    return step;
  }
  typedef Real type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knQuantizeVec3 ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, grid, step);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, grid, step);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid<Vec3> &grid;
  Real step;
--- a/extern/mantaflow/preprocessed/fileio/ioutil.cpp
+++ b/extern/mantaflow/preprocessed/fileio/ioutil.cpp
@@ -66,7 +66,7 @@ void *safeGzopen(const char *filename, const char *mode)
 #endif  // NO_ZLIB != 1
 }

-#if OPENVDB == 1
+#if defined(OPENVDB)
 // Convert from OpenVDB value to Manta value.
 template<class S, class T> void convertFrom(S &in, T *out)
 {
--- a/extern/mantaflow/preprocessed/fileio/iovdb.cpp
+++ b/extern/mantaflow/preprocessed/fileio/iovdb.cpp
@@ -29,10 +29,10 @@

 #if OPENVDB == 1
 #  include "openvdb/openvdb.h"
-#  include <openvdb/points/PointConversion.h>
-#  include <openvdb/points/PointCount.h>
-#  include <openvdb/tools/Clip.h>
-#  include <openvdb/tools/Dense.h>
+#  include "openvdb/points/PointConversion.h"
+#  include "openvdb/points/PointCount.h"
+#  include "openvdb/tools/Clip.h"
+#  include "openvdb/tools/Dense.h"
 #endif

 #define POSITION_NAME "P"
@@ -433,9 +433,9 @@ int writeObjectsVDB(const string &filename,
        debMsg("Writing int grid '" << mantaGrid->getName() << "' to vdb file " << filename, 1);
        Grid<int> *mantaIntGrid = (Grid<int> *)mantaGrid;
        if (clipGrid && mantaIntGrid->saveSparse()) {
-          // assertMsg(clipGrid->getSize() == mantaGrid->getSize(), "writeObjectsVDB: Clip grid and
-          // exported grid must have the same size " << clipGrid->getSize() << " vs " <<
-          // mantaGrid->getSize());
+          assertMsg(clipGrid->getSize() == mantaGrid->getSize(),
+                    "writeObjectsVDB: Clip grid and exported grid must have the same size "
+                        << clipGrid->getSize() << " vs " << mantaGrid->getSize());
        }
        vdbGrid = exportVDB<int, openvdb::Int32Grid>(mantaIntGrid, clip, vdbClipGrid);
        gridsVDB.push_back(vdbGrid);
@@ -448,9 +448,9 @@ int writeObjectsVDB(const string &filename,
        // Only supply clip grid if real grid is not equal to the clip grid
        openvdb::FloatGrid::Ptr tmpClipGrid = (mantaRealGrid == clipGrid) ? nullptr : vdbClipGrid;
        if (clipGrid && mantaRealGrid->saveSparse()) {
-          // assertMsg(clipGrid->getSize() == mantaGrid->getSize(), "writeObjectsVDB: Clip grid and
-          // exported grid must have the same size " << clipGrid->getSize() << " vs " <<
-          // mantaGrid->getSize());
+          assertMsg(clipGrid->getSize() == mantaGrid->getSize(),
+                    "writeObjectsVDB: Clip grid and exported grid must have the same size "
+                        << clipGrid->getSize() << " vs " << mantaGrid->getSize());
        }
        vdbGrid = exportVDB<Real, openvdb::FloatGrid>(mantaRealGrid, clip, tmpClipGrid);
        gridsVDB.push_back(vdbGrid);
@@ -461,9 +461,9 @@ int writeObjectsVDB(const string &filename,
                                                              openvdb::GRID_UNKNOWN;
        Grid<Vec3> *mantaVec3Grid = (Grid<Vec3> *)mantaGrid;
        if (clipGrid && mantaVec3Grid->saveSparse()) {
-          // assertMsg(clipGrid->getSize() == mantaGrid->getSize(), "writeObjectsVDB: Clip grid and
-          // exported grid must have the same size " << clipGrid->getSize() << " vs " <<
-          // mantaGrid->getSize());
+          assertMsg(clipGrid->getSize() == mantaGrid->getSize(),
+                    "writeObjectsVDB: Clip grid and exported grid must have the same size "
+                        << clipGrid->getSize() << " vs " << mantaGrid->getSize());
        }
        vdbGrid = exportVDB<Vec3, openvdb::Vec3SGrid>(mantaVec3Grid, clip, vdbClipGrid);
        gridsVDB.push_back(vdbGrid);
@@ -519,7 +519,7 @@ int writeObjectsVDB(const string &filename,
    }
  }

-  // Write only if the is at least one grid, optionally write with compression.
+  // Write only if there is at least one grid, optionally write with compression.
  if (gridsVDB.size()) {
    int vdb_flags = openvdb::io::COMPRESS_ACTIVE_MASK;
    switch (compression) {
@@ -534,7 +534,8 @@ int writeObjectsVDB(const string &filename,
      }
      case COMPRESSION_BLOSC: {
 #  if OPENVDB_BLOSC == 1
-        vdb_flags |= openvdb::io::COMPRESS_BLOSC;
+        // Cannot use |= here, causes segfault with blosc 1.5.0 (== recommended version)
+        vdb_flags = openvdb::io::COMPRESS_BLOSC;
 #  else
        debMsg("OpenVDB was built without Blosc support, using Zip compression instead", 1);
        vdb_flags |= openvdb::io::COMPRESS_ZIP;
@@ -695,28 +696,36 @@ int readObjectsVDB(const string &filename, std::vector<PbClass *> *objects, floa

      // Compare metadata with allocated grid setup. This prevents invalid index access.
      if (notZero(metaRes) && metaRes != origRes) {
-        // debMsg("readObjectsVDB Warning: Grid '" << vdbGrid->getName() << "' has not been read.
-        // Meta grid res " << metaRes << " vs " << origRes << " current grid size", 1);
+        debMsg("readObjectsVDB Warning: Grid '" << vdbGrid->getName()
+                                                << "' has not been read. Meta grid res " << metaRes
+                                                << " vs " << origRes << " current grid size",
+               1);
        readFailure++;
        break;
      }
      if (notZero(metaVoxelSize) && metaVoxelSize != voxelSize) {
-        // debMsg("readObjectsVDB Warning: Grid '" << vdbGrid->getName() << "' has not been read.
-        // Meta voxel size " << metaVoxelSize << " vs " << voxelSize << " current voxel size", 1);
+        debMsg("readObjectsVDB Warning: Grid '"
+                   << vdbGrid->getName() << "' has not been read. Meta voxel size "
+                   << metaVoxelSize << " vs " << voxelSize << " current voxel size",
+               1);
        readFailure++;
        break;
      }
      if (metaBBoxMax.x > origRes.x || metaBBoxMax.y > origRes.y || metaBBoxMax.z > origRes.z) {
-        // debMsg("readObjectsVDB Warning: Grid '" << vdbGrid->getName() << "' has not been read.
-        // Vdb bbox max " << metaBBoxMax << " vs " << origRes << " current grid size", 1);
+        debMsg("readObjectsVDB Warning: Grid '"
+                   << vdbGrid->getName() << "' has not been read. Vdb bbox max " << metaBBoxMax
+                   << " vs " << origRes << " current grid size",
+               1);
        readFailure++;
        break;
      }
      const Vec3i origOrigin(0);
      if (metaBBoxMin.x < origOrigin.x || metaBBoxMin.y < origOrigin.y ||
          metaBBoxMin.z < origOrigin.z) {
-        // debMsg("readObjectsVDB Warning: Grid '" << vdbGrid->getName() << "' has not been read.
-        // Vdb bbox min " << metaBBoxMin << " vs " << origOrigin << " current grid origin", 1);
+        debMsg("readObjectsVDB Warning: Grid '"
+                   << vdbGrid->getName() << "' has not been read. Vdb bbox min " << metaBBoxMin
+                   << " vs " << origOrigin << " current grid origin",
+               1);
        readFailure++;
        break;
      }
--- a/extern/mantaflow/preprocessed/fluidsolver.h
+++ b/extern/mantaflow/preprocessed/fluidsolver.h
@@ -384,6 +384,7 @@ class FluidSolver : public PbClass {
  GridStorage<Real> mGrids4dReal;
  GridStorage<Vec3> mGrids4dVec;
  GridStorage<Vec4> mGrids4dVec4;
+
 public:
  PbArgs _args;
 }
--- a/extern/mantaflow/preprocessed/general.h
+++ b/extern/mantaflow/preprocessed/general.h
@@ -242,6 +242,39 @@ inline bool c_isnan(float c)
  return d != d;
 }

+//! Swap so that a<b
+template<class T> inline void sort(T &a, T &b)
+{
+  if (a > b)
+    std::swap(a, b);
+}
+
+//! Swap so that a<b<c
+template<class T> inline void sort(T &a, T &b, T &c)
+{
+  if (a > b)
+    std::swap(a, b);
+  if (a > c)
+    std::swap(a, c);
+  if (b > c)
+    std::swap(b, c);
+}
+
+//! Swap so that a<b<c<d
+template<class T> inline void sort(T &a, T &b, T &c, T &d)
+{
+  if (a > b)
+    std::swap(a, b);
+  if (c > d)
+    std::swap(c, d);
+  if (a > c)
+    std::swap(a, c);
+  if (b > d)
+    std::swap(b, d);
+  if (b > c)
+    std::swap(b, c);
+}
+
 }  // namespace Manta

 #endif
--- a/extern/mantaflow/preprocessed/gitinfo.h
+++ b/extern/mantaflow/preprocessed/gitinfo.h
@@ -1,3 +1,3 @@


-#define MANTA_GIT_VERSION "commit 39b7a415721ecbf6643612a24e8eadd221aeb934"
+#define MANTA_GIT_VERSION "commit d5d9a6c28daa8f21426d7a285f48639c0d8fd13f"
--- a/extern/mantaflow/preprocessed/grid.cpp
+++ b/extern/mantaflow/preprocessed/grid.cpp
--- a/extern/mantaflow/preprocessed/grid.h
+++ b/extern/mantaflow/preprocessed/grid.h
@@ -383,15 +383,13 @@ class GridBase : public PbClass {
    }
  }

-  // TODO (sebbas): Moved attributes to public for now
+ protected:
  GridType mType;
  Vec3i mSize;
  Real mDx;
-  bool m3D;
-  // precomputed Z shift: to ensure 2D compatibility, always use this instead of sx*sy !
+  bool m3D;  // precomputed Z shift: to ensure 2D compatibility, always use this instead of sx*sy !
  IndexInt mStrideZ;

- protected:
 public:
  PbArgs _args;
 }
@@ -403,7 +401,7 @@ class GridBase : public PbClass {
 template<class T> class Grid : public GridBase {
 public:
  //! init new grid, values are set to zero
-  Grid(FluidSolver *parent, bool show = true, bool sparse = false, bool offload = false);
+  Grid(FluidSolver *parent, bool show = true, bool sparse = false);
  static int _W_10(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
  {
    PbClass *obj = Pb::objFromPy(_self);
@@ -418,8 +416,7 @@ template<class T> class Grid : public GridBase {
        FluidSolver *parent = _args.getPtr<FluidSolver>("parent", 0, &_lock);
        bool show = _args.getOpt<bool>("show", 1, true, &_lock);
        bool sparse = _args.getOpt<bool>("sparse", 2, false, &_lock);
-        bool offload = _args.getOpt<bool>("offload", 3, false, &_lock);
-        obj = new Grid(parent, show, sparse, offload);
+        obj = new Grid(parent, show, sparse);
        obj->registerObject(_self, &_args);
        _args.check();
      }
@@ -493,7 +490,7 @@ template<class T> class Grid : public GridBase {
  }

  //! set all cells to zero
-  void clear(bool isOmp = false);
+  void clear();
  static PyObject *_W_13(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
  {
    try {
@@ -504,10 +501,9 @@ template<class T> class Grid : public GridBase {
      PyObject *_retval = nullptr;
      {
        ArgLocker _lock;
-        bool isOmp = _args.getOpt<bool>("isOmp", 0, false, &_lock);
        pbo->_args.copy(_args);
        _retval = getPyNone();
-        pbo->clear(isOmp);
+        pbo->clear();
        pbo->_args.check();
      }
      pbFinalizePlugin(pbo->getParent(), "Grid::clear", !noTiming);
@@ -595,11 +591,6 @@ template<class T> class Grid : public GridBase {
  {
    return mSaveSparse;
  }
-  //! query if this grid should be saved as a sparse grid
-  inline bool isOffload()
-  {
-    return mOffload;
-  }

  //! set data
  inline void set(int i, int j, int k, T &val)
@@ -625,8 +616,9 @@ template<class T> class Grid : public GridBase {
        return interpol<T>(mData, mSize, mStrideZ, pos);
      case 2:
        return interpolCubic<T>(mData, mSize, mStrideZ, pos);
+      default:
+        assertMsg(false, "Unknown interpolation order " << order);
    }
-    // default: assertMsg(false, "Unknown interpolation order "<<order); }
    return T(0.);  // should never be reached, just to prevent compiler warnings
  }

@@ -636,7 +628,7 @@ template<class T> class Grid : public GridBase {
  //! content...
  // Grid<T>& operator=(const Grid<T>& a);
  //! copy content from other grid (use this one instead of operator= !)
-  Grid<T> &copyFrom(const Grid<T> &a, bool copyType = true, bool isOmp = false);
+  Grid<T> &copyFrom(const Grid<T> &a, bool copyType = true);
  static PyObject *_W_14(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
  {
    try {
@@ -649,9 +641,8 @@ template<class T> class Grid : public GridBase {
        ArgLocker _lock;
        const Grid<T> &a = *_args.getPtr<Grid<T>>("a", 0, &_lock);
        bool copyType = _args.getOpt<bool>("copyType", 1, true, &_lock);
-        bool isOmp = _args.getOpt<bool>("isOmp", 2, false, &_lock);
        pbo->_args.copy(_args);
-        _retval = toPy(pbo->copyFrom(a, copyType, isOmp));
+        _retval = toPy(pbo->copyFrom(a, copyType));
        pbo->_args.check();
      }
      pbFinalizePlugin(pbo->getParent(), "Grid::copyFrom", !noTiming);
@@ -1285,157 +1276,6 @@ template<class T> class Grid : public GridBase {
    }
  }

-  //! OpenMP data mapping from / to target device
-  void mapToOmp();
-  static PyObject *_W_38(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
-  {
-    try {
-      PbArgs _args(_linargs, _kwds);
-      Grid *pbo = dynamic_cast<Grid *>(Pb::objFromPy(_self));
-      bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
-      pbPreparePlugin(pbo->getParent(), "Grid::mapToOmp", !noTiming);
-      PyObject *_retval = nullptr;
-      {
-        ArgLocker _lock;
-        pbo->_args.copy(_args);
-        _retval = getPyNone();
-        pbo->mapToOmp();
-        pbo->_args.check();
-      }
-      pbFinalizePlugin(pbo->getParent(), "Grid::mapToOmp", !noTiming);
-      return _retval;
-    }
-    catch (std::exception &e) {
-      pbSetError("Grid::mapToOmp", e.what());
-      return 0;
-    }
-  }
-
-  void mapFromOmp();
-  static PyObject *_W_39(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
-  {
-    try {
-      PbArgs _args(_linargs, _kwds);
-      Grid *pbo = dynamic_cast<Grid *>(Pb::objFromPy(_self));
-      bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
-      pbPreparePlugin(pbo->getParent(), "Grid::mapFromOmp", !noTiming);
-      PyObject *_retval = nullptr;
-      {
-        ArgLocker _lock;
-        pbo->_args.copy(_args);
-        _retval = getPyNone();
-        pbo->mapFromOmp();
-        pbo->_args.check();
-      }
-      pbFinalizePlugin(pbo->getParent(), "Grid::mapFromOmp", !noTiming);
-      return _retval;
-    }
-    catch (std::exception &e) {
-      pbSetError("Grid::mapFromOmp", e.what());
-      return 0;
-    }
-  }
-
-  void mapAllocOmp();
-  static PyObject *_W_40(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
-  {
-    try {
-      PbArgs _args(_linargs, _kwds);
-      Grid *pbo = dynamic_cast<Grid *>(Pb::objFromPy(_self));
-      bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
-      pbPreparePlugin(pbo->getParent(), "Grid::mapAllocOmp", !noTiming);
-      PyObject *_retval = nullptr;
-      {
-        ArgLocker _lock;
-        pbo->_args.copy(_args);
-        _retval = getPyNone();
-        pbo->mapAllocOmp();
-        pbo->_args.check();
-      }
-      pbFinalizePlugin(pbo->getParent(), "Grid::mapAllocOmp", !noTiming);
-      return _retval;
-    }
-    catch (std::exception &e) {
-      pbSetError("Grid::mapAllocOmp", e.what());
-      return 0;
-    }
-  }
-
-  void mapDeleteOmp();
-  static PyObject *_W_41(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
-  {
-    try {
-      PbArgs _args(_linargs, _kwds);
-      Grid *pbo = dynamic_cast<Grid *>(Pb::objFromPy(_self));
-      bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
-      pbPreparePlugin(pbo->getParent(), "Grid::mapDeleteOmp", !noTiming);
-      PyObject *_retval = nullptr;
-      {
-        ArgLocker _lock;
-        pbo->_args.copy(_args);
-        _retval = getPyNone();
-        pbo->mapDeleteOmp();
-        pbo->_args.check();
-      }
-      pbFinalizePlugin(pbo->getParent(), "Grid::mapDeleteOmp", !noTiming);
-      return _retval;
-    }
-    catch (std::exception &e) {
-      pbSetError("Grid::mapDeleteOmp", e.what());
-      return 0;
-    }
-  }
-
-  void updateToOmp();
-  static PyObject *_W_42(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
-  {
-    try {
-      PbArgs _args(_linargs, _kwds);
-      Grid *pbo = dynamic_cast<Grid *>(Pb::objFromPy(_self));
-      bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
-      pbPreparePlugin(pbo->getParent(), "Grid::updateToOmp", !noTiming);
-      PyObject *_retval = nullptr;
-      {
-        ArgLocker _lock;
-        pbo->_args.copy(_args);
-        _retval = getPyNone();
-        pbo->updateToOmp();
-        pbo->_args.check();
-      }
-      pbFinalizePlugin(pbo->getParent(), "Grid::updateToOmp", !noTiming);
-      return _retval;
-    }
-    catch (std::exception &e) {
-      pbSetError("Grid::updateToOmp", e.what());
-      return 0;
-    }
-  }
-
-  void updateFromOmp();
-  static PyObject *_W_43(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
-  {
-    try {
-      PbArgs _args(_linargs, _kwds);
-      Grid *pbo = dynamic_cast<Grid *>(Pb::objFromPy(_self));
-      bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
-      pbPreparePlugin(pbo->getParent(), "Grid::updateFromOmp", !noTiming);
-      PyObject *_retval = nullptr;
-      {
-        ArgLocker _lock;
-        pbo->_args.copy(_args);
-        _retval = getPyNone();
-        pbo->updateFromOmp();
-        pbo->_args.check();
-      }
-      pbFinalizePlugin(pbo->getParent(), "Grid::updateFromOmp", !noTiming);
-      return _retval;
-    }
-    catch (std::exception &e) {
-      pbSetError("Grid::updateFromOmp", e.what());
-      return 0;
-    }
-  }
-
  // c++ only operators
  template<class S> Grid<T> &operator+=(const Grid<S> &a);
  template<class S> Grid<T> &operator+=(const S &a);
@@ -1459,12 +1299,10 @@ template<class T> class Grid : public GridBase {
    return mData[index(i, j, k)];
  }

-  // TODO (sebbas): Moved attributes to public for now
-  T *mData;
 protected:
+  T *mData;
  bool mExternalData;  // True if mData is managed outside of the Fluidsolver
  bool mSaveSparse;    // True if this grid may be cached in a sparse structure
-  bool mOffload;       // True if this grid shall be allocated on an OpenMP offload device too
 public:
  PbArgs _args;
 }
@@ -1476,12 +1314,12 @@ template<class T> class Grid : public GridBase {
 //! Special function for staggered grids
 class MACGrid : public Grid<Vec3> {
 public:
-  MACGrid(FluidSolver *parent, bool show = true, bool sparse = false, bool offload = false)
-      : Grid<Vec3>(parent, show, sparse, offload)
+  MACGrid(FluidSolver *parent, bool show = true, bool sparse = false)
+      : Grid<Vec3>(parent, show, sparse)
  {
    mType = (GridType)(TypeMAC | TypeVec3);
  }
-  static int _W_44(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  static int _W_38(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
  {
    PbClass *obj = Pb::objFromPy(_self);
    if (obj)
@@ -1495,8 +1333,7 @@ class MACGrid : public Grid<Vec3> {
        FluidSolver *parent = _args.getPtr<FluidSolver>("parent", 0, &_lock);
        bool show = _args.getOpt<bool>("show", 1, true, &_lock);
        bool sparse = _args.getOpt<bool>("sparse", 2, false, &_lock);
-        bool offload = _args.getOpt<bool>("offload", 3, false, &_lock);
-        obj = new MACGrid(parent, show, sparse, offload);
+        obj = new MACGrid(parent, show, sparse);
        obj->registerObject(_self, &_args);
        _args.check();
      }
@@ -1555,16 +1392,17 @@ class MACGrid : public Grid<Vec3> {
      case 1:
        return interpolComponent<comp>(mData, mSize, mStrideZ, pos);
      case 2:
-        return interpolCubicMAC(mData, mSize, mStrideZ, pos)[comp];
-    }  // warning - not yet optimized
-    // default: assertMsg(false, "Unknown interpolation order "<<order); }
+        return interpolCubicMAC(mData, mSize, mStrideZ, pos)[comp];  // warning - not yet optimized
+      default:
+        assertMsg(false, "Unknown interpolation order " << order);
+    }
    return 0.;  // should never be reached, just to prevent compiler warnings
  }

  //! set all boundary cells of a MAC grid to certain value (Dirchlet). Respects staggered grid
  //! locations optionally, only set normal components
  void setBoundMAC(Vec3 value, int boundaryWidth, bool normalOnly = false);
-  static PyObject *_W_45(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  static PyObject *_W_39(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
  {
    try {
      PbArgs _args(_linargs, _kwds);
@@ -1601,16 +1439,12 @@ class MACGrid : public Grid<Vec3> {
 //! Special functions for FlagGrid
 class FlagGrid : public Grid<int> {
 public:
-  FlagGrid(FluidSolver *parent,
-           int dim = 3,
-           bool show = true,
-           bool sparse = false,
-           bool offload = false)
-      : Grid<int>(parent, show, sparse, offload)
+  FlagGrid(FluidSolver *parent, int dim = 3, bool show = true, bool sparse = false)
+      : Grid<int>(parent, show, sparse)
  {
    mType = (GridType)(TypeFlags | TypeInt);
  }
-  static int _W_46(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  static int _W_40(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
  {
    PbClass *obj = Pb::objFromPy(_self);
    if (obj)
@@ -1625,8 +1459,7 @@ class FlagGrid : public Grid<int> {
        int dim = _args.getOpt<int>("dim", 1, 3, &_lock);
        bool show = _args.getOpt<bool>("show", 2, true, &_lock);
        bool sparse = _args.getOpt<bool>("sparse", 3, false, &_lock);
-        bool offload = _args.getOpt<bool>("offload", 4, false, &_lock);
-        obj = new FlagGrid(parent, dim, show, sparse, offload);
+        obj = new FlagGrid(parent, dim, show, sparse);
        obj->registerObject(_self, &_args);
        _args.check();
      }
@@ -1792,7 +1625,7 @@ class FlagGrid : public Grid<int> {
                  const std::string &inflow = "      ",
                  const std::string &outflow = "      ",
                  Grid<Real> *phiWalls = 0x00);
-  static PyObject *_W_47(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  static PyObject *_W_41(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
  {
    try {
      PbArgs _args(_linargs, _kwds);
@@ -1826,7 +1659,7 @@ class FlagGrid : public Grid<int> {

  //! set fluid flags inside levelset (liquids)
  void updateFromLevelset(LevelsetGrid &levelset);
-  static PyObject *_W_48(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  static PyObject *_W_42(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
  {
    try {
      PbArgs _args(_linargs, _kwds);
@@ -1853,7 +1686,7 @@ class FlagGrid : public Grid<int> {

  //! set all cells (except obs/in/outflow) to type (fluid by default)
  void fillGrid(int type = TypeFluid);
-  static PyObject *_W_49(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  static PyObject *_W_43(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
  {
    try {
      PbArgs _args(_linargs, _kwds);
@@ -1882,7 +1715,7 @@ class FlagGrid : public Grid<int> {
  //! warning for large grids! only regular int returned (due to python interface)
  //! optionally creates mask in RealGrid (1 where flag matches, 0 otherwise)
  int countCells(int flag, int bnd = 0, Grid<Real> *mask = nullptr);
-  static PyObject *_W_50(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+  static PyObject *_W_44(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
  {
    try {
      PbArgs _args(_linargs, _kwds);
@@ -2035,7 +1868,7 @@ template<class T, class S> struct gridAdd : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid<T> &me, const Grid<S> &other)
+  inline void op(IndexInt idx, Grid<T> &me, const Grid<S> &other) const
  {
    me[idx] += other[idx];
  }
@@ -2049,17 +1882,21 @@ template<class T, class S> struct gridAdd : public KernelBase {
    return other;
  }
  typedef Grid<S> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel gridAdd ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid<T> &me;
  const Grid<S> &other;
@@ -2070,7 +1907,7 @@ template<class T, class S> struct gridSub : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid<T> &me, const Grid<S> &other)
+  inline void op(IndexInt idx, Grid<T> &me, const Grid<S> &other) const
  {
    me[idx] -= other[idx];
  }
@@ -2084,17 +1921,21 @@ template<class T, class S> struct gridSub : public KernelBase {
    return other;
  }
  typedef Grid<S> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel gridSub ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid<T> &me;
  const Grid<S> &other;
@@ -2105,7 +1946,7 @@ template<class T, class S> struct gridMult : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid<T> &me, const Grid<S> &other)
+  inline void op(IndexInt idx, Grid<T> &me, const Grid<S> &other) const
  {
    me[idx] *= other[idx];
  }
@@ -2119,17 +1960,21 @@ template<class T, class S> struct gridMult : public KernelBase {
    return other;
  }
  typedef Grid<S> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel gridMult ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid<T> &me;
  const Grid<S> &other;
@@ -2140,7 +1985,7 @@ template<class T, class S> struct gridDiv : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid<T> &me, const Grid<S> &other)
+  inline void op(IndexInt idx, Grid<T> &me, const Grid<S> &other) const
  {
    me[idx] /= other[idx];
  }
@@ -2154,17 +1999,21 @@ template<class T, class S> struct gridDiv : public KernelBase {
    return other;
  }
  typedef Grid<S> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel gridDiv ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid<T> &me;
  const Grid<S> &other;
@@ -2175,7 +2024,7 @@ template<class T, class S> struct gridAddScalar : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid<T> &me, const S &other)
+  inline void op(IndexInt idx, Grid<T> &me, const S &other) const
  {
    me[idx] += other;
  }
@@ -2189,17 +2038,21 @@ template<class T, class S> struct gridAddScalar : public KernelBase {
    return other;
  }
  typedef S type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel gridAddScalar ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid<T> &me;
  const S &other;
@@ -2210,7 +2063,7 @@ template<class T, class S> struct gridMultScalar : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid<T> &me, const S &other)
+  inline void op(IndexInt idx, Grid<T> &me, const S &other) const
  {
    me[idx] *= other;
  }
@@ -2224,22 +2077,25 @@ template<class T, class S> struct gridMultScalar : public KernelBase {
    return other;
  }
  typedef S type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel gridMultScalar ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid<T> &me;
  const S &other;
 };
-
 template<class T, class S> struct gridScaledAdd : public KernelBase {
  gridScaledAdd(Grid<T> &me, const Grid<T> &other, const S &factor)
      : KernelBase(&me, 0), me(me), other(other), factor(factor)
@@ -2247,9 +2103,8 @@ template<class T, class S> struct gridScaledAdd : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, Grid<T> &me, const Grid<T> &other, const S &factor)
+  inline void op(IndexInt idx, Grid<T> &me, const Grid<T> &other, const S &factor) const
  {
-    const IndexInt idx = me.index(i, j, k);
    me[idx] += factor * other[idx];
  }
  inline Grid<T> &getArg0()
@@ -2267,35 +2122,21 @@ template<class T, class S> struct gridScaledAdd : public KernelBase {
    return factor;
  }
  typedef S type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel gridScaledAdd ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other, factor);
+  }
  void run()
  {
-    const int _maxX = maxX;
-    const int _maxY = maxY;
-    if (maxZ > 1) {
-      Grid<T> &me = getArg0();
-      const Grid<T> &other = getArg1();
-      const S &factor = getArg2();
-#pragma omp target teams distribute parallel for collapse(3) schedule(static, 1)
-      {
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, me, other, factor);
-      }
-    }
-    else {
-      const int k = 0;
-      Grid<T> &me = getArg0();
-      const Grid<T> &other = getArg1();
-      const S &factor = getArg2();
-#pragma omp target teams distribute parallel for collapse(2) schedule(static, 1)
-      {
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, me, other, factor);
-      }
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid<T> &me;
  const Grid<T> &other;
@@ -2308,7 +2149,7 @@ template<class T> struct gridSetConst : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid<T> &grid, T value)
+  inline void op(IndexInt idx, Grid<T> &grid, T value) const
  {
    grid[idx] = value;
  }
@@ -2322,17 +2163,21 @@ template<class T> struct gridSetConst : public KernelBase {
    return value;
  }
  typedef T type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel gridSetConst ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, grid, value);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, grid, value);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid<T> &grid;
  T value;
@@ -2434,7 +2279,7 @@ template<class S> struct knInterpolateGridTempl : public KernelBase {
                 const Grid<S> &source,
                 const Vec3 &sourceFactor,
                 Vec3 offset,
-                 int orderSpace = 1)
+                 int orderSpace = 1) const
  {
    Vec3 pos = Vec3(i, j, k) * sourceFactor + offset;
    if (!source.is3D())
@@ -2466,34 +2311,36 @@ template<class S> struct knInterpolateGridTempl : public KernelBase {
    return orderSpace;
  }
  typedef int type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knInterpolateGridTempl ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, target, source, sourceFactor, offset, orderSpace);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, target, source, sourceFactor, offset, orderSpace);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, target, source, sourceFactor, offset, orderSpace);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  Grid<S> &target;
  const Grid<S> &source;
--- a/extern/mantaflow/preprocessed/grid.h.reg.cpp
+++ b/extern/mantaflow/preprocessed/grid.h.reg.cpp
@@ -8,11 +8,11 @@ namespace Manta {
 #ifdef _C_FlagGrid
 static const Pb::Register _R_26("FlagGrid", "FlagGrid", "Grid<int>");
 template<> const char *Namify<FlagGrid>::S = "FlagGrid";
-static const Pb::Register _R_27("FlagGrid", "FlagGrid", FlagGrid::_W_46);
-static const Pb::Register _R_28("FlagGrid", "initDomain", FlagGrid::_W_47);
-static const Pb::Register _R_29("FlagGrid", "updateFromLevelset", FlagGrid::_W_48);
-static const Pb::Register _R_30("FlagGrid", "fillGrid", FlagGrid::_W_49);
-static const Pb::Register _R_31("FlagGrid", "countCells", FlagGrid::_W_50);
+static const Pb::Register _R_27("FlagGrid", "FlagGrid", FlagGrid::_W_40);
+static const Pb::Register _R_28("FlagGrid", "initDomain", FlagGrid::_W_41);
+static const Pb::Register _R_29("FlagGrid", "updateFromLevelset", FlagGrid::_W_42);
+static const Pb::Register _R_30("FlagGrid", "fillGrid", FlagGrid::_W_43);
+static const Pb::Register _R_31("FlagGrid", "countCells", FlagGrid::_W_44);
 #endif
 #ifdef _C_Grid
 static const Pb::Register _R_32("Grid<int>", "Grid<int>", "GridBase");
@@ -45,104 +45,86 @@ static const Pb::Register _R_57("Grid<int>", "setBound", Grid<int>::_W_34);
 static const Pb::Register _R_58("Grid<int>", "setBoundNeumann", Grid<int>::_W_35);
 static const Pb::Register _R_59("Grid<int>", "getDataPointer", Grid<int>::_W_36);
 static const Pb::Register _R_60("Grid<int>", "printGrid", Grid<int>::_W_37);
-static const Pb::Register _R_61("Grid<int>", "mapToOmp", Grid<int>::_W_38);
-static const Pb::Register _R_62("Grid<int>", "mapFromOmp", Grid<int>::_W_39);
-static const Pb::Register _R_63("Grid<int>", "mapAllocOmp", Grid<int>::_W_40);
-static const Pb::Register _R_64("Grid<int>", "mapDeleteOmp", Grid<int>::_W_41);
-static const Pb::Register _R_65("Grid<int>", "updateToOmp", Grid<int>::_W_42);
-static const Pb::Register _R_66("Grid<int>", "updateFromOmp", Grid<int>::_W_43);
-static const Pb::Register _R_67("Grid<Real>", "Grid<Real>", "GridBase");
+static const Pb::Register _R_61("Grid<Real>", "Grid<Real>", "GridBase");
 template<> const char *Namify<Grid<Real>>::S = "Grid<Real>";
-static const Pb::Register _R_68("Grid<Real>", "Grid", Grid<Real>::_W_10);
-static const Pb::Register _R_69("Grid<Real>", "save", Grid<Real>::_W_11);
-static const Pb::Register _R_70("Grid<Real>", "load", Grid<Real>::_W_12);
-static const Pb::Register _R_71("Grid<Real>", "clear", Grid<Real>::_W_13);
-static const Pb::Register _R_72("Grid<Real>", "copyFrom", Grid<Real>::_W_14);
-static const Pb::Register _R_73("Grid<Real>", "getGridType", Grid<Real>::_W_15);
-static const Pb::Register _R_74("Grid<Real>", "add", Grid<Real>::_W_16);
-static const Pb::Register _R_75("Grid<Real>", "sub", Grid<Real>::_W_17);
-static const Pb::Register _R_76("Grid<Real>", "setConst", Grid<Real>::_W_18);
-static const Pb::Register _R_77("Grid<Real>", "addConst", Grid<Real>::_W_19);
-static const Pb::Register _R_78("Grid<Real>", "addScaled", Grid<Real>::_W_20);
-static const Pb::Register _R_79("Grid<Real>", "mult", Grid<Real>::_W_21);
-static const Pb::Register _R_80("Grid<Real>", "multConst", Grid<Real>::_W_22);
-static const Pb::Register _R_81("Grid<Real>", "safeDivide", Grid<Real>::_W_23);
-static const Pb::Register _R_82("Grid<Real>", "clamp", Grid<Real>::_W_24);
-static const Pb::Register _R_83("Grid<Real>", "stomp", Grid<Real>::_W_25);
-static const Pb::Register _R_84("Grid<Real>", "permuteAxes", Grid<Real>::_W_26);
-static const Pb::Register _R_85("Grid<Real>", "permuteAxesCopyToGrid", Grid<Real>::_W_27);
-static const Pb::Register _R_86("Grid<Real>", "join", Grid<Real>::_W_28);
-static const Pb::Register _R_87("Grid<Real>", "getMaxAbs", Grid<Real>::_W_29);
-static const Pb::Register _R_88("Grid<Real>", "getMax", Grid<Real>::_W_30);
-static const Pb::Register _R_89("Grid<Real>", "getMin", Grid<Real>::_W_31);
-static const Pb::Register _R_90("Grid<Real>", "getL1", Grid<Real>::_W_32);
-static const Pb::Register _R_91("Grid<Real>", "getL2", Grid<Real>::_W_33);
-static const Pb::Register _R_92("Grid<Real>", "setBound", Grid<Real>::_W_34);
-static const Pb::Register _R_93("Grid<Real>", "setBoundNeumann", Grid<Real>::_W_35);
-static const Pb::Register _R_94("Grid<Real>", "getDataPointer", Grid<Real>::_W_36);
-static const Pb::Register _R_95("Grid<Real>", "printGrid", Grid<Real>::_W_37);
-static const Pb::Register _R_96("Grid<Real>", "mapToOmp", Grid<Real>::_W_38);
-static const Pb::Register _R_97("Grid<Real>", "mapFromOmp", Grid<Real>::_W_39);
-static const Pb::Register _R_98("Grid<Real>", "mapAllocOmp", Grid<Real>::_W_40);
-static const Pb::Register _R_99("Grid<Real>", "mapDeleteOmp", Grid<Real>::_W_41);
-static const Pb::Register _R_100("Grid<Real>", "updateToOmp", Grid<Real>::_W_42);
-static const Pb::Register _R_101("Grid<Real>", "updateFromOmp", Grid<Real>::_W_43);
-static const Pb::Register _R_102("Grid<Vec3>", "Grid<Vec3>", "GridBase");
+static const Pb::Register _R_62("Grid<Real>", "Grid", Grid<Real>::_W_10);
+static const Pb::Register _R_63("Grid<Real>", "save", Grid<Real>::_W_11);
+static const Pb::Register _R_64("Grid<Real>", "load", Grid<Real>::_W_12);
+static const Pb::Register _R_65("Grid<Real>", "clear", Grid<Real>::_W_13);
+static const Pb::Register _R_66("Grid<Real>", "copyFrom", Grid<Real>::_W_14);
+static const Pb::Register _R_67("Grid<Real>", "getGridType", Grid<Real>::_W_15);
+static const Pb::Register _R_68("Grid<Real>", "add", Grid<Real>::_W_16);
+static const Pb::Register _R_69("Grid<Real>", "sub", Grid<Real>::_W_17);
+static const Pb::Register _R_70("Grid<Real>", "setConst", Grid<Real>::_W_18);
+static const Pb::Register _R_71("Grid<Real>", "addConst", Grid<Real>::_W_19);
+static const Pb::Register _R_72("Grid<Real>", "addScaled", Grid<Real>::_W_20);
+static const Pb::Register _R_73("Grid<Real>", "mult", Grid<Real>::_W_21);
+static const Pb::Register _R_74("Grid<Real>", "multConst", Grid<Real>::_W_22);
+static const Pb::Register _R_75("Grid<Real>", "safeDivide", Grid<Real>::_W_23);
+static const Pb::Register _R_76("Grid<Real>", "clamp", Grid<Real>::_W_24);
+static const Pb::Register _R_77("Grid<Real>", "stomp", Grid<Real>::_W_25);
+static const Pb::Register _R_78("Grid<Real>", "permuteAxes", Grid<Real>::_W_26);
+static const Pb::Register _R_79("Grid<Real>", "permuteAxesCopyToGrid", Grid<Real>::_W_27);
+static const Pb::Register _R_80("Grid<Real>", "join", Grid<Real>::_W_28);
+static const Pb::Register _R_81("Grid<Real>", "getMaxAbs", Grid<Real>::_W_29);
+static const Pb::Register _R_82("Grid<Real>", "getMax", Grid<Real>::_W_30);
+static const Pb::Register _R_83("Grid<Real>", "getMin", Grid<Real>::_W_31);
+static const Pb::Register _R_84("Grid<Real>", "getL1", Grid<Real>::_W_32);
+static const Pb::Register _R_85("Grid<Real>", "getL2", Grid<Real>::_W_33);
+static const Pb::Register _R_86("Grid<Real>", "setBound", Grid<Real>::_W_34);
+static const Pb::Register _R_87("Grid<Real>", "setBoundNeumann", Grid<Real>::_W_35);
+static const Pb::Register _R_88("Grid<Real>", "getDataPointer", Grid<Real>::_W_36);
+static const Pb::Register _R_89("Grid<Real>", "printGrid", Grid<Real>::_W_37);
+static const Pb::Register _R_90("Grid<Vec3>", "Grid<Vec3>", "GridBase");
 template<> const char *Namify<Grid<Vec3>>::S = "Grid<Vec3>";
-static const Pb::Register _R_103("Grid<Vec3>", "Grid", Grid<Vec3>::_W_10);
-static const Pb::Register _R_104("Grid<Vec3>", "save", Grid<Vec3>::_W_11);
-static const Pb::Register _R_105("Grid<Vec3>", "load", Grid<Vec3>::_W_12);
-static const Pb::Register _R_106("Grid<Vec3>", "clear", Grid<Vec3>::_W_13);
-static const Pb::Register _R_107("Grid<Vec3>", "copyFrom", Grid<Vec3>::_W_14);
-static const Pb::Register _R_108("Grid<Vec3>", "getGridType", Grid<Vec3>::_W_15);
-static const Pb::Register _R_109("Grid<Vec3>", "add", Grid<Vec3>::_W_16);
-static const Pb::Register _R_110("Grid<Vec3>", "sub", Grid<Vec3>::_W_17);
-static const Pb::Register _R_111("Grid<Vec3>", "setConst", Grid<Vec3>::_W_18);
-static const Pb::Register _R_112("Grid<Vec3>", "addConst", Grid<Vec3>::_W_19);
-static const Pb::Register _R_113("Grid<Vec3>", "addScaled", Grid<Vec3>::_W_20);
-static const Pb::Register _R_114("Grid<Vec3>", "mult", Grid<Vec3>::_W_21);
-static const Pb::Register _R_115("Grid<Vec3>", "multConst", Grid<Vec3>::_W_22);
-static const Pb::Register _R_116("Grid<Vec3>", "safeDivide", Grid<Vec3>::_W_23);
-static const Pb::Register _R_117("Grid<Vec3>", "clamp", Grid<Vec3>::_W_24);
-static const Pb::Register _R_118("Grid<Vec3>", "stomp", Grid<Vec3>::_W_25);
-static const Pb::Register _R_119("Grid<Vec3>", "permuteAxes", Grid<Vec3>::_W_26);
-static const Pb::Register _R_120("Grid<Vec3>", "permuteAxesCopyToGrid", Grid<Vec3>::_W_27);
-static const Pb::Register _R_121("Grid<Vec3>", "join", Grid<Vec3>::_W_28);
-static const Pb::Register _R_122("Grid<Vec3>", "getMaxAbs", Grid<Vec3>::_W_29);
-static const Pb::Register _R_123("Grid<Vec3>", "getMax", Grid<Vec3>::_W_30);
-static const Pb::Register _R_124("Grid<Vec3>", "getMin", Grid<Vec3>::_W_31);
-static const Pb::Register _R_125("Grid<Vec3>", "getL1", Grid<Vec3>::_W_32);
-static const Pb::Register _R_126("Grid<Vec3>", "getL2", Grid<Vec3>::_W_33);
-static const Pb::Register _R_127("Grid<Vec3>", "setBound", Grid<Vec3>::_W_34);
-static const Pb::Register _R_128("Grid<Vec3>", "setBoundNeumann", Grid<Vec3>::_W_35);
-static const Pb::Register _R_129("Grid<Vec3>", "getDataPointer", Grid<Vec3>::_W_36);
-static const Pb::Register _R_130("Grid<Vec3>", "printGrid", Grid<Vec3>::_W_37);
-static const Pb::Register _R_131("Grid<Vec3>", "mapToOmp", Grid<Vec3>::_W_38);
-static const Pb::Register _R_132("Grid<Vec3>", "mapFromOmp", Grid<Vec3>::_W_39);
-static const Pb::Register _R_133("Grid<Vec3>", "mapAllocOmp", Grid<Vec3>::_W_40);
-static const Pb::Register _R_134("Grid<Vec3>", "mapDeleteOmp", Grid<Vec3>::_W_41);
-static const Pb::Register _R_135("Grid<Vec3>", "updateToOmp", Grid<Vec3>::_W_42);
-static const Pb::Register _R_136("Grid<Vec3>", "updateFromOmp", Grid<Vec3>::_W_43);
+static const Pb::Register _R_91("Grid<Vec3>", "Grid", Grid<Vec3>::_W_10);
+static const Pb::Register _R_92("Grid<Vec3>", "save", Grid<Vec3>::_W_11);
+static const Pb::Register _R_93("Grid<Vec3>", "load", Grid<Vec3>::_W_12);
+static const Pb::Register _R_94("Grid<Vec3>", "clear", Grid<Vec3>::_W_13);
+static const Pb::Register _R_95("Grid<Vec3>", "copyFrom", Grid<Vec3>::_W_14);
+static const Pb::Register _R_96("Grid<Vec3>", "getGridType", Grid<Vec3>::_W_15);
+static const Pb::Register _R_97("Grid<Vec3>", "add", Grid<Vec3>::_W_16);
+static const Pb::Register _R_98("Grid<Vec3>", "sub", Grid<Vec3>::_W_17);
+static const Pb::Register _R_99("Grid<Vec3>", "setConst", Grid<Vec3>::_W_18);
+static const Pb::Register _R_100("Grid<Vec3>", "addConst", Grid<Vec3>::_W_19);
+static const Pb::Register _R_101("Grid<Vec3>", "addScaled", Grid<Vec3>::_W_20);
+static const Pb::Register _R_102("Grid<Vec3>", "mult", Grid<Vec3>::_W_21);
+static const Pb::Register _R_103("Grid<Vec3>", "multConst", Grid<Vec3>::_W_22);
+static const Pb::Register _R_104("Grid<Vec3>", "safeDivide", Grid<Vec3>::_W_23);
+static const Pb::Register _R_105("Grid<Vec3>", "clamp", Grid<Vec3>::_W_24);
+static const Pb::Register _R_106("Grid<Vec3>", "stomp", Grid<Vec3>::_W_25);
+static const Pb::Register _R_107("Grid<Vec3>", "permuteAxes", Grid<Vec3>::_W_26);
+static const Pb::Register _R_108("Grid<Vec3>", "permuteAxesCopyToGrid", Grid<Vec3>::_W_27);
+static const Pb::Register _R_109("Grid<Vec3>", "join", Grid<Vec3>::_W_28);
+static const Pb::Register _R_110("Grid<Vec3>", "getMaxAbs", Grid<Vec3>::_W_29);
+static const Pb::Register _R_111("Grid<Vec3>", "getMax", Grid<Vec3>::_W_30);
+static const Pb::Register _R_112("Grid<Vec3>", "getMin", Grid<Vec3>::_W_31);
+static const Pb::Register _R_113("Grid<Vec3>", "getL1", Grid<Vec3>::_W_32);
+static const Pb::Register _R_114("Grid<Vec3>", "getL2", Grid<Vec3>::_W_33);
+static const Pb::Register _R_115("Grid<Vec3>", "setBound", Grid<Vec3>::_W_34);
+static const Pb::Register _R_116("Grid<Vec3>", "setBoundNeumann", Grid<Vec3>::_W_35);
+static const Pb::Register _R_117("Grid<Vec3>", "getDataPointer", Grid<Vec3>::_W_36);
+static const Pb::Register _R_118("Grid<Vec3>", "printGrid", Grid<Vec3>::_W_37);
 #endif
 #ifdef _C_GridBase
-static const Pb::Register _R_137("GridBase", "GridBase", "PbClass");
+static const Pb::Register _R_119("GridBase", "GridBase", "PbClass");
 template<> const char *Namify<GridBase>::S = "GridBase";
-static const Pb::Register _R_138("GridBase", "GridBase", GridBase::_W_0);
-static const Pb::Register _R_139("GridBase", "getSizeX", GridBase::_W_1);
-static const Pb::Register _R_140("GridBase", "getSizeY", GridBase::_W_2);
-static const Pb::Register _R_141("GridBase", "getSizeZ", GridBase::_W_3);
-static const Pb::Register _R_142("GridBase", "getSize", GridBase::_W_4);
-static const Pb::Register _R_143("GridBase", "is3D", GridBase::_W_5);
-static const Pb::Register _R_144("GridBase", "is4D", GridBase::_W_6);
-static const Pb::Register _R_145("GridBase", "getSizeT", GridBase::_W_7);
-static const Pb::Register _R_146("GridBase", "getStrideT", GridBase::_W_8);
-static const Pb::Register _R_147("GridBase", "setName", GridBase::_W_9);
+static const Pb::Register _R_120("GridBase", "GridBase", GridBase::_W_0);
+static const Pb::Register _R_121("GridBase", "getSizeX", GridBase::_W_1);
+static const Pb::Register _R_122("GridBase", "getSizeY", GridBase::_W_2);
+static const Pb::Register _R_123("GridBase", "getSizeZ", GridBase::_W_3);
+static const Pb::Register _R_124("GridBase", "getSize", GridBase::_W_4);
+static const Pb::Register _R_125("GridBase", "is3D", GridBase::_W_5);
+static const Pb::Register _R_126("GridBase", "is4D", GridBase::_W_6);
+static const Pb::Register _R_127("GridBase", "getSizeT", GridBase::_W_7);
+static const Pb::Register _R_128("GridBase", "getStrideT", GridBase::_W_8);
+static const Pb::Register _R_129("GridBase", "setName", GridBase::_W_9);
 #endif
 #ifdef _C_MACGrid
-static const Pb::Register _R_148("MACGrid", "MACGrid", "Grid<Vec3>");
+static const Pb::Register _R_130("MACGrid", "MACGrid", "Grid<Vec3>");
 template<> const char *Namify<MACGrid>::S = "MACGrid";
-static const Pb::Register _R_149("MACGrid", "MACGrid", MACGrid::_W_44);
-static const Pb::Register _R_150("MACGrid", "setBoundMAC", MACGrid::_W_45);
+static const Pb::Register _R_131("MACGrid", "MACGrid", MACGrid::_W_38);
+static const Pb::Register _R_132("MACGrid", "setBoundMAC", MACGrid::_W_39);
 #endif
 static const Pb::Register _R_7("GridType_TypeNone", 0);
 static const Pb::Register _R_8("GridType_TypeReal", 1);
@@ -273,24 +255,6 @@ void PbRegister_file_7()
  KEEP_UNUSED(_R_130);
  KEEP_UNUSED(_R_131);
  KEEP_UNUSED(_R_132);
-  KEEP_UNUSED(_R_133);
-  KEEP_UNUSED(_R_134);
-  KEEP_UNUSED(_R_135);
-  KEEP_UNUSED(_R_136);
-  KEEP_UNUSED(_R_137);
-  KEEP_UNUSED(_R_138);
-  KEEP_UNUSED(_R_139);
-  KEEP_UNUSED(_R_140);
-  KEEP_UNUSED(_R_141);
-  KEEP_UNUSED(_R_142);
-  KEEP_UNUSED(_R_143);
-  KEEP_UNUSED(_R_144);
-  KEEP_UNUSED(_R_145);
-  KEEP_UNUSED(_R_146);
-  KEEP_UNUSED(_R_147);
-  KEEP_UNUSED(_R_148);
-  KEEP_UNUSED(_R_149);
-  KEEP_UNUSED(_R_150);
 }
 }
 }  // namespace Manta
--- a/extern/mantaflow/preprocessed/grid4d.cpp
+++ b/extern/mantaflow/preprocessed/grid4d.cpp
@@ -179,21 +179,29 @@ struct kn4dMinReal : public KernelBase {
    return val;
  }
  typedef Grid4d<Real> type0;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel kn4dMinReal ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, val, minVal);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-      Real minVal = std::numeric_limits<Real>::max();
-#pragma omp for nowait
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, val, minVal);
-#pragma omp critical
-      {
-        this->minVal = min(minVal, this->minVal);
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  kn4dMinReal(kn4dMinReal &o, tbb::split)
+      : KernelBase(o), val(o.val), minVal(std::numeric_limits<Real>::max())
+  {
+  }
+  void join(const kn4dMinReal &o)
+  {
+    minVal = min(minVal, o.minVal);
  }
  Grid4d<Real> &val;
  Real minVal;
@@ -226,21 +234,29 @@ struct kn4dMaxReal : public KernelBase {
    return val;
  }
  typedef Grid4d<Real> type0;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel kn4dMaxReal ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, val, maxVal);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-      Real maxVal = -std::numeric_limits<Real>::max();
-#pragma omp for nowait
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, val, maxVal);
-#pragma omp critical
-      {
-        this->maxVal = max(maxVal, this->maxVal);
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  kn4dMaxReal(kn4dMaxReal &o, tbb::split)
+      : KernelBase(o), val(o.val), maxVal(-std::numeric_limits<Real>::max())
+  {
+  }
+  void join(const kn4dMaxReal &o)
+  {
+    maxVal = max(maxVal, o.maxVal);
  }
  Grid4d<Real> &val;
  Real maxVal;
@@ -273,21 +289,29 @@ struct kn4dMinInt : public KernelBase {
    return val;
  }
  typedef Grid4d<int> type0;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel kn4dMinInt ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, val, minVal);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-      int minVal = std::numeric_limits<int>::max();
-#pragma omp for nowait
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, val, minVal);
-#pragma omp critical
-      {
-        this->minVal = min(minVal, this->minVal);
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  kn4dMinInt(kn4dMinInt &o, tbb::split)
+      : KernelBase(o), val(o.val), minVal(std::numeric_limits<int>::max())
+  {
+  }
+  void join(const kn4dMinInt &o)
+  {
+    minVal = min(minVal, o.minVal);
  }
  Grid4d<int> &val;
  int minVal;
@@ -320,21 +344,29 @@ struct kn4dMaxInt : public KernelBase {
    return val;
  }
  typedef Grid4d<int> type0;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel kn4dMaxInt ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, val, maxVal);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-      int maxVal = std::numeric_limits<int>::min();
-#pragma omp for nowait
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, val, maxVal);
-#pragma omp critical
-      {
-        this->maxVal = max(maxVal, this->maxVal);
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  kn4dMaxInt(kn4dMaxInt &o, tbb::split)
+      : KernelBase(o), val(o.val), maxVal(std::numeric_limits<int>::min())
+  {
+  }
+  void join(const kn4dMaxInt &o)
+  {
+    maxVal = max(maxVal, o.maxVal);
  }
  Grid4d<int> &val;
  int maxVal;
@@ -368,21 +400,29 @@ template<class VEC> struct kn4dMinVec : public KernelBase {
    return val;
  }
  typedef Grid4d<VEC> type0;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel kn4dMinVec ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, val, minVal);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-      Real minVal = std::numeric_limits<Real>::max();
-#pragma omp for nowait
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, val, minVal);
-#pragma omp critical
-      {
-        this->minVal = min(minVal, this->minVal);
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  kn4dMinVec(kn4dMinVec &o, tbb::split)
+      : KernelBase(o), val(o.val), minVal(std::numeric_limits<Real>::max())
+  {
+  }
+  void join(const kn4dMinVec &o)
+  {
+    minVal = min(minVal, o.minVal);
  }
  Grid4d<VEC> &val;
  Real minVal;
@@ -416,21 +456,29 @@ template<class VEC> struct kn4dMaxVec : public KernelBase {
    return val;
  }
  typedef Grid4d<VEC> type0;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel kn4dMaxVec ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, val, maxVal);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-      Real maxVal = -std::numeric_limits<Real>::max();
-#pragma omp for nowait
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, val, maxVal);
-#pragma omp critical
-      {
-        this->maxVal = max(maxVal, this->maxVal);
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  kn4dMaxVec(kn4dMaxVec &o, tbb::split)
+      : KernelBase(o), val(o.val), maxVal(-std::numeric_limits<Real>::max())
+  {
+  }
+  void join(const kn4dMaxVec &o)
+  {
+    maxVal = max(maxVal, o.maxVal);
  }
  Grid4d<VEC> &val;
  Real maxVal;
@@ -459,7 +507,7 @@ template<class T> struct kn4dSetConstReal : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid4d<T> &me, T val)
+  inline void op(IndexInt idx, Grid4d<T> &me, T val) const
  {
    me[idx] = val;
  }
@@ -473,17 +521,21 @@ template<class T> struct kn4dSetConstReal : public KernelBase {
    return val;
  }
  typedef T type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel kn4dSetConstReal ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, val);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, val);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid4d<T> &me;
  T val;
@@ -494,7 +546,7 @@ template<class T> struct kn4dAddConstReal : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid4d<T> &me, T val)
+  inline void op(IndexInt idx, Grid4d<T> &me, T val) const
  {
    me[idx] += val;
  }
@@ -508,17 +560,21 @@ template<class T> struct kn4dAddConstReal : public KernelBase {
    return val;
  }
  typedef T type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel kn4dAddConstReal ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, val);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, val);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid4d<T> &me;
  T val;
@@ -529,7 +585,7 @@ template<class T> struct kn4dMultConst : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid4d<T> &me, T val)
+  inline void op(IndexInt idx, Grid4d<T> &me, T val) const
  {
    me[idx] *= val;
  }
@@ -543,17 +599,21 @@ template<class T> struct kn4dMultConst : public KernelBase {
    return val;
  }
  typedef T type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel kn4dMultConst ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, val);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, val);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid4d<T> &me;
  T val;
@@ -564,7 +624,7 @@ template<class T> struct kn4dClamp : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid4d<T> &me, T min, T max)
+  inline void op(IndexInt idx, Grid4d<T> &me, T min, T max) const
  {
    me[idx] = clamp(me[idx], min, max);
  }
@@ -583,17 +643,21 @@ template<class T> struct kn4dClamp : public KernelBase {
    return max;
  }
  typedef T type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel kn4dClamp ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, min, max);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, min, max);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid4d<T> &me;
  T min;
@@ -724,7 +788,7 @@ struct knGetComp4d : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, const Grid4d<Vec4> &src, Grid4d<Real> &dst, int c)
+  inline void op(IndexInt idx, const Grid4d<Vec4> &src, Grid4d<Real> &dst, int c) const
  {
    dst[idx] = src[idx][c];
  }
@@ -743,17 +807,21 @@ struct knGetComp4d : public KernelBase {
    return c;
  }
  typedef int type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knGetComp4d ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, src, dst, c);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, src, dst, c);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const Grid4d<Vec4> &src;
  Grid4d<Real> &dst;
@@ -767,7 +835,7 @@ struct knSetComp4d : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, const Grid4d<Real> &src, Grid4d<Vec4> &dst, int c)
+  inline void op(IndexInt idx, const Grid4d<Real> &src, Grid4d<Vec4> &dst, int c) const
  {
    dst[idx][c] = src[idx];
  }
@@ -786,17 +854,21 @@ struct knSetComp4d : public KernelBase {
    return c;
  }
  typedef int type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knSetComp4d ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, src, dst, c);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, src, dst, c);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const Grid4d<Real> &src;
  Grid4d<Vec4> &dst;
@@ -887,7 +959,7 @@ template<class T> struct knSetBnd4d : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, int t, Grid4d<T> &grid, T value, int w)
+  inline void op(int i, int j, int k, int t, Grid4d<T> &grid, T value, int w) const
  {
    bool bnd = (i <= w || i >= grid.getSizeX() - 1 - w || j <= w || j >= grid.getSizeY() - 1 - w ||
                k <= w || k >= grid.getSizeZ() - 1 - w || t <= w || t >= grid.getSizeT() - 1 - w);
@@ -909,47 +981,50 @@ template<class T> struct knSetBnd4d : public KernelBase {
    return w;
  }
  typedef int type2;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knSetBnd4d ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ
+               << " "
+                  " t "
+               << minT << " - " << maxT,
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
-    const int _maxX = maxX;
-    const int _maxY = maxY;
    if (maxT > 1) {
-      const int _maxZ = maxZ;
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int t = 0; t < maxT; t++)
-          for (int k = 0; k < _maxZ; k++)
-            for (int j = 0; j < _maxY; j++)
-              for (int i = 0; i < _maxX; i++)
-                op(i, j, k, t, grid, value, w);
-      }
+      for (int t = __r.begin(); t != (int)__r.end(); t++)
+        for (int k = 0; k < maxZ; k++)
+          for (int j = 0; j < maxY; j++)
+            for (int i = 0; i < maxX; i++)
+              op(i, j, k, t, grid, value, w);
    }
    else if (maxZ > 1) {
      const int t = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, t, grid, value, w);
-      }
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < maxY; j++)
+          for (int i = 0; i < maxX; i++)
+            op(i, j, k, t, grid, value, w);
    }
    else {
      const int t = 0;
      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, t, grid, value, w);
-      }
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < maxX; i++)
+          op(i, j, k, t, grid, value, w);
+    }
+  }
+  void run()
+  {
+    if (maxT > 1) {
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minT, maxT), *this);
+    }
+    else if (maxZ > 1) {
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    }
+    else {
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
    }
  }
  Grid4d<T> &grid;
@@ -968,7 +1043,7 @@ template<class T> struct knSetBnd4dNeumann : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, int t, Grid4d<T> &grid, int w)
+  inline void op(int i, int j, int k, int t, Grid4d<T> &grid, int w) const
  {
    bool set = false;
    int si = i, sj = j, sk = k, st = t;
@@ -1017,47 +1092,50 @@ template<class T> struct knSetBnd4dNeumann : public KernelBase {
    return w;
  }
  typedef int type1;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knSetBnd4dNeumann ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ
+               << " "
+                  " t "
+               << minT << " - " << maxT,
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
-    const int _maxX = maxX;
-    const int _maxY = maxY;
    if (maxT > 1) {
-      const int _maxZ = maxZ;
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int t = 0; t < maxT; t++)
-          for (int k = 0; k < _maxZ; k++)
-            for (int j = 0; j < _maxY; j++)
-              for (int i = 0; i < _maxX; i++)
-                op(i, j, k, t, grid, w);
-      }
+      for (int t = __r.begin(); t != (int)__r.end(); t++)
+        for (int k = 0; k < maxZ; k++)
+          for (int j = 0; j < maxY; j++)
+            for (int i = 0; i < maxX; i++)
+              op(i, j, k, t, grid, w);
    }
    else if (maxZ > 1) {
      const int t = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, t, grid, w);
-      }
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < maxY; j++)
+          for (int i = 0; i < maxX; i++)
+            op(i, j, k, t, grid, w);
    }
    else {
      const int t = 0;
      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, t, grid, w);
-      }
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < maxX; i++)
+          op(i, j, k, t, grid, w);
+    }
+  }
+  void run()
+  {
+    if (maxT > 1) {
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minT, maxT), *this);
+    }
+    else if (maxZ > 1) {
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    }
+    else {
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
    }
  }
  Grid4d<T> &grid;
@@ -1251,7 +1329,7 @@ template<class S> struct knSetRegion4d : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, int t, Grid4d<S> &dst, Vec4 start, Vec4 end, S value)
+  inline void op(int i, int j, int k, int t, Grid4d<S> &dst, Vec4 start, Vec4 end, S value) const
  {
    Vec4 p(i, j, k, t);
    for (int c = 0; c < 4; ++c)
@@ -1279,47 +1357,50 @@ template<class S> struct knSetRegion4d : public KernelBase {
    return value;
  }
  typedef S type3;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knSetRegion4d ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ
+               << " "
+                  " t "
+               << minT << " - " << maxT,
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
-    const int _maxX = maxX;
-    const int _maxY = maxY;
    if (maxT > 1) {
-      const int _maxZ = maxZ;
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int t = 0; t < maxT; t++)
-          for (int k = 0; k < _maxZ; k++)
-            for (int j = 0; j < _maxY; j++)
-              for (int i = 0; i < _maxX; i++)
-                op(i, j, k, t, dst, start, end, value);
-      }
+      for (int t = __r.begin(); t != (int)__r.end(); t++)
+        for (int k = 0; k < maxZ; k++)
+          for (int j = 0; j < maxY; j++)
+            for (int i = 0; i < maxX; i++)
+              op(i, j, k, t, dst, start, end, value);
    }
    else if (maxZ > 1) {
      const int t = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, t, dst, start, end, value);
-      }
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < maxY; j++)
+          for (int i = 0; i < maxX; i++)
+            op(i, j, k, t, dst, start, end, value);
    }
    else {
      const int t = 0;
      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, t, dst, start, end, value);
-      }
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < maxX; i++)
+          op(i, j, k, t, dst, start, end, value);
+    }
+  }
+  void run()
+  {
+    if (maxT > 1) {
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minT, maxT), *this);
+    }
+    else if (maxZ > 1) {
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    }
+    else {
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
    }
  }
  Grid4d<S> &dst;
@@ -1538,7 +1619,7 @@ template<class S> struct knInterpol4d : public KernelBase {
                 Grid4d<S> &target,
                 Grid4d<S> &source,
                 const Vec4 &srcFac,
-                 const Vec4 &offset)
+                 const Vec4 &offset) const
  {
    Vec4 pos = Vec4(i, j, k, t) * srcFac + offset;
    target(i, j, k, t) = source.getInterpolated(pos);
@@ -1563,47 +1644,50 @@ template<class S> struct knInterpol4d : public KernelBase {
    return offset;
  }
  typedef Vec4 type3;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knInterpol4d ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ
+               << " "
+                  " t "
+               << minT << " - " << maxT,
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
-    const int _maxX = maxX;
-    const int _maxY = maxY;
    if (maxT > 1) {
-      const int _maxZ = maxZ;
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int t = 0; t < maxT; t++)
-          for (int k = 0; k < _maxZ; k++)
-            for (int j = 0; j < _maxY; j++)
-              for (int i = 0; i < _maxX; i++)
-                op(i, j, k, t, target, source, srcFac, offset);
-      }
+      for (int t = __r.begin(); t != (int)__r.end(); t++)
+        for (int k = 0; k < maxZ; k++)
+          for (int j = 0; j < maxY; j++)
+            for (int i = 0; i < maxX; i++)
+              op(i, j, k, t, target, source, srcFac, offset);
    }
    else if (maxZ > 1) {
      const int t = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, t, target, source, srcFac, offset);
-      }
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < maxY; j++)
+          for (int i = 0; i < maxX; i++)
+            op(i, j, k, t, target, source, srcFac, offset);
    }
    else {
      const int t = 0;
      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, t, target, source, srcFac, offset);
-      }
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < maxX; i++)
+          op(i, j, k, t, target, source, srcFac, offset);
+    }
+  }
+  void run()
+  {
+    if (maxT > 1) {
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minT, maxT), *this);
+    }
+    else if (maxZ > 1) {
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    }
+    else {
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
    }
  }
  Grid4d<S> &target;
--- a/extern/mantaflow/preprocessed/grid4d.h
+++ b/extern/mantaflow/preprocessed/grid4d.h
@@ -326,6 +326,7 @@ class Grid4dBase : public PbClass {
  // precomputed Z,T shift: to ensure 2D compatibility, always use this instead of sx*sy !
  IndexInt mStrideZ;
  IndexInt mStrideT;
+
 public:
  PbArgs _args;
 }
@@ -950,6 +951,7 @@ template<class T> class Grid4d : public Grid4dBase {

 protected:
  T *mData;
+
 public:
  PbArgs _args;
 }
@@ -1025,7 +1027,7 @@ template<class T, class S> struct Grid4dAdd : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<S> &other)
+  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<S> &other) const
  {
    me[idx] += other[idx];
  }
@@ -1039,17 +1041,21 @@ template<class T, class S> struct Grid4dAdd : public KernelBase {
    return other;
  }
  typedef Grid4d<S> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel Grid4dAdd ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid4d<T> &me;
  const Grid4d<S> &other;
@@ -1060,7 +1066,7 @@ template<class T, class S> struct Grid4dSub : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<S> &other)
+  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<S> &other) const
  {
    me[idx] -= other[idx];
  }
@@ -1074,17 +1080,21 @@ template<class T, class S> struct Grid4dSub : public KernelBase {
    return other;
  }
  typedef Grid4d<S> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel Grid4dSub ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid4d<T> &me;
  const Grid4d<S> &other;
@@ -1095,7 +1105,7 @@ template<class T, class S> struct Grid4dMult : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<S> &other)
+  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<S> &other) const
  {
    me[idx] *= other[idx];
  }
@@ -1109,17 +1119,21 @@ template<class T, class S> struct Grid4dMult : public KernelBase {
    return other;
  }
  typedef Grid4d<S> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel Grid4dMult ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid4d<T> &me;
  const Grid4d<S> &other;
@@ -1130,7 +1144,7 @@ template<class T, class S> struct Grid4dDiv : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<S> &other)
+  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<S> &other) const
  {
    me[idx] /= other[idx];
  }
@@ -1144,17 +1158,21 @@ template<class T, class S> struct Grid4dDiv : public KernelBase {
    return other;
  }
  typedef Grid4d<S> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel Grid4dDiv ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid4d<T> &me;
  const Grid4d<S> &other;
@@ -1165,7 +1183,7 @@ template<class T, class S> struct Grid4dAddScalar : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid4d<T> &me, const S &other)
+  inline void op(IndexInt idx, Grid4d<T> &me, const S &other) const
  {
    me[idx] += other;
  }
@@ -1179,17 +1197,21 @@ template<class T, class S> struct Grid4dAddScalar : public KernelBase {
    return other;
  }
  typedef S type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel Grid4dAddScalar ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid4d<T> &me;
  const S &other;
@@ -1200,7 +1222,7 @@ template<class T, class S> struct Grid4dMultScalar : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid4d<T> &me, const S &other)
+  inline void op(IndexInt idx, Grid4d<T> &me, const S &other) const
  {
    me[idx] *= other;
  }
@@ -1214,17 +1236,21 @@ template<class T, class S> struct Grid4dMultScalar : public KernelBase {
    return other;
  }
  typedef S type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel Grid4dMultScalar ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid4d<T> &me;
  const S &other;
@@ -1236,7 +1262,7 @@ template<class T, class S> struct Grid4dScaledAdd : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<T> &other, const S &factor)
+  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<T> &other, const S &factor) const
  {
    me[idx] += factor * other[idx];
  }
@@ -1255,17 +1281,21 @@ template<class T, class S> struct Grid4dScaledAdd : public KernelBase {
    return factor;
  }
  typedef S type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel Grid4dScaledAdd ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other, factor);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other, factor);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid4d<T> &me;
  const Grid4d<T> &other;
@@ -1278,7 +1308,7 @@ template<class T> struct Grid4dSafeDiv : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<T> &other)
+  inline void op(IndexInt idx, Grid4d<T> &me, const Grid4d<T> &other) const
  {
    me[idx] = safeDivide(me[idx], other[idx]);
  }
@@ -1292,17 +1322,21 @@ template<class T> struct Grid4dSafeDiv : public KernelBase {
    return other;
  }
  typedef Grid4d<T> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel Grid4dSafeDiv ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid4d<T> &me;
  const Grid4d<T> &other;
@@ -1313,7 +1347,7 @@ template<class T> struct Grid4dSetConst : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid4d<T> &me, T value)
+  inline void op(IndexInt idx, Grid4d<T> &me, T value) const
  {
    me[idx] = value;
  }
@@ -1327,17 +1361,21 @@ template<class T> struct Grid4dSetConst : public KernelBase {
    return value;
  }
  typedef T type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel Grid4dSetConst ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, value);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, value);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid4d<T> &me;
  T value;
@@ -1435,7 +1473,7 @@ template<class S> struct KnInterpolateGrid4dTempl : public KernelBase {
                 Grid4d<S> &target,
                 Grid4d<S> &source,
                 const Vec4 &sourceFactor,
-                 Vec4 offset)
+                 Vec4 offset) const
  {
    Vec4 pos = Vec4(i, j, k, t) * sourceFactor + offset;
    if (!source.is3D())
@@ -1464,47 +1502,50 @@ template<class S> struct KnInterpolateGrid4dTempl : public KernelBase {
    return offset;
  }
  typedef Vec4 type3;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnInterpolateGrid4dTempl ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ
+               << " "
+                  " t "
+               << minT << " - " << maxT,
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
-    const int _maxX = maxX;
-    const int _maxY = maxY;
    if (maxT > 1) {
-      const int _maxZ = maxZ;
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int t = 0; t < maxT; t++)
-          for (int k = 0; k < _maxZ; k++)
-            for (int j = 0; j < _maxY; j++)
-              for (int i = 0; i < _maxX; i++)
-                op(i, j, k, t, target, source, sourceFactor, offset);
-      }
+      for (int t = __r.begin(); t != (int)__r.end(); t++)
+        for (int k = 0; k < maxZ; k++)
+          for (int j = 0; j < maxY; j++)
+            for (int i = 0; i < maxX; i++)
+              op(i, j, k, t, target, source, sourceFactor, offset);
    }
    else if (maxZ > 1) {
      const int t = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, t, target, source, sourceFactor, offset);
-      }
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
+        for (int j = 0; j < maxY; j++)
+          for (int i = 0; i < maxX; i++)
+            op(i, j, k, t, target, source, sourceFactor, offset);
    }
    else {
      const int t = 0;
      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int j = 0; j < _maxY; j++)
-          for (int i = 0; i < _maxX; i++)
-            op(i, j, k, t, target, source, sourceFactor, offset);
-      }
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < maxX; i++)
+          op(i, j, k, t, target, source, sourceFactor, offset);
+    }
+  }
+  void run()
+  {
+    if (maxT > 1) {
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minT, maxT), *this);
+    }
+    else if (maxZ > 1) {
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    }
+    else {
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
    }
  }
  Grid4d<S> &target;
--- a/extern/mantaflow/preprocessed/kernel.h
+++ b/extern/mantaflow/preprocessed/kernel.h
@@ -71,6 +71,19 @@ class ParticleBase;
      for (int j = bnd; j < (grid).getSizeY() - bnd; ++j) \
        for (int i = bnd; i < (grid).getSizeX() - bnd; ++i)

+#define FOR_NEIGHBORS_BND(grid, radius, bnd) \
+  for (int zj = ((grid).is3D() ? std::max(bnd, k - radius) : 0); \
+       zj <= ((grid).is3D() ? std::min(k + radius, (grid).getSizeZ() - 1 - bnd) : 0); \
+       zj++) \
+    for (int yj = std::max(bnd, j - radius); \
+         yj <= std::min(j + radius, (grid).getSizeY() - 1 - bnd); \
+         yj++) \
+      for (int xj = std::max(bnd, i - radius); \
+           xj <= std::min(i + radius, (grid).getSizeX() - 1 - bnd); \
+           xj++)
+
+#define FOR_NEIGHBORS(grid, radius) FOR_NEIGHBORS_BND(grid, radius, 0)
+
 //! Basic data structure for kernel data, initialized based on kernel type (e.g. single, idx, etc).
 struct KernelBase {
  int maxX, maxY, maxZ, minZ, maxT, minT;
--- a/extern/mantaflow/preprocessed/levelset.cpp
+++ b/extern/mantaflow/preprocessed/levelset.cpp
@@ -62,7 +62,7 @@ struct InitFmIn : public KernelBase {
                 Grid<int> &fmFlags,
                 Grid<Real> &phi,
                 bool ignoreWalls,
-                 int obstacleType)
+                 int obstacleType) const
  {
    const IndexInt idx = flags.index(i, j, k);
    const Real v = phi[idx];
@@ -104,34 +104,36 @@ struct InitFmIn : public KernelBase {
    return obstacleType;
  }
  typedef int type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel InitFmIn ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, flags, fmFlags, phi, ignoreWalls, obstacleType);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, flags, fmFlags, phi, ignoreWalls, obstacleType);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, fmFlags, phi, ignoreWalls, obstacleType);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const FlagGrid &flags;
  Grid<int> &fmFlags;
@@ -163,7 +165,7 @@ struct InitFmOut : public KernelBase {
                 Grid<int> &fmFlags,
                 Grid<Real> &phi,
                 bool ignoreWalls,
-                 int obstacleType)
+                 int obstacleType) const
  {
    const IndexInt idx = flags.index(i, j, k);
    const Real v = phi[idx];
@@ -203,34 +205,36 @@ struct InitFmOut : public KernelBase {
    return obstacleType;
  }
  typedef int type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel InitFmOut ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, flags, fmFlags, phi, ignoreWalls, obstacleType);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, flags, fmFlags, phi, ignoreWalls, obstacleType);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, fmFlags, phi, ignoreWalls, obstacleType);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const FlagGrid &flags;
  Grid<int> &fmFlags;
@@ -265,7 +269,7 @@ struct SetUninitialized : public KernelBase {
                 Grid<Real> &phi,
                 const Real val,
                 int ignoreWalls,
-                 int obstacleType)
+                 int obstacleType) const
  {
    if (ignoreWalls) {
      if ((fmFlags(i, j, k) != FlagInited) && ((flags(i, j, k) & obstacleType) == 0)) {
@@ -307,34 +311,36 @@ struct SetUninitialized : public KernelBase {
    return obstacleType;
  }
  typedef int type5;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel SetUninitialized ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, flags, fmFlags, phi, val, ignoreWalls, obstacleType);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, flags, fmFlags, phi, val, ignoreWalls, obstacleType);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, fmFlags, phi, val, ignoreWalls, obstacleType);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const Grid<int> &flags;
  Grid<int> &fmFlags;
@@ -365,8 +371,7 @@ inline bool isAtInterface(const Grid<int> &fmFlags, Grid<Real> &phi, const Vec3i
 //************************************************************************
 // Levelset class def

-LevelsetGrid::LevelsetGrid(FluidSolver *parent, bool show, bool offload)
-    : Grid<Real>(parent, show, false /*sparse*/, offload)
+LevelsetGrid::LevelsetGrid(FluidSolver *parent, bool show) : Grid<Real>(parent, show)
 {
  mType = (GridType)(TypeLevelset | TypeReal);
 }
@@ -389,7 +394,7 @@ struct KnJoin : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid<Real> &a, const Grid<Real> &b)
+  inline void op(IndexInt idx, Grid<Real> &a, const Grid<Real> &b) const
  {
    a[idx] = min(a[idx], b[idx]);
  }
@@ -403,17 +408,21 @@ struct KnJoin : public KernelBase {
    return b;
  }
  typedef Grid<Real> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel KnJoin ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, a, b);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, a, b);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid<Real> &a;
  const Grid<Real> &b;
@@ -431,8 +440,11 @@ struct KnSubtract : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(
-      IndexInt idx, Grid<Real> &a, const Grid<Real> &b, const FlagGrid *flags, int subtractType)
+  inline void op(IndexInt idx,
+                 Grid<Real> &a,
+                 const Grid<Real> &b,
+                 const FlagGrid *flags,
+                 int subtractType) const
  {
    if (flags && ((*flags)(idx)&subtractType) == 0)
      return;
@@ -459,17 +471,21 @@ struct KnSubtract : public KernelBase {
    return subtractType;
  }
  typedef int type3;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel KnSubtract ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, a, b, flags, subtractType);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, a, b, flags, subtractType);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid<Real> &a;
  const Grid<Real> &b;
@@ -652,7 +668,7 @@ struct KnFillApply : public KernelBase {
                 Grid<int> &visited,
                 const Real value,
                 const int boundaryWidth,
-                 const bool outside)
+                 const bool outside) const
  {

    if (visited(i, j, k) == ID_VISITED)
@@ -690,34 +706,36 @@ struct KnFillApply : public KernelBase {
    return outside;
  }
  typedef bool type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnFillApply ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = boundaryWidth; j < _maxY; j++)
-            for (int i = boundaryWidth; i < _maxX; i++)
-              op(i, j, k, target, visited, value, boundaryWidth, outside);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = boundaryWidth; j < _maxY; j++)
          for (int i = boundaryWidth; i < _maxX; i++)
            op(i, j, k, target, visited, value, boundaryWidth, outside);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = boundaryWidth; i < _maxX; i++)
+          op(i, j, k, target, visited, value, boundaryWidth, outside);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(boundaryWidth, maxY), *this);
  }
  Grid<Real> &target;
  Grid<int> &visited;
--- a/extern/mantaflow/preprocessed/levelset.h
+++ b/extern/mantaflow/preprocessed/levelset.h
@@ -27,7 +27,7 @@ class Mesh;
 //! Special function for levelsets
 class LevelsetGrid : public Grid<Real> {
 public:
-  LevelsetGrid(FluidSolver *parent, bool show = true, bool offload = false);
+  LevelsetGrid(FluidSolver *parent, bool show = true);
  static int _W_0(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
  {
    PbClass *obj = Pb::objFromPy(_self);
@@ -41,8 +41,7 @@ class LevelsetGrid : public Grid<Real> {
        ArgLocker _lock;
        FluidSolver *parent = _args.getPtr<FluidSolver>("parent", 0, &_lock);
        bool show = _args.getOpt<bool>("show", 1, true, &_lock);
-        bool offload = _args.getOpt<bool>("offload", 2, false, &_lock);
-        obj = new LevelsetGrid(parent, show, offload);
+        obj = new LevelsetGrid(parent, show);
        obj->registerObject(_self, &_args);
        _args.check();
      }
@@ -267,6 +266,7 @@ class LevelsetGrid : public Grid<Real> {
  }

  static Real invalidTimeValue();
+
 public:
  PbArgs _args;
 }
--- a/extern/mantaflow/preprocessed/mesh.cpp
+++ b/extern/mantaflow/preprocessed/mesh.cpp
--- a/extern/mantaflow/preprocessed/mesh.h
+++ b/extern/mantaflow/preprocessed/mesh.h
@@ -796,6 +796,7 @@ class Mesh : public PbClass {
  std::vector<MeshDataImpl<int> *>
      mMdataInt;  //! indicate that mdata of this mesh is copied, and needs to be freed
  bool mFreeMdata;
+
 public:
  PbArgs _args;
 }
@@ -881,6 +882,7 @@ class MeshDataBase : public PbClass {

 protected:
  Mesh *mMesh;
+
 public:
  PbArgs _args;
 }
@@ -1645,6 +1647,7 @@ template<class T> class MeshDataImpl : public MeshDataBase {
  //! optionally , we might have an associated grid from which to grab new data
  Grid<T> *mpGridSource;  //! unfortunately , we need to distinguish mac vs regular vec3
  bool mGridSourceMAC;
+
 public:
  PbArgs _args;
 }
--- a/extern/mantaflow/preprocessed/movingobs.h
+++ b/extern/mantaflow/preprocessed/movingobs.h
@@ -154,6 +154,7 @@ class MovingObstacle : public PbClass {
  int mEmptyType;
  int mID;
  static int sIDcnt;
+
 public:
  PbArgs _args;
 }
--- a/extern/mantaflow/preprocessed/multigrid.cpp
+++ b/extern/mantaflow/preprocessed/multigrid.cpp
@@ -428,7 +428,7 @@ struct knCopyA : public KernelBase {
                 const Grid<Real> *pA0,
                 const Grid<Real> *pAi,
                 const Grid<Real> *pAj,
-                 const Grid<Real> *pAk)
+                 const Grid<Real> *pAk) const
  {
    A0[idx * stencilSize0 + 0] = (*pA0)[idx];
    A0[idx * stencilSize0 + 1] = (*pAi)[idx];
@@ -476,17 +476,21 @@ struct knCopyA : public KernelBase {
    return pAk;
  }
  typedef Grid<Real> type7;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knCopyA ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, sizeRef, A0, stencilSize0, is3D, pA0, pAi, pAj, pAk);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, sizeRef, A0, stencilSize0, is3D, pA0, pAi, pAj, pAk);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  std::vector<Real> &sizeRef;
  std::vector<Real> &A0;
@@ -519,7 +523,7 @@ struct knActivateVertices : public KernelBase {
                 std::vector<Real> &A0,
                 bool &nonZeroStencilSumFound,
                 bool &trivialEquationsFound,
-                 const GridMg &mg)
+                 const GridMg &mg) const
  {
    // active vertices on level 0 are vertices with non-zero diagonal entry in A
    type_0[idx] = GridMg::vtInactive;
@@ -568,17 +572,21 @@ struct knActivateVertices : public KernelBase {
    return mg;
  }
  typedef GridMg type4;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knActivateVertices ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, type_0, A0, nonZeroStencilSumFound, trivialEquationsFound, mg);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, type_0, A0, nonZeroStencilSumFound, trivialEquationsFound, mg);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  std::vector<GridMg::VertexType> &type_0;
  std::vector<Real> &A0;
@@ -634,7 +642,7 @@ struct knSetRhs : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, std::vector<Real> &b, const Grid<Real> &rhs, const GridMg &mg)
+  inline void op(IndexInt idx, std::vector<Real> &b, const Grid<Real> &rhs, const GridMg &mg) const
  {
    b[idx] = rhs[idx];

@@ -658,17 +666,21 @@ struct knSetRhs : public KernelBase {
    return mg;
  }
  typedef GridMg type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knSetRhs ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, b, rhs, mg);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, b, rhs, mg);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  std::vector<Real> &b;
  const Grid<Real> &rhs;
@@ -690,7 +702,7 @@ template<class T> struct knSet : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, std::vector<T> &data, T value)
+  inline void op(IndexInt idx, std::vector<T> &data, T value) const
  {
    data[idx] = value;
  }
@@ -704,17 +716,21 @@ template<class T> struct knSet : public KernelBase {
    return value;
  }
  typedef T type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knSet ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, data, value);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, data, value);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  std::vector<T> &data;
  T value;
@@ -727,7 +743,7 @@ template<class T> struct knCopyToVector : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, std::vector<T> &dst, const Grid<T> &src)
+  inline void op(IndexInt idx, std::vector<T> &dst, const Grid<T> &src) const
  {
    dst[idx] = src[idx];
  }
@@ -741,17 +757,21 @@ template<class T> struct knCopyToVector : public KernelBase {
    return src;
  }
  typedef Grid<T> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knCopyToVector ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, dst, src);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, dst, src);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  std::vector<T> &dst;
  const Grid<T> &src;
@@ -764,7 +784,7 @@ template<class T> struct knCopyToGrid : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, const std::vector<T> &src, Grid<T> &dst)
+  inline void op(IndexInt idx, const std::vector<T> &src, Grid<T> &dst) const
  {
    dst[idx] = src[idx];
  }
@@ -778,17 +798,21 @@ template<class T> struct knCopyToGrid : public KernelBase {
    return dst;
  }
  typedef Grid<T> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knCopyToGrid ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, src, dst);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, src, dst);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const std::vector<T> &src;
  Grid<T> &dst;
@@ -801,7 +825,7 @@ template<class T> struct knAddAssign : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, std::vector<T> &dst, const std::vector<T> &src)
+  inline void op(IndexInt idx, std::vector<T> &dst, const std::vector<T> &src) const
  {
    dst[idx] += src[idx];
  }
@@ -815,17 +839,21 @@ template<class T> struct knAddAssign : public KernelBase {
    return src;
  }
  typedef std::vector<T> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knAddAssign ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, dst, src);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, dst, src);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  std::vector<T> &dst;
  const std::vector<T> &src;
@@ -902,7 +930,7 @@ struct knActivateCoarseVertices : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, std::vector<GridMg::VertexType> &type, int unused)
+  inline void op(IndexInt idx, std::vector<GridMg::VertexType> &type, int unused) const
  {
    // set all remaining 'free' vertices to 'removed',
    if (type[idx] == GridMg::vtFree)
@@ -924,17 +952,21 @@ struct knActivateCoarseVertices : public KernelBase {
    return unused;
  }
  typedef int type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knActivateCoarseVertices ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, type, unused);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, type, unused);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  std::vector<GridMg::VertexType> &type;
  int unused;
@@ -1020,8 +1052,11 @@ struct knGenCoarseGridOperator : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(
-      IndexInt idx, std::vector<Real> &sizeRef, std::vector<Real> &A, int l, const GridMg &mg)
+  inline void op(IndexInt idx,
+                 std::vector<Real> &sizeRef,
+                 std::vector<Real> &A,
+                 int l,
+                 const GridMg &mg) const
  {
    if (mg.mType[l][idx] == GridMg::vtInactive)
      return;
@@ -1143,17 +1178,21 @@ struct knGenCoarseGridOperator : public KernelBase {
    return mg;
  }
  typedef GridMg type3;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knGenCoarseGridOperator ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, sizeRef, A, l, mg);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for schedule(static, 1)
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, sizeRef, A, l, mg);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  std::vector<Real> &sizeRef;
  std::vector<Real> &A;
@@ -1193,7 +1232,7 @@ struct knSmoothColor : public KernelBase {
                 const Vec3i &blockSize,
                 const std::vector<Vec3i> &colorOffs,
                 int l,
-                 const GridMg &mg)
+                 const GridMg &mg) const
  {
    Vec3i blockOff(int(idx) % blockSize.x,
                   (int(idx) % (blockSize.x * blockSize.y)) / blockSize.x,
@@ -1279,17 +1318,21 @@ struct knSmoothColor : public KernelBase {
    return mg;
  }
  typedef GridMg type5;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knSmoothColor ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, numBlocks, x, blockSize, colorOffs, l, mg);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for schedule(static, 1)
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, numBlocks, x, blockSize, colorOffs, l, mg);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ThreadSize &numBlocks;
  std::vector<Real> &x;
@@ -1343,7 +1386,7 @@ struct knCalcResidual : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, std::vector<Real> &r, int l, const GridMg &mg)
+  inline void op(IndexInt idx, std::vector<Real> &r, int l, const GridMg &mg) const
  {
    if (mg.mType[l][idx] == GridMg::vtInactive)
      return;
@@ -1400,17 +1443,21 @@ struct knCalcResidual : public KernelBase {
    return mg;
  }
  typedef GridMg type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knCalcResidual ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, r, l, mg);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for schedule(static, 1)
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, r, l, mg);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  std::vector<Real> &r;
  int l;
@@ -1459,21 +1506,29 @@ struct knResidualNormSumSqr : public KernelBase {
    return mg;
  }
  typedef GridMg type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knResidualNormSumSqr ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, r, l, mg, result);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-      Real result = Real(0);
-#pragma omp for nowait
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, r, l, mg, result);
-#pragma omp critical
-      {
-        this->result += result;
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  knResidualNormSumSqr(knResidualNormSumSqr &o, tbb::split)
+      : KernelBase(o), r(o.r), l(o.l), mg(o.mg), result(Real(0))
+  {
+  }
+  void join(const knResidualNormSumSqr &o)
+  {
+    result += o.result;
  }
  const vector<Real> &r;
  int l;
@@ -1645,7 +1700,7 @@ struct knRestrict : public KernelBase {
                 std::vector<Real> &dst,
                 const std::vector<Real> &src,
                 int l_dst,
-                 const GridMg &mg)
+                 const GridMg &mg) const
  {
    if (mg.mType[l_dst][idx] == GridMg::vtInactive)
      return;
@@ -1691,17 +1746,21 @@ struct knRestrict : public KernelBase {
    return mg;
  }
  typedef GridMg type3;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knRestrict ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, dst, src, l_dst, mg);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for schedule(static, 1)
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, dst, src, l_dst, mg);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  std::vector<Real> &dst;
  const std::vector<Real> &src;
@@ -1725,7 +1784,7 @@ struct knInterpolate : public KernelBase {
                 std::vector<Real> &dst,
                 const std::vector<Real> &src,
                 int l_dst,
-                 const GridMg &mg)
+                 const GridMg &mg) const
  {
    if (mg.mType[l_dst][idx] == GridMg::vtInactive)
      return;
@@ -1768,17 +1827,21 @@ struct knInterpolate : public KernelBase {
    return mg;
  }
  typedef GridMg type3;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knInterpolate ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, dst, src, l_dst, mg);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for schedule(static, 1)
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, dst, src, l_dst, mg);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  std::vector<Real> &dst;
  const std::vector<Real> &src;
--- a/extern/mantaflow/preprocessed/noisefield.h
+++ b/extern/mantaflow/preprocessed/noisefield.h
@@ -236,6 +236,7 @@ class WaveletNoiseField : public PbClass {
  static int randomSeed;
  // global reference count for noise tile
  static std::atomic<int> mNoiseReferenceCount;
+
 public:
  PbArgs _args;
 }
--- a/extern/mantaflow/preprocessed/particle.cpp
+++ b/extern/mantaflow/preprocessed/particle.cpp
@@ -455,7 +455,7 @@ template<class T, class S> struct knPdataAdd : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<S> &other)
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<S> &other) const
  {
    me[idx] += other[idx];
  }
@@ -469,17 +469,21 @@ template<class T, class S> struct knPdataAdd : public KernelBase {
    return other;
  }
  typedef ParticleDataImpl<S> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knPdataAdd ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleDataImpl<T> &me;
  const ParticleDataImpl<S> &other;
@@ -491,7 +495,7 @@ template<class T, class S> struct knPdataSub : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<S> &other)
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<S> &other) const
  {
    me[idx] -= other[idx];
  }
@@ -505,17 +509,21 @@ template<class T, class S> struct knPdataSub : public KernelBase {
    return other;
  }
  typedef ParticleDataImpl<S> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knPdataSub ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleDataImpl<T> &me;
  const ParticleDataImpl<S> &other;
@@ -527,7 +535,7 @@ template<class T, class S> struct knPdataMult : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<S> &other)
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<S> &other) const
  {
    me[idx] *= other[idx];
  }
@@ -541,17 +549,21 @@ template<class T, class S> struct knPdataMult : public KernelBase {
    return other;
  }
  typedef ParticleDataImpl<S> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knPdataMult ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleDataImpl<T> &me;
  const ParticleDataImpl<S> &other;
@@ -563,7 +575,7 @@ template<class T, class S> struct knPdataDiv : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<S> &other)
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<S> &other) const
  {
    me[idx] /= other[idx];
  }
@@ -577,17 +589,21 @@ template<class T, class S> struct knPdataDiv : public KernelBase {
    return other;
  }
  typedef ParticleDataImpl<S> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knPdataDiv ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleDataImpl<T> &me;
  const ParticleDataImpl<S> &other;
@@ -599,7 +615,7 @@ template<class T> struct knPdataSafeDiv : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<T> &other)
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const ParticleDataImpl<T> &other) const
  {
    me[idx] = safeDivide(me[idx], other[idx]);
  }
@@ -613,17 +629,21 @@ template<class T> struct knPdataSafeDiv : public KernelBase {
    return other;
  }
  typedef ParticleDataImpl<T> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knPdataSafeDiv ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleDataImpl<T> &me;
  const ParticleDataImpl<T> &other;
@@ -636,7 +656,7 @@ template<class T, class S> struct knPdataSetScalar : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const S &other)
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const S &other) const
  {
    me[idx] = other;
  }
@@ -650,17 +670,21 @@ template<class T, class S> struct knPdataSetScalar : public KernelBase {
    return other;
  }
  typedef S type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knPdataSetScalar ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleDataImpl<T> &me;
  const S &other;
@@ -672,7 +696,7 @@ template<class T, class S> struct knPdataAddScalar : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const S &other)
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const S &other) const
  {
    me[idx] += other;
  }
@@ -686,17 +710,21 @@ template<class T, class S> struct knPdataAddScalar : public KernelBase {
    return other;
  }
  typedef S type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knPdataAddScalar ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleDataImpl<T> &me;
  const S &other;
@@ -708,7 +736,7 @@ template<class T, class S> struct knPdataMultScalar : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const S &other)
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const S &other) const
  {
    me[idx] *= other;
  }
@@ -722,17 +750,21 @@ template<class T, class S> struct knPdataMultScalar : public KernelBase {
    return other;
  }
  typedef S type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knPdataMultScalar ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleDataImpl<T> &me;
  const S &other;
@@ -747,7 +779,7 @@ template<class T, class S> struct knPdataScaledAdd : public KernelBase {
  inline void op(IndexInt idx,
                 ParticleDataImpl<T> &me,
                 const ParticleDataImpl<T> &other,
-                 const S &factor)
+                 const S &factor) const
  {
    me[idx] += factor * other[idx];
  }
@@ -766,17 +798,21 @@ template<class T, class S> struct knPdataScaledAdd : public KernelBase {
    return factor;
  }
  typedef S type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knPdataScaledAdd ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other, factor);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other, factor);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleDataImpl<T> &me;
  const ParticleDataImpl<T> &other;
@@ -790,7 +826,7 @@ template<class T> struct knPdataClamp : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const T vmin, const T vmax)
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const T vmin, const T vmax) const
  {
    me[idx] = clamp(me[idx], vmin, vmax);
  }
@@ -809,17 +845,21 @@ template<class T> struct knPdataClamp : public KernelBase {
    return vmax;
  }
  typedef T type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knPdataClamp ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, vmin, vmax);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, vmin, vmax);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleDataImpl<T> &me;
  const T vmin;
@@ -832,7 +872,7 @@ template<class T> struct knPdataClampMin : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const T vmin)
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const T vmin) const
  {
    me[idx] = std::max(vmin, me[idx]);
  }
@@ -846,17 +886,21 @@ template<class T> struct knPdataClampMin : public KernelBase {
    return vmin;
  }
  typedef T type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knPdataClampMin ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, vmin);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, vmin);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleDataImpl<T> &me;
  const T vmin;
@@ -868,7 +912,7 @@ template<class T> struct knPdataClampMax : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const T vmax)
+  inline void op(IndexInt idx, ParticleDataImpl<T> &me, const T vmax) const
  {
    me[idx] = std::min(vmax, me[idx]);
  }
@@ -882,17 +926,21 @@ template<class T> struct knPdataClampMax : public KernelBase {
    return vmax;
  }
  typedef T type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knPdataClampMax ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, vmax);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, vmax);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleDataImpl<T> &me;
  const T vmax;
@@ -905,7 +953,7 @@ struct knPdataClampMinVec3 : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, ParticleDataImpl<Vec3> &me, const Real vmin)
+  inline void op(IndexInt idx, ParticleDataImpl<Vec3> &me, const Real vmin) const
  {
    me[idx].x = std::max(vmin, me[idx].x);
    me[idx].y = std::max(vmin, me[idx].y);
@@ -921,17 +969,21 @@ struct knPdataClampMinVec3 : public KernelBase {
    return vmin;
  }
  typedef Real type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knPdataClampMinVec3 ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, vmin);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, vmin);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleDataImpl<Vec3> &me;
  const Real vmin;
@@ -944,7 +996,7 @@ struct knPdataClampMaxVec3 : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, ParticleDataImpl<Vec3> &me, const Real vmax)
+  inline void op(IndexInt idx, ParticleDataImpl<Vec3> &me, const Real vmax) const
  {
    me[idx].x = std::min(vmax, me[idx].x);
    me[idx].y = std::min(vmax, me[idx].y);
@@ -960,17 +1012,21 @@ struct knPdataClampMaxVec3 : public KernelBase {
    return vmax;
  }
  typedef Real type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knPdataClampMaxVec3 ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, vmax);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, vmax);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleDataImpl<Vec3> &me;
  const Real vmax;
@@ -1015,7 +1071,7 @@ template<class T, class S> struct knPdataSetScalarIntFlag : public KernelBase {
                 ParticleDataImpl<T> &me,
                 const S &other,
                 const ParticleDataImpl<int> &t,
-                 const int itype)
+                 const int itype) const
  {
    if (t[idx] & itype)
      me[idx] = other;
@@ -1040,17 +1096,21 @@ template<class T, class S> struct knPdataSetScalarIntFlag : public KernelBase {
    return itype;
  }
  typedef int type3;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knPdataSetScalarIntFlag ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other, t, itype);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other, t, itype);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleDataImpl<T> &me;
  const S &other;
@@ -1163,21 +1223,29 @@ template<typename T> struct KnPtsSum : public KernelBase {
    return itype;
  }
  typedef int type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel KnPtsSum ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, val, t, itype, result);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-      T result = T(0.);
-#pragma omp for nowait
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, val, t, itype, result);
-#pragma omp critical
-      {
-        this->result += result;
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  KnPtsSum(KnPtsSum &o, tbb::split)
+      : KernelBase(o), val(o.val), t(o.t), itype(o.itype), result(T(0.))
+  {
+  }
+  void join(const KnPtsSum &o)
+  {
+    result += o.result;
  }
  const ParticleDataImpl<T> &val;
  const ParticleDataImpl<int> *t;
@@ -1207,21 +1275,28 @@ template<typename T> struct KnPtsSumSquare : public KernelBase {
    return val;
  }
  typedef ParticleDataImpl<T> type0;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel KnPtsSumSquare ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, val, result);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-      Real result = 0.;
-#pragma omp for nowait
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, val, result);
-#pragma omp critical
-      {
-        this->result += result;
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  KnPtsSumSquare(KnPtsSumSquare &o, tbb::split) : KernelBase(o), val(o.val), result(0.)
+  {
+  }
+  void join(const KnPtsSumSquare &o)
+  {
+    result += o.result;
  }
  const ParticleDataImpl<T> &val;
  Real result;
@@ -1249,21 +1324,28 @@ template<typename T> struct KnPtsSumMagnitude : public KernelBase {
    return val;
  }
  typedef ParticleDataImpl<T> type0;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel KnPtsSumMagnitude ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, val, result);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-      Real result = 0.;
-#pragma omp for nowait
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, val, result);
-#pragma omp critical
-      {
-        this->result += result;
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  KnPtsSumMagnitude(KnPtsSumMagnitude &o, tbb::split) : KernelBase(o), val(o.val), result(0.)
+  {
+  }
+  void join(const KnPtsSumMagnitude &o)
+  {
+    result += o.result;
  }
  const ParticleDataImpl<T> &val;
  Real result;
@@ -1310,21 +1392,29 @@ struct CompPdata_Min : public KernelBase {
    return val;
  }
  typedef ParticleDataImpl<T> type0;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel CompPdata_Min ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, val, minVal);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-      Real minVal = std::numeric_limits<Real>::max();
-#pragma omp for nowait
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, val, minVal);
-#pragma omp critical
-      {
-        this->minVal = min(minVal, this->minVal);
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  CompPdata_Min(CompPdata_Min &o, tbb::split)
+      : KernelBase(o), val(o.val), minVal(std::numeric_limits<Real>::max())
+  {
+  }
+  void join(const CompPdata_Min &o)
+  {
+    minVal = min(minVal, o.minVal);
  }
  const ParticleDataImpl<T> &val;
  Real minVal;
@@ -1357,21 +1447,29 @@ struct CompPdata_Max : public KernelBase {
    return val;
  }
  typedef ParticleDataImpl<T> type0;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel CompPdata_Max ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, val, maxVal);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-      Real maxVal = -std::numeric_limits<Real>::max();
-#pragma omp for nowait
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, val, maxVal);
-#pragma omp critical
-      {
-        this->maxVal = max(maxVal, this->maxVal);
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  CompPdata_Max(CompPdata_Max &o, tbb::split)
+      : KernelBase(o), val(o.val), maxVal(-std::numeric_limits<Real>::max())
+  {
+  }
+  void join(const CompPdata_Max &o)
+  {
+    maxVal = max(maxVal, o.maxVal);
  }
  const ParticleDataImpl<T> &val;
  Real maxVal;
@@ -1447,21 +1545,29 @@ struct CompPdata_MinVec3 : public KernelBase {
    return val;
  }
  typedef ParticleDataImpl<Vec3> type0;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel CompPdata_MinVec3 ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, val, minVal);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-      Real minVal = std::numeric_limits<Real>::max();
-#pragma omp for nowait
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, val, minVal);
-#pragma omp critical
-      {
-        this->minVal = min(minVal, this->minVal);
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  CompPdata_MinVec3(CompPdata_MinVec3 &o, tbb::split)
+      : KernelBase(o), val(o.val), minVal(std::numeric_limits<Real>::max())
+  {
+  }
+  void join(const CompPdata_MinVec3 &o)
+  {
+    minVal = min(minVal, o.minVal);
  }
  const ParticleDataImpl<Vec3> &val;
  Real minVal;
@@ -1493,21 +1599,29 @@ struct CompPdata_MaxVec3 : public KernelBase {
    return val;
  }
  typedef ParticleDataImpl<Vec3> type0;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel CompPdata_MaxVec3 ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, val, maxVal);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-      Real maxVal = -std::numeric_limits<Real>::max();
-#pragma omp for nowait
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, val, maxVal);
-#pragma omp critical
-      {
-        this->maxVal = max(maxVal, this->maxVal);
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  CompPdata_MaxVec3(CompPdata_MaxVec3 &o, tbb::split)
+      : KernelBase(o), val(o.val), maxVal(-std::numeric_limits<Real>::max())
+  {
+  }
+  void join(const CompPdata_MaxVec3 &o)
+  {
+    maxVal = max(maxVal, o.maxVal);
  }
  const ParticleDataImpl<Vec3> &val;
  Real maxVal;
--- a/extern/mantaflow/preprocessed/particle.h
+++ b/extern/mantaflow/preprocessed/particle.h
@@ -205,6 +205,7 @@ class ParticleBase : public PbClass {
  //! custom seed for particle systems, used by plugins
  int mSeed;  //! fix global random seed storage, used mainly by functions in this class
  static int globalSeed;
+
 public:
  PbArgs _args;
 }
@@ -628,6 +629,7 @@ template<class S> class ParticleSystem : public ParticleBase {
  std::vector<S> mData;
  //! reduce storage , called by doCompress
  virtual void compress();
+
 public:
  PbArgs _args;
 }
@@ -918,6 +920,7 @@ class ParticleIndexSystem : public ParticleSystem<ParticleIndexData> {
      return -1;
    }
  };
+
 public:
  PbArgs _args;
 }
@@ -982,6 +985,7 @@ template<class DATA, class CON> class ConnectedParticleSystem : public ParticleS
 protected:
  std::vector<CON> mSegments;
  virtual void compress();
+
 public:
  PbArgs _args;
 }
@@ -1071,6 +1075,7 @@ class ParticleDataBase : public PbClass {

 protected:
  ParticleBase *mpParticleSys;
+
 public:
  PbArgs _args;
 }
@@ -1843,6 +1848,7 @@ template<class T> class ParticleDataImpl : public ParticleDataBase {
  //! optionally , we might have an associated grid from which to grab new data
  Grid<T> *mpGridSource;  //! unfortunately , we need to distinguish mac vs regular vec3
  bool mGridSourceMAC;
+
 public:
  PbArgs _args;
 }
@@ -1906,17 +1912,19 @@ template<class S> void ParticleSystem<S>::transformPositions(Vec3i dimOld, Vec3i

 // check for deletion/invalid position, otherwise return velocity

-template<class S> struct GridAdvectKernel : public KernelBase {
-  GridAdvectKernel(std::vector<S> &p,
-                   const MACGrid &vel,
-                   const FlagGrid &flags,
-                   const Real dt,
-                   const bool deleteInObstacle,
-                   const bool stopInObstacle,
-                   const bool skipNew,
-                   const ParticleDataImpl<int> *ptype,
-                   const int exclude)
-      : KernelBase(p.size()),
+template<class S> struct _GridAdvectKernel : public KernelBase {
+  _GridAdvectKernel(const KernelBase &base,
+                    std::vector<S> &p,
+                    const MACGrid &vel,
+                    const FlagGrid &flags,
+                    const Real dt,
+                    const bool deleteInObstacle,
+                    const bool stopInObstacle,
+                    const bool skipNew,
+                    const ParticleDataImpl<int> *ptype,
+                    const int exclude,
+                    std::vector<Vec3> &u)
+      : KernelBase(base),
        p(p),
        vel(vel),
        flags(flags),
@@ -1926,10 +1934,8 @@ template<class S> struct GridAdvectKernel : public KernelBase {
        skipNew(skipNew),
        ptype(ptype),
        exclude(exclude),
-        u((size))
+        u(u)
  {
-    runMessage();
-    run();
  }
  inline void op(IndexInt idx,
                 std::vector<S> &p,
@@ -1941,7 +1947,7 @@ template<class S> struct GridAdvectKernel : public KernelBase {
                 const bool skipNew,
                 const ParticleDataImpl<int> *ptype,
                 const int exclude,
-                 std::vector<Vec3> &u)
+                 std::vector<Vec3> &u) const
  {
    if ((p[idx].flag & ParticleBase::PDELETE) || (ptype && ((*ptype)[idx] & exclude)) ||
        (skipNew && (p[idx].flag & ParticleBase::PNEW))) {
@@ -1962,6 +1968,66 @@ template<class S> struct GridAdvectKernel : public KernelBase {
    }
    u[idx] = vel.getInterpolated(p[idx].pos) * dt;
  }
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, p, vel, flags, dt, deleteInObstacle, stopInObstacle, skipNew, ptype, exclude, u);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  std::vector<S> &p;
+  const MACGrid &vel;
+  const FlagGrid &flags;
+  const Real dt;
+  const bool deleteInObstacle;
+  const bool stopInObstacle;
+  const bool skipNew;
+  const ParticleDataImpl<int> *ptype;
+  const int exclude;
+  std::vector<Vec3> &u;
+};
+template<class S> struct GridAdvectKernel : public KernelBase {
+  GridAdvectKernel(std::vector<S> &p,
+                   const MACGrid &vel,
+                   const FlagGrid &flags,
+                   const Real dt,
+                   const bool deleteInObstacle,
+                   const bool stopInObstacle,
+                   const bool skipNew,
+                   const ParticleDataImpl<int> *ptype,
+                   const int exclude)
+      : KernelBase(p.size()),
+        _inner(KernelBase(p.size()),
+               p,
+               vel,
+               flags,
+               dt,
+               deleteInObstacle,
+               stopInObstacle,
+               skipNew,
+               ptype,
+               exclude,
+               u),
+        p(p),
+        vel(vel),
+        flags(flags),
+        dt(dt),
+        deleteInObstacle(deleteInObstacle),
+        stopInObstacle(stopInObstacle),
+        skipNew(skipNew),
+        ptype(ptype),
+        exclude(exclude),
+        u((size))
+  {
+    runMessage();
+    run();
+  }
+  void run()
+  {
+    _inner.run();
+  }
  inline operator std::vector<Vec3>()
  {
    return u;
@@ -2015,18 +2081,14 @@ template<class S> struct GridAdvectKernel : public KernelBase {
    return exclude;
  }
  typedef int type8;
-  void runMessage(){};
-  void run()
+  void runMessage()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, p, vel, flags, dt, deleteInObstacle, stopInObstacle, skipNew, ptype, exclude, u);
-    }
-  }
+    debMsg("Executing kernel GridAdvectKernel ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  _GridAdvectKernel<S> _inner;
  std::vector<S> &p;
  const MACGrid &vel;
  const FlagGrid &flags;
@@ -2050,7 +2112,7 @@ template<class S> struct KnDeleteInObstacle : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, std::vector<S> &p, const FlagGrid &flags)
+  inline void op(IndexInt idx, std::vector<S> &p, const FlagGrid &flags) const
  {
    if (p[idx].flag & ParticleBase::PDELETE)
      return;
@@ -2068,17 +2130,21 @@ template<class S> struct KnDeleteInObstacle : public KernelBase {
    return flags;
  }
  typedef FlagGrid type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel KnDeleteInObstacle ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, p, flags);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, p, flags);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  std::vector<S> &p;
  const FlagGrid &flags;
@@ -2123,7 +2189,7 @@ template<class S> struct KnClampPositions : public KernelBase {
                 ParticleDataImpl<Vec3> *posOld = nullptr,
                 bool stopInObstacle = true,
                 const ParticleDataImpl<int> *ptype = nullptr,
-                 const int exclude = 0)
+                 const int exclude = 0) const
  {
    if (p[idx].flag & ParticleBase::PDELETE)
      return;
@@ -2169,17 +2235,21 @@ template<class S> struct KnClampPositions : public KernelBase {
    return exclude;
  }
  typedef int type5;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel KnClampPositions ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, p, flags, posOld, stopInObstacle, ptype, exclude);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, p, flags, posOld, stopInObstacle, ptype, exclude);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  std::vector<S> &p;
  const FlagGrid &flags;
@@ -2271,7 +2341,13 @@ template<class S> struct KnProjectParticles : public KernelBase {
    return rand;
  }
  typedef RandomStream type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel KnProjectParticles ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
  void run()
  {
    const IndexInt _sz = size;
@@ -2313,7 +2389,7 @@ template<class S> struct KnProjectOutOfBnd : public KernelBase {
                 const Real bnd,
                 const bool *axis,
                 const ParticleDataImpl<int> *ptype,
-                 const int exclude)
+                 const int exclude) const
  {
    if (!part.isActive(idx) || (ptype && ((*ptype)[idx] & exclude)))
      return;
@@ -2362,17 +2438,21 @@ template<class S> struct KnProjectOutOfBnd : public KernelBase {
    return exclude;
  }
  typedef int type5;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel KnProjectOutOfBnd ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, part, flags, bnd, axis, ptype, exclude);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, part, flags, bnd, axis, ptype, exclude);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleSystem<S> &part;
  const FlagGrid &flags;
--- a/extern/mantaflow/preprocessed/plugin/advection.cpp
+++ b/extern/mantaflow/preprocessed/plugin/advection.cpp
@@ -59,7 +59,7 @@ template<class T> struct SemiLagrange : public KernelBase {
                 Real dt,
                 bool isLevelset,
                 int orderSpace,
-                 int orderTrace)
+                 int orderTrace) const
  {
    if (orderTrace == 1) {
      // traceback position
@@ -117,34 +117,36 @@ template<class T> struct SemiLagrange : public KernelBase {
    return orderTrace;
  }
  typedef int type7;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel SemiLagrange ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, flags, vel, dst, src, dt, isLevelset, orderSpace, orderTrace);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, flags, vel, dst, src, dt, isLevelset, orderSpace, orderTrace);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, vel, dst, src, dt, isLevelset, orderSpace, orderTrace);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const FlagGrid &flags;
  const MACGrid &vel;
@@ -187,7 +189,7 @@ struct SemiLagrangeMAC : public KernelBase {
                 const MACGrid &src,
                 Real dt,
                 int orderSpace,
-                 int orderTrace)
+                 int orderTrace) const
  {
    if (orderTrace == 1) {
      // get currect velocity at MAC position
@@ -257,34 +259,36 @@ struct SemiLagrangeMAC : public KernelBase {
    return orderTrace;
  }
  typedef int type6;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel SemiLagrangeMAC ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, flags, vel, dst, src, dt, orderSpace, orderTrace);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, flags, vel, dst, src, dt, orderSpace, orderTrace);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, vel, dst, src, dt, orderSpace, orderTrace);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const FlagGrid &flags;
  const MACGrid &vel;
@@ -327,7 +331,7 @@ template<class T> struct MacCormackCorrect : public KernelBase {
                 const Grid<T> &bwd,
                 Real strength,
                 bool isLevelSet,
-                 bool isMAC = false)
+                 bool isMAC = false) const
  {
    dst[idx] = fwd[idx];

@@ -376,17 +380,21 @@ template<class T> struct MacCormackCorrect : public KernelBase {
    return isMAC;
  }
  typedef bool type7;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel MacCormackCorrect ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, flags, dst, old, fwd, bwd, strength, isLevelSet, isMAC);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, flags, dst, old, fwd, bwd, strength, isLevelSet, isMAC);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const FlagGrid &flags;
  Grid<T> &dst;
@@ -432,7 +440,7 @@ template<class T> struct MacCormackCorrectMAC : public KernelBase {
                 const Grid<T> &bwd,
                 Real strength,
                 bool isLevelSet,
-                 bool isMAC = false)
+                 bool isMAC = false) const
  {
    bool skip[3] = {false, false, false};

@@ -497,34 +505,36 @@ template<class T> struct MacCormackCorrectMAC : public KernelBase {
    return isMAC;
  }
  typedef bool type7;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel MacCormackCorrectMAC ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, dst, old, fwd, bwd, strength, isLevelSet, isMAC);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, flags, dst, old, fwd, bwd, strength, isLevelSet, isMAC);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, dst, old, fwd, bwd, strength, isLevelSet, isMAC);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const FlagGrid &flags;
  Grid<T> &dst;
@@ -752,7 +762,7 @@ template<class T> struct MacCormackClamp : public KernelBase {
                 const Grid<T> &orig,
                 const Grid<T> &fwd,
                 Real dt,
-                 const int clampMode)
+                 const int clampMode) const
  {
    T dval = dst(i, j, k);
    Vec3i gridUpper = flags.getSize() - 1;
@@ -820,34 +830,36 @@ template<class T> struct MacCormackClamp : public KernelBase {
    return clampMode;
  }
  typedef int type6;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel MacCormackClamp ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, flags, vel, dst, orig, fwd, dt, clampMode);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, flags, vel, dst, orig, fwd, dt, clampMode);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, vel, dst, orig, fwd, dt, clampMode);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const FlagGrid &flags;
  const MACGrid &vel;
@@ -889,7 +901,7 @@ struct MacCormackClampMAC : public KernelBase {
                 const MACGrid &orig,
                 const MACGrid &fwd,
                 Real dt,
-                 const int clampMode)
+                 const int clampMode) const
  {
    Vec3 pos(i, j, k);
    Vec3 dval = dst(i, j, k);
@@ -945,34 +957,36 @@ struct MacCormackClampMAC : public KernelBase {
    return clampMode;
  }
  typedef int type6;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel MacCormackClampMAC ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, flags, vel, dst, orig, fwd, dt, clampMode);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, flags, vel, dst, orig, fwd, dt, clampMode);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, vel, dst, orig, fwd, dt, clampMode);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const FlagGrid &flags;
  const MACGrid &vel;
@@ -1002,39 +1016,27 @@ void fnAdvectSemiLagrange(FluidSolver *parent,
  bool levelset = orig.getType() & GridBase::TypeLevelset;

  // forward step
-  GridType *fwd = new GridType(parent, true, false, false);
-  SemiLagrange<T>(flags, vel, *fwd, orig, dt, levelset, orderSpace, orderTrace);
+  GridType fwd(parent);
+  SemiLagrange<T>(flags, vel, fwd, orig, dt, levelset, orderSpace, orderTrace);

  if (order == 1) {
-#if OPENMP && OPENMP_OFFLOAD
-    orig.copyFrom(*fwd, true, false);
-#else
-    orig.swap(*fwd);
-#endif
+    orig.swap(fwd);
  }
  else if (order == 2) {  // MacCormack
    GridType bwd(parent);
-    GridType *newGrid = new GridType(parent, true, false, false);
+    GridType newGrid(parent);

    // bwd <- backwards step
-    SemiLagrange<T>(flags, vel, bwd, *fwd, -dt, levelset, orderSpace, orderTrace);
+    SemiLagrange<T>(flags, vel, bwd, fwd, -dt, levelset, orderSpace, orderTrace);

    // newGrid <- compute correction
-    MacCormackCorrect<T>(flags, *newGrid, orig, *fwd, bwd, strength, levelset);
+    MacCormackCorrect<T>(flags, newGrid, orig, fwd, bwd, strength, levelset);

    // clamp values
-    MacCormackClamp<T>(flags, vel, *newGrid, orig, *fwd, dt, clampMode);
+    MacCormackClamp<T>(flags, vel, newGrid, orig, fwd, dt, clampMode);

-#if OPENMP && OPENMP_OFFLOAD
-    orig.copyFrom(*newGrid, true, false);
-#else
-    orig.swap(*newGrid);
-#endif
-    if (newGrid)
-      delete newGrid;
+    orig.swap(newGrid);
  }
-  if (fwd)
-    delete fwd;
 }

 // outflow functions
@@ -1085,7 +1087,7 @@ struct extrapolateVelConvectiveBC : public KernelBase {
                 const MACGrid &vel,
                 MACGrid &velDst,
                 const MACGrid &velPrev,
-                 Real timeStep)
+                 Real timeStep) const
  {
    if (flags.isOutflow(i, j, k)) {
      const Vec3 bulkVel = getBulkVel(flags, vel, i, j, k);
@@ -1152,34 +1154,36 @@ struct extrapolateVelConvectiveBC : public KernelBase {
    return timeStep;
  }
  typedef Real type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel extrapolateVelConvectiveBC ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, vel, velDst, velPrev, timeStep);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, flags, vel, velDst, velPrev, timeStep);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, vel, velDst, velPrev, timeStep);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const FlagGrid &flags;
  const MACGrid &vel;
@@ -1196,7 +1200,8 @@ struct copyChangedVels : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, const FlagGrid &flags, const MACGrid &velDst, MACGrid &vel)
+  inline void op(
+      int i, int j, int k, const FlagGrid &flags, const MACGrid &velDst, MACGrid &vel) const
  {
    if (flags.isOutflow(i, j, k))
      vel(i, j, k) = velDst(i, j, k);
@@ -1216,34 +1221,36 @@ struct copyChangedVels : public KernelBase {
    return vel;
  }
  typedef MACGrid type2;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel copyChangedVels ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, velDst, vel);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, flags, velDst, vel);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, velDst, vel);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const FlagGrid &flags;
  const MACGrid &velDst;
@@ -1268,7 +1275,7 @@ struct knResetPhiInObs : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, const FlagGrid &flags, Grid<Real> &sdf)
+  inline void op(int i, int j, int k, const FlagGrid &flags, Grid<Real> &sdf) const
  {
    if (flags.isObstacle(i, j, k) && (sdf(i, j, k) < 0.)) {
      sdf(i, j, k) = 0.1;
@@ -1284,34 +1291,36 @@ struct knResetPhiInObs : public KernelBase {
    return sdf;
  }
  typedef Grid<Real> type1;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knResetPhiInObs ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, sdf);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, flags, sdf);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, sdf);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const FlagGrid &flags;
  Grid<Real> &sdf;
@@ -1369,45 +1378,33 @@ void fnAdvectSemiLagrange<MACGrid>(FluidSolver *parent,
  Real dt = parent->getDt();

  // forward step
-  MACGrid *fwd = new MACGrid(parent, true, false, false);
-  SemiLagrangeMAC(flags, vel, *fwd, orig, dt, orderSpace, orderTrace);
+  MACGrid fwd(parent);
+  SemiLagrangeMAC(flags, vel, fwd, orig, dt, orderSpace, orderTrace);

  if (orderSpace != 1) {
    debMsg("Warning higher order for MAC grids not yet implemented...", 1);
  }

  if (order == 1) {
-    applyOutflowBC(flags, *fwd, orig, dt);
-#if OPENMP && OPENMP_OFFLOAD
-    orig.copyFrom(*fwd, true, false);
-#else
-    orig.swap(*fwd);
-#endif
+    applyOutflowBC(flags, fwd, orig, dt);
+    orig.swap(fwd);
  }
  else if (order == 2) {  // MacCormack
    MACGrid bwd(parent);
-    MACGrid *newGrid = new MACGrid(parent, true, false, false);
+    MACGrid newGrid(parent);

    // bwd <- backwards step
-    SemiLagrangeMAC(flags, vel, bwd, *fwd, -dt, orderSpace, orderTrace);
+    SemiLagrangeMAC(flags, vel, bwd, fwd, -dt, orderSpace, orderTrace);

    // newGrid <- compute correction
-    MacCormackCorrectMAC<Vec3>(flags, *newGrid, orig, *fwd, bwd, strength, false, true);
+    MacCormackCorrectMAC<Vec3>(flags, newGrid, orig, fwd, bwd, strength, false, true);

    // clamp values
-    MacCormackClampMAC(flags, vel, *newGrid, orig, *fwd, dt, clampMode);
+    MacCormackClampMAC(flags, vel, newGrid, orig, fwd, dt, clampMode);

-    applyOutflowBC(flags, *newGrid, orig, dt);
-#if OPENMP && OPENMP_OFFLOAD
-    orig.copyFrom(*newGrid, true, false);
-#else
-    orig.swap(*newGrid);
-#endif
-    if (newGrid)
-      delete newGrid;
+    applyOutflowBC(flags, newGrid, orig, dt);
+    orig.swap(newGrid);
  }
-  if (fwd)
-    delete fwd;
 }

 //! Perform semi-lagrangian advection of target Real- or Vec3 grid
--- a/extern/mantaflow/preprocessed/plugin/apic.cpp
+++ b/extern/mantaflow/preprocessed/plugin/apic.cpp
@@ -239,7 +239,13 @@ struct knApicMapLinearVec3ToMACGrid : public KernelBase {
    return boundaryWidth;
  }
  typedef int type9;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knApicMapLinearVec3ToMACGrid ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
  void run()
  {
    const IndexInt _sz = size;
@@ -364,7 +370,7 @@ struct knApicMapLinearMACGridToVec3 : public KernelBase {
                 const FlagGrid &flags,
                 const ParticleDataImpl<int> *ptype,
                 const int exclude,
-                 const int boundaryWidth)
+                 const int boundaryWidth) const
  {
    if (!p.isActive(idx) || (ptype && ((*ptype)[idx] & exclude)))
      return;
@@ -503,17 +509,21 @@ struct knApicMapLinearMACGridToVec3 : public KernelBase {
    return boundaryWidth;
  }
  typedef int type9;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knApicMapLinearMACGridToVec3 ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, vp, cpx, cpy, cpz, p, vg, flags, ptype, exclude, boundaryWidth);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, vp, cpx, cpy, cpz, p, vg, flags, ptype, exclude, boundaryWidth);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleDataImpl<Vec3> &vp;
  ParticleDataImpl<Vec3> &cpx;
--- a/extern/mantaflow/preprocessed/plugin/extforces.cpp
+++ b/extern/mantaflow/preprocessed/plugin/extforces.cpp
@@ -52,7 +52,7 @@ struct KnApplyForceField : public KernelBase {
                 const Grid<Vec3> &force,
                 const Grid<Real> *include,
                 bool additive,
-                 bool isMAC)
+                 bool isMAC) const
  {
    bool curFluid = flags.isFluid(i, j, k);
    bool curEmpty = flags.isEmpty(i, j, k);
@@ -105,34 +105,36 @@ struct KnApplyForceField : public KernelBase {
    return isMAC;
  }
  typedef bool type5;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnApplyForceField ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, flags, vel, force, include, additive, isMAC);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, flags, vel, force, include, additive, isMAC);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, vel, force, include, additive, isMAC);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const FlagGrid &flags;
  MACGrid &vel;
@@ -163,7 +165,7 @@ struct KnApplyForce : public KernelBase {
                 MACGrid &vel,
                 Vec3 force,
                 const Grid<Real> *exclude,
-                 bool additive)
+                 bool additive) const
  {
    bool curFluid = flags.isFluid(i, j, k);
    bool curEmpty = flags.isEmpty(i, j, k);
@@ -204,34 +206,36 @@ struct KnApplyForce : public KernelBase {
    return additive;
  }
  typedef bool type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnApplyForce ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, flags, vel, force, exclude, additive);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, flags, vel, force, exclude, additive);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, vel, force, exclude, additive);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const FlagGrid &flags;
  MACGrid &vel;
@@ -342,7 +346,7 @@ struct KnAddBuoyancy : public KernelBase {
                 const FlagGrid &flags,
                 const Grid<Real> &factor,
                 MACGrid &vel,
-                 Vec3 strength)
+                 Vec3 strength) const
  {
    if (!flags.isFluid(i, j, k))
      return;
@@ -373,34 +377,36 @@ struct KnAddBuoyancy : public KernelBase {
    return strength;
  }
  typedef Vec3 type3;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnAddBuoyancy ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, flags, factor, vel, strength);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, flags, factor, vel, strength);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, factor, vel, strength);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const FlagGrid &flags;
  const Grid<Real> &factor;
@@ -656,7 +662,7 @@ struct KnSetInflow : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, MACGrid &vel, int dim, int p0, const Vec3 &val)
+  inline void op(int i, int j, int k, MACGrid &vel, int dim, int p0, const Vec3 &val) const
  {
    Vec3i p(i, j, k);
    if (p[dim] == p0 || p[dim] == p0 + 1)
@@ -682,34 +688,36 @@ struct KnSetInflow : public KernelBase {
    return val;
  }
  typedef Vec3 type3;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnSetInflow ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, vel, dim, p0, val);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, vel, dim, p0, val);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, vel, dim, p0, val);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  MACGrid &vel;
  int dim;
@@ -776,7 +784,8 @@ struct KnSetWallBcs : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, const FlagGrid &flags, MACGrid &vel, const MACGrid *obvel)
+  inline void op(
+      int i, int j, int k, const FlagGrid &flags, MACGrid &vel, const MACGrid *obvel) const
  {

    bool curFluid = flags.isFluid(i, j, k);
@@ -839,34 +848,36 @@ struct KnSetWallBcs : public KernelBase {
    return obvel;
  }
  typedef MACGrid type2;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnSetWallBcs ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, vel, obvel);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, flags, vel, obvel);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, vel, obvel);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const FlagGrid &flags;
  MACGrid &vel;
@@ -901,7 +912,7 @@ struct KnSetWallBcsFrac : public KernelBase {
                 MACGrid &velTarget,
                 const MACGrid *obvel,
                 const Grid<Real> *phiObs,
-                 const int &boundaryWidth = 0)
+                 const int &boundaryWidth = 0) const
  {
    bool curFluid = flags.isFluid(i, j, k);
    bool curObs = flags.isObstacle(i, j, k);
@@ -1014,34 +1025,36 @@ struct KnSetWallBcsFrac : public KernelBase {
    return boundaryWidth;
  }
  typedef int type5;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnSetWallBcsFrac ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, vel, velTarget, obvel, phiObs, boundaryWidth);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, flags, vel, velTarget, obvel, phiObs, boundaryWidth);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, vel, velTarget, obvel, phiObs, boundaryWidth);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const FlagGrid &flags;
  const MACGrid &vel;
@@ -1114,7 +1127,8 @@ struct KnAddForceIfLower : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, const FlagGrid &flags, MACGrid &vel, const Grid<Vec3> &force)
+  inline void op(
+      int i, int j, int k, const FlagGrid &flags, MACGrid &vel, const Grid<Vec3> &force) const
  {
    bool curFluid = flags.isFluid(i, j, k);
    bool curEmpty = flags.isEmpty(i, j, k);
@@ -1159,34 +1173,36 @@ struct KnAddForceIfLower : public KernelBase {
    return force;
  }
  typedef Grid<Vec3> type2;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnAddForceIfLower ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, flags, vel, force);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, flags, vel, force);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, vel, force);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const FlagGrid &flags;
  MACGrid &vel;
@@ -1250,7 +1266,7 @@ struct KnConfForce : public KernelBase {
                 const Grid<Real> &grid,
                 const Grid<Vec3> &curl,
                 Real str,
-                 const Grid<Real> *strGrid)
+                 const Grid<Real> *strGrid) const
  {
    Vec3 grad = 0.5 * Vec3(grid(i + 1, j, k) - grid(i - 1, j, k),
                           grid(i, j + 1, k) - grid(i, j - 1, k),
@@ -1287,34 +1303,36 @@ struct KnConfForce : public KernelBase {
    return strGrid;
  }
  typedef Grid<Real> type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnConfForce ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, force, grid, curl, str, strGrid);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, force, grid, curl, str, strGrid);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, force, grid, curl, str, strGrid);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  Grid<Vec3> &force;
  const Grid<Real> &grid;
@@ -1496,7 +1514,7 @@ struct KnDissolveSmoke : public KernelBase {
                 int speed,
                 bool logFalloff,
                 float dydx,
-                 float fac)
+                 float fac) const
  {

    bool curFluid = flags.isFluid(i, j, k);
@@ -1584,34 +1602,36 @@ struct KnDissolveSmoke : public KernelBase {
    return fac;
  }
  typedef float type9;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnDissolveSmoke ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, density, heat, red, green, blue, speed, logFalloff, dydx, fac);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, flags, density, heat, red, green, blue, speed, logFalloff, dydx, fac);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, density, heat, red, green, blue, speed, logFalloff, dydx, fac);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const FlagGrid &flags;
  Grid<Real> &density;
--- a/extern/mantaflow/preprocessed/plugin/fire.cpp
+++ b/extern/mantaflow/preprocessed/plugin/fire.cpp
@@ -71,7 +71,7 @@ struct KnProcessBurn : public KernelBase {
                 Real ignitionTemp,
                 Real maxTemp,
                 Real dt,
-                 Vec3 flameSmokeColor)
+                 Vec3 flameSmokeColor) const
  {
    // Save initial values
    Real origFuel = fuel(i, j, k);
@@ -179,44 +179,19 @@ struct KnProcessBurn : public KernelBase {
    return flameSmokeColor;
  }
  typedef Vec3 type12;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnProcessBurn ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i,
-                 j,
-                 k,
-                 fuel,
-                 density,
-                 react,
-                 red,
-                 green,
-                 blue,
-                 heat,
-                 burningRate,
-                 flameSmoke,
-                 ignitionTemp,
-                 maxTemp,
-                 dt,
-                 flameSmokeColor);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i,
@@ -235,8 +210,35 @@ struct KnProcessBurn : public KernelBase {
               maxTemp,
               dt,
               flameSmokeColor);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i,
+             j,
+             k,
+             fuel,
+             density,
+             react,
+             red,
+             green,
+             blue,
+             heat,
+             burningRate,
+             flameSmoke,
+             ignitionTemp,
+             maxTemp,
+             dt,
+             flameSmokeColor);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  Grid<Real> &fuel;
  Grid<Real> &density;
@@ -342,7 +344,7 @@ struct KnUpdateFlame : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, const Grid<Real> &react, Grid<Real> &flame)
+  inline void op(int i, int j, int k, const Grid<Real> &react, Grid<Real> &flame) const
  {
    if (react(i, j, k) > 0.0f)
      flame(i, j, k) = pow(react(i, j, k), 0.5f);
@@ -359,34 +361,36 @@ struct KnUpdateFlame : public KernelBase {
    return flame;
  }
  typedef Grid<Real> type1;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnUpdateFlame ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, react, flame);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, react, flame);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, react, flame);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const Grid<Real> &react;
  Grid<Real> &flame;
--- a/extern/mantaflow/preprocessed/plugin/flip.cpp
+++ b/extern/mantaflow/preprocessed/plugin/flip.cpp
@@ -283,7 +283,7 @@ struct knClearFluidFlags : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, FlagGrid &flags, int dummy = 0)
+  inline void op(int i, int j, int k, FlagGrid &flags, int dummy = 0) const
  {
    if (flags.isFluid(i, j, k)) {
      flags(i, j, k) = (flags(i, j, k) | FlagGrid::TypeEmpty) & ~FlagGrid::TypeFluid;
@@ -299,34 +299,36 @@ struct knClearFluidFlags : public KernelBase {
    return dummy;
  }
  typedef int type1;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knClearFluidFlags ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, dummy);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, flags, dummy);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, dummy);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  FlagGrid &flags;
  int dummy;
@@ -340,7 +342,7 @@ struct knSetNbObstacle : public KernelBase {
    run();
  }
  inline void op(
-      int i, int j, int k, FlagGrid &nflags, const FlagGrid &flags, const Grid<Real> &phiObs)
+      int i, int j, int k, FlagGrid &nflags, const FlagGrid &flags, const Grid<Real> &phiObs) const
  {
    if (phiObs(i, j, k) > 0.)
      return;
@@ -379,34 +381,36 @@ struct knSetNbObstacle : public KernelBase {
    return phiObs;
  }
  typedef Grid<Real> type2;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knSetNbObstacle ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, nflags, flags, phiObs);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, nflags, flags, phiObs);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, nflags, flags, phiObs);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  FlagGrid &nflags;
  const FlagGrid &flags;
@@ -434,11 +438,7 @@ void markFluidCells(const BasicParticleSystem &parts,
  if (phiObs) {
    FlagGrid tmp(flags);
    knSetNbObstacle(tmp, flags, *phiObs);
-#if OPENMP && OPENMP_OFFLOAD
-    flags.copyFrom(tmp, true, false);
-#else
    flags.swap(tmp);
-#endif
  }
 }
 static PyObject *_W_3(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
@@ -818,37 +818,33 @@ struct ComputeUnionLevelsetPindex : public KernelBase {
                 LevelsetGrid &phi,
                 const Real radius,
                 const ParticleDataImpl<int> *ptype,
-                 const int exclude)
+                 const int exclude) const
  {
    const Vec3 gridPos = Vec3(i, j, k) + Vec3(0.5);  // shifted by half cell
    Real phiv = radius * 1.0;                        // outside
+    const int r = int(radius) + 1;

-    int r = int(radius) + 1;
-    int rZ = phi.is3D() ? r : 0;
-    for (int zj = k - rZ; zj <= k + rZ; zj++)
-      for (int yj = j - r; yj <= j + r; yj++)
-        for (int xj = i - r; xj <= i + r; xj++) {
-          if (!phi.isInBounds(Vec3i(xj, yj, zj)))
-            continue;
+    FOR_NEIGHBORS(phi, r)
+    {

-          // note, for the particle indices in indexSys the access is periodic (ie, dont skip for
-          // eg inBounds(sx,10,10)
-          IndexInt isysIdxS = index.index(xj, yj, zj);
-          IndexInt pStart = index(isysIdxS), pEnd = 0;
-          if (phi.isInBounds(isysIdxS + 1))
-            pEnd = index(isysIdxS + 1);
-          else
-            pEnd = indexSys.size();
+      // note, for the particle indices in indexSys the access is periodic (ie, dont skip for eg
+      // inBounds(sx,10,10)
+      IndexInt isysIdxS = index.index(xj, yj, zj);
+      IndexInt pStart = index(isysIdxS), pEnd = 0;
+      if (phi.isInBounds(isysIdxS + 1))
+        pEnd = index(isysIdxS + 1);
+      else
+        pEnd = indexSys.size();

-          // now loop over particles in cell
-          for (IndexInt p = pStart; p < pEnd; ++p) {
-            const int psrc = indexSys[p].sourceIndex;
-            if (ptype && ((*ptype)[psrc] & exclude))
-              continue;
-            const Vec3 pos = parts[psrc].pos;
-            phiv = std::min(phiv, fabs(norm(gridPos - pos)) - radius);
-          }
-        }
+      // now loop over particles in cell
+      for (IndexInt p = pStart; p < pEnd; ++p) {
+        const int psrc = indexSys[p].sourceIndex;
+        if (ptype && ((*ptype)[psrc] & exclude))
+          continue;
+        const Vec3 pos = parts[psrc].pos;
+        phiv = std::min(phiv, fabs(norm(gridPos - pos)) - radius);
+      }
+    }
    phi(i, j, k) = phiv;
  }
  inline const Grid<int> &getArg0()
@@ -886,34 +882,36 @@ struct ComputeUnionLevelsetPindex : public KernelBase {
    return exclude;
  }
  typedef int type6;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel ComputeUnionLevelsetPindex ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, index, parts, indexSys, phi, radius, ptype, exclude);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, index, parts, indexSys, phi, radius, ptype, exclude);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, index, parts, indexSys, phi, radius, ptype, exclude);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const Grid<int> &index;
  const BasicParticleSystem &parts;
@@ -1017,46 +1015,42 @@ struct ComputeAveragedLevelsetWeight : public KernelBase {
                 const ParticleDataImpl<int> *ptype,
                 const int exclude,
                 Grid<Vec3> *save_pAcc = nullptr,
-                 Grid<Real> *save_rAcc = nullptr)
+                 Grid<Real> *save_rAcc = nullptr) const
  {
    const Vec3 gridPos = Vec3(i, j, k) + Vec3(0.5);  // shifted by half cell
    Real phiv = radius * 1.0;                        // outside

    // loop over neighborhood, similar to ComputeUnionLevelsetPindex
    const Real sradiusInv = 1. / (4. * radius * radius);
-    int r = int(1. * radius) + 1;
-    int rZ = phi.is3D() ? r : 0;
+    const int r = int(radius) + 1;
    // accumulators
    Real wacc = 0.;
    Vec3 pacc = Vec3(0.);
    Real racc = 0.;

-    for (int zj = k - rZ; zj <= k + rZ; zj++)
-      for (int yj = j - r; yj <= j + r; yj++)
-        for (int xj = i - r; xj <= i + r; xj++) {
-          if (!phi.isInBounds(Vec3i(xj, yj, zj)))
-            continue;
+    FOR_NEIGHBORS(phi, r)
+    {

-          IndexInt isysIdxS = index.index(xj, yj, zj);
-          IndexInt pStart = index(isysIdxS), pEnd = 0;
-          if (phi.isInBounds(isysIdxS + 1))
-            pEnd = index(isysIdxS + 1);
-          else
-            pEnd = indexSys.size();
-          for (IndexInt p = pStart; p < pEnd; ++p) {
-            IndexInt psrc = indexSys[p].sourceIndex;
-            if (ptype && ((*ptype)[psrc] & exclude))
-              continue;
+      IndexInt isysIdxS = index.index(xj, yj, zj);
+      IndexInt pStart = index(isysIdxS), pEnd = 0;
+      if (phi.isInBounds(isysIdxS + 1))
+        pEnd = index(isysIdxS + 1);
+      else
+        pEnd = indexSys.size();
+      for (IndexInt p = pStart; p < pEnd; ++p) {
+        IndexInt psrc = indexSys[p].sourceIndex;
+        if (ptype && ((*ptype)[psrc] & exclude))
+          continue;

-            Vec3 pos = parts[psrc].pos;
-            Real s = normSquare(gridPos - pos) * sradiusInv;
-            // Real  w = std::max(0., cubed(1.-s) );
-            Real w = std::max(0., (1. - s));  // a bit smoother
-            wacc += w;
-            racc += radius * w;
-            pacc += pos * w;
-          }
-        }
+        Vec3 pos = parts[psrc].pos;
+        Real s = normSquare(gridPos - pos) * sradiusInv;
+        // Real  w = std::max(0., cubed(1.-s) );
+        Real w = std::max(0., (1. - s));  // a bit smoother
+        wacc += w;
+        racc += radius * w;
+        pacc += pos * w;
+      }
+    }

    if (wacc > VECTOR_EPSILON) {
      racc /= wacc;
@@ -1115,45 +1109,36 @@ struct ComputeAveragedLevelsetWeight : public KernelBase {
    return save_rAcc;
  }
  typedef Grid<Real> type8;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel ComputeAveragedLevelsetWeight ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i,
-                 j,
-                 k,
-                 parts,
-                 index,
-                 indexSys,
-                 phi,
-                 radius,
-                 ptype,
-                 exclude,
-                 save_pAcc,
-                 save_rAcc);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, parts, index, indexSys, phi, radius, ptype, exclude, save_pAcc, save_rAcc);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, parts, index, indexSys, phi, radius, ptype, exclude, save_pAcc, save_rAcc);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const BasicParticleSystem &parts;
  const Grid<int> &index;
@@ -1180,7 +1165,7 @@ template<class T> struct knSmoothGrid : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, const Grid<T> &me, Grid<T> &tmp, Real factor)
+  inline void op(int i, int j, int k, const Grid<T> &me, Grid<T> &tmp, Real factor) const
  {
    T val = me(i, j, k) + me(i + 1, j, k) + me(i - 1, j, k) + me(i, j + 1, k) + me(i, j - 1, k);
    if (me.is3D()) {
@@ -1203,34 +1188,36 @@ template<class T> struct knSmoothGrid : public KernelBase {
    return factor;
  }
  typedef Real type2;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knSmoothGrid ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, me, tmp, factor);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, me, tmp, factor);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, me, tmp, factor);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const Grid<T> &me;
  Grid<T> &tmp;
@@ -1244,7 +1231,7 @@ template<class T> struct knSmoothGridNeg : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, const Grid<T> &me, Grid<T> &tmp, Real factor)
+  inline void op(int i, int j, int k, const Grid<T> &me, Grid<T> &tmp, Real factor) const
  {
    T val = me(i, j, k) + me(i + 1, j, k) + me(i - 1, j, k) + me(i, j + 1, k) + me(i, j - 1, k);
    if (me.is3D()) {
@@ -1271,34 +1258,36 @@ template<class T> struct knSmoothGridNeg : public KernelBase {
    return factor;
  }
  typedef Real type2;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knSmoothGridNeg ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, me, tmp, factor);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, me, tmp, factor);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, me, tmp, factor);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const Grid<T> &me;
  Grid<T> &tmp;
@@ -1407,7 +1396,7 @@ struct correctLevelset : public KernelBase {
                 const Grid<Real> &rAcc,
                 const Real radius,
                 const Real t_low,
-                 const Real t_high)
+                 const Real t_high) const
  {
    if (rAcc(i, j, k) <= VECTOR_EPSILON)
      return;  // outside nothing happens
@@ -1474,34 +1463,36 @@ struct correctLevelset : public KernelBase {
    return t_high;
  }
  typedef Real type5;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel correctLevelset ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, phi, pAcc, rAcc, radius, t_low, t_high);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, phi, pAcc, rAcc, radius, t_low, t_high);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, phi, pAcc, rAcc, radius, t_low, t_high);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  LevelsetGrid &phi;
  const Grid<Vec3> &pAcc;
@@ -1633,7 +1624,7 @@ struct knPushOutofObs : public KernelBase {
                 const Real shift,
                 const Real thresh,
                 const ParticleDataImpl<int> *ptype,
-                 const int exclude)
+                 const int exclude) const
  {
    if (!parts.isActive(idx) || (ptype && ((*ptype)[idx] & exclude)))
      return;
@@ -1684,17 +1675,21 @@ struct knPushOutofObs : public KernelBase {
    return exclude;
  }
  typedef int type6;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knPushOutofObs ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, parts, flags, phiObs, shift, thresh, ptype, exclude);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, parts, flags, phiObs, shift, thresh, ptype, exclude);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  BasicParticleSystem &parts;
  const FlagGrid &flags;
@@ -1764,7 +1759,10 @@ template<class T> struct knSafeDivReal : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid<T> &me, const Grid<Real> &other, Real cutoff = VECTOR_EPSILON)
+  inline void op(IndexInt idx,
+                 Grid<T> &me,
+                 const Grid<Real> &other,
+                 Real cutoff = VECTOR_EPSILON) const
  {
    if (other[idx] < cutoff) {
      me[idx] = 0.;
@@ -1789,17 +1787,21 @@ template<class T> struct knSafeDivReal : public KernelBase {
    return cutoff;
  }
  typedef Real type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knSafeDivReal ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, me, other, cutoff);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, me, other, cutoff);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid<T> &me;
  const Grid<Real> &other;
@@ -1877,7 +1879,13 @@ struct knMapLinearVec3ToMACGrid : public KernelBase {
    return exclude;
  }
  typedef int type6;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knMapLinearVec3ToMACGrid ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
  void run()
  {
    const IndexInt _sz = size;
@@ -2014,7 +2022,13 @@ template<class T> struct knMapLinear : public KernelBase {
    return psource;
  }
  typedef ParticleDataImpl<T> type4;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knMapLinear ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
  void run()
  {
    const IndexInt _sz = size;
@@ -2139,7 +2153,7 @@ template<class T> struct knMapFromGrid : public KernelBase {
  inline void op(IndexInt idx,
                 const BasicParticleSystem &p,
                 const Grid<T> &gsrc,
-                 ParticleDataImpl<T> &target)
+                 ParticleDataImpl<T> &target) const
  {
    if (!p.isActive(idx))
      return;
@@ -2160,17 +2174,21 @@ template<class T> struct knMapFromGrid : public KernelBase {
    return target;
  }
  typedef ParticleDataImpl<T> type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knMapFromGrid ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, p, gsrc, target);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, p, gsrc, target);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystem &p;
  const Grid<T> &gsrc;
@@ -2280,7 +2298,7 @@ struct knMapLinearMACGridToVec3_PIC : public KernelBase {
                 const MACGrid &vel,
                 ParticleDataImpl<Vec3> &pvel,
                 const ParticleDataImpl<int> *ptype,
-                 const int exclude)
+                 const int exclude) const
  {
    if (!p.isActive(idx) || (ptype && ((*ptype)[idx] & exclude)))
      return;
@@ -2317,17 +2335,21 @@ struct knMapLinearMACGridToVec3_PIC : public KernelBase {
    return exclude;
  }
  typedef int type5;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knMapLinearMACGridToVec3_PIC ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, p, flags, vel, pvel, ptype, exclude);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, p, flags, vel, pvel, ptype, exclude);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystem &p;
  const FlagGrid &flags;
@@ -2416,7 +2438,7 @@ struct knMapLinearMACGridToVec3_FLIP : public KernelBase {
                 ParticleDataImpl<Vec3> &pvel,
                 const Real flipRatio,
                 const ParticleDataImpl<int> *ptype,
-                 const int exclude)
+                 const int exclude) const
  {
    if (!p.isActive(idx) || (ptype && ((*ptype)[idx] & exclude)))
      return;
@@ -2464,17 +2486,21 @@ struct knMapLinearMACGridToVec3_FLIP : public KernelBase {
    return exclude;
  }
  typedef int type7;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knMapLinearMACGridToVec3_FLIP ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, p, flags, vel, oldVel, pvel, flipRatio, ptype, exclude);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, p, flags, vel, oldVel, pvel, flipRatio, ptype, exclude);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystem &p;
  const FlagGrid &flags;
@@ -2566,7 +2592,7 @@ struct knCombineVels : public KernelBase {
                 MACGrid &combineVel,
                 const LevelsetGrid *phi,
                 Real narrowBand,
-                 Real thresh)
+                 Real thresh) const
  {
    int idx = vel.index(i, j, k);

@@ -2622,34 +2648,36 @@ struct knCombineVels : public KernelBase {
    return thresh;
  }
  typedef Real type5;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knCombineVels ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, vel, w, combineVel, phi, narrowBand, thresh);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, vel, w, combineVel, phi, narrowBand, thresh);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, vel, w, combineVel, phi, narrowBand, thresh);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  MACGrid &vel;
  const Grid<Vec3> &w;
--- a/extern/mantaflow/preprocessed/plugin/fluidguiding.cpp
+++ b/extern/mantaflow/preprocessed/plugin/fluidguiding.cpp
@@ -62,7 +62,7 @@ struct apply1DKernelDirX : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, const MACGrid &in, MACGrid &out, const Matrix &kernel)
+  inline void op(int i, int j, int k, const MACGrid &in, MACGrid &out, const Matrix &kernel) const
  {
    int nx = in.getSizeX();
    int kn = kernel.n;
@@ -91,34 +91,36 @@ struct apply1DKernelDirX : public KernelBase {
    return kernel;
  }
  typedef Matrix type2;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel apply1DKernelDirX ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, in, out, kernel);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, in, out, kernel);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, in, out, kernel);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const MACGrid &in;
  MACGrid &out;
@@ -134,7 +136,7 @@ struct apply1DKernelDirY : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, const MACGrid &in, MACGrid &out, const Matrix &kernel)
+  inline void op(int i, int j, int k, const MACGrid &in, MACGrid &out, const Matrix &kernel) const
  {
    int ny = in.getSizeY();
    int kn = kernel.n;
@@ -163,34 +165,36 @@ struct apply1DKernelDirY : public KernelBase {
    return kernel;
  }
  typedef Matrix type2;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel apply1DKernelDirY ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, in, out, kernel);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, in, out, kernel);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, in, out, kernel);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const MACGrid &in;
  MACGrid &out;
@@ -206,7 +210,7 @@ struct apply1DKernelDirZ : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, const MACGrid &in, MACGrid &out, const Matrix &kernel)
+  inline void op(int i, int j, int k, const MACGrid &in, MACGrid &out, const Matrix &kernel) const
  {
    int nz = in.getSizeZ();
    int kn = kernel.n;
@@ -235,34 +239,36 @@ struct apply1DKernelDirZ : public KernelBase {
    return kernel;
  }
  typedef Matrix type2;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel apply1DKernelDirZ ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, in, out, kernel);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, in, out, kernel);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, in, out, kernel);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const MACGrid &in;
  MACGrid &out;
@@ -563,88 +569,197 @@ void prox_f(MACGrid &v,

 // *****************************************************************************

-// TODO (sebbas): Disabled for now
-// // re-uses main pressure solve from pressure.cpp
-// void solvePressure(
-//         MACGrid& vel, Grid<Real>& pressure, const FlagGrid& flags, Real cgAccuracy = 1e-3,
-// 	const Grid<Real>* phi = nullptr,
-// 	const Grid<Real>* perCellCorr = nullptr,
-// 	const MACGrid* fractions = nullptr,
-// 	const MACGrid* obvel = nullptr,
-// 	Real gfClamp = 1e-04,
-// 	Real cgMaxIterFac = 1.5,
-// 	bool precondition = true,
-// 	int preconditioner = 1,
-// 	bool enforceCompatibility = false,
-// 	bool useL2Norm = false,
-// 	bool zeroPressureFixing = false,
-// 	const Grid<Real> *curv = nullptr,
-// 	const Real surfTens = 0.0,
-// 	Grid<Real>* retRhs = nullptr );
+// re-uses main pressure solve from pressure.cpp
+void solvePressure(MACGrid &vel,
+                   Grid<Real> &pressure,
+                   const FlagGrid &flags,
+                   Real cgAccuracy = 1e-3,
+                   const Grid<Real> *phi = nullptr,
+                   const Grid<Real> *perCellCorr = nullptr,
+                   const MACGrid *fractions = nullptr,
+                   const MACGrid *obvel = nullptr,
+                   Real gfClamp = 1e-04,
+                   Real cgMaxIterFac = 1.5,
+                   bool precondition = true,
+                   int preconditioner = 1,
+                   bool enforceCompatibility = false,
+                   bool useL2Norm = false,
+                   bool zeroPressureFixing = false,
+                   const Grid<Real> *curv = nullptr,
+                   const Real surfTens = 0.0,
+                   Grid<Real> *retRhs = nullptr);

-// //! Main function for fluid guiding , includes "regular" pressure solve
-// PYTHON() void PD_fluid_guiding(MACGrid& vel, MACGrid& velT,
-// 	Grid<Real>& pressure, FlagGrid& flags, Grid<Real>& weight, int blurRadius = 5,
-// 	Real theta = 1.0, Real tau = 1.0, Real sigma = 1.0,
-// 	Real epsRel = 1e-3, Real epsAbs = 1e-3, int maxIters = 200,
-// 	// duplicated for pressure solve
-// 	Grid<Real>* phi = nullptr, Grid<Real>* perCellCorr = nullptr, MACGrid* fractions = nullptr,
-// MACGrid* obvel = nullptr, Real gfClamp = 1e-04, Real cgMaxIterFac = 1.5, Real cgAccuracy = 1e-3,
-// 	int preconditioner = 1, bool zeroPressureFixing = false, const Grid<Real> *curv = nullptr,
-// const Real surfTens = 0.)
-// {
-// 	FluidSolver* parent = vel.getParent();
+//! Main function for fluid guiding , includes "regular" pressure solve

-// 	// initialize dual/slack variables
-// 	MACGrid velC = MACGrid(parent); velC.copyFrom(vel);
-// 	MACGrid x = MACGrid(parent);
-// 	MACGrid y = MACGrid(parent);
-// 	MACGrid z = MACGrid(parent);
-// 	MACGrid x0 = MACGrid(parent);
-// 	MACGrid z0 = MACGrid(parent);
+void PD_fluid_guiding(MACGrid &vel,
+                      MACGrid &velT,
+                      Grid<Real> &pressure,
+                      FlagGrid &flags,
+                      Grid<Real> &weight,
+                      int blurRadius = 5,
+                      Real theta = 1.0,
+                      Real tau = 1.0,
+                      Real sigma = 1.0,
+                      Real epsRel = 1e-3,
+                      Real epsAbs = 1e-3,
+                      int maxIters = 200,
+                      Grid<Real> *phi = nullptr,
+                      Grid<Real> *perCellCorr = nullptr,
+                      MACGrid *fractions = nullptr,
+                      MACGrid *obvel = nullptr,
+                      Real gfClamp = 1e-04,
+                      Real cgMaxIterFac = 1.5,
+                      Real cgAccuracy = 1e-3,
+                      int preconditioner = 1,
+                      bool zeroPressureFixing = false,
+                      const Grid<Real> *curv = nullptr,
+                      const Real surfTens = 0.)
+{
+  FluidSolver *parent = vel.getParent();

-// 	// precomputation
-// 	ADMM_precompute_Separable(blurRadius);
-// 	MACGrid Q = MACGrid(parent);
-// 	precomputeQ(Q, flags, velT, velC, gBlurKernel, sigma);
-// 	MACGrid invA = MACGrid(parent);
-// 	precomputeInvA(invA, weight, sigma);
+  // initialize dual/slack variables
+  MACGrid velC = MACGrid(parent);
+  velC.copyFrom(vel);
+  MACGrid x = MACGrid(parent);
+  MACGrid y = MACGrid(parent);
+  MACGrid z = MACGrid(parent);
+  MACGrid x0 = MACGrid(parent);
+  MACGrid z0 = MACGrid(parent);

-// 	// loop
-// 	int iter = 0;
-// 	for (iter = 0; iter < maxIters; iter++) {
-// 		// x-update
-// 		x0.copyFrom(x);
-// 		x.multConst(1.0 / sigma);
-// 		x.add(y);
-// 		prox_f(x, flags, Q, velC, sigma, invA);
-// 		x.multConst(-sigma); x.addScaled(y, sigma); x.add(x0);
+  // precomputation
+  ADMM_precompute_Separable(blurRadius);
+  MACGrid Q = MACGrid(parent);
+  precomputeQ(Q, flags, velT, velC, gBlurKernel, sigma);
+  MACGrid invA = MACGrid(parent);
+  precomputeInvA(invA, weight, sigma);

-// 		// z-update
-// 		z0.copyFrom(z);
-// 		z.addScaled(x, -tau);
-// 		Real cgAccuracyAdaptive = cgAccuracy;
+  // loop
+  int iter = 0;
+  for (iter = 0; iter < maxIters; iter++) {
+    // x-update
+    x0.copyFrom(x);
+    x.multConst(1.0 / sigma);
+    x.add(y);
+    prox_f(x, flags, Q, velC, sigma, invA);
+    x.multConst(-sigma);
+    x.addScaled(y, sigma);
+    x.add(x0);

-// 		solvePressure (z, pressure, flags, cgAccuracyAdaptive, phi, perCellCorr, fractions, obvel,
-// gfClamp, 		    cgMaxIterFac, true, preconditioner, false, false, zeroPressureFixing, curv, surfTens );
+    // z-update
+    z0.copyFrom(z);
+    z.addScaled(x, -tau);
+    Real cgAccuracyAdaptive = cgAccuracy;

-// 		// y-update
-// 		y.copyFrom(z);
-// 		y.sub(z0);
-// 		y.multConst(theta);
-// 		y.add(z);
+    solvePressure(z,
+                  pressure,
+                  flags,
+                  cgAccuracyAdaptive,
+                  phi,
+                  perCellCorr,
+                  fractions,
+                  obvel,
+                  gfClamp,
+                  cgMaxIterFac,
+                  true,
+                  preconditioner,
+                  false,
+                  false,
+                  zeroPressureFixing,
+                  curv,
+                  surfTens);

-// 		// stopping criterion
-// 		bool stop = (iter > 0 && getRNorm(z, z0) < getEpsDual(epsAbs, epsRel, z));
+    // y-update
+    y.copyFrom(z);
+    y.sub(z0);
+    y.multConst(theta);
+    y.add(z);

-// 		if (stop || (iter == maxIters - 1)) break;
-// 	}
+    // stopping criterion
+    bool stop = (iter > 0 && getRNorm(z, z0) < getEpsDual(epsAbs, epsRel, z));

-// 	// vel_new = z
-// 	vel.copyFrom(z);
+    if (stop || (iter == maxIters - 1))
+      break;
+  }

-// 	debMsg("PD_fluid_guiding iterations:" << iter, 1);
-// }
+  // vel_new = z
+  vel.copyFrom(z);
+
+  debMsg("PD_fluid_guiding iterations:" << iter, 1);
+}
+static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+{
+  try {
+    PbArgs _args(_linargs, _kwds);
+    FluidSolver *parent = _args.obtainParent();
+    bool noTiming = _args.getOpt<bool>("notiming", -1, 0);
+    pbPreparePlugin(parent, "PD_fluid_guiding", !noTiming);
+    PyObject *_retval = nullptr;
+    {
+      ArgLocker _lock;
+      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 0, &_lock);
+      MACGrid &velT = *_args.getPtr<MACGrid>("velT", 1, &_lock);
+      Grid<Real> &pressure = *_args.getPtr<Grid<Real>>("pressure", 2, &_lock);
+      FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 3, &_lock);
+      Grid<Real> &weight = *_args.getPtr<Grid<Real>>("weight", 4, &_lock);
+      int blurRadius = _args.getOpt<int>("blurRadius", 5, 5, &_lock);
+      Real theta = _args.getOpt<Real>("theta", 6, 1.0, &_lock);
+      Real tau = _args.getOpt<Real>("tau", 7, 1.0, &_lock);
+      Real sigma = _args.getOpt<Real>("sigma", 8, 1.0, &_lock);
+      Real epsRel = _args.getOpt<Real>("epsRel", 9, 1e-3, &_lock);
+      Real epsAbs = _args.getOpt<Real>("epsAbs", 10, 1e-3, &_lock);
+      int maxIters = _args.getOpt<int>("maxIters", 11, 200, &_lock);
+      Grid<Real> *phi = _args.getPtrOpt<Grid<Real>>("phi", 12, nullptr, &_lock);
+      Grid<Real> *perCellCorr = _args.getPtrOpt<Grid<Real>>("perCellCorr", 13, nullptr, &_lock);
+      MACGrid *fractions = _args.getPtrOpt<MACGrid>("fractions", 14, nullptr, &_lock);
+      MACGrid *obvel = _args.getPtrOpt<MACGrid>("obvel", 15, nullptr, &_lock);
+      Real gfClamp = _args.getOpt<Real>("gfClamp", 16, 1e-04, &_lock);
+      Real cgMaxIterFac = _args.getOpt<Real>("cgMaxIterFac", 17, 1.5, &_lock);
+      Real cgAccuracy = _args.getOpt<Real>("cgAccuracy", 18, 1e-3, &_lock);
+      int preconditioner = _args.getOpt<int>("preconditioner", 19, 1, &_lock);
+      bool zeroPressureFixing = _args.getOpt<bool>("zeroPressureFixing", 20, false, &_lock);
+      const Grid<Real> *curv = _args.getPtrOpt<Grid<Real>>("curv", 21, nullptr, &_lock);
+      const Real surfTens = _args.getOpt<Real>("surfTens", 22, 0., &_lock);
+      _retval = getPyNone();
+      PD_fluid_guiding(vel,
+                       velT,
+                       pressure,
+                       flags,
+                       weight,
+                       blurRadius,
+                       theta,
+                       tau,
+                       sigma,
+                       epsRel,
+                       epsAbs,
+                       maxIters,
+                       phi,
+                       perCellCorr,
+                       fractions,
+                       obvel,
+                       gfClamp,
+                       cgMaxIterFac,
+                       cgAccuracy,
+                       preconditioner,
+                       zeroPressureFixing,
+                       curv,
+                       surfTens);
+      _args.check();
+    }
+    pbFinalizePlugin(parent, "PD_fluid_guiding", !noTiming);
+    return _retval;
+  }
+  catch (std::exception &e) {
+    pbSetError("PD_fluid_guiding", e.what());
+    return 0;
+  }
+}
+static const Pb::Register _RP_PD_fluid_guiding("", "PD_fluid_guiding", _W_2);
+extern "C" {
+void PbRegister_PD_fluid_guiding()
+{
+  KEEP_UNUSED(_RP_PD_fluid_guiding);
+}
+}

 //! reset precomputation
 void releaseBlurPrecomp()
@@ -653,7 +768,7 @@ void releaseBlurPrecomp()
  gBlurKernelRadius = -1;
  gBlurKernel = 0.f;
 }
-static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
+static PyObject *_W_3(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
 {
  try {
    PbArgs _args(_linargs, _kwds);
@@ -675,7 +790,7 @@ static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
    return 0;
  }
 }
-static const Pb::Register _RP_releaseBlurPrecomp("", "releaseBlurPrecomp", _W_2);
+static const Pb::Register _RP_releaseBlurPrecomp("", "releaseBlurPrecomp", _W_3);
 extern "C" {
 void PbRegister_releaseBlurPrecomp()
 {
--- a/extern/mantaflow/preprocessed/plugin/initplugins.cpp
+++ b/extern/mantaflow/preprocessed/plugin/initplugins.cpp
@@ -56,7 +56,7 @@ struct KnApplyNoiseInfl : public KernelBase {
                 const WaveletNoiseField &noise,
                 const Grid<Real> &sdf,
                 Real scale,
-                 Real sigma)
+                 Real sigma) const
  {
    if (!flags.isFluid(i, j, k) || sdf(i, j, k) > sigma)
      return;
@@ -96,34 +96,36 @@ struct KnApplyNoiseInfl : public KernelBase {
    return sigma;
  }
  typedef Real type5;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnApplyNoiseInfl ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, density, noise, sdf, scale, sigma);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, flags, density, noise, sdf, scale, sigma);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, density, noise, sdf, scale, sigma);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const FlagGrid &flags;
  Grid<Real> &density;
@@ -200,7 +202,7 @@ struct KnAddNoise : public KernelBase {
                 Grid<Real> &density,
                 const WaveletNoiseField &noise,
                 const Grid<Real> *sdf,
-                 Real scale)
+                 Real scale) const
  {
    if (!flags.isFluid(i, j, k) || (sdf && (*sdf)(i, j, k) > 0.))
      return;
@@ -231,34 +233,36 @@ struct KnAddNoise : public KernelBase {
    return scale;
  }
  typedef Real type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnAddNoise ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, density, noise, sdf, scale);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, flags, density, noise, sdf, scale);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, density, noise, sdf, scale);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const FlagGrid &flags;
  Grid<Real> &density;
@@ -325,7 +329,7 @@ template<class T> struct knSetPdataNoise : public KernelBase {
                 const BasicParticleSystem &parts,
                 ParticleDataImpl<T> &pdata,
                 const WaveletNoiseField &noise,
-                 Real scale)
+                 Real scale) const
  {
    pdata[idx] = noise.evaluate(parts.getPos(idx)) * scale;
  }
@@ -349,17 +353,21 @@ template<class T> struct knSetPdataNoise : public KernelBase {
    return scale;
  }
  typedef Real type3;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knSetPdataNoise ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, parts, pdata, noise, scale);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, parts, pdata, noise, scale);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystem &parts;
  ParticleDataImpl<T> &pdata;
@@ -381,7 +389,7 @@ template<class T> struct knSetPdataNoiseVec : public KernelBase {
                 const BasicParticleSystem &parts,
                 ParticleDataImpl<T> &pdata,
                 const WaveletNoiseField &noise,
-                 Real scale)
+                 Real scale) const
  {
    pdata[idx] = noise.evaluateVec(parts.getPos(idx)) * scale;
  }
@@ -405,17 +413,21 @@ template<class T> struct knSetPdataNoiseVec : public KernelBase {
    return scale;
  }
  typedef Real type3;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knSetPdataNoiseVec ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, parts, pdata, noise, scale);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, parts, pdata, noise, scale);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystem &parts;
  ParticleDataImpl<T> &pdata;
@@ -676,7 +688,7 @@ struct KnApplyEmission : public KernelBase {
                 const Grid<Real> &source,
                 const Grid<Real> *emissionTexture,
                 bool isAbsolute,
-                 int type)
+                 int type) const
  {
    // if type is given, only apply emission when celltype matches type from flaggrid
    // and if emission texture is given, only apply emission when some emission is present at cell
@@ -721,34 +733,36 @@ struct KnApplyEmission : public KernelBase {
    return type;
  }
  typedef int type5;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnApplyEmission ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, target, source, emissionTexture, isAbsolute, type);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, flags, target, source, emissionTexture, isAbsolute, type);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, target, source, emissionTexture, isAbsolute, type);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const FlagGrid &flags;
  Grid<Real> &target;
@@ -823,7 +837,7 @@ struct KnApplyDensity : public KernelBase {
                 Grid<Real> &density,
                 const Grid<Real> &sdf,
                 Real value,
-                 Real sigma)
+                 Real sigma) const
  {
    if (!flags.isFluid(i, j, k) || sdf(i, j, k) > sigma)
      return;
@@ -854,34 +868,36 @@ struct KnApplyDensity : public KernelBase {
    return sigma;
  }
  typedef Real type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnApplyDensity ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, density, sdf, value, sigma);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, flags, density, sdf, value, sigma);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, density, sdf, value, sigma);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const FlagGrid &flags;
  Grid<Real> &density;
@@ -1025,7 +1041,7 @@ struct KnResetInObstacle : public KernelBase {
                 Grid<Real> *red,
                 Grid<Real> *green,
                 Grid<Real> *blue,
-                 Real resetValue)
+                 Real resetValue) const
  {
    if (!flags.isObstacle(i, j, k))
      return;
@@ -1099,34 +1115,36 @@ struct KnResetInObstacle : public KernelBase {
    return resetValue;
  }
  typedef Real type9;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnResetInObstacle ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, vel, density, heat, fuel, flame, red, green, blue, resetValue);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, flags, vel, density, heat, fuel, flame, red, green, blue, resetValue);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, vel, density, heat, fuel, flame, red, green, blue, resetValue);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  FlagGrid &flags;
  MACGrid &vel;
@@ -1674,7 +1692,7 @@ struct KnUpdateFractions : public KernelBase {
                 const Grid<Real> &phiObs,
                 MACGrid &fractions,
                 const int &boundaryWidth,
-                 const Real fracThreshold)
+                 const Real fracThreshold) const
  {

    // walls at domain bounds and inner objects
@@ -1769,34 +1787,36 @@ struct KnUpdateFractions : public KernelBase {
    return fracThreshold;
  }
  typedef Real type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnUpdateFractions ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, flags, phiObs, fractions, boundaryWidth, fracThreshold);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, flags, phiObs, fractions, boundaryWidth, fracThreshold);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, phiObs, fractions, boundaryWidth, fracThreshold);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const FlagGrid &flags;
  const Grid<Real> &phiObs;
@@ -1876,7 +1896,7 @@ struct KnUpdateFlagsObs : public KernelBase {
                 const Grid<Real> &phiObs,
                 const Grid<Real> *phiOut,
                 const Grid<Real> *phiIn,
-                 int boundaryWidth)
+                 int boundaryWidth) const
  {

    bool isObs = false;
@@ -1944,34 +1964,36 @@ struct KnUpdateFlagsObs : public KernelBase {
    return boundaryWidth;
  }
  typedef int type5;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnUpdateFlagsObs ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = boundaryWidth; j < _maxY; j++)
-            for (int i = boundaryWidth; i < _maxX; i++)
-              op(i, j, k, flags, fractions, phiObs, phiOut, phiIn, boundaryWidth);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = boundaryWidth; j < _maxY; j++)
          for (int i = boundaryWidth; i < _maxX; i++)
            op(i, j, k, flags, fractions, phiObs, phiOut, phiIn, boundaryWidth);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = boundaryWidth; i < _maxX; i++)
+          op(i, j, k, flags, fractions, phiObs, phiOut, phiIn, boundaryWidth);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(boundaryWidth, maxY), *this);
  }
  FlagGrid &flags;
  const MACGrid *fractions;
@@ -2045,7 +2067,7 @@ struct kninitVortexVelocity : public KernelBase {
                 const Grid<Real> &phiObs,
                 MACGrid &vel,
                 const Vec3 &center,
-                 const Real &radius)
+                 const Real &radius) const
  {

    if (phiObs(i, j, k) >= -1.) {
@@ -2093,34 +2115,36 @@ struct kninitVortexVelocity : public KernelBase {
    return radius;
  }
  typedef Real type3;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel kninitVortexVelocity ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, phiObs, vel, center, radius);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, phiObs, vel, center, radius);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, phiObs, vel, center, radius);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const Grid<Real> &phiObs;
  MACGrid &vel;
@@ -2294,7 +2318,7 @@ template<class T> struct knBlurGrid : public KernelBase {
                 Grid<T> &originGrid,
                 Grid<T> &targetGrid,
                 GaussianKernelCreator &gkSigma,
-                 int cdir)
+                 int cdir) const
  {
    targetGrid(i, j, k) = convolveGrid<T>(originGrid, gkSigma, Vec3(i, j, k), cdir);
  }
@@ -2318,34 +2342,36 @@ template<class T> struct knBlurGrid : public KernelBase {
    return cdir;
  }
  typedef int type3;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knBlurGrid ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, originGrid, targetGrid, gkSigma, cdir);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, originGrid, targetGrid, gkSigma, cdir);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, originGrid, targetGrid, gkSigma, cdir);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  Grid<T> &originGrid;
  Grid<T> &targetGrid;
@@ -2386,7 +2412,7 @@ struct KnBlurMACGridGauss : public KernelBase {
                 MACGrid &originGrid,
                 MACGrid &target,
                 GaussianKernelCreator &gkSigma,
-                 int cdir)
+                 int cdir) const
  {
    Vec3 pos(i, j, k);
    Vec3 step(1.0, 0.0, 0.0);
@@ -2436,34 +2462,36 @@ struct KnBlurMACGridGauss : public KernelBase {
    return cdir;
  }
  typedef int type3;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnBlurMACGridGauss ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, originGrid, target, gkSigma, cdir);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, originGrid, target, gkSigma, cdir);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, originGrid, target, gkSigma, cdir);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  MACGrid &originGrid;
  MACGrid &target;
--- a/extern/mantaflow/preprocessed/plugin/kepsilon.cpp
+++ b/extern/mantaflow/preprocessed/plugin/kepsilon.cpp
@@ -61,7 +61,7 @@ struct KnTurbulenceClamp : public KernelBase {
                 Real minK,
                 Real maxK,
                 Real minNu,
-                 Real maxNu)
+                 Real maxNu) const
  {
    Real eps = egrid[idx];
    Real ke = clamp(kgrid[idx], minK, maxK);
@@ -104,17 +104,21 @@ struct KnTurbulenceClamp : public KernelBase {
    return maxNu;
  }
  typedef Real type5;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel KnTurbulenceClamp ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, kgrid, egrid, minK, maxK, minNu, maxNu);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, kgrid, egrid, minK, maxK, minNu, maxNu);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid<Real> &kgrid;
  Grid<Real> &egrid;
@@ -159,7 +163,7 @@ struct KnComputeProduction : public KernelBase {
                 Grid<Real> &prod,
                 Grid<Real> &nuT,
                 Grid<Real> *strain,
-                 Real pscale = 1.0f)
+                 Real pscale = 1.0f) const
  {
    Real curEps = eps(i, j, k);
    if (curEps > 0) {
@@ -230,34 +234,36 @@ struct KnComputeProduction : public KernelBase {
    return pscale;
  }
  typedef Real type7;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnComputeProduction ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, vel, velCenter, ke, eps, prod, nuT, strain, pscale);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, vel, velCenter, ke, eps, prod, nuT, strain, pscale);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, vel, velCenter, ke, eps, prod, nuT, strain, pscale);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const MACGrid &vel;
  const Grid<Vec3> &velCenter;
@@ -339,7 +345,7 @@ struct KnAddTurbulenceSource : public KernelBase {
    run();
  }
  inline void op(
-      IndexInt idx, Grid<Real> &kgrid, Grid<Real> &egrid, const Grid<Real> &pgrid, Real dt)
+      IndexInt idx, Grid<Real> &kgrid, Grid<Real> &egrid, const Grid<Real> &pgrid, Real dt) const
  {
    Real eps = egrid[idx], prod = pgrid[idx], ke = kgrid[idx];
    if (ke <= 0)
@@ -373,17 +379,21 @@ struct KnAddTurbulenceSource : public KernelBase {
    return dt;
  }
  typedef Real type3;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel KnAddTurbulenceSource ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, kgrid, egrid, pgrid, dt);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, kgrid, egrid, pgrid, dt);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid<Real> &kgrid;
  Grid<Real> &egrid;
--- a/extern/mantaflow/preprocessed/plugin/pressure.cpp
+++ b/extern/mantaflow/preprocessed/plugin/pressure.cpp
@@ -138,10 +138,9 @@ struct MakeRhs : public KernelBase {
      }
    }

-    // TODO (sebbas): Disabled for now
    // per cell divergence correction (optional)
-    // if(perCellCorr)
-    // 	set += perCellCorr->get(i,j,k);
+    if (perCellCorr)
+      set += perCellCorr->get(i, j, k);

    // obtain sum, cell count
    sum += set;
@@ -199,61 +198,19 @@ struct MakeRhs : public KernelBase {
    return gfClamp;
  }
  typedef Real type9;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel MakeRhs ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-      const FlagGrid &flags = getArg0();
-      Grid<Real> &rhs = getArg1();
-      const MACGrid &vel = getArg2();
-      const Grid<Real> *perCellCorr = getArg3();
-      const MACGrid *fractions = getArg4();
-      const MACGrid *obvel = getArg5();
-      const Grid<Real> *phi = getArg6();
-      const Grid<Real> *curv = getArg7();
-      const Real &surfTens = getArg8();
-      const Real &gfClamp = getArg9();
-#pragma omp target teams distribute parallel for reduction(+:cnt, sum) collapse(3) schedule(static,1)
-      {
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i,
-                 j,
-                 k,
-                 flags,
-                 rhs,
-                 vel,
-                 perCellCorr,
-                 fractions,
-                 obvel,
-                 phi,
-                 curv,
-                 surfTens,
-                 gfClamp,
-                 cnt,
-                 sum);
-      }
-      {
-        this->sum = sum;
-      }
-    }
-    else {
-      const int k = 0;
-      const FlagGrid &flags = getArg0();
-      Grid<Real> &rhs = getArg1();
-      const MACGrid &vel = getArg2();
-      const Grid<Real> *perCellCorr = getArg3();
-      const MACGrid *fractions = getArg4();
-      const MACGrid *obvel = getArg5();
-      const Grid<Real> *phi = getArg6();
-      const Grid<Real> *curv = getArg7();
-      const Real &surfTens = getArg8();
-      const Real &gfClamp = getArg9();
-#pragma omp target teams distribute parallel for reduction(+:cnt, sum) collapse(2) schedule(static,1)
-      {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i,
@@ -271,11 +228,55 @@ struct MakeRhs : public KernelBase {
               gfClamp,
               cnt,
               sum);
-      }
-      {
-        this->sum = sum;
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i,
+             j,
+             k,
+             flags,
+             rhs,
+             vel,
+             perCellCorr,
+             fractions,
+             obvel,
+             phi,
+             curv,
+             surfTens,
+             gfClamp,
+             cnt,
+             sum);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_reduce(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_reduce(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  MakeRhs(MakeRhs &o, tbb::split)
+      : KernelBase(o),
+        flags(o.flags),
+        rhs(o.rhs),
+        vel(o.vel),
+        perCellCorr(o.perCellCorr),
+        fractions(o.fractions),
+        obvel(o.obvel),
+        phi(o.phi),
+        curv(o.curv),
+        surfTens(o.surfTens),
+        gfClamp(o.gfClamp),
+        cnt(0),
+        sum(0)
+  {
+  }
+  void join(const MakeRhs &o)
+  {
+    cnt += o.cnt;
+    sum += o.sum;
  }
  const FlagGrid &flags;
  Grid<Real> &rhs;
@@ -301,7 +302,7 @@ struct knCorrectVelocity : public KernelBase {
    run();
  }
  inline void op(
-      int i, int j, int k, const FlagGrid &flags, MACGrid &vel, const Grid<Real> &pressure)
+      int i, int j, int k, const FlagGrid &flags, MACGrid &vel, const Grid<Real> &pressure) const
  {
    const IndexInt idx = flags.index(i, j, k);
    if (flags.isFluid(idx)) {
@@ -352,35 +353,36 @@ struct knCorrectVelocity : public KernelBase {
    return pressure;
  }
  typedef Grid<Real> type2;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knCorrectVelocity ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-      const FlagGrid &flags = getArg0();
-      MACGrid &vel = getArg1();
-      const Grid<Real> &pressure = getArg2();
-#pragma omp target teams distribute parallel for collapse(3) schedule(static, 1)
-      {
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, flags, vel, pressure);
-      }
-    }
-    else {
-      const int k = 0;
-      const FlagGrid &flags = getArg0();
-      MACGrid &vel = getArg1();
-      const Grid<Real> &pressure = getArg2();
-#pragma omp target teams distribute parallel for collapse(2) schedule(static, 1)
-      {
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, flags, vel, pressure);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, vel, pressure);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const FlagGrid &flags;
  MACGrid &vel;
@@ -439,7 +441,7 @@ struct ApplyGhostFluidDiagonal : public KernelBase {
                 Grid<Real> &A0,
                 const FlagGrid &flags,
                 const Grid<Real> &phi,
-                 const Real gfClamp)
+                 const Real gfClamp) const
  {
    const int X = flags.getStrideX(), Y = flags.getStrideY(), Z = flags.getStrideZ();
    const IndexInt idx = flags.index(i, j, k);
@@ -481,34 +483,36 @@ struct ApplyGhostFluidDiagonal : public KernelBase {
    return gfClamp;
  }
  typedef Real type3;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel ApplyGhostFluidDiagonal ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, A0, flags, phi, gfClamp);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, A0, flags, phi, gfClamp);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, A0, flags, phi, gfClamp);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  Grid<Real> &A0;
  const FlagGrid &flags;
@@ -547,7 +551,7 @@ struct knCorrectVelocityGhostFluid : public KernelBase {
                 const Grid<Real> &phi,
                 Real gfClamp,
                 const Grid<Real> *curv,
-                 const Real surfTens)
+                 const Real surfTens) const
  {
    const IndexInt X = flags.getStrideX(), Y = flags.getStrideY(), Z = flags.getStrideZ();
    const IndexInt idx = flags.index(i, j, k);
@@ -636,34 +640,36 @@ struct knCorrectVelocityGhostFluid : public KernelBase {
    return surfTens;
  }
  typedef Real type6;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knCorrectVelocityGhostFluid ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, vel, flags, pressure, phi, gfClamp, curv, surfTens);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, vel, flags, pressure, phi, gfClamp, curv, surfTens);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, vel, flags, pressure, phi, gfClamp, curv, surfTens);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  MACGrid &vel;
  const FlagGrid &flags;
@@ -704,7 +710,7 @@ struct knReplaceClampedGhostFluidVels : public KernelBase {
                 const FlagGrid &flags,
                 const Grid<Real> &pressure,
                 const Grid<Real> &phi,
-                 Real gfClamp)
+                 Real gfClamp) const
  {
    const IndexInt idx = flags.index(i, j, k);
    const IndexInt X = flags.getStrideX(), Y = flags.getStrideY(), Z = flags.getStrideZ();
@@ -752,34 +758,36 @@ struct knReplaceClampedGhostFluidVels : public KernelBase {
    return gfClamp;
  }
  typedef Real type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knReplaceClampedGhostFluidVels ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, vel, flags, pressure, phi, gfClamp);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, vel, flags, pressure, phi, gfClamp);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, vel, flags, pressure, phi, gfClamp);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  MACGrid &vel;
  const FlagGrid &flags;
@@ -814,21 +822,28 @@ struct CountEmptyCells : public KernelBase {
    return flags;
  }
  typedef FlagGrid type0;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel CountEmptyCells ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, flags, numEmpty);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-      int numEmpty = 0;
-#pragma omp for nowait
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, flags, numEmpty);
-#pragma omp critical
-      {
-        this->numEmpty += numEmpty;
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  CountEmptyCells(CountEmptyCells &o, tbb::split) : KernelBase(o), flags(o.flags), numEmpty(0)
+  {
+  }
+  void join(const CountEmptyCells &o)
+  {
+    numEmpty += o.numEmpty;
  }
  const FlagGrid &flags;
  int numEmpty;
@@ -949,19 +964,11 @@ void computePressureRhs(Grid<Real> &rhs,
                        const Real surfTens = 0.)
 {
  // compute divergence and init right hand side
-  // auto kernMakeRhs = new MakeRhs(flags, rhs, vel, perCellCorr, fractions, obvel, phi, curv,
-  // surfTens, gfClamp );
-  printf("pressure = %p, flags = %p, rhs = %p, vel = %p\n",
-         pressure.mData,
-         flags.mData,
-         rhs.mData,
-         vel.mData);
-  MakeRhs(flags, rhs, vel, perCellCorr, fractions, obvel, phi, curv, surfTens, gfClamp);
+  MakeRhs kernMakeRhs(
+      flags, rhs, vel, perCellCorr, fractions, obvel, phi, curv, surfTens, gfClamp);

-  // TODO (sebbas): Disabled for now
-  // if(enforceCompatibility)
-  // 	rhs += (Real)(-kernMakeRhs->sum / (Real)kernMakeRhs->cnt);
-  // delete kernMakeRhs;
+  if (enforceCompatibility)
+    rhs += (Real)(-kernMakeRhs.sum / (Real)kernMakeRhs.cnt);
 }
 static PyObject *_W_1(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
 {
@@ -1043,13 +1050,6 @@ void solvePressureSystem(Grid<Real> &rhs,
                         MACGrid &vel,
                         Grid<Real> &pressure,
                         const FlagGrid &flags,
-                         Grid<Real> *residual = nullptr,
-                         Grid<Real> *search = nullptr,
-                         Grid<Real> *A0 = nullptr,
-                         Grid<Real> *Ai = nullptr,
-                         Grid<Real> *Aj = nullptr,
-                         Grid<Real> *Ak = nullptr,
-                         Grid<Real> *tmp = nullptr,
                         Real cgAccuracy = 1e-3,
                         const Grid<Real> *phi = nullptr,
                         const Grid<Real> *perCellCorr = nullptr,
@@ -1069,37 +1069,19 @@ void solvePressureSystem(Grid<Real> &rhs,

  // reserve temp grids
  FluidSolver *parent = flags.getParent();
-
-  bool cleanUp = false;
-  if (!residual) {
-    residual = new Grid<Real>(parent, true, false, true);
-    search = new Grid<Real>(parent, true, false, true);
-    A0 = new Grid<Real>(parent, true, false, true);
-    Ai = new Grid<Real>(parent, true, false, true);
-    Aj = new Grid<Real>(parent, true, false, true);
-    Ak = new Grid<Real>(parent, true, false, true);
-    tmp = new Grid<Real>(parent, true, false, true);
-    cleanUp = true;
-  }
-  else {
-    residual->clear(true);
-    search->clear(true);
-    A0->clear(true);
-    Ai->clear(true);
-    Aj->clear(true);
-    Ak->clear(true);
-    tmp->clear(true);
-  }
-
-  std::cout << "HERE 5" << std::endl;
+  Grid<Real> residual(parent);
+  Grid<Real> search(parent);
+  Grid<Real> A0(parent);
+  Grid<Real> Ai(parent);
+  Grid<Real> Aj(parent);
+  Grid<Real> Ak(parent);
+  Grid<Real> tmp(parent);

  // setup matrix and boundaries
-  MakeLaplaceMatrix(flags, *A0, *Ai, *Aj, *Ak, fractions);
-  // MakeLaplaceMatrix(flags, A0, Ai, Aj, Ak, fractions);
+  MakeLaplaceMatrix(flags, A0, Ai, Aj, Ak, fractions);

-  // TODO (sebbas): Disabled for now
  if (phi) {
-    ApplyGhostFluidDiagonal(*A0, flags, *phi, gfClamp);
+    ApplyGhostFluidDiagonal(A0, flags, *phi, gfClamp);
  }

  // check whether we need to fix some pressure value...
@@ -1143,8 +1125,7 @@ void solvePressureSystem(Grid<Real> &rhs,
      // debMsg("No empty cells! Fixing pressure of cell "<<fixPidx<<" to zero",1);
    }
    if (fixPidx >= 0) {
-      fixPressure(fixPidx, Real(0), rhs, *A0, *Ai, *Aj, *Ak);
-      // fixPressure(fixPidx, Real(0), rhs, A0, Ai, Aj, Ak);
+      fixPressure(fixPidx, Real(0), rhs, A0, Ai, Aj, Ak);
      static bool msgOnce = false;
      if (!msgOnce) {
        debMsg("Pinning pressure of cell " << fixPidx << " to zero", 2);
@@ -1152,21 +1133,20 @@ void solvePressureSystem(Grid<Real> &rhs,
      }
    }
  }
-  std::cout << "HERE 6" << std::endl;

  // CG setup
  // note: the last factor increases the max iterations for 2d, which right now can't use a
  // preconditioner
  GridCgInterface *gcg;
-  if (vel.is3D())
-    gcg = new GridCg<ApplyMatrix>(pressure, rhs, *residual, *search, flags, *tmp, A0, Ai, Aj, Ak);
-  // gcg = new GridCg<ApplyMatrix>  (pressure, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj,
-  // &Ak);
-  else
-    gcg = new GridCg<ApplyMatrix2D>(
-        pressure, rhs, *residual, *search, flags, *tmp, A0, Ai, Aj, Ak);
-  // gcg = new GridCg<ApplyMatrix2D>(pressure, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj,
-  // &Ak);
+  vector<Grid<Real> *> matA{&A0, &Ai, &Aj};
+
+  if (vel.is3D()) {
+    matA.push_back(&Ak);
+    gcg = new GridCg<ApplyMatrix>(pressure, rhs, residual, search, flags, tmp, matA);
+  }
+  else {
+    gcg = new GridCg<ApplyMatrix2D>(pressure, rhs, residual, search, flags, tmp, matA);
+  }

  gcg->setAccuracy(cgAccuracy);
  gcg->setUseL2Norm(useL2Norm);
@@ -1175,7 +1155,6 @@ void solvePressureSystem(Grid<Real> &rhs,

  Grid<Real> *pca0 = nullptr, *pca1 = nullptr, *pca2 = nullptr, *pca3 = nullptr;
  GridMg *pmg = nullptr;
-  std::cout << "HERE 7" << std::endl;

  // optional preconditioning
  if (preconditioner == PcMIC) {
@@ -1201,12 +1180,10 @@ void solvePressureSystem(Grid<Real> &rhs,

    gcg->setMGPreconditioner(GridCgInterface::PC_MGP, pmg);
  }
-  std::cout << "HERE 8" << std::endl;

  // CG solve
-  Real time = 0;
  for (int iter = 0; iter < maxIter; iter++) {
-    if (!gcg->iterate(time))
+    if (!gcg->iterate())
      iter = maxIter;
    if (iter < maxIter)
      debMsg("FluidSolver::solvePressure iteration " << iter
@@ -1216,26 +1193,8 @@ void solvePressureSystem(Grid<Real> &rhs,
  debMsg("FluidSolver::solvePressure done. Iterations:" << gcg->getIterations()
                                                        << ", residual:" << gcg->getResNorm(),
         2);
-  // std::cout << "TIME: " << time << std::endl;

  // Cleanup
-  if (cleanUp) {
-    if (residual)
-      delete residual;
-    if (search)
-      delete search;
-    if (A0)
-      delete A0;
-    if (Ai)
-      delete Ai;
-    if (Aj)
-      delete Aj;
-    if (Ak)
-      delete Ak;
-    if (tmp)
-      delete tmp;
-  }
-
  if (gcg)
    delete gcg;
  if (pca0)
@@ -1266,40 +1225,26 @@ static PyObject *_W_2(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 1, &_lock);
      Grid<Real> &pressure = *_args.getPtr<Grid<Real>>("pressure", 2, &_lock);
      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 3, &_lock);
-      Grid<Real> *residual = _args.getPtrOpt<Grid<Real>>("residual", 4, nullptr, &_lock);
-      Grid<Real> *search = _args.getPtrOpt<Grid<Real>>("search", 5, nullptr, &_lock);
-      Grid<Real> *A0 = _args.getPtrOpt<Grid<Real>>("A0", 6, nullptr, &_lock);
-      Grid<Real> *Ai = _args.getPtrOpt<Grid<Real>>("Ai", 7, nullptr, &_lock);
-      Grid<Real> *Aj = _args.getPtrOpt<Grid<Real>>("Aj", 8, nullptr, &_lock);
-      Grid<Real> *Ak = _args.getPtrOpt<Grid<Real>>("Ak", 9, nullptr, &_lock);
-      Grid<Real> *tmp = _args.getPtrOpt<Grid<Real>>("tmp", 10, nullptr, &_lock);
-      Real cgAccuracy = _args.getOpt<Real>("cgAccuracy", 11, 1e-3, &_lock);
-      const Grid<Real> *phi = _args.getPtrOpt<Grid<Real>>("phi", 12, nullptr, &_lock);
+      Real cgAccuracy = _args.getOpt<Real>("cgAccuracy", 4, 1e-3, &_lock);
+      const Grid<Real> *phi = _args.getPtrOpt<Grid<Real>>("phi", 5, nullptr, &_lock);
      const Grid<Real> *perCellCorr = _args.getPtrOpt<Grid<Real>>(
-          "perCellCorr", 13, nullptr, &_lock);
-      const MACGrid *fractions = _args.getPtrOpt<MACGrid>("fractions", 14, nullptr, &_lock);
-      Real gfClamp = _args.getOpt<Real>("gfClamp", 15, 1e-04, &_lock);
-      Real cgMaxIterFac = _args.getOpt<Real>("cgMaxIterFac", 16, 1.5, &_lock);
-      bool precondition = _args.getOpt<bool>("precondition", 17, true, &_lock);
-      int preconditioner = _args.getOpt<int>("preconditioner", 18, PcMIC, &_lock);
+          "perCellCorr", 6, nullptr, &_lock);
+      const MACGrid *fractions = _args.getPtrOpt<MACGrid>("fractions", 7, nullptr, &_lock);
+      Real gfClamp = _args.getOpt<Real>("gfClamp", 8, 1e-04, &_lock);
+      Real cgMaxIterFac = _args.getOpt<Real>("cgMaxIterFac", 9, 1.5, &_lock);
+      bool precondition = _args.getOpt<bool>("precondition", 10, true, &_lock);
+      int preconditioner = _args.getOpt<int>("preconditioner", 11, PcMIC, &_lock);
      const bool enforceCompatibility = _args.getOpt<bool>(
-          "enforceCompatibility", 19, false, &_lock);
-      const bool useL2Norm = _args.getOpt<bool>("useL2Norm", 20, false, &_lock);
-      const bool zeroPressureFixing = _args.getOpt<bool>("zeroPressureFixing", 21, false, &_lock);
-      const Grid<Real> *curv = _args.getPtrOpt<Grid<Real>>("curv", 22, nullptr, &_lock);
-      const Real surfTens = _args.getOpt<Real>("surfTens", 23, 0., &_lock);
+          "enforceCompatibility", 12, false, &_lock);
+      const bool useL2Norm = _args.getOpt<bool>("useL2Norm", 13, false, &_lock);
+      const bool zeroPressureFixing = _args.getOpt<bool>("zeroPressureFixing", 14, false, &_lock);
+      const Grid<Real> *curv = _args.getPtrOpt<Grid<Real>>("curv", 15, nullptr, &_lock);
+      const Real surfTens = _args.getOpt<Real>("surfTens", 16, 0., &_lock);
      _retval = getPyNone();
      solvePressureSystem(rhs,
                          vel,
                          pressure,
                          flags,
-                          residual,
-                          search,
-                          A0,
-                          Ai,
-                          Aj,
-                          Ak,
-                          tmp,
                          cgAccuracy,
                          phi,
                          perCellCorr,
@@ -1425,13 +1370,6 @@ void PbRegister_correctVelocity()
 void solvePressure(MACGrid &vel,
                   Grid<Real> &pressure,
                   const FlagGrid &flags,
-                   Grid<Real> *residual = nullptr,
-                   Grid<Real> *search = nullptr,
-                   Grid<Real> *A0 = nullptr,
-                   Grid<Real> *Ai = nullptr,
-                   Grid<Real> *Aj = nullptr,
-                   Grid<Real> *Ak = nullptr,
-                   Grid<Real> *tmp = nullptr,
                   Real cgAccuracy = 1e-3,
                   const Grid<Real> *phi = nullptr,
                   const Grid<Real> *perCellCorr = nullptr,
@@ -1473,13 +1411,6 @@ void solvePressure(MACGrid &vel,
                      vel,
                      pressure,
                      flags,
-                      residual,
-                      search,
-                      A0,
-                      Ai,
-                      Aj,
-                      Aj,
-                      tmp,
                      cgAccuracy,
                      phi,
                      perCellCorr,
@@ -1511,11 +1442,10 @@ void solvePressure(MACGrid &vel,
                  curv,
                  surfTens);

-  // TODO (sebbas): Disabled for now
  // optionally , return RHS
-  // if(retRhs) {
-  // 	retRhs->copyFrom(rhs);
-  // }
+  if (retRhs) {
+    retRhs->copyFrom(rhs);
+  }
 }
 static PyObject *_W_4(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
 {
@@ -1530,40 +1460,26 @@ static PyObject *_W_4(PyObject *_self, PyObject *_linargs, PyObject *_kwds)
      MACGrid &vel = *_args.getPtr<MACGrid>("vel", 0, &_lock);
      Grid<Real> &pressure = *_args.getPtr<Grid<Real>>("pressure", 1, &_lock);
      const FlagGrid &flags = *_args.getPtr<FlagGrid>("flags", 2, &_lock);
-      Grid<Real> *residual = _args.getPtrOpt<Grid<Real>>("residual", 3, nullptr, &_lock);
-      Grid<Real> *search = _args.getPtrOpt<Grid<Real>>("search", 4, nullptr, &_lock);
-      Grid<Real> *A0 = _args.getPtrOpt<Grid<Real>>("A0", 5, nullptr, &_lock);
-      Grid<Real> *Ai = _args.getPtrOpt<Grid<Real>>("Ai", 6, nullptr, &_lock);
-      Grid<Real> *Aj = _args.getPtrOpt<Grid<Real>>("Aj", 7, nullptr, &_lock);
-      Grid<Real> *Ak = _args.getPtrOpt<Grid<Real>>("Ak", 8, nullptr, &_lock);
-      Grid<Real> *tmp = _args.getPtrOpt<Grid<Real>>("tmp", 9, nullptr, &_lock);
-      Real cgAccuracy = _args.getOpt<Real>("cgAccuracy", 10, 1e-3, &_lock);
-      const Grid<Real> *phi = _args.getPtrOpt<Grid<Real>>("phi", 11, nullptr, &_lock);
+      Real cgAccuracy = _args.getOpt<Real>("cgAccuracy", 3, 1e-3, &_lock);
+      const Grid<Real> *phi = _args.getPtrOpt<Grid<Real>>("phi", 4, nullptr, &_lock);
      const Grid<Real> *perCellCorr = _args.getPtrOpt<Grid<Real>>(
-          "perCellCorr", 12, nullptr, &_lock);
-      const MACGrid *fractions = _args.getPtrOpt<MACGrid>("fractions", 13, nullptr, &_lock);
-      const MACGrid *obvel = _args.getPtrOpt<MACGrid>("obvel", 14, nullptr, &_lock);
-      Real gfClamp = _args.getOpt<Real>("gfClamp", 15, 1e-04, &_lock);
-      Real cgMaxIterFac = _args.getOpt<Real>("cgMaxIterFac", 16, 1.5, &_lock);
-      bool precondition = _args.getOpt<bool>("precondition", 17, true, &_lock);
-      int preconditioner = _args.getOpt<int>("preconditioner", 18, PcMIC, &_lock);
-      bool enforceCompatibility = _args.getOpt<bool>("enforceCompatibility", 19, false, &_lock);
-      bool useL2Norm = _args.getOpt<bool>("useL2Norm", 20, false, &_lock);
-      bool zeroPressureFixing = _args.getOpt<bool>("zeroPressureFixing", 21, false, &_lock);
-      const Grid<Real> *curv = _args.getPtrOpt<Grid<Real>>("curv", 22, nullptr, &_lock);
-      const Real surfTens = _args.getOpt<Real>("surfTens", 23, 0., &_lock);
-      Grid<Real> *retRhs = _args.getPtrOpt<Grid<Real>>("retRhs", 24, nullptr, &_lock);
+          "perCellCorr", 5, nullptr, &_lock);
+      const MACGrid *fractions = _args.getPtrOpt<MACGrid>("fractions", 6, nullptr, &_lock);
+      const MACGrid *obvel = _args.getPtrOpt<MACGrid>("obvel", 7, nullptr, &_lock);
+      Real gfClamp = _args.getOpt<Real>("gfClamp", 8, 1e-04, &_lock);
+      Real cgMaxIterFac = _args.getOpt<Real>("cgMaxIterFac", 9, 1.5, &_lock);
+      bool precondition = _args.getOpt<bool>("precondition", 10, true, &_lock);
+      int preconditioner = _args.getOpt<int>("preconditioner", 11, PcMIC, &_lock);
+      bool enforceCompatibility = _args.getOpt<bool>("enforceCompatibility", 12, false, &_lock);
+      bool useL2Norm = _args.getOpt<bool>("useL2Norm", 13, false, &_lock);
+      bool zeroPressureFixing = _args.getOpt<bool>("zeroPressureFixing", 14, false, &_lock);
+      const Grid<Real> *curv = _args.getPtrOpt<Grid<Real>>("curv", 15, nullptr, &_lock);
+      const Real surfTens = _args.getOpt<Real>("surfTens", 16, 0., &_lock);
+      Grid<Real> *retRhs = _args.getPtrOpt<Grid<Real>>("retRhs", 17, nullptr, &_lock);
      _retval = getPyNone();
      solvePressure(vel,
                    pressure,
                    flags,
-                    residual,
-                    search,
-                    A0,
-                    Ai,
-                    Aj,
-                    Ak,
-                    tmp,
                    cgAccuracy,
                    phi,
                    perCellCorr,
--- a/extern/mantaflow/preprocessed/plugin/ptsplugins.cpp
+++ b/extern/mantaflow/preprocessed/plugin/ptsplugins.cpp
@@ -34,7 +34,7 @@ struct KnAddForcePvel : public KernelBase {
                 ParticleDataImpl<Vec3> &v,
                 const Vec3 &da,
                 const ParticleDataImpl<int> *ptype,
-                 const int exclude)
+                 const int exclude) const
  {
    if (ptype && ((*ptype)[idx] & exclude))
      return;
@@ -60,17 +60,21 @@ struct KnAddForcePvel : public KernelBase {
    return exclude;
  }
  typedef int type3;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel KnAddForcePvel ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, v, da, ptype, exclude);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, v, da, ptype, exclude);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleDataImpl<Vec3> &v;
  const Vec3 &da;
@@ -146,7 +150,7 @@ struct KnUpdateVelocityFromDeltaPos : public KernelBase {
                 const ParticleDataImpl<Vec3> &x_prev,
                 const Real over_dt,
                 const ParticleDataImpl<int> *ptype,
-                 const int exclude)
+                 const int exclude) const
  {
    if (ptype && ((*ptype)[idx] & exclude))
      return;
@@ -182,17 +186,21 @@ struct KnUpdateVelocityFromDeltaPos : public KernelBase {
    return exclude;
  }
  typedef int type5;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel KnUpdateVelocityFromDeltaPos ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, p, v, x_prev, over_dt, ptype, exclude);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, p, v, x_prev, over_dt, ptype, exclude);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystem &p;
  ParticleDataImpl<Vec3> &v;
@@ -265,7 +273,7 @@ struct KnStepEuler : public KernelBase {
                 const ParticleDataImpl<Vec3> &v,
                 const Real dt,
                 const ParticleDataImpl<int> *ptype,
-                 const int exclude)
+                 const int exclude) const
  {
    if (ptype && ((*ptype)[idx] & exclude))
      return;
@@ -296,17 +304,21 @@ struct KnStepEuler : public KernelBase {
    return exclude;
  }
  typedef int type4;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel KnStepEuler ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, p, v, dt, ptype, exclude);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, p, v, dt, ptype, exclude);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  BasicParticleSystem &p;
  const ParticleDataImpl<Vec3> &v;
@@ -381,7 +393,7 @@ struct KnSetPartType : public KernelBase {
                 const int mark,
                 const int stype,
                 const FlagGrid &flags,
-                 const int cflag)
+                 const int cflag) const
  {
    if (flags.isInBounds(part.getPos(idx), 0) && (flags.getAt(part.getPos(idx)) & cflag) &&
        (ptype[idx] & stype))
@@ -417,17 +429,21 @@ struct KnSetPartType : public KernelBase {
    return cflag;
  }
  typedef int type5;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel KnSetPartType ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, ptype, part, mark, stype, flags, cflag);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, ptype, part, mark, stype, flags, cflag);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  ParticleDataImpl<int> &ptype;
  const BasicParticleSystem &part;
--- a/extern/mantaflow/preprocessed/plugin/secondaryparticles.cpp
+++ b/extern/mantaflow/preprocessed/plugin/secondaryparticles.cpp
@@ -99,7 +99,7 @@ struct knFlipComputeSecondaryParticlePotentials : public KernelBase {
                 const Real scaleFromManta,
                 const int itype = FlagGrid::TypeFluid,
                 const int jtype = FlagGrid::TypeObstacle | FlagGrid::TypeOutflow |
-                                   FlagGrid::TypeInflow)
+                                   FlagGrid::TypeInflow) const
  {

    if (!(flags(i, j, k) & itype))
@@ -253,48 +253,19 @@ struct knFlipComputeSecondaryParticlePotentials : public KernelBase {
    return jtype;
  }
  typedef int type16;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipComputeSecondaryParticlePotentials ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = radius; j < _maxY; j++)
-            for (int i = radius; i < _maxX; i++)
-              op(i,
-                 j,
-                 k,
-                 potTA,
-                 potWC,
-                 potKE,
-                 neighborRatio,
-                 flags,
-                 v,
-                 normal,
-                 radius,
-                 tauMinTA,
-                 tauMaxTA,
-                 tauMinWC,
-                 tauMaxWC,
-                 tauMinKE,
-                 tauMaxKE,
-                 scaleFromManta,
-                 itype,
-                 jtype);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = radius; j < _maxY; j++)
          for (int i = radius; i < _maxX; i++)
            op(i,
@@ -317,8 +288,39 @@ struct knFlipComputeSecondaryParticlePotentials : public KernelBase {
               scaleFromManta,
               itype,
               jtype);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = radius; i < _maxX; i++)
+          op(i,
+             j,
+             k,
+             potTA,
+             potWC,
+             potKE,
+             neighborRatio,
+             flags,
+             v,
+             normal,
+             radius,
+             tauMinTA,
+             tauMaxTA,
+             tauMinWC,
+             tauMaxWC,
+             tauMinKE,
+             tauMaxKE,
+             scaleFromManta,
+             itype,
+             jtype);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(radius, maxY), *this);
  }
  Grid<Real> &potTA;
  Grid<Real> &potWC;
@@ -668,7 +670,13 @@ struct knFlipSampleSecondaryParticlesMoreCylinders : public KernelBase {
    return rand;
  }
  typedef RandomStream type17;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipSampleSecondaryParticlesMoreCylinders ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
  void run()
  {
    const int _maxX = maxX;
@@ -922,7 +930,13 @@ struct knFlipSampleSecondaryParticles : public KernelBase {
    return rand;
  }
  typedef RandomStream type17;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipSampleSecondaryParticles ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
  void run()
  {
    const int _maxX = maxX;
@@ -1178,7 +1192,7 @@ struct knFlipUpdateSecondaryParticlesLinear : public KernelBase {
                 const Real c_b,
                 const Real dt,
                 const int exclude,
-                 const int antitunneling)
+                 const int antitunneling) const
  {

    if (!pts_sec.isActive(idx) || pts_sec[idx].flag & exclude)
@@ -1328,32 +1342,36 @@ struct knFlipUpdateSecondaryParticlesLinear : public KernelBase {
    return antitunneling;
  }
  typedef int type14;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipUpdateSecondaryParticlesLinear ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx,
+         pts_sec,
+         v_sec,
+         l_sec,
+         f_sec,
+         flags,
+         v,
+         neighborRatio,
+         gravity,
+         k_b,
+         k_d,
+         c_s,
+         c_b,
+         dt,
+         exclude,
+         antitunneling);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i,
-           pts_sec,
-           v_sec,
-           l_sec,
-           f_sec,
-           flags,
-           v,
-           neighborRatio,
-           gravity,
-           k_b,
-           k_d,
-           c_s,
-           c_b,
-           dt,
-           exclude,
-           antitunneling);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  BasicParticleSystem &pts_sec;
  ParticleDataImpl<Vec3> &v_sec;
@@ -1431,7 +1449,7 @@ struct knFlipUpdateSecondaryParticlesCubic : public KernelBase {
                 const Real dt,
                 const int exclude,
                 const int antitunneling,
-                 const int itype)
+                 const int itype) const
  {

    if (!pts_sec.isActive(idx) || pts_sec[idx].flag & exclude)
@@ -1637,34 +1655,38 @@ struct knFlipUpdateSecondaryParticlesCubic : public KernelBase {
    return itype;
  }
  typedef int type16;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipUpdateSecondaryParticlesCubic ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx,
+         pts_sec,
+         v_sec,
+         l_sec,
+         f_sec,
+         flags,
+         v,
+         neighborRatio,
+         radius,
+         gravity,
+         k_b,
+         k_d,
+         c_s,
+         c_b,
+         dt,
+         exclude,
+         antitunneling,
+         itype);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i,
-           pts_sec,
-           v_sec,
-           l_sec,
-           f_sec,
-           flags,
-           v,
-           neighborRatio,
-           radius,
-           gravity,
-           k_b,
-           k_d,
-           c_s,
-           c_b,
-           dt,
-           exclude,
-           antitunneling,
-           itype);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  BasicParticleSystem &pts_sec;
  ParticleDataImpl<Vec3> &v_sec;
@@ -1834,7 +1856,7 @@ struct knFlipDeleteParticlesInObstacle : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, BasicParticleSystem &pts, const FlagGrid &flags)
+  inline void op(IndexInt idx, BasicParticleSystem &pts, const FlagGrid &flags) const
  {

    if (!pts.isActive(idx))
@@ -1863,17 +1885,21 @@ struct knFlipDeleteParticlesInObstacle : public KernelBase {
    return flags;
  }
  typedef FlagGrid type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipDeleteParticlesInObstacle ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, pts, flags);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, pts, flags);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  BasicParticleSystem &pts;
  const FlagGrid &flags;
@@ -2005,7 +2031,7 @@ struct knSetFlagsFromLevelset : public KernelBase {
                 FlagGrid &flags,
                 const Grid<Real> &phi,
                 const int exclude = FlagGrid::TypeObstacle,
-                 const int itype = FlagGrid::TypeFluid)
+                 const int itype = FlagGrid::TypeFluid) const
  {
    if (phi(idx) < 0 && !(flags(idx) & exclude))
      flags(idx) = itype;
@@ -2030,17 +2056,21 @@ struct knSetFlagsFromLevelset : public KernelBase {
    return itype;
  }
  typedef int type3;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knSetFlagsFromLevelset ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, flags, phi, exclude, itype);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, flags, phi, exclude, itype);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  FlagGrid &flags;
  const Grid<Real> &phi;
@@ -2096,7 +2126,7 @@ struct knSetMACFromLevelset : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, MACGrid &v, const Grid<Real> &phi, const Vec3 c)
+  inline void op(int i, int j, int k, MACGrid &v, const Grid<Real> &phi, const Vec3 c) const
  {
    if (phi.getInterpolated(Vec3(i, j, k)) > 0)
      v(i, j, k) = c;
@@ -2116,34 +2146,36 @@ struct knSetMACFromLevelset : public KernelBase {
    return c;
  }
  typedef Vec3 type2;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knSetMACFromLevelset ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, v, phi, c);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, v, phi, c);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, v, phi, c);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  MACGrid &v;
  const Grid<Real> &phi;
@@ -2236,7 +2268,7 @@ struct knFlipComputePotentialTrappedAir : public KernelBase {
                 const Real tauMax,
                 const Real scaleFromManta,
                 const int itype = FlagGrid::TypeFluid,
-                 const int jtype = FlagGrid::TypeFluid)
+                 const int jtype = FlagGrid::TypeFluid) const
  {

    if (!(flags(i, j, k) & itype))
@@ -2310,34 +2342,36 @@ struct knFlipComputePotentialTrappedAir : public KernelBase {
    return jtype;
  }
  typedef int type8;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipComputePotentialTrappedAir ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, pot, flags, v, radius, tauMin, tauMax, scaleFromManta, itype, jtype);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, pot, flags, v, radius, tauMin, tauMax, scaleFromManta, itype, jtype);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, pot, flags, v, radius, tauMin, tauMax, scaleFromManta, itype, jtype);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  Grid<Real> &pot;
  const FlagGrid &flags;
@@ -2438,7 +2472,7 @@ struct knFlipComputePotentialKineticEnergy : public KernelBase {
                 const Real tauMin,
                 const Real tauMax,
                 const Real scaleFromManta,
-                 const int itype = FlagGrid::TypeFluid)
+                 const int itype = FlagGrid::TypeFluid) const
  {

    if (!(flags(i, j, k) & itype))
@@ -2486,34 +2520,36 @@ struct knFlipComputePotentialKineticEnergy : public KernelBase {
    return itype;
  }
  typedef int type6;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipComputePotentialKineticEnergy ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, pot, flags, v, tauMin, tauMax, scaleFromManta, itype);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, pot, flags, v, tauMin, tauMax, scaleFromManta, itype);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, pot, flags, v, tauMin, tauMax, scaleFromManta, itype);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  Grid<Real> &pot;
  const FlagGrid &flags;
@@ -2614,7 +2650,7 @@ struct knFlipComputePotentialWaveCrest : public KernelBase {
                 const Real tauMax,
                 const Real scaleFromManta,
                 const int itype = FlagGrid::TypeFluid,
-                 const int jtype = FlagGrid::TypeFluid)
+                 const int jtype = FlagGrid::TypeFluid) const
  {

    if (!(flags(i, j, k) & itype))
@@ -2700,41 +2736,19 @@ struct knFlipComputePotentialWaveCrest : public KernelBase {
    return jtype;
  }
  typedef int type9;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipComputePotentialWaveCrest ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i,
-                 j,
-                 k,
-                 pot,
-                 flags,
-                 v,
-                 radius,
-                 normal,
-                 tauMin,
-                 tauMax,
-                 scaleFromManta,
-                 itype,
-                 jtype);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i,
@@ -2750,8 +2764,20 @@ struct knFlipComputePotentialWaveCrest : public KernelBase {
               scaleFromManta,
               itype,
               jtype);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, pot, flags, v, radius, normal, tauMin, tauMax, scaleFromManta, itype, jtype);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  Grid<Real> &pot;
  const FlagGrid &flags;
@@ -2834,7 +2860,7 @@ struct knFlipComputeSurfaceNormals : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, Grid<Vec3> &normal, const Grid<Real> &phi)
+  inline void op(IndexInt idx, Grid<Vec3> &normal, const Grid<Real> &phi) const
  {
    normal[idx] = getNormalized(normal[idx]);
  }
@@ -2848,17 +2874,21 @@ struct knFlipComputeSurfaceNormals : public KernelBase {
    return phi;
  }
  typedef Grid<Real> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipComputeSurfaceNormals ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, normal, phi);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, normal, phi);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  Grid<Vec3> &normal;
  const Grid<Real> &phi;
@@ -2928,7 +2958,7 @@ struct knFlipUpdateNeighborRatio : public KernelBase {
                 Grid<Real> &neighborRatio,
                 const int radius,
                 const int itype = FlagGrid::TypeFluid,
-                 const int jtype = FlagGrid::TypeObstacle)
+                 const int jtype = FlagGrid::TypeObstacle) const
  {

    if (!(flags(i, j, k) & itype))
@@ -2978,34 +3008,36 @@ struct knFlipUpdateNeighborRatio : public KernelBase {
    return jtype;
  }
  typedef int type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knFlipUpdateNeighborRatio ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, flags, neighborRatio, radius, itype, jtype);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, flags, neighborRatio, radius, itype, jtype);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, neighborRatio, radius, itype, jtype);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const FlagGrid &flags;
  Grid<Real> &neighborRatio;
--- a/extern/mantaflow/preprocessed/plugin/surfaceturbulence.cpp
+++ b/extern/mantaflow/preprocessed/plugin/surfaceturbulence.cpp
@@ -569,7 +569,7 @@ struct advectSurfacePoints : public KernelBase {
  inline void op(IndexInt idx,
                 BasicParticleSystemWrapper &surfacePoints,
                 const BasicParticleSystemWrapper &coarseParticles,
-                 const ParticleDataImplVec3Wrapper &coarseParticlesPrevPos)
+                 const ParticleDataImplVec3Wrapper &coarseParticlesPrevPos) const
  {
    if (surfacePoints.isActive(idx)) {
      Vec3 avgDisplacement(0, 0, 0);
@@ -606,17 +606,21 @@ struct advectSurfacePoints : public KernelBase {
    return coarseParticlesPrevPos;
  }
  typedef ParticleDataImplVec3Wrapper type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel advectSurfacePoints ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, coarseParticles, coarseParticlesPrevPos);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, surfacePoints, coarseParticles, coarseParticlesPrevPos);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  BasicParticleSystemWrapper &surfacePoints;
  const BasicParticleSystemWrapper &coarseParticles;
@@ -669,7 +673,7 @@ struct computeSurfaceNormals : public KernelBase {
  inline void op(IndexInt idx,
                 const BasicParticleSystemWrapper &surfacePoints,
                 const BasicParticleSystemWrapper &coarseParticles,
-                 ParticleDataImpl<Vec3> &surfaceNormals)
+                 ParticleDataImpl<Vec3> &surfaceNormals) const
  {
    Vec3 pos = surfacePoints.getPos(idx);

@@ -739,17 +743,21 @@ struct computeSurfaceNormals : public KernelBase {
    return surfaceNormals;
  }
  typedef ParticleDataImpl<Vec3> type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel computeSurfaceNormals ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, coarseParticles, surfaceNormals);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, surfacePoints, coarseParticles, surfaceNormals);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystemWrapper &surfacePoints;
  const BasicParticleSystemWrapper &coarseParticles;
@@ -772,7 +780,7 @@ struct computeAveragedNormals : public KernelBase {
  }
  inline void op(IndexInt idx,
                 const BasicParticleSystemWrapper &surfacePoints,
-                 const ParticleDataImpl<Vec3> &surfaceNormals)
+                 const ParticleDataImpl<Vec3> &surfaceNormals) const
  {
    Vec3 pos = surfacePoints.getPos(idx);
    Vec3 newNormal = Vec3(0, 0, 0);
@@ -792,17 +800,21 @@ struct computeAveragedNormals : public KernelBase {
    return surfaceNormals;
  }
  typedef ParticleDataImpl<Vec3> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel computeAveragedNormals ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceNormals);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, surfacePoints, surfaceNormals);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystemWrapper &surfacePoints;
  const ParticleDataImpl<Vec3> &surfaceNormals;
@@ -820,7 +832,7 @@ struct assignNormals : public KernelBase {
  }
  inline void op(IndexInt idx,
                 const BasicParticleSystemWrapper &surfacePoints,
-                 ParticleDataImpl<Vec3> &surfaceNormals)
+                 ParticleDataImpl<Vec3> &surfaceNormals) const
  {
    surfaceNormals[idx] = tempSurfaceVec3[idx];
  }
@@ -834,17 +846,21 @@ struct assignNormals : public KernelBase {
    return surfaceNormals;
  }
  typedef ParticleDataImpl<Vec3> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel assignNormals ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceNormals);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, surfacePoints, surfaceNormals);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystemWrapper &surfacePoints;
  ParticleDataImpl<Vec3> &surfaceNormals;
@@ -947,7 +963,7 @@ struct computeSurfaceDensities : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, const BasicParticleSystemWrapper &surfacePoints, void *dummy)
+  inline void op(IndexInt idx, const BasicParticleSystemWrapper &surfacePoints, void *dummy) const
  {
    Vec3 pos = surfacePoints.getPos(idx);
    Real density = 0;
@@ -968,17 +984,21 @@ struct computeSurfaceDensities : public KernelBase {
    return dummy;
  }
  typedef void type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel computeSurfaceDensities ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, dummy);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, surfacePoints, dummy);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystemWrapper &surfacePoints;
  void *dummy;
@@ -996,7 +1016,7 @@ struct computeSurfaceDisplacements : public KernelBase {
  }
  inline void op(IndexInt idx,
                 const BasicParticleSystemWrapper &surfacePoints,
-                 const ParticleDataImpl<Vec3> &surfaceNormals)
+                 const ParticleDataImpl<Vec3> &surfaceNormals) const
  {
    Vec3 pos = surfacePoints.getPos(idx);
    Vec3 normal = surfaceNormals[idx];
@@ -1048,17 +1068,21 @@ struct computeSurfaceDisplacements : public KernelBase {
    return surfaceNormals;
  }
  typedef ParticleDataImpl<Vec3> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel computeSurfaceDisplacements ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceNormals);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, surfacePoints, surfaceNormals);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystemWrapper &surfacePoints;
  const ParticleDataImpl<Vec3> &surfaceNormals;
@@ -1071,7 +1095,7 @@ struct applySurfaceDisplacements : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, BasicParticleSystemWrapper &surfacePoints, void *dummy)
+  inline void op(IndexInt idx, BasicParticleSystemWrapper &surfacePoints, void *dummy) const
  {
    surfacePoints.setPos(idx, surfacePoints.getPos(idx) + tempSurfaceVec3[idx]);
  }
@@ -1085,17 +1109,21 @@ struct applySurfaceDisplacements : public KernelBase {
    return dummy;
  }
  typedef void type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel applySurfaceDisplacements ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, dummy);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, surfacePoints, dummy);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  BasicParticleSystemWrapper &surfacePoints;
  void *dummy;
@@ -1124,7 +1152,7 @@ struct constrainSurface : public KernelBase {
  }
  inline void op(IndexInt idx,
                 BasicParticleSystemWrapper &surfacePoints,
-                 const BasicParticleSystemWrapper &coarseParticles)
+                 const BasicParticleSystemWrapper &coarseParticles) const
  {
    Vec3 pos = surfacePoints.getPos(idx);
    Real level = computeConstraintLevel(coarseParticles, surfacePoints.getPos(idx));
@@ -1151,17 +1179,21 @@ struct constrainSurface : public KernelBase {
    return coarseParticles;
  }
  typedef BasicParticleSystemWrapper type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel constrainSurface ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, coarseParticles);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, surfacePoints, coarseParticles);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  BasicParticleSystemWrapper &surfacePoints;
  const BasicParticleSystemWrapper &coarseParticles;
@@ -1188,7 +1220,7 @@ struct interpolateNewWaveData : public KernelBase {
                 ParticleDataImpl<Real> &surfaceWaveH,
                 ParticleDataImpl<Real> &surfaceWaveDtH,
                 ParticleDataImpl<Real> &surfaceWaveSeed,
-                 ParticleDataImpl<Real> &surfaceWaveSeedAmplitude)
+                 ParticleDataImpl<Real> &surfaceWaveSeedAmplitude) const
  {
    if (surfacePoints.getStatus(idx) & ParticleBase::PNEW) {
      Vec3 pos = surfacePoints.getPos(idx);
@@ -1238,22 +1270,26 @@ struct interpolateNewWaveData : public KernelBase {
    return surfaceWaveSeedAmplitude;
  }
  typedef ParticleDataImpl<Real> type4;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel interpolateNewWaveData ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx,
+         surfacePoints,
+         surfaceWaveH,
+         surfaceWaveDtH,
+         surfaceWaveSeed,
+         surfaceWaveSeedAmplitude);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i,
-           surfacePoints,
-           surfaceWaveH,
-           surfaceWaveDtH,
-           surfaceWaveSeed,
-           surfaceWaveSeedAmplitude);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystemWrapper &surfacePoints;
  ParticleDataImpl<Real> &surfaceWaveH;
@@ -1309,7 +1345,7 @@ struct addSeed : public KernelBase {
  inline void op(IndexInt idx,
                 const BasicParticleSystemWrapper &surfacePoints,
                 ParticleDataImpl<Real> &surfaceWaveH,
-                 const ParticleDataImpl<Real> &surfaceWaveSeed)
+                 const ParticleDataImpl<Real> &surfaceWaveSeed) const
  {
    surfaceWaveH[idx] += surfaceWaveSeed[idx];
  }
@@ -1328,17 +1364,21 @@ struct addSeed : public KernelBase {
    return surfaceWaveSeed;
  }
  typedef ParticleDataImpl<Real> type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel addSeed ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceWaveH, surfaceWaveSeed);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, surfacePoints, surfaceWaveH, surfaceWaveSeed);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystemWrapper &surfacePoints;
  ParticleDataImpl<Real> &surfaceWaveH;
@@ -1360,7 +1400,7 @@ struct computeSurfaceWaveNormal : public KernelBase {
  inline void op(IndexInt idx,
                 const BasicParticleSystemWrapper &surfacePoints,
                 const ParticleDataImpl<Vec3> &surfaceNormals,
-                 const ParticleDataImpl<Real> &surfaceWaveH)
+                 const ParticleDataImpl<Real> &surfaceWaveH) const
  {
    Vec3 pos = surfacePoints.getPos(idx);

@@ -1424,17 +1464,21 @@ struct computeSurfaceWaveNormal : public KernelBase {
    return surfaceWaveH;
  }
  typedef ParticleDataImpl<Real> type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel computeSurfaceWaveNormal ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceNormals, surfaceWaveH);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, surfacePoints, surfaceNormals, surfaceWaveH);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystemWrapper &surfacePoints;
  const ParticleDataImpl<Vec3> &surfaceNormals;
@@ -1456,7 +1500,7 @@ struct computeSurfaceWaveLaplacians : public KernelBase {
  inline void op(IndexInt idx,
                 const BasicParticleSystemWrapper &surfacePoints,
                 const ParticleDataImpl<Vec3> &surfaceNormals,
-                 const ParticleDataImpl<Real> &surfaceWaveH)
+                 const ParticleDataImpl<Real> &surfaceWaveH) const
  {
    Real laplacian = 0;
    Real wTotal = 0;
@@ -1517,17 +1561,21 @@ struct computeSurfaceWaveLaplacians : public KernelBase {
    return surfaceWaveH;
  }
  typedef ParticleDataImpl<Real> type2;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel computeSurfaceWaveLaplacians ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceNormals, surfaceWaveH);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, surfacePoints, surfaceNormals, surfaceWaveH);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystemWrapper &surfacePoints;
  const ParticleDataImpl<Vec3> &surfaceNormals;
@@ -1552,7 +1600,7 @@ struct evolveWave : public KernelBase {
                 const BasicParticleSystemWrapper &surfacePoints,
                 ParticleDataImpl<Real> &surfaceWaveH,
                 ParticleDataImpl<Real> &surfaceWaveDtH,
-                 const ParticleDataImpl<Real> &surfaceWaveSeed)
+                 const ParticleDataImpl<Real> &surfaceWaveSeed) const
  {
    surfaceWaveDtH[idx] += params.waveSpeed * params.waveSpeed * params.dt * tempSurfaceFloat[idx];
    surfaceWaveDtH[idx] /= (1 + params.dt * params.waveDamping);
@@ -1587,17 +1635,21 @@ struct evolveWave : public KernelBase {
    return surfaceWaveSeed;
  }
  typedef ParticleDataImpl<Real> type3;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel evolveWave ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceWaveH, surfaceWaveDtH, surfaceWaveSeed);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, surfacePoints, surfaceWaveH, surfaceWaveDtH, surfaceWaveSeed);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystemWrapper &surfacePoints;
  ParticleDataImpl<Real> &surfaceWaveH;
@@ -1617,7 +1669,7 @@ struct computeSurfaceCurvature : public KernelBase {
  }
  inline void op(IndexInt idx,
                 const BasicParticleSystemWrapper &surfacePoints,
-                 const ParticleDataImpl<Vec3> &surfaceNormals)
+                 const ParticleDataImpl<Vec3> &surfaceNormals) const
  {
    Vec3 pPos = surfacePoints.getPos(idx);
    Real wTotal = 0;
@@ -1658,17 +1710,21 @@ struct computeSurfaceCurvature : public KernelBase {
    return surfaceNormals;
  }
  typedef ParticleDataImpl<Vec3> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel computeSurfaceCurvature ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceNormals);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, surfacePoints, surfaceNormals);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystemWrapper &surfacePoints;
  const ParticleDataImpl<Vec3> &surfaceNormals;
@@ -1686,7 +1742,7 @@ struct smoothCurvature : public KernelBase {
  }
  inline void op(IndexInt idx,
                 const BasicParticleSystemWrapper &surfacePoints,
-                 ParticleDataImpl<Real> &surfaceWaveSource)
+                 ParticleDataImpl<Real> &surfaceWaveSource) const
  {
    Vec3 pPos = surfacePoints.getPos(idx);
    Real curv = 0;
@@ -1712,17 +1768,21 @@ struct smoothCurvature : public KernelBase {
    return surfaceWaveSource;
  }
  typedef ParticleDataImpl<Real> type1;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel smoothCurvature ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceWaveSource);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, surfacePoints, surfaceWaveSource);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystemWrapper &surfacePoints;
  ParticleDataImpl<Real> &surfaceWaveSource;
@@ -1746,7 +1806,7 @@ struct seedWaves : public KernelBase {
                 const BasicParticleSystemWrapper &surfacePoints,
                 ParticleDataImpl<Real> &surfaceWaveSeed,
                 ParticleDataImpl<Real> &surfaceWaveSeedAmplitude,
-                 ParticleDataImpl<Real> &surfaceWaveSource)
+                 ParticleDataImpl<Real> &surfaceWaveSource) const
  {
    Real source = smoothstep(params.waveSeedingCurvatureThresholdRegionCenter -
                                 params.waveSeedingCurvatureThresholdRegionRadius,
@@ -1790,17 +1850,21 @@ struct seedWaves : public KernelBase {
    return surfaceWaveSource;
  }
  typedef ParticleDataImpl<Real> type3;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel seedWaves ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, surfacePoints, surfaceWaveSeed, surfaceWaveSeedAmplitude, surfaceWaveSource);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, surfacePoints, surfaceWaveSeed, surfaceWaveSeedAmplitude, surfaceWaveSource);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  const BasicParticleSystemWrapper &surfacePoints;
  ParticleDataImpl<Real> &surfaceWaveSeed;
--- a/extern/mantaflow/preprocessed/plugin/viscosity.cpp
+++ b/extern/mantaflow/preprocessed/plugin/viscosity.cpp
--- a/extern/mantaflow/preprocessed/plugin/vortexplugins.cpp
+++ b/extern/mantaflow/preprocessed/plugin/vortexplugins.cpp
@@ -188,7 +188,8 @@ struct KnAcceleration : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(IndexInt idx, MACGrid &a, const MACGrid &v1, const MACGrid &v0, const Real idt)
+  inline void op(
+      IndexInt idx, MACGrid &a, const MACGrid &v1, const MACGrid &v0, const Real idt) const
  {
    a[idx] = (v1[idx] - v0[idx]) * idt;
  }
@@ -212,17 +213,21 @@ struct KnAcceleration : public KernelBase {
    return idt;
  }
  typedef Real type3;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel KnAcceleration ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, a, v1, v0, idt);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, a, v1, v0, idt);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  MACGrid &a;
  const MACGrid &v1;
@@ -571,17 +576,18 @@ void VICintegration(VortexSheetMesh &mesh,

    // prepare CG solver
    const int maxIter = (int)(cgMaxIterFac * vel.getSize().max());
+    vector<Grid<Real> *> matA{&A0, &Ai, &Aj, &Ak};
+
    GridCgInterface *gcg = new GridCg<ApplyMatrix>(
-        solution, rhs, residual, search, flags, temp1, &A0, &Ai, &Aj, &Ak);
+        solution, rhs, residual, search, flags, temp1, matA);
    gcg->setAccuracy(cgAccuracy);
    gcg->setUseL2Norm(true);
    gcg->setICPreconditioner(
        (GridCgInterface::PreconditionType)precondition, &pca0, &pca1, &pca2, &pca3);

    // iterations
-    Real time = 0;
    for (int iter = 0; iter < maxIter; iter++) {
-      if (!gcg->iterate(time))
+      if (!gcg->iterate())
        iter = maxIter;
    }
    debMsg("VICintegration CG iterations:" << gcg->getIterations() << ", res:" << gcg->getSigma(),
--- a/extern/mantaflow/preprocessed/plugin/waveletturbulence.cpp
+++ b/extern/mantaflow/preprocessed/plugin/waveletturbulence.cpp
@@ -170,7 +170,7 @@ struct KnInterpolateMACGrid : public KernelBase {
                 const MACGrid &source,
                 const Vec3 &sourceFactor,
                 const Vec3 &off,
-                 int orderSpace)
+                 int orderSpace) const
  {
    Vec3 pos = Vec3(i, j, k) * sourceFactor + off;

@@ -207,34 +207,36 @@ struct KnInterpolateMACGrid : public KernelBase {
    return orderSpace;
  }
  typedef int type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnInterpolateMACGrid ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, target, source, sourceFactor, off, orderSpace);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, target, source, sourceFactor, off, orderSpace);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, target, source, sourceFactor, off, orderSpace);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  MACGrid &target;
  const MACGrid &source;
@@ -317,7 +319,7 @@ struct knApplySimpleNoiseVec3 : public KernelBase {
                 Grid<Vec3> &target,
                 const WaveletNoiseField &noise,
                 Real scale,
-                 const Grid<Real> *weight)
+                 const Grid<Real> *weight) const
  {
    if (!flags.isFluid(i, j, k))
      return;
@@ -351,34 +353,36 @@ struct knApplySimpleNoiseVec3 : public KernelBase {
    return weight;
  }
  typedef Grid<Real> type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knApplySimpleNoiseVec3 ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, target, noise, scale, weight);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, flags, target, noise, scale, weight);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, target, noise, scale, weight);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const FlagGrid &flags;
  Grid<Vec3> &target;
@@ -457,7 +461,7 @@ struct knApplySimpleNoiseReal : public KernelBase {
                 Grid<Real> &target,
                 const WaveletNoiseField &noise,
                 Real scale,
-                 const Grid<Real> *weight)
+                 const Grid<Real> *weight) const
  {
    if (!flags.isFluid(i, j, k))
      return;
@@ -491,34 +495,36 @@ struct knApplySimpleNoiseReal : public KernelBase {
    return weight;
  }
  typedef Grid<Real> type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knApplySimpleNoiseReal ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, target, noise, scale, weight);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, flags, target, noise, scale, weight);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, target, noise, scale, weight);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const FlagGrid &flags;
  Grid<Real> &target;
@@ -609,7 +615,7 @@ struct knApplyNoiseVec3 : public KernelBase {
                 const Grid<Real> *weight,
                 const Grid<Vec3> *uv,
                 bool uvInterpol,
-                 const Vec3 &sourceFactor)
+                 const Vec3 &sourceFactor) const
  {
    if (!flags.isFluid(i, j, k))
      return;
@@ -688,40 +694,19 @@ struct knApplyNoiseVec3 : public KernelBase {
    return sourceFactor;
  }
  typedef Vec3 type8;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knApplyNoiseVec3 ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i,
-                 j,
-                 k,
-                 flags,
-                 target,
-                 noise,
-                 scale,
-                 scaleSpatial,
-                 weight,
-                 uv,
-                 uvInterpol,
-                 sourceFactor);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i,
@@ -736,8 +721,31 @@ struct knApplyNoiseVec3 : public KernelBase {
               uv,
               uvInterpol,
               sourceFactor);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i,
+             j,
+             k,
+             flags,
+             target,
+             noise,
+             scale,
+             scaleSpatial,
+             weight,
+             uv,
+             uvInterpol,
+             sourceFactor);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const FlagGrid &flags;
  Grid<Vec3> &target;
@@ -826,7 +834,7 @@ struct KnApplyComputeEnergy : public KernelBase {
    run();
  }
  inline void op(
-      int i, int j, int k, const FlagGrid &flags, const MACGrid &vel, Grid<Real> &energy)
+      int i, int j, int k, const FlagGrid &flags, const MACGrid &vel, Grid<Real> &energy) const
  {
    Real e = 0.f;
    if (flags.isFluid(i, j, k)) {
@@ -850,34 +858,36 @@ struct KnApplyComputeEnergy : public KernelBase {
    return energy;
  }
  typedef Grid<Real> type2;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnApplyComputeEnergy ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, flags, vel, energy);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, flags, vel, energy);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, flags, vel, energy);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const FlagGrid &flags;
  const MACGrid &vel;
@@ -1009,7 +1019,7 @@ struct KnComputeStrainRateMag : public KernelBase {
    run();
  }
  inline void op(
-      int i, int j, int k, const MACGrid &vel, const Grid<Vec3> &velCenter, Grid<Real> &prod)
+      int i, int j, int k, const MACGrid &vel, const Grid<Vec3> &velCenter, Grid<Real> &prod) const
  {
    // compute Sij = 1/2 * (dU_i/dx_j + dU_j/dx_i)
    Vec3 diag = Vec3(vel(i + 1, j, k).x, vel(i, j + 1, k).y, 0.) - vel(i, j, k);
@@ -1046,34 +1056,36 @@ struct KnComputeStrainRateMag : public KernelBase {
    return prod;
  }
  typedef Grid<Real> type2;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel KnComputeStrainRateMag ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, vel, velCenter, prod);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, vel, velCenter, prod);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, vel, velCenter, prod);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const MACGrid &vel;
  const Grid<Vec3> &velCenter;
--- a/extern/mantaflow/preprocessed/plugin/waves.cpp
+++ b/extern/mantaflow/preprocessed/plugin/waves.cpp
@@ -38,7 +38,7 @@ struct knCalcSecDeriv2d : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, const Grid<Real> &v, Grid<Real> &ret)
+  inline void op(int i, int j, int k, const Grid<Real> &v, Grid<Real> &ret) const
  {
    ret(i, j, k) = (-4. * v(i, j, k) + v(i - 1, j, k) + v(i + 1, j, k) + v(i, j - 1, k) +
                    v(i, j + 1, k));
@@ -53,34 +53,36 @@ struct knCalcSecDeriv2d : public KernelBase {
    return ret;
  }
  typedef Grid<Real> type1;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knCalcSecDeriv2d ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, v, ret);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, v, ret);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, v, ret);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const Grid<Real> &v;
  Grid<Real> &ret;
@@ -149,42 +151,43 @@ struct knTotalSum : public KernelBase {
    return h;
  }
  typedef Grid<Real> type0;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel knTotalSum ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-        double sum = 0;
-#pragma omp for nowait
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, h, sum);
-#pragma omp critical
-        {
-          this->sum += sum;
-        }
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-        double sum = 0;
-#pragma omp for nowait
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, h, sum);
-#pragma omp critical
-        {
-          this->sum += sum;
-        }
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, h, sum);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_reduce(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_reduce(tbb::blocked_range<IndexInt>(1, maxY), *this);
+  }
+  knTotalSum(knTotalSum &o, tbb::split) : KernelBase(o), h(o.h), sum(0)
+  {
+  }
+  void join(const knTotalSum &o)
+  {
+    sum += o.sum;
  }
  Grid<Real> &h;
  double sum;
@@ -293,7 +296,7 @@ struct MakeRhsWE : public KernelBase {
                 const Grid<Real> &ut,
                 const Grid<Real> &utm1,
                 Real s,
-                 bool crankNic = false)
+                 bool crankNic = false) const
  {
    rhs(i, j, k) = (2. * ut(i, j, k) - utm1(i, j, k));
    if (crankNic) {
@@ -331,34 +334,36 @@ struct MakeRhsWE : public KernelBase {
    return crankNic;
  }
  typedef bool type5;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel MakeRhsWE ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 1; j < _maxY; j++)
-            for (int i = 1; i < _maxX; i++)
-              op(i, j, k, flags, rhs, ut, utm1, s, crankNic);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 1; j < _maxY; j++)
          for (int i = 1; i < _maxX; i++)
            op(i, j, k, flags, rhs, ut, utm1, s, crankNic);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 1; i < _maxX; i++)
+          op(i, j, k, flags, rhs, ut, utm1, s, crankNic);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(1, maxY), *this);
  }
  const FlagGrid &flags;
  Grid<Real> &rhs;
@@ -418,17 +423,21 @@ void cgSolveWE(const FlagGrid &flags,

  const int maxIter = (int)(cgMaxIterFac * flags.getSize().max()) * (flags.is3D() ? 1 : 4);
  GridCgInterface *gcg;
-  if (flags.is3D())
-    gcg = new GridCg<ApplyMatrix>(out, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj, &Ak);
-  else
-    gcg = new GridCg<ApplyMatrix2D>(out, rhs, residual, search, flags, tmp, &A0, &Ai, &Aj, &Ak);
+  vector<Grid<Real> *> matA{&A0, &Ai, &Aj};
+
+  if (flags.is3D()) {
+    matA.push_back(&Ak);
+    gcg = new GridCg<ApplyMatrix>(out, rhs, residual, search, flags, tmp, matA);
+  }
+  else {
+    gcg = new GridCg<ApplyMatrix2D>(out, rhs, residual, search, flags, tmp, matA);
+  }

  gcg->setAccuracy(cgAccuracy);

  // no preconditioning for now...
-  Real time = 0;
  for (int iter = 0; iter < maxIter; iter++) {
-    if (!gcg->iterate(time))
+    if (!gcg->iterate())
      iter = maxIter;
  }
  debMsg("cgSolveWaveEq iterations:" << gcg->getIterations() << ", res:" << gcg->getSigma(), 1);
--- a/extern/mantaflow/preprocessed/registration.cpp
+++ b/extern/mantaflow/preprocessed/registration.cpp
@@ -89,6 +89,7 @@ extern void PbRegister_processBurn();
 extern void PbRegister_updateFlame();
 extern void PbRegister_getSpiralVelocity();
 extern void PbRegister_setGradientYWeight();
+extern void PbRegister_PD_fluid_guiding();
 extern void PbRegister_releaseBlurPrecomp();
 extern void PbRegister_KEpsilonComputeProduction();
 extern void PbRegister_KEpsilonSources();
@@ -144,6 +145,7 @@ extern void PbRegister_flipComputeSurfaceNormals();
 extern void PbRegister_flipUpdateNeighborRatio();
 extern void PbRegister_particleSurfaceTurbulence();
 extern void PbRegister_debugCheckParts();
+extern void PbRegister_applyViscosity();
 extern void PbRegister_markAsFixed();
 extern void PbRegister_texcoordInflow();
 extern void PbRegister_meshSmokeInflow();
@@ -285,6 +287,7 @@ void MantaEnsureRegistration()
  PbRegister_updateFlame();
  PbRegister_getSpiralVelocity();
  PbRegister_setGradientYWeight();
+  PbRegister_PD_fluid_guiding();
  PbRegister_releaseBlurPrecomp();
  PbRegister_KEpsilonComputeProduction();
  PbRegister_KEpsilonSources();
@@ -340,6 +343,7 @@ void MantaEnsureRegistration()
  PbRegister_flipUpdateNeighborRatio();
  PbRegister_particleSurfaceTurbulence();
  PbRegister_debugCheckParts();
+  PbRegister_applyViscosity();
  PbRegister_markAsFixed();
  PbRegister_texcoordInflow();
  PbRegister_meshSmokeInflow();
--- a/extern/mantaflow/preprocessed/shapes.cpp
+++ b/extern/mantaflow/preprocessed/shapes.cpp
@@ -52,7 +52,8 @@ template<class T> struct ApplyShapeToGrid : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, Grid<T> *grid, Shape *shape, T value, FlagGrid *respectFlags)
+  inline void op(
+      int i, int j, int k, Grid<T> *grid, Shape *shape, T value, FlagGrid *respectFlags) const
  {
    if (respectFlags && respectFlags->isObstacle(i, j, k))
      return;
@@ -79,34 +80,36 @@ template<class T> struct ApplyShapeToGrid : public KernelBase {
    return respectFlags;
  }
  typedef FlagGrid type3;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel ApplyShapeToGrid ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, grid, shape, value, respectFlags);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, grid, shape, value, respectFlags);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, grid, shape, value, respectFlags);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  Grid<T> *grid;
  Shape *shape;
@@ -138,7 +141,7 @@ template<class T> struct ApplyShapeToGridSmooth : public KernelBase {
                 Real sigma,
                 Real shift,
                 T value,
-                 FlagGrid *respectFlags)
+                 FlagGrid *respectFlags) const
  {
    if (respectFlags && respectFlags->isObstacle(i, j, k))
      return;
@@ -178,34 +181,36 @@ template<class T> struct ApplyShapeToGridSmooth : public KernelBase {
    return respectFlags;
  }
  typedef FlagGrid type5;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel ApplyShapeToGridSmooth ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, grid, phi, sigma, shift, value, respectFlags);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, grid, phi, sigma, shift, value, respectFlags);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, grid, phi, sigma, shift, value, respectFlags);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  Grid<T> *grid;
  Grid<Real> &phi;
@@ -225,7 +230,7 @@ struct ApplyShapeToMACGrid : public KernelBase {
    run();
  }
  inline void op(
-      int i, int j, int k, MACGrid *grid, Shape *shape, Vec3 value, FlagGrid *respectFlags)
+      int i, int j, int k, MACGrid *grid, Shape *shape, Vec3 value, FlagGrid *respectFlags) const
  {
    if (respectFlags && respectFlags->isObstacle(i, j, k))
      return;
@@ -256,34 +261,36 @@ struct ApplyShapeToMACGrid : public KernelBase {
    return respectFlags;
  }
  typedef FlagGrid type3;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel ApplyShapeToMACGrid ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, grid, shape, value, respectFlags);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, grid, shape, value, respectFlags);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, grid, shape, value, respectFlags);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  MACGrid *grid;
  Shape *shape;
@@ -422,7 +429,7 @@ struct BoxSDF : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, Grid<Real> &phi, const Vec3 &p1, const Vec3 &p2)
+  inline void op(int i, int j, int k, Grid<Real> &phi, const Vec3 &p1, const Vec3 &p2) const
  {
    const Vec3 p(i + 0.5, j + 0.5, k + 0.5);
    if (p.x <= p2.x && p.x >= p1.x && p.y <= p2.y && p.y >= p1.y && p.z <= p2.z && p.z >= p1.z) {
@@ -498,34 +505,36 @@ struct BoxSDF : public KernelBase {
    return p2;
  }
  typedef Vec3 type2;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel BoxSDF ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, phi, p1, p2);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, phi, p1, p2);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, phi, p1, p2);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  Grid<Real> &phi;
  const Vec3 &p1;
@@ -638,7 +647,7 @@ struct SphereSDF : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(int i, int j, int k, Grid<Real> &phi, Vec3 center, Real radius, Vec3 scale)
+  inline void op(int i, int j, int k, Grid<Real> &phi, Vec3 center, Real radius, Vec3 scale) const
  {
    phi(i, j, k) = norm((Vec3(i + 0.5, j + 0.5, k + 0.5) - center) / scale) - radius;
  }
@@ -662,34 +671,36 @@ struct SphereSDF : public KernelBase {
    return scale;
  }
  typedef Vec3 type3;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel SphereSDF ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, phi, center, radius, scale);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, phi, center, radius, scale);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, phi, center, radius, scale);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  Grid<Real> &phi;
  Vec3 center;
@@ -763,7 +774,7 @@ struct CylinderSDF : public KernelBase {
    run();
  }
  inline void op(
-      int i, int j, int k, Grid<Real> &phi, Vec3 center, Real radius, Vec3 zaxis, Real maxz)
+      int i, int j, int k, Grid<Real> &phi, Vec3 center, Real radius, Vec3 zaxis, Real maxz) const
  {
    Vec3 p = Vec3(i + 0.5, j + 0.5, k + 0.5) - center;
    Real z = fabs(dot(p, zaxis));
@@ -809,34 +820,36 @@ struct CylinderSDF : public KernelBase {
    return maxz;
  }
  typedef Real type4;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel CylinderSDF ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, phi, center, radius, zaxis, maxz);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, phi, center, radius, zaxis, maxz);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, phi, center, radius, zaxis, maxz);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  Grid<Real> &phi;
  Vec3 center;
@@ -907,8 +920,13 @@ struct SlopeSDF : public KernelBase {
    runMessage();
    run();
  }
-  inline void op(
-      int i, int j, int k, const Vec3 &n, Grid<Real> &phiObs, const Real &fac, const Real &origin)
+  inline void op(int i,
+                 int j,
+                 int k,
+                 const Vec3 &n,
+                 Grid<Real> &phiObs,
+                 const Real &fac,
+                 const Real &origin) const
  {

    phiObs(i, j, k) = (n.x * (double)i + n.y * (double)j + n.z * (double)k - origin) * fac;
@@ -933,34 +951,36 @@ struct SlopeSDF : public KernelBase {
    return origin;
  }
  typedef Real type3;
-  void runMessage(){};
-  void run()
+  void runMessage()
+  {
+    debMsg("Executing kernel SlopeSDF ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
  {
    const int _maxX = maxX;
    const int _maxY = maxY;
    if (maxZ > 1) {
-
-#pragma omp parallel
-      {
-
-#pragma omp for
-        for (int k = minZ; k < maxZ; k++)
-          for (int j = 0; j < _maxY; j++)
-            for (int i = 0; i < _maxX; i++)
-              op(i, j, k, n, phiObs, fac, origin);
-      }
-    }
-    else {
-      const int k = 0;
-#pragma omp parallel
-      {
-
-#pragma omp for
+      for (int k = __r.begin(); k != (int)__r.end(); k++)
        for (int j = 0; j < _maxY; j++)
          for (int i = 0; i < _maxX; i++)
            op(i, j, k, n, phiObs, fac, origin);
-      }
    }
+    else {
+      const int k = 0;
+      for (int j = __r.begin(); j != (int)__r.end(); j++)
+        for (int i = 0; i < _maxX; i++)
+          op(i, j, k, n, phiObs, fac, origin);
+    }
+  }
+  void run()
+  {
+    if (maxZ > 1)
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(minZ, maxZ), *this);
+    else
+      tbb::parallel_for(tbb::blocked_range<IndexInt>(0, maxY), *this);
  }
  const Vec3 &n;
  Grid<Real> &phiObs;
--- a/extern/mantaflow/preprocessed/shapes.h
+++ b/extern/mantaflow/preprocessed/shapes.h
@@ -269,6 +269,7 @@ class Shape : public PbClass {

 protected:
  GridType mType;
+
 public:
  PbArgs _args;
 }
@@ -319,6 +320,7 @@ class NullShape : public Shape {
  {
    gridSetConst<Real>(phi, 1000.0f);
  }
+
 public:
  PbArgs _args;
 }
@@ -394,6 +396,7 @@ class Box : public Shape {

 protected:
  Vec3 mP0, mP1;
+
 public:
  PbArgs _args;
 }
@@ -455,6 +458,7 @@ class Sphere : public Shape {
 protected:
  Vec3 mCenter, mScale;
  Real mRadius;
+
 public:
  PbArgs _args;
 }
@@ -579,6 +583,7 @@ class Cylinder : public Shape {
 protected:
  Vec3 mCenter, mZDir;
  Real mRadius, mZ;
+
 public:
  PbArgs _args;
 }
@@ -655,6 +660,7 @@ class Slope : public Shape {
  Real mAnglexy, mAngleyz;
  Real mOrigin;
  Vec3 mGs;
+
 public:
  PbArgs _args;
 }
--- a/extern/mantaflow/preprocessed/test.cpp
+++ b/extern/mantaflow/preprocessed/test.cpp
@@ -50,21 +50,28 @@ struct reductionTest : public KernelBase {
    return v;
  }
  typedef Grid<Real> type0;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel reductionTest ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, v, sum);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-      double sum = 0;
-#pragma omp for nowait
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, v, sum);
-#pragma omp critical
-      {
-        this->sum += sum;
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  reductionTest(reductionTest &o, tbb::split) : KernelBase(o), v(o.v), sum(0)
+  {
+  }
+  void join(const reductionTest &o)
+  {
+    sum += o.sum;
  }
  const Grid<Real> &v;
  double sum;
@@ -94,21 +101,28 @@ struct minReduction : public KernelBase {
    return v;
  }
  typedef Grid<Real> type0;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel minReduction ", 3);
+    debMsg("Kernel range"
+               << " x " << maxX << " y " << maxY << " z " << minZ << " - " << maxZ << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r)
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, v, sum);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-      double sum = 0;
-#pragma omp for nowait
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, v, sum);
-#pragma omp critical
-      {
-        this->sum = min(sum, this->sum);
-      }
-    }
+    tbb::parallel_reduce(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  minReduction(minReduction &o, tbb::split) : KernelBase(o), v(o.v), sum(0)
+  {
+  }
+  void join(const minReduction &o)
+  {
+    sum = min(sum, o.sum);
  }
  const Grid<Real> &v;
  double sum;
--- a/extern/mantaflow/preprocessed/turbulencepart.cpp
+++ b/extern/mantaflow/preprocessed/turbulencepart.cpp
@@ -136,7 +136,7 @@ struct KnSynthesizeTurbulence : public KernelBase {
                 int octaves,
                 Real scale,
                 Real invL0,
-                 Real kmin)
+                 Real kmin) const
  {
    const Real PERSISTENCE = 0.56123f;

@@ -217,17 +217,21 @@ struct KnSynthesizeTurbulence : public KernelBase {
    return kmin;
  }
  typedef Real type9;
-  void runMessage(){};
+  void runMessage()
+  {
+    debMsg("Executing kernel KnSynthesizeTurbulence ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, p, flags, noise, kGrid, alpha, dt, octaves, scale, invL0, kmin);
+  }
  void run()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, p, flags, noise, kGrid, alpha, dt, octaves, scale, invL0, kmin);
-    }
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
  }
  TurbulenceParticleSystem &p;
  FlagGrid &flags;
--- a/extern/mantaflow/preprocessed/turbulencepart.h
+++ b/extern/mantaflow/preprocessed/turbulencepart.h
@@ -199,6 +199,7 @@ class TurbulenceParticleSystem : public ParticleSystem<TurbulenceParticleData> {

 private:
  WaveletNoiseField &noise;
+
 public:
  PbArgs _args;
 }
--- a/extern/mantaflow/preprocessed/vortexpart.cpp
+++ b/extern/mantaflow/preprocessed/vortexpart.cpp
@@ -60,24 +60,56 @@ inline Vec3 VortexKernel(const Vec3 &p, const vector<VortexParticleData> &vp, Re
  return u;
 }

-struct KnVpAdvectMesh : public KernelBase {
-  KnVpAdvectMesh(vector<Node> &nodes, const vector<VortexParticleData> &vp, Real scale)
-      : KernelBase(nodes.size()), nodes(nodes), vp(vp), scale(scale), u((size))
+struct _KnVpAdvectMesh : public KernelBase {
+  _KnVpAdvectMesh(const KernelBase &base,
+                  vector<Node> &nodes,
+                  const vector<VortexParticleData> &vp,
+                  Real scale,
+                  vector<Vec3> &u)
+      : KernelBase(base), nodes(nodes), vp(vp), scale(scale), u(u)
  {
-    runMessage();
-    run();
  }
  inline void op(IndexInt idx,
                 vector<Node> &nodes,
                 const vector<VortexParticleData> &vp,
                 Real scale,
-                 vector<Vec3> &u)
+                 vector<Vec3> &u) const
  {
    if (nodes[idx].flags & Mesh::NfFixed)
      u[idx] = 0.0;
    else
      u[idx] = VortexKernel(nodes[idx].pos, vp, scale);
  }
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, nodes, vp, scale, u);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  vector<Node> &nodes;
+  const vector<VortexParticleData> &vp;
+  Real scale;
+  vector<Vec3> &u;
+};
+struct KnVpAdvectMesh : public KernelBase {
+  KnVpAdvectMesh(vector<Node> &nodes, const vector<VortexParticleData> &vp, Real scale)
+      : KernelBase(nodes.size()),
+        _inner(KernelBase(nodes.size()), nodes, vp, scale, u),
+        nodes(nodes),
+        vp(vp),
+        scale(scale),
+        u((size))
+  {
+    runMessage();
+    run();
+  }
+  void run()
+  {
+    _inner.run();
+  }
  inline operator vector<Vec3>()
  {
    return u;
@@ -101,38 +133,63 @@ struct KnVpAdvectMesh : public KernelBase {
    return scale;
  }
  typedef Real type2;
-  void runMessage(){};
-  void run()
+  void runMessage()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, nodes, vp, scale, u);
-    }
-  }
+    debMsg("Executing kernel KnVpAdvectMesh ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  _KnVpAdvectMesh _inner;
  vector<Node> &nodes;
  const vector<VortexParticleData> &vp;
  Real scale;
  vector<Vec3> u;
 };

-struct KnVpAdvectSelf : public KernelBase {
-  KnVpAdvectSelf(vector<VortexParticleData> &vp, Real scale)
-      : KernelBase(vp.size()), vp(vp), scale(scale), u((size))
+struct _KnVpAdvectSelf : public KernelBase {
+  _KnVpAdvectSelf(const KernelBase &base,
+                  vector<VortexParticleData> &vp,
+                  Real scale,
+                  vector<Vec3> &u)
+      : KernelBase(base), vp(vp), scale(scale), u(u)
  {
-    runMessage();
-    run();
  }
-  inline void op(IndexInt idx, vector<VortexParticleData> &vp, Real scale, vector<Vec3> &u)
+  inline void op(IndexInt idx, vector<VortexParticleData> &vp, Real scale, vector<Vec3> &u) const
  {
    if (vp[idx].flag & ParticleBase::PDELETE)
      u[idx] = 0.0;
    else
      u[idx] = VortexKernel(vp[idx].pos, vp, scale);
  }
+  void operator()(const tbb::blocked_range<IndexInt> &__r) const
+  {
+    for (IndexInt idx = __r.begin(); idx != (IndexInt)__r.end(); idx++)
+      op(idx, vp, scale, u);
+  }
+  void run()
+  {
+    tbb::parallel_for(tbb::blocked_range<IndexInt>(0, size), *this);
+  }
+  vector<VortexParticleData> &vp;
+  Real scale;
+  vector<Vec3> &u;
+};
+struct KnVpAdvectSelf : public KernelBase {
+  KnVpAdvectSelf(vector<VortexParticleData> &vp, Real scale)
+      : KernelBase(vp.size()),
+        _inner(KernelBase(vp.size()), vp, scale, u),
+        vp(vp),
+        scale(scale),
+        u((size))
+  {
+    runMessage();
+    run();
+  }
+  void run()
+  {
+    _inner.run();
+  }
  inline operator vector<Vec3>()
  {
    return u;
@@ -151,18 +208,14 @@ struct KnVpAdvectSelf : public KernelBase {
    return scale;
  }
  typedef Real type1;
-  void runMessage(){};
-  void run()
+  void runMessage()
  {
-    const IndexInt _sz = size;
-#pragma omp parallel
-    {
-
-#pragma omp for
-      for (IndexInt i = 0; i < _sz; i++)
-        op(i, vp, scale, u);
-    }
-  }
+    debMsg("Executing kernel KnVpAdvectSelf ", 3);
+    debMsg("Kernel range"
+               << " size " << size << " ",
+           4);
+  };
+  _KnVpAdvectSelf _inner;
  vector<VortexParticleData> &vp;
  Real scale;
  vector<Vec3> u;
--- a/extern/mantaflow/preprocessed/vortexpart.h
+++ b/extern/mantaflow/preprocessed/vortexpart.h
@@ -127,6 +127,7 @@ class VortexParticleSystem : public ParticleSystem<VortexParticleData> {
  }

  virtual ParticleBase *clone();
+
 public:
  PbArgs _args;
 }
--- a/extern/mantaflow/preprocessed/vortexsheet.h
+++ b/extern/mantaflow/preprocessed/vortexsheet.h
@@ -240,6 +240,7 @@ class VortexSheetMesh : public Mesh {
  VorticityChannel mVorticity;
  TexCoord3Channel mTex1, mTex2;
  TurbulenceChannel mTurb;
+
 public:
  PbArgs _args;
 }
--- a/intern/mantaflow/intern/MANTA_main.cpp
+++ b/intern/mantaflow/intern/MANTA_main.cpp
@@ -59,8 +59,8 @@ int MANTA::with_debug(0);
 MANTA::MANTA(int *res, FluidModifierData *fmd)
    : mCurrentID(++solverID), mMaxRes(fmd->domain->maxres)
 {
-  //if (with_debug)
-    cout << "============= FLUID: " << mCurrentID << " with res(" << res[0] << ", " << res[1] << ", " << res[2]
+  if (with_debug)
+    cout << "FLUID: " << mCurrentID << " with res(" << res[0] << ", " << res[1] << ", " << res[2]
         << ")" << endl;

  FluidDomainSettings *fds = fmd->domain;
@@ -279,7 +279,6 @@ MANTA::MANTA(int *res, FluidModifierData *fmd)
  }
  /* All requested initializations must not fail in constructor. */
  BLI_assert(initSuccess);
-  UNUSED_VARS(initSuccess);
  updatePointers(fmd);
 }

--- a/intern/mantaflow/intern/strings/fluid_script.h
+++ b/intern/mantaflow/intern/strings/fluid_script.h
@@ -273,32 +273,13 @@ def fluid_adapt_time_step_$ID$():\n\
 const std::string fluid_alloc =
    "\n\
 mantaMsg('Fluid alloc data')\n\
-flags_s$ID$       = s$ID$.create(FlagGrid, name='$NAME_FLAGS$', offload=True)\n\
-mantaMsg('Fluid alloc data vel')\n\
-vel_s$ID$         = s$ID$.create(MACGrid, name='$NAME_VELOCITY$', sparse=True, offload=True)\n\
+flags_s$ID$       = s$ID$.create(FlagGrid, name='$NAME_FLAGS$')\n\
+vel_s$ID$         = s$ID$.create(MACGrid, name='$NAME_VELOCITY$', sparse=True)\n\
 velTmp_s$ID$      = s$ID$.create(MACGrid, name='$NAME_VELOCITYTMP$', sparse=True)\n\
 x_vel_s$ID$       = s$ID$.create(RealGrid, name='$NAME_VELOCITY_X$')\n\
 y_vel_s$ID$       = s$ID$.create(RealGrid, name='$NAME_VELOCITY_Y$')\n\
 z_vel_s$ID$       = s$ID$.create(RealGrid, name='$NAME_VELOCITY_Z$')\n\
-mantaMsg('Fluid alloc data pressure')\n\
-pressure_s$ID$    = s$ID$.create(RealGrid, name='$NAME_PRESSURE$', offload=True)\n\
-mantaMsg('Fluid alloc data rhs')\n\
-rhs_s$ID$         = s$ID$.create(RealGrid, offload=True)\n\
-mantaMsg('Fluid alloc data A0')\n\
-A0_s$ID$          = s$ID$.create(RealGrid, offload=True)\n\
-mantaMsg('Fluid alloc data Ai')\n\
-Ai_s$ID$          = s$ID$.create(RealGrid, offload=True)\n\
-mantaMsg('Fluid alloc data Aj')\n\
-Aj_s$ID$          = s$ID$.create(RealGrid, offload=True)\n\
-mantaMsg('Fluid alloc data Ak')\n\
-Ak_s$ID$          = s$ID$.create(RealGrid, offload=True)\n\
-mantaMsg('Fluid alloc data search')\n\
-search_s$ID$      = s$ID$.create(RealGrid, offload=True)\n\
-mantaMsg('Fluid alloc data residual')\n\
-residual_s$ID$    = s$ID$.create(RealGrid, offload=True)\n\
-mantaMsg('Fluid alloc data tmp')\n\
-tmp_s$ID$         = s$ID$.create(RealGrid, offload=True)\n\
-mantaMsg('Fluid alloc data 6')\n\
+pressure_s$ID$    = s$ID$.create(RealGrid, name='$NAME_PRESSURE$')\n\
 phiObs_s$ID$      = s$ID$.create(LevelsetGrid, name='$NAME_PHIOBS$')\n\
 phiSIn_s$ID$      = s$ID$.create(LevelsetGrid, name='$NAME_PHISIN$') # helper for static flow objects\n\
 phiIn_s$ID$       = s$ID$.create(LevelsetGrid, name='$NAME_PHIIN$')\n\
@@ -317,8 +298,7 @@ phiOut_s$ID$.setConst(9999)\n\
 \n\
 # Keep track of important objects in dict to load them later on\n\
 fluid_data_dict_final_s$ID$  = { 'vel' : vel_s$ID$ }\n\
-fluid_data_dict_resume_s$ID$ = { 'phiObs' : phiObs_s$ID$, 'phiIn' : phiIn_s$ID$, 'phiOut' : phiOut_s$ID$, 'flags' : flags_s$ID$, 'velTmp' : velTmp_s$ID$ }\n\
-mantaMsg('Fluid alloc DONE')\n";
+fluid_data_dict_resume_s$ID$ = { 'phiObs' : phiObs_s$ID$, 'phiIn' : phiIn_s$ID$, 'phiOut' : phiOut_s$ID$, 'flags' : flags_s$ID$, 'velTmp' : velTmp_s$ID$ }\n";

 const std::string fluid_alloc_obstacle =
    "\n\
@@ -497,8 +477,7 @@ mantaMsg('Delete guiding solver')\n\
 if 'sg$ID$' in globals(): del sg$ID$\n\
 \n\
 # Release unreferenced memory (if there is some left)\n\
-gc.collect()\n\
-mantaMsg('Done deleting')\n";
+gc.collect()\n";

 //////////////////////////////////////////////////////////////////////
 // BAKE
--- a/intern/mantaflow/intern/strings/liquid_script.h
+++ b/intern/mantaflow/intern/strings/liquid_script.h
@@ -274,14 +274,9 @@ def liquid_step_$ID$():\n\
        velTmp_s$ID$.copyFrom(vel_s$ID$)\n\
    \n\
    mantaMsg('Advecting phi')\n\
-    #phi_s$ID$.updateToOmp()\n\
-    #vel_s$ID$.updateToOmp()\n\
    advectSemiLagrange(flags=flags_s$ID$, vel=vel_s$ID$, grid=phi_s$ID$, order=1) # first order is usually enough\n\
-    \n\
    mantaMsg('Advecting velocity')\n\
    advectSemiLagrange(flags=flags_s$ID$, vel=vel_s$ID$, grid=vel_s$ID$, order=2)\n\
-    #phi_s$ID$.updateFromOmp()\n\
-    #vel_s$ID$.updateFromOmp()\n\
    \n\
    # create level set of particles\n\
    gridParticleIndex(parts=pp_s$ID$, flags=flags_s$ID$, indexSys=pindex_s$ID$, index=gpi_s$ID$)\n\
@@ -328,45 +323,21 @@ def liquid_step_$ID$():\n\
        getLaplacian(laplacian=curvature_s$ID$, grid=phi_s$ID$)\n\
        curvature_s$ID$.clamp(-1.0, 1.0)\n\
    \n\
-    #vel_s$ID$.updateToOmp()\n\
    setWallBcs(flags=flags_s$ID$, vel=vel_s$ID$, obvel=None if using_fractions_s$ID$ else obvel_s$ID$, phiObs=phiObs_s$ID$, fractions=fractions_s$ID$)\n\
-    #vel_s$ID$.updateFromOmp()\n\
-    \n\
    if using_viscosity_s$ID$:\n\
        viscosity_s$ID$.setConst(viscosityValue_s$ID$)\n\
        applyViscosity(flags=flags_s$ID$, phi=phi_s$ID$, vel=vel_s$ID$, volumes=volumes_s$ID$, viscosity=viscosity_s$ID$)\n\
    \n\
-    #vel_s$ID$.updateToOmp()\n\
    setWallBcs(flags=flags_s$ID$, vel=vel_s$ID$, obvel=None if using_fractions_s$ID$ else obvel_s$ID$, phiObs=phiObs_s$ID$, fractions=fractions_s$ID$)\n\
-    #vel_s$ID$.updateFromOmp()\n\
-    \n\
    if using_guiding_s$ID$:\n\
        mantaMsg('Guiding and pressure')\n\
        PD_fluid_guiding(vel=vel_s$ID$, velT=velT_s$ID$, flags=flags_s$ID$, phi=phi_s$ID$, curv=curvature_s$ID$, surfTens=surfaceTension_s$ID$, fractions=fractions_s$ID$, weight=weightGuide_s$ID$, blurRadius=beta_sg$ID$, pressure=pressure_s$ID$, tau=tau_sg$ID$, sigma=sigma_sg$ID$, theta=theta_sg$ID$, zeroPressureFixing=domainClosed_s$ID$)\n\
    else:\n\
-        print('Pressure')\n\
-        # openmp sync to device\n\
-        flags_s$ID$.updateToOmp()\n\
-        vel_s$ID$.updateToOmp()\n\
-        print('Pressure 2')\n\
-        \n\
-        #solvePressure(flags=flags_s$ID$, vel=vel_s$ID$, pressure=pressure_s$ID$, curv=curvature_s$ID$, surfTens=surfaceTension_s$ID$, fractions=fractions_s$ID$, obvel=obvel_s$ID$ if using_fractions_s$ID$ else None, zeroPressureFixing=domainClosed_s$ID$)\n\
-        computePressureRhs(rhs=rhs_s$ID$, vel=vel_s$ID$, pressure=pressure_s$ID$, flags=flags_s$ID$, preconditioner=PcNone)\n\
-        print('Pressure 21')\n\
-        solvePressureSystem(rhs=rhs_s$ID$, vel=vel_s$ID$, pressure=pressure_s$ID$, flags=flags_s$ID$, useL2Norm=True, preconditioner=PcNone, residual=residual_s$ID$, search=search_s$ID$, A0=A0_s$ID$, Ai=Ai_s$ID$, Aj=Aj_s$ID$, Ak=Ak_s$ID$, tmp=tmp_s$ID$)\n\
-        print('Pressure 22')\n\
-        correctVelocity(vel=vel_s$ID$, pressure=pressure_s$ID$, flags=flags_s$ID$, preconditioner=PcNone)\n\
-        \n\
-        print('Pressure 3')\n\
-        # openmp sync from device\n\
-        pressure_s$ID$.updateFromOmp()\n\
-        vel_s$ID$.updateFromOmp()\n\
+        mantaMsg('Pressure')\n\
+        solvePressure(flags=flags_s$ID$, vel=vel_s$ID$, pressure=pressure_s$ID$, curv=curvature_s$ID$, surfTens=surfaceTension_s$ID$, fractions=fractions_s$ID$, obvel=obvel_s$ID$ if using_fractions_s$ID$ else None, zeroPressureFixing=domainClosed_s$ID$)\n\
    \n\
    extrapolateMACSimple(flags=flags_s$ID$, vel=vel_s$ID$, distance=4, intoObs=True if using_fractions_s$ID$ else False)\n\
-    \n\
-    #vel_s$ID$.updateToOmp()\n\
    setWallBcs(flags=flags_s$ID$, vel=vel_s$ID$, obvel=None if using_fractions_s$ID$ else obvel_s$ID$, phiObs=phiObs_s$ID$, fractions=fractions_s$ID$)\n\
-    #vel_s$ID$.updateFromOmp()\n\
    \n\
    if not using_fractions_s$ID$:\n\
        extrapolateMACSimple(flags=flags_s$ID$, vel=vel_s$ID$)\n\
--- a/release/scripts/modules/bl_i18n_utils/utils_spell_check.py
+++ b/release/scripts/modules/bl_i18n_utils/utils_spell_check.py
@@ -48,7 +48,6 @@ class SpellChecker:
        "equi",  # equi-angular, etc.
        "fader",
        "globbing",
-        "gridded",
        "haptics",
        "hasn",  # hasn't
        "hetero",
@@ -65,14 +64,12 @@ class SpellChecker:
        "mplayer",
        "ons",  # add-ons
        "pong",  # ping pong
-        "resumable",
        "scalable",
        "shadeless",
        "shouldn",  # shouldn't
        "smoothen",
        "spacings",
        "teleport", "teleporting",
-        "tangency",
        "vertices",
        "wasn",  # wasn't

@@ -176,13 +173,11 @@ class SpellChecker:
        "precalculate",
        "precomputing",
        "prefetch",
-        "preload",
        "premultiply", "premultiplied",
        "prepass",
        "prepend",
-        "preprocess", "preprocessing", "preprocessor", "preprocessed",
+        "preprocess", "preprocessing", "preprocessor",
        "preseek",
-        "preselect", "preselected",
        "promillage",
        "pushdown",
        "raytree",
@@ -190,10 +185,8 @@ class SpellChecker:
        "realtime",
        "reinject", "reinjected",
        "rekey",
-        "relink",
        "remesh",
        "reprojection", "reproject", "reprojecting",
-        "resample",
        "resize",
        "restpose",
        "resync", "resynced",
@@ -233,7 +226,6 @@ class SpellChecker:
        "todo",
        "tradeoff",
        "un",
-        "unadjust", "unadjusted",
        "unassociate", "unassociated",
        "unbake",
        "uncheck",
@@ -396,7 +388,6 @@ class SpellChecker:
        "boid", "boids",
        "ceil",
        "compressibility",
-        "coplanar",
        "curvilinear",
        "equiangular",
        "equisolid",
@@ -405,7 +396,6 @@ class SpellChecker:
        "gettext",
        "hashable",
        "hotspot",
-        "hydrostatic",
        "interocular",
        "intrinsics",
        "irradiance",
@@ -505,7 +495,6 @@ class SpellChecker:
        "perlin",
        "phong",
        "pinlight",
-        "posterize",
        "qi",
        "radiosity",
        "raycast", "raycasting",
--- a/source/blender/blenkernel/BKE_packedFile.h
+++ b/source/blender/blenkernel/BKE_packedFile.h
@@ -74,12 +74,6 @@ char *BKE_packedfile_unpack_to_file(struct ReportList *reports,
                                    const char *local_name,
                                    struct PackedFile *pf,
                                    enum ePF_FileStatus how);
-char *BKE_packedfile_unpack(struct Main *bmain,
-                            struct ReportList *reports,
-                            struct ID *id,
-                            const char *orig_file_name,
-                            struct PackedFile *pf,
-                            enum ePF_FileStatus how);
 int BKE_packedfile_unpack_vfont(struct Main *bmain,
                                struct ReportList *reports,
                                struct VFont *vfont,
--- a/source/blender/blenkernel/intern/packedFile.c
+++ b/source/blender/blenkernel/intern/packedFile.c
@@ -576,42 +576,26 @@ static void unpack_generate_paths(const char *name,
  }
 }

-char *BKE_packedfile_unpack(Main *bmain,
-                            ReportList *reports,
-                            ID *id,
-                            const char *orig_file_path,
-                            PackedFile *pf,
-                            enum ePF_FileStatus how)
-{
-  char localname[FILE_MAX], absname[FILE_MAX];
-  char *new_name = NULL;
-
-  if (id != NULL) {
-    unpack_generate_paths(
-        orig_file_path, id, absname, localname, sizeof(absname), sizeof(localname));
-    new_name = BKE_packedfile_unpack_to_file(
-        reports, BKE_main_blendfile_path(bmain), absname, localname, pf, how);
-  }
-
-  return new_name;
-}
-
 int BKE_packedfile_unpack_vfont(Main *bmain,
                                ReportList *reports,
                                VFont *vfont,
                                enum ePF_FileStatus how)
 {
+  char localname[FILE_MAX], absname[FILE_MAX];
+  char *newname;
  int ret_value = RET_ERROR;
-  if (vfont) {
-    char *new_file_path = BKE_packedfile_unpack(
-        bmain, reports, (ID *)vfont, vfont->filepath, vfont->packedfile, how);

-    if (new_file_path != NULL) {
+  if (vfont != NULL) {
+    unpack_generate_paths(
+        vfont->filepath, (ID *)vfont, absname, localname, sizeof(absname), sizeof(localname));
+    newname = BKE_packedfile_unpack_to_file(
+        reports, BKE_main_blendfile_path(bmain), absname, localname, vfont->packedfile, how);
+    if (newname != NULL) {
      ret_value = RET_OK;
      BKE_packedfile_free(vfont->packedfile);
      vfont->packedfile = NULL;
-      BLI_strncpy(vfont->filepath, new_file_path, sizeof(vfont->filepath));
-      MEM_freeN(new_file_path);
+      BLI_strncpy(vfont->filepath, newname, sizeof(vfont->filepath));
+      MEM_freeN(newname);
    }
  }

@@ -623,14 +607,18 @@ int BKE_packedfile_unpack_sound(Main *bmain,
                                bSound *sound,
                                enum ePF_FileStatus how)
 {
+  char localname[FILE_MAX], absname[FILE_MAX];
+  char *newname;
  int ret_value = RET_ERROR;

  if (sound != NULL) {
-    char *new_file_path = BKE_packedfile_unpack(
-        bmain, reports, (ID *)sound, sound->filepath, sound->packedfile, how);
-    if (new_file_path != NULL) {
-      BLI_strncpy(sound->filepath, new_file_path, sizeof(sound->filepath));
-      MEM_freeN(new_file_path);
+    unpack_generate_paths(
+        sound->filepath, (ID *)sound, absname, localname, sizeof(absname), sizeof(localname));
+    newname = BKE_packedfile_unpack_to_file(
+        reports, BKE_main_blendfile_path(bmain), absname, localname, sound->packedfile, how);
+    if (newname != NULL) {
+      BLI_strncpy(sound->filepath, newname, sizeof(sound->filepath));
+      MEM_freeN(newname);

      BKE_packedfile_free(sound->packedfile);
      sound->packedfile = NULL;
@@ -653,11 +641,16 @@ int BKE_packedfile_unpack_image(Main *bmain,

  if (ima != NULL) {
    while (ima->packedfiles.last) {
+      char localname[FILE_MAX], absname[FILE_MAX];
+      char *newname;
      ImagePackedFile *imapf = ima->packedfiles.last;
-      char *new_file_path = BKE_packedfile_unpack(
-          bmain, reports, (ID *)ima, imapf->filepath, imapf->packedfile, how);

-      if (new_file_path != NULL) {
+      unpack_generate_paths(
+          imapf->filepath, (ID *)ima, absname, localname, sizeof(absname), sizeof(localname));
+      newname = BKE_packedfile_unpack_to_file(
+          reports, BKE_main_blendfile_path(bmain), absname, localname, imapf->packedfile, how);
+
+      if (newname != NULL) {
        ImageView *iv;

        ret_value = ret_value == RET_ERROR ? RET_ERROR : RET_OK;
@@ -667,14 +660,14 @@ int BKE_packedfile_unpack_image(Main *bmain,
        /* update the new corresponding view filepath */
        iv = BLI_findstring(&ima->views, imapf->filepath, offsetof(ImageView, filepath));
        if (iv) {
-          BLI_strncpy(iv->filepath, new_file_path, sizeof(imapf->filepath));
+          BLI_strncpy(iv->filepath, newname, sizeof(imapf->filepath));
        }

        /* keep the new name in the image for non-pack specific reasons */
        if (how != PF_REMOVE) {
-          BLI_strncpy(ima->filepath, new_file_path, sizeof(imapf->filepath));
+          BLI_strncpy(ima->filepath, newname, sizeof(imapf->filepath));
        }
-        MEM_freeN(new_file_path);
+        MEM_freeN(newname);
      }
      else {
        ret_value = RET_ERROR;
@@ -697,14 +690,18 @@ int BKE_packedfile_unpack_volume(Main *bmain,
                                 Volume *volume,
                                 enum ePF_FileStatus how)
 {
+  char localname[FILE_MAX], absname[FILE_MAX];
+  char *newfilepath;
  int ret_value = RET_ERROR;

  if (volume != NULL) {
-    char *new_file_path = BKE_packedfile_unpack(
-        bmain, reports, (ID *)volume, volume->filepath, volume->packedfile, how);
-    if (new_file_path != NULL) {
-      BLI_strncpy(volume->filepath, new_file_path, sizeof(volume->filepath));
-      MEM_freeN(new_file_path);
+    unpack_generate_paths(
+        volume->filepath, (ID *)volume, absname, localname, sizeof(absname), sizeof(localname));
+    newfilepath = BKE_packedfile_unpack_to_file(
+        reports, BKE_main_blendfile_path(bmain), absname, localname, volume->packedfile, how);
+    if (newfilepath != NULL) {
+      BLI_strncpy(volume->filepath, newfilepath, sizeof(volume->filepath));
+      MEM_freeN(newfilepath);

      BKE_packedfile_free(volume->packedfile);
      volume->packedfile = NULL;
--- a/source/blender/draw/intern/draw_manager.c
+++ b/source/blender/draw/intern/draw_manager.c
@@ -695,7 +695,7 @@ static void duplidata_value_free(void *val)
 static void duplidata_key_free(void *key)
 {
  DupliKey *dupli_key = (DupliKey *)key;
-  if (dupli_key->ob_data == dupli_key->ob->data) {
+  if (dupli_key->ob_data == NULL) {
    drw_batch_cache_generate_requested(dupli_key->ob);
  }
  else {
--- a/source/blender/editors/undo/ed_undo.c
+++ b/source/blender/editors/undo/ed_undo.c
@@ -578,8 +578,8 @@ static bool ed_undo_is_init_poll(bContext *C)
     * it will be part of the exception when attempting to call undo in background mode. */
    CTX_wm_operator_poll_msg_set(
        C,
-        "Undo disabled at startup in background-mode "
-        "(call `ed.undo_push()` to explicitly initialize the undo-system)");
+        "Undo disabled at startup in background-mode. "
+        "Call `ed.undo_push()` to explicitly initialize the undo-system.");
    return false;
  }
  return true;
--- a/source/blender/functions/FN_field.hh
+++ b/source/blender/functions/FN_field.hh
@@ -151,14 +151,12 @@ template<typename NodePtr> class GFieldBase {

  friend bool operator==(const GFieldBase &a, const GFieldBase &b)
  {
-    /* Two nodes can compare equal even when their pointer is not the same. For example, two
-     * "Position" nodes are the same. */
-    return *a.node_ == *b.node_ && a.node_output_index_ == b.node_output_index_;
+    return &*a.node_ == &*b.node_ && a.node_output_index_ == b.node_output_index_;
  }

  uint64_t hash() const
  {
-    return get_default_hash_2(*node_, node_output_index_);
+    return get_default_hash_2(node_, node_output_index_);
  }

  const fn::CPPType &cpp_type() const
--- a/source/blender/makesdna/DNA_fluid_defaults.h
+++ b/source/blender/makesdna/DNA_fluid_defaults.h
@@ -76,7 +76,7 @@
    .adapt_margin = 4, \
    .adapt_res = 0, \
    .adapt_threshold = 0.02f, \
-    .maxres = 64, \
+    .maxres = 32, \
    .solver_res = 3, \
    .border_collisions = 0, \
    .flags = FLUID_DOMAIN_USE_DISSOLVE_LOG | FLUID_DOMAIN_USE_ADAPTIVE_TIME, \
--- a/source/blender/makesrna/intern/rna_attribute.c
+++ b/source/blender/makesrna/intern/rna_attribute.c
@@ -41,8 +41,8 @@ const EnumPropertyItem rna_enum_attribute_type_items[] = {
    {CD_PROP_FLOAT, "FLOAT", 0, "Float", "Floating-point value"},
    {CD_PROP_INT32, "INT", 0, "Integer", "32-bit integer"},
    {CD_PROP_FLOAT3, "FLOAT_VECTOR", 0, "Vector", "3D vector with floating-point values"},
-    {CD_PROP_COLOR, "FLOAT_COLOR", 0, "Color", "RGBA color with floating-point values"},
-    {CD_MLOOPCOL, "BYTE_COLOR", 0, "Byte Color", "RGBA color with 8-bit values"},
+    {CD_PROP_COLOR, "FLOAT_COLOR", 0, "Color", "RGBA color with floating-point precisions"},
+    {CD_MLOOPCOL, "BYTE_COLOR", 0, "Byte Color", "RGBA color with 8-bit precision"},
    {CD_PROP_STRING, "STRING", 0, "String", "Text string"},
    {CD_PROP_BOOL, "BOOLEAN", 0, "Boolean", "True or false"},
    {CD_PROP_FLOAT2, "FLOAT2", 0, "2D Vector", "2D vector with floating-point values"},
@@ -54,8 +54,8 @@ const EnumPropertyItem rna_enum_attribute_type_with_auto_items[] = {
    {CD_PROP_FLOAT, "FLOAT", 0, "Float", "Floating-point value"},
    {CD_PROP_INT32, "INT", 0, "Integer", "32-bit integer"},
    {CD_PROP_FLOAT3, "FLOAT_VECTOR", 0, "Vector", "3D vector with floating-point values"},
-    {CD_PROP_COLOR, "FLOAT_COLOR", 0, "Color", "RGBA color with floating-point values"},
-    {CD_MLOOPCOL, "BYTE_COLOR", 0, "Byte Color", "RGBA color with 8-bit values"},
+    {CD_PROP_COLOR, "FLOAT_COLOR", 0, "Color", "RGBA color with floating-point precisions"},
+    {CD_MLOOPCOL, "BYTE_COLOR", 0, "Byte Color", "RGBA color with 8-bit precision"},
    {CD_PROP_STRING, "STRING", 0, "String", "Text string"},
    {CD_PROP_BOOL, "BOOLEAN", 0, "Boolean", "True or false"},
    {CD_PROP_FLOAT2, "FLOAT2", 0, "2D Vector", "2D vector with floating-point values"},
@@ -443,7 +443,7 @@ static void rna_def_attribute_float_vector(BlenderRNA *brna)
  srna = RNA_def_struct(brna, "FloatVectorAttribute", "Attribute");
  RNA_def_struct_sdna(srna, "CustomDataLayer");
  RNA_def_struct_ui_text(
-      srna, "Float Vector Attribute", "Vector geometry attribute, with floating-point values");
+      srna, "Float Vector Attribute", "Vector geometry attribute, with floating-point precision");

  prop = RNA_def_property(srna, "data", PROP_COLLECTION, PROP_NONE);
  RNA_def_property_struct_type(prop, "FloatVectorAttributeValue");
@@ -479,7 +479,7 @@ static void rna_def_attribute_float_color(BlenderRNA *brna)
  srna = RNA_def_struct(brna, "FloatColorAttribute", "Attribute");
  RNA_def_struct_sdna(srna, "CustomDataLayer");
  RNA_def_struct_ui_text(
-      srna, "Float Color Attribute", "Color geometry attribute, with floating-point values");
+      srna, "Float Color Attribute", "Color geometry attribute, with floating-point precision");

  prop = RNA_def_property(srna, "data", PROP_COLLECTION, PROP_NONE);
  RNA_def_property_struct_type(prop, "FloatColorAttributeValue");
@@ -514,7 +514,7 @@ static void rna_def_attribute_byte_color(BlenderRNA *brna)
  srna = RNA_def_struct(brna, "ByteColorAttribute", "Attribute");
  RNA_def_struct_sdna(srna, "CustomDataLayer");
  RNA_def_struct_ui_text(
-      srna, "Byte Color Attribute", "Color geometry attribute, with 8-bit values");
+      srna, "Byte Color Attribute", "Color geometry attribute, with 8-bit precision");

  prop = RNA_def_property(srna, "data", PROP_COLLECTION, PROP_NONE);
  RNA_def_property_struct_type(prop, "ByteColorAttributeValue");
@@ -639,7 +639,7 @@ static void rna_def_attribute_float2(BlenderRNA *brna)
  srna = RNA_def_struct(brna, "Float2Attribute", "Attribute");
  RNA_def_struct_sdna(srna, "CustomDataLayer");
  RNA_def_struct_ui_text(
-      srna, "Float2 Attribute", "2D vector geometry attribute, with floating-point values");
+      srna, "Float2 Attribute", "2D vector geometry attribute, with floating-point precision");

  prop = RNA_def_property(srna, "data", PROP_COLLECTION, PROP_NONE);
  RNA_def_property_struct_type(prop, "Float2AttributeValue");
--- a/source/blender/makesrna/intern/rna_scene.c
+++ b/source/blender/makesrna/intern/rna_scene.c
@@ -2845,7 +2845,7 @@ static void rna_def_tool_settings(BlenderRNA *brna)
       "IMAGE",
       ICON_IMAGE_DATA,
       "Image",
-       "Stick stroke to the image"},
+       "Strick stroke to the image"},
      /* Weird, GP_PROJECT_VIEWALIGN is inverted. */
      {0, "VIEW", ICON_RESTRICT_VIEW_ON, "View", "Stick stroke to the view"},
      {0, NULL, 0, NULL, NULL},
--- a/source/blender/nodes/geometry/nodes/node_geo_mesh_primitive_ico_sphere.cc
+++ b/source/blender/nodes/geometry/nodes/node_geo_mesh_primitive_ico_sphere.cc
@@ -44,7 +44,7 @@ static Mesh *create_ico_sphere_mesh(const int subdivisions, const float radius)

  BMO_op_callf(bm,
               BMO_FLAG_DEFAULTS,
-               "create_icosphere subdivisions=%i radius=%f matrix=%m4 calc_uvs=%b",
+               "create_icosphere subdivisions=%i diameter=%f matrix=%m4 calc_uvs=%b",
               subdivisions,
               std::abs(radius),
               transform.values,
--- a/source/creator/CMakeLists.txt
+++ b/source/creator/CMakeLists.txt
@@ -688,13 +688,6 @@ if(UNIX AND NOT APPLE)
      DESTINATION ${TARGETDIR_VER}/python/lib/python${PYTHON_VERSION}/site-packages
    )
  endif()
-  if(WITH_MOD_FLUID)
-    message(STATUS "============== Here")
-    install(
-      PROGRAMS $<TARGET_FILE:extern_mantaflow>
-      DESTINATION ${TARGETDIR_VER}/python/lib/python${PYTHON_VERSION}/site-packages
-    )
-  endif()
 elseif(WIN32)

  set(BLENDER_TEXT_FILES_DESTINATION ".")