diff --git a/third_party/mikktspace/README.md b/third_party/mikktspace/README.md
new file mode 100644
index 000000000..9fda1559e
--- /dev/null
+++ b/third_party/mikktspace/README.md
@@ -0,0 +1,4 @@
+# MikkTSpace
+A common standard for tangent space used in baking tools to produce normal maps.
+
+More information can be found at http://www.mikktspace.com/.
diff --git a/third_party/mikktspace/mikk_atomic_hash_set.hh b/third_party/mikktspace/mikk_atomic_hash_set.hh
new file mode 100644
index 000000000..aa8214603
--- /dev/null
+++ b/third_party/mikktspace/mikk_atomic_hash_set.hh
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: Apache-2.0
+ *
+ * Original code:
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Modifications:
+ * Copyright 2022 Blender Foundation
+ */
+
+/* Simplified version of Folly's AtomicHashArray
+ * (https://github.com/facebook/folly/blob/main/folly/AtomicHashArray.h).
+ *
+ * Notable changes:
+ * - Standalone and header-only.
+ * - Behaves like a set, not like a map: There's no value type anymore, only keys.
+ * - Capacity check logic have been removed, the code assumes you know the required size in
+ * advance.
+ * - Custom allocator support has been removed.
+ * - Erase has been removed.
+ * - Find has been removed.
+ */
+
+/** \file
+ * \ingroup mikktspace
+ */
+
+#pragma once
+
+#ifdef _MSC_VER
+#  include <intrin.h>
+#endif
+
+#include <atomic>
+#include <type_traits>
+
+namespace mikk {
+
+struct AtomicHashSetLinearProbeFcn {
+  inline size_t operator()(size_t idx, size_t /* numProbes */, size_t capacity) const
+  {
+    idx += 1;  // linear probing
+
+    // Avoid modulus because it's slow
+    return LIKELY(idx < capacity) ? idx : (idx - capacity);
+  }
+};
+
+struct AtomicHashSetQuadraticProbeFcn {
+  inline size_t operator()(size_t idx, size_t numProbes, size_t capacity) const
+  {
+    idx += numProbes;  // quadratic probing
+
+    // Avoid modulus because it's slow
+    return LIKELY(idx < capacity) ? idx : (idx - capacity);
+  }
+};
+
+template<class KeyT,
+         bool isAtomic,
+         class KeyHash = std::hash<KeyT>,
+         class KeyEqual = std::equal_to<KeyT>,
+         class ProbeFcn = AtomicHashSetLinearProbeFcn>
+class AtomicHashSet {
+  static_assert((std::is_convertible<KeyT, int32_t>::value ||
+                 std::is_convertible<KeyT, int64_t>::value ||
+                 std::is_convertible<KeyT, const void *>::value),
+                "You are trying to use AtomicHashSet with disallowed key "
+                "types.  You must use atomically compare-and-swappable integer "
+                "keys, or a different container class.");
+
+ public:
+  const size_t capacity_;
+  const KeyT kEmptyKey_;
+
+  KeyHash hasher_;
+  KeyEqual equalityChecker_;
+
+ private:
+  size_t kAnchorMask_;
+  /* When using a single thread, we can avoid overhead by not bothering with atomic cells. */
+  typedef typename std::conditional<isAtomic, std::atomic<KeyT>, KeyT>::type cell_type;
+  std::vector<cell_type> cells_;
+
+ public:
+  struct Config {
+    KeyT emptyKey;
+    double maxLoadFactor;
+    double growthFactor;
+    size_t capacity;  // if positive, overrides maxLoadFactor
+
+    //  Cannot have constexpr ctor because some compilers rightly complain.
+    Config() : emptyKey((KeyT)-1), maxLoadFactor(0.8), growthFactor(-1), capacity(0) {}
+  };
+
+  /* Instead of a mess of arguments, we take a max size and a Config struct to
+   * simulate named ctor parameters.  The Config struct has sensible defaults
+   * for everything, but is overloaded - if you specify a positive capacity,
+   * that will be used directly instead of computing it based on maxLoadFactor.
+   */
+  AtomicHashSet(size_t maxSize,
+                KeyHash hasher = KeyHash(),
+                KeyEqual equalityChecker = KeyEqual(),
+                const Config &c = Config())
+      : capacity_(size_t(double(maxSize) / c.maxLoadFactor) + 1),
+        kEmptyKey_(c.emptyKey),
+        hasher_(hasher),
+        equalityChecker_(equalityChecker),
+        cells_(capacity_)
+  {
+    /* Get next power of two. Could be done more effiently with builtin_clz, but this is not
+     * performance-critical. */
+    kAnchorMask_ = 1;
+    while (kAnchorMask_ < capacity_)
+      kAnchorMask_ *= 2;
+    /* Get mask for lower bits. */
+    kAnchorMask_ -= 1;
+
+    /* Not great, but the best we can do to support both atomic and non-atomic cells
+     * since std::atomic doesn't have a copy constructor so cells_(capacity_, kEmptyKey_)
+     * in the initializer list won't work. */
+    std::fill((KeyT *)cells_.data(), (KeyT *)cells_.data() + capacity_, kEmptyKey_);
+  }
+
+  AtomicHashSet(const AtomicHashSet &) = delete;
+  AtomicHashSet &operator=(const AtomicHashSet &) = delete;
+
+  ~AtomicHashSet() = default;
+
+  /* Sequential specialization. */
+  bool tryUpdateCell(KeyT *cell, KeyT &existingKey, KeyT newKey)
+  {
+    if (*cell == existingKey) {
+      *cell = newKey;
+      return true;
+    }
+    existingKey = *cell;
+    return false;
+  }
+
+  /* Atomic specialization. */
+  bool tryUpdateCell(std::atomic<KeyT> *cell, KeyT &existingKey, KeyT newKey)
+  {
+    return cell->compare_exchange_strong(existingKey, newKey, std::memory_order_acq_rel);
+  }
+
+  std::pair<KeyT, bool> emplace(KeyT key)
+  {
+    size_t idx = keyToAnchorIdx(key);
+    size_t numProbes = 0;
+    for (;;) {
+      cell_type *cell = &cells_[idx];
+      KeyT existingKey = kEmptyKey_;
+      /* Try to replace empty cell with our key. */
+      if (tryUpdateCell(cell, existingKey, key)) {
+        /* Cell was empty, we're done. */
+        return std::make_pair(key, true);
+      }
+
+      /* Cell was not empty, check if the existing key is equal. */
+      if (equalityChecker_(existingKey, key)) {
+        /* Found equal element, we're done. */
+        return std::make_pair(existingKey, false);
+      }
+
+      /* Continue to next cell according to probe strategy. */
+      ++numProbes;
+      if (UNLIKELY(numProbes >= capacity_)) {
+        // probed every cell...fail
+        assert(false);
+        return std::make_pair(kEmptyKey_, false);
+      }
+
+      idx = ProbeFcn()(idx, numProbes, capacity_);
+    }
+  }
+
+ private:
+  inline size_t keyToAnchorIdx(const KeyT k) const
+  {
+    const size_t hashVal = hasher_(k);
+    const size_t probe = hashVal & kAnchorMask_;
+    return LIKELY(probe < capacity_) ? probe : hashVal % capacity_;
+  }
+
+};  // AtomicHashSet
+
+}  // namespace mikk
diff --git a/third_party/mikktspace/mikk_float3.hh b/third_party/mikktspace/mikk_float3.hh
new file mode 100644
index 000000000..fc70b5f5f
--- /dev/null
+++ b/third_party/mikktspace/mikk_float3.hh
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/** \file
+ * \ingroup mikktspace
+ */
+
+#pragma once
+
+#include <cmath>
+
+namespace mikk {
+
+struct float3 {
+  float x, y, z;
+
+  float3() = default;
+
+  float3(const float *ptr) : x{ptr[0]}, y{ptr[1]}, z{ptr[2]} {}
+
+  float3(const float (*ptr)[3]) : float3((const float *)ptr) {}
+
+  explicit float3(float value) : x(value), y(value), z(value) {}
+
+  explicit float3(int value) : x((float)value), y((float)value), z((float)value) {}
+
+  float3(float x_, float y_, float z_) : x{x_}, y{y_}, z{z_} {}
+
+  static float3 zero()
+  {
+    return {0.0f, 0.0f, 0.0f};
+  }
+
+  friend float3 operator*(const float3 &a, float b)
+  {
+    return {a.x * b, a.y * b, a.z * b};
+  }
+
+  friend float3 operator*(float b, const float3 &a)
+  {
+    return {a.x * b, a.y * b, a.z * b};
+  }
+
+  friend float3 operator-(const float3 &a, const float3 &b)
+  {
+    return {a.x - b.x, a.y - b.y, a.z - b.z};
+  }
+
+  friend float3 operator-(const float3 &a)
+  {
+    return {-a.x, -a.y, -a.z};
+  }
+
+  friend bool operator==(const float3 &a, const float3 &b)
+  {
+    return a.x == b.x && a.y == b.y && a.z == b.z;
+  }
+
+  float length_squared() const
+  {
+    return x * x + y * y + z * z;
+  }
+
+  float length() const
+  {
+    return sqrt(length_squared());
+  }
+
+  static float distance(const float3 &a, const float3 &b)
+  {
+    return (a - b).length();
+  }
+
+  friend float3 operator+(const float3 &a, const float3 &b)
+  {
+    return {a.x + b.x, a.y + b.y, a.z + b.z};
+  }
+
+  void operator+=(const float3 &b)
+  {
+    this->x += b.x;
+    this->y += b.y;
+    this->z += b.z;
+  }
+
+  friend float3 operator*(const float3 &a, const float3 &b)
+  {
+    return {a.x * b.x, a.y * b.y, a.z * b.z};
+  }
+
+  float3 normalize() const
+  {
+    const float len = length();
+    return (len != 0.0f) ? *this * (1.0f / len) : *this;
+  }
+
+  float reduce_add() const
+  {
+    return x + y + z;
+  }
+};
+
+inline float dot(const float3 &a, const float3 &b)
+{
+  return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+inline float distance(const float3 &a, const float3 &b)
+{
+  return float3::distance(a, b);
+}
+
+/* Projects v onto the surface with normal n. */
+inline float3 project(const float3 &n, const float3 &v)
+{
+  return (v - n * dot(n, v)).normalize();
+}
+
+}  // namespace mikk
diff --git a/third_party/mikktspace/mikk_util.hh b/third_party/mikktspace/mikk_util.hh
new file mode 100644
index 000000000..c848fcada
--- /dev/null
+++ b/third_party/mikktspace/mikk_util.hh
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/** \file
+ * \ingroup mikktspace
+ */
+
+#pragma once
+
+#include <cassert>
+#include <cmath>
+
+#ifndef M_PI_F
+#  define M_PI_F (3.1415926535897932f) /* pi */
+#endif
+
+namespace mikk {
+
+inline bool not_zero(const float fX)
+{
+  return fabsf(fX) > FLT_MIN;
+}
+
+/* Helpers for (un)packing a 2-bit vertex index and a 30-bit face index to one integer. */
+static uint pack_index(const uint face, const uint vert)
+{
+  assert((vert & 0x3) == vert);
+  return (face << 2) | (vert & 0x3);
+}
+
+static void unpack_index(uint &face, uint &vert, const uint indexIn)
+{
+  vert = indexIn & 0x3;
+  face = indexIn >> 2;
+}
+
+/* From intern/cycles/util/math_fast.h */
+inline float fast_acosf(float x)
+{
+  const float f = fabsf(x);
+  /* clamp and crush denormals. */
+  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
+  /* Based on http://www.pouet.net/topic.php?which=9132&page=2
+   * 85% accurate (ULP 0)
+   * Examined 2130706434 values of acos:
+   *   15.2000597 avg ULP diff, 4492 max ULP, 4.51803e-05 max error // without "denormal crush"
+   * Examined 2130706434 values of acos:
+   *   15.2007108 avg ULP diff, 4492 max ULP, 4.51803e-05 max error // with "denormal crush"
+   */
+  const float a = sqrtf(1.0f - m) *
+                  (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
+  return x < 0 ? M_PI_F - a : a;
+}
+
+static uint rotl(uint x, uint k)
+{
+  return (x << k) | (x >> (32 - k));
+}
+
+static uint hash_uint3(uint kx, uint ky, uint kz)
+{
+  uint a, b, c;
+  a = b = c = 0xdeadbeef + (2 << 2) + 13;
+
+  c += kz;
+  b += ky;
+  a += kx;
+
+  c = (c ^ b) - rotl(b, 14);
+  a = (a ^ c) - rotl(c, 11);
+  b = (b ^ a) - rotl(a, 25);
+  c = (c ^ b) - rotl(b, 16);
+
+  return c;
+}
+
+static uint hash_uint3_fast(const uint x, const uint y, const uint z)
+{
+  return (x * 73856093) ^ (y * 19349663) ^ (z * 83492791);
+}
+
+static uint float_as_uint(const float v)
+{
+  return *((uint *)(&v));
+}
+
+static float uint_as_float(const uint v)
+{
+  return *((float *)(&v));
+}
+
+static uint hash_float3_fast(const float x, const float y, const float z)
+{
+  return hash_uint3_fast(float_as_uint(x), float_as_uint(y), float_as_uint(z));
+}
+
+static uint hash_float3x3(const float3 &x, const float3 &y, const float3 &z)
+{
+  return hash_uint3(hash_float3_fast(x.x, x.y, x.z),
+                    hash_float3_fast(y.x, y.y, y.z),
+                    hash_float3_fast(z.x, z.y, z.z));
+}
+
+template<typename T, typename KeyGetter>
+void radixsort(std::vector<T> &data, std::vector<T> &data2, KeyGetter getKey)
+{
+  typedef decltype(getKey(data[0])) key_t;
+  constexpr size_t datasize = sizeof(key_t);
+  static_assert(datasize % 2 == 0);
+  static_assert(std::is_integral<key_t>::value);
+
+  uint bins[datasize][257] = {{0}};
+
+  /* Count number of elements per bin. */
+  for (const T &item : data) {
+    key_t key = getKey(item);
+    for (uint pass = 0; pass < datasize; pass++)
+      bins[pass][((key >> (8 * pass)) & 0xff) + 1]++;
+  }
+
+  /* Compute prefix sum to find position of each bin in the sorted array. */
+  for (uint pass = 0; pass < datasize; pass++) {
+    for (uint i = 2; i < 256; i++) {
+      bins[pass][i] += bins[pass][i - 1];
+    }
+  }
+
+  int shift = 0;
+  for (uint pass = 0; pass < datasize; pass++, shift += 8) {
+    /* Insert the elements in their correct location based on their bin. */
+    for (const T &item : data) {
+      uint pos = bins[pass][(getKey(item) >> shift) & 0xff]++;
+      data2[pos] = item;
+    }
+
+    /* Swap arrays. */
+    std::swap(data, data2);
+  }
+}
+
+static void float_add_atomic(float *val, float add)
+{
+  /* Hacky, but atomic floats are only supported from C++20 onward.
+   * This works in practice since `std::atomic<uint32_t>` is really just an `uint32_t` in memory,
+   * so this cast lets us do a 32-bit CAS operation (which is used to build the atomic float
+   * operation) without needing any external libraries or compiler-specific builtins. */
+  std::atomic<uint32_t> *atomic_val = reinterpret_cast<std::atomic<uint32_t> *>(val);
+  for (;;) {
+    uint32_t old_v = atomic_val->load();
+    uint32_t new_v = float_as_uint(uint_as_float(old_v) + add);
+    if (atomic_val->compare_exchange_weak(old_v, new_v)) {
+      return;
+    }
+  }
+}
+
+}  // namespace mikk
diff --git a/third_party/mikktspace/mikktspace.hh b/third_party/mikktspace/mikktspace.hh
new file mode 100644
index 000000000..14f7b26a3
--- /dev/null
+++ b/third_party/mikktspace/mikktspace.hh
@@ -0,0 +1,824 @@
+/* SPDX-License-Identifier: Apache-2.0
+ *
+ * Original C code:
+ * Copyright 2011 by Morten S. Mikkelsen.
+ *
+ * C++ rewrite:
+ * Copyright 2022 Blender Foundation
+ */
+
+/** \file
+ * \ingroup mikktspace
+ */
+
+#include <algorithm>
+#include <cassert>
+
+#ifdef WITH_TBB
+#  include <tbb/parallel_for.h>
+#endif
+
+#include "mikk_atomic_hash_set.hh"
+#include "mikk_float3.hh"
+#include "mikk_util.hh"
+
+namespace mikk {
+
+static constexpr uint UNSET_ENTRY = 0xffffffffu;
+
+template<typename Mesh> class Mikktspace {
+  struct Triangle {
+    /* Stores neighboring triangle for group assignment. */
+    std::array<uint, 3> neighbor;
+    /* Stores assigned group of each vertex. */
+    std::array<uint, 3> group;
+    /* Stores vertex indices that make up the triangle. */
+    std::array<uint, 3> vertices;
+
+    /* Computed face tangent, will be accumulated into group. */
+    float3 tangent;
+
+    /* Index of the face that this triangle belongs to. */
+    uint faceIdx;
+    /* Index of the first of this triangle's vertices' TSpaces. */
+    uint tSpaceIdx;
+
+    /* Stores mapping from this triangle's vertices to the original
+     * face's vertices (relevant for quads). */
+    std::array<uint8_t, 3> faceVertex;
+
+    // flags
+    bool markDegenerate : 1;
+    bool quadOneDegenTri : 1;
+    bool groupWithAny : 1;
+    bool orientPreserving : 1;
+
+    Triangle(uint faceIdx_, uint tSpaceIdx_)
+        : tangent{0.0f},
+          faceIdx{faceIdx_},
+          tSpaceIdx{tSpaceIdx_},
+          markDegenerate{false},
+          quadOneDegenTri{false},
+          groupWithAny{true},
+          orientPreserving{false}
+    {
+      neighbor.fill(UNSET_ENTRY);
+      group.fill(UNSET_ENTRY);
+    }
+
+    void setVertices(uint8_t i0, uint8_t i1, uint8_t i2)
+    {
+      faceVertex[0] = i0;
+      faceVertex[1] = i1;
+      faceVertex[2] = i2;
+      vertices[0] = pack_index(faceIdx, i0);
+      vertices[1] = pack_index(faceIdx, i1);
+      vertices[2] = pack_index(faceIdx, i2);
+    }
+  };
+
+  struct Group {
+    float3 tangent;
+    uint vertexRepresentative;
+    bool orientPreserving;
+
+    Group(uint vertexRepresentative_, bool orientPreserving_)
+        : tangent{0.0f},
+          vertexRepresentative{vertexRepresentative_},
+          orientPreserving{orientPreserving_}
+    {
+    }
+
+    void normalizeTSpace()
+    {
+      tangent = tangent.normalize();
+    }
+
+    void accumulateTSpaceAtomic(float3 v_tangent)
+    {
+      float_add_atomic(&tangent.x, v_tangent.x);
+      float_add_atomic(&tangent.y, v_tangent.y);
+      float_add_atomic(&tangent.z, v_tangent.z);
+    }
+
+    void accumulateTSpace(float3 v_tangent)
+    {
+      tangent += v_tangent;
+    }
+  };
+
+  struct TSpace {
+    float3 tangent = float3(1.0f, 0.0f, 0.0f);
+    uint counter = 0;
+    bool orientPreserving = false;
+
+    void accumulateGroup(const Group &group)
+    {
+      assert(counter < 2);
+
+      if (counter == 0) {
+        tangent = group.tangent;
+      }
+      else if (tangent == group.tangent) {
+        // this if is important. Due to floating point precision
+        // averaging when ts0==ts1 will cause a slight difference
+        // which results in tangent space splits later on, so do nothing
+      }
+      else {
+        tangent = (tangent + group.tangent).normalize();
+      }
+
+      counter++;
+      orientPreserving = group.orientPreserving;
+    }
+  };
+
+  Mesh &mesh;
+
+  std::vector<Triangle> triangles;
+  std::vector<TSpace> tSpaces;
+  std::vector<Group> groups;
+
+  uint nrTSpaces, nrFaces, nrTriangles, totalTriangles;
+
+  int nrThreads;
+  bool isParallel;
+
+ public:
+  Mikktspace(Mesh &mesh_) : mesh(mesh_) {}
+
+  void genTangSpace()
+  {
+    nrFaces = uint(mesh.GetNumFaces());
+
+#ifdef WITH_TBB
+    nrThreads = tbb::this_task_arena::max_concurrency();
+    isParallel = (nrThreads > 1) && (nrFaces > 10000);
+#else
+    nrThreads = 1;
+    isParallel = false;
+#endif
+
+    // make an initial triangle --> face index list
+    generateInitialVerticesIndexList();
+
+    if (nrTriangles == 0) {
+      return;
+    }
+
+    // make a welded index list of identical positions and attributes (pos, norm, texc)
+    generateSharedVerticesIndexList();
+
+    // mark all triangle pairs that belong to a quad with only one
+    // good triangle. These need special treatment in degenEpilogue().
+    // Additionally, move all good triangles to the start of
+    // triangles[] without changing order and
+    // put the degenerate triangles last.
+    degenPrologue();
+
+    if (nrTriangles == 0) {
+      // No point in building tangents if there are no non-degenerate triangles, so just zero them
+      tSpaces.resize(nrTSpaces);
+    }
+    else {
+      // evaluate triangle level attributes and neighbor list
+      initTriangle();
+
+      // match up edge pairs
+      buildNeighbors();
+
+      // based on the 4 rules, identify groups based on connectivity
+      build4RuleGroups();
+
+      // make tspaces, each group is split up into subgroups.
+      // Finally a tangent space is made for every resulting subgroup
+      generateTSpaces();
+
+      // degenerate quads with one good triangle will be fixed by copying a space from
+      // the good triangle to the coinciding vertex.
+      // all other degenerate triangles will just copy a space from any good triangle
+      // with the same welded index in vertices[].
+      degenEpilogue();
+    }
+
+    uint index = 0;
+    for (uint f = 0; f < nrFaces; f++) {
+      const uint verts = mesh.GetNumVerticesOfFace(f);
+      if (verts != 3 && verts != 4) {
+        continue;
+      }
+
+      // set data
+      for (uint i = 0; i < verts; i++) {
+        const TSpace &tSpace = tSpaces[index++];
+        mesh.SetTangentSpace(f, i, tSpace.tangent, tSpace.orientPreserving);
+      }
+    }
+  }
+
+ protected:
+  template<typename F> void runParallel(uint start, uint end, F func)
+  {
+#ifdef WITH_TBB
+    if (isParallel) {
+      tbb::parallel_for(start, end, func);
+    }
+    else
+#endif
+    {
+      for (uint i = start; i < end; i++) {
+        func(i);
+      }
+    }
+  }
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////////////////////////
+
+  float3 getPosition(uint vertexID)
+  {
+    uint f, v;
+    unpack_index(f, v, vertexID);
+    return mesh.GetPosition(f, v);
+  }
+
+  float3 getNormal(uint vertexID)
+  {
+    uint f, v;
+    unpack_index(f, v, vertexID);
+    return mesh.GetNormal(f, v);
+  }
+
+  float3 getTexCoord(uint vertexID)
+  {
+    uint f, v;
+    unpack_index(f, v, vertexID);
+    return mesh.GetTexCoord(f, v);
+  }
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////////////////////////
+
+  void generateInitialVerticesIndexList()
+  {
+    nrTriangles = 0;
+    for (uint f = 0; f < nrFaces; f++) {
+      const uint verts = mesh.GetNumVerticesOfFace(f);
+      if (verts == 3) {
+        nrTriangles += 1;
+      }
+      else if (verts == 4) {
+        nrTriangles += 2;
+      }
+    }
+
+    triangles.reserve(nrTriangles);
+
+    nrTSpaces = 0;
+    for (uint f = 0; f < nrFaces; f++) {
+      const uint verts = mesh.GetNumVerticesOfFace(f);
+      if (verts != 3 && verts != 4)
+        continue;
+
+      uint tA = uint(triangles.size());
+      triangles.emplace_back(f, nrTSpaces);
+      Triangle &triA = triangles[tA];
+
+      if (verts == 3) {
+        triA.setVertices(0, 1, 2);
+      }
+      else {
+        uint tB = uint(triangles.size());
+        triangles.emplace_back(f, nrTSpaces);
+        Triangle &triB = triangles[tB];
+
+        // need an order independent way to evaluate
+        // tspace on quads. This is done by splitting
+        // along the shortest diagonal.
+        float distSQ_02 = (mesh.GetTexCoord(f, 2) - mesh.GetTexCoord(f, 0)).length_squared();
+        float distSQ_13 = (mesh.GetTexCoord(f, 3) - mesh.GetTexCoord(f, 1)).length_squared();
+        bool quadDiagIs_02;
+        if (distSQ_02 != distSQ_13)
+          quadDiagIs_02 = (distSQ_02 < distSQ_13);
+        else {
+          distSQ_02 = (mesh.GetPosition(f, 2) - mesh.GetPosition(f, 0)).length_squared();
+          distSQ_13 = (mesh.GetPosition(f, 3) - mesh.GetPosition(f, 1)).length_squared();
+          quadDiagIs_02 = !(distSQ_13 < distSQ_02);
+        }
+
+        if (quadDiagIs_02) {
+          triA.setVertices(0, 1, 2);
+          triB.setVertices(0, 2, 3);
+        }
+        else {
+          triA.setVertices(0, 1, 3);
+          triB.setVertices(1, 2, 3);
+        }
+      }
+
+      nrTSpaces += verts;
+    }
+  }
+
+  struct VertexHash {
+    Mikktspace<Mesh> *mikk;
+    inline uint operator()(const uint &k) const
+    {
+      return hash_float3x3(mikk->getPosition(k), mikk->getNormal(k), mikk->getTexCoord(k));
+    }
+  };
+
+  struct VertexEqual {
+    Mikktspace<Mesh> *mikk;
+    inline bool operator()(const uint &kA, const uint &kB) const
+    {
+      return mikk->getTexCoord(kA) == mikk->getTexCoord(kB) &&
+             mikk->getNormal(kA) == mikk->getNormal(kB) &&
+             mikk->getPosition(kA) == mikk->getPosition(kB);
+    }
+  };
+
+  /* Merge identical vertices.
+   * To find vertices with identical position, normal and texcoord, we calculate a hash of the 9
+   * values. Then, by sorting based on that hash, identical elements (having identical hashes) will
+   * be moved next to each other. Since there might be hash collisions, the elements of each block
+   * are then compared with each other and duplicates are merged.
+   */
+  template<bool isAtomic> void generateSharedVerticesIndexList_impl()
+  {
+    uint numVertices = nrTriangles * 3;
+    AtomicHashSet<uint, isAtomic, VertexHash, VertexEqual> set(numVertices, {this}, {this});
+    runParallel(0u, nrTriangles, [&](uint t) {
+      for (uint i = 0; i < 3; i++) {
+        auto res = set.emplace(triangles[t].vertices[i]);
+        if (!res.second) {
+          triangles[t].vertices[i] = res.first;
+        }
+      }
+    });
+  }
+  void generateSharedVerticesIndexList()
+  {
+    if (isParallel) {
+      generateSharedVerticesIndexList_impl<true>();
+    }
+    else {
+      generateSharedVerticesIndexList_impl<false>();
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////////////////////////
+  /////////////////////////////////// Degenerate triangles ////////////////////////////////////
+
+  void degenPrologue()
+  {
+    // Mark all degenerate triangles
+    totalTriangles = nrTriangles;
+    std::atomic<uint> degenTriangles(0);
+    runParallel(0u, totalTriangles, [&](uint t) {
+      const float3 p0 = getPosition(triangles[t].vertices[0]);
+      const float3 p1 = getPosition(triangles[t].vertices[1]);
+      const float3 p2 = getPosition(triangles[t].vertices[2]);
+      if (p0 == p1 || p0 == p2 || p1 == p2)  // degenerate
+      {
+        triangles[t].markDegenerate = true;
+        degenTriangles.fetch_add(1);
+      }
+    });
+    nrTriangles -= degenTriangles.load();
+
+    if (totalTriangles == nrTriangles) {
+      return;
+    }
+
+    // locate quads with only one good triangle
+    runParallel(0u, totalTriangles - 1, [&](uint t) {
+      Triangle &triangleA = triangles[t], &triangleB = triangles[t + 1];
+      if (triangleA.faceIdx != triangleB.faceIdx) {
+        /* Individual triangle, skip. */
+        return;
+      }
+      if (triangleA.markDegenerate != triangleB.markDegenerate) {
+        triangleA.quadOneDegenTri = true;
+        triangleB.quadOneDegenTri = true;
+      }
+    });
+
+    std::stable_partition(triangles.begin(), triangles.end(), [](const Triangle &tri) {
+      return !tri.markDegenerate;
+    });
+  }
+
+  void degenEpilogue()
+  {
+    if (nrTriangles == totalTriangles) {
+      return;
+    }
+
+    std::unordered_map<uint, uint> goodTriangleMap;
+    for (uint t = 0; t < nrTriangles; t++) {
+      for (uint i = 0; i < 3; i++) {
+        goodTriangleMap.emplace(triangles[t].vertices[i], pack_index(t, i));
+      }
+    }
+
+    // deal with degenerate triangles
+    // punishment for degenerate triangles is O(nrTriangles) extra memory.
+    for (uint t = nrTriangles; t < totalTriangles; t++) {
+      // degenerate triangles on a quad with one good triangle are skipped
+      // here but processed in the next loop
+      if (triangles[t].quadOneDegenTri) {
+        continue;
+      }
+
+      for (uint i = 0; i < 3; i++) {
+        const auto entry = goodTriangleMap.find(triangles[t].vertices[i]);
+        if (entry == goodTriangleMap.end()) {
+          // Matching vertex from good triangle is not found.
+          continue;
+        }
+
+        uint tSrc, iSrc;
+        unpack_index(tSrc, iSrc, entry->second);
+        const uint iSrcVert = triangles[tSrc].faceVertex[iSrc];
+        const uint iSrcOffs = triangles[tSrc].tSpaceIdx;
+        const uint iDstVert = triangles[t].faceVertex[i];
+        const uint iDstOffs = triangles[t].tSpaceIdx;
+        // copy tspace
+        tSpaces[iDstOffs + iDstVert] = tSpaces[iSrcOffs + iSrcVert];
+      }
+    }
+
+    // deal with degenerate quads with one good triangle
+    for (uint t = 0; t < nrTriangles; t++) {
+      // this triangle belongs to a quad where the
+      // other triangle is degenerate
+      if (!triangles[t].quadOneDegenTri) {
+        continue;
+      }
+      uint vertFlag = (1u << triangles[t].faceVertex[0]) | (1u << triangles[t].faceVertex[1]) |
+                      (1u << triangles[t].faceVertex[2]);
+      uint missingFaceVertex = 0;
+      if ((vertFlag & 2) == 0)
+        missingFaceVertex = 1;
+      else if ((vertFlag & 4) == 0)
+        missingFaceVertex = 2;
+      else if ((vertFlag & 8) == 0)
+        missingFaceVertex = 3;
+
+      uint faceIdx = triangles[t].faceIdx;
+      float3 dstP = mesh.GetPosition(faceIdx, missingFaceVertex);
+      bool found = false;
+      for (uint i = 0; i < 3; i++) {
+        const uint faceVertex = triangles[t].faceVertex[i];
+        const float3 srcP = mesh.GetPosition(faceIdx, faceVertex);
+        if (srcP == dstP) {
+          const uint offset = triangles[t].tSpaceIdx;
+          tSpaces[offset + missingFaceVertex] = tSpaces[offset + faceVertex];
+          found = true;
+          break;
+        }
+      }
+      assert(found);
+      (void)found;
+    }
+  }
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // returns the texture area times 2
+  float calcTexArea(uint tri)
+  {
+    const float3 t1 = getTexCoord(triangles[tri].vertices[0]);
+    const float3 t2 = getTexCoord(triangles[tri].vertices[1]);
+    const float3 t3 = getTexCoord(triangles[tri].vertices[2]);
+
+    const float t21x = t2.x - t1.x;
+    const float t21y = t2.y - t1.y;
+    const float t31x = t3.x - t1.x;
+    const float t31y = t3.y - t1.y;
+
+    const float signedAreaSTx2 = t21x * t31y - t21y * t31x;
+    return fabsf(signedAreaSTx2);
+  }
+
+  void initTriangle()
+  {
+    // triangles[f].iFlag is cleared in generateInitialVerticesIndexList()
+    // which is called before this function.
+
+    // evaluate first order derivatives
+    runParallel(0u, nrTriangles, [&](uint t) {
+      Triangle &triangle = triangles[t];
+
+      // initial values
+      const float3 v1 = getPosition(triangle.vertices[0]);
+      const float3 v2 = getPosition(triangle.vertices[1]);
+      const float3 v3 = getPosition(triangle.vertices[2]);
+      const float3 t1 = getTexCoord(triangle.vertices[0]);
+      const float3 t2 = getTexCoord(triangle.vertices[1]);
+      const float3 t3 = getTexCoord(triangle.vertices[2]);
+
+      const float t21x = t2.x - t1.x;
+      const float t21y = t2.y - t1.y;
+      const float t31x = t3.x - t1.x;
+      const float t31y = t3.y - t1.y;
+      const float3 d1 = v2 - v1, d2 = v3 - v1;
+
+      const float signedAreaSTx2 = t21x * t31y - t21y * t31x;
+      const float3 vOs = (t31y * d1) - (t21y * d2);   // eq 18
+      const float3 vOt = (-t31x * d1) + (t21x * d2);  // eq 19
+
+      triangle.orientPreserving = (signedAreaSTx2 > 0);
+
+      if (not_zero(signedAreaSTx2)) {
+        const float lenOs2 = vOs.length_squared();
+        const float lenOt2 = vOt.length_squared();
+        const float fS = triangle.orientPreserving ? 1.0f : (-1.0f);
+        if (not_zero(lenOs2))
+          triangle.tangent = vOs * (fS / sqrtf(lenOs2));
+
+        // if this is a good triangle
+        if (not_zero(lenOs2) && not_zero(lenOt2))
+          triangle.groupWithAny = false;
+      }
+    });
+
+    // force otherwise healthy quads to a fixed orientation
+    runParallel(0u, nrTriangles - 1, [&](uint t) {
+      Triangle &triangleA = triangles[t], &triangleB = triangles[t + 1];
+      if (triangleA.faceIdx != triangleB.faceIdx) {
+        // this is not a quad
+        return;
+      }
+
+      // bad triangles should already have been removed by
+      // degenPrologue(), but just in case check that neither are degenerate
+      if (!(triangleA.markDegenerate || triangleB.markDegenerate)) {
+        // if this happens the quad has extremely bad mapping!!
+        if (triangleA.orientPreserving != triangleB.orientPreserving) {
+          bool chooseOrientFirstTri = false;
+          if (triangleB.groupWithAny)
+            chooseOrientFirstTri = true;
+          else if (calcTexArea(t) >= calcTexArea(t + 1))
+            chooseOrientFirstTri = true;
+
+          // force match
+          const uint t0 = chooseOrientFirstTri ? t : (t + 1);
+          const uint t1 = chooseOrientFirstTri ? (t + 1) : t;
+          triangles[t1].orientPreserving = triangles[t0].orientPreserving;
+        }
+      }
+    });
+  }
+
+  /////////////////////////////////////////////////////////////////////////////////////////////
+  /////////////////////////////////////////// Edges ///////////////////////////////////////////
+
+  struct NeighborShard {
+    struct Entry {
+      Entry(uint32_t key_, uint data_) : key(key_), data(data_) {}
+      uint key, data;
+    };
+    std::vector<Entry> entries;
+
+    NeighborShard(size_t capacity)
+    {
+      entries.reserve(capacity);
+    }
+
+    void buildNeighbors(Mikktspace<Mesh> *mikk)
+    {
+      /* Entries are added by iterating over t, so by using a stable sort,
+       * we don't have to compare based on t as well. */
+      {
+        std::vector<Entry> tempEntries(entries.size(), {0, 0});
+        radixsort(entries, tempEntries, [](const Entry &e) { return e.key; });
+      }
+
+      for (uint i = 0; i < entries.size(); i++) {
+        const Entry &a = entries[i];
+        uint tA, iA;
+        unpack_index(tA, iA, a.data);
+        Mikktspace<Mesh>::Triangle &triA = mikk->triangles[tA];
+
+        if (triA.neighbor[iA] != UNSET_ENTRY) {
+          continue;
+        }
+
+        uint i0A = triA.vertices[iA], i1A = triA.vertices[(iA != 2) ? (iA + 1) : 0];
+        for (uint j = i + 1; j < entries.size(); j++) {
+          const Entry &b = entries[j];
+          uint tB, iB;
+          unpack_index(tB, iB, b.data);
+          Mikktspace<Mesh>::Triangle &triB = mikk->triangles[tB];
+
+          if (b.key != a.key)
+            break;
+
+          if (triB.neighbor[iB] != UNSET_ENTRY) {
+            continue;
+          }
+
+          uint i1B = triB.vertices[iB], i0B = triB.vertices[(iB != 2) ? (iB + 1) : 0];
+          if (i0A == i0B && i1A == i1B) {
+            triA.neighbor[iA] = tB;
+            triB.neighbor[iB] = tA;
+            break;
+          }
+        }
+      }
+    }
+  };
+
+  void buildNeighbors()
+  {
+    /* In order to parallelize the processing, we divide the vertices into shards.
+     * Since only vertex pairs with the same key will be checked, we can process
+     * shards independently as long as we ensure that all vertices with the same
+     * key go into the same shard.
+     * This is done by hashing the key to get the shard index of each vertex.
+     */
+    // TODO: Two-step filling that first counts and then fills? Could be parallel then.
+    uint targetNrShards = isParallel ? uint(4 * nrThreads) : 1;
+    uint nrShards = 1, hashShift = 32;
+    while (nrShards < targetNrShards) {
+      nrShards *= 2;
+      hashShift -= 1;
+    }
+
+    /* Reserve 25% extra to account for variation due to hashing. */
+    size_t reserveSize = size_t(double(3 * nrTriangles) * 1.25 / nrShards);
+    std::vector<NeighborShard> shards(nrShards, {reserveSize});
+
+    for (uint t = 0; t < nrTriangles; t++) {
+      Triangle &triangle = triangles[t];
+      for (uint i = 0; i < 3; i++) {
+        const uint i0 = triangle.vertices[i];
+        const uint i1 = triangle.vertices[(i != 2) ? (i + 1) : 0];
+        const uint high = std::max(i0, i1), low = std::min(i0, i1);
+        const uint hash = hash_uint3(high, low, 0);
+        /* TODO: Reusing the hash here means less hash space inside each shard.
+         * Computing a second hash with a different seed it probably not worth it? */
+        const uint shard = isParallel ? (hash >> hashShift) : 0;
+        shards[shard].entries.emplace_back(hash, pack_index(t, i));
+      }
+    }
+
+    runParallel(0u, nrShards, [&](uint s) { shards[s].buildNeighbors(this); });
+  }
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////////////////////////
+
+  void assignRecur(const uint t, uint groupId)
+  {
+    if (t == UNSET_ENTRY) {
+      return;
+    }
+
+    Triangle &triangle = triangles[t];
+    Group &group = groups[groupId];
+
+    // track down vertex
+    const uint vertRep = group.vertexRepresentative;
+    uint i = 3;
+    if (triangle.vertices[0] == vertRep)
+      i = 0;
+    else if (triangle.vertices[1] == vertRep)
+      i = 1;
+    else if (triangle.vertices[2] == vertRep)
+      i = 2;
+    assert(i < 3);
+
+    // early out
+    if (triangle.group[i] != UNSET_ENTRY)
+      return;
+
+    if (triangle.groupWithAny) {
+      // first to group with a group-with-anything triangle
+      // determines its orientation.
+      // This is the only existing order dependency in the code!!
+      if (triangle.group[0] == UNSET_ENTRY && triangle.group[1] == UNSET_ENTRY &&
+          triangle.group[2] == UNSET_ENTRY) {
+        triangle.orientPreserving = group.orientPreserving;
+      }
+    }
+
+    if (triangle.orientPreserving != group.orientPreserving)
+      return;
+
+    triangle.group[i] = groupId;
+
+    const uint t_L = triangle.neighbor[i];
+    const uint t_R = triangle.neighbor[i > 0 ? (i - 1) : 2];
+    assignRecur(t_L, groupId);
+    assignRecur(t_R, groupId);
+  }
+
+  void build4RuleGroups()
+  {
+    /* NOTE: This could be parallelized by grouping all [t, i] pairs into
+     * shards by hash(triangles[t].vertices[i]). This way, each shard can be processed
+     * independently and in parallel.
+     * However, the `groupWithAny` logic needs special handling (e.g. lock a mutex when
+     * encountering a `groupWithAny` triangle, then sort it out, then unlock and proceed). */
+    for (uint t = 0; t < nrTriangles; t++) {
+      Triangle &triangle = triangles[t];
+      for (uint i = 0; i < 3; i++) {
+        // if not assigned to a group
+        if (triangle.groupWithAny || triangle.group[i] != UNSET_ENTRY) {
+          continue;
+        }
+
+        const uint newGroupId = uint(groups.size());
+        triangle.group[i] = newGroupId;
+
+        groups.emplace_back(triangle.vertices[i], bool(triangle.orientPreserving));
+
+        const uint t_L = triangle.neighbor[i];
+        const uint t_R = triangle.neighbor[i > 0 ? (i - 1) : 2];
+        assignRecur(t_L, newGroupId);
+        assignRecur(t_R, newGroupId);
+      }
+    }
+  }
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////////////////////////
+
+  template<bool atomic> void accumulateTSpaces(uint t)
+  {
+    const Triangle &triangle = triangles[t];
+    // only valid triangles get to add their contribution
+    if (triangle.groupWithAny) {
+      return;
+    }
+
+    /* Todo: Vectorize?
+     * Also: Could add special case for flat shading, when all normals are equal half of the fCos
+     * projections and two of the three tangent projections are unnecessary. */
+    std::array<float3, 3> n, p;
+    for (uint i = 0; i < 3; i++) {
+      n[i] = getNormal(triangle.vertices[i]);
+      p[i] = getPosition(triangle.vertices[i]);
+    }
+
+    std::array<float, 3> fCos = {dot(project(n[0], p[1] - p[0]), project(n[0], p[2] - p[0])),
+                                 dot(project(n[1], p[2] - p[1]), project(n[1], p[0] - p[1])),
+                                 dot(project(n[2], p[0] - p[2]), project(n[2], p[1] - p[2]))};
+
+    for (uint i = 0; i < 3; i++) {
+      uint groupId = triangle.group[i];
+      if (groupId != UNSET_ENTRY) {
+        float3 tangent = project(n[i], triangle.tangent) *
+                         fast_acosf(std::clamp(fCos[i], -1.0f, 1.0f));
+        if constexpr (atomic) {
+          groups[groupId].accumulateTSpaceAtomic(tangent);
+        }
+        else {
+          groups[groupId].accumulateTSpace(tangent);
+        }
+      }
+    }
+  }
+
+  void generateTSpaces()
+  {
+    if (isParallel) {
+      runParallel(0u, nrTriangles, [&](uint t) { accumulateTSpaces<true>(t); });
+    }
+    else {
+      for (uint t = 0; t < nrTriangles; t++) {
+        accumulateTSpaces<false>(t);
+      }
+    }
+
+    /* TODO: Worth parallelizing? Probably not. */
+    for (Group &group : groups) {
+      group.normalizeTSpace();
+    }
+
+    tSpaces.resize(nrTSpaces);
+
+    for (uint t = 0; t < nrTriangles; t++) {
+      Triangle &triangle = triangles[t];
+      for (uint i = 0; i < 3; i++) {
+        uint groupId = triangle.group[i];
+        if (groupId == UNSET_ENTRY) {
+          continue;
+        }
+        const Group group = groups[groupId];
+        assert(triangle.orientPreserving == group.orientPreserving);
+
+        // output tspace
+        const uint offset = triangle.tSpaceIdx;
+        const uint faceVertex = triangle.faceVertex[i];
+        tSpaces[offset + faceVertex].accumulateGroup(group);
+      }
+    }
+  }
+};
+
+}  // namespace mikk