From 388f48cb09bee6a734bc752c3d44d0c093e0410e Mon Sep 17 00:00:00 2001
From: Thomas Barlow <github@mysterymayhem.co.uk>
Date: Wed, 25 Jan 2023 03:40:32 +0000
Subject: [PATCH] FBX Export: Base patch for numpy speedup

This is a base patch that all other separate numpy patches to the FBX exporter rely on.

Add support for writing bytes from numpy arrays like the already supported Python arrays.

Add numpy and helper function imports to fbx_utils.py and export_fbx_bin.py to simplify subsequent patches.

Add astype_view_signedness utility function for viewing unsigned integer data as signed only when the itemsizes match, to avoid copying arrays unnecessarily with `numpy.ndarray.astype(new_type, copy=False)`.

Add numpy versions of the vcos_transformed_gen and nors_transformed_gen mesh transform helpers. 4d output is supported following comments in nors_transformed_gen, though remains unused.
Given tests of 1000 to 200000 vectors:
The most common use case is where the matrix is None (when the bake_space_transform option of the exporter is disabled, which is the default), which is ~44-105 times faster.
When bake_space_transform is enabled geom_mat_co is usually a matrix containing only scaling, which is ~14-65 times faster.
When bake_space_transform is enabled geom_mat_no is usually the identity matrix, which is ~18-170 times faster.

Add helper functions for performing faster uniqueness along the first axis when a sorted result is not needed. The sorting part of numpy.unique is often what takes the most time in the subsequent patches and this helper function can run numpy.unique many times faster when the second axis of an array has more than one element because it treats each row as a single element of a larger dtype meaning it only has to sort once.

This patch on its own makes no change to exported files.
---
 io_scene_fbx/encode_bin.py     |  92 +++++++++----
 io_scene_fbx/export_fbx_bin.py |   5 +-
 io_scene_fbx/fbx_utils.py      | 241 +++++++++++++++++++++++++++++++++
 3 files changed, 307 insertions(+), 31 deletions(-)

diff --git a/io_scene_fbx/encode_bin.py b/io_scene_fbx/encode_bin.py
index 516221e66..8433134a7 100644
--- a/io_scene_fbx/encode_bin.py
+++ b/io_scene_fbx/encode_bin.py
@@ -9,6 +9,7 @@ except:
 
 from struct import pack
 import array
+import numpy as np
 import zlib
 
 _BLOCK_SENTINEL_LENGTH = 13
@@ -112,17 +113,7 @@ class FBXElem:
         self.props_type.append(data_types.STRING)
         self.props.append(data)
 
-    def _add_array_helper(self, data, array_type, prop_type):
-        assert(isinstance(data, array.array))
-        assert(data.typecode == array_type)
-
-        length = len(data)
-
-        if _IS_BIG_ENDIAN:
-            data = data[:]
-            data.byteswap()
-        data = data.tobytes()
-
+    def _add_array_helper(self, data, prop_type, length):
         # mimic behavior of fbxconverter (also common sense)
         # we could make this configurable.
         encoding = 0 if len(data) <= 128 else 1
@@ -138,35 +129,78 @@ class FBXElem:
         self.props_type.append(prop_type)
         self.props.append(data)
 
+    def _add_parray_helper(self, data, array_type, prop_type):
+        assert (isinstance(data, array.array))
+        assert (data.typecode == array_type)
+
+        length = len(data)
+
+        if _IS_BIG_ENDIAN:
+            data = data[:]
+            data.byteswap()
+        data = data.tobytes()
+
+        self._add_array_helper(data, prop_type, length)
+
+    def _add_ndarray_helper(self, data, dtype, prop_type):
+        assert (isinstance(data, np.ndarray))
+        assert (data.dtype == dtype)
+
+        length = data.size
+
+        if _IS_BIG_ENDIAN and data.dtype.isnative:
+            data = data.byteswap()
+        data = data.tobytes()
+
+        self._add_array_helper(data, prop_type, length)
+
     def add_int32_array(self, data):
-        if not isinstance(data, array.array):
-            data = array.array(data_types.ARRAY_INT32, data)
-        self._add_array_helper(data, data_types.ARRAY_INT32, data_types.INT32_ARRAY)
+        if isinstance(data, np.ndarray):
+            self._add_ndarray_helper(data, np.int32, data_types.INT32_ARRAY)
+        else:
+            if not isinstance(data, array.array):
+                data = array.array(data_types.ARRAY_INT32, data)
+            self._add_parray_helper(data, data_types.ARRAY_INT32, data_types.INT32_ARRAY)
 
     def add_int64_array(self, data):
-        if not isinstance(data, array.array):
-            data = array.array(data_types.ARRAY_INT64, data)
-        self._add_array_helper(data, data_types.ARRAY_INT64, data_types.INT64_ARRAY)
+        if isinstance(data, np.ndarray):
+            self._add_ndarray_helper(data, np.int64, data_types.INT64_ARRAY)
+        else:
+            if not isinstance(data, array.array):
+                data = array.array(data_types.ARRAY_INT64, data)
+            self._add_parray_helper(data, data_types.ARRAY_INT64, data_types.INT64_ARRAY)
 
     def add_float32_array(self, data):
-        if not isinstance(data, array.array):
-            data = array.array(data_types.ARRAY_FLOAT32, data)
-        self._add_array_helper(data, data_types.ARRAY_FLOAT32, data_types.FLOAT32_ARRAY)
+        if isinstance(data, np.ndarray):
+            self._add_ndarray_helper(data, np.float32, data_types.FLOAT32_ARRAY)
+        else:
+            if not isinstance(data, array.array):
+                data = array.array(data_types.ARRAY_FLOAT32, data)
+            self._add_parray_helper(data, data_types.ARRAY_FLOAT32, data_types.FLOAT32_ARRAY)
 
     def add_float64_array(self, data):
-        if not isinstance(data, array.array):
-            data = array.array(data_types.ARRAY_FLOAT64, data)
-        self._add_array_helper(data, data_types.ARRAY_FLOAT64, data_types.FLOAT64_ARRAY)
+        if isinstance(data, np.ndarray):
+            self._add_ndarray_helper(data, np.float64, data_types.FLOAT64_ARRAY)
+        else:
+            if not isinstance(data, array.array):
+                data = array.array(data_types.ARRAY_FLOAT64, data)
+            self._add_parray_helper(data, data_types.ARRAY_FLOAT64, data_types.FLOAT64_ARRAY)
 
     def add_bool_array(self, data):
-        if not isinstance(data, array.array):
-            data = array.array(data_types.ARRAY_BOOL, data)
-        self._add_array_helper(data, data_types.ARRAY_BOOL, data_types.BOOL_ARRAY)
+        if isinstance(data, np.ndarray):
+            self._add_ndarray_helper(data, bool, data_types.BOOL_ARRAY)
+        else:
+            if not isinstance(data, array.array):
+                data = array.array(data_types.ARRAY_BOOL, data)
+            self._add_parray_helper(data, data_types.ARRAY_BOOL, data_types.BOOL_ARRAY)
 
     def add_byte_array(self, data):
-        if not isinstance(data, array.array):
-            data = array.array(data_types.ARRAY_BYTE, data)
-        self._add_array_helper(data, data_types.ARRAY_BYTE, data_types.BYTE_ARRAY)
+        if isinstance(data, np.ndarray):
+            self._add_ndarray_helper(data, np.byte, data_types.BYTE_ARRAY)
+        else:
+            if not isinstance(data, array.array):
+                data = array.array(data_types.ARRAY_BYTE, data)
+            self._add_parray_helper(data, data_types.ARRAY_BYTE, data_types.BYTE_ARRAY)
 
     # -------------------------
     # internal helper functions
diff --git a/io_scene_fbx/export_fbx_bin.py b/io_scene_fbx/export_fbx_bin.py
index c5cd93a7c..f600c7cfd 100644
--- a/io_scene_fbx/export_fbx_bin.py
+++ b/io_scene_fbx/export_fbx_bin.py
@@ -6,6 +6,7 @@
 import array
 import datetime
 import math
+import numpy as np
 import os
 import time
 
@@ -46,9 +47,9 @@ from .fbx_utils import (
     # Miscellaneous utils.
     PerfMon,
     units_blender_to_fbx_factor, units_convertor, units_convertor_iter,
-    matrix4_to_array, similar_values, similar_values_iter,
+    matrix4_to_array, similar_values, similar_values_iter, astype_view_signedness, fast_first_axis_unique,
     # Mesh transform helpers.
-    vcos_transformed_gen, nors_transformed_gen,
+    vcos_transformed_gen, nors_transformed_gen, vcos_transformed, nors_transformed,
     # UUID from key.
     get_fbx_uuid_from_key,
     # Key generators.
diff --git a/io_scene_fbx/fbx_utils.py b/io_scene_fbx/fbx_utils.py
index e52cd9eb1..9698a3609 100644
--- a/io_scene_fbx/fbx_utils.py
+++ b/io_scene_fbx/fbx_utils.py
@@ -9,6 +9,7 @@ import time
 from collections import namedtuple
 from collections.abc import Iterable
 from itertools import zip_longest, chain
+import numpy as np
 
 import bpy
 import bpy_extras
@@ -272,6 +273,246 @@ def nors_transformed_gen(raw_nors, m=None):
     return gen if m is None else (m @ Vector(v) for v in gen)
 
 
+def _mat4_vec3_array_multiply(mat4, vec3_array, dtype=None, return_4d=False):
+    """Multiply a 4d matrix by each 3d vector in an array and return as an array of either 3d or 4d vectors.
+
+    A view of the input array is returned if return_4d=False, the dtype matches the input array and either the matrix is
+    None or, ignoring the last row, is a 3x3 identity matrix with no translation:
+    ┌1, 0, 0, 0┐
+    │0, 1, 0, 0│
+    └0, 0, 1, 0┘
+
+    When dtype=None, it defaults to the dtype of the input array."""
+    return_dtype = dtype if dtype is not None else vec3_array.dtype
+    vec3_array = vec3_array.reshape(-1, 3)
+
+    # Multiplying a 4d mathutils.Matrix by a 3d mathutils.Vector implicitly extends the Vector to 4d during the
+    # calculation by appending 1.0 to the Vector and then the 4d result is truncated back to 3d.
+    # Numpy does not do an implicit extension to 4d, so it would have to be done explicitly by extending the entire
+    # vec3_array to 4d.
+    # However, since the w component of the vectors is always 1.0, the last column can be excluded from the
+    # multiplication and then added to every multiplied vector afterwards, which avoids having to make a 4d copy of
+    # vec3_array beforehand.
+    # For a single column vector:
+    # ┌a, b, c, d┐   ┌x┐   ┌ax+by+cz+d┐
+    # │e, f, g, h│ @ │y│ = │ex+fy+gz+h│
+    # │i, j, k, l│   │z│   │ix+jy+kz+l│
+    # └m, n, o, p┘   └1┘   └mx+ny+oz+p┘
+    # ┌a, b, c┐   ┌x┐   ┌d┐   ┌ax+by+cz┐   ┌d┐   ┌ax+by+cz+d┐
+    # │e, f, g│ @ │y│ + │h│ = │ex+fy+gz│ + │h│ = │ex+fy+gz+h│
+    # │i, j, k│   └z┘   │l│   │ix+jy+kz│   │l│   │ix+jy+kz+l│
+    # └m, n, o┘         └p┘   └mx+ny+oz┘   └p┘   └mx+ny+oz+p┘
+
+    # column_vector_multiplication in mathutils_Vector.c uses double precision math for Matrix @ Vector by casting the
+    # matrix's values to double precision and then casts back to single precision when returning the result, so at least
+    # double precision math is always be used to match standard Blender behaviour.
+    math_precision = np.result_type(np.double, vec3_array)
+
+    to_multiply = None
+    to_add = None
+    w_to_set = 1.0
+    if mat4 is not None:
+        mat_np = np.array(mat4, dtype=math_precision)
+        # Identity matrix is compared against to check if any matrix multiplication is required.
+        identity = np.identity(4, dtype=math_precision)
+        if not return_4d:
+            # If returning 3d, the entire last row of the matrix can be ignored because it only affects the w component.
+            mat_np = mat_np[:3]
+            identity = identity[:3]
+
+        # Split mat_np into the columns to multiply and the column to add afterwards.
+        # First 3 columns
+        multiply_columns = mat_np[:, :3]
+        multiply_identity = identity[:, :3]
+        # Last column only
+        add_column = mat_np.T[3]
+
+        # Analyze the split parts of the matrix to figure out if there is anything to multiply and anything to add.
+        if not np.array_equal(multiply_columns, multiply_identity):
+            to_multiply = multiply_columns
+
+        if return_4d and to_multiply is None:
+            # When there's nothing to multiply, the w component of add_column can be set directly into the array because
+            # mx+ny+oz+p becomes 0x+0y+0z+p where p is add_column[3].
+            w_to_set = add_column[3]
+            # Replace add_column with a view of only the translation.
+            add_column = add_column[:3]
+
+        if add_column.any():
+            to_add = add_column
+
+    if to_multiply is None:
+        # If there's anything to add, ensure it's added using the precision being used for math.
+        array_dtype = math_precision if to_add is not None else return_dtype
+        if return_4d:
+            multiplied_vectors = np.empty((len(vec3_array), 4), dtype=array_dtype)
+            multiplied_vectors[:, :3] = vec3_array
+            multiplied_vectors[:, 3] = w_to_set
+        else:
+            # If there's anything to add, ensure a copy is made so that the input vec3_array isn't modified.
+            multiplied_vectors = vec3_array.astype(array_dtype, copy=to_add is not None)
+    else:
+        # Matrix multiplication has the signature (n,k) @ (k,m) -> (n,m).
+        # Where v is the number of vectors in vec3_array and d is the number of vector dimensions to return:
+        # to_multiply has shape (d,3), vec3_array has shape (v,3) and the result should have shape (v,d).
+        # Either vec3_array or to_multiply must be transposed:
+        # Can transpose vec3_array and then transpose the result:
+        # (v,3).T -> (3,v); (d,3) @ (3,v) -> (d,v); (d,v).T -> (v,d)
+        # Or transpose to_multiply and swap the order of multiplication:
+        # (d,3).T -> (3,d); (v,3) @ (3,d) -> (v,d)
+        # There's no, or negligible, performance difference between the two options, however, the result of the latter
+        # will be C contiguous in memory, making it faster to convert to flattened bytes with .tobytes().
+        multiplied_vectors = vec3_array @ to_multiply.T
+
+    if to_add is not None:
+        for axis, to_add_to_axis in zip(multiplied_vectors.T, to_add):
+            if to_add_to_axis != 0:
+                axis += to_add_to_axis
+
+    # Cast to the desired return type before returning.
+    return multiplied_vectors.astype(return_dtype, copy=False)
+
+
+def vcos_transformed(raw_cos, m=None, dtype=None):
+    return _mat4_vec3_array_multiply(m, raw_cos, dtype)
+
+
+def nors_transformed(raw_nors, m=None, dtype=None):
+    # Great, now normals are also expected 4D!
+    # XXX Back to 3D normals for now!
+    # return _mat4_vec3_array_multiply(m, raw_nors, dtype, return_4d=True)
+    return _mat4_vec3_array_multiply(m, raw_nors, dtype)
+
+
+def astype_view_signedness(arr, new_dtype):
+    """Unsafely views arr as new_dtype if the itemsize and byteorder of arr matches but the signedness does not,
+    otherwise calls np.ndarray.astype with copy=False.
+
+    The benefit of copy=False is that if the array can be safely viewed as the new type, then a view is made, instead of
+    a copy with the new type.
+
+    Unsigned types can't be viewed safely as signed or vice-versa, meaning that a copy would always be made by
+    .astype(..., copy=False).
+
+    This is intended for viewing uintc data (a common Blender C type with variable itemsize, though usually 4 bytes, so
+    uint32) as int32 (a common FBX type), when the itemsizes match."""
+    arr_dtype = arr.dtype
+
+    if not isinstance(new_dtype, np.dtype):
+        # new_dtype could be a type instance or a string, but it needs to be a dtype to compare its itemsize, byteorder
+        # and kind.
+        new_dtype = np.dtype(new_dtype)
+
+    # For simplicity, only dtypes of the same itemsize and byteorder, but opposite signedness, are handled. Everything
+    # else is left to .astype.
+    arr_kind = arr_dtype.kind
+    new_kind = new_dtype.kind
+    if (
+        # Signed and unsigned int are opposite in terms of signedness. Other types don't have signedness.
+        ((arr_kind == 'i' and new_kind == 'u') or (arr_kind == 'u' and new_kind == 'i'))
+        and arr_dtype.itemsize == new_dtype.itemsize
+        and arr_dtype.byteorder == new_dtype.byteorder
+    ):
+        # new_dtype has opposite signedness and matching itemsize and byteorder, so return a view of the new type.
+        return arr.view(new_dtype)
+    else:
+        return arr.astype(new_dtype, copy=False)
+
+
+def fast_first_axis_flat(ar):
+    """Get a flat view (or a copy if a view is not possible) of the input array whereby each element is a single element
+    of a dtype that is fast to sort, sorts according to individual bytes and contains the data for an entire row (and
+    any further dimensions) of the input array.
+
+    Since the dtype of the view could sort in a different order to the dtype of the input array, this isn't typically
+    useful for actual sorting, but it is useful for sorting-based uniqueness, such as np.unique."""
+    # If there are no rows, each element will be viewed as the new dtype.
+    elements_per_row = math.prod(ar.shape[1:])
+    row_itemsize = ar.itemsize * elements_per_row
+
+    # Get a dtype with itemsize that equals row_itemsize.
+    # Integer types sort the fastest, but are only available for specific itemsizes.
+    uint_dtypes_by_itemsize = {1: np.uint8, 2: np.uint16, 4: np.uint32, 8: np.uint64}
+    # Signed/unsigned makes no noticeable speed difference, but using unsigned will result in ordering according to
+    # individual bytes like the other, non-integer types.
+    if row_itemsize in uint_dtypes_by_itemsize:
+        entire_row_dtype = uint_dtypes_by_itemsize[row_itemsize]
+    else:
+        # When using kind='stable' sorting, numpy only uses radix sort with integer types, but it's still
+        # significantly faster to sort by a single item per row instead of multiple row elements or multiple structured
+        # type fields.
+        # Construct a flexible size dtype with matching itemsize.
+        # Should always be 4 because each character in a unicode string is UCS4.
+        str_itemsize = np.dtype((np.str_, 1)).itemsize
+        if row_itemsize % str_itemsize == 0:
+            # Unicode strings seem to be slightly faster to sort than bytes.
+            entire_row_dtype = np.dtype((np.str_, row_itemsize // str_itemsize))
+        else:
+            # Bytes seem to be slightly faster to sort than raw bytes (np.void).
+            entire_row_dtype = np.dtype((np.bytes_, row_itemsize))
+
+    # View each element along the first axis as a single element.
+    # View (or copy if a view is not possible) as flat
+    ar = ar.reshape(-1)
+    # To view as a dtype of different size, the last axis (entire array in NumPy 1.22 and earlier) must be C-contiguous.
+    if row_itemsize != ar.itemsize and not ar.flags.c_contiguous:
+        ar = np.ascontiguousarray(ar)
+    return ar.view(entire_row_dtype)
+
+
+def fast_first_axis_unique(ar, return_unique=True, return_index=False, return_inverse=False, return_counts=False):
+    """np.unique with axis=0 but optimised for when the input array has multiple elements per row, and the returned
+    unique array doesn't need to be sorted.
+
+    Arrays with more than one element per row are more costly to sort in np.unique due to being compared one
+    row-element at a time, like comparing tuples.
+
+    By viewing each entire row as a single non-structured element, much faster sorting can be achieved. Since the values
+    are viewed as a different type to their original, this means that the returned array of unique values may not be
+    sorted according to their original type.
+
+    The array of unique values can be excluded from the returned tuple by specifying return_unique=False.
+
+    Float type caveats:
+    All elements of -0.0 in the input array will be replaced with 0.0 to ensure that both values are collapsed into one.
+    NaN values can have lots of different byte representations (e.g. signalling/quiet and custom payloads). Only the
+    duplicates of each unique byte representation will be collapsed into one."""
+    # At least something should always be returned.
+    assert(return_unique or return_index or return_inverse or return_counts)
+    # Only signed integer, unsigned integer and floating-point kinds of data are allowed. Other kinds of data have not
+    # been tested.
+    assert(ar.dtype.kind in "iuf")
+
+    # Floating-point types have different byte representations for -0.0 and 0.0. Collapse them together by replacing all
+    # -0.0 in the input array with 0.0.
+    if ar.dtype.kind == 'f':
+        ar[ar == -0.0] = 0.0
+
+    # It's a bit annoying that the unique array is always calculated even when it might not be needed, but it is
+    # generally insignificant compared to the cost of sorting.
+    result = np.unique(fast_first_axis_flat(ar), return_index=return_index,
+                       return_inverse=return_inverse, return_counts=return_counts)
+
+    if return_unique:
+        unique = result[0] if isinstance(result, tuple) else result
+        # View in the original dtype.
+        unique = unique.view(ar.dtype)
+        # Return the same number of elements per row and any extra dimensions per row as the input array.
+        unique.shape = (-1, *ar.shape[1:])
+        if isinstance(result, tuple):
+            return (unique,) + result[1:]
+        else:
+            return unique
+    else:
+        # Remove the first element, the unique array.
+        result = result[1:]
+        if len(result) == 1:
+            # Unpack single element tuples.
+            return result[0]
+        else:
+            return result
+
+
 # ##### UIDs code. #####
 
 # ID class (mere int).
-- 
2.30.2