Merge remote-tracking branch 'origin/master' into blender2.8

2016-11-02 10:48:06 +01:00
parent fb6f42dc4f 13ee9b8ebe
commit a41bbd3053
195 changed files with 6126 additions and 3391 deletions
@@ -187,7 +187,7 @@ The next table describes the information in the file-header.
 </table>

 <p>
-<a href="http://en.wikipedia.org/wiki/Endianness">Endianness</a> addresses the way values are ordered in a sequence of bytes(see the <a href="#example-endianess">example</a> below):
+<a href="https://en.wikipedia.org/wiki/Endianness">Endianness</a> addresses the way values are ordered in a sequence of bytes(see the <a href="#example-endianess">example</a> below):
 </p>

 <ul>
@@ -699,7 +699,7 @@ LAYOUT_FILE            =
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
 # extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
 # For LaTeX the style of the bibliography can be controlled using
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.
@@ -1145,7 +1145,7 @@ HTML_EXTRA_FILES       =
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
 # this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
 # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
 # purple, and 360 is red again.
 # Minimum value: 0, maximum value: 359, default value: 220.
@@ -1752,7 +1752,7 @@ LATEX_SOURCE_CODE      = NO

 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
 # The default value is: plain.
 # This tag requires that the tag GENERATE_LATEX is set to YES.

@@ -24,8 +24,8 @@ Then, call ``bpy.app.translations.register(__name__, your_dict)`` in your ``regi
 The ``Manage UI translations`` add-on has several functions to help you collect strings to translate, and
 generate the needed python code (the translation dictionary), as well as optional intermediary po files
 if you want some... See
-`How to Translate Blender <http://wiki.blender.org/index.php/Dev:Doc/Process/Translate_Blender>`_ and
-`Using i18n in Blender Code <http://wiki.blender.org/index.php/Dev:Source/Interface/Internationalization>`_
+`How to Translate Blender <https://wiki.blender.org/index.php/Dev:Doc/Process/Translate_Blender>`_ and
+`Using i18n in Blender Code <https://wiki.blender.org/index.php/Dev:Source/Interface/Internationalization>`_
 for more info.

 Module References
@@ -49,7 +49,7 @@ vec2d[:] = vec3d[:2]


 # Vectors support 'swizzle' operations
-# See http://en.wikipedia.org/wiki/Swizzling_(computer_graphics)
+# See https://en.wikipedia.org/wiki/Swizzling_(computer_graphics)
 vec.xyz = vec.zyx
 vec.xy = vec4d.zw
 vec.xyz = vec4d.wzz
@@ -23,7 +23,7 @@ The features exposed closely follow the C API,
 giving python access to the functions used by blenders own mesh editing tools.

 For an overview of BMesh data types and how they reference each other see:
-`BMesh Design Document <http://wiki.blender.org/index.php/Dev:2.6/Source/Modeling/BMesh/Design>`_ .
+`BMesh Design Document <https://wiki.blender.org/index.php/Dev:Source/Modeling/BMesh/Design>`_ .


 .. note::
@@ -31,13 +31,12 @@ For an overview of BMesh data types and how they reference each other see:
   **Disk** and **Radial** data is not exposed by the python api since this is for internal use only.


-.. warning::
-
-   TODO items are...
+.. warning:: TODO items are...

   * add access to BMesh **walkers**
   * add custom-data manipulation functions add/remove/rename.

+
 Example Script
 --------------

@@ -18,7 +18,7 @@ amongst our own scripts and make it easier to use python scripts from other proj

 Using our style guide for your own scripts makes it easier if you eventually want to contribute them to blender.

-This style guide is known as pep8 and can be found `here <http://www.python.org/dev/peps/pep-0008>`_
+This style guide is known as pep8 and can be found `here <https://www.python.org/dev/peps/pep-0008/>`_

 A brief listing of pep8 criteria.

@@ -316,7 +316,7 @@ use to join a list of strings (the list may be temporary). In the following exam


 Join is fastest on many strings,
-`string formatting <http://docs.python.org/py3k/library/string.html#string-formatting>`__
+`string formatting <https://wiki.blender.org/index.php/Dev:Source/Modeling/BMesh/Design>`__
 is quite fast too (better for converting data types). String arithmetic is slowest.


@@ -1,3 +1,4 @@
+
 *******
 Gotchas
 *******
@@ -38,7 +39,6 @@ but some operators are more picky about when they run.
 In most cases you can figure out what context an operator needs
 simply be seeing how it's used in Blender and thinking about what it does.

-
 Unfortunately if you're still stuck - the only way to **really** know
 whats going on is to read the source code for the poll function and see what its checking.

@@ -82,7 +82,6 @@ it should be reported to the bug tracker.
 Stale Data
 ==========

-
 No updates after setting values
 -------------------------------

@@ -174,8 +173,8 @@ In this situation you can...

 .. _info_gotcha_mesh_faces:

-NGons and Tessellation Faces
-============================
+N-Gons and Tessellation Faces
+=============================

 Since 2.63 NGons are supported, this adds some complexity
 since in some cases you need to access triangles/quads still (some exporters for example).
@@ -509,7 +508,7 @@ Unicode Problems
 Python supports many different encodings so there is nothing stopping you from
 writing a script in ``latin1`` or ``iso-8859-15``.

-See `pep-0263 <http://www.python.org/dev/peps/pep-0263/>`_
+See `pep-0263 <https://www.python.org/dev/peps/pep-0263/>`_

 However this complicates matters for Blender's Python API because ``.blend`` files don't have an explicit encoding.

@@ -657,7 +656,7 @@ Here are some general hints to avoid running into these problems.
 .. note::

   To find the line of your script that crashes you can use the ``faulthandler`` module.
-   See `faulthandler docs <http://docs.python.org/dev/library/faulthandler.html>`_.
+   See the `faulthandler docs <https://docs.python.org/dev/library/faulthandler.html>`_.

   While the crash may be in Blenders C/C++ code,
   this can help a lot to track down the area of the script that causes the crash.
@@ -43,8 +43,7 @@ scene manipulation, automation, defining your own toolset and customization.

 On startup Blender scans the ``scripts/startup/`` directory for Python modules and imports them.
 The exact location of this directory depends on your installation.
-`See the directory layout docs
-<https://www.blender.org/manual/getting_started/installing_blender/directorylayout.html>`__
+See the :ref:`directory layout docs <blender_manual:getting-started_installing-config-directories>`.


 Script Loading
@@ -92,7 +91,7 @@ variable which Blender uses to read metadata such as name, author, category and

 The User Preferences add-on listing uses **bl_info** to display information about each add-on.

-`See Add-ons <http://wiki.blender.org/index.php/Dev:2.5/Py/Scripts/Guidelines/Addons>`__
+`See Add-ons <https://wiki.blender.org/index.php/Dev:Py/Scripts/Guidelines/Addons>`__
 for details on the ``bl_info`` dictionary.


@@ -51,8 +51,7 @@ A quick list of helpful things to know before starting:
  | ``scripts/startup/bl_operators`` for operators.

  Exact location depends on platform, see:
-  `Configuration and Data Paths
-  <https://www.blender.org/manual/getting_started/installing_blender/directorylayout.html>`__.
+  :ref:`Configuration and Data Paths <blender_manual:getting-started_installing-config-directories>`.


 Running Scripts
@@ -27,7 +27,7 @@ There are 3 main uses for the terminal, these are:

 .. note::

-   For Linux and OSX users this means starting the terminal first, then running Blender from within it.
+   For Linux and macOS users this means starting the terminal first, then running Blender from within it.
   On Windows the terminal can be enabled from the help menu.


@@ -306,7 +306,7 @@ Advantages include:
 This is marked advanced because to run Blender as a Python module requires a special build option.

 For instructions on building see
-`Building Blender as a Python module <http://wiki.blender.org/index.php/User:Ideasman42/BlenderAsPyModule>`_
+`Building Blender as a Python module <https://wiki.blender.org/index.php/User:Ideasman42/BlenderAsPyModule>`_


 Python Safety (Build Option)
@@ -232,7 +232,7 @@ if you want it to be enabled on restart, press *Save as Default*.
      print(addon_utils.paths())

   More is written on this topic here:
-   `Directory Layout <https://www.blender.org/manual/getting_started/installing_blender/directorylayout.html>`_
+   :ref:`Directory Layout <blender_manual:getting-started_installing-config-directories>`.


 Your Second Add-on
@@ -630,6 +630,6 @@ Here are some sites you might like to check on after completing this tutorial.
  *Great info for those who are still learning Python.*
 - `Blender Development (Wiki) <https://wiki.blender.org/index.php/Dev:Contents>`_ -
  *Blender Development, general information and helpful links.*
- `Blender Artists (Coding Section) <http://blenderartists.org/forum/forumdisplay.php?47-Coding>`_ -
+- `Blender Artists (Coding Section) <https://blenderartists.org/forum/forumdisplay.php?47-Coding>`_ -
  *forum where people ask Python development questions*

@@ -73,10 +73,12 @@ set(SRC
 	internal/ceres/file.cc
 	internal/ceres/generated/partitioned_matrix_view_d_d_d.cc
 	internal/ceres/generated/schur_eliminator_d_d_d.cc
+	internal/ceres/gradient_checker.cc
 	internal/ceres/gradient_checking_cost_function.cc
 	internal/ceres/gradient_problem.cc
 	internal/ceres/gradient_problem_solver.cc
 	internal/ceres/implicit_schur_complement.cc
+	internal/ceres/is_close.cc
 	internal/ceres/iterative_schur_complement_solver.cc
 	internal/ceres/lapack.cc
 	internal/ceres/levenberg_marquardt_strategy.cc
@@ -116,6 +118,7 @@ set(SRC
 	internal/ceres/triplet_sparse_matrix.cc
 	internal/ceres/trust_region_minimizer.cc
 	internal/ceres/trust_region_preprocessor.cc
+	internal/ceres/trust_region_step_evaluator.cc
 	internal/ceres/trust_region_strategy.cc
 	internal/ceres/types.cc
 	internal/ceres/wall_time.cc
@@ -204,6 +207,7 @@ set(SRC
 	internal/ceres/householder_vector.h
 	internal/ceres/implicit_schur_complement.h
 	internal/ceres/integral_types.h
+	internal/ceres/is_close.h
 	internal/ceres/iterative_schur_complement_solver.h
 	internal/ceres/lapack.h
 	internal/ceres/levenberg_marquardt_strategy.h
@@ -248,6 +252,7 @@ set(SRC
 	internal/ceres/triplet_sparse_matrix.h
 	internal/ceres/trust_region_minimizer.h
 	internal/ceres/trust_region_preprocessor.h
+	internal/ceres/trust_region_step_evaluator.h
 	internal/ceres/trust_region_strategy.h
 	internal/ceres/visibility_based_preconditioner.h
 	internal/ceres/wall_time.h
@@ -173,26 +173,5 @@ if(WITH_OPENMP)
 	)
 endif()

-TEST_UNORDERED_MAP_SUPPORT()
-if(HAVE_STD_UNORDERED_MAP_HEADER)
-	if(HAVE_UNORDERED_MAP_IN_STD_NAMESPACE)
-		add_definitions(-DCERES_STD_UNORDERED_MAP)
-	else()
-		if(HAVE_UNORDERED_MAP_IN_TR1_NAMESPACE)
-			add_definitions(-DCERES_STD_UNORDERED_MAP_IN_TR1_NAMESPACE)
-		else()
-			add_definitions(-DCERES_NO_UNORDERED_MAP)
-			message(STATUS "Replacing unordered_map/set with map/set (warning: slower!)")
-		endif()
-	endif()
-else()
-	if(HAVE_UNORDERED_MAP_IN_TR1_NAMESPACE)
-		add_definitions(-DCERES_TR1_UNORDERED_MAP)
-	else()
-		add_definitions(-DCERES_NO_UNORDERED_MAP)
-		message(STATUS "Replacing unordered_map/set with map/set (warning: slower!)")
-	endif()
-endif()
-
 blender_add_lib(extern_ceres "\${SRC}" "\${INC}" "\${INC_SYS}")
 EOF
@@ -149,6 +149,7 @@ internal/ceres/generated/schur_eliminator_4_4_d.cc
 internal/ceres/generated/schur_eliminator_d_d_d.cc
 internal/ceres/generate_eliminator_specialization.py
 internal/ceres/generate_partitioned_matrix_view_specializations.py
+internal/ceres/gradient_checker.cc
 internal/ceres/gradient_checking_cost_function.cc
 internal/ceres/gradient_checking_cost_function.h
 internal/ceres/gradient_problem.cc
@@ -160,6 +161,8 @@ internal/ceres/householder_vector.h
 internal/ceres/implicit_schur_complement.cc
 internal/ceres/implicit_schur_complement.h
 internal/ceres/integral_types.h
+internal/ceres/is_close.cc
+internal/ceres/is_close.h
 internal/ceres/iterative_schur_complement_solver.cc
 internal/ceres/iterative_schur_complement_solver.h
 internal/ceres/lapack.cc
@@ -243,6 +246,8 @@ internal/ceres/trust_region_minimizer.cc
 internal/ceres/trust_region_minimizer.h
 internal/ceres/trust_region_preprocessor.cc
 internal/ceres/trust_region_preprocessor.h
+internal/ceres/trust_region_step_evaluator.cc
+internal/ceres/trust_region_step_evaluator.h
 internal/ceres/trust_region_strategy.cc
 internal/ceres/trust_region_strategy.h
 internal/ceres/types.cc
@@ -130,7 +130,8 @@ class CostFunctionToFunctor {
    const int num_parameter_blocks =
        (N0 > 0) + (N1 > 0) + (N2 > 0) + (N3 > 0) + (N4 > 0) +
        (N5 > 0) + (N6 > 0) + (N7 > 0) + (N8 > 0) + (N9 > 0);
-    CHECK_EQ(parameter_block_sizes.size(), num_parameter_blocks);
+    CHECK_EQ(static_cast<int>(parameter_block_sizes.size()),
+             num_parameter_blocks);

    CHECK_EQ(N0, parameter_block_sizes[0]);
    if (parameter_block_sizes.size() > 1) CHECK_EQ(N1, parameter_block_sizes[1]);  // NOLINT
@@ -357,6 +357,28 @@ class CERES_EXPORT Covariance {
                                  const double*> >& covariance_blocks,
      Problem* problem);

+  // Compute a part of the covariance matrix.
+  //
+  // The vector parameter_blocks contains the parameter blocks that
+  // are used for computing the covariance matrix. From this vector
+  // all covariance pairs are generated. This allows the covariance
+  // estimation algorithm to only compute and store these blocks.
+  //
+  // parameter_blocks cannot contain duplicates. Bad things will
+  // happen if they do.
+  //
+  // Note that the list of covariance_blocks is only used to determine
+  // what parts of the covariance matrix are computed. The full
+  // Jacobian is used to do the computation, i.e. they do not have an
+  // impact on what part of the Jacobian is used for computation.
+  //
+  // The return value indicates the success or failure of the
+  // covariance computation. Please see the documentation for
+  // Covariance::Options for more on the conditions under which this
+  // function returns false.
+  bool Compute(const std::vector<const double*>& parameter_blocks,
+               Problem* problem);
+
  // Return the block of the cross-covariance matrix corresponding to
  // parameter_block1 and parameter_block2.
  //
@@ -394,6 +416,40 @@ class CERES_EXPORT Covariance {
                                        const double* parameter_block2,
                                        double* covariance_block) const;

+  // Return the covariance matrix corresponding to all parameter_blocks.
+  //
+  // Compute must be called before calling GetCovarianceMatrix and all
+  // parameter_blocks must have been present in the vector
+  // parameter_blocks when Compute was called. Otherwise
+  // GetCovarianceMatrix returns false.
+  //
+  // covariance_matrix must point to a memory location that can store
+  // the size of the covariance matrix. The covariance matrix will be
+  // a square matrix whose row and column count is equal to the sum of
+  // the sizes of the individual parameter blocks. The covariance
+  // matrix will be a row-major matrix.
+  bool GetCovarianceMatrix(const std::vector<const double *> &parameter_blocks,
+                           double *covariance_matrix);
+
+  // Return the covariance matrix corresponding to parameter_blocks
+  // in the tangent space if a local parameterization is associated
+  // with one of the parameter blocks else returns the covariance
+  // matrix in the ambient space.
+  //
+  // Compute must be called before calling GetCovarianceMatrix and all
+  // parameter_blocks must have been present in the vector
+  // parameters_blocks when Compute was called. Otherwise
+  // GetCovarianceMatrix returns false.
+  //
+  // covariance_matrix must point to a memory location that can store
+  // the size of the covariance matrix. The covariance matrix will be
+  // a square matrix whose row and column count is equal to the sum of
+  // the sizes of the tangent spaces of the individual parameter
+  // blocks. The covariance matrix will be a row-major matrix.
+  bool GetCovarianceMatrixInTangentSpace(
+      const std::vector<const double*>& parameter_blocks,
+      double* covariance_matrix);
+
 private:
  internal::scoped_ptr<internal::CovarianceImpl> impl_;
 };
@@ -85,22 +85,6 @@ class DynamicNumericDiffCostFunction : public CostFunction {
        options_(options) {
  }

-  // Deprecated. New users should avoid using this constructor. Instead, use the
-  // constructor with NumericDiffOptions.
-  DynamicNumericDiffCostFunction(
-      const CostFunctor* functor,
-      Ownership ownership,
-      double relative_step_size)
-      : functor_(functor),
-        ownership_(ownership),
-        options_() {
-    LOG(WARNING) << "This constructor is deprecated and will be removed in "
-                    "a future version. Please use the NumericDiffOptions "
-                    "constructor instead.";
-
-    options_.relative_step_size = relative_step_size;
-  }
-
  virtual ~DynamicNumericDiffCostFunction() {
    if (ownership_ != TAKE_OWNERSHIP) {
      functor_.release();
@@ -138,19 +122,19 @@ class DynamicNumericDiffCostFunction : public CostFunction {
    std::vector<double> parameters_copy(parameters_size);
    std::vector<double*> parameters_references_copy(block_sizes.size());
    parameters_references_copy[0] = &parameters_copy[0];
-    for (int block = 1; block < block_sizes.size(); ++block) {
+    for (size_t block = 1; block < block_sizes.size(); ++block) {
      parameters_references_copy[block] = parameters_references_copy[block - 1]
          + block_sizes[block - 1];
    }

    // Copy the parameters into the local temp space.
-    for (int block = 0; block < block_sizes.size(); ++block) {
+    for (size_t block = 0; block < block_sizes.size(); ++block) {
      memcpy(parameters_references_copy[block],
             parameters[block],
             block_sizes[block] * sizeof(*parameters[block]));
    }

-    for (int block = 0; block < block_sizes.size(); ++block) {
+    for (size_t block = 0; block < block_sizes.size(); ++block) {
      if (jacobians[block] != NULL &&
          !NumericDiff<CostFunctor, method, DYNAMIC,
                       DYNAMIC, DYNAMIC, DYNAMIC, DYNAMIC, DYNAMIC,
@@ -27,194 +27,121 @@
 // POSSIBILITY OF SUCH DAMAGE.
 // Copyright 2007 Google Inc. All Rights Reserved.
 //
-// Author: wjr@google.com (William Rucklidge)
-//
-// This file contains a class that exercises a cost function, to make sure
-// that it is computing reasonable derivatives. It compares the Jacobians
-// computed by the cost function with those obtained by finite
-// differences.
+// Authors: wjr@google.com (William Rucklidge),
+//          keir@google.com (Keir Mierle),
+//          dgossow@google.com (David Gossow)

 #ifndef CERES_PUBLIC_GRADIENT_CHECKER_H_
 #define CERES_PUBLIC_GRADIENT_CHECKER_H_

-#include <cstddef>
-#include <algorithm>
 #include <vector>
+#include <string>

+#include "ceres/cost_function.h"
+#include "ceres/dynamic_numeric_diff_cost_function.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/internal/fixed_array.h"
 #include "ceres/internal/macros.h"
 #include "ceres/internal/scoped_ptr.h"
-#include "ceres/numeric_diff_cost_function.h"
+#include "ceres/local_parameterization.h"
 #include "glog/logging.h"

 namespace ceres {

-// An object that exercises a cost function, to compare the answers that it
-// gives with derivatives estimated using finite differencing.
+// GradientChecker compares the Jacobians returned by a cost function against
+// derivatives estimated using finite differencing.
 //
-// The only likely usage of this is for testing.
+// The condition enforced is that
+//
+//    (J_actual(i, j) - J_numeric(i, j))
+//   ------------------------------------  <  relative_precision
+//   max(J_actual(i, j), J_numeric(i, j))
+//
+// where J_actual(i, j) is the jacobian as computed by the supplied cost
+// function (by the user) multiplied by the local parameterization Jacobian
+// and J_numeric is the jacobian as computed by finite differences, multiplied
+// by the local parameterization Jacobian as well.
 //
 // How to use: Fill in an array of pointers to parameter blocks for your
-// CostFunction, and then call Probe(). Check that the return value is
-// 'true'. See prober_test.cc for an example.
-//
-// This is templated similarly to NumericDiffCostFunction, as it internally
-// uses that.
-template <typename CostFunctionToProbe,
-          int M = 0, int N0 = 0, int N1 = 0, int N2 = 0, int N3 = 0, int N4 = 0>
+// CostFunction, and then call Probe(). Check that the return value is 'true'.
 class GradientChecker {
 public:
-  // Here we stash some results from the probe, for later
-  // inspection.
-  struct GradientCheckResults {
-    // Computed cost.
-    Vector cost;
+  // This will not take ownership of the cost function or local
+  // parameterizations.
+  //
+  // function: The cost function to probe.
+  // local_parameterization: A vector of local parameterizations for each
+  // parameter. May be NULL or contain NULL pointers to indicate that the
+  // respective parameter does not have a local parameterization.
+  // options: Options to use for numerical differentiation.
+  GradientChecker(
+      const CostFunction* function,
+      const std::vector<const LocalParameterization*>* local_parameterizations,
+      const NumericDiffOptions& options);

-    // The sizes of these matrices are dictated by the cost function's
-    // parameter and residual block sizes. Each vector's length will
-    // term->parameter_block_sizes().size(), and each matrix is the
-    // Jacobian of the residual with respect to the corresponding parameter
-    // block.
+  // Contains results from a call to Probe for later inspection.
+  struct ProbeResults {
+    // The return value of the cost function.
+    bool return_value;
+
+    // Computed residual vector.
+    Vector residuals;
+
+    // The sizes of the Jacobians below are dictated by the cost function's
+    // parameter block size and residual block sizes. If a parameter block
+    // has a local parameterization associated with it, the size of the "local"
+    // Jacobian will be determined by the local parameterization dimension and
+    // residual block size, otherwise it will be identical to the regular
+    // Jacobian.

    // Derivatives as computed by the cost function.
-    std::vector<Matrix> term_jacobians;
+    std::vector<Matrix> jacobians;

-    // Derivatives as computed by finite differencing.
-    std::vector<Matrix> finite_difference_jacobians;
+    // Derivatives as computed by the cost function in local space.
+    std::vector<Matrix> local_jacobians;

-    // Infinity-norm of term_jacobians - finite_difference_jacobians.
-    double error_jacobians;
+    // Derivatives as computed by nuerical differentiation in local space.
+    std::vector<Matrix> numeric_jacobians;
+
+    // Derivatives as computed by nuerical differentiation in local space.
+    std::vector<Matrix> local_numeric_jacobians;
+
+    // Contains the maximum relative error found in the local Jacobians.
+    double maximum_relative_error;
+
+    // If an error was detected, this will contain a detailed description of
+    // that error.
+    std::string error_log;
  };

-  // Checks the Jacobian computed by a cost function.
+  // Call the cost function, compute alternative Jacobians using finite
+  // differencing and compare results. If local parameterizations are given,
+  // the Jacobians will be multiplied by the local parameterization Jacobians
+  // before performing the check, which effectively means that all errors along
+  // the null space of the local parameterization will be ignored.
+  // Returns false if the Jacobians don't match, the cost function return false,
+  // or if the cost function returns different residual when called with a
+  // Jacobian output argument vs. calling it without. Otherwise returns true.
  //
-  // probe_point: The parameter values at which to probe.
-  // error_tolerance: A threshold for the infinity-norm difference
-  // between the Jacobians. If the Jacobians differ by more than
-  // this amount, then the probe fails.
-  //
-  // term: The cost function to test. Not retained after this call returns.
-  //
-  // results: On return, the two Jacobians (and other information)
-  // will be stored here.  May be NULL.
+  // parameters: The parameter values at which to probe.
+  // relative_precision: A threshold for the relative difference between the
+  // Jacobians. If the Jacobians differ by more than this amount, then the
+  // probe fails.
+  // results: On return, the Jacobians (and other information) will be stored
+  // here. May be NULL.
  //
  // Returns true if no problems are detected and the difference between the
  // Jacobians is less than error_tolerance.
-  static bool Probe(double const* const* probe_point,
-                    double error_tolerance,
-                    CostFunctionToProbe *term,
-                    GradientCheckResults* results) {
-    CHECK_NOTNULL(probe_point);
-    CHECK_NOTNULL(term);
-    LOG(INFO) << "-------------------- Starting Probe() --------------------";
-
-    // We need a GradientCheckeresults, whether or not they supplied one.
-    internal::scoped_ptr<GradientCheckResults> owned_results;
-    if (results == NULL) {
-      owned_results.reset(new GradientCheckResults);
-      results = owned_results.get();
-    }
-
-    // Do a consistency check between the term and the template parameters.
-    CHECK_EQ(M, term->num_residuals());
-    const int num_residuals = M;
-    const std::vector<int32>& block_sizes = term->parameter_block_sizes();
-    const int num_blocks = block_sizes.size();
-
-    CHECK_LE(num_blocks, 5) << "Unable to test functions that take more "
-                            << "than 5 parameter blocks";
-    if (N0) {
-      CHECK_EQ(N0, block_sizes[0]);
-      CHECK_GE(num_blocks, 1);
-    } else {
-      CHECK_LT(num_blocks, 1);
-    }
-    if (N1) {
-      CHECK_EQ(N1, block_sizes[1]);
-      CHECK_GE(num_blocks, 2);
-    } else {
-      CHECK_LT(num_blocks, 2);
-    }
-    if (N2) {
-      CHECK_EQ(N2, block_sizes[2]);
-      CHECK_GE(num_blocks, 3);
-    } else {
-      CHECK_LT(num_blocks, 3);
-    }
-    if (N3) {
-      CHECK_EQ(N3, block_sizes[3]);
-      CHECK_GE(num_blocks, 4);
-    } else {
-      CHECK_LT(num_blocks, 4);
-    }
-    if (N4) {
-      CHECK_EQ(N4, block_sizes[4]);
-      CHECK_GE(num_blocks, 5);
-    } else {
-      CHECK_LT(num_blocks, 5);
-    }
-
-    results->term_jacobians.clear();
-    results->term_jacobians.resize(num_blocks);
-    results->finite_difference_jacobians.clear();
-    results->finite_difference_jacobians.resize(num_blocks);
-
-    internal::FixedArray<double*> term_jacobian_pointers(num_blocks);
-    internal::FixedArray<double*>
-        finite_difference_jacobian_pointers(num_blocks);
-    for (int i = 0; i < num_blocks; i++) {
-      results->term_jacobians[i].resize(num_residuals, block_sizes[i]);
-      term_jacobian_pointers[i] = results->term_jacobians[i].data();
-      results->finite_difference_jacobians[i].resize(
-          num_residuals, block_sizes[i]);
-      finite_difference_jacobian_pointers[i] =
-          results->finite_difference_jacobians[i].data();
-    }
-    results->cost.resize(num_residuals, 1);
-
-    CHECK(term->Evaluate(probe_point, results->cost.data(),
-                         term_jacobian_pointers.get()));
-    NumericDiffCostFunction<CostFunctionToProbe, CENTRAL, M, N0, N1, N2, N3, N4>
-        numeric_term(term, DO_NOT_TAKE_OWNERSHIP);
-    CHECK(numeric_term.Evaluate(probe_point, results->cost.data(),
-                                finite_difference_jacobian_pointers.get()));
-
-    results->error_jacobians = 0;
-    for (int i = 0; i < num_blocks; i++) {
-      Matrix jacobian_difference = results->term_jacobians[i] -
-          results->finite_difference_jacobians[i];
-      results->error_jacobians =
-          std::max(results->error_jacobians,
-                   jacobian_difference.lpNorm<Eigen::Infinity>());
-    }
-
-    LOG(INFO) << "========== term-computed derivatives ==========";
-    for (int i = 0; i < num_blocks; i++) {
-      LOG(INFO) << "term_computed block " << i;
-      LOG(INFO) << "\n" << results->term_jacobians[i];
-    }
-
-    LOG(INFO) << "========== finite-difference derivatives ==========";
-    for (int i = 0; i < num_blocks; i++) {
-      LOG(INFO) << "finite_difference block " << i;
-      LOG(INFO) << "\n" << results->finite_difference_jacobians[i];
-    }
-
-    LOG(INFO) << "========== difference ==========";
-    for (int i = 0; i < num_blocks; i++) {
-      LOG(INFO) << "difference block " << i;
-      LOG(INFO) << (results->term_jacobians[i] -
-                    results->finite_difference_jacobians[i]);
-    }
-
-    LOG(INFO) << "||difference|| = " << results->error_jacobians;
-
-    return results->error_jacobians < error_tolerance;
-  }
+  bool Probe(double const* const* parameters,
+             double relative_precision,
+             ProbeResults* results) const;

 private:
  CERES_DISALLOW_IMPLICIT_CONSTRUCTORS(GradientChecker);
+
+  std::vector<const LocalParameterization*> local_parameterizations_;
+  const CostFunction* function_;
+  internal::scoped_ptr<CostFunction> finite_diff_cost_function_;
 };

 }  // namespace ceres
@@ -33,9 +33,8 @@

 // This file needs to compile as c code.
 #ifdef __cplusplus
-
+#include <cstddef>
 #include "ceres/internal/config.h"
-
 #if defined(CERES_TR1_MEMORY_HEADER)
 #include <tr1/memory>
 #else
@@ -50,6 +49,25 @@ using std::tr1::shared_ptr;
 using std::shared_ptr;
 #endif

+// We allocate some Eigen objects on the stack and other places they
+// might not be aligned to 16-byte boundaries.  If we have C++11, we
+// can specify their alignment anyway, and thus can safely enable
+// vectorization on those matrices; in C++99, we are out of luck.  Figure out
+// what case we're in and write macros that do the right thing.
+#ifdef CERES_USE_CXX11
+namespace port_constants {
+static constexpr size_t kMaxAlignBytes =
+    // Work around a GCC 4.8 bug
+    // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56019) where
+    // std::max_align_t is misplaced.
+#if defined (__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ == 8
+    alignof(::max_align_t);
+#else
+    alignof(std::max_align_t);
+#endif
+}  // namespace port_constants
+#endif
+
 }  // namespace ceres

 #endif  // __cplusplus
@@ -69,7 +69,7 @@ struct CERES_EXPORT IterationSummary {
  // Step was numerically valid, i.e., all values are finite and the
  // step reduces the value of the linearized model.
  //
-  // Note: step_is_valid is false when iteration = 0.
+  // Note: step_is_valid is always true when iteration = 0.
  bool step_is_valid;

  // Step did not reduce the value of the objective function
@@ -77,7 +77,7 @@ struct CERES_EXPORT IterationSummary {
  // acceptance criterion used by the non-monotonic trust region
  // algorithm.
  //
-  // Note: step_is_nonmonotonic is false when iteration = 0;
+  // Note: step_is_nonmonotonic is always false when iteration = 0;
  bool step_is_nonmonotonic;

  // Whether or not the minimizer accepted this step or not. If the
@@ -89,7 +89,7 @@ struct CERES_EXPORT IterationSummary {
  // relative decrease is not sufficient, the algorithm may accept the
  // step and the step is declared successful.
  //
-  // Note: step_is_successful is false when iteration = 0.
+  // Note: step_is_successful is always true when iteration = 0.
  bool step_is_successful;

  // Value of the objective function.
@@ -164,6 +164,7 @@

 #include "Eigen/Core"
 #include "ceres/fpclassify.h"
+#include "ceres/internal/port.h"

 namespace ceres {

@@ -227,21 +228,23 @@ struct Jet {
  T a;

  // The infinitesimal part.
-  //
-  // Note the Eigen::DontAlign bit is needed here because this object
-  // gets allocated on the stack and as part of other arrays and
-  // structs. Forcing the right alignment there is the source of much
-  // pain and suffering. Even if that works, passing Jets around to
-  // functions by value has problems because the C++ ABI does not
-  // guarantee alignment for function arguments.
-  //
-  // Setting the DontAlign bit prevents Eigen from using SSE for the
-  // various operations on Jets. This is a small performance penalty
-  // since the AutoDiff code will still expose much of the code as
-  // statically sized loops to the compiler. But given the subtle
-  // issues that arise due to alignment, especially when dealing with
-  // multiple platforms, it seems to be a trade off worth making.
+
+  // We allocate Jets on the stack and other places they
+  // might not be aligned to 16-byte boundaries.  If we have C++11, we
+  // can specify their alignment anyway, and thus can safely enable
+  // vectorization on those matrices; in C++99, we are out of luck.  Figure out
+  // what case we're in and do the right thing.
+#ifndef CERES_USE_CXX11
+  // fall back to safe version:
  Eigen::Matrix<T, N, 1, Eigen::DontAlign> v;
+#else
+  static constexpr bool kShouldAlignMatrix =
+      16 <= ::ceres::port_constants::kMaxAlignBytes;
+  static constexpr int kAlignHint = kShouldAlignMatrix ?
+      Eigen::AutoAlign : Eigen::DontAlign;
+  static constexpr size_t kAlignment = kShouldAlignMatrix ? 16 : 1;
+  alignas(kAlignment) Eigen::Matrix<T, N, 1, kAlignHint> v;
+#endif
 };

 // Unary +
@@ -388,6 +391,8 @@ inline double atan    (double x) { return std::atan(x);     }
 inline double sinh    (double x) { return std::sinh(x);     }
 inline double cosh    (double x) { return std::cosh(x);     }
 inline double tanh    (double x) { return std::tanh(x);     }
+inline double floor   (double x) { return std::floor(x);    }
+inline double ceil    (double x) { return std::ceil(x);     }
 inline double pow  (double x, double y) { return std::pow(x, y);   }
 inline double atan2(double y, double x) { return std::atan2(y, x); }

@@ -482,10 +487,51 @@ Jet<T, N> tanh(const Jet<T, N>& f) {
  return Jet<T, N>(tanh_a, tmp * f.v);
 }

+// The floor function should be used with extreme care as this operation will
+// result in a zero derivative which provides no information to the solver.
+//
+// floor(a + h) ~= floor(a) + 0
+template <typename T, int N> inline
+Jet<T, N> floor(const Jet<T, N>& f) {
+  return Jet<T, N>(floor(f.a));
+}
+
+// The ceil function should be used with extreme care as this operation will
+// result in a zero derivative which provides no information to the solver.
+//
+// ceil(a + h) ~= ceil(a) + 0
+template <typename T, int N> inline
+Jet<T, N> ceil(const Jet<T, N>& f) {
+  return Jet<T, N>(ceil(f.a));
+}
+
 // Bessel functions of the first kind with integer order equal to 0, 1, n.
-inline double BesselJ0(double x) { return j0(x); }
-inline double BesselJ1(double x) { return j1(x); }
-inline double BesselJn(int n, double x) { return jn(n, x); }
+//
+// Microsoft has deprecated the j[0,1,n]() POSIX Bessel functions in favour of
+// _j[0,1,n]().  Where available on MSVC, use _j[0,1,n]() to avoid deprecated
+// function errors in client code (the specific warning is suppressed when
+// Ceres itself is built).
+inline double BesselJ0(double x) {
+#if defined(_MSC_VER) && defined(_j0)
+  return _j0(x);
+#else
+  return j0(x);
+#endif
+}
+inline double BesselJ1(double x) {
+#if defined(_MSC_VER) && defined(_j1)
+  return _j1(x);
+#else
+  return j1(x);
+#endif
+}
+inline double BesselJn(int n, double x) {
+#if defined(_MSC_VER) && defined(_jn)
+  return _jn(n, x);
+#else
+  return jn(n, x);
+#endif
+}

 // For the formulae of the derivatives of the Bessel functions see the book:
 // Olver, Lozier, Boisvert, Clark, NIST Handbook of Mathematical Functions,
@@ -743,7 +789,15 @@ template<typename T, int N> inline       Jet<T, N>  ei_pow (const Jet<T, N>& x,
 // strange compile errors.
 template <typename T, int N>
 inline std::ostream &operator<<(std::ostream &s, const Jet<T, N>& z) {
-  return s << "[" << z.a << " ; " << z.v.transpose() << "]";
+  s << "[" << z.a << " ; ";
+  for (int i = 0; i < N; ++i) {
+    s << z.v[i];
+    if (i != N - 1) {
+      s << ", ";
+    }
+  }
+  s << "]";
+  return s;
 }

 }  // namespace ceres
@@ -757,6 +811,7 @@ struct NumTraits<ceres::Jet<T, N> > {
  typedef ceres::Jet<T, N> Real;
  typedef ceres::Jet<T, N> NonInteger;
  typedef ceres::Jet<T, N> Nested;
+  typedef ceres::Jet<T, N> Literal;

  static typename ceres::Jet<T, N> dummy_precision() {
    return ceres::Jet<T, N>(1e-12);
@@ -777,6 +832,21 @@ struct NumTraits<ceres::Jet<T, N> > {
    HasFloatingPoint = 1,
    RequireInitialization = 1
  };
+
+  template<bool Vectorized>
+  struct Div {
+    enum {
+#if defined(EIGEN_VECTORIZE_AVX)
+      AVX = true,
+#else
+      AVX = false,
+#endif
+
+      // Assuming that for Jets, division is as expensive as
+      // multiplication.
+      Cost = 3
+    };
+  };
 };

 }  // namespace Eigen
@@ -211,6 +211,28 @@ class CERES_EXPORT QuaternionParameterization : public LocalParameterization {
  virtual int LocalSize() const { return 3; }
 };

+// Implements the quaternion local parameterization for Eigen's representation
+// of the quaternion. Eigen uses a different internal memory layout for the
+// elements of the quaternion than what is commonly used. Specifically, Eigen
+// stores the elements in memory as [x, y, z, w] where the real part is last
+// whereas it is typically stored first. Note, when creating an Eigen quaternion
+// through the constructor the elements are accepted in w, x, y, z order. Since
+// Ceres operates on parameter blocks which are raw double pointers this
+// difference is important and requires a different parameterization.
+//
+// Plus(x, delta) = [sin(|delta|) delta / |delta|, cos(|delta|)] * x
+// with * being the quaternion multiplication operator.
+class EigenQuaternionParameterization : public ceres::LocalParameterization {
+ public:
+  virtual ~EigenQuaternionParameterization() {}
+  virtual bool Plus(const double* x,
+                    const double* delta,
+                    double* x_plus_delta) const;
+  virtual bool ComputeJacobian(const double* x,
+                               double* jacobian) const;
+  virtual int GlobalSize() const { return 4; }
+  virtual int LocalSize() const { return 3; }
+};

 // This provides a parameterization for homogeneous vectors which are commonly
 // used in Structure for Motion problems.  One example where they are used is
@@ -206,29 +206,6 @@ class NumericDiffCostFunction
    }
  }

-  // Deprecated. New users should avoid using this constructor. Instead, use the
-  // constructor with NumericDiffOptions.
-  NumericDiffCostFunction(CostFunctor* functor,
-                          Ownership ownership,
-                          int num_residuals,
-                          const double relative_step_size)
-      :functor_(functor),
-       ownership_(ownership),
-       options_() {
-    LOG(WARNING) << "This constructor is deprecated and will be removed in "
-                    "a future version. Please use the NumericDiffOptions "
-                    "constructor instead.";
-
-    if (kNumResiduals == DYNAMIC) {
-      SizedCostFunction<kNumResiduals,
-                        N0, N1, N2, N3, N4,
-                        N5, N6, N7, N8, N9>
-          ::set_num_residuals(num_residuals);
-    }
-
-    options_.relative_step_size = relative_step_size;
-  }
-
  ~NumericDiffCostFunction() {
    if (ownership_ != TAKE_OWNERSHIP) {
      functor_.release();
@@ -309,6 +309,9 @@ class CERES_EXPORT Problem {
  // Allow the indicated parameter block to vary during optimization.
  void SetParameterBlockVariable(double* values);

+  // Returns true if a parameter block is set constant, and false otherwise.
+  bool IsParameterBlockConstant(double* values) const;
+
  // Set the local parameterization for one of the parameter blocks.
  // The local_parameterization is owned by the Problem by default. It
  // is acceptable to set the same parameterization for multiple
@@ -461,6 +464,10 @@ class CERES_EXPORT Problem {
  // parameter block has a local parameterization, then it contributes
  // "LocalSize" entries to the gradient vector (and the number of
  // columns in the jacobian).
+  //
+  // Note 3: This function cannot be called while the problem is being
+  // solved, for example it cannot be called from an IterationCallback
+  // at the end of an iteration during a solve.
  bool Evaluate(const EvaluateOptions& options,
                double* cost,
                std::vector<double>* residuals,
@@ -48,7 +48,6 @@
 #include <algorithm>
 #include <cmath>
 #include <limits>
-#include "glog/logging.h"

 namespace ceres {

@@ -418,7 +417,6 @@ template <typename T>
 inline void EulerAnglesToRotationMatrix(const T* euler,
                                        const int row_stride_parameter,
                                        T* R) {
-  CHECK_EQ(row_stride_parameter, 3);
  EulerAnglesToRotationMatrix(euler, RowMajorAdapter3x3(R));
 }

@@ -496,7 +494,6 @@ void QuaternionToRotation(const T q[4],
  QuaternionToScaledRotation(q, R);

  T normalizer = q[0]*q[0] + q[1]*q[1] + q[2]*q[2] + q[3]*q[3];
-  CHECK_NE(normalizer, T(0));
  normalizer = T(1) / normalizer;

  for (int i = 0; i < 3; ++i) {
@@ -134,7 +134,7 @@ class CERES_EXPORT Solver {
      trust_region_problem_dump_format_type = TEXTFILE;
      check_gradients = false;
      gradient_check_relative_precision = 1e-8;
-      numeric_derivative_relative_step_size = 1e-6;
+      gradient_check_numeric_derivative_relative_step_size = 1e-6;
      update_state_every_iteration = false;
    }

@@ -701,12 +701,22 @@ class CERES_EXPORT Solver {
    // this number, then the jacobian for that cost term is dumped.
    double gradient_check_relative_precision;

-    // Relative shift used for taking numeric derivatives. For finite
-    // differencing, each dimension is evaluated at slightly shifted
-    // values; for the case of central difference, this is what gets
-    // evaluated:
+    // WARNING: This option only applies to the to the numeric
+    // differentiation used for checking the user provided derivatives
+    // when when Solver::Options::check_gradients is true. If you are
+    // using NumericDiffCostFunction and are interested in changing
+    // the step size for numeric differentiation in your cost
+    // function, please have a look at
+    // include/ceres/numeric_diff_options.h.
    //
-    //   delta = numeric_derivative_relative_step_size;
+    // Relative shift used for taking numeric derivatives when
+    // Solver::Options::check_gradients is true.
+    //
+    // For finite differencing, each dimension is evaluated at
+    // slightly shifted values; for the case of central difference,
+    // this is what gets evaluated:
+    //
+    //   delta = gradient_check_numeric_derivative_relative_step_size;
    //   f_initial  = f(x)
    //   f_forward  = f((1 + delta) * x)
    //   f_backward = f((1 - delta) * x)
@@ -723,7 +733,7 @@ class CERES_EXPORT Solver {
    // theory a good choice is sqrt(eps) * x, which for doubles means
    // about 1e-8 * x. However, I have found this number too
    // optimistic. This number should be exposed for users to change.
-    double numeric_derivative_relative_step_size;
+    double gradient_check_numeric_derivative_relative_step_size;

    // If true, the user's parameter blocks are updated at the end of
    // every Minimizer iteration, otherwise they are updated when the
@@ -801,6 +811,13 @@ class CERES_EXPORT Solver {
    // Number of times inner iterations were performed.
    int num_inner_iteration_steps;

+    // Total number of iterations inside the line search algorithm
+    // across all invocations. We call these iterations "steps" to
+    // distinguish them from the outer iterations of the line search
+    // and trust region minimizer algorithms which call the line
+    // search algorithm as a subroutine.
+    int num_line_search_steps;
+
    // All times reported below are wall times.

    // When the user calls Solve, before the actual optimization
@@ -32,7 +32,7 @@
 #define CERES_PUBLIC_VERSION_H_

 #define CERES_VERSION_MAJOR 1
-#define CERES_VERSION_MINOR 11
+#define CERES_VERSION_MINOR 12
 #define CERES_VERSION_REVISION 0

 // Classic CPP stringifcation; the extra level of indirection allows the
@@ -46,6 +46,7 @@ namespace internal {
 using std::make_pair;
 using std::pair;
 using std::vector;
+using std::adjacent_find;

 void CompressedRowJacobianWriter::PopulateJacobianRowAndColumnBlockVectors(
    const Program* program, CompressedRowSparseMatrix* jacobian) {
@@ -140,12 +141,21 @@ SparseMatrix* CompressedRowJacobianWriter::CreateJacobian() const {

    // Sort the parameters by their position in the state vector.
    sort(parameter_indices.begin(), parameter_indices.end());
-    CHECK(unique(parameter_indices.begin(), parameter_indices.end()) ==
-          parameter_indices.end())
-          << "Ceres internal error:  "
-          << "Duplicate parameter blocks detected in a cost function. "
-          << "This should never happen. Please report this to "
-          << "the Ceres developers.";
+    if (adjacent_find(parameter_indices.begin(), parameter_indices.end()) !=
+        parameter_indices.end()) {
+      std::string parameter_block_description;
+      for (int j = 0; j < num_parameter_blocks; ++j) {
+        ParameterBlock* parameter_block = residual_block->parameter_blocks()[j];
+        parameter_block_description +=
+            parameter_block->ToString() + "\n";
+      }
+      LOG(FATAL) << "Ceres internal error: "
+                 << "Duplicate parameter blocks detected in a cost function. "
+                 << "This should never happen. Please report this to "
+                 << "the Ceres developers.\n"
+                 << "Residual Block: " << residual_block->ToString() << "\n"
+                 << "Parameter Blocks: " << parameter_block_description;
+    }

    // Update the row indices.
    const int num_residuals = residual_block->NumResiduals();
@@ -38,6 +38,7 @@

 namespace ceres {

+using std::make_pair;
 using std::pair;
 using std::vector;

@@ -54,6 +55,12 @@ bool Covariance::Compute(
  return impl_->Compute(covariance_blocks, problem->problem_impl_.get());
 }

+bool Covariance::Compute(
+    const vector<const double*>& parameter_blocks,
+    Problem* problem) {
+  return impl_->Compute(parameter_blocks, problem->problem_impl_.get());
+}
+
 bool Covariance::GetCovarianceBlock(const double* parameter_block1,
                                    const double* parameter_block2,
                                    double* covariance_block) const {
@@ -73,4 +80,20 @@ bool Covariance::GetCovarianceBlockInTangentSpace(
                                                          covariance_block);
 }

+bool Covariance::GetCovarianceMatrix(
+    const vector<const double*>& parameter_blocks,
+    double* covariance_matrix) {
+  return impl_->GetCovarianceMatrixInTangentOrAmbientSpace(parameter_blocks,
+                                                           true,  // ambient
+                                                           covariance_matrix);
+}
+
+bool Covariance::GetCovarianceMatrixInTangentSpace(
+    const std::vector<const double *>& parameter_blocks,
+    double *covariance_matrix) {
+  return impl_->GetCovarianceMatrixInTangentOrAmbientSpace(parameter_blocks,
+                                                           false,  // tangent
+                                                           covariance_matrix);
+}
+
 }  // namespace ceres
@@ -36,6 +36,8 @@

 #include <algorithm>
 #include <cstdlib>
+#include <numeric>
+#include <sstream>
 #include <utility>
 #include <vector>

@@ -43,6 +45,7 @@
 #include "Eigen/SparseQR"
 #include "Eigen/SVD"

+#include "ceres/collections_port.h"
 #include "ceres/compressed_col_sparse_matrix_utils.h"
 #include "ceres/compressed_row_sparse_matrix.h"
 #include "ceres/covariance.h"
@@ -51,6 +54,7 @@
 #include "ceres/map_util.h"
 #include "ceres/parameter_block.h"
 #include "ceres/problem_impl.h"
+#include "ceres/residual_block.h"
 #include "ceres/suitesparse.h"
 #include "ceres/wall_time.h"
 #include "glog/logging.h"
@@ -61,6 +65,7 @@ namespace internal {
 using std::make_pair;
 using std::map;
 using std::pair;
+using std::sort;
 using std::swap;
 using std::vector;

@@ -86,8 +91,38 @@ CovarianceImpl::CovarianceImpl(const Covariance::Options& options)
 CovarianceImpl::~CovarianceImpl() {
 }

+template <typename T> void CheckForDuplicates(vector<T> blocks) {
+  sort(blocks.begin(), blocks.end());
+  typename vector<T>::iterator it =
+      std::adjacent_find(blocks.begin(), blocks.end());
+  if (it != blocks.end()) {
+    // In case there are duplicates, we search for their location.
+    map<T, vector<int> > blocks_map;
+    for (int i = 0; i < blocks.size(); ++i) {
+      blocks_map[blocks[i]].push_back(i);
+    }
+
+    std::ostringstream duplicates;
+    while (it != blocks.end()) {
+      duplicates << "(";
+      for (int i = 0; i < blocks_map[*it].size() - 1; ++i) {
+        duplicates << blocks_map[*it][i] << ", ";
+      }
+      duplicates << blocks_map[*it].back() << ")";
+      it = std::adjacent_find(it + 1, blocks.end());
+      if (it < blocks.end()) {
+        duplicates << " and ";
+      }
+    }
+
+    LOG(FATAL) << "Covariance::Compute called with duplicate blocks at "
+               << "indices " << duplicates.str();
+  }
+}
+
 bool CovarianceImpl::Compute(const CovarianceBlocks& covariance_blocks,
                             ProblemImpl* problem) {
+  CheckForDuplicates<pair<const double*, const double*> >(covariance_blocks);
  problem_ = problem;
  parameter_block_to_row_index_.clear();
  covariance_matrix_.reset(NULL);
@@ -97,6 +132,20 @@ bool CovarianceImpl::Compute(const CovarianceBlocks& covariance_blocks,
  return is_valid_;
 }

+bool CovarianceImpl::Compute(const vector<const double*>& parameter_blocks,
+                             ProblemImpl* problem) {
+  CheckForDuplicates<const double*>(parameter_blocks);
+  CovarianceBlocks covariance_blocks;
+  for (int i = 0; i < parameter_blocks.size(); ++i) {
+    for (int j = i; j < parameter_blocks.size(); ++j) {
+      covariance_blocks.push_back(make_pair(parameter_blocks[i],
+                                            parameter_blocks[j]));
+    }
+  }
+
+  return Compute(covariance_blocks, problem);
+}
+
 bool CovarianceImpl::GetCovarianceBlockInTangentOrAmbientSpace(
    const double* original_parameter_block1,
    const double* original_parameter_block2,
@@ -120,9 +169,17 @@ bool CovarianceImpl::GetCovarianceBlockInTangentOrAmbientSpace(
    ParameterBlock* block2 =
        FindOrDie(parameter_map,
                  const_cast<double*>(original_parameter_block2));
+
    const int block1_size = block1->Size();
    const int block2_size = block2->Size();
-    MatrixRef(covariance_block, block1_size, block2_size).setZero();
+    const int block1_local_size = block1->LocalSize();
+    const int block2_local_size = block2->LocalSize();
+    if (!lift_covariance_to_ambient_space) {
+      MatrixRef(covariance_block, block1_local_size, block2_local_size)
+          .setZero();
+    } else {
+      MatrixRef(covariance_block, block1_size, block2_size).setZero();
+    }
    return true;
  }

@@ -240,6 +297,94 @@ bool CovarianceImpl::GetCovarianceBlockInTangentOrAmbientSpace(
  return true;
 }

+bool CovarianceImpl::GetCovarianceMatrixInTangentOrAmbientSpace(
+    const vector<const double*>& parameters,
+    bool lift_covariance_to_ambient_space,
+    double* covariance_matrix) const {
+  CHECK(is_computed_)
+      << "Covariance::GetCovarianceMatrix called before Covariance::Compute";
+  CHECK(is_valid_)
+      << "Covariance::GetCovarianceMatrix called when Covariance::Compute "
+      << "returned false.";
+
+  const ProblemImpl::ParameterMap& parameter_map = problem_->parameter_map();
+  // For OpenMP compatibility we need to define these vectors in advance
+  const int num_parameters = parameters.size();
+  vector<int> parameter_sizes;
+  vector<int> cum_parameter_size;
+  parameter_sizes.reserve(num_parameters);
+  cum_parameter_size.resize(num_parameters + 1);
+  cum_parameter_size[0] = 0;
+  for (int i = 0; i < num_parameters; ++i) {
+    ParameterBlock* block =
+        FindOrDie(parameter_map, const_cast<double*>(parameters[i]));
+    if (lift_covariance_to_ambient_space) {
+      parameter_sizes.push_back(block->Size());
+    } else {
+      parameter_sizes.push_back(block->LocalSize());
+    }
+  }
+  std::partial_sum(parameter_sizes.begin(), parameter_sizes.end(),
+                   cum_parameter_size.begin() + 1);
+  const int max_covariance_block_size =
+      *std::max_element(parameter_sizes.begin(), parameter_sizes.end());
+  const int covariance_size = cum_parameter_size.back();
+
+  // Assemble the blocks in the covariance matrix.
+  MatrixRef covariance(covariance_matrix, covariance_size, covariance_size);
+  const int num_threads = options_.num_threads;
+  scoped_array<double> workspace(
+      new double[num_threads * max_covariance_block_size *
+                 max_covariance_block_size]);
+
+  bool success = true;
+
+// The collapse() directive is only supported in OpenMP 3.0 and higher. OpenMP
+// 3.0 was released in May 2008 (hence the version number).
+#if _OPENMP >= 200805
+#  pragma omp parallel for num_threads(num_threads) schedule(dynamic) collapse(2)
+#else
+#  pragma omp parallel for num_threads(num_threads) schedule(dynamic)
+#endif
+  for (int i = 0; i < num_parameters; ++i) {
+    for (int j = 0; j < num_parameters; ++j) {
+      // The second loop can't start from j = i for compatibility with OpenMP
+      // collapse command. The conditional serves as a workaround
+      if (j >= i) {
+        int covariance_row_idx = cum_parameter_size[i];
+        int covariance_col_idx = cum_parameter_size[j];
+        int size_i = parameter_sizes[i];
+        int size_j = parameter_sizes[j];
+#ifdef CERES_USE_OPENMP
+        int thread_id = omp_get_thread_num();
+#else
+        int thread_id = 0;
+#endif
+        double* covariance_block =
+            workspace.get() +
+            thread_id * max_covariance_block_size * max_covariance_block_size;
+        if (!GetCovarianceBlockInTangentOrAmbientSpace(
+                parameters[i], parameters[j], lift_covariance_to_ambient_space,
+                covariance_block)) {
+          success = false;
+        }
+
+        covariance.block(covariance_row_idx, covariance_col_idx,
+                         size_i, size_j) =
+            MatrixRef(covariance_block, size_i, size_j);
+
+        if (i != j) {
+          covariance.block(covariance_col_idx, covariance_row_idx,
+                           size_j, size_i) =
+              MatrixRef(covariance_block, size_i, size_j).transpose();
+
+        }
+      }
+    }
+  }
+  return success;
+}
+
 // Determine the sparsity pattern of the covariance matrix based on
 // the block pairs requested by the user.
 bool CovarianceImpl::ComputeCovarianceSparsity(
@@ -252,18 +397,28 @@ bool CovarianceImpl::ComputeCovarianceSparsity(
  vector<double*> all_parameter_blocks;
  problem->GetParameterBlocks(&all_parameter_blocks);
  const ProblemImpl::ParameterMap& parameter_map = problem->parameter_map();
+  HashSet<ParameterBlock*> parameter_blocks_in_use;
+  vector<ResidualBlock*> residual_blocks;
+  problem->GetResidualBlocks(&residual_blocks);
+
+  for (int i = 0; i < residual_blocks.size(); ++i) {
+    ResidualBlock* residual_block = residual_blocks[i];
+    parameter_blocks_in_use.insert(residual_block->parameter_blocks(),
+                                   residual_block->parameter_blocks() +
+                                   residual_block->NumParameterBlocks());
+  }
+
  constant_parameter_blocks_.clear();
  vector<double*>& active_parameter_blocks =
      evaluate_options_.parameter_blocks;
  active_parameter_blocks.clear();
  for (int i = 0; i < all_parameter_blocks.size(); ++i) {
    double* parameter_block = all_parameter_blocks[i];
-
    ParameterBlock* block = FindOrDie(parameter_map, parameter_block);
-    if (block->IsConstant()) {
-      constant_parameter_blocks_.insert(parameter_block);
-    } else {
+    if (!block->IsConstant() && (parameter_blocks_in_use.count(block) > 0)) {
      active_parameter_blocks.push_back(parameter_block);
+    } else {
+      constant_parameter_blocks_.insert(parameter_block);
    }
  }

@@ -386,8 +541,8 @@ bool CovarianceImpl::ComputeCovarianceValues() {
  switch (options_.algorithm_type) {
    case DENSE_SVD:
      return ComputeCovarianceValuesUsingDenseSVD();
-#ifndef CERES_NO_SUITESPARSE
    case SUITE_SPARSE_QR:
+#ifndef CERES_NO_SUITESPARSE
      return ComputeCovarianceValuesUsingSuiteSparseQR();
 #else
      LOG(ERROR) << "SuiteSparse is required to use the "
@@ -624,7 +779,10 @@ bool CovarianceImpl::ComputeCovarianceValuesUsingDenseSVD() {
      if (automatic_truncation) {
        break;
      } else {
-        LOG(ERROR) << "Cholesky factorization of J'J is not reliable. "
+        LOG(ERROR) << "Error: Covariance matrix is near rank deficient "
+                   << "and the user did not specify a non-zero"
+                   << "Covariance::Options::null_space_rank "
+                   << "to enable the computation of a Pseudo-Inverse. "
                   << "Reciprocal condition number: "
                   << singular_value_ratio * singular_value_ratio << " "
                   << "min_reciprocal_condition_number: "
@@ -55,12 +55,21 @@ class CovarianceImpl {
                                  const double*> >& covariance_blocks,
      ProblemImpl* problem);

+  bool Compute(
+      const std::vector<const double*>& parameter_blocks,
+      ProblemImpl* problem);
+
  bool GetCovarianceBlockInTangentOrAmbientSpace(
      const double* parameter_block1,
      const double* parameter_block2,
      bool lift_covariance_to_ambient_space,
      double* covariance_block) const;

+  bool GetCovarianceMatrixInTangentOrAmbientSpace(
+      const std::vector<const double*>& parameters,
+      bool lift_covariance_to_ambient_space,
+      double *covariance_matrix) const;
+
  bool ComputeCovarianceSparsity(
      const std::vector<std::pair<const double*,
                                  const double*> >& covariance_blocks,
@@ -0,0 +1,276 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2016 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wjr@google.com (William Rucklidge),
+//          keir@google.com (Keir Mierle),
+//          dgossow@google.com (David Gossow)
+
+#include "ceres/gradient_checker.h"
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "ceres/is_close.h"
+#include "ceres/stringprintf.h"
+#include "ceres/types.h"
+
+namespace ceres {
+
+using internal::IsClose;
+using internal::StringAppendF;
+using internal::StringPrintf;
+using std::string;
+using std::vector;
+
+namespace {
+// Evaluate the cost function and transform the returned Jacobians to
+// the local space of the respective local parameterizations.
+bool EvaluateCostFunction(
+    const ceres::CostFunction* function,
+    double const* const * parameters,
+    const std::vector<const ceres::LocalParameterization*>&
+        local_parameterizations,
+    Vector* residuals,
+    std::vector<Matrix>* jacobians,
+    std::vector<Matrix>* local_jacobians) {
+  CHECK_NOTNULL(residuals);
+  CHECK_NOTNULL(jacobians);
+  CHECK_NOTNULL(local_jacobians);
+
+  const vector<int32>& block_sizes = function->parameter_block_sizes();
+  const int num_parameter_blocks = block_sizes.size();
+
+  // Allocate Jacobian matrices in local space.
+  local_jacobians->resize(num_parameter_blocks);
+  vector<double*> local_jacobian_data(num_parameter_blocks);
+  for (int i = 0; i < num_parameter_blocks; ++i) {
+    int block_size = block_sizes.at(i);
+    if (local_parameterizations.at(i) != NULL) {
+      block_size = local_parameterizations.at(i)->LocalSize();
+    }
+    local_jacobians->at(i).resize(function->num_residuals(), block_size);
+    local_jacobians->at(i).setZero();
+    local_jacobian_data.at(i) = local_jacobians->at(i).data();
+  }
+
+  // Allocate Jacobian matrices in global space.
+  jacobians->resize(num_parameter_blocks);
+  vector<double*> jacobian_data(num_parameter_blocks);
+  for (int i = 0; i < num_parameter_blocks; ++i) {
+    jacobians->at(i).resize(function->num_residuals(), block_sizes.at(i));
+    jacobians->at(i).setZero();
+    jacobian_data.at(i) = jacobians->at(i).data();
+  }
+
+  // Compute residuals & jacobians.
+  CHECK_NE(0, function->num_residuals());
+  residuals->resize(function->num_residuals());
+  residuals->setZero();
+  if (!function->Evaluate(parameters, residuals->data(),
+                          jacobian_data.data())) {
+    return false;
+  }
+
+  // Convert Jacobians from global to local space.
+  for (size_t i = 0; i < local_jacobians->size(); ++i) {
+    if (local_parameterizations.at(i) == NULL) {
+      local_jacobians->at(i) = jacobians->at(i);
+    } else {
+      int global_size = local_parameterizations.at(i)->GlobalSize();
+      int local_size = local_parameterizations.at(i)->LocalSize();
+      CHECK_EQ(jacobians->at(i).cols(), global_size);
+      Matrix global_J_local(global_size, local_size);
+      local_parameterizations.at(i)->ComputeJacobian(
+          parameters[i], global_J_local.data());
+      local_jacobians->at(i) = jacobians->at(i) * global_J_local;
+    }
+  }
+  return true;
+}
+} // namespace
+
+GradientChecker::GradientChecker(
+      const CostFunction* function,
+      const vector<const LocalParameterization*>* local_parameterizations,
+      const NumericDiffOptions& options) :
+        function_(function) {
+  CHECK_NOTNULL(function);
+  if (local_parameterizations != NULL) {
+    local_parameterizations_ = *local_parameterizations;
+  } else {
+    local_parameterizations_.resize(function->parameter_block_sizes().size(),
+                                    NULL);
+  }
+  DynamicNumericDiffCostFunction<CostFunction, CENTRAL>*
+      finite_diff_cost_function =
+      new DynamicNumericDiffCostFunction<CostFunction, CENTRAL>(
+          function, DO_NOT_TAKE_OWNERSHIP, options);
+  finite_diff_cost_function_.reset(finite_diff_cost_function);
+
+  const vector<int32>& parameter_block_sizes =
+      function->parameter_block_sizes();
+  const int num_parameter_blocks = parameter_block_sizes.size();
+  for (int i = 0; i < num_parameter_blocks; ++i) {
+    finite_diff_cost_function->AddParameterBlock(parameter_block_sizes[i]);
+  }
+  finite_diff_cost_function->SetNumResiduals(function->num_residuals());
+}
+
+bool GradientChecker::Probe(double const* const * parameters,
+                            double relative_precision,
+                            ProbeResults* results_param) const {
+  int num_residuals = function_->num_residuals();
+
+  // Make sure that we have a place to store results, no matter if the user has
+  // provided an output argument.
+  ProbeResults* results;
+  ProbeResults results_local;
+  if (results_param != NULL) {
+    results = results_param;
+    results->residuals.resize(0);
+    results->jacobians.clear();
+    results->numeric_jacobians.clear();
+    results->local_jacobians.clear();
+    results->local_numeric_jacobians.clear();
+    results->error_log.clear();
+  } else {
+    results = &results_local;
+  }
+  results->maximum_relative_error = 0.0;
+  results->return_value = true;
+
+  // Evaluate the derivative using the user supplied code.
+  vector<Matrix>& jacobians = results->jacobians;
+  vector<Matrix>& local_jacobians = results->local_jacobians;
+  if (!EvaluateCostFunction(function_, parameters, local_parameterizations_,
+                       &results->residuals, &jacobians, &local_jacobians)) {
+    results->error_log = "Function evaluation with Jacobians failed.";
+    results->return_value = false;
+  }
+
+  // Evaluate the derivative using numeric derivatives.
+  vector<Matrix>& numeric_jacobians = results->numeric_jacobians;
+  vector<Matrix>& local_numeric_jacobians = results->local_numeric_jacobians;
+  Vector finite_diff_residuals;
+  if (!EvaluateCostFunction(finite_diff_cost_function_.get(), parameters,
+                            local_parameterizations_, &finite_diff_residuals,
+                            &numeric_jacobians, &local_numeric_jacobians)) {
+    results->error_log += "\nFunction evaluation with numerical "
+        "differentiation failed.";
+    results->return_value = false;
+  }
+
+  if (!results->return_value) {
+    return false;
+  }
+
+  for (int i = 0; i < num_residuals; ++i) {
+    if (!IsClose(
+        results->residuals[i],
+        finite_diff_residuals[i],
+        relative_precision,
+        NULL,
+        NULL)) {
+      results->error_log = "Function evaluation with and without Jacobians "
+          "resulted in different residuals.";
+      LOG(INFO) << results->residuals.transpose();
+      LOG(INFO) << finite_diff_residuals.transpose();
+      return false;
+    }
+  }
+
+  // See if any elements have relative error larger than the threshold.
+  int num_bad_jacobian_components = 0;
+  double& worst_relative_error = results->maximum_relative_error;
+  worst_relative_error = 0;
+
+  // Accumulate the error message for all the jacobians, since it won't get
+  // output if there are no bad jacobian components.
+  string error_log;
+  for (int k = 0; k < function_->parameter_block_sizes().size(); k++) {
+    StringAppendF(&error_log,
+                  "========== "
+                  "Jacobian for " "block %d: (%ld by %ld)) "
+                  "==========\n",
+                  k,
+                  static_cast<long>(local_jacobians[k].rows()),
+                  static_cast<long>(local_jacobians[k].cols()));
+    // The funny spacing creates appropriately aligned column headers.
+    error_log +=
+        " block  row  col        user dx/dy    num diff dx/dy         "
+        "abs error    relative error         parameter          residual\n";
+
+    for (int i = 0; i < local_jacobians[k].rows(); i++) {
+      for (int j = 0; j < local_jacobians[k].cols(); j++) {
+        double term_jacobian = local_jacobians[k](i, j);
+        double finite_jacobian = local_numeric_jacobians[k](i, j);
+        double relative_error, absolute_error;
+        bool bad_jacobian_entry =
+            !IsClose(term_jacobian,
+                     finite_jacobian,
+                     relative_precision,
+                     &relative_error,
+                     &absolute_error);
+        worst_relative_error = std::max(worst_relative_error, relative_error);
+
+        StringAppendF(&error_log,
+                      "%6d %4d %4d %17g %17g %17g %17g %17g %17g",
+                      k, i, j,
+                      term_jacobian, finite_jacobian,
+                      absolute_error, relative_error,
+                      parameters[k][j],
+                      results->residuals[i]);
+
+        if (bad_jacobian_entry) {
+          num_bad_jacobian_components++;
+          StringAppendF(
+              &error_log,
+              " ------ (%d,%d,%d) Relative error worse than %g",
+              k, i, j, relative_precision);
+        }
+        error_log += "\n";
+      }
+    }
+  }
+
+  // Since there were some bad errors, dump comprehensive debug info.
+  if (num_bad_jacobian_components) {
+    string header = StringPrintf("\nDetected %d bad Jacobian component(s). "
+        "Worst relative error was %g.\n",
+        num_bad_jacobian_components,
+        worst_relative_error);
+     results->error_log = header + "\n" + error_log;
+    return false;
+  }
+  return true;
+}
+
+}  // namespace ceres
@@ -26,7 +26,8 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
-// Author: keir@google.com (Keir Mierle)
+// Authors: keir@google.com (Keir Mierle),
+//          dgossow@google.com (David Gossow)

 #include "ceres/gradient_checking_cost_function.h"

@@ -36,7 +37,7 @@
 #include <string>
 #include <vector>

-#include "ceres/cost_function.h"
+#include "ceres/gradient_checker.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/internal/scoped_ptr.h"
 #include "ceres/parameter_block.h"
@@ -59,55 +60,25 @@ using std::vector;

 namespace {

-// True if x and y have an absolute relative difference less than
-// relative_precision and false otherwise. Stores the relative and absolute
-// difference in relative/absolute_error if non-NULL.
-bool IsClose(double x, double y, double relative_precision,
-             double *relative_error,
-             double *absolute_error) {
-  double local_absolute_error;
-  double local_relative_error;
-  if (!absolute_error) {
-    absolute_error = &local_absolute_error;
-  }
-  if (!relative_error) {
-    relative_error = &local_relative_error;
-  }
-  *absolute_error = abs(x - y);
-  *relative_error = *absolute_error / max(abs(x), abs(y));
-  if (x == 0 || y == 0) {
-    // If x or y is exactly zero, then relative difference doesn't have any
-    // meaning. Take the absolute difference instead.
-    *relative_error = *absolute_error;
-  }
-  return abs(*relative_error) < abs(relative_precision);
-}
-
 class GradientCheckingCostFunction : public CostFunction {
 public:
-  GradientCheckingCostFunction(const CostFunction* function,
-                               const NumericDiffOptions& options,
-                               double relative_precision,
-                               const string& extra_info)
+  GradientCheckingCostFunction(
+      const CostFunction* function,
+      const std::vector<const LocalParameterization*>* local_parameterizations,
+      const NumericDiffOptions& options,
+      double relative_precision,
+      const string& extra_info,
+      GradientCheckingIterationCallback* callback)
      : function_(function),
+        gradient_checker_(function, local_parameterizations, options),
        relative_precision_(relative_precision),
-        extra_info_(extra_info) {
-    DynamicNumericDiffCostFunction<CostFunction, CENTRAL>*
-        finite_diff_cost_function =
-        new DynamicNumericDiffCostFunction<CostFunction, CENTRAL>(
-            function,
-            DO_NOT_TAKE_OWNERSHIP,
-            options);
-
+        extra_info_(extra_info),
+        callback_(callback) {
+    CHECK_NOTNULL(callback_);
    const vector<int32>& parameter_block_sizes =
        function->parameter_block_sizes();
-    for (int i = 0; i < parameter_block_sizes.size(); ++i) {
-      finite_diff_cost_function->AddParameterBlock(parameter_block_sizes[i]);
-    }
    *mutable_parameter_block_sizes() = parameter_block_sizes;
    set_num_residuals(function->num_residuals());
-    finite_diff_cost_function->SetNumResiduals(num_residuals());
-    finite_diff_cost_function_.reset(finite_diff_cost_function);
  }

  virtual ~GradientCheckingCostFunction() { }
@@ -120,133 +91,92 @@ class GradientCheckingCostFunction : public CostFunction {
      return function_->Evaluate(parameters, residuals, NULL);
    }

-    int num_residuals = function_->num_residuals();
+    GradientChecker::ProbeResults results;
+    bool okay = gradient_checker_.Probe(parameters,
+                                        relative_precision_,
+                                        &results);

-    // Make space for the jacobians of the two methods.
-    const vector<int32>& block_sizes = function_->parameter_block_sizes();
-    vector<Matrix> term_jacobians(block_sizes.size());
-    vector<Matrix> finite_difference_jacobians(block_sizes.size());
-    vector<double*> term_jacobian_pointers(block_sizes.size());
-    vector<double*> finite_difference_jacobian_pointers(block_sizes.size());
-    for (int i = 0; i < block_sizes.size(); i++) {
-      term_jacobians[i].resize(num_residuals, block_sizes[i]);
-      term_jacobian_pointers[i] = term_jacobians[i].data();
-      finite_difference_jacobians[i].resize(num_residuals, block_sizes[i]);
-      finite_difference_jacobian_pointers[i] =
-          finite_difference_jacobians[i].data();
-    }
-
-    // Evaluate the derivative using the user supplied code.
-    if (!function_->Evaluate(parameters,
-                             residuals,
-                             &term_jacobian_pointers[0])) {
-      LOG(WARNING) << "Function evaluation failed.";
+    // If the cost function returned false, there's nothing we can say about
+    // the gradients.
+    if (results.return_value == false) {
      return false;
    }

-    // Evaluate the derivative using numeric derivatives.
-    finite_diff_cost_function_->Evaluate(
-        parameters,
-        residuals,
-        &finite_difference_jacobian_pointers[0]);
+    // Copy the residuals.
+    const int num_residuals = function_->num_residuals();
+    MatrixRef(residuals, num_residuals, 1) = results.residuals;

-    // See if any elements have relative error larger than the threshold.
-    int num_bad_jacobian_components = 0;
-    double worst_relative_error = 0;
-
-    // Accumulate the error message for all the jacobians, since it won't get
-    // output if there are no bad jacobian components.
-    string m;
+    // Copy the original jacobian blocks into the jacobians array.
+    const vector<int32>& block_sizes = function_->parameter_block_sizes();
    for (int k = 0; k < block_sizes.size(); k++) {
-      // Copy the original jacobian blocks into the jacobians array.
      if (jacobians[k] != NULL) {
        MatrixRef(jacobians[k],
-                  term_jacobians[k].rows(),
-                  term_jacobians[k].cols()) = term_jacobians[k];
-      }
-
-      StringAppendF(&m,
-                    "========== "
-                    "Jacobian for " "block %d: (%ld by %ld)) "
-                    "==========\n",
-                    k,
-                    static_cast<long>(term_jacobians[k].rows()),
-                    static_cast<long>(term_jacobians[k].cols()));
-      // The funny spacing creates appropriately aligned column headers.
-      m += " block  row  col        user dx/dy    num diff dx/dy         "
-           "abs error    relative error         parameter          residual\n";
-
-      for (int i = 0; i < term_jacobians[k].rows(); i++) {
-        for (int j = 0; j < term_jacobians[k].cols(); j++) {
-          double term_jacobian = term_jacobians[k](i, j);
-          double finite_jacobian = finite_difference_jacobians[k](i, j);
-          double relative_error, absolute_error;
-          bool bad_jacobian_entry =
-              !IsClose(term_jacobian,
-                       finite_jacobian,
-                       relative_precision_,
-                       &relative_error,
-                       &absolute_error);
-          worst_relative_error = max(worst_relative_error, relative_error);
-
-          StringAppendF(&m, "%6d %4d %4d %17g %17g %17g %17g %17g %17g",
-                        k, i, j,
-                        term_jacobian, finite_jacobian,
-                        absolute_error, relative_error,
-                        parameters[k][j],
-                        residuals[i]);
-
-          if (bad_jacobian_entry) {
-            num_bad_jacobian_components++;
-            StringAppendF(
-                &m, " ------ (%d,%d,%d) Relative error worse than %g",
-                k, i, j, relative_precision_);
-          }
-          m += "\n";
-        }
+                  results.jacobians[k].rows(),
+                  results.jacobians[k].cols()) = results.jacobians[k];
      }
    }

-    // Since there were some bad errors, dump comprehensive debug info.
-    if (num_bad_jacobian_components) {
-      string header = StringPrintf("Detected %d bad jacobian component(s). "
-                                   "Worst relative error was %g.\n",
-                                   num_bad_jacobian_components,
-                                   worst_relative_error);
-      if (!extra_info_.empty()) {
-        header += "Extra info for this residual: " + extra_info_ + "\n";
-      }
-      LOG(WARNING) << "\n" << header << m;
+    if (!okay) {
+      std::string error_log = "Gradient Error detected!\nExtra info for "
+          "this residual: " + extra_info_ + "\n" + results.error_log;
+      callback_->SetGradientErrorDetected(error_log);
    }
    return true;
  }

 private:
  const CostFunction* function_;
-  internal::scoped_ptr<CostFunction> finite_diff_cost_function_;
+  GradientChecker gradient_checker_;
  double relative_precision_;
  string extra_info_;
+  GradientCheckingIterationCallback* callback_;
 };

 }  // namespace

-CostFunction *CreateGradientCheckingCostFunction(
-    const CostFunction *cost_function,
+GradientCheckingIterationCallback::GradientCheckingIterationCallback()
+    : gradient_error_detected_(false) {
+}
+
+CallbackReturnType GradientCheckingIterationCallback::operator()(
+    const IterationSummary& summary) {
+  if (gradient_error_detected_) {
+    LOG(ERROR)<< "Gradient error detected. Terminating solver.";
+    return SOLVER_ABORT;
+  }
+  return SOLVER_CONTINUE;
+}
+void GradientCheckingIterationCallback::SetGradientErrorDetected(
+    std::string& error_log) {
+  mutex_.Lock();
+  gradient_error_detected_ = true;
+  error_log_ += "\n" + error_log;
+  mutex_.Unlock();
+}
+
+CostFunction* CreateGradientCheckingCostFunction(
+    const CostFunction* cost_function,
+    const std::vector<const LocalParameterization*>* local_parameterizations,
    double relative_step_size,
    double relative_precision,
-    const string& extra_info) {
+    const std::string& extra_info,
+    GradientCheckingIterationCallback* callback) {
  NumericDiffOptions numeric_diff_options;
  numeric_diff_options.relative_step_size = relative_step_size;

  return new GradientCheckingCostFunction(cost_function,
+                                          local_parameterizations,
                                          numeric_diff_options,
-                                          relative_precision,
-                                          extra_info);
+                                          relative_precision, extra_info,
+                                          callback);
 }

-ProblemImpl* CreateGradientCheckingProblemImpl(ProblemImpl* problem_impl,
-                                               double relative_step_size,
-                                               double relative_precision) {
+ProblemImpl* CreateGradientCheckingProblemImpl(
+    ProblemImpl* problem_impl,
+    double relative_step_size,
+    double relative_precision,
+    GradientCheckingIterationCallback* callback) {
+  CHECK_NOTNULL(callback);
  // We create new CostFunctions by wrapping the original CostFunction
  // in a gradient checking CostFunction. So its okay for the
  // ProblemImpl to take ownership of it and destroy it. The
@@ -260,6 +190,9 @@ ProblemImpl* CreateGradientCheckingProblemImpl(ProblemImpl* problem_impl,
  gradient_checking_problem_options.local_parameterization_ownership =
      DO_NOT_TAKE_OWNERSHIP;

+  NumericDiffOptions numeric_diff_options;
+  numeric_diff_options.relative_step_size = relative_step_size;
+
  ProblemImpl* gradient_checking_problem_impl = new ProblemImpl(
      gradient_checking_problem_options);

@@ -294,19 +227,26 @@ ProblemImpl* CreateGradientCheckingProblemImpl(ProblemImpl* problem_impl,
    string extra_info = StringPrintf(
        "Residual block id %d; depends on parameters [", i);
    vector<double*> parameter_blocks;
+    vector<const LocalParameterization*> local_parameterizations;
+    parameter_blocks.reserve(residual_block->NumParameterBlocks());
+    local_parameterizations.reserve(residual_block->NumParameterBlocks());
    for (int j = 0; j < residual_block->NumParameterBlocks(); ++j) {
      ParameterBlock* parameter_block = residual_block->parameter_blocks()[j];
      parameter_blocks.push_back(parameter_block->mutable_user_state());
      StringAppendF(&extra_info, "%p", parameter_block->mutable_user_state());
      extra_info += (j < residual_block->NumParameterBlocks() - 1) ? ", " : "]";
+      local_parameterizations.push_back(problem_impl->GetParameterization(
+          parameter_block->mutable_user_state()));
    }

    // Wrap the original CostFunction in a GradientCheckingCostFunction.
    CostFunction* gradient_checking_cost_function =
-        CreateGradientCheckingCostFunction(residual_block->cost_function(),
-                                           relative_step_size,
-                                           relative_precision,
-                                           extra_info);
+        new GradientCheckingCostFunction(residual_block->cost_function(),
+                                         &local_parameterizations,
+                                         numeric_diff_options,
+                                         relative_precision,
+                                         extra_info,
+                                         callback);

    // The const_cast is necessary because
    // ProblemImpl::AddResidualBlock can potentially take ownership of
@@ -26,7 +26,8 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 //
-// Author: keir@google.com (Keir Mierle)
+// Authors: keir@google.com (Keir Mierle),
+//          dgossow@google.com (David Gossow)

 #ifndef CERES_INTERNAL_GRADIENT_CHECKING_COST_FUNCTION_H_
 #define CERES_INTERNAL_GRADIENT_CHECKING_COST_FUNCTION_H_
@@ -34,50 +35,76 @@
 #include <string>

 #include "ceres/cost_function.h"
+#include "ceres/iteration_callback.h"
+#include "ceres/local_parameterization.h"
+#include "ceres/mutex.h"

 namespace ceres {
 namespace internal {

 class ProblemImpl;

-// Creates a CostFunction that checks the jacobians that cost_function computes
-// with finite differences. Bad results are logged; required precision is
-// controlled by relative_precision and the numeric differentiation step size is
-// controlled with relative_step_size. See solver.h for a better explanation of
-// relative_step_size. Caller owns result.
-//
-// The condition enforced is that
-//
-//    (J_actual(i, j) - J_numeric(i, j))
-//   ------------------------------------  <  relative_precision
-//   max(J_actual(i, j), J_numeric(i, j))
-//
-// where J_actual(i, j) is the jacobian as computed by the supplied cost
-// function (by the user) and J_numeric is the jacobian as computed by finite
-// differences.
-//
-// Note: This is quite inefficient and is intended only for debugging.
+// Callback that collects information about gradient checking errors, and
+// will abort the solve as soon as an error occurs.
+class GradientCheckingIterationCallback : public IterationCallback {
+ public:
+  GradientCheckingIterationCallback();
+
+  // Will return SOLVER_CONTINUE until a gradient error has been detected,
+  // then return SOLVER_ABORT.
+  virtual CallbackReturnType operator()(const IterationSummary& summary);
+
+  // Notify this that a gradient error has occurred (thread safe).
+  void SetGradientErrorDetected(std::string& error_log);
+
+  // Retrieve error status (not thread safe).
+  bool gradient_error_detected() const { return gradient_error_detected_; }
+  const std::string& error_log() const { return error_log_; }
+ private:
+  bool gradient_error_detected_;
+  std::string error_log_;
+  // Mutex protecting member variables.
+  ceres::internal::Mutex mutex_;
+};
+
+// Creates a CostFunction that checks the Jacobians that cost_function computes
+// with finite differences. This API is only intended for unit tests that intend
+// to  check the functionality of the GradientCheckingCostFunction
+// implementation directly.
 CostFunction* CreateGradientCheckingCostFunction(
    const CostFunction* cost_function,
+    const std::vector<const LocalParameterization*>* local_parameterizations,
    double relative_step_size,
    double relative_precision,
-    const std::string& extra_info);
+    const std::string& extra_info,
+    GradientCheckingIterationCallback* callback);

-// Create a new ProblemImpl object from the input problem_impl, where
-// each CostFunctions in problem_impl are wrapped inside a
-// GradientCheckingCostFunctions. This gives us a ProblemImpl object
-// which checks its derivatives against estimates from numeric
-// differentiation everytime a ResidualBlock is evaluated.
+// Create a new ProblemImpl object from the input problem_impl, where all
+// cost functions are wrapped so that each time their Evaluate method is called,
+// an additional check is performed that compares the Jacobians computed by
+// the original cost function with alternative Jacobians computed using
+// numerical differentiation. If local parameterizations are given for any
+// parameters, the Jacobians will be compared in the local space instead of the
+// ambient space. For details on the gradient checking procedure, see the
+// documentation of the GradientChecker class. If an error is detected in any
+// iteration, the respective cost function will notify the
+// GradientCheckingIterationCallback.
+//
+// The caller owns the returned ProblemImpl object.
+//
+// Note: This is quite inefficient and is intended only for debugging.
 //
 // relative_step_size and relative_precision are parameters to control
 // the numeric differentiation and the relative tolerance between the
 // jacobian computed by the CostFunctions in problem_impl and
-// jacobians obtained by numerically differentiating them. For more
-// details see the documentation for
-// CreateGradientCheckingCostFunction above.
-ProblemImpl* CreateGradientCheckingProblemImpl(ProblemImpl* problem_impl,
-                                               double relative_step_size,
-                                               double relative_precision);
+// jacobians obtained by numerically differentiating them. See the
+// documentation of 'numeric_derivative_relative_step_size' in solver.h for a
+// better explanation.
+ProblemImpl* CreateGradientCheckingProblemImpl(
+    ProblemImpl* problem_impl,
+    double relative_step_size,
+    double relative_precision,
+    GradientCheckingIterationCallback* callback);

 }  // namespace internal
 }  // namespace ceres
@@ -84,6 +84,12 @@ Solver::Options GradientProblemSolverOptionsToSolverOptions(

 }  // namespace

+bool GradientProblemSolver::Options::IsValid(std::string* error) const {
+  const Solver::Options solver_options =
+      GradientProblemSolverOptionsToSolverOptions(*this);
+  return solver_options.IsValid(error);
+}
+
 GradientProblemSolver::~GradientProblemSolver() {
 }

@@ -99,8 +105,6 @@ void GradientProblemSolver::Solve(const GradientProblemSolver::Options& options,
  using internal::SetSummaryFinalCost;

  double start_time = WallTimeInSeconds();
-  Solver::Options solver_options =
-      GradientProblemSolverOptionsToSolverOptions(options);

  *CHECK_NOTNULL(summary) = Summary();
  summary->num_parameters                    = problem.NumParameters();
@@ -112,14 +116,16 @@ void GradientProblemSolver::Solve(const GradientProblemSolver::Options& options,
  summary->nonlinear_conjugate_gradient_type = options.nonlinear_conjugate_gradient_type;  //  NOLINT

  // Check validity
-  if (!solver_options.IsValid(&summary->message)) {
+  if (!options.IsValid(&summary->message)) {
    LOG(ERROR) << "Terminating: " << summary->message;
    return;
  }

-  // Assuming that the parameter blocks in the program have been
-  Minimizer::Options minimizer_options;
-  minimizer_options = Minimizer::Options(solver_options);
+  // TODO(sameeragarwal): This is a bit convoluted, we should be able
+  // to convert to minimizer options directly, but this will do for
+  // now.
+  Minimizer::Options minimizer_options =
+      Minimizer::Options(GradientProblemSolverOptionsToSolverOptions(options));
  minimizer_options.evaluator.reset(new GradientProblemEvaluator(problem));

  scoped_ptr<IterationCallback> logging_callback;
@@ -0,0 +1,59 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2016 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: keir@google.com (Keir Mierle), dgossow@google.com (David Gossow)
+
+#include "ceres/is_close.h"
+
+#include <algorithm>
+#include <cmath>
+
+namespace ceres {
+namespace internal {
+bool IsClose(double x, double y, double relative_precision,
+             double *relative_error,
+             double *absolute_error) {
+  double local_absolute_error;
+  double local_relative_error;
+  if (!absolute_error) {
+    absolute_error = &local_absolute_error;
+  }
+  if (!relative_error) {
+    relative_error = &local_relative_error;
+  }
+  *absolute_error = std::fabs(x - y);
+  *relative_error = *absolute_error / std::max(std::fabs(x), std::fabs(y));
+  if (x == 0 || y == 0) {
+    // If x or y is exactly zero, then relative difference doesn't have any
+    // meaning. Take the absolute difference instead.
+    *relative_error = *absolute_error;
+  }
+  return *relative_error < std::fabs(relative_precision);
+}
+}  // namespace internal
+}  // namespace ceres
@@ -0,0 +1,51 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2016 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: keir@google.com (Keir Mierle), dgossow@google.com (David Gossow)
+//
+// Utility routine for comparing two values.
+
+#ifndef CERES_INTERNAL_IS_CLOSE_H_
+#define CERES_INTERNAL_IS_CLOSE_H_
+
+namespace ceres {
+namespace internal {
+// Returns true if x and y have a relative (unsigned) difference less than
+// relative_precision and false otherwise. Stores the relative and absolute
+// difference in relative/absolute_error if non-NULL. If one of the two values
+// is exactly zero, the absolute difference will be compared, and relative_error
+// will be set to the absolute difference.
+bool IsClose(double x,
+             double y,
+             double relative_precision,
+             double *relative_error,
+             double *absolute_error);
+}  // namespace internal
+}  // namespace ceres
+
+#endif  // CERES_INTERNAL_IS_CLOSE_H_
@@ -191,6 +191,7 @@ void LineSearchMinimizer::Minimize(const Minimizer::Options& options,
      options.line_search_sufficient_curvature_decrease;
  line_search_options.max_step_expansion =
      options.max_line_search_step_expansion;
+  line_search_options.is_silent = options.is_silent;
  line_search_options.function = &line_search_function;

  scoped_ptr<LineSearch>
@@ -341,10 +342,12 @@ void LineSearchMinimizer::Minimize(const Minimizer::Options& options,
          "as the step was valid when it was selected by the line search.";
      LOG_IF(WARNING, is_not_silent) << "Terminating: " << summary->message;
      break;
-    } else if (!Evaluate(evaluator,
-                         x_plus_delta,
-                         &current_state,
-                         &summary->message)) {
+    }
+
+    if (!Evaluate(evaluator,
+                  x_plus_delta,
+                  &current_state,
+                  &summary->message)) {
      summary->termination_type = FAILURE;
      summary->message =
          "Step failed to evaluate. This should not happen as the step was "
@@ -352,15 +355,17 @@ void LineSearchMinimizer::Minimize(const Minimizer::Options& options,
          summary->message;
      LOG_IF(WARNING, is_not_silent) << "Terminating: " << summary->message;
      break;
-    } else {
-      x = x_plus_delta;
    }

+    // Compute the norm of the step in the ambient space.
+    iteration_summary.step_norm = (x_plus_delta - x).norm();
+    x = x_plus_delta;
+
    iteration_summary.gradient_max_norm = current_state.gradient_max_norm;
    iteration_summary.gradient_norm = sqrt(current_state.gradient_squared_norm);
    iteration_summary.cost_change = previous_state.cost - current_state.cost;
    iteration_summary.cost = current_state.cost + summary->fixed_cost;
-    iteration_summary.step_norm = delta.norm();
+
    iteration_summary.step_is_valid = true;
    iteration_summary.step_is_successful = true;
    iteration_summary.step_size =  current_state.step_size;
@@ -376,6 +381,13 @@ void LineSearchMinimizer::Minimize(const Minimizer::Options& options,
        WallTimeInSeconds() - start_time
        + summary->preprocessor_time_in_seconds;

+    // Iterations inside the line search algorithm are considered
+    // 'steps' in the broader context, to distinguish these inner
+    // iterations from from the outer iterations of the line search
+    // minimizer. The number of line search steps is the total number
+    // of inner line search iterations (or steps) across the entire
+    // minimization.
+    summary->num_line_search_steps +=  line_search_summary.num_iterations;
    summary->line_search_cost_evaluation_time_in_seconds +=
        line_search_summary.cost_evaluation_time_in_seconds;
    summary->line_search_gradient_evaluation_time_in_seconds +=
@@ -30,6 +30,8 @@

 #include "ceres/local_parameterization.h"

+#include <algorithm>
+#include "Eigen/Geometry"
 #include "ceres/householder_vector.h"
 #include "ceres/internal/eigen.h"
 #include "ceres/internal/fixed_array.h"
@@ -87,28 +89,17 @@ bool IdentityParameterization::MultiplyByJacobian(const double* x,
 }

 SubsetParameterization::SubsetParameterization(
-    int size,
-    const vector<int>& constant_parameters)
-    : local_size_(size - constant_parameters.size()),
-      constancy_mask_(size, 0) {
-  CHECK_GT(constant_parameters.size(), 0)
-      << "The set of constant parameters should contain at least "
-      << "one element. If you do not wish to hold any parameters "
-      << "constant, then do not use a SubsetParameterization";
-
+    int size, const vector<int>& constant_parameters)
+    : local_size_(size - constant_parameters.size()), constancy_mask_(size, 0) {
  vector<int> constant = constant_parameters;
-  sort(constant.begin(), constant.end());
-  CHECK(unique(constant.begin(), constant.end()) == constant.end())
+  std::sort(constant.begin(), constant.end());
+  CHECK_GE(constant.front(), 0)
+      << "Indices indicating constant parameter must be greater than zero.";
+  CHECK_LT(constant.back(), size)
+      << "Indices indicating constant parameter must be less than the size "
+      << "of the parameter block.";
+  CHECK(std::adjacent_find(constant.begin(), constant.end()) == constant.end())
      << "The set of constant parameters cannot contain duplicates";
-  CHECK_LT(constant_parameters.size(), size)
-      << "Number of parameters held constant should be less "
-      << "than the size of the parameter block. If you wish "
-      << "to hold the entire parameter block constant, then a "
-      << "efficient way is to directly mark it as constant "
-      << "instead of using a LocalParameterization to do so.";
-  CHECK_GE(*min_element(constant.begin(), constant.end()), 0);
-  CHECK_LT(*max_element(constant.begin(), constant.end()), size);
-
  for (int i = 0; i < constant_parameters.size(); ++i) {
    constancy_mask_[constant_parameters[i]] = 1;
  }
@@ -129,6 +120,10 @@ bool SubsetParameterization::Plus(const double* x,

 bool SubsetParameterization::ComputeJacobian(const double* x,
                                             double* jacobian) const {
+  if (local_size_ == 0) {
+    return true;
+  }
+
  MatrixRef m(jacobian, constancy_mask_.size(), local_size_);
  m.setZero();
  for (int i = 0, j = 0; i < constancy_mask_.size(); ++i) {
@@ -143,6 +138,10 @@ bool SubsetParameterization::MultiplyByJacobian(const double* x,
                                               const int num_rows,
                                               const double* global_matrix,
                                               double* local_matrix) const {
+  if (local_size_ == 0) {
+    return true;
+  }
+
  for (int row = 0; row < num_rows; ++row) {
    for (int col = 0, j = 0; col < constancy_mask_.size(); ++col) {
      if (!constancy_mask_[col]) {
@@ -184,6 +183,39 @@ bool QuaternionParameterization::ComputeJacobian(const double* x,
  return true;
 }

+bool EigenQuaternionParameterization::Plus(const double* x_ptr,
+                                           const double* delta,
+                                           double* x_plus_delta_ptr) const {
+  Eigen::Map<Eigen::Quaterniond> x_plus_delta(x_plus_delta_ptr);
+  Eigen::Map<const Eigen::Quaterniond> x(x_ptr);
+
+  const double norm_delta =
+      sqrt(delta[0] * delta[0] + delta[1] * delta[1] + delta[2] * delta[2]);
+  if (norm_delta > 0.0) {
+    const double sin_delta_by_delta = sin(norm_delta) / norm_delta;
+
+    // Note, in the constructor w is first.
+    Eigen::Quaterniond delta_q(cos(norm_delta),
+                               sin_delta_by_delta * delta[0],
+                               sin_delta_by_delta * delta[1],
+                               sin_delta_by_delta * delta[2]);
+    x_plus_delta = delta_q * x;
+  } else {
+    x_plus_delta = x;
+  }
+
+  return true;
+}
+
+bool EigenQuaternionParameterization::ComputeJacobian(const double* x,
+                                                      double* jacobian) const {
+  jacobian[0] =  x[3]; jacobian[1]  =  x[2]; jacobian[2]  = -x[1];  // NOLINT
+  jacobian[3] = -x[2]; jacobian[4]  =  x[3]; jacobian[5]  =  x[0];  // NOLINT
+  jacobian[6] =  x[1]; jacobian[7]  = -x[0]; jacobian[8]  =  x[3];  // NOLINT
+  jacobian[9] = -x[0]; jacobian[10] = -x[1]; jacobian[11] = -x[2];  // NOLINT
+  return true;
+}
+
 HomogeneousVectorParameterization::HomogeneousVectorParameterization(int size)
    : size_(size) {
  CHECK_GT(size_, 1) << "The size of the homogeneous vector needs to be "
@@ -332,9 +364,9 @@ bool ProductParameterization::ComputeJacobian(const double* x,
    if (!param->ComputeJacobian(x + x_cursor, buffer.get())) {
      return false;
    }
-
    jacobian.block(x_cursor, delta_cursor, global_size, local_size)
        = MatrixRef(buffer.get(), global_size, local_size);
+
    delta_cursor += local_size;
    x_cursor += global_size;
  }
@@ -67,7 +67,7 @@ FindOrDie(const Collection& collection,
 // If the key is present in the map then the value associated with that
 // key is returned, otherwise the value passed as a default is returned.
 template <class Collection>
-const typename Collection::value_type::second_type&
+const typename Collection::value_type::second_type
 FindWithDefault(const Collection& collection,
                const typename Collection::value_type::first_type& key,
                const typename Collection::value_type::second_type& value) {
@@ -161,25 +161,34 @@ class ParameterBlock {
  // does not take ownership of the parameterization.
  void SetParameterization(LocalParameterization* new_parameterization) {
    CHECK(new_parameterization != NULL) << "NULL parameterization invalid.";
+    // Nothing to do if the new parameterization is the same as the
+    // old parameterization.
+    if (new_parameterization == local_parameterization_) {
+      return;
+    }
+
+    CHECK(local_parameterization_ == NULL)
+        << "Can't re-set the local parameterization; it leads to "
+        << "ambiguous ownership. Current local parameterization is: "
+        << local_parameterization_;
+
    CHECK(new_parameterization->GlobalSize() == size_)
        << "Invalid parameterization for parameter block. The parameter block "
        << "has size " << size_ << " while the parameterization has a global "
        << "size of " << new_parameterization->GlobalSize() << ". Did you "
        << "accidentally use the wrong parameter block or parameterization?";
-    if (new_parameterization != local_parameterization_) {
-      CHECK(local_parameterization_ == NULL)
-          << "Can't re-set the local parameterization; it leads to "
-          << "ambiguous ownership.";
-      local_parameterization_ = new_parameterization;
-      local_parameterization_jacobian_.reset(
-          new double[local_parameterization_->GlobalSize() *
-                     local_parameterization_->LocalSize()]);
-      CHECK(UpdateLocalParameterizationJacobian())
-          << "Local parameterization Jacobian computation failed for x: "
-          << ConstVectorRef(state_, Size()).transpose();
-    } else {
-      // Ignore the case that the parameterizations match.
-    }
+
+    CHECK_GT(new_parameterization->LocalSize(), 0)
+        << "Invalid parameterization. Parameterizations must have a positive "
+        << "dimensional tangent space.";
+
+    local_parameterization_ = new_parameterization;
+    local_parameterization_jacobian_.reset(
+        new double[local_parameterization_->GlobalSize() *
+                   local_parameterization_->LocalSize()]);
+    CHECK(UpdateLocalParameterizationJacobian())
+        << "Local parameterization Jacobian computation failed for x: "
+        << ConstVectorRef(state_, Size()).transpose();
  }

  void SetUpperBound(int index, double upper_bound) {
@@ -174,6 +174,10 @@ void Problem::SetParameterBlockVariable(double* values) {
  problem_impl_->SetParameterBlockVariable(values);
 }

+bool Problem::IsParameterBlockConstant(double* values) const {
+  return problem_impl_->IsParameterBlockConstant(values);
+}
+
 void Problem::SetParameterization(
    double* values,
    LocalParameterization* local_parameterization) {
@@ -249,10 +249,11 @@ ResidualBlock* ProblemImpl::AddResidualBlock(
    // Check for duplicate parameter blocks.
    vector<double*> sorted_parameter_blocks(parameter_blocks);
    sort(sorted_parameter_blocks.begin(), sorted_parameter_blocks.end());
-    vector<double*>::const_iterator duplicate_items =
-        unique(sorted_parameter_blocks.begin(),
-               sorted_parameter_blocks.end());
-    if (duplicate_items != sorted_parameter_blocks.end()) {
+    const bool has_duplicate_items =
+        (std::adjacent_find(sorted_parameter_blocks.begin(),
+                            sorted_parameter_blocks.end())
+         != sorted_parameter_blocks.end());
+    if (has_duplicate_items) {
      string blocks;
      for (int i = 0; i < parameter_blocks.size(); ++i) {
        blocks += StringPrintf(" %p ", parameter_blocks[i]);
@@ -572,6 +573,16 @@ void ProblemImpl::SetParameterBlockConstant(double* values) {
  parameter_block->SetConstant();
 }

+bool ProblemImpl::IsParameterBlockConstant(double* values) const {
+  const ParameterBlock* parameter_block =
+      FindWithDefault(parameter_block_map_, values, NULL);
+  CHECK(parameter_block != NULL)
+    << "Parameter block not found: " << values << ". You must add the "
+    << "parameter block to the problem before it can be queried.";
+
+  return parameter_block->IsConstant();
+}
+
 void ProblemImpl::SetParameterBlockVariable(double* values) {
  ParameterBlock* parameter_block =
      FindWithDefault(parameter_block_map_, values, NULL);
@@ -128,6 +128,8 @@ class ProblemImpl {

  void SetParameterBlockConstant(double* values);
  void SetParameterBlockVariable(double* values);
+  bool IsParameterBlockConstant(double* values) const;
+
  void SetParameterization(double* values,
                           LocalParameterization* local_parameterization);
  const LocalParameterization* GetParameterization(double* values) const;
@@ -142,6 +142,11 @@ void OrderingForSparseNormalCholeskyUsingSuiteSparse(
                                                   ordering);
  }

+  VLOG(2) << "Block ordering stats: "
+          << " flops: " << ss.mutable_cc()->fl
+          << " lnz  : " << ss.mutable_cc()->lnz
+          << " anz  : " << ss.mutable_cc()->anz;
+
  ss.Free(block_jacobian_transpose);
 #endif  // CERES_NO_SUITESPARSE
 }
@@ -127,7 +127,7 @@ class ResidualBlock {
  int index() const { return index_; }
  void set_index(int index) { index_ = index; }

-  std::string ToString() {
+  std::string ToString() const {
    return StringPrintf("{residual block; index=%d}", index_);
  }

@@ -33,6 +33,7 @@
 #include <algorithm>
 #include <ctime>
 #include <set>
+#include <sstream>
 #include <vector>

 #include "ceres/block_random_access_dense_matrix.h"
@@ -563,6 +564,12 @@ SparseSchurComplementSolver::SolveReducedLinearSystemUsingEigen(
    // worse than the one computed using the block version of the
    // algorithm.
    simplicial_ldlt_->analyzePattern(eigen_lhs);
+    if (VLOG_IS_ON(2)) {
+      std::stringstream ss;
+      simplicial_ldlt_->dumpMemory(ss);
+      VLOG(2) << "Symbolic Analysis\n"
+              << ss.str();
+    }
    event_logger.AddEvent("Analysis");
    if (simplicial_ldlt_->info() != Eigen::Success) {
      summary.termination_type = LINEAR_SOLVER_FATAL_ERROR;
@@ -94,7 +94,7 @@ bool CommonOptionsAreValid(const Solver::Options& options, string* error) {
  OPTION_GT(num_linear_solver_threads, 0);
  if (options.check_gradients) {
    OPTION_GT(gradient_check_relative_precision, 0.0);
-    OPTION_GT(numeric_derivative_relative_step_size, 0.0);
+    OPTION_GT(gradient_check_numeric_derivative_relative_step_size, 0.0);
  }
  return true;
 }
@@ -351,6 +351,7 @@ void PreSolveSummarize(const Solver::Options& options,
  summary->dense_linear_algebra_library_type  = options.dense_linear_algebra_library_type;  //  NOLINT
  summary->dogleg_type                        = options.dogleg_type;
  summary->inner_iteration_time_in_seconds    = 0.0;
+  summary->num_line_search_steps              = 0;
  summary->line_search_cost_evaluation_time_in_seconds = 0.0;
  summary->line_search_gradient_evaluation_time_in_seconds = 0.0;
  summary->line_search_polynomial_minimization_time_in_seconds = 0.0;
@@ -495,21 +496,28 @@ void Solver::Solve(const Solver::Options& options,
  // values provided by the user.
  program->SetParameterBlockStatePtrsToUserStatePtrs();

+  // If gradient_checking is enabled, wrap all cost functions in a
+  // gradient checker and install a callback that terminates if any gradient
+  // error is detected.
  scoped_ptr<internal::ProblemImpl> gradient_checking_problem;
+  internal::GradientCheckingIterationCallback gradient_checking_callback;
+  Solver::Options modified_options = options;
  if (options.check_gradients) {
+    modified_options.callbacks.push_back(&gradient_checking_callback);
    gradient_checking_problem.reset(
        CreateGradientCheckingProblemImpl(
            problem_impl,
-            options.numeric_derivative_relative_step_size,
-            options.gradient_check_relative_precision));
+            options.gradient_check_numeric_derivative_relative_step_size,
+            options.gradient_check_relative_precision,
+            &gradient_checking_callback));
    problem_impl = gradient_checking_problem.get();
    program = problem_impl->mutable_program();
  }

  scoped_ptr<Preprocessor> preprocessor(
-      Preprocessor::Create(options.minimizer_type));
+      Preprocessor::Create(modified_options.minimizer_type));
  PreprocessedProblem pp;
-  const bool status = preprocessor->Preprocess(options, problem_impl, &pp);
+  const bool status = preprocessor->Preprocess(modified_options, problem_impl, &pp);
  summary->fixed_cost = pp.fixed_cost;
  summary->preprocessor_time_in_seconds = WallTimeInSeconds() - start_time;

@@ -534,6 +542,13 @@ void Solver::Solve(const Solver::Options& options,
  summary->postprocessor_time_in_seconds =
      WallTimeInSeconds() - postprocessor_start_time;

+  // If the gradient checker reported an error, we want to report FAILURE
+  // instead of USER_FAILURE and provide the error log.
+  if (gradient_checking_callback.gradient_error_detected()) {
+    summary->termination_type = FAILURE;
+    summary->message = gradient_checking_callback.error_log();
+  }
+
  summary->total_time_in_seconds = WallTimeInSeconds() - start_time;
 }

@@ -556,6 +571,7 @@ Solver::Summary::Summary()
      num_successful_steps(-1),
      num_unsuccessful_steps(-1),
      num_inner_iteration_steps(-1),
+      num_line_search_steps(-1),
      preprocessor_time_in_seconds(-1.0),
      minimizer_time_in_seconds(-1.0),
      postprocessor_time_in_seconds(-1.0),
@@ -696,16 +712,14 @@ string Solver::Summary::FullReport() const {
                  num_linear_solver_threads_given,
                  num_linear_solver_threads_used);

-    if (IsSchurType(linear_solver_type_used)) {
-      string given;
-      StringifyOrdering(linear_solver_ordering_given, &given);
-      string used;
-      StringifyOrdering(linear_solver_ordering_used, &used);
-      StringAppendF(&report,
-                    "Linear solver ordering %22s %24s\n",
-                    given.c_str(),
-                    used.c_str());
-    }
+    string given;
+    StringifyOrdering(linear_solver_ordering_given, &given);
+    string used;
+    StringifyOrdering(linear_solver_ordering_used, &used);
+    StringAppendF(&report,
+                  "Linear solver ordering %22s %24s\n",
+                  given.c_str(),
+                  used.c_str());

    if (inner_iterations_given) {
      StringAppendF(&report,
@@ -784,9 +798,14 @@ string Solver::Summary::FullReport() const {
                  num_inner_iteration_steps);
  }

-  const bool print_line_search_timing_information =
-      minimizer_type == LINE_SEARCH ||
-      (minimizer_type == TRUST_REGION && is_constrained);
+  const bool line_search_used =
+      (minimizer_type == LINE_SEARCH ||
+       (minimizer_type == TRUST_REGION && is_constrained));
+
+  if (line_search_used) {
+    StringAppendF(&report, "Line search steps              % 14d\n",
+                  num_line_search_steps);
+  }

  StringAppendF(&report, "\nTime (in seconds):\n");
  StringAppendF(&report, "Preprocessor        %25.4f\n",
@@ -794,13 +813,13 @@ string Solver::Summary::FullReport() const {

  StringAppendF(&report, "\n  Residual evaluation %23.4f\n",
                residual_evaluation_time_in_seconds);
-  if (print_line_search_timing_information) {
+  if (line_search_used) {
    StringAppendF(&report, "    Line search cost evaluation    %10.4f\n",
                  line_search_cost_evaluation_time_in_seconds);
  }
  StringAppendF(&report, "  Jacobian evaluation %23.4f\n",
                jacobian_evaluation_time_in_seconds);
-  if (print_line_search_timing_information) {
+  if (line_search_used) {
    StringAppendF(&report, "    Line search gradient evaluation    %6.4f\n",
                  line_search_gradient_evaluation_time_in_seconds);
  }
@@ -815,7 +834,7 @@ string Solver::Summary::FullReport() const {
                  inner_iteration_time_in_seconds);
  }

-  if (print_line_search_timing_information) {
+  if (line_search_used) {
    StringAppendF(&report, "  Line search polynomial minimization  %.4f\n",
                  line_search_polynomial_minimization_time_in_seconds);
  }
@@ -33,6 +33,7 @@
 #include <algorithm>
 #include <cstring>
 #include <ctime>
+#include <sstream>

 #include "ceres/compressed_row_sparse_matrix.h"
 #include "ceres/cxsparse.h"
@@ -71,6 +72,12 @@ LinearSolver::Summary SimplicialLDLTSolve(

  if (do_symbolic_analysis) {
    solver->analyzePattern(lhs);
+    if (VLOG_IS_ON(2)) {
+      std::stringstream ss;
+      solver->dumpMemory(ss);
+      VLOG(2) << "Symbolic Analysis\n"
+              << ss.str();
+    }
    event_logger->AddEvent("Analyze");
    if (solver->info() != Eigen::Success) {
      summary.termination_type = LINEAR_SOLVER_FATAL_ERROR;
@@ -43,14 +43,27 @@ namespace internal {

 using std::string;

-#ifdef _MSC_VER
-enum { IS_COMPILER_MSVC = 1 };
-#if _MSC_VER < 1800
-#define va_copy(d, s) ((d) = (s))
-#endif
+// va_copy() was defined in the C99 standard.  However, it did not appear in the
+// C++ standard until C++11.  This means that if Ceres is being compiled with a
+// strict pre-C++11 standard (e.g. -std=c++03), va_copy() will NOT be defined,
+// as we are using the C++ compiler (it would however be defined if we were
+// using the C compiler).  Note however that both GCC & Clang will in fact
+// define va_copy() when compiling for C++ if the C++ standard is not explicitly
+// specified (i.e. no -std=c++<XX> arg), even though it should not strictly be
+// defined unless -std=c++11 (or greater) was passed.
+#if !defined(va_copy)
+#if defined (__GNUC__)
+// On GCC/Clang, if va_copy() is not defined (C++ standard < C++11 explicitly
+// specified), use the internal __va_copy() version, which should be present
+// in even very old GCC versions.
+#define va_copy(d, s) __va_copy(d, s)
 #else
-enum { IS_COMPILER_MSVC = 0 };
-#endif
+// Some older versions of MSVC do not have va_copy(), in which case define it.
+// Although this is required for older MSVC versions, it should also work for
+// other non-GCC/Clang compilers which also do not defined va_copy().
+#define va_copy(d, s) ((d) = (s))
+#endif  // defined (__GNUC__)
+#endif  // !defined(va_copy)

 void StringAppendV(string* dst, const char* format, va_list ap) {
  // First try with a small fixed size buffer
@@ -71,13 +84,13 @@ void StringAppendV(string* dst, const char* format, va_list ap) {
      return;
    }

-    if (IS_COMPILER_MSVC) {
-      // Error or MSVC running out of space.  MSVC 8.0 and higher
-      // can be asked about space needed with the special idiom below:
-      va_copy(backup_ap, ap);
-      result = vsnprintf(NULL, 0, format, backup_ap);
-      va_end(backup_ap);
-    }
+#if defined (_MSC_VER)
+    // Error or MSVC running out of space.  MSVC 8.0 and higher
+    // can be asked about space needed with the special idiom below:
+    va_copy(backup_ap, ap);
+    result = vsnprintf(NULL, 0, format, backup_ap);
+    va_end(backup_ap);
+#endif

    if (result < 0) {
      // Just an error.
@@ -1,5 +1,5 @@
 // Ceres Solver - A fast non-linear least squares minimizer
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2016 Google Inc. All rights reserved.
 // http://ceres-solver.org/
 //
 // Redistribution and use in source and binary forms, with or without
@@ -31,35 +31,136 @@
 #ifndef CERES_INTERNAL_TRUST_REGION_MINIMIZER_H_
 #define CERES_INTERNAL_TRUST_REGION_MINIMIZER_H_

+#include "ceres/internal/eigen.h"
+#include "ceres/internal/scoped_ptr.h"
 #include "ceres/minimizer.h"
 #include "ceres/solver.h"
+#include "ceres/sparse_matrix.h"
+#include "ceres/trust_region_step_evaluator.h"
+#include "ceres/trust_region_strategy.h"
 #include "ceres/types.h"

 namespace ceres {
 namespace internal {

-// Generic trust region minimization algorithm. The heavy lifting is
-// done by a TrustRegionStrategy object passed in as part of options.
+// Generic trust region minimization algorithm.
 //
 // For example usage, see SolverImpl::Minimize.
 class TrustRegionMinimizer : public Minimizer {
 public:
-  ~TrustRegionMinimizer() {}
+  ~TrustRegionMinimizer();
+
+  // This method is not thread safe.
  virtual void Minimize(const Minimizer::Options& options,
                        double* parameters,
-                        Solver::Summary* summary);
+                        Solver::Summary* solver_summary);

 private:
-  void Init(const Minimizer::Options& options);
-  void EstimateScale(const SparseMatrix& jacobian, double* scale) const;
-  bool MaybeDumpLinearLeastSquaresProblem(const int iteration,
-                                          const SparseMatrix* jacobian,
-                                          const double* residuals,
-                                          const double* step) const;
+  void Init(const Minimizer::Options& options,
+            double* parameters,
+            Solver::Summary* solver_summary);
+  bool IterationZero();
+  bool FinalizeIterationAndCheckIfMinimizerCanContinue();
+  bool ComputeTrustRegionStep();
+
+  bool EvaluateGradientAndJacobian();
+  void ComputeCandidatePointAndEvaluateCost();
+
+  void DoLineSearch(const Vector& x,
+                    const Vector& gradient,
+                    const double cost,
+                    Vector* delta);
+  void DoInnerIterationsIfNeeded();
+
+  bool ParameterToleranceReached();
+  bool FunctionToleranceReached();
+  bool GradientToleranceReached();
+  bool MaxSolverTimeReached();
+  bool MaxSolverIterationsReached();
+  bool MinTrustRegionRadiusReached();
+
+  bool IsStepSuccessful();
+  void HandleUnsuccessfulStep();
+  bool HandleSuccessfulStep();
+  bool HandleInvalidStep();

  Minimizer::Options options_;
+
+  // These pointers are shortcuts to objects passed to the
+  // TrustRegionMinimizer. The TrustRegionMinimizer does not own them.
+  double* parameters_;
+  Solver::Summary* solver_summary_;
+  Evaluator* evaluator_;
+  SparseMatrix* jacobian_;
+  TrustRegionStrategy* strategy_;
+
+  scoped_ptr<TrustRegionStepEvaluator> step_evaluator_;
+
+  bool is_not_silent_;
+  bool inner_iterations_are_enabled_;
+  bool inner_iterations_were_useful_;
+
+  // Summary of the current iteration.
+  IterationSummary iteration_summary_;
+
+  // Dimensionality of the problem in the ambient space.
+  int num_parameters_;
+  // Dimensionality of the problem in the tangent space. This is the
+  // number of columns in the Jacobian.
+  int num_effective_parameters_;
+  // Length of the residual vector, also the number of rows in the Jacobian.
+  int num_residuals_;
+
+  // Current point.
+  Vector x_;
+  // Residuals at x_;
+  Vector residuals_;
+  // Gradient at x_.
+  Vector gradient_;
+  // Solution computed by the inner iterations.
+  Vector inner_iteration_x_;
+  // model_residuals = J * trust_region_step
+  Vector model_residuals_;
+  Vector negative_gradient_;
+  // projected_gradient_step = Plus(x, -gradient), an intermediate
+  // quantity used to compute the projected gradient norm.
+  Vector projected_gradient_step_;
+  // The step computed by the trust region strategy. If Jacobi scaling
+  // is enabled, this is a vector in the scaled space.
+  Vector trust_region_step_;
+  // The current proposal for how far the trust region algorithm
+  // thinks we should move. In the most basic case, it is just the
+  // trust_region_step_ with the Jacobi scaling undone. If bounds
+  // constraints are present, then it is the result of the projected
+  // line search.
+  Vector delta_;
+  // candidate_x  = Plus(x, delta)
+  Vector candidate_x_;
+  // Scaling vector to scale the columns of the Jacobian.
+  Vector jacobian_scaling_;
+
+  // Euclidean norm of x_.
+  double x_norm_;
+  // Cost at x_.
+  double x_cost_;
+  // Minimum cost encountered up till now.
+  double minimum_cost_;
+  // How much did the trust region strategy reduce the cost of the
+  // linearized Gauss-Newton model.
+  double model_cost_change_;
+  // Cost at candidate_x_.
+  double candidate_cost_;
+
+  // Time at which the minimizer was started.
+  double start_time_in_secs_;
+  // Time at which the current iteration was started.
+  double iteration_start_time_in_secs_;
+  // Number of consecutive steps where the minimizer loop computed a
+  // numerically invalid step.
+  int num_consecutive_invalid_steps_;
 };

 }  // namespace internal
 }  // namespace ceres
+
 #endif  // CERES_INTERNAL_TRUST_REGION_MINIMIZER_H_
@@ -0,0 +1,107 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2016 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: sameeragarwal@google.com (Sameer Agarwal)
+
+#include <algorithm>
+#include "ceres/trust_region_step_evaluator.h"
+#include "glog/logging.h"
+
+namespace ceres {
+namespace internal {
+
+TrustRegionStepEvaluator::TrustRegionStepEvaluator(
+    const double initial_cost,
+    const int max_consecutive_nonmonotonic_steps)
+    : max_consecutive_nonmonotonic_steps_(max_consecutive_nonmonotonic_steps),
+      minimum_cost_(initial_cost),
+      current_cost_(initial_cost),
+      reference_cost_(initial_cost),
+      candidate_cost_(initial_cost),
+      accumulated_reference_model_cost_change_(0.0),
+      accumulated_candidate_model_cost_change_(0.0),
+      num_consecutive_nonmonotonic_steps_(0){
+}
+
+double TrustRegionStepEvaluator::StepQuality(
+    const double cost,
+    const double model_cost_change) const {
+  const double relative_decrease = (current_cost_ - cost) / model_cost_change;
+  const double historical_relative_decrease =
+      (reference_cost_ - cost) /
+      (accumulated_reference_model_cost_change_ + model_cost_change);
+  return std::max(relative_decrease, historical_relative_decrease);
+}
+
+void TrustRegionStepEvaluator::StepAccepted(
+    const double cost,
+    const double model_cost_change) {
+  // Algorithm 10.1.2 from Trust Region Methods by Conn, Gould &
+  // Toint.
+  //
+  // Step 3a
+  current_cost_ = cost;
+  accumulated_candidate_model_cost_change_ += model_cost_change;
+  accumulated_reference_model_cost_change_ += model_cost_change;
+
+  // Step 3b.
+  if (current_cost_ < minimum_cost_) {
+    minimum_cost_ = current_cost_;
+    num_consecutive_nonmonotonic_steps_ = 0;
+    candidate_cost_ = current_cost_;
+    accumulated_candidate_model_cost_change_ = 0.0;
+  } else {
+    // Step 3c.
+    ++num_consecutive_nonmonotonic_steps_;
+    if (current_cost_ > candidate_cost_) {
+      candidate_cost_ = current_cost_;
+      accumulated_candidate_model_cost_change_ = 0.0;
+    }
+  }
+
+  // Step 3d.
+  //
+  // At this point we have made too many non-monotonic steps and
+  // we are going to reset the value of the reference iterate so
+  // as to force the algorithm to descend.
+  //
+  // Note: In the original algorithm by Toint, this step was only
+  // executed if the step was non-monotonic, but that would not handle
+  // the case of max_consecutive_nonmonotonic_steps = 0. The small
+  // modification of doing this always handles that corner case
+  // correctly.
+  if (num_consecutive_nonmonotonic_steps_ ==
+      max_consecutive_nonmonotonic_steps_) {
+    reference_cost_ = candidate_cost_;
+    accumulated_reference_model_cost_change_ =
+        accumulated_candidate_model_cost_change_;
+  }
+}
+
+}  // namespace internal
+}  // namespace ceres
@@ -0,0 +1,122 @@
+// Ceres Solver - A fast non-linear least squares minimizer
+// Copyright 2016 Google Inc. All rights reserved.
+// http://ceres-solver.org/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// * Neither the name of Google Inc. nor the names of its contributors may be
+//   used to endorse or promote products derived from this software without
+//   specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: sameeragarwal@google.com (Sameer Agarwal)
+
+#ifndef CERES_INTERNAL_TRUST_REGION_STEP_EVALUATOR_H_
+#define CERES_INTERNAL_TRUST_REGION_STEP_EVALUATOR_H_
+
+namespace ceres {
+namespace internal {
+
+// The job of the TrustRegionStepEvaluator is to evaluate the quality
+// of a step, i.e., how the cost of a step compares with the reduction
+// in the objective of the trust region problem.
+//
+// Classic trust region methods are descent methods, in that they only
+// accept a point if it strictly reduces the value of the objective
+// function. They do this by measuring the quality of a step as
+//
+//   cost_change / model_cost_change.
+//
+// Relaxing the monotonic descent requirement allows the algorithm to
+// be more efficient in the long term at the cost of some local
+// increase in the value of the objective function.
+//
+// This is because allowing for non-decreasing objective function
+// values in a principled manner allows the algorithm to "jump over
+// boulders" as the method is not restricted to move into narrow
+// valleys while preserving its convergence properties.
+//
+// The parameter max_consecutive_nonmonotonic_steps controls the
+// window size used by the step selection algorithm to accept
+// non-monotonic steps. Setting this parameter to zero, recovers the
+// classic montonic descent algorithm.
+//
+// Based on algorithm 10.1.2 (page 357) of "Trust Region
+// Methods" by Conn Gould & Toint, or equations 33-40 of
+// "Non-monotone trust-region algorithms for nonlinear
+// optimization subject to convex constraints" by Phil Toint,
+// Mathematical Programming, 77, 1997.
+//
+// Example usage:
+//
+// TrustRegionStepEvaluator* step_evaluator = ...
+//
+// cost = ... // Compute the non-linear objective function value.
+// model_cost_change = ... // Change in the value of the trust region objective.
+// if (step_evaluator->StepQuality(cost, model_cost_change) > threshold) {
+//   x = x + delta;
+//   step_evaluator->StepAccepted(cost, model_cost_change);
+// }
+class TrustRegionStepEvaluator {
+ public:
+  // initial_cost is as the name implies the cost of the starting
+  // state of the trust region minimizer.
+  //
+  // max_consecutive_nonmonotonic_steps controls the window size used
+  // by the step selection algorithm to accept non-monotonic
+  // steps. Setting this parameter to zero, recovers the classic
+  // montonic descent algorithm.
+  TrustRegionStepEvaluator(double initial_cost,
+                           int max_consecutive_nonmonotonic_steps);
+
+  // Return the quality of the step given its cost and the decrease in
+  // the cost of the model. model_cost_change has to be positive.
+  double StepQuality(double cost, double model_cost_change) const;
+
+  // Inform the step evaluator that a step with the given cost and
+  // model_cost_change has been accepted by the trust region
+  // minimizer.
+  void StepAccepted(double cost, double model_cost_change);
+
+ private:
+  const int max_consecutive_nonmonotonic_steps_;
+  // The minimum cost encountered up till now.
+  double minimum_cost_;
+  // The current cost of the trust region minimizer as informed by the
+  // last call to StepAccepted.
+  double current_cost_;
+  double reference_cost_;
+  double candidate_cost_;
+  // Accumulated model cost since the last time the reference model
+  // cost was updated, i.e., when a step with cost less than the
+  // current known minimum cost is accepted.
+  double accumulated_reference_model_cost_change_;
+  // Accumulated model cost since the last time the candidate model
+  // cost was updated, i.e., a non-monotonic step was taken with a
+  // cost that was greater than the current candidate cost.
+  double accumulated_candidate_model_cost_change_;
+  // Number of steps taken since the last time minimum_cost was updated.
+  int num_consecutive_nonmonotonic_steps_;
+};
+
+}  // namespace internal
+}  // namespace ceres
+
+#endif  // CERES_INTERNAL_TRUST_REGION_STEP_EVALUATOR_H_
@@ -86,20 +86,20 @@ class TrustRegionStrategy {
  struct PerSolveOptions {
    PerSolveOptions()
        : eta(0),
-          dump_filename_base(""),
          dump_format_type(TEXTFILE) {
    }

    // Forcing sequence for inexact solves.
    double eta;

+    DumpFormatType dump_format_type;
+
    // If non-empty and dump_format_type is not CONSOLE, the trust
    // regions strategy will write the linear system to file(s) with
    // name starting with dump_filename_base.  If dump_format_type is
    // CONSOLE then dump_filename_base will be ignored and the linear
    // system will be written to the standard error.
    std::string dump_filename_base;
-    DumpFormatType dump_format_type;
  };

  struct Summary {
@@ -21,10 +21,10 @@ set(INC
 )

 set(SRC
-	range_tree.hh
-	range_tree_c_api.h
+	range_tree.h
+	intern/generic_alloc_impl.h

-	range_tree_c_api.cc
+	intern/range_tree.c
 )

 blender_add_lib(extern_rangetree "${SRC}" "${INC}" "")
@@ -1,5 +1,5 @@
 Project: RangeTree
-URL: https://github.com/nicholasbishop/RangeTree
-License: GPLv2+
-Upstream version: c4ecf6bb7dfd
+URL: https://github.com/ideasman42/rangetree-c
+License: Apache 2.0
+Upstream version: 40ebed8aa209
 Local modifications: None
@@ -1,13 +0,0 @@
-* Overview
-  Basic class for storing non-overlapping scalar ranges. Underlying
-  representation is a C++ STL set for fast lookups.
-
-* License
-  GPL version 2 or later (see COPYING)
-
-* Author Note
-  This implementation is intended for storing free unique IDs in a new
-  undo system for BMesh in Blender, but could be useful elsewhere.
-
-* Website
-  https://github.com/nicholasbishop/RangeTree
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2016, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "Apache License")
+ * with the following modification; you may not use this file except in
+ * compliance with the Apache License and the following modification to it:
+ * Section 6. Trademarks. is deleted and replaced with:
+ *
+ * 6. Trademarks. This License does not grant permission to use the trade
+ *   names, trademarks, service marks, or product names of the Licensor
+ *   and its affiliates, except as required to comply with Section 4(c) of
+ *   the License and to reproduce the content of the NOTICE file.
+ *
+ * You may obtain a copy of the Apache License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Apache License with the above modification is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the Apache License for the specific
+ * language governing permissions and limitations under the Apache License.
+ */
+
+/**
+ * Simple Memory Chunking Allocator
+ * ================================
+ *
+ * Defines need to be set:
+ * - #TPOOL_IMPL_PREFIX: Prefix to use for the API.
+ * - #TPOOL_ALLOC_TYPE: Struct type this pool handles.
+ * - #TPOOL_STRUCT: Name for pool struct name.
+ * - #TPOOL_CHUNK_SIZE: Chunk size (optional), use 64kb when not defined.
+ *
+ * \note #TPOOL_ALLOC_TYPE must be at least ``sizeof(void *)``.
+ *
+ * Defines the API, uses #TPOOL_IMPL_PREFIX to prefix each function.
+ *
+ * - *_pool_create()
+ * - *_pool_destroy()
+ * - *_pool_clear()
+ *
+ * - *_pool_elem_alloc()
+ * - *_pool_elem_calloc()
+ * - *_pool_elem_free()
+ */
+
+/* check we're not building directly */
+#if !defined(TPOOL_IMPL_PREFIX) || \
+    !defined(TPOOL_ALLOC_TYPE) || \
+    !defined(TPOOL_STRUCT)
+#  error "This file can't be compiled directly, include in another source file"
+#endif
+
+#define _CONCAT_AUX(MACRO_ARG1, MACRO_ARG2) MACRO_ARG1 ## MACRO_ARG2
+#define _CONCAT(MACRO_ARG1, MACRO_ARG2) _CONCAT_AUX(MACRO_ARG1, MACRO_ARG2)
+#define _TPOOL_PREFIX(id) _CONCAT(TPOOL_IMPL_PREFIX, _##id)
+
+/* local identifiers */
+#define pool_create		_TPOOL_PREFIX(pool_create)
+#define pool_destroy	_TPOOL_PREFIX(pool_destroy)
+#define pool_clear		_TPOOL_PREFIX(pool_clear)
+
+#define pool_elem_alloc		_TPOOL_PREFIX(pool_elem_alloc)
+#define pool_elem_calloc	_TPOOL_PREFIX(pool_elem_calloc)
+#define pool_elem_free		_TPOOL_PREFIX(pool_elem_free)
+
+/* private identifiers (only for this file, undefine after) */
+#define pool_alloc_chunk	_TPOOL_PREFIX(pool_alloc_chunk)
+#define TPoolChunk			_TPOOL_PREFIX(TPoolChunk)
+#define TPoolChunkElemFree	_TPOOL_PREFIX(TPoolChunkElemFree)
+
+#ifndef TPOOL_CHUNK_SIZE
+#define  TPOOL_CHUNK_SIZE (1 << 16)  /* 64kb */
+#define _TPOOL_CHUNK_SIZE_UNDEF
+#endif
+
+#ifndef UNLIKELY
+#  ifdef __GNUC__
+#    define UNLIKELY(x)     __builtin_expect(!!(x), 0)
+#  else
+#    define UNLIKELY(x)     (x)
+#  endif
+#endif
+
+#ifdef __GNUC__
+#  define MAYBE_UNUSED __attribute__((unused))
+#else
+#  define MAYBE_UNUSED
+#endif
+
+
+struct TPoolChunk {
+	struct TPoolChunk *prev;
+	unsigned int    size;
+	unsigned int    bufsize;
+	TPOOL_ALLOC_TYPE buf[0];
+};
+
+struct TPoolChunkElemFree {
+	struct TPoolChunkElemFree *next;
+};
+
+struct TPOOL_STRUCT {
+	/* Always keep at least one chunk (never NULL) */
+	struct TPoolChunk *chunk;
+	/* when NULL, allocate a new chunk */
+	struct TPoolChunkElemFree *free;
+};
+
+/**
+ * Number of elems to include per #TPoolChunk when no reserved size is passed,
+ * or we allocate past the reserved number.
+ *
+ * \note Optimize number for 64kb allocs.
+ */
+#define _TPOOL_CHUNK_DEFAULT_NUM \
+	(((1 << 16) - sizeof(struct TPoolChunk)) / sizeof(TPOOL_ALLOC_TYPE))
+
+
+/** \name Internal Memory Management
+ * \{ */
+
+static struct TPoolChunk *pool_alloc_chunk(
+        unsigned int tot_elems, struct TPoolChunk *chunk_prev)
+{
+	struct TPoolChunk *chunk = malloc(
+	        sizeof(struct TPoolChunk) + (sizeof(TPOOL_ALLOC_TYPE) * tot_elems));
+	chunk->prev = chunk_prev;
+	chunk->bufsize = tot_elems;
+	chunk->size = 0;
+	return chunk;
+}
+
+static TPOOL_ALLOC_TYPE *pool_elem_alloc(struct TPOOL_STRUCT *pool)
+{
+	TPOOL_ALLOC_TYPE *elem;
+
+	if (pool->free) {
+		elem = (TPOOL_ALLOC_TYPE *)pool->free;
+		pool->free = pool->free->next;
+	}
+	else {
+		struct TPoolChunk *chunk = pool->chunk;
+		if (UNLIKELY(chunk->size == chunk->bufsize)) {
+			chunk = pool->chunk = pool_alloc_chunk(_TPOOL_CHUNK_DEFAULT_NUM, chunk);
+		}
+		elem = &chunk->buf[chunk->size++];
+	}
+
+	return elem;
+}
+
+MAYBE_UNUSED
+static TPOOL_ALLOC_TYPE *pool_elem_calloc(struct TPOOL_STRUCT *pool)
+{
+	TPOOL_ALLOC_TYPE *elem = pool_elem_alloc(pool);
+	memset(elem, 0, sizeof(*elem));
+	return elem;
+}
+
+static void pool_elem_free(struct TPOOL_STRUCT *pool, TPOOL_ALLOC_TYPE *elem)
+{
+	struct TPoolChunkElemFree *elem_free = (struct TPoolChunkElemFree *)elem;
+	elem_free->next = pool->free;
+	pool->free = elem_free;
+}
+
+static void pool_create(struct TPOOL_STRUCT *pool, unsigned int tot_reserve)
+{
+	pool->chunk = pool_alloc_chunk((tot_reserve > 1) ? tot_reserve : _TPOOL_CHUNK_DEFAULT_NUM, NULL);
+	pool->free = NULL;
+}
+
+MAYBE_UNUSED
+static void pool_clear(struct TPOOL_STRUCT *pool)
+{
+	/* Remove all except the last chunk */
+	while (pool->chunk->prev) {
+		struct TPoolChunk *chunk_prev = pool->chunk->prev;
+		free(pool->chunk);
+		pool->chunk = chunk_prev;
+	}
+	pool->chunk->size = 0;
+	pool->free = NULL;
+}
+
+static void pool_destroy(struct TPOOL_STRUCT *pool)
+{
+	struct TPoolChunk *chunk = pool->chunk;
+	do {
+		struct TPoolChunk *chunk_prev;
+		chunk_prev = chunk->prev;
+		free(chunk);
+		chunk = chunk_prev;
+	} while (chunk);
+
+	pool->chunk = NULL;
+	pool->free = NULL;
+}
+
+/** \} */
+
+#undef _TPOOL_CHUNK_DEFAULT_NUM
+#undef _CONCAT_AUX
+#undef _CONCAT
+#undef _TPOOL_PREFIX
+
+#undef TPoolChunk
+#undef TPoolChunkElemFree
+
+#ifdef _TPOOL_CHUNK_SIZE_UNDEF
+#  undef  TPOOL_CHUNK_SIZE
+#  undef _TPOOL_CHUNK_SIZE_UNDEF
+#endif
@@ -0,0 +1,873 @@
+/*
+ * Copyright (c) 2016, Campbell Barton.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "Apache License")
+ * with the following modification; you may not use this file except in
+ * compliance with the Apache License and the following modification to it:
+ * Section 6. Trademarks. is deleted and replaced with:
+ *
+ * 6. Trademarks. This License does not grant permission to use the trade
+ *   names, trademarks, service marks, or product names of the Licensor
+ *   and its affiliates, except as required to comply with Section 4(c) of
+ *   the License and to reproduce the content of the NOTICE file.
+ *
+ * You may obtain a copy of the Apache License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Apache License with the above modification is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the Apache License for the specific
+ * language governing permissions and limitations under the Apache License.
+ */
+
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include <assert.h>
+
+#include "range_tree.h"
+
+typedef unsigned int uint;
+
+/* Use binary-tree for lookups, else fallback to full search */
+#define USE_BTREE
+/* Use memory pool for nodes, else do individual allocations */
+#define USE_TPOOL
+
+/* Node representing a range in the RangeTreeUInt. */
+typedef struct Node {
+	struct Node *next, *prev;
+
+	/* range (inclusive) */
+	uint min, max;
+
+#ifdef USE_BTREE
+	/* Left leaning red-black tree, for reference implementation see:
+	 * https://gitlab.com/ideasman42/btree-mini-py */
+	struct Node *left, *right;
+	/* RED/BLACK */
+	bool color;
+#endif
+} Node;
+
+#ifdef USE_TPOOL
+/* rt_pool_* pool allocator */
+#define TPOOL_IMPL_PREFIX  rt_node
+#define TPOOL_ALLOC_TYPE   Node
+#define TPOOL_STRUCT       ElemPool_Node
+#include "generic_alloc_impl.h"
+#undef TPOOL_IMPL_PREFIX
+#undef TPOOL_ALLOC_TYPE
+#undef TPOOL_STRUCT
+#endif  /* USE_TPOOL */
+
+typedef struct LinkedList {
+	Node *first, *last;
+} LinkedList;
+
+typedef struct RangeTreeUInt {
+	uint range[2];
+	LinkedList list;
+#ifdef USE_BTREE
+	Node *root;
+#endif
+#ifdef USE_TPOOL
+	struct ElemPool_Node epool;
+#endif
+} RangeTreeUInt;
+
+/* ------------------------------------------------------------------------- */
+/* List API */
+
+static void list_push_front(LinkedList *list, Node *node)
+{
+	if (list->first != NULL) {
+		node->next = list->first;
+		node->next->prev = node;
+		node->prev = NULL;
+	}
+	else {
+		list->last = node;
+	}
+	list->first = node;
+}
+
+static void list_push_back(LinkedList *list, Node *node)
+{
+	if (list->first != NULL) {
+		node->prev = list->last;
+		node->prev->next = node;
+		node->next = NULL;
+	}
+	else {
+		list->first = node;
+	}
+	list->last = node;
+}
+
+static void list_push_after(LinkedList *list, Node *node_prev, Node *node_new)
+{
+	/* node_new before node_next */
+
+	/* empty list */
+	if (list->first == NULL) {
+		list->first = node_new;
+		list->last = node_new;
+		return;
+	}
+
+	/* insert at head of list */
+	if (node_prev == NULL) {
+		node_new->prev = NULL;
+		node_new->next = list->first;
+		node_new->next->prev = node_new;
+		list->first = node_new;
+		return;
+	}
+
+	/* at end of list */
+	if (list->last == node_prev) {
+		list->last = node_new;
+	}
+
+	node_new->next = node_prev->next;
+	node_new->prev = node_prev;
+	node_prev->next = node_new;
+	if (node_new->next) {
+		node_new->next->prev = node_new;
+	}
+}
+
+static void list_push_before(LinkedList *list, Node *node_next, Node *node_new)
+{
+	/* node_new before node_next */
+
+	/* empty list */
+	if (list->first == NULL) {
+		list->first = node_new;
+		list->last = node_new;
+		return;
+	}
+
+	/* insert at end of list */
+	if (node_next == NULL) {
+		node_new->prev = list->last;
+		node_new->next = NULL;
+		list->last->next = node_new;
+		list->last = node_new;
+		return;
+	}
+
+	/* at beginning of list */
+	if (list->first == node_next) {
+		list->first = node_new;
+	}
+
+	node_new->next = node_next;
+	node_new->prev = node_next->prev;
+	node_next->prev = node_new;
+	if (node_new->prev) {
+		node_new->prev->next = node_new;
+	}
+}
+
+static void list_remove(LinkedList *list, Node *node)
+{
+	if (node->next != NULL) {
+		node->next->prev = node->prev;
+	}
+	if (node->prev != NULL) {
+		node->prev->next = node->next;
+	}
+
+	if (list->last == node) {
+		list->last = node->prev;
+	}
+	if (list->first == node) {
+		list->first = node->next;
+	}
+}
+
+static void list_clear(LinkedList *list)
+{
+	list->first = NULL;
+	list->last = NULL;
+}
+
+/* end list API */
+
+
+/* forward declarations */
+static void rt_node_free(RangeTreeUInt *rt, Node *node);
+
+
+#ifdef USE_BTREE
+
+#ifdef DEBUG
+static bool rb_is_balanced_root(const Node *root);
+#endif
+
+/* ------------------------------------------------------------------------- */
+/* Internal BTree API
+ *
+ * Left-leaning red-black tree.
+ */
+
+/* use minimum, could use max too since nodes never overlap */
+#define KEY(n) ((n)->min)
+
+enum {
+	RED = 0,
+	BLACK = 1,
+};
+
+
+static bool is_red(const Node *node)
+{
+	return (node && (node->color == RED));
+}
+
+static int key_cmp(uint key1, uint key2)
+{
+	return (key1 == key2) ? 0 : ((key1 < key2) ? -1 : 1);
+}
+
+/* removed from the tree */
+static void rb_node_invalidate(Node *node)
+{
+#ifdef DEBUG
+	node->left = NULL;
+	node->right = NULL;
+	node->color = false;
+#else
+	(void)node;
+#endif
+}
+
+static void rb_flip_color(Node *node)
+{
+	node->color ^= 1;
+	node->left->color ^= 1;
+	node->right->color ^= 1;
+}
+
+static Node *rb_rotate_left(Node *left)
+{
+	/* Make a right-leaning 3-node lean to the left. */
+	Node *right = left->right;
+	left->right = right->left;
+	right->left = left;
+	right->color = left->color;
+	left->color = RED;
+	return right;
+}
+
+static Node *rb_rotate_right(Node *right)
+{
+	/* Make a left-leaning 3-node lean to the right. */
+	Node *left = right->left;
+	right->left = left->right;
+	left->right = right;
+	left->color = right->color;
+	right->color = RED;
+	return left;
+}
+
+/* Fixup colors when insert happened */
+static Node *rb_fixup_insert(Node *node)
+{
+	if (is_red(node->right) && !is_red(node->left)) {
+		node = rb_rotate_left(node);
+	}
+	if (is_red(node->left) && is_red(node->left->left)) {
+		node = rb_rotate_right(node);
+	}
+
+	if (is_red(node->left) && is_red(node->right)) {
+		rb_flip_color(node);
+	}
+
+	return node;
+}
+
+static Node *rb_insert_recursive(Node *node, Node *node_to_insert)
+{
+	if (node == NULL) {
+		return node_to_insert;
+	}
+
+	const int cmp = key_cmp(KEY(node_to_insert), KEY(node));
+	if (cmp == 0) {
+		/* caller ensures no collisions */
+		assert(0);
+	}
+	else if (cmp == -1) {
+		node->left = rb_insert_recursive(node->left, node_to_insert);
+	}
+	else {
+		node->right = rb_insert_recursive(node->right, node_to_insert);
+	}
+
+	return rb_fixup_insert(node);
+}
+
+static Node *rb_insert_root(Node *root, Node *node_to_insert)
+{
+	root = rb_insert_recursive(root, node_to_insert);
+	root->color = BLACK;
+	return root;
+}
+
+static Node *rb_move_red_to_left(Node *node)
+{
+	/* Assuming that h is red and both h->left and h->left->left
+	 * are black, make h->left or one of its children red.
+	 */
+	rb_flip_color(node);
+	if (node->right && is_red(node->right->left)) {
+		node->right = rb_rotate_right(node->right);
+		node = rb_rotate_left(node);
+		rb_flip_color(node);
+	}
+	return node;
+}
+
+static Node *rb_move_red_to_right(Node *node)
+{
+	/* Assuming that h is red and both h->right and h->right->left
+	 * are black, make h->right or one of its children red.
+	 */
+	rb_flip_color(node);
+	if (node->left && is_red(node->left->left)) {
+		node = rb_rotate_right(node);
+		rb_flip_color(node);
+	}
+	return node;
+}
+
+/* Fixup colors when remove happened */
+static Node *rb_fixup_remove(Node *node)
+{
+	if (is_red(node->right)) {
+		node = rb_rotate_left(node);
+	}
+	if (is_red(node->left) && is_red(node->left->left)) {
+		node = rb_rotate_right(node);
+	}
+	if (is_red(node->left) && is_red(node->right)) {
+		rb_flip_color(node);
+	}
+	return node;
+}
+
+static Node *rb_pop_min_recursive(Node *node, Node **r_node_pop)
+{
+	if (node == NULL) {
+		return NULL;
+	}
+	if (node->left == NULL) {
+		rb_node_invalidate(node);
+		*r_node_pop = node;
+		return NULL;
+	}
+	if ((!is_red(node->left)) && (!is_red(node->left->left))) {
+		node = rb_move_red_to_left(node);
+	}
+	node->left = rb_pop_min_recursive(node->left, r_node_pop);
+	return rb_fixup_remove(node);
+}
+
+static Node *rb_remove_recursive(Node *node, const Node *node_to_remove)
+{
+	if (node == NULL) {
+		return NULL;
+	}
+	if (key_cmp(KEY(node_to_remove), KEY(node)) == -1) {
+		if (node->left != NULL) {
+			if ((!is_red(node->left)) && (!is_red(node->left->left))) {
+				node = rb_move_red_to_left(node);
+			}
+		}
+		node->left = rb_remove_recursive(node->left, node_to_remove);
+	}
+	else {
+		if (is_red(node->left)) {
+			node = rb_rotate_right(node);
+		}
+		if ((node == node_to_remove) && (node->right == NULL)) {
+			rb_node_invalidate(node);
+			return NULL;
+		}
+		assert(node->right != NULL);
+		if ((!is_red(node->right)) && (!is_red(node->right->left))) {
+			node = rb_move_red_to_right(node);
+		}
+
+		if (node == node_to_remove) {
+			/* minor improvement over original method:
+			 * no need to double lookup min */
+			Node *node_free;  /* will always be set */
+			node->right = rb_pop_min_recursive(node->right, &node_free);
+
+			node_free->left = node->left;
+			node_free->right = node->right;
+			node_free->color = node->color;
+
+			rb_node_invalidate(node);
+			node = node_free;
+		}
+		else {
+			node->right = rb_remove_recursive(node->right, node_to_remove);
+		}
+	}
+	return rb_fixup_remove(node);
+}
+
+static Node *rb_btree_remove(Node *root, const Node *node_to_remove)
+{
+	root = rb_remove_recursive(root, node_to_remove);
+	if (root != NULL) {
+		root->color = BLACK;
+	}
+	return root;
+}
+
+/*
+ * Returns the node closest to and including 'key',
+ * excluding anything below.
+ */
+static Node *rb_get_or_upper_recursive(Node *n, const uint key)
+{
+	if (n == NULL) {
+		return NULL;
+	}
+	const int cmp_upper = key_cmp(KEY(n), key);
+	if (cmp_upper == 0) {
+		return n;  // exact match
+	}
+	else if (cmp_upper == 1) {
+		assert(KEY(n) >= key);
+		Node *n_test = rb_get_or_upper_recursive(n->left, key);
+		return n_test ? n_test : n;
+	}
+	else {  // cmp_upper == -1
+		return rb_get_or_upper_recursive(n->right, key);
+	}
+}
+
+/*
+ * Returns the node closest to and including 'key',
+ * excluding anything above.
+ */
+static Node *rb_get_or_lower_recursive(Node *n, const uint key)
+{
+	if (n == NULL) {
+		return NULL;
+	}
+	const int cmp_lower = key_cmp(KEY(n), key);
+	if (cmp_lower == 0) {
+		return n;  // exact match
+	}
+	else if (cmp_lower == -1) {
+		assert(KEY(n) <= key);
+		Node *n_test = rb_get_or_lower_recursive(n->right, key);
+		return n_test ? n_test : n;
+	}
+	else {  // cmp_lower == 1
+		return rb_get_or_lower_recursive(n->left, key);
+	}
+}
+
+#ifdef DEBUG
+
+static bool rb_is_balanced_recursive(const Node *node, int black)
+{
+	// Does every path from the root to a leaf have the given number
+	// of black links?
+	if (node == NULL) {
+		return black == 0;
+	}
+	if (!is_red(node)) {
+		black--;
+	}
+	return rb_is_balanced_recursive(node->left, black) &&
+	       rb_is_balanced_recursive(node->right, black);
+}
+
+static bool rb_is_balanced_root(const Node *root)
+{
+	// Do all paths from root to leaf have same number of black edges?
+	int black = 0;     // number of black links on path from root to min
+	const Node *node = root;
+	while (node != NULL) {
+		if (!is_red(node)) {
+			black++;
+		}
+		node = node->left;
+	}
+	return rb_is_balanced_recursive(root, black);
+}
+
+#endif  // DEBUG
+
+
+/* End BTree API */
+#endif  // USE_BTREE
+
+
+/* ------------------------------------------------------------------------- */
+/* Internal RangeTreeUInt API */
+
+#ifdef _WIN32
+#define inline __inline
+#endif
+
+static inline Node *rt_node_alloc(RangeTreeUInt *rt)
+{
+#ifdef USE_TPOOL
+	return rt_node_pool_elem_alloc(&rt->epool);
+#else
+	(void)rt;
+	return malloc(sizeof(Node));
+#endif
+}
+
+static Node *rt_node_new(RangeTreeUInt *rt, uint min, uint max)
+{
+	Node *node = rt_node_alloc(rt);
+
+	assert(min <= max);
+	node->prev = NULL;
+	node->next = NULL;
+	node->min = min;
+	node->max = max;
+#ifdef USE_BTREE
+	node->left = NULL;
+	node->right = NULL;
+#endif
+	return node;
+}
+
+static void rt_node_free(RangeTreeUInt *rt, Node *node)
+{
+#ifdef USE_TPOOL
+	rt_node_pool_elem_free(&rt->epool, node);
+#else
+	(void)rt;
+	free(node);
+#endif
+}
+
+#ifdef USE_BTREE
+static void rt_btree_insert(RangeTreeUInt *rt, Node *node)
+{
+	node->color = RED;
+	node->left = NULL;
+	node->right = NULL;
+	rt->root = rb_insert_root(rt->root, node);
+}
+#endif
+
+static void rt_node_add_back(RangeTreeUInt *rt, Node *node)
+{
+	list_push_back(&rt->list, node);
+#ifdef USE_BTREE
+	rt_btree_insert(rt, node);
+#endif
+}
+static void rt_node_add_front(RangeTreeUInt *rt, Node *node)
+{
+	list_push_front(&rt->list, node);
+#ifdef USE_BTREE
+	rt_btree_insert(rt, node);
+#endif
+}
+static void rt_node_add_before(RangeTreeUInt *rt, Node *node_next, Node *node)
+{
+	list_push_before(&rt->list, node_next, node);
+#ifdef USE_BTREE
+	rt_btree_insert(rt, node);
+#endif
+}
+static void rt_node_add_after(RangeTreeUInt *rt, Node *node_prev, Node *node)
+{
+	list_push_after(&rt->list, node_prev, node);
+#ifdef USE_BTREE
+	rt_btree_insert(rt, node);
+#endif
+}
+
+static void rt_node_remove(RangeTreeUInt *rt, Node *node)
+{
+	list_remove(&rt->list, node);
+#ifdef USE_BTREE
+	rt->root = rb_btree_remove(rt->root, node);
+#endif
+	rt_node_free(rt, node);
+}
+
+static Node *rt_find_node_from_value(RangeTreeUInt *rt, const uint value)
+{
+#ifdef USE_BTREE
+	Node *node = rb_get_or_lower_recursive(rt->root, value);
+	if (node != NULL) {
+		if ((value >= node->min) && (value <= node->max)) {
+			return node;
+		}
+	}
+	return NULL;
+#else
+	for (Node *node = rt->list.first; node; node = node->next) {
+		if ((value >= node->min) && (value <= node->max)) {
+			return node;
+		}
+	}
+	return NULL;
+#endif // USE_BTREE
+}
+
+static void rt_find_node_pair_around_value(RangeTreeUInt *rt, const uint value,
+                                           Node **r_node_prev, Node **r_node_next)
+{
+	if (value < rt->list.first->min) {
+		*r_node_prev = NULL;
+		*r_node_next = rt->list.first;
+		return;
+	}
+	else if (value > rt->list.last->max) {
+		*r_node_prev = rt->list.last;
+		*r_node_next = NULL;
+		return;
+	}
+	else {
+#ifdef USE_BTREE
+		Node *node_next = rb_get_or_upper_recursive(rt->root, value);
+		if (node_next != NULL) {
+			Node *node_prev = node_next->prev;
+			if ((node_prev->max < value) && (value < node_next->min)) {
+				*r_node_prev = node_prev;
+				*r_node_next = node_next;
+				return;
+			}
+		}
+#else
+		Node *node_prev = rt->list.first;
+		Node *node_next;
+		while ((node_next = node_prev->next)) {
+			if ((node_prev->max < value) && (value < node_next->min)) {
+				*r_node_prev = node_prev;
+				*r_node_next = node_next;
+				return;
+			}
+			node_prev = node_next;
+		}
+#endif // USE_BTREE
+	}
+	*r_node_prev = NULL;
+	*r_node_next = NULL;
+}
+
+
+/* ------------------------------------------------------------------------- */
+/* Public API */
+
+static RangeTreeUInt *rt_create_empty(uint min, uint max)
+{
+	RangeTreeUInt *rt = malloc(sizeof(*rt));
+	rt->range[0] = min;
+	rt->range[1] = max;
+
+	list_clear(&rt->list);
+
+#ifdef USE_BTREE
+	rt->root = NULL;
+#endif
+#ifdef USE_TPOOL
+	rt_node_pool_create(&rt->epool, 512);
+#endif
+
+	return rt;
+}
+
+RangeTreeUInt *range_tree_uint_alloc(uint min, uint max)
+{
+	RangeTreeUInt *rt = rt_create_empty(min, max);
+
+	Node *node = rt_node_new(rt, min, max);
+	rt_node_add_front(rt, node);
+	return rt;
+}
+
+void range_tree_uint_free(RangeTreeUInt *rt)
+{
+#ifdef DEBUG
+#ifdef USE_BTREE
+	assert(rb_is_balanced_root(rt->root));
+#endif
+#endif
+
+#ifdef USE_TPOOL
+
+	rt_node_pool_destroy(&rt->epool);
+#else
+	for (Node *node = rt->list.first, *node_next; node; node = node_next) {
+		node_next = node->next;
+		rt_node_free(rt, node);
+	}
+#endif
+
+	free(rt);
+}
+
+#ifdef USE_BTREE
+static Node *rt_copy_recursive(RangeTreeUInt *rt_dst, const Node *node_src)
+{
+	if (node_src == NULL) {
+		return NULL;
+	}
+
+	Node *node_dst = rt_node_alloc(rt_dst);
+
+	*node_dst = *node_src;
+	node_dst->left = rt_copy_recursive(rt_dst, node_dst->left);
+	list_push_back(&rt_dst->list, node_dst);
+	node_dst->right = rt_copy_recursive(rt_dst, node_dst->right);
+
+	return node_dst;
+}
+#endif  // USE_BTREE
+
+RangeTreeUInt *range_tree_uint_copy(const RangeTreeUInt *rt_src)
+{
+	RangeTreeUInt *rt_dst = rt_create_empty(rt_src->range[0], rt_src->range[1]);
+#ifdef USE_BTREE
+	rt_dst->root = rt_copy_recursive(rt_dst, rt_src->root);
+#else
+	for (Node *node_src = rt_src->list.first; node_src; node_src = node_src->next) {
+		Node *node_dst = rt_node_alloc(rt_dst);
+		*node_dst = *node_src;
+		list_push_back(&rt_dst->list, node_dst);
+	}
+#endif
+	return rt_dst;
+}
+
+/**
+ * Return true if the tree has the value (not taken).
+ */
+bool range_tree_uint_has(RangeTreeUInt *rt, const uint value)
+{
+	assert(value >= rt->range[0] && value <= rt->range[1]);
+	Node *node = rt_find_node_from_value(rt, value);
+	return (node != NULL);
+}
+
+static void range_tree_uint_take_impl(RangeTreeUInt *rt, const uint value, Node *node)
+{
+	assert(node == rt_find_node_from_value(rt, value));
+	if (node->min == value) {
+		if (node->max != value) {
+			node->min += 1;
+		}
+		else {
+			assert(node->min == node->max);
+			rt_node_remove(rt, node);
+		}
+	}
+	else if (node->max == value) {
+		node->max -= 1;
+	}
+	else {
+		Node *node_next = rt_node_new(rt, value + 1, node->max);
+		node->max = value - 1;
+		rt_node_add_after(rt, node, node_next);
+	}
+}
+
+void range_tree_uint_take(RangeTreeUInt *rt, const uint value)
+{
+	Node *node = rt_find_node_from_value(rt, value);
+	assert(node != NULL);
+	range_tree_uint_take_impl(rt, value, node);
+}
+
+bool range_tree_uint_retake(RangeTreeUInt *rt, const uint value)
+{
+	Node *node = rt_find_node_from_value(rt, value);
+	if (node != NULL) {
+		range_tree_uint_take_impl(rt, value, node);
+		return true;
+	}
+	else {
+		return false;
+	}
+}
+
+uint range_tree_uint_take_any(RangeTreeUInt *rt)
+{
+	Node *node = node = rt->list.first;
+	uint value = node->min;
+	if (value == node->max) {
+		rt_node_remove(rt, node);
+	}
+	else {
+		node->min += 1;
+	}
+	return value;
+}
+
+void range_tree_uint_release(RangeTreeUInt *rt, const uint value)
+{
+	bool touch_prev, touch_next;
+	Node *node_prev, *node_next;
+
+	if (rt->list.first != NULL) {
+		rt_find_node_pair_around_value(rt, value, &node_prev, &node_next);
+		/* the value must have been already taken */
+		assert(node_prev || node_next);
+
+		/* Cases:
+		 * 1) fill the gap between prev & next (two spans into one span).
+		 * 2) touching prev, (grow node_prev->max up one).
+		 * 3) touching next, (grow node_next->min down one).
+		 * 4) touching neither, add a new segment. */
+		touch_prev = (node_prev != NULL && node_prev->max + 1 == value);
+		touch_next = (node_next != NULL && node_next->min - 1 == value);
+	}
+	else {
+		// we could handle this case (4) inline,
+		// since its not a common case - use regular logic.
+		node_prev = node_next = NULL;
+		touch_prev = false;
+		touch_next = false;
+	}
+
+	if (touch_prev && touch_next) {  // 1)
+		node_prev->max = node_next->max;
+		rt_node_remove(rt, node_next);
+	}
+	else if (touch_prev) {  // 2)
+		assert(node_prev->max + 1 == value);
+		node_prev->max = value;
+	}
+	else if (touch_next) {  // 3)
+		assert(node_next->min - 1 == value);
+		node_next->min = value;
+	}
+	else {  // 4)
+		Node *node_new = rt_node_new(rt, value, value);
+		if (node_prev != NULL) {
+			rt_node_add_after(rt, node_prev, node_new);
+		}
+		else if (node_next != NULL) {
+			rt_node_add_before(rt, node_next, node_new);
+		}
+		else {
+			assert(rt->list.first == NULL);
+			rt_node_add_back(rt, node_new);
+		}
+	}
+}
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, Campbell Barton.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "Apache License")
+ * with the following modification; you may not use this file except in
+ * compliance with the Apache License and the following modification to it:
+ * Section 6. Trademarks. is deleted and replaced with:
+ *
+ * 6. Trademarks. This License does not grant permission to use the trade
+ *   names, trademarks, service marks, or product names of the Licensor
+ *   and its affiliates, except as required to comply with Section 4(c) of
+ *   the License and to reproduce the content of the NOTICE file.
+ *
+ * You may obtain a copy of the Apache License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the Apache License with the above modification is
+ * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the Apache License for the specific
+ * language governing permissions and limitations under the Apache License.
+ */
+
+#ifndef __RANGE_TREE_H__
+#define __RANGE_TREE_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct RangeTreeUInt RangeTreeUInt;
+
+struct RangeTreeUInt *range_tree_uint_alloc(unsigned int min, unsigned int max);
+void                  range_tree_uint_free(struct RangeTreeUInt *rt);
+struct RangeTreeUInt *range_tree_uint_copy(const struct RangeTreeUInt *rt_src);
+
+bool         range_tree_uint_has(struct RangeTreeUInt *rt, const unsigned int value);
+void         range_tree_uint_take(struct RangeTreeUInt *rt, const unsigned int value);
+bool         range_tree_uint_retake(struct RangeTreeUInt *rt, const unsigned int value);
+unsigned int range_tree_uint_take_any(struct RangeTreeUInt *rt);
+void         range_tree_uint_release(struct RangeTreeUInt *rt, const unsigned int value);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __RANGE_TREE_H__ */
@@ -1,251 +0,0 @@
-/* This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-   02110-1301, USA.
-*/
-
-#include <cassert>
-#include <climits>
-#include <iostream>
-#include <set>
-
-#ifndef RANGE_TREE_DEBUG_PRINT_FUNCTION
-#    define RANGE_TREE_DEBUG_PRINT_FUNCTION 0
-#endif
-
-template <typename T>
-struct RangeTree {
-	struct Range {
-		Range(T min_, T max_)
-			: min(min_), max(max_), single(min_ == max_) {
-			assert(min_ <= max_);
-		}
-
-		Range(T t)
-			: min(t), max(t), single(true)
-		{}
-
-		Range& operator=(const Range& v) {
-			*this = v;
-			return *this;
-		}
-
-		bool operator<(const Range& v) const {
-			return max < v.min;
-		}
-
-		const T min;
-		const T max;
-		const bool single;
-	};
-
-	typedef std::set<Range> Tree;
-	typedef typename Tree::iterator TreeIter;
-	typedef typename Tree::reverse_iterator TreeIterReverse;
-	typedef typename Tree::const_iterator TreeIterConst;
-
-	/* Initialize with a single range from 'min' to 'max', inclusive. */
-	RangeTree(T min, T max) {
-		tree.insert(Range(min, max));
-	}
-
-	/* Initialize with a single range from 0 to 'max', inclusive. */
-	RangeTree(T max) {
-		tree.insert(Range(0, max));
-	}
-
-	RangeTree(const RangeTree<T>& src) {
-		tree = src.tree;
-	}
-
-	/* Remove 't' from the associated range in the tree. Precondition:
-	   a range including 't' must exist in the tree. */
-	void take(T t) {
-		#if RANGE_TREE_DEBUG_PRINT_FUNCTION
-		std::cout << __func__ << "(" << t << ")\n";
-		#endif
-
-		/* Find the range that includes 't' and its neighbors */
-		TreeIter iter = tree.find(Range(t));
-		assert(iter != tree.end());
-		Range cur = *iter;
-
-		/* Remove the original range (note that this does not
-		   invalidate the prev/next iterators) */
-		tree.erase(iter);
-
-		/* Construct two new ranges that together cover the original
-		   range, except for 't' */
-		if (t > cur.min)
-			tree.insert(Range(cur.min, t - 1));
-		if (t + 1 <= cur.max)
-			tree.insert(Range(t + 1, cur.max));
-	}
-
-	/* clone of 'take' that checks if the item exists */
-	bool retake(T t) {
-		#if RANGE_TREE_DEBUG_PRINT_FUNCTION
-		std::cout << __func__ << "(" << t << ")\n";
-		#endif
-
-		TreeIter iter = tree.find(Range(t));
-		if (iter == tree.end()) {
-			return false;
-		}
-
-		Range cur = *iter;
-		tree.erase(iter);
-		if (t > cur.min)
-			tree.insert(Range(cur.min, t - 1));
-		if (t + 1 <= cur.max)
-			tree.insert(Range(t + 1, cur.max));
-
-		return true;
-	}
-
-
-	/* Take the first element out of the first range in the
-	   tree. Precondition: tree must not be empty. */
-	T take_any() {
-		#if RANGE_TREE_DEBUG_PRINT_FUNCTION
-		std::cout << __func__ << "()\n";
-		#endif
-
-		/* Find the first element */
-		TreeIter iter = tree.begin();
-		assert(iter != tree.end());
-		T first = iter->min;
-
-		/* Take the first element */
-		take(first);
-		return first;
-	}
-
-	/* Return 't' to the tree, either expanding/merging existing
-	   ranges or adding a range to cover it. Precondition: 't' cannot
-	   be in an existing range. */
-	void release(T t) {
-		#if RANGE_TREE_DEBUG_PRINT_FUNCTION
-		std::cout << __func__ << "(" << t << ")\n";
-		#endif
-
-		/* TODO: these cases should be simplified/unified */
-
-		TreeIter right = tree.upper_bound(t);
-		if (right != tree.end()) {
-			TreeIter left = right;
-			if (left != tree.begin())
-				--left;
-
-			if (left == right) {
-				/* 't' lies before any existing ranges */
-				if (t + 1 == left->min) {
-					/* 't' lies directly before the first range,
-					   resize and replace that range */
-					const Range r(t, left->max);
-					tree.erase(left);
-					tree.insert(r);
-				}
-				else {
-					/* There's a gap between 't' and the first range,
-					   add a new range */
-					tree.insert(Range(t));
-				}
-			}
-			else if ((left->max + 1 == t) &&
-				(t + 1 == right->min)) {
-				/* 't' fills a hole. Remove left and right, and insert a
-				   new range that covers both. */
-				const Range r(left->min, right->max);
-				tree.erase(left);
-				tree.erase(right);
-				tree.insert(r);
-			}
-			else if (left->max + 1 == t) {
-				/* 't' lies directly after 'left' range, resize and
-				   replace that range */
-				const Range r(left->min, t);
-				tree.erase(left);
-				tree.insert(r);
-			}
-			else if (t + 1 == right->min) {
-				/* 't' lies directly before 'right' range, resize and
-				   replace that range */
-				const Range r(t, right->max);
-				tree.erase(right);
-				tree.insert(r);
-			}
-			else {
-				/* There's a gap between 't' and both adjacent ranges,
-				   add a new range */
-				tree.insert(Range(t));
-			}
-		}
-		else {
-			/* 't' lies after any existing ranges */
-			right = tree.end();
-			right--;
-			if (right->max + 1 == t) {
-				/* 't' lies directly after last range, resize and
-				   replace that range */
-				const Range r(right->min, t);
-				tree.erase(right);
-				tree.insert(r);
-			}
-			else {
-				/* There's a gap between the last range and 't', add a
-				   new range */
-				tree.insert(Range(t));
-			}
-		}
-	}
-
-	bool has(T t) const {
-		TreeIterConst iter = tree.find(Range(t));
-		return (iter != tree.end()) && (t <= iter->max);
-	}
-
-	bool has_range(T min, T max) const {
-		TreeIterConst iter = tree.find(Range(min, max));
-		return (iter != tree.end()) && (min == iter->min && max == iter->max);
-	}
-
-	bool empty() const {
-		return tree.empty();
-	}
-
-	int size() const {
-		return tree.size();
-	}
-
-	void print() const {
-		std::cout << "RangeTree:\n";
-		for (TreeIterConst iter = tree.begin(); iter != tree.end(); ++iter) {
-			const Range& r = *iter;
-			if (r.single)
-				std::cout << "  [" << r.min << "]\n";
-			else
-				std::cout << "  [" << r.min << ", " << r.max << "]\n";
-		}
-		if (empty())
-			std::cout << "  <empty>";
-		std::cout << "\n";
-	}
-
-	unsigned int allocation_lower_bound() const {
-		return tree.size() * sizeof(Range);
-	}
-
-private:
-	Tree tree;
-};
@@ -1,92 +0,0 @@
-/* This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-   02110-1301, USA.
-*/
-
-#include "range_tree.hh"
-
-/* Give RangeTreeUInt a real type rather than the opaque struct type
-   defined for external use. */
-#define RANGE_TREE_C_API_INTERNAL
-typedef RangeTree<unsigned> RangeTreeUInt;
-
-#include "range_tree_c_api.h"
-
-RangeTreeUInt *range_tree_uint_alloc(unsigned min, unsigned max)
-{
-	return new RangeTreeUInt(min, max);
-}
-
-RangeTreeUInt *range_tree_uint_copy(RangeTreeUInt *src)
-{
-	return new RangeTreeUInt(*src);
-}
-
-void range_tree_uint_free(RangeTreeUInt *rt)
-{
-	delete rt;
-}
-
-void range_tree_uint_take(RangeTreeUInt *rt, unsigned v)
-{
-	rt->take(v);
-}
-
-bool range_tree_uint_retake(RangeTreeUInt *rt, unsigned v)
-{
-	return rt->retake(v);
-}
-
-unsigned range_tree_uint_take_any(RangeTreeUInt *rt)
-{
-	return rt->take_any();
-}
-
-void range_tree_uint_release(RangeTreeUInt *rt, unsigned v)
-{
-	rt->release(v);
-}
-
-bool range_tree_uint_has(const RangeTreeUInt *rt, unsigned v)
-{
-	return rt->has(v);
-}
-
-bool range_tree_uint_has_range(
-        const RangeTreeUInt *rt,
-        unsigned vmin,
-        unsigned vmax)
-{
-	return rt->has_range(vmin, vmax);
-}
-
-bool range_tree_uint_empty(const RangeTreeUInt *rt)
-{
-	return rt->empty();
-}
-
-unsigned range_tree_uint_size(const RangeTreeUInt *rt)
-{
-	return rt->size();
-}
-
-void range_tree_uint_print(const RangeTreeUInt *rt)
-{
-	rt->print();
-}
-
-unsigned int range_tree_uint_allocation_lower_bound(const RangeTreeUInt *rt)
-{
-	return rt->allocation_lower_bound();
-}
@@ -1,62 +0,0 @@
-/* This program is free software; you can redistribute it and/or
-   modify it under the terms of the GNU General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-   02110-1301, USA.
-*/
-
-#ifndef __RANGE_TREE_C_API_H__
-#define __RANGE_TREE_C_API_H__
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Simple C-accessible wrapper for RangeTree<unsigned> */
-
-#ifndef RANGE_TREE_C_API_INTERNAL
-typedef struct RangeTreeUInt RangeTreeUInt;
-#endif
-
-RangeTreeUInt *range_tree_uint_alloc(unsigned min, unsigned max);
-
-RangeTreeUInt *range_tree_uint_copy(RangeTreeUInt *src);
-
-void range_tree_uint_free(RangeTreeUInt *rt);
-
-void range_tree_uint_take(RangeTreeUInt *rt, unsigned v);
-
-bool range_tree_uint_retake(RangeTreeUInt *rt, unsigned v);
-
-unsigned range_tree_uint_take_any(RangeTreeUInt *rt);
-
-void range_tree_uint_release(RangeTreeUInt *rt, unsigned v);
-
-bool range_tree_uint_has(const RangeTreeUInt *rt, unsigned v);
-
-bool range_tree_uint_has_range(
-        const RangeTreeUInt *rt,
-        unsigned vmin, unsigned vmax);
-
-bool range_tree_uint_empty(const RangeTreeUInt *rt);
-
-unsigned range_tree_uint_size(const RangeTreeUInt *rt);
-
-void range_tree_uint_print(const RangeTreeUInt *rt);
-
-unsigned int range_tree_uint_allocation_lower_bound(const RangeTreeUInt *rt);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* __RANGE_TREE_C_API_H__ */
@@ -2698,7 +2698,7 @@ Device_set_doppler_factor(Device *self, PyObject *args, void* nothing)

 PyDoc_STRVAR(M_aud_Device_distance_model_doc,
 			 "The distance model of the device.\n\n"
-			 ".. seealso:: http://connect.creativelabs.com/openal/Documentation/OpenAL%201.1%20Specification.htm#_Toc199835864");
+			 ".. seealso:: `OpenAL documentation <https://www.openal.org/documentation>`");

 static PyObject *
 Device_get_distance_model(Device *self, void* nothing)
@@ -266,6 +266,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                description="Sample all lights (for indirect samples), rather than randomly picking one",
                default=True,
                )
+        cls.light_sampling_threshold = FloatProperty(
+                name="Light Sampling Threshold",
+                description="Probabilistically terminate light samples when the light contribution is below this threshold (more noise but faster rendering). "
+                            "Zero disables the test and never ignores lights",
+                min=0.0, max=1.0,
+                default=0.05,
+                )

        cls.caustics_reflective = BoolProperty(
                name="Reflective Caustics",
@@ -166,6 +166,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):

        sub.prop(cscene, "sample_clamp_direct")
        sub.prop(cscene, "sample_clamp_indirect")
+        sub.prop(cscene, "light_sampling_threshold")

        if cscene.progressive == 'PATH' or use_branched_path(context) is False:
            col = split.column()
@@ -153,6 +153,7 @@ void BlenderSync::sync_light(BL::Object& b_parent,
 	/* location and (inverted!) direction */
 	light->co = transform_get_column(&tfm, 3);
 	light->dir = -transform_get_column(&tfm, 2);
+	light->tfm = tfm;

 	/* shader */
 	vector<Shader*> used_shaders;
@@ -276,6 +276,7 @@ void BlenderSync::sync_integrator()

 	integrator->sample_all_lights_direct = get_boolean(cscene, "sample_all_lights_direct");
 	integrator->sample_all_lights_indirect = get_boolean(cscene, "sample_all_lights_indirect");
+	integrator->light_sampling_threshold = get_float(cscene, "light_sampling_threshold");

 	int diffuse_samples = get_int(cscene, "diffuse_samples");
 	int glossy_samples = get_int(cscene, "glossy_samples");
@@ -177,6 +177,7 @@ set(SRC_UTIL_HEADERS
 	../util/util_atomic.h
 	../util/util_color.h
 	../util/util_half.h
+	../util/util_hash.h
 	../util/util_math.h
 	../util/util_math_fast.h
 	../util/util_static_assert.h
@@ -21,6 +21,36 @@ struct QBVHStackItem {
 	float dist;
 };

+ccl_device_inline void qbvh_near_far_idx_calc(const float3& idir,
+                                              int *ccl_restrict near_x,
+                                              int *ccl_restrict near_y,
+                                              int *ccl_restrict near_z,
+                                              int *ccl_restrict far_x,
+                                              int *ccl_restrict far_y,
+                                              int *ccl_restrict far_z)
+
+{
+#ifdef __KERNEL_SSE__
+	*near_x = 0; *far_x = 1;
+	*near_y = 2; *far_y = 3;
+	*near_z = 4; *far_z = 5;
+
+	const size_t mask = movemask(ssef(idir.m128));
+
+	const int mask_x = mask & 1;
+	const int mask_y = (mask & 2) >> 1;
+	const int mask_z = (mask & 4) >> 2;
+
+	*near_x += mask_x; *far_x -= mask_x;
+	*near_y += mask_y; *far_y -= mask_y;
+	*near_z += mask_z; *far_z -= mask_z;
+#else
+	if(idir.x >= 0.0f) { *near_x = 0; *far_x = 1; } else { *near_x = 1; *far_x = 0; }
+	if(idir.y >= 0.0f) { *near_y = 2; *far_y = 3; } else { *near_y = 3; *far_y = 2; }
+	if(idir.z >= 0.0f) { *near_z = 4; *far_z = 5; } else { *near_z = 5; *far_z = 4; }
+#endif
+}
+
 /* TOOD(sergey): Investigate if using intrinsics helps for both
 * stack item swap and float comparison.
 */
@@ -92,10 +92,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	/* Offsets to select the side that becomes the lower or upper bound. */
 	int near_x, near_y, near_z;
 	int far_x, far_y, far_z;
-
-	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+	qbvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);

 	IsectPrecalc isect_precalc;
 	triangle_intersect_precalc(dir, &isect_precalc);
@@ -392,9 +391,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					num_hits_in_instance = 0;
 					isect_array->t = isect_t;

-					if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+					qbvh_near_far_idx_calc(idir,
+					                       &near_x, &near_y, &near_z,
+					                       &far_x, &far_y, &far_z);
 					tfar = ssef(isect_t);
 #  if BVH_FEATURE(BVH_HAIR)
 					dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
@@ -450,9 +449,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			isect_t = tmax;
 			isect_array->t = isect_t;

-			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			qbvh_near_far_idx_calc(idir,
+			                       &near_x, &near_y, &near_z,
+			                       &far_x, &far_y, &far_z);
 			tfar = ssef(isect_t);
 #  if BVH_FEATURE(BVH_HAIR)
 			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
@@ -101,10 +101,9 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	/* Offsets to select the side that becomes the lower or upper bound. */
 	int near_x, near_y, near_z;
 	int far_x, far_y, far_z;
-
-	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+	qbvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);

 	IsectPrecalc isect_precalc;
 	triangle_intersect_precalc(dir, &isect_precalc);
@@ -102,10 +102,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	/* Offsets to select the side that becomes the lower or upper bound. */
 	int near_x, near_y, near_z;
 	int far_x, far_y, far_z;
-
-	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+	qbvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);

 	IsectPrecalc isect_precalc;
 	triangle_intersect_precalc(dir, &isect_precalc);
@@ -427,9 +426,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					qbvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t, &node_dist);
 #  endif

-					if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-					if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-					if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+					qbvh_near_far_idx_calc(idir,
+					                       &near_x, &near_y, &near_z,
+					                       &far_x, &far_y, &far_z);
 					tfar = ssef(isect->t);
 #  if BVH_FEATURE(BVH_HAIR)
 					dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
@@ -469,9 +468,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
 #  endif

-			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			qbvh_near_far_idx_calc(idir,
+			                       &near_x, &near_y, &near_z,
+			                       &far_x, &far_y, &far_z);
 			tfar = ssef(isect->t);
 #  if BVH_FEATURE(BVH_HAIR)
 			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
@@ -87,10 +87,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	/* Offsets to select the side that becomes the lower or upper bound. */
 	int near_x, near_y, near_z;
 	int far_x, far_y, far_z;
-
-	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+	qbvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);

 	IsectPrecalc isect_precalc;
 	triangle_intersect_precalc(dir, &isect_precalc);
@@ -303,9 +302,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
 #  endif

-						if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+						qbvh_near_far_idx_calc(idir,
+						                       &near_x, &near_y, &near_z,
+						                       &far_x, &far_y, &far_z);
 						tfar = ssef(isect->t);
 #  if BVH_FEATURE(BVH_HAIR)
 						dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
@@ -349,9 +348,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
 #  endif

-			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			qbvh_near_far_idx_calc(idir,
+			                       &near_x, &near_y, &near_z,
+			                       &far_x, &far_y, &far_z);
 			tfar = ssef(isect->t);
 #  if BVH_FEATURE(BVH_HAIR)
 			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
@@ -91,10 +91,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	/* Offsets to select the side that becomes the lower or upper bound. */
 	int near_x, near_y, near_z;
 	int far_x, far_y, far_z;
-
-	if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-	if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-	if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+	qbvh_near_far_idx_calc(idir,
+	                       &near_x, &near_y, &near_z,
+	                       &far_x, &far_y, &far_z);

 	IsectPrecalc isect_precalc;
 	triangle_intersect_precalc(dir, &isect_precalc);
@@ -354,9 +353,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
 #  endif

-						if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-						if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-						if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+						qbvh_near_far_idx_calc(idir,
+						                       &near_x, &near_y, &near_z,
+						                       &far_x, &far_y, &far_z);
 						tfar = ssef(isect_t);
 						idir4 = sse3f(ssef(idir.x), ssef(idir.y), ssef(idir.z));
 #  if BVH_FEATURE(BVH_HAIR)
@@ -420,9 +419,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			isect_t = tmax;
 			isect_array->t = isect_t;

-			if(idir.x >= 0.0f) { near_x = 0; far_x = 1; } else { near_x = 1; far_x = 0; }
-			if(idir.y >= 0.0f) { near_y = 2; far_y = 3; } else { near_y = 3; far_y = 2; }
-			if(idir.z >= 0.0f) { near_z = 4; far_z = 5; } else { near_z = 5; far_z = 4; }
+			qbvh_near_far_idx_calc(idir,
+			                       &near_x, &near_y, &near_z,
+			                       &far_x, &far_y, &far_z);
 			tfar = ssef(isect_t);
 #  if BVH_FEATURE(BVH_HAIR)
 			dir4 = sse3f(ssef(dir.x), ssef(dir.y), ssef(dir.z));
@@ -55,6 +55,21 @@ ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg, int object
 	return tfm;
 }

+/* Lamp to world space transformation */
+
+ccl_device_inline Transform lamp_fetch_transform(KernelGlobals *kg, int lamp, bool inverse)
+{
+	int offset = lamp*LIGHT_SIZE + (inverse? 8 : 5);
+
+	Transform tfm;
+	tfm.x = kernel_tex_fetch(__light_data, offset + 0);
+	tfm.y = kernel_tex_fetch(__light_data, offset + 1);
+	tfm.z = kernel_tex_fetch(__light_data, offset + 2);
+	tfm.w = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
+
+	return tfm;
+}
+
 /* Object to world space transformation for motion vectors */

 ccl_device_inline Transform object_fetch_vector_transform(KernelGlobals *kg, int object, enum ObjectVectorTransform type)
@@ -376,15 +391,33 @@ ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle)
 ccl_device_inline float3 bvh_clamp_direction(float3 dir)
 {
 	/* clamp absolute values by exp2f(-80.0f) to avoid division by zero when calculating inverse direction */
-	float ooeps = 8.271806E-25f;
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE2__)
+	const ssef oopes(8.271806E-25f,8.271806E-25f,8.271806E-25f,0.0f);
+	const ssef mask = _mm_cmpgt_ps(fabs(dir), oopes);
+	const ssef signdir = signmsk(dir.m128) | oopes;
+#  ifndef __KERNEL_AVX__
+	ssef res = mask & ssef(dir);
+	res = _mm_or_ps(res,_mm_andnot_ps(mask, signdir));
+#  else
+	ssef res = _mm_blendv_ps(signdir, dir, mask);
+#  endif
+	return float3(res);
+#else  /* __KERNEL_SSE__ && __KERNEL_SSE2__ */
+	const float ooeps = 8.271806E-25f;
 	return make_float3((fabsf(dir.x) > ooeps)? dir.x: copysignf(ooeps, dir.x),
 	                   (fabsf(dir.y) > ooeps)? dir.y: copysignf(ooeps, dir.y),
 	                   (fabsf(dir.z) > ooeps)? dir.z: copysignf(ooeps, dir.z));
+#endif  /* __KERNEL_SSE__ && __KERNEL_SSE2__ */
 }

 ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 {
+	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
+#if defined(__KERNEL_SSE__) && 0
+	return rcp(dir);
+#else
 	return 1.0f / dir;
+#endif
 }

 /* Transform ray into object space to enter static object in BVH */
@@ -59,21 +59,33 @@ void triangle_intersect_precalc(float3 dir,
                                IsectPrecalc *isect_precalc)
 {
 	/* Calculate dimension where the ray direction is maximal. */
+#ifndef __KERNEL_SSE__
 	int kz = util_max_axis(make_float3(fabsf(dir.x),
 	                                   fabsf(dir.y),
 	                                   fabsf(dir.z)));
 	int kx = kz + 1; if(kx == 3) kx = 0;
 	int ky = kx + 1; if(ky == 3) ky = 0;
+#else
+	int kx, ky, kz;
+	/* Avoiding mispredicted branch on direction. */
+	kz = util_max_axis(fabs(dir));
+	static const char inc_xaxis[] = {1, 2, 0, 55};
+	static const char inc_yaxis[] = {2, 0, 1, 55};
+	kx = inc_xaxis[kz];
+	ky = inc_yaxis[kz];
+#endif
+
+	float dir_kz = IDX(dir, kz);

 	/* Swap kx and ky dimensions to preserve winding direction of triangles. */
-	if(IDX(dir, kz) < 0.0f) {
+	if(dir_kz < 0.0f) {
 		int tmp = kx;
 		kx = ky;
 		ky = tmp;
 	}

 	/* Calculate the shear constants. */
-	float inv_dir_z = 1.0f / IDX(dir, kz);
+	float inv_dir_z = 1.0f / dir_kz;
 	isect_precalc->Sx = IDX(dir, kx) * inv_dir_z;
 	isect_precalc->Sy = IDX(dir, ky) * inv_dir_z;
 	isect_precalc->Sz = inv_dir_z;
@@ -108,7 +120,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 	/* Calculate vertices relative to ray origin. */
 	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, triAddr);

-#if defined(__KERNEL_AVX2__)
+#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
 	const avxf avxf_P(P.m128, P.m128);

 	const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0);
@@ -270,7 +282,7 @@ ccl_device_inline void triangle_intersect_subsurface(
 	             tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
 	             tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);

-#if defined(__KERNEL_AVX2__)
+#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
 	const avxf avxf_P(P.m128, P.m128);

 	const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0);
@@ -96,7 +96,7 @@ ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
 	}
 }

-ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float3 value)
+ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
 {
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
@@ -115,6 +115,36 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float3 value)
 	}
 }

+ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value)
+{
+#ifdef __PASSES__
+	if(eval->use_light_pass) {
+		eval->diffuse *= value;
+		eval->glossy *= value;
+		eval->transmission *= value;
+		eval->subsurface *= value;
+		eval->scatter *= value;
+
+		/* skipping transparent, this function is used by for eval(), will be zero then */
+	}
+	else
+		eval->diffuse *= value;
+#else
+	eval->diffuse *= value;
+#endif
+}
+
+ccl_device_inline float3 bsdf_eval_sum(BsdfEval *eval)
+{
+#ifdef __PASSES__
+	if(eval->use_light_pass) {
+		return eval->diffuse + eval->glossy + eval->transmission + eval->subsurface + eval->scatter;
+	}
+	else
+#endif
+	return eval->diffuse;
+}
+
 /* Path Radiance
 *
 * We accumulate different render passes separately. After summing at the end
@@ -193,8 +223,7 @@ ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space
 		}
 		else {
 			/* transparent bounce before first hit, or indirectly visible through BSDF */
-			float3 sum = (bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->transparent +
-						  bsdf_eval->subsurface + bsdf_eval->scatter) * inverse_pdf;
+			float3 sum = (bsdf_eval_sum(bsdf_eval) + bsdf_eval->transparent) * inverse_pdf;
 			*throughput *= sum;
 		}
 	}
@@ -264,8 +293,7 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
 		}
 		else {
 			/* indirectly visible lighting after BSDF bounce */
-			float3 sum = bsdf_eval->diffuse + bsdf_eval->glossy + bsdf_eval->transmission + bsdf_eval->subsurface + bsdf_eval->scatter;
-			L->indirect += throughput*sum*shadow;
+			L->indirect += throughput*bsdf_eval_sum(bsdf_eval)*shadow;
 		}
 	}
 	else
@@ -63,7 +63,7 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,

 		/* sample ambient occlusion */
 		if(pass_filter & BAKE_FILTER_AO) {
-			kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput);
+			kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput, shader_bsdf_alpha(kg, sd));
 		}

 		/* sample emission */
@@ -320,7 +320,8 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	                         P, Ng, Ng,
 	                         shader, object, prim,
 	                         u, v, 1.0f, 0.5f,
-	                         !(kernel_tex_fetch(__object_flag, object) & SD_TRANSFORM_APPLIED));
+	                         !(kernel_tex_fetch(__object_flag, object) & SD_TRANSFORM_APPLIED),
+	                         LAMP_NONE);
 	sd.I = sd.N;

 	/* update differentials */
@@ -65,7 +65,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		shader_setup_from_sample(kg, emission_sd,
 		                         ls->P, ls->Ng, I,
 		                         ls->shader, ls->object, ls->prim,
-		                         ls->u, ls->v, t, time, false);
+		                         ls->u, ls->v, t, time, false, ls->lamp);

 		ls->Ng = ccl_fetch(emission_sd, Ng);

@@ -94,7 +94,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
                                         ccl_addr_space PathState *state,
                                         Ray *ray,
                                         BsdfEval *eval,
-                                         bool *is_lamp)
+                                         bool *is_lamp,
+                                         float rand_terminate)
 {
 	if(ls->pdf == 0.0f)
 		return false;
@@ -134,7 +135,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS);
 #endif

-	bsdf_eval_mul(eval, light_eval/ls->pdf);
+	bsdf_eval_mul3(eval, light_eval/ls->pdf);

 #ifdef __PASSES__
 	/* use visibility flag to skip lights */
@@ -155,6 +156,16 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	if(bsdf_eval_is_zero(eval))
 		return false;

+	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
+		float probability = max3(bsdf_eval_sum(eval)) * kernel_data.integrator.light_inv_rr_threshold;
+		if(probability < 1.0f) {
+			if(rand_terminate >= probability) {
+				return false;
+			}
+			bsdf_eval_mul(eval, 1.0f / probability);
+		}
+	}
+
 	if(ls->shader & SHADER_CAST_SHADOW) {
 		/* setup ray */
 		bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f);
@@ -297,7 +297,7 @@ ccl_device_inline float background_portal_pdf(KernelGlobals *kg,
 		float3 axisu = make_float3(data1.y, data1.z, data1.w);
 		float3 axisv = make_float3(data2.y, data2.z, data2.w);

-		if(!ray_quad_intersect(P, direction, 1e-4f, FLT_MAX, lightpos, axisu, axisv, dir, NULL, NULL))
+		if(!ray_quad_intersect(P, direction, 1e-4f, FLT_MAX, lightpos, axisu, axisv, dir, NULL, NULL, NULL, NULL))
 			continue;

 		portal_pdf += area_light_sample(P, &lightpos, axisu, axisv, 0.0f, 0.0f, false);
@@ -585,6 +585,10 @@ ccl_device_inline bool lamp_light_sample(KernelGlobals *kg,
 					return false;
 				}
 			}
+			float2 uv = map_to_sphere(ls->Ng);
+			ls->u = uv.x;
+			ls->v = uv.y;
+
 			ls->pdf *= lamp_light_pdf(kg, ls->Ng, -ls->D, ls->t);
 		}
 		else {
@@ -600,11 +604,16 @@ ccl_device_inline bool lamp_light_sample(KernelGlobals *kg,
 				return false;
 			}

+			float3 inplane = ls->P;
 			ls->pdf = area_light_sample(P, &ls->P,
 			                          axisu, axisv,
 			                          randu, randv,
 			                          true);

+			inplane = ls->P - inplane;
+			ls->u = dot(inplane, axisu) * (1.0f / dot(axisu, axisu)) + 0.5f;
+			ls->v = dot(inplane, axisv) * (1.0f / dot(axisv, axisv)) + 0.5f;
+
 			ls->Ng = D;
 			ls->D = normalize_len(ls->P - P, &ls->t);

@@ -706,6 +715,9 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 			if(ls->eval_fac == 0.0f)
 				return false;
 		}
+		float2 uv = map_to_sphere(ls->Ng);
+		ls->u = uv.x;
+		ls->v = uv.y;

 		/* compute pdf */
 		if(ls->t != FLT_MAX)
@@ -730,8 +742,10 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,

 		float3 light_P = make_float3(data0.y, data0.z, data0.w);

-		if(!ray_quad_intersect(P, D, 0.0f, t,
-		                       light_P, axisu, axisv, Ng, &ls->P, &ls->t))
+		if(!ray_quad_intersect(P, D, 0.0f, t, light_P,
+		                       axisu, axisv, Ng,
+		                       &ls->P, &ls->t,
+		                       &ls->u, &ls->v))
 		{
 			return false;
 		}
@@ -887,4 +901,3 @@ ccl_device int light_select_num_samples(KernelGlobals *kg, int index)
 }

 CCL_NAMESPACE_END
-
@@ -53,6 +53,47 @@

 CCL_NAMESPACE_BEGIN

+ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
+                                        ShaderData *sd,
+                                        ShaderData *emission_sd,
+                                        PathRadiance *L,
+                                        PathState *state,
+                                        RNG *rng,
+                                        float3 throughput,
+                                        float3 ao_alpha)
+{
+	/* todo: solve correlation */
+	float bsdf_u, bsdf_v;
+
+	path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+	float ao_factor = kernel_data.background.ao_factor;
+	float3 ao_N;
+	float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+	float3 ao_D;
+	float ao_pdf;
+
+	sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+	if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+		Ray light_ray;
+		float3 ao_shadow;
+
+		light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+		light_ray.D = ao_D;
+		light_ray.t = kernel_data.background.ao_distance;
+#ifdef __OBJECT_MOTION__
+		light_ray.time = ccl_fetch(sd, time);
+#endif
+		light_ray.dP = ccl_fetch(sd, dP);
+		light_ray.dD = differential3_zero();
+
+		if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
+			path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+		}
+	}
+}
+
 ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                     ShaderData *sd,
                                     ShaderData *emission_sd,
@@ -305,40 +346,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
-			float bsdf_u, bsdf_v;
-			path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-			float ao_factor = kernel_data.background.ao_factor;
-			float3 ao_N;
-			float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-			float3 ao_D;
-			float ao_pdf;
-			float3 ao_alpha = make_float3(0.0f, 0.0f, 0.0f);
-
-			sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-			if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-				Ray light_ray;
-				float3 ao_shadow;
-
-				light_ray.P = ray_offset(sd->P, sd->Ng);
-				light_ray.D = ao_D;
-				light_ray.t = kernel_data.background.ao_distance;
-#  ifdef __OBJECT_MOTION__
-				light_ray.time = sd->time;
-#  endif
-				light_ray.dP = sd->dP;
-				light_ray.dD = differential3_zero();
-
-				if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
-					path_radiance_accum_ao(L,
-					                       throughput,
-					                       ao_alpha,
-					                       ao_bsdf,
-					                       ao_shadow,
-					                       state->bounce);
-				}
-			}
+			kernel_path_ao(kg, sd, emission_sd, L, state, rng, throughput, make_float3(0.0f, 0.0f, 0.0f));
 		}
 #endif

@@ -394,46 +402,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 	}
 }

-ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
-                                        ShaderData *sd,
-                                        ShaderData *emission_sd,
-                                        PathRadiance *L,
-                                        PathState *state,
-                                        RNG *rng,
-                                        float3 throughput)
-{
-	/* todo: solve correlation */
-	float bsdf_u, bsdf_v;
-
-	path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-	float ao_factor = kernel_data.background.ao_factor;
-	float3 ao_N;
-	float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-	float3 ao_D;
-	float ao_pdf;
-	float3 ao_alpha = shader_bsdf_alpha(kg, sd);
-
-	sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-	if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
-		Ray light_ray;
-		float3 ao_shadow;
-
-		light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
-		light_ray.D = ao_D;
-		light_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
-		light_ray.time = ccl_fetch(sd, time);
-#endif
-		light_ray.dP = ccl_fetch(sd, dP);
-		light_ray.dD = differential3_zero();
-
-		if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow))
-			path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
-	}
-}
-
 #ifdef __SUBSURFACE__
 #  ifndef __KERNEL_CUDA__
 ccl_device
@@ -851,7 +819,6 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 		}
 		else if(probability != 1.0f) {
 			float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
-
 			if(terminate >= probability)
 				break;

@@ -861,7 +828,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
+			kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
 		}
 #endif

@@ -14,6 +14,8 @@
 * limitations under the License.
 */

+#include "util_hash.h"
+
 CCL_NAMESPACE_BEGIN

 ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,
@@ -28,6 +30,10 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,

 	int num_samples = kernel_data.integrator.aa_samples;

+	if(sample == 0) {
+		*rng_state = hash_int_2d(x, y);
+	}
+
 	path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v);

 	/* sample camera ray */
@@ -49,6 +49,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 			for(int j = 0; j < num_samples; j++) {
 				float light_u, light_v;
 				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+				float terminate = path_branched_rng_light_termination(kg, &lamp_rng, state, j, num_samples);

 				LightSample ls;
 				if(lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls)) {
@@ -57,7 +58,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 					if(kernel_data.integrator.pdf_triangles != 0.0f)
 						ls.pdf *= 2.0f;

-					if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
+					if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 						/* trace shadow ray */
 						float3 shadow;

@@ -79,6 +80,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 				float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT);
 				float light_u, light_v;
 				path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+				float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples);

 				/* only sample triangle lights */
 				if(kernel_data.integrator.num_all_lights)
@@ -90,7 +92,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 					if(kernel_data.integrator.num_all_lights)
 						ls.pdf *= 2.0f;

-					if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
+					if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 						/* trace shadow ray */
 						float3 shadow;

@@ -108,11 +110,12 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 		float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 		float light_u, light_v;
 		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+		float terminate = path_state_rng_light_termination(kg, rng, state);

 		LightSample ls;
 		if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
 			/* sample random light */
-			if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
+			if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 				/* trace shadow ray */
 				float3 shadow;

@@ -210,7 +213,8 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_

 	LightSample ls;
 	if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
-		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
+		float terminate = path_state_rng_light_termination(kg, rng, state);
+		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 			/* trace shadow ray */
 			float3 shadow;

@@ -48,7 +48,8 @@ ccl_device_inline void kernel_path_volume_connect_light(

 	if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls))
 	{
-		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
+		float terminate = path_state_rng_light_termination(kg, rng, state);
+		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 			/* trace shadow ray */
 			float3 shadow;

@@ -161,7 +162,8 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 					if(kernel_data.integrator.pdf_triangles != 0.0f)
 						ls.pdf *= 2.0f;

-					if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
+					float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples);
+					if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 						/* trace shadow ray */
 						float3 shadow;

@@ -209,7 +211,8 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 					if(kernel_data.integrator.num_all_lights)
 						ls.pdf *= 2.0f;

-					if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
+					float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples);
+					if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 						/* trace shadow ray */
 						float3 shadow;

@@ -246,7 +249,8 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 		/* todo: split up light_sample so we don't have to call it again with new position */
 		if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 			/* sample random light */
-			if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp)) {
+			float terminate = path_state_rng_light_termination(kg, rng, state);
+			if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 				/* trace shadow ray */
 				float3 shadow;

@@ -235,7 +235,7 @@ ccl_device_inline void spherical_stereo_transform(KernelGlobals *kg, float3 *P,
 	if(kernel_data.cam.pole_merge_angle_to > 0.0f) {
 		const float pole_merge_angle_from = kernel_data.cam.pole_merge_angle_from,
 		            pole_merge_angle_to = kernel_data.cam.pole_merge_angle_to;
-		float altitude = fabsf(safe_asinf(D->z));
+		float altitude = fabsf(safe_asinf((*D).z));
 		if(altitude > pole_merge_angle_to) {
 			interocular_offset = 0.0f;
 		}
@@ -300,6 +300,23 @@ ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, ccl_addr_space RN
 	path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
 }

+/* Utitility functions to get light termination value, since it might not be needed in many cases. */
+ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state)
+{
+	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
+		return path_state_rng_1D_for_decision(kg, rng, state, PRNG_LIGHT_TERMINATE);
+	}
+	return 0.0f;
+}
+
+ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches)
+{
+	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
+		return path_branched_rng_1D_for_decision(kg, rng, state, branch, num_branches, PRNG_LIGHT_TERMINATE);
+	}
+	return 0.0f;
+}
+
 ccl_device_inline void path_state_branch(PathState *state, int branch, int num_branches)
 {
 	/* path is splitting into a branch, adjust so that each branch
@@ -242,7 +242,8 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
                                                int shader, int object, int prim,
                                                float u, float v, float t,
                                                float time,
-                                                bool object_space)
+                                                bool object_space,
+                                                int lamp)
 {
 	/* vectors */
 	ccl_fetch(sd, P) = P;
@@ -250,7 +251,12 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
 	ccl_fetch(sd, Ng) = Ng;
 	ccl_fetch(sd, I) = I;
 	ccl_fetch(sd, shader) = shader;
-	ccl_fetch(sd, type) = (prim == PRIM_NONE)? PRIMITIVE_NONE: PRIMITIVE_TRIANGLE;
+	if(prim != PRIM_NONE)
+		ccl_fetch(sd, type) = PRIMITIVE_TRIANGLE;
+	else if(lamp != LAMP_NONE)
+		ccl_fetch(sd, type) = PRIMITIVE_LAMP;
+	else
+		ccl_fetch(sd, type) = PRIMITIVE_NONE;

 	/* primitive */
 #ifdef __INSTANCING__
@@ -270,11 +276,15 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,

 #ifdef __OBJECT_MOTION__
 		shader_setup_object_transforms(kg, sd, time);
+#endif
+	}
+	else if(lamp != LAMP_NONE) {
+		ccl_fetch(sd, ob_tfm)  = lamp_fetch_transform(kg, lamp, false);
+		ccl_fetch(sd, ob_itfm) = lamp_fetch_transform(kg, lamp, true);
 	}

+#ifdef __OBJECT_MOTION__
 	ccl_fetch(sd, time) = time;
-#else
-	}
 #endif

 	/* transform into world space */
@@ -357,7 +367,8 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 	                         P, Ng, I,
 	                         shader, object, prim,
 	                         u, v, 0.0f, 0.5f,
-	                         !(kernel_tex_fetch(__object_flag, object) & SD_TRANSFORM_APPLIED));
+	                         !(kernel_tex_fetch(__object_flag, object) & SD_TRANSFORM_APPLIED),
+	                         LAMP_NONE);
 }

 /* ShaderData setup from ray into background */
@@ -561,7 +572,7 @@ void shader_bsdf_eval(KernelGlobals *kg,
 		_shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, -1, eval, 0.0f, 0.0f);
 		if(use_mis) {
 			float weight = power_heuristic(light_pdf, pdf);
-			bsdf_eval_mul(eval, make_float3(weight, weight, weight));
+			bsdf_eval_mul(eval, weight);
 		}
 	}
 }
@@ -37,7 +37,7 @@ CCL_NAMESPACE_BEGIN
 /* constants */
 #define OBJECT_SIZE 		12
 #define OBJECT_VECTOR_SIZE	6
-#define LIGHT_SIZE			5
+#define LIGHT_SIZE		11
 #define FILTER_TABLE_SIZE	1024
 #define RAMP_TABLE_SIZE		256
 #define SHUTTER_TABLE_SIZE		256
@@ -250,7 +250,7 @@ enum PathTraceDimension {
 	PRNG_LIGHT = 3,
 	PRNG_LIGHT_U = 4,
 	PRNG_LIGHT_V = 5,
-	PRNG_UNUSED_3 = 6,
+	PRNG_LIGHT_TERMINATE = 6,
 	PRNG_TERMINATE = 7,

 #ifdef __VOLUME__
@@ -552,6 +552,8 @@ typedef enum PrimitiveType {
 	PRIMITIVE_MOTION_TRIANGLE = 2,
 	PRIMITIVE_CURVE = 4,
 	PRIMITIVE_MOTION_CURVE = 8,
+	/* Lamp primitive is not included below on purpose, since it is no real traceable primitive */
+	PRIMITIVE_LAMP = 16,

 	PRIMITIVE_ALL_TRIANGLE = (PRIMITIVE_TRIANGLE|PRIMITIVE_MOTION_TRIANGLE),
 	PRIMITIVE_ALL_CURVE = (PRIMITIVE_CURVE|PRIMITIVE_MOTION_CURVE),
@@ -1121,8 +1123,9 @@ typedef struct KernelIntegrator {
 	float volume_step_size;
 	int volume_samples;

+	float light_inv_rr_threshold;
+
 	int pad1;
-	int pad2;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);

@@ -42,6 +42,7 @@
 #    define __KERNEL_SSE41__
 #  endif
 #  ifdef __AVX__
+#    define __KERNEL_SSE__
 #    define __KERNEL_AVX__
 #  endif
 #  ifdef __AVX2__
@@ -20,6 +20,7 @@
 
 /* SSE optimization disabled for now on 32 bit, see bug #36316 */
 #if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE__
 #  define __KERNEL_SSE2__
 #  define __KERNEL_SSE3__
 #  define __KERNEL_SSSE3__
@@ -166,6 +166,12 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 			tfm = transform_transpose(tfm);
 			COPY_MATRIX44(&result, &tfm);

+			return true;
+		}
+		else if(sd->type == PRIMITIVE_LAMP) {
+			Transform tfm = transform_transpose(sd->ob_tfm);
+			COPY_MATRIX44(&result, &tfm);
+
 			return true;
 		}
 	}
@@ -196,6 +202,12 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 			itfm = transform_transpose(itfm);
 			COPY_MATRIX44(&result, &itfm);

+			return true;
+		}
+		else if(sd->type == PRIMITIVE_LAMP) {
+			Transform tfm = transform_transpose(sd->ob_itfm);
+			COPY_MATRIX44(&result, &tfm);
+
 			return true;
 		}
 	}
@@ -285,6 +297,12 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result
 			tfm = transform_transpose(tfm);
 			COPY_MATRIX44(&result, &tfm);

+			return true;
+		}
+		else if(sd->type == PRIMITIVE_LAMP) {
+			Transform tfm = transform_transpose(sd->ob_tfm);
+			COPY_MATRIX44(&result, &tfm);
+
 			return true;
 		}
 	}
@@ -310,6 +328,12 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44
 			tfm = transform_transpose(tfm);
 			COPY_MATRIX44(&result, &tfm);

+			return true;
+		}
+		else if(sd->type == PRIMITIVE_LAMP) {
+			Transform tfm = transform_transpose(sd->ob_itfm);
+			COPY_MATRIX44(&result, &tfm);
+
 			return true;
 		}
 	}
@@ -28,7 +28,7 @@ float brick_noise(int n) /* fast integer noise */
 	return 0.5 * ((float)nn / 1073741824.0);
 }

-float brick(point p, float mortar_size, float bias,
+float brick(point p, float mortar_size, float mortar_smooth, float bias,
 	float BrickWidth, float row_height, float offset_amount, int offset_frequency,
 	float squash_amount, int squash_frequency, float tint)
 {
@@ -51,9 +51,17 @@ float brick(point p, float mortar_size, float bias,

 	tint = clamp((brick_noise((rownum << 16) + (bricknum & 65535)) + bias), 0.0, 1.0);

-	return (x < mortar_size || y < mortar_size ||
-	        x > (brick_width - mortar_size) ||
-	        y > (row_height - mortar_size)) ? 1.0 : 0.0;
+	float min_dist = min(min(x, y), min(brick_width - x, row_height - y));
+	if(min_dist >= mortar_size) {
+		return 0.0;
+	}
+	else if(mortar_smooth == 0.0) {
+		return 1.0;
+	}
+	else {
+		min_dist = 1.0 - min_dist/mortar_size;
+		return smoothstep(0.0, mortar_smooth, min_dist);
+	}
 }

 shader node_brick_texture(
@@ -69,6 +77,7 @@ shader node_brick_texture(
 	color Mortar = 0.0,
 	float Scale = 5.0,
 	float MortarSize = 0.02,
+	float MortarSmooth = 0.0,
 	float Bias = 0.0,
 	float BrickWidth = 0.5,
 	float RowHeight = 0.25,
@@ -83,7 +92,7 @@ shader node_brick_texture(
 	float tint = 0.0;
 	color Col = Color1;
 	
-	Fac = brick(p * Scale, MortarSize, Bias, BrickWidth, RowHeight,
+	Fac = brick(p * Scale, MortarSize, MortarSmooth, Bias, BrickWidth, RowHeight,
 		offset, offset_frequency, squash, squash_frequency, tint);
 		
 	if (Fac != 1.0) {
@@ -91,6 +100,6 @@ shader node_brick_texture(
 		Col = facm * Color1 + tint * Color2;
 	}
 	
-	Color = (Fac == 1.0) ? Mortar : Col;
+	Color = mix(Col, Mortar, Fac);
 }

@@ -72,6 +72,7 @@ ccl_device char kernel_direct_lighting(
 			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 			float light_u, light_v;
 			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+			float terminate = path_state_rng_light_termination(kg, rng, state);

 			LightSample ls;
 			if(light_sample(kg,
@@ -88,7 +89,7 @@ ccl_device char kernel_direct_lighting(

 				BsdfEval L_light;
 				bool is_lamp;
-				if(direct_emission(kg, sd, kg->sd_input, &ls, state, &light_ray, &L_light, &is_lamp)) {
+				if(direct_emission(kg, sd, kg->sd_input, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 					/* Write intermediate data to global memory to access from
 					 * the next kernel.
 					 */
@@ -27,7 +27,7 @@ ccl_device_noinline float brick_noise(int n) /* fast integer noise */
 	return 0.5f * ((float)nn / 1073741824.0f);
 }

-ccl_device_noinline float2 svm_brick(float3 p, float mortar_size, float bias,
+ccl_device_noinline float2 svm_brick(float3 p, float mortar_size, float mortar_smooth, float bias,
 	float brick_width, float row_height, float offset_amount, int offset_frequency,
 	float squash_amount, int squash_frequency)
 {
@@ -47,30 +47,41 @@ ccl_device_noinline float2 svm_brick(float3 p, float mortar_size, float bias,
 	x = (p.x+offset) - brick_width*bricknum;
 	y = p.y - row_height*rownum;

-	return make_float2(
-		saturate((brick_noise((rownum << 16) + (bricknum & 0xFFFF)) + bias)),
+	float tint = saturate((brick_noise((rownum << 16) + (bricknum & 0xFFFF)) + bias));
+	float min_dist = min(min(x, y), min(brick_width - x, row_height - y));

-		(x < mortar_size || y < mortar_size ||
-		x > (brick_width - mortar_size) ||
-		y > (row_height - mortar_size)) ? 1.0f : 0.0f);
+	float mortar;
+	if(min_dist >= mortar_size) {
+		mortar = 0.0f;
+	}
+	else if(mortar_smooth == 0.0f) {
+		mortar = 1.0f;
+	}
+	else {
+		min_dist = 1.0f - min_dist/mortar_size;
+		mortar = (min_dist < mortar_smooth)? smoothstepf(min_dist / mortar_smooth) : 1.0f;
+	}
+
+	return make_float2(tint, mortar);
 }

 ccl_device void svm_node_tex_brick(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {	
 	uint4 node2 = read_node(kg, offset);
 	uint4 node3 = read_node(kg, offset);
+	uint4 node4 = read_node(kg, offset);
 	
 	/* Input and Output Sockets */
 	uint co_offset, color1_offset, color2_offset, mortar_offset, scale_offset;
 	uint mortar_size_offset, bias_offset, brick_width_offset, row_height_offset;
-	uint color_offset, fac_offset;
+	uint color_offset, fac_offset, mortar_smooth_offset;
 	
 	/* RNA properties */
 	uint offset_frequency, squash_frequency;
 	
 	decode_node_uchar4(node.y, &co_offset, &color1_offset, &color2_offset, &mortar_offset);
 	decode_node_uchar4(node.z, &scale_offset, &mortar_size_offset, &bias_offset, &brick_width_offset);
-	decode_node_uchar4(node.w, &row_height_offset, &color_offset, &fac_offset, NULL);
+	decode_node_uchar4(node.w, &row_height_offset, &color_offset, &fac_offset, &mortar_smooth_offset);
 	
 	decode_node_uchar4(node2.x, &offset_frequency, &squash_frequency, NULL, NULL);

@@ -82,13 +93,14 @@ ccl_device void svm_node_tex_brick(KernelGlobals *kg, ShaderData *sd, float *sta
 	
 	float scale = stack_load_float_default(stack, scale_offset, node2.y);
 	float mortar_size = stack_load_float_default(stack, mortar_size_offset, node2.z);
+	float mortar_smooth = stack_load_float_default(stack, mortar_smooth_offset, node4.x);
 	float bias = stack_load_float_default(stack, bias_offset, node2.w);
 	float brick_width = stack_load_float_default(stack, brick_width_offset, node3.x);
 	float row_height = stack_load_float_default(stack, row_height_offset, node3.y);
 	float offset_amount = __int_as_float(node3.z);
 	float squash_amount = __int_as_float(node3.w);
 	
-	float2 f2 = svm_brick(co*scale, mortar_size, bias, brick_width, row_height,
+	float2 f2 = svm_brick(co*scale, mortar_size, mortar_smooth, bias, brick_width, row_height,
 		offset_amount, offset_frequency, squash_amount, squash_frequency);

 	float tint = f2.x;
@@ -100,7 +112,7 @@ ccl_device void svm_node_tex_brick(KernelGlobals *kg, ShaderData *sd, float *sta
 	}

 	if(stack_valid(color_offset))
-		stack_store_float3(stack, color_offset, (f == 1.0f)? mortar: color1);
+		stack_store_float3(stack, color_offset, color1*(1.0f-f) + mortar*f);
 	if(stack_valid(fac_offset))
 		stack_store_float(stack, fac_offset, f);
 }
@@ -49,7 +49,7 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
 		}
 		case NODE_TEXCO_NORMAL: {
 			data = ccl_fetch(sd, N);
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
+			if((ccl_fetch(sd, object) != OBJECT_NONE) || (ccl_fetch(sd, type) == PRIMITIVE_LAMP))
 				object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
@@ -131,7 +131,7 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
 		}
 		case NODE_TEXCO_NORMAL: {
 			data = ccl_fetch(sd, N);
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
+			if((ccl_fetch(sd, object) != OBJECT_NONE) || (ccl_fetch(sd, type) == PRIMITIVE_LAMP))
 				object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
@@ -216,7 +216,7 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
 		}
 		case NODE_TEXCO_NORMAL: {
 			data = ccl_fetch(sd, N);
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
+			if((ccl_fetch(sd, object) != OBJECT_NONE) || (ccl_fetch(sd, type) == PRIMITIVE_LAMP))
 				object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
@@ -135,15 +135,7 @@ void RenderBuffers::reset(Device *device, BufferParams& params_)
 	/* allocate rng state */
 	rng_state.resize(params.width, params.height);

-	uint *init_state = rng_state.resize(params.width, params.height);
-	int x, y, width = params.width, height = params.height;
-	
-	for(y = 0; y < height; y++)
-		for(x = 0; x < width; x++)
-			init_state[y*width + x] = hash_int_2d(params.full_x+x, params.full_y+y);
-
 	device->mem_alloc(rng_state, MEM_READ_WRITE);
-	device->mem_copy_to(rng_state);
 }

 bool RenderBuffers::copy_from_device()
--- a/Show More
+++ b/Show More