From 5322ff3b482cf85fb7f5f2e3df909d63b6fc292f Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Tue, 11 Oct 2016 14:48:15 +0200
Subject: [PATCH 01/27] Make console message more clear for --scene argument

---
 source/blender/blenkernel/intern/scene.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/blender/blenkernel/intern/scene.c b/source/blender/blenkernel/intern/scene.c
index 0d204461d16..6e1f11cb526 100644
--- a/source/blender/blenkernel/intern/scene.c
+++ b/source/blender/blenkernel/intern/scene.c
@@ -906,7 +906,7 @@ Scene *BKE_scene_set_name(Main *bmain, const char *name)
 	Scene *sce = (Scene *)BKE_libblock_find_name_ex(bmain, ID_SCE, name);
 	if (sce) {
 		BKE_scene_set_background(bmain, sce);
-		printf("Scene switch: '%s' in file: '%s'\n", name, bmain->name);
+		printf("Scene switch for render: '%s' in file: '%s'\n", name, bmain->name);
 		return sce;
 	}
 

From 17603b9f01ca1d70b327a6b81d9a751b510a7f04 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Tue, 11 Oct 2016 15:35:14 +0200
Subject: [PATCH 02/27] Fix objects added via py being on the wrong layer when
 viewport is decoupled from scene

---
 release/scripts/modules/bpy_extras/object_utils.py |  8 ++++++--
 source/blender/makesrna/intern/rna_space.c         | 12 ++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/release/scripts/modules/bpy_extras/object_utils.py b/release/scripts/modules/bpy_extras/object_utils.py
index c2c306e5145..87bb84b5844 100644
--- a/release/scripts/modules/bpy_extras/object_utils.py
+++ b/release/scripts/modules/bpy_extras/object_utils.py
@@ -145,8 +145,12 @@ def object_data_add(context, obdata, operator=None, use_active_layer=True, name=
                 base.layers_from_view(context.space_data)
                 base.layers[scene.active_layer] = True
             else:
-                base.layers = [True if i == scene.active_layer
-                               else False for i in range(len(scene.layers))]
+                if v3d and not v3d.lock_camera_and_layers:
+                    base.layers = [True if i == v3d.active_layer
+                                   else False for i in range(len(v3d.layers))]
+                else:
+                    base.layers = [True if i == scene.active_layer
+                                   else False for i in range(len(scene.layers))]
         else:
             if v3d:
                 base.layers_from_view(context.space_data)
diff --git a/source/blender/makesrna/intern/rna_space.c b/source/blender/makesrna/intern/rna_space.c
index b6c393280ba..8fb99703e9b 100644
--- a/source/blender/makesrna/intern/rna_space.c
+++ b/source/blender/makesrna/intern/rna_space.c
@@ -522,6 +522,13 @@ static void rna_SpaceView3D_layer_set(PointerRNA *ptr, const int *values)
 	v3d->lay = ED_view3d_scene_layer_set(v3d->lay, values, &v3d->layact);
 }
 
+static int rna_SpaceView3D_active_layer_get(PointerRNA *ptr)
+{
+	View3D *v3d = (View3D *)(ptr->data);
+
+	return (int)(log(v3d->layact) / M_LN2);
+}
+
 static void rna_SpaceView3D_layer_update(Main *bmain, Scene *UNUSED(scene), PointerRNA *UNUSED(ptr))
 {
 	DAG_on_visible_update(bmain, false);
@@ -2647,6 +2654,11 @@ static void rna_def_space_view3d(BlenderRNA *brna)
 	RNA_def_property_ui_text(prop, "Visible Layers", "Layers visible in this 3D View");
 	RNA_def_property_update(prop, NC_SPACE | ND_SPACE_VIEW3D, "rna_SpaceView3D_layer_update");
 
+	prop = RNA_def_property(srna, "active_layer", PROP_INT, PROP_NONE);
+	RNA_def_property_clear_flag(prop, PROP_ANIMATABLE | PROP_EDITABLE);
+	RNA_def_property_int_funcs(prop, "rna_SpaceView3D_active_layer_get", NULL, NULL);
+	RNA_def_property_ui_text(prop, "Active Layer", "Active 3D view layer index");
+
 	prop = RNA_def_property(srna, "layers_local_view", PROP_BOOLEAN, PROP_LAYER_MEMBER);
 	RNA_def_property_boolean_sdna(prop, NULL, "lay", 0x01000000);
 	RNA_def_property_array(prop, 8);

From 06215c71c075369c4910dfbe11b35e5ed176bc22 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Tue, 11 Oct 2016 17:08:00 +0200
Subject: [PATCH 03/27] Fix T49629: Graph editor normalize function doesn't
 work on f-curves with a constant key value

Technically it is a regression in behavior and should be 2.78a.
---
 source/blender/editors/animation/anim_draw.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/source/blender/editors/animation/anim_draw.c b/source/blender/editors/animation/anim_draw.c
index f8b98ebb8b7..33e44d73894 100644
--- a/source/blender/editors/animation/anim_draw.c
+++ b/source/blender/editors/animation/anim_draw.c
@@ -350,6 +350,10 @@ static float normalization_factor_get(Scene *scene, FCurve *fcu, short flag, flo
 			}
 			offset = -min_coord - range / 2.0f;
 		}
+		else if (max_coord == min_coord) {
+			factor = 1.0f;
+			offset = -min_coord;
+		}
 	}
 	BLI_assert(factor != 0.0f);
 	if (r_offset) {

From dc95c3137256e36314c9b1125f7b9f2f6e51ce78 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Wed, 12 Oct 2016 00:07:11 +0200
Subject: [PATCH 04/27] Fix T49502: file browser on OS X not highlighting
 external drives.

---
 source/blender/editors/space_file/fsmenu.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/source/blender/editors/space_file/fsmenu.c b/source/blender/editors/space_file/fsmenu.c
index 72034b4f828..631ff06a77a 100644
--- a/source/blender/editors/space_file/fsmenu.c
+++ b/source/blender/editors/space_file/fsmenu.c
@@ -518,14 +518,18 @@ void fsmenu_read_system(struct FSMenu *fsmenu, int read_bookmarks)
 		CFURLEnumeratorRef volEnum = CFURLEnumeratorCreateForMountedVolumes(NULL, kCFURLEnumeratorSkipInvisibles, NULL);
 		
 		while (result != kCFURLEnumeratorEnd) {
-			unsigned char defPath[FILE_MAX];
+			char defPath[FILE_MAX];
 
 			result = CFURLEnumeratorGetNextURL(volEnum, &cfURL, NULL);
 			if (result != kCFURLEnumeratorSuccess)
 				continue;
 			
 			CFURLGetFileSystemRepresentation(cfURL, false, (UInt8 *)defPath, FILE_MAX);
-			fsmenu_insert_entry(fsmenu, FS_CATEGORY_SYSTEM, (char *)defPath, NULL, FS_INSERT_SORTED);
+
+			/* Add end slash for consistency with other platforms */
+			BLI_add_slash(defPath);
+
+			fsmenu_insert_entry(fsmenu, FS_CATEGORY_SYSTEM, defPath, NULL, FS_INSERT_SORTED);
 		}
 		
 		CFRelease(volEnum);

From acbbcdfbb8d63ad783410120db12ab1540822a0e Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Wed, 12 Oct 2016 10:01:30 +0200
Subject: [PATCH 05/27] Fix T49622: Grease pencil not rendering out of VSE

---
 source/blender/editors/render/render_opengl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/blender/editors/render/render_opengl.c b/source/blender/editors/render/render_opengl.c
index 6f3694d1db8..0716c062ab9 100644
--- a/source/blender/editors/render/render_opengl.c
+++ b/source/blender/editors/render/render_opengl.c
@@ -442,7 +442,7 @@ static void add_gpencil_renderpass(OGLRender *oglrender, RenderResult *rr, Rende
 	if (BLI_listbase_is_empty(&gpd->layers)) {
 		return;
 	}
-	if ((oglrender->v3d->flag2 & V3D_SHOW_GPENCIL) == 0) {
+	if (oglrender->v3d != NULL && (oglrender->v3d->flag2 & V3D_SHOW_GPENCIL) == 0) {
 		return;
 	}
 

From edd9d89673f7f737d367fca5a896e478f6ff66f4 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Wed, 12 Oct 2016 11:54:33 +0200
Subject: [PATCH 06/27] Cycles: Cleanup, style

---
 intern/cycles/kernel/kernel_subsurface.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index f03fe288a0c..1575fa4e2db 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -230,7 +230,7 @@ ccl_device_inline
 #endif
 int subsurface_scatter_multi_intersect(
         KernelGlobals *kg,
-        SubsurfaceIntersection* ss_isect,
+        SubsurfaceIntersection *ss_isect,
         ShaderData *sd,
         ShaderClosure *sc,
         uint *lcg_state,

From cc951726673e0e7e9969ce83f90efdabde289791 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Wed, 12 Oct 2016 12:12:28 +0200
Subject: [PATCH 07/27] Cycles: Fix use of uninitialized variable in SSS

When ray hits curve segment with SSS shader it was possible to have
uninitialized hit_P variable used for sampling.

Seems that was a reason of our headache of difference between AVX2
and SSE4 render results here, so now we can revert all the nasty
ifdef-ed inline policies.
---
 intern/cycles/kernel/kernel_subsurface.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index 1575fa4e2db..955aa8a346f 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -330,6 +330,10 @@ int subsurface_scatter_multi_intersect(
 			                                          verts);
 		}
 #endif  /* __OBJECT_MOTION__ */
+		else {
+			ss_isect->weight[hit] = make_float3(0.0f, 0.0f, 0.0f);
+			continue;
+		}
 
 		float3 hit_Ng = ss_isect->Ng[hit];
 		if(ss_isect->hits[hit].object != OBJECT_NONE) {

From 87d08a5dc183c9d5025433be137970423c4824e7 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Wed, 12 Oct 2016 12:15:24 +0200
Subject: [PATCH 08/27] Cycles: Get rid of ifdef-ed noinline policy

---
 intern/cycles/kernel/kernel_subsurface.h | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index 955aa8a346f..52c05b85aee 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -85,16 +85,11 @@ ccl_device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, Sha
 	return NULL;
 }
 
-#ifndef __KERNEL_GPU__
-ccl_device_noinline
-#else
-ccl_device_inline
-#endif
-float3 subsurface_scatter_eval(ShaderData *sd,
-                               ShaderClosure *sc,
-                               float disk_r,
-                               float r,
-                               bool all)
+ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
+                                                 ShaderClosure *sc,
+                                                 float disk_r,
+                                                 float r,
+                                                 bool all)
 {
 #ifdef BSSRDF_MULTI_EVAL
 	/* this is the veach one-sample model with balance heuristic, some pdf
@@ -223,12 +218,7 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
 /* Subsurface scattering step, from a point on the surface to other
  * nearby points on the same object.
  */
-#ifndef __KERNEL_CUDA__
-ccl_device
-#else
-ccl_device_inline
-#endif
-int subsurface_scatter_multi_intersect(
+ccl_device_inline int subsurface_scatter_multi_intersect(
         KernelGlobals *kg,
         SubsurfaceIntersection *ss_isect,
         ShaderData *sd,

From fa62a989b4d66e58dc21bd319f9adc84bdd8721e Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Wed, 12 Oct 2016 12:54:31 +0200
Subject: [PATCH 09/27] Cycles: Enable SSE options of math module for AVX2
 kernels

Currently this does not give measurable difference, but is required
ground work for some upcoming further optimization of AVX2 kernels.
---
 intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
index 7351e2bad6b..1a416e771ee 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -20,6 +20,7 @@
 
 /* SSE optimization disabled for now on 32 bit, see bug #36316 */
 #if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE__
 #  define __KERNEL_SSE2__
 #  define __KERNEL_SSE3__
 #  define __KERNEL_SSSE3__

From 6a4ec3ca43b3aaade29a3642f3c6a6138b89e4b8 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Wed, 12 Oct 2016 13:03:15 +0200
Subject: [PATCH 10/27] Cycles: Add new avxf vectorized data type

Based on existing ssef data type and to my knowledge it's also what happens in
Embree nowadays.

Inspired by Maxym Dmytrychenko and required for the upcoming triangle
intersection commit.

Hopefully the copyright message is correct.
---
 intern/cycles/kernel/kernel_compat_cpu.h |  15 ++
 intern/cycles/util/CMakeLists.txt        |   1 +
 intern/cycles/util/util_avxf.h           | 185 +++++++++++++++++++++++
 intern/cycles/util/util_simd.h           |   1 +
 4 files changed, 202 insertions(+)
 create mode 100644 intern/cycles/util/util_avxf.h

diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 7b30df04550..9d1f3bdc918 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -71,6 +71,20 @@ template<typename T> struct texture  {
 		return data[index];
 	}
 
+#ifdef __KERNEL_AVX__
+	/* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
+	 * compatibility with existing indicies and data structures.
+	 */
+	ccl_always_inline avxf fetch_avxf(const int index)
+	{
+		kernel_assert(index >= 0 && (index+1) < width);
+		ssef *ssefData = (ssef*)data;
+		ssef *ssefNodeData = &ssefData[index];
+		return _mm256_loadu_ps((float *)ssefNodeData);
+	}
+
+#endif
+
 #ifdef __KERNEL_SSE2__
 	ccl_always_inline ssef fetch_ssef(int index)
 	{
@@ -506,6 +520,7 @@ typedef texture_image<half4> texture_image_half4;
 /* Macros to handle different memory storage on different devices */
 
 #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
+#define kernel_tex_fetch_avxf(tex, index) (kg->tex.fetch_avxf(index))
 #define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
 #define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
 #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index f5674bdc15c..02ee4cd6774 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -63,6 +63,7 @@ set(SRC_HEADERS
 	util_sky_model.cpp
 	util_sky_model.h
 	util_sky_model_data.h
+	util_avxf.h
 	util_sseb.h
 	util_ssef.h
 	util_ssei.h
diff --git a/intern/cycles/util/util_avxf.h b/intern/cycles/util/util_avxf.h
new file mode 100644
index 00000000000..2db2c4dad1a
--- /dev/null
+++ b/intern/cycles/util/util_avxf.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright 2016 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_AVXF_H__
+#define __UTIL_AVXF_H__
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __KERNEL_AVX__
+struct avxf
+{
+	typedef avxf Float;
+
+	enum { size = 8 };  /* Number of SIMD elements. */
+
+	union {
+		__m256 m256;
+		float f[8];
+		int i[8];
+	};
+
+	__forceinline avxf           () {}
+	__forceinline avxf           (const avxf& other) { m256 = other.m256; }
+	__forceinline avxf& operator=(const avxf& other) { m256 = other.m256; return *this; }
+
+	__forceinline avxf(const __m256 a) : m256(a) {}
+	__forceinline avxf(const __m256i a) : m256(_mm256_castsi256_ps (a)) {}
+
+	__forceinline operator const __m256&(void) const { return m256; }
+	__forceinline operator       __m256&(void)       { return m256; }
+
+	__forceinline avxf          (float a) : m256(_mm256_set1_ps(a)) {}
+
+	__forceinline avxf(float high32x4, float low32x4) :
+	   m256(_mm256_set_ps(high32x4, high32x4, high32x4, high32x4, low32x4, low32x4, low32x4, low32x4)) {}
+
+	__forceinline avxf(float a3, float a2, float a1, float a0) :
+	   m256(_mm256_set_ps(a3, a2, a1, a0, a3, a2, a1, a0)) {}
+
+	__forceinline avxf(float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0) :
+		m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0)) {}
+
+
+	__forceinline avxf(int a3, int a2, int a1, int a0)
+	{
+		const __m256i foo = _mm256_set_epi32(a3, a2, a1, a0, a3, a2, a1, a0);
+		m256 = _mm256_castsi256_ps(foo);
+	}
+
+
+	__forceinline avxf(int a7, int a6, int a5, int a4, int a3, int a2, int a1, int a0)
+	{
+		const __m256i foo = _mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0);
+		m256 = _mm256_castsi256_ps(foo);
+	}
+
+	__forceinline avxf(__m128 a, __m128 b)
+	{
+		const __m256 foo = _mm256_castps128_ps256(a);
+		m256 = _mm256_insertf128_ps(foo, b, 1);
+	}
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxf mm256_sqrt(const avxf& a) { return _mm256_sqrt_ps(a.m256); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Binary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxf operator +(const avxf& a, const avxf& b) { return _mm256_add_ps(a.m256, b.m256); }
+__forceinline const avxf operator +(const avxf& a, const float& b) { return a + avxf(b); }
+__forceinline const avxf operator +(const float& a, const avxf& b) { return avxf(a) + b; }
+
+__forceinline const avxf operator -(const avxf& a, const avxf& b) { return _mm256_sub_ps(a.m256, b.m256); }
+__forceinline const avxf operator -(const avxf& a, const float& b) { return a - avxf(b); }
+__forceinline const avxf operator -(const float& a, const avxf& b) { return avxf(a) - b; }
+
+__forceinline const avxf operator *(const avxf& a, const avxf& b) { return _mm256_mul_ps(a.m256, b.m256); }
+__forceinline const avxf operator *(const avxf& a, const float& b) { return a * avxf(b); }
+__forceinline const avxf operator *(const float& a, const avxf& b) { return avxf(a) * b; }
+
+__forceinline const avxf operator /(const avxf& a, const avxf& b) { return _mm256_div_ps(a.m256,b.m256); }
+__forceinline const avxf operator /(const avxf& a, const float& b) { return a/avxf(b); }
+__forceinline const avxf operator /(const float& a, const avxf& b) { return avxf(a)/b; }
+
+__forceinline const avxf operator|(const avxf& a, const avxf& b) { return _mm256_or_ps(a.m256,b.m256); }
+
+__forceinline const avxf operator^(const avxf& a, const avxf& b) { return _mm256_xor_ps(a.m256,b.m256); }
+
+__forceinline const avxf operator&(const avxf& a, const avxf& b) { return _mm256_and_ps(a.m256,b.m256); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Movement/Shifting/Shuffling Functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxf shuffle(const avxf& a, const __m256i &shuf) {
+	return _mm256_permutevar_ps(a, shuf);
+}
+
+template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> __forceinline const avxf shuffle(const avxf& a) {
+	return _mm256_permutevar_ps(a, _mm256_set_epi32( i7,i6,i5,i4 ,i3,i2,i1,i0));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const avxf shuffle(const avxf& a, const avxf& b) {
+	return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+}
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const avxf shuffle(const avxf& a) {
+	return shuffle<i0,i1,i2,i3>(a,a);
+}
+template<size_t i0> __forceinline const avxf shuffle(const avxf& a, const avxf& b) {
+	return shuffle<i0,i0,i0,i0>(a, b);
+}
+template<size_t i0> __forceinline const avxf shuffle(const avxf& a) {
+	return shuffle<i0>(a,a);
+}
+
+template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> __forceinline const avxf permute(const avxf& a) {
+#ifdef __KERNEL_AVX2__
+	return  _mm256_permutevar8x32_ps(a,_mm256_set_epi32( i7,i6,i5,i4 ,i3,i2,i1,i0));
+#else
+	float temp[8];
+	_mm256_storeu_ps((float*)&temp, a);
+	return avxf(temp[i7], temp[i6], temp[i5], temp[i4], temp[i3], temp[i2], temp[i1], temp[i0]);
+#endif
+}
+
+template<int S0, int S1, int S2, int S3,int S4,int S5,int S6, int S7>
+ccl_device_inline const avxf set_sign_bit(const avxf &a)
+{
+	return a ^ avxf(S7 << 31, S6 << 31, S5 << 31, S4 << 31, S3 << 31,S2 << 31,S1 << 31,S0 << 31);
+}
+
+template<size_t S0, size_t S1, size_t S2, size_t S3,size_t S4,size_t S5,size_t S6, size_t S7>
+ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
+{
+	return _mm256_blend_ps(a,b,S7 << 0 | S6 << 1 | S5 << 2 | S4 << 3 | S3 << 4 | S2 << 5 | S1 << 6 | S0 << 7);
+}
+
+template<size_t S0, size_t S1, size_t S2, size_t S3 >
+ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
+{
+	return blend<S0,S1,S2,S3,S0,S1,S2,S3>(a,b);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Ternary Operators
+////////////////////////////////////////////////////////////////////////////////
+__forceinline const avxf madd (const avxf& a, const avxf& b, const avxf& c) {
+#ifdef __KERNEL_AVX2__
+	return _mm256_fmadd_ps(a,b,c);
+#else
+	return c+(a*b);
+#endif
+}
+
+__forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) {
+#ifdef __KERNEL_AVX2__
+	return _mm256_fnmadd_ps(a, b, c);
+#else
+	return c-(a*b);
+#endif
+}
+#endif
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 8d4d79068d6..f4f460d6cf6 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -455,6 +455,7 @@ CCL_NAMESPACE_END
 #include "util_sseb.h"
 #include "util_ssei.h"
 #include "util_ssef.h"
+#include "util_avxf.h"
 
 #endif /* __UTIL_SIMD_TYPES_H__ */
 

From 42aeb608e75ec976c0bb3d91ca14b49371e43e6d Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Wed, 12 Oct 2016 13:46:25 +0200
Subject: [PATCH 11/27] Cycles: Implement AVX2 version of triangle_intersect

This commit basically vectorizes existing code using AVX2 instructions
(without modifying algorithm itself). This gives quite nice speedups:

  BMW:        -8%
  Classroom:  -5%
  Cat:        -5%
  Koro:       +1%
  Barcelona:  -8%

That's on Linux machine, reported performance improvement on Windows
goes up to 20%.

Not currently sure why Koro is somewhat slower because it mainly uses
curve intersection tests, could be a time noise? Or osmething with the
cache utilization perhaps? In any case speedup in other scenes makes
me thinking that current state is acceptable for initial implementation.

This is again inspired by Maxym Dmytrychenko.
---
 .../kernel/geom/geom_triangle_intersect.h     | 62 +++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index dd5328220ab..b505bd54e5e 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -107,6 +107,67 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 
 	/* Calculate vertices relative to ray origin. */
 	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, triAddr);
+
+#if defined(__KERNEL_AVX2__)
+	const avxf avxf_P(P.m128, P.m128);
+
+	const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0);
+	const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1);
+
+	const avxf AB = tri_ab - avxf_P;
+	const avxf BC = tri_bc - avxf_P;
+
+	const __m256i permuteMask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx);
+
+	const avxf AB_k = shuffle(AB, permuteMask);
+	const avxf BC_k = shuffle(BC, permuteMask);
+
+	/* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */
+	const avxf ABBC_kz = shuffle<2>(AB_k, BC_k);
+
+	/* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */
+	const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k);
+
+	const avxf Sxy(Sy, Sx, Sy, Sx);
+
+	/* Ax, Ay, Bx, By, Bx, By, Cx, Cy */
+	const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy);
+
+	float ABBC_kz_array[8];
+	_mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz);
+
+	const float A_kz = ABBC_kz_array[0];
+	const float B_kz = ABBC_kz_array[2];
+	const float C_kz = ABBC_kz_array[6];
+
+	/* By, Bx, Cy, Cx, By, Bx, Ay, Ax */
+	const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy);
+
+	const avxf negMask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000);
+
+	/* W           U                             V
+	 * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX
+	 */
+	const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, negMask /* Dont care */);
+
+	const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ negMask;
+
+	/* Calculate scaled barycentric coordinates. */
+	float WUVW_array[4];
+	_mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW));
+
+	const float W = WUVW_array[0];
+	const float U = WUVW_array[1];
+	const float V = WUVW_array[2];
+
+	const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW);
+	const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW,
+	                                               _mm256_setzero_ps(), 0));
+
+	if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) {
+		return false;
+	}
+#else
 	const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
 	             tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
 	             tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
@@ -135,6 +196,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 	{
 		return false;
 	}
+#endif
 
 	/* Calculate determinant. */
 	float det = U + V + W;

From e588106d459207f04d28cfc3456355343d413446 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Wed, 12 Oct 2016 14:23:29 +0200
Subject: [PATCH 12/27] Cycles: Use more SSE intrinsics for float3 type

This gives about 5% speedup on AVX2 kernels (other kernels still
have SSE disabled for math operations) and this solves the slowdown
of koro scene mention in the previous commit.

The title says it all actually. This commit also contains
changes to pass float3 as const reference in affected functions.

This should make MSVC happier without breaking OpenCL because it's
only done in areas which are ifdef-ed for non-OpenCL.

Another patch based on inspiration from Maxym Dmytrychenko, thanks!
---
 intern/cycles/util/util_math.h  | 78 ++++++++++++++++++++++++++-------
 intern/cycles/util/util_types.h | 15 ++++++-
 2 files changed, 75 insertions(+), 18 deletions(-)

diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 89a882d9b9d..c98407b1f77 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -424,53 +424,87 @@ ccl_device_inline float2 interp(float2 a, float2 b, float t)
 
 #ifndef __KERNEL_OPENCL__
 
-ccl_device_inline float3 operator-(const float3 a)
+ccl_device_inline float3 operator-(const float3& a)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+#else
 	return make_float3(-a.x, -a.y, -a.z);
+#endif
 }
 
-ccl_device_inline float3 operator*(const float3 a, const float3 b)
+ccl_device_inline float3 operator*(const float3& a, const float3& b)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_mul_ps(a.m128,b.m128));
+#else
 	return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
+#endif
 }
 
-ccl_device_inline float3 operator*(const float3 a, float f)
+ccl_device_inline float3 operator*(const float3& a, const float f)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f)));
+#else
 	return make_float3(a.x*f, a.y*f, a.z*f);
+#endif
 }
 
-ccl_device_inline float3 operator*(float f, const float3 a)
+ccl_device_inline float3 operator*(const float f, const float3& a)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
+#else
 	return make_float3(a.x*f, a.y*f, a.z*f);
+#endif
 }
 
-ccl_device_inline float3 operator/(float f, const float3 a)
+ccl_device_inline float3 operator/(const float f, const float3& a)
 {
-	return make_float3(f/a.x, f/a.y, f/a.z);
+#ifdef __KERNEL_SSE__
+	__m128 rc = _mm_rcp_ps(a.m128);
+	return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
+#else
+	return make_float3(f / a.x, f / a.y, f / a.z);
+#endif
 }
 
-ccl_device_inline float3 operator/(const float3 a, float f)
+ccl_device_inline float3 operator/(const float3& a, const float f)
 {
 	float invf = 1.0f/f;
-	return make_float3(a.x*invf, a.y*invf, a.z*invf);
+	return a * invf;
 }
 
-ccl_device_inline float3 operator/(const float3 a, const float3 b)
+ccl_device_inline float3 operator/(const float3& a, const float3& b)
 {
-	return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
+#ifdef __KERNEL_SSE__
+	__m128 rc = _mm_rcp_ps(b.m128);
+	return float3(_mm_mul_ps(a, rc));
+#else
+	return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+#endif
 }
 
-ccl_device_inline float3 operator+(const float3 a, const float3 b)
+ccl_device_inline float3 operator+(const float3& a, const float3& b)
 {
-	return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
+#ifdef __KERNEL_SSE__
+	return float3(_mm_add_ps(a.m128, b.m128));
+#else
+	return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+#endif
 }
 
-ccl_device_inline float3 operator-(const float3 a, const float3 b)
+ccl_device_inline float3 operator-(const float3& a, const float3& b)
 {
-	return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
+#ifdef __KERNEL_SSE__
+	return float3(_mm_sub_ps(a.m128, b.m128));
+#else
+	return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+#endif
 }
 
-ccl_device_inline float3 operator+=(float3& a, const float3 b)
+ccl_device_inline float3 operator+=(float3& a, const float3& b)
 {
 	return a = a + b;
 }
@@ -505,6 +539,15 @@ ccl_device_inline float dot(const float3 a, const float3 b)
 #endif
 }
 
+ccl_device_inline float dot_xy(const float3& a, const float3& b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b));
+#else
+	return a.x*b.x + a.y*b.y;
+#endif
+}
+
 ccl_device_inline float dot(const float4 a, const float4 b)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
@@ -563,13 +606,14 @@ ccl_device_inline float3 saturate3(float3 a)
 ccl_device_inline float3 normalize_len(const float3 a, float *t)
 {
 	*t = len(a);
-	return a/(*t);
+	float x = 1.0f / *t;
+	return a*x;
 }
 
 ccl_device_inline float3 safe_normalize(const float3 a)
 {
 	float t = len(a);
-	return (t != 0.0f)? a/t: a;
+	return (t != 0.0f)? a * (1.0f/t) : a;
 }
 
 ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 6af65f88a02..a000fae4bd6 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -174,6 +174,9 @@ struct ccl_try_align(16) int3 {
 	__forceinline int3(const __m128i a) : m128(a) {}
 	__forceinline operator const __m128i&(void) const { return m128; }
 	__forceinline operator __m128i&(void) { return m128; }
+
+	int3(const int3& a) { m128 = a.m128; }
+	int3& operator =(const int3& a) { m128 = a.m128; return *this; }
 #else
 	int x, y, z, w;
 #endif
@@ -193,6 +196,9 @@ struct ccl_try_align(16) int4 {
 	__forceinline int4(const __m128i a) : m128(a) {}
 	__forceinline operator const __m128i&(void) const { return m128; }
 	__forceinline operator __m128i&(void) { return m128; }
+
+	int4(const int4& a) : m128(a.m128) {}
+	int4& operator=(const int4& a) { m128 = a.m128; return *this; }
 #else
 	int x, y, z, w;
 #endif
@@ -237,9 +243,12 @@ struct ccl_try_align(16) float3 {
 	};
 
 	__forceinline float3() {}
-	__forceinline float3(const __m128 a) : m128(a) {}
+	__forceinline float3(const __m128& a) : m128(a) {}
 	__forceinline operator const __m128&(void) const { return m128; }
 	__forceinline operator __m128&(void) { return m128; }
+
+	__forceinline float3(const float3& a) : m128(a.m128) {}
+	__forceinline float3& operator =(const float3& a) { m128 = a.m128; return *this; }
 #else
 	float x, y, z, w;
 #endif
@@ -259,6 +268,10 @@ struct ccl_try_align(16) float4 {
 	__forceinline float4(const __m128 a) : m128(a) {}
 	__forceinline operator const __m128&(void) const { return m128; }
 	__forceinline operator __m128&(void) { return m128; }
+
+	__forceinline float4(const float4& a) : m128(a.m128) {}
+	__forceinline float4& operator =(const float4& a) { m128 = a.m128; return *this; }
+
 #else
 	float x, y, z, w;
 #endif

From 22cdf441018f6ff358fed68aae33cf6dada5119e Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Wed, 12 Oct 2016 14:48:59 +0200
Subject: [PATCH 13/27] Cycles: Use const reference for register variables in
 non-OpenCL code

This is something tested by @LazyDodo and suggested by Maxym to make
MSVC happier.
---
 intern/cycles/util/util_math.h | 100 ++++++++++++++++-----------------
 1 file changed, 50 insertions(+), 50 deletions(-)

diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index c98407b1f77..ce2e4e5c30d 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -233,7 +233,7 @@ ccl_device_inline int mod(int x, int m)
 
 #ifndef __KERNEL_OPENCL__
 
-ccl_device_inline bool is_zero(const float2 a)
+ccl_device_inline bool is_zero(const float2& a)
 {
 	return (a.x == 0.0f && a.y == 0.0f);
 }
@@ -242,7 +242,7 @@ ccl_device_inline bool is_zero(const float2 a)
 
 #ifndef __KERNEL_OPENCL__
 
-ccl_device_inline float average(const float2 a)
+ccl_device_inline float average(const float2& a)
 {
 	return (a.x + a.y)*(1.0f/2.0f);
 }
@@ -251,58 +251,58 @@ ccl_device_inline float average(const float2 a)
 
 #ifndef __KERNEL_OPENCL__
 
-ccl_device_inline float2 operator-(const float2 a)
+ccl_device_inline float2 operator-(const float2& a)
 {
 	return make_float2(-a.x, -a.y);
 }
 
-ccl_device_inline float2 operator*(const float2 a, const float2 b)
+ccl_device_inline float2 operator*(const float2& a, const float2& b)
 {
 	return make_float2(a.x*b.x, a.y*b.y);
 }
 
-ccl_device_inline float2 operator*(const float2 a, float f)
+ccl_device_inline float2 operator*(const float2& a, float f)
 {
 	return make_float2(a.x*f, a.y*f);
 }
 
-ccl_device_inline float2 operator*(float f, const float2 a)
+ccl_device_inline float2 operator*(float f, const float2& a)
 {
 	return make_float2(a.x*f, a.y*f);
 }
 
-ccl_device_inline float2 operator/(float f, const float2 a)
+ccl_device_inline float2 operator/(float f, const float2& a)
 {
 	return make_float2(f/a.x, f/a.y);
 }
 
-ccl_device_inline float2 operator/(const float2 a, float f)
+ccl_device_inline float2 operator/(const float2& a, float f)
 {
 	float invf = 1.0f/f;
 	return make_float2(a.x*invf, a.y*invf);
 }
 
-ccl_device_inline float2 operator/(const float2 a, const float2 b)
+ccl_device_inline float2 operator/(const float2& a, const float2& b)
 {
 	return make_float2(a.x/b.x, a.y/b.y);
 }
 
-ccl_device_inline float2 operator+(const float2 a, const float2 b)
+ccl_device_inline float2 operator+(const float2& a, const float2& b)
 {
 	return make_float2(a.x+b.x, a.y+b.y);
 }
 
-ccl_device_inline float2 operator-(const float2 a, const float2 b)
+ccl_device_inline float2 operator-(const float2& a, const float2& b)
 {
 	return make_float2(a.x-b.x, a.y-b.y);
 }
 
-ccl_device_inline float2 operator+=(float2& a, const float2 b)
+ccl_device_inline float2 operator+=(float2& a, const float2& b)
 {
 	return a = a + b;
 }
 
-ccl_device_inline float2 operator*=(float2& a, const float2 b)
+ccl_device_inline float2 operator*=(float2& a, const float2& b)
 {
 	return a = a * b;
 }
@@ -312,7 +312,7 @@ ccl_device_inline float2 operator*=(float2& a, float f)
 	return a = a * f;
 }
 
-ccl_device_inline float2 operator/=(float2& a, const float2 b)
+ccl_device_inline float2 operator/=(float2& a, const float2& b)
 {
 	return a = a / b;
 }
@@ -324,12 +324,12 @@ ccl_device_inline float2 operator/=(float2& a, float f)
 }
 
 
-ccl_device_inline float dot(const float2 a, const float2 b)
+ccl_device_inline float dot(const float2& a, const float2& b)
 {
 	return a.x*b.x + a.y*b.y;
 }
 
-ccl_device_inline float cross(const float2 a, const float2 b)
+ccl_device_inline float cross(const float2& a, const float2& b)
 {
 	return (a.x*b.y - a.y*b.x);
 }
@@ -343,59 +343,59 @@ ccl_device_inline bool operator==(const int2 a, const int2 b)
 	return (a.x == b.x && a.y == b.y);
 }
 
-ccl_device_inline float len(const float2 a)
+ccl_device_inline float len(const float2& a)
 {
 	return sqrtf(dot(a, a));
 }
 
-ccl_device_inline float2 normalize(const float2 a)
+ccl_device_inline float2 normalize(const float2& a)
 {
 	return a/len(a);
 }
 
-ccl_device_inline float2 normalize_len(const float2 a, float *t)
+ccl_device_inline float2 normalize_len(const float2& a, float *t)
 {
 	*t = len(a);
 	return a/(*t);
 }
 
-ccl_device_inline float2 safe_normalize(const float2 a)
+ccl_device_inline float2 safe_normalize(const float2& a)
 {
 	float t = len(a);
 	return (t != 0.0f)? a/t: a;
 }
 
-ccl_device_inline bool operator==(const float2 a, const float2 b)
+ccl_device_inline bool operator==(const float2& a, const float2& b)
 {
 	return (a.x == b.x && a.y == b.y);
 }
 
-ccl_device_inline bool operator!=(const float2 a, const float2 b)
+ccl_device_inline bool operator!=(const float2& a, const float2& b)
 {
 	return !(a == b);
 }
 
-ccl_device_inline float2 min(float2 a, float2 b)
+ccl_device_inline float2 min(const float2& a, const float2& b)
 {
 	return make_float2(min(a.x, b.x), min(a.y, b.y));
 }
 
-ccl_device_inline float2 max(float2 a, float2 b)
+ccl_device_inline float2 max(const float2& a, const float2& b)
 {
 	return make_float2(max(a.x, b.x), max(a.y, b.y));
 }
 
-ccl_device_inline float2 clamp(float2 a, float2 mn, float2 mx)
+ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx)
 {
 	return min(max(a, mn), mx);
 }
 
-ccl_device_inline float2 fabs(float2 a)
+ccl_device_inline float2 fabs(const float2& a)
 {
 	return make_float2(fabsf(a.x), fabsf(a.y));
 }
 
-ccl_device_inline float2 as_float2(const float4 a)
+ccl_device_inline float2 as_float2(const float4& a)
 {
 	return make_float2(a.x, a.y);
 }
@@ -413,7 +413,7 @@ ccl_device_inline void print_float2(const char *label, const float2& a)
 
 #ifndef __KERNEL_OPENCL__
 
-ccl_device_inline float2 interp(float2 a, float2 b, float t)
+ccl_device_inline float2 interp(const float2& a, const float2& b, float t)
 {
 	return a + t*(b - a);
 }
@@ -509,7 +509,7 @@ ccl_device_inline float3 operator+=(float3& a, const float3& b)
 	return a = a + b;
 }
 
-ccl_device_inline float3 operator*=(float3& a, const float3 b)
+ccl_device_inline float3 operator*=(float3& a, const float3& b)
 {
 	return a = a * b;
 }
@@ -519,7 +519,7 @@ ccl_device_inline float3 operator*=(float3& a, float f)
 	return a = a * f;
 }
 
-ccl_device_inline float3 operator/=(float3& a, const float3 b)
+ccl_device_inline float3 operator/=(float3& a, const float3& b)
 {
 	return a = a / b;
 }
@@ -530,7 +530,7 @@ ccl_device_inline float3 operator/=(float3& a, float f)
 	return a = a * invf;
 }
 
-ccl_device_inline float dot(const float3 a, const float3 b)
+ccl_device_inline float dot(const float3& a, const float3& b)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
 	return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
@@ -548,7 +548,7 @@ ccl_device_inline float dot_xy(const float3& a, const float3& b)
 #endif
 }
 
-ccl_device_inline float dot(const float4 a, const float4 b)
+ccl_device_inline float dot(const float4& a, const float4& b)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
 	return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
@@ -557,7 +557,7 @@ ccl_device_inline float dot(const float4 a, const float4 b)
 #endif
 }
 
-ccl_device_inline float3 cross(const float3 a, const float3 b)
+ccl_device_inline float3 cross(const float3& a, const float3& b)
 {
 	float3 r = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
 	return r;
@@ -581,12 +581,12 @@ ccl_device_inline float len_squared(const float3 a)
 
 #ifndef __KERNEL_OPENCL__
 
-ccl_device_inline float len_squared(const float4 a)
+ccl_device_inline float len_squared(const float4& a)
 {
 	return dot(a, a);
 }
 
-ccl_device_inline float3 normalize(const float3 a)
+ccl_device_inline float3 normalize(const float3& a)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
 	__m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
@@ -624,7 +624,7 @@ ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
 
 #ifndef __KERNEL_OPENCL__
 
-ccl_device_inline bool operator==(const float3 a, const float3 b)
+ccl_device_inline bool operator==(const float3& a, const float3& b)
 {
 #ifdef __KERNEL_SSE__
 	return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
@@ -633,12 +633,12 @@ ccl_device_inline bool operator==(const float3 a, const float3 b)
 #endif
 }
 
-ccl_device_inline bool operator!=(const float3 a, const float3 b)
+ccl_device_inline bool operator!=(const float3& a, const float3& b)
 {
 	return !(a == b);
 }
 
-ccl_device_inline float3 min(float3 a, float3 b)
+ccl_device_inline float3 min(const float3& a, const float3& b)
 {
 #ifdef __KERNEL_SSE__
 	return _mm_min_ps(a.m128, b.m128);
@@ -647,7 +647,7 @@ ccl_device_inline float3 min(float3 a, float3 b)
 #endif
 }
 
-ccl_device_inline float3 max(float3 a, float3 b)
+ccl_device_inline float3 max(const float3& a, const float3& b)
 {
 #ifdef __KERNEL_SSE__
 	return _mm_max_ps(a.m128, b.m128);
@@ -656,12 +656,12 @@ ccl_device_inline float3 max(float3 a, float3 b)
 #endif
 }
 
-ccl_device_inline float3 clamp(float3 a, float3 mn, float3 mx)
+ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx)
 {
 	return min(max(a, mn), mx);
 }
 
-ccl_device_inline float3 fabs(float3 a)
+ccl_device_inline float3 fabs(const float3& a)
 {
 #ifdef __KERNEL_SSE__
 	__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
@@ -714,7 +714,7 @@ ccl_device_inline float3 interp(float3 a, float3 b, float t)
 
 #ifndef __KERNEL_OPENCL__
 
-ccl_device_inline float3 mix(float3 a, float3 b, float t)
+ccl_device_inline float3 mix(const float3& a, const float3& b, float t)
 {
 	return a + t*(b - a);
 }
@@ -877,7 +877,7 @@ ccl_device_inline int4 operator<(const float4& a, const float4& b)
 #endif
 }
 
-ccl_device_inline int4 operator>=(float4 a, float4 b)
+ccl_device_inline int4 operator>=(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
 	return _mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)); /* todo: avoid cvt */
@@ -895,7 +895,7 @@ ccl_device_inline int4 operator<=(const float4& a, const float4& b)
 #endif
 }
 
-ccl_device_inline bool operator==(const float4 a, const float4 b)
+ccl_device_inline bool operator==(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
 	return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
@@ -937,23 +937,23 @@ ccl_device_inline float average(const float4& a)
 	return reduce_add(a) * 0.25f;
 }
 
-ccl_device_inline float len(const float4 a)
+ccl_device_inline float len(const float4& a)
 {
 	return sqrtf(dot(a, a));
 }
 
-ccl_device_inline float4 normalize(const float4 a)
+ccl_device_inline float4 normalize(const float4& a)
 {
 	return a/len(a);
 }
 
-ccl_device_inline float4 safe_normalize(const float4 a)
+ccl_device_inline float4 safe_normalize(const float4& a)
 {
 	float t = len(a);
 	return (t != 0.0f)? a/t: a;
 }
 
-ccl_device_inline float4 min(float4 a, float4 b)
+ccl_device_inline float4 min(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
 	return _mm_min_ps(a.m128, b.m128);
@@ -962,7 +962,7 @@ ccl_device_inline float4 min(float4 a, float4 b)
 #endif
 }
 
-ccl_device_inline float4 max(float4 a, float4 b)
+ccl_device_inline float4 max(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
 	return _mm_max_ps(a.m128, b.m128);
@@ -1234,7 +1234,7 @@ template<class A, class B> A lerp(const A& a, const A& b, const B& t)
 
 /* Triangle */
 
-ccl_device_inline float triangle_area(const float3 v1, const float3 v2, const float3 v3)
+ccl_device_inline float triangle_area(const float3& v1, const float3& v2, const float3& v3)
 {
 	return len(cross(v3 - v2, v1 - v2))*0.5f;
 }

From a54242503e3fd50dcd4d3af8b39e1d2a6e138e45 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Wed, 12 Oct 2016 16:17:43 +0200
Subject: [PATCH 14/27] FFmpeg: Fix off by one error in number of detected
 frames in matroska container

Seems to be rounding error. Hopefully new code handles the error fixed back in
SVN revision 28901 and still have proper frame number for Hjalti.

What could possibly go wrong here..
---
 source/blender/imbuf/intern/anim_movie.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/source/blender/imbuf/intern/anim_movie.c b/source/blender/imbuf/intern/anim_movie.c
index d378ca9a78c..a40b257b75b 100644
--- a/source/blender/imbuf/intern/anim_movie.c
+++ b/source/blender/imbuf/intern/anim_movie.c
@@ -522,9 +522,9 @@ static int startffmpeg(struct anim *anim)
 		anim->duration = pFormatCtx->streams[videoStream]->nb_frames;
 	}
 	else {
-		anim->duration = ceil(pFormatCtx->duration *
-		                      av_q2d(frame_rate) /
-		                      AV_TIME_BASE);
+		anim->duration = (int)(pFormatCtx->duration *
+		                       av_q2d(frame_rate) /
+		                       AV_TIME_BASE + 0.5f);
 	}
 
 	frs_num = frame_rate.num;

From 5f16382195a04d5bb418c1d8fa666110b368404d Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Wed, 12 Oct 2016 13:10:09 +0200
Subject: [PATCH 15/27] Fix T49636: material draw mode crash with displacement
 and missing group input node.

---
 source/blender/nodes/shader/node_shader_tree.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/source/blender/nodes/shader/node_shader_tree.c b/source/blender/nodes/shader/node_shader_tree.c
index 8523b7275bf..40d1cfdfcb0 100644
--- a/source/blender/nodes/shader/node_shader_tree.c
+++ b/source/blender/nodes/shader/node_shader_tree.c
@@ -327,7 +327,8 @@ static void ntree_shader_link_builtin_group_normal(
 	/* Need to update tree so all node instances nodes gets proper sockets. */
 	bNode *group_input_node = ntreeFindType(group_ntree, NODE_GROUP_INPUT);
 	node_group_verify(ntree, group_node, &group_ntree->id);
-	node_group_input_verify(group_ntree, group_input_node, &group_ntree->id);
+	if (group_input_node)
+		node_group_input_verify(group_ntree, group_input_node, &group_ntree->id);
 	ntreeUpdateTree(G.main, group_ntree);
 	/* Assumes sockets are always added at the end. */
 	bNodeSocket *group_node_normal_socket = group_node->inputs.last;
@@ -370,7 +371,7 @@ static void ntree_shader_link_builtin_group_normal(
 		                                 group_displacement_socket);
 		ntreeUpdateTree(G.main, group_ntree);
 	}
-	else {
+	else if (group_input_node) {
 		/* Connect group node normal input. */
 		nodeAddLink(ntree,
 		            node_from, socket_from,

From 21e65d7457a4fbe7712630fe94425e49772e0c9d Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Wed, 12 Oct 2016 17:35:03 +0200
Subject: [PATCH 16/27] Fix build error with WITH_CYCLES_NATIVE_ONLY and recent
 AVX2 changes.

---
 intern/cycles/kernel/kernels/cpu/kernel.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp
index f11c85d5f6a..1559b0d7322 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp
@@ -45,6 +45,7 @@
 #    define __KERNEL_AVX__
 #  endif
 #  ifdef __AVX2__
+#    define __KERNEL_SSE__
 #    define __KERNEL_AVX2__
 #  endif
 #endif

From 7f5441b9167e46761c944584719a556d4604da46 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Wed, 12 Oct 2016 18:36:41 +0200
Subject: [PATCH 17/27] Fix T49640: Cycles constant folding incorrect for
 texture coordinates.

---
 intern/cycles/render/graph.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp
index ed8c7056aaa..131ec824be3 100644
--- a/intern/cycles/render/graph.cpp
+++ b/intern/cycles/render/graph.cpp
@@ -321,8 +321,8 @@ void ShaderGraph::finalize(Scene *scene,
 	 * modified afterwards. */
 
 	if(!finalized) {
-		clean(scene);
 		default_inputs(do_osl);
+		clean(scene);
 		refine_bump_nodes();
 
 		if(do_bump)

From 2d03edb458bda6f2d3b84ede0d49f8e7b78594f7 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brechtvanlommel@gmail.com>
Date: Wed, 12 Oct 2016 20:15:38 +0200
Subject: [PATCH 18/27] Fix T49631: radial control operators not using DPI
 properly.

---
 source/blender/windowmanager/intern/wm_operators.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/source/blender/windowmanager/intern/wm_operators.c b/source/blender/windowmanager/intern/wm_operators.c
index 87ef0596e52..b9fd4d2e762 100644
--- a/source/blender/windowmanager/intern/wm_operators.c
+++ b/source/blender/windowmanager/intern/wm_operators.c
@@ -2872,8 +2872,8 @@ void WM_OT_straightline_gesture(wmOperatorType *ot)
 
 /* *********************** radial control ****************** */
 
-#define WM_RADIAL_CONTROL_DISPLAY_SIZE (200)
-#define WM_RADIAL_CONTROL_DISPLAY_MIN_SIZE (35)
+#define WM_RADIAL_CONTROL_DISPLAY_SIZE (200 * UI_DPI_FAC)
+#define WM_RADIAL_CONTROL_DISPLAY_MIN_SIZE (35 * UI_DPI_FAC)
 #define WM_RADIAL_CONTROL_DISPLAY_WIDTH (WM_RADIAL_CONTROL_DISPLAY_SIZE - WM_RADIAL_CONTROL_DISPLAY_MIN_SIZE)
 #define WM_RADIAL_MAX_STR 10
 
@@ -3150,7 +3150,7 @@ static void radial_control_paint_cursor(bContext *C, int x, int y, void *customd
 	if (rmin > 0.0f)
 		glutil_draw_lined_arc(0.0, (float)(M_PI * 2.0), rmin, 40);
 
-	BLF_size(fontid, 1.5 * fstyle_points, 1.0f / U.dpi);
+	BLF_size(fontid, 1.5 * fstyle_points * U.pixelsize, U.dpi);
 	BLF_enable(fontid, BLF_SHADOW);
 	BLF_shadow(fontid, 3, (const float[4]){0.0f, 0.0f, 0.0f, 0.5f});
 	BLF_shadow_offset(fontid, 1, -1);

From 8d573aa0ecb6143f239d5bfeb3842c565461c1e4 Mon Sep 17 00:00:00 2001
From: Dalai Felinto <dfelinto@gmail.com>
Date: Thu, 13 Oct 2016 00:28:07 +0000
Subject: [PATCH 19/27] wrong alpha set for timeline theme (fixup for
 rBf329ebe3)

---
 source/blender/editors/interface/resources.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/source/blender/editors/interface/resources.c b/source/blender/editors/interface/resources.c
index 79fa7a7571a..dad5eb70612 100644
--- a/source/blender/editors/interface/resources.c
+++ b/source/blender/editors/interface/resources.c
@@ -2743,6 +2743,10 @@ void init_userdef_do_versions(void)
 		for (btheme = U.themes.first; btheme; btheme = btheme->next) {
 			rgba_char_args_set(btheme->tv3d.vertex_bevel, 0, 165, 255, 255);
 			rgba_char_args_set(btheme->tv3d.edge_bevel, 0, 165, 255, 255);
+
+			/* 3dView Keyframe Indicators */
+			btheme->tv3d.time_keyframe[3] = 0xFF;
+			btheme->tv3d.time_gp_keyframe[3] = 0xFF;
 		}
 	}
 

From 786c0966ec19e95b37c352f3fdc74cd36803537a Mon Sep 17 00:00:00 2001
From: Bastien Montagne <montagne29@wanadoo.fr>
Date: Wed, 12 Oct 2016 21:28:11 +0200
Subject: [PATCH 20/27] Cleanup: UI layout: remove unsed and confusing
 parameter.

Things are complicated enough like that, no need to add useless noise on
top of it!
---
 .../editors/interface/interface_layout.c        | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/source/blender/editors/interface/interface_layout.c b/source/blender/editors/interface/interface_layout.c
index 976b5ed1193..b52068d8bd1 100644
--- a/source/blender/editors/interface/interface_layout.c
+++ b/source/blender/editors/interface/interface_layout.c
@@ -184,11 +184,8 @@ static const char *ui_item_name_add_colon(const char *name, char namestr[UI_MAX_
 	return name;
 }
 
-static int ui_item_fit(int item, int pos, int all, int available, bool is_last, int alignment, int *offset)
+static int ui_item_fit(int item, int pos, int all, int available, bool is_last, int alignment)
 {
-	if (offset)
-		*offset = 0;
-
 	/* available == 0 is unlimited */
 	if (available == 0)
 		return item;
@@ -2110,7 +2107,7 @@ static void ui_litem_layout_row(uiLayout *litem)
 			minw = ui_litem_min_width(itemw);
 
 			if (w - lastw > 0)
-				neww = ui_item_fit(itemw, x, totw, w - lastw, !item->next, litem->alignment, NULL);
+				neww = ui_item_fit(itemw, x, totw, w - lastw, !item->next, litem->alignment);
 			else
 				neww = 0;  /* no space left, all will need clamping to minimum size */
 
@@ -2144,12 +2141,12 @@ static void ui_litem_layout_row(uiLayout *litem)
 
 		if (item->flag) {
 			/* fixed minimum size items */
-			itemw = ui_item_fit(minw, fixedx, fixedw, min_ii(w, fixedw), !item->next, litem->alignment, NULL);
+			itemw = ui_item_fit(minw, fixedx, fixedw, min_ii(w, fixedw), !item->next, litem->alignment);
 			fixedx += itemw;
 		}
 		else {
 			/* free size item */
-			itemw = ui_item_fit(itemw, freex, freew, w - fixedw, !item->next, litem->alignment, NULL);
+			itemw = ui_item_fit(itemw, freex, freew, w - fixedw, !item->next, litem->alignment);
 			freex += itemw;
 		}
 
@@ -2469,7 +2466,7 @@ static void ui_litem_layout_column_flow(uiLayout *litem)
 	uiLayoutItemFlow *flow = (uiLayoutItemFlow *)litem;
 	uiItem *item;
 	int col, x, y, w, emh, emy, miny, itemw, itemh;
-	int toth, totitem, offset;
+	int toth, totitem;
 
 	/* compute max needed width and total height */
 	toth = 0;
@@ -2493,11 +2490,11 @@ static void ui_litem_layout_column_flow(uiLayout *litem)
 	col = 0;
 	for (item = litem->items.first; item; item = item->next) {
 		ui_item_size(item, NULL, &itemh);
-		itemw = ui_item_fit(1, x - litem->x, flow->totcol, w, col == flow->totcol - 1, litem->alignment, &offset);
+		itemw = ui_item_fit(1, x - litem->x, flow->totcol, w, col == flow->totcol - 1, litem->alignment);
 	
 		y -= itemh;
 		emy -= itemh;
-		ui_item_position(item, x + offset, y, itemw, itemh);
+		ui_item_position(item, x, y, itemw, itemh);
 		y -= style->buttonspacey;
 		miny = min_ii(miny, y);
 

From 918e6cf4c9b74ff96ad06753ef9e541837eb3b22 Mon Sep 17 00:00:00 2001
From: Bastien Montagne <montagne29@wanadoo.fr>
Date: Thu, 13 Oct 2016 10:21:38 +0200
Subject: [PATCH 21/27] Fix T49635: column_flow Layout - last column is too
 small.

Column flow layout was abuse ui_item_fit in a weird way, which was
broken for last column items.

Now rather use own code, which basically spread available width as
equally as possible between all columns.
---
 source/blender/editors/interface/interface_layout.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/source/blender/editors/interface/interface_layout.c b/source/blender/editors/interface/interface_layout.c
index b52068d8bd1..875522e01c6 100644
--- a/source/blender/editors/interface/interface_layout.c
+++ b/source/blender/editors/interface/interface_layout.c
@@ -2488,10 +2488,12 @@ static void ui_litem_layout_column_flow(uiLayout *litem)
 
 	/* create column per column */
 	col = 0;
+	w = (litem->w - (flow->totcol - 1) * style->columnspace) / flow->totcol;
 	for (item = litem->items.first; item; item = item->next) {
-		ui_item_size(item, NULL, &itemh);
-		itemw = ui_item_fit(1, x - litem->x, flow->totcol, w, col == flow->totcol - 1, litem->alignment);
-	
+		ui_item_size(item, &itemw, &itemh);
+
+		itemw = (litem->alignment == UI_LAYOUT_ALIGN_EXPAND) ? w : min_ii(w, itemw);
+
 		y -= itemh;
 		emy -= itemh;
 		ui_item_position(item, x, y, itemw, itemh);
@@ -2500,10 +2502,13 @@ static void ui_litem_layout_column_flow(uiLayout *litem)
 
 		/* decide to go to next one */
 		if (col < flow->totcol - 1 && emy <= -emh) {
-			x += itemw + style->columnspace;
+			x += w + style->columnspace;
 			y = litem->y;
 			emy = 0; /* need to reset height again for next column */
 			col++;
+
+			/*  (<     remaining width     > - <      space between remaining columns      >) / <remamining columns > */
+			w = ((litem->w - (x - litem->x)) - (flow->totcol - col - 1) * style->columnspace) / (flow->totcol - col);
 		}
 	}
 

From 5c651554e2f98bff1cfdff0a6a6f029453dc3309 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sybren=20A=2E=20St=C3=BCvel?= <sybren@stuvel.eu>
Date: Thu, 13 Oct 2016 13:32:08 +0200
Subject: [PATCH 22/27] Proxy appending: re-establish link to proxies when they
 are made local

This allows appending of an entire scene from another blend file into this one,
even when that blend file contains proxified armatures.

This replaces the approach from commit 1cdc54dc7db85766.

Thanks @sergey for the help.
---
 source/blender/blenkernel/BKE_object.h     |  1 +
 source/blender/blenkernel/intern/library.c | 49 ++++------------------
 source/blender/blenkernel/intern/object.c  | 14 ++++++-
 3 files changed, 23 insertions(+), 41 deletions(-)

diff --git a/source/blender/blenkernel/BKE_object.h b/source/blender/blenkernel/BKE_object.h
index 1b3e05d11ae..cf07a178fe8 100644
--- a/source/blender/blenkernel/BKE_object.h
+++ b/source/blender/blenkernel/BKE_object.h
@@ -108,6 +108,7 @@ struct Object *BKE_object_lod_matob_get(struct Object *ob, struct Scene *scene);
 struct Object *BKE_object_copy_ex(struct Main *bmain, struct Object *ob, bool copy_caches);
 struct Object *BKE_object_copy(struct Main *bmain, struct Object *ob);
 void BKE_object_make_local(struct Main *bmain, struct Object *ob, const bool lib_local);
+void BKE_object_make_local_ex(struct Main *bmain, struct Object *ob, const bool lib_local, const bool clear_proxy);
 bool BKE_object_is_libdata(struct Object *ob);
 bool BKE_object_obdata_is_libdata(struct Object *ob);
 
diff --git a/source/blender/blenkernel/intern/library.c b/source/blender/blenkernel/intern/library.c
index c66ac31982c..a687d16a910 100644
--- a/source/blender/blenkernel/intern/library.c
+++ b/source/blender/blenkernel/intern/library.c
@@ -1592,15 +1592,6 @@ void id_clear_lib_data_ex(Main *bmain, ID *id, const bool id_in_mainlist)
 	if ((key = BKE_key_from_id(id))) {
 		id_clear_lib_data_ex(bmain, &key->id, id_in_mainlist);  /* sigh, why are keys in Main? */
 	}
-
-	if (GS(id->name) == ID_OB) {
-		Object *object = (Object *)id;
-		if (object->proxy_from != NULL) {
-			object->proxy_from->proxy = NULL;
-			object->proxy_from->proxy_group = NULL;
-		}
-		object->proxy = object->proxy_from = object->proxy_group = NULL;
-	}
 }
 
 void id_clear_lib_data(Main *bmain, ID *id)
@@ -1675,7 +1666,15 @@ void BKE_library_make_local(Main *bmain, const Library *lib, const bool untagged
 				if (lib == NULL || id->lib == lib) {
 					if (id->lib) {
 						/* In this specific case, we do want to make ID local even if it has no local usage yet... */
-						id_make_local(bmain, id, false, true);
+						if (GS(id->name) == ID_OB) {
+							/* Special case for objects because we don't want proxy pointers to be
+							 * cleared yet. This will happen down the road in this function.
+							 */
+							BKE_object_make_local_ex(bmain, (Object*)id, true, false);
+						}
+						else {
+							id_make_local(bmain, id, false, true);
+						}
 					}
 					else {
 						id->tag &= ~(LIB_TAG_EXTERN | LIB_TAG_INDIRECT | LIB_TAG_NEW);
@@ -1715,36 +1714,6 @@ void BKE_library_make_local(Main *bmain, const Library *lib, const bool untagged
 					bool is_local = false, is_lib = false;
 
 					BKE_library_ID_test_usages(bmain, id, &is_local, &is_lib);
-
-					/* Attempt to re-link appended proxy objects. This allows appending of an entire scene
-					 * from another blend file into this one, even when that blend file contains proxified
-					 * armatures. Since the proxified object needs to be linked (not local), this will
-					 * only work when the "Localize all" checkbox is disabled.
-					 * TL;DR: this is a dirty hack on top of an already weak feature (proxies). */
-					if (GS(id->name) == ID_OB && ((Object *)id)->proxy != NULL) {
-						Object *ob = (Object *)id;
-						Object *ob_new = (Object *)id->newid;
-
-						/* Proxies only work when the proxified object is linked-in from a library. */
-						if (ob->proxy->id.lib == NULL) {
-							printf("Warning, proxy object %s will loose its link to %s, because the "
-							       "proxified object is local.\n", id->newid->name, ob->proxy->id.name);
-						}
-						/* We can only switch the proxy'ing to a made-local proxy if it is no longer
-						 * referred to from a library. Not checking for local use; if new local proxy
-						 * was not used locally would be a nasty bug! */
-						else if (is_local || is_lib) {
-							printf("Warning, made-local proxy object %s will loose its link to %s, "
-							       "because the linked-in proxy is referenced (is_local=%i, is_lib=%i).\n",
-							       id->newid->name, ob->proxy->id.name, is_local, is_lib);
-						}
-						else {
-							/* we can switch the proxy'ing from the linked-in to the made-local proxy. */
-							BKE_object_make_proxy(ob_new, ob->proxy, ob->proxy_group);
-							ob->proxy = ob->proxy_from = ob->proxy_group = NULL;
-						}
-					}
-
 					if (!is_local && !is_lib) {
 						BKE_libblock_free(bmain, id);
 						do_loop = true;
diff --git a/source/blender/blenkernel/intern/object.c b/source/blender/blenkernel/intern/object.c
index a059055a49e..5bcf31ba45b 100644
--- a/source/blender/blenkernel/intern/object.c
+++ b/source/blender/blenkernel/intern/object.c
@@ -1181,7 +1181,7 @@ Object *BKE_object_copy(Main *bmain, Object *ob)
 	return BKE_object_copy_ex(bmain, ob, false);
 }
 
-void BKE_object_make_local(Main *bmain, Object *ob, const bool lib_local)
+void BKE_object_make_local_ex(Main *bmain, Object *ob, const bool lib_local, const bool clear_proxy)
 {
 	bool is_local = false, is_lib = false;
 
@@ -1201,6 +1201,13 @@ void BKE_object_make_local(Main *bmain, Object *ob, const bool lib_local)
 		if (!is_lib) {
 			id_clear_lib_data(bmain, &ob->id);
 			BKE_id_expand_local(&ob->id);
+			if (clear_proxy) {
+				if (ob->proxy_from != NULL) {
+					ob->proxy_from->proxy = NULL;
+					ob->proxy_from->proxy_group = NULL;
+				}
+				ob->proxy = ob->proxy_from = ob->proxy_group = NULL;
+			}
 		}
 		else {
 			Object *ob_new = BKE_object_copy(bmain, ob);
@@ -1215,6 +1222,11 @@ void BKE_object_make_local(Main *bmain, Object *ob, const bool lib_local)
 	}
 }
 
+void BKE_object_make_local(Main *bmain, Object *ob, const bool lib_local)
+{
+	BKE_object_make_local_ex(bmain, ob, lib_local, true);
+}
+
 /* Returns true if the Object is from an external blend file (libdata) */
 bool BKE_object_is_libdata(Object *ob)
 {

From 6917de6919408f5ceecf578b884fa207fee67262 Mon Sep 17 00:00:00 2001
From: Joshua Leung <aligorith@gmail.com>
Date: Fri, 14 Oct 2016 01:18:53 +1300
Subject: [PATCH 23/27] Fix: Grease Pencil palettes were missing a RNA path
 callback

---
 source/blender/makesrna/intern/rna_gpencil.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/source/blender/makesrna/intern/rna_gpencil.c b/source/blender/makesrna/intern/rna_gpencil.c
index 9b881c13347..7ba89538b18 100644
--- a/source/blender/makesrna/intern/rna_gpencil.c
+++ b/source/blender/makesrna/intern/rna_gpencil.c
@@ -783,6 +783,16 @@ static void rna_GPencilPalette_info_set(PointerRNA *ptr, const char *value)
 	               sizeof(palette->info));
 }
 
+static char *rna_GPencilPalette_path(PointerRNA *ptr)
+{
+	bGPDpalette *palette = ptr->data;
+	char name_esc[sizeof(palette->info) * 2];
+	
+	BLI_strescape(name_esc, palette->info, sizeof(name_esc));
+	
+	return BLI_sprintfN("palettes[\"%s\"]", name_esc);
+}
+
 static char *rna_GPencilPalette_color_path(PointerRNA *ptr)
 {
 	bGPdata *gpd = ptr->id.data;
@@ -1510,6 +1520,7 @@ static void rna_def_gpencil_palette(BlenderRNA *brna)
 	srna = RNA_def_struct(brna, "GPencilPalette", NULL);
 	RNA_def_struct_sdna(srna, "bGPDpalette");
 	RNA_def_struct_ui_text(srna, "Grease Pencil Palette", "Collection of related palettes");
+	RNA_def_struct_path_func(srna, "rna_GPencilPalette_path");
 	RNA_def_struct_ui_icon(srna, ICON_COLOR);
 
 	/* Name */

From 6027550e83879ec9ff4de3cb5bf43db54594771b Mon Sep 17 00:00:00 2001
From: Bastien Montagne <montagne29@wanadoo.fr>
Date: Thu, 13 Oct 2016 16:20:29 +0200
Subject: [PATCH 24/27] Usual UI/i18n tweaks & fixes.

---
 release/scripts/modules/bl_i18n_utils/utils_spell_check.py | 2 ++
 source/blender/makesrna/intern/rna_smoke.c                 | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/release/scripts/modules/bl_i18n_utils/utils_spell_check.py b/release/scripts/modules/bl_i18n_utils/utils_spell_check.py
index fa42778b53f..f749cf82bb9 100644
--- a/release/scripts/modules/bl_i18n_utils/utils_spell_check.py
+++ b/release/scripts/modules/bl_i18n_utils/utils_spell_check.py
@@ -163,6 +163,7 @@ class SpellChecker:
         "runtime",
         "scanline",
         "screencast", "screenshot", "screenshots",
+        "seekability",
         "selfcollision",
         "shadowbuffer", "shadowbuffers",
         "singletexture",
@@ -184,6 +185,7 @@ class SpellChecker:
         "timestamp", "timestamps",
         "timestep", "timesteps",
         "todo",
+        "tradeoff",
         "un",
         "unbake",
         "uncomment",
diff --git a/source/blender/makesrna/intern/rna_smoke.c b/source/blender/makesrna/intern/rna_smoke.c
index 40a45416aca..b4ba306df3f 100644
--- a/source/blender/makesrna/intern/rna_smoke.c
+++ b/source/blender/makesrna/intern/rna_smoke.c
@@ -790,7 +790,7 @@ static void rna_def_smoke_domain_settings(BlenderRNA *brna)
 
 	prop = RNA_def_property(srna, "draw_velocity", PROP_BOOLEAN, PROP_NONE);
 	RNA_def_property_boolean_sdna(prop, NULL, "draw_velocity", 0);
-	RNA_def_property_ui_text(prop, "Draw Velocity", "Toggle visualation of the velocity field as needles");
+	RNA_def_property_ui_text(prop, "Draw Velocity", "Toggle visualization of the velocity field as needles");
 	RNA_def_property_update(prop, NC_OBJECT | ND_DRAW, NULL);
 
 	prop = RNA_def_property(srna, "vector_draw_type", PROP_ENUM, PROP_NONE);

From 78817ae95c8834588bf2acf64591ed9216fb7b97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sybren=20A=2E=20St=C3=BCvel?= <sybren@stuvel.eu>
Date: Thu, 13 Oct 2016 17:06:24 +0200
Subject: [PATCH 25/27] Prevent problems when appending scene with referenced
 proxy

Such a "referenced proxy" could be a proxy that is used in a constraint on
another object. Brings back part of 1cdc54dc7db85766 but without the
memory leak.
---
 source/blender/blenkernel/intern/library.c | 34 ++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/source/blender/blenkernel/intern/library.c b/source/blender/blenkernel/intern/library.c
index a687d16a910..a2802d2b5d8 100644
--- a/source/blender/blenkernel/intern/library.c
+++ b/source/blender/blenkernel/intern/library.c
@@ -1713,6 +1713,40 @@ void BKE_library_make_local(Main *bmain, const Library *lib, const bool untagged
 				if (id->newid) {
 					bool is_local = false, is_lib = false;
 
+					/* Attempt to re-link copied proxy objects. This allows appending of an entire scene
+					 * from another blend file into this one, even when that blend file contains proxified
+					 * armatures that have local references. Since the proxified object needs to be linked
+					 * (not local), this will only work when the "Localize all" checkbox is disabled.
+					 * TL;DR: this is a dirty hack on top of an already weak feature (proxies). */
+					if (GS(id->name) == ID_OB && ((Object *)id)->proxy != NULL) {
+						Object *ob = (Object *)id;
+						Object *ob_new = (Object *)id->newid;
+
+						/* Proxies only work when the proxified object is linked-in from a library. */
+						if (ob->proxy->id.lib == NULL) {
+							printf("Warning, proxy object %s will loose its link to %s, because the "
+							       "proxified object is local.\n", id->newid->name, ob->proxy->id.name);
+						}
+						/* We can only switch the proxy'ing to a made-local proxy if it is no longer
+						 * referred to from a library. Not checking for local use; if new local proxy
+						 * was not used locally would be a nasty bug! */
+						else if (is_local || is_lib) {
+							printf("Warning, made-local proxy object %s will loose its link to %s, "
+							       "because the linked-in proxy is referenced (is_local=%i, is_lib=%i).\n",
+							       id->newid->name, ob->proxy->id.name, is_local, is_lib);
+						}
+						else {
+							/* we can switch the proxy'ing from the linked-in to the made-local proxy.
+							 * BKE_object_make_proxy() shouldn't be used here, as it allocates memory that
+							 * was already allocated by BKE_object_make_local_ex() (which called BKE_object_copy_ex). */
+							ob_new->proxy = ob->proxy;
+							ob_new->proxy_group = ob->proxy_group;
+							ob_new->proxy_from = ob->proxy_from;
+							ob_new->proxy->proxy_from = ob_new;
+							ob->proxy = ob->proxy_from = ob->proxy_group = NULL;
+						}
+					}
+
 					BKE_library_ID_test_usages(bmain, id, &is_local, &is_lib);
 					if (!is_local && !is_lib) {
 						BKE_libblock_free(bmain, id);

From 02a1f15416494542896384236ef94e8ecbb42743 Mon Sep 17 00:00:00 2001
From: Aaron Carlisle <carlisle.b3d@gmail.com>
Date: Thu, 13 Oct 2016 17:53:30 +0200
Subject: [PATCH 26/27] Fix OLD pre-git links in the API, add missing
 factory-startup option to blender executions.

Reviewers: mont29

Reviewed By: mont29

Tags: #bf_blender, #python, #infrastructure:_websites

Differential Revision: https://developer.blender.org/D2290
---
 doc/python_api/sphinx_changelog_gen.py |  4 ++--
 doc/python_api/sphinx_doc_gen.py       | 28 ++++++++++++--------------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/doc/python_api/sphinx_changelog_gen.py b/doc/python_api/sphinx_changelog_gen.py
index 4cbb418e326..8758590dbae 100644
--- a/doc/python_api/sphinx_changelog_gen.py
+++ b/doc/python_api/sphinx_changelog_gen.py
@@ -27,7 +27,7 @@ output from this tool should be added into "doc/python_api/rst/change_log.rst"
 blender --background --python doc/python_api/sphinx_changelog_gen.py -- --dump
 
 # create changelog
-blender --background --python doc/python_api/sphinx_changelog_gen.py -- \
+blender --background --factory-startup --python doc/python_api/sphinx_changelog_gen.py -- \
         --api_from blender_2_63_0.py \
         --api_to   blender_2_64_0.py \
         --api_out changes.rst
@@ -331,7 +331,7 @@ def main():
 
     # When --help or no args are given, print this help
     usage_text = "Run blender in background mode with this script: "
-    "blender --background --python %s -- [options]" % os.path.basename(__file__)
+    "blender --background --factory-startup --python %s -- [options]" % os.path.basename(__file__)
 
     epilog = "Run this before releases"
 
diff --git a/doc/python_api/sphinx_doc_gen.py b/doc/python_api/sphinx_doc_gen.py
index 4a109a44ec0..2fbbd16a461 100644
--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -26,16 +26,16 @@ API dump in RST files
 ---------------------
   Run this script from Blender's root path once you have compiled Blender
 
-    ./blender.bin --background -noaudio --python doc/python_api/sphinx_doc_gen.py
+    blender --background --factory-startup -noaudio --python doc/python_api/sphinx_doc_gen.py
 
   This will generate python files in doc/python_api/sphinx-in/
-  providing ./blender.bin is or links to the blender executable
+  providing ./blender is or links to the blender executable
 
   To choose sphinx-in directory:
-    ./blender.bin --background --python doc/python_api/sphinx_doc_gen.py -- --output ../python_api
+    blender --background --factory-startup --python doc/python_api/sphinx_doc_gen.py -- --output ../python_api
 
   For quick builds:
-    ./blender.bin --background --python doc/python_api/sphinx_doc_gen.py -- --partial bmesh.*
+    blender --background --factory-startup --python doc/python_api/sphinx_doc_gen.py -- --partial bmesh.*
 
 
 Sphinx: HTML generation
@@ -46,8 +46,6 @@ Sphinx: HTML generation
     cd doc/python_api
     sphinx-build sphinx-in sphinx-out
 
-  This requires sphinx 1.0.7 to be installed.
-
 
 Sphinx: PDF generation
 ----------------------
@@ -68,7 +66,7 @@ except ImportError:
     import sys
     sys.exit()
 
-import rna_info     # Blender module
+import rna_info  # Blender module
 
 
 def rna_info_BuildRNAInfo_cache():
@@ -86,7 +84,7 @@ import shutil
 import logging
 
 from platform import platform
-PLATFORM = platform().split('-')[0].lower()    # 'linux', 'darwin', 'windows'
+PLATFORM = platform().split('-')[0].lower()  # 'linux', 'darwin', 'windows'
 
 SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))
 
@@ -208,12 +206,12 @@ BPY_LOGGER.setLevel(logging.DEBUG)
 """
 # for quick rebuilds
 rm -rf /b/doc/python_api/sphinx-* && \
-./blender.bin -b -noaudio --factory-startup -P doc/python_api/sphinx_doc_gen.py && \
+./blender -b -noaudio --factory-startup -P doc/python_api/sphinx_doc_gen.py && \
 sphinx-build doc/python_api/sphinx-in doc/python_api/sphinx-out
 
 or
 
-./blender.bin -b -noaudio --factory-startup -P doc/python_api/sphinx_doc_gen.py -- -f -B
+./blender -b -noaudio --factory-startup -P doc/python_api/sphinx_doc_gen.py -- -f -B
 """
 
 # Switch for quick testing so doc-builds don't take so long
@@ -420,7 +418,7 @@ MODULE_GROUPING = {
 
 blender_version_strings = [str(v) for v in bpy.app.version]
 
-# converting bytes to strings, due to #30154
+# converting bytes to strings, due to T30154
 BLENDER_REVISION = str(bpy.app.build_hash, 'utf_8')
 BLENDER_DATE = str(bpy.app.build_date, 'utf_8')
 
@@ -1567,9 +1565,9 @@ def pyrna2sphinx(basepath):
 
     # operators
     def write_ops():
-        API_BASEURL = "http://svn.blender.org/svnroot/bf-blender/trunk/blender/release/scripts"
-        API_BASEURL_ADDON = "http://svn.blender.org/svnroot/bf-extensions/trunk/py/scripts"
-        API_BASEURL_ADDON_CONTRIB = "http://svn.blender.org/svnroot/bf-extensions/contrib/py/scripts"
+        API_BASEURL = "https://developer.blender.org/diffusion/B/browse/master/release/scripts/ "
+        API_BASEURL_ADDON = "https://developer.blender.org/diffusion/BA/"
+        API_BASEURL_ADDON_CONTRIB = "https://developer.blender.org/diffusion/BAC/"
 
         op_modules = {}
         for op in ops.values():
@@ -1645,7 +1643,7 @@ def write_sphinx_conf_py(basepath):
 
     if ARGS.sphinx_theme == "blender-org":
         fw("html_theme_path = ['../']\n")
-        # copied with the theme, exclude else we get an error [#28873]
+        # copied with the theme, exclude else we get an error [T28873]
         fw("html_favicon = 'favicon.ico'\n")    # in <theme>/static/
 
     # not helpful since the source is generated, adds to upload size.

From 625b504b23beb11e8a02408e7474e38fa525e8ee Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey.vfx@gmail.com>
Date: Thu, 13 Oct 2016 18:31:53 +0200
Subject: [PATCH 27/27] Fix T49534: 2.78 Wrong texture scaling in material
 viewport

Seems to be a bug in original implementation of a830280: code was always
using tangent space instead of UV map because it had the same name. Now
prefer UVMap over tangent because this is how Cycles works. At least it's
closer to.

Not sure it the save+reload issue is still relevant after this fix, that
needs to be double-checked.

Thanks @dfelinto for looking into the report and simplifying the case.

Should be included into 2.78a.
---
 source/blender/blenkernel/intern/DerivedMesh.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/source/blender/blenkernel/intern/DerivedMesh.c b/source/blender/blenkernel/intern/DerivedMesh.c
index 8168817491f..f75b3c0df85 100644
--- a/source/blender/blenkernel/intern/DerivedMesh.c
+++ b/source/blender/blenkernel/intern/DerivedMesh.c
@@ -3675,15 +3675,15 @@ void DM_vertex_attributes_from_gpu(DerivedMesh *dm, GPUVertexAttribs *gattribs,
 			 * We do it based on the specified name.
 			 */
 			if (gattribs->layer[b].name[0]) {
-				layer = CustomData_get_named_layer_index(&dm->loopData, CD_TANGENT, gattribs->layer[b].name);
-				type = CD_TANGENT;
+				layer = CustomData_get_named_layer_index(ldata, CD_MLOOPUV, gattribs->layer[b].name);
+				type = CD_MTFACE;
 				if (layer == -1) {
 					layer = CustomData_get_named_layer_index(ldata, CD_MLOOPCOL, gattribs->layer[b].name);
 					type = CD_MCOL;
 				}
 				if (layer == -1) {
-					layer = CustomData_get_named_layer_index(ldata, CD_MLOOPUV, gattribs->layer[b].name);
-					type = CD_MTFACE;
+					layer = CustomData_get_named_layer_index(&dm->loopData, CD_TANGENT, gattribs->layer[b].name);
+					type = CD_TANGENT;
 				}
 				if (layer == -1) {
 					continue;