Merge branch 'master' into cycles_disney_brdf

2016-10-20 10:41:50 +02:00
parent 243a0e3eb8 2cd6a89d07
commit 4dfcf455f7
192 changed files with 5915 additions and 4774 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -404,7 +404,7 @@ option(WITH_CYCLES_STANDALONE_GUI	"Build Cycles standalone with GUI" OFF)
 option(WITH_CYCLES_OSL				"Build Cycles with OSL support" ${_init_CYCLES_OSL})
 option(WITH_CYCLES_OPENSUBDIV		"Build Cycles with OpenSubdiv support" ${_init_CYCLES_OPENSUBDIV})
 option(WITH_CYCLES_CUDA_BINARIES	"Build Cycles CUDA binaries" OFF)
-set(CYCLES_CUDA_BINARIES_ARCH sm_20 sm_21 sm_30 sm_35 sm_37 sm_50 sm_52 CACHE STRING "CUDA architectures to build binaries for")
+set(CYCLES_CUDA_BINARIES_ARCH sm_20 sm_21 sm_30 sm_35 sm_37 sm_50 sm_52 sm_60 sm_61 CACHE STRING "CUDA architectures to build binaries for")
 mark_as_advanced(CYCLES_CUDA_BINARIES_ARCH)
 unset(PLATFORM_DEFAULT)
 option(WITH_CYCLES_LOGGING	"Build Cycles with logging support" ON)
@@ -629,9 +629,21 @@ if(APPLE)
 		set(CMAKE_FIND_ROOT_PATH ${CMAKE_OSX_SYSROOT})
 	endif()

-	if(NOT CMAKE_OSX_DEPLOYMENT_TARGET)
-		# 10.6 is our min. target, if you use higher sdk, weak linking happens
-		set(CMAKE_OSX_DEPLOYMENT_TARGET "10.6" CACHE STRING "" FORCE)
+	if(WITH_CXX11)
+		# 10.9 is our min. target, if you use higher sdk, weak linking happens
+		if(CMAKE_OSX_DEPLOYMENT_TARGET)
+			if(${CMAKE_OSX_DEPLOYMENT_TARGET} VERSION_LESS 10.9)
+				message(STATUS "Setting deployment target to 10.9, lower versions are incompatible with WITH_CXX11")
+				set(CMAKE_OSX_DEPLOYMENT_TARGET "10.9" CACHE STRING "" FORCE)
+			endif()
+		else()
+			set(CMAKE_OSX_DEPLOYMENT_TARGET "10.9" CACHE STRING "" FORCE)
+		endif()
+	else()
+		if(NOT CMAKE_OSX_DEPLOYMENT_TARGET)
+			# 10.6 is our min. target, if you use higher sdk, weak linking happens
+			set(CMAKE_OSX_DEPLOYMENT_TARGET "10.6" CACHE STRING "" FORCE)
+		endif()
 	endif()
 	
 	if(NOT ${CMAKE_GENERATOR} MATCHES "Xcode")
@@ -974,7 +986,7 @@ if(SUPPORT_SSE_BUILD)
 	add_definitions(-D__SSE__ -D__MMX__)
 endif()
 if(SUPPORT_SSE2_BUILD)
-	set(PLATFORM_CFLAGS " ${COMPILER_SSE2_FLAG} ${PLATFORM_CFLAGS}")
+	set(PLATFORM_CFLAGS " ${PLATFORM_CFLAGS} ${COMPILER_SSE2_FLAG}")
 	add_definitions(-D__SSE2__)
 	if(NOT SUPPORT_SSE_BUILD) # dont double up
 		add_definitions(-D__MMX__)
--- a/build_files/build_environment/install_deps.sh
+++ b/build_files/build_environment/install_deps.sh
@@ -25,7 +25,8 @@
 ARGS=$( \
 getopt \
 -o s:i:t:h \
--long source:,install:,tmp:,info:,threads:,help,show-deps,no-sudo,no-build,no-confirm,with-all,with-opencollada,\
+--long source:,install:,tmp:,info:,threads:,help,show-deps,no-sudo,no-build,no-confirm,use-cxx11,\
+with-all,with-opencollada,\
 ver-ocio:,ver-oiio:,ver-llvm:,ver-osl:,ver-osd:,ver-openvdb:,\
 force-all,force-python,force-numpy,force-boost,\
 force-ocio,force-openexr,force-oiio,force-llvm,force-osl,force-osd,force-openvdb,\
@@ -103,6 +104,11 @@ ARGUMENTS_INFO="\"COMMAND LINE ARGUMENTS:
    --no-confirm
        Disable any interaction with user (suitable for automated run).

+    --use-cxx11
+        Build all libraries in cpp11 'mode' (will be mandatory soon in blender2.8 branch).
+        NOTE: If your compiler is gcc-6.0 or above, you probably *want* to enable this option (since it's default
+              standard starting from this version).
+
    --with-all
        By default, a number of optional and not-so-often needed libraries are not installed.
        This option will try to install them, at the cost of potential conflicts (depending on
@@ -281,6 +287,7 @@ SUDO="sudo"

 NO_BUILD=false
 NO_CONFIRM=false
+USE_CXX11=false

 PYTHON_VERSION="3.5.1"
 PYTHON_VERSION_MIN="3.5"
@@ -492,6 +499,9 @@ while true; do
    --no-confirm)
      NO_CONFIRM=true; shift; continue
    ;;
+    --use-cxx11)
+      USE_CXX11=true; shift; continue
+    ;;
    --with-all)
      WITH_ALL=true; shift; continue
    ;;
@@ -766,7 +776,18 @@ OPENCOLLADA_REPO_BRANCH="master"

 FFMPEG_SOURCE=( "http://ffmpeg.org/releases/ffmpeg-$FFMPEG_VERSION.tar.bz2" )

+CXXFLAGS_BACK=$CXXFLAGS
+if [ "$USE_CXX11" = true ]; then
+  WARNING "You are trying to use c++11, this *should* go smoothely with any very recent distribution
+However, if you are experiencing linking errors (also when building Blender itself), please try the following:
+    * Re-run this script with `--build-all --force-all` options.
+    * Ensure your gcc version is at the very least 4.8, if possible you should really rather use gcc-5.1 or above.

+Please note that until the transition to C++11-built libraries if completed in your distribution, situation will
+remain fuzzy and incompatibilities may happen..."
+  CXXFLAGS="$CXXFLAGS -std=c++11"
+  export CXXFLAGS
+fi

 #### Show Dependencies ####

@@ -779,7 +800,7 @@ Those libraries should be available as packages in all recent distributions (opt
    * libjpeg, libpng, libtiff, [libopenjpeg], [libopenal].
    * libx11, libxcursor, libxi, libxrandr, libxinerama (and other libx... as needed).
    * libsqlite3, libbz2, libssl, libfftw3, libxml2, libtinyxml, yasm, libyaml-cpp.
-    * libsdl1.2, libglew, libglewmx.\""
+    * libsdl1.2, libglew, [libglewmx].\""

 DEPS_SPECIFIC_INFO="\"BUILDABLE DEPENDENCIES:

@@ -953,7 +974,7 @@ prepare_opt() {

 # Check whether the current package needs to be recompiled, based on a dummy file containing a magic number in its name...
 magic_compile_check() {
-  if [ -f $INST/.$1-magiccheck-$2 ]; then
+  if [ -f $INST/.$1-magiccheck-$2-$USE_CXX11 ]; then
    return 0
  else
    return 1
@@ -962,7 +983,7 @@ magic_compile_check() {

 magic_compile_set() {
  rm -f $INST/.$1-magiccheck-*
-  touch $INST/.$1-magiccheck-$2
+  touch $INST/.$1-magiccheck-$2-$USE_CXX11
 }

 # Note: should clean nicely in $INST, but not in $SRC, when we switch to a new version of a lib...
@@ -1622,6 +1643,10 @@ compile_OIIO() {
 #    fi
    cmake_d="$cmake_d -D USE_OCIO=OFF"

+    if [ "$USE_CXX11" = true ]; then
+      cmake_d="$cmake_d -D OIIO_BUILD_CPP11=ON"
+    fi
+
    if file /bin/cp | grep -q '32-bit'; then
      cflags="-fPIC -m32 -march=i686"
    else
@@ -2562,8 +2587,9 @@ install_DEB() {
             git libfreetype6-dev libx11-dev flex bison libtbb-dev libxxf86vm-dev \
             libxcursor-dev libxi-dev wget libsqlite3-dev libxrandr-dev libxinerama-dev \
             libbz2-dev libncurses5-dev libssl-dev liblzma-dev libreadline-dev $OPENJPEG_DEV \
-             libopenal-dev libglew-dev libglewmx-dev yasm $THEORA_DEV $VORBIS_DEV $OGG_DEV \
+             libopenal-dev libglew-dev yasm $THEORA_DEV $VORBIS_DEV $OGG_DEV \
             libsdl1.2-dev libfftw3-dev patch bzip2 libxml2-dev libtinyxml-dev libjemalloc-dev"
+             # libglewmx-dev  (broken in deb testing currently...)

  OPENJPEG_USE=true
  VORBIS_USE=true
@@ -4164,6 +4190,12 @@ print_info() {
  _buildargs="$_buildargs -U *OPENCOLORIO* -U *OPENEXR* -U *OPENIMAGEIO* -U *LLVM* -U *CYCLES*"
  _buildargs="$_buildargs -U *OPENSUBDIV* -U *OPENVDB* -U *COLLADA* -U *FFMPEG* -U *ALEMBIC*"

+  if [ "$USE_CXX11" = true ]; then
+    _1="-D WITH_CXX11=ON"
+    PRINT "  $_1"
+    _buildargs="$_buildargs $_1"
+  fi
+
  _1="-D WITH_CODEC_SNDFILE=ON"
  PRINT "  $_1"
  _buildargs="$_buildargs $_1"
@@ -4327,3 +4359,6 @@ PRINT ""
 # Switch back to user language.
 LANG=LANG_BACK
 export LANG
+
+CXXFLAGS=$CXXFLAGS_BACK
+export CXXFLAGS
--- a/build_files/buildbot/slave_compile.py
+++ b/build_files/buildbot/slave_compile.py
@@ -72,8 +72,7 @@ if 'cmake' in builder:
        # Set up OSX architecture
        if builder.endswith('x86_64_10_6_cmake'):
            cmake_extra_options.append('-DCMAKE_OSX_ARCHITECTURES:STRING=x86_64')
-        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-hack/bin/nvcc')
-        cmake_extra_options.append('-DCUDA_NVCC8_EXECUTABLE=/usr/local/cuda8-hack/bin/nvcc')
+        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE=/usr/local/cuda8-hack/bin/nvcc')
        cmake_extra_options.append('-DWITH_CODEC_QUICKTIME=OFF')
        cmake_extra_options.append('-DCMAKE_OSX_DEPLOYMENT_TARGET=10.6')
        build_cubins = False
@@ -94,8 +93,7 @@ if 'cmake' in builder:
            elif builder.startswith('win32'):
                bits = 32
                cmake_options.extend(['-G', 'Visual Studio 12 2013'])
-        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE:FILEPATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v7.5/bin/nvcc.exe')
-        cmake_extra_options.append('-DCUDA_NVCC8_EXECUTABLE:FILEPATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/bin/nvcc.exe')
+        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE:FILEPATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/bin/nvcc.exe')

    elif builder.startswith('linux'):
        tokens = builder.split("_")
@@ -115,8 +113,7 @@ if 'cmake' in builder:
            cuda_chroot_name = 'buildbot_' + deb_name + '_x86_64'
            targets = ['player', 'blender', 'cuda']

-        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-7.5/bin/nvcc')
-        cmake_extra_options.append('-DCUDA_NVCC8_EXECUTABLE=/usr/local/cuda-8.0/bin/nvcc')
+        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-8.0/bin/nvcc')

    cmake_options.append("-C" + os.path.join(blender_dir, cmake_config_file))

--- a/build_files/cmake/packaging.cmake
+++ b/build_files/cmake/packaging.cmake
@@ -97,6 +97,8 @@ if(WIN32)
 endif()

 set(CPACK_PACKAGE_EXECUTABLES "blender" "blender")
+set(CPACK_CREATE_DESKTOP_LINKS "blender" "blender")
+
 include(CPack)

 # Target for build_archive.py script, to automatically pass along
--- a/build_files/cmake/platform/platform_apple.cmake
+++ b/build_files/cmake/platform/platform_apple.cmake
@@ -24,7 +24,11 @@
 # Libraries configuration for Apple.

 if(NOT DEFINED LIBDIR)
-	set(LIBDIR ${CMAKE_SOURCE_DIR}/../lib/darwin-9.x.universal)
+	if(WITH_CXX11)
+		set(LIBDIR ${CMAKE_SOURCE_DIR}/../lib/darwin)
+	else()
+		set(LIBDIR ${CMAKE_SOURCE_DIR}/../lib/darwin-9.x.universal)
+	endif()
 else()
 	message(STATUS "Using pre-compiled LIBDIR: ${LIBDIR}")
 endif()
@@ -74,7 +78,7 @@ if(WITH_CODEC_SNDFILE)
 	set(SNDFILE ${LIBDIR}/sndfile)
 	set(SNDFILE_INCLUDE_DIRS ${SNDFILE}/include)
 	set(SNDFILE_LIBRARIES sndfile FLAC ogg vorbis vorbisenc)
-	set(SNDFILE_LIBPATH ${SNDFILE}/lib ${FFMPEG}/lib)  # TODO, deprecate
+	set(SNDFILE_LIBPATH ${SNDFILE}/lib ${LIBDIR}/ffmpeg/lib)  # TODO, deprecate
 endif()

 if(WITH_PYTHON)
@@ -132,7 +136,17 @@ if(WITH_IMAGE_OPENEXR)
 	set(OPENEXR ${LIBDIR}/openexr)
 	set(OPENEXR_INCLUDE_DIR ${OPENEXR}/include)
 	set(OPENEXR_INCLUDE_DIRS ${OPENEXR_INCLUDE_DIR} ${OPENEXR}/include/OpenEXR)
-	set(OPENEXR_LIBRARIES Iex Half IlmImf Imath IlmThread)
+	if(WITH_CXX11)
+		set(OPENEXR_POSTFIX -2_2)
+	else()
+		set(OPENEXR_POSTFIX)
+	endif()
+	set(OPENEXR_LIBRARIES
+		Iex${OPENEXR_POSTFIX}
+		Half
+		IlmImf${OPENEXR_POSTFIX}
+		Imath${OPENEXR_POSTFIX}
+		IlmThread${OPENEXR_POSTFIX})
 	set(OPENEXR_LIBPATH ${OPENEXR}/lib)
 endif()

@@ -143,9 +157,22 @@ if(WITH_CODEC_FFMPEG)
 		avcodec avdevice avformat avutil
 		mp3lame swscale x264 xvidcore theora theoradec theoraenc vorbis vorbisenc vorbisfile ogg
 	)
+	if(WITH_CXX11)
+		set(FFMPEG_LIBRARIES ${FFMPEG_LIBRARIES} schroedinger orc vpx)
+	endif()
 	set(FFMPEG_LIBPATH ${FFMPEG}/lib)
 endif()

+if(WITH_OPENJPEG OR WITH_CODEC_FFMPEG)
+	# use openjpeg from libdir that is linked into ffmpeg
+	if(WITH_CXX11)
+		set(OPENJPEG ${LIBDIR}/openjpeg)
+		set(WITH_SYSTEM_OPENJPEG ON)
+		set(OPENJPEG_INCLUDE_DIRS ${OPENJPEG}/include)
+		set(OPENJPEG_LIBRARIES ${OPENJPEG}/lib/libopenjpeg.a)
+	endif()
+endif()
+
 find_library(SYSTEMSTUBS_LIBRARY
 	NAMES
 	SystemStubs
@@ -223,7 +250,11 @@ if(WITH_SDL)
 	set(SDL_INCLUDE_DIR ${SDL}/include)
 	set(SDL_LIBRARY SDL2)
 	set(SDL_LIBPATH ${SDL}/lib)
-	set(PLATFORM_LINKFLAGS "${PLATFORM_LINKFLAGS} -lazy_framework ForceFeedback")
+	if(WITH_CXX11)
+		set(PLATFORM_LINKFLAGS "${PLATFORM_LINKFLAGS} -framework ForceFeedback")
+	else()
+		set(PLATFORM_LINKFLAGS "${PLATFORM_LINKFLAGS} -lazy_framework ForceFeedback")
+	endif()
 endif()

 set(PNG "${LIBDIR}/png")
@@ -244,22 +275,27 @@ endif()
 if(WITH_BOOST)
 	set(BOOST ${LIBDIR}/boost)
 	set(BOOST_INCLUDE_DIR ${BOOST}/include)
+	if(WITH_CXX11)
+		set(BOOST_POSTFIX)
+	else()
+		set(BOOST_POSTFIX -mt)
+	endif()
 	set(BOOST_LIBRARIES
-		boost_date_time-mt
-		boost_filesystem-mt
-		boost_regex-mt
-		boost_system-mt
-		boost_thread-mt
-		boost_wave-mt
+		boost_date_time${BOOST_POSTFIX}
+		boost_filesystem${BOOST_POSTFIX}
+		boost_regex${BOOST_POSTFIX}
+		boost_system${BOOST_POSTFIX}
+		boost_thread${BOOST_POSTFIX}
+		boost_wave${BOOST_POSTFIX}
 	)
 	if(WITH_INTERNATIONAL)
-		list(APPEND BOOST_LIBRARIES boost_locale-mt)
+		list(APPEND BOOST_LIBRARIES boost_locale${BOOST_POSTFIX})
 	endif()
 	if(WITH_CYCLES_NETWORK)
-		list(APPEND BOOST_LIBRARIES boost_serialization-mt)
+		list(APPEND BOOST_LIBRARIES boost_serialization${BOOST_POSTFIX})
 	endif()
 	if(WITH_OPENVDB)
-		list(APPEND BOOST_LIBRARIES boost_iostreams-mt)
+		list(APPEND BOOST_LIBRARIES boost_iostreams${BOOST_POSTFIX})
 	endif()
 	set(BOOST_LIBPATH ${BOOST}/lib)
 	set(BOOST_DEFINITIONS)
--- a/build_files/utils/build_tgz.sh
+++ b/build_files/utils/build_tgz.sh
@@ -6,10 +6,10 @@
 BASE_DIR="$PWD"

 blender_srcdir=$(dirname -- $0)/../..
-blender_version=$(grep "BLENDER_VERSION\s" "$blender_srcdir/source/blender/blenkernel/BKE_blender.h" | awk '{print $3}')
-blender_version_char=$(grep "BLENDER_VERSION_CHAR\s" "$blender_srcdir/source/blender/blenkernel/BKE_blender.h" | awk '{print $3}')
-blender_version_cycle=$(grep "BLENDER_VERSION_CYCLE\s" "$blender_srcdir/source/blender/blenkernel/BKE_blender.h" | awk '{print $3}')
-blender_subversion=$(grep "BLENDER_SUBVERSION\s" "$blender_srcdir/source/blender/blenkernel/BKE_blender.h" | awk '{print $3}')
+blender_version=$(grep "BLENDER_VERSION\s" "$blender_srcdir/source/blender/blenkernel/BKE_blender_version.h" | awk '{print $3}')
+blender_version_char=$(grep "BLENDER_VERSION_CHAR\s" "$blender_srcdir/source/blender/blenkernel/BKE_blender_version.h" | awk '{print $3}')
+blender_version_cycle=$(grep "BLENDER_VERSION_CYCLE\s" "$blender_srcdir/source/blender/blenkernel/BKE_blender_version.h" | awk '{print $3}')
+blender_subversion=$(grep "BLENDER_SUBVERSION\s" "$blender_srcdir/source/blender/blenkernel/BKE_blender_version.h" | awk '{print $3}')

 if [ "$blender_version_cycle" = "release" ] ; then
 	VERSION=$(expr $blender_version / 100).$(expr $blender_version % 100)$blender_version_char
--- a/doc/python_api/examples/bpy.app.handlers.1.py
+++ b/doc/python_api/examples/bpy.app.handlers.1.py
@@ -4,7 +4,7 @@ Persistent Handler Example

 By default handlers are freed when loading new files, in some cases you may
 wan't the handler stay running across multiple files (when the handler is
-part of an addon for example).
+part of an add-on for example).

 For this the :data:`bpy.app.handlers.persistent` decorator needs to be used.
 """
--- a/doc/python_api/examples/bpy.app.translations.py
+++ b/doc/python_api/examples/bpy.app.translations.py
@@ -5,7 +5,7 @@ Intro
 .. warning::

   Most of this object should only be useful if you actually manipulate i18n stuff from Python.
-   If you are a regular addon, you should only bother about :const:`contexts` member,
+   If you are a regular add-on, you should only bother about :const:`contexts` member,
   and the :func:`register`/:func:`unregister` functions! The :func:`pgettext` family of functions
   should only be used in rare, specific cases (like e.g. complex "composited" UI strings...).

@@ -21,7 +21,7 @@ Intro
 Then, call ``bpy.app.translations.register(__name__, your_dict)`` in your ``register()`` function, and
 ``bpy.app.translations.unregister(__name__)`` in your ``unregister()`` one.

-The ``Manage UI translations`` addon has several functions to help you collect strings to translate, and
+The ``Manage UI translations`` add-on has several functions to help you collect strings to translate, and
 generate the needed python code (the translation dictionary), as well as optional intermediary po files
 if you want some... See
 `How to Translate Blender <http://wiki.blender.org/index.php/Dev:Doc/Process/Translate_Blender>`_ and
--- a/doc/python_api/examples/bpy.types.AddonPreferences.1.py
+++ b/doc/python_api/examples/bpy.types.AddonPreferences.1.py
@@ -1,10 +1,10 @@
 bl_info = {
-    "name": "Example Addon Preferences",
+    "name": "Example Add-on Preferences",
    "author": "Your Name Here",
    "version": (1, 0),
    "blender": (2, 65, 0),
-    "location": "SpaceBar Search -> Addon Preferences Example",
-    "description": "Example Addon",
+    "location": "SpaceBar Search -> Add-on Preferences Example",
+    "description": "Example Add-on",
    "warning": "",
    "wiki_url": "",
    "tracker_url": "",
@@ -18,7 +18,7 @@ from bpy.props import StringProperty, IntProperty, BoolProperty


 class ExampleAddonPreferences(AddonPreferences):
-    # this must match the addon name, use '__package__'
+    # this must match the add-on name, use '__package__'
    # when defining this in a submodule of a python package.
    bl_idname = __name__

@@ -37,7 +37,7 @@ class ExampleAddonPreferences(AddonPreferences):

    def draw(self, context):
        layout = self.layout
-        layout.label(text="This is a preferences view for our addon")
+        layout.label(text="This is a preferences view for our add-on")
        layout.prop(self, "filepath")
        layout.prop(self, "number")
        layout.prop(self, "boolean")
@@ -46,7 +46,7 @@ class ExampleAddonPreferences(AddonPreferences):
 class OBJECT_OT_addon_prefs_example(Operator):
    """Display example preferences"""
    bl_idname = "object.addon_prefs_example"
-    bl_label = "Addon Preferences Example"
+    bl_label = "Add-on Preferences Example"
    bl_options = {'REGISTER', 'UNDO'}

    def execute(self, context):
--- a/doc/python_api/examples/bpy.types.Menu.2.py
+++ b/doc/python_api/examples/bpy.types.Menu.2.py
@@ -2,9 +2,9 @@
 Extending Menus
 +++++++++++++++

-When creating menus for addons you can't reference menus in Blender's default
-scripts.
-Instead, the addon can add menu items to existing menus.
+When creating menus for add-ons you can't reference menus
+in Blender's default scripts.
+Instead, the add-on can add menu items to existing menus.

 The function menu_draw acts like :class:`Menu.draw`.
 """
--- a/doc/python_api/examples/bpy.types.PropertyGroup.py
+++ b/doc/python_api/examples/bpy.types.PropertyGroup.py
@@ -13,7 +13,7 @@ be animated, accessed from the user interface and from python.
   definitions are not, this means whenever you load blender the class needs
   to be registered too.

-   This is best done by creating an addon which loads on startup and registers
+   This is best done by creating an add-on which loads on startup and registers
   your properties.

 .. note::
--- a/doc/python_api/rst/info_overview.rst
+++ b/doc/python_api/rst/info_overview.rst
@@ -77,22 +77,22 @@ To run as modules:
 - The obvious way, ``import some_module`` command from the text window or interactive console.
 - Open as a text block and tick "Register" option, this will load with the blend file.
 - copy into one of the directories ``scripts/startup``, where they will be automatically imported on startup.
- define as an addon, enabling the addon will load it as a Python module.
+- define as an add-on, enabling the add-on will load it as a Python module.


-Addons
+Add-ons
 ------

 Some of Blenders functionality is best kept optional,
-alongside scripts loaded at startup we have addons which are kept in their own directory ``scripts/addons``,
+alongside scripts loaded at startup we have add-ons which are kept in their own directory ``scripts/addons``,
 and only load on startup if selected from the user preferences.

-The only difference between addons and built-in Python modules is that addons must contain a ``bl_info``
+The only difference between add-ons and built-in Python modules is that add-ons must contain a ``bl_info``
 variable which Blender uses to read metadata such as name, author, category and URL.

-The user preferences addon listing uses **bl_info** to display information about each addon.
+The User Preferences add-on listing uses **bl_info** to display information about each add-on.

-`See Addons <http://wiki.blender.org/index.php/Dev:2.5/Py/Scripts/Guidelines/Addons>`__
+`See Add-ons <http://wiki.blender.org/index.php/Dev:2.5/Py/Scripts/Guidelines/Addons>`__
 for details on the ``bl_info`` dictionary.


@@ -223,7 +223,7 @@ These functions usually appear at the bottom of the script containing class regi
 You can also use them for internal purposes setting up data for your own tools but take care
 since register won't re-run when a new blend file is loaded.

-The register/unregister calls are used so it's possible to toggle addons and reload scripts while Blender runs.
+The register/unregister calls are used so it's possible to toggle add-ons and reload scripts while Blender runs.
 If the register calls were placed in the body of the script, registration would be called on import,
 meaning there would be no distinction between importing a module or loading its classes into Blender.

--- a/doc/python_api/rst/info_tutorial_addon.rst
+++ b/doc/python_api/rst/info_tutorial_addon.rst
@@ -1,6 +1,6 @@

-Addon Tutorial
-##############
+Add-on Tutorial
+###############

 ************
 Introduction
@@ -36,6 +36,7 @@ Suggested reading before starting this tutorial.
 To best troubleshoot any error message Python prints while writing scripts you run blender with from a terminal,
 see :ref:`Use The Terminal <use_the_terminal>`.

+
 Documentation Links
 ===================

@@ -46,51 +47,48 @@ While going through the tutorial you may want to look into our reference documen
 - :mod:`bpy.context` api reference. -
  *Handy to have a list of available items your script may operate on.*
 - :class:`bpy.types.Operator`. -
-  *The following addons define operators, these docs give details and more examples of operators.*
+  *The following add-ons define operators, these docs give details and more examples of operators.*


-******
-Addons
-******
+*******
+Add-ons
+*******

+What is an Add-on?
+==================

-What is an Addon?
-=================
-
-An addon is simply a Python module with some additional requirements so Blender can display it in a list with useful
+An add-on is simply a Python module with some additional requirements so Blender can display it in a list with useful
 information.

-To give an example, here is the simplest possible addon.
-
+To give an example, here is the simplest possible add-on.

 .. code-block:: python

-   bl_info = {"name": "My Test Addon", "category": "Object"}
+   bl_info = {"name": "My Test Add-on", "category": "Object"}
   def register():
       print("Hello World")
   def unregister():
       print("Goodbye World")


- ``bl_info`` is a dictionary containing addon meta-data such as the title, version and author to be displayed in the
-  user preferences addon list.
- ``register`` is a function which only runs when enabling the addon, this means the module can be loaded without
-  activating the addon.
- ``unregister`` is a function to unload anything setup by ``register``, this is called when the addon is disabled.
+- ``bl_info`` is a dictionary containing add-on metadata such as the title,
+  version and author to be displayed in the user preferences add-on list.
+- ``register`` is a function which only runs when enabling the add-on,
+  this means the module can be loaded without activating the add-on.
+- ``unregister`` is a function to unload anything setup by ``register``, this is called when the add-on is disabled.


+Notice this add-on does not do anything related to Blender, (the :mod:`bpy` module is not imported for example).

-Notice this addon does not do anything related to Blender, (the :mod:`bpy` module is not imported for example).
+This is a contrived example of an add-on that serves to illustrate the point
+that the base requirements of an add-on are simple.

-This is a contrived example of an addon that serves to illustrate the point
-that the base requirements of an addon are simple.
-
-An addon will typically register operators, panels, menu items etc, but its worth noting that _any_ script can do this,
+An add-on will typically register operators, panels, menu items etc, but its worth noting that _any_ script can do this,
 when executed from the text editor or even the interactive console - there is nothing inherently different about an
-addon that allows it to integrate with Blender, such functionality is just provided by the :mod:`bpy` module for any
+add-on that allows it to integrate with Blender, such functionality is just provided by the :mod:`bpy` module for any
 script to access.

-So an addon is just a way to encapsulate a Python module in a way a user can easily utilize.
+So an add-on is just a way to encapsulate a Python module in a way a user can easily utilize.

 .. note::

@@ -99,14 +97,14 @@ So an addon is just a way to encapsulate a Python module in a way a user can eas
   Messages will be printed when enabling and disabling.


-Your First Addon
-================
+Your First Add-on
+=================

-The simplest possible addon above was useful as an example but not much else.
-This next addon is simple but shows how to integrate a script into Blender using an ``Operator``
+The simplest possible add-on above is useful as an example but not much else.
+This next add-on is simple but shows how to integrate a script into Blender using an ``Operator``
 which is the typical way to define a tool accessed from menus, buttons and keyboard shortcuts.

-For the first example we'll make a script that simply moves all objects in a scene.
+For the first example we will make a script that simply moves all objects in a scene.


 Write The Script
@@ -130,13 +128,13 @@ Add the following script to the text editor in Blender.
   :alt: Run Script button

 Click the Run Script button, all objects in the active scene are moved by 1.0 Blender unit.
-Next we'll make this script into an addon.
+Next we will make this script into an add-on.


-Write the Addon (Simple)
------------------------
+Write the Add-on (Simple)
+-------------------------

-This addon takes the body of the script above, and adds them to an operator's ``execute()`` function.
+This add-on takes the body of the script above, and adds them to an operator's ``execute()`` function.


 .. code-block:: python
@@ -173,7 +171,7 @@ This addon takes the body of the script above, and adds them to an operator's ``


   # This allows you to run the script directly from blenders text editor
-   # to test the addon without having to install it.
+   # to test the add-on without having to install it.
   if __name__ == "__main__":
       register()

@@ -206,33 +204,33 @@ Do this by pressing :kbd:`Spacebar` to bring up the operator search dialog and t

 The objects should move as before.

-*Keep this addon open in Blender for the next step - Installing.*
+*Keep this add-on open in Blender for the next step - Installing.*


-Install The Addon
-----------------
+Install The Add-on
+------------------

-Once you have your addon within in Blender's text editor,
+Once you have your add-on within in Blender's text editor,
 you will want to be able to install it so it can be enabled in the user preferences to load on startup.

-Even though the addon above is a test, lets go through the steps anyway so you know how to do it for later.
+Even though the add-on above is a test, lets go through the steps anyway so you know how to do it for later.

-To install the Blender text as an addon you will first have to save it to disk, take care to obey the naming
+To install the Blender text as an add-on you will first have to save it to disk, take care to obey the naming
 restrictions that apply to Python modules and end with a ``.py`` extension.

-Once the file is on disk, you can install it as you would for an addon downloaded online.
+Once the file is on disk, you can install it as you would for an add-on downloaded online.

-Open the user :menuselection:`File -> User Preferences`,
-Select the *Addon* section, press *Install Addon...* and select the file. 
+Open the user :menuselection:`File --> User Preferences`,
+Select the *Add-on* section, press *Install Add-on...* and select the file. 

-Now the addon will be listed and you can enable it by pressing the check-box,
+Now the add-on will be listed and you can enable it by pressing the check-box,
 if you want it to be enabled on restart, press *Save as Default*.

 .. note::

-   The destination of the addon depends on your Blender configuration.
-   When installing an addon the source and destination path are printed in the console.
-   You can also find addon path locations by running this in the Python console.
+   The destination of the add-on depends on your Blender configuration.
+   When installing an add-on the source and destination path are printed in the console.
+   You can also find add-on path locations by running this in the Python console.

   .. code-block:: python

@@ -243,17 +241,17 @@ if you want it to be enabled on restart, press *Save as Default*.
   `Directory Layout <https://www.blender.org/manual/getting_started/installing_blender/directorylayout.html>`_


-Your Second Addon
-=================
+Your Second Add-on
+==================

-For our second addon, we will focus on object instancing - this is - to make linked copies of an object in a
+For our second add-on, we will focus on object instancing - this is - to make linked copies of an object in a
 similar way to what you may have seen with the array modifier.


 Write The Script
 ----------------

-As before, first we will start with a script, develop it, then convert into an addon.
+As before, first we will start with a script, develop it, then convert into an add-on.

 .. code-block:: python

@@ -324,17 +322,17 @@ allows vectors to be multiplied by numbers and matrices.

 If you are interested in this area, read into :class:`mathutils.Vector` - there are many handy utility functions
 such as getting the angle between vectors, cross product, dot products
-as well as more advanced functions in :mod:`mathutils.geometry` such as bezier spline interpolation and
+as well as more advanced functions in :mod:`mathutils.geometry` such as Bézier Spline interpolation and
 ray-triangle intersection.

-For now we'll focus on making this script an addon, but its good to know that this 3D math module is available and
+For now we will focus on making this script an add-on, but its good to know that this 3D math module is available and
 can help you with more advanced functionality later on.


-Write the Addon
---------------
+Write the Add-on
+----------------

-The first step is to convert the script as-is into an addon.
+The first step is to convert the script as-is into an add-on.


 .. code-block:: python
@@ -381,7 +379,7 @@ The first step is to convert the script as-is into an addon.
       register()


-Everything here has been covered in the previous steps, you may want to try run the addon still
+Everything here has been covered in the previous steps, you may want to try run the add-on still
 and consider what could be done to make it more useful.


@@ -434,7 +432,7 @@ however the link above includes examples of more advanced property usage.
 Menu Item
 ^^^^^^^^^

-Addons can add to the user interface of existing panels, headers and menus defined in Python.
+Add-ons can add to the user interface of existing panels, headers and menus defined in Python.

 For this example we'll add to an existing menu.

@@ -464,7 +462,7 @@ For docs on extending menus see: :doc:`bpy.types.Menu`.
 Keymap
 ^^^^^^

-In Blender addons have their own key-maps so as not to interfere with Blenders built in key-maps.
+In Blender, add-ons have their own keymaps so as not to interfere with Blenders built in key-maps.

 In the example below, a new object-mode :class:`bpy.types.KeyMap` is added,
 then a :class:`bpy.types.KeyMapItem` is added to the key-map which references our newly added operator,
@@ -502,7 +500,7 @@ this allows you to have multiple keys accessing the same operator with different

 .. note::

-   While :kbd:`Ctrl-Shift-Space` isn't a default Blender key shortcut, its hard to make sure addons won't
+   While :kbd:`Ctrl-Shift-Space` isn't a default Blender key shortcut, its hard to make sure add-ons won't
   overwrite each others keymaps, At least take care when assigning keys that they don't
   conflict with important functionality within Blender.

@@ -606,14 +604,14 @@ After selecting it from the menu, you can choose how many instance of the cube y
 .. note::

   Directly executing the script multiple times will add the menu each time too.
-   While not useful behavior, theres nothing to worry about since addons won't register them selves multiple
+   While not useful behavior, theres nothing to worry about since add-ons won't register them selves multiple
   times when enabled through the user preferences.


 Conclusions
 ===========

-Addons can encapsulate certain functionality neatly for writing tools to improve your work-flow or for writing utilities
+Add-ons can encapsulate certain functionality neatly for writing tools to improve your work-flow or for writing utilities
 for others to use.

 While there are limits to what Python can do within Blender, there is certainly a lot that can be achieved without
@@ -636,7 +634,7 @@ Here are some sites you might like to check on after completing this tutorial.
  *For more background details on Blender/Python integration.*
 - `How to Think Like a Computer Scientist <http://interactivepython.org/courselib/static/thinkcspy/index.html>`_ -
  *Great info for those who are still learning Python.*
- `Blender Development (Wiki) <http://wiki.blender.org/index.php/Dev:Contents>`_ -
+- `Blender Development (Wiki) <https://wiki.blender.org/index.php/Dev:Contents>`_ -
  *Blender Development, general information and helpful links.*
 - `Blender Artists (Coding Section) <http://blenderartists.org/forum/forumdisplay.php?47-Coding>`_ -
  *forum where people ask Python development questions*
--- a/doc/python_api/sphinx_changelog_gen.py
+++ b/doc/python_api/sphinx_changelog_gen.py
@@ -27,7 +27,7 @@ output from this tool should be added into "doc/python_api/rst/change_log.rst"
 blender --background --python doc/python_api/sphinx_changelog_gen.py -- --dump

 # create changelog
-blender --background --python doc/python_api/sphinx_changelog_gen.py -- \
+blender --background --factory-startup --python doc/python_api/sphinx_changelog_gen.py -- \
        --api_from blender_2_63_0.py \
        --api_to   blender_2_64_0.py \
        --api_out changes.rst
@@ -331,7 +331,7 @@ def main():

    # When --help or no args are given, print this help
    usage_text = "Run blender in background mode with this script: "
-    "blender --background --python %s -- [options]" % os.path.basename(__file__)
+    "blender --background --factory-startup --python %s -- [options]" % os.path.basename(__file__)

    epilog = "Run this before releases"

--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -26,16 +26,16 @@ API dump in RST files
 ---------------------
  Run this script from Blender's root path once you have compiled Blender

-    ./blender.bin --background -noaudio --python doc/python_api/sphinx_doc_gen.py
+    blender --background --factory-startup -noaudio --python doc/python_api/sphinx_doc_gen.py

  This will generate python files in doc/python_api/sphinx-in/
-  providing ./blender.bin is or links to the blender executable
+  providing ./blender is or links to the blender executable

  To choose sphinx-in directory:
-    ./blender.bin --background --python doc/python_api/sphinx_doc_gen.py -- --output ../python_api
+    blender --background --factory-startup --python doc/python_api/sphinx_doc_gen.py -- --output ../python_api

  For quick builds:
-    ./blender.bin --background --python doc/python_api/sphinx_doc_gen.py -- --partial bmesh.*
+    blender --background --factory-startup --python doc/python_api/sphinx_doc_gen.py -- --partial bmesh.*


 Sphinx: HTML generation
@@ -46,8 +46,6 @@ Sphinx: HTML generation
    cd doc/python_api
    sphinx-build sphinx-in sphinx-out

-  This requires sphinx 1.0.7 to be installed.
-

 Sphinx: PDF generation
 ----------------------
@@ -68,7 +66,7 @@ except ImportError:
    import sys
    sys.exit()

-import rna_info     # Blender module
+import rna_info  # Blender module


 def rna_info_BuildRNAInfo_cache():
@@ -86,7 +84,7 @@ import shutil
 import logging

 from platform import platform
-PLATFORM = platform().split('-')[0].lower()    # 'linux', 'darwin', 'windows'
+PLATFORM = platform().split('-')[0].lower()  # 'linux', 'darwin', 'windows'

 SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__))

@@ -208,12 +206,12 @@ BPY_LOGGER.setLevel(logging.DEBUG)
 """
 # for quick rebuilds
 rm -rf /b/doc/python_api/sphinx-* && \
-./blender.bin -b -noaudio --factory-startup -P doc/python_api/sphinx_doc_gen.py && \
+./blender -b -noaudio --factory-startup -P doc/python_api/sphinx_doc_gen.py && \
 sphinx-build doc/python_api/sphinx-in doc/python_api/sphinx-out

 or

-./blender.bin -b -noaudio --factory-startup -P doc/python_api/sphinx_doc_gen.py -- -f -B
+./blender -b -noaudio --factory-startup -P doc/python_api/sphinx_doc_gen.py -- -f -B
 """

 # Switch for quick testing so doc-builds don't take so long
@@ -365,7 +363,7 @@ INFO_DOCS = (
    ("info_overview.rst",
     "Blender/Python API Overview: a more complete explanation of Python integration"),
    ("info_tutorial_addon.rst",
-     "Blender/Python Addon Tutorial: a step by step guide on how to write an addon from scratch"),
+     "Blender/Python Add-on Tutorial: a step by step guide on how to write an add-on from scratch"),
    ("info_api_reference.rst",
     "Blender/Python API Reference Usage: examples of how to use the API reference docs"),
    ("info_best_practice.rst",
@@ -420,7 +418,7 @@ MODULE_GROUPING = {

 blender_version_strings = [str(v) for v in bpy.app.version]

-# converting bytes to strings, due to #30154
+# converting bytes to strings, due to T30154
 BLENDER_REVISION = str(bpy.app.build_hash, 'utf_8')
 BLENDER_DATE = str(bpy.app.build_date, 'utf_8')

@@ -1567,9 +1565,9 @@ def pyrna2sphinx(basepath):

    # operators
    def write_ops():
-        API_BASEURL = "http://svn.blender.org/svnroot/bf-blender/trunk/blender/release/scripts"
-        API_BASEURL_ADDON = "http://svn.blender.org/svnroot/bf-extensions/trunk/py/scripts"
-        API_BASEURL_ADDON_CONTRIB = "http://svn.blender.org/svnroot/bf-extensions/contrib/py/scripts"
+        API_BASEURL = "https://developer.blender.org/diffusion/B/browse/master/release/scripts/ "
+        API_BASEURL_ADDON = "https://developer.blender.org/diffusion/BA/"
+        API_BASEURL_ADDON_CONTRIB = "https://developer.blender.org/diffusion/BAC/"

        op_modules = {}
        for op in ops.values():
@@ -1645,7 +1643,7 @@ def write_sphinx_conf_py(basepath):

    if ARGS.sphinx_theme == "blender-org":
        fw("html_theme_path = ['../']\n")
-        # copied with the theme, exclude else we get an error [#28873]
+        # copied with the theme, exclude else we get an error [T28873]
        fw("html_favicon = 'favicon.ico'\n")    # in <theme>/static/

    # not helpful since the source is generated, adds to upload size.
--- a/extern/cuew/README
+++ b/extern/cuew/README
@@ -4,7 +4,7 @@ for determining which CUDA functions and extensions extensions are supported
 on the target platform.

 CUDA core and extension functionality is exposed in a single header file.
-GUEW has been tested on a variety of operating systems, including Windows,
+CUEW has been tested on a variety of operating systems, including Windows,
 Linux, Mac OS X.

 LICENSE
--- a/extern/cuew/README.blender
+++ b/extern/cuew/README.blender
@@ -1,5 +1,5 @@
 Project: Cuda Wrangler
 URL: https://github.com/CudaWrangler/cuew
 License: Apache 2.0
-Upstream version: e2e0315
+Upstream version: 63d2a0f
 Local modifications: None
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -36,6 +36,15 @@ set(SRC
 	device_task.cpp
 )

+set(SRC_OPENCL
+	opencl/opencl.h
+
+	opencl/opencl_base.cpp
+	opencl/opencl_mega.cpp
+	opencl/opencl_split.cpp
+	opencl/opencl_util.cpp
+)
+
 if(WITH_CYCLES_NETWORK)
 	list(APPEND SRC
 		device_network.cpp
@@ -67,4 +76,4 @@ endif()
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})

-add_library(cycles_device ${SRC} ${SRC_HEADERS})
+add_library(cycles_device ${SRC} ${SRC_OPENCL} ${SRC_HEADERS})
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -993,7 +993,7 @@ public:
 				cuda_assert(cuCtxSynchronize());

 				if(task.get_cancel()) {
-					canceled = false;
+					canceled = true;
 					break;
 				}
 			}
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -0,0 +1,403 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPENCL
+
+#include "clew.h"
+
+#include "device.h"
+
+#include "util_map.h"
+#include "util_param.h"
+#include "util_string.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
+
+/* Macro declarations used with split kernel */
+
+/* Macro to enable/disable work-stealing */
+#define __WORK_STEALING__
+
+#define SPLIT_KERNEL_LOCAL_SIZE_X 64
+#define SPLIT_KERNEL_LOCAL_SIZE_Y 1
+
+/* This value may be tuned according to the scene we are rendering.
+ *
+ * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
+ * ray-bounces will improve performance.
+ */
+#define PATH_ITER_INC_FACTOR 8
+
+/* When allocate global memory in chunks. We may not be able to
+ * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
+ * Since some bytes may be needed for aligning chunks of memory;
+ * This is the amount of memory that we dedicate for that purpose.
+ */
+#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
+
+struct OpenCLPlatformDevice {
+	OpenCLPlatformDevice(cl_platform_id platform_id,
+	                     const string& platform_name,
+	                     cl_device_id device_id,
+	                     cl_device_type device_type,
+	                     const string& device_name)
+	  : platform_id(platform_id),
+	    platform_name(platform_name),
+	    device_id(device_id),
+	    device_type(device_type),
+	    device_name(device_name) {}
+	cl_platform_id platform_id;
+	string platform_name;
+	cl_device_id device_id;
+	cl_device_type device_type;
+	string device_name;
+};
+
+/* Contains all static OpenCL helper functions. */
+class OpenCLInfo
+{
+public:
+	static cl_device_type device_type();
+	static bool use_debug();
+	static bool kernel_use_advanced_shading(const string& platform_name);
+	static bool kernel_use_split(const string& platform_name,
+	                             const cl_device_type device_type);
+	static bool device_supported(const string& platform_name,
+	                             const cl_device_id device_id);
+	static bool platform_version_check(cl_platform_id platform,
+	                                   string *error = NULL);
+	static bool device_version_check(cl_device_id device,
+	                                 string *error = NULL);
+	static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
+	                               bool force_all = false);
+};
+
+/* Thread safe cache for contexts and programs.
+ */
+class OpenCLCache
+{
+	struct Slot
+	{
+		struct ProgramEntry
+		{
+			ProgramEntry();
+			ProgramEntry(const ProgramEntry& rhs);
+			~ProgramEntry();
+			cl_program program;
+			thread_mutex *mutex;
+		};
+
+		Slot();
+		Slot(const Slot& rhs);
+		~Slot();
+
+		thread_mutex *context_mutex;
+		cl_context context;
+		typedef map<ustring, ProgramEntry> EntryMap;
+		EntryMap programs;
+
+	};
+
+	/* key is combination of platform ID and device ID */
+	typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
+
+	/* map of Slot objects */
+	typedef map<PlatformDevicePair, Slot> CacheMap;
+	CacheMap cache;
+
+	/* MD5 hash of the kernel source. */
+	string kernel_md5;
+
+	thread_mutex cache_lock;
+	thread_mutex kernel_md5_lock;
+
+	/* lazy instantiate */
+	static OpenCLCache& global_instance();
+
+public:
+
+	enum ProgramName {
+		OCL_DEV_BASE_PROGRAM,
+		OCL_DEV_MEGAKERNEL_PROGRAM,
+	};
+
+	/* Lookup context in the cache. If this returns NULL, slot_locker
+	 * will be holding a lock for the cache. slot_locker should refer to a
+	 * default constructed thread_scoped_lock. */
+	static cl_context get_context(cl_platform_id platform,
+	                              cl_device_id device,
+	                              thread_scoped_lock& slot_locker);
+	/* Same as above. */
+	static cl_program get_program(cl_platform_id platform,
+	                              cl_device_id device,
+	                              ustring key,
+	                              thread_scoped_lock& slot_locker);
+
+	/* Store context in the cache. You MUST have tried to get the item before storing to it. */
+	static void store_context(cl_platform_id platform,
+	                          cl_device_id device,
+	                          cl_context context,
+	                          thread_scoped_lock& slot_locker);
+	/* Same as above. */
+	static void store_program(cl_platform_id platform,
+	                          cl_device_id device,
+	                          cl_program program,
+	                          ustring key,
+	                          thread_scoped_lock& slot_locker);
+
+	static string get_kernel_md5();
+};
+
+#define opencl_assert(stmt) \
+	{ \
+		cl_int err = stmt; \
+		\
+		if(err != CL_SUCCESS) { \
+			string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
+			if(error_msg == "") \
+				error_msg = message; \
+			fprintf(stderr, "%s\n", message.c_str()); \
+		} \
+	} (void)0
+
+class OpenCLDeviceBase : public Device
+{
+public:
+	DedicatedTaskPool task_pool;
+	cl_context cxContext;
+	cl_command_queue cqCommandQueue;
+	cl_platform_id cpPlatform;
+	cl_device_id cdDevice;
+	cl_int ciErr;
+
+	class OpenCLProgram {
+	public:
+		OpenCLProgram() : loaded(false), device(NULL) {}
+		OpenCLProgram(OpenCLDeviceBase *device,
+		              string program_name,
+		              string kernel_name,
+		              string kernel_build_options,
+		              bool use_stdout = true);
+		~OpenCLProgram();
+
+		void add_kernel(ustring name);
+		void load();
+
+		bool is_loaded()    { return loaded; }
+		string get_log()    { return log; }
+		void report_error();
+
+		cl_kernel operator()();
+		cl_kernel operator()(ustring name);
+
+		void release();
+
+	private:
+		bool build_kernel(const string *debug_src);
+		bool compile_kernel(const string *debug_src);
+		bool load_binary(const string& clbin, const string *debug_src = NULL);
+		bool save_binary(const string& clbin);
+
+		void add_log(string msg, bool is_debug);
+		void add_error(string msg);
+
+		bool loaded;
+		cl_program program;
+		OpenCLDeviceBase *device;
+
+		/* Used for the OpenCLCache key. */
+		string program_name;
+
+		string kernel_file, kernel_build_options, device_md5;
+
+		bool use_stdout;
+		string log, error_msg;
+		string compile_output;
+
+		map<ustring, cl_kernel> kernels;
+	};
+
+	OpenCLProgram base_program;
+
+	typedef map<string, device_vector<uchar>*> ConstMemMap;
+	typedef map<string, device_ptr> MemMap;
+
+	ConstMemMap const_mem_map;
+	MemMap mem_map;
+	device_ptr null_mem;
+
+	bool device_initialized;
+	string platform_name;
+
+	bool opencl_error(cl_int err);
+	void opencl_error(const string& message);
+	void opencl_assert_err(cl_int err, const char* where);
+
+	OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_);
+	~OpenCLDeviceBase();
+
+	static void CL_CALLBACK context_notify_callback(const char *err_info,
+		const void * /*private_info*/, size_t /*cb*/, void *user_data);
+
+	bool opencl_version_check();
+
+	string device_md5_hash(string kernel_custom_build_options = "");
+	bool load_kernels(const DeviceRequestedFeatures& requested_features);
+
+	/* Has to be implemented by the real device classes.
+	 * The base device will then load all these programs. */
+	virtual void load_kernels(const DeviceRequestedFeatures& requested_features,
+	                          vector<OpenCLProgram*> &programs) = 0;
+
+	void mem_alloc(device_memory& mem, MemoryType type);
+	void mem_copy_to(device_memory& mem);
+	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
+	void mem_zero(device_memory& mem);
+	void mem_free(device_memory& mem);
+	void const_copy_to(const char *name, void *host, size_t size);
+	void tex_alloc(const char *name,
+	               device_memory& mem,
+	               InterpolationType /*interpolation*/,
+	               ExtensionType /*extension*/);
+	void tex_free(device_memory& mem);
+
+	size_t global_size_round_up(int group_size, int global_size);
+	void enqueue_kernel(cl_kernel kernel, size_t w, size_t h);
+	void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
+
+	void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
+	void shader(DeviceTask& task);
+
+	class OpenCLDeviceTask : public DeviceTask {
+	public:
+		OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
+		: DeviceTask(task)
+		{
+			run = function_bind(&OpenCLDeviceBase::thread_run,
+			                    device,
+			                    this);
+		}
+	};
+
+	int get_split_task_count(DeviceTask& /*task*/)
+	{
+		return 1;
+	}
+
+	void task_add(DeviceTask& task)
+	{
+		task_pool.push(new OpenCLDeviceTask(this, task));
+	}
+
+	void task_wait()
+	{
+		task_pool.wait();
+	}
+
+	void task_cancel()
+	{
+		task_pool.cancel();
+	}
+
+	virtual void thread_run(DeviceTask * /*task*/) = 0;
+
+protected:
+	string kernel_build_options(const string *debug_src = NULL);
+
+	class ArgumentWrapper {
+	public:
+		ArgumentWrapper() : size(0), pointer(NULL) {}
+		template <typename T>
+		ArgumentWrapper(T& argument) : size(sizeof(argument)),
+		                               pointer(&argument) { }
+		ArgumentWrapper(int argument) : size(sizeof(int)),
+		                                int_value(argument),
+		                                pointer(&int_value) { }
+		ArgumentWrapper(float argument) : size(sizeof(float)),
+		                                  float_value(argument),
+		                                  pointer(&float_value) { }
+		size_t size;
+		int int_value;
+		float float_value;
+		void *pointer;
+	};
+
+	/* TODO(sergey): In the future we can use variadic templates, once
+	 * C++0x is allowed. Should allow to clean this up a bit.
+	 */
+	int kernel_set_args(cl_kernel kernel,
+	                    int start_argument_index,
+	                    const ArgumentWrapper& arg1 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg2 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg3 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg4 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg5 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg6 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg7 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg8 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg9 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg10 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg11 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg12 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg13 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg14 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg15 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg16 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg17 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg18 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg19 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg20 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg21 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg22 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg23 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg24 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg25 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg26 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg27 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg28 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg29 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg30 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg31 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg32 = ArgumentWrapper(),
+	                    const ArgumentWrapper& arg33 = ArgumentWrapper());
+
+	void release_kernel_safe(cl_kernel kernel);
+	void release_mem_object_safe(cl_mem mem);
+	void release_program_safe(cl_program program);
+
+	/* ** Those guys are for workign around some compiler-specific bugs ** */
+
+	virtual cl_program load_cached_kernel(
+	        ustring key,
+	        thread_scoped_lock& cache_locker);
+
+	virtual void store_cached_kernel(
+	        cl_program program,
+	        ustring key,
+	        thread_scoped_lock& cache_locker);
+
+	virtual string build_options_for_base_program(
+	        const DeviceRequestedFeatures& /*requested_features*/);
+};
+
+Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background);
+Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background);
+
+CCL_NAMESPACE_END
+
+#endif
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -0,0 +1,738 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPENCL
+
+#include "opencl.h"
+
+#include "kernel_types.h"
+
+#include "util_foreach.h"
+#include "util_logging.h"
+#include "util_md5.h"
+#include "util_path.h"
+#include "util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+bool OpenCLDeviceBase::opencl_error(cl_int err)
+{
+	if(err != CL_SUCCESS) {
+		string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
+		if(error_msg == "")
+			error_msg = message;
+		fprintf(stderr, "%s\n", message.c_str());
+		return true;
+	}
+
+	return false;
+}
+
+void OpenCLDeviceBase::opencl_error(const string& message)
+{
+	if(error_msg == "")
+		error_msg = message;
+	fprintf(stderr, "%s\n", message.c_str());
+}
+
+void OpenCLDeviceBase::opencl_assert_err(cl_int err, const char* where)
+{
+	if(err != CL_SUCCESS) {
+		string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
+		if(error_msg == "")
+			error_msg = message;
+		fprintf(stderr, "%s\n", message.c_str());
+#ifndef NDEBUG
+		abort();
+#endif
+	}
+}
+
+OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
+: Device(info, stats, background_)
+{
+	cpPlatform = NULL;
+	cdDevice = NULL;
+	cxContext = NULL;
+	cqCommandQueue = NULL;
+	null_mem = 0;
+	device_initialized = false;
+
+	vector<OpenCLPlatformDevice> usable_devices;
+	OpenCLInfo::get_usable_devices(&usable_devices);
+	if(usable_devices.size() == 0) {
+		opencl_error("OpenCL: no devices found.");
+		return;
+	}
+	assert(info.num < usable_devices.size());
+	OpenCLPlatformDevice& platform_device = usable_devices[info.num];
+	cpPlatform = platform_device.platform_id;
+	cdDevice = platform_device.device_id;
+	platform_name = platform_device.platform_name;
+	VLOG(2) << "Creating new Cycles device for OpenCL platform "
+	        << platform_name << ", device "
+	        << platform_device.device_name << ".";
+
+	{
+		/* try to use cached context */
+		thread_scoped_lock cache_locker;
+		cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
+
+		if(cxContext == NULL) {
+			/* create context properties array to specify platform */
+			const cl_context_properties context_props[] = {
+				CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
+				0, 0
+			};
+
+			/* create context */
+			cxContext = clCreateContext(context_props, 1, &cdDevice,
+				context_notify_callback, cdDevice, &ciErr);
+
+			if(opencl_error(ciErr)) {
+				opencl_error("OpenCL: clCreateContext failed");
+				return;
+			}
+
+			/* cache it */
+			OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
+		}
+	}
+
+	cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
+	if(opencl_error(ciErr))
+		return;
+
+	null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
+	if(opencl_error(ciErr))
+		return;
+
+	fprintf(stderr, "Device init success\n");
+	device_initialized = true;
+}
+
+OpenCLDeviceBase::~OpenCLDeviceBase()
+{
+	task_pool.stop();
+
+	if(null_mem)
+		clReleaseMemObject(CL_MEM_PTR(null_mem));
+
+	ConstMemMap::iterator mt;
+	for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
+		mem_free(*(mt->second));
+		delete mt->second;
+	}
+
+	base_program.release();
+	if(cqCommandQueue)
+		clReleaseCommandQueue(cqCommandQueue);
+	if(cxContext)
+		clReleaseContext(cxContext);
+}
+
+void CL_CALLBACK OpenCLDeviceBase::context_notify_callback(const char *err_info,
+	const void * /*private_info*/, size_t /*cb*/, void *user_data)
+{
+	char name[256];
+	clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
+
+	fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info);
+}
+
+bool OpenCLDeviceBase::opencl_version_check()
+{
+	string error;
+	if(!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
+		opencl_error(error);
+		return false;
+	}
+	if(!OpenCLInfo::device_version_check(cdDevice, &error)) {
+		opencl_error(error);
+		return false;
+	}
+	return true;
+}
+
+string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options)
+{
+	MD5Hash md5;
+	char version[256], driver[256], name[256], vendor[256];
+
+	clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
+	clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
+	clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
+	clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
+
+	md5.append((uint8_t*)vendor, strlen(vendor));
+	md5.append((uint8_t*)version, strlen(version));
+	md5.append((uint8_t*)name, strlen(name));
+	md5.append((uint8_t*)driver, strlen(driver));
+
+	string options = kernel_build_options();
+	options += kernel_custom_build_options;
+	md5.append((uint8_t*)options.c_str(), options.size());
+
+	return md5.get_hex();
+}
+
+bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features)
+{
+	/* Verify if device was initialized. */
+	if(!device_initialized) {
+		fprintf(stderr, "OpenCL: failed to initialize device.\n");
+		return false;
+	}
+
+	/* Verify we have right opencl version. */
+	if(!opencl_version_check())
+		return false;
+
+	base_program = OpenCLProgram(this, "base", "kernel.cl", build_options_for_base_program(requested_features));
+	base_program.add_kernel(ustring("convert_to_byte"));
+	base_program.add_kernel(ustring("convert_to_half_float"));
+	base_program.add_kernel(ustring("shader"));
+	base_program.add_kernel(ustring("bake"));
+
+	vector<OpenCLProgram*> programs;
+	programs.push_back(&base_program);
+	/* Call actual class to fill the vector with its programs. */
+	load_kernels(requested_features, programs);
+
+	/* Parallel compilation is supported by Cycles, but currently all OpenCL frameworks
+	 * serialize the calls internally, so it's not much use right now.
+	 * Note: When enabling parallel compilation, use_stdout in the OpenCLProgram constructor
+	 * should be set to false as well. */
+#if 0
+	TaskPool task_pool;
+	foreach(OpenCLProgram *program, programs) {
+		task_pool.push(function_bind(&OpenCLProgram::load, program));
+	}
+	task_pool.wait_work();
+
+	foreach(OpenCLProgram *program, programs) {
+		VLOG(2) << program->get_log();
+		if(!program->is_loaded()) {
+			program->report_error();
+			return false;
+		}
+	}
+#else
+	foreach(OpenCLProgram *program, programs) {
+		program->load();
+	}
+#endif
+
+	return true;
+}
+
+void OpenCLDeviceBase::mem_alloc(device_memory& mem, MemoryType type)
+{
+	size_t size = mem.memory_size();
+
+	cl_mem_flags mem_flag;
+	void *mem_ptr = NULL;
+
+	if(type == MEM_READ_ONLY)
+		mem_flag = CL_MEM_READ_ONLY;
+	else if(type == MEM_WRITE_ONLY)
+		mem_flag = CL_MEM_WRITE_ONLY;
+	else
+		mem_flag = CL_MEM_READ_WRITE;
+
+	/* Zero-size allocation might be invoked by render, but not really
+	 * supported by OpenCL. Using NULL as device pointer also doesn't really
+	 * work for some reason, so for the time being we'll use special case
+	 * will null_mem buffer.
+	 */
+	if(size != 0) {
+		mem.device_pointer = (device_ptr)clCreateBuffer(cxContext,
+		                                                mem_flag,
+		                                                size,
+		                                                mem_ptr,
+		                                                &ciErr);
+		opencl_assert_err(ciErr, "clCreateBuffer");
+	}
+	else {
+		mem.device_pointer = null_mem;
+	}
+
+	stats.mem_alloc(size);
+	mem.device_size = size;
+}
+
+void OpenCLDeviceBase::mem_copy_to(device_memory& mem)
+{
+	/* this is blocking */
+	size_t size = mem.memory_size();
+	if(size != 0) {
+		opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
+		                                   CL_MEM_PTR(mem.device_pointer),
+		                                   CL_TRUE,
+		                                   0,
+		                                   size,
+		                                   (void*)mem.data_pointer,
+		                                   0,
+		                                   NULL, NULL));
+	}
+}
+
+void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
+{
+	size_t offset = elem*y*w;
+	size_t size = elem*w*h;
+	assert(size != 0);
+	opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
+	                                  CL_MEM_PTR(mem.device_pointer),
+	                                  CL_TRUE,
+	                                  offset,
+	                                  size,
+	                                  (uchar*)mem.data_pointer + offset,
+	                                  0,
+	                                  NULL, NULL));
+}
+
+void OpenCLDeviceBase::mem_zero(device_memory& mem)
+{
+	if(mem.device_pointer) {
+		memset((void*)mem.data_pointer, 0, mem.memory_size());
+		mem_copy_to(mem);
+	}
+}
+
+void OpenCLDeviceBase::mem_free(device_memory& mem)
+{
+	if(mem.device_pointer) {
+		if(mem.device_pointer != null_mem) {
+			opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
+		}
+		mem.device_pointer = 0;
+
+		stats.mem_free(mem.device_size);
+		mem.device_size = 0;
+	}
+}
+
+void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
+{
+	ConstMemMap::iterator i = const_mem_map.find(name);
+
+	if(i == const_mem_map.end()) {
+		device_vector<uchar> *data = new device_vector<uchar>();
+		data->copy((uchar*)host, size);
+
+		mem_alloc(*data, MEM_READ_ONLY);
+		i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
+	}
+	else {
+		device_vector<uchar> *data = i->second;
+		data->copy((uchar*)host, size);
+	}
+
+	mem_copy_to(*i->second);
+}
+
+void OpenCLDeviceBase::tex_alloc(const char *name,
+               device_memory& mem,
+               InterpolationType /*interpolation*/,
+               ExtensionType /*extension*/)
+{
+	VLOG(1) << "Texture allocate: " << name << ", "
+	        << string_human_readable_number(mem.memory_size()) << " bytes. ("
+	        << string_human_readable_size(mem.memory_size()) << ")";
+	mem_alloc(mem, MEM_READ_ONLY);
+	mem_copy_to(mem);
+	assert(mem_map.find(name) == mem_map.end());
+	mem_map.insert(MemMap::value_type(name, mem.device_pointer));
+}
+
+void OpenCLDeviceBase::tex_free(device_memory& mem)
+{
+	if(mem.device_pointer) {
+		foreach(const MemMap::value_type& value, mem_map) {
+			if(value.second == mem.device_pointer) {
+				mem_map.erase(value.first);
+				break;
+			}
+		}
+
+		mem_free(mem);
+	}
+}
+
+size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
+{
+	int r = global_size % group_size;
+	return global_size + ((r == 0)? 0: group_size - r);
+}
+
+void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
+{
+	size_t workgroup_size, max_work_items[3];
+
+	clGetKernelWorkGroupInfo(kernel, cdDevice,
+		CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
+	clGetDeviceInfo(cdDevice,
+		CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
+
+	/* Try to divide evenly over 2 dimensions. */
+	size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
+	size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
+
+	/* Some implementations have max size 1 on 2nd dimension. */
+	if(local_size[1] > max_work_items[1]) {
+		local_size[0] = workgroup_size/max_work_items[1];
+		local_size[1] = max_work_items[1];
+	}
+
+	size_t global_size[2] = {global_size_round_up(local_size[0], w),
+	                         global_size_round_up(local_size[1], h)};
+
+	/* Vertical size of 1 is coming from bake/shade kernels where we should
+	 * not round anything up because otherwise we'll either be doing too
+	 * much work per pixel (if we don't check global ID on Y axis) or will
+	 * be checking for global ID to always have Y of 0.
+	 */
+	if (h == 1) {
+		global_size[h] = 1;
+	}
+
+	/* run kernel */
+	opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
+	opencl_assert(clFlush(cqCommandQueue));
+}
+
+void OpenCLDeviceBase::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
+{
+	cl_mem ptr;
+
+	MemMap::iterator i = mem_map.find(name);
+	if(i != mem_map.end()) {
+		ptr = CL_MEM_PTR(i->second);
+	}
+	else {
+		/* work around NULL not working, even though the spec says otherwise */
+		ptr = CL_MEM_PTR(null_mem);
+	}
+
+	opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
+}
+
+void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
+{
+	/* cast arguments to cl types */
+	cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+	cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
+	cl_mem d_buffer = CL_MEM_PTR(buffer);
+	cl_int d_x = task.x;
+	cl_int d_y = task.y;
+	cl_int d_w = task.w;
+	cl_int d_h = task.h;
+	cl_float d_sample_scale = 1.0f/(task.sample + 1);
+	cl_int d_offset = task.offset;
+	cl_int d_stride = task.stride;
+
+
+	cl_kernel ckFilmConvertKernel = (rgba_byte)? base_program(ustring("convert_to_byte")): base_program(ustring("convert_to_half_float"));
+
+	cl_uint start_arg_index =
+		kernel_set_args(ckFilmConvertKernel,
+		                0,
+		                d_data,
+		                d_rgba,
+		                d_buffer);
+
+#define KERNEL_TEX(type, ttype, name) \
+set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+
+	start_arg_index += kernel_set_args(ckFilmConvertKernel,
+	                                   start_arg_index,
+	                                   d_sample_scale,
+	                                   d_x,
+	                                   d_y,
+	                                   d_w,
+	                                   d_h,
+	                                   d_offset,
+	                                   d_stride);
+
+	enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
+}
+
+void OpenCLDeviceBase::shader(DeviceTask& task)
+{
+	/* cast arguments to cl types */
+	cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+	cl_mem d_input = CL_MEM_PTR(task.shader_input);
+	cl_mem d_output = CL_MEM_PTR(task.shader_output);
+	cl_mem d_output_luma = CL_MEM_PTR(task.shader_output_luma);
+	cl_int d_shader_eval_type = task.shader_eval_type;
+	cl_int d_shader_filter = task.shader_filter;
+	cl_int d_shader_x = task.shader_x;
+	cl_int d_shader_w = task.shader_w;
+	cl_int d_offset = task.offset;
+
+	cl_kernel kernel;
+
+	if(task.shader_eval_type >= SHADER_EVAL_BAKE)
+		kernel = base_program(ustring("bake"));
+	else
+		kernel = base_program(ustring("shader"));
+
+	cl_uint start_arg_index =
+		kernel_set_args(kernel,
+		                0,
+		                d_data,
+		                d_input,
+		                d_output);
+
+	if(task.shader_eval_type < SHADER_EVAL_BAKE) {
+		start_arg_index += kernel_set_args(kernel,
+		                                   start_arg_index,
+		                                   d_output_luma);
+	}
+
+#define KERNEL_TEX(type, ttype, name) \
+	set_kernel_arg_mem(kernel, &start_arg_index, #name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+
+	start_arg_index += kernel_set_args(kernel,
+	                                   start_arg_index,
+	                                   d_shader_eval_type);
+	if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
+		start_arg_index += kernel_set_args(kernel,
+		                                   start_arg_index,
+		                                   d_shader_filter);
+	}
+	start_arg_index += kernel_set_args(kernel,
+	                                   start_arg_index,
+	                                   d_shader_x,
+	                                   d_shader_w,
+	                                   d_offset);
+
+	for(int sample = 0; sample < task.num_samples; sample++) {
+
+		if(task.get_cancel())
+			break;
+
+		kernel_set_args(kernel, start_arg_index, sample);
+
+		enqueue_kernel(kernel, task.shader_w, 1);
+
+		clFinish(cqCommandQueue);
+
+		task.update_progress(NULL);
+	}
+}
+
+string OpenCLDeviceBase::kernel_build_options(const string *debug_src)
+{
+	string build_options = "-cl-fast-relaxed-math ";
+
+	if(platform_name == "NVIDIA CUDA") {
+		build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
+		                 "-cl-nv-maxrregcount=32 "
+		                 "-cl-nv-verbose ";
+
+		uint compute_capability_major, compute_capability_minor;
+		clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
+		                sizeof(cl_uint), &compute_capability_major, NULL);
+		clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
+		                sizeof(cl_uint), &compute_capability_minor, NULL);
+
+		build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
+		                               compute_capability_major * 100 +
+		                               compute_capability_minor * 10);
+	}
+
+	else if(platform_name == "Apple")
+		build_options += "-D__KERNEL_OPENCL_APPLE__ ";
+
+	else if(platform_name == "AMD Accelerated Parallel Processing")
+		build_options += "-D__KERNEL_OPENCL_AMD__ ";
+
+	else if(platform_name == "Intel(R) OpenCL") {
+		build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
+
+		/* Options for gdb source level kernel debugging.
+		 * this segfaults on linux currently.
+		 */
+		if(OpenCLInfo::use_debug() && debug_src)
+			build_options += "-g -s \"" + *debug_src + "\" ";
+	}
+
+	if(OpenCLInfo::use_debug())
+		build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
+
+#ifdef WITH_CYCLES_DEBUG
+	build_options += "-D__KERNEL_DEBUG__ ";
+#endif
+
+	return build_options;
+}
+
+/* TODO(sergey): In the future we can use variadic templates, once
+ * C++0x is allowed. Should allow to clean this up a bit.
+ */
+int OpenCLDeviceBase::kernel_set_args(cl_kernel kernel,
+                    int start_argument_index,
+                    const ArgumentWrapper& arg1,
+                    const ArgumentWrapper& arg2,
+                    const ArgumentWrapper& arg3,
+                    const ArgumentWrapper& arg4,
+                    const ArgumentWrapper& arg5,
+                    const ArgumentWrapper& arg6,
+                    const ArgumentWrapper& arg7,
+                    const ArgumentWrapper& arg8,
+                    const ArgumentWrapper& arg9,
+                    const ArgumentWrapper& arg10,
+                    const ArgumentWrapper& arg11,
+                    const ArgumentWrapper& arg12,
+                    const ArgumentWrapper& arg13,
+                    const ArgumentWrapper& arg14,
+                    const ArgumentWrapper& arg15,
+                    const ArgumentWrapper& arg16,
+                    const ArgumentWrapper& arg17,
+                    const ArgumentWrapper& arg18,
+                    const ArgumentWrapper& arg19,
+                    const ArgumentWrapper& arg20,
+                    const ArgumentWrapper& arg21,
+                    const ArgumentWrapper& arg22,
+                    const ArgumentWrapper& arg23,
+                    const ArgumentWrapper& arg24,
+                    const ArgumentWrapper& arg25,
+                    const ArgumentWrapper& arg26,
+                    const ArgumentWrapper& arg27,
+                    const ArgumentWrapper& arg28,
+                    const ArgumentWrapper& arg29,
+                    const ArgumentWrapper& arg30,
+                    const ArgumentWrapper& arg31,
+                    const ArgumentWrapper& arg32,
+                    const ArgumentWrapper& arg33)
+{
+	int current_arg_index = 0;
+#define FAKE_VARARG_HANDLE_ARG(arg) \
+	do { \
+		if(arg.pointer != NULL) { \
+			opencl_assert(clSetKernelArg( \
+				kernel, \
+				start_argument_index + current_arg_index, \
+				arg.size, arg.pointer)); \
+			++current_arg_index; \
+		} \
+		else { \
+			return current_arg_index; \
+		} \
+	} while(false)
+	FAKE_VARARG_HANDLE_ARG(arg1);
+	FAKE_VARARG_HANDLE_ARG(arg2);
+	FAKE_VARARG_HANDLE_ARG(arg3);
+	FAKE_VARARG_HANDLE_ARG(arg4);
+	FAKE_VARARG_HANDLE_ARG(arg5);
+	FAKE_VARARG_HANDLE_ARG(arg6);
+	FAKE_VARARG_HANDLE_ARG(arg7);
+	FAKE_VARARG_HANDLE_ARG(arg8);
+	FAKE_VARARG_HANDLE_ARG(arg9);
+	FAKE_VARARG_HANDLE_ARG(arg10);
+	FAKE_VARARG_HANDLE_ARG(arg11);
+	FAKE_VARARG_HANDLE_ARG(arg12);
+	FAKE_VARARG_HANDLE_ARG(arg13);
+	FAKE_VARARG_HANDLE_ARG(arg14);
+	FAKE_VARARG_HANDLE_ARG(arg15);
+	FAKE_VARARG_HANDLE_ARG(arg16);
+	FAKE_VARARG_HANDLE_ARG(arg17);
+	FAKE_VARARG_HANDLE_ARG(arg18);
+	FAKE_VARARG_HANDLE_ARG(arg19);
+	FAKE_VARARG_HANDLE_ARG(arg20);
+	FAKE_VARARG_HANDLE_ARG(arg21);
+	FAKE_VARARG_HANDLE_ARG(arg22);
+	FAKE_VARARG_HANDLE_ARG(arg23);
+	FAKE_VARARG_HANDLE_ARG(arg24);
+	FAKE_VARARG_HANDLE_ARG(arg25);
+	FAKE_VARARG_HANDLE_ARG(arg26);
+	FAKE_VARARG_HANDLE_ARG(arg27);
+	FAKE_VARARG_HANDLE_ARG(arg28);
+	FAKE_VARARG_HANDLE_ARG(arg29);
+	FAKE_VARARG_HANDLE_ARG(arg30);
+	FAKE_VARARG_HANDLE_ARG(arg31);
+	FAKE_VARARG_HANDLE_ARG(arg32);
+	FAKE_VARARG_HANDLE_ARG(arg33);
+#undef FAKE_VARARG_HANDLE_ARG
+	return current_arg_index;
+}
+
+void OpenCLDeviceBase::release_kernel_safe(cl_kernel kernel)
+{
+	if(kernel) {
+		clReleaseKernel(kernel);
+	}
+}
+
+void OpenCLDeviceBase::release_mem_object_safe(cl_mem mem)
+{
+	if(mem != NULL) {
+		clReleaseMemObject(mem);
+	}
+}
+
+void OpenCLDeviceBase::release_program_safe(cl_program program)
+{
+	if(program) {
+		clReleaseProgram(program);
+	}
+}
+
+/* ** Those guys are for workign around some compiler-specific bugs ** */
+
+cl_program OpenCLDeviceBase::load_cached_kernel(
+        ustring key,
+        thread_scoped_lock& cache_locker)
+{
+	return OpenCLCache::get_program(cpPlatform,
+	                                cdDevice,
+	                                key,
+	                                cache_locker);
+}
+
+void OpenCLDeviceBase::store_cached_kernel(
+        cl_program program,
+        ustring key,
+        thread_scoped_lock& cache_locker)
+{
+	OpenCLCache::store_program(cpPlatform,
+	                           cdDevice,
+	                           program,
+	                           key,
+	                           cache_locker);
+}
+
+string OpenCLDeviceBase::build_options_for_base_program(
+        const DeviceRequestedFeatures& /*requested_features*/)
+{
+	/* TODO(sergey): By default we compile all features, meaning
+	 * mega kernel is not getting feature-based optimizations.
+	 *
+	 * Ideally we need always compile kernel with as less features
+	 * enabled as possible to keep performance at it's max.
+	 */
+	return "";
+}
+
+CCL_NAMESPACE_END
+
+#endif
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPENCL
+
+#include "opencl.h"
+
+#include "buffers.h"
+
+#include "kernel_types.h"
+
+#include "util_md5.h"
+#include "util_path.h"
+#include "util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OpenCLDeviceMegaKernel : public OpenCLDeviceBase
+{
+public:
+	OpenCLProgram path_trace_program;
+
+	OpenCLDeviceMegaKernel(DeviceInfo& info, Stats &stats, bool background_)
+	: OpenCLDeviceBase(info, stats, background_),
+	  path_trace_program(this, "megakernel", "kernel.cl", "-D__COMPILE_ONLY_MEGAKERNEL__ ")
+	{
+	}
+
+	virtual void load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
+	                          vector<OpenCLProgram*> &programs)
+	{
+		path_trace_program.add_kernel(ustring("path_trace"));
+		programs.push_back(&path_trace_program);
+	}
+
+	~OpenCLDeviceMegaKernel()
+	{
+		task_pool.stop();
+		path_trace_program.release();
+	}
+
+	void path_trace(RenderTile& rtile, int sample)
+	{
+		/* Cast arguments to cl types. */
+		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
+		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
+		cl_int d_x = rtile.x;
+		cl_int d_y = rtile.y;
+		cl_int d_w = rtile.w;
+		cl_int d_h = rtile.h;
+		cl_int d_offset = rtile.offset;
+		cl_int d_stride = rtile.stride;
+
+		/* Sample arguments. */
+		cl_int d_sample = sample;
+
+		cl_kernel ckPathTraceKernel = path_trace_program(ustring("path_trace"));
+
+		cl_uint start_arg_index =
+			kernel_set_args(ckPathTraceKernel,
+			                0,
+			                d_data,
+			                d_buffer,
+			                d_rng_state);
+
+#define KERNEL_TEX(type, ttype, name) \
+		set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+
+		start_arg_index += kernel_set_args(ckPathTraceKernel,
+		                                   start_arg_index,
+		                                   d_sample,
+		                                   d_x,
+		                                   d_y,
+		                                   d_w,
+		                                   d_h,
+		                                   d_offset,
+		                                   d_stride);
+
+		enqueue_kernel(ckPathTraceKernel, d_w, d_h);
+	}
+
+	void thread_run(DeviceTask *task)
+	{
+		if(task->type == DeviceTask::FILM_CONVERT) {
+			film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
+		}
+		else if(task->type == DeviceTask::SHADER) {
+			shader(*task);
+		}
+		else if(task->type == DeviceTask::PATH_TRACE) {
+			RenderTile tile;
+			/* Keep rendering tiles until done. */
+			while(task->acquire_tile(this, tile)) {
+				int start_sample = tile.start_sample;
+				int end_sample = tile.start_sample + tile.num_samples;
+
+				for(int sample = start_sample; sample < end_sample; sample++) {
+					if(task->get_cancel()) {
+						if(task->need_finish_queue == false)
+							break;
+					}
+
+					path_trace(tile, sample);
+
+					tile.sample = sample + 1;
+
+					task->update_progress(&tile);
+				}
+
+				/* Complete kernel execution before release tile */
+				/* This helps in multi-device render;
+				 * The device that reaches the critical-section function
+				 * release_tile waits (stalling other devices from entering
+				 * release_tile) for all kernels to complete. If device1 (a
+				 * slow-render device) reaches release_tile first then it would
+				 * stall device2 (a fast-render device) from proceeding to render
+				 * next tile.
+				 */
+				clFinish(cqCommandQueue);
+
+				task->release_tile(tile);
+			}
+		}
+	}
+};
+
+Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background)
+{
+	return new OpenCLDeviceMegaKernel(info, stats, background);
+}
+
+CCL_NAMESPACE_END
+
+#endif
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -0,0 +1,800 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPENCL
+
+#include "opencl.h"
+
+#include "util_logging.h"
+#include "util_path.h"
+#include "util_time.h"
+
+using std::cerr;
+using std::endl;
+
+CCL_NAMESPACE_BEGIN
+
+OpenCLCache::Slot::ProgramEntry::ProgramEntry()
+ : program(NULL),
+   mutex(NULL)
+{
+}
+
+OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry& rhs)
+ : program(rhs.program),
+   mutex(NULL)
+{
+}
+
+OpenCLCache::Slot::ProgramEntry::~ProgramEntry()
+{
+	delete mutex;
+}
+
+OpenCLCache::Slot::Slot()
+ : context_mutex(NULL),
+   context(NULL)
+{
+}
+
+OpenCLCache::Slot::Slot(const Slot& rhs)
+ : context_mutex(NULL),
+   context(NULL),
+   programs(rhs.programs)
+{
+}
+
+OpenCLCache::Slot::~Slot()
+{
+	delete context_mutex;
+}
+
+OpenCLCache& OpenCLCache::global_instance()
+{
+	static OpenCLCache instance;
+	return instance;
+}
+
+cl_context OpenCLCache::get_context(cl_platform_id platform,
+                                    cl_device_id device,
+                                    thread_scoped_lock& slot_locker)
+{
+	assert(platform != NULL);
+
+	OpenCLCache& self = global_instance();
+
+	thread_scoped_lock cache_lock(self.cache_lock);
+
+	pair<CacheMap::iterator,bool> ins = self.cache.insert(
+		CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
+
+	Slot &slot = ins.first->second;
+
+	/* create slot lock only while holding cache lock */
+	if(!slot.context_mutex)
+		slot.context_mutex = new thread_mutex;
+
+	/* need to unlock cache before locking slot, to allow store to complete */
+	cache_lock.unlock();
+
+	/* lock the slot */
+	slot_locker = thread_scoped_lock(*slot.context_mutex);
+
+	/* If the thing isn't cached */
+	if(slot.context == NULL) {
+		/* return with the caller's lock holder holding the slot lock */
+		return NULL;
+	}
+
+	/* the item was already cached, release the slot lock */
+	slot_locker.unlock();
+
+	cl_int ciErr = clRetainContext(slot.context);
+	assert(ciErr == CL_SUCCESS);
+	(void)ciErr;
+
+	return slot.context;
+}
+
+cl_program OpenCLCache::get_program(cl_platform_id platform,
+                                    cl_device_id device,
+                                    ustring key,
+                                    thread_scoped_lock& slot_locker)
+{
+	assert(platform != NULL);
+
+	OpenCLCache& self = global_instance();
+
+	thread_scoped_lock cache_lock(self.cache_lock);
+
+	pair<CacheMap::iterator,bool> ins = self.cache.insert(
+		CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
+
+	Slot &slot = ins.first->second;
+
+	pair<Slot::EntryMap::iterator,bool> ins2 = slot.programs.insert(
+		Slot::EntryMap::value_type(key, Slot::ProgramEntry()));
+
+	Slot::ProgramEntry &entry = ins2.first->second;
+
+	/* create slot lock only while holding cache lock */
+	if(!entry.mutex)
+		entry.mutex = new thread_mutex;
+
+	/* need to unlock cache before locking slot, to allow store to complete */
+	cache_lock.unlock();
+
+	/* lock the slot */
+	slot_locker = thread_scoped_lock(*entry.mutex);
+
+	/* If the thing isn't cached */
+	if(entry.program == NULL) {
+		/* return with the caller's lock holder holding the slot lock */
+		return NULL;
+	}
+
+	/* the item was already cached, release the slot lock */
+	slot_locker.unlock();
+
+	cl_int ciErr = clRetainProgram(entry.program);
+	assert(ciErr == CL_SUCCESS);
+	(void)ciErr;
+
+	return entry.program;
+}
+
+void OpenCLCache::store_context(cl_platform_id platform,
+                                cl_device_id device,
+                                cl_context context,
+                                thread_scoped_lock& slot_locker)
+{
+	assert(platform != NULL);
+	assert(device != NULL);
+	assert(context != NULL);
+
+	OpenCLCache &self = global_instance();
+
+	thread_scoped_lock cache_lock(self.cache_lock);
+	CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
+	cache_lock.unlock();
+
+	Slot &slot = i->second;
+
+	/* sanity check */
+	assert(i != self.cache.end());
+	assert(slot.context == NULL);
+
+	slot.context = context;
+
+	/* unlock the slot */
+	slot_locker.unlock();
+
+	/* increment reference count in OpenCL.
+	 * The caller is going to release the object when done with it. */
+	cl_int ciErr = clRetainContext(context);
+	assert(ciErr == CL_SUCCESS);
+	(void)ciErr;
+}
+
+void OpenCLCache::store_program(cl_platform_id platform,
+                                cl_device_id device,
+                                cl_program program,
+                                ustring key,
+                                thread_scoped_lock& slot_locker)
+{
+	assert(platform != NULL);
+	assert(device != NULL);
+	assert(program != NULL);
+
+	OpenCLCache &self = global_instance();
+
+	thread_scoped_lock cache_lock(self.cache_lock);
+
+	CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
+	assert(i != self.cache.end());
+	Slot &slot = i->second;
+
+	Slot::EntryMap::iterator i2 = slot.programs.find(key);
+	assert(i2 != slot.programs.end());
+	Slot::ProgramEntry &entry = i2->second;
+
+	assert(entry.program == NULL);
+
+	cache_lock.unlock();
+
+	entry.program = program;
+
+	/* unlock the slot */
+	slot_locker.unlock();
+
+	/* Increment reference count in OpenCL.
+	 * The caller is going to release the object when done with it.
+	 */
+	cl_int ciErr = clRetainProgram(program);
+	assert(ciErr == CL_SUCCESS);
+	(void)ciErr;
+}
+
+string OpenCLCache::get_kernel_md5()
+{
+	OpenCLCache &self = global_instance();
+	thread_scoped_lock lock(self.kernel_md5_lock);
+
+	if(self.kernel_md5.empty()) {
+		self.kernel_md5 = path_files_md5_hash(path_get("kernel"));
+	}
+	return self.kernel_md5;
+}
+
+OpenCLDeviceBase::OpenCLProgram::OpenCLProgram(OpenCLDeviceBase *device,
+                                               string program_name,
+                                               string kernel_file,
+                                               string kernel_build_options,
+                                               bool use_stdout)
+ : device(device),
+   program_name(program_name),
+   kernel_file(kernel_file),
+   kernel_build_options(kernel_build_options),
+   use_stdout(use_stdout)
+{
+	loaded = false;
+	program = NULL;
+}
+
+OpenCLDeviceBase::OpenCLProgram::~OpenCLProgram()
+{
+	release();
+}
+
+void OpenCLDeviceBase::OpenCLProgram::release()
+{
+	for(map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); ++kernel) {
+		if(kernel->second) {
+			clReleaseKernel(kernel->second);
+			kernel->second = NULL;
+		}
+	}
+	if(program) {
+		clReleaseProgram(program);
+		program = NULL;
+	}
+}
+
+void OpenCLDeviceBase::OpenCLProgram::add_log(string msg, bool debug)
+{
+	if(!use_stdout) {
+		log += msg + "\n";
+	}
+	else if(!debug) {
+		printf("%s\n", msg.c_str());
+	}
+	else {
+		VLOG(2) << msg;
+	}
+}
+
+void OpenCLDeviceBase::OpenCLProgram::add_error(string msg)
+{
+	if(use_stdout) {
+		fprintf(stderr, "%s\n", msg.c_str());
+	}
+	if(error_msg == "") {
+		error_msg += "\n";
+	}
+	error_msg += msg;
+}
+
+void OpenCLDeviceBase::OpenCLProgram::add_kernel(ustring name)
+{
+	if(!kernels.count(name)) {
+		kernels[name] = NULL;
+	}
+}
+
+bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src)
+{
+	string build_options;
+	build_options = device->kernel_build_options(debug_src) + kernel_build_options;
+
+	cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
+
+	/* show warnings even if build is successful */
+	size_t ret_val_size = 0;
+
+	clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+
+	if(ret_val_size > 1) {
+		vector<char> build_log(ret_val_size + 1);
+		clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
+
+		build_log[ret_val_size] = '\0';
+		/* Skip meaningless empty output from the NVidia compiler. */
+		if(!(ret_val_size == 2 && build_log[0] == '\n')) {
+			add_error("OpenCL build failed: errors in console");
+			if(use_stdout) {
+				fprintf(stderr, "OpenCL kernel build output:\n%s\n", &build_log[0]);
+			}
+			else {
+				compile_output = string(&build_log[0]);
+			}
+		}
+	}
+
+	if(ciErr != CL_SUCCESS) {
+		add_error(string("OpenCL build failed: ") + clewErrorString(ciErr));
+		return false;
+	}
+
+	return true;
+}
+
+bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src)
+{
+	string source = "#include \"kernels/opencl/" + kernel_file + "\" // " + OpenCLCache::get_kernel_md5() + "\n";
+	/* We compile kernels consisting of many files. unfortunately OpenCL
+	 * kernel caches do not seem to recognize changes in included files.
+	 * so we force recompile on changes by adding the md5 hash of all files.
+	 */
+	source = path_source_replace_includes(source, path_get("kernel"));
+
+	if(debug_src) {
+		path_write_text(*debug_src, source);
+	}
+
+	size_t source_len = source.size();
+	const char *source_str = source.c_str();
+	cl_int ciErr;
+
+	program = clCreateProgramWithSource(device->cxContext,
+	                                   1,
+	                                   &source_str,
+	                                   &source_len,
+	                                   &ciErr);
+
+	if(ciErr != CL_SUCCESS) {
+		add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
+		return false;
+	}
+
+	double starttime = time_dt();
+	add_log(string("Compiling OpenCL program ") + program_name.c_str(), false);
+	add_log(string("Build flags: ") + kernel_build_options, true);
+
+	if(!build_kernel(debug_src))
+		return false;
+
+	add_log(string("Kernel compilation of ") + program_name + " finished in " + string_printf("%.2lfs.\n", time_dt() - starttime), false);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::OpenCLProgram::load_binary(const string& clbin,
+                                                  const string *debug_src)
+{
+	/* read binary into memory */
+	vector<uint8_t> binary;
+
+	if(!path_read_binary(clbin, binary)) {
+		add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
+		return false;
+	}
+
+	/* create program */
+	cl_int status, ciErr;
+	size_t size = binary.size();
+	const uint8_t *bytes = &binary[0];
+
+	program = clCreateProgramWithBinary(device->cxContext, 1, &device->cdDevice,
+		&size, &bytes, &status, &ciErr);
+
+	if(status != CL_SUCCESS || ciErr != CL_SUCCESS) {
+		add_error(string("OpenCL failed create program from cached binary ") + clbin + ": "
+		                 + clewErrorString(status) + " " + clewErrorString(ciErr));
+		return false;
+	}
+
+	if(!build_kernel(debug_src))
+		return false;
+
+	return true;
+}
+
+bool OpenCLDeviceBase::OpenCLProgram::save_binary(const string& clbin)
+{
+	size_t size = 0;
+	clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
+
+	if(!size)
+		return false;
+
+	vector<uint8_t> binary(size);
+	uint8_t *bytes = &binary[0];
+
+	clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
+
+	return path_write_binary(clbin, binary);
+}
+
+void OpenCLDeviceBase::OpenCLProgram::load()
+{
+	assert(device);
+
+	loaded = false;
+
+	string device_md5 = device->device_md5_hash(kernel_build_options);
+
+	/* Try to use cached kernel. */
+	thread_scoped_lock cache_locker;
+	ustring cache_key(program_name + device_md5);
+	program = device->load_cached_kernel(cache_key,
+	                                     cache_locker);
+
+	if(!program) {
+		add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
+
+		string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + OpenCLCache::get_kernel_md5();
+		basename = path_cache_get(path_join("kernels", basename));
+		string clbin = basename + ".clbin";
+
+		/* path to preprocessed source for debugging */
+		string clsrc, *debug_src = NULL;
+
+		if(OpenCLInfo::use_debug()) {
+			clsrc = basename + ".cl";
+			debug_src = &clsrc;
+		}
+
+		/* If binary kernel exists already, try use it. */
+		if(path_exists(clbin) && load_binary(clbin)) {
+			/* Kernel loaded from binary, nothing to do. */
+			add_log(string("Loaded program from ") + clbin + ".", true);
+		}
+		else {
+			add_log(string("Kernel file ") + clbin + " either doesn't exist or failed to be loaded by driver.", true);
+
+			/* If does not exist or loading binary failed, compile kernel. */
+			if(!compile_kernel(debug_src)) {
+				return;
+			}
+
+			/* Save binary for reuse. */
+			if(!save_binary(clbin)) {
+				add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true);
+			}
+		}
+
+		/* Cache the program. */
+		device->store_cached_kernel(program,
+		                            cache_key,
+		                            cache_locker);
+	}
+	else {
+		add_log(string("Found cached OpenCL program ") + program_name + ".", true);
+	}
+
+	for(map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); ++kernel) {
+		assert(kernel->second == NULL);
+		cl_int ciErr;
+		string name = "kernel_ocl_" + kernel->first.string();
+		kernel->second = clCreateKernel(program, name.c_str(), &ciErr);
+		if(device->opencl_error(ciErr)) {
+			add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " + clewErrorString(ciErr));
+			return;
+		}
+	}
+
+	loaded = true;
+}
+
+void OpenCLDeviceBase::OpenCLProgram::report_error()
+{
+	/* If loaded is true, there was no error. */
+	if(loaded) return;
+	/* if use_stdout is true, the error was already reported. */
+	if(use_stdout) return;
+
+	cerr << error_msg << endl;
+	if(!compile_output.empty()) {
+		cerr << "OpenCL kernel build output for " << program_name << ":" << endl;
+		cerr << compile_output << endl;
+	}
+}
+
+cl_kernel OpenCLDeviceBase::OpenCLProgram::operator()()
+{
+	assert(kernels.size() == 1);
+	return kernels.begin()->second;
+}
+
+cl_kernel OpenCLDeviceBase::OpenCLProgram::operator()(ustring name)
+{
+	assert(kernels.count(name));
+	return kernels[name];
+}
+
+cl_device_type OpenCLInfo::device_type()
+{
+	switch(DebugFlags().opencl.device_type)
+	{
+		case DebugFlags::OpenCL::DEVICE_NONE:
+			return 0;
+		case DebugFlags::OpenCL::DEVICE_ALL:
+			return CL_DEVICE_TYPE_ALL;
+		case DebugFlags::OpenCL::DEVICE_DEFAULT:
+			return CL_DEVICE_TYPE_DEFAULT;
+		case DebugFlags::OpenCL::DEVICE_CPU:
+			return CL_DEVICE_TYPE_CPU;
+		case DebugFlags::OpenCL::DEVICE_GPU:
+			return CL_DEVICE_TYPE_GPU;
+		case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
+			return CL_DEVICE_TYPE_ACCELERATOR;
+		default:
+			return CL_DEVICE_TYPE_ALL;
+	}
+}
+
+bool OpenCLInfo::use_debug()
+{
+	return DebugFlags().opencl.debug;
+}
+
+bool OpenCLInfo::kernel_use_advanced_shading(const string& platform)
+{
+	/* keep this in sync with kernel_types.h! */
+	if(platform == "NVIDIA CUDA")
+		return true;
+	else if(platform == "Apple")
+		return true;
+	else if(platform == "AMD Accelerated Parallel Processing")
+		return true;
+	else if(platform == "Intel(R) OpenCL")
+		return true;
+	/* Make sure officially unsupported OpenCL platforms
+	 * does not set up to use advanced shading.
+	 */
+	return false;
+}
+
+bool OpenCLInfo::kernel_use_split(const string& platform_name,
+                                  const cl_device_type device_type)
+{
+	if(DebugFlags().opencl.kernel_type == DebugFlags::OpenCL::KERNEL_SPLIT) {
+		VLOG(1) << "Forcing split kernel to use.";
+		return true;
+	}
+	if(DebugFlags().opencl.kernel_type == DebugFlags::OpenCL::KERNEL_MEGA) {
+		VLOG(1) << "Forcing mega kernel to use.";
+		return false;
+	}
+	/* TODO(sergey): Replace string lookups with more enum-like API,
+	 * similar to device/vendor checks blender's gpu.
+	 */
+	if(platform_name == "AMD Accelerated Parallel Processing" &&
+	   device_type == CL_DEVICE_TYPE_GPU)
+	{
+		return true;
+	}
+	return false;
+}
+
+bool OpenCLInfo::device_supported(const string& platform_name,
+                                  const cl_device_id device_id)
+{
+	cl_device_type device_type;
+	clGetDeviceInfo(device_id,
+	                CL_DEVICE_TYPE,
+	                sizeof(cl_device_type),
+	                &device_type,
+	                NULL);
+	if(platform_name == "AMD Accelerated Parallel Processing" &&
+	   device_type == CL_DEVICE_TYPE_GPU)
+	{
+		return true;
+	}
+	if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
+		return true;
+	}
+	return false;
+}
+
+bool OpenCLInfo::platform_version_check(cl_platform_id platform,
+                                        string *error)
+{
+	const int req_major = 1, req_minor = 1;
+	int major, minor;
+	char version[256];
+	clGetPlatformInfo(platform,
+	                  CL_PLATFORM_VERSION,
+	                  sizeof(version),
+	                  &version,
+	                  NULL);
+	if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
+		if(error != NULL) {
+			*error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
+		}
+		return false;
+	}
+	if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
+		if(error != NULL) {
+			*error = string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = "";
+	}
+	return true;
+}
+
+bool OpenCLInfo::device_version_check(cl_device_id device,
+                                      string *error)
+{
+	const int req_major = 1, req_minor = 1;
+	int major, minor;
+	char version[256];
+	clGetDeviceInfo(device,
+	                CL_DEVICE_OPENCL_C_VERSION,
+	                sizeof(version),
+	                &version,
+	                NULL);
+	if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
+		if(error != NULL) {
+			*error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
+		}
+		return false;
+	}
+	if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
+		if(error != NULL) {
+			*error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = "";
+	}
+	return true;
+}
+
+void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
+                                    bool force_all)
+{
+	const bool force_all_platforms = force_all ||
+		(DebugFlags().opencl.kernel_type != DebugFlags::OpenCL::KERNEL_DEFAULT);
+	const cl_device_type device_type = OpenCLInfo::device_type();
+	static bool first_time = true;
+#define FIRST_VLOG(severity) if(first_time) VLOG(severity)
+
+	usable_devices->clear();
+
+	if(device_type == 0) {
+		FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
+		first_time = false;
+		return;
+	}
+
+	vector<cl_device_id> device_ids;
+	cl_uint num_devices = 0;
+	vector<cl_platform_id> platform_ids;
+	cl_uint num_platforms = 0;
+
+	/* Get devices. */
+	if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS ||
+	   num_platforms == 0)
+	{
+		FIRST_VLOG(2) << "No OpenCL platforms were found.";
+		first_time = false;
+		return;
+	}
+	platform_ids.resize(num_platforms);
+	if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS) {
+		FIRST_VLOG(2) << "Failed to fetch platform IDs from the driver..";
+		first_time = false;
+		return;
+	}
+	/* Devices are numbered consecutively across platforms. */
+	for(int platform = 0; platform < num_platforms; platform++) {
+		cl_platform_id platform_id = platform_ids[platform];
+		char pname[256];
+		if(clGetPlatformInfo(platform_id,
+		                     CL_PLATFORM_NAME,
+		                     sizeof(pname),
+		                     &pname,
+		                     NULL) != CL_SUCCESS)
+		{
+			FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
+			continue;
+		}
+		string platform_name = pname;
+		FIRST_VLOG(2) << "Enumerating devices for platform "
+		              << platform_name << ".";
+		if(!platform_version_check(platform_id)) {
+			FIRST_VLOG(2) << "Ignoring platform " << platform_name
+			              << " due to too old compiler version.";
+			continue;
+		}
+		num_devices = 0;
+		cl_int ciErr;
+		if((ciErr = clGetDeviceIDs(platform_id,
+		                  device_type,
+		                  0,
+		                  NULL,
+		                  &num_devices)) != CL_SUCCESS || num_devices == 0)
+		{
+			FIRST_VLOG(2) << "Ignoring platform " << platform_name
+			              << ", failed to fetch number of devices: " << string(clewErrorString(ciErr));
+			continue;
+		}
+		device_ids.resize(num_devices);
+		if(clGetDeviceIDs(platform_id,
+		                  device_type,
+		                  num_devices,
+		                  &device_ids[0],
+		                  NULL) != CL_SUCCESS)
+		{
+			FIRST_VLOG(2) << "Ignoring platform " << platform_name
+			              << ", failed to fetch devices list.";
+			continue;
+		}
+		for(int num = 0; num < num_devices; num++) {
+			cl_device_id device_id = device_ids[num];
+			char device_name[1024] = "\0";
+			if(clGetDeviceInfo(device_id,
+			                   CL_DEVICE_NAME,
+			                   sizeof(device_name),
+			                   &device_name,
+			                   NULL) != CL_SUCCESS)
+			{
+				FIRST_VLOG(2) << "Failed to fetch device name, ignoring.";
+				continue;
+			}
+			if(!device_version_check(device_id)) {
+				FIRST_VLOG(2) << "Ignoring device " << device_name
+				              << " due to old compiler version.";
+				continue;
+			}
+			if(force_all_platforms ||
+			   device_supported(platform_name, device_id))
+			{
+				cl_device_type device_type;
+				if(clGetDeviceInfo(device_id,
+				                   CL_DEVICE_TYPE,
+				                   sizeof(cl_device_type),
+				                   &device_type,
+				                   NULL) != CL_SUCCESS)
+				{
+					FIRST_VLOG(2) << "Ignoring device " << device_name
+					              << ", failed to fetch device type.";
+					continue;
+				}
+				FIRST_VLOG(2) << "Adding new device " << device_name << ".";
+				usable_devices->push_back(OpenCLPlatformDevice(platform_id,
+				                                               platform_name,
+				                                               device_id,
+				                                               device_type,
+				                                               device_name));
+			}
+			else {
+				FIRST_VLOG(2) << "Ignoring device " << device_name
+				              << ", not officially supported yet.";
+			}
+		}
+	}
+	first_time = false;
+}
+
+CCL_NAMESPACE_END
+
+#endif
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -217,11 +217,11 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	set(CUDA_VERSION "${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}")

 	# warn for other versions
-	if(CUDA_VERSION MATCHES "75")
+	if(CUDA_VERSION MATCHES "80")
 	else()
 		message(WARNING
 			"CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, "
-			"build may succeed but only CUDA 7.5 is officially supported")
+			"build may succeed but only CUDA 8.0 is officially supported")
 	endif()

 	# build for each arch
@@ -253,11 +253,6 @@ if(WITH_CYCLES_CUDA_BINARIES)
 		set(cuda_nvcc_command ${CUDA_NVCC_EXECUTABLE})
 		set(cuda_nvcc_version ${CUDA_VERSION})

-		if(DEFINED CUDA_NVCC8_EXECUTABLE  AND ((${arch} STREQUAL "sm_60") OR (${arch} STREQUAL "sm_61")))
-			set(cuda_nvcc_command ${CUDA_NVCC8_EXECUTABLE})
-			set(cuda_nvcc_version "80")
-		endif()
-
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
 		set(cuda_math_flags "--use_fast_math")

--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -1,6 +1,5 @@
 /*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation
- * Modifications Copyright 2011, Blender Foundation.
+ * Copyright 2011-2013 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -158,8 +157,9 @@ CCL_NAMESPACE_BEGIN
 #undef BVH_NAME_EVAL
 #undef BVH_FUNCTION_FULL_NAME

+/* Note: ray is passed by value to work around a possible CUDA compiler bug. */
 ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
-                                          const Ray *ray,
+                                          const Ray ray,
                                          const uint visibility,
                                          Intersection *isect,
                                          uint *lcg_state,
@@ -170,32 +170,32 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
 	if(kernel_data.bvh.have_motion) {
 #  ifdef __HAIR__
 		if(kernel_data.bvh.have_curves)
-			return bvh_intersect_hair_motion(kg, ray, isect, visibility, lcg_state, difl, extmax);
+			return bvh_intersect_hair_motion(kg, &ray, isect, visibility, lcg_state, difl, extmax);
 #  endif /* __HAIR__ */

-		return bvh_intersect_motion(kg, ray, isect, visibility);
+		return bvh_intersect_motion(kg, &ray, isect, visibility);
 	}
 #endif /* __OBJECT_MOTION__ */

 #ifdef __HAIR__
 	if(kernel_data.bvh.have_curves)
-		return bvh_intersect_hair(kg, ray, isect, visibility, lcg_state, difl, extmax);
+		return bvh_intersect_hair(kg, &ray, isect, visibility, lcg_state, difl, extmax);
 #endif /* __HAIR__ */

 #ifdef __KERNEL_CPU__

 #  ifdef __INSTANCING__
 	if(kernel_data.bvh.have_instancing)
-		return bvh_intersect_instancing(kg, ray, isect, visibility);
+		return bvh_intersect_instancing(kg, &ray, isect, visibility);
 #  endif /* __INSTANCING__ */

-	return bvh_intersect(kg, ray, isect, visibility);
+	return bvh_intersect(kg, &ray, isect, visibility);
 #else /* __KERNEL_CPU__ */

 #  ifdef __INSTANCING__
-	return bvh_intersect_instancing(kg, ray, isect, visibility);
+	return bvh_intersect_instancing(kg, &ray, isect, visibility);
 #  else
-	return bvh_intersect(kg, ray, isect, visibility);
+	return bvh_intersect(kg, &ray, isect, visibility);
 #  endif /* __INSTANCING__ */

 #endif /* __KERNEL_CPU__ */
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -16,7 +16,7 @@

 // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
 // 3-vector which might be faster.
-ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
+ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
                                                           int node_addr,
                                                           int child)
 {
@@ -30,7 +30,7 @@ ccl_device_inline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
 }

 #if !defined(__KERNEL_SSE2__)
-ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
                                                 const float3 P,
                                                 const float3 idir,
                                                 const float t,
@@ -77,7 +77,7 @@ ccl_device_inline int bvh_aligned_node_intersect(KernelGlobals *kg,
 #endif
 }

-ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
                                                        const float3 P,
                                                        const float3 idir,
                                                        const float t,
@@ -139,7 +139,7 @@ ccl_device_inline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
 #endif
 }

-ccl_device_inline bool bvh_unaligned_node_intersect_child(
+ccl_device_forceinline bool bvh_unaligned_node_intersect_child(
        KernelGlobals *kg,
        const float3 P,
        const float3 dir,
@@ -166,7 +166,7 @@ ccl_device_inline bool bvh_unaligned_node_intersect_child(
 	return tnear <= tfar;
 }

-ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
+ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust(
        KernelGlobals *kg,
        const float3 P,
        const float3 dir,
@@ -202,7 +202,7 @@ ccl_device_inline bool bvh_unaligned_node_intersect_child_robust(
 	}
 }

-ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
                                                   const float3 P,
                                                   const float3 dir,
                                                   const float3 idir,
@@ -232,7 +232,7 @@ ccl_device_inline int bvh_unaligned_node_intersect(KernelGlobals *kg,
 	return mask;
 }

-ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
                                                          const float3 P,
                                                          const float3 dir,
                                                          const float3 idir,
@@ -264,7 +264,7 @@ ccl_device_inline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
 	return mask;
 }

-ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
                                         const float3 P,
                                         const float3 dir,
                                         const float3 idir,
@@ -295,7 +295,7 @@ ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
 	}
 }

-ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
                                                const float3 P,
                                                const float3 dir,
                                                const float3 idir,
@@ -333,7 +333,7 @@ ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
 }
 #else  /* !defined(__KERNEL_SSE2__) */

-int ccl_device_inline bvh_aligned_node_intersect(
+int ccl_device_forceinline bvh_aligned_node_intersect(
        KernelGlobals *kg,
        const float3& P,
        const float3& dir,
@@ -377,7 +377,7 @@ int ccl_device_inline bvh_aligned_node_intersect(
 #  endif
 }

-int ccl_device_inline bvh_aligned_node_intersect_robust(
+ccl_device_forceinline int bvh_aligned_node_intersect_robust(
        KernelGlobals *kg,
        const float3& P,
        const float3& dir,
@@ -441,7 +441,7 @@ int ccl_device_inline bvh_aligned_node_intersect_robust(
 #  endif
 }

-int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
                                                   const float3 P,
                                                   const float3 dir,
                                                   const ssef& isect_near,
@@ -502,7 +502,7 @@ int ccl_device_inline bvh_unaligned_node_intersect(KernelGlobals *kg,
 #  endif
 }

-int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
                                                          const float3 P,
                                                          const float3 dir,
                                                          const ssef& isect_near,
@@ -573,7 +573,7 @@ int ccl_device_inline bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
 #  endif
 }

-ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
                                         const float3& P,
                                         const float3& dir,
                                         const ssef& isect_near,
@@ -611,7 +611,7 @@ ccl_device_inline int bvh_node_intersect(KernelGlobals *kg,
 	}
 }

-ccl_device_inline int bvh_node_intersect_robust(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
                                                const float3& P,
                                                const float3& dir,
                                                const ssef& isect_near,
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -40,21 +40,16 @@
 *
 */

-#ifndef __KERNEL_GPU__
-ccl_device
-#else
-ccl_device_inline
-#endif
-bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                 const Ray *ray,
-                                 Intersection *isect,
-                                 const uint visibility
+ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                                     const Ray *ray,
+                                                     Intersection *isect,
+                                                     const uint visibility
 #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-                                 , uint *lcg_state,
-                                 float difl,
-                                 float extmax
+                                                     , uint *lcg_state,
+                                                     float difl,
+                                                     float extmax
 #endif
-                                 )
+                                                     )
 {
 	/* todo:
 	 * - test if pushing distance on the stack helps (for non shadow rays)
--- a/intern/cycles/kernel/bvh/bvh_types.h
+++ b/intern/cycles/kernel/bvh/bvh_types.h
@@ -1,6 +1,5 @@
 /*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation
- * Modifications Copyright 2011, Blender Foundation.
+ * Copyright 2011-2013 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,7 +21,7 @@ CCL_NAMESPACE_BEGIN

 /* Don't inline intersect functions on GPU, this is faster */
 #ifdef __KERNEL_GPU__
-#  define ccl_device_intersect ccl_device_noinline
+#  define ccl_device_intersect ccl_device_forceinline
 #else
 #  define ccl_device_intersect ccl_device_inline
 #endif
--- a/intern/cycles/kernel/bvh/qbvh_nodes.h
+++ b/intern/cycles/kernel/bvh/qbvh_nodes.h
@@ -12,6 +12,8 @@
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
+ *
+ * Aligned nodes intersection SSE code is adopted from Embree,
 */

 struct QBVHStackItem {
--- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
@@ -1,8 +1,5 @@
 /*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
- * and code copyright 2009-2012 Intel Corporation
- *
- * Modifications Copyright 2011-2014, Blender Foundation.
+ * Copyright 2011-2013 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/intern/cycles/kernel/bvh/qbvh_subsurface.h
+++ b/intern/cycles/kernel/bvh/qbvh_subsurface.h
@@ -1,8 +1,5 @@
 /*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
- * and code copyright 2009-2012 Intel Corporation
- *
- * Modifications Copyright 2011-2014, Blender Foundation.
+ * Copyright 2011-2013 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/intern/cycles/kernel/bvh/qbvh_traversal.h
+++ b/intern/cycles/kernel/bvh/qbvh_traversal.h
@@ -1,8 +1,5 @@
 /*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
- * and code copyright 2009-2012 Intel Corporation
- *
- * Modifications Copyright 2011-2014, Blender Foundation.
+ * Copyright 2011-2013 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/intern/cycles/kernel/bvh/qbvh_volume.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume.h
@@ -1,8 +1,5 @@
 /*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
- * and code copyright 2009-2012 Intel Corporation
- *
- * Modifications Copyright 2011-2014, Blender Foundation.
+ * Copyright 2011-2013 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/intern/cycles/kernel/bvh/qbvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h
@@ -1,8 +1,5 @@
 /*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation,
- * and code copyright 2009-2012 Intel Corporation
- *
- * Modifications Copyright 2011-2014, Blender Foundation.
+ * Copyright 2011-2013 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -38,7 +38,7 @@

 CCL_NAMESPACE_BEGIN

-ccl_device_inline int bsdf_sample(KernelGlobals *kg,
+ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
                                  ShaderData *sd,
                                  const ShaderClosure *sc,
                                  float randu,
@@ -159,7 +159,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
 #ifndef __KERNEL_CUDA__
 ccl_device
 #else
-ccl_device_inline
+ccl_device_forceinline
 #endif
 float3 bsdf_eval(KernelGlobals *kg,
                 ShaderData *sd,
@@ -401,6 +401,8 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
 		default:
 			return false;
 	}
+#else
+	return false;
 #endif
 }

--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -62,7 +62,7 @@ ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float rough
 	return 2.0f / (roughness*roughness) - 2.0f;
 }

-ccl_device_inline float3 bsdf_ashikhmin_shirley_eval_reflect(
+ccl_device_forceinline float3 bsdf_ashikhmin_shirley_eval_reflect(
        const ShaderClosure *sc,
        const float3 I,
        const float3 omega_in,
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -184,7 +184,7 @@ ccl_device_inline void microfacet_ggx_sample_slopes(
 	*slope_y = S * z * safe_sqrtf(1.0f + (*slope_x)*(*slope_x));
 }

-ccl_device_inline float3 microfacet_sample_stretched(
+ccl_device_forceinline float3 microfacet_sample_stretched(
 	KernelGlobals *kg, const float3 omega_i,
 	const float alpha_x, const float alpha_y,
 	const float randu, const float randv,
@@ -277,7 +277,7 @@ ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosur
 	       (isequal_float3(bsdf_a->T, bsdf_b->T)) &&
 	       (bsdf_a->ior == bsdf_b->ior) &&
 	       ((!bsdf_a->extra && !bsdf_b->extra) ||
-            ((bsdf_a->extra && bsdf_b->extra) &&
+	        ((bsdf_a->extra && bsdf_b->extra) &&
 	         (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color))));
 }

--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
 /* === GGX Microfacet distribution functions === */

 /* Isotropic GGX microfacet distribution */
-ccl_device_inline float D_ggx(float3 wm, float alpha)
+ccl_device_forceinline float D_ggx(float3 wm, float alpha)
 {
 	wm.z *= wm.z;
 	alpha *= alpha;
@@ -30,7 +30,7 @@ ccl_device_inline float D_ggx(float3 wm, float alpha)
 }

 /* Anisotropic GGX microfacet distribution */
-ccl_device_inline float D_ggx_aniso(const float3 wm, const float2 alpha)
+ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
 {
 	float slope_x = -wm.x/alpha.x;
 	float slope_y = -wm.y/alpha.y;
@@ -40,7 +40,7 @@ ccl_device_inline float D_ggx_aniso(const float3 wm, const float2 alpha)
 }

 /* Sample slope distribution (based on page 14 of the supplemental implementation). */
-ccl_device_inline float2 mf_sampleP22_11(const float cosI, const float2 randU)
+ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
 {
 	if(cosI > 0.9999f || cosI < 1e-6f) {
 		const float r = sqrtf(randU.x / (1.0f - randU.x));
@@ -78,7 +78,7 @@ ccl_device_inline float2 mf_sampleP22_11(const float cosI, const float2 randU)
 }

 /* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */
-ccl_device_inline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU)
+ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU)
 {
 	const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z));
 	const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU);
@@ -94,7 +94,7 @@ ccl_device_inline float3 mf_sample_vndf(const float3 wi, const float2 alpha, con
 /* === Phase functions: Glossy, Diffuse and Glass === */

 /* Phase function for reflective materials, either without a fresnel term (for compatibility) or with the conductive fresnel term. */
-ccl_device_inline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm)
+ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, float3 *k, float3 *weight, const float3 wm)
 {
 	if(n && k)
 		*weight *= fresnel_conductor(dot(wi, wm), *n, *k);
@@ -102,7 +102,7 @@ ccl_device_inline float3 mf_sample_phase_glossy(const float3 wi, float3 *n, floa
 	return -wi + 2.0f * wm * dot(wi, wm);
 }

-ccl_device_inline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k)
+ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w, const float lambda, const float3 wo, const float2 alpha, float3 *n, float3 *k)
 {
 	if(w.z > 0.9999f)
 		return make_float3(0.0f, 0.0f, 0.0f);
@@ -132,7 +132,7 @@ ccl_device_inline float3 mf_eval_phase_glossy(const float3 w, const float lambda
 }

 /* Phase function for rough lambertian diffuse surfaces. */
-ccl_device_inline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv)
+ccl_device_forceinline float3 mf_sample_phase_diffuse(const float3 wm, const float randu, const float randv)
 {
 	float3 tm, bm;
 	make_orthonormals(wm, &tm, &bm);
@@ -141,14 +141,14 @@ ccl_device_inline float3 mf_sample_phase_diffuse(const float3 wm, const float ra
 	return disk.x*tm + disk.y*bm + safe_sqrtf(1.0f - disk.x*disk.x - disk.y*disk.y)*wm;
 }

-ccl_device_inline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm)
+ccl_device_forceinline float3 mf_eval_phase_diffuse(const float3 w, const float3 wm)
 {
 	const float v = max(0.0f, dot(w, wm)) * M_1_PI_F;
 	return make_float3(v, v, v);
 }

 /* Phase function for dielectric transmissive materials, including both reflection and refraction according to the dielectric fresnel term. */
-ccl_device_inline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside)
+ccl_device_forceinline float3 mf_sample_phase_glass(const float3 wi, const float eta, const float3 wm, const float randV, bool *outside)
 {
 	float cosI = dot(wi, wm);
 	float f = fresnel_dielectric_cos(cosI, eta);
@@ -162,7 +162,7 @@ ccl_device_inline float3 mf_sample_phase_glass(const float3 wi, const float eta,
 	return normalize(wm*(cosI*inv_eta + cosT) - wi*inv_eta);
 }

-ccl_device_inline float3 mf_eval_phase_glass(const float3 w, const float lambda, const float3 wo, const bool wo_outside, const float2 alpha, const float eta)
+ccl_device_forceinline float3 mf_eval_phase_glass(const float3 w, const float lambda, const float3 wo, const bool wo_outside, const float2 alpha, const float eta)
 {
 	if(w.z > 0.9999f)
 		return make_float3(0.0f, 0.0f, 0.0f);
@@ -195,7 +195,7 @@ ccl_device_inline float3 mf_eval_phase_glass(const float3 w, const float lambda,
 /* === Utility functions for the random walks === */

 /* Smith Lambda function for GGX (based on page 12 of the supplemental implementation). */
-ccl_device_inline float mf_lambda(const float3 w, const float2 alpha)
+ccl_device_forceinline float mf_lambda(const float3 w, const float2 alpha)
 {
 	if(w.z > 0.9999f)
 		return 0.0f;
@@ -212,18 +212,18 @@ ccl_device_inline float mf_lambda(const float3 w, const float2 alpha)
 }

 /* Height distribution CDF (based on page 4 of the supplemental implementation). */
-ccl_device_inline float mf_invC1(const float h)
+ccl_device_forceinline float mf_invC1(const float h)
 {
 	return 2.0f * saturate(h) - 1.0f;
 }

-ccl_device_inline float mf_C1(const float h)
+ccl_device_forceinline float mf_C1(const float h)
 {
 	return saturate(0.5f * (h + 1.0f));
 }

 /* Masking function (based on page 16 of the supplemental implementation). */
-ccl_device_inline float mf_G1(const float3 w, const float C1, const float lambda)
+ccl_device_forceinline float mf_G1(const float3 w, const float C1, const float lambda)
 {
 	if(w.z > 0.9999f)
 		return 1.0f;
@@ -233,7 +233,7 @@ ccl_device_inline float mf_G1(const float3 w, const float C1, const float lambda
 }

 /* Sampling from the visible height distribution (based on page 17 of the supplemental implementation). */
-ccl_device_inline bool mf_sample_height(const float3 w, float *h, float *C1, float *G1, float *lambda, const float U)
+ccl_device_forceinline bool mf_sample_height(const float3 w, float *h, float *C1, float *G1, float *lambda, const float U)
 {
 	if(w.z > 0.9999f)
 		return false;
@@ -262,14 +262,14 @@ ccl_device_inline bool mf_sample_height(const float3 w, float *h, float *C1, flo

 /* Approximation for the albedo of the single-scattering GGX distribution,
 * the missing energy is then approximated as a diffuse reflection for the PDF. */
-ccl_device_inline float mf_ggx_albedo(float r)
+ccl_device_forceinline float mf_ggx_albedo(float r)
 {
 	float albedo = 0.806495f*expf(-1.98712f*r*r) + 0.199531f;
 	albedo -= ((((((1.76741f*r - 8.43891f)*r + 15.784f)*r - 14.398f)*r + 6.45221f)*r - 1.19722f)*r + 0.027803f)*r + 0.00568739f;
 	return saturate(albedo);
 }

-ccl_device_inline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha)
+ccl_device_forceinline float mf_ggx_pdf(const float3 wi, const float3 wo, const float alpha)
 {
 	float D = D_ggx(normalize(wi+wo), alpha);
 	float lambda = mf_lambda(wi, make_float2(alpha, alpha));
@@ -277,17 +277,17 @@ ccl_device_inline float mf_ggx_pdf(const float3 wi, const float3 wo, const float
 	return 0.25f * D / max((1.0f + lambda) * wi.z, 1e-7f) + (1.0f - albedo) * wo.z;
 }

-ccl_device_inline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha)
+ccl_device_forceinline float mf_ggx_aniso_pdf(const float3 wi, const float3 wo, const float2 alpha)
 {
 	return 0.25f * D_ggx_aniso(normalize(wi+wo), alpha) / ((1.0f + mf_lambda(wi, alpha)) * wi.z) + (1.0f - mf_ggx_albedo(sqrtf(alpha.x*alpha.y))) * wo.z;
 }

-ccl_device_inline float mf_diffuse_pdf(const float3 wo)
+ccl_device_forceinline float mf_diffuse_pdf(const float3 wo)
 {
 	return M_1_PI_F * wo.z;
 }

-ccl_device_inline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta)
+ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, const float alpha, const float eta)
 {
 	float3 wh;
 	float fresnel;
@@ -404,7 +404,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc
 		*pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y));
 	else
 		*pdf = mf_ggx_pdf(localI, localO, bsdf->alpha_x);
-	return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->extra->cspec0, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL, bsdf->ior, bsdf->extra->use_fresnel);
+	return mf_eval_glossy(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL, bsdf->ior, bsdf->extra->use_fresnel, bsdf->extra->cspec0);
 }

 ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state)
@@ -430,7 +430,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
 	float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z));
 	float3 localO;

-	*eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->extra->cspec0, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL, bsdf->ior, bsdf->extra->use_fresnel);
+	*eval = mf_sample_glossy(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, NULL, NULL, bsdf->ior, bsdf->extra->use_fresnel, bsdf->extra->cspec0);
 	if(is_aniso)
 		*pdf = mf_ggx_aniso_pdf(localI, localO, make_float2(bsdf->alpha_x, bsdf->alpha_y));
 	else
@@ -447,7 +447,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC

 /* Multiscattering GGX Glass closure */

-ccl_device int bsdf_microfacet_multi_ggx_glass_setup(MicrofacetBsdf *bsdf, bool use_fresnel = false, bool initial_outside = true)
+ccl_device int bsdf_microfacet_multi_ggx_glass_setup(MicrofacetBsdf *bsdf, bool use_fresnel = false)
 {
 	bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
 	bsdf->alpha_y = bsdf->alpha_x;
@@ -459,7 +459,6 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_setup(MicrofacetBsdf *bsdf, bool
 	bsdf->extra->cspec0.x = saturate(bsdf->extra->cspec0.x);
 	bsdf->extra->cspec0.y = saturate(bsdf->extra->cspec0.y);
 	bsdf->extra->cspec0.z = saturate(bsdf->extra->cspec0.z);
-	bsdf->extra->initial_outside = initial_outside;

 	bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID;

@@ -481,7 +480,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_transmit(const ShaderClos
 	float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z));

 	*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
-	return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->extra->cspec0, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
+	return mf_eval_glass(localI, localO, false, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior);
 }

 ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf, ccl_addr_space uint *lcg_state) {
@@ -499,7 +498,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu
 	float3 localO = make_float3(dot(omega_in, X), dot(omega_in, Y), dot(omega_in, Z));

 	*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
-	return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->extra->cspec0, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, bsdf->extra->use_fresnel, bsdf->extra->initial_outside);
+	return mf_eval_glass(localI, localO, true, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, bsdf->extra->use_fresnel, bsdf->extra->cspec0);
 }

 ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const ShaderClosure *sc, float3 Ng, float3 I, float3 dIdx, float3 dIdy, float randu, float randv, float3 *eval, float3 *omega_in, float3 *domega_in_dx, float3 *domega_in_dy, float *pdf, ccl_addr_space uint *lcg_state)
@@ -546,7 +545,7 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg, const S
 	float3 localI = make_float3(dot(I, X), dot(I, Y), dot(I, Z));
 	float3 localO;

-	*eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->extra->cspec0, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, bsdf->extra->use_fresnel, bsdf->extra->initial_outside);
+	*eval = mf_sample_glass(localI, &localO, bsdf->extra->color, bsdf->alpha_x, bsdf->alpha_y, lcg_state, bsdf->ior, bsdf->extra->use_fresnel, bsdf->extra->cspec0);
 	*pdf = mf_glass_pdf(localI, localO, bsdf->alpha_x, bsdf->ior);
 	*eval *= *pdf;

--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@@ -25,23 +25,23 @@
 * energy is used. In combination with MIS, that is enough to produce an unbiased result, although
 * the balance heuristic isn't necessarily optimal anymore.
 */
-ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
+ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
        float3 wi,
        float3 wo,
        const bool wo_outside,
 		const float3 color,
-		const float3 cspec0,
        const float alpha_x,
        const float alpha_y,
         ccl_addr_space uint *lcg_state
 #ifdef MF_MULTI_GLASS
        , const float eta
 		, bool use_fresnel = false
-		, bool initial_outside = true
+		, const float3 cspec0 = make_float3(1.0f, 1.0f, 1.0f)
 #elif defined(MF_MULTI_GLOSSY)
 		 , float3 *n, float3 *k
 		 , const float eta = 1.0f
 		 , bool use_fresnel = false
+		 , const float3 cspec0 = make_float3(1.0f, 1.0f, 1.0f)
 #endif
 )
 {
@@ -88,8 +88,8 @@ ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 	float3 throughput2 = make_float3(1.0f, 1.0f, 1.0f);
 	float F0 = fresnel_dielectric_cos(1.0f, eta);
 	float F0_norm = 1.0f / (1.0f - F0);
-	if (use_fresnel/* && initial_outside*/) {
-		float FH = (fresnel_dielectric_cos(dot(wi, normalize(wi + wo)), eta) - F0) * F0_norm; //schlick_fresnel(dot(wi, normalize(wi + wo))); //
+	if (use_fresnel) {
+		float FH = (fresnel_dielectric_cos(dot(wi, normalize(wi + wo)), eta) - F0) * F0_norm;
 		throughput2 = cspec0 * (1.0f - FH) + make_float3(1.0f, 1.0f, 1.0f) * FH;

 		eval2 = throughput2 * eval;
@@ -118,7 +118,7 @@ ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 	float F0 = fresnel_dielectric_cos(1.0f, eta);
 	float F0_norm = 1.0f / (1.0f - F0);
 	if (use_fresnel) {
-		float FH = (fresnel_dielectric_cos(dot(wi, normalize(wi + wo)), eta) - F0) * F0_norm; //schlick_fresnel(dot(wi, normalize(wi + wo))); //
+		float FH = (fresnel_dielectric_cos(dot(wi, normalize(wi + wo)), eta) - F0) * F0_norm;
 		throughput2 = cspec0 * (1.0f - FH) + make_float3(1.0f, 1.0f, 1.0f) * FH;

 		eval2 = throughput2 * val;
@@ -167,7 +167,7 @@ ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 			else
 				phase = mf_eval_phase_glass(wr, lambda_r, -wo, !wo_outside, alpha, 1.0f/eta);

-			if (use_fresnel/* && initial_outside*/)
+			if (use_fresnel)
 				eval2 += throughput2 * phase * mf_G1(wo_outside ? wo : -wo, mf_C1((outside == wo_outside) ? hr : -hr), shadowing_lambda);
 #elif defined(MF_MULTI_DIFFUSE)
 			phase = mf_eval_phase_diffuse(wo, wm);
@@ -194,8 +194,8 @@ ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 			if (use_fresnel && !next_outside) {
 				throughput2 *= color;
 			}
-			else if (use_fresnel/* && initial_outside && outside && next_outside*/) {
-				float FH = (fresnel_dielectric_cos(dot(wi_prev, wm), eta) - F0) * F0_norm; //schlick_fresnel(dot(wi_prev, wm)); //
+			else if (use_fresnel) {
+				float FH = (fresnel_dielectric_cos(dot(wi_prev, wm), eta) - F0) * F0_norm;
 				t_color = cspec0 * (1.0f - FH) + make_float3(1.0f, 1.0f, 1.0f) * FH;

 				if (order > 0)
@@ -207,7 +207,7 @@ ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 			                             lcg_step_float_addrspace(lcg_state));
 #else /* MF_MULTI_GLOSSY */
 			if (use_fresnel) {
-				float FH = (fresnel_dielectric_cos(dot(-wr, wm), eta) - F0) * F0_norm; //schlick_fresnel(dot(-wr, wm)); //
+				float FH = (fresnel_dielectric_cos(dot(-wr, wm), eta) - F0) * F0_norm;
 				t_color = cspec0 * (1.0f - FH) + make_float3(1.0f, 1.0f, 1.0f) * FH;

 				if (order > 0)
@@ -246,15 +246,16 @@ ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 * escaped the surface in wo. The function returns the throughput between wi and wo.
 * Without reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal.
 */
-ccl_device float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float3 cspec0, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state
+ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint *lcg_state
 #ifdef MF_MULTI_GLASS
 	, const float eta
 	, bool use_fresnel = false
-	, bool initial_outside = true
+	, const float3 cspec0 = make_float3(1.0f, 1.0f, 1.0f)
 #elif defined(MF_MULTI_GLOSSY)
 	, float3 *n, float3 *k
 	, const float eta = 1.0f
 	, bool use_fresnel = false
+	, const float3 cspec0 = make_float3(1.0f, 1.0f, 1.0f)
 #endif
 )
 {
@@ -272,8 +273,8 @@ ccl_device float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const
 	float3 throughput2 = make_float3(1.0f, 1.0f, 1.0f);
 	float F0 = fresnel_dielectric_cos(1.0f, eta);
 	float F0_norm = 1.0f / (1.0f - F0);
-	if (use_fresnel/* && initial_outside*/) {
-		float FH = (fresnel_dielectric_cos(dot(wi, normalize(wi + wr)), eta) - F0) * F0_norm; //schlick_fresnel(dot(wi, normalize(wi + wr))); //
+	if (use_fresnel) {
+		float FH = (fresnel_dielectric_cos(dot(wi, normalize(wi + wr)), eta) - F0) * F0_norm;
 		throughput2 = cspec0 * (1.0f - FH) + make_float3(1.0f, 1.0f, 1.0f) * FH;
 	}
 #elif defined(MF_MULTI_GLOSSY)
@@ -282,7 +283,7 @@ ccl_device float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const
 	float F0 = fresnel_dielectric_cos(1.0f, eta);
 	float F0_norm = 1.0f / (1.0f - F0);
 	if (use_fresnel) {
-		float FH = (fresnel_dielectric_cos(dot(wi, normalize(wi + wr)), eta) - F0) * F0_norm; //schlick_fresnel(dot(wi, normalize(wi + wr))); //
+		float FH = (fresnel_dielectric_cos(dot(wi, normalize(wi + wr)), eta) - F0) * F0_norm;
 		throughput2 = cspec0 * (1.0f - FH) + make_float3(1.0f, 1.0f, 1.0f) * FH;
 	}
 #endif
@@ -322,8 +323,8 @@ ccl_device float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const
 			if (!next_outside) {
 				throughput2 *= color;
 			}
-			else if (/*initial_outside && outside && next_outside*/true) {
-				float FH = (fresnel_dielectric_cos(dot(wi_prev, wm), eta) - F0) * F0_norm; //schlick_fresnel(dot(wi_prev, wm)); //
+			else {
+				float FH = (fresnel_dielectric_cos(dot(wi_prev, wm), eta) - F0) * F0_norm;
 				t_color = cspec0 * (1.0f - FH) + make_float3(1.0f, 1.0f, 1.0f) * FH;

 				if (order == 0)
@@ -338,7 +339,7 @@ ccl_device float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi, float3 *wo, const
 		                             lcg_step_float_addrspace(lcg_state));
 #else /* MF_MULTI_GLOSSY */
 		if (use_fresnel) {
-			float FH = (fresnel_dielectric_cos(dot(-wr, wm), eta) - F0) * F0_norm; //schlick_fresnel(dot(-wr, wm)); //
+			float FH = (fresnel_dielectric_cos(dot(-wr, wm), eta) - F0) * F0_norm;
 			t_color = cspec0 * (1.0f - FH) + make_float3(1.0f, 1.0f, 1.0f) * FH;

 			if (order == 0)
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -143,7 +143,7 @@ ccl_device float bssrdf_cubic_pdf(const ShaderClosure *sc, float r)
 }

 /* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
-ccl_device_inline float bssrdf_cubic_quintic_root_find(float xi)
+ccl_device_forceinline float bssrdf_cubic_quintic_root_find(float xi)
 {
 	/* newton-raphson iteration, usually succeeds in 2-4 iterations, except
 	 * outside 0.02 ... 0.98 where it can go up to 10, so overall performance
@@ -257,7 +257,7 @@ ccl_device float bssrdf_burley_pdf(const ShaderClosure *sc, float r)
 * Returns scaled radius, meaning the result is to be scaled up by d.
 * Since there's no closed form solution we do Newton-Raphson method to find it.
 */
-ccl_device_inline float bssrdf_burley_root_find(float xi)
+ccl_device_forceinline float bssrdf_burley_root_find(float xi)
 {
 	const float tolerance = 1e-6f;
 	const int max_iteration_count = 10;
@@ -412,7 +412,7 @@ ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float
 		bssrdf_burley_sample(sc, xi, r, h);
 }

-ccl_device_inline float bssrdf_pdf(const ShaderClosure *sc, float r)
+ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r)
 {
 	if(sc->type == CLOSURE_BSSRDF_CUBIC_ID)
 		return bssrdf_cubic_pdf(sc, r);
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -1,6 +1,5 @@
 /*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation
- * Modifications Copyright 2011, Blender Foundation.
+ * Copyright 2011-2013 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -222,10 +222,10 @@ ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)

 #ifdef __KERNEL_SSE2__
 /* Pass P and dir by reference to aligned vector */
-ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
 #else
-ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
 #endif
 {
@@ -621,7 +621,7 @@ ccl_device_inline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
 	return hit;
 }

-ccl_device_inline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
 {
 	/* define few macros to minimize code duplication for SSE */
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -1,6 +1,5 @@
 /*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation
- * Modifications Copyright 2011, Blender Foundation.
+ * Copyright 2011-2013 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -1,6 +1,5 @@
 /*
- * Adapted from code Copyright 2009-2010 NVIDIA Corporation
- * Modifications Copyright 2011, Blender Foundation.
+ * Copyright 2011-2013 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -107,6 +107,67 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,

 	/* Calculate vertices relative to ray origin. */
 	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, triAddr);
+
+#if defined(__KERNEL_AVX2__)
+	const avxf avxf_P(P.m128, P.m128);
+
+	const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0);
+	const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1);
+
+	const avxf AB = tri_ab - avxf_P;
+	const avxf BC = tri_bc - avxf_P;
+
+	const __m256i permuteMask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx);
+
+	const avxf AB_k = shuffle(AB, permuteMask);
+	const avxf BC_k = shuffle(BC, permuteMask);
+
+	/* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */
+	const avxf ABBC_kz = shuffle<2>(AB_k, BC_k);
+
+	/* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */
+	const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k);
+
+	const avxf Sxy(Sy, Sx, Sy, Sx);
+
+	/* Ax, Ay, Bx, By, Bx, By, Cx, Cy */
+	const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy);
+
+	float ABBC_kz_array[8];
+	_mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz);
+
+	const float A_kz = ABBC_kz_array[0];
+	const float B_kz = ABBC_kz_array[2];
+	const float C_kz = ABBC_kz_array[6];
+
+	/* By, Bx, Cy, Cx, By, Bx, Ay, Ax */
+	const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy);
+
+	const avxf negMask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000);
+
+	/* W           U                             V
+	 * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX
+	 */
+	const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, negMask /* Dont care */);
+
+	const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ negMask;
+
+	/* Calculate scaled barycentric coordinates. */
+	float WUVW_array[4];
+	_mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW));
+
+	const float W = WUVW_array[0];
+	const float U = WUVW_array[1];
+	const float V = WUVW_array[2];
+
+	const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW);
+	const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW,
+	                                               _mm256_setzero_ps(), 0));
+
+	if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) {
+		return false;
+	}
+#else
 	const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
 	             tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
 	             tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
@@ -135,6 +196,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 	{
 		return false;
 	}
+#endif

 	/* Calculate determinant. */
 	float det = U + V + W;
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -54,13 +54,7 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
 	}
 }

-/* TODO(sergey): This is just a workaround for annoying 6.5 compiler bug. */
-#if !defined(__KERNEL_CUDA__) || __CUDA_ARCH__ < 500
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
+ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
 {
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -221,14 +221,6 @@ ccl_device_inline void camera_sample_panorama(KernelGlobals *kg,

 	/* create ray form raster position */
 	ray->P = make_float3(0.0f, 0.0f, 0.0f);
-
-#ifdef __CAMERA_CLIPPING__
-	/* clipping */
-	ray->t = kernel_data.cam.cliplength;
-#else
-	ray->t = FLT_MAX;
-#endif
-
 	ray->D = panorama_to_direction(kg, Pcamera.x, Pcamera.y);

 	/* indicates ray should not receive any light, outside of the lens */
@@ -302,6 +294,14 @@ ccl_device_inline void camera_sample_panorama(KernelGlobals *kg,
 	ray->dD.dy = spherical_stereo_direction(kg, tD, tP, Pcamera) - Ddiff;
 	/* dP.dy is zero, since the omnidirectional panorama only shift the eyes horizontally */
 #endif
+
+#ifdef __CAMERA_CLIPPING__
+	/* clipping */
+	ray->P += kernel_data.cam.nearclip*ray->D;
+	ray->t = kernel_data.cam.cliplength;
+#else
+	ray->t = FLT_MAX;
+#endif
 }

 /* Common */
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -71,6 +71,20 @@ template<typename T> struct texture  {
 		return data[index];
 	}

+#ifdef __KERNEL_AVX__
+	/* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
+	 * compatibility with existing indicies and data structures.
+	 */
+	ccl_always_inline avxf fetch_avxf(const int index)
+	{
+		kernel_assert(index >= 0 && (index+1) < width);
+		ssef *ssefData = (ssef*)data;
+		ssef *ssefNodeData = &ssefData[index];
+		return _mm256_loadu_ps((float *)ssefNodeData);
+	}
+
+#endif
+
 #ifdef __KERNEL_SSE2__
 	ccl_always_inline ssef fetch_ssef(int index)
 	{
@@ -506,6 +520,7 @@ typedef texture_image<half4> texture_image_half4;
 /* Macros to handle different memory storage on different devices */

 #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
+#define kernel_tex_fetch_avxf(tex, index) (kg->tex.fetch_avxf(index))
 #define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
 #define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
 #define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -37,6 +37,7 @@
 /* Qualifier wrappers for different names on different devices */

 #define ccl_device  __device__ __inline__
+#  define ccl_device_forceinline  __device__ __forceinline__
 #if (__KERNEL_CUDA_VERSION__ == 80) && (__CUDA_ARCH__ < 500)
 #  define ccl_device_inline  __device__ __forceinline__
 #else
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -33,6 +33,7 @@
 /* in opencl all functions are device functions, so leave this empty */
 #define ccl_device
 #define ccl_device_inline ccl_device
+#define ccl_device_forceinline ccl_device
 #define ccl_device_noinline ccl_device ccl_noinline
 #define ccl_may_alias
 #define ccl_constant __constant
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -69,7 +69,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 		Intersection isect;
 		uint visibility = path_state_ray_visibility(kg, state);
 		bool hit = scene_intersect(kg,
-		                           ray,
+		                           *ray,
 		                           visibility,
 		                           &isect,
 		                           NULL,
@@ -655,9 +655,9 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
 		}

-		bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
+		bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
 #else
-		bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
+		bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
 #endif

 #ifdef __KERNEL_DEBUG__
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -282,9 +282,9 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
 		}

-		bool hit = scene_intersect(kg, &ray, visibility, &isect, &lcg_state, difl, extmax);
+		bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
 #else
-		bool hit = scene_intersect(kg, &ray, visibility, &isect, NULL, 0.0f, 0.0f);
+		bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
 #endif

 #ifdef __KERNEL_DEBUG__
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -98,7 +98,7 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons
 	return index;
 }

-ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -132,13 +132,7 @@ ccl_device_inline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng,
 #endif
 }

-/* Temporary workaround for Pascal cards, otherwise AA does not work properly. */
-#if defined(__KERNEL_GPU__) && __CUDA_ARCH__ >= 600
-__device__ __forceinline__
-#else
-ccl_device_inline
-#endif
-void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -199,7 +193,7 @@ ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG

 /* Linear Congruential Generator */

-ccl_device_inline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
 {
 	/* implicit mod 2^32 */
 	rng = (1103515245*(rng) + 12345);
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -851,11 +851,11 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_
 #ifdef __SVM__
 		svm_eval_nodes(kg, sd, state, SHADER_TYPE_SURFACE, path_flag);
 #else
-		ccl_fetch_array(sd, closure, 0)->weight = make_float3(0.8f, 0.8f, 0.8f);
-		ccl_fetch_array(sd, closure, 0)->N = ccl_fetch(sd, N);
-		ccl_fetch_array(sd, closure, 0)->data0 = 0.0f;
-		ccl_fetch_array(sd, closure, 0)->data1 = 0.0f;
-		ccl_fetch(sd, flag) |= bsdf_diffuse_setup(ccl_fetch_array(sd, closure, 0));
+		DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd,
+		                                             sizeof(DiffuseBsdf),
+		                                             make_float3(0.8f, 0.8f, 0.8f));
+		bsdf->N = ccl_fetch(sd, N);
+		ccl_fetch(sd, flag) |= bsdf_diffuse_setup(bsdf);
 #endif
 	}

--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -155,7 +155,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd,
 	}
 	else {
 		Intersection isect;
-		blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
+		blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
 	}

 #ifdef __VOLUME__
@@ -205,7 +205,7 @@ ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
 	Intersection *isect = &isect_object;
 #endif

-	bool blocked = scene_intersect(kg, ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);
+	bool blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);

 #ifdef __TRANSPARENT_SHADOWS__
 	if(blocked && kernel_data.integrator.transparent_shadows) {
@@ -221,7 +221,7 @@ ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
 				if(bounce >= kernel_data.integrator.transparent_max_bounce)
 					return true;

-				if(!scene_intersect(kg, ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f))
+				if(!scene_intersect(kg, *ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f))
 				{
 #ifdef __VOLUME__
 					/* attenuation for last line segment towards light */
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -85,16 +85,11 @@ ccl_device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, Sha
 	return NULL;
 }

-#ifndef __KERNEL_GPU__
-ccl_device_noinline
-#else
-ccl_device_inline
-#endif
-float3 subsurface_scatter_eval(ShaderData *sd,
-                               ShaderClosure *sc,
-                               float disk_r,
-                               float r,
-                               bool all)
+ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
+                                                 ShaderClosure *sc,
+                                                 float disk_r,
+                                                 float r,
+                                                 bool all)
 {
 #ifdef BSSRDF_MULTI_EVAL
 	/* this is the veach one-sample model with balance heuristic, some pdf
@@ -240,14 +235,9 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
 /* Subsurface scattering step, from a point on the surface to other
 * nearby points on the same object.
 */
-#ifndef __KERNEL_CUDA__
-ccl_device
-#else
-ccl_device_inline
-#endif
-int subsurface_scatter_multi_intersect(
+ccl_device_inline int subsurface_scatter_multi_intersect(
        KernelGlobals *kg,
-        SubsurfaceIntersection* ss_isect,
+        SubsurfaceIntersection *ss_isect,
        ShaderData *sd,
        ShaderClosure *sc,
        uint *lcg_state,
@@ -347,6 +337,10 @@ int subsurface_scatter_multi_intersect(
 			                                          verts);
 		}
 #endif  /* __OBJECT_MOTION__ */
+		else {
+			ss_isect->weight[hit] = make_float3(0.0f, 0.0f, 0.0f);
+			continue;
+		}

 		float3 hit_Ng = ss_isect->Ng[hit];
 		if(ss_isect->hits[hit].object != OBJECT_NONE) {
--- a/intern/cycles/kernel/kernels/cpu/kernel.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp
@@ -45,6 +45,7 @@
 #    define __KERNEL_AVX__
 #  endif
 #  ifdef __AVX2__
+#    define __KERNEL_SSE__
 #    define __KERNEL_AVX2__
 #  endif
 #endif
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -20,6 +20,7 @@

 /* SSE optimization disabled for now on 32 bit, see bug #36316 */
 #if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE__
 #  define __KERNEL_SSE2__
 #  define __KERNEL_SSE3__
 #  define __KERNEL_SSSE3__
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -1153,7 +1153,7 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg,
 	tracedata->sd.osl_globals = sd->osl_globals;

 	/* raytrace */
-	return scene_intersect(sd->osl_globals, &ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f);
+	return scene_intersect(sd->osl_globals, ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f);
 }


--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -109,9 +109,9 @@ ccl_device void kernel_scene_intersect(
 		lcg_state = lcg_state_init(&rng, &state, 0x51633e2d);
 	}

-	bool hit = scene_intersect(kg, &ray, visibility, isect, &lcg_state, difl, extmax);
+	bool hit = scene_intersect(kg, ray, visibility, isect, &lcg_state, difl, extmax);
 #else
-	bool hit = scene_intersect(kg, &ray, visibility, isect, NULL, 0.0f, 0.0f);
+	bool hit = scene_intersect(kg, ray, visibility, isect, NULL, 0.0f, 0.0f);
 #endif

 #ifdef __KERNEL_DEBUG__
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -302,7 +302,6 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				if (transp > CLOSURE_WEIGHT_CUTOFF) {
 					float3 glass_weight = weight * transp;
 					float3 cspec0 = baseColor * specularTint + make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specularTint);
-					bool frontfacing = (ccl_fetch(sd, flag) & SD_BACKFACING) == 0;

 					if (roughness <= 5e-2f || distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID) { /* use single-scatter GGX */
 						float refl_roughness = roughness;
@@ -382,7 +381,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 							bsdf->extra->cspec0 = cspec0;

 							/* setup bsdf */
-							ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_glass_setup(bsdf, true, frontfacing);
+							ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_glass_setup(bsdf, true);
 						}
 					}
 				}
--- a/intern/cycles/kernel/svm/svm_math_util.h
+++ b/intern/cycles/kernel/svm/svm_math_util.h
@@ -164,6 +164,9 @@ ccl_device float3 svm_math_blackbody_color(float t) {

 ccl_device_inline float3 svm_math_gamma_color(float3 color, float gamma)
 {
+	if(gamma == 0.0f)
+		return make_float3(1.0f, 1.0f, 1.0f);
+
 	if(color.x > 0.0f)
 		color.x = powf(color.x, gamma);
 	if(color.y > 0.0f)
--- a/intern/cycles/render/constant_fold.cpp
+++ b/intern/cycles/render/constant_fold.cpp
@@ -89,6 +89,19 @@ void ConstantFolder::make_zero() const
 	}
 }

+void ConstantFolder::make_one() const
+{
+	if(output->type() == SocketType::FLOAT) {
+		make_constant(1.0f);
+	}
+	else if(SocketType::is_float3(output->type())) {
+		make_constant(make_float3(1.0f, 1.0f, 1.0f));
+	}
+	else {
+		assert(0);
+	}
+}
+
 void ConstantFolder::bypass(ShaderOutput *new_output) const
 {
 	assert(new_output);
@@ -321,6 +334,15 @@ void ConstantFolder::fold_math(NodeMath type, bool clamp) const
 				make_zero();
 			}
 			break;
+		case NODE_MATH_POWER:
+			/* 1 ^ X == X ^ 0 == 1 */
+			if(is_one(value1_in) || is_zero(value2_in)) {
+				make_one();
+			}
+			/* X ^ 1 == X */
+			else if(is_one(value2_in)) {
+				try_bypass_or_make_constant(value1_in, clamp);
+			}
 		default:
 			break;
 	}
--- a/intern/cycles/render/constant_fold.h
+++ b/intern/cycles/render/constant_fold.h
@@ -43,6 +43,7 @@ public:
 	void make_constant_clamp(float value, bool clamp) const;
 	void make_constant_clamp(float3 value, bool clamp) const;
 	void make_zero() const;
+	void make_one() const;

 	/* Bypass node, relinking to another output socket. */
 	void bypass(ShaderOutput *output) const;
--- a/intern/cycles/render/graph.cpp
+++ b/intern/cycles/render/graph.cpp
@@ -321,8 +321,8 @@ void ShaderGraph::finalize(Scene *scene,
 	 * modified afterwards. */

 	if(!finalized) {
-		clean(scene);
 		default_inputs(do_osl);
+		clean(scene);
 		refine_bump_nodes();

 		if(do_bump)
--- a/intern/cycles/render/mesh_subdivision.cpp
+++ b/intern/cycles/render/mesh_subdivision.cpp
@@ -109,7 +109,7 @@ namespace Far {

 	template<>
 	void TopologyRefinerFactory<ccl::Mesh>::reportInvalidTopology(TopologyError /*err_code*/,
-		char const */*msg*/, ccl::Mesh const& /*mesh*/)
+		char const * /*msg*/, ccl::Mesh const& /*mesh*/)
 	{
 	}
 } /* namespace Far */
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -4049,6 +4049,19 @@ void GammaNode::constant_fold(const ConstantFolder& folder)
 	if(folder.all_inputs_constant()) {
 		folder.make_constant(svm_math_gamma_color(color, gamma));
 	}
+	else {
+		ShaderInput *color_in = input("Color");
+		ShaderInput *gamma_in = input("Gamma");
+
+		/* 1 ^ X == X ^ 0 == 1 */
+		if(folder.is_one(color_in) || folder.is_zero(gamma_in)) {
+			folder.make_one();
+		}
+		/* X ^ 1 == X */
+		else if(folder.is_one(gamma_in)) {
+			folder.try_bypass_or_make_constant(color_in, false);
+		}
+	}
 }

 void GammaNode::compile(SVMCompiler& compiler)
--- a/intern/cycles/test/render_graph_finalize_test.cpp
+++ b/intern/cycles/test/render_graph_finalize_test.cpp
@@ -930,6 +930,72 @@ TEST(render_graph, constant_fold_gamma)
 	graph.finalize(&scene);
 }

+/*
+ * Tests: Gamma with one constant 0 input.
+ */
+TEST(render_graph, constant_fold_gamma_part_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	INVALID_INFO_MESSAGE(log, "Folding Gamma_Cx::");
+	CORRECT_INFO_MESSAGE(log, "Folding Gamma_xC::Color to constant (1, 1, 1).");
+
+	builder
+		.add_attribute("Attribute")
+		/* constant on the left */
+		.add_node(ShaderNodeBuilder<GammaNode>("Gamma_Cx")
+		          .set("Color", make_float3(0.0f, 0.0f, 0.0f)))
+		.add_connection("Attribute::Fac", "Gamma_Cx::Gamma")
+		/* constant on the right */
+		.add_node(ShaderNodeBuilder<GammaNode>("Gamma_xC")
+		          .set("Gamma", 0.0f))
+		.add_connection("Attribute::Color", "Gamma_xC::Color")
+		/* output sum */
+		.add_node(ShaderNodeBuilder<MixNode>("Out")
+		          .set(&MixNode::type, NODE_MIX_ADD)
+		          .set(&MixNode::use_clamp, true)
+		          .set("Fac", 1.0f))
+		.add_connection("Gamma_Cx::Color", "Out::Color1")
+		.add_connection("Gamma_xC::Color", "Out::Color2")
+		.output_color("Out::Color");
+
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: Gamma with one constant 1 input.
+ */
+TEST(render_graph, constant_fold_gamma_part_1)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	CORRECT_INFO_MESSAGE(log, "Folding Gamma_Cx::Color to constant (1, 1, 1).");
+	CORRECT_INFO_MESSAGE(log, "Folding Gamma_xC::Color to socket Attribute::Color.");
+
+	builder
+		.add_attribute("Attribute")
+		/* constant on the left */
+		.add_node(ShaderNodeBuilder<GammaNode>("Gamma_Cx")
+		          .set("Color", make_float3(1.0f, 1.0f, 1.0f)))
+		.add_connection("Attribute::Fac", "Gamma_Cx::Gamma")
+		/* constant on the right */
+		.add_node(ShaderNodeBuilder<GammaNode>("Gamma_xC")
+		          .set("Gamma", 1.0f))
+		.add_connection("Attribute::Color", "Gamma_xC::Color")
+		/* output sum */
+		.add_node(ShaderNodeBuilder<MixNode>("Out")
+		          .set(&MixNode::type, NODE_MIX_ADD)
+		          .set(&MixNode::use_clamp, true)
+		          .set("Fac", 1.0f))
+		.add_connection("Gamma_Cx::Color", "Out::Color1")
+		.add_connection("Gamma_xC::Color", "Out::Color2")
+		.output_color("Out::Color");
+
+	graph.finalize(&scene);
+}
+
 /*
 * Tests: BrightnessContrast with all constant inputs.
 */
@@ -1142,6 +1208,40 @@ TEST(render_graph, constant_fold_part_math_div_0)
 	graph.finalize(&scene);
 }

+/*
+ * Tests: partial folding for Math Power with known 0.
+ */
+TEST(render_graph, constant_fold_part_math_pow_0)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	/* X ^ 0 == 1 */
+	INVALID_INFO_MESSAGE(log, "Folding Math_Cx::");
+	CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Value to constant (1).");
+	INVALID_INFO_MESSAGE(log, "Folding Out::");
+
+	build_math_partial_test_graph(builder, NODE_MATH_POWER, 0.0f);
+	graph.finalize(&scene);
+}
+
+/*
+ * Tests: partial folding for Math Power with known 1.
+ */
+TEST(render_graph, constant_fold_part_math_pow_1)
+{
+	DEFINE_COMMON_VARIABLES(builder, log);
+
+	EXPECT_ANY_MESSAGE(log);
+	/* 1 ^ X == 1; X ^ 1 == X */
+	CORRECT_INFO_MESSAGE(log, "Folding Math_Cx::Value to constant (1)");
+	CORRECT_INFO_MESSAGE(log, "Folding Math_xC::Value to socket Attribute::Fac.");
+	INVALID_INFO_MESSAGE(log, "Folding Out::");
+
+	build_math_partial_test_graph(builder, NODE_MATH_POWER, 1.0f);
+	graph.finalize(&scene);
+}
+
 /*
 * Tests: Vector Math with all constant inputs.
 */
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -63,6 +63,7 @@ set(SRC_HEADERS
 	util_sky_model.cpp
 	util_sky_model.h
 	util_sky_model_data.h
+	util_avxf.h
 	util_sseb.h
 	util_ssef.h
 	util_ssei.h
--- a/intern/cycles/util/util_avxf.h
+++ b/intern/cycles/util/util_avxf.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright 2016 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0(the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_AVXF_H__
+#define __UTIL_AVXF_H__
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __KERNEL_AVX__
+struct avxf
+{
+	typedef avxf Float;
+
+	enum { size = 8 };  /* Number of SIMD elements. */
+
+	union {
+		__m256 m256;
+		float f[8];
+		int i[8];
+	};
+
+	__forceinline avxf           () {}
+	__forceinline avxf           (const avxf& other) { m256 = other.m256; }
+	__forceinline avxf& operator=(const avxf& other) { m256 = other.m256; return *this; }
+
+	__forceinline avxf(const __m256 a) : m256(a) {}
+	__forceinline avxf(const __m256i a) : m256(_mm256_castsi256_ps (a)) {}
+
+	__forceinline operator const __m256&(void) const { return m256; }
+	__forceinline operator       __m256&(void)       { return m256; }
+
+	__forceinline avxf          (float a) : m256(_mm256_set1_ps(a)) {}
+
+	__forceinline avxf(float high32x4, float low32x4) :
+	   m256(_mm256_set_ps(high32x4, high32x4, high32x4, high32x4, low32x4, low32x4, low32x4, low32x4)) {}
+
+	__forceinline avxf(float a3, float a2, float a1, float a0) :
+	   m256(_mm256_set_ps(a3, a2, a1, a0, a3, a2, a1, a0)) {}
+
+	__forceinline avxf(float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0) :
+		m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0)) {}
+
+
+	__forceinline avxf(int a3, int a2, int a1, int a0)
+	{
+		const __m256i foo = _mm256_set_epi32(a3, a2, a1, a0, a3, a2, a1, a0);
+		m256 = _mm256_castsi256_ps(foo);
+	}
+
+
+	__forceinline avxf(int a7, int a6, int a5, int a4, int a3, int a2, int a1, int a0)
+	{
+		const __m256i foo = _mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0);
+		m256 = _mm256_castsi256_ps(foo);
+	}
+
+	__forceinline avxf(__m128 a, __m128 b)
+	{
+		const __m256 foo = _mm256_castps128_ps256(a);
+		m256 = _mm256_insertf128_ps(foo, b, 1);
+	}
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Unary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxf mm256_sqrt(const avxf& a) { return _mm256_sqrt_ps(a.m256); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Binary Operators
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxf operator +(const avxf& a, const avxf& b) { return _mm256_add_ps(a.m256, b.m256); }
+__forceinline const avxf operator +(const avxf& a, const float& b) { return a + avxf(b); }
+__forceinline const avxf operator +(const float& a, const avxf& b) { return avxf(a) + b; }
+
+__forceinline const avxf operator -(const avxf& a, const avxf& b) { return _mm256_sub_ps(a.m256, b.m256); }
+__forceinline const avxf operator -(const avxf& a, const float& b) { return a - avxf(b); }
+__forceinline const avxf operator -(const float& a, const avxf& b) { return avxf(a) - b; }
+
+__forceinline const avxf operator *(const avxf& a, const avxf& b) { return _mm256_mul_ps(a.m256, b.m256); }
+__forceinline const avxf operator *(const avxf& a, const float& b) { return a * avxf(b); }
+__forceinline const avxf operator *(const float& a, const avxf& b) { return avxf(a) * b; }
+
+__forceinline const avxf operator /(const avxf& a, const avxf& b) { return _mm256_div_ps(a.m256,b.m256); }
+__forceinline const avxf operator /(const avxf& a, const float& b) { return a/avxf(b); }
+__forceinline const avxf operator /(const float& a, const avxf& b) { return avxf(a)/b; }
+
+__forceinline const avxf operator|(const avxf& a, const avxf& b) { return _mm256_or_ps(a.m256,b.m256); }
+
+__forceinline const avxf operator^(const avxf& a, const avxf& b) { return _mm256_xor_ps(a.m256,b.m256); }
+
+__forceinline const avxf operator&(const avxf& a, const avxf& b) { return _mm256_and_ps(a.m256,b.m256); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// Movement/Shifting/Shuffling Functions
+////////////////////////////////////////////////////////////////////////////////
+
+__forceinline const avxf shuffle(const avxf& a, const __m256i &shuf) {
+	return _mm256_permutevar_ps(a, shuf);
+}
+
+template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> __forceinline const avxf shuffle(const avxf& a) {
+	return _mm256_permutevar_ps(a, _mm256_set_epi32( i7,i6,i5,i4 ,i3,i2,i1,i0));
+}
+
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const avxf shuffle(const avxf& a, const avxf& b) {
+	return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
+}
+template<size_t i0, size_t i1, size_t i2, size_t i3> __forceinline const avxf shuffle(const avxf& a) {
+	return shuffle<i0,i1,i2,i3>(a,a);
+}
+template<size_t i0> __forceinline const avxf shuffle(const avxf& a, const avxf& b) {
+	return shuffle<i0,i0,i0,i0>(a, b);
+}
+template<size_t i0> __forceinline const avxf shuffle(const avxf& a) {
+	return shuffle<i0>(a,a);
+}
+
+template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7> __forceinline const avxf permute(const avxf& a) {
+#ifdef __KERNEL_AVX2__
+	return  _mm256_permutevar8x32_ps(a,_mm256_set_epi32( i7,i6,i5,i4 ,i3,i2,i1,i0));
+#else
+	float temp[8];
+	_mm256_storeu_ps((float*)&temp, a);
+	return avxf(temp[i7], temp[i6], temp[i5], temp[i4], temp[i3], temp[i2], temp[i1], temp[i0]);
+#endif
+}
+
+template<int S0, int S1, int S2, int S3,int S4,int S5,int S6, int S7>
+ccl_device_inline const avxf set_sign_bit(const avxf &a)
+{
+	return a ^ avxf(S7 << 31, S6 << 31, S5 << 31, S4 << 31, S3 << 31,S2 << 31,S1 << 31,S0 << 31);
+}
+
+template<size_t S0, size_t S1, size_t S2, size_t S3,size_t S4,size_t S5,size_t S6, size_t S7>
+ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
+{
+	return _mm256_blend_ps(a,b,S7 << 0 | S6 << 1 | S5 << 2 | S4 << 3 | S3 << 4 | S2 << 5 | S1 << 6 | S0 << 7);
+}
+
+template<size_t S0, size_t S1, size_t S2, size_t S3 >
+ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
+{
+	return blend<S0,S1,S2,S3,S0,S1,S2,S3>(a,b);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Ternary Operators
+////////////////////////////////////////////////////////////////////////////////
+__forceinline const avxf madd (const avxf& a, const avxf& b, const avxf& c) {
+#ifdef __KERNEL_AVX2__
+	return _mm256_fmadd_ps(a,b,c);
+#else
+	return c+(a*b);
+#endif
+}
+
+__forceinline const avxf nmadd(const avxf& a, const avxf& b, const avxf& c) {
+#ifdef __KERNEL_AVX2__
+	return _mm256_fnmadd_ps(a, b, c);
+#else
+	return c-(a*b);
+#endif
+}
+#endif
+
+CCL_NAMESPACE_END
+
+#endif
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -233,7 +233,7 @@ ccl_device_inline int mod(int x, int m)

 #ifndef __KERNEL_OPENCL__

-ccl_device_inline bool is_zero(const float2 a)
+ccl_device_inline bool is_zero(const float2& a)
 {
 	return (a.x == 0.0f && a.y == 0.0f);
 }
@@ -242,7 +242,7 @@ ccl_device_inline bool is_zero(const float2 a)

 #ifndef __KERNEL_OPENCL__

-ccl_device_inline float average(const float2 a)
+ccl_device_inline float average(const float2& a)
 {
 	return (a.x + a.y)*(1.0f/2.0f);
 }
@@ -251,58 +251,58 @@ ccl_device_inline float average(const float2 a)

 #ifndef __KERNEL_OPENCL__

-ccl_device_inline float2 operator-(const float2 a)
+ccl_device_inline float2 operator-(const float2& a)
 {
 	return make_float2(-a.x, -a.y);
 }

-ccl_device_inline float2 operator*(const float2 a, const float2 b)
+ccl_device_inline float2 operator*(const float2& a, const float2& b)
 {
 	return make_float2(a.x*b.x, a.y*b.y);
 }

-ccl_device_inline float2 operator*(const float2 a, float f)
+ccl_device_inline float2 operator*(const float2& a, float f)
 {
 	return make_float2(a.x*f, a.y*f);
 }

-ccl_device_inline float2 operator*(float f, const float2 a)
+ccl_device_inline float2 operator*(float f, const float2& a)
 {
 	return make_float2(a.x*f, a.y*f);
 }

-ccl_device_inline float2 operator/(float f, const float2 a)
+ccl_device_inline float2 operator/(float f, const float2& a)
 {
 	return make_float2(f/a.x, f/a.y);
 }

-ccl_device_inline float2 operator/(const float2 a, float f)
+ccl_device_inline float2 operator/(const float2& a, float f)
 {
 	float invf = 1.0f/f;
 	return make_float2(a.x*invf, a.y*invf);
 }

-ccl_device_inline float2 operator/(const float2 a, const float2 b)
+ccl_device_inline float2 operator/(const float2& a, const float2& b)
 {
 	return make_float2(a.x/b.x, a.y/b.y);
 }

-ccl_device_inline float2 operator+(const float2 a, const float2 b)
+ccl_device_inline float2 operator+(const float2& a, const float2& b)
 {
 	return make_float2(a.x+b.x, a.y+b.y);
 }

-ccl_device_inline float2 operator-(const float2 a, const float2 b)
+ccl_device_inline float2 operator-(const float2& a, const float2& b)
 {
 	return make_float2(a.x-b.x, a.y-b.y);
 }

-ccl_device_inline float2 operator+=(float2& a, const float2 b)
+ccl_device_inline float2 operator+=(float2& a, const float2& b)
 {
 	return a = a + b;
 }

-ccl_device_inline float2 operator*=(float2& a, const float2 b)
+ccl_device_inline float2 operator*=(float2& a, const float2& b)
 {
 	return a = a * b;
 }
@@ -312,7 +312,7 @@ ccl_device_inline float2 operator*=(float2& a, float f)
 	return a = a * f;
 }

-ccl_device_inline float2 operator/=(float2& a, const float2 b)
+ccl_device_inline float2 operator/=(float2& a, const float2& b)
 {
 	return a = a / b;
 }
@@ -324,12 +324,12 @@ ccl_device_inline float2 operator/=(float2& a, float f)
 }


-ccl_device_inline float dot(const float2 a, const float2 b)
+ccl_device_inline float dot(const float2& a, const float2& b)
 {
 	return a.x*b.x + a.y*b.y;
 }

-ccl_device_inline float cross(const float2 a, const float2 b)
+ccl_device_inline float cross(const float2& a, const float2& b)
 {
 	return (a.x*b.y - a.y*b.x);
 }
@@ -343,59 +343,59 @@ ccl_device_inline bool operator==(const int2 a, const int2 b)
 	return (a.x == b.x && a.y == b.y);
 }

-ccl_device_inline float len(const float2 a)
+ccl_device_inline float len(const float2& a)
 {
 	return sqrtf(dot(a, a));
 }

-ccl_device_inline float2 normalize(const float2 a)
+ccl_device_inline float2 normalize(const float2& a)
 {
 	return a/len(a);
 }

-ccl_device_inline float2 normalize_len(const float2 a, float *t)
+ccl_device_inline float2 normalize_len(const float2& a, float *t)
 {
 	*t = len(a);
 	return a/(*t);
 }

-ccl_device_inline float2 safe_normalize(const float2 a)
+ccl_device_inline float2 safe_normalize(const float2& a)
 {
 	float t = len(a);
 	return (t != 0.0f)? a/t: a;
 }

-ccl_device_inline bool operator==(const float2 a, const float2 b)
+ccl_device_inline bool operator==(const float2& a, const float2& b)
 {
 	return (a.x == b.x && a.y == b.y);
 }

-ccl_device_inline bool operator!=(const float2 a, const float2 b)
+ccl_device_inline bool operator!=(const float2& a, const float2& b)
 {
 	return !(a == b);
 }

-ccl_device_inline float2 min(float2 a, float2 b)
+ccl_device_inline float2 min(const float2& a, const float2& b)
 {
 	return make_float2(min(a.x, b.x), min(a.y, b.y));
 }

-ccl_device_inline float2 max(float2 a, float2 b)
+ccl_device_inline float2 max(const float2& a, const float2& b)
 {
 	return make_float2(max(a.x, b.x), max(a.y, b.y));
 }

-ccl_device_inline float2 clamp(float2 a, float2 mn, float2 mx)
+ccl_device_inline float2 clamp(const float2& a, const float2& mn, const float2& mx)
 {
 	return min(max(a, mn), mx);
 }

-ccl_device_inline float2 fabs(float2 a)
+ccl_device_inline float2 fabs(const float2& a)
 {
 	return make_float2(fabsf(a.x), fabsf(a.y));
 }

-ccl_device_inline float2 as_float2(const float4 a)
+ccl_device_inline float2 as_float2(const float4& a)
 {
 	return make_float2(a.x, a.y);
 }
@@ -413,7 +413,7 @@ ccl_device_inline void print_float2(const char *label, const float2& a)

 #ifndef __KERNEL_OPENCL__

-ccl_device_inline float2 interp(float2 a, float2 b, float t)
+ccl_device_inline float2 interp(const float2& a, const float2& b, float t)
 {
 	return a + t*(b - a);
 }
@@ -424,58 +424,93 @@ ccl_device_inline float2 interp(float2 a, float2 b, float t)

 #ifndef __KERNEL_OPENCL__

-ccl_device_inline float3 operator-(const float3 a)
+ccl_device_inline float3 operator-(const float3& a)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
+#else
 	return make_float3(-a.x, -a.y, -a.z);
+#endif
 }

-ccl_device_inline float3 operator*(const float3 a, const float3 b)
+ccl_device_inline float3 operator*(const float3& a, const float3& b)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_mul_ps(a.m128,b.m128));
+#else
 	return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
+#endif
 }

-ccl_device_inline float3 operator*(const float3 a, float f)
+ccl_device_inline float3 operator*(const float3& a, const float f)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_mul_ps(a.m128,_mm_set1_ps(f)));
+#else
 	return make_float3(a.x*f, a.y*f, a.z*f);
+#endif
 }

-ccl_device_inline float3 operator*(float f, const float3 a)
+ccl_device_inline float3 operator*(const float f, const float3& a)
 {
+#ifdef __KERNEL_SSE__
+	return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
+#else
 	return make_float3(a.x*f, a.y*f, a.z*f);
+#endif
 }

-ccl_device_inline float3 operator/(float f, const float3 a)
+ccl_device_inline float3 operator/(const float f, const float3& a)
 {
-	return make_float3(f/a.x, f/a.y, f/a.z);
+	/* TODO(sergey): Currently disabled, gives speedup but makes intersection tets non-watertight. */
+// #ifdef __KERNEL_SSE__
+// 	__m128 rc = _mm_rcp_ps(a.m128);
+// 	return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
+// #else
+	return make_float3(f / a.x, f / a.y, f / a.z);
+// #endif
 }

-ccl_device_inline float3 operator/(const float3 a, float f)
+ccl_device_inline float3 operator/(const float3& a, const float f)
 {
 	float invf = 1.0f/f;
-	return make_float3(a.x*invf, a.y*invf, a.z*invf);
+	return a * invf;
 }

-ccl_device_inline float3 operator/(const float3 a, const float3 b)
+ccl_device_inline float3 operator/(const float3& a, const float3& b)
 {
-	return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
+#ifdef __KERNEL_SSE__
+	__m128 rc = _mm_rcp_ps(b.m128);
+	return float3(_mm_mul_ps(a, rc));
+#else
+	return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+#endif
 }

-ccl_device_inline float3 operator+(const float3 a, const float3 b)
+ccl_device_inline float3 operator+(const float3& a, const float3& b)
 {
-	return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
+#ifdef __KERNEL_SSE__
+	return float3(_mm_add_ps(a.m128, b.m128));
+#else
+	return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+#endif
 }

-ccl_device_inline float3 operator-(const float3 a, const float3 b)
+ccl_device_inline float3 operator-(const float3& a, const float3& b)
 {
-	return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
+#ifdef __KERNEL_SSE__
+	return float3(_mm_sub_ps(a.m128, b.m128));
+#else
+	return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+#endif
 }

-ccl_device_inline float3 operator+=(float3& a, const float3 b)
+ccl_device_inline float3 operator+=(float3& a, const float3& b)
 {
 	return a = a + b;
 }

-ccl_device_inline float3 operator*=(float3& a, const float3 b)
+ccl_device_inline float3 operator*=(float3& a, const float3& b)
 {
 	return a = a * b;
 }
@@ -485,7 +520,7 @@ ccl_device_inline float3 operator*=(float3& a, float f)
 	return a = a * f;
 }

-ccl_device_inline float3 operator/=(float3& a, const float3 b)
+ccl_device_inline float3 operator/=(float3& a, const float3& b)
 {
 	return a = a / b;
 }
@@ -496,7 +531,7 @@ ccl_device_inline float3 operator/=(float3& a, float f)
 	return a = a * invf;
 }

-ccl_device_inline float dot(const float3 a, const float3 b)
+ccl_device_inline float dot(const float3& a, const float3& b)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
 	return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
@@ -505,7 +540,16 @@ ccl_device_inline float dot(const float3 a, const float3 b)
 #endif
 }

-ccl_device_inline float dot(const float4 a, const float4 b)
+ccl_device_inline float dot_xy(const float3& a, const float3& b)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+	return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a,b),b));
+#else
+	return a.x*b.x + a.y*b.y;
+#endif
+}
+
+ccl_device_inline float dot(const float4& a, const float4& b)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
 	return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
@@ -514,7 +558,7 @@ ccl_device_inline float dot(const float4 a, const float4 b)
 #endif
 }

-ccl_device_inline float3 cross(const float3 a, const float3 b)
+ccl_device_inline float3 cross(const float3& a, const float3& b)
 {
 	float3 r = make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
 	return r;
@@ -538,12 +582,12 @@ ccl_device_inline float len_squared(const float3 a)

 #ifndef __KERNEL_OPENCL__

-ccl_device_inline float len_squared(const float4 a)
+ccl_device_inline float len_squared(const float4& a)
 {
 	return dot(a, a);
 }

-ccl_device_inline float3 normalize(const float3 a)
+ccl_device_inline float3 normalize(const float3& a)
 {
 #if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
 	__m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
@@ -563,13 +607,14 @@ ccl_device_inline float3 saturate3(float3 a)
 ccl_device_inline float3 normalize_len(const float3 a, float *t)
 {
 	*t = len(a);
-	return a/(*t);
+	float x = 1.0f / *t;
+	return a*x;
 }

 ccl_device_inline float3 safe_normalize(const float3 a)
 {
 	float t = len(a);
-	return (t != 0.0f)? a/t: a;
+	return (t != 0.0f)? a * (1.0f/t) : a;
 }

 ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)
@@ -580,7 +625,7 @@ ccl_device_inline float3 safe_normalize_len(const float3 a, float *t)

 #ifndef __KERNEL_OPENCL__

-ccl_device_inline bool operator==(const float3 a, const float3 b)
+ccl_device_inline bool operator==(const float3& a, const float3& b)
 {
 #ifdef __KERNEL_SSE__
 	return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
@@ -589,12 +634,12 @@ ccl_device_inline bool operator==(const float3 a, const float3 b)
 #endif
 }

-ccl_device_inline bool operator!=(const float3 a, const float3 b)
+ccl_device_inline bool operator!=(const float3& a, const float3& b)
 {
 	return !(a == b);
 }

-ccl_device_inline float3 min(float3 a, float3 b)
+ccl_device_inline float3 min(const float3& a, const float3& b)
 {
 #ifdef __KERNEL_SSE__
 	return _mm_min_ps(a.m128, b.m128);
@@ -603,7 +648,7 @@ ccl_device_inline float3 min(float3 a, float3 b)
 #endif
 }

-ccl_device_inline float3 max(float3 a, float3 b)
+ccl_device_inline float3 max(const float3& a, const float3& b)
 {
 #ifdef __KERNEL_SSE__
 	return _mm_max_ps(a.m128, b.m128);
@@ -612,12 +657,12 @@ ccl_device_inline float3 max(float3 a, float3 b)
 #endif
 }

-ccl_device_inline float3 clamp(float3 a, float3 mn, float3 mx)
+ccl_device_inline float3 clamp(const float3& a, const float3& mn, const float3& mx)
 {
 	return min(max(a, mn), mx);
 }

-ccl_device_inline float3 fabs(float3 a)
+ccl_device_inline float3 fabs(const float3& a)
 {
 #ifdef __KERNEL_SSE__
 	__m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
@@ -670,7 +715,7 @@ ccl_device_inline float3 interp(float3 a, float3 b, float t)

 #ifndef __KERNEL_OPENCL__

-ccl_device_inline float3 mix(float3 a, float3 b, float t)
+ccl_device_inline float3 mix(const float3& a, const float3& b, float t)
 {
 	return a + t*(b - a);
 }
@@ -833,7 +878,7 @@ ccl_device_inline int4 operator<(const float4& a, const float4& b)
 #endif
 }

-ccl_device_inline int4 operator>=(float4 a, float4 b)
+ccl_device_inline int4 operator>=(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
 	return _mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)); /* todo: avoid cvt */
@@ -851,7 +896,7 @@ ccl_device_inline int4 operator<=(const float4& a, const float4& b)
 #endif
 }

-ccl_device_inline bool operator==(const float4 a, const float4 b)
+ccl_device_inline bool operator==(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
 	return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
@@ -893,23 +938,23 @@ ccl_device_inline float average(const float4& a)
 	return reduce_add(a) * 0.25f;
 }

-ccl_device_inline float len(const float4 a)
+ccl_device_inline float len(const float4& a)
 {
 	return sqrtf(dot(a, a));
 }

-ccl_device_inline float4 normalize(const float4 a)
+ccl_device_inline float4 normalize(const float4& a)
 {
 	return a/len(a);
 }

-ccl_device_inline float4 safe_normalize(const float4 a)
+ccl_device_inline float4 safe_normalize(const float4& a)
 {
 	float t = len(a);
 	return (t != 0.0f)? a/t: a;
 }

-ccl_device_inline float4 min(float4 a, float4 b)
+ccl_device_inline float4 min(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
 	return _mm_min_ps(a.m128, b.m128);
@@ -918,7 +963,7 @@ ccl_device_inline float4 min(float4 a, float4 b)
 #endif
 }

-ccl_device_inline float4 max(float4 a, float4 b)
+ccl_device_inline float4 max(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
 	return _mm_max_ps(a.m128, b.m128);
@@ -1190,7 +1235,7 @@ template<class A, class B> A lerp(const A& a, const A& b, const B& t)

 /* Triangle */

-ccl_device_inline float triangle_area(const float3 v1, const float3 v2, const float3 v3)
+ccl_device_inline float triangle_area(const float3& v1, const float3& v2, const float3& v3)
 {
 	return len(cross(v3 - v2, v1 - v2))*0.5f;
 }
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -778,7 +778,9 @@ static string line_directive(const string& path, int line)
 }


-string path_source_replace_includes(const string& source, const string& path)
+string path_source_replace_includes(const string& source,
+                                    const string& path,
+                                    const string& source_filename)
 {
 	/* Our own little c preprocessor that replaces #includes with the file
 	 * contents, to work around issue of opencl drivers not supporting
@@ -807,12 +809,12 @@ string path_source_replace_includes(const string& source, const string& path)
 						 * and avoids having list of include directories.x
 						 */
 						text = path_source_replace_includes(
-						        text, path_dirname(filepath));
-						text = path_source_replace_includes(text, path);
+						        text, path_dirname(filepath), filename);
+						text = path_source_replace_includes(text, path, filename);
 						/* Use line directives for better error messages. */
 						line = line_directive(filepath, 1)
 						     + token.replace(0, n_end + 1, "\n" + text + "\n")
-						     + line_directive(path, i);
+						     + line_directive(path_join(path, source_filename), i);
 					}
 				}
 			}
--- a/intern/cycles/util/util_path.h
+++ b/intern/cycles/util/util_path.h
@@ -66,7 +66,9 @@ bool path_read_text(const string& path, string& text);
 bool path_remove(const string& path);

 /* source code utility */
-string path_source_replace_includes(const string& source, const string& path);
+string path_source_replace_includes(const string& source,
+                                    const string& path,
+                                    const string& source_filename="");

 /* cache utility */
 void path_cache_clear_except(const string& name, const set<string>& except);
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -71,7 +71,7 @@ __forceinline operator          int      ( ) const { return std::numeric_limits<
 #define _lzcnt_u64 __lzcnt64
 #endif

-#if defined(_WIN32) && !defined(__MINGW32__)
+#if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)

 __forceinline int __popcnt(int in) {
  return _mm_popcnt_u32(in);
@@ -455,6 +455,7 @@ CCL_NAMESPACE_END
 #include "util_sseb.h"
 #include "util_ssei.h"
 #include "util_ssef.h"
+#include "util_avxf.h"

 #endif /* __UTIL_SIMD_TYPES_H__ */

--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -42,6 +42,7 @@

 #if defined(_WIN32) && !defined(FREE_WINDOWS)
 #define ccl_device_inline static __forceinline
+#define ccl_device_forceinline static __forceinline
 #define ccl_align(...) __declspec(align(__VA_ARGS__))
 #ifdef __KERNEL_64_BIT__
 #define ccl_try_align(...) __declspec(align(__VA_ARGS__))
@@ -56,6 +57,7 @@
 #else

 #define ccl_device_inline static inline __attribute__((always_inline))
+#define ccl_device_forceinline static inline __attribute__((always_inline))
 #define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
 #ifndef FREE_WINDOWS64
 #define __forceinline inline __attribute__((always_inline))
@@ -172,6 +174,9 @@ struct ccl_try_align(16) int3 {
 	__forceinline int3(const __m128i a) : m128(a) {}
 	__forceinline operator const __m128i&(void) const { return m128; }
 	__forceinline operator __m128i&(void) { return m128; }
+
+	int3(const int3& a) { m128 = a.m128; }
+	int3& operator =(const int3& a) { m128 = a.m128; return *this; }
 #else
 	int x, y, z, w;
 #endif
@@ -191,6 +196,9 @@ struct ccl_try_align(16) int4 {
 	__forceinline int4(const __m128i a) : m128(a) {}
 	__forceinline operator const __m128i&(void) const { return m128; }
 	__forceinline operator __m128i&(void) { return m128; }
+
+	int4(const int4& a) : m128(a.m128) {}
+	int4& operator=(const int4& a) { m128 = a.m128; return *this; }
 #else
 	int x, y, z, w;
 #endif
@@ -235,9 +243,12 @@ struct ccl_try_align(16) float3 {
 	};

 	__forceinline float3() {}
-	__forceinline float3(const __m128 a) : m128(a) {}
+	__forceinline float3(const __m128& a) : m128(a) {}
 	__forceinline operator const __m128&(void) const { return m128; }
 	__forceinline operator __m128&(void) { return m128; }
+
+	__forceinline float3(const float3& a) : m128(a.m128) {}
+	__forceinline float3& operator =(const float3& a) { m128 = a.m128; return *this; }
 #else
 	float x, y, z, w;
 #endif
@@ -257,6 +268,10 @@ struct ccl_try_align(16) float4 {
 	__forceinline float4(const __m128 a) : m128(a) {}
 	__forceinline operator const __m128&(void) const { return m128; }
 	__forceinline operator __m128&(void) { return m128; }
+
+	__forceinline float4(const float4& a) : m128(a.m128) {}
+	__forceinline float4& operator =(const float4& a) { m128 = a.m128; return *this; }
+
 #else
 	float x, y, z, w;
 #endif
--- a/intern/elbeem/intern/solver_class.h
+++ b/intern/elbeem/intern/solver_class.h
@@ -332,7 +332,7 @@ class LbmFsgrSolver :
 		void debugMarkCellCall(int level, int vi,int vj,int vk);
 		
 		// loop over grid, stream&collide update
-		void mainLoop(int lev);
+		void mainLoop(const int lev);
 		// change time step size
 		void adaptTimestep();
 		//! init mObjectSpeeds for current parametrization
--- a/intern/elbeem/intern/solver_main.cpp
+++ b/intern/elbeem/intern/solver_main.cpp
@@ -355,7 +355,7 @@ void LbmFsgrSolver::fineAdvance()
 //! fine step function
 /*****************************************************************************/
 void 
-LbmFsgrSolver::mainLoop(int lev)
+LbmFsgrSolver::mainLoop(const int lev)
 {
 	// loops over _only inner_ cells  -----------------------------------------------------------------------------------
 	
@@ -376,13 +376,16 @@ LbmFsgrSolver::mainLoop(int lev)
  // main loop region
 	const bool doReduce = true;
 	const int gridLoopBound=1;
+	int calcNumInvIfCells = 0;
+	LbmFloat calcInitialMass = 0;
 	GRID_REGION_INIT();
 #if PARALLEL==1
-#pragma omp parallel default(shared) num_threads(mNumOMPThreads) \
+	const int gDebugLevel = ::gDebugLevel;
+#pragma omp parallel default(none) num_threads(mNumOMPThreads) \
  reduction(+: \
 	  calcCurrentMass,calcCurrentVolume, \
 		calcCellsFilled,calcCellsEmptied, \
-		calcNumUsedCells )
+		calcNumUsedCells,calcNumInvIfCells,calcInitialMass)
 	GRID_REGION_START();
 #else // PARALLEL==1
 	GRID_REGION_START();
@@ -468,7 +471,7 @@ LbmFsgrSolver::mainLoop(int lev)
 				calcCurrentMass += iniRho; 
 				calcCurrentVolume += 1.0; 
 				calcNumUsedCells++;
-				mInitialMass += iniRho;
+				calcInitialMass += iniRho;
 				// dont treat cell until next step
 				continue;
 			} 
@@ -479,7 +482,7 @@ LbmFsgrSolver::mainLoop(int lev)
 			if(isnotValid) {
 				// remove fluid cells, shouldnt be here anyway
 				LbmFloat fluidRho = m[0]; FORDF1 { fluidRho += m[l]; }
-				mInitialMass -= fluidRho;
+				calcInitialMass -= fluidRho;
 				const LbmFloat iniRho = 0.0;
 				RAC(tcel, dMass) = RAC(tcel, dFfrac) = iniRho;
 				RAC(tcel, dFlux) = FLUX_INIT;
@@ -608,8 +611,8 @@ LbmFsgrSolver::mainLoop(int lev)
 		// read distribution funtions of adjacent cells = stream step
 		DEFAULT_STREAM;

-		if((nbored & CFFluid)==0) { newFlag |= CFNoNbFluid; mNumInvIfCells++; }
-		if((nbored & CFEmpty)==0) { newFlag |= CFNoNbEmpty; mNumInvIfCells++; }
+		if((nbored & CFFluid)==0) { newFlag |= CFNoNbFluid; calcNumInvIfCells++; }
+		if((nbored & CFEmpty)==0) { newFlag |= CFNoNbEmpty; calcNumInvIfCells++; }

 		// calculate mass exchange for interface cells 
 		LbmFloat myfrac = RAC(ccel,dFfrac);
@@ -809,7 +812,7 @@ LbmFsgrSolver::mainLoop(int lev)
 			// fill if cells in inflow region
 			if(myfrac<0.5) { 
 				mass += 0.25; 
-				mInitialMass += 0.25;
+				calcInitialMass += 0.25;
 			}
 			const int OId = oldFlag>>24;
 			const LbmVec vel(mObjectSpeeds[OId]);
@@ -865,10 +868,8 @@ LbmFsgrSolver::mainLoop(int lev)
 			// physical drop model
 			if(mPartUsePhysModel) {
 				LbmFloat realWorldFac = (mLevel[lev].simCellSize / mLevel[lev].timestep);
-				LbmFloat rux = (ux * realWorldFac);
-				LbmFloat ruy = (uy * realWorldFac);
-				LbmFloat ruz = (uz * realWorldFac);
-				LbmFloat rl = norm(ntlVec3Gfx(rux,ruy,ruz));
+				LbmVec ru(ux * realWorldFac, uy * realWorldFac, uz * realWorldFac);
+				LbmFloat rl = norm(ru);
 				basethresh *= rl;

 				// reduce probability in outer region?
@@ -960,14 +961,15 @@ LbmFsgrSolver::mainLoop(int lev)
 				// average normal & velocity 
 				// -> mostly along velocity dir, many into surface
 				// fluid velocity (not normalized!)
-				LbmVec flvelVel = LbmVec(ux,uy,uz);
+				LbmVec flvelVel(ux,uy,uz);
 				LbmFloat flvelLen = norm(flvelVel);
 				// surface normal
-				LbmVec normVel = LbmVec(surfaceNormal[0],surfaceNormal[1],surfaceNormal[2]);
+				LbmVec normVel(surfaceNormal[0],surfaceNormal[1],surfaceNormal[2]);
 				normalize(normVel);
 				LbmFloat normScale = (0.01+flvelLen);
 				// jitter vector, 0.2 * flvel
-				LbmVec jittVel = LbmVec(jx,jy,jz)*(0.05+flvelLen)*0.1;
+				LbmVec jittVel(jx,jy,jz);
+				jittVel *= (0.05+flvelLen)*0.1;
 				// weighten velocities
 				const LbmFloat flvelWeight = 0.9;
 				LbmVec newpartVel = normVel*normScale*(1.-flvelWeight) + flvelVel*(flvelWeight) + jittVel; 
@@ -1013,7 +1015,7 @@ LbmFsgrSolver::mainLoop(int lev)
 		if( (mass) <= (rho * (   -FSGR_MAGICNR)) ) { ifemptied = 1; }

 		if(oldFlag & (CFMbndOutflow)) {
-			mInitialMass -= mass;
+			calcInitialMass -= mass;
 			mass = myfrac = 0.0;
 			iffilled = 0; ifemptied = 1;
 		}
@@ -1105,6 +1107,8 @@ LbmFsgrSolver::mainLoop(int lev)
 	mNumFilledCells  = calcCellsFilled;
 	mNumEmptiedCells = calcCellsEmptied;
 	mNumUsedCells = calcNumUsedCells;
+	mNumInvIfCells += calcNumInvIfCells;
+	mInitialMass += calcInitialMass;
 }


@@ -1121,7 +1125,8 @@ LbmFsgrSolver::preinitGrids()
 	
 		GRID_REGION_INIT();
 #if PARALLEL==1
-#pragma omp parallel default(shared) num_threads(mNumOMPThreads) \
+	const int gDebugLevel = ::gDebugLevel;
+#pragma omp parallel default(none) num_threads(mNumOMPThreads) \
  reduction(+: \
 	  calcCurrentMass,calcCurrentVolume, \
 		calcCellsFilled,calcCellsEmptied, \
@@ -1158,7 +1163,8 @@ LbmFsgrSolver::standingFluidPreinit()

 	GRID_REGION_INIT();
 #if PARALLEL==1
-#pragma omp parallel default(shared) num_threads(mNumOMPThreads) \
+	const int gDebugLevel = ::gDebugLevel;
+#pragma omp parallel default(none) num_threads(mNumOMPThreads) \
  reduction(+: \
 	  calcCurrentMass,calcCurrentVolume, \
 		calcCellsFilled,calcCellsEmptied, \
--- a/intern/ghost/intern/GHOST_NDOFManager.cpp
+++ b/intern/ghost/intern/GHOST_NDOFManager.cpp
@@ -324,14 +324,14 @@ bool GHOST_NDOFManager::setDevice(unsigned short vendor_id, unsigned short produ
 	return m_deviceType != NDOF_UnknownDevice;
 }

-void GHOST_NDOFManager::updateTranslation(const short t[3], GHOST_TUns64 time)
+void GHOST_NDOFManager::updateTranslation(const int t[3], GHOST_TUns64 time)
 {
 	memcpy(m_translation, t, sizeof(m_translation));
 	m_motionTime = time;
 	m_motionEventPending = true;
 }

-void GHOST_NDOFManager::updateRotation(const short r[3], GHOST_TUns64 time)
+void GHOST_NDOFManager::updateRotation(const int r[3], GHOST_TUns64 time)
 {
 	memcpy(m_rotation, r, sizeof(m_rotation));
 	m_motionTime = time;
--- a/intern/ghost/intern/GHOST_NDOFManager.h
+++ b/intern/ghost/intern/GHOST_NDOFManager.h
@@ -138,8 +138,8 @@ public:
 	//       rotations are + when CCW, - when CW
 	// each platform is responsible for getting axis data into this form
 	// these values should not be scaled (just shuffled or flipped)
-	void updateTranslation(const short t[3], GHOST_TUns64 time);
-	void updateRotation(const short r[3], GHOST_TUns64 time);
+	void updateTranslation(const int t[3], GHOST_TUns64 time);
+	void updateRotation(const int r[3], GHOST_TUns64 time);

 	// the latest raw button data from the device
 	// use HID button encoding (not NDOF_ButtonT)
@@ -163,8 +163,8 @@ private:
 	int m_buttonMask;
 	const NDOF_ButtonT *m_hidMap;

-	short m_translation[3];
-	short m_rotation[3];
+	int m_translation[3];
+	int m_rotation[3];
 	int m_buttons; // bit field

 	GHOST_TUns64 m_motionTime; // in milliseconds
--- a/intern/ghost/intern/GHOST_NDOFManagerCocoa.mm
+++ b/intern/ghost/intern/GHOST_NDOFManagerCocoa.mm
@@ -200,8 +200,8 @@ static void DeviceEvent(uint32_t unused, uint32_t msg_type, void* msg_arg)
 				case kConnexionCmdHandleAxis:
 				{
 					// convert to blender view coordinates
-					const short t[3] = {s->axis[0], -(s->axis[2]), s->axis[1]};
-					const short r[3] = {-(s->axis[3]), s->axis[5], -(s->axis[4])};
+					const int t[3] = {s->axis[0], -(s->axis[2]), s->axis[1]};
+					const int r[3] = {-(s->axis[3]), s->axis[5], -(s->axis[4])};

 					ndof_manager->updateTranslation(t, now);
 					ndof_manager->updateRotation(r, now);
--- a/intern/ghost/intern/GHOST_NDOFManagerUnix.cpp
+++ b/intern/ghost/intern/GHOST_NDOFManagerUnix.cpp
@@ -107,8 +107,8 @@ bool GHOST_NDOFManagerUnix::processEvents()
 				{
 					/* convert to blender view coords */
 					GHOST_TUns64 now = m_system.getMilliSeconds();
-					const short t[3] = {(short)e.motion.x, (short)e.motion.y, (short)-e.motion.z};
-					const short r[3] = {(short)-e.motion.rx, (short)-e.motion.ry, (short)e.motion.rz};
+					const int t[3] = {(int)e.motion.x, (int)e.motion.y, (int)-e.motion.z};
+					const int r[3] = {(int)-e.motion.rx, (int)-e.motion.ry, (int)e.motion.rz};

 					updateTranslation(t, now);
 					updateRotation(r, now);
@@ -128,7 +128,7 @@ bool GHOST_NDOFManagerUnix::processEvents()
 #ifdef USE_FINISH_GLITCH_WORKAROUND
 		if (motion_test_prev == true && motion_test == false) {
 			GHOST_TUns64 now = m_system.getMilliSeconds();
-			const short v[3] = {0, 0, 0};
+			const int v[3] = {0, 0, 0};

 			updateTranslation(v, now);
 			updateRotation(v, now);
--- a/intern/ghost/intern/GHOST_SystemCocoa.mm
+++ b/intern/ghost/intern/GHOST_SystemCocoa.mm
--- a/intern/ghost/intern/GHOST_SystemWin32.cpp
+++ b/intern/ghost/intern/GHOST_SystemWin32.cpp
@@ -712,18 +712,26 @@ GHOST_EventCursor *GHOST_SystemWin32::processCursorEvent(GHOST_TEventType type,
 }


-GHOST_EventWheel *GHOST_SystemWin32::processWheelEvent(GHOST_WindowWin32 *window, WPARAM wParam, LPARAM lParam)
+void GHOST_SystemWin32::processWheelEvent(GHOST_WindowWin32 *window, WPARAM wParam, LPARAM lParam)
 {
-	// short fwKeys = LOWORD(wParam);			// key flags
-	int zDelta = (short) HIWORD(wParam);    // wheel rotation
-	
-	// zDelta /= WHEEL_DELTA;
-	// temporary fix below: microsoft now has added more precision, making the above division not work
-	zDelta = (zDelta <= 0) ? -1 : 1;
+	GHOST_SystemWin32 *system = (GHOST_SystemWin32 *)getSystem();

-	// short xPos = (short) LOWORD(lParam);	// horizontal position of pointer
-	// short yPos = (short) HIWORD(lParam);	// vertical position of pointer
-	return new GHOST_EventWheel(getSystem()->getMilliSeconds(), window, zDelta);
+	int acc = system->m_wheelDeltaAccum;
+	int delta = GET_WHEEL_DELTA_WPARAM(wParam);
+	
+	if (acc * delta < 0) {
+		// scroll direction reversed.
+		acc = 0;
+	}
+	acc += delta;
+	int direction = (acc >= 0) ? 1 : -1;
+	acc = abs(acc);
+	
+	while (acc >= WHEEL_DELTA) {
+		system->pushEvent(new GHOST_EventWheel(system->getMilliSeconds(), window, direction));
+		acc -= WHEEL_DELTA;
+	}
+	system->m_wheelDeltaAccum = acc * direction;
 }


@@ -871,12 +879,12 @@ bool GHOST_SystemWin32::processNDOF(RAWINPUT const &raw)
 		{
 			const short *axis = (short *)(data + 1);
 			// massage into blender view coords (same goes for rotation)
-			const short t[3] = {axis[0], -axis[2], axis[1]};
+			const int t[3] = {axis[0], -axis[2], axis[1]};
 			m_ndofManager->updateTranslation(t, now);

 			if (raw.data.hid.dwSizeHid == 13) {
 				// this report also includes rotation
-				const short r[3] = {-axis[3], axis[5], -axis[4]};
+				const int r[3] = {-axis[3], axis[5], -axis[4]};
 				m_ndofManager->updateRotation(r, now);

 				// I've never gotten one of these, has anyone else?
@@ -887,7 +895,7 @@ bool GHOST_SystemWin32::processNDOF(RAWINPUT const &raw)
 		case 2: // rotation
 		{
 			const short *axis = (short *)(data + 1);
-			const short r[3] = {-axis[0], axis[2], -axis[1]};
+			const int r[3] = {-axis[0], axis[2], -axis[1]};
 			m_ndofManager->updateRotation(r, now);
 			break;
 		}
@@ -1137,14 +1145,9 @@ LRESULT WINAPI GHOST_SystemWin32::s_wndProc(HWND hwnd, UINT msg, WPARAM wParam,
 					POINT mouse_pos = {GET_X_LPARAM(lParam), GET_Y_LPARAM(lParam)};
 					HWND mouse_hwnd = ChildWindowFromPoint(HWND_DESKTOP, mouse_pos);
 					GHOST_WindowWin32 *mouse_window = (GHOST_WindowWin32 *)::GetWindowLongPtr(mouse_hwnd, GWLP_USERDATA);
-					if (mouse_window != NULL) {
-						event = processWheelEvent(mouse_window, wParam, lParam);
-					}
-					else {
-						/* Happens when mouse is not over any of blender's windows. */
-						event = processWheelEvent(window, wParam, lParam);
-					}
-
+					
+					processWheelEvent(mouse_window ? mouse_window : window , wParam, lParam);
+					eventHandled = true;
 #ifdef BROKEN_PEEK_TOUCHPAD
 					PostMessage(hwnd, WM_USER, 0, 0);
 #endif
@@ -1203,6 +1206,7 @@ LRESULT WINAPI GHOST_SystemWin32::s_wndProc(HWND hwnd, UINT msg, WPARAM wParam,
 					GHOST_ModifierKeys modifiers;
 					modifiers.clear();
 					system->storeModifierKeys(modifiers);
+					system->m_wheelDeltaAccum = 0;
 					event = processWindowEvent(LOWORD(wParam) ? GHOST_kEventWindowActivate : GHOST_kEventWindowDeactivate, window);
 					/* WARNING: Let DefWindowProc handle WM_ACTIVATE, otherwise WM_MOUSEWHEEL
 					 * will not be dispatched to OUR active window if we minimize one of OUR windows. */
--- a/intern/ghost/intern/GHOST_SystemWin32.h
+++ b/intern/ghost/intern/GHOST_SystemWin32.h
@@ -264,12 +264,12 @@ protected:
 	static GHOST_EventCursor *processCursorEvent(GHOST_TEventType type, GHOST_WindowWin32 *window);

 	/**
-	 * Creates a mouse wheel event.
+	 * Handles a mouse wheel event.
 	 * \param window	The window receiving the event (the active window).
 	 * \param wParam	The wParam from the wndproc
 	 * \param lParam	The lParam from the wndproc
 	 */
-	static GHOST_EventWheel *processWheelEvent(GHOST_WindowWin32 *window, WPARAM wParam, LPARAM lParam);
+	static void processWheelEvent(GHOST_WindowWin32 *window, WPARAM wParam, LPARAM lParam);

 	/**
 	 * Creates a key event and updates the key data stored locally (m_modifierKeys).
@@ -376,6 +376,9 @@ protected:

 	/** Console status */
 	int m_consoleStatus;
+
+	/** Wheel delta accumulator **/
+	int m_wheelDeltaAccum;
 };

 inline void GHOST_SystemWin32::retrieveModifierKeys(GHOST_ModifierKeys& keys) const
--- a/intern/mikktspace/mikktspace.c
+++ b/intern/mikktspace/mikktspace.c
@@ -579,13 +579,16 @@ static void MergeVertsFast(int piTriList_in_and_out[], STmpVert pTmpVert[], cons
 {
 	// make bbox
 	int c=0, l=0, channel=0;
-	float fvMin[3] = {INFINITY, INFINITY, INFINITY};
-	float fvMax[3] = {-INFINITY, -INFINITY, -INFINITY};
+	float fvMin[3], fvMax[3];
 	float dx=0, dy=0, dz=0, fSep=0;
-	for (l=iL_in; l<=iR_in; l++)
-		for (c=0; c<3; c++)
+	for (c=0; c<3; c++)
+	{	fvMin[c]=pTmpVert[iL_in].vert[c]; fvMax[c]=fvMin[c];	}
+	for (l=(iL_in+1); l<=iR_in; l++) {
+		for (c=0; c<3; c++) {
 			if (fvMin[c]>pTmpVert[l].vert[c]) fvMin[c]=pTmpVert[l].vert[c];
-			else if (fvMax[c]<pTmpVert[l].vert[c]) fvMax[c]=pTmpVert[l].vert[c];
+			if (fvMax[c]<pTmpVert[l].vert[c]) fvMax[c]=pTmpVert[l].vert[c];
+		}
+	}

 	dx = fvMax[0]-fvMin[0];
 	dy = fvMax[1]-fvMin[1];
--- a/make.bat
+++ b/make.bat
@@ -6,7 +6,17 @@ setlocal ENABLEEXTENSIONS
 set BLENDER_DIR=%~dp0
 set BUILD_DIR=%BLENDER_DIR%..\build_windows
 set BUILD_TYPE=Release
+rem reset all variables so they do not get accidentally get carried over from previous builds
 set BUILD_CMAKE_ARGS=
+set BUILD_ARCH=
+set BUILD_VS_VER=
+set BUILD_VS_YEAR=
+set KEY_NAME=
+set MSBUILD_PLATFORM=
+set MUST_CLEAN=
+set NOBUILD=
+set TARGET=
+set WINDOWS_ARCH=

 :argv_loop
 if NOT "%1" == "" (
@@ -18,54 +28,34 @@ if NOT "%1" == "" (

 	REM Build Types
 	if "%1" == "debug" (
-		set BUILD_DIR=%BUILD_DIR%_debug
 		set BUILD_TYPE=Debug
-
 	REM Build Configurations
 	) else if "%1" == "full" (
-		set TARGET_SET=1
-		set BUILD_DIR=%BUILD_DIR%_full
+		set TARGET=Full
 		set BUILD_CMAKE_ARGS=%BUILD_CMAKE_ARGS% ^
 		    -C"%BLENDER_DIR%\build_files\cmake\config\blender_full.cmake"
 	) else if "%1" == "lite" (
-		set TARGET_SET=1
-		set BUILD_DIR=%BUILD_DIR%_lite
+		set TARGET=Lite
 		set BUILD_CMAKE_ARGS=%BUILD_CMAKE_ARGS% ^
 		    -C"%BLENDER_DIR%\build_files\cmake\config\blender_lite.cmake"
 	) else if "%1" == "cycles" (
-		set TARGET_SET=1
-		set BUILD_DIR=%BUILD_DIR%_cycles
+		set TARGET=Cycles
 		set BUILD_CMAKE_ARGS=%BUILD_CMAKE_ARGS% ^
 		    -C"%BLENDER_DIR%\build_files\cmake\config\cycles_standalone.cmake"
 	) else if "%1" == "headless" (
-		set TARGET_SET=1
-		set BUILD_DIR=%BUILD_DIR%_headless
+		set TARGET=Headless
 		set BUILD_CMAKE_ARGS=%BUILD_CMAKE_ARGS% ^
 		    -C"%BLENDER_DIR%\build_files\cmake\config\blender_headless.cmake"
 	) else if "%1" == "bpy" (
-		set TARGET_SET=1
-		set BUILD_DIR=%BUILD_DIR%_bpy
+		set TARGET=Bpy
 		set BUILD_CMAKE_ARGS=%BUILD_CMAKE_ARGS% ^
 		    -C"%BLENDER_DIR%\build_files\cmake\config\bpy_module.cmake"
 	) else if "%1" == "release" (
-		set TARGET_SET=1
-		if "%CUDA_PATH_V7_5%"=="" (
-			echo Cuda 7.5 Not found, aborting!
-			goto EOF
-		)
-		if "%CUDA_PATH_V8_0%"=="" (
-			echo Cuda 8.0 Not found, aborting!
-			goto EOF
-		)
-		set BUILD_DIR=%BUILD_DIR%_Release
-		set BUILD_CMAKE_ARGS=%BUILD_CMAKE_ARGS% ^
-		    -C"%BLENDER_DIR%\build_files\cmake\config\blender_release.cmake" -DCUDA_NVCC_EXECUTABLE:FILEPATH=%CUDA_PATH_V7_5%/bin/nvcc.exe -DCUDA_NVCC8_EXECUTABLE:FILEPATH=%CUDA_PATH_V8_0%/bin/nvcc.exe  
+		set TARGET=Release
 	)	else if "%1" == "x86" (
 		set BUILD_ARCH=x86
-		set BUILD_DIR=%BUILD_DIR%_x86
 	)	else if "%1" == "x64" (
 		set BUILD_ARCH=x64
-		set BUILD_DIR=%BUILD_DIR%_x64
 	)	else if "%1" == "2015" (
 	set BUILD_VS_VER=14
 	set BUILD_VS_YEAR=2015
@@ -105,10 +95,13 @@ if NOT "%1" == "" (
 if "%BUILD_ARCH%"=="" (
 	if "%PROCESSOR_ARCHITECTURE%" == "AMD64" (
 		set WINDOWS_ARCH= Win64
+		set BUILD_ARCH=x64
 	) else if "%PROCESSOR_ARCHITEW6432%" == "AMD64" (
 		set WINDOWS_ARCH= Win64
+		set BUILD_ARCH=x64
 	) else (
 		set WINDOWS_ARCH=
+		set BUILD_ARCH=x86
 	)
 ) else if "%BUILD_ARCH%"=="x64" (
 		set WINDOWS_ARCH= Win64
@@ -121,8 +114,27 @@ if "%BUILD_VS_VER%"=="" (
 	set BUILD_VS_YEAR=2013
 )

-set BUILD_DIR=%BUILD_DIR%_vc%BUILD_VS_VER%
+if "%BUILD_ARCH%"=="x64" (
+	set MSBUILD_PLATFORM=x64
+	) else if "%BUILD_ARCH%"=="x86" (
+		set MSBUILD_PLATFORM=win32
+)

+
+set BUILD_DIR=%BUILD_DIR%_%TARGET%_%BUILD_ARCH%_vc%BUILD_VS_VER%_%BUILD_TYPE%
+
+
+if "%target%"=="Release" (
+		rem for vc12 check for both cuda 7.5 and 8 
+		if "%CUDA_PATH%"=="" (
+			echo Cuda Not found, aborting!
+			goto EOF
+		)
+		set BUILD_CMAKE_ARGS=%BUILD_CMAKE_ARGS% ^
+		-C"%BLENDER_DIR%\build_files\cmake\config\blender_release.cmake" 
+)
+
+:DetectMSVC
 REM Detect MSVC Installation
 if DEFINED VisualStudioVersion goto msvc_detect_finally
 set VALUE_NAME=ProductDir
@@ -140,10 +152,18 @@ if DEFINED MSVC_VC_DIR call "%MSVC_VC_DIR%\vcvarsall.bat"
 REM Sanity Checks
 where /Q msbuild
 if %ERRORLEVEL% NEQ 0 (
-	echo Error: "MSBuild" command not in the PATH.
-	echo You must have MSVC installed and run this from the "Developer Command Prompt"
-	echo ^(available from Visual Studio's Start menu entry^), aborting!
-	goto EOF
+	if "%BUILD_VS_VER%"=="12" (
+		rem vs12 not found, try vs14
+		echo Visual Studio 2012 not found, trying Visual Studio 2015.
+		set BUILD_VS_VER=14
+		set BUILD_VS_YEAR=2015
+		goto DetectMSVC
+	)	else	(
+		echo Error: "MSBuild" command not in the PATH.
+		echo You must have MSVC installed and run this from the "Developer Command Prompt"
+		echo ^(available from Visual Studio's Start menu entry^), aborting!
+		goto EOF
+	)
 )
 where /Q cmake
 if %ERRORLEVEL% NEQ 0 (
@@ -156,7 +176,7 @@ if NOT EXIST %BLENDER_DIR%..\lib\nul (
 	echo This is needed for building, aborting!
 	goto EOF
 )
-if NOT "%TARGET_SET%"=="1" (
+if "%TARGET%"=="" (
 	echo Error: Convenience target not set
 	echo This is required for building, aborting!
 	echo . 
@@ -173,7 +193,9 @@ if "%MUST_CLEAN%"=="1" (
 			%BUILD_DIR%\Blender.sln ^
 			/target:clean ^
 			/property:Configuration=%BUILD_TYPE% ^
-			/verbosity:minimal
+			/verbosity:minimal ^
+			/p:platform=%MSBUILD_PLATFORM%
+
 		if %ERRORLEVEL% NEQ 0 (
 			echo Cleaned "%BUILD_DIR%"
 		)
@@ -202,7 +224,8 @@ msbuild ^
 	/target:build ^
 	/property:Configuration=%BUILD_TYPE% ^
 	/maxcpucount ^
-	/verbosity:minimal
+	/verbosity:minimal ^
+	/p:platform=%MSBUILD_PLATFORM%

 if %ERRORLEVEL% NEQ 0 (
 	echo "Build Failed"
@@ -212,7 +235,8 @@ if %ERRORLEVEL% NEQ 0 (
 msbuild ^
 	%BUILD_DIR%\INSTALL.vcxproj ^
 	/property:Configuration=%BUILD_TYPE% ^
-	/verbosity:minimal
+	/verbosity:minimal ^
+	/p:platform=%MSBUILD_PLATFORM%

 echo.
 echo At any point you can optionally modify your build configuration by editing:
@@ -224,10 +248,9 @@ goto EOF
 :HELP
 		echo.
 		echo Convenience targets
-		echo - release 
-		echo - debug
-		echo - full
-		echo - lite
+		echo - release ^(identical to the offical blender.org builds^)
+		echo - full ^(same as release minus the cuda kernels^)
+		echo - lite 
 		echo - headless
 		echo - cycles
 		echo - bpy
@@ -239,6 +262,7 @@ goto EOF
 		echo - showhash ^(Show git hashes of source tree^)
 		echo.
 		echo Configuration options
+		echo - debug ^(Build an unoptimized debuggable build^)
 		echo - packagename [newname] ^(override default cpack package name^)
 		echo - x86 ^(override host autodetect and build 32 bit code^)
 		echo - x64 ^(override host autodetect and build 64 bit code^)
--- a/release/scripts/modules/bl_i18n_utils/utils_spell_check.py
+++ b/release/scripts/modules/bl_i18n_utils/utils_spell_check.py
@@ -163,6 +163,7 @@ class SpellChecker:
        "runtime",
        "scanline",
        "screencast", "screenshot", "screenshots",
+        "seekability",
        "selfcollision",
        "shadowbuffer", "shadowbuffers",
        "singletexture",
@@ -184,6 +185,7 @@ class SpellChecker:
        "timestamp", "timestamps",
        "timestep", "timesteps",
        "todo",
+        "tradeoff",
        "un",
        "unbake",
        "uncomment",
--- a/release/scripts/modules/bl_previews_utils/bl_previews_render.py
+++ b/release/scripts/modules/bl_previews_utils/bl_previews_render.py
@@ -236,8 +236,8 @@ def do_previews(do_objects, do_groups, do_scenes, do_data_intern):
        return success

    def objects_render_engine_guess(obs):
-        for obname in obs:
-            ob = bpy.data.objects[obname, None]
+        for obname, libpath in obs:
+            ob = bpy.data.objects[obname, libpath]
            for matslot in ob.material_slots:
                mat = matslot.material
                if mat and mat.use_nodes and mat.node_tree:
@@ -247,10 +247,20 @@ def do_previews(do_objects, do_groups, do_scenes, do_data_intern):
        return 'BLENDER_RENDER'

    def object_bbox_merge(bbox, ob, ob_space, offset_matrix):
-        if ob.bound_box:
+        # Take group instances into account (including linked one in this case).
+        if ob.type == 'EMPTY' and ob.dupli_type == 'GROUP':
+            grp_objects = tuple((ob.name, ob.library.filepath if ob.library else None) for ob in ob.dupli_group.objects)
+            if (len(grp_objects) == 0):
+                ob_bbox = ob.bound_box
+            else:
+                coords = objects_bbox_calc(ob_space, grp_objects,
+                                           Matrix.Translation(ob.dupli_group.dupli_offset).inverted())
+                ob_bbox = ((coords[0], coords[1], coords[2]), (coords[21], coords[22], coords[23]))
+        elif ob.bound_box:
            ob_bbox = ob.bound_box
        else:
            ob_bbox = ((-ob.scale.x, -ob.scale.y, -ob.scale.z), (ob.scale.x, ob.scale.y, ob.scale.z))
+
        for v in ob_bbox:
            v = offset_matrix * Vector(v) if offset_matrix is not None else Vector(v)
            v = ob_space.matrix_world.inverted() * ob.matrix_world * v
@@ -269,8 +279,8 @@ def do_previews(do_objects, do_groups, do_scenes, do_data_intern):

    def objects_bbox_calc(camera, objects, offset_matrix):
        bbox = (Vector((1e9, 1e9, 1e9)), Vector((-1e9, -1e9, -1e9)))
-        for obname in objects:
-            ob = bpy.data.objects[obname, None]
+        for obname, libpath in objects:
+            ob = bpy.data.objects[obname, libpath]
            object_bbox_merge(bbox, ob, camera, offset_matrix)
        # Our bbox has been generated in camera local space, bring it back in world one
        bbox[0][:] = camera.matrix_world * bbox[0]
@@ -333,7 +343,7 @@ def do_previews(do_objects, do_groups, do_scenes, do_data_intern):
                continue
            if root.type not in OBJECT_TYPES_RENDER:
                continue
-            objects = (root.name,)
+            objects = ((root.name, None),)

            render_engine = objects_render_engine_guess(objects)
            render_context = render_contexts.get(render_engine, None)
@@ -344,8 +354,8 @@ def do_previews(do_objects, do_groups, do_scenes, do_data_intern):
            scene = bpy.data.scenes[render_context.scene, None]
            bpy.context.screen.scene = scene

-            for obname in objects:
-                ob = bpy.data.objects[obname, None]
+            for obname, libpath in objects:
+                ob = bpy.data.objects[obname, libpath]
                if obname not in scene.objects:
                    scene.objects.link(ob)
                ob.hide_render = False
@@ -363,8 +373,8 @@ def do_previews(do_objects, do_groups, do_scenes, do_data_intern):
            #         OverflowError: Python int too large to convert to C long
            #    ... :(
            scene = bpy.data.scenes[render_context.scene, None]
-            for obname in objects:
-                ob = bpy.data.objects[obname, None]
+            for obname, libpath in objects:
+                ob = bpy.data.objects[obname, libpath]
                scene.objects.unlink(ob)
                ob.hide_render = True

@@ -377,7 +387,8 @@ def do_previews(do_objects, do_groups, do_scenes, do_data_intern):
        for grp in ids_nolib(bpy.data.groups):
            if grp.name in groups_ignored:
                continue
-            objects = tuple(ob.name for ob in grp.objects)
+            # Here too, we do want to keep linked objects members of local group...
+            objects = tuple((ob.name, ob.library.filepath if ob.library else None) for ob in grp.objects)

            render_engine = objects_render_engine_guess(objects)
            render_context = render_contexts.get(render_engine, None)
@@ -415,7 +426,7 @@ def do_previews(do_objects, do_groups, do_scenes, do_data_intern):
            objects = None
            if not has_camera:
                # We had to add a temp camera, now we need to place it to see interesting objects!
-                objects = tuple(ob.name for ob in scene.objects
+                objects = tuple((ob.name, ob.library.filepath if ob.library else None) for ob in scene.objects
                                        if (not ob.hide_render) and (ob.type in OBJECT_TYPES_RENDER))

            preview_render_do(render_context, 'scenes', scene.name, objects)
--- a/release/scripts/modules/bpy_extras/keyconfig_utils.py
+++ b/release/scripts/modules/bpy_extras/keyconfig_utils.py
@@ -136,6 +136,7 @@ KM_HIERARCHY = [

    ('Standard Modal Map', 'EMPTY', 'WINDOW', []),
    ('Transform Modal Map', 'EMPTY', 'WINDOW', []),
+    ('Eyedropper Modal Map', 'EMPTY', 'WINDOW', []),
 ]


--- a/release/scripts/modules/bpy_extras/object_utils.py
+++ b/release/scripts/modules/bpy_extras/object_utils.py
@@ -145,8 +145,12 @@ def object_data_add(context, obdata, operator=None, use_active_layer=True, name=
                base.layers_from_view(context.space_data)
                base.layers[scene.active_layer] = True
            else:
-                base.layers = [True if i == scene.active_layer
-                               else False for i in range(len(scene.layers))]
+                if v3d and not v3d.lock_camera_and_layers:
+                    base.layers = [True if i == v3d.active_layer
+                                   else False for i in range(len(v3d.layers))]
+                else:
+                    base.layers = [True if i == scene.active_layer
+                                   else False for i in range(len(scene.layers))]
        else:
            if v3d:
                base.layers_from_view(context.space_data)
--- a/release/scripts/presets/interface_theme/back_to_black.xml
+++ b/release/scripts/presets/interface_theme/back_to_black.xml
@@ -398,11 +398,7 @@
      </ThemeGraphEditor>
    </graph_editor>
    <file_browser>
-      <ThemeFileBrowser selected_file="#354d66"
-                        scrollbar="#a0a0a0"
-                        scroll_handle="#7f7070"
-                        active_file="#b1b1b1"
-                        active_file_text="#ffffff">
+      <ThemeFileBrowser selected_file="#354d66">
        <space>
          <ThemeSpaceGeneric back="#000000"
                             title="#5d5d5d"
@@ -428,13 +424,6 @@
            </panelcolors>
          </ThemeSpaceGeneric>
        </space>
-        <space_list>
-          <ThemeSpaceListGeneric list="#181818"
-                                 list_title="#9e9e9e"
-                                 list_text="#5d5d5d"
-                                 list_text_hi="#ffffff">
-          </ThemeSpaceListGeneric>
-        </space_list>
      </ThemeFileBrowser>
    </file_browser>
    <nla_editor>
--- a/release/scripts/presets/interface_theme/blender_24x.xml
+++ b/release/scripts/presets/interface_theme/blender_24x.xml
@@ -398,11 +398,7 @@
      </ThemeGraphEditor>
    </graph_editor>
    <file_browser>
-      <ThemeFileBrowser selected_file="#a0a0d0"
-                        scrollbar="#a0a0a0"
-                        scroll_handle="#7f7070"
-                        active_file="#828282"
-                        active_file_text="#ffffff">
+      <ThemeFileBrowser selected_file="#a0a0d0">
        <space>
          <ThemeSpaceGeneric back="#808080"
                             title="#000000"
@@ -428,13 +424,6 @@
            </panelcolors>
          </ThemeSpaceGeneric>
        </space>
-        <space_list>
-          <ThemeSpaceListGeneric list="#b4b4b4"
-                                 list_title="#000000"
-                                 list_text="#000000"
-                                 list_text_hi="#ffffff">
-          </ThemeSpaceListGeneric>
-        </space_list>
      </ThemeFileBrowser>
    </file_browser>
    <nla_editor>
--- a/release/scripts/presets/interface_theme/elsyiun.xml
+++ b/release/scripts/presets/interface_theme/elsyiun.xml
@@ -398,11 +398,7 @@
      </ThemeGraphEditor>
    </graph_editor>
    <file_browser>
-      <ThemeFileBrowser selected_file="#755129"
-                        scrollbar="#a0a0a0"
-                        scroll_handle="#7f7070"
-                        active_file="#828282"
-                        active_file_text="#fafafa">
+      <ThemeFileBrowser selected_file="#755129">
        <space>
          <ThemeSpaceGeneric back="#4b4b4b"
                             title="#8b8b8b"
@@ -428,13 +424,6 @@
            </panelcolors>
          </ThemeSpaceGeneric>
        </space>
-        <space_list>
-          <ThemeSpaceListGeneric list="#3b3b3b"
-                                 list_title="#8b8b8b"
-                                 list_text="#8b8b8b"
-                                 list_text_hi="#ffffff">
-          </ThemeSpaceListGeneric>
-        </space_list>
      </ThemeFileBrowser>
    </file_browser>
    <nla_editor>
--- a/release/scripts/presets/interface_theme/flatty_light.xml
+++ b/release/scripts/presets/interface_theme/flatty_light.xml
@@ -398,11 +398,7 @@
      </ThemeGraphEditor>
    </graph_editor>
    <file_browser>
-      <ThemeFileBrowser selected_file="#ff8c19"
-                        scrollbar="#a0a0a0"
-                        scroll_handle="#7f7070"
-                        active_file="#828282"
-                        active_file_text="#fafafa">
+      <ThemeFileBrowser selected_file="#ff8c19">
        <space>
          <ThemeSpaceGeneric back="#404040"
                             title="#000000"
@@ -428,13 +424,6 @@
            </panelcolors>
          </ThemeSpaceGeneric>
        </space>
-        <space_list>
-          <ThemeSpaceListGeneric list="#808080"
-                                 list_title="#000000"
-                                 list_text="#000000"
-                                 list_text_hi="#ffffff">
-          </ThemeSpaceListGeneric>
-        </space_list>
      </ThemeFileBrowser>
    </file_browser>
    <nla_editor>
--- a/release/scripts/presets/interface_theme/graph.xml
+++ b/release/scripts/presets/interface_theme/graph.xml
@@ -398,11 +398,7 @@
      </ThemeGraphEditor>
    </graph_editor>
    <file_browser>
-      <ThemeFileBrowser selected_file="#607f9e"
-                        scrollbar="#a0a0a0"
-                        scroll_handle="#7f7070"
-                        active_file="#ffffff"
-                        active_file_text="#ffffff">
+      <ThemeFileBrowser selected_file="#607f9e">
        <space>
          <ThemeSpaceGeneric back="#4b4b4b"
                             title="#e4e4e4"
@@ -428,13 +424,6 @@
            </panelcolors>
          </ThemeSpaceGeneric>
        </space>
-        <space_list>
-          <ThemeSpaceListGeneric list="#3b3b3b"
-                                 list_title="#8b8b8b"
-                                 list_text="#8b8b8b"
-                                 list_text_hi="#ffffff">
-          </ThemeSpaceListGeneric>
-        </space_list>
      </ThemeFileBrowser>
    </file_browser>
    <nla_editor>
--- a/release/scripts/presets/interface_theme/hexagon.xml
+++ b/release/scripts/presets/interface_theme/hexagon.xml
@@ -398,11 +398,7 @@
      </ThemeGraphEditor>
    </graph_editor>
    <file_browser>
-      <ThemeFileBrowser selected_file="#69a5be"
-                        scrollbar="#a0a0a0"
-                        scroll_handle="#7f7070"
-                        active_file="#859cb9"
-                        active_file_text="#fafafa">
+      <ThemeFileBrowser selected_file="#69a5be">
        <space>
          <ThemeSpaceGeneric back="#646875"
                             title="#000000"
@@ -428,13 +424,6 @@
            </panelcolors>
          </ThemeSpaceGeneric>
        </space>
-        <space_list>
-          <ThemeSpaceListGeneric list="#6c717f"
-                                 list_title="#d7d7d7"
-                                 list_text="#d7d7d7"
-                                 list_text_hi="#ffffff">
-          </ThemeSpaceListGeneric>
-        </space_list>
      </ThemeFileBrowser>
    </file_browser>
    <nla_editor>
--- a/release/scripts/presets/interface_theme/rtheme.xml
+++ b/release/scripts/presets/interface_theme/rtheme.xml
@@ -398,11 +398,7 @@
      </ThemeGraphEditor>
    </graph_editor>
    <file_browser>
-      <ThemeFileBrowser selected_file="#ff8c19"
-                        scrollbar="#a0a0a0"
-                        scroll_handle="#7f7070"
-                        active_file="#828282"
-                        active_file_text="#fafafa">
+      <ThemeFileBrowser selected_file="#ff8c19">
        <space>
          <ThemeSpaceGeneric back="#9098a0"
                             title="#000000"
@@ -428,13 +424,6 @@
            </panelcolors>
          </ThemeSpaceGeneric>
        </space>
-        <space_list>
-          <ThemeSpaceListGeneric list="#484848"
-                                 list_title="#000000"
-                                 list_text="#000000"
-                                 list_text_hi="#ffffff">
-          </ThemeSpaceListGeneric>
-        </space_list>
      </ThemeFileBrowser>
    </file_browser>
    <nla_editor>
--- a/release/scripts/presets/interface_theme/science_lab.xml
+++ b/release/scripts/presets/interface_theme/science_lab.xml
@@ -398,11 +398,7 @@
      </ThemeGraphEditor>
    </graph_editor>
    <file_browser>
-      <ThemeFileBrowser selected_file="#517da1"
-                        scrollbar="#a0a0a0"
-                        scroll_handle="#7f7070"
-                        active_file="#828282"
-                        active_file_text="#fafafa">
+      <ThemeFileBrowser selected_file="#517da1">
        <space>
          <ThemeSpaceGeneric back="#363636"
                             title="#646464"
@@ -428,13 +424,6 @@
            </panelcolors>
          </ThemeSpaceGeneric>
        </space>
-        <space_list>
-          <ThemeSpaceListGeneric list="#444444"
-                                 list_title="#c0c0c0"
-                                 list_text="#999999"
-                                 list_text_hi="#ffffff">
-          </ThemeSpaceListGeneric>
-        </space_list>
      </ThemeFileBrowser>
    </file_browser>
    <nla_editor>
--- a/Show More
+++ b/Show More