Animation: Move Graph Editor settings to User Preferences #104532

Merged
Christoph Lendenfeld merged 14 commits from ChrisLend/blender:user_pref_only_selected_keys into main 2023-03-09 14:15:36 +01:00
902 changed files with 40225 additions and 28773 deletions
Showing only changes of commit c0c36f0a2b - Show all commits

View File

@ -0,0 +1,5 @@
${CommitTitle}
${CommitBody}
Pull Request #${PullRequestIndex}

View File

@ -0,0 +1,3 @@
${PullRequestTitle}
Pull Request #${PullRequestIndex}

View File

@ -15,7 +15,3 @@ body:
attributes:
label: "Description"
hide_label: true
value: |
Description of the problem that is addressed in the patch.
Description of the proposed solution and its implementation.

View File

@ -1,5 +1,4 @@
This repository is only used as a mirror of git.blender.org. Blender development happens on
https://developer.blender.org.
This repository is only used as a mirror. Blender development happens on projects.blender.org.
To get started with contributing code, please see:
https://wiki.blender.org/wiki/Process/Contributing_Code

3
.github/stale.yml vendored
View File

@ -15,8 +15,7 @@ staleLabel: stale
# Comment to post when closing a stale Issue or Pull Request.
closeComment: >
This issue has been automatically closed, because this repository is only
used as a mirror of git.blender.org. Blender development happens on
developer.blender.org.
used as a mirror. Blender development happens on projects.blender.org.
To get started contributing code, please read:
https://wiki.blender.org/wiki/Process/Contributing_Code

12
.gitmodules vendored
View File

@ -1,20 +1,16 @@
[submodule "release/scripts/addons"]
path = release/scripts/addons
url = ../blender-addons.git
branch = master
ignore = all
branch = main
[submodule "release/scripts/addons_contrib"]
path = release/scripts/addons_contrib
url = ../blender-addons-contrib.git
branch = master
ignore = all
branch = main
[submodule "release/datafiles/locale"]
path = release/datafiles/locale
url = ../blender-translations.git
branch = master
ignore = all
branch = main
[submodule "source/tools"]
path = source/tools
url = ../blender-dev-tools.git
branch = master
ignore = all
branch = main

View File

@ -1241,13 +1241,6 @@ if(WITH_OPENGL)
add_definitions(-DWITH_OPENGL)
endif()
#-----------------------------------------------------------------------------
# Configure Vulkan.
if(WITH_VULKAN_BACKEND)
list(APPEND BLENDER_GL_LIBRARIES ${VULKAN_LIBRARIES})
endif()
# -----------------------------------------------------------------------------
# Configure Metal

View File

@ -71,6 +71,13 @@ Static Source Code Checking
* check_mypy: Checks all Python scripts using mypy,
see: source/tools/check_source/check_mypy_config.py scripts which are included.
Documentation Checking
* check_wiki_file_structure:
Check the WIKI documentation for the source-tree's file structure
matches Blender's source-code.
See: https://wiki.blender.org/wiki/Source/File_Structure
Spell Checkers
This runs the spell checker from the developer tools repositor.
@ -481,6 +488,10 @@ check_smatch: .FORCE
check_mypy: .FORCE
@$(PYTHON) "$(BLENDER_DIR)/source/tools/check_source/check_mypy.py"
check_wiki_file_structure: .FORCE
@PYTHONIOENCODING=utf_8 $(PYTHON) \
"$(BLENDER_DIR)/source/tools/check_wiki/check_wiki_file_structure.py"
check_spelling_py: .FORCE
@cd "$(BUILD_DIR)" ; \
PYTHONIOENCODING=utf_8 $(PYTHON) \

View File

@ -24,7 +24,7 @@ Development
-----------
- [Build Instructions](https://wiki.blender.org/wiki/Building_Blender)
- [Code Review & Bug Tracker](https://developer.blender.org)
- [Code Review & Bug Tracker](https://projects.blender.org)
- [Developer Forum](https://devtalk.blender.org)
- [Developer Documentation](https://wiki.blender.org)

View File

@ -2,7 +2,7 @@
# LLVM does not switch over to cpp17 until llvm 16 and building ealier versions with
# MSVC is leading to some crashes in ISPC. Switch back to their default on all platforms
# for now.
# for now.
string(REPLACE "-DCMAKE_CXX_STANDARD=17" " " DPCPP_CMAKE_FLAGS "${DEFAULT_CMAKE_FLAGS}")
if(WIN32)

View File

@ -40,7 +40,8 @@ ExternalProject_Add(external_igc_llvm
${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0004-OpenCL-support-cl_ext_float_atomics.patch &&
${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0005-OpenCL-Add-cl_khr_integer_dot_product.patch &&
${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0001-Memory-leak-fix-for-Managed-Static-Mutex.patch &&
${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0002-Remove-repo-name-in-LLVM-IR.patch
${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0002-Remove-repo-name-in-LLVM-IR.patch &&
${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0003-Add-missing-include-limit-in-benchmark.patch
)
add_dependencies(
external_igc_llvm
@ -55,9 +56,6 @@ ExternalProject_Add(external_igc_spirv_translator
CONFIGURE_COMMAND echo .
BUILD_COMMAND echo .
INSTALL_COMMAND echo .
PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0001-update-SPIR-V-headers-for-SPV_INTEL_split_barrier.patch &&
${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0002-Add-support-for-split-barriers-extension-SPV_INTEL_s.patch &&
${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0003-Support-cl_bf16_conversions.patch
)
add_dependencies(
external_igc_spirv_translator

View File

@ -42,7 +42,7 @@ endif()
# LLVM does not switch over to cpp17 until llvm 16 and building ealier versions with
# MSVC is leading to some crashes in ISPC. Switch back to their default on all platforms
# for now.
# for now.
string(REPLACE "-DCMAKE_CXX_STANDARD=17" " " LLVM_CMAKE_FLAGS "${DEFAULT_CMAKE_FLAGS}")
# short project name due to long filename issues on windows

View File

@ -88,6 +88,19 @@ else()
export LDFLAGS=${PYTHON_LDFLAGS} &&
export PKG_CONFIG_PATH=${LIBDIR}/ffi/lib/pkgconfig)
# NOTE: untested on APPLE so far.
if(NOT APPLE)
set(PYTHON_CONFIGURE_EXTRA_ARGS
${PYTHON_CONFIGURE_EXTRA_ARGS}
# Used on most release Linux builds (Fedora for e.g.),
# increases build times noticeably with the benefit of a modest speedup at runtime.
--enable-optimizations
# While LTO is OK when building on the same system, it's incompatible across GCC versions,
# making it impractical for developers to build against, so keep it disabled.
# `--with-lto`
)
endif()
ExternalProject_Add(external_python
URL file://${PACKAGE_DIR}/${PYTHON_FILE}
DOWNLOAD_DIR ${DOWNLOAD_DIR}

View File

@ -10,9 +10,9 @@ if(WIN32)
DOWNLOAD_DIR ${DOWNLOAD_DIR}
URL_HASH ${SSL_HASH_TYPE}=${SSL_HASH}
PREFIX ${BUILD_DIR}/ssl
CONFIGURE_COMMAND echo "."
BUILD_COMMAND echo "."
INSTALL_COMMAND echo "."
CONFIGURE_COMMAND echo "."
BUILD_COMMAND echo "."
INSTALL_COMMAND echo "."
INSTALL_DIR ${LIBDIR}/ssl
)
else()
@ -46,4 +46,4 @@ else()
INSTALL_COMMAND ${CONFIGURE_ENV} && cd ${BUILD_DIR}/ssl/src/external_ssl/ && make install
INSTALL_DIR ${LIBDIR}/ssl
)
endif()
endif()

View File

@ -668,9 +668,9 @@ set(SPIRV_HEADERS_FILE SPIR-V-Headers-${SPIRV_HEADERS_VERSION}.tar.gz)
# compiler, the versions used are taken from the following location
# https://github.com/intel/intel-graphics-compiler/releases
set(IGC_VERSION 1.0.12149.1)
set(IGC_VERSION 1.0.13064.7)
set(IGC_URI https://github.com/intel/intel-graphics-compiler/archive/refs/tags/igc-${IGC_VERSION}.tar.gz)
set(IGC_HASH 44f67f24e3bc5130f9f062533abf8154782a9d0a992bc19b498639a8521ae836)
set(IGC_HASH a929abd4cca2b293961ec0437ee4b3b2147bd3b2c8a3c423af78c0c359b2e5ae)
set(IGC_HASH_TYPE SHA256)
set(IGC_FILE igc-${IGC_VERSION}.tar.gz)
@ -690,15 +690,15 @@ set(IGC_LLVM_FILE ${IGC_LLVM_VERSION}.tar.gz)
#
# WARNING WARNING WARNING
set(IGC_OPENCL_CLANG_VERSION 363a5262d8c7cff3fb28f3bdb5d85c8d7e91c1bb)
set(IGC_OPENCL_CLANG_VERSION ee31812ea8b89d08c2918f045d11a19bd33525c5)
set(IGC_OPENCL_CLANG_URI https://github.com/intel/opencl-clang/archive/${IGC_OPENCL_CLANG_VERSION}.tar.gz)
set(IGC_OPENCL_CLANG_HASH aa8cf72bb239722ce8ce44f79413c6887ecc8ca18477dd520aa5c4809756da9a)
set(IGC_OPENCL_CLANG_HASH 1db6735bbcfaa31e8a9ba39f121d6bafa806ea8919e9f56782d6aaa67771ddda)
set(IGC_OPENCL_CLANG_HASH_TYPE SHA256)
set(IGC_OPENCL_CLANG_FILE opencl-clang-${IGC_OPENCL_CLANG_VERSION}.tar.gz)
set(IGC_VCINTRINSICS_VERSION v0.5.0)
set(IGC_VCINTRINSICS_VERSION v0.11.0)
set(IGC_VCINTRINSICS_URI https://github.com/intel/vc-intrinsics/archive/refs/tags/${IGC_VCINTRINSICS_VERSION}.tar.gz)
set(IGC_VCINTRINSICS_HASH 70bb47c5e32173cf61514941e83ae7c7eb4485e6d2fca60cfa1f50d4f42c41f2)
set(IGC_VCINTRINSICS_HASH e5acd5626ce7fa6d41ce154c50ac805eda734ee66af94ef28e680ac2ad81bb9f)
set(IGC_VCINTRINSICS_HASH_TYPE SHA256)
set(IGC_VCINTRINSICS_FILE vc-intrinsics-${IGC_VCINTRINSICS_VERSION}.tar.gz)
@ -714,9 +714,9 @@ set(IGC_SPIRV_TOOLS_HASH 6e19900e948944243024aedd0a201baf3854b377b9cc7a386553bc1
set(IGC_SPIRV_TOOLS_HASH_TYPE SHA256)
set(IGC_SPIRV_TOOLS_FILE SPIR-V-Tools-${IGC_SPIRV_TOOLS_VERSION}.tar.gz)
set(IGC_SPIRV_TRANSLATOR_VERSION a31ffaeef77e23d500b3ea3d35e0c42ff5648ad9)
set(IGC_SPIRV_TRANSLATOR_VERSION d739c01d65ec00dee64dedd40deed805216a7193)
set(IGC_SPIRV_TRANSLATOR_URI https://github.com/KhronosGroup/SPIRV-LLVM-Translator/archive/${IGC_SPIRV_TRANSLATOR_VERSION}.tar.gz)
set(IGC_SPIRV_TRANSLATOR_HASH 9e26c96a45341b8f8af521bacea20e752623346340addd02af95d669f6e89252)
set(IGC_SPIRV_TRANSLATOR_HASH ddc0cc9ccbe59dadeaf291012d59de142b2e9f2b124dbb634644d39daddaa13e)
set(IGC_SPIRV_TRANSLATOR_HASH_TYPE SHA256)
set(IGC_SPIRV_TRANSLATOR_FILE SPIR-V-Translator-${IGC_SPIRV_TRANSLATOR_VERSION}.tar.gz)
@ -724,15 +724,15 @@ set(IGC_SPIRV_TRANSLATOR_FILE SPIR-V-Translator-${IGC_SPIRV_TRANSLATOR_VERSION}.
### Intel Graphics Compiler DEPS END ###
########################################
set(GMMLIB_VERSION intel-gmmlib-22.1.8)
set(GMMLIB_VERSION intel-gmmlib-22.3.0)
set(GMMLIB_URI https://github.com/intel/gmmlib/archive/refs/tags/${GMMLIB_VERSION}.tar.gz)
set(GMMLIB_HASH bf23e9a3742b4fb98c7666c9e9b29f3219e4b2fb4d831aaf4eed71f5e2d17368)
set(GMMLIB_HASH c1f33e1519edfc527127baeb0436b783430dfd256c643130169a3a71dc86aff9)
set(GMMLIB_HASH_TYPE SHA256)
set(GMMLIB_FILE ${GMMLIB_VERSION}.tar.gz)
set(OCLOC_VERSION 22.38.24278)
set(OCLOC_VERSION 22.49.25018.21)
set(OCLOC_URI https://github.com/intel/compute-runtime/archive/refs/tags/${OCLOC_VERSION}.tar.gz)
set(OCLOC_HASH db0c542fccd651e6404b15a74d46027f1ce0eda8dc9e25a40cbb6c0faef257ee)
set(OCLOC_HASH 92362dae08b503a34e5d3820ed284198c452bcd5e7504d90eb69887b20492c06)
set(OCLOC_HASH_TYPE SHA256)
set(OCLOC_FILE ocloc-${OCLOC_VERSION}.tar.gz)

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-or-later
if(WIN32)
set(XML2_EXTRA_ARGS
set(XML2_EXTRA_ARGS
-DLIBXML2_WITH_ZLIB=OFF
-DLIBXML2_WITH_LZMA=OFF
-DLIBXML2_WITH_PYTHON=OFF

View File

@ -1,7 +1,7 @@
diff -Naur external_igc_opencl_clang.orig/CMakeLists.txt external_igc_opencl_clang/CMakeLists.txt
--- external_igc_opencl_clang.orig/CMakeLists.txt 2022-03-16 05:51:10 -0600
+++ external_igc_opencl_clang/CMakeLists.txt 2022-05-23 10:40:09 -0600
@@ -126,22 +126,24 @@
@@ -147,22 +147,24 @@
)
endif()

View File

@ -24,7 +24,7 @@ SET(_moltenvk_SEARCH_DIRS
# FIXME: These finder modules typically don't use LIBDIR,
# this should be set by `./build_files/cmake/platform/` instead.
IF(DEFINED LIBDIR)
SET(_moltenvk_SEARCH_DIRS ${_moltenvk_SEARCH_DIRS} ${LIBDIR}/vulkan/MoltenVK)
SET(_moltenvk_SEARCH_DIRS ${_moltenvk_SEARCH_DIRS} ${LIBDIR}/moltenvk)
ENDIF()
FIND_PATH(MOLTENVK_INCLUDE_DIR

View File

@ -0,0 +1,63 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright 2023 Blender Foundation.
# - Find ShaderC libraries
# Find the ShaderC includes and libraries
# This module defines
# SHADERC_INCLUDE_DIRS, where to find MoltenVK headers, Set when
# SHADERC_INCLUDE_DIR is found.
# SHADERC_LIBRARIES, libraries to link against to use ShaderC.
# SHADERC_ROOT_DIR, The base directory to search for ShaderC.
# This can also be an environment variable.
# SHADERC_FOUND, If false, do not try to use ShaderC.
#
# If SHADERC_ROOT_DIR was defined in the environment, use it.
IF(NOT SHADERC_ROOT_DIR AND NOT $ENV{SHADERC_ROOT_DIR} STREQUAL "")
SET(SHADERC_ROOT_DIR $ENV{SHADERC_ROOT_DIR})
ENDIF()
SET(_shaderc_SEARCH_DIRS
${SHADERC_ROOT_DIR}
)
# FIXME: These finder modules typically don't use LIBDIR,
# this should be set by `./build_files/cmake/platform/` instead.
IF(DEFINED LIBDIR)
SET(_shaderc_SEARCH_DIRS ${_shaderc_SEARCH_DIRS} ${LIBDIR}/shaderc)
ENDIF()
FIND_PATH(SHADERC_INCLUDE_DIR
NAMES
shaderc/shaderc.h
HINTS
${_shaderc_SEARCH_DIRS}
PATH_SUFFIXES
include
)
FIND_LIBRARY(SHADERC_LIBRARY
NAMES
shaderc_combined
HINTS
${_shaderc_SEARCH_DIRS}
PATH_SUFFIXES
lib
)
# handle the QUIETLY and REQUIRED arguments and set SHADERC_FOUND to TRUE if
# all listed variables are TRUE
INCLUDE(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(ShaderC DEFAULT_MSG SHADERC_LIBRARY SHADERC_INCLUDE_DIR)
IF(SHADERC_FOUND)
SET(SHADERC_LIBRARIES ${SHADERC_LIBRARY})
SET(SHADERC_INCLUDE_DIRS ${SHADERC_INCLUDE_DIR})
ENDIF()
MARK_AS_ADVANCED(
SHADERC_INCLUDE_DIR
SHADERC_LIBRARY
)
UNSET(_shaderc_SEARCH_DIRS)

View File

@ -0,0 +1,63 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright 2023 Blender Foundation.
# - Find Vulkan libraries
# Find the Vulkan includes and libraries
# This module defines
# VULKAN_INCLUDE_DIRS, where to find Vulkan headers, Set when
# VULKAN_INCLUDE_DIR is found.
# VULKAN_LIBRARIES, libraries to link against to use Vulkan.
# VULKAN_ROOT_DIR, The base directory to search for Vulkan.
# This can also be an environment variable.
# VULKAN_FOUND, If false, do not try to use Vulkan.
#
# If VULKAN_ROOT_DIR was defined in the environment, use it.
IF(NOT VULKAN_ROOT_DIR AND NOT $ENV{VULKAN_ROOT_DIR} STREQUAL "")
SET(VULKAN_ROOT_DIR $ENV{VULKAN_ROOT_DIR})
ENDIF()
SET(_vulkan_SEARCH_DIRS
${VULKAN_ROOT_DIR}
)
# FIXME: These finder modules typically don't use LIBDIR,
# this should be set by `./build_files/cmake/platform/` instead.
IF(DEFINED LIBDIR)
SET(_vulkan_SEARCH_DIRS ${_vulkan_SEARCH_DIRS} ${LIBDIR}/vulkan)
ENDIF()
FIND_PATH(VULKAN_INCLUDE_DIR
NAMES
vulkan/vulkan.h
HINTS
${_vulkan_SEARCH_DIRS}
PATH_SUFFIXES
include
)
FIND_LIBRARY(VULKAN_LIBRARY
NAMES
vulkan
HINTS
${_vulkan_SEARCH_DIRS}
PATH_SUFFIXES
lib
)
# handle the QUIETLY and REQUIRED arguments and set VULKAN_FOUND to TRUE if
# all listed variables are TRUE
INCLUDE(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(Vulkan DEFAULT_MSG VULKAN_LIBRARY VULKAN_INCLUDE_DIR)
IF(VULKAN_FOUND)
SET(VULKAN_LIBRARIES ${VULKAN_LIBRARY})
SET(VULKAN_INCLUDE_DIRS ${VULKAN_INCLUDE_DIR})
ENDIF()
MARK_AS_ADVANCED(
VULKAN_INCLUDE_DIR
VULKAN_LIBRARY
)
UNSET(_vulkan_SEARCH_DIRS)

View File

@ -23,19 +23,19 @@ if(EXISTS ${SOURCE_DIR}/.git)
if(MY_WC_BRANCH STREQUAL "HEAD")
# Detached HEAD, check whether commit hash is reachable
# in the master branch
# in the main branch
execute_process(COMMAND git rev-parse --short=12 HEAD
WORKING_DIRECTORY ${SOURCE_DIR}
OUTPUT_VARIABLE MY_WC_HASH
OUTPUT_STRIP_TRAILING_WHITESPACE)
execute_process(COMMAND git branch --list master blender-v* --contains ${MY_WC_HASH}
execute_process(COMMAND git branch --list main blender-v* --contains ${MY_WC_HASH}
WORKING_DIRECTORY ${SOURCE_DIR}
OUTPUT_VARIABLE _git_contains_check
OUTPUT_STRIP_TRAILING_WHITESPACE)
if(NOT _git_contains_check STREQUAL "")
set(MY_WC_BRANCH "master")
set(MY_WC_BRANCH "main")
else()
execute_process(COMMAND git show-ref --tags -d
WORKING_DIRECTORY ${SOURCE_DIR}
@ -48,7 +48,7 @@ if(EXISTS ${SOURCE_DIR}/.git)
OUTPUT_STRIP_TRAILING_WHITESPACE)
if(_git_tag_hashes MATCHES "${_git_head_hash}")
set(MY_WC_BRANCH "master")
set(MY_WC_BRANCH "main")
else()
execute_process(COMMAND git branch --contains ${MY_WC_HASH}
WORKING_DIRECTORY ${SOURCE_DIR}

View File

@ -11,11 +11,11 @@
mkdir ~/blender-git
cd ~/blender-git
git clone http://git.blender.org/blender.git
git clone https://projects.blender.org/blender/blender.git
cd blender
git submodule update --init --recursive
git submodule foreach git checkout master
git submodule foreach git pull --rebase origin master
git submodule foreach git checkout main
git submodule foreach git pull --rebase origin main
# create build dir
mkdir ~/blender-git/build-cmake
@ -35,7 +35,7 @@ ln -s ~/blender-git/build-cmake/bin/blender ~/blender-git/blender/blender.bin
echo ""
echo "* Useful Commands *"
echo " Run Blender: ~/blender-git/blender/blender.bin"
echo " Update Blender: git pull --rebase; git submodule foreach git pull --rebase origin master"
echo " Update Blender: git pull --rebase; git submodule foreach git pull --rebase origin main"
echo " Reconfigure Blender: cd ~/blender-git/build-cmake ; cmake ."
echo " Build Blender: cd ~/blender-git/build-cmake ; make"
echo ""

View File

@ -97,20 +97,8 @@ add_bundled_libraries(materialx/lib)
if(WITH_VULKAN_BACKEND)
find_package(MoltenVK REQUIRED)
if(EXISTS ${LIBDIR}/vulkan)
set(VULKAN_FOUND On)
set(VULKAN_ROOT_DIR ${LIBDIR}/vulkan/macOS)
set(VULKAN_INCLUDE_DIR ${VULKAN_ROOT_DIR}/include)
set(VULKAN_LIBRARY ${VULKAN_ROOT_DIR}/lib/libvulkan.1.dylib)
set(SHADERC_LIBRARY ${VULKAN_ROOT_DIR}/lib/libshaderc_combined.a)
set(VULKAN_INCLUDE_DIRS ${VULKAN_INCLUDE_DIR} ${MOLTENVK_INCLUDE_DIRS})
set(VULKAN_LIBRARIES ${VULKAN_LIBRARY} ${SHADERC_LIBRARY} ${MOLTENVK_LIBRARIES})
else()
message(WARNING "Vulkan SDK was not found, disabling WITH_VULKAN_BACKEND")
set(WITH_VULKAN_BACKEND OFF)
endif()
find_package(ShaderC REQUIRED)
find_package(Vulkan REQUIRED)
endif()
if(WITH_OPENSUBDIV)

View File

@ -111,6 +111,7 @@ find_package_wrapper(Epoxy REQUIRED)
if(WITH_VULKAN_BACKEND)
find_package_wrapper(Vulkan REQUIRED)
find_package_wrapper(ShaderC REQUIRED)
endif()
function(check_freetype_for_brotli)

View File

@ -5,16 +5,16 @@
update-code:
git:
submodules:
- branch: master
- branch: main
commit_id: HEAD
path: release/scripts/addons
- branch: master
- branch: main
commit_id: HEAD
path: release/scripts/addons_contrib
- branch: master
- branch: main
commit_id: HEAD
path: release/datafiles/locale
- branch: master
- branch: main
commit_id: HEAD
path: source/tools
svn:

View File

@ -24,7 +24,7 @@ import os
import re
import platform
import string
import setuptools # type: ignore
import setuptools
import sys
from typing import (
@ -58,7 +58,7 @@ Each Blender release supports one Python version, and the package is only compat
## Source Code
* [Releases](https://download.blender.org/source/)
* Repository: [git.blender.org/blender.git](https://git.blender.org/gitweb/gitweb.cgi/blender.git)
* Repository: [projects.blender.org/blender/blender.git](https://projects.blender.org/blender/blender)
## Credits
@ -208,7 +208,7 @@ def main() -> None:
return paths
# Ensure this wheel is marked platform specific.
class BinaryDistribution(setuptools.dist.Distribution): # type: ignore
class BinaryDistribution(setuptools.dist.Distribution):
def has_ext_modules(self) -> bool:
return True

View File

@ -13,10 +13,10 @@ import sys
import make_utils
from make_utils import call
# Parse arguments
# Parse arguments.
def parse_arguments():
def parse_arguments() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--ctest-command", default="ctest")
parser.add_argument("--cmake-command", default="cmake")

View File

@ -170,7 +170,7 @@ def git_update_skip(args: argparse.Namespace, check_remote_exists: bool = True)
return "rebase or merge in progress, complete it first"
# Abort if uncommitted changes.
changes = check_output([args.git_command, 'status', '--porcelain', '--untracked-files=no'])
changes = check_output([args.git_command, 'status', '--porcelain', '--untracked-files=no', '--ignore-submodules'])
if len(changes) != 0:
return "you have unstaged changes"
@ -202,8 +202,8 @@ def submodules_update(
sys.exit(1)
# Update submodules to appropriate given branch,
# falling back to master if none is given and/or found in a sub-repository.
branch_fallback = "master"
# falling back to main if none is given and/or found in a sub-repository.
branch_fallback = "main"
if not branch:
branch = branch_fallback

View File

@ -3,9 +3,9 @@ if NOT exist "%BLENDER_DIR%\source\tools\.git" (
if not "%GIT%" == "" (
"%GIT%" submodule update --init --recursive --progress
if errorlevel 1 goto FAIL
"%GIT%" submodule foreach git checkout master
"%GIT%" submodule foreach git checkout main
if errorlevel 1 goto FAIL
"%GIT%" submodule foreach git pull --rebase origin master
"%GIT%" submodule foreach git pull --rebase origin main
if errorlevel 1 goto FAIL
goto EOF
) else (

View File

@ -37,7 +37,7 @@ def draw_callback_px(self, context):
# BLF drawing routine
font_id = font_info["font_id"]
blf.position(font_id, 2, 80, 0)
blf.size(font_id, 50, 72)
blf.size(font_id, 50)
blf.draw(font_id, "Hello World")

View File

@ -1816,9 +1816,9 @@ def pyrna2sphinx(basepath):
# operators
def write_ops():
API_BASEURL = "https://developer.blender.org/diffusion/B/browse/master/release/scripts"
API_BASEURL_ADDON = "https://developer.blender.org/diffusion/BA"
API_BASEURL_ADDON_CONTRIB = "https://developer.blender.org/diffusion/BAC"
API_BASEURL = "https://projects.blender.org/blender/blender/src/branch/main/release/scripts"
API_BASEURL_ADDON = "https://projects.blender.org/blender/blender-addons"
API_BASEURL_ADDON_CONTRIB = "https://projects.blender.org/blender/blender-addons-contrib"
op_modules = {}
op = None

View File

@ -156,7 +156,7 @@ var Popover = function() {
},
getNamed : function(v) {
$.each(all_versions, function(ix, title) {
if (ix === "master" || ix === "latest") {
if (ix === "master" || ix === "main" || ix === "latest") {
var m = title.match(/\d\.\d[\w\d\.]*/)[0];
if (parseFloat(m) == v) {
v = ix;

View File

@ -1,5 +1,5 @@
Project: Blender
URL: https://git.blender.org/blender.git
URL: https://projects.blender.org/blender/blender.git
License: Apache 2.0
Upstream version: N/A
Local modifications: None

View File

@ -7,6 +7,7 @@ set(INC
set(INC_SYS
${VULKAN_INCLUDE_DIRS}
${MOLTENVK_INCLUDE_DIRS}
)
set(SRC

View File

@ -0,0 +1,15 @@
diff --git a/extern/vulkan_memory_allocator/vk_mem_alloc.h b/extern/vulkan_memory_allocator/vk_mem_alloc.h
index 60f572038c0..63a9994ba46 100644
--- a/extern/vulkan_memory_allocator/vk_mem_alloc.h
+++ b/extern/vulkan_memory_allocator/vk_mem_alloc.h
@@ -13371,8 +13371,8 @@ bool VmaDefragmentationContext_T::IncrementCounters(VkDeviceSize bytes)
// Early return when max found
if (++m_PassStats.allocationsMoved >= m_MaxPassAllocations || m_PassStats.bytesMoved >= m_MaxPassBytes)
{
- VMA_ASSERT(m_PassStats.allocationsMoved == m_MaxPassAllocations ||
- m_PassStats.bytesMoved == m_MaxPassBytes && "Exceeded maximal pass threshold!");
+ VMA_ASSERT((m_PassStats.allocationsMoved == m_MaxPassAllocations ||
+ m_PassStats.bytesMoved == m_MaxPassBytes) && "Exceeded maximal pass threshold!");
return true;
}
return false;

File diff suppressed because it is too large Load Diff

View File

@ -12,6 +12,7 @@ from bpy.props import (
PointerProperty,
StringProperty,
)
from bpy.app.translations import pgettext_iface as iface_
from math import pi
@ -1664,30 +1665,48 @@ class CyclesPreferences(bpy.types.AddonPreferences):
col.label(text="No compatible GPUs found for Cycles", icon='INFO')
if device_type == 'CUDA':
col.label(text="Requires NVIDIA GPU with compute capability 3.0", icon='BLANK1')
compute_capability = "3.0"
col.label(text=iface_("Requires NVIDIA GPU with compute capability %s") % compute_capability,
icon='BLANK1', translate=False)
elif device_type == 'OPTIX':
col.label(text="Requires NVIDIA GPU with compute capability 5.0", icon='BLANK1')
col.label(text="and NVIDIA driver version 470 or newer", icon='BLANK1')
compute_capability = "5.0"
driver_version = "470"
col.label(text=iface_("Requires NVIDIA GPU with compute capability %s") % compute_capability,
icon='BLANK1', translate=False)
col.label(text="and NVIDIA driver version %s or newer" % driver_version,
icon='BLANK1', translate=False)
elif device_type == 'HIP':
import sys
if sys.platform[:3] == "win":
driver_version = "21.Q4"
col.label(text="Requires AMD GPU with RDNA architecture", icon='BLANK1')
col.label(text="and AMD Radeon Pro 21.Q4 driver or newer", icon='BLANK1')
col.label(text=iface_("and AMD Radeon Pro %s driver or newer") % driver_version,
icon='BLANK1', translate=False)
elif sys.platform.startswith("linux"):
driver_version = "22.10"
col.label(text="Requires AMD GPU with RDNA architecture", icon='BLANK1')
col.label(text="and AMD driver version 22.10 or newer", icon='BLANK1')
col.label(text=iface_("and AMD driver version %s or newer") % driver_version, icon='BLANK1',
translate=False)
elif device_type == 'ONEAPI':
import sys
if sys.platform.startswith("win"):
driver_version = "101.4032"
col.label(text="Requires Intel GPU with Xe-HPG architecture", icon='BLANK1')
col.label(text="and Windows driver version 101.4032 or newer", icon='BLANK1')
col.label(text=iface_("and Windows driver version %s or newer") % driver_version,
icon='BLANK1', translate=False)
elif sys.platform.startswith("linux"):
driver_version = "1.3.24931"
col.label(text="Requires Intel GPU with Xe-HPG architecture and", icon='BLANK1')
col.label(text=" - intel-level-zero-gpu version 1.3.24931 or newer", icon='BLANK1')
col.label(text=iface_(" - intel-level-zero-gpu version %s or newer") % driver_version,
icon='BLANK1', translate=False)
col.label(text=" - oneAPI Level-Zero Loader", icon='BLANK1')
elif device_type == 'METAL':
col.label(text="Requires Apple Silicon with macOS 12.2 or newer", icon='BLANK1')
col.label(text="or AMD with macOS 12.3 or newer", icon='BLANK1')
silicon_mac_version = "12.2"
amd_mac_version = "12.3"
col.label(text=iface_("Requires Apple Silicon with macOS %s or newer") % silicon_mac_version,
icon='BLANK1', translate=False)
col.label(text=iface_("or AMD with macOS %s or newer") % amd_mac_version, icon='BLANK1',
translate=False)
return
for device in devices:
@ -1723,12 +1742,21 @@ class CyclesPreferences(bpy.types.AddonPreferences):
if compute_device_type == 'METAL':
import platform
# MetalRT only works on Apple Silicon at present, pending argument encoding fixes on AMD
# Kernel specialization is only viable on Apple Silicon at present due to relative compilation speed
if platform.machine() == 'arm64':
import re
is_navi_2 = False
for device in devices:
if re.search(r"((RX)|(Pro)|(PRO))\s+W?6\d00X", device.name):
is_navi_2 = True
break
# MetalRT only works on Apple Silicon and Navi2.
is_arm64 = platform.machine() == 'arm64'
if is_arm64 or is_navi_2:
col = layout.column()
col.use_property_split = True
col.prop(self, "kernel_optimization_level")
# Kernel specialization is only supported on Apple Silicon
if is_arm64:
col.prop(self, "kernel_optimization_level")
col.prop(self, "use_metalrt")
def draw(self, context):

View File

@ -48,6 +48,8 @@ void BlenderSync::sync_light(BL::Object &b_parent,
case BL::Light::type_SPOT: {
BL::SpotLight b_spot_light(b_light);
light->set_size(b_spot_light.shadow_soft_size());
light->set_axisu(transform_get_column(&tfm, 0));
light->set_axisv(transform_get_column(&tfm, 1));
light->set_light_type(LIGHT_SPOT);
light->set_spot_angle(b_spot_light.spot_size());
light->set_spot_smooth(b_spot_light.spot_blend());

View File

@ -53,8 +53,12 @@ void CUDADevice::set_error(const string &error)
}
CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
: Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
: GPUDevice(info, stats, profiler)
{
/* Verify that base class types can be used with specific backend types */
static_assert(sizeof(texMemObject) == sizeof(CUtexObject));
static_assert(sizeof(arrayMemObject) == sizeof(CUarray));
first_error = true;
cuDevId = info.num;
@ -65,12 +69,6 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
need_texture_info = false;
device_texture_headroom = 0;
device_working_headroom = 0;
move_texture_to_host = false;
map_host_limit = 0;
map_host_used = 0;
can_map_host = 0;
pitch_alignment = 0;
/* Initialize CUDA. */
@ -91,8 +89,9 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
/* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
* CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
* so we can predict which memory to map to host. */
cuda_assert(
cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
int value;
cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
can_map_host = value != 0;
cuda_assert(cuDeviceGetAttribute(
&pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
@ -499,311 +498,57 @@ void CUDADevice::reserve_local_memory(const uint kernel_features)
# endif
}
void CUDADevice::init_host_memory()
{
/* Limit amount of host mapped memory, because allocating too much can
* cause system instability. Leave at least half or 4 GB of system
* memory free, whichever is smaller. */
size_t default_limit = 4 * 1024 * 1024 * 1024LL;
size_t system_ram = system_physical_ram();
if (system_ram > 0) {
if (system_ram / 2 > default_limit) {
map_host_limit = system_ram - default_limit;
}
else {
map_host_limit = system_ram / 2;
}
}
else {
VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
map_host_limit = 0;
}
/* Amount of device memory to keep is free after texture memory
* and working memory allocations respectively. We set the working
* memory limit headroom lower so that some space is left after all
* texture memory allocations. */
device_working_headroom = 32 * 1024 * 1024LL; // 32MB
device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
<< " bytes. (" << string_human_readable_size(map_host_limit) << ")";
}
void CUDADevice::load_texture_info()
{
if (need_texture_info) {
/* Unset flag before copying, so this does not loop indefinitely if the copy below calls
* into 'move_textures_to_host' (which calls 'load_texture_info' again). */
need_texture_info = false;
texture_info.copy_to_device();
}
}
void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
{
/* Break out of recursive call, which can happen when moving memory on a multi device. */
static bool any_device_moving_textures_to_host = false;
if (any_device_moving_textures_to_host) {
return;
}
/* Signal to reallocate textures in host memory only. */
move_texture_to_host = true;
while (size > 0) {
/* Find suitable memory allocation to move. */
device_memory *max_mem = NULL;
size_t max_size = 0;
bool max_is_image = false;
thread_scoped_lock lock(cuda_mem_map_mutex);
foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
device_memory &mem = *pair.first;
CUDAMem *cmem = &pair.second;
/* Can only move textures allocated on this device (and not those from peer devices).
* And need to ignore memory that is already on the host. */
if (!mem.is_resident(this) || cmem->use_mapped_host) {
continue;
}
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
(&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
/* Can't move this type of memory. */
if (!is_texture || cmem->array) {
continue;
}
/* For other textures, only move image textures. */
if (for_texture && !is_image) {
continue;
}
/* Try to move largest allocation, prefer moving images. */
if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
max_is_image = is_image;
max_size = mem.device_size;
max_mem = &mem;
}
}
lock.unlock();
/* Move to host memory. This part is mutex protected since
* multiple CUDA devices could be moving the memory. The
* first one will do it, and the rest will adopt the pointer. */
if (max_mem) {
VLOG_WORK << "Move memory from device to host: " << max_mem->name;
static thread_mutex move_mutex;
thread_scoped_lock lock(move_mutex);
any_device_moving_textures_to_host = true;
/* Potentially need to call back into multi device, so pointer mapping
* and peer devices are updated. This is also necessary since the device
* pointer may just be a key here, so cannot be accessed and freed directly.
* Unfortunately it does mean that memory is reallocated on all other
* devices as well, which is potentially dangerous when still in use (since
* a thread rendering on another devices would only be caught in this mutex
* if it so happens to do an allocation at the same time as well. */
max_mem->device_copy_to();
size = (max_size >= size) ? 0 : size - max_size;
any_device_moving_textures_to_host = false;
}
else {
break;
}
}
/* Unset flag before texture info is reloaded, since it should stay in device memory. */
move_texture_to_host = false;
/* Update texture info array with new pointers. */
load_texture_info();
}
CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
void CUDADevice::get_device_memory_info(size_t &total, size_t &free)
{
CUDAContextScope scope(this);
CUdeviceptr device_pointer = 0;
size_t size = mem.memory_size() + pitch_padding;
CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
const char *status = "";
/* First try allocating in device memory, respecting headroom. We make
* an exception for texture info. It is small and frequently accessed,
* so treat it as working memory.
*
* If there is not enough room for working memory, we will try to move
* textures to host memory, assuming the performance impact would have
* been worse for working memory. */
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
size_t total = 0, free = 0;
cuMemGetInfo(&free, &total);
/* Move textures to host memory if needed. */
if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
move_textures_to_host(size + headroom - free, is_texture);
cuMemGetInfo(&free, &total);
}
/* Allocate in device memory. */
if (!move_texture_to_host && (size + headroom) < free) {
mem_alloc_result = cuMemAlloc(&device_pointer, size);
if (mem_alloc_result == CUDA_SUCCESS) {
status = " in device memory";
}
}
/* Fall back to mapped host memory if needed and possible. */
void *shared_pointer = 0;
if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
if (mem.shared_pointer) {
/* Another device already allocated host memory. */
mem_alloc_result = CUDA_SUCCESS;
shared_pointer = mem.shared_pointer;
}
else if (map_host_used + size < map_host_limit) {
/* Allocate host memory ourselves. */
mem_alloc_result = cuMemHostAlloc(
&shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
(mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
}
if (mem_alloc_result == CUDA_SUCCESS) {
cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
map_host_used += size;
status = " in host memory";
}
}
if (mem_alloc_result != CUDA_SUCCESS) {
if (mem.type == MEM_DEVICE_ONLY) {
status = " failed, out of device memory";
set_error("System is out of GPU memory");
}
else {
status = " failed, out of device and host memory";
set_error("System is out of GPU and shared host memory");
}
}
if (mem.name) {
VLOG_WORK << "Buffer allocate: " << mem.name << ", "
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
<< string_human_readable_size(mem.memory_size()) << ")" << status;
}
mem.device_pointer = (device_ptr)device_pointer;
mem.device_size = size;
stats.mem_alloc(size);
if (!mem.device_pointer) {
return NULL;
}
/* Insert into map of allocations. */
thread_scoped_lock lock(cuda_mem_map_mutex);
CUDAMem *cmem = &cuda_mem_map[&mem];
if (shared_pointer != 0) {
/* Replace host pointer with our host allocation. Only works if
* CUDA memory layout is the same and has no pitch padding. Also
* does not work if we move textures to host during a render,
* since other devices might be using the memory. */
if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
mem.host_pointer != shared_pointer) {
memcpy(shared_pointer, mem.host_pointer, size);
/* A Call to device_memory::host_free() should be preceded by
* a call to device_memory::device_free() for host memory
* allocated by a device to be handled properly. Two exceptions
* are here and a call in OptiXDevice::generic_alloc(), where
* the current host memory can be assumed to be allocated by
* device_memory::host_alloc(), not by a device */
mem.host_free();
mem.host_pointer = shared_pointer;
}
mem.shared_pointer = shared_pointer;
mem.shared_counter++;
cmem->use_mapped_host = true;
}
else {
cmem->use_mapped_host = false;
}
return cmem;
}
void CUDADevice::generic_copy_to(device_memory &mem)
bool CUDADevice::alloc_device(void *&device_pointer, size_t size)
{
if (!mem.host_pointer || !mem.device_pointer) {
return;
}
CUDAContextScope scope(this);
/* If use_mapped_host of mem is false, the current device only uses device memory allocated by
* cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
* mem.host_pointer. */
thread_scoped_lock lock(cuda_mem_map_mutex);
if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
const CUDAContextScope scope(this);
cuda_assert(
cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
}
CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer, size);
return mem_alloc_result == CUDA_SUCCESS;
}
void CUDADevice::generic_free(device_memory &mem)
void CUDADevice::free_device(void *device_pointer)
{
if (mem.device_pointer) {
CUDAContextScope scope(this);
thread_scoped_lock lock(cuda_mem_map_mutex);
DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
const CUDAMem &cmem = cuda_mem_map[&mem];
CUDAContextScope scope(this);
/* If cmem.use_mapped_host is true, reference counting is used
* to safely free a mapped host memory. */
cuda_assert(cuMemFree((CUdeviceptr)device_pointer));
}
if (cmem.use_mapped_host) {
assert(mem.shared_pointer);
if (mem.shared_pointer) {
assert(mem.shared_counter > 0);
if (--mem.shared_counter == 0) {
if (mem.host_pointer == mem.shared_pointer) {
mem.host_pointer = 0;
}
cuMemFreeHost(mem.shared_pointer);
mem.shared_pointer = 0;
}
}
map_host_used -= mem.device_size;
}
else {
/* Free device memory. */
cuda_assert(cuMemFree(mem.device_pointer));
}
bool CUDADevice::alloc_host(void *&shared_pointer, size_t size)
{
CUDAContextScope scope(this);
stats.mem_free(mem.device_size);
mem.device_pointer = 0;
mem.device_size = 0;
CUresult mem_alloc_result = cuMemHostAlloc(
&shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
return mem_alloc_result == CUDA_SUCCESS;
}
cuda_mem_map.erase(cuda_mem_map.find(&mem));
}
void CUDADevice::free_host(void *shared_pointer)
{
CUDAContextScope scope(this);
cuMemFreeHost(shared_pointer);
}
bool CUDADevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
{
CUDAContextScope scope(this);
cuda_assert(cuMemHostGetDevicePointer_v2((CUdeviceptr *)&device_pointer, shared_pointer, 0));
return true;
}
void CUDADevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
{
const CUDAContextScope scope(this);
cuda_assert(cuMemcpyHtoD((CUdeviceptr)device_pointer, host_pointer, size));
}
void CUDADevice::mem_alloc(device_memory &mem)
@ -868,8 +613,8 @@ void CUDADevice::mem_zero(device_memory &mem)
/* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
* regardless of mem.host_pointer and mem.shared_pointer. */
thread_scoped_lock lock(cuda_mem_map_mutex);
if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
thread_scoped_lock lock(device_mem_map_mutex);
if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
const CUDAContextScope scope(this);
cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
}
@ -994,19 +739,19 @@ void CUDADevice::tex_alloc(device_texture &mem)
return;
}
CUDAMem *cmem = NULL;
Mem *cmem = NULL;
CUarray array_3d = NULL;
size_t src_pitch = mem.data_width * dsize * mem.data_elements;
size_t dst_pitch = src_pitch;
if (!mem.is_resident(this)) {
thread_scoped_lock lock(cuda_mem_map_mutex);
cmem = &cuda_mem_map[&mem];
thread_scoped_lock lock(device_mem_map_mutex);
cmem = &device_mem_map[&mem];
cmem->texobject = 0;
if (mem.data_depth > 1) {
array_3d = (CUarray)mem.device_pointer;
cmem->array = array_3d;
cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
}
else if (mem.data_height > 0) {
dst_pitch = align_up(src_pitch, pitch_alignment);
@ -1050,10 +795,10 @@ void CUDADevice::tex_alloc(device_texture &mem)
mem.device_size = size;
stats.mem_alloc(size);
thread_scoped_lock lock(cuda_mem_map_mutex);
cmem = &cuda_mem_map[&mem];
thread_scoped_lock lock(device_mem_map_mutex);
cmem = &device_mem_map[&mem];
cmem->texobject = 0;
cmem->array = array_3d;
cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
}
else if (mem.data_height > 0) {
/* 2D texture, using pitch aligned linear memory. */
@ -1137,8 +882,8 @@ void CUDADevice::tex_alloc(device_texture &mem)
texDesc.filterMode = filter_mode;
texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
thread_scoped_lock lock(cuda_mem_map_mutex);
cmem = &cuda_mem_map[&mem];
thread_scoped_lock lock(device_mem_map_mutex);
cmem = &device_mem_map[&mem];
cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
@ -1153,9 +898,9 @@ void CUDADevice::tex_free(device_texture &mem)
{
if (mem.device_pointer) {
CUDAContextScope scope(this);
thread_scoped_lock lock(cuda_mem_map_mutex);
DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
const CUDAMem &cmem = cuda_mem_map[&mem];
thread_scoped_lock lock(device_mem_map_mutex);
DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
const Mem &cmem = device_mem_map[&mem];
if (cmem.texobject) {
/* Free bindless texture. */
@ -1164,16 +909,16 @@ void CUDADevice::tex_free(device_texture &mem)
if (!mem.is_resident(this)) {
/* Do not free memory here, since it was allocated on a different device. */
cuda_mem_map.erase(cuda_mem_map.find(&mem));
device_mem_map.erase(device_mem_map.find(&mem));
}
else if (cmem.array) {
/* Free array. */
cuArrayDestroy(cmem.array);
cuArrayDestroy(reinterpret_cast<CUarray>(cmem.array));
stats.mem_free(mem.device_size);
mem.device_pointer = 0;
mem.device_size = 0;
cuda_mem_map.erase(cuda_mem_map.find(&mem));
device_mem_map.erase(device_mem_map.find(&mem));
}
else {
lock.unlock();

View File

@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
class DeviceQueue;
class CUDADevice : public Device {
class CUDADevice : public GPUDevice {
friend class CUDAContextScope;
@ -29,36 +29,11 @@ class CUDADevice : public Device {
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
size_t device_texture_headroom;
size_t device_working_headroom;
bool move_texture_to_host;
size_t map_host_used;
size_t map_host_limit;
int can_map_host;
int pitch_alignment;
int cuDevId;
int cuDevArchitecture;
bool first_error;
struct CUDAMem {
CUDAMem() : texobject(0), array(0), use_mapped_host(false)
{
}
CUtexObject texobject;
CUarray array;
/* If true, a mapped host memory in shared_pointer is being used. */
bool use_mapped_host;
};
typedef map<device_memory *, CUDAMem> CUDAMemMap;
CUDAMemMap cuda_mem_map;
thread_mutex cuda_mem_map_mutex;
/* Bindless Textures */
device_vector<TextureInfo> texture_info;
bool need_texture_info;
CUDADeviceKernels kernels;
static bool have_precompiled_kernels();
@ -88,17 +63,13 @@ class CUDADevice : public Device {
void reserve_local_memory(const uint kernel_features);
void init_host_memory();
void load_texture_info();
void move_textures_to_host(size_t size, bool for_texture);
CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
void generic_copy_to(device_memory &mem);
void generic_free(device_memory &mem);
virtual void get_device_memory_info(size_t &total, size_t &free) override;
virtual bool alloc_device(void *&device_pointer, size_t size) override;
virtual void free_device(void *device_pointer) override;
virtual bool alloc_host(void *&shared_pointer, size_t size) override;
virtual void free_host(void *shared_pointer) override;
virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;
void mem_alloc(device_memory &mem) override;

View File

@ -452,6 +452,320 @@ void *Device::get_cpu_osl_memory()
return nullptr;
}
GPUDevice::~GPUDevice() noexcept(false)
{
}
bool GPUDevice::load_texture_info()
{
if (need_texture_info) {
/* Unset flag before copying, so this does not loop indefinitely if the copy below calls
* into 'move_textures_to_host' (which calls 'load_texture_info' again). */
need_texture_info = false;
texture_info.copy_to_device();
return true;
}
else {
return false;
}
}
void GPUDevice::init_host_memory(size_t preferred_texture_headroom,
size_t preferred_working_headroom)
{
/* Limit amount of host mapped memory, because allocating too much can
* cause system instability. Leave at least half or 4 GB of system
* memory free, whichever is smaller. */
size_t default_limit = 4 * 1024 * 1024 * 1024LL;
size_t system_ram = system_physical_ram();
if (system_ram > 0) {
if (system_ram / 2 > default_limit) {
map_host_limit = system_ram - default_limit;
}
else {
map_host_limit = system_ram / 2;
}
}
else {
VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
map_host_limit = 0;
}
/* Amount of device memory to keep free after texture memory
* and working memory allocations respectively. We set the working
* memory limit headroom lower than the working one so there
* is space left for it. */
device_working_headroom = preferred_working_headroom > 0 ? preferred_working_headroom :
32 * 1024 * 1024LL; // 32MB
device_texture_headroom = preferred_texture_headroom > 0 ? preferred_texture_headroom :
128 * 1024 * 1024LL; // 128MB
VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
<< " bytes. (" << string_human_readable_size(map_host_limit) << ")";
}
void GPUDevice::move_textures_to_host(size_t size, bool for_texture)
{
/* Break out of recursive call, which can happen when moving memory on a multi device. */
static bool any_device_moving_textures_to_host = false;
if (any_device_moving_textures_to_host) {
return;
}
/* Signal to reallocate textures in host memory only. */
move_texture_to_host = true;
while (size > 0) {
/* Find suitable memory allocation to move. */
device_memory *max_mem = NULL;
size_t max_size = 0;
bool max_is_image = false;
thread_scoped_lock lock(device_mem_map_mutex);
foreach (MemMap::value_type &pair, device_mem_map) {
device_memory &mem = *pair.first;
Mem *cmem = &pair.second;
/* Can only move textures allocated on this device (and not those from peer devices).
* And need to ignore memory that is already on the host. */
if (!mem.is_resident(this) || cmem->use_mapped_host) {
continue;
}
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
(&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
/* Can't move this type of memory. */
if (!is_texture || cmem->array) {
continue;
}
/* For other textures, only move image textures. */
if (for_texture && !is_image) {
continue;
}
/* Try to move largest allocation, prefer moving images. */
if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
max_is_image = is_image;
max_size = mem.device_size;
max_mem = &mem;
}
}
lock.unlock();
/* Move to host memory. This part is mutex protected since
* multiple backend devices could be moving the memory. The
* first one will do it, and the rest will adopt the pointer. */
if (max_mem) {
VLOG_WORK << "Move memory from device to host: " << max_mem->name;
static thread_mutex move_mutex;
thread_scoped_lock lock(move_mutex);
any_device_moving_textures_to_host = true;
/* Potentially need to call back into multi device, so pointer mapping
* and peer devices are updated. This is also necessary since the device
* pointer may just be a key here, so cannot be accessed and freed directly.
* Unfortunately it does mean that memory is reallocated on all other
* devices as well, which is potentially dangerous when still in use (since
* a thread rendering on another devices would only be caught in this mutex
* if it so happens to do an allocation at the same time as well. */
max_mem->device_copy_to();
size = (max_size >= size) ? 0 : size - max_size;
any_device_moving_textures_to_host = false;
}
else {
break;
}
}
/* Unset flag before texture info is reloaded, since it should stay in device memory. */
move_texture_to_host = false;
/* Update texture info array with new pointers. */
load_texture_info();
}
GPUDevice::Mem *GPUDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
{
void *device_pointer = 0;
size_t size = mem.memory_size() + pitch_padding;
bool mem_alloc_result = false;
const char *status = "";
/* First try allocating in device memory, respecting headroom. We make
* an exception for texture info. It is small and frequently accessed,
* so treat it as working memory.
*
* If there is not enough room for working memory, we will try to move
* textures to host memory, assuming the performance impact would have
* been worse for working memory. */
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
size_t total = 0, free = 0;
get_device_memory_info(total, free);
/* Move textures to host memory if needed. */
if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
move_textures_to_host(size + headroom - free, is_texture);
get_device_memory_info(total, free);
}
/* Allocate in device memory. */
if (!move_texture_to_host && (size + headroom) < free) {
mem_alloc_result = alloc_device(device_pointer, size);
if (mem_alloc_result) {
device_mem_in_use += size;
status = " in device memory";
}
}
/* Fall back to mapped host memory if needed and possible. */
void *shared_pointer = 0;
if (!mem_alloc_result && can_map_host && mem.type != MEM_DEVICE_ONLY) {
if (mem.shared_pointer) {
/* Another device already allocated host memory. */
mem_alloc_result = true;
shared_pointer = mem.shared_pointer;
}
else if (map_host_used + size < map_host_limit) {
/* Allocate host memory ourselves. */
mem_alloc_result = alloc_host(shared_pointer, size);
assert((mem_alloc_result && shared_pointer != 0) ||
(!mem_alloc_result && shared_pointer == 0));
}
if (mem_alloc_result) {
assert(transform_host_pointer(device_pointer, shared_pointer));
map_host_used += size;
status = " in host memory";
}
}
if (!mem_alloc_result) {
if (mem.type == MEM_DEVICE_ONLY) {
status = " failed, out of device memory";
set_error("System is out of GPU memory");
}
else {
status = " failed, out of device and host memory";
set_error("System is out of GPU and shared host memory");
}
}
if (mem.name) {
VLOG_WORK << "Buffer allocate: " << mem.name << ", "
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
<< string_human_readable_size(mem.memory_size()) << ")" << status;
}
mem.device_pointer = (device_ptr)device_pointer;
mem.device_size = size;
stats.mem_alloc(size);
if (!mem.device_pointer) {
return NULL;
}
/* Insert into map of allocations. */
thread_scoped_lock lock(device_mem_map_mutex);
Mem *cmem = &device_mem_map[&mem];
if (shared_pointer != 0) {
/* Replace host pointer with our host allocation. Only works if
* memory layout is the same and has no pitch padding. Also
* does not work if we move textures to host during a render,
* since other devices might be using the memory. */
if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
mem.host_pointer != shared_pointer) {
memcpy(shared_pointer, mem.host_pointer, size);
/* A Call to device_memory::host_free() should be preceded by
* a call to device_memory::device_free() for host memory
* allocated by a device to be handled properly. Two exceptions
* are here and a call in OptiXDevice::generic_alloc(), where
* the current host memory can be assumed to be allocated by
* device_memory::host_alloc(), not by a device */
mem.host_free();
mem.host_pointer = shared_pointer;
}
mem.shared_pointer = shared_pointer;
mem.shared_counter++;
cmem->use_mapped_host = true;
}
else {
cmem->use_mapped_host = false;
}
return cmem;
}
void GPUDevice::generic_free(device_memory &mem)
{
if (mem.device_pointer) {
thread_scoped_lock lock(device_mem_map_mutex);
DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
const Mem &cmem = device_mem_map[&mem];
/* If cmem.use_mapped_host is true, reference counting is used
* to safely free a mapped host memory. */
if (cmem.use_mapped_host) {
assert(mem.shared_pointer);
if (mem.shared_pointer) {
assert(mem.shared_counter > 0);
if (--mem.shared_counter == 0) {
if (mem.host_pointer == mem.shared_pointer) {
mem.host_pointer = 0;
}
free_host(mem.shared_pointer);
mem.shared_pointer = 0;
}
}
map_host_used -= mem.device_size;
}
else {
/* Free device memory. */
free_device((void *)mem.device_pointer);
device_mem_in_use -= mem.device_size;
}
stats.mem_free(mem.device_size);
mem.device_pointer = 0;
mem.device_size = 0;
device_mem_map.erase(device_mem_map.find(&mem));
}
}
void GPUDevice::generic_copy_to(device_memory &mem)
{
if (!mem.host_pointer || !mem.device_pointer) {
return;
}
/* If use_mapped_host of mem is false, the current device only uses device memory allocated by
* backend device allocation regardless of mem.host_pointer and mem.shared_pointer, and should
* copy data from mem.host_pointer. */
thread_scoped_lock lock(device_mem_map_mutex);
if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
copy_host_to_device((void *)mem.device_pointer, mem.host_pointer, mem.memory_size());
}
}
/* DeviceInfo */
CCL_NAMESPACE_END

View File

@ -309,6 +309,93 @@ class Device {
static uint devices_initialized_mask;
};
/* Device, which is GPU, with some common functionality for GPU backends */
class GPUDevice : public Device {
protected:
GPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
: Device(info_, stats_, profiler_),
texture_info(this, "texture_info", MEM_GLOBAL),
need_texture_info(false),
can_map_host(false),
map_host_used(0),
map_host_limit(0),
device_texture_headroom(0),
device_working_headroom(0),
device_mem_map(),
device_mem_map_mutex(),
move_texture_to_host(false),
device_mem_in_use(0)
{
}
public:
virtual ~GPUDevice() noexcept(false);
/* For GPUs that can use bindless textures in some way or another. */
device_vector<TextureInfo> texture_info;
bool need_texture_info;
/* Returns true if the texture info was copied to the device (meaning, some more
* re-initialization might be needed). */
virtual bool load_texture_info();
protected:
/* Memory allocation, only accessed through device_memory. */
friend class device_memory;
bool can_map_host;
size_t map_host_used;
size_t map_host_limit;
size_t device_texture_headroom;
size_t device_working_headroom;
typedef unsigned long long texMemObject;
typedef unsigned long long arrayMemObject;
struct Mem {
Mem() : texobject(0), array(0), use_mapped_host(false)
{
}
texMemObject texobject;
arrayMemObject array;
/* If true, a mapped host memory in shared_pointer is being used. */
bool use_mapped_host;
};
typedef map<device_memory *, Mem> MemMap;
MemMap device_mem_map;
thread_mutex device_mem_map_mutex;
bool move_texture_to_host;
/* Simple counter which will try to track amount of used device memory */
size_t device_mem_in_use;
virtual void init_host_memory(size_t preferred_texture_headroom = 0,
size_t preferred_working_headroom = 0);
virtual void move_textures_to_host(size_t size, bool for_texture);
/* Allocation, deallocation and copy functions, with corresponding
* support of device/host allocations. */
virtual GPUDevice::Mem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
virtual void generic_free(device_memory &mem);
virtual void generic_copy_to(device_memory &mem);
/* total - amount of device memory, free - amount of available device memory */
virtual void get_device_memory_info(size_t &total, size_t &free) = 0;
virtual bool alloc_device(void *&device_pointer, size_t size) = 0;
virtual void free_device(void *device_pointer) = 0;
virtual bool alloc_host(void *&shared_pointer, size_t size) = 0;
virtual void free_host(void *shared_pointer) = 0;
/* This function should return device pointer corresponding to shared pointer, which
* is host buffer, allocated in `alloc_host`. The function should `true`, if such
* address transformation is possible and `false` otherwise. */
virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) = 0;
virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) = 0;
};
CCL_NAMESPACE_END
#endif /* __DEVICE_H__ */

View File

@ -53,8 +53,12 @@ void HIPDevice::set_error(const string &error)
}
HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
: Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
: GPUDevice(info, stats, profiler)
{
/* Verify that base class types can be used with specific backend types */
static_assert(sizeof(texMemObject) == sizeof(hipTextureObject_t));
static_assert(sizeof(arrayMemObject) == sizeof(hArray));
first_error = true;
hipDevId = info.num;
@ -65,12 +69,6 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
need_texture_info = false;
device_texture_headroom = 0;
device_working_headroom = 0;
move_texture_to_host = false;
map_host_limit = 0;
map_host_used = 0;
can_map_host = 0;
pitch_alignment = 0;
/* Initialize HIP. */
@ -91,7 +89,9 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
/* hipDeviceMapHost for mapping host memory when out of device memory.
* hipDeviceLmemResizeToMax for reserving local memory ahead of render,
* so we can predict which memory to map to host. */
hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice));
int value;
hip_assert(hipDeviceGetAttribute(&value, hipDeviceAttributeCanMapHostMemory, hipDevice));
can_map_host = value != 0;
hip_assert(
hipDeviceGetAttribute(&pitch_alignment, hipDeviceAttributeTexturePitchAlignment, hipDevice));
@ -460,305 +460,58 @@ void HIPDevice::reserve_local_memory(const uint kernel_features)
# endif
}
void HIPDevice::init_host_memory()
{
/* Limit amount of host mapped memory, because allocating too much can
* cause system instability. Leave at least half or 4 GB of system
* memory free, whichever is smaller. */
size_t default_limit = 4 * 1024 * 1024 * 1024LL;
size_t system_ram = system_physical_ram();
if (system_ram > 0) {
if (system_ram / 2 > default_limit) {
map_host_limit = system_ram - default_limit;
}
else {
map_host_limit = system_ram / 2;
}
}
else {
VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
map_host_limit = 0;
}
/* Amount of device memory to keep is free after texture memory
* and working memory allocations respectively. We set the working
* memory limit headroom lower so that some space is left after all
* texture memory allocations. */
device_working_headroom = 32 * 1024 * 1024LL; // 32MB
device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
<< " bytes. (" << string_human_readable_size(map_host_limit) << ")";
}
void HIPDevice::load_texture_info()
{
if (need_texture_info) {
/* Unset flag before copying, so this does not loop indefinitely if the copy below calls
* into 'move_textures_to_host' (which calls 'load_texture_info' again). */
need_texture_info = false;
texture_info.copy_to_device();
}
}
void HIPDevice::move_textures_to_host(size_t size, bool for_texture)
{
/* Break out of recursive call, which can happen when moving memory on a multi device. */
static bool any_device_moving_textures_to_host = false;
if (any_device_moving_textures_to_host) {
return;
}
/* Signal to reallocate textures in host memory only. */
move_texture_to_host = true;
while (size > 0) {
/* Find suitable memory allocation to move. */
device_memory *max_mem = NULL;
size_t max_size = 0;
bool max_is_image = false;
thread_scoped_lock lock(hip_mem_map_mutex);
foreach (HIPMemMap::value_type &pair, hip_mem_map) {
device_memory &mem = *pair.first;
HIPMem *cmem = &pair.second;
/* Can only move textures allocated on this device (and not those from peer devices).
* And need to ignore memory that is already on the host. */
if (!mem.is_resident(this) || cmem->use_mapped_host) {
continue;
}
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
(&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
/* Can't move this type of memory. */
if (!is_texture || cmem->array) {
continue;
}
/* For other textures, only move image textures. */
if (for_texture && !is_image) {
continue;
}
/* Try to move largest allocation, prefer moving images. */
if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
max_is_image = is_image;
max_size = mem.device_size;
max_mem = &mem;
}
}
lock.unlock();
/* Move to host memory. This part is mutex protected since
* multiple HIP devices could be moving the memory. The
* first one will do it, and the rest will adopt the pointer. */
if (max_mem) {
VLOG_WORK << "Move memory from device to host: " << max_mem->name;
static thread_mutex move_mutex;
thread_scoped_lock lock(move_mutex);
any_device_moving_textures_to_host = true;
/* Potentially need to call back into multi device, so pointer mapping
* and peer devices are updated. This is also necessary since the device
* pointer may just be a key here, so cannot be accessed and freed directly.
* Unfortunately it does mean that memory is reallocated on all other
* devices as well, which is potentially dangerous when still in use (since
* a thread rendering on another devices would only be caught in this mutex
* if it so happens to do an allocation at the same time as well. */
max_mem->device_copy_to();
size = (max_size >= size) ? 0 : size - max_size;
any_device_moving_textures_to_host = false;
}
else {
break;
}
}
/* Unset flag before texture info is reloaded, since it should stay in device memory. */
move_texture_to_host = false;
/* Update texture info array with new pointers. */
load_texture_info();
}
HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
void HIPDevice::get_device_memory_info(size_t &total, size_t &free)
{
HIPContextScope scope(this);
hipDeviceptr_t device_pointer = 0;
size_t size = mem.memory_size() + pitch_padding;
hipError_t mem_alloc_result = hipErrorOutOfMemory;
const char *status = "";
/* First try allocating in device memory, respecting headroom. We make
* an exception for texture info. It is small and frequently accessed,
* so treat it as working memory.
*
* If there is not enough room for working memory, we will try to move
* textures to host memory, assuming the performance impact would have
* been worse for working memory. */
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
size_t total = 0, free = 0;
hipMemGetInfo(&free, &total);
/* Move textures to host memory if needed. */
if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
move_textures_to_host(size + headroom - free, is_texture);
hipMemGetInfo(&free, &total);
}
/* Allocate in device memory. */
if (!move_texture_to_host && (size + headroom) < free) {
mem_alloc_result = hipMalloc(&device_pointer, size);
if (mem_alloc_result == hipSuccess) {
status = " in device memory";
}
}
/* Fall back to mapped host memory if needed and possible. */
void *shared_pointer = 0;
if (mem_alloc_result != hipSuccess && can_map_host) {
if (mem.shared_pointer) {
/* Another device already allocated host memory. */
mem_alloc_result = hipSuccess;
shared_pointer = mem.shared_pointer;
}
else if (map_host_used + size < map_host_limit) {
/* Allocate host memory ourselves. */
mem_alloc_result = hipHostMalloc(
&shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);
assert((mem_alloc_result == hipSuccess && shared_pointer != 0) ||
(mem_alloc_result != hipSuccess && shared_pointer == 0));
}
if (mem_alloc_result == hipSuccess) {
hip_assert(hipHostGetDevicePointer(&device_pointer, shared_pointer, 0));
map_host_used += size;
status = " in host memory";
}
}
if (mem_alloc_result != hipSuccess) {
status = " failed, out of device and host memory";
set_error("System is out of GPU and shared host memory");
}
if (mem.name) {
VLOG_WORK << "Buffer allocate: " << mem.name << ", "
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
<< string_human_readable_size(mem.memory_size()) << ")" << status;
}
mem.device_pointer = (device_ptr)device_pointer;
mem.device_size = size;
stats.mem_alloc(size);
if (!mem.device_pointer) {
return NULL;
}
/* Insert into map of allocations. */
thread_scoped_lock lock(hip_mem_map_mutex);
HIPMem *cmem = &hip_mem_map[&mem];
if (shared_pointer != 0) {
/* Replace host pointer with our host allocation. Only works if
* HIP memory layout is the same and has no pitch padding. Also
* does not work if we move textures to host during a render,
* since other devices might be using the memory. */
if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
mem.host_pointer != shared_pointer) {
memcpy(shared_pointer, mem.host_pointer, size);
/* A Call to device_memory::host_free() should be preceded by
* a call to device_memory::device_free() for host memory
* allocated by a device to be handled properly. Two exceptions
* are here and a call in OptiXDevice::generic_alloc(), where
* the current host memory can be assumed to be allocated by
* device_memory::host_alloc(), not by a device */
mem.host_free();
mem.host_pointer = shared_pointer;
}
mem.shared_pointer = shared_pointer;
mem.shared_counter++;
cmem->use_mapped_host = true;
}
else {
cmem->use_mapped_host = false;
}
return cmem;
}
void HIPDevice::generic_copy_to(device_memory &mem)
bool HIPDevice::alloc_device(void *&device_pointer, size_t size)
{
if (!mem.host_pointer || !mem.device_pointer) {
return;
}
HIPContextScope scope(this);
/* If use_mapped_host of mem is false, the current device only uses device memory allocated by
* hipMalloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
* mem.host_pointer. */
thread_scoped_lock lock(hip_mem_map_mutex);
if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
const HIPContextScope scope(this);
hip_assert(
hipMemcpyHtoD((hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size()));
}
hipError_t mem_alloc_result = hipMalloc((hipDeviceptr_t *)&device_pointer, size);
return mem_alloc_result == hipSuccess;
}
void HIPDevice::generic_free(device_memory &mem)
void HIPDevice::free_device(void *device_pointer)
{
if (mem.device_pointer) {
HIPContextScope scope(this);
thread_scoped_lock lock(hip_mem_map_mutex);
DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
const HIPMem &cmem = hip_mem_map[&mem];
HIPContextScope scope(this);
/* If cmem.use_mapped_host is true, reference counting is used
* to safely free a mapped host memory. */
hip_assert(hipFree((hipDeviceptr_t)device_pointer));
}
if (cmem.use_mapped_host) {
assert(mem.shared_pointer);
if (mem.shared_pointer) {
assert(mem.shared_counter > 0);
if (--mem.shared_counter == 0) {
if (mem.host_pointer == mem.shared_pointer) {
mem.host_pointer = 0;
}
hipHostFree(mem.shared_pointer);
mem.shared_pointer = 0;
}
}
map_host_used -= mem.device_size;
}
else {
/* Free device memory. */
hip_assert(hipFree(mem.device_pointer));
}
bool HIPDevice::alloc_host(void *&shared_pointer, size_t size)
{
HIPContextScope scope(this);
stats.mem_free(mem.device_size);
mem.device_pointer = 0;
mem.device_size = 0;
hipError_t mem_alloc_result = hipHostMalloc(
&shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);
hip_mem_map.erase(hip_mem_map.find(&mem));
}
return mem_alloc_result == hipSuccess;
}
void HIPDevice::free_host(void *shared_pointer)
{
HIPContextScope scope(this);
hipHostFree(shared_pointer);
}
bool HIPDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
{
HIPContextScope scope(this);
hip_assert(hipHostGetDevicePointer((hipDeviceptr_t *)&device_pointer, shared_pointer, 0));
return true;
}
void HIPDevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
{
const HIPContextScope scope(this);
hip_assert(hipMemcpyHtoD((hipDeviceptr_t)device_pointer, host_pointer, size));
}
void HIPDevice::mem_alloc(device_memory &mem)
@ -823,8 +576,8 @@ void HIPDevice::mem_zero(device_memory &mem)
/* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
* regardless of mem.host_pointer and mem.shared_pointer. */
thread_scoped_lock lock(hip_mem_map_mutex);
if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
thread_scoped_lock lock(device_mem_map_mutex);
if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
const HIPContextScope scope(this);
hip_assert(hipMemsetD8((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size()));
}
@ -951,19 +704,19 @@ void HIPDevice::tex_alloc(device_texture &mem)
return;
}
HIPMem *cmem = NULL;
Mem *cmem = NULL;
hArray array_3d = NULL;
size_t src_pitch = mem.data_width * dsize * mem.data_elements;
size_t dst_pitch = src_pitch;
if (!mem.is_resident(this)) {
thread_scoped_lock lock(hip_mem_map_mutex);
cmem = &hip_mem_map[&mem];
thread_scoped_lock lock(device_mem_map_mutex);
cmem = &device_mem_map[&mem];
cmem->texobject = 0;
if (mem.data_depth > 1) {
array_3d = (hArray)mem.device_pointer;
cmem->array = array_3d;
cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
}
else if (mem.data_height > 0) {
dst_pitch = align_up(src_pitch, pitch_alignment);
@ -1007,10 +760,10 @@ void HIPDevice::tex_alloc(device_texture &mem)
mem.device_size = size;
stats.mem_alloc(size);
thread_scoped_lock lock(hip_mem_map_mutex);
cmem = &hip_mem_map[&mem];
thread_scoped_lock lock(device_mem_map_mutex);
cmem = &device_mem_map[&mem];
cmem->texobject = 0;
cmem->array = array_3d;
cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
}
else if (mem.data_height > 0) {
/* 2D texture, using pitch aligned linear memory. */
@ -1095,8 +848,8 @@ void HIPDevice::tex_alloc(device_texture &mem)
texDesc.filterMode = filter_mode;
texDesc.flags = HIP_TRSF_NORMALIZED_COORDINATES;
thread_scoped_lock lock(hip_mem_map_mutex);
cmem = &hip_mem_map[&mem];
thread_scoped_lock lock(device_mem_map_mutex);
cmem = &device_mem_map[&mem];
hip_assert(hipTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
@ -1111,9 +864,9 @@ void HIPDevice::tex_free(device_texture &mem)
{
if (mem.device_pointer) {
HIPContextScope scope(this);
thread_scoped_lock lock(hip_mem_map_mutex);
DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
const HIPMem &cmem = hip_mem_map[&mem];
thread_scoped_lock lock(device_mem_map_mutex);
DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
const Mem &cmem = device_mem_map[&mem];
if (cmem.texobject) {
/* Free bindless texture. */
@ -1122,16 +875,16 @@ void HIPDevice::tex_free(device_texture &mem)
if (!mem.is_resident(this)) {
/* Do not free memory here, since it was allocated on a different device. */
hip_mem_map.erase(hip_mem_map.find(&mem));
device_mem_map.erase(device_mem_map.find(&mem));
}
else if (cmem.array) {
/* Free array. */
hipArrayDestroy(cmem.array);
hipArrayDestroy(reinterpret_cast<hArray>(cmem.array));
stats.mem_free(mem.device_size);
mem.device_pointer = 0;
mem.device_size = 0;
hip_mem_map.erase(hip_mem_map.find(&mem));
device_mem_map.erase(device_mem_map.find(&mem));
}
else {
lock.unlock();

View File

@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
class DeviceQueue;
class HIPDevice : public Device {
class HIPDevice : public GPUDevice {
friend class HIPContextScope;
@ -26,36 +26,11 @@ class HIPDevice : public Device {
hipDevice_t hipDevice;
hipCtx_t hipContext;
hipModule_t hipModule;
size_t device_texture_headroom;
size_t device_working_headroom;
bool move_texture_to_host;
size_t map_host_used;
size_t map_host_limit;
int can_map_host;
int pitch_alignment;
int hipDevId;
int hipDevArchitecture;
bool first_error;
struct HIPMem {
HIPMem() : texobject(0), array(0), use_mapped_host(false)
{
}
hipTextureObject_t texobject;
hArray array;
/* If true, a mapped host memory in shared_pointer is being used. */
bool use_mapped_host;
};
typedef map<device_memory *, HIPMem> HIPMemMap;
HIPMemMap hip_mem_map;
thread_mutex hip_mem_map_mutex;
/* Bindless Textures */
device_vector<TextureInfo> texture_info;
bool need_texture_info;
HIPDeviceKernels kernels;
static bool have_precompiled_kernels();
@ -81,17 +56,13 @@ class HIPDevice : public Device {
virtual bool load_kernels(const uint kernel_features) override;
void reserve_local_memory(const uint kernel_features);
void init_host_memory();
void load_texture_info();
void move_textures_to_host(size_t size, bool for_texture);
HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
void generic_copy_to(device_memory &mem);
void generic_free(device_memory &mem);
virtual void get_device_memory_info(size_t &total, size_t &free) override;
virtual bool alloc_device(void *&device_pointer, size_t size) override;
virtual void free_device(void *device_pointer) override;
virtual bool alloc_host(void *&shared_pointer, size_t size) override;
virtual void free_host(void *shared_pointer) override;
virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;
void mem_alloc(device_memory &mem) override;

View File

@ -73,6 +73,10 @@ const char *device_kernel_as_string(DeviceKernel kernel)
return "integrator_terminated_paths_array";
case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
return "integrator_sorted_paths_array";
case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
return "integrator_sort_bucket_pass";
case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS:
return "integrator_sort_write_pass";
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
return "integrator_compact_paths_array";
case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:

View File

@ -247,6 +247,8 @@ class device_memory {
bool is_resident(Device *sub_device) const;
protected:
friend class Device;
friend class GPUDevice;
friend class CUDADevice;
friend class OptiXDevice;
friend class HIPDevice;

View File

@ -21,6 +21,7 @@ class BVHMetal : public BVH {
API_AVAILABLE(macos(11.0))
vector<id<MTLAccelerationStructure>> blas_array;
vector<uint32_t> blas_lookup;
bool motion_blur = false;

View File

@ -816,6 +816,11 @@ bool BVHMetal::build_TLAS(Progress &progress,
uint32_t instance_index = 0;
uint32_t motion_transform_index = 0;
// allocate look up buffer for wost case scenario
uint64_t count = objects.size();
blas_lookup.resize(count);
for (Object *ob : objects) {
/* Skip non-traceable objects */
if (!ob->is_traceable())
@ -843,12 +848,15 @@ bool BVHMetal::build_TLAS(Progress &progress,
/* Set user instance ID to object index */
int object_index = ob->get_device_index();
uint32_t user_id = uint32_t(object_index);
int currIndex = instance_index++;
assert(user_id < blas_lookup.size());
blas_lookup[user_id] = accel_struct_index;
/* Bake into the appropriate descriptor */
if (motion_blur) {
MTLAccelerationStructureMotionInstanceDescriptor *instances =
(MTLAccelerationStructureMotionInstanceDescriptor *)[instanceBuf contents];
MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[instance_index++];
MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[currIndex];
desc.accelerationStructureIndex = accel_struct_index;
desc.userID = user_id;
@ -894,7 +902,7 @@ bool BVHMetal::build_TLAS(Progress &progress,
else {
MTLAccelerationStructureUserIDInstanceDescriptor *instances =
(MTLAccelerationStructureUserIDInstanceDescriptor *)[instanceBuf contents];
MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[instance_index++];
MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[currIndex];
desc.accelerationStructureIndex = accel_struct_index;
desc.userID = user_id;

View File

@ -55,6 +55,10 @@ void device_metal_info(vector<DeviceInfo> &devices)
info.denoisers = DENOISER_NONE;
info.id = id;
if (MetalInfo::get_device_vendor(device) == METAL_GPU_AMD) {
info.has_light_tree = false;
}
devices.push_back(info);
device_index++;
}

View File

@ -74,6 +74,11 @@ class MetalDevice : public Device {
id<MTLBuffer> texture_bindings_3d = nil;
std::vector<id<MTLTexture>> texture_slot_map;
/* BLAS encoding & lookup */
id<MTLArgumentEncoder> mtlBlasArgEncoder = nil;
id<MTLBuffer> blas_buffer = nil;
id<MTLBuffer> blas_lookup_buffer = nil;
bool use_metalrt = false;
MetalPipelineType kernel_specialization_level = PSO_GENERIC;
@ -105,6 +110,8 @@ class MetalDevice : public Device {
bool use_adaptive_compilation();
bool use_local_atomic_sort() const;
bool make_source_and_check_if_compile_needed(MetalPipelineType pso_type);
void make_source(MetalPipelineType pso_type, const uint kernel_features);

View File

@ -192,6 +192,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
arg_desc_as.access = MTLArgumentAccessReadOnly;
MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init];
arg_desc_ptrs.dataType = MTLDataTypePointer;
arg_desc_ptrs.access = MTLArgumentAccessReadOnly;
MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
arg_desc_ift.access = MTLArgumentAccessReadOnly;
@ -204,14 +208,28 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
arg_desc_ift.index = index++;
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */
arg_desc_ift.index = index++;
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_prim */
arg_desc_ptrs.index = index++;
[ancillary_desc addObject:[arg_desc_ptrs copy]]; /* blas array */
arg_desc_ptrs.index = index++;
[ancillary_desc addObject:[arg_desc_ptrs copy]]; /* look up table for blas */
[arg_desc_ift release];
[arg_desc_as release];
[arg_desc_ptrs release];
}
}
mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];
// preparing the blas arg encoder
MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init];
arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
arg_desc_blas.access = MTLArgumentAccessReadOnly;
mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
[arg_desc_blas release];
for (int i = 0; i < ancillary_desc.count; i++) {
[ancillary_desc[i] release];
}
@ -271,6 +289,11 @@ bool MetalDevice::use_adaptive_compilation()
return DebugFlags().metal.adaptive_compile;
}
bool MetalDevice::use_local_atomic_sort() const
{
return DebugFlags().metal.use_local_atomic_sort;
}
void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
{
string global_defines;
@ -278,6 +301,10 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
}
if (use_local_atomic_sort()) {
global_defines += "#define __KERNEL_LOCAL_ATOMIC_SORT__\n";
}
if (use_metalrt) {
global_defines += "#define __METALRT__\n";
if (motion_blur) {
@ -1231,6 +1258,33 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
if (@available(macos 11.0, *)) {
if (bvh->params.top_level) {
bvhMetalRT = bvh_metal;
// allocate required buffers for BLAS array
uint64_t count = bvhMetalRT->blas_array.size();
uint64_t bufferSize = mtlBlasArgEncoder.encodedLength * count;
blas_buffer = [mtlDevice newBufferWithLength:bufferSize options:default_storage_mode];
stats.mem_alloc(blas_buffer.allocatedSize);
for (uint64_t i = 0; i < count; ++i) {
[mtlBlasArgEncoder setArgumentBuffer:blas_buffer
offset:i * mtlBlasArgEncoder.encodedLength];
[mtlBlasArgEncoder setAccelerationStructure:bvhMetalRT->blas_array[i] atIndex:0];
}
count = bvhMetalRT->blas_lookup.size();
bufferSize = sizeof(uint32_t) * count;
blas_lookup_buffer = [mtlDevice newBufferWithLength:bufferSize
options:default_storage_mode];
stats.mem_alloc(blas_lookup_buffer.allocatedSize);
memcpy([blas_lookup_buffer contents],
bvhMetalRT -> blas_lookup.data(),
blas_lookup_buffer.allocatedSize);
if (default_storage_mode == MTLResourceStorageModeManaged) {
[blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)];
[blas_lookup_buffer didModifyRange:NSMakeRange(0, blas_lookup_buffer.length)];
}
}
}
}

View File

@ -19,6 +19,8 @@ enum {
METALRT_FUNC_SHADOW_BOX,
METALRT_FUNC_LOCAL_TRI,
METALRT_FUNC_LOCAL_BOX,
METALRT_FUNC_LOCAL_TRI_PRIM,
METALRT_FUNC_LOCAL_BOX_PRIM,
METALRT_FUNC_CURVE_RIBBON,
METALRT_FUNC_CURVE_RIBBON_SHADOW,
METALRT_FUNC_CURVE_ALL,
@ -28,7 +30,13 @@ enum {
METALRT_FUNC_NUM
};
enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };
enum {
METALRT_TABLE_DEFAULT,
METALRT_TABLE_SHADOW,
METALRT_TABLE_LOCAL,
METALRT_TABLE_LOCAL_PRIM,
METALRT_TABLE_NUM
};
/* Pipeline State Object types */
enum MetalPipelineType {

View File

@ -87,6 +87,9 @@ struct ShaderCache {
break;
}
}
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS] = {1024, 1024};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS] = {1024, 1024};
}
~ShaderCache();
@ -521,6 +524,8 @@ void MetalKernelPipeline::compile()
"__anyhit__cycles_metalrt_shadow_all_hit_box",
"__anyhit__cycles_metalrt_local_hit_tri",
"__anyhit__cycles_metalrt_local_hit_box",
"__anyhit__cycles_metalrt_local_hit_tri_prim",
"__anyhit__cycles_metalrt_local_hit_box_prim",
"__intersection__curve_ribbon",
"__intersection__curve_ribbon_shadow",
"__intersection__curve_all",
@ -611,11 +616,17 @@ void MetalKernelPipeline::compile()
rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
nil];
table_functions[METALRT_TABLE_LOCAL_PRIM] = [NSArray
arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI_PRIM],
rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
nil];
NSMutableSet *unique_functions = [NSMutableSet
setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
[unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
[unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
[unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL_PRIM]];
if (kernel_has_intersection(device_kernel)) {
linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]

View File

@ -25,6 +25,7 @@ class MetalDeviceQueue : public DeviceQueue {
virtual int num_concurrent_states(const size_t) const override;
virtual int num_concurrent_busy_states(const size_t) const override;
virtual int num_sort_partition_elements() const override;
virtual bool supports_local_atomic_sort() const override;
virtual void init_execution() override;

View File

@ -315,6 +315,11 @@ int MetalDeviceQueue::num_sort_partition_elements() const
return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice);
}
bool MetalDeviceQueue::supports_local_atomic_sort() const
{
return metal_device_->use_local_atomic_sort();
}
void MetalDeviceQueue::init_execution()
{
/* Synchronize all textures and memory copies before executing task. */
@ -477,6 +482,12 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
if (metal_device_->bvhMetalRT) {
id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct;
[metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
offset:0
atIndex:7];
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_lookup_buffer
offset:0
atIndex:8];
}
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
@ -527,6 +538,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
if (bvhMetalRT) {
/* Mark all Accelerations resources as used */
[mtlComputeCommandEncoder useResource:bvhMetalRT->accel_struct usage:MTLResourceUsageRead];
[mtlComputeCommandEncoder useResource:metal_device_->blas_buffer
usage:MTLResourceUsageRead];
[mtlComputeCommandEncoder useResource:metal_device_->blas_lookup_buffer
usage:MTLResourceUsageRead];
[mtlComputeCommandEncoder useResources:bvhMetalRT->blas_array.data()
count:bvhMetalRT->blas_array.size()
usage:MTLResourceUsageRead];
@ -553,13 +568,24 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
/* See parallel_active_index.h for why this amount of shared memory is needed.
* Rounded up to 16 bytes for Metal */
shared_mem_bytes = (int)round_up((num_threads_per_block + 1) * sizeof(int), 16);
[mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
break;
case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
int key_count = metal_device_->launch_params.data.max_shaders;
shared_mem_bytes = (int)round_up(key_count * sizeof(int), 16);
break;
}
default:
break;
}
if (shared_mem_bytes) {
assert(shared_mem_bytes <= 32 * 1024);
[mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
}
MTLSize size_threadgroups_per_dispatch = MTLSizeMake(
divide_up(work_size, num_threads_per_block), 1, 1);
MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);

View File

@ -64,6 +64,12 @@ MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device)
return METAL_GPU_INTEL;
}
else if (strstr(device_name, "AMD")) {
/* Setting this env var hides AMD devices thus exposing any integrated Intel devices. */
if (auto str = getenv("CYCLES_METAL_FORCE_INTEL")) {
if (atoi(str)) {
return METAL_GPU_UNKNOWN;
}
}
return METAL_GPU_AMD;
}
else if (strstr(device_name, "Apple")) {
@ -96,6 +102,15 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
return usable_devices;
}
/* If the system has both an AMD GPU (discrete) and an Intel one (integrated), prefer the AMD
* one. This can be overridden with CYCLES_METAL_FORCE_INTEL. */
bool has_usable_amd_gpu = false;
if (@available(macos 12.3, *)) {
for (id<MTLDevice> device in MTLCopyAllDevices()) {
has_usable_amd_gpu |= (get_device_vendor(device) == METAL_GPU_AMD);
}
}
metal_printf("Usable Metal devices:\n");
for (id<MTLDevice> device in MTLCopyAllDevices()) {
string device_name = get_device_name(device);
@ -111,8 +126,10 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
}
# if defined(MAC_OS_VERSION_13_0)
if (@available(macos 13.0, *)) {
usable |= (vendor == METAL_GPU_INTEL);
if (!has_usable_amd_gpu) {
if (@available(macos 13.0, *)) {
usable |= (vendor == METAL_GPU_INTEL);
}
}
# endif

View File

@ -377,7 +377,7 @@ void OneapiDevice::tex_alloc(device_texture &mem)
generic_alloc(mem);
generic_copy_to(mem);
/* Resize if needed. Also, in case of resize - allocate in advance for future allocs. */
/* Resize if needed. Also, in case of resize - allocate in advance for future allocations. */
const uint slot = mem.slot;
if (slot >= texture_info_.size()) {
texture_info_.resize(slot + 128);

View File

@ -854,12 +854,14 @@ bool OptiXDevice::load_osl_kernels()
context, group_descs, 2, &group_options, nullptr, 0, &osl_groups[i * 2]));
}
OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
vector<OptixStackSizes> osl_stack_size(osl_groups.size());
/* Update SBT with new entries. */
sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size());
for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
}
for (size_t i = 0; i < osl_groups.size(); ++i) {
if (osl_groups[i] != NULL) {
@ -907,13 +909,15 @@ bool OptiXDevice::load_osl_kernels()
0,
&pipelines[PIP_SHADE]));
const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG);
unsigned int dss = 0;
for (unsigned int i = 0; i < osl_stack_size.size(); ++i) {
dss = std::max(dss, osl_stack_size[i].dssDC);
}
optix_assert(optixPipelineSetStackSize(
pipelines[PIP_SHADE], 0, dss, 0, pipeline_options.usesMotionBlur ? 3 : 2));
pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
}
return !have_error();

View File

@ -112,6 +112,13 @@ class DeviceQueue {
return 65536;
}
/* Does device support local atomic sorting kernels (INTEGRATOR_SORT_BUCKET_PASS and
* INTEGRATOR_SORT_WRITE_PASS)? */
virtual bool supports_local_atomic_sort() const
{
return false;
}
/* Initialize execution of kernels on this queue.
*
* Will, for example, load all data required by the kernels from Device to global or path state.

View File

@ -71,6 +71,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
device, "integrator_shader_mnee_sort_counter", MEM_READ_WRITE),
integrator_shader_sort_prefix_sum_(
device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
integrator_shader_sort_partition_key_offsets_(
device, "integrator_shader_sort_partition_key_offsets", MEM_READ_WRITE),
integrator_next_main_path_index_(device, "integrator_next_main_path_index", MEM_READ_WRITE),
integrator_next_shadow_path_index_(
device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
@ -207,33 +209,45 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
num_sort_partitions_);
/* Allocate arrays for shader sorting. */
const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
if (integrator_shader_sort_counter_.size() < sort_buckets) {
integrator_shader_sort_counter_.alloc(sort_buckets);
integrator_shader_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
(int *)integrator_shader_sort_counter_.device_pointer;
integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
integrator_shader_sort_prefix_sum_.zero_to_device();
}
if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
integrator_shader_raytrace_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
(int *)integrator_shader_raytrace_sort_counter_.device_pointer;
if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
/* Allocate array for partitioned shader sorting using local atomics. */
const int num_offsets = (device_scene_->data.max_shaders + 1) * num_sort_partitions_;
if (integrator_shader_sort_partition_key_offsets_.size() < num_offsets) {
integrator_shader_sort_partition_key_offsets_.alloc(num_offsets);
integrator_shader_sort_partition_key_offsets_.zero_to_device();
}
integrator_state_gpu_.sort_partition_key_offsets =
(int *)integrator_shader_sort_partition_key_offsets_.device_pointer;
}
else {
/* Allocate arrays for shader sorting. */
const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
if (integrator_shader_sort_counter_.size() < sort_buckets) {
integrator_shader_sort_counter_.alloc(sort_buckets);
integrator_shader_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
(int *)integrator_shader_sort_counter_.device_pointer;
if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
integrator_shader_mnee_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
(int *)integrator_shader_mnee_sort_counter_.device_pointer;
integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
integrator_shader_sort_prefix_sum_.zero_to_device();
}
if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
integrator_shader_raytrace_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
(int *)integrator_shader_raytrace_sort_counter_.device_pointer;
}
}
if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
integrator_shader_mnee_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
(int *)integrator_shader_mnee_sort_counter_.device_pointer;
}
}
}
}
@ -451,8 +465,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
work_size = num_queued;
d_path_index = queued_paths_.device_pointer;
compute_sorted_queued_paths(
DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit);
compute_sorted_queued_paths(kernel, num_paths_limit);
}
else if (num_queued < work_size) {
work_size = num_queued;
@ -511,11 +524,26 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
}
}
void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
DeviceKernel queued_kernel,
void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel queued_kernel,
const int num_paths_limit)
{
int d_queued_kernel = queued_kernel;
/* Launch kernel to fill the active paths arrays. */
if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
const int work_size = kernel_max_active_main_path_index(queued_kernel);
device_ptr d_queued_paths = queued_paths_.device_pointer;
int partition_size = (int)integrator_state_gpu_.sort_partition_divisor;
DeviceKernelArguments args(
&work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS, 1024 * num_sort_partitions_, args);
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS, 1024 * num_sort_partitions_, args);
return;
}
device_ptr d_counter = (device_ptr)integrator_state_gpu_.sort_key_counter[d_queued_kernel];
device_ptr d_prefix_sum = integrator_shader_sort_prefix_sum_.device_pointer;
assert(d_counter != 0 && d_prefix_sum != 0);
@ -552,7 +580,7 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
&d_prefix_sum,
&d_queued_kernel);
queue_->enqueue(kernel, work_size, args);
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, work_size, args);
}
}

View File

@ -70,9 +70,7 @@ class PathTraceWorkGPU : public PathTraceWork {
void enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit = INT_MAX);
void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
void compute_sorted_queued_paths(DeviceKernel kernel,
DeviceKernel queued_kernel,
const int num_paths_limit);
void compute_sorted_queued_paths(DeviceKernel queued_kernel, const int num_paths_limit);
void compact_main_paths(const int num_active_paths);
void compact_shadow_paths();
@ -135,6 +133,7 @@ class PathTraceWorkGPU : public PathTraceWork {
device_vector<int> integrator_shader_raytrace_sort_counter_;
device_vector<int> integrator_shader_mnee_sort_counter_;
device_vector<int> integrator_shader_sort_prefix_sum_;
device_vector<int> integrator_shader_sort_partition_key_offsets_;
/* Path split. */
device_vector<int> integrator_next_main_path_index_;
device_vector<int> integrator_next_shadow_path_index_;

View File

@ -661,4 +661,38 @@ ccl_device void bsdf_blur(KernelGlobals kg, ccl_private ShaderClosure *sc, float
#endif
}
ccl_device_inline Spectrum bsdf_albedo(ccl_private const ShaderData *sd,
ccl_private const ShaderClosure *sc)
{
Spectrum albedo = sc->weight;
/* Some closures include additional components such as Fresnel terms that cause their albedo to
* be below 1. The point of this function is to return a best-effort estimation of their albedo,
* meaning the amount of reflected/refracted light that would be expected when illuminated by a
* uniform white background.
* This is used for the denoising albedo pass and diffuse/glossy/transmission color passes.
* NOTE: This should always match the sample_weight of the closure - as in, if there's an albedo
* adjustment in here, the sample_weight should also be reduced accordingly.
* TODO(lukas): Consider calling this function to determine the sample_weight? Would be a bit of
* extra overhead though. */
#if defined(__SVM__) || defined(__OSL__)
switch (sc->type) {
case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
albedo *= microfacet_fresnel((ccl_private const MicrofacetBsdf *)sc, sd->wi, sc->N);
break;
case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
albedo *= ((ccl_private const PrincipledSheenBsdf *)sc)->avg_value;
break;
case CLOSURE_BSDF_HAIR_PRINCIPLED_ID:
albedo *= bsdf_principled_hair_albedo(sc);
break;
default:
break;
}
#endif
return albedo;
}
CCL_NAMESPACE_END

View File

@ -23,8 +23,6 @@ enum MicrofacetType {
typedef struct MicrofacetExtra {
Spectrum color, cspec0;
Spectrum fresnel_color;
float clearcoat;
} MicrofacetExtra;
typedef struct MicrofacetBsdf {
@ -184,26 +182,25 @@ ccl_device_forceinline float3 microfacet_ggx_sample_vndf(const float3 wi,
*
* Else it is simply white
*/
ccl_device_forceinline Spectrum reflection_color(ccl_private const MicrofacetBsdf *bsdf,
float3 L,
float3 H)
ccl_device_forceinline Spectrum microfacet_fresnel(ccl_private const MicrofacetBsdf *bsdf,
float3 wi,
float3 H)
{
Spectrum F = one_spectrum();
bool use_clearcoat = bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID;
bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID || use_clearcoat);
if (use_fresnel) {
float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
F = interpolate_fresnel_color(L, H, bsdf->ior, F0, bsdf->extra->cspec0);
if (CLOSURE_IS_BSDF_MICROFACET_FRESNEL(bsdf->type)) {
return interpolate_fresnel_color(wi, H, bsdf->ior, bsdf->extra->cspec0);
}
if (use_clearcoat) {
F *= 0.25f * bsdf->extra->clearcoat;
else if (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) {
return make_spectrum(fresnel_dielectric_cos(dot(wi, H), bsdf->ior));
}
else {
return one_spectrum();
}
}
return F;
ccl_device_forceinline void bsdf_microfacet_adjust_weight(ccl_private const ShaderData *sd,
ccl_private MicrofacetBsdf *bsdf)
{
bsdf->sample_weight *= average(microfacet_fresnel(bsdf, sd->wi, bsdf->N));
}
/* Generalized Trowbridge-Reitz for clearcoat. */
@ -292,22 +289,6 @@ ccl_device_inline float bsdf_aniso_D(float alpha_x, float alpha_y, float3 H)
}
}
ccl_device_forceinline void bsdf_microfacet_fresnel_color(ccl_private const ShaderData *sd,
ccl_private MicrofacetBsdf *bsdf)
{
kernel_assert(CLOSURE_IS_BSDF_MICROFACET_FRESNEL(bsdf->type));
float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
bsdf->extra->fresnel_color = interpolate_fresnel_color(
sd->wi, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0);
if (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) {
bsdf->extra->fresnel_color *= 0.25f * bsdf->extra->clearcoat;
}
bsdf->sample_weight *= average(bsdf->extra->fresnel_color);
}
template<MicrofacetType m_type>
ccl_device Spectrum bsdf_microfacet_eval(ccl_private const ShaderClosure *sc,
const float3 Ng,
@ -380,8 +361,7 @@ ccl_device Spectrum bsdf_microfacet_eval(ccl_private const ShaderClosure *sc,
*pdf = common / (1.0f + lambdaI);
const Spectrum F = m_refractive ? one_spectrum() : reflection_color(bsdf, wo, H);
const Spectrum F = microfacet_fresnel(bsdf, wo, H);
return F * common / (1.0f + lambdaO + lambdaI);
}
@ -463,14 +443,7 @@ ccl_device int bsdf_microfacet_sample(ccl_private const ShaderClosure *sc,
label |= LABEL_SINGULAR;
/* Some high number for MIS. */
*pdf = 1e6f;
*eval = make_spectrum(1e6f);
bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID ||
bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID);
if (use_fresnel && !m_refractive) {
*eval *= reflection_color(bsdf, *wo, H);
}
*eval = make_spectrum(1e6f) * microfacet_fresnel(bsdf, *wo, H);
}
else {
label |= LABEL_GLOSSY;
@ -511,8 +484,7 @@ ccl_device int bsdf_microfacet_sample(ccl_private const ShaderClosure *sc,
*pdf = common / (1.0f + lambdaI);
Spectrum F = m_refractive ? one_spectrum() : reflection_color(bsdf, *wo, H);
Spectrum F = microfacet_fresnel(bsdf, *wo, H);
*eval = F * common / (1.0f + lambdaI + lambdaO);
}
@ -547,14 +519,6 @@ ccl_device int bsdf_microfacet_ggx_setup(ccl_private MicrofacetBsdf *bsdf)
return SD_BSDF | SD_BSDF_HAS_EVAL;
}
/* Required to maintain OSL interface. */
ccl_device int bsdf_microfacet_ggx_isotropic_setup(ccl_private MicrofacetBsdf *bsdf)
{
bsdf->alpha_y = bsdf->alpha_x;
return bsdf_microfacet_ggx_setup(bsdf);
}
ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsdf,
ccl_private const ShaderData *sd)
{
@ -565,7 +529,7 @@ ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsd
bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID;
bsdf_microfacet_fresnel_color(sd, bsdf);
bsdf_microfacet_adjust_weight(sd, bsdf);
return SD_BSDF | SD_BSDF_HAS_EVAL;
}
@ -573,14 +537,12 @@ ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsd
ccl_device int bsdf_microfacet_ggx_clearcoat_setup(ccl_private MicrofacetBsdf *bsdf,
ccl_private const ShaderData *sd)
{
bsdf->extra->cspec0 = saturate(bsdf->extra->cspec0);
bsdf->alpha_x = saturatef(bsdf->alpha_x);
bsdf->alpha_y = bsdf->alpha_x;
bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID;
bsdf_microfacet_fresnel_color(sd, bsdf);
bsdf_microfacet_adjust_weight(sd, bsdf);
return SD_BSDF | SD_BSDF_HAS_EVAL;
}
@ -643,14 +605,6 @@ ccl_device int bsdf_microfacet_beckmann_setup(ccl_private MicrofacetBsdf *bsdf)
return SD_BSDF | SD_BSDF_HAS_EVAL;
}
/* Required to maintain OSL interface. */
ccl_device int bsdf_microfacet_beckmann_isotropic_setup(ccl_private MicrofacetBsdf *bsdf)
{
bsdf->alpha_y = bsdf->alpha_x;
return bsdf_microfacet_beckmann_setup(bsdf);
}
ccl_device int bsdf_microfacet_beckmann_refraction_setup(ccl_private MicrofacetBsdf *bsdf)
{
bsdf->alpha_x = saturatef(bsdf->alpha_x);

View File

@ -401,7 +401,7 @@ ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(ccl_private MicrofacetBsd
bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID;
bsdf_microfacet_fresnel_color(sd, bsdf);
bsdf_microfacet_adjust_weight(sd, bsdf);
return bsdf_microfacet_multi_ggx_common_setup(bsdf);
}
@ -575,7 +575,7 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(ccl_private Microfa
bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID;
bsdf_microfacet_fresnel_color(sd, bsdf);
bsdf_microfacet_adjust_weight(sd, bsdf);
return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_NEEDS_LCG;
}

View File

@ -73,9 +73,8 @@ ccl_device_forceinline Spectrum MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
eval = make_spectrum(val);
#endif
float F0 = fresnel_dielectric_cos(1.0f, eta);
if (use_fresnel) {
throughput = interpolate_fresnel_color(wi, wh, eta, F0, cspec0);
throughput = interpolate_fresnel_color(wi, wh, eta, cspec0);
eval *= throughput;
}
@ -144,11 +143,11 @@ ccl_device_forceinline Spectrum MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
throughput *= color;
}
else if (use_fresnel && order > 0) {
throughput *= interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
throughput *= interpolate_fresnel_color(wi_prev, wm, eta, cspec0);
}
#else /* MF_MULTI_GLOSSY */
if (use_fresnel && order > 0) {
throughput *= interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
throughput *= interpolate_fresnel_color(-wr, wm, eta, cspec0);
}
wr = mf_sample_phase_glossy(-wr, &throughput, wm);
#endif
@ -192,8 +191,6 @@ ccl_device_forceinline Spectrum MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi,
float G1_r = 0.0f;
bool outside = true;
float F0 = fresnel_dielectric_cos(1.0f, eta);
int order;
for (order = 0; order < 10; order++) {
/* Sample microfacet height. */
@ -229,22 +226,12 @@ ccl_device_forceinline Spectrum MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi,
throughput *= color;
}
else {
Spectrum t_color = interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
if (order == 0)
throughput = t_color;
else
throughput *= t_color;
throughput *= interpolate_fresnel_color(wi_prev, wm, eta, cspec0);
}
}
#else /* MF_MULTI_GLOSSY */
if (use_fresnel) {
Spectrum t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
if (order == 0)
throughput = t_color;
else
throughput *= t_color;
throughput *= interpolate_fresnel_color(-wr, wm, eta, cspec0);
}
wr = mf_sample_phase_glossy(-wr, &throughput, wm);
#endif

View File

@ -89,19 +89,21 @@ ccl_device float schlick_fresnel(float u)
return m2 * m2 * m; // pow(m, 5)
}
/* Calculate the fresnel color which is a blend between white and the F0 color (cspec0) */
ccl_device_forceinline Spectrum
interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, Spectrum cspec0)
/* Calculate the fresnel color, which is a blend between white and the F0 color */
ccl_device_forceinline Spectrum interpolate_fresnel_color(float3 L,
float3 H,
float ior,
Spectrum F0)
{
/* Calculate the fresnel interpolation factor
* The value from fresnel_dielectric_cos(...) has to be normalized because
* the cspec0 keeps the F0 color
*/
float F0_norm = 1.0f / (1.0f - F0);
float FH = (fresnel_dielectric_cos(dot(L, H), ior) - F0) * F0_norm;
/* Compute the real Fresnel term and remap it from real_F0..1 to F0..1.
* The reason why we use this remapping instead of directly doing the
* Schlick approximation lerp(F0, 1.0, (1.0-cosLH)^5) is that for cases
* with similar IORs (e.g. ice in water), the relative IOR can be close
* enough to 1.0 that the Schlick approximation becomes inaccurate. */
float real_F = fresnel_dielectric_cos(dot(L, H), ior);
float real_F0 = fresnel_dielectric_cos(1.0f, ior);
/* Blend between white and a specular color with respect to the fresnel */
return cspec0 * (1.0f - FH) + make_spectrum(FH);
return mix(F0, one_spectrum(), inverse_lerp(real_F0, 1.0f, real_F));
}
ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)

View File

@ -401,6 +401,72 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
}
ccl_gpu_kernel_postfix
ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
int num_states,
int partition_size,
int num_states_limit,
ccl_global int *indices,
int kernel_index)
{
#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
int max_shaders = context.launch_params_metal.data.max_shaders;
ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
kernel_integrator_state.path.queued_kernel;
ccl_global uint *d_shader_sort_key = (ccl_global uint *)
kernel_integrator_state.path.shader_sort_key;
ccl_global int *key_offsets = (ccl_global int *)
kernel_integrator_state.sort_partition_key_offsets;
gpu_parallel_sort_bucket_pass(num_states,
partition_size,
max_shaders,
kernel_index,
d_queued_kernel,
d_shader_sort_key,
key_offsets,
(threadgroup int *)threadgroup_array,
metal_local_id,
metal_local_size,
metal_grid_id);
#endif
}
ccl_gpu_kernel_postfix
ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
ccl_gpu_kernel_signature(integrator_sort_write_pass,
int num_states,
int partition_size,
int num_states_limit,
ccl_global int *indices,
int kernel_index)
{
#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
int max_shaders = context.launch_params_metal.data.max_shaders;
ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
kernel_integrator_state.path.queued_kernel;
ccl_global uint *d_shader_sort_key = (ccl_global uint *)
kernel_integrator_state.path.shader_sort_key;
ccl_global int *key_offsets = (ccl_global int *)
kernel_integrator_state.sort_partition_key_offsets;
gpu_parallel_sort_write_pass(num_states,
partition_size,
max_shaders,
kernel_index,
num_states_limit,
indices,
d_queued_kernel,
d_shader_sort_key,
key_offsets,
(threadgroup int *)threadgroup_array,
metal_local_id,
metal_local_size,
metal_grid_id);
#endif
}
ccl_gpu_kernel_postfix
ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
ccl_gpu_kernel_signature(integrator_compact_paths_array,
int num_states,

View File

@ -178,7 +178,7 @@ __device__
simd_lane_index, \
simd_group_index, \
num_simd_groups, \
simdgroup_offset)
(threadgroup int *)threadgroup_array)
#elif defined(__KERNEL_ONEAPI__)
# define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \

View File

@ -19,6 +19,115 @@ CCL_NAMESPACE_BEGIN
# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
#endif
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024
#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
# define atomic_store_local(p, x) \
atomic_store_explicit((threadgroup atomic_int *)p, x, memory_order_relaxed)
# define atomic_load_local(p) \
atomic_load_explicit((threadgroup atomic_int *)p, memory_order_relaxed)
ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
const uint partition_size,
const uint max_shaders,
const uint queued_kernel,
ccl_global ushort *d_queued_kernel,
ccl_global uint *d_shader_sort_key,
ccl_global int *partition_key_offsets,
ccl_gpu_shared int *buckets,
const ushort local_id,
const ushort local_size,
const ushort grid_id)
{
/* Zero the bucket sizes. */
if (local_id < max_shaders) {
atomic_store_local(&buckets[local_id], 0);
}
ccl_gpu_syncthreads();
/* Determine bucket sizes within the partitions. */
const uint partition_start = partition_size * uint(grid_id);
const uint partition_end = min(num_states, partition_start + partition_size);
for (int state_index = partition_start + uint(local_id); state_index < partition_end;
state_index += uint(local_size)) {
ushort kernel_index = d_queued_kernel[state_index];
if (kernel_index == queued_kernel) {
uint key = d_shader_sort_key[state_index] % max_shaders;
atomic_fetch_and_add_uint32(&buckets[key], 1);
}
}
ccl_gpu_syncthreads();
/* Calculate the partition's local offsets from the prefix sum of bucket sizes. */
if (local_id == 0) {
int offset = 0;
for (int i = 0; i < max_shaders; i++) {
partition_key_offsets[i + uint(grid_id) * (max_shaders + 1)] = offset;
offset = offset + atomic_load_local(&buckets[i]);
}
/* Store the number of active states in this partition. */
partition_key_offsets[max_shaders + uint(grid_id) * (max_shaders + 1)] = offset;
}
}
ccl_device_inline void gpu_parallel_sort_write_pass(const uint num_states,
const uint partition_size,
const uint max_shaders,
const uint queued_kernel,
const int num_states_limit,
ccl_global int *indices,
ccl_global ushort *d_queued_kernel,
ccl_global uint *d_shader_sort_key,
ccl_global int *partition_key_offsets,
ccl_gpu_shared int *local_offset,
const ushort local_id,
const ushort local_size,
const ushort grid_id)
{
/* Calculate each partition's global offset from the prefix sum of the active state counts per
* partition. */
if (local_id < max_shaders) {
int partition_offset = 0;
for (int i = 0; i < uint(grid_id); i++) {
int partition_key_count = partition_key_offsets[max_shaders + uint(i) * (max_shaders + 1)];
partition_offset += partition_key_count;
}
ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * (max_shaders + 1));
atomic_store_local(&local_offset[local_id], key_offsets[local_id] + partition_offset);
}
ccl_gpu_syncthreads();
/* Write the sorted active indices. */
const uint partition_start = partition_size * uint(grid_id);
const uint partition_end = min(num_states, partition_start + partition_size);
ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * max_shaders);
for (int state_index = partition_start + uint(local_id); state_index < partition_end;
state_index += uint(local_size)) {
ushort kernel_index = d_queued_kernel[state_index];
if (kernel_index == queued_kernel) {
uint key = d_shader_sort_key[state_index] % max_shaders;
int index = atomic_fetch_and_add_uint32(&local_offset[key], 1);
if (index < num_states_limit) {
indices[index] = state_index;
}
}
}
}
#endif /* __KERNEL_LOCAL_ATOMIC_SORT__ */
template<typename GetKeyOp>
__device__ void gpu_parallel_sorted_index_array(const uint state_index,

View File

@ -172,17 +172,14 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
kernel_assert(!"Invalid ift_local");
return false;
}
# endif
metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
metalrt_intersector_type metalrt_intersect;
metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
if (triangle_only) {
metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
if (is_null_intersection_function_table(metal_ancillaries->ift_local_prim)) {
if (local_isect) {
local_isect->num_hits = 0;
}
kernel_assert(!"Invalid ift_local_prim");
return false;
}
# endif
MetalRTIntersectionLocalPayload payload;
payload.self = ray->self;
@ -195,14 +192,48 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
}
payload.result = false;
typename metalrt_intersector_type::result_type intersection;
metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
# if defined(__METALRT_MOTION__)
metalrt_intersector_type metalrt_intersect;
typename metalrt_intersector_type::result_type intersection;
metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
if (triangle_only) {
metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
}
intersection = metalrt_intersect.intersect(
r, metal_ancillaries->accel_struct, 0xFF, ray->time, metal_ancillaries->ift_local, payload);
# else
metalrt_blas_intersector_type metalrt_intersect;
typename metalrt_blas_intersector_type::result_type intersection;
metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
if (triangle_only) {
metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
}
// if we know we are going to get max one hit, like for random-sss-walk we can
// optimize and accept the first hit
if (max_hits == 1) {
metalrt_intersect.accept_any_intersection(true);
}
int blas_index = metal_ancillaries->blas_userID_to_index_lookUp[local_object];
// transform the ray into object's local space
Transform itfm = kernel_data_fetch(objects, local_object).itfm;
r.origin = transform_point(&itfm, r.origin);
r.direction = transform_direction(&itfm, r.direction);
intersection = metalrt_intersect.intersect(
r, metal_ancillaries->accel_struct, 0xFF, metal_ancillaries->ift_local, payload);
r,
metal_ancillaries->blas_accel_structs[blas_index].blas,
metal_ancillaries->ift_local_prim,
payload);
# endif
if (lcg_state) {

View File

@ -105,10 +105,11 @@ struct kernel_gpu_##name \
{ \
PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
void run(thread MetalKernelContext& context, \
threadgroup int *simdgroup_offset, \
threadgroup atomic_int *threadgroup_array, \
const uint metal_global_id, \
const ushort metal_local_id, \
const ushort metal_local_size, \
const ushort metal_grid_id, \
uint simdgroup_size, \
uint simd_lane_index, \
uint simd_group_index, \
@ -117,22 +118,24 @@ struct kernel_gpu_##name \
kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
constant KernelParamsMetal &ccl_restrict _launch_params_metal, \
constant MetalAncillaries *_metal_ancillaries, \
threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
threadgroup atomic_int *threadgroup_array[[ threadgroup(0) ]], \
const uint metal_global_id [[thread_position_in_grid]], \
const ushort metal_local_id [[thread_position_in_threadgroup]], \
const ushort metal_local_size [[threads_per_threadgroup]], \
const ushort metal_grid_id [[threadgroup_position_in_grid]], \
uint simdgroup_size [[threads_per_simdgroup]], \
uint simd_lane_index [[thread_index_in_simdgroup]], \
uint simd_group_index [[simdgroup_index_in_threadgroup]], \
uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
params_struct->run(context, threadgroup_array, metal_global_id, metal_local_id, metal_local_size, metal_grid_id, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
} \
void kernel_gpu_##name::run(thread MetalKernelContext& context, \
threadgroup int *simdgroup_offset, \
threadgroup atomic_int *threadgroup_array, \
const uint metal_global_id, \
const ushort metal_local_id, \
const ushort metal_local_size, \
const ushort metal_grid_id, \
uint simdgroup_size, \
uint simd_lane_index, \
uint simd_group_index, \
@ -263,13 +266,25 @@ ccl_device_forceinline uchar4 make_uchar4(const uchar x,
# if defined(__METALRT_MOTION__)
# define METALRT_TAGS instancing, instance_motion, primitive_motion
# define METALRT_BLAS_TAGS , primitive_motion
# else
# define METALRT_TAGS instancing
# define METALRT_BLAS_TAGS
# endif /* __METALRT_MOTION__ */
typedef acceleration_structure<METALRT_TAGS> metalrt_as_type;
typedef intersection_function_table<triangle_data, METALRT_TAGS> metalrt_ift_type;
typedef metal::raytracing::intersector<triangle_data, METALRT_TAGS> metalrt_intersector_type;
# if defined(__METALRT_MOTION__)
typedef acceleration_structure<primitive_motion> metalrt_blas_as_type;
typedef intersection_function_table<triangle_data, primitive_motion> metalrt_blas_ift_type;
typedef metal::raytracing::intersector<triangle_data, primitive_motion>
metalrt_blas_intersector_type;
# else
typedef acceleration_structure<> metalrt_blas_as_type;
typedef intersection_function_table<triangle_data> metalrt_blas_ift_type;
typedef metal::raytracing::intersector<triangle_data> metalrt_blas_intersector_type;
# endif
#endif /* __METALRT__ */
@ -282,6 +297,12 @@ struct Texture3DParamsMetal {
texture3d<float, access::sample> tex;
};
#ifdef __METALRT__
struct MetalRTBlasWrapper {
metalrt_blas_as_type blas;
};
#endif
struct MetalAncillaries {
device Texture2DParamsMetal *textures_2d;
device Texture3DParamsMetal *textures_3d;
@ -291,6 +312,9 @@ struct MetalAncillaries {
metalrt_ift_type ift_default;
metalrt_ift_type ift_shadow;
metalrt_ift_type ift_local;
metalrt_blas_ift_type ift_local_prim;
constant MetalRTBlasWrapper *blas_accel_structs;
constant int *blas_userID_to_index_lookUp;
#endif
};

View File

@ -34,7 +34,7 @@ class MetalKernelContext {
kernel_assert(0);
return 0;
}
#ifdef __KERNEL_METAL_INTEL__
template<typename TextureType, typename CoordsType>
inline __attribute__((__always_inline__))
@ -55,7 +55,7 @@ class MetalKernelContext {
}
}
#endif
// texture2d
template<>
inline __attribute__((__always_inline__))

View File

@ -139,6 +139,20 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
#endif
}
[[intersection(triangle, triangle_data )]] TriangleIntersectionResult
__anyhit__cycles_metalrt_local_hit_tri_prim(
constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
uint primitive_id [[primitive_id]],
float2 barycentrics [[barycentric_coord]],
float ray_tmax [[distance]])
{
//instance_id, aka the user_id has been removed. If we take this function we optimized the
//SSS for starting traversal from a primitive acceleration structure instead of the root of the global AS.
//this means we will always be intersecting the correct object no need for the userid to check
return metalrt_local_hit<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
launch_params_metal, payload, payload.local_object, primitive_id, barycentrics, ray_tmax);
}
[[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult
__anyhit__cycles_metalrt_local_hit_tri(
constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
@ -163,6 +177,17 @@ __anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
return result;
}
[[intersection(bounding_box, triangle_data )]] BoundingBoxIntersectionResult
__anyhit__cycles_metalrt_local_hit_box_prim(const float ray_tmax [[max_distance]])
{
/* unused function */
BoundingBoxIntersectionResult result;
result.distance = ray_tmax;
result.accept = false;
result.continue_search = false;
return result;
}
template<uint intersection_type>
bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,

View File

@ -195,9 +195,9 @@ using sycl::half;
#define fmodf(x, y) sycl::fmod((x), (y))
#define lgammaf(x) sycl::lgamma((x))
/* sycl::native::cos precision is not sufficient and -ffast-math lets
* the current DPC++ compiler overload sycl::cos with it.
* We work around this issue by directly calling the spirv implementation which
/* `sycl::native::cos` precision is not sufficient and `-ffast-math` lets
* the current DPC++ compiler overload `sycl::cos` with it.
* We work around this issue by directly calling the SPIRV implementation which
* provides greater precision. */
#if defined(__SYCL_DEVICE_ONLY__) && defined(__SPIR__)
# define cosf(x) __spirv_ocl_cos(((float)(x)))

View File

@ -372,6 +372,16 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sorted_paths_array);
break;
}
case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_bucket_pass);
break;
}
case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_write_pass);
break;
}
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: {
oneapi_call(kg,
cgh,

View File

@ -58,23 +58,7 @@ ccl_device_forceinline void film_write_denoising_features_surface(KernelGlobals
normal += sc->N * sc->sample_weight;
sum_weight += sc->sample_weight;
Spectrum closure_albedo = sc->weight;
/* Closures that include a Fresnel term typically have weights close to 1 even though their
* actual contribution is significantly lower.
* To account for this, we scale their weight by the average fresnel factor (the same is also
* done for the sample weight in the BSDF setup, so we don't need to scale that here). */
if (CLOSURE_IS_BSDF_MICROFACET_FRESNEL(sc->type)) {
ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)sc;
closure_albedo *= bsdf->extra->fresnel_color;
}
else if (sc->type == CLOSURE_BSDF_PRINCIPLED_SHEEN_ID) {
ccl_private PrincipledSheenBsdf *bsdf = (ccl_private PrincipledSheenBsdf *)sc;
closure_albedo *= bsdf->avg_value;
}
else if (sc->type == CLOSURE_BSDF_HAIR_PRINCIPLED_ID) {
closure_albedo *= bsdf_principled_hair_albedo(sc);
}
else if (sc->type == CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID) {
if (sc->type == CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID) {
/* BSSRDF already accounts for weight, retro-reflection would double up. */
ccl_private const PrincipledDiffuseBsdf *bsdf = (ccl_private const PrincipledDiffuseBsdf *)
sc;
@ -83,6 +67,7 @@ ccl_device_forceinline void film_write_denoising_features_surface(KernelGlobals
}
}
Spectrum closure_albedo = bsdf_albedo(sd, sc);
if (bsdf_get_specular_roughness_squared(sc) > sqr(0.075f)) {
diffuse_albedo += closure_albedo;
sum_nonspecular_weight += sc->sample_weight;

View File

@ -132,6 +132,9 @@ typedef struct IntegratorStateGPU {
/* Index of main path which will be used by a next shadow catcher split. */
ccl_global int *next_main_path_index;
/* Partition/key offsets used when writing sorted active indices. */
ccl_global int *sort_partition_key_offsets;
/* Divisor used to partition active indices by locality when sorting by material. */
uint sort_partition_divisor;
} IntegratorStateGPU;

View File

@ -115,6 +115,13 @@ ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
# if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
if (!kernel_integrator_state.sort_key_counter[next_kernel]) {
return;
}
# endif
atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
}
@ -130,6 +137,13 @@ ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg,
atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
# if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
if (!kernel_integrator_state.sort_key_counter[next_kernel]) {
return;
}
# endif
atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
}

View File

@ -621,7 +621,7 @@ ccl_device Spectrum surface_shader_diffuse(KernelGlobals kg, ccl_private const S
ccl_private const ShaderClosure *sc = &sd->closure[i];
if (CLOSURE_IS_BSDF_DIFFUSE(sc->type) || CLOSURE_IS_BSSRDF(sc->type))
eval += sc->weight;
eval += bsdf_albedo(sd, sc);
}
return eval;
@ -635,7 +635,7 @@ ccl_device Spectrum surface_shader_glossy(KernelGlobals kg, ccl_private const Sh
ccl_private const ShaderClosure *sc = &sd->closure[i];
if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
eval += sc->weight;
eval += bsdf_albedo(sd, sc);
}
return eval;
@ -649,7 +649,7 @@ ccl_device Spectrum surface_shader_transmission(KernelGlobals kg, ccl_private co
ccl_private const ShaderClosure *sc = &sd->closure[i];
if (CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
eval += sc->weight;
eval += bsdf_albedo(sd, sc);
}
return eval;

View File

@ -7,24 +7,13 @@
CCL_NAMESPACE_BEGIN
ccl_device float spot_light_attenuation(float3 dir,
float cos_half_spot_angle,
float spot_smooth,
float3 N)
ccl_device float spot_light_attenuation(const ccl_global KernelSpotLight *spot, float3 ray)
{
float attenuation = dot(dir, N);
const float3 scaled_ray = safe_normalize(
make_float3(dot(ray, spot->axis_u), dot(ray, spot->axis_v), dot(ray, spot->dir)) /
spot->len);
if (attenuation <= cos_half_spot_angle) {
attenuation = 0.0f;
}
else {
float t = attenuation - cos_half_spot_angle;
if (t < spot_smooth && spot_smooth != 0.0f)
attenuation *= smoothstepf(t / spot_smooth);
}
return attenuation;
return smoothstepf((scaled_ray.z - spot->cos_half_spot_angle) / spot->spot_smooth);
}
template<bool in_volume_segment>
@ -57,8 +46,7 @@ ccl_device_inline bool spot_light_sample(const ccl_global KernelLight *klight,
ls->eval_fac = (0.25f * M_1_PI_F) * invarea;
/* spot light attenuation */
ls->eval_fac *= spot_light_attenuation(
klight->spot.dir, klight->spot.cos_half_spot_angle, klight->spot.spot_smooth, -ls->D);
ls->eval_fac *= spot_light_attenuation(&klight->spot, -ls->D);
if (!in_volume_segment && ls->eval_fac == 0.0f) {
return false;
}
@ -87,8 +75,7 @@ ccl_device_forceinline void spot_light_update_position(const ccl_global KernelLi
ls->pdf = invarea;
/* spot light attenuation */
ls->eval_fac *= spot_light_attenuation(
klight->spot.dir, klight->spot.cos_half_spot_angle, klight->spot.spot_smooth, ls->Ng);
ls->eval_fac *= spot_light_attenuation(&klight->spot, ls->Ng);
}
ccl_device_inline bool spot_light_intersect(const ccl_global KernelLight *klight,
@ -129,8 +116,7 @@ ccl_device_inline bool spot_light_sample_from_intersection(
ls->pdf = invarea;
/* spot light attenuation */
ls->eval_fac *= spot_light_attenuation(
klight->spot.dir, klight->spot.cos_half_spot_angle, klight->spot.spot_smooth, -ls->D);
ls->eval_fac *= spot_light_attenuation(&klight->spot, -ls->D);
if (ls->eval_fac == 0.0f) {
return false;

View File

@ -209,14 +209,7 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
if (closure->distribution == make_string("ggx", 11253504724482777663ull) ||
closure->distribution == make_string("default", 4430693559278735917ull)) {
if (!closure->refract) {
if (closure->alpha_x == closure->alpha_y) {
/* Isotropic */
sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
}
else {
/* Anisotropic */
sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
}
sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
}
else {
sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
@ -225,14 +218,7 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
/* Beckmann */
else {
if (!closure->refract) {
if (closure->alpha_x == closure->alpha_y) {
/* Isotropic */
sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
}
else {
/* Anisotropic */
sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
}
sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
}
else {
sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
@ -258,9 +244,9 @@ ccl_device void osl_closure_microfacet_ggx_setup(
}
bsdf->N = ensure_valid_reflection(sd->Ng, sd->wi, closure->N);
bsdf->alpha_x = closure->alpha_x;
bsdf->alpha_x = bsdf->alpha_y = closure->alpha_x;
sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
}
ccl_device void osl_closure_microfacet_ggx_aniso_setup(
@ -345,7 +331,6 @@ ccl_device void osl_closure_microfacet_ggx_fresnel_setup(
bsdf->extra = extra;
bsdf->extra->color = rgb_to_spectrum(closure->color);
bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
bsdf->extra->clearcoat = 0.0f;
bsdf->T = zero_float3();
@ -383,7 +368,6 @@ ccl_device void osl_closure_microfacet_ggx_aniso_fresnel_setup(
bsdf->extra = extra;
bsdf->extra->color = rgb_to_spectrum(closure->color);
bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
bsdf->extra->clearcoat = 0.0f;
bsdf->T = closure->T;
@ -426,7 +410,6 @@ ccl_device void osl_closure_microfacet_multi_ggx_setup(
bsdf->extra = extra;
bsdf->extra->color = rgb_to_spectrum(closure->color);
bsdf->extra->cspec0 = zero_spectrum();
bsdf->extra->clearcoat = 0.0f;
bsdf->T = zero_float3();
@ -467,7 +450,6 @@ ccl_device void osl_closure_microfacet_multi_ggx_glass_setup(
bsdf->extra = extra;
bsdf->extra->color = rgb_to_spectrum(closure->color);
bsdf->extra->cspec0 = zero_spectrum();
bsdf->extra->clearcoat = 0.0f;
bsdf->T = zero_float3();
@ -508,7 +490,6 @@ ccl_device void osl_closure_microfacet_multi_ggx_aniso_setup(
bsdf->extra = extra;
bsdf->extra->color = rgb_to_spectrum(closure->color);
bsdf->extra->cspec0 = zero_spectrum();
bsdf->extra->clearcoat = 0.0f;
bsdf->T = closure->T;
@ -551,7 +532,6 @@ ccl_device void osl_closure_microfacet_multi_ggx_fresnel_setup(
bsdf->extra = extra;
bsdf->extra->color = rgb_to_spectrum(closure->color);
bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
bsdf->extra->clearcoat = 0.0f;
bsdf->T = zero_float3();
@ -592,7 +572,6 @@ ccl_device void osl_closure_microfacet_multi_ggx_glass_fresnel_setup(
bsdf->extra = extra;
bsdf->extra->color = rgb_to_spectrum(closure->color);
bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
bsdf->extra->clearcoat = 0.0f;
bsdf->T = zero_float3();
@ -633,7 +612,6 @@ ccl_device void osl_closure_microfacet_multi_ggx_aniso_fresnel_setup(
bsdf->extra = extra;
bsdf->extra->color = rgb_to_spectrum(closure->color);
bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
bsdf->extra->clearcoat = 0.0f;
bsdf->T = closure->T;
@ -660,9 +638,9 @@ ccl_device void osl_closure_microfacet_beckmann_setup(
}
bsdf->N = ensure_valid_reflection(sd->Ng, sd->wi, closure->N);
bsdf->alpha_x = closure->alpha_x;
bsdf->alpha_x = bsdf->alpha_y = closure->alpha_x;
sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
}
ccl_device void osl_closure_microfacet_beckmann_aniso_setup(
@ -865,27 +843,18 @@ ccl_device void osl_closure_principled_clearcoat_setup(
float3 weight,
ccl_private const PrincipledClearcoatClosure *closure)
{
weight *= 0.25f * closure->clearcoat;
ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
if (!bsdf) {
return;
}
MicrofacetExtra *extra = (MicrofacetExtra *)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
if (!extra) {
return;
}
bsdf->N = ensure_valid_reflection(sd->Ng, sd->wi, closure->N);
bsdf->alpha_x = closure->clearcoat_roughness;
bsdf->alpha_y = closure->clearcoat_roughness;
bsdf->ior = 1.5f;
bsdf->extra = extra;
bsdf->extra->color = zero_spectrum();
bsdf->extra->cspec0 = make_spectrum(0.04f);
bsdf->extra->clearcoat = closure->clearcoat;
bsdf->T = zero_float3();
sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);

View File

@ -161,7 +161,10 @@ ccl_device_inline void osl_eval_nodes(KernelGlobals kg,
/* shadeindex = */ 0);
# endif
if (globals.Ci) {
if constexpr (type == SHADER_TYPE_DISPLACEMENT) {
sd->P = globals.P;
}
else if (globals.Ci) {
flatten_closure_tree(kg, sd, path_flag, globals.Ci);
}
}

View File

@ -46,17 +46,8 @@ ccl_device_noinline_cpu float2 svm_brick(float3 p,
float tint = saturatef((brick_noise((rownum << 16) + (bricknum & 0xFFFF)) + bias));
float min_dist = min(min(x, y), min(brick_width - x, row_height - y));
float mortar;
if (min_dist >= mortar_size) {
mortar = 0.0f;
}
else if (mortar_smooth == 0.0f) {
mortar = 1.0f;
}
else {
min_dist = 1.0f - min_dist / mortar_size;
mortar = (min_dist < mortar_smooth) ? smoothstepf(min_dist / mortar_smooth) : 1.0f;
}
min_dist = 1.0f - min_dist / mortar_size;
float mortar = smoothstepf(min_dist / mortar_smooth);
return make_float2(tint, mortar);
}

View File

@ -333,7 +333,6 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
bsdf->extra->cspec0 = rgb_to_spectrum(
(specular * 0.08f * tmp_col) * (1.0f - metallic) + base_color * metallic);
bsdf->extra->color = rgb_to_spectrum(base_color);
bsdf->extra->clearcoat = 0.0f;
/* setup bsdf */
if (distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID ||
@ -383,7 +382,6 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
bsdf->extra->color = rgb_to_spectrum(base_color);
bsdf->extra->cspec0 = rgb_to_spectrum(cspec0);
bsdf->extra->clearcoat = 0.0f;
/* setup bsdf */
sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd);
@ -440,7 +438,6 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
bsdf->extra->color = rgb_to_spectrum(base_color);
bsdf->extra->cspec0 = rgb_to_spectrum(cspec0);
bsdf->extra->clearcoat = 0.0f;
/* setup bsdf */
sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd);
@ -455,30 +452,20 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
#ifdef __CAUSTICS_TRICKS__
if (kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) {
#endif
if (clearcoat > CLOSURE_WEIGHT_CUTOFF) {
ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
sd, sizeof(MicrofacetBsdf), weight);
ccl_private MicrofacetExtra *extra =
(bsdf != NULL) ?
(ccl_private MicrofacetExtra *)closure_alloc_extra(sd, sizeof(MicrofacetExtra)) :
NULL;
Spectrum clearcoat_weight = 0.25f * clearcoat * weight;
ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
sd, sizeof(MicrofacetBsdf), clearcoat_weight);
if (bsdf && extra) {
bsdf->N = clearcoat_normal;
bsdf->T = zero_float3();
bsdf->ior = 1.5f;
bsdf->extra = extra;
if (bsdf) {
bsdf->N = clearcoat_normal;
bsdf->T = zero_float3();
bsdf->ior = 1.5f;
bsdf->alpha_x = clearcoat_roughness * clearcoat_roughness;
bsdf->alpha_y = clearcoat_roughness * clearcoat_roughness;
bsdf->alpha_x = clearcoat_roughness * clearcoat_roughness;
bsdf->alpha_y = clearcoat_roughness * clearcoat_roughness;
bsdf->extra->color = zero_spectrum();
bsdf->extra->cspec0 = make_spectrum(0.04f);
bsdf->extra->clearcoat = clearcoat;
/* setup bsdf */
sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);
}
/* setup bsdf */
sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);
}
#ifdef __CAUSTICS_TRICKS__
}
@ -584,7 +571,6 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
if (bsdf->extra) {
bsdf->extra->color = rgb_to_spectrum(stack_load_float3(stack, data_node.w));
bsdf->extra->cspec0 = zero_spectrum();
bsdf->extra->clearcoat = 0.0f;
sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf);
}
}
@ -724,7 +710,6 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
kernel_assert(stack_valid(data_node.z));
bsdf->extra->color = rgb_to_spectrum(stack_load_float3(stack, data_node.z));
bsdf->extra->cspec0 = zero_spectrum();
bsdf->extra->clearcoat = 0.0f;
/* setup bsdf */
sd->flag |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);

View File

@ -489,8 +489,7 @@ typedef enum ClosureType {
#define CLOSURE_IS_BSDF_MICROFACET_FRESNEL(type) \
(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID || \
type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID || \
type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID || \
type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID)
type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID)
#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID)
#define CLOSURE_IS_BSSRDF(type) \
(type >= CLOSURE_BSSRDF_BURLEY_ID && type <= CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID)

View File

@ -74,7 +74,8 @@ CCL_NAMESPACE_BEGIN
#define __VOLUME__
/* TODO: solve internal compiler errors and enable light tree on HIP. */
#ifdef __KERNEL_HIP__
/* TODO: solve internal compiler perf issue and enable light tree on Metal/AMD. */
#if defined(__KERNEL_HIP__) || defined(__KERNEL_METAL_AMD__)
# undef __LIGHT_TREE__
#endif
@ -1290,12 +1291,14 @@ typedef struct KernelCurveSegment {
static_assert_align(KernelCurveSegment, 8);
typedef struct KernelSpotLight {
packed_float3 axis_u;
float radius;
packed_float3 axis_v;
float invarea;
float cos_half_spot_angle;
float spot_smooth;
packed_float3 dir;
float pad;
float cos_half_spot_angle;
packed_float3 len;
float spot_smooth;
} KernelSpotLight;
/* PointLight is SpotLight with only radius and invarea being used. */
@ -1506,6 +1509,8 @@ typedef enum DeviceKernel : int {
DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY,
DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY,
DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY,
DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS,
DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS,
DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES,
DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY,

View File

@ -13,6 +13,7 @@
#include "scene/light.h"
#include "scene/mesh.h"
#include "scene/object.h"
#include "scene/osl.h"
#include "scene/pointcloud.h"
#include "scene/scene.h"
#include "scene/shader.h"
@ -23,7 +24,9 @@
#include "subd/patch_table.h"
#include "subd/split.h"
#include "kernel/osl/globals.h"
#ifdef WITH_OSL
# include "kernel/osl/globals.h"
#endif
#include "util/foreach.h"
#include "util/log.h"
@ -306,6 +309,11 @@ void GeometryManager::update_osl_globals(Device *device, Scene *scene)
{
#ifdef WITH_OSL
OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
if (og == nullptr) {
/* Can happen when rendering with multiple GPUs, but no CPU (in which case the name maps filled
* below are not used anyway) */
return;
}
og->object_name_map.clear();
og->object_names.clear();
@ -1666,6 +1674,7 @@ void GeometryManager::device_update_displacement_images(Device *device,
TaskPool pool;
ImageManager *image_manager = scene->image_manager;
set<int> bump_images;
bool has_osl_node = false;
foreach (Geometry *geom, scene->geometry) {
if (geom->is_modified()) {
/* Geometry-level check for hair shadow transparency.
@ -1685,6 +1694,9 @@ void GeometryManager::device_update_displacement_images(Device *device,
continue;
}
foreach (ShaderNode *node, shader->graph->nodes) {
if (node->special_type == SHADER_SPECIAL_TYPE_OSL) {
has_osl_node = true;
}
if (node->special_type != SHADER_SPECIAL_TYPE_IMAGE_SLOT) {
continue;
}
@ -1700,6 +1712,15 @@ void GeometryManager::device_update_displacement_images(Device *device,
}
}
}
#ifdef WITH_OSL
/* If any OSL node is used for displacement, it may reference a texture. But it's
* unknown which ones, so have to load them all. */
if (has_osl_node) {
OSLShaderManager::osl_image_slots(device, image_manager, bump_images);
}
#endif
foreach (int slot, bump_images) {
pool.push(function_bind(
&ImageManager::device_update_slot, image_manager, device, scene, slot, &progress));

View File

@ -1076,23 +1076,31 @@ void LightManager::device_update_lights(Device *device, DeviceScene *dscene, Sce
else if (light->light_type == LIGHT_SPOT) {
shader_id &= ~SHADER_AREA_LIGHT;
float3 len;
float3 axis_u = normalize_len(light->axisu, &len.x);
float3 axis_v = normalize_len(light->axisv, &len.y);
float3 dir = normalize_len(light->dir, &len.z);
if (len.z == 0.0f) {
dir = zero_float3();
}
float radius = light->size;
float invarea = (radius > 0.0f) ? 1.0f / (M_PI_F * radius * radius) : 1.0f;
float cos_half_spot_angle = cosf(light->spot_angle * 0.5f);
float spot_smooth = (1.0f - cos_half_spot_angle) * light->spot_smooth;
float3 dir = light->dir;
dir = safe_normalize(dir);
if (light->use_mis && radius > 0.0f)
shader_id |= SHADER_USE_MIS;
klights[light_index].co = co;
klights[light_index].spot.axis_u = axis_u;
klights[light_index].spot.radius = radius;
klights[light_index].spot.axis_v = axis_v;
klights[light_index].spot.invarea = invarea;
klights[light_index].spot.cos_half_spot_angle = cos_half_spot_angle;
klights[light_index].spot.spot_smooth = spot_smooth;
klights[light_index].spot.dir = dir;
klights[light_index].spot.cos_half_spot_angle = cos_half_spot_angle;
klights[light_index].spot.len = len;
klights[light_index].spot.spot_smooth = spot_smooth;
}
klights[light_index].shader_id = shader_id;

View File

@ -156,7 +156,13 @@ LightTreePrimitive::LightTreePrimitive(Scene *scene, int prim_id, int object_id)
}
else if (type == LIGHT_SPOT) {
bcone.theta_o = 0;
bcone.theta_e = lamp->get_spot_angle() * 0.5f;
const float unscaled_theta_e = lamp->get_spot_angle() * 0.5f;
const float len_u = len(lamp->get_axisu());
const float len_v = len(lamp->get_axisv());
const float len_w = len(lamp->get_dir());
bcone.theta_e = fast_atanf(fast_tanf(unscaled_theta_e) * fmaxf(len_u, len_v) / len_w);
/* Point and spot lights can emit light from any point within its radius. */
const float3 radius = make_float3(size);

View File

@ -665,6 +665,27 @@ OSLNode *OSLShaderManager::osl_node(ShaderGraph *graph,
return node;
}
/* Static function, so only this file needs to be compile with RTTT. */
void OSLShaderManager::osl_image_slots(Device *device,
ImageManager *image_manager,
set<int> &image_slots)
{
set<OSLRenderServices *> services_shared;
device->foreach_device([&services_shared](Device *sub_device) {
OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
services_shared.insert(og->services);
});
for (OSLRenderServices *services : services_shared) {
for (auto it = services->textures.begin(); it != services->textures.end(); ++it) {
if (it->second->handle.get_manager() == image_manager) {
const int slot = it->second->handle.svm_slot();
image_slots.insert(slot);
}
}
}
}
/* Graph Compiler */
OSLCompiler::OSLCompiler(OSLShaderManager *manager, OSL::ShadingSystem *ss, Scene *scene)
@ -1241,6 +1262,7 @@ void OSLCompiler::compile(OSLGlobals *og, Shader *shader)
shader->has_surface = false;
shader->has_surface_transparent = false;
shader->has_surface_raytrace = false;
shader->has_surface_bssrdf = false;
shader->has_bump = has_bump;
shader->has_bssrdf_bump = has_bump;

View File

@ -92,6 +92,9 @@ class OSLShaderManager : public ShaderManager {
const std::string &bytecode_hash = "",
const std::string &bytecode = "");
/* Get image slots used by OSL services on device. */
static void osl_image_slots(Device *device, ImageManager *image_manager, set<int> &image_slots);
private:
void texture_system_init();
void texture_system_free();

View File

@ -73,16 +73,55 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *_s
return new_value.float_value;
}
# define atomic_fetch_and_add_uint32(p, x) \
atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed)
# define atomic_fetch_and_sub_uint32(p, x) \
atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed)
# define atomic_fetch_and_inc_uint32(p) \
atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
# define atomic_fetch_and_dec_uint32(p) \
atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
# define atomic_fetch_and_or_uint32(p, x) \
atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed)
template<class T> ccl_device_inline uint32_t atomic_fetch_and_add_uint32(device T *p, int x)
{
return atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed);
}
template<class T> ccl_device_inline uint32_t atomic_fetch_and_sub_uint32(device T *p, int x)
{
return atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed);
}
template<class T> ccl_device_inline uint32_t atomic_fetch_and_inc_uint32(device T *p)
{
return atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed);
}
template<class T> ccl_device_inline uint32_t atomic_fetch_and_dec_uint32(device T *p)
{
return atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed);
}
template<class T> ccl_device_inline uint32_t atomic_fetch_and_or_uint32(device T *p, int x)
{
return atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed);
}
template<class T> ccl_device_inline uint32_t atomic_fetch_and_add_uint32(threadgroup T *p, int x)
{
return atomic_fetch_add_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
}
template<class T> ccl_device_inline uint32_t atomic_fetch_and_sub_uint32(threadgroup T *p, int x)
{
return atomic_fetch_sub_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
}
template<class T> ccl_device_inline uint32_t atomic_fetch_and_inc_uint32(threadgroup T *p)
{
return atomic_fetch_add_explicit((threadgroup atomic_uint *)p, 1, memory_order_relaxed);
}
template<class T> ccl_device_inline uint32_t atomic_fetch_and_dec_uint32(threadgroup T *p)
{
return atomic_fetch_sub_explicit((threadgroup atomic_uint *)p, 1, memory_order_relaxed);
}
template<class T> ccl_device_inline uint32_t atomic_fetch_and_or_uint32(threadgroup T *p, int x)
{
return atomic_fetch_or_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
}
ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest,
const float old_val,

View File

@ -69,6 +69,9 @@ void DebugFlags::Metal::reset()
{
if (getenv("CYCLES_METAL_ADAPTIVE_COMPILE") != NULL)
adaptive_compile = true;
if (auto str = getenv("CYCLES_METAL_LOCAL_ATOMIC_SORT"))
use_local_atomic_sort = (atoi(str) != 0);
}
DebugFlags::OptiX::OptiX()

View File

@ -97,6 +97,9 @@ class DebugFlags {
/* Whether adaptive feature based runtime compile is enabled or not. */
bool adaptive_compile = false;
/* Whether local atomic sorting is enabled or not. */
bool use_local_atomic_sort = true;
};
/* Get instance of debug flags registry. */

View File

@ -483,6 +483,12 @@ ccl_device_inline float compatible_signf(float f)
ccl_device_inline float smoothstepf(float f)
{
if (f <= 0.0f) {
return 0.0f;
}
if (f >= 1.0f) {
return 1.0f;
}
float ff = f * f;
return (3.0f * ff - 2.0f * ff * f);
}

View File

@ -74,7 +74,7 @@ ccl_device float fast_sinf(float x)
*
* Results on: [-2pi,2pi].
*
* Examined 2173837240 values of sin: 0.00662760244 avg ulp diff, 2 max ulp,
* Examined 2173837240 values of sin: 0.00662760244 avg ULP diff, 2 max ULP,
* 1.19209e-07 max error
*/
int q = fast_rint(x * M_1_PI_F);
@ -256,11 +256,11 @@ ccl_device float fast_acosf(float x)
/* clamp and crush denormals. */
const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
/* Based on http://www.pouet.net/topic.php?which=9132&page=2
* 85% accurate (ulp 0)
* 85% accurate (ULP 0)
* Examined 2130706434 values of acos:
* 15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
* 15.2000597 avg ULP diff, 4492 max ULP, 4.51803e-05 max error // without "denormal crush"
* Examined 2130706434 values of acos:
* 15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
* 15.2007108 avg ULP diff, 4492 max ULP, 4.51803e-05 max error // with "denormal crush"
*/
const float a = sqrtf(1.0f - m) *
(1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
@ -270,9 +270,8 @@ ccl_device float fast_acosf(float x)
ccl_device float fast_asinf(float x)
{
/* Based on acosf approximation above.
* Max error is 4.51133e-05 (ulps are higher because we are consistently off
* by a little amount).
*/
* Max error is 4.51133e-05 (ULPS are higher because we are consistently off
* by a little amount). */
const float f = fabsf(x);
/* Clamp and crush denormals. */
const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
@ -290,9 +289,9 @@ ccl_device float fast_atanf(float x)
const float t = s * s;
/* http://mathforum.org/library/drmath/view/62672.html
* Examined 4278190080 values of atan:
* 2.36864877 avg ulp diff, 302 max ulp, 6.55651e-06 max error // (with denormals)
* 2.36864877 avg ULP diff, 302 max ULP, 6.55651e-06 max error // (with denormals)
* Examined 4278190080 values of atan:
* 171160502 avg ulp diff, 855638016 max ulp, 6.55651e-06 max error // (crush denormals)
* 171160502 avg ULP diff, 855638016 max ULP, 6.55651e-06 max error // (crush denormals)
*/
float r = s * madd(0.43157974f, t, 1.0f) / madd(madd(0.05831938f, t, 0.76443945f), t, 1.0f);
if (a > 1.0f) {
@ -343,8 +342,8 @@ ccl_device float fast_log2f(float x)
int exponent = (int)(bits >> 23) - 127;
float f = __uint_as_float((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
/* Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]:
* 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error.
* ulp histogram:
* 0.0797524457 avg ULP diff, 3713596 max ULP, 7.62939e-06 max error.
* ULP histogram:
* 0 = 97.46%
* 1 = 2.29%
* 2 = 0.11%
@ -363,7 +362,7 @@ ccl_device float fast_log2f(float x)
ccl_device_inline float fast_logf(float x)
{
/* Examined 2130706432 values of logf on [1.17549435e-38,3.40282347e+38]:
* 0.313865375 avg ulp diff, 5148137 max ulp, 7.62939e-06 max error.
* 0.313865375 avg ULP diff, 5148137 max ULP, 7.62939e-06 max error.
*/
return fast_log2f(x) * M_LN2_F;
}
@ -371,7 +370,7 @@ ccl_device_inline float fast_logf(float x)
ccl_device_inline float fast_log10(float x)
{
/* Examined 2130706432 values of log10f on [1.17549435e-38,3.40282347e+38]:
* 0.631237033 avg ulp diff, 4471615 max ulp, 3.8147e-06 max error.
* 0.631237033 avg ULP diff, 4471615 max ULP, 3.8147e-06 max error.
*/
return fast_log2f(x) * M_LN2_F / M_LN10_F;
}
@ -392,12 +391,12 @@ ccl_device float fast_exp2f(float x)
/* Range reduction. */
int m = (int)x;
x -= m;
x = 1.0f - (1.0f - x); /* Crush denormals (does not affect max ulps!). */
x = 1.0f - (1.0f - x); /* Crush denormals (does not affect max ULPS!). */
/* 5th degree polynomial generated with sollya
* Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff,
* 232 max ulp.
* Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ULP diff,
* 232 max ULP.
*
* ulp histogram:
* ULP histogram:
* 0 = 87.81%
* 1 = 4.18%
*/
@ -415,7 +414,7 @@ ccl_device float fast_exp2f(float x)
ccl_device_inline float fast_expf(float x)
{
/* Examined 2237485550 values of exp on [-87.3300018,87.3300018]:
* 2.6666452 avg ulp diff, 230 max ulp.
* 2.6666452 avg ULP diff, 230 max ULP.
*/
return fast_exp2f(x / M_LN2_F);
}
@ -454,7 +453,7 @@ ccl_device_inline float4 fast_expf4(float4 x)
ccl_device_inline float fast_exp10(float x)
{
/* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]:
* 2.71732409 avg ulp diff, 232 max ulp.
* 2.71732409 avg ULP diff, 232 max ULP.
*/
return fast_exp2f(x * M_LN10_F / M_LN2_F);
}
@ -475,7 +474,7 @@ ccl_device float fast_sinhf(float x)
float a = fabsf(x);
if (a > 1.0f) {
/* Examined 53389559 values of sinh on [1,87.3300018]:
* 33.6886442 avg ulp diff, 178 max ulp. */
* 33.6886442 avg ULP diff, 178 max ULP. */
float e = fast_expf(a);
return copysignf(0.5f * e - 0.5f / e, x);
}
@ -495,7 +494,7 @@ ccl_device float fast_sinhf(float x)
ccl_device_inline float fast_coshf(float x)
{
/* Examined 2237485550 values of cosh on [-87.3300018,87.3300018]:
* 1.78256726 avg ulp diff, 178 max ulp.
* 1.78256726 avg ULP diff, 178 max ULP.
*/
float e = fast_expf(fabsf(x));
return 0.5f * e + 0.5f / e;
@ -506,7 +505,7 @@ ccl_device_inline float fast_tanhf(float x)
/* Examined 4278190080 values of tanh on [-3.40282347e+38,3.40282347e+38]:
* 3.12924e-06 max error.
*/
/* NOTE: ulp error is high because of sub-optimal handling around the origin. */
/* NOTE: ULP error is high because of sub-optimal handling around the origin. */
float e = fast_expf(2.0f * fabsf(x));
return copysignf(1.0f - 2.0f / (1.0f + e), x);
}
@ -579,7 +578,7 @@ ccl_device_inline float fast_erfcf(float x)
{
/* Examined 2164260866 values of erfcf on [-4,4]: 1.90735e-06 max error.
*
* ulp histogram:
* ULP histogram:
*
* 0 = 80.30%
*/

View File

@ -85,10 +85,12 @@ if(WITH_VULKAN_BACKEND)
list(APPEND INC_SYS
${VULKAN_INCLUDE_DIRS}
${MOLTENVK_INCLUDE_DIRS}
)
list(APPEND LIB
${VULKAN_LIBRARIES}
${MOLTENVK_LIBRARIES}
)
add_definitions(-DWITH_VULKAN_BACKEND)

View File

@ -1201,7 +1201,7 @@ void GHOST_GetVulkanHandles(GHOST_ContextHandle context,
void *r_instance,
void *r_physical_device,
void *r_device,
uint32_t *r_graphic_queue_familly);
uint32_t *r_graphic_queue_family);
/**
* Return VULKAN back-buffer resources handles for the given window.

View File

@ -1203,10 +1203,10 @@ void GHOST_GetVulkanHandles(GHOST_ContextHandle contexthandle,
void *r_instance,
void *r_physical_device,
void *r_device,
uint32_t *r_graphic_queue_familly)
uint32_t *r_graphic_queue_family)
{
GHOST_IContext *context = (GHOST_IContext *)contexthandle;
context->getVulkanHandles(r_instance, r_physical_device, r_device, r_graphic_queue_familly);
context->getVulkanHandles(r_instance, r_physical_device, r_device, r_graphic_queue_family);
}
void GHOST_GetVulkanBackbuffer(GHOST_WindowHandle windowhandle,

View File

@ -142,7 +142,7 @@ class GHOST_Context : public GHOST_IContext {
virtual GHOST_TSuccess getVulkanHandles(void * /*r_instance*/,
void * /*r_physical_device*/,
void * /*r_device*/,
uint32_t * /*r_graphic_queue_familly*/) override
uint32_t * /*r_graphic_queue_family*/) override
{
return GHOST_kFailure;
};

Some files were not shown because too many files have changed in this diff Show More