1
1

Compare commits

..

71 Commits

Author SHA1 Message Date
8ea7063a71 Merge remote-tracking branch 'origin' into temp-pbvh-split 2023-01-23 09:14:43 -08:00
495094d65b temp-pbvh-split: fix compile error 2023-01-23 09:11:57 -08:00
3f923b0d7e temp-pbvh-split: Attempt to fix submodules 2023-01-23 08:47:46 -08:00
365bd78cba Merge branch 'master' into temp-pbvh-split 2023-01-23 08:45:43 -08:00
14fa20d797 Merge branch 'master' into temp-pbvh-split 2023-01-12 19:33:03 -08:00
afbfa1352b temp-pbvh-split: make requested patch changes 2023-01-03 10:57:29 -08:00
7cb31b649f Merge branch 'master' into temp-pbvh-split 2023-01-03 10:37:06 -08:00
98bdd08507 Merge branch 'master' into temp-pbvh-split 2022-12-30 21:38:01 -08:00
78b008d20d temp-pbvh-split: Fix various crashes and memory corruption 2022-10-15 00:55:45 -07:00
aa1f2f243d Merge branch 'master' into temp-pbvh-split 2022-10-14 23:22:01 -07:00
278a2137f9 Merge remote-tracking branch 'origin' into temp-pbvh-split 2022-10-07 12:28:55 -07:00
92964a29b5 temp-pbvh-split: fix improver use of mesh->mvert 2022-10-07 12:28:25 -07:00
adb0e5e054 temp-pbvh-split: fix merge error 2022-10-07 02:32:49 -07:00
6a0a92c587 Merge branch 'master' into temp-pbvh-split 2022-10-06 14:14:55 -07:00
35bff5f4e0 Merge branch 'master' into temp-pbvh-split 2022-06-22 08:34:35 -07:00
f90aeb0152 temp-pbvh-split: Remove debug macro 2022-06-02 16:47:03 -07:00
6851971501 temp-pbvh-split: Use task api correctly.
BLI_task_pool_push could use a comment clarify that
you can call it within threads, and also that there's
no overhead to doing so.
2022-06-02 16:43:37 -07:00
d55dc9a3b1 temp-pbvh-split: Remove debug printf 2022-06-02 16:15:49 -07:00
6047cb7708 Merge remote-tracking branch 'origin' into temp-pbvh-split 2022-06-02 16:13:51 -07:00
f82036b6e2 temp-pbvh-split: Finish merge and fix paint undo 2022-06-02 16:13:19 -07:00
f967fbea65 Merge branch 'master' into temp-pbvh-split 2022-06-02 03:22:27 -07:00
f2d39b810b Enable inlining on Apple Silicon. Use new process-wide ShaderCache in order to safely re-enable binary archives
This patch is the same as D14763, but with a fix for unit test failures caused by ShaderCache fetch logic not working in the non-MetalRT case:

```
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index ad268ae7057..6aa1a56056e 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -203,9 +203,12 @@ bool kernel_has_intersection(DeviceKernel device_kernel)

   /* metalrt options */
   request.pipeline->use_metalrt = device->use_metalrt;
-  request.pipeline->metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR;
-  request.pipeline->metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK;
-  request.pipeline->metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD;
+  request.pipeline->metalrt_hair = device->use_metalrt &&
+                                   (device->kernel_features & KERNEL_FEATURE_HAIR);
+  request.pipeline->metalrt_hair_thick = device->use_metalrt &&
+                                         (device->kernel_features & KERNEL_FEATURE_HAIR_THICK);
+  request.pipeline->metalrt_pointcloud = device->use_metalrt &&
+                                         (device->kernel_features & KERNEL_FEATURE_POINTCLOUD);

   {
     thread_scoped_lock lock(cache_mutex);
@@ -225,9 +228,9 @@ bool kernel_has_intersection(DeviceKernel device_kernel)

   /* metalrt options */
   bool use_metalrt = device->use_metalrt;
-  bool metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR;
-  bool metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK;
-  bool metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD;
+  bool metalrt_hair = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR);
+  bool metalrt_hair_thick = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR_THICK);
+  bool metalrt_pointcloud = use_metalrt && (device->kernel_features & KERNEL_FEATURE_POINTCLOUD);

   MetalKernelPipeline *best_pipeline = nullptr;
   for (auto &pipeline : collection) {

```

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D14923
2022-06-02 02:56:36 -07:00
9549df341c UI: Update rest of UI code for increased button flag bitfield
Needed after 98a04ed452.
2022-06-02 02:56:36 -07:00
afabb990ee Fix T95710: Make Single User > Object Data Animation broken
The operator now not only checks `ob->data` for Actions to duplicate,
but also passes `ob->data` to the duplication function (instead of `ob`).
2022-06-02 02:56:36 -07:00
052073e9e4 Fix T97947: USD will fail to export without file extension
Now add a default ".usdc" file extension if no (or the wrong) extension
is given instead of presenting the user with the error that "no suitable
USD plugin to write is found".

This is in line with how other exporters do this.

Maniphest Tasks: T97947

Differential Revision: https://developer.blender.org/D14895
2022-06-02 02:56:36 -07:00
1ebfd4294e Curves: Adjust sculpt mode UI layouts
This patch adjusts the UI layouts for the tool header and the tool
properties in sculpt mode in a few ways. The goals are to better group
related settings, keep fundamental settings easily accessible, fix the
availability of some options, and make better use of space.

1. Remove ID template in tool header
2. Rename "Add Amount" to "Count" for add brush
3. Add "use pressure" toggles to radius and strength sliders
4. Move strength falloff to a popover
5. Move many "Add" brush settings to popover called "Curve Shape"
6. Move two "Grow/Shrink" options to a popover called "Scaling"
7. Don't display "Falloff" panel in properties when it has no effect

See the differential revision for screenshots and more reasoning.

Differential Revision: https://developer.blender.org/D14922
2022-06-02 02:56:35 -07:00
8414e79b71 Fix: Crash with empty curves add interpolate points
The neighbors for an added curve can be empty.
In that case use the constant value instead of interpolating.
2022-06-02 02:56:35 -07:00
73f1f51627 Fix T97330: GPU Subdiv compiler error.
GLSL has different max number of ssbo per glsl stage.
This patch checks if the number of compute ssbo blocks matches
our requirements for the GPU Subdiv, before enabling it.

Some platforms allow more ssbo bindings then blocks per stage.
2022-06-02 02:56:35 -07:00
6bd0176ebc Fix: Build error on windows.
Issue introduced by rBeef98e66cf9e

BLI_math_rotation.h uses M_PI which
gets defined inside BLI_math_base.h
2022-06-02 02:56:35 -07:00
91a488ad18 Fix (unreported) crash in Outliner Overrides Properties view in invalid cases.
We cannot try to get RNA info when the rna path of an override property
is invalid.
2022-06-02 02:56:35 -07:00
c6445efe72 Cleanup: rename BLI_str_format_attribute_domain_size
This is useful without any functionality specific to attribute domains,
rename to `BLI_str_format_decimal_unit` to follow naming of a similar
function `BLI_str_format_byte_unit`.
2022-06-02 02:56:35 -07:00
d98e2b20d4 Fix T97895: Eevee support for Geometry Nodes Color Attributes.
Geometry nodes can generate color attributes that aren't on point or corner domain.
When not found in these domains it will be processed as a common attribute.
2022-06-02 02:56:35 -07:00
Pablo Vazquez
5fa64f6565 Mesh: Add Auto Smooth option to Shade Smooth operator
Add a property to the **Shade Smooth** operator to quickly enable the Mesh `use_auto_smooth` option.

The `Angle` property is exposed in the **Adjust Last Operation** panel to make it easy to tweak on multiple objects without having to go to the Properties editor.

The operator is exposed in the `Object` menu and `Object Context Menu`.

=== Demo ===

{F13066173, size=full}

Regarding the implementation, there are multiple ways to go about this (like making a whole new operator altogether), but I think a property is the cleanest/simplest.

I imagine there are simpler ways to achieve this without duplicating the `use_auto_smooth` property in the operator itself (getting it from the Mesh props?), but I couldn't find other operators doing something similar.

Reviewed By: #modeling, mont29

Differential Revision: https://developer.blender.org/D14894
2022-06-02 02:56:35 -07:00
1cc6e754f3 Fix "Open Clip" operator in Clip Editor broken
Steps to reproduce were:
- Open Clip Editor
- Call "Open Clip" (e.g. Alt+O)
- Select video file

The file wouldn't be loaded into the Clip Editor.

Caused by 7849b56c3c.
2022-06-02 02:56:35 -07:00
a92413b0db Revert "Gizmo: optimize intersection tests, fix selection bias"
Manually revert commit [0] as it caused problems macOS (reported T96435).

- Includes fixes from [1] & [2].
- T98037 TODO has been created to keep track of this feature.

Thanks to @jbakker & @sergey for investigating this issue as I wasn't
able to reproduce the bug.

[0]: 0cb5eae9d0
[1]: cb986446e2
[2]: cc8fe1a1cb
2022-06-02 02:56:35 -07:00
83a3443ea5 Fix T97173: Color Attributes shading turns black after switching mode.
Sculpt colors tagged the custom data as already created (cd_used), but
should have been tagged as being requested (cd_needed).
2022-06-02 02:56:35 -07:00
3915e57a47 Cleanup: Use single quotes for Python enum string 2022-06-02 02:56:35 -07:00
a74cd17030 Fix: Hide empty panel in curves sculpt mode tool settings
This panel is empty after rB5b24291be1e0
2022-06-02 02:56:35 -07:00
d1c080b737 Fix: Spline parameter node broken for Catmull Rom curves
Subtracting one from the evaluated index could make the index -1.
That was only necessary for Bezier curves due to the specifics of
the "bezier_evaluated_offsets".
2022-06-02 02:56:35 -07:00
16fa4c5347 Outliner: Remove the 'Remap data-block usages' operation.
This feature is very advanced, and the way it was exposed in the
Outliner was very confusing at best.

It remains available through the Python API (`ID.user_remap`) e.g.
2022-06-02 02:56:35 -07:00
1b4cc8c5f3 Outliner: Remove 'rename library' feature.
This was historically the only way to change/fix paths of library files
in Blender. However, only changing the path then required a manual
reload of the library, which could be skipped by user, or a save/reload
of the working .blend file, which could lead to corruption of advanced
library usages like overrides.

Prefferred, modern way to change path of a library is to use the
Relocate operation instead. Direct path modification remains possible
through RNA (python console or the Data API view in the Outliner.
2022-06-02 02:56:34 -07:00
9184beada7 Update Ceres to latest upstream version 2.1.0
This release deprecated the Parameterization API and the new Manifolds
API is to be used instead. This is what was done in the Libmv as part
of this change.

Additionally, remove the bundling scripts. Nowadays those are only
leading to a duplicated work to maintain.

No measurable changes on user side is expected.
2022-06-02 02:56:34 -07:00
331cc4b66e Cleanup: spelling in comments/strings
D14918 from @linux_dr with some other changes included.
2022-06-02 02:56:34 -07:00
Jun Mizutani
07ac2445b9 Fix: Curves interpolate point count option missing from panels
Added in 8852191b77

Differential Revision: https://developer.blender.org/D14919
2022-06-02 02:56:34 -07:00
e34bfbc13b Fix T97153: Knife project crashes
Knife projection BVH-tree lookup could use invalid indices since the
mesh being cut is also used for BVH intersection tests.

Solve by storing triangle indices when knife project is used so a
triangle index can always be used to look up original coordinates of a
triangle.
2022-06-02 02:56:34 -07:00
86214740ca Fix knife tool use-after free on completion
Regression in [0] accessed knife data after it had been freed.

[0]: f87029f7b1
2022-06-02 02:56:34 -07:00
9f76a10260 Fix T96892 Overlay: Hiding all of a mesh in edit mode causes visual glitch
This is caused by the geometry shader used by the edit mode line drawing.
If the drawcall uses indexed drawing and if the index buffer only contains
restart indices, it seems the result is 1 glitchy invocation of the
geometry shader.

Workaround by tagging these special case index buffers and bypassing
their drawcall.
2022-06-02 02:56:34 -07:00
166ef650cb Fix T97945: Cycles baking max distance is wrong
It was effectively sqrt(max_distance) before this fix.

Thanks to Omar Emara for identifying the solution.
2022-06-02 02:56:34 -07:00
ac1ffa2420 Fix T97908: Cycles missing motion from on pointcloud generated by geometry nodes
Assume geometry is always potentially animated, since we can't use our heuristic
to detect if the object is potentially animated by looking at modifiers on the
object.

The main original reason for this check was to avoid evaluating subdivision
surfaces for many static objects, which is not happening here anyway.
2022-06-02 02:56:34 -07:00
Mikhail Matrosov
f8db581a22 Fix T97966: Cycles shadow terminator offset wrong for scaled object instances
Differential Revision: https://developer.blender.org/D14893
2022-06-02 02:56:34 -07:00
Olivier Maury
2a570c1f4b Fix T97056: Cycles MNEE not working with glass and pure refraction BSDFs
Differential Revision: https://developer.blender.org/D14901
2022-06-02 02:56:33 -07:00
9f3fe0583a Fix part of T97895: Cycles not rendering edge domain attributes
These aren't really ideal for rendering, but better to show something. Edge
values are averaged at vertices.
2022-06-02 02:56:33 -07:00
5e338f88d5 Cleanup: use 'num' / 'size' suffix instead of 'sz'
GPU code used `sz` as an abbreviation for size, as well as a few other
places. Use size where this represents a size in bytes, see: T85728.
2022-06-02 02:56:33 -07:00
18159bd6c4 Cleanup: use '_num' / '_count' suffix instead of '_ct'
Use num & count (for counters), in drawing code, see: T85728.
2022-06-02 02:56:33 -07:00
17d5450eaf Cleanup: use '_num' suffix, mostly for curves & spline code
Replace tot/amount & size with num, in keeping with T85728.
2022-06-02 02:56:33 -07:00
01d33feed5 WM: clear wmEvent.flag for file-select events
Harmless but could cause file-select events to have WM_EVENT_IS_REPEAT
set which logged a warning as this is only intended for keyboard events.
2022-06-02 02:56:33 -07:00
d46039a8b2 Cleanup: spelling in comments
Revert change from [0] that assumed UNORM was a mis-spelling of UNIFORM.

[0]: 2c75857f9f
2022-06-02 02:56:33 -07:00
844bed2ec6 Cleanup: use '_num' suffix instead of '_size' for CurveGeometry
Follow conventions from T85728.
2022-06-02 02:56:33 -07:00
1d8f2aeb74 Cleanup: format 2022-06-02 02:56:33 -07:00
db23f908ad Cleanup: use doxy sections for node_edit.cc 2022-06-02 02:56:33 -07:00
7e63c035b8 Fix cursor snap not acting on selected UVs
Regression in rBd2271cf939.
2022-06-02 02:56:32 -07:00
09e05193c7 temp-pbvh-split: Use TaskPool API for texture node splitting 2022-05-10 18:39:22 -07:00
0d24e8af62 temp-pbvh-split: reuse triangles array. 2022-05-10 18:05:54 -07:00
b617e66e36 Merge branch 'master' into temp-pbvh-split 2022-05-10 17:10:37 -07:00
d6da131c40 Merge remote-tracking branch 'origin/master' into temp-pbvh-split 2022-05-09 18:31:18 -07:00
6e4daa7d3c temp-pbvh-split: Multithread pbvh texture building
I had to use the original threading API for this and
ThreadQueue.  Threads pull nodes to split from a
thread queue and push any new nodes onto the queue
for other threads to further split.

I'm thinking of trying this approach out for PBVH building in
general.  It cut the build time for texture leaves in half.
2022-05-09 18:11:22 -07:00
27fd506501 temp-pbvh-split: Fix pixel row splitting 2022-05-09 16:35:39 -07:00
867fe19770 Merge remote-tracking branch 'origin/master' into temp-pbvh-split 2022-05-09 11:05:32 -07:00
17c7f2e41b Merge remote-tracking branch 'origin/master' into temp-pbvh-split 2022-05-06 16:12:59 -07:00
d2fa1daea6 temp-pbvh-split: PBVH texpaint node splitting
* Texture paint now has its own special
  PBVH nodes leaf node flag, PBVH_TexLeaf.
* There is a new version of BKE_pbvh_search_gather
  (BKE_pbvh_search_gather_ex) that takes the leaf
  test flag as an extra argument.
2022-05-06 16:08:20 -07:00
1c77f259fd temp-pbvh-split: Support pbvh debug node box drawing outside of pbvh
draw mode
2022-05-04 20:58:20 -07:00
1672 changed files with 40516 additions and 62301 deletions

8
.arcconfig Normal file
View File

@@ -0,0 +1,8 @@
{
"project_id" : "Blender",
"conduit_uri" : "https://developer.blender.org/",
"phabricator.uri" : "https://developer.blender.org/",
"git.default-relative-commit" : "origin/master",
"arc.land.update.default" : "rebase",
"arc.land.onto.default" : "master"
}

View File

@@ -236,8 +236,6 @@ ForEachMacros:
- LOOP_UNSELECTED_POINTS
- LOOP_VISIBLE_KEYS
- LOOP_VISIBLE_POINTS
- LIGHT_FOREACH_BEGIN_DIRECTIONAL
- LIGHT_FOREACH_BEGIN_LOCAL
- LISTBASE_CIRCULAR_BACKWARD_BEGIN
- LISTBASE_CIRCULAR_FORWARD_BEGIN
- LISTBASE_FOREACH

View File

@@ -1,5 +0,0 @@
${CommitTitle}
${CommitBody}
Pull Request #${PullRequestIndex}

View File

@@ -1,3 +0,0 @@
${PullRequestTitle}
Pull Request #${PullRequestIndex}

View File

@@ -1,15 +1,13 @@
name: Bug Report
about: File a bug report
labels:
- "type::Report"
- "status::Needs Triage"
- "priority::Normal"
- bug
body:
- type: markdown
attributes:
value: |
### Instructions
First time reporting? See [tips](https://wiki.blender.org/wiki/Process/Bug_Reports).
First time reporting? See [tips](https://wiki.blender.org/wiki/Process/Bug_Reports) and [walkthrough video](https://www.youtube.com/watch?v=JTD0OJq_rF4).
* Use **Help > Report a Bug** in Blender to fill system information and exact Blender version.
* Test [daily builds](https://builder.blender.org/) to verify if the issue is already fixed.
@@ -21,7 +19,6 @@ body:
id: body
attributes:
label: "Description"
hide_label: true
value: |
**System Information**
Operating system:

View File

@@ -1,10 +1,9 @@
name: Design
about: Create a design task (for developers only)
labels:
- "type::Design"
- design
body:
- type: textarea
id: body
attributes:
label: "Description"
hide_label: true

View File

@@ -1,10 +1,9 @@
name: To Do
about: Create a to do task (for developers only)
labels:
- "type::To Do"
- todo
body:
- type: textarea
id: body
attributes:
label: "Description"
hide_label: true

View File

@@ -14,4 +14,7 @@ body:
id: body
attributes:
label: "Description"
hide_label: true
value: |
Description of the problem that is addressed in the patch.
Description of the proposed solution and its implementation.

View File

@@ -1,4 +1,5 @@
This repository is only used as a mirror. Blender development happens on projects.blender.org.
This repository is only used as a mirror of git.blender.org. Blender development happens on
https://developer.blender.org.
To get started with contributing code, please see:
https://wiki.blender.org/wiki/Process/Contributing_Code

3
.github/stale.yml vendored
View File

@@ -15,7 +15,8 @@ staleLabel: stale
# Comment to post when closing a stale Issue or Pull Request.
closeComment: >
This issue has been automatically closed, because this repository is only
used as a mirror. Blender development happens on projects.blender.org.
used as a mirror of git.blender.org. Blender development happens on
developer.blender.org.
To get started contributing code, please read:
https://wiki.blender.org/wiki/Process/Contributing_Code

8
.gitmodules vendored
View File

@@ -1,20 +1,20 @@
[submodule "release/scripts/addons"]
path = release/scripts/addons
url = ../blender-addons.git
branch = main
branch = master
ignore = all
[submodule "release/scripts/addons_contrib"]
path = release/scripts/addons_contrib
url = ../blender-addons-contrib.git
branch = main
branch = master
ignore = all
[submodule "release/datafiles/locale"]
path = release/datafiles/locale
url = ../blender-translations.git
branch = main
branch = master
ignore = all
[submodule "source/tools"]
path = source/tools
url = ../blender-dev-tools.git
branch = main
branch = master
ignore = all

View File

@@ -167,26 +167,14 @@ get_blender_version()
option(WITH_BLENDER "Build blender (disable to build only the blender player)" ON)
mark_as_advanced(WITH_BLENDER)
if(WIN32)
option(WITH_BLENDER_THUMBNAILER "\
Build \"BlendThumb.dll\" helper for Windows explorer integration to support extracting \
thumbnails from `.blend` files."
ON
)
if(APPLE)
# In future, can be used with `quicklookthumbnailing/qlthumbnailreply` to create file
# thumbnails for say Finder. Turn it off for now.
option(WITH_BLENDER_THUMBNAILER "Build \"blender-thumbnailer\" thumbnail extraction utility" OFF)
elseif(WIN32)
option(WITH_BLENDER_THUMBNAILER "Build \"BlendThumb.dll\" helper for Windows explorer integration" ON)
else()
set(_option_default ON)
if(APPLE)
# In future, can be used with `quicklookthumbnailing/qlthumbnailreply`
# to create file thumbnails for say Finder.
# Turn it off for now, even though it can build on APPLE, it's not likely to be useful.
set(_option_default OFF)
endif()
option(WITH_BLENDER_THUMBNAILER "\
Build stand-alone \"blender-thumbnailer\" command-line thumbnail extraction utility, \
intended for use by file-managers to extract PNG images from `.blend` files."
${_option_default}
)
unset(_option_default)
option(WITH_BLENDER_THUMBNAILER "Build \"blender-thumbnailer\" thumbnail extraction utility" ON)
endif()
option(WITH_INTERNATIONAL "Enable I18N (International fonts and text)" ON)
@@ -226,19 +214,14 @@ option(WITH_BULLET "Enable Bullet (Physics Engine)" ON)
option(WITH_SYSTEM_BULLET "Use the systems bullet library (currently unsupported due to missing features in upstream!)" )
mark_as_advanced(WITH_SYSTEM_BULLET)
option(WITH_OPENCOLORIO "Enable OpenColorIO color management" ON)
set(_option_default ON)
if(APPLE)
# There's no OpenXR runtime in sight for macOS, neither is code well
# tested there -> disable it by default.
set(_option_default OFF)
endif()
option(WITH_XR_OPENXR "Enable VR features through the OpenXR specification" ${_option_default})
if(APPLE)
option(WITH_XR_OPENXR "Enable VR features through the OpenXR specification" OFF)
mark_as_advanced(WITH_XR_OPENXR)
else()
option(WITH_XR_OPENXR "Enable VR features through the OpenXR specification" ON)
endif()
unset(_option_default)
option(WITH_GMP "Enable features depending on GMP (Exact Boolean)" ON)
# Compositor
@@ -370,13 +353,12 @@ else()
set(WITH_COREAUDIO OFF)
endif()
if(NOT WIN32)
set(_option_default ON)
if(APPLE)
set(_option_default OFF)
option(WITH_JACK "Enable JACK Support (http://www.jackaudio.org)" OFF)
else()
option(WITH_JACK "Enable JACK Support (http://www.jackaudio.org)" ON)
endif()
option(WITH_JACK "Enable JACK Support (http://www.jackaudio.org)" ${_option_default})
unset(_option_default)
option(WITH_JACK_DYNLOAD "Enable runtime dynamic JACK libraries loading" OFF)
option(WITH_JACK_DYNLOAD "Enable runtime dynamic JACK libraries loading" OFF)
else()
set(WITH_JACK OFF)
endif()
@@ -524,7 +506,7 @@ endif()
if(NOT APPLE)
option(WITH_CYCLES_DEVICE_HIP "Enable Cycles AMD HIP support" ON)
option(WITH_CYCLES_HIP_BINARIES "Build Cycles AMD HIP binaries" OFF)
set(CYCLES_HIP_BINARIES_ARCH gfx900 gfx906 gfx90c gfx902 gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 CACHE STRING "AMD HIP architectures to build binaries for")
set(CYCLES_HIP_BINARIES_ARCH gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 CACHE STRING "AMD HIP architectures to build binaries for")
mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
mark_as_advanced(CYCLES_HIP_BINARIES_ARCH)
endif()
@@ -625,10 +607,8 @@ mark_as_advanced(
# Vulkan
option(WITH_VULKAN_BACKEND "Enable Vulkan as graphics backend (only for development)" OFF)
option(WITH_VULKAN_GUARDEDALLOC "Use guardedalloc for host allocations done inside Vulkan (development option)" OFF)
mark_as_advanced(
WITH_VULKAN_BACKEND
WITH_VULKAN_GUARDEDALLOC
)
# Metal
@@ -1243,6 +1223,13 @@ if(WITH_OPENGL)
add_definitions(-DWITH_OPENGL)
endif()
#-----------------------------------------------------------------------------
# Configure Vulkan.
if(WITH_VULKAN_BACKEND)
list(APPEND BLENDER_GL_LIBRARIES ${VULKAN_LIBRARIES})
endif()
# -----------------------------------------------------------------------------
# Configure Metal

View File

@@ -71,13 +71,6 @@ Static Source Code Checking
* check_mypy: Checks all Python scripts using mypy,
see: source/tools/check_source/check_mypy_config.py scripts which are included.
Documentation Checking
* check_wiki_file_structure:
Check the WIKI documentation for the source-tree's file structure
matches Blender's source-code.
See: https://wiki.blender.org/wiki/Source/File_Structure
Spell Checkers
This runs the spell checker from the developer tools repositor.
@@ -299,11 +292,7 @@ else
ifneq ("$(wildcard $(DEPS_BUILD_DIR)/build.ninja)","")
DEPS_BUILD_COMMAND:=ninja
else
ifeq ($(OS), Darwin)
DEPS_BUILD_COMMAND:=make -s
else
DEPS_BUILD_COMMAND:="$(BLENDER_DIR)/build_files/build_environment/linux/make_deps_wrapper.sh" -s
endif
DEPS_BUILD_COMMAND:=make -s
endif
endif
@@ -402,7 +391,7 @@ endif
deps: .FORCE
@echo
@echo Configuring dependencies in \"$(DEPS_BUILD_DIR)\", install to \"$(DEPS_INSTALL_DIR)\"
@echo Configuring dependencies in \"$(DEPS_BUILD_DIR)\"
@cmake -H"$(DEPS_SOURCE_DIR)" \
-B"$(DEPS_BUILD_DIR)" \
@@ -492,10 +481,6 @@ check_smatch: .FORCE
check_mypy: .FORCE
@$(PYTHON) "$(BLENDER_DIR)/source/tools/check_source/check_mypy.py"
check_wiki_file_structure: .FORCE
@PYTHONIOENCODING=utf_8 $(PYTHON) \
"$(BLENDER_DIR)/source/tools/check_wiki/check_wiki_file_structure.py"
check_spelling_py: .FORCE
@cd "$(BUILD_DIR)" ; \
PYTHONIOENCODING=utf_8 $(PYTHON) \

View File

@@ -24,7 +24,7 @@ Development
-----------
- [Build Instructions](https://wiki.blender.org/wiki/Building_Blender)
- [Code Review & Bug Tracker](https://projects.blender.org)
- [Code Review & Bug Tracker](https://developer.blender.org)
- [Developer Forum](https://devtalk.blender.org)
- [Developer Documentation](https://wiki.blender.org)

View File

@@ -2,7 +2,7 @@
# LLVM does not switch over to cpp17 until llvm 16 and building ealier versions with
# MSVC is leading to some crashes in ISPC. Switch back to their default on all platforms
# for now.
# for now.
string(REPLACE "-DCMAKE_CXX_STANDARD=17" " " DPCPP_CMAKE_FLAGS "${DEFAULT_CMAKE_FLAGS}")
if(WIN32)

View File

@@ -10,7 +10,7 @@ ExternalProject_Add(external_epoxy
URL_HASH ${EPOXY_HASH_TYPE}=${EPOXY_HASH}
PREFIX ${BUILD_DIR}/epoxy
PATCH_COMMAND ${PATCH_CMD} -p 1 -N -d ${BUILD_DIR}/epoxy/src/external_epoxy/ < ${PATCH_DIR}/epoxy.diff
CONFIGURE_COMMAND ${CONFIGURE_ENV} && ${MESON} setup --prefix ${LIBDIR}/epoxy --default-library ${EPOXY_LIB_TYPE} --libdir lib ${BUILD_DIR}/epoxy/src/external_epoxy-build ${BUILD_DIR}/epoxy/src/external_epoxy -Dtests=false ${MESON_BUILD_TYPE}
CONFIGURE_COMMAND ${CONFIGURE_ENV} && ${MESON} setup --prefix ${LIBDIR}/epoxy --default-library ${EPOXY_LIB_TYPE} --libdir lib ${BUILD_DIR}/epoxy/src/external_epoxy-build ${BUILD_DIR}/epoxy/src/external_epoxy -Dtests=false
BUILD_COMMAND ninja
INSTALL_COMMAND ninja install
)

View File

@@ -9,7 +9,7 @@ ExternalProject_Add(external_fribidi
URL_HASH ${FRIBIDI_HASH_TYPE}=${FRIBIDI_HASH}
DOWNLOAD_DIR ${DOWNLOAD_DIR}
PREFIX ${BUILD_DIR}/fribidi
CONFIGURE_COMMAND ${MESON} setup --prefix ${LIBDIR}/fribidi ${MESON_BUILD_TYPE} -Ddocs=false --default-library static --libdir lib ${BUILD_DIR}/fribidi/src/external_fribidi-build ${BUILD_DIR}/fribidi/src/external_fribidi
CONFIGURE_COMMAND ${MESON} setup --prefix ${LIBDIR}/fribidi -Ddocs=false --default-library static --libdir lib ${BUILD_DIR}/fribidi/src/external_fribidi-build ${BUILD_DIR}/fribidi/src/external_fribidi
BUILD_COMMAND ninja
INSTALL_COMMAND ninja install
INSTALL_DIR ${LIBDIR}/fribidi

View File

@@ -22,7 +22,7 @@ elseif(UNIX AND NOT APPLE)
)
endif()
# Boolean crashes with Arm assembly, see #103423.
# Boolean crashes with Arm assembly, see T103423.
if(BLENDER_PLATFORM_ARM)
set(GMP_OPTIONS
${GMP_OPTIONS}

View File

@@ -21,7 +21,6 @@ set(HARFBUZZ_EXTRA_OPTIONS
# Only used for command line utilities,
# disable as this would add an addition & unnecessary build-dependency.
-Dcairo=disabled
${MESON_BUILD_TYPE}
)
ExternalProject_Add(external_harfbuzz
@@ -60,10 +59,3 @@ if(BUILD_MODE STREQUAL Release AND WIN32)
DEPENDEES install
)
endif()
if(BUILD_MODE STREQUAL Debug AND WIN32)
ExternalProject_Add_Step(external_harfbuzz after_install
COMMAND ${CMAKE_COMMAND} -E copy ${LIBDIR}/harfbuzz/lib/libharfbuzz.a ${HARVEST_TARGET}/harfbuzz/lib/libharfbuzz_d.lib
DEPENDEES install
)
endif()

View File

@@ -40,8 +40,7 @@ ExternalProject_Add(external_igc_llvm
${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0004-OpenCL-support-cl_ext_float_atomics.patch &&
${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0005-OpenCL-Add-cl_khr_integer_dot_product.patch &&
${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0001-Memory-leak-fix-for-Managed-Static-Mutex.patch &&
${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0002-Remove-repo-name-in-LLVM-IR.patch &&
${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0003-Add-missing-include-limit-in-benchmark.patch
${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0002-Remove-repo-name-in-LLVM-IR.patch
)
add_dependencies(
external_igc_llvm
@@ -56,6 +55,9 @@ ExternalProject_Add(external_igc_spirv_translator
CONFIGURE_COMMAND echo .
BUILD_COMMAND echo .
INSTALL_COMMAND echo .
PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0001-update-SPIR-V-headers-for-SPV_INTEL_split_barrier.patch &&
${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0002-Add-support-for-split-barriers-extension-SPV_INTEL_s.patch &&
${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0003-Support-cl_bf16_conversions.patch
)
add_dependencies(
external_igc_spirv_translator

View File

@@ -42,7 +42,7 @@ endif()
# LLVM does not switch over to cpp17 until llvm 16 and building ealier versions with
# MSVC is leading to some crashes in ISPC. Switch back to their default on all platforms
# for now.
# for now.
string(REPLACE "-DCMAKE_CXX_STANDARD=17" " " LLVM_CMAKE_FLAGS "${DEFAULT_CMAKE_FLAGS}")
# short project name due to long filename issues on windows

View File

@@ -15,7 +15,7 @@ llvm-config = '${LIBDIR}/llvm/bin/llvm-config'"
)
set(MESA_EXTRA_FLAGS
${MESON_BUILD_TYPE}
-Dbuildtype=release
-Dc_args=${MESA_CFLAGS}
-Dcpp_args=${MESA_CXXFLAGS}
-Dc_link_args=${MESA_LDFLAGS}

View File

@@ -16,10 +16,8 @@ message("BuildMode = ${BUILD_MODE}")
if(BUILD_MODE STREQUAL "Debug")
set(LIBDIR ${CMAKE_CURRENT_BINARY_DIR}/Debug)
set(MESON_BUILD_TYPE -Dbuildtype=debug)
else()
set(LIBDIR ${CMAKE_CURRENT_BINARY_DIR}/Release)
set(MESON_BUILD_TYPE -Dbuildtype=release)
endif()
set(DOWNLOAD_DIR "${CMAKE_CURRENT_BINARY_DIR}/downloads" CACHE STRING "Path for downloaded files")

View File

@@ -88,19 +88,6 @@ else()
export LDFLAGS=${PYTHON_LDFLAGS} &&
export PKG_CONFIG_PATH=${LIBDIR}/ffi/lib/pkgconfig)
# NOTE: untested on APPLE so far.
if(NOT APPLE)
set(PYTHON_CONFIGURE_EXTRA_ARGS
${PYTHON_CONFIGURE_EXTRA_ARGS}
# Used on most release Linux builds (Fedora for e.g.),
# increases build times noticeably with the benefit of a modest speedup at runtime.
--enable-optimizations
# While LTO is OK when building on the same system, it's incompatible across GCC versions,
# making it impractical for developers to build against, so keep it disabled.
# `--with-lto`
)
endif()
ExternalProject_Add(external_python
URL file://${PACKAGE_DIR}/${PYTHON_FILE}
DOWNLOAD_DIR ${DOWNLOAD_DIR}

View File

@@ -10,9 +10,9 @@ if(WIN32)
DOWNLOAD_DIR ${DOWNLOAD_DIR}
URL_HASH ${SSL_HASH_TYPE}=${SSL_HASH}
PREFIX ${BUILD_DIR}/ssl
CONFIGURE_COMMAND echo "."
BUILD_COMMAND echo "."
INSTALL_COMMAND echo "."
CONFIGURE_COMMAND echo "."
BUILD_COMMAND echo "."
INSTALL_COMMAND echo "."
INSTALL_DIR ${LIBDIR}/ssl
)
else()
@@ -46,4 +46,4 @@ else()
INSTALL_COMMAND ${CONFIGURE_ENV} && cd ${BUILD_DIR}/ssl/src/external_ssl/ && make install
INSTALL_DIR ${LIBDIR}/ssl
)
endif()
endif()

View File

@@ -668,9 +668,9 @@ set(SPIRV_HEADERS_FILE SPIR-V-Headers-${SPIRV_HEADERS_VERSION}.tar.gz)
# compiler, the versions used are taken from the following location
# https://github.com/intel/intel-graphics-compiler/releases
set(IGC_VERSION 1.0.13064.7)
set(IGC_VERSION 1.0.12149.1)
set(IGC_URI https://github.com/intel/intel-graphics-compiler/archive/refs/tags/igc-${IGC_VERSION}.tar.gz)
set(IGC_HASH a929abd4cca2b293961ec0437ee4b3b2147bd3b2c8a3c423af78c0c359b2e5ae)
set(IGC_HASH 44f67f24e3bc5130f9f062533abf8154782a9d0a992bc19b498639a8521ae836)
set(IGC_HASH_TYPE SHA256)
set(IGC_FILE igc-${IGC_VERSION}.tar.gz)
@@ -690,15 +690,15 @@ set(IGC_LLVM_FILE ${IGC_LLVM_VERSION}.tar.gz)
#
# WARNING WARNING WARNING
set(IGC_OPENCL_CLANG_VERSION ee31812ea8b89d08c2918f045d11a19bd33525c5)
set(IGC_OPENCL_CLANG_VERSION 363a5262d8c7cff3fb28f3bdb5d85c8d7e91c1bb)
set(IGC_OPENCL_CLANG_URI https://github.com/intel/opencl-clang/archive/${IGC_OPENCL_CLANG_VERSION}.tar.gz)
set(IGC_OPENCL_CLANG_HASH 1db6735bbcfaa31e8a9ba39f121d6bafa806ea8919e9f56782d6aaa67771ddda)
set(IGC_OPENCL_CLANG_HASH aa8cf72bb239722ce8ce44f79413c6887ecc8ca18477dd520aa5c4809756da9a)
set(IGC_OPENCL_CLANG_HASH_TYPE SHA256)
set(IGC_OPENCL_CLANG_FILE opencl-clang-${IGC_OPENCL_CLANG_VERSION}.tar.gz)
set(IGC_VCINTRINSICS_VERSION v0.11.0)
set(IGC_VCINTRINSICS_VERSION v0.5.0)
set(IGC_VCINTRINSICS_URI https://github.com/intel/vc-intrinsics/archive/refs/tags/${IGC_VCINTRINSICS_VERSION}.tar.gz)
set(IGC_VCINTRINSICS_HASH e5acd5626ce7fa6d41ce154c50ac805eda734ee66af94ef28e680ac2ad81bb9f)
set(IGC_VCINTRINSICS_HASH 70bb47c5e32173cf61514941e83ae7c7eb4485e6d2fca60cfa1f50d4f42c41f2)
set(IGC_VCINTRINSICS_HASH_TYPE SHA256)
set(IGC_VCINTRINSICS_FILE vc-intrinsics-${IGC_VCINTRINSICS_VERSION}.tar.gz)
@@ -714,9 +714,9 @@ set(IGC_SPIRV_TOOLS_HASH 6e19900e948944243024aedd0a201baf3854b377b9cc7a386553bc1
set(IGC_SPIRV_TOOLS_HASH_TYPE SHA256)
set(IGC_SPIRV_TOOLS_FILE SPIR-V-Tools-${IGC_SPIRV_TOOLS_VERSION}.tar.gz)
set(IGC_SPIRV_TRANSLATOR_VERSION d739c01d65ec00dee64dedd40deed805216a7193)
set(IGC_SPIRV_TRANSLATOR_VERSION a31ffaeef77e23d500b3ea3d35e0c42ff5648ad9)
set(IGC_SPIRV_TRANSLATOR_URI https://github.com/KhronosGroup/SPIRV-LLVM-Translator/archive/${IGC_SPIRV_TRANSLATOR_VERSION}.tar.gz)
set(IGC_SPIRV_TRANSLATOR_HASH ddc0cc9ccbe59dadeaf291012d59de142b2e9f2b124dbb634644d39daddaa13e)
set(IGC_SPIRV_TRANSLATOR_HASH 9e26c96a45341b8f8af521bacea20e752623346340addd02af95d669f6e89252)
set(IGC_SPIRV_TRANSLATOR_HASH_TYPE SHA256)
set(IGC_SPIRV_TRANSLATOR_FILE SPIR-V-Translator-${IGC_SPIRV_TRANSLATOR_VERSION}.tar.gz)
@@ -724,15 +724,15 @@ set(IGC_SPIRV_TRANSLATOR_FILE SPIR-V-Translator-${IGC_SPIRV_TRANSLATOR_VERSION}.
### Intel Graphics Compiler DEPS END ###
########################################
set(GMMLIB_VERSION intel-gmmlib-22.3.0)
set(GMMLIB_VERSION intel-gmmlib-22.1.8)
set(GMMLIB_URI https://github.com/intel/gmmlib/archive/refs/tags/${GMMLIB_VERSION}.tar.gz)
set(GMMLIB_HASH c1f33e1519edfc527127baeb0436b783430dfd256c643130169a3a71dc86aff9)
set(GMMLIB_HASH bf23e9a3742b4fb98c7666c9e9b29f3219e4b2fb4d831aaf4eed71f5e2d17368)
set(GMMLIB_HASH_TYPE SHA256)
set(GMMLIB_FILE ${GMMLIB_VERSION}.tar.gz)
set(OCLOC_VERSION 22.49.25018.21)
set(OCLOC_VERSION 22.38.24278)
set(OCLOC_URI https://github.com/intel/compute-runtime/archive/refs/tags/${OCLOC_VERSION}.tar.gz)
set(OCLOC_HASH 92362dae08b503a34e5d3820ed284198c452bcd5e7504d90eb69887b20492c06)
set(OCLOC_HASH db0c542fccd651e6404b15a74d46027f1ce0eda8dc9e25a40cbb6c0faef257ee)
set(OCLOC_HASH_TYPE SHA256)
set(OCLOC_FILE ocloc-${OCLOC_VERSION}.tar.gz)

View File

@@ -13,7 +13,7 @@ ExternalProject_Add(external_wayland
# NOTE: `-lm` is needed for `libxml2` which is a static library that uses `libm.so`,
# without this, math symbols such as `floor` aren't found.
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env PKG_CONFIG_PATH=${LIBDIR}/expat/lib/pkgconfig:${LIBDIR}/xml2/lib/pkgconfig:${LIBDIR}/ffi/lib/pkgconfig:$PKG_CONFIG_PATH
${MESON} --prefix ${LIBDIR}/wayland ${MESON_BUILD_TYPE} -Ddocumentation=false -Dtests=false -D "c_link_args=-L${LIBDIR}/ffi/lib -lm" . ../external_wayland
${MESON} --prefix ${LIBDIR}/wayland -Ddocumentation=false -Dtests=false -D "c_link_args=-L${LIBDIR}/ffi/lib -lm" . ../external_wayland
BUILD_COMMAND ninja
INSTALL_COMMAND ninja install
)

View File

@@ -7,7 +7,7 @@ ExternalProject_Add(external_wayland_protocols
PREFIX ${BUILD_DIR}/wayland-protocols
# Use `-E` so the `PKG_CONFIG_PATH` can be defined to link against our own WAYLAND.
CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env PKG_CONFIG_PATH=${LIBDIR}/wayland/lib64/pkgconfig:$PKG_CONFIG_PATH
${MESON} --prefix ${LIBDIR}/wayland-protocols ${MESON_BUILD_TYPE} . ../external_wayland_protocols -Dtests=false
${MESON} --prefix ${LIBDIR}/wayland-protocols . ../external_wayland_protocols -Dtests=false
BUILD_COMMAND ninja
INSTALL_COMMAND ninja install
)

View File

@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-or-later
if(WIN32)
set(XML2_EXTRA_ARGS
set(XML2_EXTRA_ARGS
-DLIBXML2_WITH_ZLIB=OFF
-DLIBXML2_WITH_LZMA=OFF
-DLIBXML2_WITH_PYTHON=OFF

View File

@@ -1,74 +0,0 @@
#!/usr/bin/env bash
# SPDX-License-Identifier: GPL-2.0-or-later
# This script ensures:
# - One dependency is built at a time.
# - That dependency uses all available cores.
#
# Without this, simply calling `make -j$(nproc)` from the `${CMAKE_BUILD_DIR}/deps/`
# directory will build many projects at once.
#
# This is undesirable for the following reasons:
#
# - The output from projects is mixed together,
# making it difficult to track down the cause of a build failure.
#
# - Larger dependencies such as LLVM can bottleneck the build process,
# making it necessary to cancel the build and manually run build commands in each directory.
#
# - Building many projects at once means canceling (Control-C) can lead to the build being in an undefined state.
# It's possible canceling happens as a patch is being applied or files are being copied.
# (steps that aren't part of the compilation process where it's typically safe to cancel).
if [[ -z "$MY_MAKE_CALL_LEVEL" ]]; then
export MY_MAKE_CALL_LEVEL=0
export MY_MAKEFLAGS=$MAKEFLAGS
# Extract the jobs argument (`-jN`, `-j N`, `--jobs=N`).
add_next=0
for i in "$@"; do
case $i in
-j*)
export MY_JOBS_ARG=$i
if [ "$MY_JOBS_ARG" = "-j" ]; then
add_next=1
fi
;;
--jobs=*)
shift # past argument=value
MY_JOBS_ARG=$i
;;
*)
if (( add_next == 1 )); then
MY_JOBS_ARG="$MY_JOBS_ARG $i"
add_next=0
fi
;;
esac
done
unset i add_next
if [[ -z "$MY_JOBS_ARG" ]]; then
MY_JOBS_ARG="-j$(nproc)"
fi
export MY_JOBS_ARG
# Support user defined `MAKEFLAGS`.
export MAKEFLAGS="$MY_MAKEFLAGS -j1"
else
export MY_MAKE_CALL_LEVEL=$(( MY_MAKE_CALL_LEVEL + 1 ))
if (( MY_MAKE_CALL_LEVEL == 1 )); then
# Important to set jobs to 1, otherwise user defined jobs argument is used.
export MAKEFLAGS="$MY_MAKEFLAGS -j1"
elif (( MY_MAKE_CALL_LEVEL == 2 )); then
# This is the level used by each sub-project.
export MAKEFLAGS="$MY_MAKEFLAGS $MY_JOBS_ARG"
fi
# Else leave `MY_MAKEFLAGS` flags as-is, avoids setting a high number of jobs on recursive
# calls (which may easily run out of memory). Let the job-server handle the rest.
fi
# Useful for troubleshooting the wrapper.
# echo "Call level: $MY_MAKE_CALL_LEVEL, args=$@".
# Call actual make but ensure recursive calls run via this script.
exec make MAKE="$0" "$@"

View File

@@ -1,7 +1,7 @@
diff -Naur external_igc_opencl_clang.orig/CMakeLists.txt external_igc_opencl_clang/CMakeLists.txt
--- external_igc_opencl_clang.orig/CMakeLists.txt 2022-03-16 05:51:10 -0600
+++ external_igc_opencl_clang/CMakeLists.txt 2022-05-23 10:40:09 -0600
@@ -147,22 +147,24 @@
@@ -126,22 +126,24 @@
)
endif()

View File

@@ -24,7 +24,7 @@ SET(_moltenvk_SEARCH_DIRS
# FIXME: These finder modules typically don't use LIBDIR,
# this should be set by `./build_files/cmake/platform/` instead.
IF(DEFINED LIBDIR)
SET(_moltenvk_SEARCH_DIRS ${_moltenvk_SEARCH_DIRS} ${LIBDIR}/moltenvk)
SET(_moltenvk_SEARCH_DIRS ${_moltenvk_SEARCH_DIRS} ${LIBDIR}/vulkan/MoltenVK)
ENDIF()
FIND_PATH(MOLTENVK_INCLUDE_DIR

View File

@@ -1,63 +0,0 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright 2023 Blender Foundation.
# - Find ShaderC libraries
# Find the ShaderC includes and libraries
# This module defines
# SHADERC_INCLUDE_DIRS, where to find MoltenVK headers, Set when
# SHADERC_INCLUDE_DIR is found.
# SHADERC_LIBRARIES, libraries to link against to use ShaderC.
# SHADERC_ROOT_DIR, The base directory to search for ShaderC.
# This can also be an environment variable.
# SHADERC_FOUND, If false, do not try to use ShaderC.
#
# If SHADERC_ROOT_DIR was defined in the environment, use it.
IF(NOT SHADERC_ROOT_DIR AND NOT $ENV{SHADERC_ROOT_DIR} STREQUAL "")
SET(SHADERC_ROOT_DIR $ENV{SHADERC_ROOT_DIR})
ENDIF()
SET(_shaderc_SEARCH_DIRS
${SHADERC_ROOT_DIR}
)
# FIXME: These finder modules typically don't use LIBDIR,
# this should be set by `./build_files/cmake/platform/` instead.
IF(DEFINED LIBDIR)
SET(_shaderc_SEARCH_DIRS ${_shaderc_SEARCH_DIRS} ${LIBDIR}/shaderc)
ENDIF()
FIND_PATH(SHADERC_INCLUDE_DIR
NAMES
shaderc/shaderc.h
HINTS
${_shaderc_SEARCH_DIRS}
PATH_SUFFIXES
include
)
FIND_LIBRARY(SHADERC_LIBRARY
NAMES
shaderc_combined
HINTS
${_shaderc_SEARCH_DIRS}
PATH_SUFFIXES
lib
)
# handle the QUIETLY and REQUIRED arguments and set SHADERC_FOUND to TRUE if
# all listed variables are TRUE
INCLUDE(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(ShaderC DEFAULT_MSG SHADERC_LIBRARY SHADERC_INCLUDE_DIR)
IF(SHADERC_FOUND)
SET(SHADERC_LIBRARIES ${SHADERC_LIBRARY})
SET(SHADERC_INCLUDE_DIRS ${SHADERC_INCLUDE_DIR})
ENDIF()
MARK_AS_ADVANCED(
SHADERC_INCLUDE_DIR
SHADERC_LIBRARY
)
UNSET(_shaderc_SEARCH_DIRS)

View File

@@ -1,63 +0,0 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright 2023 Blender Foundation.
# - Find Vulkan libraries
# Find the Vulkan includes and libraries
# This module defines
# VULKAN_INCLUDE_DIRS, where to find Vulkan headers, Set when
# VULKAN_INCLUDE_DIR is found.
# VULKAN_LIBRARIES, libraries to link against to use Vulkan.
# VULKAN_ROOT_DIR, The base directory to search for Vulkan.
# This can also be an environment variable.
# VULKAN_FOUND, If false, do not try to use Vulkan.
#
# If VULKAN_ROOT_DIR was defined in the environment, use it.
IF(NOT VULKAN_ROOT_DIR AND NOT $ENV{VULKAN_ROOT_DIR} STREQUAL "")
SET(VULKAN_ROOT_DIR $ENV{VULKAN_ROOT_DIR})
ENDIF()
SET(_vulkan_SEARCH_DIRS
${VULKAN_ROOT_DIR}
)
# FIXME: These finder modules typically don't use LIBDIR,
# this should be set by `./build_files/cmake/platform/` instead.
IF(DEFINED LIBDIR)
SET(_vulkan_SEARCH_DIRS ${_vulkan_SEARCH_DIRS} ${LIBDIR}/vulkan)
ENDIF()
FIND_PATH(VULKAN_INCLUDE_DIR
NAMES
vulkan/vulkan.h
HINTS
${_vulkan_SEARCH_DIRS}
PATH_SUFFIXES
include
)
FIND_LIBRARY(VULKAN_LIBRARY
NAMES
vulkan
HINTS
${_vulkan_SEARCH_DIRS}
PATH_SUFFIXES
lib
)
# handle the QUIETLY and REQUIRED arguments and set VULKAN_FOUND to TRUE if
# all listed variables are TRUE
INCLUDE(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(Vulkan DEFAULT_MSG VULKAN_LIBRARY VULKAN_INCLUDE_DIR)
IF(VULKAN_FOUND)
SET(VULKAN_LIBRARIES ${VULKAN_LIBRARY})
SET(VULKAN_INCLUDE_DIRS ${VULKAN_INCLUDE_DIR})
ENDIF()
MARK_AS_ADVANCED(
VULKAN_INCLUDE_DIR
VULKAN_LIBRARY
)
UNSET(_vulkan_SEARCH_DIRS)

View File

@@ -23,19 +23,19 @@ if(EXISTS ${SOURCE_DIR}/.git)
if(MY_WC_BRANCH STREQUAL "HEAD")
# Detached HEAD, check whether commit hash is reachable
# in the main branch
# in the master branch
execute_process(COMMAND git rev-parse --short=12 HEAD
WORKING_DIRECTORY ${SOURCE_DIR}
OUTPUT_VARIABLE MY_WC_HASH
OUTPUT_STRIP_TRAILING_WHITESPACE)
execute_process(COMMAND git branch --list main blender-v* --contains ${MY_WC_HASH}
execute_process(COMMAND git branch --list master blender-v* --contains ${MY_WC_HASH}
WORKING_DIRECTORY ${SOURCE_DIR}
OUTPUT_VARIABLE _git_contains_check
OUTPUT_STRIP_TRAILING_WHITESPACE)
if(NOT _git_contains_check STREQUAL "")
set(MY_WC_BRANCH "main")
set(MY_WC_BRANCH "master")
else()
execute_process(COMMAND git show-ref --tags -d
WORKING_DIRECTORY ${SOURCE_DIR}
@@ -48,7 +48,7 @@ if(EXISTS ${SOURCE_DIR}/.git)
OUTPUT_STRIP_TRAILING_WHITESPACE)
if(_git_tag_hashes MATCHES "${_git_head_hash}")
set(MY_WC_BRANCH "main")
set(MY_WC_BRANCH "master")
else()
execute_process(COMMAND git branch --contains ${MY_WC_HASH}
WORKING_DIRECTORY ${SOURCE_DIR}

View File

@@ -85,7 +85,7 @@ if(NOT APPLE)
set(WITH_CYCLES_DEVICE_OPTIX ON CACHE BOOL "" FORCE)
set(WITH_CYCLES_CUDA_BINARIES ON CACHE BOOL "" FORCE)
set(WITH_CYCLES_CUBIN_COMPILER OFF CACHE BOOL "" FORCE)
set(WITH_CYCLES_HIP_BINARIES OFF CACHE BOOL "" FORCE)
set(WITH_CYCLES_HIP_BINARIES ON CACHE BOOL "" FORCE)
set(WITH_CYCLES_DEVICE_ONEAPI ON CACHE BOOL "" FORCE)
set(WITH_CYCLES_ONEAPI_BINARIES ON CACHE BOOL "" FORCE)
endif()

View File

@@ -11,11 +11,11 @@
mkdir ~/blender-git
cd ~/blender-git
git clone https://projects.blender.org/blender/blender.git
git clone http://git.blender.org/blender.git
cd blender
git submodule update --init --recursive
git submodule foreach git checkout main
git submodule foreach git pull --rebase origin main
git submodule foreach git checkout master
git submodule foreach git pull --rebase origin master
# create build dir
mkdir ~/blender-git/build-cmake
@@ -35,7 +35,7 @@ ln -s ~/blender-git/build-cmake/bin/blender ~/blender-git/blender/blender.bin
echo ""
echo "* Useful Commands *"
echo " Run Blender: ~/blender-git/blender/blender.bin"
echo " Update Blender: git pull --rebase; git submodule foreach git pull --rebase origin main"
echo " Update Blender: git pull --rebase; git submodule foreach git pull --rebase origin master"
echo " Reconfigure Blender: cd ~/blender-git/build-cmake ; cmake ."
echo " Build Blender: cd ~/blender-git/build-cmake ; make"
echo ""

View File

@@ -544,7 +544,7 @@ endfunction()
function(setup_platform_linker_libs
target
)
# jemalloc must be early in the list, to be before pthread (see #57998).
# jemalloc must be early in the list, to be before pthread (see T57998)
if(WITH_MEM_JEMALLOC)
target_link_libraries(${target} ${JEMALLOC_LIBRARIES})
endif()

View File

@@ -97,8 +97,20 @@ add_bundled_libraries(materialx/lib)
if(WITH_VULKAN_BACKEND)
find_package(MoltenVK REQUIRED)
find_package(ShaderC REQUIRED)
find_package(Vulkan REQUIRED)
if(EXISTS ${LIBDIR}/vulkan)
set(VULKAN_FOUND On)
set(VULKAN_ROOT_DIR ${LIBDIR}/vulkan/macOS)
set(VULKAN_INCLUDE_DIR ${VULKAN_ROOT_DIR}/include)
set(VULKAN_LIBRARY ${VULKAN_ROOT_DIR}/lib/libvulkan.1.dylib)
set(SHADERC_LIBRARY ${VULKAN_ROOT_DIR}/lib/libshaderc_combined.a)
set(VULKAN_INCLUDE_DIRS ${VULKAN_INCLUDE_DIR} ${MOLTENVK_INCLUDE_DIRS})
set(VULKAN_LIBRARIES ${VULKAN_LIBRARY} ${SHADERC_LIBRARY} ${MOLTENVK_LIBRARIES})
else()
message(WARNING "Vulkan SDK was not found, disabling WITH_VULKAN_BACKEND")
set(WITH_VULKAN_BACKEND OFF)
endif()
endif()
if(WITH_OPENSUBDIV)
@@ -440,7 +452,7 @@ string(APPEND PLATFORM_LINKFLAGS " -stdlib=libc++")
# Make stack size more similar to Embree, required for Embree.
string(APPEND PLATFORM_LINKFLAGS_EXECUTABLE " -Wl,-stack_size,0x100000")
# Suppress ranlib "has no symbols" warnings (workaround for #48250).
# Suppress ranlib "has no symbols" warnings (workaround for T48250)
set(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> Scr <TARGET> <LINK_FLAGS> <OBJECTS>")
set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> Scr <TARGET> <LINK_FLAGS> <OBJECTS>")
# llvm-ranlib doesn't support this flag. Xcode's libtool does.

View File

@@ -111,7 +111,6 @@ find_package_wrapper(Epoxy REQUIRED)
if(WITH_VULKAN_BACKEND)
find_package_wrapper(Vulkan REQUIRED)
find_package_wrapper(ShaderC REQUIRED)
endif()
function(check_freetype_for_brotli)

View File

@@ -121,7 +121,7 @@ if(WITH_WINDOWS_BUNDLE_CRT)
include(InstallRequiredSystemLibraries)
# ucrtbase(d).dll cannot be in the manifest, due to the way windows 10 handles
# redirects for this dll, for details see #88813.
# redirects for this dll, for details see T88813.
foreach(lib ${CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS})
string(FIND ${lib} "ucrtbase" pos)
if(NOT pos EQUAL -1)
@@ -295,7 +295,7 @@ unset(MATERIALX_LIB_FOLDER_EXISTS)
if(NOT MSVC_CLANG AND # Available with MSVC 15.7+ but not for CLANG.
NOT WITH_WINDOWS_SCCACHE AND # And not when sccache is enabled
NOT VS_CLANG_TIDY) # Clang-tidy does not like these options
add_compile_options(/experimental:external /external:I "${LIBDIR}" /external:W0)
add_compile_options(/experimental:external /external:templates- /external:I "${LIBDIR}" /external:W0)
endif()
# Add each of our libraries to our cmake_prefix_path so find_package() could work

View File

@@ -5,16 +5,16 @@
update-code:
git:
submodules:
- branch: main
- branch: master
commit_id: HEAD
path: release/scripts/addons
- branch: main
- branch: master
commit_id: HEAD
path: release/scripts/addons_contrib
- branch: main
- branch: master
commit_id: HEAD
path: release/datafiles/locale
- branch: main
- branch: master
commit_id: HEAD
path: source/tools
svn:
@@ -63,7 +63,7 @@ buildbot:
optix:
version: '7.3.0'
ocloc:
version: '101.4032'
version: '101.3430'
cmake:
default:
version: any

View File

@@ -24,7 +24,7 @@ import os
import re
import platform
import string
import setuptools
import setuptools # type: ignore
import sys
from typing import (
@@ -58,7 +58,7 @@ Each Blender release supports one Python version, and the package is only compat
## Source Code
* [Releases](https://download.blender.org/source/)
* Repository: [projects.blender.org/blender/blender.git](https://projects.blender.org/blender/blender)
* Repository: [git.blender.org/blender.git](https://git.blender.org/gitweb/gitweb.cgi/blender.git)
## Credits
@@ -208,7 +208,7 @@ def main() -> None:
return paths
# Ensure this wheel is marked platform specific.
class BinaryDistribution(setuptools.dist.Distribution):
class BinaryDistribution(setuptools.dist.Distribution): # type: ignore
def has_ext_modules(self) -> bool:
return True

View File

@@ -13,10 +13,10 @@ import sys
import make_utils
from make_utils import call
# Parse arguments.
# Parse arguments
def parse_arguments() -> argparse.Namespace:
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("--ctest-command", default="ctest")
parser.add_argument("--cmake-command", default="cmake")

View File

@@ -42,7 +42,6 @@ def parse_arguments() -> argparse.Namespace:
parser.add_argument("--svn-branch", default=None)
parser.add_argument("--git-command", default="git")
parser.add_argument("--use-linux-libraries", action="store_true")
parser.add_argument("--architecture", type=str, choices=("x86_64", "amd64", "arm64",))
return parser.parse_args()
@@ -52,17 +51,6 @@ def get_blender_git_root() -> str:
# Setup for precompiled libraries and tests from svn.
def get_effective_architecture(args: argparse.Namespace):
if args.architecture:
return args.architecture
# Check platform.version to detect arm64 with x86_64 python binary.
if "ARM64" in platform.version():
return "arm64"
return platform.machine().lower()
def svn_update(args: argparse.Namespace, release_version: Optional[str]) -> None:
svn_non_interactive = [args.svn_command, '--non-interactive']
@@ -70,11 +58,11 @@ def svn_update(args: argparse.Namespace, release_version: Optional[str]) -> None
svn_url = make_utils.svn_libraries_base_url(release_version, args.svn_branch)
# Checkout precompiled libraries
architecture = get_effective_architecture(args)
if sys.platform == 'darwin':
if architecture == 'arm64':
# Check platform.version to detect arm64 with x86_64 python binary.
if platform.machine() == 'arm64' or ('ARM64' in platform.version()):
lib_platform = "darwin_arm64"
elif architecture == 'x86_64':
elif platform.machine() == 'x86_64':
lib_platform = "darwin"
else:
lib_platform = None
@@ -182,7 +170,7 @@ def git_update_skip(args: argparse.Namespace, check_remote_exists: bool = True)
return "rebase or merge in progress, complete it first"
# Abort if uncommitted changes.
changes = check_output([args.git_command, 'status', '--porcelain', '--untracked-files=no', '--ignore-submodules'])
changes = check_output([args.git_command, 'status', '--porcelain', '--untracked-files=no'])
if len(changes) != 0:
return "you have unstaged changes"
@@ -214,8 +202,8 @@ def submodules_update(
sys.exit(1)
# Update submodules to appropriate given branch,
# falling back to main if none is given and/or found in a sub-repository.
branch_fallback = "main"
# falling back to master if none is given and/or found in a sub-repository.
branch_fallback = "master"
if not branch:
branch = branch_fallback
@@ -268,15 +256,14 @@ if __name__ == "__main__":
blender_skip_msg = ""
submodules_skip_msg = ""
blender_version = make_utils. parse_blender_version()
if blender_version.cycle != 'alpha':
major = blender_version.version // 100
minor = blender_version.version % 100
branch = f"blender-v{major}.{minor}-release"
release_version = f"{major}.{minor}"
else:
branch = 'main'
release_version = None
# Test if we are building a specific release version.
branch = make_utils.git_branch(args.git_command)
if branch == 'HEAD':
sys.stderr.write('Blender git repository is in detached HEAD state, must be in a branch\n')
sys.exit(1)
tag = make_utils.git_tag(args.git_command)
release_version = make_utils.git_branch_release_version(branch, tag)
if not args.no_libraries:
svn_update(args, release_version)

View File

@@ -3,9 +3,9 @@ if NOT exist "%BLENDER_DIR%\source\tools\.git" (
if not "%GIT%" == "" (
"%GIT%" submodule update --init --recursive --progress
if errorlevel 1 goto FAIL
"%GIT%" submodule foreach git checkout main
"%GIT%" submodule foreach git checkout master
if errorlevel 1 goto FAIL
"%GIT%" submodule foreach git pull --rebase origin main
"%GIT%" submodule foreach git pull --rebase origin master
if errorlevel 1 goto FAIL
goto EOF
) else (

View File

@@ -38,7 +38,7 @@ PROJECT_NAME = Blender
# could be handy for archiving the generated documentation or if some version
# control system is used.
PROJECT_NUMBER = V3.6
PROJECT_NUMBER = V3.5
# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer a

View File

@@ -37,7 +37,7 @@ def draw_callback_px(self, context):
# BLF drawing routine
font_id = font_info["font_id"]
blf.position(font_id, 2, 80, 0)
blf.size(font_id, 50)
blf.size(font_id, 50, 72)
blf.draw(font_id, "Hello World")

View File

@@ -476,7 +476,7 @@ MODULE_GROUPING = {
# -------------------------------BLENDER----------------------------------------
# Converting bytes to strings, due to #30154.
# converting bytes to strings, due to T30154
BLENDER_REVISION = str(bpy.app.build_hash, 'utf_8')
BLENDER_REVISION_TIMESTAMP = bpy.app.build_commit_timestamp
@@ -487,7 +487,7 @@ BLENDER_VERSION_DOTS = "%d.%d" % (bpy.app.version[0], bpy.app.version[1])
if BLENDER_REVISION != "Unknown":
# SHA1 Git hash
BLENDER_VERSION_HASH = BLENDER_REVISION
BLENDER_VERSION_HASH_HTML_LINK = "<a href=https://projects.blender.org/blender/blender/commit/%s>%s</a>" % (
BLENDER_VERSION_HASH_HTML_LINK = "<a href=https://developer.blender.org/rB%s>%s</a>" % (
BLENDER_VERSION_HASH, BLENDER_VERSION_HASH,
)
BLENDER_VERSION_DATE = time.strftime("%d/%m/%Y", time.localtime(BLENDER_REVISION_TIMESTAMP))
@@ -647,7 +647,7 @@ def undocumented_message(module_name, type_name, identifier):
module_name, type_name, identifier,
)
return "Undocumented, consider `contributing <https://developer.blender.org/>`__."
return "Undocumented, consider `contributing <https://developer.blender.org/T51061>`__."
def range_str(val):
@@ -1816,9 +1816,9 @@ def pyrna2sphinx(basepath):
# operators
def write_ops():
API_BASEURL = "https://projects.blender.org/blender/blender/src/branch/main/release/scripts"
API_BASEURL_ADDON = "https://projects.blender.org/blender/blender-addons"
API_BASEURL_ADDON_CONTRIB = "https://projects.blender.org/blender/blender-addons-contrib"
API_BASEURL = "https://developer.blender.org/diffusion/B/browse/master/release/scripts"
API_BASEURL_ADDON = "https://developer.blender.org/diffusion/BA"
API_BASEURL_ADDON_CONTRIB = "https://developer.blender.org/diffusion/BAC"
op_modules = {}
op = None
@@ -2200,7 +2200,7 @@ def write_rst_enum_items(basepath, key, key_no_prefix, enum_items):
Write a single page for a static enum in RST.
This helps avoiding very large lists being in-lined in many places which is an issue
especially with icons in ``bpy.types.UILayout``. See #87008.
especially with icons in ``bpy.types.UILayout``. See T87008.
"""
filepath = os.path.join(basepath, "%s.rst" % key_no_prefix)
with open(filepath, "w", encoding="utf-8") as fh:

View File

@@ -156,7 +156,7 @@ var Popover = function() {
},
getNamed : function(v) {
$.each(all_versions, function(ix, title) {
if (ix === "master" || ix === "main" || ix === "latest") {
if (ix === "master" || ix === "latest") {
var m = title.match(/\d\.\d[\w\d\.]*/)[0];
if (parseFloat(m) == v) {
v = ix;

View File

@@ -1,5 +1,5 @@
Project: Blender
URL: https://projects.blender.org/blender/blender.git
URL: https://git.blender.org/blender.git
License: Apache 2.0
Upstream version: N/A
Local modifications: None

View File

@@ -7,7 +7,6 @@ set(INC
set(INC_SYS
${VULKAN_INCLUDE_DIRS}
${MOLTENVK_INCLUDE_DIRS}
)
set(SRC

View File

@@ -1,15 +0,0 @@
diff --git a/extern/vulkan_memory_allocator/vk_mem_alloc.h b/extern/vulkan_memory_allocator/vk_mem_alloc.h
index 60f572038c0..63a9994ba46 100644
--- a/extern/vulkan_memory_allocator/vk_mem_alloc.h
+++ b/extern/vulkan_memory_allocator/vk_mem_alloc.h
@@ -13371,8 +13371,8 @@ bool VmaDefragmentationContext_T::IncrementCounters(VkDeviceSize bytes)
// Early return when max found
if (++m_PassStats.allocationsMoved >= m_MaxPassAllocations || m_PassStats.bytesMoved >= m_MaxPassBytes)
{
- VMA_ASSERT(m_PassStats.allocationsMoved == m_MaxPassAllocations ||
- m_PassStats.bytesMoved == m_MaxPassBytes && "Exceeded maximal pass threshold!");
+ VMA_ASSERT((m_PassStats.allocationsMoved == m_MaxPassAllocations ||
+ m_PassStats.bytesMoved == m_MaxPassBytes) && "Exceeded maximal pass threshold!");
return true;
}
return false;

File diff suppressed because it is too large Load Diff

View File

@@ -12,7 +12,6 @@ from bpy.props import (
PointerProperty,
StringProperty,
)
from bpy.app.translations import pgettext_iface as iface_
from math import pi
@@ -1665,51 +1664,30 @@ class CyclesPreferences(bpy.types.AddonPreferences):
col.label(text="No compatible GPUs found for Cycles", icon='INFO')
if device_type == 'CUDA':
compute_capability = "3.0"
col.label(text=iface_("Requires NVIDIA GPU with compute capability %s") % compute_capability,
icon='BLANK1', translate=False)
col.label(text="Requires NVIDIA GPU with compute capability 3.0", icon='BLANK1')
elif device_type == 'OPTIX':
compute_capability = "5.0"
driver_version = "470"
col.label(text=iface_("Requires NVIDIA GPU with compute capability %s") % compute_capability,
icon='BLANK1', translate=False)
col.label(text="and NVIDIA driver version %s or newer" % driver_version,
icon='BLANK1', translate=False)
col.label(text="Requires NVIDIA GPU with compute capability 5.0", icon='BLANK1')
col.label(text="and NVIDIA driver version 470 or newer", icon='BLANK1')
elif device_type == 'HIP':
if True:
col.label(text="HIP temporarily disabled due to compiler bugs", icon='BLANK1')
else:
import sys
if sys.platform[:3] == "win":
driver_version = "21.Q4"
col.label(text="Requires AMD GPU with Vega or RDNA architecture", icon='BLANK1')
col.label(text=iface_("and AMD Radeon Pro %s driver or newer") % driver_version,
icon='BLANK1', translate=False)
elif sys.platform.startswith("linux"):
driver_version = "22.10"
col.label(text="Requires AMD GPU with Vega or RDNA architecture", icon='BLANK1')
col.label(text=iface_("and AMD driver version %s or newer") % driver_version, icon='BLANK1',
translate=False)
import sys
if sys.platform[:3] == "win":
col.label(text="Requires AMD GPU with RDNA architecture", icon='BLANK1')
col.label(text="and AMD Radeon Pro 21.Q4 driver or newer", icon='BLANK1')
elif sys.platform.startswith("linux"):
col.label(text="Requires AMD GPU with RDNA architecture", icon='BLANK1')
col.label(text="and AMD driver version 22.10 or newer", icon='BLANK1')
elif device_type == 'ONEAPI':
import sys
if sys.platform.startswith("win"):
driver_version = "101.4032"
col.label(text="Requires Intel GPU with Xe-HPG architecture", icon='BLANK1')
col.label(text=iface_("and Windows driver version %s or newer") % driver_version,
icon='BLANK1', translate=False)
col.label(text="and Windows driver version 101.3430 or newer", icon='BLANK1')
elif sys.platform.startswith("linux"):
driver_version = "1.3.24931"
col.label(text="Requires Intel GPU with Xe-HPG architecture and", icon='BLANK1')
col.label(text=iface_(" - intel-level-zero-gpu version %s or newer") % driver_version,
icon='BLANK1', translate=False)
col.label(text=" - intel-level-zero-gpu version 1.3.23904 or newer", icon='BLANK1')
col.label(text=" - oneAPI Level-Zero Loader", icon='BLANK1')
elif device_type == 'METAL':
silicon_mac_version = "12.2"
amd_mac_version = "12.3"
col.label(text=iface_("Requires Apple Silicon with macOS %s or newer") % silicon_mac_version,
icon='BLANK1', translate=False)
col.label(text=iface_("or AMD with macOS %s or newer") % amd_mac_version, icon='BLANK1',
translate=False)
col.label(text="Requires Apple Silicon with macOS 12.2 or newer", icon='BLANK1')
col.label(text="or AMD with macOS 12.3 or newer", icon='BLANK1')
return
for device in devices:
@@ -1745,21 +1723,12 @@ class CyclesPreferences(bpy.types.AddonPreferences):
if compute_device_type == 'METAL':
import platform
import re
is_navi_2 = False
for device in devices:
if re.search(r"((RX)|(Pro)|(PRO))\s+W?6\d00X", device.name):
is_navi_2 = True
break
# MetalRT only works on Apple Silicon and Navi2.
is_arm64 = platform.machine() == 'arm64'
if is_arm64 or is_navi_2:
# MetalRT only works on Apple Silicon at present, pending argument encoding fixes on AMD
# Kernel specialization is only viable on Apple Silicon at present due to relative compilation speed
if platform.machine() == 'arm64':
col = layout.column()
col.use_property_split = True
# Kernel specialization is only supported on Apple Silicon
if is_arm64:
col.prop(self, "kernel_optimization_level")
col.prop(self, "kernel_optimization_level")
col.prop(self, "use_metalrt")
def draw(self, context):

View File

@@ -20,7 +20,7 @@ class CyclesPresetPanel(PresetPanel, Panel):
@staticmethod
def post_cb(context):
# Modify an arbitrary built-in scene property to force a depsgraph
# update, because add-on properties don't. (see #62325)
# update, because add-on properties don't. (see T62325)
render = context.scene.render
render.filter_size = render.filter_size

View File

@@ -105,12 +105,11 @@ GPUShader *BlenderFallbackDisplayShader::bind(int width, int height)
/* Bind shader now to enable uniform assignment. */
GPU_shader_bind(shader_program_);
int slot = 0;
GPU_shader_uniform_int_ex(shader_program_, image_texture_location_, 1, 1, &slot);
GPU_shader_uniform_int(shader_program_, image_texture_location_, 0);
float size[2];
size[0] = width;
size[1] = height;
GPU_shader_uniform_float_ex(shader_program_, fullscreen_location_, 2, 1, size);
GPU_shader_uniform_vector(shader_program_, fullscreen_location_, 2, 1, size);
return shader_program_;
}

View File

@@ -20,7 +20,7 @@ BlenderImageLoader::BlenderImageLoader(BL::Image b_image,
: b_image(b_image),
frame(frame),
tile_number(tile_number),
/* Don't free cache for preview render to avoid race condition from #93560, to be fixed
/* Don't free cache for preview render to avoid race condition from T93560, to be fixed
* properly later as we are close to release. */
free_cache(!is_preview_render && !b_image.has_data())
{
@@ -72,7 +72,7 @@ bool BlenderImageLoader::load_metadata(const ImageDeviceFeatures &, ImageMetaDat
metadata.colorspace = u_colorspace_raw;
}
else {
/* In some cases (e.g. #94135), the colorspace setting in Blender gets updated as part of the
/* In some cases (e.g. T94135), the colorspace setting in Blender gets updated as part of the
* metadata queries in this function, so update the colorspace setting here. */
PointerRNA colorspace_ptr = b_image.colorspace_settings().ptr;
metadata.colorspace = get_enum_identifier(colorspace_ptr, "name");

View File

@@ -24,7 +24,7 @@ void BlenderSync::sync_light(BL::Object &b_parent,
Light *light = light_map.find(key);
/* Check if the transform was modified, in case a linked collection is moved we do not get a
* specific depsgraph update (#88515). This also mimics the behavior for Objects. */
* specific depsgraph update (T88515). This also mimics the behavior for Objects. */
const bool tfm_updated = (light && light->get_tfm() != tfm);
/* Update if either object or light data changed. */
@@ -48,8 +48,6 @@ void BlenderSync::sync_light(BL::Object &b_parent,
case BL::Light::type_SPOT: {
BL::SpotLight b_spot_light(b_light);
light->set_size(b_spot_light.shadow_soft_size());
light->set_axisu(transform_get_column(&tfm, 0));
light->set_axisv(transform_get_column(&tfm, 1));
light->set_light_type(LIGHT_SPOT);
light->set_spot_angle(b_spot_light.spot_size());
light->set_spot_smooth(b_spot_light.spot_blend());

View File

@@ -94,7 +94,7 @@ void python_thread_state_restore(void **python_thread_state)
*python_thread_state = NULL;
}
static const char *PyC_UnicodeAsBytes(PyObject *py_str, PyObject **coerce)
static const char *PyC_UnicodeAsByte(PyObject *py_str, PyObject **coerce)
{
const char *result = PyUnicode_AsUTF8(py_str);
if (result) {
@@ -131,8 +131,8 @@ static PyObject *init_func(PyObject * /*self*/, PyObject *args)
}
PyObject *path_coerce = nullptr, *user_path_coerce = nullptr;
path_init(PyC_UnicodeAsBytes(path, &path_coerce),
PyC_UnicodeAsBytes(user_path, &user_path_coerce));
path_init(PyC_UnicodeAsByte(path, &path_coerce),
PyC_UnicodeAsByte(user_path, &user_path_coerce));
Py_XDECREF(path_coerce);
Py_XDECREF(user_path_coerce);

View File

@@ -404,7 +404,7 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
* point we know that we've got everything to render current view layer.
*/
/* At the moment we only free if we are not doing multi-view
* (or if we are rendering the last view). See #58142/D4239 for discussion.
* (or if we are rendering the last view). See T58142/D4239 for discussion.
*/
if (view_index == num_views - 1) {
free_blender_memory_if_possible();

View File

@@ -766,7 +766,7 @@ void BlenderSync::free_data_after_sync(BL::Depsgraph &b_depsgraph)
(BlenderSession::headless || is_interface_locked) &&
/* Baking re-uses the depsgraph multiple times, clearing crashes
* reading un-evaluated mesh data which isn't aligned with the
* geometry we're baking, see #71012. */
* geometry we're baking, see T71012. */
!scene->bake_manager->get_baking() &&
/* Persistent data must main caches for performance and correctness. */
!is_persistent_data;

View File

@@ -42,15 +42,12 @@ endif()
###########################################################################
if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
set(WITH_CYCLES_HIP_BINARIES OFF)
message(STATUS "HIP temporarily disabled due to compiler bugs")
find_package(HIP)
set_and_warn_library_found("HIP compiler" HIP_FOUND WITH_CYCLES_HIP_BINARIES)
# find_package(HIP)
# set_and_warn_library_found("HIP compiler" HIP_FOUND WITH_CYCLES_HIP_BINARIES)
# if(HIP_FOUND)
# message(STATUS "Found HIP ${HIP_HIPCC_EXECUTABLE} (${HIP_VERSION})")
# endif()
if(HIP_FOUND)
message(STATUS "Found HIP ${HIP_HIPCC_EXECUTABLE} (${HIP_VERSION})")
endif()
endif()
if(NOT WITH_HIP_DYNLOAD)

View File

@@ -53,12 +53,8 @@ void CUDADevice::set_error(const string &error)
}
CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
: GPUDevice(info, stats, profiler)
: Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
{
/* Verify that base class types can be used with specific backend types */
static_assert(sizeof(texMemObject) == sizeof(CUtexObject));
static_assert(sizeof(arrayMemObject) == sizeof(CUarray));
first_error = true;
cuDevId = info.num;
@@ -69,6 +65,12 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
need_texture_info = false;
device_texture_headroom = 0;
device_working_headroom = 0;
move_texture_to_host = false;
map_host_limit = 0;
map_host_used = 0;
can_map_host = 0;
pitch_alignment = 0;
/* Initialize CUDA. */
@@ -89,9 +91,8 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
/* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
* CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
* so we can predict which memory to map to host. */
int value;
cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
can_map_host = value != 0;
cuda_assert(
cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
cuda_assert(cuDeviceGetAttribute(
&pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
@@ -498,57 +499,311 @@ void CUDADevice::reserve_local_memory(const uint kernel_features)
# endif
}
void CUDADevice::get_device_memory_info(size_t &total, size_t &free)
void CUDADevice::init_host_memory()
{
/* Limit amount of host mapped memory, because allocating too much can
* cause system instability. Leave at least half or 4 GB of system
* memory free, whichever is smaller. */
size_t default_limit = 4 * 1024 * 1024 * 1024LL;
size_t system_ram = system_physical_ram();
if (system_ram > 0) {
if (system_ram / 2 > default_limit) {
map_host_limit = system_ram - default_limit;
}
else {
map_host_limit = system_ram / 2;
}
}
else {
VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
map_host_limit = 0;
}
/* Amount of device memory to keep is free after texture memory
* and working memory allocations respectively. We set the working
* memory limit headroom lower so that some space is left after all
* texture memory allocations. */
device_working_headroom = 32 * 1024 * 1024LL; // 32MB
device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
<< " bytes. (" << string_human_readable_size(map_host_limit) << ")";
}
void CUDADevice::load_texture_info()
{
if (need_texture_info) {
/* Unset flag before copying, so this does not loop indefinitely if the copy below calls
* into 'move_textures_to_host' (which calls 'load_texture_info' again). */
need_texture_info = false;
texture_info.copy_to_device();
}
}
void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
{
/* Break out of recursive call, which can happen when moving memory on a multi device. */
static bool any_device_moving_textures_to_host = false;
if (any_device_moving_textures_to_host) {
return;
}
/* Signal to reallocate textures in host memory only. */
move_texture_to_host = true;
while (size > 0) {
/* Find suitable memory allocation to move. */
device_memory *max_mem = NULL;
size_t max_size = 0;
bool max_is_image = false;
thread_scoped_lock lock(cuda_mem_map_mutex);
foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
device_memory &mem = *pair.first;
CUDAMem *cmem = &pair.second;
/* Can only move textures allocated on this device (and not those from peer devices).
* And need to ignore memory that is already on the host. */
if (!mem.is_resident(this) || cmem->use_mapped_host) {
continue;
}
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
(&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
/* Can't move this type of memory. */
if (!is_texture || cmem->array) {
continue;
}
/* For other textures, only move image textures. */
if (for_texture && !is_image) {
continue;
}
/* Try to move largest allocation, prefer moving images. */
if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
max_is_image = is_image;
max_size = mem.device_size;
max_mem = &mem;
}
}
lock.unlock();
/* Move to host memory. This part is mutex protected since
* multiple CUDA devices could be moving the memory. The
* first one will do it, and the rest will adopt the pointer. */
if (max_mem) {
VLOG_WORK << "Move memory from device to host: " << max_mem->name;
static thread_mutex move_mutex;
thread_scoped_lock lock(move_mutex);
any_device_moving_textures_to_host = true;
/* Potentially need to call back into multi device, so pointer mapping
* and peer devices are updated. This is also necessary since the device
* pointer may just be a key here, so cannot be accessed and freed directly.
* Unfortunately it does mean that memory is reallocated on all other
* devices as well, which is potentially dangerous when still in use (since
* a thread rendering on another devices would only be caught in this mutex
* if it so happens to do an allocation at the same time as well. */
max_mem->device_copy_to();
size = (max_size >= size) ? 0 : size - max_size;
any_device_moving_textures_to_host = false;
}
else {
break;
}
}
/* Unset flag before texture info is reloaded, since it should stay in device memory. */
move_texture_to_host = false;
/* Update texture info array with new pointers. */
load_texture_info();
}
CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
{
CUDAContextScope scope(this);
CUdeviceptr device_pointer = 0;
size_t size = mem.memory_size() + pitch_padding;
CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
const char *status = "";
/* First try allocating in device memory, respecting headroom. We make
* an exception for texture info. It is small and frequently accessed,
* so treat it as working memory.
*
* If there is not enough room for working memory, we will try to move
* textures to host memory, assuming the performance impact would have
* been worse for working memory. */
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
size_t total = 0, free = 0;
cuMemGetInfo(&free, &total);
/* Move textures to host memory if needed. */
if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
move_textures_to_host(size + headroom - free, is_texture);
cuMemGetInfo(&free, &total);
}
/* Allocate in device memory. */
if (!move_texture_to_host && (size + headroom) < free) {
mem_alloc_result = cuMemAlloc(&device_pointer, size);
if (mem_alloc_result == CUDA_SUCCESS) {
status = " in device memory";
}
}
/* Fall back to mapped host memory if needed and possible. */
void *shared_pointer = 0;
if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
if (mem.shared_pointer) {
/* Another device already allocated host memory. */
mem_alloc_result = CUDA_SUCCESS;
shared_pointer = mem.shared_pointer;
}
else if (map_host_used + size < map_host_limit) {
/* Allocate host memory ourselves. */
mem_alloc_result = cuMemHostAlloc(
&shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
(mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
}
if (mem_alloc_result == CUDA_SUCCESS) {
cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
map_host_used += size;
status = " in host memory";
}
}
if (mem_alloc_result != CUDA_SUCCESS) {
if (mem.type == MEM_DEVICE_ONLY) {
status = " failed, out of device memory";
set_error("System is out of GPU memory");
}
else {
status = " failed, out of device and host memory";
set_error("System is out of GPU and shared host memory");
}
}
if (mem.name) {
VLOG_WORK << "Buffer allocate: " << mem.name << ", "
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
<< string_human_readable_size(mem.memory_size()) << ")" << status;
}
mem.device_pointer = (device_ptr)device_pointer;
mem.device_size = size;
stats.mem_alloc(size);
if (!mem.device_pointer) {
return NULL;
}
/* Insert into map of allocations. */
thread_scoped_lock lock(cuda_mem_map_mutex);
CUDAMem *cmem = &cuda_mem_map[&mem];
if (shared_pointer != 0) {
/* Replace host pointer with our host allocation. Only works if
* CUDA memory layout is the same and has no pitch padding. Also
* does not work if we move textures to host during a render,
* since other devices might be using the memory. */
if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
mem.host_pointer != shared_pointer) {
memcpy(shared_pointer, mem.host_pointer, size);
/* A Call to device_memory::host_free() should be preceded by
* a call to device_memory::device_free() for host memory
* allocated by a device to be handled properly. Two exceptions
* are here and a call in OptiXDevice::generic_alloc(), where
* the current host memory can be assumed to be allocated by
* device_memory::host_alloc(), not by a device */
mem.host_free();
mem.host_pointer = shared_pointer;
}
mem.shared_pointer = shared_pointer;
mem.shared_counter++;
cmem->use_mapped_host = true;
}
else {
cmem->use_mapped_host = false;
}
return cmem;
}
bool CUDADevice::alloc_device(void *&device_pointer, size_t size)
void CUDADevice::generic_copy_to(device_memory &mem)
{
CUDAContextScope scope(this);
if (!mem.host_pointer || !mem.device_pointer) {
return;
}
CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer, size);
return mem_alloc_result == CUDA_SUCCESS;
/* If use_mapped_host of mem is false, the current device only uses device memory allocated by
* cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
* mem.host_pointer. */
thread_scoped_lock lock(cuda_mem_map_mutex);
if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
const CUDAContextScope scope(this);
cuda_assert(
cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
}
}
void CUDADevice::free_device(void *device_pointer)
void CUDADevice::generic_free(device_memory &mem)
{
CUDAContextScope scope(this);
if (mem.device_pointer) {
CUDAContextScope scope(this);
thread_scoped_lock lock(cuda_mem_map_mutex);
DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
const CUDAMem &cmem = cuda_mem_map[&mem];
cuda_assert(cuMemFree((CUdeviceptr)device_pointer));
}
/* If cmem.use_mapped_host is true, reference counting is used
* to safely free a mapped host memory. */
bool CUDADevice::alloc_host(void *&shared_pointer, size_t size)
{
CUDAContextScope scope(this);
if (cmem.use_mapped_host) {
assert(mem.shared_pointer);
if (mem.shared_pointer) {
assert(mem.shared_counter > 0);
if (--mem.shared_counter == 0) {
if (mem.host_pointer == mem.shared_pointer) {
mem.host_pointer = 0;
}
cuMemFreeHost(mem.shared_pointer);
mem.shared_pointer = 0;
}
}
map_host_used -= mem.device_size;
}
else {
/* Free device memory. */
cuda_assert(cuMemFree(mem.device_pointer));
}
CUresult mem_alloc_result = cuMemHostAlloc(
&shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
return mem_alloc_result == CUDA_SUCCESS;
}
stats.mem_free(mem.device_size);
mem.device_pointer = 0;
mem.device_size = 0;
void CUDADevice::free_host(void *shared_pointer)
{
CUDAContextScope scope(this);
cuMemFreeHost(shared_pointer);
}
bool CUDADevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
{
CUDAContextScope scope(this);
cuda_assert(cuMemHostGetDevicePointer_v2((CUdeviceptr *)&device_pointer, shared_pointer, 0));
return true;
}
void CUDADevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
{
const CUDAContextScope scope(this);
cuda_assert(cuMemcpyHtoD((CUdeviceptr)device_pointer, host_pointer, size));
cuda_mem_map.erase(cuda_mem_map.find(&mem));
}
}
void CUDADevice::mem_alloc(device_memory &mem)
@@ -613,8 +868,8 @@ void CUDADevice::mem_zero(device_memory &mem)
/* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
* regardless of mem.host_pointer and mem.shared_pointer. */
thread_scoped_lock lock(device_mem_map_mutex);
if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
thread_scoped_lock lock(cuda_mem_map_mutex);
if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
const CUDAContextScope scope(this);
cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
}
@@ -739,19 +994,19 @@ void CUDADevice::tex_alloc(device_texture &mem)
return;
}
Mem *cmem = NULL;
CUDAMem *cmem = NULL;
CUarray array_3d = NULL;
size_t src_pitch = mem.data_width * dsize * mem.data_elements;
size_t dst_pitch = src_pitch;
if (!mem.is_resident(this)) {
thread_scoped_lock lock(device_mem_map_mutex);
cmem = &device_mem_map[&mem];
thread_scoped_lock lock(cuda_mem_map_mutex);
cmem = &cuda_mem_map[&mem];
cmem->texobject = 0;
if (mem.data_depth > 1) {
array_3d = (CUarray)mem.device_pointer;
cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
cmem->array = array_3d;
}
else if (mem.data_height > 0) {
dst_pitch = align_up(src_pitch, pitch_alignment);
@@ -795,10 +1050,10 @@ void CUDADevice::tex_alloc(device_texture &mem)
mem.device_size = size;
stats.mem_alloc(size);
thread_scoped_lock lock(device_mem_map_mutex);
cmem = &device_mem_map[&mem];
thread_scoped_lock lock(cuda_mem_map_mutex);
cmem = &cuda_mem_map[&mem];
cmem->texobject = 0;
cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
cmem->array = array_3d;
}
else if (mem.data_height > 0) {
/* 2D texture, using pitch aligned linear memory. */
@@ -882,8 +1137,8 @@ void CUDADevice::tex_alloc(device_texture &mem)
texDesc.filterMode = filter_mode;
texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
thread_scoped_lock lock(device_mem_map_mutex);
cmem = &device_mem_map[&mem];
thread_scoped_lock lock(cuda_mem_map_mutex);
cmem = &cuda_mem_map[&mem];
cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
@@ -898,9 +1153,9 @@ void CUDADevice::tex_free(device_texture &mem)
{
if (mem.device_pointer) {
CUDAContextScope scope(this);
thread_scoped_lock lock(device_mem_map_mutex);
DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
const Mem &cmem = device_mem_map[&mem];
thread_scoped_lock lock(cuda_mem_map_mutex);
DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
const CUDAMem &cmem = cuda_mem_map[&mem];
if (cmem.texobject) {
/* Free bindless texture. */
@@ -909,16 +1164,16 @@ void CUDADevice::tex_free(device_texture &mem)
if (!mem.is_resident(this)) {
/* Do not free memory here, since it was allocated on a different device. */
device_mem_map.erase(device_mem_map.find(&mem));
cuda_mem_map.erase(cuda_mem_map.find(&mem));
}
else if (cmem.array) {
/* Free array. */
cuArrayDestroy(reinterpret_cast<CUarray>(cmem.array));
cuArrayDestroy(cmem.array);
stats.mem_free(mem.device_size);
mem.device_pointer = 0;
mem.device_size = 0;
device_mem_map.erase(device_mem_map.find(&mem));
cuda_mem_map.erase(cuda_mem_map.find(&mem));
}
else {
lock.unlock();

View File

@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
class DeviceQueue;
class CUDADevice : public GPUDevice {
class CUDADevice : public Device {
friend class CUDAContextScope;
@@ -29,11 +29,36 @@ class CUDADevice : public GPUDevice {
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
size_t device_texture_headroom;
size_t device_working_headroom;
bool move_texture_to_host;
size_t map_host_used;
size_t map_host_limit;
int can_map_host;
int pitch_alignment;
int cuDevId;
int cuDevArchitecture;
bool first_error;
struct CUDAMem {
CUDAMem() : texobject(0), array(0), use_mapped_host(false)
{
}
CUtexObject texobject;
CUarray array;
/* If true, a mapped host memory in shared_pointer is being used. */
bool use_mapped_host;
};
typedef map<device_memory *, CUDAMem> CUDAMemMap;
CUDAMemMap cuda_mem_map;
thread_mutex cuda_mem_map_mutex;
/* Bindless Textures */
device_vector<TextureInfo> texture_info;
bool need_texture_info;
CUDADeviceKernels kernels;
static bool have_precompiled_kernels();
@@ -63,13 +88,17 @@ class CUDADevice : public GPUDevice {
void reserve_local_memory(const uint kernel_features);
virtual void get_device_memory_info(size_t &total, size_t &free) override;
virtual bool alloc_device(void *&device_pointer, size_t size) override;
virtual void free_device(void *device_pointer) override;
virtual bool alloc_host(void *&shared_pointer, size_t size) override;
virtual void free_host(void *shared_pointer) override;
virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;
void init_host_memory();
void load_texture_info();
void move_textures_to_host(size_t size, bool for_texture);
CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
void generic_copy_to(device_memory &mem);
void generic_free(device_memory &mem);
void mem_alloc(device_memory &mem) override;

View File

@@ -452,320 +452,6 @@ void *Device::get_cpu_osl_memory()
return nullptr;
}
GPUDevice::~GPUDevice() noexcept(false)
{
}
bool GPUDevice::load_texture_info()
{
if (need_texture_info) {
/* Unset flag before copying, so this does not loop indefinitely if the copy below calls
* into 'move_textures_to_host' (which calls 'load_texture_info' again). */
need_texture_info = false;
texture_info.copy_to_device();
return true;
}
else {
return false;
}
}
void GPUDevice::init_host_memory(size_t preferred_texture_headroom,
size_t preferred_working_headroom)
{
/* Limit amount of host mapped memory, because allocating too much can
* cause system instability. Leave at least half or 4 GB of system
* memory free, whichever is smaller. */
size_t default_limit = 4 * 1024 * 1024 * 1024LL;
size_t system_ram = system_physical_ram();
if (system_ram > 0) {
if (system_ram / 2 > default_limit) {
map_host_limit = system_ram - default_limit;
}
else {
map_host_limit = system_ram / 2;
}
}
else {
VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
map_host_limit = 0;
}
/* Amount of device memory to keep free after texture memory
* and working memory allocations respectively. We set the working
* memory limit headroom lower than the working one so there
* is space left for it. */
device_working_headroom = preferred_working_headroom > 0 ? preferred_working_headroom :
32 * 1024 * 1024LL; // 32MB
device_texture_headroom = preferred_texture_headroom > 0 ? preferred_texture_headroom :
128 * 1024 * 1024LL; // 128MB
VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
<< " bytes. (" << string_human_readable_size(map_host_limit) << ")";
}
void GPUDevice::move_textures_to_host(size_t size, bool for_texture)
{
/* Break out of recursive call, which can happen when moving memory on a multi device. */
static bool any_device_moving_textures_to_host = false;
if (any_device_moving_textures_to_host) {
return;
}
/* Signal to reallocate textures in host memory only. */
move_texture_to_host = true;
while (size > 0) {
/* Find suitable memory allocation to move. */
device_memory *max_mem = NULL;
size_t max_size = 0;
bool max_is_image = false;
thread_scoped_lock lock(device_mem_map_mutex);
foreach (MemMap::value_type &pair, device_mem_map) {
device_memory &mem = *pair.first;
Mem *cmem = &pair.second;
/* Can only move textures allocated on this device (and not those from peer devices).
* And need to ignore memory that is already on the host. */
if (!mem.is_resident(this) || cmem->use_mapped_host) {
continue;
}
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
(&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
/* Can't move this type of memory. */
if (!is_texture || cmem->array) {
continue;
}
/* For other textures, only move image textures. */
if (for_texture && !is_image) {
continue;
}
/* Try to move largest allocation, prefer moving images. */
if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
max_is_image = is_image;
max_size = mem.device_size;
max_mem = &mem;
}
}
lock.unlock();
/* Move to host memory. This part is mutex protected since
* multiple backend devices could be moving the memory. The
* first one will do it, and the rest will adopt the pointer. */
if (max_mem) {
VLOG_WORK << "Move memory from device to host: " << max_mem->name;
static thread_mutex move_mutex;
thread_scoped_lock lock(move_mutex);
any_device_moving_textures_to_host = true;
/* Potentially need to call back into multi device, so pointer mapping
* and peer devices are updated. This is also necessary since the device
* pointer may just be a key here, so cannot be accessed and freed directly.
* Unfortunately it does mean that memory is reallocated on all other
* devices as well, which is potentially dangerous when still in use (since
* a thread rendering on another devices would only be caught in this mutex
* if it so happens to do an allocation at the same time as well. */
max_mem->device_copy_to();
size = (max_size >= size) ? 0 : size - max_size;
any_device_moving_textures_to_host = false;
}
else {
break;
}
}
/* Unset flag before texture info is reloaded, since it should stay in device memory. */
move_texture_to_host = false;
/* Update texture info array with new pointers. */
load_texture_info();
}
GPUDevice::Mem *GPUDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
{
void *device_pointer = 0;
size_t size = mem.memory_size() + pitch_padding;
bool mem_alloc_result = false;
const char *status = "";
/* First try allocating in device memory, respecting headroom. We make
* an exception for texture info. It is small and frequently accessed,
* so treat it as working memory.
*
* If there is not enough room for working memory, we will try to move
* textures to host memory, assuming the performance impact would have
* been worse for working memory. */
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
size_t total = 0, free = 0;
get_device_memory_info(total, free);
/* Move textures to host memory if needed. */
if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
move_textures_to_host(size + headroom - free, is_texture);
get_device_memory_info(total, free);
}
/* Allocate in device memory. */
if (!move_texture_to_host && (size + headroom) < free) {
mem_alloc_result = alloc_device(device_pointer, size);
if (mem_alloc_result) {
device_mem_in_use += size;
status = " in device memory";
}
}
/* Fall back to mapped host memory if needed and possible. */
void *shared_pointer = 0;
if (!mem_alloc_result && can_map_host && mem.type != MEM_DEVICE_ONLY) {
if (mem.shared_pointer) {
/* Another device already allocated host memory. */
mem_alloc_result = true;
shared_pointer = mem.shared_pointer;
}
else if (map_host_used + size < map_host_limit) {
/* Allocate host memory ourselves. */
mem_alloc_result = alloc_host(shared_pointer, size);
assert((mem_alloc_result && shared_pointer != 0) ||
(!mem_alloc_result && shared_pointer == 0));
}
if (mem_alloc_result) {
assert(transform_host_pointer(device_pointer, shared_pointer));
map_host_used += size;
status = " in host memory";
}
}
if (!mem_alloc_result) {
if (mem.type == MEM_DEVICE_ONLY) {
status = " failed, out of device memory";
set_error("System is out of GPU memory");
}
else {
status = " failed, out of device and host memory";
set_error("System is out of GPU and shared host memory");
}
}
if (mem.name) {
VLOG_WORK << "Buffer allocate: " << mem.name << ", "
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
<< string_human_readable_size(mem.memory_size()) << ")" << status;
}
mem.device_pointer = (device_ptr)device_pointer;
mem.device_size = size;
stats.mem_alloc(size);
if (!mem.device_pointer) {
return NULL;
}
/* Insert into map of allocations. */
thread_scoped_lock lock(device_mem_map_mutex);
Mem *cmem = &device_mem_map[&mem];
if (shared_pointer != 0) {
/* Replace host pointer with our host allocation. Only works if
* memory layout is the same and has no pitch padding. Also
* does not work if we move textures to host during a render,
* since other devices might be using the memory. */
if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
mem.host_pointer != shared_pointer) {
memcpy(shared_pointer, mem.host_pointer, size);
/* A Call to device_memory::host_free() should be preceded by
* a call to device_memory::device_free() for host memory
* allocated by a device to be handled properly. Two exceptions
* are here and a call in OptiXDevice::generic_alloc(), where
* the current host memory can be assumed to be allocated by
* device_memory::host_alloc(), not by a device */
mem.host_free();
mem.host_pointer = shared_pointer;
}
mem.shared_pointer = shared_pointer;
mem.shared_counter++;
cmem->use_mapped_host = true;
}
else {
cmem->use_mapped_host = false;
}
return cmem;
}
void GPUDevice::generic_free(device_memory &mem)
{
if (mem.device_pointer) {
thread_scoped_lock lock(device_mem_map_mutex);
DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
const Mem &cmem = device_mem_map[&mem];
/* If cmem.use_mapped_host is true, reference counting is used
* to safely free a mapped host memory. */
if (cmem.use_mapped_host) {
assert(mem.shared_pointer);
if (mem.shared_pointer) {
assert(mem.shared_counter > 0);
if (--mem.shared_counter == 0) {
if (mem.host_pointer == mem.shared_pointer) {
mem.host_pointer = 0;
}
free_host(mem.shared_pointer);
mem.shared_pointer = 0;
}
}
map_host_used -= mem.device_size;
}
else {
/* Free device memory. */
free_device((void *)mem.device_pointer);
device_mem_in_use -= mem.device_size;
}
stats.mem_free(mem.device_size);
mem.device_pointer = 0;
mem.device_size = 0;
device_mem_map.erase(device_mem_map.find(&mem));
}
}
void GPUDevice::generic_copy_to(device_memory &mem)
{
if (!mem.host_pointer || !mem.device_pointer) {
return;
}
/* If use_mapped_host of mem is false, the current device only uses device memory allocated by
* backend device allocation regardless of mem.host_pointer and mem.shared_pointer, and should
* copy data from mem.host_pointer. */
thread_scoped_lock lock(device_mem_map_mutex);
if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
copy_host_to_device((void *)mem.device_pointer, mem.host_pointer, mem.memory_size());
}
}
/* DeviceInfo */
CCL_NAMESPACE_END

View File

@@ -309,93 +309,6 @@ class Device {
static uint devices_initialized_mask;
};
/* Device, which is GPU, with some common functionality for GPU backends */
class GPUDevice : public Device {
protected:
GPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
: Device(info_, stats_, profiler_),
texture_info(this, "texture_info", MEM_GLOBAL),
need_texture_info(false),
can_map_host(false),
map_host_used(0),
map_host_limit(0),
device_texture_headroom(0),
device_working_headroom(0),
device_mem_map(),
device_mem_map_mutex(),
move_texture_to_host(false),
device_mem_in_use(0)
{
}
public:
virtual ~GPUDevice() noexcept(false);
/* For GPUs that can use bindless textures in some way or another. */
device_vector<TextureInfo> texture_info;
bool need_texture_info;
/* Returns true if the texture info was copied to the device (meaning, some more
* re-initialization might be needed). */
virtual bool load_texture_info();
protected:
/* Memory allocation, only accessed through device_memory. */
friend class device_memory;
bool can_map_host;
size_t map_host_used;
size_t map_host_limit;
size_t device_texture_headroom;
size_t device_working_headroom;
typedef unsigned long long texMemObject;
typedef unsigned long long arrayMemObject;
struct Mem {
Mem() : texobject(0), array(0), use_mapped_host(false)
{
}
texMemObject texobject;
arrayMemObject array;
/* If true, a mapped host memory in shared_pointer is being used. */
bool use_mapped_host;
};
typedef map<device_memory *, Mem> MemMap;
MemMap device_mem_map;
thread_mutex device_mem_map_mutex;
bool move_texture_to_host;
/* Simple counter which will try to track amount of used device memory */
size_t device_mem_in_use;
virtual void init_host_memory(size_t preferred_texture_headroom = 0,
size_t preferred_working_headroom = 0);
virtual void move_textures_to_host(size_t size, bool for_texture);
/* Allocation, deallocation and copy functions, with corresponding
* support of device/host allocations. */
virtual GPUDevice::Mem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
virtual void generic_free(device_memory &mem);
virtual void generic_copy_to(device_memory &mem);
/* total - amount of device memory, free - amount of available device memory */
virtual void get_device_memory_info(size_t &total, size_t &free) = 0;
virtual bool alloc_device(void *&device_pointer, size_t size) = 0;
virtual void free_device(void *device_pointer) = 0;
virtual bool alloc_host(void *&shared_pointer, size_t size) = 0;
virtual void free_host(void *shared_pointer) = 0;
/* This function should return device pointer corresponding to shared pointer, which
* is host buffer, allocated in `alloc_host`. The function should `true`, if such
* address transformation is possible and `false` otherwise. */
virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) = 0;
virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) = 0;
};
CCL_NAMESPACE_END
#endif /* __DEVICE_H__ */

View File

@@ -53,12 +53,8 @@ void HIPDevice::set_error(const string &error)
}
HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
: GPUDevice(info, stats, profiler)
: Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
{
/* Verify that base class types can be used with specific backend types */
static_assert(sizeof(texMemObject) == sizeof(hipTextureObject_t));
static_assert(sizeof(arrayMemObject) == sizeof(hArray));
first_error = true;
hipDevId = info.num;
@@ -69,6 +65,12 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
need_texture_info = false;
device_texture_headroom = 0;
device_working_headroom = 0;
move_texture_to_host = false;
map_host_limit = 0;
map_host_used = 0;
can_map_host = 0;
pitch_alignment = 0;
/* Initialize HIP. */
@@ -89,9 +91,7 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
/* hipDeviceMapHost for mapping host memory when out of device memory.
* hipDeviceLmemResizeToMax for reserving local memory ahead of render,
* so we can predict which memory to map to host. */
int value;
hip_assert(hipDeviceGetAttribute(&value, hipDeviceAttributeCanMapHostMemory, hipDevice));
can_map_host = value != 0;
hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice));
hip_assert(
hipDeviceGetAttribute(&pitch_alignment, hipDeviceAttributeTexturePitchAlignment, hipDevice));
@@ -460,58 +460,305 @@ void HIPDevice::reserve_local_memory(const uint kernel_features)
# endif
}
void HIPDevice::get_device_memory_info(size_t &total, size_t &free)
void HIPDevice::init_host_memory()
{
/* Limit amount of host mapped memory, because allocating too much can
* cause system instability. Leave at least half or 4 GB of system
* memory free, whichever is smaller. */
size_t default_limit = 4 * 1024 * 1024 * 1024LL;
size_t system_ram = system_physical_ram();
if (system_ram > 0) {
if (system_ram / 2 > default_limit) {
map_host_limit = system_ram - default_limit;
}
else {
map_host_limit = system_ram / 2;
}
}
else {
VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
map_host_limit = 0;
}
/* Amount of device memory to keep is free after texture memory
* and working memory allocations respectively. We set the working
* memory limit headroom lower so that some space is left after all
* texture memory allocations. */
device_working_headroom = 32 * 1024 * 1024LL; // 32MB
device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
<< " bytes. (" << string_human_readable_size(map_host_limit) << ")";
}
void HIPDevice::load_texture_info()
{
if (need_texture_info) {
/* Unset flag before copying, so this does not loop indefinitely if the copy below calls
* into 'move_textures_to_host' (which calls 'load_texture_info' again). */
need_texture_info = false;
texture_info.copy_to_device();
}
}
void HIPDevice::move_textures_to_host(size_t size, bool for_texture)
{
/* Break out of recursive call, which can happen when moving memory on a multi device. */
static bool any_device_moving_textures_to_host = false;
if (any_device_moving_textures_to_host) {
return;
}
/* Signal to reallocate textures in host memory only. */
move_texture_to_host = true;
while (size > 0) {
/* Find suitable memory allocation to move. */
device_memory *max_mem = NULL;
size_t max_size = 0;
bool max_is_image = false;
thread_scoped_lock lock(hip_mem_map_mutex);
foreach (HIPMemMap::value_type &pair, hip_mem_map) {
device_memory &mem = *pair.first;
HIPMem *cmem = &pair.second;
/* Can only move textures allocated on this device (and not those from peer devices).
* And need to ignore memory that is already on the host. */
if (!mem.is_resident(this) || cmem->use_mapped_host) {
continue;
}
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
(&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
/* Can't move this type of memory. */
if (!is_texture || cmem->array) {
continue;
}
/* For other textures, only move image textures. */
if (for_texture && !is_image) {
continue;
}
/* Try to move largest allocation, prefer moving images. */
if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
max_is_image = is_image;
max_size = mem.device_size;
max_mem = &mem;
}
}
lock.unlock();
/* Move to host memory. This part is mutex protected since
* multiple HIP devices could be moving the memory. The
* first one will do it, and the rest will adopt the pointer. */
if (max_mem) {
VLOG_WORK << "Move memory from device to host: " << max_mem->name;
static thread_mutex move_mutex;
thread_scoped_lock lock(move_mutex);
any_device_moving_textures_to_host = true;
/* Potentially need to call back into multi device, so pointer mapping
* and peer devices are updated. This is also necessary since the device
* pointer may just be a key here, so cannot be accessed and freed directly.
* Unfortunately it does mean that memory is reallocated on all other
* devices as well, which is potentially dangerous when still in use (since
* a thread rendering on another devices would only be caught in this mutex
* if it so happens to do an allocation at the same time as well. */
max_mem->device_copy_to();
size = (max_size >= size) ? 0 : size - max_size;
any_device_moving_textures_to_host = false;
}
else {
break;
}
}
/* Unset flag before texture info is reloaded, since it should stay in device memory. */
move_texture_to_host = false;
/* Update texture info array with new pointers. */
load_texture_info();
}
HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
{
HIPContextScope scope(this);
hipDeviceptr_t device_pointer = 0;
size_t size = mem.memory_size() + pitch_padding;
hipError_t mem_alloc_result = hipErrorOutOfMemory;
const char *status = "";
/* First try allocating in device memory, respecting headroom. We make
* an exception for texture info. It is small and frequently accessed,
* so treat it as working memory.
*
* If there is not enough room for working memory, we will try to move
* textures to host memory, assuming the performance impact would have
* been worse for working memory. */
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
size_t total = 0, free = 0;
hipMemGetInfo(&free, &total);
/* Move textures to host memory if needed. */
if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
move_textures_to_host(size + headroom - free, is_texture);
hipMemGetInfo(&free, &total);
}
/* Allocate in device memory. */
if (!move_texture_to_host && (size + headroom) < free) {
mem_alloc_result = hipMalloc(&device_pointer, size);
if (mem_alloc_result == hipSuccess) {
status = " in device memory";
}
}
/* Fall back to mapped host memory if needed and possible. */
void *shared_pointer = 0;
if (mem_alloc_result != hipSuccess && can_map_host) {
if (mem.shared_pointer) {
/* Another device already allocated host memory. */
mem_alloc_result = hipSuccess;
shared_pointer = mem.shared_pointer;
}
else if (map_host_used + size < map_host_limit) {
/* Allocate host memory ourselves. */
mem_alloc_result = hipHostMalloc(
&shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);
assert((mem_alloc_result == hipSuccess && shared_pointer != 0) ||
(mem_alloc_result != hipSuccess && shared_pointer == 0));
}
if (mem_alloc_result == hipSuccess) {
hip_assert(hipHostGetDevicePointer(&device_pointer, shared_pointer, 0));
map_host_used += size;
status = " in host memory";
}
}
if (mem_alloc_result != hipSuccess) {
status = " failed, out of device and host memory";
set_error("System is out of GPU and shared host memory");
}
if (mem.name) {
VLOG_WORK << "Buffer allocate: " << mem.name << ", "
<< string_human_readable_number(mem.memory_size()) << " bytes. ("
<< string_human_readable_size(mem.memory_size()) << ")" << status;
}
mem.device_pointer = (device_ptr)device_pointer;
mem.device_size = size;
stats.mem_alloc(size);
if (!mem.device_pointer) {
return NULL;
}
/* Insert into map of allocations. */
thread_scoped_lock lock(hip_mem_map_mutex);
HIPMem *cmem = &hip_mem_map[&mem];
if (shared_pointer != 0) {
/* Replace host pointer with our host allocation. Only works if
* HIP memory layout is the same and has no pitch padding. Also
* does not work if we move textures to host during a render,
* since other devices might be using the memory. */
if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
mem.host_pointer != shared_pointer) {
memcpy(shared_pointer, mem.host_pointer, size);
/* A Call to device_memory::host_free() should be preceded by
* a call to device_memory::device_free() for host memory
* allocated by a device to be handled properly. Two exceptions
* are here and a call in OptiXDevice::generic_alloc(), where
* the current host memory can be assumed to be allocated by
* device_memory::host_alloc(), not by a device */
mem.host_free();
mem.host_pointer = shared_pointer;
}
mem.shared_pointer = shared_pointer;
mem.shared_counter++;
cmem->use_mapped_host = true;
}
else {
cmem->use_mapped_host = false;
}
return cmem;
}
bool HIPDevice::alloc_device(void *&device_pointer, size_t size)
void HIPDevice::generic_copy_to(device_memory &mem)
{
HIPContextScope scope(this);
if (!mem.host_pointer || !mem.device_pointer) {
return;
}
hipError_t mem_alloc_result = hipMalloc((hipDeviceptr_t *)&device_pointer, size);
return mem_alloc_result == hipSuccess;
/* If use_mapped_host of mem is false, the current device only uses device memory allocated by
* hipMalloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
* mem.host_pointer. */
thread_scoped_lock lock(hip_mem_map_mutex);
if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
const HIPContextScope scope(this);
hip_assert(
hipMemcpyHtoD((hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size()));
}
}
void HIPDevice::free_device(void *device_pointer)
void HIPDevice::generic_free(device_memory &mem)
{
HIPContextScope scope(this);
if (mem.device_pointer) {
HIPContextScope scope(this);
thread_scoped_lock lock(hip_mem_map_mutex);
DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
const HIPMem &cmem = hip_mem_map[&mem];
hip_assert(hipFree((hipDeviceptr_t)device_pointer));
}
/* If cmem.use_mapped_host is true, reference counting is used
* to safely free a mapped host memory. */
bool HIPDevice::alloc_host(void *&shared_pointer, size_t size)
{
HIPContextScope scope(this);
if (cmem.use_mapped_host) {
assert(mem.shared_pointer);
if (mem.shared_pointer) {
assert(mem.shared_counter > 0);
if (--mem.shared_counter == 0) {
if (mem.host_pointer == mem.shared_pointer) {
mem.host_pointer = 0;
}
hipHostFree(mem.shared_pointer);
mem.shared_pointer = 0;
}
}
map_host_used -= mem.device_size;
}
else {
/* Free device memory. */
hip_assert(hipFree(mem.device_pointer));
}
hipError_t mem_alloc_result = hipHostMalloc(
&shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);
stats.mem_free(mem.device_size);
mem.device_pointer = 0;
mem.device_size = 0;
return mem_alloc_result == hipSuccess;
}
void HIPDevice::free_host(void *shared_pointer)
{
HIPContextScope scope(this);
hipHostFree(shared_pointer);
}
bool HIPDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
{
HIPContextScope scope(this);
hip_assert(hipHostGetDevicePointer((hipDeviceptr_t *)&device_pointer, shared_pointer, 0));
return true;
}
void HIPDevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
{
const HIPContextScope scope(this);
hip_assert(hipMemcpyHtoD((hipDeviceptr_t)device_pointer, host_pointer, size));
hip_mem_map.erase(hip_mem_map.find(&mem));
}
}
void HIPDevice::mem_alloc(device_memory &mem)
@@ -576,8 +823,8 @@ void HIPDevice::mem_zero(device_memory &mem)
/* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
* regardless of mem.host_pointer and mem.shared_pointer. */
thread_scoped_lock lock(device_mem_map_mutex);
if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
thread_scoped_lock lock(hip_mem_map_mutex);
if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
const HIPContextScope scope(this);
hip_assert(hipMemsetD8((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size()));
}
@@ -704,19 +951,19 @@ void HIPDevice::tex_alloc(device_texture &mem)
return;
}
Mem *cmem = NULL;
HIPMem *cmem = NULL;
hArray array_3d = NULL;
size_t src_pitch = mem.data_width * dsize * mem.data_elements;
size_t dst_pitch = src_pitch;
if (!mem.is_resident(this)) {
thread_scoped_lock lock(device_mem_map_mutex);
cmem = &device_mem_map[&mem];
thread_scoped_lock lock(hip_mem_map_mutex);
cmem = &hip_mem_map[&mem];
cmem->texobject = 0;
if (mem.data_depth > 1) {
array_3d = (hArray)mem.device_pointer;
cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
cmem->array = array_3d;
}
else if (mem.data_height > 0) {
dst_pitch = align_up(src_pitch, pitch_alignment);
@@ -760,10 +1007,10 @@ void HIPDevice::tex_alloc(device_texture &mem)
mem.device_size = size;
stats.mem_alloc(size);
thread_scoped_lock lock(device_mem_map_mutex);
cmem = &device_mem_map[&mem];
thread_scoped_lock lock(hip_mem_map_mutex);
cmem = &hip_mem_map[&mem];
cmem->texobject = 0;
cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
cmem->array = array_3d;
}
else if (mem.data_height > 0) {
/* 2D texture, using pitch aligned linear memory. */
@@ -848,8 +1095,8 @@ void HIPDevice::tex_alloc(device_texture &mem)
texDesc.filterMode = filter_mode;
texDesc.flags = HIP_TRSF_NORMALIZED_COORDINATES;
thread_scoped_lock lock(device_mem_map_mutex);
cmem = &device_mem_map[&mem];
thread_scoped_lock lock(hip_mem_map_mutex);
cmem = &hip_mem_map[&mem];
hip_assert(hipTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
@@ -864,9 +1111,9 @@ void HIPDevice::tex_free(device_texture &mem)
{
if (mem.device_pointer) {
HIPContextScope scope(this);
thread_scoped_lock lock(device_mem_map_mutex);
DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
const Mem &cmem = device_mem_map[&mem];
thread_scoped_lock lock(hip_mem_map_mutex);
DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
const HIPMem &cmem = hip_mem_map[&mem];
if (cmem.texobject) {
/* Free bindless texture. */
@@ -875,16 +1122,16 @@ void HIPDevice::tex_free(device_texture &mem)
if (!mem.is_resident(this)) {
/* Do not free memory here, since it was allocated on a different device. */
device_mem_map.erase(device_mem_map.find(&mem));
hip_mem_map.erase(hip_mem_map.find(&mem));
}
else if (cmem.array) {
/* Free array. */
hipArrayDestroy(reinterpret_cast<hArray>(cmem.array));
hipArrayDestroy(cmem.array);
stats.mem_free(mem.device_size);
mem.device_pointer = 0;
mem.device_size = 0;
device_mem_map.erase(device_mem_map.find(&mem));
hip_mem_map.erase(hip_mem_map.find(&mem));
}
else {
lock.unlock();
@@ -906,7 +1153,7 @@ bool HIPDevice::should_use_graphics_interop()
* possible, but from the empiric measurements it can be considerably slower than using naive
* pixels copy. */
/* Disable graphics interop for now, because of driver bug in 21.40. See #92972 */
/* Disable graphics interop for now, because of driver bug in 21.40. See T92972 */
# if 0
HIPContextScope scope(this);

View File

@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
class DeviceQueue;
class HIPDevice : public GPUDevice {
class HIPDevice : public Device {
friend class HIPContextScope;
@@ -26,11 +26,36 @@ class HIPDevice : public GPUDevice {
hipDevice_t hipDevice;
hipCtx_t hipContext;
hipModule_t hipModule;
size_t device_texture_headroom;
size_t device_working_headroom;
bool move_texture_to_host;
size_t map_host_used;
size_t map_host_limit;
int can_map_host;
int pitch_alignment;
int hipDevId;
int hipDevArchitecture;
bool first_error;
struct HIPMem {
HIPMem() : texobject(0), array(0), use_mapped_host(false)
{
}
hipTextureObject_t texobject;
hArray array;
/* If true, a mapped host memory in shared_pointer is being used. */
bool use_mapped_host;
};
typedef map<device_memory *, HIPMem> HIPMemMap;
HIPMemMap hip_mem_map;
thread_mutex hip_mem_map_mutex;
/* Bindless Textures */
device_vector<TextureInfo> texture_info;
bool need_texture_info;
HIPDeviceKernels kernels;
static bool have_precompiled_kernels();
@@ -56,13 +81,17 @@ class HIPDevice : public GPUDevice {
virtual bool load_kernels(const uint kernel_features) override;
void reserve_local_memory(const uint kernel_features);
virtual void get_device_memory_info(size_t &total, size_t &free) override;
virtual bool alloc_device(void *&device_pointer, size_t size) override;
virtual void free_device(void *device_pointer) override;
virtual bool alloc_host(void *&shared_pointer, size_t size) override;
virtual void free_host(void *shared_pointer) override;
virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;
void init_host_memory();
void load_texture_info();
void move_textures_to_host(size_t size, bool for_texture);
HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
void generic_copy_to(device_memory &mem);
void generic_free(device_memory &mem);
void mem_alloc(device_memory &mem) override;

View File

@@ -51,7 +51,7 @@ static inline bool hipSupportsDevice(const int hipDevId)
hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);
return (major >= 9);
return (major >= 10);
}
CCL_NAMESPACE_END

View File

@@ -73,10 +73,6 @@ const char *device_kernel_as_string(DeviceKernel kernel)
return "integrator_terminated_paths_array";
case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
return "integrator_sorted_paths_array";
case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
return "integrator_sort_bucket_pass";
case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS:
return "integrator_sort_write_pass";
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
return "integrator_compact_paths_array";
case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:

View File

@@ -247,8 +247,6 @@ class device_memory {
bool is_resident(Device *sub_device) const;
protected:
friend class Device;
friend class GPUDevice;
friend class CUDADevice;
friend class OptiXDevice;
friend class HIPDevice;

View File

@@ -21,7 +21,6 @@ class BVHMetal : public BVH {
API_AVAILABLE(macos(11.0))
vector<id<MTLAccelerationStructure>> blas_array;
vector<uint32_t> blas_lookup;
bool motion_blur = false;

View File

@@ -816,11 +816,6 @@ bool BVHMetal::build_TLAS(Progress &progress,
uint32_t instance_index = 0;
uint32_t motion_transform_index = 0;
// allocate look up buffer for wost case scenario
uint64_t count = objects.size();
blas_lookup.resize(count);
for (Object *ob : objects) {
/* Skip non-traceable objects */
if (!ob->is_traceable())
@@ -848,15 +843,12 @@ bool BVHMetal::build_TLAS(Progress &progress,
/* Set user instance ID to object index */
int object_index = ob->get_device_index();
uint32_t user_id = uint32_t(object_index);
int currIndex = instance_index++;
assert(user_id < blas_lookup.size());
blas_lookup[user_id] = accel_struct_index;
/* Bake into the appropriate descriptor */
if (motion_blur) {
MTLAccelerationStructureMotionInstanceDescriptor *instances =
(MTLAccelerationStructureMotionInstanceDescriptor *)[instanceBuf contents];
MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[currIndex];
MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[instance_index++];
desc.accelerationStructureIndex = accel_struct_index;
desc.userID = user_id;
@@ -902,7 +894,7 @@ bool BVHMetal::build_TLAS(Progress &progress,
else {
MTLAccelerationStructureUserIDInstanceDescriptor *instances =
(MTLAccelerationStructureUserIDInstanceDescriptor *)[instanceBuf contents];
MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[currIndex];
MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[instance_index++];
desc.accelerationStructureIndex = accel_struct_index;
desc.userID = user_id;

View File

@@ -55,10 +55,6 @@ void device_metal_info(vector<DeviceInfo> &devices)
info.denoisers = DENOISER_NONE;
info.id = id;
if (MetalInfo::get_device_vendor(device) == METAL_GPU_AMD) {
info.has_light_tree = false;
}
devices.push_back(info);
device_index++;
}

View File

@@ -74,11 +74,6 @@ class MetalDevice : public Device {
id<MTLBuffer> texture_bindings_3d = nil;
std::vector<id<MTLTexture>> texture_slot_map;
/* BLAS encoding & lookup */
id<MTLArgumentEncoder> mtlBlasArgEncoder = nil;
id<MTLBuffer> blas_buffer = nil;
id<MTLBuffer> blas_lookup_buffer = nil;
bool use_metalrt = false;
MetalPipelineType kernel_specialization_level = PSO_GENERIC;
@@ -110,8 +105,6 @@ class MetalDevice : public Device {
bool use_adaptive_compilation();
bool use_local_atomic_sort() const;
bool make_source_and_check_if_compile_needed(MetalPipelineType pso_type);
void make_source(MetalPipelineType pso_type, const uint kernel_features);

View File

@@ -105,7 +105,6 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
}
case METAL_GPU_AMD: {
max_threads_per_threadgroup = 128;
use_metalrt = info.use_metalrt;
break;
}
case METAL_GPU_APPLE: {
@@ -193,10 +192,6 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
arg_desc_as.access = MTLArgumentAccessReadOnly;
MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init];
arg_desc_ptrs.dataType = MTLDataTypePointer;
arg_desc_ptrs.access = MTLArgumentAccessReadOnly;
MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
arg_desc_ift.access = MTLArgumentAccessReadOnly;
@@ -209,28 +204,14 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
arg_desc_ift.index = index++;
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */
arg_desc_ift.index = index++;
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_prim */
arg_desc_ptrs.index = index++;
[ancillary_desc addObject:[arg_desc_ptrs copy]]; /* blas array */
arg_desc_ptrs.index = index++;
[ancillary_desc addObject:[arg_desc_ptrs copy]]; /* look up table for blas */
[arg_desc_ift release];
[arg_desc_as release];
[arg_desc_ptrs release];
}
}
mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];
// preparing the blas arg encoder
MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init];
arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
arg_desc_blas.access = MTLArgumentAccessReadOnly;
mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
[arg_desc_blas release];
for (int i = 0; i < ancillary_desc.count; i++) {
[ancillary_desc[i] release];
}
@@ -290,11 +271,6 @@ bool MetalDevice::use_adaptive_compilation()
return DebugFlags().metal.adaptive_compile;
}
bool MetalDevice::use_local_atomic_sort() const
{
return DebugFlags().metal.use_local_atomic_sort;
}
void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
{
string global_defines;
@@ -302,10 +278,6 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
}
if (use_local_atomic_sort()) {
global_defines += "#define __KERNEL_LOCAL_ATOMIC_SORT__\n";
}
if (use_metalrt) {
global_defines += "#define __METALRT__\n";
if (motion_blur) {
@@ -586,7 +558,7 @@ void MetalDevice::erase_allocation(device_memory &mem)
if (it != metal_mem_map.end()) {
MetalMem *mmem = it->second.get();
/* blank out reference to MetalMem* in the launch params (fixes crash #94736) */
/* blank out reference to MetalMem* in the launch params (fixes crash T94736) */
if (mmem->pointer_index >= 0) {
device_ptr *pointers = (device_ptr *)&launch_params;
pointers[mmem->pointer_index] = 0;
@@ -1259,33 +1231,6 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
if (@available(macos 11.0, *)) {
if (bvh->params.top_level) {
bvhMetalRT = bvh_metal;
// allocate required buffers for BLAS array
uint64_t count = bvhMetalRT->blas_array.size();
uint64_t bufferSize = mtlBlasArgEncoder.encodedLength * count;
blas_buffer = [mtlDevice newBufferWithLength:bufferSize options:default_storage_mode];
stats.mem_alloc(blas_buffer.allocatedSize);
for (uint64_t i = 0; i < count; ++i) {
[mtlBlasArgEncoder setArgumentBuffer:blas_buffer
offset:i * mtlBlasArgEncoder.encodedLength];
[mtlBlasArgEncoder setAccelerationStructure:bvhMetalRT->blas_array[i] atIndex:0];
}
count = bvhMetalRT->blas_lookup.size();
bufferSize = sizeof(uint32_t) * count;
blas_lookup_buffer = [mtlDevice newBufferWithLength:bufferSize
options:default_storage_mode];
stats.mem_alloc(blas_lookup_buffer.allocatedSize);
memcpy([blas_lookup_buffer contents],
bvhMetalRT -> blas_lookup.data(),
blas_lookup_buffer.allocatedSize);
if (default_storage_mode == MTLResourceStorageModeManaged) {
[blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)];
[blas_lookup_buffer didModifyRange:NSMakeRange(0, blas_lookup_buffer.length)];
}
}
}
}

View File

@@ -19,8 +19,6 @@ enum {
METALRT_FUNC_SHADOW_BOX,
METALRT_FUNC_LOCAL_TRI,
METALRT_FUNC_LOCAL_BOX,
METALRT_FUNC_LOCAL_TRI_PRIM,
METALRT_FUNC_LOCAL_BOX_PRIM,
METALRT_FUNC_CURVE_RIBBON,
METALRT_FUNC_CURVE_RIBBON_SHADOW,
METALRT_FUNC_CURVE_ALL,
@@ -30,13 +28,7 @@ enum {
METALRT_FUNC_NUM
};
enum {
METALRT_TABLE_DEFAULT,
METALRT_TABLE_SHADOW,
METALRT_TABLE_LOCAL,
METALRT_TABLE_LOCAL_PRIM,
METALRT_TABLE_NUM
};
enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };
/* Pipeline State Object types */
enum MetalPipelineType {

View File

@@ -87,9 +87,6 @@ struct ShaderCache {
break;
}
}
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS] = {1024, 1024};
occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS] = {1024, 1024};
}
~ShaderCache();
@@ -524,8 +521,6 @@ void MetalKernelPipeline::compile()
"__anyhit__cycles_metalrt_shadow_all_hit_box",
"__anyhit__cycles_metalrt_local_hit_tri",
"__anyhit__cycles_metalrt_local_hit_box",
"__anyhit__cycles_metalrt_local_hit_tri_prim",
"__anyhit__cycles_metalrt_local_hit_box_prim",
"__intersection__curve_ribbon",
"__intersection__curve_ribbon_shadow",
"__intersection__curve_all",
@@ -616,17 +611,11 @@ void MetalKernelPipeline::compile()
rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
nil];
table_functions[METALRT_TABLE_LOCAL_PRIM] = [NSArray
arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI_PRIM],
rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
nil];
NSMutableSet *unique_functions = [NSMutableSet
setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
[unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
[unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
[unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL_PRIM]];
if (kernel_has_intersection(device_kernel)) {
linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]

View File

@@ -25,7 +25,6 @@ class MetalDeviceQueue : public DeviceQueue {
virtual int num_concurrent_states(const size_t) const override;
virtual int num_concurrent_busy_states(const size_t) const override;
virtual int num_sort_partition_elements() const override;
virtual bool supports_local_atomic_sort() const override;
virtual void init_execution() override;

View File

@@ -315,11 +315,6 @@ int MetalDeviceQueue::num_sort_partition_elements() const
return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice);
}
bool MetalDeviceQueue::supports_local_atomic_sort() const
{
return metal_device_->use_local_atomic_sort();
}
void MetalDeviceQueue::init_execution()
{
/* Synchronize all textures and memory copies before executing task. */
@@ -482,12 +477,6 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
if (metal_device_->bvhMetalRT) {
id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct;
[metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
offset:0
atIndex:7];
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_lookup_buffer
offset:0
atIndex:8];
}
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
@@ -538,10 +527,6 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
if (bvhMetalRT) {
/* Mark all Accelerations resources as used */
[mtlComputeCommandEncoder useResource:bvhMetalRT->accel_struct usage:MTLResourceUsageRead];
[mtlComputeCommandEncoder useResource:metal_device_->blas_buffer
usage:MTLResourceUsageRead];
[mtlComputeCommandEncoder useResource:metal_device_->blas_lookup_buffer
usage:MTLResourceUsageRead];
[mtlComputeCommandEncoder useResources:bvhMetalRT->blas_array.data()
count:bvhMetalRT->blas_array.size()
usage:MTLResourceUsageRead];
@@ -568,24 +553,13 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
/* See parallel_active_index.h for why this amount of shared memory is needed.
* Rounded up to 16 bytes for Metal */
shared_mem_bytes = (int)round_up((num_threads_per_block + 1) * sizeof(int), 16);
[mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
break;
case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
int key_count = metal_device_->launch_params.data.max_shaders;
shared_mem_bytes = (int)round_up(key_count * sizeof(int), 16);
break;
}
default:
break;
}
if (shared_mem_bytes) {
assert(shared_mem_bytes <= 32 * 1024);
[mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
}
MTLSize size_threadgroups_per_dispatch = MTLSizeMake(
divide_up(work_size, num_threads_per_block), 1, 1);
MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);

View File

@@ -64,12 +64,6 @@ MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device)
return METAL_GPU_INTEL;
}
else if (strstr(device_name, "AMD")) {
/* Setting this env var hides AMD devices thus exposing any integrated Intel devices. */
if (auto str = getenv("CYCLES_METAL_FORCE_INTEL")) {
if (atoi(str)) {
return METAL_GPU_UNKNOWN;
}
}
return METAL_GPU_AMD;
}
else if (strstr(device_name, "Apple")) {
@@ -102,15 +96,6 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
return usable_devices;
}
/* If the system has both an AMD GPU (discrete) and an Intel one (integrated), prefer the AMD
* one. This can be overridden with CYCLES_METAL_FORCE_INTEL. */
bool has_usable_amd_gpu = false;
if (@available(macos 12.3, *)) {
for (id<MTLDevice> device in MTLCopyAllDevices()) {
has_usable_amd_gpu |= (get_device_vendor(device) == METAL_GPU_AMD);
}
}
metal_printf("Usable Metal devices:\n");
for (id<MTLDevice> device in MTLCopyAllDevices()) {
string device_name = get_device_name(device);
@@ -126,10 +111,8 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
}
# if defined(MAC_OS_VERSION_13_0)
if (!has_usable_amd_gpu) {
if (@available(macos 13.0, *)) {
usable |= (vendor == METAL_GPU_INTEL);
}
if (@available(macos 13.0, *)) {
usable |= (vendor == METAL_GPU_INTEL);
}
# endif

View File

@@ -377,7 +377,7 @@ void OneapiDevice::tex_alloc(device_texture &mem)
generic_alloc(mem);
generic_copy_to(mem);
/* Resize if needed. Also, in case of resize - allocate in advance for future allocations. */
/* Resize if needed. Also, in case of resize - allocate in advance for future allocs. */
const uint slot = mem.slot;
if (slot >= texture_info_.size()) {
texture_info_.resize(slot + 128);
@@ -631,9 +631,9 @@ bool OneapiDevice::enqueue_kernel(KernelContext *kernel_context,
/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
* since Windows driver 101.3268. */
/* The same min compute-runtime version is currently required across Windows and Linux.
* For Windows driver 101.4032, compute-runtime version is 24931. */
static const int lowest_supported_driver_version_win = 1014032;
static const int lowest_supported_driver_version_neo = 24931;
* For Windows driver 101.3430, compute-runtime version is 23904. */
static const int lowest_supported_driver_version_win = 1013430;
static const int lowest_supported_driver_version_neo = 23904;
int OneapiDevice::parse_driver_build_version(const sycl::device &device)
{

View File

@@ -854,14 +854,12 @@ bool OptiXDevice::load_osl_kernels()
context, group_descs, 2, &group_options, nullptr, 0, &osl_groups[i * 2]));
}
OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
vector<OptixStackSizes> osl_stack_size(osl_groups.size());
/* Update SBT with new entries. */
sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size());
for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
}
for (size_t i = 0; i < osl_groups.size(); ++i) {
if (osl_groups[i] != NULL) {
@@ -909,15 +907,13 @@ bool OptiXDevice::load_osl_kernels()
0,
&pipelines[PIP_SHADE]));
const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG);
unsigned int dss = 0;
for (unsigned int i = 0; i < osl_stack_size.size(); ++i) {
dss = std::max(dss, osl_stack_size[i].dssDC);
}
optix_assert(optixPipelineSetStackSize(
pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
pipelines[PIP_SHADE], 0, dss, 0, pipeline_options.usesMotionBlur ? 3 : 2));
}
return !have_error();

View File

@@ -112,13 +112,6 @@ class DeviceQueue {
return 65536;
}
/* Does device support local atomic sorting kernels (INTEGRATOR_SORT_BUCKET_PASS and
* INTEGRATOR_SORT_WRITE_PASS)? */
virtual bool supports_local_atomic_sort() const
{
return false;
}
/* Initialize execution of kernels on this queue.
*
* Will, for example, load all data required by the kernels from Device to global or path state.

View File

@@ -71,8 +71,6 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
device, "integrator_shader_mnee_sort_counter", MEM_READ_WRITE),
integrator_shader_sort_prefix_sum_(
device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
integrator_shader_sort_partition_key_offsets_(
device, "integrator_shader_sort_partition_key_offsets", MEM_READ_WRITE),
integrator_next_main_path_index_(device, "integrator_next_main_path_index", MEM_READ_WRITE),
integrator_next_shadow_path_index_(
device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
@@ -209,45 +207,33 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
num_sort_partitions_);
if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
/* Allocate array for partitioned shader sorting using local atomics. */
const int num_offsets = (device_scene_->data.max_shaders + 1) * num_sort_partitions_;
if (integrator_shader_sort_partition_key_offsets_.size() < num_offsets) {
integrator_shader_sort_partition_key_offsets_.alloc(num_offsets);
integrator_shader_sort_partition_key_offsets_.zero_to_device();
}
integrator_state_gpu_.sort_partition_key_offsets =
(int *)integrator_shader_sort_partition_key_offsets_.device_pointer;
/* Allocate arrays for shader sorting. */
const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
if (integrator_shader_sort_counter_.size() < sort_buckets) {
integrator_shader_sort_counter_.alloc(sort_buckets);
integrator_shader_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
(int *)integrator_shader_sort_counter_.device_pointer;
integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
integrator_shader_sort_prefix_sum_.zero_to_device();
}
else {
/* Allocate arrays for shader sorting. */
const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
if (integrator_shader_sort_counter_.size() < sort_buckets) {
integrator_shader_sort_counter_.alloc(sort_buckets);
integrator_shader_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
(int *)integrator_shader_sort_counter_.device_pointer;
integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
integrator_shader_sort_prefix_sum_.zero_to_device();
if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
integrator_shader_raytrace_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
(int *)integrator_shader_raytrace_sort_counter_.device_pointer;
}
}
if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
integrator_shader_raytrace_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
(int *)integrator_shader_raytrace_sort_counter_.device_pointer;
}
}
if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
integrator_shader_mnee_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
(int *)integrator_shader_mnee_sort_counter_.device_pointer;
}
if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
integrator_shader_mnee_sort_counter_.zero_to_device();
integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
(int *)integrator_shader_mnee_sort_counter_.device_pointer;
}
}
}
@@ -465,7 +451,8 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
work_size = num_queued;
d_path_index = queued_paths_.device_pointer;
compute_sorted_queued_paths(kernel, num_paths_limit);
compute_sorted_queued_paths(
DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit);
}
else if (num_queued < work_size) {
work_size = num_queued;
@@ -524,26 +511,11 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
}
}
void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel queued_kernel,
void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
DeviceKernel queued_kernel,
const int num_paths_limit)
{
int d_queued_kernel = queued_kernel;
/* Launch kernel to fill the active paths arrays. */
if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
const int work_size = kernel_max_active_main_path_index(queued_kernel);
device_ptr d_queued_paths = queued_paths_.device_pointer;
int partition_size = (int)integrator_state_gpu_.sort_partition_divisor;
DeviceKernelArguments args(
&work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS, 1024 * num_sort_partitions_, args);
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS, 1024 * num_sort_partitions_, args);
return;
}
device_ptr d_counter = (device_ptr)integrator_state_gpu_.sort_key_counter[d_queued_kernel];
device_ptr d_prefix_sum = integrator_shader_sort_prefix_sum_.device_pointer;
assert(d_counter != 0 && d_prefix_sum != 0);
@@ -580,7 +552,7 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel queued_kernel,
&d_prefix_sum,
&d_queued_kernel);
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, work_size, args);
queue_->enqueue(kernel, work_size, args);
}
}

View File

@@ -70,7 +70,9 @@ class PathTraceWorkGPU : public PathTraceWork {
void enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit = INT_MAX);
void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
void compute_sorted_queued_paths(DeviceKernel queued_kernel, const int num_paths_limit);
void compute_sorted_queued_paths(DeviceKernel kernel,
DeviceKernel queued_kernel,
const int num_paths_limit);
void compact_main_paths(const int num_active_paths);
void compact_shadow_paths();
@@ -133,7 +135,6 @@ class PathTraceWorkGPU : public PathTraceWork {
device_vector<int> integrator_shader_raytrace_sort_counter_;
device_vector<int> integrator_shader_mnee_sort_counter_;
device_vector<int> integrator_shader_sort_prefix_sum_;
device_vector<int> integrator_shader_sort_partition_key_offsets_;
/* Path split. */
device_vector<int> integrator_next_main_path_index_;
device_vector<int> integrator_next_shadow_path_index_;

View File

@@ -886,7 +886,7 @@ int RenderScheduler::get_num_samples_during_navigation(int resolution_divider) c
{
/* Special trick for fast navigation: schedule multiple samples during fast navigation
* (which will prefer to use lower resolution to keep up with refresh rate). This gives more
* usable visual feedback for artists. */
* usable visual feedback for artists. There are a couple of tricks though. */
if (is_denoise_active_during_update()) {
/* When denoising is used during navigation prefer using a higher resolution with less samples
@@ -896,12 +896,25 @@ int RenderScheduler::get_num_samples_during_navigation(int resolution_divider) c
return 1;
}
/* Schedule samples equal to the resolution divider up to a maximum of 4.
* The idea is to have enough information on the screen by increasing the sample count as the
* resolution is decreased. */
/* NOTE: Changing this formula will change the formula in
* `RenderScheduler::calculate_resolution_divider_for_time()`. */
return min(max(1, resolution_divider / pixel_size_), 4);
if (resolution_divider <= pixel_size_) {
/* When resolution divider is at or below pixel size, schedule one sample. This doesn't effect
* the sample count at this resolution division, but instead assists in the calculation of
* the resolution divider. */
return 1;
}
if (resolution_divider == pixel_size_ * 2) {
/* When resolution divider is the previous step to the final resolution, schedule two samples.
* This is so that rendering on lower resolution does not exceed time that it takes to render
* first sample at the full resolution. */
return 2;
}
/* Always render 4 samples, even if scene is configured for less.
* The idea here is to have enough information on the screen. Resolution divider of 2 allows us
* to have 4 time extra samples, so overall worst case timing is the same as the final resolution
* at one sample. */
return 4;
}
bool RenderScheduler::work_need_adaptive_filter() const
@@ -1087,10 +1100,9 @@ void RenderScheduler::update_start_resolution_divider()
/* TODO(sergey): Need to add hysteresis to avoid resolution divider bouncing around when actual
* render time is somewhere on a boundary between two resolutions. */
/* Don't let resolution drop below the desired one. It's better to be slow than provide an
* unreadable viewport render. */
start_resolution_divider_ = min(resolution_divider_for_update,
default_start_resolution_divider_);
/* Never increase resolution to higher than the pixel size (which is possible if the scene is
* simple and compute device is fast). */
start_resolution_divider_ = max(resolution_divider_for_update, pixel_size_);
VLOG_WORK << "Calculated resolution divider is " << start_resolution_divider_;
}
@@ -1175,24 +1187,24 @@ void RenderScheduler::check_time_limit_reached()
int RenderScheduler::calculate_resolution_divider_for_time(double desired_time, double actual_time)
{
const double ratio_between_times = actual_time / desired_time;
/* TODO(sergey): There should a non-iterative analytical formula here. */
/* We can pass `ratio_between_times` to `get_num_samples_during_navigation()` to get our
* navigation samples because the equation for calculating the resolution divider is as follows:
* `actual_time / desired_time = sqr(resolution_divider) / sample_count`.
* While `resolution_divider` is less than or equal to 4, `resolution_divider = sample_count`
* (This relationship is determined in `get_num_samples_during_navigation()`). With some
* substitution we end up with `actual_time / desired_time = resolution_divider` while the
* resolution divider is less than or equal to 4. Once the resolution divider increases above 4,
* the relationship of `actual_time / desired_time = resolution_divider` is no longer true,
* however the sample count retrieved from `get_num_samples_during_navigation()` is still
* accurate if we continue using this assumption. It should be noted that the interaction between
* `pixel_size`, sample count, and resolution divider are automatically accounted for and that's
* why `pixel_size` isn't included in any of the equations. */
const int navigation_samples = get_num_samples_during_navigation(
ceil_to_int(ratio_between_times));
int resolution_divider = 1;
return ceil_to_int(sqrt(navigation_samples * ratio_between_times));
/* This algorithm iterates through resolution dividers until a divider is found that achieves
* the desired render time. A limit of default_start_resolution_divider_ is put in place as the
* maximum resolution divider to avoid an unreadable viewport due to a low resolution.
* pre_resolution_division_samples and post_resolution_division_samples are used in this
* calculation to better predict the performance impact of changing resolution divisions as
* the sample count can also change between resolution divisions. */
while (actual_time > desired_time && resolution_divider < default_start_resolution_divider_) {
int pre_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
resolution_divider = resolution_divider * 2;
int post_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
actual_time /= 4.0 * pre_resolution_division_samples / post_resolution_division_samples;
}
return resolution_divider;
}
int calculate_resolution_divider_for_resolution(int width, int height, int resolution)

View File

@@ -412,12 +412,11 @@ if(WITH_CYCLES_CUDA_BINARIES)
# warn for other versions
if((CUDA_VERSION STREQUAL "101") OR
(CUDA_VERSION STREQUAL "102") OR
(CUDA_VERSION_MAJOR STREQUAL "11") OR
(CUDA_VERSION_MAJOR STREQUAL "12"))
(CUDA_VERSION_MAJOR STREQUAL "11"))
else()
message(WARNING
"CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, "
"build may succeed but only CUDA 12, 11, 10.2 and 10.1 have been tested")
"build may succeed but only CUDA 11, 10.2 and 10.1 have been tested")
endif()
# build for each arch
@@ -515,16 +514,6 @@ if(WITH_CYCLES_CUDA_BINARIES)
else()
message(STATUS "CUDA binaries for ${arch} require CUDA 10 or earlier, skipped.")
endif()
elseif(${arch} MATCHES ".*_3.")
if(DEFINED CUDA11_NVCC_EXECUTABLE)
set(cuda_nvcc_executable ${CUDA11_NVCC_EXECUTABLE})
set(cuda_toolkit_root_dir ${CUDA11_TOOLKIT_ROOT_DIR})
elseif("${CUDA_VERSION}" LESS 120) # Support for sm_35, sm_37 was removed in CUDA 12
set(cuda_nvcc_executable ${CUDA_NVCC_EXECUTABLE})
set(cuda_toolkit_root_dir ${CUDA_TOOLKIT_ROOT_DIR})
else()
message(STATUS "CUDA binaries for ${arch} require CUDA 11 or earlier, skipped.")
endif()
elseif(${arch} MATCHES ".*_7." AND "${CUDA_VERSION}" LESS 100)
message(STATUS "CUDA binaries for ${arch} require CUDA 10.0+, skipped.")
elseif(${arch} MATCHES ".*_8.")

View File

@@ -170,7 +170,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals kg,
case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
label = bsdf_microfacet_ggx_sample(
sc, Ng, sd->wi, randu, randv, eval, wo, pdf, sampled_roughness, eta);
kg, sc, Ng, sd->wi, randu, randv, eval, wo, pdf, sampled_roughness, eta);
break;
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
@@ -185,7 +185,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals kg,
case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
label = bsdf_microfacet_beckmann_sample(
sc, Ng, sd->wi, randu, randv, eval, wo, pdf, sampled_roughness, eta);
kg, sc, Ng, sd->wi, randu, randv, eval, wo, pdf, sampled_roughness, eta);
break;
case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
label = bsdf_ashikhmin_shirley_sample(
@@ -661,38 +661,4 @@ ccl_device void bsdf_blur(KernelGlobals kg, ccl_private ShaderClosure *sc, float
#endif
}
ccl_device_inline Spectrum bsdf_albedo(ccl_private const ShaderData *sd,
ccl_private const ShaderClosure *sc)
{
Spectrum albedo = sc->weight;
/* Some closures include additional components such as Fresnel terms that cause their albedo to
* be below 1. The point of this function is to return a best-effort estimation of their albedo,
* meaning the amount of reflected/refracted light that would be expected when illuminated by a
* uniform white background.
* This is used for the denoising albedo pass and diffuse/glossy/transmission color passes.
* NOTE: This should always match the sample_weight of the closure - as in, if there's an albedo
* adjustment in here, the sample_weight should also be reduced accordingly.
* TODO(lukas): Consider calling this function to determine the sample_weight? Would be a bit of
* extra overhead though. */
#if defined(__SVM__) || defined(__OSL__)
switch (sc->type) {
case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
albedo *= microfacet_fresnel((ccl_private const MicrofacetBsdf *)sc, sd->wi, sc->N);
break;
case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
albedo *= ((ccl_private const PrincipledSheenBsdf *)sc)->avg_value;
break;
case CLOSURE_BSDF_HAIR_PRINCIPLED_ID:
albedo *= bsdf_principled_hair_albedo(sd, sc);
break;
default:
break;
}
#endif
return albedo;
}
CCL_NAMESPACE_END

View File

@@ -41,6 +41,11 @@ static_assert(sizeof(ShaderClosure) >= sizeof(PrincipledHairBSDF),
static_assert(sizeof(ShaderClosure) >= sizeof(PrincipledHairExtra),
"PrincipledHairExtra is too large!");
ccl_device_inline float cos_from_sin(const float s)
{
return safe_sqrtf(1.0f - s * s);
}
/* Gives the change in direction in the normal plane for the given angles and p-th-order
* scattering. */
ccl_device_inline float delta_phi(int p, float gamma_o, float gamma_t)
@@ -478,18 +483,10 @@ ccl_device_inline float bsdf_principled_hair_albedo_roughness_scale(
return (((((0.245f * x) + 5.574f) * x - 10.73f) * x + 2.532f) * x - 0.215f) * x + 5.969f;
}
ccl_device Spectrum bsdf_principled_hair_albedo(ccl_private const ShaderData *sd,
ccl_private const ShaderClosure *sc)
ccl_device Spectrum bsdf_principled_hair_albedo(ccl_private const ShaderClosure *sc)
{
ccl_private PrincipledHairBSDF *bsdf = (ccl_private PrincipledHairBSDF *)sc;
const float cos_theta_o = cos_from_sin(dot(sd->wi, safe_normalize(sd->dPdu)));
const float cos_gamma_o = cos_from_sin(bsdf->extra->geom.w);
const float f = fresnel_dielectric_cos(cos_theta_o * cos_gamma_o, bsdf->eta);
const float roughness_scale = bsdf_principled_hair_albedo_roughness_scale(bsdf->v);
/* TODO(lukas): Adding the Fresnel term here as a workaround until the proper refactor. */
return exp(-sqrt(bsdf->sigma) * roughness_scale) + make_spectrum(f);
return exp(-sqrt(bsdf->sigma) * bsdf_principled_hair_albedo_roughness_scale(bsdf->v));
}
ccl_device_inline Spectrum

View File

@@ -23,6 +23,8 @@ enum MicrofacetType {
typedef struct MicrofacetExtra {
Spectrum color, cspec0;
Spectrum fresnel_color;
float clearcoat;
} MicrofacetExtra;
typedef struct MicrofacetBsdf {
@@ -35,99 +37,190 @@ typedef struct MicrofacetBsdf {
static_assert(sizeof(ShaderClosure) >= sizeof(MicrofacetBsdf), "MicrofacetBsdf is too large!");
/* Beckmann VNDF importance sampling algorithm from:
* Importance Sampling Microfacet-Based BSDFs using the Distribution of Visible Normals.
* Eric Heitz and Eugene d'Eon, EGSR 2014.
* https://hal.inria.fr/hal-00996995v2/document */
/* Beckmann and GGX microfacet importance sampling. */
ccl_device_forceinline float3 microfacet_beckmann_sample_vndf(const float3 wi,
const float alpha_x,
const float alpha_y,
const float randu,
const float randv)
ccl_device_inline void microfacet_beckmann_sample_slopes(KernelGlobals kg,
const float cos_theta_i,
const float sin_theta_i,
float randu,
float randv,
ccl_private float *slope_x,
ccl_private float *slope_y,
ccl_private float *G1i)
{
/* Special case (normal incidence). */
if (cos_theta_i >= 0.99999f) {
const float r = sqrtf(-logf(randu));
const float phi = M_2PI_F * randv;
*slope_x = r * cosf(phi);
*slope_y = r * sinf(phi);
*G1i = 1.0f;
return;
}
/* Precomputations. */
const float tan_theta_i = sin_theta_i / cos_theta_i;
const float inv_a = tan_theta_i;
const float cot_theta_i = 1.0f / tan_theta_i;
const float erf_a = fast_erff(cot_theta_i);
const float exp_a2 = expf(-cot_theta_i * cot_theta_i);
const float SQRT_PI_INV = 0.56418958354f;
const float Lambda = 0.5f * (erf_a - 1.0f) + (0.5f * SQRT_PI_INV) * (exp_a2 * inv_a);
const float G1 = 1.0f / (1.0f + Lambda); /* masking */
*G1i = G1;
/* Based on paper from Wenzel Jakob
* An Improved Visible Normal Sampling Routine for the Beckmann Distribution
*
* http://www.mitsuba-renderer.org/~wenzel/files/visnormal.pdf
*
* Reformulation from OpenShadingLanguage which avoids using inverse
* trigonometric functions.
*/
/* Sample slope X.
*
* Compute a coarse approximation using the approximation:
* exp(-ierf(x)^2) ~= 1 - x * x
* solve y = 1 + b + K * (1 - b * b)
*/
const float K = tan_theta_i * SQRT_PI_INV;
const float y_approx = randu * (1.0f + erf_a + K * (1 - erf_a * erf_a));
const float y_exact = randu * (1.0f + erf_a + K * exp_a2);
float b = K > 0 ? (0.5f - sqrtf(K * (K - y_approx + 1.0f) + 0.25f)) / K : y_approx - 1.0f;
float inv_erf = fast_ierff(b);
float2 begin = make_float2(-1.0f, -y_exact);
float2 end = make_float2(erf_a, 1.0f + erf_a + K * exp_a2 - y_exact);
float2 current = make_float2(b, 1.0f + b + K * expf(-sqr(inv_erf)) - y_exact);
/* Find root in a monotonic interval using newton method, under given precision and maximal
* iterations. Falls back to bisection if newton step produces results outside of the valid
* interval.*/
const float precision = 1e-6f;
const int max_iter = 3;
int iter = 0;
while (fabsf(current.y) > precision && iter++ < max_iter) {
if (signf(begin.y) == signf(current.y)) {
begin.x = current.x;
begin.y = current.y;
}
else {
end.x = current.x;
}
const float newton_x = current.x - current.y / (1.0f - inv_erf * tan_theta_i);
current.x = (newton_x >= begin.x && newton_x <= end.x) ? newton_x : 0.5f * (begin.x + end.x);
inv_erf = fast_ierff(current.x);
current.y = 1.0f + current.x + K * expf(-sqr(inv_erf)) - y_exact;
}
*slope_x = inv_erf;
*slope_y = fast_ierff(2.0f * randv - 1.0f);
}
/* GGX microfacet importance sampling from:
*
* Importance Sampling Microfacet-Based BSDFs using the Distribution of Visible Normals.
* E. Heitz and E. d'Eon, EGSR 2014
*/
ccl_device_inline void microfacet_ggx_sample_slopes(const float cos_theta_i,
const float sin_theta_i,
float randu,
float randv,
ccl_private float *slope_x,
ccl_private float *slope_y,
ccl_private float *G1i)
{
/* Special case (normal incidence). */
if (cos_theta_i >= 0.99999f) {
const float r = sqrtf(randu / (1.0f - randu));
const float phi = M_2PI_F * randv;
*slope_x = r * cosf(phi);
*slope_y = r * sinf(phi);
*G1i = 1.0f;
return;
}
/* Precomputations. */
const float tan_theta_i = sin_theta_i / cos_theta_i;
const float G1_inv = 0.5f * (1.0f + safe_sqrtf(1.0f + tan_theta_i * tan_theta_i));
*G1i = 1.0f / G1_inv;
/* Sample slope_x. */
const float A = 2.0f * randu * G1_inv - 1.0f;
const float AA = A * A;
const float tmp = 1.0f / (AA - 1.0f);
const float B = tan_theta_i;
const float BB = B * B;
const float D = safe_sqrtf(BB * (tmp * tmp) - (AA - BB) * tmp);
const float slope_x_1 = B * tmp - D;
const float slope_x_2 = B * tmp + D;
*slope_x = (A < 0.0f || slope_x_2 * tan_theta_i > 1.0f) ? slope_x_1 : slope_x_2;
/* Sample slope_y. */
float S;
if (randv > 0.5f) {
S = 1.0f;
randv = 2.0f * (randv - 0.5f);
}
else {
S = -1.0f;
randv = 2.0f * (0.5f - randv);
}
const float z = (randv * (randv * (randv * 0.27385f - 0.73369f) + 0.46341f)) /
(randv * (randv * (randv * 0.093073f + 0.309420f) - 1.000000f) + 0.597999f);
*slope_y = S * z * safe_sqrtf(1.0f + (*slope_x) * (*slope_x));
}
template<MicrofacetType m_type>
ccl_device_forceinline float3 microfacet_sample_stretched(KernelGlobals kg,
const float3 wi,
const float alpha_x,
const float alpha_y,
const float randu,
const float randv,
ccl_private float *G1i)
{
/* 1. stretch wi */
float3 wi_ = make_float3(alpha_x * wi.x, alpha_y * wi.y, wi.z);
wi_ = normalize(wi_);
/* Compute polar coordinates of wi_. */
float costheta_ = 1.0f;
float sintheta_ = 0.0f;
float cosphi_ = 1.0f;
float sinphi_ = 0.0f;
if (wi_.z < 0.99999f) {
costheta_ = wi_.z;
sintheta_ = safe_sqrtf(1.0f - costheta_ * costheta_);
float invlen = 1.0f / sintheta_;
cosphi_ = wi_.x * invlen;
sinphi_ = wi_.y * invlen;
}
/* 2. sample P22_{wi}(x_slope, y_slope, 1, 1) */
float slope_x, slope_y;
float cos_phi_i = 1.0f;
float sin_phi_i = 0.0f;
if (wi_.z >= 0.99999f) {
/* Special case (normal incidence). */
const float r = sqrtf(-logf(randu));
const float phi = M_2PI_F * randv;
slope_x = r * cosf(phi);
slope_y = r * sinf(phi);
if (m_type == MicrofacetType::BECKMANN) {
microfacet_beckmann_sample_slopes(
kg, costheta_, sintheta_, randu, randv, &slope_x, &slope_y, G1i);
}
else {
/* Precomputations. */
const float cos_theta_i = wi_.z;
const float sin_theta_i = sin_from_cos(cos_theta_i);
const float tan_theta_i = sin_theta_i / cos_theta_i;
const float cot_theta_i = 1.0f / tan_theta_i;
const float erf_a = fast_erff(cot_theta_i);
const float exp_a2 = expf(-cot_theta_i * cot_theta_i);
const float SQRT_PI_INV = 0.56418958354f;
float invlen = 1.0f / sin_theta_i;
cos_phi_i = wi_.x * invlen;
sin_phi_i = wi_.y * invlen;
/* Based on paper from Wenzel Jakob
* An Improved Visible Normal Sampling Routine for the Beckmann Distribution
*
* http://www.mitsuba-renderer.org/~wenzel/files/visnormal.pdf
*
* Reformulation from OpenShadingLanguage which avoids using inverse
* trigonometric functions.
*/
/* Sample slope X.
*
* Compute a coarse approximation using the approximation:
* exp(-ierf(x)^2) ~= 1 - x * x
* solve y = 1 + b + K * (1 - b * b)
*/
const float K = tan_theta_i * SQRT_PI_INV;
const float y_approx = randu * (1.0f + erf_a + K * (1 - erf_a * erf_a));
const float y_exact = randu * (1.0f + erf_a + K * exp_a2);
float b = K > 0 ? (0.5f - sqrtf(K * (K - y_approx + 1.0f) + 0.25f)) / K : y_approx - 1.0f;
float inv_erf = fast_ierff(b);
float2 begin = make_float2(-1.0f, -y_exact);
float2 end = make_float2(erf_a, 1.0f + erf_a + K * exp_a2 - y_exact);
float2 current = make_float2(b, 1.0f + b + K * expf(-sqr(inv_erf)) - y_exact);
/* Find root in a monotonic interval using newton method, under given precision and maximal
* iterations. Falls back to bisection if newton step produces results outside of the valid
* interval.*/
const float precision = 1e-6f;
const int max_iter = 3;
int iter = 0;
while (fabsf(current.y) > precision && iter++ < max_iter) {
if (signf(begin.y) == signf(current.y)) {
begin.x = current.x;
begin.y = current.y;
}
else {
end.x = current.x;
}
const float newton_x = current.x - current.y / (1.0f - inv_erf * tan_theta_i);
current.x = (newton_x >= begin.x && newton_x <= end.x) ? newton_x : 0.5f * (begin.x + end.x);
inv_erf = fast_ierff(current.x);
current.y = 1.0f + current.x + K * expf(-sqr(inv_erf)) - y_exact;
}
slope_x = inv_erf;
slope_y = fast_ierff(2.0f * randv - 1.0f);
microfacet_ggx_sample_slopes(costheta_, sintheta_, randu, randv, &slope_x, &slope_y, G1i);
}
/* 3. rotate */
float tmp = cos_phi_i * slope_x - sin_phi_i * slope_y;
slope_y = sin_phi_i * slope_x + cos_phi_i * slope_y;
float tmp = cosphi_ * slope_x - sinphi_ * slope_y;
slope_y = sinphi_ * slope_x + cosphi_ * slope_y;
slope_x = tmp;
/* 4. unstretch */
@@ -138,43 +231,6 @@ ccl_device_forceinline float3 microfacet_beckmann_sample_vndf(const float3 wi,
return normalize(make_float3(-slope_x, -slope_y, 1.0f));
}
/* GGX VNDF importance sampling algorithm from:
* Sampling the GGX Distribution of Visible Normals.
* Eric Heitz, JCGT Vol. 7, No. 4, 2018.
* https://jcgt.org/published/0007/04/01/ */
ccl_device_forceinline float3 microfacet_ggx_sample_vndf(const float3 wi,
const float alpha_x,
const float alpha_y,
const float randu,
const float randv)
{
/* Section 3.2: Transforming the view direction to the hemisphere configuration. */
float3 wi_ = normalize(make_float3(alpha_x * wi.x, alpha_y * wi.y, wi.z));
/* Section 4.1: Orthonormal basis. */
float lensq = sqr(wi_.x) + sqr(wi_.y);
float3 T1, T2;
if (lensq > 1e-7f) {
T1 = make_float3(-wi_.y, wi_.x, 0.0f) * inversesqrtf(lensq);
T2 = cross(wi_, T1);
}
else {
/* Normal incidence, any basis is fine. */
T1 = make_float3(1.0f, 0.0f, 0.0f);
T2 = make_float3(0.0f, 1.0f, 0.0f);
}
/* Section 4.2: Parameterization of the projected area. */
float2 t = concentric_sample_disk(randu, randv);
t.y = mix(safe_sqrtf(1.0f - sqr(t.x)), t.y, 0.5f * (1.0f + wi_.z));
/* Section 4.3: Reprojection onto hemisphere. */
float3 H_ = t.x * T1 + t.y * T2 + safe_sqrtf(1.0f - len_squared(t)) * wi_;
/* Section 3.4: Transforming the normal back to the ellipsoid configuration. */
return normalize(make_float3(alpha_x * H_.x, alpha_y * H_.y, max(0.0f, H_.z)));
}
/* Calculate the reflection color
*
* If fresnel is used, the color is an interpolation of the F0 color and white
@@ -182,25 +238,26 @@ ccl_device_forceinline float3 microfacet_ggx_sample_vndf(const float3 wi,
*
* Else it is simply white
*/
ccl_device_forceinline Spectrum microfacet_fresnel(ccl_private const MicrofacetBsdf *bsdf,
float3 wi,
float3 H)
ccl_device_forceinline Spectrum reflection_color(ccl_private const MicrofacetBsdf *bsdf,
float3 L,
float3 H)
{
if (CLOSURE_IS_BSDF_MICROFACET_FRESNEL(bsdf->type)) {
return interpolate_fresnel_color(wi, H, bsdf->ior, bsdf->extra->cspec0);
}
else if (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) {
return make_spectrum(fresnel_dielectric_cos(dot(wi, H), bsdf->ior));
}
else {
return one_spectrum();
}
}
Spectrum F = one_spectrum();
ccl_device_forceinline void bsdf_microfacet_adjust_weight(ccl_private const ShaderData *sd,
ccl_private MicrofacetBsdf *bsdf)
{
bsdf->sample_weight *= average(microfacet_fresnel(bsdf, sd->wi, bsdf->N));
bool use_clearcoat = bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID;
bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID || use_clearcoat);
if (use_fresnel) {
float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
F = interpolate_fresnel_color(L, H, bsdf->ior, F0, bsdf->extra->cspec0);
}
if (use_clearcoat) {
F *= 0.25f * bsdf->extra->clearcoat;
}
return F;
}
/* Generalized Trowbridge-Reitz for clearcoat. */
@@ -214,48 +271,37 @@ ccl_device_forceinline float bsdf_clearcoat_D(float alpha2, float cos_NH)
return (alpha2 - 1.0f) / (M_PI_F * logf(alpha2) * t);
}
/* Smith shadowing-masking term, here in the non-separable form.
* For details, see:
* Understanding the Masking-Shadowing Function in Microfacet-Based BRDFs.
* Eric Heitz, JCGT Vol. 3, No. 2, 2014.
* https://jcgt.org/published/0003/02/03/ */
/* Monodirectional shadowing-masking term. */
template<MicrofacetType m_type>
ccl_device_inline float bsdf_lambda_from_sqr_alpha_tan_n(float sqr_alpha_tan_n)
ccl_device_inline float bsdf_G1_from_sqr_alpha_tan_n(float sqr_alpha_tan_n)
{
if (m_type == MicrofacetType::GGX) {
/* Equation 72. */
return 0.5f * (sqrtf(1.0f + sqr_alpha_tan_n) - 1.0f);
return 2.0f / (1.0f + sqrtf(1.0f + sqr_alpha_tan_n));
}
else {
/* m_type == MicrofacetType::BECKMANN
* Approximation from below Equation 69. */
if (sqr_alpha_tan_n < 0.39f) {
/* Equivalent to a >= 1.6f, but also handles sqr_alpha_tan_n == 0.0f cleanly. */
return 0.0f;
}
/* m_type == MicrofacetType::BECKMANN */
const float a = inversesqrtf(sqr_alpha_tan_n);
return ((0.396f * a - 1.259f) * a + 1.0f) / ((2.181f * a + 3.535f) * a);
return (a > 1.6f) ? 1.0f : ((2.181f * a + 3.535f) * a) / ((2.577f * a + 2.276f) * a + 1.0f);
}
}
template<MicrofacetType m_type> ccl_device_inline float bsdf_lambda(float alpha2, float cos_N)
template<MicrofacetType m_type> ccl_device_inline float bsdf_G1(float alpha2, float cos_N)
{
return bsdf_lambda_from_sqr_alpha_tan_n<m_type>(alpha2 * fmaxf(1.0f / sqr(cos_N) - 1.0f, 0.0f));
return bsdf_G1_from_sqr_alpha_tan_n<m_type>(alpha2 * fmaxf(1.0f / (cos_N * cos_N) - 1.0f, 0.0f));
}
template<MicrofacetType m_type>
ccl_device_inline float bsdf_aniso_lambda(float alpha_x, float alpha_y, float3 V)
ccl_device_inline float bsdf_aniso_G1(float alpha_x, float alpha_y, float3 V)
{
const float sqr_alpha_tan_n = (sqr(alpha_x * V.x) + sqr(alpha_y * V.y)) / sqr(V.z);
return bsdf_lambda_from_sqr_alpha_tan_n<m_type>(sqr_alpha_tan_n);
return bsdf_G1_from_sqr_alpha_tan_n<m_type>((sqr(alpha_x * V.x) + sqr(alpha_y * V.y)) /
sqr(V.z));
}
/* Combined shadowing-masking term. */
/* Smith's separable shadowing-masking term. */
template<MicrofacetType m_type>
ccl_device_inline float bsdf_G(float alpha2, float cos_NI, float cos_NO)
{
return 1.0f / (1.0f + bsdf_lambda<m_type>(alpha2, cos_NI) + bsdf_lambda<m_type>(alpha2, cos_NO));
return bsdf_G1<m_type>(alpha2, cos_NI) * bsdf_G1<m_type>(alpha2, cos_NO);
}
/* Normal distribution function. */
@@ -289,6 +335,22 @@ ccl_device_inline float bsdf_aniso_D(float alpha_x, float alpha_y, float3 H)
}
}
ccl_device_forceinline void bsdf_microfacet_fresnel_color(ccl_private const ShaderData *sd,
ccl_private MicrofacetBsdf *bsdf)
{
kernel_assert(CLOSURE_IS_BSDF_MICROFACET_FRESNEL(bsdf->type));
float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
bsdf->extra->fresnel_color = interpolate_fresnel_color(
sd->wi, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0);
if (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) {
bsdf->extra->fresnel_color *= 0.25f * bsdf->extra->clearcoat;
}
bsdf->sample_weight *= average(bsdf->extra->fresnel_color);
}
template<MicrofacetType m_type>
ccl_device Spectrum bsdf_microfacet_eval(ccl_private const ShaderClosure *sc,
const float3 Ng,
@@ -320,7 +382,7 @@ ccl_device Spectrum bsdf_microfacet_eval(ccl_private const ShaderClosure *sc,
H *= inv_len_H;
const float cos_NH = dot(N, H);
float D, lambdaI, lambdaO;
float D, G1i, G1o;
/* TODO: add support for anisotropic transmission. */
if (alpha_x == alpha_y || m_refractive) { /* Isotropic. */
@@ -337,8 +399,8 @@ ccl_device Spectrum bsdf_microfacet_eval(ccl_private const ShaderClosure *sc,
D = bsdf_D<m_type>(alpha2, cos_NH);
}
lambdaI = bsdf_lambda<m_type>(alpha2, cos_NI);
lambdaO = bsdf_lambda<m_type>(alpha2, cos_NO);
G1i = bsdf_G1<m_type>(alpha2, cos_NI);
G1o = bsdf_G1<m_type>(alpha2, cos_NO);
}
else { /* Anisotropic. */
float3 X, Y;
@@ -350,23 +412,25 @@ ccl_device Spectrum bsdf_microfacet_eval(ccl_private const ShaderClosure *sc,
D = bsdf_aniso_D<m_type>(alpha_x, alpha_y, local_H);
lambdaI = bsdf_aniso_lambda<m_type>(alpha_x, alpha_y, local_I);
lambdaO = bsdf_aniso_lambda<m_type>(alpha_x, alpha_y, local_O);
G1i = bsdf_aniso_G1<m_type>(alpha_x, alpha_y, local_I);
G1o = bsdf_aniso_G1<m_type>(alpha_x, alpha_y, local_O);
}
const float common = D / cos_NI *
const float common = G1i * D / cos_NI *
(m_refractive ?
sqr(bsdf->ior * inv_len_H) * fabsf(dot(H, wi) * dot(H, wo)) :
0.25f);
*pdf = common / (1.0f + lambdaI);
*pdf = common;
const Spectrum F = microfacet_fresnel(bsdf, wo, H);
return F * common / (1.0f + lambdaO + lambdaI);
const Spectrum F = m_refractive ? one_spectrum() : reflection_color(bsdf, wo, H);
return F * G1o * common;
}
template<MicrofacetType m_type>
ccl_device int bsdf_microfacet_sample(ccl_private const ShaderClosure *sc,
ccl_device int bsdf_microfacet_sample(KernelGlobals kg,
ccl_private const ShaderClosure *sc,
float3 Ng,
float3 wi,
float randu,
@@ -402,15 +466,10 @@ ccl_device int bsdf_microfacet_sample(ccl_private const ShaderClosure *sc,
/* Importance sampling with distribution of visible normals. Vectors are transformed to local
* space before and after sampling. */
float G1i;
const float3 local_I = make_float3(dot(X, wi), dot(Y, wi), cos_NI);
float3 local_H;
if (m_type == MicrofacetType::GGX) {
local_H = microfacet_ggx_sample_vndf(local_I, alpha_x, alpha_y, randu, randv);
}
else {
/* m_type == MicrofacetType::BECKMANN */
local_H = microfacet_beckmann_sample_vndf(local_I, alpha_x, alpha_y, randu, randv);
}
const float3 local_H = microfacet_sample_stretched<m_type>(
kg, local_I, alpha_x, alpha_y, randu, randv, &G1i);
const float3 H = X * local_H.x + Y * local_H.y + N * local_H.z;
const float cos_NH = local_H.z;
@@ -443,12 +502,19 @@ ccl_device int bsdf_microfacet_sample(ccl_private const ShaderClosure *sc,
label |= LABEL_SINGULAR;
/* Some high number for MIS. */
*pdf = 1e6f;
*eval = make_spectrum(1e6f) * microfacet_fresnel(bsdf, *wo, H);
*eval = make_spectrum(1e6f);
bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID ||
bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID);
if (use_fresnel && !m_refractive) {
*eval *= reflection_color(bsdf, *wo, H);
}
}
else {
label |= LABEL_GLOSSY;
float cos_NO = dot(N, *wo);
float D, lambdaI, lambdaO;
float D, G1o;
/* TODO: add support for anisotropic transmission. */
if (alpha_x == alpha_y || m_refractive) { /* Isotropic. */
@@ -460,32 +526,34 @@ ccl_device int bsdf_microfacet_sample(ccl_private const ShaderClosure *sc,
/* The masking-shadowing term for clearcoat has a fixed alpha of 0.25
* => alpha2 = 0.25 * 0.25 */
alpha2 = 0.0625f;
/* Recalculate G1i. */
G1i = bsdf_G1<m_type>(alpha2, cos_NI);
}
else {
D = bsdf_D<m_type>(alpha2, cos_NH);
}
lambdaO = bsdf_lambda<m_type>(alpha2, cos_NO);
lambdaI = bsdf_lambda<m_type>(alpha2, cos_NI);
G1o = bsdf_G1<m_type>(alpha2, cos_NO);
}
else { /* Anisotropic. */
const float3 local_O = make_float3(dot(X, *wo), dot(Y, *wo), cos_NO);
D = bsdf_aniso_D<m_type>(alpha_x, alpha_y, local_H);
lambdaO = bsdf_aniso_lambda<m_type>(alpha_x, alpha_y, local_O);
lambdaI = bsdf_aniso_lambda<m_type>(alpha_x, alpha_y, local_I);
G1o = bsdf_aniso_G1<m_type>(alpha_x, alpha_y, local_O);
}
const float cos_HO = dot(H, *wo);
const float common = D / cos_NI *
const float common = G1i * D / cos_NI *
(m_refractive ? fabsf(cos_HI * cos_HO) / sqr(cos_HO + cos_HI / m_eta) :
0.25f);
*pdf = common / (1.0f + lambdaI);
*pdf = common;
Spectrum F = microfacet_fresnel(bsdf, *wo, H);
*eval = F * common / (1.0f + lambdaI + lambdaO);
Spectrum F = m_refractive ? one_spectrum() : reflection_color(bsdf, *wo, H);
*eval = G1o * common * F;
}
*sampled_roughness = make_float2(alpha_x, alpha_y);
@@ -519,6 +587,14 @@ ccl_device int bsdf_microfacet_ggx_setup(ccl_private MicrofacetBsdf *bsdf)
return SD_BSDF | SD_BSDF_HAS_EVAL;
}
/* Required to maintain OSL interface. */
ccl_device int bsdf_microfacet_ggx_isotropic_setup(ccl_private MicrofacetBsdf *bsdf)
{
bsdf->alpha_y = bsdf->alpha_x;
return bsdf_microfacet_ggx_setup(bsdf);
}
ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsdf,
ccl_private const ShaderData *sd)
{
@@ -529,7 +605,7 @@ ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsd
bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID;
bsdf_microfacet_adjust_weight(sd, bsdf);
bsdf_microfacet_fresnel_color(sd, bsdf);
return SD_BSDF | SD_BSDF_HAS_EVAL;
}
@@ -537,12 +613,14 @@ ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsd
ccl_device int bsdf_microfacet_ggx_clearcoat_setup(ccl_private MicrofacetBsdf *bsdf,
ccl_private const ShaderData *sd)
{
bsdf->extra->cspec0 = saturate(bsdf->extra->cspec0);
bsdf->alpha_x = saturatef(bsdf->alpha_x);
bsdf->alpha_y = bsdf->alpha_x;
bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID;
bsdf_microfacet_adjust_weight(sd, bsdf);
bsdf_microfacet_fresnel_color(sd, bsdf);
return SD_BSDF | SD_BSDF_HAS_EVAL;
}
@@ -576,7 +654,8 @@ ccl_device Spectrum bsdf_microfacet_ggx_eval(ccl_private const ShaderClosure *sc
return bsdf_microfacet_eval<MicrofacetType::GGX>(sc, Ng, wi, wo, pdf);
}
ccl_device int bsdf_microfacet_ggx_sample(ccl_private const ShaderClosure *sc,
ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals kg,
ccl_private const ShaderClosure *sc,
float3 Ng,
float3 wi,
float randu,
@@ -588,7 +667,7 @@ ccl_device int bsdf_microfacet_ggx_sample(ccl_private const ShaderClosure *sc,
ccl_private float *eta)
{
return bsdf_microfacet_sample<MicrofacetType::GGX>(
sc, Ng, wi, randu, randv, eval, wo, pdf, sampled_roughness, eta);
kg, sc, Ng, wi, randu, randv, eval, wo, pdf, sampled_roughness, eta);
}
/* Beckmann microfacet with Smith shadow-masking from:
@@ -605,6 +684,14 @@ ccl_device int bsdf_microfacet_beckmann_setup(ccl_private MicrofacetBsdf *bsdf)
return SD_BSDF | SD_BSDF_HAS_EVAL;
}
/* Required to maintain OSL interface. */
ccl_device int bsdf_microfacet_beckmann_isotropic_setup(ccl_private MicrofacetBsdf *bsdf)
{
bsdf->alpha_y = bsdf->alpha_x;
return bsdf_microfacet_beckmann_setup(bsdf);
}
ccl_device int bsdf_microfacet_beckmann_refraction_setup(ccl_private MicrofacetBsdf *bsdf)
{
bsdf->alpha_x = saturatef(bsdf->alpha_x);
@@ -631,7 +718,8 @@ ccl_device Spectrum bsdf_microfacet_beckmann_eval(ccl_private const ShaderClosur
return bsdf_microfacet_eval<MicrofacetType::BECKMANN>(sc, Ng, wi, wo, pdf);
}
ccl_device int bsdf_microfacet_beckmann_sample(ccl_private const ShaderClosure *sc,
ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals kg,
ccl_private const ShaderClosure *sc,
float3 Ng,
float3 wi,
float randu,
@@ -643,7 +731,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(ccl_private const ShaderClosure *
ccl_private float *eta)
{
return bsdf_microfacet_sample<MicrofacetType::BECKMANN>(
sc, Ng, wi, randu, randv, eval, wo, pdf, sampled_roughness, eta);
kg, sc, Ng, wi, randu, randv, eval, wo, pdf, sampled_roughness, eta);
}
CCL_NAMESPACE_END

View File

@@ -43,7 +43,7 @@ ccl_device_forceinline float2 mf_sampleP22_11(const float cosI,
return make_float2(r * cosf(phi), r * sinf(phi));
}
const float sinI = sin_from_cos(cosI);
const float sinI = safe_sqrtf(1.0f - cosI * cosI);
const float tanI = sinI / cosI;
const float projA = 0.5f * (cosI + 1.0f);
if (projA < 0.0001f)
@@ -401,7 +401,7 @@ ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(ccl_private MicrofacetBsd
bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID;
bsdf_microfacet_adjust_weight(sd, bsdf);
bsdf_microfacet_fresnel_color(sd, bsdf);
return bsdf_microfacet_multi_ggx_common_setup(bsdf);
}
@@ -575,7 +575,7 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(ccl_private Microfa
bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID;
bsdf_microfacet_adjust_weight(sd, bsdf);
bsdf_microfacet_fresnel_color(sd, bsdf);
return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_NEEDS_LCG;
}

View File

@@ -73,8 +73,9 @@ ccl_device_forceinline Spectrum MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
eval = make_spectrum(val);
#endif
float F0 = fresnel_dielectric_cos(1.0f, eta);
if (use_fresnel) {
throughput = interpolate_fresnel_color(wi, wh, eta, cspec0);
throughput = interpolate_fresnel_color(wi, wh, eta, F0, cspec0);
eval *= throughput;
}
@@ -143,11 +144,11 @@ ccl_device_forceinline Spectrum MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
throughput *= color;
}
else if (use_fresnel && order > 0) {
throughput *= interpolate_fresnel_color(wi_prev, wm, eta, cspec0);
throughput *= interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
}
#else /* MF_MULTI_GLOSSY */
if (use_fresnel && order > 0) {
throughput *= interpolate_fresnel_color(-wr, wm, eta, cspec0);
throughput *= interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
}
wr = mf_sample_phase_glossy(-wr, &throughput, wm);
#endif
@@ -191,6 +192,8 @@ ccl_device_forceinline Spectrum MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi,
float G1_r = 0.0f;
bool outside = true;
float F0 = fresnel_dielectric_cos(1.0f, eta);
int order;
for (order = 0; order < 10; order++) {
/* Sample microfacet height. */
@@ -226,12 +229,22 @@ ccl_device_forceinline Spectrum MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi,
throughput *= color;
}
else {
throughput *= interpolate_fresnel_color(wi_prev, wm, eta, cspec0);
Spectrum t_color = interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
if (order == 0)
throughput = t_color;
else
throughput *= t_color;
}
}
#else /* MF_MULTI_GLOSSY */
if (use_fresnel) {
throughput *= interpolate_fresnel_color(-wr, wm, eta, cspec0);
Spectrum t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
if (order == 0)
throughput = t_color;
else
throughput *= t_color;
}
wr = mf_sample_phase_glossy(-wr, &throughput, wm);
#endif

View File

@@ -89,21 +89,19 @@ ccl_device float schlick_fresnel(float u)
return m2 * m2 * m; // pow(m, 5)
}
/* Calculate the fresnel color, which is a blend between white and the F0 color */
ccl_device_forceinline Spectrum interpolate_fresnel_color(float3 L,
float3 H,
float ior,
Spectrum F0)
/* Calculate the fresnel color which is a blend between white and the F0 color (cspec0) */
ccl_device_forceinline Spectrum
interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, Spectrum cspec0)
{
/* Compute the real Fresnel term and remap it from real_F0..1 to F0..1.
* The reason why we use this remapping instead of directly doing the
* Schlick approximation lerp(F0, 1.0, (1.0-cosLH)^5) is that for cases
* with similar IORs (e.g. ice in water), the relative IOR can be close
* enough to 1.0 that the Schlick approximation becomes inaccurate. */
float real_F = fresnel_dielectric_cos(dot(L, H), ior);
float real_F0 = fresnel_dielectric_cos(1.0f, ior);
/* Calculate the fresnel interpolation factor
* The value from fresnel_dielectric_cos(...) has to be normalized because
* the cspec0 keeps the F0 color
*/
float F0_norm = 1.0f / (1.0f - F0);
float FH = (fresnel_dielectric_cos(dot(L, H), ior) - F0) * F0_norm;
return mix(F0, one_spectrum(), inverse_lerp(real_F0, 1.0f, real_F));
/* Blend between white and a specular color with respect to the fresnel */
return cspec0 * (1.0f - FH) + make_spectrum(FH);
}
ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)

View File

@@ -88,7 +88,7 @@ henyey_greenstrein_sample(float3 D, float g, float randu, float randv, ccl_priva
}
}
float sin_theta = sin_from_cos(cos_theta);
float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta);
float phi = M_2PI_F * randv;
float3 dir = make_float3(sin_theta * cosf(phi), sin_theta * sinf(phi), cos_theta);

View File

@@ -10,7 +10,7 @@
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316. */
/* SSE optimization disabled for now on 32 bit, see bug T36316. */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE__
# define __KERNEL_SSE2__

View File

@@ -10,7 +10,7 @@
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316. */
/* SSE optimization disabled for now on 32 bit, see bug T36316. */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# endif

View File

@@ -10,7 +10,7 @@
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
# define KERNEL_STUB
#else
/* SSE optimization disabled for now on 32 bit, see bug #36316. */
/* SSE optimization disabled for now on 32 bit, see bug T36316. */
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
# define __KERNEL_SSE2__
# define __KERNEL_SSE3__

View File

@@ -401,72 +401,6 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
}
ccl_gpu_kernel_postfix
ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
int num_states,
int partition_size,
int num_states_limit,
ccl_global int *indices,
int kernel_index)
{
#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
int max_shaders = context.launch_params_metal.data.max_shaders;
ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
kernel_integrator_state.path.queued_kernel;
ccl_global uint *d_shader_sort_key = (ccl_global uint *)
kernel_integrator_state.path.shader_sort_key;
ccl_global int *key_offsets = (ccl_global int *)
kernel_integrator_state.sort_partition_key_offsets;
gpu_parallel_sort_bucket_pass(num_states,
partition_size,
max_shaders,
kernel_index,
d_queued_kernel,
d_shader_sort_key,
key_offsets,
(threadgroup int *)threadgroup_array,
metal_local_id,
metal_local_size,
metal_grid_id);
#endif
}
ccl_gpu_kernel_postfix
ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
ccl_gpu_kernel_signature(integrator_sort_write_pass,
int num_states,
int partition_size,
int num_states_limit,
ccl_global int *indices,
int kernel_index)
{
#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
int max_shaders = context.launch_params_metal.data.max_shaders;
ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
kernel_integrator_state.path.queued_kernel;
ccl_global uint *d_shader_sort_key = (ccl_global uint *)
kernel_integrator_state.path.shader_sort_key;
ccl_global int *key_offsets = (ccl_global int *)
kernel_integrator_state.sort_partition_key_offsets;
gpu_parallel_sort_write_pass(num_states,
partition_size,
max_shaders,
kernel_index,
num_states_limit,
indices,
d_queued_kernel,
d_shader_sort_key,
key_offsets,
(threadgroup int *)threadgroup_array,
metal_local_id,
metal_local_size,
metal_grid_id);
#endif
}
ccl_gpu_kernel_postfix
ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
ccl_gpu_kernel_signature(integrator_compact_paths_array,
int num_states,
@@ -645,7 +579,7 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb
const int y,
const half4 half_pixel)
{
/* Work around HIP issue with half float display, see #92972. */
/* Work around HIP issue with half float display, see T92972. */
#ifdef __KERNEL_HIP__
ccl_global half *out = ((ccl_global half *)rgba) + (rgba_offset + y * rgba_stride + x) * 4;
out[0] = half_pixel.x;

Some files were not shown because too many files have changed in this diff Show More